├── README.md ├── .gitignore ├── 0x_svm └── two_classes.png ├── 05_neighbors ├── seismic.png └── 05_neighbors.ipynb ├── universal_images ├── mac_icon.png ├── linux_icon.jpg ├── so_confused.jpg ├── windows_icon.jpg ├── changing_stuff.jpg ├── logos.3.600.wide.png ├── red_sticky.600px.png ├── green_sticky.300px.png └── dark_art_logo.600px.png ├── 07_conclusion ├── task_breakdown.jpg └── 07_conclusion.ipynb ├── 04_naive_bayes ├── naive_bayes_ftw.png └── 04_naive_bayes.ipynb ├── 03_linear_reg ├── 500px-Linear_least_squares_example2.svg.png └── 03_linear_reg.ipynb ├── universal_datasets ├── coffee.csv ├── svm_train.csv ├── linreg_train.csv ├── skincancer.txt ├── nbayes_train.csv ├── bananas.csv ├── seeds_dataset.txt └── svm_test.csv ├── behind_the_scenes ├── scikit_learn_outline.txt └── lesson_template.ipynb ├── 00_basics ├── 00_intro.ipynb └── 01_install.ipynb ├── 01_intro_to_sklearn └── 01_intro_to_sklearn.ipynb └── 06_special_topics └── 06_special_topics.ipynb /README.md: -------------------------------------------------------------------------------- 1 | Introduction to Machine Learning 2 | 3 | A class on machine learning fundamentals. 4 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | .DS_Store 2 | .ipynb_checkpoints 3 | Icon 4 | toc_generator.py 5 | toc_content.txt 6 | tmp/ 7 | -------------------------------------------------------------------------------- /0x_svm/two_classes.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/chalmerlowe/machine_learning/HEAD/0x_svm/two_classes.png -------------------------------------------------------------------------------- /05_neighbors/seismic.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/chalmerlowe/machine_learning/HEAD/05_neighbors/seismic.png -------------------------------------------------------------------------------- /universal_images/mac_icon.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/chalmerlowe/machine_learning/HEAD/universal_images/mac_icon.png -------------------------------------------------------------------------------- /07_conclusion/task_breakdown.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/chalmerlowe/machine_learning/HEAD/07_conclusion/task_breakdown.jpg -------------------------------------------------------------------------------- /universal_images/linux_icon.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/chalmerlowe/machine_learning/HEAD/universal_images/linux_icon.jpg -------------------------------------------------------------------------------- /universal_images/so_confused.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/chalmerlowe/machine_learning/HEAD/universal_images/so_confused.jpg -------------------------------------------------------------------------------- /04_naive_bayes/naive_bayes_ftw.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/chalmerlowe/machine_learning/HEAD/04_naive_bayes/naive_bayes_ftw.png -------------------------------------------------------------------------------- /universal_images/windows_icon.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/chalmerlowe/machine_learning/HEAD/universal_images/windows_icon.jpg -------------------------------------------------------------------------------- /universal_images/changing_stuff.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/chalmerlowe/machine_learning/HEAD/universal_images/changing_stuff.jpg -------------------------------------------------------------------------------- /universal_images/logos.3.600.wide.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/chalmerlowe/machine_learning/HEAD/universal_images/logos.3.600.wide.png -------------------------------------------------------------------------------- /universal_images/red_sticky.600px.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/chalmerlowe/machine_learning/HEAD/universal_images/red_sticky.600px.png -------------------------------------------------------------------------------- /universal_images/green_sticky.300px.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/chalmerlowe/machine_learning/HEAD/universal_images/green_sticky.300px.png -------------------------------------------------------------------------------- /universal_images/dark_art_logo.600px.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/chalmerlowe/machine_learning/HEAD/universal_images/dark_art_logo.600px.png -------------------------------------------------------------------------------- /03_linear_reg/500px-Linear_least_squares_example2.svg.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/chalmerlowe/machine_learning/HEAD/03_linear_reg/500px-Linear_least_squares_example2.svg.png -------------------------------------------------------------------------------- /universal_datasets/coffee.csv: -------------------------------------------------------------------------------- 1 | size,price 2 | 12,2.95 3 | 16,3.65 4 | 20,4.15 5 | 14,3.25 6 | 18,4.20 7 | 12,3.00 8 | 16,3.70 9 | 20,4.25 10 | 14,3.10 11 | 18,4.20 12 | 12,2.90 13 | 16,3.60 14 | 20,4.05 15 | 14,3.15 16 | 18,4.35 -------------------------------------------------------------------------------- /universal_datasets/svm_train.csv: -------------------------------------------------------------------------------- 1 | -2.17754709631256,-9.470645062274183,0 2 | 0.5866631983391473,-1.5769729846575844,1 3 | -0.7122111151907693,-1.4777463824468018,1 4 | 0.22806354259302142,-2.7186868912968043,1 5 | 0.6703812053149667,1.3840879990294623,1 6 | 1.5954782057117587,-2.306608592230297,1 7 | 0.9932612713400982,-0.642729061055628,1 8 | -3.4322246593937606,-10.49157220222962,0 9 | -1.2302546855618406,-10.822985897578395,0 10 | -0.6766442565702735,-10.975821067370852,0 11 | 0.6170397211820922,-0.36833806195116914,1 12 | -1.248459227902333,0.7838694515420985,1 13 | -0.6183571037042156,-6.730825748064225,0 14 | -1.92847153425107,-1.1582803418544843,1 15 | 1.437783001570064,0.33800848757265234,1 16 | -1.3030585529853065,-8.071473900041825,0 17 | -0.6332319724629782,-10.196867003209935,0 18 | -0.97221741475837,-10.66801022196572,0 19 | -2.549644619794732,-10.572284501354392,0 20 | -2.333831429048335,-9.669196367903723,0 21 | -------------------------------------------------------------------------------- /universal_datasets/linreg_train.csv: -------------------------------------------------------------------------------- 1 | 22.809099198935886,11.567585959270929 2 | 11.6002266930792,5.214937584688396 3 | 9.619660981755743,4.057581988322643 4 | 6.852974097597672,3.9478072508732525 5 | 14.336799457163249,5.05697380655601 6 | 27.618843876985558,9.992782461863234 7 | 10.29024909134175,5.142966413981185 8 | 13.186541309609833,6.5939678453617665 9 | 1.786110876458814,1.0347533985797999 10 | 13.667809317547082,4.525240768384208 11 | 11.380781174154695,6.406030700340814 12 | 13.41295660992788,7.363962149193506 13 | 2.5807461535222007,0.94853271931713 14 | 7.460994666297125,3.9897745985035664 15 | 28.85091126934303,10.051321497518035 16 | 3.5964057742406252,3.2798432383591973 17 | 19.52171538417383,8.57219785187176 18 | 19.941153477382414,7.02134024799522 19 | 14.058323902941659,7.1724067091601205 20 | 24.45510798931772,12.249732225364653 21 | 24.648900143933385,8.694067487593259 22 | 26.838259421867857,11.824256053953313 23 | 6.3423082978744905,4.3025399313148665 24 | 28.83670420598234,10.755643054594055 25 | 4.770686384401824,5.635315577273484 26 | -------------------------------------------------------------------------------- /universal_datasets/skincancer.txt: -------------------------------------------------------------------------------- 1 | State Lat Mort Ocean Long 2 | Alabama 33.0 219 1 87.0 3 | Arizona 34.5 160 0 112.0 4 | Arkansas 35.0 170 0 92.5 5 | California 37.5 182 1 119.5 6 | Colorado 39.0 149 0 105.5 7 | Connecticut 41.8 159 1 72.8 8 | Delaware 39.0 200 1 75.5 9 | "Wash,D.C." 39.0 177 0 77.0 10 | Florida 28.0 197 1 82.0 11 | Georgia 33.0 214 1 83.5 12 | Idaho 44.5 116 0 114.0 13 | Illinois 40.0 124 0 89.5 14 | Indiana 40.2 128 0 86.2 15 | Iowa 42.2 128 0 93.8 16 | Kansas 38.5 166 0 98.5 17 | Kentucky 37.8 147 0 85.0 18 | Louisiana 31.2 190 1 91.8 19 | Maine 45.2 117 1 69.0 20 | Maryland 39.0 162 1 76.5 21 | Massachusetts 42.2 143 1 71.8 22 | Michigan 43.5 117 0 84.5 23 | Minnesota 46.0 116 0 94.5 24 | Mississippi 32.8 207 1 90.0 25 | Missouri 38.5 131 0 92.0 26 | Montana 47.0 109 0 110.5 27 | Nebraska 41.5 122 0 99.5 28 | Nevada 39.0 191 0 117.0 29 | NewHampshire 43.8 129 1 71.5 30 | NewJersey 40.2 159 1 74.5 31 | NewMexico 35.0 141 0 106.0 32 | MewYork 43.0 152 1 75.5 33 | NorthCarolina 35.5 199 1 79.5 34 | NorthDakota 47.5 115 0 100.5 35 | Ohio 40.2 131 0 82.8 36 | Oklahoma 35.5 182 0 97.2 37 | Oregon 44.0 136 1 120.5 38 | Pennsylvania 40.8 132 0 77.8 39 | RhodeIsland 41.8 137 1 71.5 40 | SouthCarolina 33.8 178 1 81.0 41 | SouthDakota 44.8 86 0 100.0 42 | Tennessee 36.0 186 0 86.2 43 | Texas 31.5 229 1 98.0 44 | Utah 39.5 142 0 111.5 45 | Vermont 44.0 153 1 72.5 46 | Virginia 37.5 166 1 78.5 47 | Washington 47.5 117 1 121.0 48 | WestVirginia 38.8 136 0 80.8 49 | Wisconsin 44.5 110 0 90.2 50 | Wyoming 43.0 134 0 107.5 51 | -------------------------------------------------------------------------------- /universal_datasets/nbayes_train.csv: -------------------------------------------------------------------------------- 1 | -5.382404587194567,3.099757608778995,0 2 | -4.102454769426915,-0.22318587720290328,1 3 | -0.4753390073762995,-0.20172584185812448,1 4 | -6.9095652430213725,4.632675192089428,0 5 | -6.846011195271036,4.600321549939559,0 6 | -7.656509201546659,4.746101085547851,0 7 | -7.19580142739034,5.161875549959177,0 8 | -4.711391336587894,0.5627270202928222,1 9 | -5.802642378452722,1.023672360976985,1 10 | -5.037444232393378,4.629984227572444,0 11 | -4.293389972832816,-2.808418382062139,1 12 | -11.323126282568126,5.550184180900258,0 13 | -4.8570155627180185,7.092042000728633,0 14 | -7.717286574801877,3.9563751696693097,0 15 | -7.0776310695122095,7.939538842461423,0 16 | -7.562696063113033,4.519086752088996,0 17 | -5.698129865720822,1.7984704979523385,1 18 | -2.60172122921113,2.7214232713504187,1 19 | -2.625438418290591,-0.14748265422911344,1 20 | -5.411343701460463,-1.4435342962465745,1 21 | -7.033572036360779,5.749922474394444,0 22 | -6.8671602678098775,3.851870419136311,0 23 | -2.1733506951924717,-0.7266142954139785,1 24 | -7.064118336583272,5.5178249516953,0 25 | -7.308452726945211,0.7640780874799677,1 26 | -3.703422741243391,1.2828341163468846,1 27 | -4.35235223938933,-0.6820592305071826,1 28 | -5.499221484638512,3.5215771249198795,0 29 | -8.3128285248011,3.801595127658697,0 30 | -5.515423287441246,4.729012936907532,0 31 | -3.5129443255240678,-1.3501153523090208,1 32 | -6.662334142960418,2.026832754701778,1 33 | -2.0271979061887766,-0.8121447283980057,1 34 | -5.862343184877673,2.6523840540168337,0 35 | -8.35808544273085,6.494708959792456,0 36 | -5.663716299462879,1.166657620704456,1 37 | -6.5912565372692855,6.015650661543293,0 38 | -10.022692075450513,3.894590838076258,0 39 | -5.046579278266077,5.247125333530684,0 40 | -5.595737605677421,5.301695845965116,0 41 | -6.6157768246245015,4.2628916067331915,0 42 | -9.057322823688411,6.241889576076393,0 43 | -6.138258628898663,5.112801175501586,0 44 | -7.717376638627667,4.652786688842826,0 45 | -5.733255032723233,-0.11185066077215056,1 46 | -4.484849884991392,0.7195962963673159,1 47 | -8.524240952351331,7.761075868345809,0 48 | -6.214121927115906,4.995378578326823,0 49 | -7.662067394981727,0.8799714558836418,1 50 | -7.753695239575434,5.462433721258156,0 51 | -3.7991982154292914,0.7397515581252385,1 52 | -7.389376502476604,3.981618341097725,0 53 | -9.271202266959733,2.297661978821074,0 54 | -8.400990347812272,1.7094527468795904,1 55 | -8.791888061942707,2.4354798418127253,0 56 | -6.693040375103849,3.7982249094531513,0 57 | -5.084823494715631,-0.11641218397809415,1 58 | -8.532847481709918,1.7023270591525432,0 59 | -5.759380434769507,1.8546258180107675,1 60 | -8.046517782408685,8.82154108329258,0 61 | -5.484773465774849,0.9518765904458804,1 62 | -7.272208861735192,4.903546834753613,0 63 | -5.386591238920379,1.1735000778359066,1 64 | -6.301757242672959,1.9195203217829664,1 65 | -6.531304540024218,-1.1096130729876057,1 66 | -4.945924283197794,2.4382190668422017,0 67 | -6.64559543579283,-0.9974342683680084,1 68 | -4.680981347817782,0.33011178853495704,1 69 | -3.899427641599536,-2.175418983200336,1 70 | -8.549505415642951,3.38760761707493,0 71 | -6.019677593728213,5.539259661463021,0 72 | -6.319156087139491,1.058795271813489,1 73 | -3.801191061195107,2.6459157448893134,1 74 | -3.2262637138239785,1.315554620842794,1 75 | -5.290731322602657,-1.0881250542469745,1 76 | -4.6620908354242125,-0.6355769376118812,1 77 | -4.5618334160382,3.0397967988983368,1 78 | -6.959431746877026,-0.6965484236578581,1 79 | -2.821505616627221,2.399650690978495,1 80 | -6.277440411755358,-0.42040754951710796,1 81 | -------------------------------------------------------------------------------- /universal_datasets/bananas.csv: -------------------------------------------------------------------------------- 1 | length,width,category 2 | 192.31775277264344,42.11007560961724,0 3 | 237.74108507802904,36.20243450997909,1 4 | 191.60045941916462,43.46940778882997,0 5 | 234.8701831630404,40.06614294198304,1 6 | 228.32958068374643,37.36357939528025,1 7 | 201.5640210495342,50.010346848962094,0 8 | 228.8711201957312,35.16376499623423,1 9 | 208.77022366137354,31.686755406013408,1 10 | 193.2754797044879,36.25323574038754,0 11 | 188.3572766334926,41.54378919667501,0 12 | 198.3314796405088,36.43091673251474,0 13 | 212.2210635834421,25.320415628182857,1 14 | 192.42445114365717,45.44774611909541,0 15 | 182.9786228510589,43.86329325039323,0 16 | 218.3472494657376,32.43700455014038,1 17 | 217.21173518740238,29.8963931555431,1 18 | 226.4900537124523,42.662368120376684,1 19 | 214.04770392442876,33.58384034135039,1 20 | 226.467817306007,26.13389457309542,1 21 | 198.59803793083216,33.851321727405654,0 22 | 228.18255258853324,25.66287236905584,1 23 | 223.71155079050865,32.75460042384817,1 24 | 234.52887282057452,27.204482603676563,1 25 | 236.42465882478854,33.264421019944734,1 26 | 228.68623213983366,34.95586326713179,1 27 | 214.96001035952295,46.47842176127723,0 28 | 224.97940259954586,32.93612165606207,1 29 | 206.52645343527783,49.17711271694269,0 30 | 231.56887862488688,32.73368819047123,1 31 | 222.9162962374835,43.8399504516882,1 32 | 195.01298226698495,41.887884866642665,0 33 | 181.32492749489995,39.66172663085066,0 34 | 184.9277400372963,44.68296183674407,0 35 | 213.71080584707875,41.92620454209646,0 36 | 201.3684705741947,34.88041187391387,0 37 | 232.28294943490397,41.41209672311486,1 38 | 227.73251158746402,33.13854642857629,1 39 | 201.23256713438084,47.43226108530389,0 40 | 188.69523975711658,40.66129280289728,0 41 | 199.88616893399396,33.979191184877436,0 42 | 205.71180176889115,43.74645054144711,0 43 | 234.77621079416735,25.73104698783016,1 44 | 215.10602303060045,34.979979538638695,1 45 | 231.03313527340998,31.038709255141786,1 46 | 223.12825212706943,27.917884351513962,1 47 | 213.84550388109832,40.585607418905525,0 48 | 197.19458295873218,50.132296008473936,0 49 | 200.84599243665198,44.15751591681757,0 50 | 185.41898467487533,37.47839889229,0 51 | 212.72318293387187,34.381712477724165,1 52 | 225.97054221375797,38.034539625123486,1 53 | 178.61457323129827,40.383861410305286,0 54 | 192.49107476800455,40.826417882129405,0 55 | 229.7631823669321,33.942770850673526,1 56 | 200.25025437616802,34.19181146464484,0 57 | 232.70986222851607,27.44386963791116,1 58 | 198.65652243177007,47.40063545926883,0 59 | 228.6605757010514,31.031297990890856,1 60 | 216.04286267722026,27.6333779653942,1 61 | 201.5666656644656,40.00421371071958,0 62 | 198.5461871395615,41.59056222969118,0 63 | 223.36943076451664,34.64501070274121,1 64 | 191.76130714750423,45.31348832109596,0 65 | 204.72103803481156,31.25098650901021,1 66 | 238.57207277218245,27.611047812373783,1 67 | 216.10508960488855,32.07708184040278,1 68 | 246.57556631641683,32.172656746965046,1 69 | 182.0195912263871,41.033054856651844,0 70 | 208.1452058229275,32.21874695642551,1 71 | 228.58294275690565,32.96039606501358,1 72 | 192.52164290838397,47.80615612282977,0 73 | 214.93162182666345,28.776977177473075,1 74 | 246.35104102569332,33.544242658998144,1 75 | 238.39857236276623,34.81886817130017,1 76 | 220.91585693036376,35.2976695788017,1 77 | 224.0533785747749,33.22440819855242,1 78 | 226.92368641906506,29.06491169011987,1 79 | 196.79655805155798,47.560538367998184,0 80 | 215.71385482652917,38.476374461149504,1 81 | 197.44019991682435,38.113170828278534,0 82 | 229.0513186093865,28.098397772198272,1 83 | 190.96989665441828,51.450620931659046,0 84 | 206.6366634220767,37.36547919029256,0 85 | 187.6004045582083,36.48795395487299,0 86 | 214.14218055211384,33.54314041189703,1 87 | 196.316538282671,41.580953981208836,0 88 | 189.86608535629077,41.45922774741997,0 89 | 168.77147324395446,46.728065731046186,0 90 | 225.44569193634334,26.01606743239626,1 91 | 186.447527065263,44.40131286038809,0 92 | 241.25825882339808,24.947357733687834,1 93 | 190.98164379983479,41.624082899440566,0 94 | 201.15359305228438,41.19605213408496,0 95 | 190.1425273855761,45.76452890984706,0 96 | 193.81880970714718,39.283908435394,0 97 | 208.9296641363971,47.22937979860898,0 98 | 207.65809268366445,44.467788505266626,0 99 | 216.51279050434854,24.42945865834223,1 100 | 195.172278622729,44.59777853875525,0 101 | 197.6449174120087,41.88582155802365,0 102 | -------------------------------------------------------------------------------- /07_conclusion/07_conclusion.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": {}, 6 | "source": [ 7 | "# Welcome to the Dark Art of Coding:\n", 8 | "## Introduction to Machine Learning\n", 9 | "Conclusion\n", 10 | "\n", 11 | "" 12 | ] 13 | }, 14 | { 15 | "cell_type": "markdown", 16 | "metadata": {}, 17 | "source": [ 18 | "# Objectives\n", 19 | "---" 20 | ] 21 | }, 22 | { 23 | "cell_type": "markdown", 24 | "metadata": {}, 25 | "source": [ 26 | "In this session, students should expect to:\n", 27 | "\n", 28 | "* Review the content of the tutorial\n", 29 | "* Find places to learn more\n", 30 | "* Be inspired to continue the journey" 31 | ] 32 | }, 33 | { 34 | "cell_type": "markdown", 35 | "metadata": {}, 36 | "source": [ 37 | "# Review\n", 38 | "---" 39 | ] 40 | }, 41 | { 42 | "cell_type": "markdown", 43 | "metadata": {}, 44 | "source": [ 45 | "## The Processs" 46 | ] 47 | }, 48 | { 49 | "cell_type": "markdown", 50 | "metadata": {}, 51 | "source": [ 52 | "As we saw, for each model, **The Process** was by and large the same. When I started out, that discovery gave me a sense of relief.\n", 53 | "\n", 54 | "As your familiarity with Scikit Learn increases, you will find that each of the steps is pretty small and pretty straightforward, even if it doesn't feel that way today.\n", 55 | "\n", 56 | "* Prep the data\n", 57 | "* Choose the model\n", 58 | "* Choose appropriate hyperparameters\n", 59 | "* Fit the model\n", 60 | "* Apply the model\n", 61 | "* Examine the results\n", 62 | "\n" 63 | ] 64 | }, 65 | { 66 | "cell_type": "markdown", 67 | "metadata": {}, 68 | "source": [ 69 | "# Where to go next\n", 70 | "---" 71 | ] 72 | }, 73 | { 74 | "cell_type": "markdown", 75 | "metadata": {}, 76 | "source": [ 77 | "Machine learning is just one part of a data analysis system.\n", 78 | "\n", 79 | "* Frame the problem\n", 80 | "* Data acquisition\n", 81 | "* Data wrangling (cleansing, normalization, filtering, deduplication, etc)\n", 82 | "* Data exploration\n", 83 | "* In-depth analysis (sometimes machine learning)\n", 84 | "* Communication of results\n", 85 | "\n", 86 | "If you are part of a team, you may get lucky enough to find someone who is adept at some of these steps (hopefully the most tedious and time-consuming steps).\n", 87 | "\n", 88 | "But for the rest of us, we have to do some or all of the steps ourselves.\n", 89 | "\n", 90 | "If that is the case, then it behooves you to grow your skills not only on the In-depth Analysis steps, but on all of the other steps as well. Especially the ones where you will spend the greatest amount of time.\n", 91 | "\n", 92 | "\n", 93 | "\n", 94 | "Source: [Forbes article on data analysis task breakdowns](https://www.forbes.com/sites/gilpress/2016/03/23/data-preparation-most-time-consuming-least-enjoyable-data-science-task-survey-says/#50b1f0236f63)\n", 95 | "\n", 96 | "\n", 97 | "**Yeah, but what about machine learning?**\n", 98 | "\n", 99 | "As you grow your skills in the ancillary skills, how do you grow your machine learning skills?\n", 100 | "\n", 101 | "* find the environment you learn best in: books, videos, classrooms\n", 102 | "* meet folks at meetups\n", 103 | "* find a pet project that will keep your attention\n", 104 | "* write code. every day. no, really, every day!\n", 105 | "\n" 106 | ] 107 | }, 108 | { 109 | "cell_type": "markdown", 110 | "metadata": {}, 111 | "source": [ 112 | "# A little inspiration\n", 113 | "---" 114 | ] 115 | }, 116 | { 117 | "cell_type": "markdown", 118 | "metadata": {}, 119 | "source": [ 120 | "As you continue your studies, you will often come across tutorials and code samples and by and large, each of them will follow the steps in **The Process™** (whether they call it that or not). When faced with a wall of code...\n", 121 | "\n", 122 | "* take a deep breath\n", 123 | "* break the code into parts \n", 124 | "* build the code line by line (don't just cut and paste)\n", 125 | "* identify which step are we working on in any given code block\n", 126 | "* separate the `must have` code from the `nice to have` code (the machine learning steps from the data engineering OR from the data viz steps)\n", 127 | "* run the code line by line, and thoroughly think about and examine what is produced by each line of code (is it a numpy array, a dataframe, a model, a prediction, etc)\n", 128 | "\n", 129 | "And most importantly:" 130 | ] 131 | }, 132 | { 133 | "cell_type": "markdown", 134 | "metadata": {}, 135 | "source": [ 136 | "**Don't give up**\n", 137 | "\n", 138 | "**Be curious**\n", 139 | "\n", 140 | "**Study hard**" 141 | ] 142 | }, 143 | { 144 | "cell_type": "markdown", 145 | "metadata": {}, 146 | "source": [ 147 | "# Experience Points!\n", 148 | "---" 149 | ] 150 | }, 151 | { 152 | "cell_type": "markdown", 153 | "metadata": { 154 | "slideshow": { 155 | "slide_type": "slide" 156 | } 157 | }, 158 | "source": [ 159 | "# Final task: task 01\n", 160 | "\n", 161 | "Tell yourself: \"**I've got this!**\"\n", 162 | "\n", 163 | "Repeat" 164 | ] 165 | }, 166 | { 167 | "cell_type": "markdown", 168 | "metadata": {}, 169 | "source": [ 170 | "---\n", 171 | "When you complete this exercise, please put your **green** post-it on your monitor. \n", 172 | "\n", 173 | "If you want to continue on at your own-pace, please feel free to do so.\n", 174 | "\n", 175 | "" 176 | ] 177 | }, 178 | { 179 | "cell_type": "markdown", 180 | "metadata": {}, 181 | "source": [ 182 | "# References\n", 183 | "---" 184 | ] 185 | }, 186 | { 187 | "cell_type": "markdown", 188 | "metadata": {}, 189 | "source": [ 190 | "Below are references that may assist you in learning more:\n", 191 | " \n", 192 | "|Title (link)|Comments|\n", 193 | "|---|---|\n", 194 | "|[General API Reference](https://scikit-learn.org/stable/modules/classes.html)||\n", 195 | "|[Forbes article](https://www.forbes.com/sites/gilpress/2016/03/23/data-preparation-most-time-consuming-least-enjoyable-data-science-task-survey-says/#50b1f0236f63)|Article on data analysis task breakdowns|\n" 196 | ] 197 | }, 198 | { 199 | "cell_type": "code", 200 | "execution_count": null, 201 | "metadata": {}, 202 | "outputs": [], 203 | "source": [] 204 | } 205 | ], 206 | "metadata": { 207 | "kernelspec": { 208 | "display_name": "Python 3", 209 | "language": "python", 210 | "name": "python3" 211 | }, 212 | "language_info": { 213 | "codemirror_mode": { 214 | "name": "ipython", 215 | "version": 3 216 | }, 217 | "file_extension": ".py", 218 | "mimetype": "text/x-python", 219 | "name": "python", 220 | "nbconvert_exporter": "python", 221 | "pygments_lexer": "ipython3", 222 | "version": "3.6.7" 223 | } 224 | }, 225 | "nbformat": 4, 226 | "nbformat_minor": 2 227 | } 228 | -------------------------------------------------------------------------------- /behind_the_scenes/scikit_learn_outline.txt: -------------------------------------------------------------------------------- 1 | ------------------------------------------------------------- 2 | 3 | 0) Introduction: (10 mins - discussion) 4 | * Teacher introduction [95% done] 5 | * Agenda overview [95% done] 6 | * Accessing the course material 7 | > Individual student environments and course material (notebooks and 8 | datafiles) will be hosted in a Jupyter Hub and accessible via the 9 | Internet. 10 | > Instructions will ALSO be provided in advance for students to install 11 | the requisite libraries a) should there be a constraint in accessing the 12 | materials online and b) so that students can use the course materials 13 | after the course is done. 14 | 15 | 1) Machine Learning Overview (30 mins) 16 | * An overview of machine learning types and techniques 17 | * Supervised learning 18 | * Unsupervised learning 19 | * Classification 20 | * Regression 21 | * Clustering 22 | * Dimensionality reduction 23 | * Intro to Scikit-Learn 24 | 25 | 2) Naive bayes Classification (20 mins) 26 | * Overview 27 | * Hands-on code examples 28 | * When to use and when not to use Naive Bayes Classification 29 | 30 | BREAK (10 mins) ------------------- 31 | 32 | 3) Linear Regression (15 mins) 33 | * Overview 34 | * Hands-on code examples 35 | * When to use and when not to use Linear Regression 36 | 37 | 4) Support Vector Machines (20 mins) 38 | * Overview 39 | * Hands-on code examples 40 | * When to use and when not to use Support Vector Machines 41 | 42 | BREAK (10 mins) ------------------- 43 | 44 | 5) Decision Trees and Random Forests (20 mins) 45 | * Overview 46 | * Hands-on code examples 47 | * When to use and when not to use Decision Trees 48 | 49 | 6) Principal Component Analysis (PCA) (20 mins) 50 | * Overview 51 | * Hands-on code examples 52 | * When to use and when not to use PCA 53 | 54 | 7) Conclusion (10 mins) 55 | * Gotchas and problems with using machine learning 56 | * Places to learn more 57 | 58 | TITLE 59 | Scikit-learn, wrapping your head around machine learning 60 | 61 | DESCRIPTION 62 | Edit using Markdown. 63 | 64 | Both your title and this description are made public and displayed in the conference program to help attendees decide whether they are interested in this presentation. Limit this description to a few concise paragraphs. 65 | 66 | A gentle introduction to machine learning through scikit-learn. This tutorial will enable attendees to understand the capabilities and limitations of machine learning through hands-on code examples and fun and interesting datasets. Learn when to turn to machine learning and which tools apply to your problem. Also learn about gotchas and problems that are likely to show up when attempting to use machine learning. 67 | 68 | AUDIENCE 69 | 1–2 paragraphs that should answer three questions: (1) Who is this tutorial for? (2) What background knowledge or experience do you expect students to have? (3) What do you expect students to learn, or to be able to do after attending your tutorial? 70 | 71 | Students that attend this tutorial should have a basic understanding of the following: 72 | * Python, to include importing libraries, writing simple functions, using datatypes such as dicts, sets and lists, and reading and writing files 73 | * The use of simple Jupyter/IPython notebooks 74 | * Familiarity with pandas and dataframes will be useful 75 | 76 | NOTE: previous knowledge of machine learning OR scikit-learn is not required. 77 | 78 | OUTLINE 79 | 80 | Make an outline that lists the topics and activities you will guide your students through over the 3 hours of your tutorial. Provide timings for each activity — indicate when and for how long you will lecture, and when and for how long students will be tackling hands-on exercises. This is a very important criteria! Generally speaking, the more detailed the outline, the more confidence the committee will have that you can deliver the material in the allotted time. 81 | 82 | ADDITIONAL NOTES 83 | 84 | (a) If you have offered this tutorial before, please provide links to the material and video, if possible. Otherwise, please provide links to one (or two!) previous presentations by each speaker. (b) Please summarize your teaching or public speaking experience and your experience with the subject of the tutorial. (c) Let us know if you have specific needs or special requests — for example, requests that involve accessibility, audio, or restrictions on when your talk can be scheduled. 85 | 86 | a) The material in this tutorial is based on a planned series of scikit-learn mentoring sessions designed for the PyHawaii Python meetup. These mentoring sessions are first taught to members of PyHawaii as a community service and are then delivered to clients in the data analysis industry. Thus this mentoring material will undergo at least two cycles of revision before being taught a Pycon. 87 | 88 | I am the past Chair of the Python Education Summit (held annually at PyCon) and have given technical presentations at Pycon, PyHawaii and PyOhio, in business settings, at the collegiate level and for high schoolers and youth. At Pycon and PyOhio, I host the 3-hour workshop on Preparing to Contribute to Open Source using git, virtual environments and Github. At PyHawaii, I am the lead instructor for our fortnightly mentoring sessions. I am the founder and lead instructor for Dark Art of Coding, a programming school. I served as adjunct faculty, teaching Programming in Python, at the University of Hawaii: (https://www.sis.hawaii.edu/uhdad/avail.class?i=MAN&t=201740&c=92245) 89 | 90 | b) I have been teaching technical computer-related topics for decades to a wide range of students, young and old, newbie to advanced. Through this experience, I have been able to evolve a teaching style that helps to match the message to the student: easing students into a subject at a pace that matches their knowledge level. Some examples of my training courses include: 91 | 92 | * Statistics and probability: your first steps on the road to data science (3 hours) @ Pycon 2018 (https://www.youtube.com/watch?v=zzbw0JbiI6Y) 93 | * Introduction to Bokeh: data visualization (3 hours) @ Pycon 2017 (https://www.youtube.com/watch?v=xId9B1BVusA) 94 | * Jupyter: Introduction to Jupyter Lab/Notebooks Tutorial (3 hours) 95 | * Python for Analysts bootcamp (40 hours) 96 | * Founder/Lead instructor for Dark Art of Coding (Intro to Programming with Python & Automating Everyday Tasks Using Python) 97 | * Adjunct Faculty, University of Hawaii, Introduction to Programming 98 | * Lead instructor for DjangoGirls Workshop (10 hours) 99 | * Preparing to Contribute to Open Source (4 hours) 100 | * Operating System and Network Security at Champlain College 101 | * Operating System Fundamentals 102 | * BASH Scripting 103 | * Windows Exploitation 104 | * Linux Exploitation 105 | 106 | 107 | 108 | BIO: 109 | Chalmer Lowe has served on Pycon's Python Education Summit Committee for many years. He helps run the Pycon Sprint Workshops every year. He founded Dark Art of Coding, a programming school. Chalmer founded PyHawaii. He performs data analysis for his employer: Booz Allen Hamilton and teaches Python to his colleagues, clients and anyone who will stand still long enough. Chalmer has a long history in the cyber security and programming fields including: Python, scripting/automation, penetration testing, vulnerability assessment, incident response, intel analysis, data analysis and the fundamentals of data science. 110 | -------------------------------------------------------------------------------- /universal_datasets/seeds_dataset.txt: -------------------------------------------------------------------------------- 1 | 15.26 14.84 0.871 5.763 3.312 2.221 5.22 1 2 | 14.88 14.57 0.8811 5.554 3.333 1.018 4.956 1 3 | 14.29 14.09 0.905 5.291 3.337 2.699 4.825 1 4 | 13.84 13.94 0.8955 5.324 3.379 2.259 4.805 1 5 | 16.14 14.99 0.9034 5.658 3.562 1.355 5.175 1 6 | 14.38 14.21 0.8951 5.386 3.312 2.462 4.956 1 7 | 14.69 14.49 0.8799 5.563 3.259 3.586 5.219 1 8 | 14.11 14.1 0.8911 5.42 3.302 2.7 5 1 9 | 16.63 15.46 0.8747 6.053 3.465 2.04 5.877 1 10 | 16.44 15.25 0.888 5.884 3.505 1.969 5.533 1 11 | 15.26 14.85 0.8696 5.714 3.242 4.543 5.314 1 12 | 14.03 14.16 0.8796 5.438 3.201 1.717 5.001 1 13 | 13.89 14.02 0.888 5.439 3.199 3.986 4.738 1 14 | 13.78 14.06 0.8759 5.479 3.156 3.136 4.872 1 15 | 13.74 14.05 0.8744 5.482 3.114 2.932 4.825 1 16 | 14.59 14.28 0.8993 5.351 3.333 4.185 4.781 1 17 | 13.99 13.83 0.9183 5.119 3.383 5.234 4.781 1 18 | 15.69 14.75 0.9058 5.527 3.514 1.599 5.046 1 19 | 14.7 14.21 0.9153 5.205 3.466 1.767 4.649 1 20 | 12.72 13.57 0.8686 5.226 3.049 4.102 4.914 1 21 | 14.16 14.4 0.8584 5.658 3.129 3.072 5.176 1 22 | 14.11 14.26 0.8722 5.52 3.168 2.688 5.219 1 23 | 15.88 14.9 0.8988 5.618 3.507 0.7651 5.091 1 24 | 12.08 13.23 0.8664 5.099 2.936 1.415 4.961 1 25 | 15.01 14.76 0.8657 5.789 3.245 1.791 5.001 1 26 | 16.19 15.16 0.8849 5.833 3.421 0.903 5.307 1 27 | 13.02 13.76 0.8641 5.395 3.026 3.373 4.825 1 28 | 12.74 13.67 0.8564 5.395 2.956 2.504 4.869 1 29 | 14.11 14.18 0.882 5.541 3.221 2.754 5.038 1 30 | 13.45 14.02 0.8604 5.516 3.065 3.531 5.097 1 31 | 13.16 13.82 0.8662 5.454 2.975 0.8551 5.056 1 32 | 15.49 14.94 0.8724 5.757 3.371 3.412 5.228 1 33 | 14.09 14.41 0.8529 5.717 3.186 3.92 5.299 1 34 | 13.94 14.17 0.8728 5.585 3.15 2.124 5.012 1 35 | 15.05 14.68 0.8779 5.712 3.328 2.129 5.36 1 36 | 16.12 15 0.9 5.709 3.485 2.27 5.443 1 37 | 16.2 15.27 0.8734 5.826 3.464 2.823 5.527 1 38 | 17.08 15.38 0.9079 5.832 3.683 2.956 5.484 1 39 | 14.8 14.52 0.8823 5.656 3.288 3.112 5.309 1 40 | 14.28 14.17 0.8944 5.397 3.298 6.685 5.001 1 41 | 13.54 13.85 0.8871 5.348 3.156 2.587 5.178 1 42 | 13.5 13.85 0.8852 5.351 3.158 2.249 5.176 1 43 | 13.16 13.55 0.9009 5.138 3.201 2.461 4.783 1 44 | 15.5 14.86 0.882 5.877 3.396 4.711 5.528 1 45 | 15.11 14.54 0.8986 5.579 3.462 3.128 5.18 1 46 | 13.8 14.04 0.8794 5.376 3.155 1.56 4.961 1 47 | 15.36 14.76 0.8861 5.701 3.393 1.367 5.132 1 48 | 14.99 14.56 0.8883 5.57 3.377 2.958 5.175 1 49 | 14.79 14.52 0.8819 5.545 3.291 2.704 5.111 1 50 | 14.86 14.67 0.8676 5.678 3.258 2.129 5.351 1 51 | 14.43 14.4 0.8751 5.585 3.272 3.975 5.144 1 52 | 15.78 14.91 0.8923 5.674 3.434 5.593 5.136 1 53 | 14.49 14.61 0.8538 5.715 3.113 4.116 5.396 1 54 | 14.33 14.28 0.8831 5.504 3.199 3.328 5.224 1 55 | 14.52 14.6 0.8557 5.741 3.113 1.481 5.487 1 56 | 15.03 14.77 0.8658 5.702 3.212 1.933 5.439 1 57 | 14.46 14.35 0.8818 5.388 3.377 2.802 5.044 1 58 | 14.92 14.43 0.9006 5.384 3.412 1.142 5.088 1 59 | 15.38 14.77 0.8857 5.662 3.419 1.999 5.222 1 60 | 12.11 13.47 0.8392 5.159 3.032 1.502 4.519 1 61 | 11.42 12.86 0.8683 5.008 2.85 2.7 4.607 1 62 | 11.23 12.63 0.884 4.902 2.879 2.269 4.703 1 63 | 12.36 13.19 0.8923 5.076 3.042 3.22 4.605 1 64 | 13.22 13.84 0.868 5.395 3.07 4.157 5.088 1 65 | 12.78 13.57 0.8716 5.262 3.026 1.176 4.782 1 66 | 12.88 13.5 0.8879 5.139 3.119 2.352 4.607 1 67 | 14.34 14.37 0.8726 5.63 3.19 1.313 5.15 1 68 | 14.01 14.29 0.8625 5.609 3.158 2.217 5.132 1 69 | 14.37 14.39 0.8726 5.569 3.153 1.464 5.3 1 70 | 12.73 13.75 0.8458 5.412 2.882 3.533 5.067 1 71 | 17.63 15.98 0.8673 6.191 3.561 4.076 6.06 2 72 | 16.84 15.67 0.8623 5.998 3.484 4.675 5.877 2 73 | 17.26 15.73 0.8763 5.978 3.594 4.539 5.791 2 74 | 19.11 16.26 0.9081 6.154 3.93 2.936 6.079 2 75 | 16.82 15.51 0.8786 6.017 3.486 4.004 5.841 2 76 | 16.77 15.62 0.8638 5.927 3.438 4.92 5.795 2 77 | 17.32 15.91 0.8599 6.064 3.403 3.824 5.922 2 78 | 20.71 17.23 0.8763 6.579 3.814 4.451 6.451 2 79 | 18.94 16.49 0.875 6.445 3.639 5.064 6.362 2 80 | 17.12 15.55 0.8892 5.85 3.566 2.858 5.746 2 81 | 16.53 15.34 0.8823 5.875 3.467 5.532 5.88 2 82 | 18.72 16.19 0.8977 6.006 3.857 5.324 5.879 2 83 | 20.2 16.89 0.8894 6.285 3.864 5.173 6.187 2 84 | 19.57 16.74 0.8779 6.384 3.772 1.472 6.273 2 85 | 19.51 16.71 0.878 6.366 3.801 2.962 6.185 2 86 | 18.27 16.09 0.887 6.173 3.651 2.443 6.197 2 87 | 18.88 16.26 0.8969 6.084 3.764 1.649 6.109 2 88 | 18.98 16.66 0.859 6.549 3.67 3.691 6.498 2 89 | 21.18 17.21 0.8989 6.573 4.033 5.78 6.231 2 90 | 20.88 17.05 0.9031 6.45 4.032 5.016 6.321 2 91 | 20.1 16.99 0.8746 6.581 3.785 1.955 6.449 2 92 | 18.76 16.2 0.8984 6.172 3.796 3.12 6.053 2 93 | 18.81 16.29 0.8906 6.272 3.693 3.237 6.053 2 94 | 18.59 16.05 0.9066 6.037 3.86 6.001 5.877 2 95 | 18.36 16.52 0.8452 6.666 3.485 4.933 6.448 2 96 | 16.87 15.65 0.8648 6.139 3.463 3.696 5.967 2 97 | 19.31 16.59 0.8815 6.341 3.81 3.477 6.238 2 98 | 18.98 16.57 0.8687 6.449 3.552 2.144 6.453 2 99 | 18.17 16.26 0.8637 6.271 3.512 2.853 6.273 2 100 | 18.72 16.34 0.881 6.219 3.684 2.188 6.097 2 101 | 16.41 15.25 0.8866 5.718 3.525 4.217 5.618 2 102 | 17.99 15.86 0.8992 5.89 3.694 2.068 5.837 2 103 | 19.46 16.5 0.8985 6.113 3.892 4.308 6.009 2 104 | 19.18 16.63 0.8717 6.369 3.681 3.357 6.229 2 105 | 18.95 16.42 0.8829 6.248 3.755 3.368 6.148 2 106 | 18.83 16.29 0.8917 6.037 3.786 2.553 5.879 2 107 | 18.85 16.17 0.9056 6.152 3.806 2.843 6.2 2 108 | 17.63 15.86 0.88 6.033 3.573 3.747 5.929 2 109 | 19.94 16.92 0.8752 6.675 3.763 3.252 6.55 2 110 | 18.55 16.22 0.8865 6.153 3.674 1.738 5.894 2 111 | 18.45 16.12 0.8921 6.107 3.769 2.235 5.794 2 112 | 19.38 16.72 0.8716 6.303 3.791 3.678 5.965 2 113 | 19.13 16.31 0.9035 6.183 3.902 2.109 5.924 2 114 | 19.14 16.61 0.8722 6.259 3.737 6.682 6.053 2 115 | 20.97 17.25 0.8859 6.563 3.991 4.677 6.316 2 116 | 19.06 16.45 0.8854 6.416 3.719 2.248 6.163 2 117 | 18.96 16.2 0.9077 6.051 3.897 4.334 5.75 2 118 | 19.15 16.45 0.889 6.245 3.815 3.084 6.185 2 119 | 18.89 16.23 0.9008 6.227 3.769 3.639 5.966 2 120 | 20.03 16.9 0.8811 6.493 3.857 3.063 6.32 2 121 | 20.24 16.91 0.8897 6.315 3.962 5.901 6.188 2 122 | 18.14 16.12 0.8772 6.059 3.563 3.619 6.011 2 123 | 16.17 15.38 0.8588 5.762 3.387 4.286 5.703 2 124 | 18.43 15.97 0.9077 5.98 3.771 2.984 5.905 2 125 | 15.99 14.89 0.9064 5.363 3.582 3.336 5.144 2 126 | 18.75 16.18 0.8999 6.111 3.869 4.188 5.992 2 127 | 18.65 16.41 0.8698 6.285 3.594 4.391 6.102 2 128 | 17.98 15.85 0.8993 5.979 3.687 2.257 5.919 2 129 | 20.16 17.03 0.8735 6.513 3.773 1.91 6.185 2 130 | 17.55 15.66 0.8991 5.791 3.69 5.366 5.661 2 131 | 18.3 15.89 0.9108 5.979 3.755 2.837 5.962 2 132 | 18.94 16.32 0.8942 6.144 3.825 2.908 5.949 2 133 | 15.38 14.9 0.8706 5.884 3.268 4.462 5.795 2 134 | 16.16 15.33 0.8644 5.845 3.395 4.266 5.795 2 135 | 15.56 14.89 0.8823 5.776 3.408 4.972 5.847 2 136 | 15.38 14.66 0.899 5.477 3.465 3.6 5.439 2 137 | 17.36 15.76 0.8785 6.145 3.574 3.526 5.971 2 138 | 15.57 15.15 0.8527 5.92 3.231 2.64 5.879 2 139 | 15.6 15.11 0.858 5.832 3.286 2.725 5.752 2 140 | 16.23 15.18 0.885 5.872 3.472 3.769 5.922 2 141 | 13.07 13.92 0.848 5.472 2.994 5.304 5.395 3 142 | 13.32 13.94 0.8613 5.541 3.073 7.035 5.44 3 143 | 13.34 13.95 0.862 5.389 3.074 5.995 5.307 3 144 | 12.22 13.32 0.8652 5.224 2.967 5.469 5.221 3 145 | 11.82 13.4 0.8274 5.314 2.777 4.471 5.178 3 146 | 11.21 13.13 0.8167 5.279 2.687 6.169 5.275 3 147 | 11.43 13.13 0.8335 5.176 2.719 2.221 5.132 3 148 | 12.49 13.46 0.8658 5.267 2.967 4.421 5.002 3 149 | 12.7 13.71 0.8491 5.386 2.911 3.26 5.316 3 150 | 10.79 12.93 0.8107 5.317 2.648 5.462 5.194 3 151 | 11.83 13.23 0.8496 5.263 2.84 5.195 5.307 3 152 | 12.01 13.52 0.8249 5.405 2.776 6.992 5.27 3 153 | 12.26 13.6 0.8333 5.408 2.833 4.756 5.36 3 154 | 11.18 13.04 0.8266 5.22 2.693 3.332 5.001 3 155 | 11.36 13.05 0.8382 5.175 2.755 4.048 5.263 3 156 | 11.19 13.05 0.8253 5.25 2.675 5.813 5.219 3 157 | 11.34 12.87 0.8596 5.053 2.849 3.347 5.003 3 158 | 12.13 13.73 0.8081 5.394 2.745 4.825 5.22 3 159 | 11.75 13.52 0.8082 5.444 2.678 4.378 5.31 3 160 | 11.49 13.22 0.8263 5.304 2.695 5.388 5.31 3 161 | 12.54 13.67 0.8425 5.451 2.879 3.082 5.491 3 162 | 12.02 13.33 0.8503 5.35 2.81 4.271 5.308 3 163 | 12.05 13.41 0.8416 5.267 2.847 4.988 5.046 3 164 | 12.55 13.57 0.8558 5.333 2.968 4.419 5.176 3 165 | 11.14 12.79 0.8558 5.011 2.794 6.388 5.049 3 166 | 12.1 13.15 0.8793 5.105 2.941 2.201 5.056 3 167 | 12.44 13.59 0.8462 5.319 2.897 4.924 5.27 3 168 | 12.15 13.45 0.8443 5.417 2.837 3.638 5.338 3 169 | 11.35 13.12 0.8291 5.176 2.668 4.337 5.132 3 170 | 11.24 13 0.8359 5.09 2.715 3.521 5.088 3 171 | 11.02 13 0.8189 5.325 2.701 6.735 5.163 3 172 | 11.55 13.1 0.8455 5.167 2.845 6.715 4.956 3 173 | 11.27 12.97 0.8419 5.088 2.763 4.309 5 3 174 | 11.4 13.08 0.8375 5.136 2.763 5.588 5.089 3 175 | 10.83 12.96 0.8099 5.278 2.641 5.182 5.185 3 176 | 10.8 12.57 0.859 4.981 2.821 4.773 5.063 3 177 | 11.26 13.01 0.8355 5.186 2.71 5.335 5.092 3 178 | 10.74 12.73 0.8329 5.145 2.642 4.702 4.963 3 179 | 11.48 13.05 0.8473 5.18 2.758 5.876 5.002 3 180 | 12.21 13.47 0.8453 5.357 2.893 1.661 5.178 3 181 | 11.41 12.95 0.856 5.09 2.775 4.957 4.825 3 182 | 12.46 13.41 0.8706 5.236 3.017 4.987 5.147 3 183 | 12.19 13.36 0.8579 5.24 2.909 4.857 5.158 3 184 | 11.65 13.07 0.8575 5.108 2.85 5.209 5.135 3 185 | 12.89 13.77 0.8541 5.495 3.026 6.185 5.316 3 186 | 11.56 13.31 0.8198 5.363 2.683 4.062 5.182 3 187 | 11.81 13.45 0.8198 5.413 2.716 4.898 5.352 3 188 | 10.91 12.8 0.8372 5.088 2.675 4.179 4.956 3 189 | 11.23 12.82 0.8594 5.089 2.821 7.524 4.957 3 190 | 10.59 12.41 0.8648 4.899 2.787 4.975 4.794 3 191 | 10.93 12.8 0.839 5.046 2.717 5.398 5.045 3 192 | 11.27 12.86 0.8563 5.091 2.804 3.985 5.001 3 193 | 11.87 13.02 0.8795 5.132 2.953 3.597 5.132 3 194 | 10.82 12.83 0.8256 5.18 2.63 4.853 5.089 3 195 | 12.11 13.27 0.8639 5.236 2.975 4.132 5.012 3 196 | 12.8 13.47 0.886 5.16 3.126 4.873 4.914 3 197 | 12.79 13.53 0.8786 5.224 3.054 5.483 4.958 3 198 | 13.37 13.78 0.8849 5.32 3.128 4.67 5.091 3 199 | 12.62 13.67 0.8481 5.41 2.911 3.306 5.231 3 200 | 12.76 13.38 0.8964 5.073 3.155 2.828 4.83 3 201 | 12.38 13.44 0.8609 5.219 2.989 5.472 5.045 3 202 | 12.67 13.32 0.8977 4.984 3.135 2.3 4.745 3 203 | 11.18 12.72 0.868 5.009 2.81 4.051 4.828 3 204 | 12.7 13.41 0.8874 5.183 3.091 8.456 5 3 205 | 12.37 13.47 0.8567 5.204 2.96 3.919 5.001 3 206 | 12.19 13.2 0.8783 5.137 2.981 3.631 4.87 3 207 | 11.23 12.88 0.8511 5.14 2.795 4.325 5.003 3 208 | 13.2 13.66 0.8883 5.236 3.232 8.315 5.056 3 209 | 11.84 13.21 0.8521 5.175 2.836 3.598 5.044 3 210 | 12.3 13.34 0.8684 5.243 2.974 5.637 5.063 3 -------------------------------------------------------------------------------- /00_basics/00_intro.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": {}, 6 | "source": [ 7 | "# Welcome to the Dark Art of Coding:\n", 8 | "## Introduction to Machine Learning\n", 9 | "Class overview\n", 10 | "\n", 11 | "" 12 | ] 13 | }, 14 | { 15 | "cell_type": "markdown", 16 | "metadata": {}, 17 | "source": [ 18 | "# Objectives\n", 19 | "---" 20 | ] 21 | }, 22 | { 23 | "cell_type": "markdown", 24 | "metadata": {}, 25 | "source": [ 26 | "In this session, students should expect to:\n", 27 | "\n", 28 | "* Get to know the instructor\n", 29 | "* Review 'what to expect' and 'what not to expect'\n", 30 | "* Explore how to access the class materials" 31 | ] 32 | }, 33 | { 34 | "cell_type": "markdown", 35 | "metadata": {}, 36 | "source": [ 37 | "# Instructor Intro\n", 38 | "---" 39 | ] 40 | }, 41 | { 42 | "cell_type": "markdown", 43 | "metadata": {}, 44 | "source": [ 45 | "My name is **Chalmer Lowe**. I like to stay busy ... To that end, I:\n", 46 | "\n", 47 | "* work for **[Booz Allen Hamilton](https://www.boozallen.com/about.html)**, a technology and consulting firm with one guiding purpose—to empower people to change the world. \n", 48 | "* founded a programming school called **[Dark Art of Coding](https://darkartofcoding.com/)**, where I teach Python, data analysis and data science.\n", 49 | "* founded **[PyHawaii](https://www.meetup.com/PyHawaii-Python-Users-Group/)**, the largest and most active programming meetup in Hawaii. \n", 50 | "* serve on the **[Pycon Education Summit Committee](https://us.pycon.org/2019/events/edusummit/#!)**.\n", 51 | "* help teach the **[Introduction to Sprinting Hands-on Tutorial](https://us.pycon.org/2019/community/sprints/)** (late Sunday night!)\n", 52 | "* this is my third Pycon tutorial... previous trips into the lion's den include:\n", 53 | " * Introduction to Bokeh\n", 54 | " * Introduction to Statistics and Probability\n", 55 | "* have contributed (in minor ways) to bokeh, pandas, Jupyter and more.\n", 56 | "\n", 57 | "|Social Media|Contact |\n", 58 | "|----:|----:|\n", 59 | "|Twitter|@chalmer_lowe|\n", 60 | "|Email|info@darkartofcoding.com|\n", 61 | "|Linkedin|https://www.linkedin.com/in/chalmerlowe/ |" 62 | ] 63 | }, 64 | { 65 | "cell_type": "markdown", 66 | "metadata": {}, 67 | "source": [ 68 | "# What to expect/not expect\n", 69 | "---" 70 | ] 71 | }, 72 | { 73 | "cell_type": "markdown", 74 | "metadata": {}, 75 | "source": [ 76 | "We have a lot of material to cover...\n", 77 | "\n", 78 | "1. We are gonna move at a **rapid pace**\n", 79 | "\n", 80 | "1. I will gladly take questions, but depending on the depth of the question and/or the relevance of the question to what we are trying to accomplish and/or whether I can answer the question, I might defer the question to a **parking lot** and cover it at the end OR after the tutorial ends\n", 81 | "\n", 82 | "1. During certain points in the tutorial, I may be able to support some one-on-one conversations and/or over-the-shoulder help, but it will be dependant on our progress\n", 83 | "\n", 84 | "1. There is a lot of math under the hood, when doing machine learning. There is almost no math in this tutorial\n", 85 | "\n", 86 | "1. If someone runs into a significant snag that can't be rectified in a timely fashion, I will invite you to simply take a deep breath, relax and enjoy the show OR partner with someone beside you >>> Please note, I would be happy to visit with you later during the conference OR during the Sprints to do a deep dive troubleshooting session.\n", 87 | "\n", 88 | "1. My main ambition today is **to teach you to learn**. I will do what I can to:\n", 89 | " * Help you know where to look for more info\n", 90 | " * Help you explore some of the most commonly used grammar and vocabulary\n", 91 | "\n", 92 | "1. Expectation management: No one is gonna walk out of this room, after only three hours, as a ninja/guru/rockstar\n", 93 | "\n", 94 | "1. For you to get there, will take considerable time, effort, practice, repetition and additional study" 95 | ] 96 | }, 97 | { 98 | "cell_type": "markdown", 99 | "metadata": {}, 100 | "source": [ 101 | "# Lesson layout...\n", 102 | "---" 103 | ] 104 | }, 105 | { 106 | "cell_type": "markdown", 107 | "metadata": { 108 | "slideshow": { 109 | "slide_type": "slide" 110 | } 111 | }, 112 | "source": [ 113 | "1. Each lesson starts with a high level **overview**\n", 114 | "\n", 115 | "1. The lesson then provides some **code samples** that you should be able to run and explore\n", 116 | "\n", 117 | "1. After the code, there will generally be some written documentation in a **deep dive** that we will cover at a high level\n", 118 | "\n", 119 | "1. To help us keep to our schedule, each section is **time-boxed**... we will cover as much as we can and then we will move on\n", 120 | "\n", 121 | "1. That deep dive is specifically designed for you to go back to for additional clarification at your leisure and to help account for the fact that with limited time, we won't have the ability to dwell deeply on any given topic\n", 122 | "\n", 123 | "1. In some of the sessions, there may be some opportunities for you to do some hands-on exercises... when you complete the exercise, put up your **green post-it**. If you run into a snag, put up your **red post-it**. (see a sample exercise below)\n", 124 | "\n", 125 | "1. Each lesson and exercise should be fairly stand-alone with the mindset that if something in a particular lesson doesn't work for you, it hopefully won't affect any of the follow-on lessons. If you run into a snag that we can't resolve, feel free to **make a note of it** and we can explore it together later today OR this weekend. " 126 | ] 127 | }, 128 | { 129 | "cell_type": "markdown", 130 | "metadata": {}, 131 | "source": [ 132 | "# Experience Points!\n", 133 | "---" 134 | ] 135 | }, 136 | { 137 | "cell_type": "markdown", 138 | "metadata": { 139 | "slideshow": { 140 | "slide_type": "slide" 141 | } 142 | }, 143 | "source": [ 144 | "Sample exercise...\n", 145 | "\n", 146 | "1. Remind yourself \"I've got this!\"\n", 147 | "1. Repeat\n" 148 | ] 149 | }, 150 | { 151 | "cell_type": "markdown", 152 | "metadata": {}, 153 | "source": [ 154 | "---\n", 155 | "When you complete this exercise, please put your **green** post-it on your monitor. \n", 156 | "\n", 157 | "If you want to continue on at your own-pace, please feel free to do so.\n", 158 | "\n", 159 | "" 160 | ] 161 | }, 162 | { 163 | "cell_type": "markdown", 164 | "metadata": {}, 165 | "source": [ 166 | "# Accessing the class materials\n", 167 | "---" 168 | ] 169 | }, 170 | { 171 | "cell_type": "markdown", 172 | "metadata": {}, 173 | "source": [ 174 | "There are several primary ways to access the class materials:\n", 175 | " \n", 176 | "|Source|Interactive?|\n", 177 | "|:---|:---|\n", 178 | "|Via the class Jupyter Hub |Fully interactive\n", 179 | "|Via a local install you run in an environment on your machine |Fully interactive\n", 180 | "|Via Github |Non-interactive \n" 181 | ] 182 | }, 183 | { 184 | "cell_type": "markdown", 185 | "metadata": {}, 186 | "source": [ 187 | "## Via Jupyter Hub" 188 | ] 189 | }, 190 | { 191 | "cell_type": "markdown", 192 | "metadata": {}, 193 | "source": [ 194 | "Each of you should have received an **instruction flyer** with your **personal student username** that will walk you through a process to access the class material.\n", 195 | "\n", 196 | "If you follow those steps, you should be able to access all the course content **without having to install anything** on your local machine.\n", 197 | "\n", 198 | "The Jupyter Hub will only be available for the duration of the tutorial, so instructions are provided below to allow you to install the needed software and download all the course content to your local machine." 199 | ] 200 | }, 201 | { 202 | "cell_type": "markdown", 203 | "metadata": {}, 204 | "source": [ 205 | "## Via local install" 206 | ] 207 | }, 208 | { 209 | "cell_type": "markdown", 210 | "metadata": {}, 211 | "source": [ 212 | "To install the software needed to run the class and to go through the class material on your local machine, go to this link and follow the instructions you find there:\n", 213 | "\n", 214 | "[**https://github.com/chalmerlowe/machine_learning/blob/master/00_basics/01_install.ipynb**](https://github.com/chalmerlowe/machine_learning/blob/master/00_basics/01_install.ipynb)" 215 | ] 216 | }, 217 | { 218 | "cell_type": "markdown", 219 | "metadata": {}, 220 | "source": [ 221 | "## Via Github" 222 | ] 223 | }, 224 | { 225 | "cell_type": "markdown", 226 | "metadata": {}, 227 | "source": [ 228 | "If all else fails, you can follow along in a static repository of files hosted on Github. You won't be able to run code, but you can still track the class. Chalmer's Machine Learning Tutorial can be found here:\n", 229 | "\n", 230 | "[**https://github.com/chalmerlowe/machine_learning**](https://github.com/chalmerlowe/machine_learning)" 231 | ] 232 | }, 233 | { 234 | "cell_type": "code", 235 | "execution_count": null, 236 | "metadata": {}, 237 | "outputs": [], 238 | "source": [] 239 | } 240 | ], 241 | "metadata": { 242 | "kernelspec": { 243 | "display_name": "Python 3", 244 | "language": "python", 245 | "name": "python3" 246 | }, 247 | "language_info": { 248 | "codemirror_mode": { 249 | "name": "ipython", 250 | "version": 3 251 | }, 252 | "file_extension": ".py", 253 | "mimetype": "text/x-python", 254 | "name": "python", 255 | "nbconvert_exporter": "python", 256 | "pygments_lexer": "ipython3", 257 | "version": "3.6.7" 258 | } 259 | }, 260 | "nbformat": 4, 261 | "nbformat_minor": 2 262 | } 263 | -------------------------------------------------------------------------------- /behind_the_scenes/lesson_template.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": {}, 6 | "source": [ 7 | "# Welcome to the Dark Art of Coding:\n", 8 | "## Introduction to Machine Learning\n", 9 | "replace_with_topic\n", 10 | "\n", 11 | "" 12 | ] 13 | }, 14 | { 15 | "cell_type": "markdown", 16 | "metadata": {}, 17 | "source": [ 18 | "# Objectives\n", 19 | "---" 20 | ] 21 | }, 22 | { 23 | "cell_type": "markdown", 24 | "metadata": {}, 25 | "source": [ 26 | "In this session, students should expect to:\n", 27 | "\n", 28 | "* \n", 29 | "* \n", 30 | "* \n", 31 | "\n", 32 | "\n", 33 | "* DELETE: Short list of expected outcomes\n", 34 | "* DELETE: Use active verbs: Create, Change, Manipulate, Explore, etc.\n", 35 | "* DELETE: Keep outcomes measurable, where possible: Success means the thing was created OR the object was changed " 36 | ] 37 | }, 38 | { 39 | "cell_type": "markdown", 40 | "metadata": {}, 41 | "source": [ 42 | "# Overview: Model X\n", 43 | "---" 44 | ] 45 | }, 46 | { 47 | "cell_type": "code", 48 | "execution_count": null, 49 | "metadata": { 50 | "collapsed": true 51 | }, 52 | "outputs": [], 53 | "source": [] 54 | }, 55 | { 56 | "cell_type": "markdown", 57 | "metadata": {}, 58 | "source": [ 59 | "## Prep the data" 60 | ] 61 | }, 62 | { 63 | "cell_type": "markdown", 64 | "metadata": {}, 65 | "source": [ 66 | "We start with a set of standard imports..." 67 | ] 68 | }, 69 | { 70 | "cell_type": "code", 71 | "execution_count": null, 72 | "metadata": {}, 73 | "outputs": [], 74 | "source": [ 75 | "import matplotlib.pyplot as plt\n", 76 | "import numpy as np\n", 77 | "import pandas as pd\n", 78 | "import sklearn\n", 79 | "\n", 80 | "# NOTE: during the Choose the Model step, we will import the \n", 81 | "# model we want, but there is no reason you can't import it here.\n", 82 | "# from sklearn.xx import XX" 83 | ] 84 | }, 85 | { 86 | "cell_type": "markdown", 87 | "metadata": {}, 88 | "source": [ 89 | "### Prep the training data and test data" 90 | ] 91 | }, 92 | { 93 | "cell_type": "code", 94 | "execution_count": null, 95 | "metadata": {}, 96 | "outputs": [], 97 | "source": [ 98 | "df = pd.read_csv('../universal_datasets/skincancer.txt',\n", 99 | " names=[])\n", 100 | "df.head()" 101 | ] 102 | }, 103 | { 104 | "cell_type": "code", 105 | "execution_count": null, 106 | "metadata": {}, 107 | "outputs": [], 108 | "source": [ 109 | "df.shape" 110 | ] 111 | }, 112 | { 113 | "cell_type": "code", 114 | "execution_count": null, 115 | "metadata": {}, 116 | "outputs": [], 117 | "source": [ 118 | "X = df[].to_frame()\n", 119 | "y = df[]" 120 | ] 121 | }, 122 | { 123 | "cell_type": "code", 124 | "execution_count": null, 125 | "metadata": {}, 126 | "outputs": [], 127 | "source": [ 128 | "X[:5]" 129 | ] 130 | }, 131 | { 132 | "cell_type": "code", 133 | "execution_count": null, 134 | "metadata": {}, 135 | "outputs": [], 136 | "source": [ 137 | "y[:5]" 138 | ] 139 | }, 140 | { 141 | "cell_type": "markdown", 142 | "metadata": {}, 143 | "source": [ 144 | "With our data imported, let's separate it into training data and test data." 145 | ] 146 | }, 147 | { 148 | "cell_type": "code", 149 | "execution_count": 2, 150 | "metadata": {}, 151 | "outputs": [], 152 | "source": [ 153 | "from sklearn.model_selection import train_test_split" 154 | ] 155 | }, 156 | { 157 | "cell_type": "code", 158 | "execution_count": null, 159 | "metadata": {}, 160 | "outputs": [], 161 | "source": [ 162 | "X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=42)" 163 | ] 164 | }, 165 | { 166 | "cell_type": "code", 167 | "execution_count": null, 168 | "metadata": {}, 169 | "outputs": [], 170 | "source": [ 171 | "plt.scatter(X_train, y_train)\n", 172 | "plt.title(\"\")\n", 173 | "plt.xlabel(\"\")\n", 174 | "plt.ylabel(\"\");" 175 | ] 176 | }, 177 | { 178 | "cell_type": "markdown", 179 | "metadata": {}, 180 | "source": [ 181 | "## Choose the Model" 182 | ] 183 | }, 184 | { 185 | "cell_type": "code", 186 | "execution_count": null, 187 | "metadata": { 188 | "collapsed": true 189 | }, 190 | "outputs": [], 191 | "source": [] 192 | }, 193 | { 194 | "cell_type": "markdown", 195 | "metadata": {}, 196 | "source": [ 197 | "## Choose Appropriate Hyperparameters" 198 | ] 199 | }, 200 | { 201 | "cell_type": "markdown", 202 | "metadata": {}, 203 | "source": [ 204 | "Here we choose to assign xx hyperparameters: `xx` and `xx`. We will discuss both later." 205 | ] 206 | }, 207 | { 208 | "cell_type": "code", 209 | "execution_count": null, 210 | "metadata": { 211 | "collapsed": true 212 | }, 213 | "outputs": [], 214 | "source": [] 215 | }, 216 | { 217 | "cell_type": "markdown", 218 | "metadata": {}, 219 | "source": [ 220 | "There are a number of hyperparameters\n", 221 | "\n", 222 | "```python\n", 223 | "XX()\n", 224 | "```" 225 | ] 226 | }, 227 | { 228 | "cell_type": "markdown", 229 | "metadata": {}, 230 | "source": [ 231 | "## Fit the Model" 232 | ] 233 | }, 234 | { 235 | "cell_type": "code", 236 | "execution_count": null, 237 | "metadata": { 238 | "collapsed": true 239 | }, 240 | "outputs": [], 241 | "source": [ 242 | "model.fit(X_train, y_train)" 243 | ] 244 | }, 245 | { 246 | "cell_type": "code", 247 | "execution_count": null, 248 | "metadata": { 249 | "collapsed": true 250 | }, 251 | "outputs": [], 252 | "source": [] 253 | }, 254 | { 255 | "cell_type": "markdown", 256 | "metadata": {}, 257 | "source": [ 258 | "## Apply the Model" 259 | ] 260 | }, 261 | { 262 | "cell_type": "code", 263 | "execution_count": null, 264 | "metadata": { 265 | "collapsed": true 266 | }, 267 | "outputs": [], 268 | "source": [ 269 | "y_pred = model.predict(X_test)" 270 | ] 271 | }, 272 | { 273 | "cell_type": "code", 274 | "execution_count": null, 275 | "metadata": { 276 | "collapsed": true 277 | }, 278 | "outputs": [], 279 | "source": [ 280 | "y_pred.shape" 281 | ] 282 | }, 283 | { 284 | "cell_type": "code", 285 | "execution_count": null, 286 | "metadata": { 287 | "collapsed": true 288 | }, 289 | "outputs": [], 290 | "source": [ 291 | "y_pred[::100]" 292 | ] 293 | }, 294 | { 295 | "cell_type": "code", 296 | "execution_count": null, 297 | "metadata": { 298 | "collapsed": true 299 | }, 300 | "outputs": [], 301 | "source": [] 302 | }, 303 | { 304 | "cell_type": "markdown", 305 | "metadata": {}, 306 | "source": [ 307 | "## Examine the results" 308 | ] 309 | }, 310 | { 311 | "cell_type": "code", 312 | "execution_count": null, 313 | "metadata": { 314 | "collapsed": true 315 | }, 316 | "outputs": [], 317 | "source": [] 318 | }, 319 | { 320 | "cell_type": "markdown", 321 | "metadata": {}, 322 | "source": [ 323 | "# Gotchas\n", 324 | "---" 325 | ] 326 | }, 327 | { 328 | "cell_type": "code", 329 | "execution_count": null, 330 | "metadata": { 331 | "collapsed": true 332 | }, 333 | "outputs": [], 334 | "source": [] 335 | }, 336 | { 337 | "cell_type": "markdown", 338 | "metadata": {}, 339 | "source": [ 340 | "# Deep Dive\n", 341 | "---" 342 | ] 343 | }, 344 | { 345 | "cell_type": "code", 346 | "execution_count": null, 347 | "metadata": { 348 | "collapsed": true 349 | }, 350 | "outputs": [], 351 | "source": [] 352 | }, 353 | { 354 | "cell_type": "markdown", 355 | "metadata": {}, 356 | "source": [ 357 | "# Gotchas\n", 358 | "---" 359 | ] 360 | }, 361 | { 362 | "cell_type": "code", 363 | "execution_count": null, 364 | "metadata": { 365 | "collapsed": true 366 | }, 367 | "outputs": [], 368 | "source": [] 369 | }, 370 | { 371 | "cell_type": "markdown", 372 | "metadata": {}, 373 | "source": [ 374 | "# How to learn more: tips and hints\n", 375 | "---" 376 | ] 377 | }, 378 | { 379 | "cell_type": "code", 380 | "execution_count": null, 381 | "metadata": { 382 | "collapsed": true 383 | }, 384 | "outputs": [], 385 | "source": [] 386 | }, 387 | { 388 | "cell_type": "markdown", 389 | "metadata": {}, 390 | "source": [ 391 | "# Experience Points!\n", 392 | "---" 393 | ] 394 | }, 395 | { 396 | "cell_type": "markdown", 397 | "metadata": { 398 | "slideshow": { 399 | "slide_type": "slide" 400 | } 401 | }, 402 | "source": [ 403 | "# delete_this_line: task 01\n", 404 | "\n", 405 | "In **`jupyter`** create a simple script to complete the following tasks:\n", 406 | "\n", 407 | "\n", 408 | "**REPLACE THE FOLLOWING**\n", 409 | "\n", 410 | "Create a function called `me()` that prints out 3 things:\n", 411 | "\n", 412 | "* Your name\n", 413 | "* Your favorite food\n", 414 | "* Your favorite color\n", 415 | "\n", 416 | "Lastly, call the function, so that it executes when the script is run" 417 | ] 418 | }, 419 | { 420 | "cell_type": "markdown", 421 | "metadata": {}, 422 | "source": [ 423 | "---\n", 424 | "When you complete this exercise, please put your **green** post-it on your monitor. \n", 425 | "\n", 426 | "If you want to continue on at your own-pace, please feel free to do so.\n", 427 | "\n", 428 | "" 429 | ] 430 | }, 431 | { 432 | "cell_type": "markdown", 433 | "metadata": {}, 434 | "source": [ 435 | "# Experience Points!\n", 436 | "---" 437 | ] 438 | }, 439 | { 440 | "cell_type": "markdown", 441 | "metadata": { 442 | "slideshow": { 443 | "slide_type": "slide" 444 | } 445 | }, 446 | "source": [ 447 | "# delete_this_line: task 02\n", 448 | "\n", 449 | "In **`jupyter`** create a simple script to complete the following tasks:\n", 450 | "\n", 451 | "**REPLACE THE FOLLOWING**\n", 452 | "\n", 453 | "Task | Sample Object(s)\n", 454 | ":---|:---\n", 455 | "Compare two items using `and` | 'Bruce', 0\n", 456 | "Compare two items using `or` | '', 42\n", 457 | "Use the `not` operator to make an object False | 'Selina' \n", 458 | "Compare two numbers using comparison operators | `>, <, >=, !=, ==`\n", 459 | "Create a more complex/nested comparison using parenthesis and Boolean operators| `('kara' _ 'clark') _ (0 _ 0.0)`" 460 | ] 461 | }, 462 | { 463 | "cell_type": "markdown", 464 | "metadata": {}, 465 | "source": [ 466 | "---\n", 467 | "When you complete this exercise, please put your **green** post-it on your monitor. \n", 468 | "\n", 469 | "If you want to continue on at your own-pace, please feel free to do so.\n", 470 | "\n", 471 | "" 472 | ] 473 | }, 474 | { 475 | "cell_type": "markdown", 476 | "metadata": {}, 477 | "source": [ 478 | "# Experience Points!\n", 479 | "---" 480 | ] 481 | }, 482 | { 483 | "cell_type": "markdown", 484 | "metadata": {}, 485 | "source": [ 486 | "# delete_this_line: sample 03\n", 487 | "\n", 488 | "In your **text editor** create a simple script called:\n", 489 | "\n", 490 | "```bash\n", 491 | "my_lessonname_03.py```\n", 492 | "\n", 493 | "Execute your script on the command line using **`ipython`** via this command:\n", 494 | "\n", 495 | "```bash\n", 496 | "ipython -i my_lessonname_03.py```\n", 497 | "\n", 498 | "**REPLACE THE FOLLOWING**\n", 499 | "\n", 500 | "I suggest that as you add each feature to your script that you run it right away to test it incrementally. \n", 501 | "\n", 502 | "1. Create a variable with your first name as a string AND save it with the label: `myfname`.\n", 503 | "1. Create a variable with your age as an integer AND save it with the label: `myage`.\n", 504 | "\n", 505 | "1. Use `input()` to prompt for your first name AND save it with the label: `fname`.\n", 506 | "1. Create an `if` statement to test whether `fname` is equivalent to `myfname`. \n", 507 | "1. In the `if` code block: \n", 508 | " 1. Use `input()` prompt for your age AND save it with the label: `age` \n", 509 | " 1. NOTE: don't forget to convert the value to an integer.\n", 510 | " 1. Create a nested `if` statement to test whether `myage` and `age` are equivalent.\n", 511 | "1. If both tests pass, have the script print: `Your identity has been verified`" 512 | ] 513 | }, 514 | { 515 | "cell_type": "markdown", 516 | "metadata": {}, 517 | "source": [ 518 | "When you complete this exercise, please put your **green** post-it on your monitor. \n", 519 | "\n", 520 | "If you want to continue on at your own-pace, please feel free to do so.\n", 521 | "\n", 522 | "" 523 | ] 524 | }, 525 | { 526 | "cell_type": "markdown", 527 | "metadata": {}, 528 | "source": [ 529 | "# References\n", 530 | "---" 531 | ] 532 | }, 533 | { 534 | "cell_type": "markdown", 535 | "metadata": {}, 536 | "source": [ 537 | "Below are references that may assist you in learning more:\n", 538 | " \n", 539 | "|Title (link)|Comments|\n", 540 | "|---|---|\n", 541 | "|[General API Reference](https://scikit-learn.org/stable/modules/classes.html)||\n", 542 | "|[XX API Reference]()||\n", 543 | "|[User Guide]()||" 544 | ] 545 | }, 546 | { 547 | "cell_type": "code", 548 | "execution_count": null, 549 | "metadata": { 550 | "collapsed": true 551 | }, 552 | "outputs": [], 553 | "source": [] 554 | }, 555 | { 556 | "cell_type": "code", 557 | "execution_count": null, 558 | "metadata": { 559 | "collapsed": true 560 | }, 561 | "outputs": [], 562 | "source": [] 563 | } 564 | ], 565 | "metadata": { 566 | "kernelspec": { 567 | "display_name": "Python 3", 568 | "language": "python", 569 | "name": "python3" 570 | }, 571 | "language_info": { 572 | "codemirror_mode": { 573 | "name": "ipython", 574 | "version": 3 575 | }, 576 | "file_extension": ".py", 577 | "mimetype": "text/x-python", 578 | "name": "python", 579 | "nbconvert_exporter": "python", 580 | "pygments_lexer": "ipython3", 581 | "version": "3.6.7" 582 | } 583 | }, 584 | "nbformat": 4, 585 | "nbformat_minor": 2 586 | } 587 | -------------------------------------------------------------------------------- /03_linear_reg/03_linear_reg.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": {}, 6 | "source": [ 7 | "# Welcome to the Dark Art of Coding:\n", 8 | "## Introduction to Machine Learning\n", 9 | "Linear Regression\n", 10 | "\n", 11 | "" 12 | ] 13 | }, 14 | { 15 | "cell_type": "markdown", 16 | "metadata": {}, 17 | "source": [ 18 | "# Objectives\n", 19 | "---" 20 | ] 21 | }, 22 | { 23 | "cell_type": "markdown", 24 | "metadata": {}, 25 | "source": [ 26 | "In this session, students should expect to:\n", 27 | "\n", 28 | "* Cover an overview of Linear Regression\n", 29 | "* Examine code samples that walk us through **The Process™**:\n", 30 | " * Prep the data\n", 31 | " * Choose the model\n", 32 | " * Choose appropriate hyperparameters\n", 33 | " * Fit the model\n", 34 | " * Apply the model\n", 35 | " * Examine the results\n", 36 | "* Explore a deep dive into this model\n", 37 | "* Review some gotchas that might complicate things\n", 38 | "* Review tips related to learning more" 39 | ] 40 | }, 41 | { 42 | "cell_type": "markdown", 43 | "metadata": {}, 44 | "source": [ 45 | "# Overview: Linear Regression\n", 46 | "---" 47 | ] 48 | }, 49 | { 50 | "cell_type": "markdown", 51 | "metadata": {}, 52 | "source": [ 53 | "Linear Regression models are popular machine learning models because they:\n", 54 | "* are often fast\n", 55 | "* are often simple with few tunable hyperparameters\n", 56 | "* are very easy to interpret\n", 57 | "* can provide a nice baseline classification to start with before considering more sophisticated models\n", 58 | "\n", 59 | "Several cases where you might use a linear regression to predict an output based on a set of inputs include:\n", 60 | "\n", 61 | "|Inputs|Outputs|\n", 62 | "|:---|:---|\n", 63 | "|ad dollars spent|sales dollars earned|\n", 64 | "|car age|sale price|\n", 65 | "|latitude|skin cancer mortality|\n", 66 | "\n", 67 | "The LinearRegression model that we will examine here relies upon the Ordinary Least Squares (OLS) method to calculate a linear function that fits the input data.\n", 68 | "\n", 69 | "From [Wikipedia](https://en.wikipedia.org/wiki/Ordinary_least_squares): \n", 70 | "\n", 71 | "> \"Geometrically, this is seen as the sum of the squared distances, ... between each data point in the set and the corresponding point on the regression surface – **the smaller the differences, the better the model fits the data**.\"\n", 72 | "\n", 73 | "\n", 74 | "\n", 75 | "Image source: [Wikimedia](https://commons.wikimedia.org/wiki/File:Linear_least_squares_example2.svg)\n", 76 | "\n", 77 | "The result of the simplest type of linear regression calculation is a formula for straight line (although sophisticated curved surfaces can also be determined using linear regression):\n", 78 | "\n", 79 | "$$y = mx + b$$\n", 80 | "\n", 81 | "Where if given some value of $x$, if we know the slope of the line ($m$) and the y-intercept ($b$) we can calculate $y$.\n", 82 | "\n", 83 | "Beyond that, we won't cover the math here. 😀\n", 84 | "\n", 85 | "Scikit-Learn has a number of Linear Models based on calculations besides OLS: \n", 86 | "\n", 87 | "* Ridge \n", 88 | "* Lasso\n", 89 | "* Huber\n", 90 | "* and many more...\n", 91 | "\n", 92 | "Each one has slightly different approaches to calculating a line that fits the data.\n", 93 | "\n", 94 | "**Ridge**: addresses some issues related to OLS by controlling the size of coefficients.\n", 95 | "\n", 96 | "**Lasso**: encourages simple, sparse models (i.e. models with fewer parameters). Can be useful when you want to automate certain parts of model selection, like variable selection/parameter elimination. \n", 97 | "\n", 98 | "**Huber**: applies a linear loss (lower weight) to samples that are classified as outliers, thus minimizing the impact of random outliers.\n", 99 | "\n", 100 | "With this background, let's apply **The Process™** on a LinearRegression model." 101 | ] 102 | }, 103 | { 104 | "cell_type": "markdown", 105 | "metadata": {}, 106 | "source": [ 107 | "## Prep the data" 108 | ] 109 | }, 110 | { 111 | "cell_type": "markdown", 112 | "metadata": {}, 113 | "source": [ 114 | "We start with a set of standard imports..." 115 | ] 116 | }, 117 | { 118 | "cell_type": "code", 119 | "execution_count": null, 120 | "metadata": {}, 121 | "outputs": [], 122 | "source": [ 123 | "import matplotlib.pyplot as plt\n", 124 | "import numpy as np\n", 125 | "import pandas as pd\n", 126 | "import sklearn\n", 127 | "\n", 128 | "# NOTE: during the Choose the Model step, we will import the \n", 129 | "# model we want, but there is no reason you can't import it here.\n", 130 | "# from sklearn.linear_model import LinearRegression" 131 | ] 132 | }, 133 | { 134 | "cell_type": "markdown", 135 | "metadata": {}, 136 | "source": [ 137 | "### Prep the training data and test data" 138 | ] 139 | }, 140 | { 141 | "cell_type": "markdown", 142 | "metadata": {}, 143 | "source": [ 144 | "For this example, we will use a dataset hosted by Penn State:\n", 145 | " \n", 146 | "[skincancer.txt](https://newonlinecourses.science.psu.edu/stat501/sites/onlinecourses.science.psu.edu.stat501/files/data/skincancer/index.txt)\n", 147 | "\n", 148 | "I don't have a clear understanding of the origin of this data and **we are simply using the dataset to demo a technique**. Please don't draw conclusions from the results of this simplistic analysis." 149 | ] 150 | }, 151 | { 152 | "cell_type": "code", 153 | "execution_count": null, 154 | "metadata": {}, 155 | "outputs": [], 156 | "source": [ 157 | "df = pd.read_csv('../universal_datasets/skincancer.txt',\n", 158 | " delim_whitespace=True,\n", 159 | " header=0,\n", 160 | " names=['state', 'lat', 'mort', 'ocean', 'long'])\n", 161 | "df.head()" 162 | ] 163 | }, 164 | { 165 | "cell_type": "code", 166 | "execution_count": null, 167 | "metadata": {}, 168 | "outputs": [], 169 | "source": [ 170 | "df.shape \n", 171 | "\n", 172 | "# Ummm. One line per state?\n", 173 | "# How did we get 49 lines?\n", 174 | "# Weird." 175 | ] 176 | }, 177 | { 178 | "cell_type": "code", 179 | "execution_count": null, 180 | "metadata": {}, 181 | "outputs": [], 182 | "source": [ 183 | "X = df['lat'].to_frame()\n", 184 | "y = df['mort']" 185 | ] 186 | }, 187 | { 188 | "cell_type": "code", 189 | "execution_count": null, 190 | "metadata": {}, 191 | "outputs": [], 192 | "source": [ 193 | "X[:5]" 194 | ] 195 | }, 196 | { 197 | "cell_type": "code", 198 | "execution_count": null, 199 | "metadata": {}, 200 | "outputs": [], 201 | "source": [ 202 | "y[:5]" 203 | ] 204 | }, 205 | { 206 | "cell_type": "markdown", 207 | "metadata": {}, 208 | "source": [ 209 | "With our data imported, let's separate it into training data and test data." 210 | ] 211 | }, 212 | { 213 | "cell_type": "code", 214 | "execution_count": null, 215 | "metadata": {}, 216 | "outputs": [], 217 | "source": [ 218 | "from sklearn.model_selection import train_test_split" 219 | ] 220 | }, 221 | { 222 | "cell_type": "code", 223 | "execution_count": null, 224 | "metadata": {}, 225 | "outputs": [], 226 | "source": [ 227 | "X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=42)" 228 | ] 229 | }, 230 | { 231 | "cell_type": "code", 232 | "execution_count": null, 233 | "metadata": {}, 234 | "outputs": [], 235 | "source": [ 236 | "plt.scatter(X_train, y_train)\n", 237 | "plt.title(\"Mortality vs Latitude\")\n", 238 | "plt.xlabel(\"Latitude\")\n", 239 | "plt.ylabel(\"Number of deaths\");" 240 | ] 241 | }, 242 | { 243 | "cell_type": "markdown", 244 | "metadata": {}, 245 | "source": [ 246 | "## Choose the Model" 247 | ] 248 | }, 249 | { 250 | "cell_type": "markdown", 251 | "metadata": {}, 252 | "source": [ 253 | "In this case, we have already decided upon using the LinearRegression model, so importing it is straightforward. But if we aren't sure what model we want we can always refer back to the [API Reference](https://scikit-learn.org/stable/modules/classes.html)." 254 | ] 255 | }, 256 | { 257 | "cell_type": "code", 258 | "execution_count": null, 259 | "metadata": { 260 | "collapsed": true 261 | }, 262 | "outputs": [], 263 | "source": [ 264 | "from sklearn.linear_model import LinearRegression" 265 | ] 266 | }, 267 | { 268 | "cell_type": "markdown", 269 | "metadata": {}, 270 | "source": [ 271 | "## Choose Appropriate Hyperparameters" 272 | ] 273 | }, 274 | { 275 | "cell_type": "markdown", 276 | "metadata": {}, 277 | "source": [ 278 | "For our purposes, this model doesn't require any hyperparameters, so we simply call the `LinearRegression` class." 279 | ] 280 | }, 281 | { 282 | "cell_type": "code", 283 | "execution_count": null, 284 | "metadata": { 285 | "collapsed": true 286 | }, 287 | "outputs": [], 288 | "source": [ 289 | "model = LinearRegression()" 290 | ] 291 | }, 292 | { 293 | "cell_type": "markdown", 294 | "metadata": {}, 295 | "source": [ 296 | "If we were to look at the possible hyperparameters, we would see this:\n", 297 | "\n", 298 | "```python\n", 299 | "LinearRegression(\n", 300 | " fit_intercept=True,\n", 301 | " normalize=False,\n", 302 | " copy_X=True,\n", 303 | " n_jobs=None,\n", 304 | ")\n", 305 | "```" 306 | ] 307 | }, 308 | { 309 | "cell_type": "markdown", 310 | "metadata": {}, 311 | "source": [ 312 | "**Yeah, but what do these even mean?**\n", 313 | "\n", 314 | "Some hyperparameters can be tricky to understand. Good places to start are the documentation:\n", 315 | "\n", 316 | "> [sklearn.linear_model.LinearRegression¶](https://scikit-learn.org/stable/modules/generated/sklearn.linear_model.LinearRegression.html)\n", 317 | "\n", 318 | "A number of these items are also explained on Stackoverflow:\n", 319 | "\n", 320 | "> [how fit intercept parameter impacts linear regression with scikit learn](https://stackoverflow.com/questions/46510242/how-fit-intercept-parameter-impacts-linear-regression-with-scikit-learn)\n", 321 | "\n", 322 | "It might take:\n", 323 | "\n", 324 | "* several readings\n", 325 | "* multiple sources\n", 326 | "* some tests and examples\n", 327 | "\n", 328 | "...before you start to wrap your head around the expected outcomes.\n", 329 | "\n", 330 | "*This is OK. You are just like the rest of us!*\n", 331 | "\n", 332 | "\n" 333 | ] 334 | }, 335 | { 336 | "cell_type": "markdown", 337 | "metadata": {}, 338 | "source": [ 339 | "## Fit the Model" 340 | ] 341 | }, 342 | { 343 | "cell_type": "code", 344 | "execution_count": null, 345 | "metadata": { 346 | "collapsed": true 347 | }, 348 | "outputs": [], 349 | "source": [ 350 | "model.fit(X_train, y_train)" 351 | ] 352 | }, 353 | { 354 | "cell_type": "markdown", 355 | "metadata": {}, 356 | "source": [ 357 | "## Apply the Model" 358 | ] 359 | }, 360 | { 361 | "cell_type": "code", 362 | "execution_count": null, 363 | "metadata": { 364 | "collapsed": true 365 | }, 366 | "outputs": [], 367 | "source": [ 368 | "y_pred = model.predict(X_test)" 369 | ] 370 | }, 371 | { 372 | "cell_type": "code", 373 | "execution_count": null, 374 | "metadata": {}, 375 | "outputs": [], 376 | "source": [ 377 | "y_pred.shape" 378 | ] 379 | }, 380 | { 381 | "cell_type": "code", 382 | "execution_count": null, 383 | "metadata": {}, 384 | "outputs": [], 385 | "source": [ 386 | "y_pred[:5]" 387 | ] 388 | }, 389 | { 390 | "cell_type": "markdown", 391 | "metadata": {}, 392 | "source": [ 393 | "## Examine the results" 394 | ] 395 | }, 396 | { 397 | "cell_type": "code", 398 | "execution_count": null, 399 | "metadata": { 400 | "collapsed": true 401 | }, 402 | "outputs": [], 403 | "source": [ 404 | "plt.title(\"Red and Purple Results\")\n", 405 | "plt.scatter(X_train, y_train, color='rebeccapurple')\n", 406 | "plt.scatter(X_test, y_pred, color='red', alpha=0.2);" 407 | ] 408 | }, 409 | { 410 | "cell_type": "code", 411 | "execution_count": null, 412 | "metadata": { 413 | "collapsed": true 414 | }, 415 | "outputs": [], 416 | "source": [ 417 | "plt.title(\"Red and Purple Results\")\n", 418 | "plt.scatter(X_train, y_train, color='rebeccapurple')\n", 419 | "plt.plot(X_test, y_pred, color='red');" 420 | ] 421 | }, 422 | { 423 | "cell_type": "code", 424 | "execution_count": null, 425 | "metadata": {}, 426 | "outputs": [], 427 | "source": [ 428 | "# For reference, against the above graph:\n", 429 | "\n", 430 | "print('Coefficient/slope:', model.coef_)\n", 431 | "print('y-intercept:', model.intercept_)\n" 432 | ] 433 | }, 434 | { 435 | "cell_type": "markdown", 436 | "metadata": {}, 437 | "source": [ 438 | "# Gotchas\n", 439 | "---" 440 | ] 441 | }, 442 | { 443 | "cell_type": "markdown", 444 | "metadata": {}, 445 | "source": [ 446 | "**Understanding the data formats**: When I first began experimenting with scikit-learn, I kept messing up the format of the data. I kept trying to feed it 1D arrays instead of 2D feature matrices. I would try to apply a model from a book OR a tutorial and would end up totally flummoxed, which was very stressful, especially with something that seems as simple as a linear regression. \n", 447 | "\n", 448 | "As we discussed in the data handling lesson, the `X` inputs (`X_train`, `X_test`) needed to be a 2D matrix." 449 | ] 450 | }, 451 | { 452 | "cell_type": "markdown", 453 | "metadata": {}, 454 | "source": [ 455 | "# Deep Dive\n", 456 | "---" 457 | ] 458 | }, 459 | { 460 | "cell_type": "markdown", 461 | "metadata": {}, 462 | "source": [ 463 | "N/A" 464 | ] 465 | }, 466 | { 467 | "cell_type": "markdown", 468 | "metadata": {}, 469 | "source": [ 470 | "# How to learn more: tips and hints\n", 471 | "---" 472 | ] 473 | }, 474 | { 475 | "cell_type": "markdown", 476 | "metadata": {}, 477 | "source": [ 478 | "**Read the outputs**: Pay close attention to the outputs that Scikit-Learn prints to the screen. Regular exposure to these outputs will regularly expose you to terms, arguments, vocabulary and grammar that are fundamental to understanding the inner workings of the models specifically and machine learning more generally. \n", 479 | "\n", 480 | "**Do outside research**: When you find a new word OR a word used in ways that you are not used to, look it up, read articles about that concept, read stackoverflow answers about that concept, and of course read the documentation. The word **regression** has been a thorn in my side since I first saw it. I just couldn't put my finger on what it means. I know what is happening in a regression calculation, but the **meaning** just escaped me. Why that word, to describe that phenomena? \n", 481 | "\n", 482 | "> \"The term \"regression\" was coined by Francis Galton in the nineteenth century to describe a biological phenomenon. The phenomenon was that the heights of descendants of tall ancestors tend to regress down towards a normal average (a phenomenon also known as regression toward the mean).\" \n", 483 | "\n", 484 | "> Source: [Wikipedia: Regression Analysis](https://en.wikipedia.org/wiki/Regression_analysis)\n" 485 | ] 486 | }, 487 | { 488 | "cell_type": "markdown", 489 | "metadata": {}, 490 | "source": [ 491 | "# Experience Points!\n", 492 | "---" 493 | ] 494 | }, 495 | { 496 | "cell_type": "markdown", 497 | "metadata": { 498 | "slideshow": { 499 | "slide_type": "slide" 500 | } 501 | }, 502 | "source": [ 503 | "# Read the docs...\n", 504 | "\n", 505 | "Explore the docs related to Naive Bayes models for about 3 - 4 minutes, in particular the section related to GaussianNB (naive bayes).\n", 506 | "\n", 507 | "[**GaussianNB (link)**](https://scikit-learn.org/stable/modules/generated/sklearn.naive_bayes.GaussianNB.html#sklearn.naive_bayes.GaussianNB)\n", 508 | "\n", 509 | "Find answers to the following:\n", 510 | "\n", 511 | "* Gaussian Naive Bayes has a method you can call that allows you to update models and can be used if the dataset is too large to fit into memory all at once. What is that method?\n", 512 | "* There is a link to the User Guide. Find the link and skim the overview. There are two cases mentioned where Naive Bayes Classifiers have worked quite well in many real-world situations. What are those two cases?" 513 | ] 514 | }, 515 | { 516 | "cell_type": "markdown", 517 | "metadata": {}, 518 | "source": [ 519 | "---\n", 520 | "When you complete this exercise, please put your **green** post-it on your monitor. \n", 521 | "\n", 522 | "If you want to continue on at your own-pace, please feel free to do so.\n", 523 | "\n", 524 | "" 525 | ] 526 | }, 527 | { 528 | "cell_type": "markdown", 529 | "metadata": {}, 530 | "source": [ 531 | "# References\n", 532 | "---" 533 | ] 534 | }, 535 | { 536 | "cell_type": "markdown", 537 | "metadata": {}, 538 | "source": [ 539 | "Below are references that may assist you in learning more:\n", 540 | " \n", 541 | "|Title (link)|Comments|\n", 542 | "|---|---|\n", 543 | "|[API docs on linear models](https://scikit-learn.org/stable/modules/classes.html#module-sklearn.linear_model)||\n", 544 | "|[sklearn description of overfitting](https://scikit-learn.org/stable/auto_examples/model_selection/plot_underfitting_overfitting.html)||\n", 545 | "|[Wikipedia article on overfitting](https://en.wikipedia.org/wiki/Overfitting)||\n", 546 | "|[Wikipedia article on regression analysis](https://en.wikipedia.org/wiki/Regression_analysis)||\n", 547 | "|[Skincancer dataset](https://newonlinecourses.science.psu.edu/stat501/sites/onlinecourses.science.psu.edu.stat501/files/data/skincancer/index.txt)||" 548 | ] 549 | }, 550 | { 551 | "cell_type": "code", 552 | "execution_count": null, 553 | "metadata": {}, 554 | "outputs": [], 555 | "source": [] 556 | } 557 | ], 558 | "metadata": { 559 | "kernelspec": { 560 | "display_name": "Python 3", 561 | "language": "python", 562 | "name": "python3" 563 | }, 564 | "language_info": { 565 | "codemirror_mode": { 566 | "name": "ipython", 567 | "version": 3 568 | }, 569 | "file_extension": ".py", 570 | "mimetype": "text/x-python", 571 | "name": "python", 572 | "nbconvert_exporter": "python", 573 | "pygments_lexer": "ipython3", 574 | "version": "3.6.7" 575 | } 576 | }, 577 | "nbformat": 4, 578 | "nbformat_minor": 2 579 | } 580 | -------------------------------------------------------------------------------- /04_naive_bayes/04_naive_bayes.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": {}, 6 | "source": [ 7 | "# Welcome to the Dark Art of Coding:\n", 8 | "## Introduction to Machine Learning\n", 9 | "Naive Bayes Classification\n", 10 | "\n", 11 | "" 12 | ] 13 | }, 14 | { 15 | "cell_type": "markdown", 16 | "metadata": {}, 17 | "source": [ 18 | "# Objectives\n", 19 | "---" 20 | ] 21 | }, 22 | { 23 | "cell_type": "markdown", 24 | "metadata": {}, 25 | "source": [ 26 | "In this session, students should expect to:\n", 27 | "\n", 28 | "* Cover an overview of Naive Bayes Classification\n", 29 | "* Examine code samples that walk us through **The Process™**:\n", 30 | " * Prep the data\n", 31 | " * Choose the model\n", 32 | " * Choose appropriate hyperparameters\n", 33 | " * Fit the model\n", 34 | " * Apply the model\n", 35 | " * Examine the results\n", 36 | "* Explore a deep dive into this model\n", 37 | "* Review some gotchas that might complicate things\n", 38 | "* Review tips related to learning more" 39 | ] 40 | }, 41 | { 42 | "cell_type": "markdown", 43 | "metadata": {}, 44 | "source": [ 45 | "# Overview: Naive Bayes Classification\n", 46 | "---" 47 | ] 48 | }, 49 | { 50 | "cell_type": "markdown", 51 | "metadata": {}, 52 | "source": [ 53 | "Naive Bayes Classification models are popular machine learning models because they:\n", 54 | "* are fast\n", 55 | "* are simple with few tunable hyperparameters\n", 56 | "* are suitable for datasets with very high dimensions\n", 57 | "* can provide a nice baseline classification to start with before considering more sophisticated models\n", 58 | "\n", 59 | "
\n", 60 | "\n", 61 | "\n", 62 | "\n", 63 | "Naive Bayes Classifiers rely upon Bayes Theorem that allows you to predict the probability of a `label` if given some set of `features`:\n", 64 | "\n", 65 | "$$P(label | features)$$\n", 66 | "\n", 67 | "We won't cover the math here. 😀\n", 68 | "\n", 69 | "I do go into it in my [**Intro to Statistics and Probability**](https://www.youtube.com/watch?v=zzbw0JbiI6Y) tutorial from Pycon 2018. Check it out!\n", 70 | "\n", 71 | "Scikit-learn has a number of Naive Bayes Classifiers. They are referred to as **naive** because they make certain presumptions about the data.\n", 72 | "\n", 73 | "Each of the following has slightly different assumptions about the data. For example, the GaussianNB model that we will look at presumes that the \"likelihood of the features is assumed to be Gaussian\" (i.e. the likelihood of any given feature falls on a bell curve).\n", 74 | "\n", 75 | "* BernoulliNB\n", 76 | "* ComplementNB\n", 77 | "* GaussianNB\n", 78 | "* MultinomialNB" 79 | ] 80 | }, 81 | { 82 | "cell_type": "markdown", 83 | "metadata": {}, 84 | "source": [ 85 | "Let's go through the steps of **The Process™** to see how this works." 86 | ] 87 | }, 88 | { 89 | "cell_type": "markdown", 90 | "metadata": {}, 91 | "source": [ 92 | "## Prep the data" 93 | ] 94 | }, 95 | { 96 | "cell_type": "markdown", 97 | "metadata": {}, 98 | "source": [ 99 | "We start with a set of standard imports..." 100 | ] 101 | }, 102 | { 103 | "cell_type": "code", 104 | "execution_count": null, 105 | "metadata": { 106 | "collapsed": true 107 | }, 108 | "outputs": [], 109 | "source": [ 110 | "import matplotlib.pyplot as plt\n", 111 | "import numpy as np\n", 112 | "import pandas as pd\n", 113 | "import sklearn\n", 114 | "\n", 115 | "# NOTE: during the Choose the Model step, we will import the \n", 116 | "# model we want, but there is no reason you can't import it here.\n", 117 | "# from sklearn.naive_bayes import GaussianNB " 118 | ] 119 | }, 120 | { 121 | "cell_type": "markdown", 122 | "metadata": {}, 123 | "source": [ 124 | "### Prep the training data and test data" 125 | ] 126 | }, 127 | { 128 | "cell_type": "code", 129 | "execution_count": null, 130 | "metadata": {}, 131 | "outputs": [], 132 | "source": [ 133 | "# For the banana lovers in the room:\n", 134 | "# fake data warning...\n", 135 | "\n", 136 | "df = pd.read_csv('../universal_datasets/bananas.csv')\n", 137 | "df.head()" 138 | ] 139 | }, 140 | { 141 | "cell_type": "markdown", 142 | "metadata": {}, 143 | "source": [ 144 | "Here two columns from a `pandas DataFrame` represent a suitable 2D matrix for the `features`.\n", 145 | "\n", 146 | "One column from the `pandas DataFrame` (i.e. a `pandas Series`) is suitable as the `target` array." 147 | ] 148 | }, 149 | { 150 | "cell_type": "code", 151 | "execution_count": null, 152 | "metadata": {}, 153 | "outputs": [], 154 | "source": [ 155 | "X = df[['length', 'width']]\n", 156 | "y = df['category']" 157 | ] 158 | }, 159 | { 160 | "cell_type": "markdown", 161 | "metadata": {}, 162 | "source": [ 163 | "It can be really useful to take a look at the features matrix and target array of the training data. \n", 164 | "\n", 165 | "* In the raw form\n", 166 | "* In a visualization tool\n", 167 | "\n", 168 | "For this dataset, let's use a scatter plot." 169 | ] 170 | }, 171 | { 172 | "cell_type": "code", 173 | "execution_count": null, 174 | "metadata": {}, 175 | "outputs": [], 176 | "source": [ 177 | "X[:5]" 178 | ] 179 | }, 180 | { 181 | "cell_type": "code", 182 | "execution_count": null, 183 | "metadata": {}, 184 | "outputs": [], 185 | "source": [ 186 | "y[:5]" 187 | ] 188 | }, 189 | { 190 | "cell_type": "code", 191 | "execution_count": null, 192 | "metadata": {}, 193 | "outputs": [], 194 | "source": [ 195 | "from sklearn.model_selection import train_test_split" 196 | ] 197 | }, 198 | { 199 | "cell_type": "code", 200 | "execution_count": null, 201 | "metadata": {}, 202 | "outputs": [], 203 | "source": [ 204 | "X_train, X_test, y_train, y_test = train_test_split(X, y, \n", 205 | " test_size=0.33,\n", 206 | " random_state=42)" 207 | ] 208 | }, 209 | { 210 | "cell_type": "code", 211 | "execution_count": null, 212 | "metadata": {}, 213 | "outputs": [], 214 | "source": [ 215 | "for item in X_train, X_test, y_train, y_test:\n", 216 | " print(item[:2]) # Let's look at just two samples\n", 217 | " print(item.shape) # Let's confirm the number of samples\n", 218 | " print()" 219 | ] 220 | }, 221 | { 222 | "cell_type": "code", 223 | "execution_count": null, 224 | "metadata": {}, 225 | "outputs": [], 226 | "source": [ 227 | "plt.scatter(X_train['length'], X_train['width'], c=y_train,\n", 228 | " cmap='seismic')\n", 229 | "plt.title(\"'Cavendish' vs. 'Apple Banana' Training Data\");\n", 230 | "\n", 231 | "# NOTE TO SELF: Blue is cat zero, red is cat one" 232 | ] 233 | }, 234 | { 235 | "cell_type": "markdown", 236 | "metadata": {}, 237 | "source": [ 238 | "In the following plot of the test data, we chose to set the `alpha` channel for the dots at `0.15` which makes the dots largely transparent, so that they are visually distinct. Later we will plot the training data and the test data on the same graph and that transparencey will help to segregate them visually.\n", 239 | "\n", 240 | "AND, although we know what category each of these falls into, we chose to keep them all the same color, since we want to rely upon the model to categorize them." 241 | ] 242 | }, 243 | { 244 | "cell_type": "code", 245 | "execution_count": null, 246 | "metadata": {}, 247 | "outputs": [], 248 | "source": [ 249 | "plt.title(\"All the test points\")\n", 250 | "plt.scatter(X_test['length'], X_test['width'], alpha=0.15);" 251 | ] 252 | }, 253 | { 254 | "cell_type": "markdown", 255 | "metadata": {}, 256 | "source": [ 257 | "## Choose the Model" 258 | ] 259 | }, 260 | { 261 | "cell_type": "markdown", 262 | "metadata": {}, 263 | "source": [ 264 | "In this case, we have already decided upon using the GaussianNB model, so importing it is straightforward. But if we aren't sure what model we want we can always refer back to the [API Reference](https://scikit-learn.org/stable/modules/classes.html)." 265 | ] 266 | }, 267 | { 268 | "cell_type": "code", 269 | "execution_count": null, 270 | "metadata": { 271 | "collapsed": true 272 | }, 273 | "outputs": [], 274 | "source": [ 275 | "from sklearn.naive_bayes import GaussianNB" 276 | ] 277 | }, 278 | { 279 | "cell_type": "markdown", 280 | "metadata": {}, 281 | "source": [ 282 | "## Choose Appropriate Hyperparameters" 283 | ] 284 | }, 285 | { 286 | "cell_type": "markdown", 287 | "metadata": {}, 288 | "source": [ 289 | "This model doesn't require any hyperparameters, so we simply call the `GaussianNB` class." 290 | ] 291 | }, 292 | { 293 | "cell_type": "code", 294 | "execution_count": null, 295 | "metadata": { 296 | "collapsed": true 297 | }, 298 | "outputs": [], 299 | "source": [ 300 | "model = GaussianNB()" 301 | ] 302 | }, 303 | { 304 | "cell_type": "markdown", 305 | "metadata": {}, 306 | "source": [ 307 | "## Fit the Model" 308 | ] 309 | }, 310 | { 311 | "cell_type": "markdown", 312 | "metadata": {}, 313 | "source": [ 314 | "Here we supply the **features matrix** and a **target array** that we generated above. Notice that it immediately provides a summary of the hyperparameters (in this case, the defaults) that were supplied." 315 | ] 316 | }, 317 | { 318 | "cell_type": "code", 319 | "execution_count": null, 320 | "metadata": { 321 | "collapsed": true 322 | }, 323 | "outputs": [], 324 | "source": [ 325 | "model.fit(X_train, y_train)" 326 | ] 327 | }, 328 | { 329 | "cell_type": "markdown", 330 | "metadata": {}, 331 | "source": [ 332 | "## Apply the Model" 333 | ] 334 | }, 335 | { 336 | "cell_type": "markdown", 337 | "metadata": {}, 338 | "source": [ 339 | "We can now supply the test features matrix in expectation that the model will produce an array of labels (categories): one label for each sample in the features matrix." 340 | ] 341 | }, 342 | { 343 | "cell_type": "code", 344 | "execution_count": null, 345 | "metadata": { 346 | "collapsed": true 347 | }, 348 | "outputs": [], 349 | "source": [ 350 | "y_pred = model.predict(X_test)" 351 | ] 352 | }, 353 | { 354 | "cell_type": "code", 355 | "execution_count": null, 356 | "metadata": { 357 | "collapsed": true 358 | }, 359 | "outputs": [], 360 | "source": [ 361 | "y_pred.shape" 362 | ] 363 | }, 364 | { 365 | "cell_type": "code", 366 | "execution_count": null, 367 | "metadata": { 368 | "collapsed": true 369 | }, 370 | "outputs": [], 371 | "source": [ 372 | "y_pred[:10]" 373 | ] 374 | }, 375 | { 376 | "cell_type": "markdown", 377 | "metadata": {}, 378 | "source": [ 379 | "## Examine the results" 380 | ] 381 | }, 382 | { 383 | "cell_type": "code", 384 | "execution_count": null, 385 | "metadata": { 386 | "collapsed": true 387 | }, 388 | "outputs": [], 389 | "source": [ 390 | "plt.title(\"Red and Blue Results\")\n", 391 | "\n", 392 | "plt.scatter(X_train['length'], X_train['width'], c=y_train,\n", 393 | " cmap='seismic')\n", 394 | "\n", 395 | "plt.scatter(X_test['length'], X_test['width'], c=y_pred,\n", 396 | " cmap='seismic',\n", 397 | " alpha=0.15);" 398 | ] 399 | }, 400 | { 401 | "cell_type": "markdown", 402 | "metadata": {}, 403 | "source": [ 404 | "# Gotchas\n", 405 | "---" 406 | ] 407 | }, 408 | { 409 | "cell_type": "markdown", 410 | "metadata": {}, 411 | "source": [ 412 | "A number of problems arose the first time I dove into this model:\n", 413 | " \n", 414 | "**Naming conventions**: I ran into snags with naming conventions. My first toy dataset was randomly generated and I just pictured it as **x and y coordinates**. But having x values and y values on my graph (and in my head) threw everything out of whack when I tried to translate that to the `X` and `y` inputs and outputs that are commonly used in models and in statistics, etc. If my data were naturally labeled as anything else, it might have been less painful to mentally translate:\n", 415 | "\n", 416 | "|Alternate Labels||\n", 417 | "|:---|:---|\n", 418 | "|lat|long|\n", 419 | "|price|quantity sold|\n", 420 | "|passing yards|wins|" 421 | ] 422 | }, 423 | { 424 | "cell_type": "markdown", 425 | "metadata": {}, 426 | "source": [ 427 | "**Making graphs shouldn't distract you from the machine learning**: Above, (and in most of these sessions) we have a number of dataviz steps intermixed with our machine learning steps. And there is the possibility that it might lead to confusion about what parts are *critical* to the machine learning and which parts are *nice to have*. Presuming your data is prepared properly, **just these four lines are necessary** to predict the category OR label for the values in the test set. \n", 428 | "\n", 429 | "```python\n", 430 | " from sklearn.naive_bayes import GaussianNB\n", 431 | " model = GaussianNB()\n", 432 | " model.fit(X_train, y_train)\n", 433 | " y_pred = model.predict(X_test)\n", 434 | "```\n", 435 | "\n", 436 | "**Sometimes starting too big is too confusing**: I often recommend that students pare back their problem to a small handful of items so that they can really see what is happening. Since this model takes a pair of coordinates and returns a label to say whether the coordinates fit in the **blue** category OR the **red** category, let's take a pair of coordinates that we know should fit clearly into the blue category (i.e. something above the dividing line." 437 | ] 438 | }, 439 | { 440 | "cell_type": "code", 441 | "execution_count": null, 442 | "metadata": {}, 443 | "outputs": [], 444 | "source": [ 445 | "# 83 187.600405 36.487954 0\n", 446 | "\n", 447 | "y_pred_single = model.predict([[187, 36]])\n", 448 | "y_pred_single" 449 | ] 450 | }, 451 | { 452 | "cell_type": "markdown", 453 | "metadata": {}, 454 | "source": [ 455 | "Now, let's take a pair of points (one in each category **red** and **blue**) and ensure that we get two different labels:" 456 | ] 457 | }, 458 | { 459 | "cell_type": "code", 460 | "execution_count": null, 461 | "metadata": {}, 462 | "outputs": [], 463 | "source": [ 464 | "# 28 231.568879 32.733688 1\n", 465 | "# 93 190.142527 45.764529 0\n", 466 | "\n", 467 | "y_pred_pair = model.predict([[232, 33],\n", 468 | " [190, 46]])\n", 469 | "y_pred_pair" 470 | ] 471 | }, 472 | { 473 | "cell_type": "markdown", 474 | "metadata": {}, 475 | "source": [ 476 | "# Deep Dive\n", 477 | "---" 478 | ] 479 | }, 480 | { 481 | "cell_type": "markdown", 482 | "metadata": {}, 483 | "source": [ 484 | "N/A" 485 | ] 486 | }, 487 | { 488 | "cell_type": "markdown", 489 | "metadata": {}, 490 | "source": [ 491 | "# How to learn more: tips and hints\n", 492 | "---" 493 | ] 494 | }, 495 | { 496 | "cell_type": "markdown", 497 | "metadata": {}, 498 | "source": [ 499 | "What should you do to advance your skills?\n", 500 | "\n", 501 | "**Play with the tools**:\n", 502 | "\n", 503 | "\n", 504 | "\n", 505 | "**Get familiar with your favorite graphing library**: being able to visualize the results will help you get a sense of whether your model is accurately predicting. It will also help you to better succeed at the **ultimate goal of data science**:\n", 506 | "\n", 507 | "> Data science is meant to inform and thus enable action.\n", 508 | "\n", 509 | "\n", 510 | "**Read the docs**: yeah... I know they can be scary. I love math, but sometimes my eyes glaze over when row after row of equations come rolling out. Regardless, the more time you spend reading the docs, the faster you will begin to better understand the nuances of different models, which models apply in which situations. Don't be afraid if there are words in there that you don't understand. The vocabulary will come, given time and plenty of exposure. From this lesson, several good resources include:\n", 511 | "* [API Reference](https://scikit-learn.org/stable/modules/classes.html)\n", 512 | "* [Gaussian Naive Bayes Page](https://scikit-learn.org/stable/modules/generated/sklearn.naive_bayes.GaussianNB.html)\n", 513 | "* [User Guide: Naive Bayes](https://scikit-learn.org/stable/modules/naive_bayes.html)\n", 514 | "\n", 515 | "**Don't just copy-paste other people's models**: go home, find a dataset with values that are suitable to a given model and create your own model. Then put in some test values and see if it predicts properly." 516 | ] 517 | }, 518 | { 519 | "cell_type": "markdown", 520 | "metadata": {}, 521 | "source": [ 522 | "# Experience Points!\n", 523 | "---" 524 | ] 525 | }, 526 | { 527 | "cell_type": "markdown", 528 | "metadata": { 529 | "slideshow": { 530 | "slide_type": "slide" 531 | } 532 | }, 533 | "source": [ 534 | "# Read the docs...\n", 535 | "\n", 536 | "Explore the docs related to clustering and KMEANS for about 3 - 4 minutes.\n", 537 | "\n", 538 | "[**Clustering (link)**](https://scikit-learn.org/stable/modules/clustering.html#k-means)\n", 539 | "\n", 540 | "[**KMEANS API (link)**](https://scikit-learn.org/stable/modules/generated/sklearn.cluster.KMeans.html#sklearn.cluster.KMeans)\n", 541 | "\n", 542 | "Find answers to the following:\n", 543 | "\n", 544 | "* On the Clustering page, there is a section focused on K-means. In that section, there are four small graphs. Those graphs indicate four cases where K-means may struggle in producing accurate clusters. What are those four cases? (Hint: look for inertia).\n", 545 | "* On the KMEANS API page, there is an argument mentioned that controls the number of times the k-means algorithm will be run with different centroid seeds. What is the default number of times?" 546 | ] 547 | }, 548 | { 549 | "cell_type": "markdown", 550 | "metadata": {}, 551 | "source": [ 552 | "---\n", 553 | "When you complete this exercise, please put your **green** post-it on your monitor. \n", 554 | "\n", 555 | "If you want to continue on at your own-pace, please feel free to do so.\n", 556 | "\n", 557 | "" 558 | ] 559 | }, 560 | { 561 | "cell_type": "markdown", 562 | "metadata": {}, 563 | "source": [ 564 | "# References\n", 565 | "---" 566 | ] 567 | }, 568 | { 569 | "cell_type": "markdown", 570 | "metadata": {}, 571 | "source": [ 572 | "Below are references that may assist you in learning more:\n", 573 | " \n", 574 | "|Title (link)|Comments|\n", 575 | "|---|---|\n", 576 | "|[API Reference](https://scikit-learn.org/stable/modules/classes.html)||\n", 577 | "|[Gaussian Naive Bayes Page](https://scikit-learn.org/stable/modules/generated/sklearn.naive_bayes.GaussianNB.html)||\n", 578 | "|[User Guide: Naive Bayes](https://scikit-learn.org/stable/modules/naive_bayes.html)||" 579 | ] 580 | } 581 | ], 582 | "metadata": { 583 | "kernelspec": { 584 | "display_name": "Python 3", 585 | "language": "python", 586 | "name": "python3" 587 | }, 588 | "language_info": { 589 | "codemirror_mode": { 590 | "name": "ipython", 591 | "version": 3 592 | }, 593 | "file_extension": ".py", 594 | "mimetype": "text/x-python", 595 | "name": "python", 596 | "nbconvert_exporter": "python", 597 | "pygments_lexer": "ipython3", 598 | "version": "3.6.7" 599 | } 600 | }, 601 | "nbformat": 4, 602 | "nbformat_minor": 2 603 | } 604 | -------------------------------------------------------------------------------- /universal_datasets/svm_test.csv: -------------------------------------------------------------------------------- 1 | -1.1128777354788033,-4.198293550514309 2 | -1.5474321011450844,-8.513296710085173 3 | -0.8413329171075709,-4.570114718395777 4 | 0.5912805806348089,-2.0298235499910566 5 | -2.5034573722452205,1.5984402199917822 6 | -4.581650231154441,-10.557697330825775 7 | -1.20990781248348,-11.682959807907126 8 | 0.4394508714648315,-0.17059459964294366 9 | -1.6445130595044544,-1.1214521417663437 10 | 0.19031516462512865,0.08734902225413332 11 | -1.19167337459382,-6.146809041308661 12 | -1.3085533869201318,-4.350813803390592 13 | -2.306946193513669,-1.4452818792232835 14 | -3.14688610811814,-9.70465692233303 15 | -1.0242914317012626,-6.529314650292045 16 | 0.4919143370694552,-6.6995136789470875 17 | 0.016185879349561283,-13.13359909954168 18 | 0.0360166874566048,-5.115758650312007 19 | -1.7698348835851956,-8.466899151755907 20 | -1.0736152379772443,-2.1532893668854047 21 | 0.4098044799186551,-2.6293513081646123 22 | -1.6851622373555832,-0.9835611450213841 23 | -1.6106341312508223,-6.5409431708702845 24 | -0.7430206938447779,-2.14729472131521 25 | -2.266473221541881,-5.3291539664110035 26 | -1.581066658517325,-4.922281617698713 27 | -1.9558827223378632,-6.021700877640337 28 | -0.9387609606326968,-8.718706338879006 29 | -0.2301104407724368,-4.732083990372785 30 | -1.5906587209581595,-2.0727547646030704 31 | -1.3903369815871975,-2.8919671133878744 32 | 1.479076786668335,-6.498974380475772 33 | -2.9346328244050133,1.0986661301577705 34 | -1.15070382621266,-3.4134380455882227 35 | -1.9542548079513282,-5.507156905206045 36 | -0.5684234625902876,-0.610820333397216 37 | -1.5679922068669918,-2.7929728643040432 38 | -0.7041198804938382,-0.1781956770802795 39 | -0.8836271411491521,10.8173982416032 40 | -0.4721529571719072,-5.474383220946176 41 | 0.2960366741598628,-7.297419726991481 42 | -2.095078961965399,-7.818206265656964 43 | -0.6265028246144968,-3.1475975034269634 44 | -1.3150713066566837,-0.27925473687410474 45 | -2.488778585455615,-5.078831149589483 46 | -0.6057255796745875,-8.513211751127557 47 | -0.48235756401064556,-8.959721657796088 48 | -0.2838109663390621,-4.523799455667192 49 | -2.0231548987762293,-4.335880453508577 50 | -0.996074358538087,3.0597117469927166 51 | 0.9901845607455066,0.5224254058947455 52 | -3.3771895470127506,-0.9292548641916207 53 | -1.3071800647197884,-3.070239491532864 54 | -1.2758938439899323,-7.000760648613763 55 | -0.9948128966276316,-6.0383081608925195 56 | 0.2434595319317041,-2.7064273482009567 57 | -1.1690940009147741,-0.9043762635196266 58 | -3.335869765263701,-5.628384123440624 59 | 0.9821869305938502,5.261865789762641 60 | -0.19147247303603931,1.1908892227316379 61 | -0.2181997473744839,-16.44773455805294 62 | 0.12960189093060603,-9.60980858058559 63 | -3.009611866707992,-3.0964255940602476 64 | -2.007766638226739,-1.5753611236593463 65 | -2.8251980124114953,-7.031778004054083 66 | 2.00473200982919,-3.274821617856249 67 | -1.7679256563802483,-3.6804803210456596 68 | -2.2335824595287894,-10.616225074507568 69 | 0.053487932897701373,-3.1563098797363254 70 | -2.1857322638354835,-5.277172986142428 71 | -0.6711630915485909,-4.334429142497399 72 | -0.7285043236068133,-3.707072940066287 73 | -1.563981387280442,-3.9247507097832375 74 | -2.631795438719293,-10.45130809027092 75 | 0.4406367794407118,-7.711596650127005 76 | -0.5468932107633326,-5.175736655529768 77 | -0.536519025771141,-6.356582747502822 78 | 1.4970857713734436,-6.01928804348192 79 | -0.02868140249853024,-6.524236218716373 80 | -0.5263339336681936,1.5962961566214293 81 | -1.6071354163066518,-8.423417090968968 82 | -0.9891988917270499,-5.148505008129691 83 | -0.3653995723747301,-3.02437293221283 84 | -0.9822850126215252,-0.5543305941117551 85 | 0.12393723777135057,-5.632331929032225 86 | 0.81010747537499,-9.715903558896468 87 | 1.4070684434538498,-0.798908877631185 88 | -2.4430172601404907,-7.519256236670962 89 | 0.17940627657545782,-2.668367943717899 90 | -0.9615413296626997,-6.087085020051929 91 | -0.19223997006077254,4.23465919579189 92 | -1.3128419414454324,-10.37338828506749 93 | -0.9866059510816588,2.0142188988848693 94 | -1.335904576602386,-3.704809267653785 95 | 0.5185367751335748,-6.132426741716719 96 | 1.7071907683790912,-7.463475598725525 97 | -0.16916555502687247,-3.8264338023130167 98 | -2.4132164113431545,-13.043569405732988 99 | 0.20137642533920252,3.2059240047449897 100 | -0.40747209518681127,-7.51457395604197 101 | 0.27382129858365056,-5.917867348765813 102 | -1.4562171240822945,-3.0340017925613747 103 | -0.5482698906275856,-2.980823871339118 104 | 1.2291130733484739,-5.590924248231709 105 | -1.5553596089240154,-2.3355326335062268 106 | -0.8006998821492153,4.39825586935649 107 | -1.3324880544027575,-4.710594485681928 108 | -0.536249055451949,-2.227055346483483 109 | -3.5945315564892124,-0.988128753703394 110 | -1.2605411231623436,2.814814052218316 111 | 2.412182553036236,0.1128229738256259 112 | -1.2871657660629863,-0.1453039966800489 113 | 0.46176674674106044,-6.2351510463130415 114 | -1.0452744568713892,-5.33223840545686 115 | -0.6515562210836497,-11.876539420388724 116 | -2.4198144999165843,-9.042416520761627 117 | -2.727844731412838,-0.0979870392619957 118 | -0.5197555178132393,-2.5988945669278336 119 | -0.714344854470871,-1.5174770901825783 120 | -1.5079231732020961,-3.477998928994165 121 | 0.9226893964735177,-9.888991302372752 122 | -2.5449144755222926,-0.9352796786917144 123 | 0.029554117762038867,-3.9612231798876087 124 | 0.24873022816942592,-7.363411033550648 125 | -1.3146547455503872,-3.1666592242191935 126 | -1.169912733526902,3.5169501219545705 127 | 0.29208822534881507,-5.899481505379233 128 | 0.1397504920912147,-6.741801808912094 129 | -1.3843895153191639,0.8755126119758376 130 | 0.22820574588570675,-2.0662376202181862 131 | -2.5372903787533145,-2.648502256573363 132 | 0.4673792257018752,1.8146046142746766 133 | -1.3120397720769954,-2.89541342871493 134 | 1.9368240880477532,-3.1073675008983055 135 | -2.4549412376138826,-6.1293883863116925 136 | -0.2746470893857751,-4.3677310786379575 137 | -1.784459871132847,1.6715553735647246 138 | 0.1402103786720339,-6.490842918160093 139 | 0.179680659791027,-1.8273590187608306 140 | -1.3246019440511527,-7.217612148373873 141 | 0.12031256638922905,-2.813658106757887 142 | -1.326285145078372,-7.330428746758571 143 | -3.1147084957898525,-9.434828059975843 144 | -2.5535537001405273,-4.2864313097643505 145 | -2.025297732713089,0.8081762661781102 146 | 0.4364556450314867,-0.36902614279795776 147 | -0.14295350853438704,-7.091404216909701 148 | 1.7939770213011967,-4.267594335097729 149 | -1.0984189071325916,-8.849200519658748 150 | -3.404224052194068,-1.4929371133042935 151 | -2.592841650911873,-0.8137261872522261 152 | 0.2440419485026808,3.7801304782547884 153 | 0.511612897545638,-4.753008017658611 154 | -1.0715433116632096,-2.490352366355628 155 | -1.60151335822812,1.0572219068535036 156 | 1.470372421121279,-9.909566514087295 157 | -0.57983340915765,-1.5251375534063212 158 | -1.9432703247978385,-3.7075936700221073 159 | 2.0611647957057775,-1.9693761357270425 160 | 0.8353192892179313,-7.58989014746479 161 | -2.2588973984409195,-6.618525497360318 162 | -0.29943419554851824,-10.078916500258547 163 | 0.5447103991330242,-5.286234559925591 164 | -1.263249164014744,-1.465723575583775 165 | 0.24196363595697035,-0.6451136170690162 166 | 1.0590111972933545,-3.155389054839493 167 | -2.155207294566483,4.521830011420883 168 | -1.6213016326065264,-7.991147816925385 169 | 1.0134518329138489,-10.549085068940713 170 | -2.6367527830965445,-0.3862670569628648 171 | -2.962755818320897,-4.803764489168577 172 | -2.3124992092204213,-6.284494590117147 173 | -0.5803291296377401,-6.0125042700666755 174 | -2.094405504981626,-2.3731302649672847 175 | 0.40390456634281313,-2.8360411011601676 176 | 0.3113593084095836,-5.331204209847755 177 | -0.7941343188944578,-8.856231074412683 178 | -0.4559661420037868,-1.9152707824758601 179 | -2.0603420398922765,-0.6885666719508574 180 | -2.075788369930776,-1.9163382385696401 181 | -1.2350777886647473,-9.722168418113693 182 | -0.07020042927254444,-3.4198365841517036 183 | -0.5285793163048296,-2.865147670538677 184 | -1.2862490833421456,1.1971108440240128 185 | -2.012721152140739,-1.5785213504708366 186 | -0.15902799791158984,-5.936280986059682 187 | 0.7760866086200271,-7.449449528293108 188 | -0.16763009805643359,-11.754207983464923 189 | -0.9299175118643797,-8.827276806254526 190 | -2.6867687929018103,-2.9181327165713737 191 | -0.9852704699454691,-0.9775181031343916 192 | 1.0207590525289714,-7.886427243400431 193 | -1.6572918282234985,-9.105034909569302 194 | -1.39067345067344,4.041168670705808 195 | 0.8022849707320943,3.187660106071009 196 | -0.3917813091122211,2.1226251494452466 197 | 0.7336349726266675,-6.534662357427745 198 | 0.048954383815224434,-5.655673996480912 199 | -3.5469532640120267,-3.1359491834882007 200 | 0.46764049139741704,-1.9332327370148148 201 | 1.545204507640002,0.9662192776398486 202 | -0.947209598895472,-8.713750176649674 203 | -1.4876012400395109,-4.627899496104078 204 | -2.0836694537214155,-1.2946024051675553 205 | -1.034693731302668,-8.702365178766676 206 | -1.116660332356774,-3.055897693472799 207 | -1.3324359796098209,-7.916101647461197 208 | -1.8322339697284262,-7.271336959293851 209 | -0.5813279766268333,-3.436475114950773 210 | -1.2654560161346229,2.245679198760117 211 | 0.5296821362123993,-5.2793592696408425 212 | -0.5866418398291119,-5.048133604670716 213 | 0.3233017994841798,-5.095824845994039 214 | 1.448422160569376,-0.860826240019513 215 | -1.8161714575819536,-6.737962303756498 216 | -0.38975285182147834,-0.5965182953296884 217 | -1.2487488109951144,-4.292458314434005 218 | -0.48939306735942834,-13.652797915157418 219 | -1.7358565770206127,-3.829973239655861 220 | -2.220932414029518,-5.73661455558891 221 | -3.368987111684689,-3.902579869562497 222 | 0.5877943865755928,1.7449973268795205 223 | -0.3540845973764529,-2.0792115449795188 224 | -1.0715985137285198,-12.061838337341282 225 | -1.5042898118276156,-0.7321109895948288 226 | -0.08456099179558785,1.041618513336358 227 | -2.534050217913368,-3.820388386664164 228 | -0.2742876314386189,-4.679599764323371 229 | -0.5543339008814908,-4.699346544242372 230 | 0.6961672068271421,2.251803710561407 231 | -0.7050586462947833,5.085363196753761 232 | -3.770554850925653,-3.592556033486702 233 | -1.1085894774047764,-3.8641139353523286 234 | 0.4551258697912641,-2.421916861182247 235 | 0.24362205114534285,-3.5781485255508088 236 | -1.0880138328041307,-4.2454829492528185 237 | -2.374163440446767,-2.8240230821032575 238 | -0.4003667290980757,-1.9904745066529905 239 | -3.1213814371526833,8.439381149457407 240 | 0.5824419668992531,-0.4690592719057509 241 | -1.9700121172295944,2.4400110048206196 242 | -2.8036777324069906,-7.272866429872092 243 | -1.7270659520237779,-7.882909034550316 244 | -3.120080147253502,-3.981479385425046 245 | -2.302319166513156,-4.381050115632078 246 | 0.28277391601410273,-4.455259677168917 247 | 0.4653973465708314,-8.549125023488344 248 | -2.7683769415331962,-12.688622991951636 249 | -1.2043844226691438,-6.171178064769993 250 | -0.4224743491930061,-2.4185872061228912 251 | 0.8307589376358631,-0.3825808216958424 252 | -1.9788783268539336,-5.219912498877827 253 | -3.0282531355188977,-2.528981443790983 254 | -1.8325168896155002,-4.679277475839315 255 | -0.7029749765706805,-8.677707691382043 256 | -0.38822700186965575,-1.1835637127312397 257 | -0.9288948600748226,-6.787805327829065 258 | -1.7831200486025336,-2.9488994654990757 259 | -1.3666581174181738,0.622504221722811 260 | -2.0014259136181667,-1.8622557198407188 261 | 0.4984192235837317,3.281833406443777 262 | -1.0434966327533859,-2.3054775650520636 263 | 1.1304889945184344,-9.073609061313611 264 | -0.2523506128293508,-4.480874694567567 265 | 0.9891831220914369,-2.371330457585594 266 | 1.17981507670804,-2.294507636202413 267 | -2.433890733514144,-8.798023215839837 268 | -0.6837892666734493,-7.194064096846326 269 | -0.11468582618766421,-5.338142301372787 270 | -1.1721409247642693,-1.2110638857332678 271 | -3.962885813923088,3.473522280799205 272 | -2.1402047832081923,-2.075862253029036 273 | 0.10002800442499793,-1.4857525654286285 274 | 0.3383325900782621,-10.478521823512333 275 | -3.004835640877978,-2.0339014307924703 276 | 0.1381214162835216,-3.3641898836511475 277 | 0.4292677251481145,-4.009232669192184 278 | -1.551632148526525,-1.2067185911867124 279 | -0.5783626688735155,-4.741920666867494 280 | -0.4742995565578716,-6.989081942361601 281 | -1.1942403665194754,-4.409391657957 282 | 0.23640163580809936,-4.597337367505926 283 | 0.18967860363430966,-4.05266521947363 284 | -1.9139441746698154,-3.092321440413615 285 | -0.7312765666764722,-1.193218869514641 286 | 0.4600640827095619,-4.150528979550504 287 | -2.0769899561772043,-3.021199695878228 288 | -2.6537950166799815,-3.1448371753847546 289 | 0.4885289132770172,-5.72774367826829 290 | -2.7715119475581598,-3.1998475355543 291 | -1.5990739783790202,-3.700376959494573 292 | -2.264012838682739,-3.481640211557375 293 | -2.383269713327781,-2.867466115796966 294 | -2.1888752645635092,-4.837765085816123 295 | -1.4107858132174655,-6.25379045110369 296 | -0.0763569664967575,-6.238396418830908 297 | -2.7667950531451986,-8.911639519511573 298 | -1.2545826042097925,-3.185261102421432 299 | -2.0677260798091748,-8.404608795569388 300 | -1.1307010403332611,1.5236960730851328 301 | -1.8871714417434489,-3.3239905989209246 302 | -0.6951156192991823,-1.2238946250701783 303 | 1.4103192235590494,-4.001907508163434 304 | -0.1192319775028381,-6.4037388198575185 305 | -1.192042789170495,-2.105600330068654 306 | -2.3005387351149986,-0.3419595360002319 307 | -1.4823391336874172,-5.082767469943232 308 | -2.058698164768944,6.001845664450034 309 | 1.5887038015637804,-8.299243344801436 310 | -0.9916892776054702,-6.698249834933088 311 | -1.315624204153219,-7.107155893422096 312 | -2.5426768516597753,-4.177803548411678 313 | -0.6487509857259672,-6.58725057753119 314 | 0.02977790681244108,1.1782908350417278 315 | -1.7118139366846252,-8.627470140338058 316 | -1.597277907383274,-10.073829697745001 317 | 0.4556748715787271,-5.528181692789595 318 | -0.24811094555841073,-1.097820059800568 319 | -0.06871717468816896,-3.501088588352608 320 | -2.3459084798204985,-1.3324235351623983 321 | -1.6086593246047047,-10.037042995927797 322 | -0.5754197688892831,-7.470604284287438 323 | -1.5296687936388906,0.8526555493182189 324 | -3.1163347464018716,-2.1541081061714387 325 | -0.6061807140684177,0.6862802875440588 326 | -1.7522744140333137,-3.5896662037610696 327 | -2.6543854462057475,4.161196397717035 328 | -2.6340863052782666,1.0060654125962802 329 | -2.7122158534069416,-1.99318896871802 330 | -4.139975485154663,-2.451094383189929 331 | -1.4187808026179622,-1.915947688925081 332 | -0.7288644902986623,1.786518734646032 333 | -1.9452277358113772,-0.9072184837671919 334 | -0.5971730681724873,-0.45014879984070344 335 | 0.9707179729986437,-3.8880583205989603 336 | 0.6592075577281982,-4.76190980165592 337 | -1.27604219598435,-5.396304199594539 338 | -2.215717579153721,-6.707947283207385 339 | -0.11535522541635268,1.0629738188498807 340 | -1.9811239007375194,-8.454363501880222 341 | 0.649840596229458,-1.919357510627262 342 | -0.5466551860660194,-4.658693729235955 343 | -0.7521148890263412,-5.5063179134790445 344 | -0.6845949921755581,-9.004190031472334 345 | -1.7810146340970543,-7.43623322937134 346 | 0.9921959632662554,-8.887132092335467 347 | -1.4204751641100768,-2.559195059850611 348 | -2.672087780536039,-10.938191084875562 349 | -1.9715740875742331,0.7923574363632229 350 | -0.6840412142529101,-4.831457010093357 351 | 0.0011800861809636576,-7.472655071780515 352 | -2.853942035088541,-4.634998500812769 353 | -0.42049919841058403,-7.327093701378728 354 | 0.2285443297387042,-5.095645629990317 355 | -0.6538558619296958,-3.861309765879895 356 | -1.8272160734297702,-4.6428282411088 357 | -0.564598715004617,-2.829099614537295 358 | -2.7699507652821334,-1.81265514094191 359 | -0.9250525757520809,-3.071344763382676 360 | -2.594266840500027,-4.214897495038795 361 | -0.5205929690840647,-4.586501365762072 362 | -0.8951711876107677,1.6183325723260973 363 | 0.5237495833518975,-2.325722514614627 364 | -0.6458338857860921,-3.8435898878824073 365 | -1.0703143742579868,-1.9675501511892546 366 | -0.7225681876762677,-0.7686424967109806 367 | -2.1651041126047854,-7.471259961532812 368 | -0.37038074055618453,-0.45859163545557813 369 | -0.6219795346224191,-2.1981828058525017 370 | -0.37789298998491216,-1.1865786448800768 371 | -1.353196812367155,-6.865230516562836 372 | -1.977375103898015,-0.3943520453537599 373 | -1.6060245805373023,-1.8566934666772155 374 | -2.6436184354371357,-1.6149194176843102 375 | -0.18183489561980537,-8.489237102752753 376 | -0.8858341205172254,1.90020292387522 377 | -1.5881119795132732,-14.480215294118565 378 | -2.6587449752383456,-6.490094074316531 379 | -2.1336567494660756,-0.8796377901395602 380 | -0.2557174740604984,-0.9772614146373115 381 | -0.11130751984866016,-0.4748158350199758 382 | -2.05429005386282,-7.023205921962287 383 | -0.7841960790757913,-4.5734286882985655 384 | -0.23298206687445122,-0.09363102229937548 385 | 0.639872018532196,-5.713547269906027 386 | -2.4314720691460003,-0.29456504415536555 387 | 0.2130277422909448,-7.819467713103724 388 | -1.9267946814623316,-4.189824482154564 389 | -0.6654687045430873,0.529541933188213 390 | -4.942654856670764,-4.219131790677451 391 | -0.7853355302764866,-4.592963828074572 392 | -0.7060179301336605,-1.430537614726842 393 | -1.0631307594358403,-6.391036849800224 394 | -1.6324106052478293,-6.444010208401991 395 | -3.2755159949646737,-2.9939191265268486 396 | -2.56047150796073,0.5841987698391771 397 | -2.262393700892434,2.055669721963797 398 | 0.11043200692146105,-2.8910807669965406 399 | -0.6580171921512933,-11.648501295402882 400 | 1.4008453971047086,-8.831505716943603 401 | -1.2943954122829755,-11.643975912182736 402 | -0.5197910603947153,-8.487847392080011 403 | -0.481585522326919,-6.630750024428158 404 | -1.1472417485195456,0.28532496069873936 405 | -0.7790469474112999,-0.49922926004874624 406 | 0.46245304842934165,-2.0901440849578714 407 | -1.3980062091390344,2.2674671872828007 408 | -3.0170996525795735,-10.462996177896642 409 | -2.6418891000420457,-1.6571518912837928 410 | -1.2720924993920695,-0.010621254246447442 411 | -2.3301604056981478,-2.958922140276937 412 | -1.7940996870630719,-8.324480581162852 413 | -0.026488424662141297,-10.117678707844252 414 | -0.23256257108067013,-8.104835307589259 415 | 0.45905545457326125,-6.400615541482103 416 | -1.8540180406230888,-7.195830995326459 417 | -1.7799935662164998,-10.480595597655421 418 | 0.2148560837940583,-5.677461505474805 419 | -3.9874598533479,-6.4927987338978275 420 | -2.1593225607308013,2.5917387684731246 421 | -2.7161117646470276,-0.0011437797282791706 422 | -0.6477429360935066,-0.5753396324753455 423 | -3.0695584463992214,-5.313628079483232 424 | -0.7701383171503423,-1.4365085728346223 425 | 0.35687187455105507,-0.8606141371916074 426 | -2.3146197401490154,0.6889084450900214 427 | -0.8529635997977232,-7.975193669209747 428 | 1.1409711944777583,-9.212423704276159 429 | 0.2774697239245445,-1.848797266193242 430 | 0.5130375997828074,-5.507060328978735 431 | -2.662561549081989,-3.820615024450105 432 | -2.5352670295141513,-4.114760838242309 433 | -0.18488220171073955,-4.551419196839449 434 | -2.0582730897747146,-7.9681999653974085 435 | -0.6244995981186756,-3.9290029048698423 436 | -2.663028536082492,-3.228255904697134 437 | -2.104197030277769,6.488834366459962 438 | -1.0409752482905588,-7.513788332117563 439 | -0.7971094398286516,1.079105562447075 440 | -2.858587301358905,-6.93014924232981 441 | -0.9123082791698328,-7.457343544647026 442 | -0.406711038479041,-2.8195347517786993 443 | -1.4709126731207471,-10.2676065467578 444 | -1.6893430262769895,-4.709391169256514 445 | 0.8919021309588049,-5.717087483592456 446 | -1.4066782954601156,-7.082325011812102 447 | -1.2727656545108998,-12.446137073299711 448 | -1.3561371295370355,-6.3217115244444555 449 | 1.061119765838721,-9.958693089346145 450 | -0.04050844470867698,3.3030485179747657 451 | -3.6897565468019184,-0.005620257604205925 452 | -0.6418820746913338,-6.385889611832532 453 | -1.1841749693175885,-3.6693533102356928 454 | -3.362309558345096,-1.695826985773194 455 | -0.015303768424369757,-2.2396377435434003 456 | 0.03797682408415293,0.9276257596472774 457 | -1.5466131112968382,-9.162976808819367 458 | -0.4035731406373806,-0.706367739427161 459 | 0.4509463666457787,-11.729442532747505 460 | -0.8851773400699331,-10.394053316125312 461 | -4.377192530544903,-5.529454322794213 462 | 0.42830425720463583,-2.8238844682962716 463 | -1.5594108603131156,-3.678004952447857 464 | 0.38743795408864656,-5.0706077146573545 465 | -0.7584332669373329,-13.084355695691887 466 | -0.4612017916542708,1.190697864228702 467 | -5.442679592745944,-0.7842790171019605 468 | -2.0301137313464706,-3.780227382913715 469 | 0.3345370484203216,-2.558521345196407 470 | 0.413834582309232,-5.081969587796571 471 | 1.712227051874871,0.17829571356413165 472 | -2.327724795142675,-4.493660899821206 473 | -2.0875018546953594,-2.783256684416428 474 | -1.562131770127968,-8.039862457958638 475 | -1.6842518241833462,-8.487377033508453 476 | -1.5777397829494197,0.40442719202459987 477 | -1.1508990822032477,0.7393756320446032 478 | 0.07443648093934363,-6.923236872010624 479 | -1.5973860283246681,-7.903461748286943 480 | -3.574039840032565,-5.3297535668958425 481 | 0.4200029428201366,-1.3839853049907886 482 | 0.44661014415683353,-1.2406381415058578 483 | 0.1494828086674762,-3.567655822129713 484 | 0.15927756996516496,1.8943249080662419 485 | -0.5976881226213171,-6.068102563371678 486 | -0.3342659088685722,-1.8915608688768781 487 | -3.088372957390626,-8.546943916265866 488 | 0.6308549419261529,-4.670021540262622 489 | 0.322467943804547,-7.855805145044611 490 | -1.591852983742711,-3.010394053029088 491 | 1.120176468942152,-8.681359705682391 492 | -0.11518032383231758,-3.307271215112081 493 | -1.1401047752655846,-6.88038100623831 494 | 1.0260241233712688,-3.2575842350433493 495 | -0.6259514637034067,-7.610907283725005 496 | -0.03719539188837184,-6.932780964839495 497 | -0.9324910148636657,-4.039039081706546 498 | 0.09158484715088999,-5.184091555458755 499 | -1.1918134901862092,-3.9387633301732223 500 | 0.3089777988835327,-1.007219813891004 501 | -------------------------------------------------------------------------------- /00_basics/01_install.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": {}, 6 | "source": [ 7 | "# Welcome to the Dark Art of Coding:\n", 8 | "## Introduction to Machine Learning\n", 9 | "Preparation and installation guide\n", 10 | "\n", 11 | "" 12 | ] 13 | }, 14 | { 15 | "cell_type": "markdown", 16 | "metadata": {}, 17 | "source": [ 18 | "# Objectives\n", 19 | "---" 20 | ] 21 | }, 22 | { 23 | "cell_type": "markdown", 24 | "metadata": {}, 25 | "source": [ 26 | "By the end of this module, you will be able to:\n", 27 | "\n", 28 | "* Download the tools we will be using (conda, specific Python libraries)\n", 29 | "* Install the tools\n", 30 | "* Test them for successful installation\n", 31 | "* Open the Juptyer Lab interface that we will be using in class\n", 32 | "* Run the code samples found in the notebooks\n", 33 | "* Understand the importance of the tools for our tasks today" 34 | ] 35 | }, 36 | { 37 | "cell_type": "markdown", 38 | "metadata": {}, 39 | "source": [ 40 | "# Installing the Software You'll Need\n", 41 | "---" 42 | ] 43 | }, 44 | { 45 | "cell_type": "markdown", 46 | "metadata": {}, 47 | "source": [ 48 | "## Step Zero: Read through ALL the steps...\n", 49 | "\n", 50 | "We strongly recommend that you read through **ALL** the steps below, before you start to install, etc. For some more advanced practitioners, you **may** already have some tools installed OR available. \n", 51 | "\n", 52 | "IF you can successfully\n", 53 | "* open Jupyter Lab\n", 54 | "* open the Notebooks in this tutorial AND\n", 55 | "* import the data libraries listed below in Step Three\n", 56 | "\n", 57 | "...then you shouldn't need to do anything. \n", 58 | "\n", 59 | "For folks who aren't sure OR for folks who are fairly new to Python/programming... these steps should get us to the point we need to be." 60 | ] 61 | }, 62 | { 63 | "cell_type": "markdown", 64 | "metadata": {}, 65 | "source": [ 66 | "## Step One: Download and Install Miniconda\n", 67 | "\n", 68 | "Follow the instructions for your operating system in the **`miniconda`** quickstart guides.\n", 69 | "\n", 70 | "**Some warnings/cautions:**\n", 71 | "\n", 72 | "1. We **highly recommend** the use of `conda` as a package manager and virtual environment manager for this tutorial. This material has been tested using `conda` but has not been tested using `pip`, `virtualenv`, `pyenv`, etc.\n", 73 | "\n", 74 | "1. **IF you already have `conda`** installed via a previous `Anaconda` OR `miniconda` install, you should not need to reinstall. How can you tell? If you type `conda` on your command line and get a response similar to this, then you should not need to reinstall conda:\n", 75 | "\n", 76 | " ```\n", 77 | " my_macbook:my_folder chalmerlowe$ conda\n", 78 | " usage: conda [-h] [-V] command ...\n", 79 | "\n", 80 | " conda is a tool for managing and deploying applications, environments and packages.\n", 81 | " .\n", 82 | " .\n", 83 | " .\n", 84 | " ```\n", 85 | "1. Be sure you use a **Python 3.x** version of `miniconda` to install Python 3.x.\n", 86 | "1. Based on our experience in workshops, **the most common problems** we experience with installs is that a step got missed OR a command was typed incorrectly. It happens to all of us, so stay sharp, folks!" 87 | ] 88 | }, 89 | { 90 | "cell_type": "markdown", 91 | "metadata": {}, 92 | "source": [ 93 | "With that in mind, please choose the appropriate version and install `conda` as described below:\n", 94 | "\n", 95 | "**`conda` for Windows**:\n", 96 | "\n", 97 | "* Download the installer: [Miniconda installer for Windows.](https://conda.io/miniconda.html)\n", 98 | "* Double-click the .exe file.\n", 99 | "* Follow the instructions on the screen.\n", 100 | "* NOTE: If you are unsure about any setting, accept the defaults. You can change them later.\n", 101 | "* When installation is finished, from the **Start menu**, open the **Anaconda Prompt**.\n", 102 | "\n", 103 | "**`conda` for MacOS**:\n", 104 | "\n", 105 | "* Download the installer: [Miniconda installer for MacOS.](https://conda.io/miniconda.html)\n", 106 | "* In your Terminal window navigate to the folder where you downloaded the miniconda installer\n", 107 | "* At your Terminal prompt, run this command:\n", 108 | " \n", 109 | " ```bash\n", 110 | " $ bash Miniconda3-latest-MacOSX-x86_64.sh\n", 111 | " ```\n", 112 | "
\n", 113 | "* Follow the prompts on the installer screens.\n", 114 | "* NOTE: If you are unsure about any setting, accept the defaults. You can change them later.\n", 115 | "* **Close** and then **re-open** your Terminal window, to make the changes take effect.\n", 116 | "\n", 117 | "\n", 118 | "**`conda` for Linux**:\n", 119 | "\n", 120 | "* Download the installer: [Miniconda installer for Linux.](https://conda.io/miniconda.html)\n", 121 | "* In your Terminal window navigate to the folder where you downloaded the miniconda installer\n", 122 | "* At your Terminal prompt, run this command:\n", 123 | " \n", 124 | " ```bash\n", 125 | " $ bash Miniconda3-latest-Linux-x86_64.sh\n", 126 | " ```\n", 127 | "
\n", 128 | "* Follow the prompts on the installer screens.\n", 129 | "* If you are unsure about any setting, accept the defaults. You can change them later.\n", 130 | "* **Close** and then **re-open** your Terminal window, to make the changes take effect." 131 | ] 132 | }, 133 | { 134 | "cell_type": "markdown", 135 | "metadata": {}, 136 | "source": [ 137 | "## Step Two: Confirm your conda install\n", 138 | "\n", 139 | "In a command prompt type `conda list`. If `conda` is installed properly, you will see a summary of the packages installed by `conda`." 140 | ] 141 | }, 142 | { 143 | "cell_type": "markdown", 144 | "metadata": {}, 145 | "source": [ 146 | "### Troubleshooting\n", 147 | "\n", 148 | "Here's a list of error messages & how to fix them.\n", 149 | "\n", 150 | "- **`conda: Command not found.` **IF you see this, the most common reason is that your command shell is not yet aware of the installation of `conda`. The easiest fix is to simply **close** your terminal/command prompt & **reopen** your terminal/command prompt. If that doesn't fix it, ask for help." 151 | ] 152 | }, 153 | { 154 | "cell_type": "markdown", 155 | "metadata": {}, 156 | "source": [ 157 | "## Step Three: Install Python, and other packages..." 158 | ] 159 | }, 160 | { 161 | "cell_type": "markdown", 162 | "metadata": {}, 163 | "source": [ 164 | "With `conda` installed, we want to ensure that we have a suitable version of Python installed and that we have the necessary libraries also installed.\n", 165 | "\n", 166 | "We will create a directory to hold our lesson content. For consistency, we will call this directory `mltutorial` and then we will create a virtual environment called `mlenv` and populate it with Python and our libraries.\n", 167 | "\n", 168 | "1. On your command prompt, make sure you are in a directory where you want your project folder to be located (many people put this in their `My Documents` OR `home` folder. From that directory, run the following command:\n", 169 | "\n", 170 | " ```bash\n", 171 | " chalmerlowe$ mkdir mltutorial\n", 172 | " ```\n", 173 | "
\n", 174 | "1. Change directories into the new folder:\n", 175 | "\n", 176 | " ```bash\n", 177 | " chalmerlowe$ cd mltutorial\n", 178 | " ```\n", 179 | "
\n", 180 | "1. Create a virtual environment with Python 3, using the following command (don't worry, we will explain this below):\n", 181 | "\n", 182 | " ```bash\n", 183 | " chalmerlowe$ conda create -n mlenv python=3\n", 184 | " ```\n", 185 | "
\n", 186 | "1. Activate your virtual environment using the command appropriate to your operating system. NOTICE your prompt will change to reflect the fact that you are now in a virtual environment:\n", 187 | "\n", 188 | " **Mac/ Linux** \n", 189 | "\n", 190 | " ```bash\n", 191 | " chalmerlowe$ conda activate mlenv\n", 192 | " (mlenv) chalmerlowe$ \n", 193 | " ```\n", 194 | "
\n", 195 | " \n", 196 | " **Windows**\n", 197 | "\n", 198 | " ```bat\n", 199 | " C:\\> activate mlenv\n", 200 | " (mlenv) C:\\>\n", 201 | " ```\n", 202 | "
\n", 203 | "\n", 204 | "1. Install the following additional packages to your virtual environment (there may be a lot, make sure you get them all):\n", 205 | "\n", 206 | " ```bash\n", 207 | " (mlenv) chalmerlowe$ conda install -c conda-forge jupyter jupyterlab pandas matplotlib scipy numpy scikit-learn requests ipython seaborn \n", 208 | " ```\n", 209 | "
\n", 210 | "\n", 211 | "1. Test your installation, by typing the following on your commmand line/terminal:\n", 212 | "\n", 213 | " ```bash\n", 214 | " (mlenv) chalmerlowe$ jupyter lab \n", 215 | " ```\n", 216 | "
\n", 217 | " \n", 218 | "If your browser opens with a Jupyter Lab instance, you will know the install process succeeded. \n", 219 | "\n", 220 | "## Getting the actual class notebooks:\n", 221 | "\n", 222 | "1. You can do this step now, but I would suggest waiting til the morning of the class (see WARNING below). To get the latest version of the course material, navigate to the [**class github repository (link)**](https://github.com/chalmerlowe/machine_learning) and press the Big Green `Clone OR Download` button.\n", 223 | "\n", 224 | "1. Then press the `Download Zip` button to download a zip file of all the course content.\n", 225 | "\n", 226 | "1. Unzip the content into the new folder you have made (`mltutorial`). Once it is unzipped, you should see it in your Jupyter Lab interface.\n", 227 | "\n", 228 | "**WARNING**: the class material will be undergoing revision all the way up until the day of the class (\"...Conference Driven Development\") ... please be prepared to update your copy of the course material on the morning of the tutorial (either by using `git` if you are familiar with that tool OR by deleting your local files and downloading a fresh copy." 229 | ] 230 | }, 231 | { 232 | "cell_type": "markdown", 233 | "metadata": {}, 234 | "source": [ 235 | "## Done with commands for now!\n", 236 | "\n", 237 | "If you have been successful with the above steps, you are done for now.\n", 238 | "\n", 239 | "\n", 240 | "\n", 241 | "If you like reading, you can also keep reading this page to learn more about what we did, why we chose `conda`, what happened behind the scenes, etc!\n", 242 | "\n", 243 | "If you ran into any problems, feel free to reach out via info@darkartofcoding.com" 244 | ] 245 | }, 246 | { 247 | "cell_type": "markdown", 248 | "metadata": {}, 249 | "source": [ 250 | "# The Big Picture \n", 251 | "---" 252 | ] 253 | }, 254 | { 255 | "cell_type": "markdown", 256 | "metadata": {}, 257 | "source": [ 258 | "## What is miniconda (conda) and why did we install it?\n", 259 | "\n", 260 | "Miniconda contains the `conda` package manager/virtual environment manager and `Python`. `conda` is language agnostic, so you can also use it to support work with languages besides Python. Once miniconda is installed, you will be able to: \n", 261 | "\n", 262 | "* create virtual environments and \n", 263 | "* manage separate installations of `Python` (including different versions) \n", 264 | "* manage Python packages/libraries\n", 265 | "* as well as manage packages in other languages ... packages that are fundamentally unmanageable by Python-only tools like `pip` & `virtualenv`.\n", 266 | "\n", 267 | "Whenever you work on a new project, you should create a separate environment for that project. `conda` lets you do this easily and efficiently. \n", 268 | "\n", 269 | "## And what is a virtual environment?\n", 270 | "\n", 271 | "When you create a virtual environment, `conda` will add subdirectories to a miniconda directory on your computer. Specifically it will create a directory that will contain:\n", 272 | "\n", 273 | "* a database and metadata about the virtualenv\n", 274 | "* software and libraries related to the project (i.e., Python and any modules you install in the virtualenv)\n", 275 | "\n", 276 | "NOTE: this virtualenv folder is **NOT** a duplicate of your project folder **NOR** does it contain your code/class material" 277 | ] 278 | }, 279 | { 280 | "cell_type": "markdown", 281 | "metadata": {}, 282 | "source": [ 283 | "# Deep dive\n", 284 | "---" 285 | ] 286 | }, 287 | { 288 | "cell_type": "markdown", 289 | "metadata": {}, 290 | "source": [ 291 | "## What is a virtual environment?\n", 292 | "\n", 293 | "As mentioned above, virtual environments (also called virtualenvs) are tools used to keep projects separate, especially in terms of keeping different software versions separate and different library versions separate. For example, virtualenvs prevent Python's site packages folder from getting filled with potentially conflicting versions of software AND thus prevents problems that arise when one project needs **version x.x** of a library but another project needs **version y.y** of the same library. At their core, virtualenvs are glorified directories that use scripts and metadata to organize and control the environment. You are allowed to have an essentially unlimited number of virtual environments. And as you saw above, they are very easy to create using various command line tools, such as `conda`.\n", 294 | "\n", 295 | "## When should we use a virtual environment?\n", 296 | "\n", 297 | "Anytime you have more than one project and there is a possibility of conflicts between your libraries, it is a good time to use a virtual environment. Having said that, many programmers use virtual environments for **all but the most trivial** programming tasks. Especially for beginners, using virtual environmentss early on in your learning career will build a valuable skill AND help prevent sneaky bugs related to version discrepancies. Bugs that can be hard to diagnose.\n", 298 | "\n", 299 | "## How do you create a virtual environment?\n", 300 | "\n", 301 | "While there are several programs or libraries that can generate virtual environments for today's lesson, we will be using the `conda` package manager, which includes the capability to simply and easily produce virtual environments.\n", 302 | "\n", 303 | "Presuming you have `conda` installed, these steps enable you to create and activate a virtual environment.\n", 304 | "\n", 305 | "```bash\n", 306 | "$ conda create -n mlenv python=3\n", 307 | "```\n", 308 | "\n", 309 | "Description:\n", 310 | "* `conda` runs the conda program.\n", 311 | "* `create` tells it to create a virtualenv\n", 312 | "* `-n` identifies the name of the virtualenv, in this case, `mlenv`\n", 313 | "* `python=3` tells conda that you want to install Python version 3 in this virtualenv\n", 314 | "\n", 315 | "**NOTE**: for other projects, you **can** use `python=2` or `python=3` and regardless which you choose, conda will default to the most recent version of Python within the version 2 OR version 3 family. If you need to select a specific minor version of python, use the following syntax:\n", 316 | "\n", 317 | "`python=3.7`\n", 318 | "\n", 319 | "When you execute the `conda create` command, `conda` prepares to install Python and any dependencies that Python relies upon. It will display output similar to the following. \n", 320 | "\n", 321 | "```bash\n", 322 | "my_macbook:my_folder chalmerlowe$ conda create -n mlenv python=3\n", 323 | "Fetching package metadata .......\n", 324 | "Solving package specifications: ..........\n", 325 | "\n", 326 | "Package plan for installation in environment /Users/chalmerlowe/miniconda3/envs/stats:\n", 327 | "\n", 328 | "The following packages will be downloaded:\n", 329 | "\n", 330 | " package | build\n", 331 | " ---------------------------|-----------------\n", 332 | " openssl-1.0.2k | 1 3.0 MB\n", 333 | " python-3.6.0 | 0 11.7 MB\n", 334 | " setuptools-27.2.0 | py36_0 523 KB\n", 335 | " wheel-0.29.0 | py36_0 87 KB\n", 336 | " pip-9.0.1 | py36_1 1.7 MB\n", 337 | " ------------------------------------------------------------\n", 338 | " Total: 17.0 MB\n", 339 | "\n", 340 | "The following NEW packages will be INSTALLED:\n", 341 | "\n", 342 | " openssl: 1.0.2k-1\n", 343 | " pip: 9.0.1-py36_1\n", 344 | " python: 3.6.0-0\n", 345 | " readline: 6.2-2\n", 346 | " setuptools: 27.2.0-py36_0\n", 347 | " sqlite: 3.13.0-0\n", 348 | " tk: 8.5.18-0\n", 349 | " wheel: 0.29.0-py36_0\n", 350 | " xz: 5.2.2-1\n", 351 | " zlib: 1.2.8-3\n", 352 | "\n", 353 | "Proceed ([y]/n)?\n", 354 | "```\n", 355 | "\n", 356 | "To finish the creation of the virtualenv and install the software, press `y`.\n", 357 | "\n", 358 | "## Activating a virtualenv\n", 359 | "\n", 360 | "Once you have created a virtualenv, you will need to activate it. Activation has several side effects:\n", 361 | "\n", 362 | "* It temporarily changes your `$PATH` variable so calls to the `python` command (and similar commands) will look first in the virtual environment's `bin/` directory. \n", 363 | "* It temporarily changes your shell prompt to show which virtual environment you are using. Your prompt will likely look something like this, with the name of your virtual environment in parenthesis in front of the prompt:\n", 364 | " * Mac/Linux: `(mlenv) chalmerlowe$`\n", 365 | " * Windows: `(mlenv) C:\\>`\n", 366 | "\n", 367 | "To activate your virtual environment, run the appropriate command for your operating system:\n", 368 | "\n", 369 | "**Mac/Linux**\n", 370 | "\n", 371 | "```bash\n", 372 | "$ conda activate mlenv\n", 373 | "```\n", 374 | "\n", 375 | "**Windows**\n", 376 | "\n", 377 | "```bat\n", 378 | "C:\\> activate mlenv\n", 379 | "```\n", 380 | "\n", 381 | "**Note:** If you are using **Power Shell**, `activate` won't work out of the box. Type `cmd` first to get a regular command prompt, *then* `activate stats`.\n", 382 | "\n", 383 | "### Adding software to your virtualenv \n", 384 | "\n", 385 | "To add more software to the virtualenv, you can use `conda` to install the software. The maintainers of conda provide access to many Python and non-Python libraries, but not all of them. If conda cannot install a particular library that you need, you can generally use `pip` or a similar package installation tool to install it instead (covering `pip` is outside the scope of this workshop).\n", 386 | "\n", 387 | "For example, to install Jupyter, you can use the following `conda` command:\n", 388 | "\n", 389 | "```\n", 390 | "conda install -c conda-forge jupyter jupyterlab pandas matplotlib scipy numpy scikit-learn requests ipython seaborn \n", 391 | "```\n", 392 | "\n", 393 | "Conda will prepare to install Jupyter and any dependencies that Jupyter relies upon. It will display output similar to the following (truncated to save space).\n", 394 | "\n", 395 | "```bash\n", 396 | "Fetching package metadata .......\n", 397 | "Solving package specifications: ..........\n", 398 | "\n", 399 | "Package plan for installation in environment /Users/chalmerlowe/miniconda3:\n", 400 | "\n", 401 | "The following packages will be downloaded:\n", 402 | "\n", 403 | " package | build\n", 404 | " ---------------------------|-----------------\n", 405 | " conda-env-2.6.0 | 0 601 B\n", 406 | " ...\n", 407 | " ipython-5.3.0 | py35_0 1021 KB\n", 408 | " conda-4.3.14 | py35_0 505 KB\n", 409 | " ------------------------------------------------------------\n", 410 | " Total: 3.8 MB\n", 411 | "\n", 412 | "The following NEW packages will be INSTALLED:\n", 413 | "\n", 414 | " appnope: 0.1.0-py35_0\n", 415 | " ...\n", 416 | " wcwidth: 0.1.7-py35_0\n", 417 | "\n", 418 | "The following packages will be UPDATED:\n", 419 | "\n", 420 | " conda: 4.1.11-py35_0 --> 4.3.14-py35_0\n", 421 | " conda-env: 2.5.2-py35_0 --> 2.6.0-0\n", 422 | " requests: 2.10.0-py35_0 --> 2.13.0-py35_0\n", 423 | "\n", 424 | "Proceed ([y]/n)?\n", 425 | "```\n", 426 | "\n", 427 | "To finish the installation of Jupyter and its dependencies, press `y`.\n", 428 | "\n", 429 | "### Multiple packages\n", 430 | "\n", 431 | "Multiple packages can be installed at the same time, by separating the package names with spaces:\n", 432 | "\n", 433 | "`conda install matplotlib numpy pandas scipy`\n", 434 | "\n", 435 | "**IF** there are special packages that you need to get from a specific repository channel (i.e. the conda-forge channel), you can designate a channel using the `-c` flag and the name of the channel (such as `conda-forge`) as shown here:\n", 436 | "\n", 437 | "`conda install -c conda-forge jupyter jupyterlab pandas matplotlib scipy numpy scikit-learn requests ipython seaborn`\n", 438 | "\n", 439 | "### Leaving the virtualenv when you are done\n", 440 | "\n", 441 | "When you are done working in your virtualenv, you can deactivate it using the following command:\n", 442 | "\n", 443 | "**Mac/Linux**\n", 444 | "\n", 445 | "```bash\n", 446 | "(mlenv) $ conda deactivate\n", 447 | "$\n", 448 | "```\n", 449 | "\n", 450 | "**Windows**\n", 451 | "\n", 452 | "```bat\n", 453 | "(mlenv) C:\\> deactivate\n", 454 | "C:\\>\n", 455 | "```" 456 | ] 457 | }, 458 | { 459 | "cell_type": "markdown", 460 | "metadata": {}, 461 | "source": [ 462 | "## Resources\n", 463 | "\n", 464 | "\n", 465 | "\n", 466 | "* [Using conda](http://conda.pydata.org/docs/using/index.html): A tutorial on how to use `conda`\n", 467 | "\n", 468 | "* [conda cheatsheet](https://conda.io/docs/_downloads/conda-cheatsheet.pdf): A cheatsheet of the most common `conda` commands\n", 469 | "\n", 470 | "* [conda myths and misconceptions](http://jakevdp.github.io/blog/2016/08/25/conda-myths-and-misconceptions/): Reasons why conda was created and how it differs from `pip`, `virtualenv`, etc.\n", 471 | "\n", 472 | "* [Python's `venv` and `virtualenv` can also create virtual environments.](http://stackoverflow.com/questions/41573587/what-is-the-difference-between-venv-pyvenv-pyenv-virtualenv-virtualenvwrappe)\n", 473 | "\n", 474 | "* [`pip` is Python's package manager.](https://en.wikipedia.org/wiki/Pip_(package_manager))" 475 | ] 476 | } 477 | ], 478 | "metadata": { 479 | "kernelspec": { 480 | "display_name": "Python 3", 481 | "language": "python", 482 | "name": "python3" 483 | }, 484 | "language_info": { 485 | "codemirror_mode": { 486 | "name": "ipython", 487 | "version": 3 488 | }, 489 | "file_extension": ".py", 490 | "mimetype": "text/x-python", 491 | "name": "python", 492 | "nbconvert_exporter": "python", 493 | "pygments_lexer": "ipython3", 494 | "version": "3.6.7" 495 | } 496 | }, 497 | "nbformat": 4, 498 | "nbformat_minor": 2 499 | } 500 | -------------------------------------------------------------------------------- /05_neighbors/05_neighbors.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": {}, 6 | "source": [ 7 | "# Welcome to the Dark Art of Coding:\n", 8 | "## Introduction to Machine Learning\n", 9 | "k-Means Clustering\n", 10 | "\n", 11 | "" 12 | ] 13 | }, 14 | { 15 | "cell_type": "markdown", 16 | "metadata": {}, 17 | "source": [ 18 | "# Objectives\n", 19 | "---" 20 | ] 21 | }, 22 | { 23 | "cell_type": "markdown", 24 | "metadata": {}, 25 | "source": [ 26 | "In this session, students should expect to:\n", 27 | "\n", 28 | "* Cover an overview of k-Means Clustering\n", 29 | "* Examine code samples that walk us through **The Process™**:\n", 30 | " * Prep the data\n", 31 | " * Choose the model\n", 32 | " * Choose appropriate hyperparameters\n", 33 | " * Fit the model\n", 34 | " * Apply the model\n", 35 | " * Examine the results\n", 36 | "* Explore a deep dive into this model\n", 37 | "* Review some gotchas that might complicate things\n", 38 | "* Review tips related to learning more" 39 | ] 40 | }, 41 | { 42 | "cell_type": "markdown", 43 | "metadata": {}, 44 | "source": [ 45 | "# Overview: k-Means Clustering\n", 46 | "---" 47 | ] 48 | }, 49 | { 50 | "cell_type": "markdown", 51 | "metadata": {}, 52 | "source": [ 53 | "The goal of a clustering algorithm is to assign data points to the same group if they are similar and to assign data points to different groups if they are different.\n", 54 | "\n", 55 | "Clustering models are popular machine learning models because they:\n", 56 | "\n", 57 | "* are **unsupervised** and thus don't require pre-determined labels\n", 58 | "* can accommodate multidimensional datasets\n", 59 | "* can, for simple cases, be fairly easy to interpret, especially in 2D/3D via charts\n", 60 | "\n", 61 | "The k-Means Clustering algorithm: \n", 62 | "\n", 63 | "* looks for the arithmetic mean of all points in a cluster to identify the cluster centers\n", 64 | "* groups points together by identifying the closest cluster center\n", 65 | "\n", 66 | "For this example, we will use the `KMeans` model. `The sklearn.cluster` module has a number of clustering models, including:\n", 67 | "\n", 68 | "* AffinityPropagation\n", 69 | "* DBSCAN\n", 70 | "* KMeans\n", 71 | "* MeanShift\n", 72 | "* SpectralClustering\n", 73 | "* and more...\n", 74 | "\n", 75 | "With this background, let's apply **The Process™** on the `KMeans` Clustering model." 76 | ] 77 | }, 78 | { 79 | "cell_type": "markdown", 80 | "metadata": {}, 81 | "source": [ 82 | "## Prep the data" 83 | ] 84 | }, 85 | { 86 | "cell_type": "markdown", 87 | "metadata": {}, 88 | "source": [ 89 | "We start with a set of standard imports..." 90 | ] 91 | }, 92 | { 93 | "cell_type": "code", 94 | "execution_count": null, 95 | "metadata": {}, 96 | "outputs": [], 97 | "source": [ 98 | "import matplotlib.pyplot as plt\n", 99 | "import numpy as np\n", 100 | "import pandas as pd\n", 101 | "import sklearn\n", 102 | "from sklearn.model_selection import train_test_split\n", 103 | "\n", 104 | "# NOTE: during the Choose the Model step, we will import the \n", 105 | "# model we want, but there is no reason you can't import it here.\n", 106 | "# from sklearn.cluster import KMeans" 107 | ] 108 | }, 109 | { 110 | "cell_type": "markdown", 111 | "metadata": {}, 112 | "source": [ 113 | "### Prep the training data and test data" 114 | ] 115 | }, 116 | { 117 | "cell_type": "markdown", 118 | "metadata": {}, 119 | "source": [ 120 | "A number of data generating functions exist in Scikit-Learn to help you create data sets that you can use to play with and manipulate the models. For this example, I want to explore one of these data generation libraries: \n", 121 | "\n", 122 | "```python\n", 123 | "sklearn.datasets.samples_generator.make_blobs\n", 124 | "```\n", 125 | "\n", 126 | "This dataset generator produces preformatted `features` matrices and `target` arrays." 127 | ] 128 | }, 129 | { 130 | "cell_type": "markdown", 131 | "metadata": {}, 132 | "source": [ 133 | "This dataset is composed of:\n", 134 | "\n", 135 | "* a `features matrix` of `x`-`y` vectors that can be plotted on a chart\n", 136 | "* a `target array` of cluster labels" 137 | ] 138 | }, 139 | { 140 | "cell_type": "code", 141 | "execution_count": null, 142 | "metadata": {}, 143 | "outputs": [], 144 | "source": [ 145 | "from sklearn.datasets.samples_generator import make_blobs\n", 146 | "\n", 147 | "X, y = make_blobs(n_samples=400,\n", 148 | " centers=4,\n", 149 | " cluster_std=0.70,\n", 150 | " random_state=13)" 151 | ] 152 | }, 153 | { 154 | "cell_type": "markdown", 155 | "metadata": {}, 156 | "source": [ 157 | "Since we have never made blobs before, we should check to see what the output looks like:" 158 | ] 159 | }, 160 | { 161 | "cell_type": "code", 162 | "execution_count": null, 163 | "metadata": {}, 164 | "outputs": [], 165 | "source": [ 166 | "X.shape" 167 | ] 168 | }, 169 | { 170 | "cell_type": "code", 171 | "execution_count": null, 172 | "metadata": {}, 173 | "outputs": [], 174 | "source": [ 175 | "X[:5]" 176 | ] 177 | }, 178 | { 179 | "cell_type": "markdown", 180 | "metadata": {}, 181 | "source": [ 182 | "Initially, looking at only a five element slice of the labels didn't show me all the possible categories, so I expanded the slice a little, then more, then alot." 183 | ] 184 | }, 185 | { 186 | "cell_type": "code", 187 | "execution_count": null, 188 | "metadata": {}, 189 | "outputs": [], 190 | "source": [ 191 | "y[:100]" 192 | ] 193 | }, 194 | { 195 | "cell_type": "markdown", 196 | "metadata": {}, 197 | "source": [ 198 | "**TIP**: A quick way to confirm exactly which categories you have, if you have lots of them is to use the `np.unique()` method to deduplicate the elements stored in your array (i.e. `y`)." 199 | ] 200 | }, 201 | { 202 | "cell_type": "code", 203 | "execution_count": null, 204 | "metadata": {}, 205 | "outputs": [], 206 | "source": [ 207 | "np.unique(y)" 208 | ] 209 | }, 210 | { 211 | "cell_type": "markdown", 212 | "metadata": {}, 213 | "source": [ 214 | "In this case, we are gonna stick with the defaults in terms of the size of the test set and in terms of the random seed." 215 | ] 216 | }, 217 | { 218 | "cell_type": "code", 219 | "execution_count": null, 220 | "metadata": {}, 221 | "outputs": [], 222 | "source": [ 223 | "X_train, X_test, y_train, y_test = train_test_split(X, y)" 224 | ] 225 | }, 226 | { 227 | "cell_type": "code", 228 | "execution_count": null, 229 | "metadata": {}, 230 | "outputs": [], 231 | "source": [ 232 | "plt.scatter(X_train[:, 0], X_train[:, 1])\n", 233 | "plt.title(\"Four well behaved clusters\");" 234 | ] 235 | }, 236 | { 237 | "cell_type": "markdown", 238 | "metadata": {}, 239 | "source": [ 240 | "## Choose the Model" 241 | ] 242 | }, 243 | { 244 | "cell_type": "code", 245 | "execution_count": null, 246 | "metadata": { 247 | "collapsed": true 248 | }, 249 | "outputs": [], 250 | "source": [ 251 | "from sklearn.cluster import KMeans" 252 | ] 253 | }, 254 | { 255 | "cell_type": "markdown", 256 | "metadata": {}, 257 | "source": [ 258 | "## Choose Appropriate Hyperparameters" 259 | ] 260 | }, 261 | { 262 | "cell_type": "markdown", 263 | "metadata": {}, 264 | "source": [ 265 | "Here we choose to assign xx hyperparameters: `xx` and `xx`. We will discuss both later." 266 | ] 267 | }, 268 | { 269 | "cell_type": "code", 270 | "execution_count": null, 271 | "metadata": { 272 | "collapsed": true 273 | }, 274 | "outputs": [], 275 | "source": [ 276 | "model = KMeans(n_clusters=4)" 277 | ] 278 | }, 279 | { 280 | "cell_type": "markdown", 281 | "metadata": {}, 282 | "source": [ 283 | "There are a number of hyperparameters... we will cover several in greater depth later.\n", 284 | "\n", 285 | "```python\n", 286 | "KMeans(\n", 287 | " n_clusters=8,\n", 288 | " init='k-means++',\n", 289 | " n_init=10,\n", 290 | " max_iter=300,\n", 291 | " tol=0.0001,\n", 292 | " precompute_distances='auto',\n", 293 | " verbose=0,\n", 294 | " random_state=None,\n", 295 | " copy_x=True,\n", 296 | " n_jobs=None,\n", 297 | " algorithm='auto',\n", 298 | ")\n", 299 | "```" 300 | ] 301 | }, 302 | { 303 | "cell_type": "markdown", 304 | "metadata": {}, 305 | "source": [ 306 | "## Fit the Model" 307 | ] 308 | }, 309 | { 310 | "cell_type": "markdown", 311 | "metadata": {}, 312 | "source": [ 313 | "This model doesn't need OR use any labels, so we simply feed in the `X_train` data." 314 | ] 315 | }, 316 | { 317 | "cell_type": "code", 318 | "execution_count": null, 319 | "metadata": { 320 | "collapsed": true 321 | }, 322 | "outputs": [], 323 | "source": [ 324 | "model.fit(X_train)" 325 | ] 326 | }, 327 | { 328 | "cell_type": "markdown", 329 | "metadata": {}, 330 | "source": [ 331 | "## Apply the Model" 332 | ] 333 | }, 334 | { 335 | "cell_type": "code", 336 | "execution_count": null, 337 | "metadata": { 338 | "collapsed": true 339 | }, 340 | "outputs": [], 341 | "source": [ 342 | "y_pred = model.predict(X_test)" 343 | ] 344 | }, 345 | { 346 | "cell_type": "code", 347 | "execution_count": null, 348 | "metadata": { 349 | "collapsed": true 350 | }, 351 | "outputs": [], 352 | "source": [ 353 | "y_pred.shape" 354 | ] 355 | }, 356 | { 357 | "cell_type": "markdown", 358 | "metadata": {}, 359 | "source": [ 360 | "Again, here as above, we don't get to see all the categories by only looking at a five element slice." 361 | ] 362 | }, 363 | { 364 | "cell_type": "code", 365 | "execution_count": null, 366 | "metadata": { 367 | "collapsed": true 368 | }, 369 | "outputs": [], 370 | "source": [ 371 | "y_pred[:5]" 372 | ] 373 | }, 374 | { 375 | "cell_type": "markdown", 376 | "metadata": {}, 377 | "source": [ 378 | "## Examine the results" 379 | ] 380 | }, 381 | { 382 | "cell_type": "markdown", 383 | "metadata": {}, 384 | "source": [ 385 | "If we plot the clusters and use the predicted labels as the basis for assigning colors, we see that the model correctly grouped the samples into clusters." 386 | ] 387 | }, 388 | { 389 | "cell_type": "code", 390 | "execution_count": null, 391 | "metadata": { 392 | "collapsed": true 393 | }, 394 | "outputs": [], 395 | "source": [ 396 | "plt.scatter(X_test[:, 0], X_test[:, 1],\n", 397 | " c=y_pred,\n", 398 | " cmap='seismic', alpha=0.2);" 399 | ] 400 | }, 401 | { 402 | "cell_type": "markdown", 403 | "metadata": {}, 404 | "source": [ 405 | "We mentioned that the model drills down to a center for each cluster. If you want to know where the centers are, the model stores them as an attribute called `.cluster_centers_`. \n", 406 | "\n", 407 | "**Notice** the trailing underscore (`_`) at the end of the attribute name to show that it is a computed value." 408 | ] 409 | }, 410 | { 411 | "cell_type": "code", 412 | "execution_count": null, 413 | "metadata": {}, 414 | "outputs": [], 415 | "source": [ 416 | "model.cluster_centers_" 417 | ] 418 | }, 419 | { 420 | "cell_type": "code", 421 | "execution_count": null, 422 | "metadata": {}, 423 | "outputs": [], 424 | "source": [ 425 | "ctrs = model.cluster_centers_" 426 | ] 427 | }, 428 | { 429 | "cell_type": "markdown", 430 | "metadata": {}, 431 | "source": [ 432 | "With these points, it is simple enough to plot them on the chart. Here we highlight them as large (`s=150`) white dots (`c='white'`) outlined in black (`edgecolors='black'`)" 433 | ] 434 | }, 435 | { 436 | "cell_type": "code", 437 | "execution_count": null, 438 | "metadata": {}, 439 | "outputs": [], 440 | "source": [ 441 | "plt.scatter(X_test[:, 0], X_test[:, 1],\n", 442 | " c=y_pred,\n", 443 | " cmap='seismic', alpha=0.2)\n", 444 | "\n", 445 | "plt.scatter(ctrs[:, 0], ctrs[:, 1],\n", 446 | " c='white',\n", 447 | " edgecolors='black',\n", 448 | " s=150,\n", 449 | " );" 450 | ] 451 | }, 452 | { 453 | "cell_type": "markdown", 454 | "metadata": {}, 455 | "source": [ 456 | "# Deep Dive\n", 457 | "---\n", 458 | "\n", 459 | "The k-Means Clustering model works based on a process called **Expectation-Maximization**. In this process, the model:\n", 460 | "\n", 461 | "* starts by randomly picking some cluster centers\n", 462 | "* repeats the following cycle until the model converges\n", 463 | " * Expectation: assign points to the closest cluster center\n", 464 | " * Maximization: use the points of the newly formed clusters to calculate a new mean to use as a new cluster center\n", 465 | " \n", 466 | "The process is designed such that for every cycle of the Expectation and Maximization steps, the model will always have a better estimation of any given cluster." 467 | ] 468 | }, 469 | { 470 | "cell_type": "markdown", 471 | "metadata": {}, 472 | "source": [ 473 | "# Gotchas\n", 474 | "---" 475 | ] 476 | }, 477 | { 478 | "cell_type": "markdown", 479 | "metadata": {}, 480 | "source": [ 481 | "**No global guarantees**: despite the promise of convergence... there is no guarantee that as a whole the clusters produced will globally be the most suitable clusters.\n", 482 | "\n", 483 | "It really depends on the randomly selected initial cluster centers. To overcome this limitation, the model typically runs the algorithm multiple times. The default `n_init` is set at `10`.\n", 484 | "\n", 485 | "**You must decide on the number of clusters**: when we set the hyperparameters, we need to initialize the model with the some number of clusters. The default `n_clusters` is set at `8`.\n", 486 | "\n", 487 | "* There are other models that may provide some measure of the fitness of the number of clusters: `GaussianMixture`\n", 488 | "* There are other models that can choose a suitable number of clusters: `DBSCAN`, `MeanShift`\n", 489 | "\n", 490 | "**Speed considerations**: clustering algorithms can be slow on large datasets." 491 | ] 492 | }, 493 | { 494 | "cell_type": "markdown", 495 | "metadata": {}, 496 | "source": [ 497 | "**What do the colors OR category labels really mean?**:\n", 498 | "\n", 499 | "Turns out, not much. The labels aren't magic, they don't carry meaning beyond: every sample in this set with this label is tied to or associated with samples that carry the same label. Due to the way that the model randomizes the clustering process, during one run of the algorithm, a group of samples may be numbered `0`, while during a subsequent run of the algorithm, a similar grouping of samples might be numbered `1`.\n", 500 | "\n", 501 | "To demo this principle, let's plot the **test** data and the **training** data on the same chart.\n", 502 | "\n", 503 | "Remember, in scatter plots:\n", 504 | "\n", 505 | "* `c` values are assigned based on the labels we provide\n", 506 | "* `cmap` maps a color to each value associated with `c`\n", 507 | "* `seismic` is a range of colors from deep blue to deep red\n", 508 | "\n", 509 | "\n", 510 | "\n", 511 | "* the plot selects four colors from the `seismic` range and assigns one color to each of the four labels that are present in `c` \n", 512 | " * Almost black\n", 513 | " * Blue\n", 514 | " * Maroon\n", 515 | " * Red\n", 516 | "\n", 517 | "Notice, the algorithms correctly lump the samples into clusters, but there is no correlation between the labels from the training run to the test run.\n", 518 | "\n", 519 | "The cluster at the top of the chart ended up having to separate labels and thus shows up as two different colors. Don't let that throw you." 520 | ] 521 | }, 522 | { 523 | "cell_type": "code", 524 | "execution_count": null, 525 | "metadata": {}, 526 | "outputs": [], 527 | "source": [ 528 | "plt.scatter(X_test[:, 0], X_test[:, 1],\n", 529 | " c=y_pred,\n", 530 | " cmap='seismic', alpha=0.5)\n", 531 | "\n", 532 | "plt.scatter(X_train[:, 0], X_train[:, 1],\n", 533 | " c=y_train,\n", 534 | " cmap='seismic', alpha=0.2);\n" 535 | ] 536 | }, 537 | { 538 | "cell_type": "markdown", 539 | "metadata": {}, 540 | "source": [ 541 | "# How to learn more: tips and hints\n", 542 | "---" 543 | ] 544 | }, 545 | { 546 | "cell_type": "markdown", 547 | "metadata": {}, 548 | "source": [ 549 | "**Read the error messages**: They are sometimes scarier than the docs but they will often give you some insight into the nature of the problem.\n", 550 | "\n", 551 | "Pay attention to the errors. While putting this lesson together, I cobbled some content together from various notebooks using cut and paste and I failed to paste the line where I actually called the `.fit()` method. This error message was the result.\n", 552 | "\n", 553 | "```python\n", 554 | "NotFittedError: This KMeans instance is not fitted yet. Call 'fit' with appropriate arguments before using this method.```\n", 555 | "\n", 556 | "One of the most common mistakes that I see with students is that they don't read the error messages. Why?\n", 557 | "\n", 558 | "* Sometimes error messages aren't written by humans\n", 559 | "* Sometimes error messages are well-written, clear and concise, but a beginner doesn't yet understand what the message is trying to say\n", 560 | "* Sometimes students three or four changes to a snippet and then run the code and immediately presume that the last edit was the breaking change. So they go down some rabbithole on the wrong line of code\n", 561 | "\n", 562 | "So get used to reading the error messages, try to understand them. If they are foreign to you, Google some of the key phrases... it is often comforting to see how many folks out there have asked the same question about what does a specific error mean." 563 | ] 564 | }, 565 | { 566 | "cell_type": "markdown", 567 | "metadata": {}, 568 | "source": [ 569 | "As far as additional topics to explore? Read up on **Scoring and validating your models**:\n", 570 | "\n", 571 | "[https://scikit-learn.org/stable/modules/learning_curve.html]()\n", 572 | "\n", 573 | "[https://scikit-learn.org/stable/modules/cross_validation.html#multimetric-cross-validation]()" 574 | ] 575 | }, 576 | { 577 | "cell_type": "markdown", 578 | "metadata": {}, 579 | "source": [ 580 | "# Experience Points!\n", 581 | "---" 582 | ] 583 | }, 584 | { 585 | "cell_type": "markdown", 586 | "metadata": { 587 | "slideshow": { 588 | "slide_type": "slide" 589 | } 590 | }, 591 | "source": [ 592 | "## Task 01" 593 | ] 594 | }, 595 | { 596 | "cell_type": "markdown", 597 | "metadata": {}, 598 | "source": [ 599 | "Let's play around a bit...\n", 600 | "\n", 601 | "The following code is set up to make it easy for you to change the standard deviation for the clusters that `make_blobs()` will generate.\n", 602 | "\n", 603 | "The initial `cluster_std` is set at `0.7` (just as it was in the examples above).\n", 604 | "Execute the following cells to see the scatter plots showing the training data and the test/predicted data.\n", 605 | "\n", 606 | "Then change the values for `cluster_std` to each of the following values one by one, executing the cells for each value so that you can see how the clustering model responds as the spread of the input data increases.\n", 607 | "\n", 608 | "* 1\n", 609 | "* 2\n", 610 | "* 3" 611 | ] 612 | }, 613 | { 614 | "cell_type": "code", 615 | "execution_count": null, 616 | "metadata": { 617 | "slideshow": { 618 | "slide_type": "slide" 619 | } 620 | }, 621 | "outputs": [], 622 | "source": [ 623 | "from sklearn.datasets.samples_generator import make_blobs\n", 624 | "\n", 625 | "cluster_std = 0.7 # std of 0.70 gives tight clusters, try other options!\n", 626 | "random_state = 13\n", 627 | "\n", 628 | "X, y = make_blobs(n_samples=400,\n", 629 | " centers=4,\n", 630 | " cluster_std=cluster_std,\n", 631 | " random_state=random_state)\n", 632 | "\n", 633 | "X_train, X_test, y_train, y_test = train_test_split(X, y)\n", 634 | "\n", 635 | "plt.scatter(X_train[:, 0], X_train[:, 1],\n", 636 | " c=y_train,\n", 637 | " cmap='seismic', alpha=0.5);" 638 | ] 639 | }, 640 | { 641 | "cell_type": "code", 642 | "execution_count": null, 643 | "metadata": {}, 644 | "outputs": [], 645 | "source": [ 646 | "from sklearn.cluster import KMeans" 647 | ] 648 | }, 649 | { 650 | "cell_type": "code", 651 | "execution_count": null, 652 | "metadata": {}, 653 | "outputs": [], 654 | "source": [ 655 | "model = KMeans(n_clusters=4)\n", 656 | "model.fit(X_train)" 657 | ] 658 | }, 659 | { 660 | "cell_type": "code", 661 | "execution_count": null, 662 | "metadata": {}, 663 | "outputs": [], 664 | "source": [ 665 | "y_pred = model.predict(X_test)" 666 | ] 667 | }, 668 | { 669 | "cell_type": "code", 670 | "execution_count": null, 671 | "metadata": {}, 672 | "outputs": [], 673 | "source": [ 674 | "plt.scatter(X_test[:, 0], X_test[:, 1],\n", 675 | " c=y_pred,\n", 676 | " cmap='seismic', alpha=0.5);" 677 | ] 678 | }, 679 | { 680 | "cell_type": "markdown", 681 | "metadata": {}, 682 | "source": [ 683 | "---\n", 684 | "When you complete this exercise, please put your **green** post-it on your monitor. \n", 685 | "\n", 686 | "If you want to continue on at your own-pace, please feel free to do so.\n", 687 | "\n", 688 | "" 689 | ] 690 | }, 691 | { 692 | "cell_type": "markdown", 693 | "metadata": {}, 694 | "source": [ 695 | "# References\n", 696 | "---" 697 | ] 698 | }, 699 | { 700 | "cell_type": "markdown", 701 | "metadata": {}, 702 | "source": [ 703 | "Below are references that may assist you in learning more:\n", 704 | " \n", 705 | "|Title (link)|Comments|\n", 706 | "|---|---|\n", 707 | "|[General API Reference](https://scikit-learn.org/stable/modules/classes.html)||\n", 708 | "|[KMeans API Reference](https://scikit-learn.org/stable/modules/generated/sklearn.cluster.KMeans.html#sklearn.cluster.KMeans)||\n", 709 | "|[User Guide](https://scikit-learn.org/stable/modules/clustering.html#k-means)||\n", 710 | "|[Sample datasets](https://scikit-learn.org/stable/modules/classes.html#module-sklearn.datasets)|Load or create datasets for practice and study|\n", 711 | "|[Make blobs](https://scikit-learn.org/stable/modules/generated/sklearn.datasets.make_blobs.html#sklearn.datasets.make_blobs)|Specifically make clusters of values|" 712 | ] 713 | } 714 | ], 715 | "metadata": { 716 | "kernelspec": { 717 | "display_name": "Python 3", 718 | "language": "python", 719 | "name": "python3" 720 | }, 721 | "language_info": { 722 | "codemirror_mode": { 723 | "name": "ipython", 724 | "version": 3 725 | }, 726 | "file_extension": ".py", 727 | "mimetype": "text/x-python", 728 | "name": "python", 729 | "nbconvert_exporter": "python", 730 | "pygments_lexer": "ipython3", 731 | "version": "3.6.7" 732 | } 733 | }, 734 | "nbformat": 4, 735 | "nbformat_minor": 2 736 | } 737 | -------------------------------------------------------------------------------- /01_intro_to_sklearn/01_intro_to_sklearn.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": {}, 6 | "source": [ 7 | "# Welcome to the Dark Art of Coding:\n", 8 | "## Introduction to Machine Learning\n", 9 | "Intro to Scikit-Learn\n", 10 | "\n", 11 | "" 12 | ] 13 | }, 14 | { 15 | "cell_type": "markdown", 16 | "metadata": {}, 17 | "source": [ 18 | "# Objectives\n", 19 | "---" 20 | ] 21 | }, 22 | { 23 | "cell_type": "markdown", 24 | "metadata": {}, 25 | "source": [ 26 | "In this session, students should expect to:\n", 27 | "\n", 28 | "* Explore machine learning techniques, tools and categories\n", 29 | " * Supervised learning\n", 30 | " * Unsupervised learning\n", 31 | " * Classification\n", 32 | " * Regression\n", 33 | " * Clustering\n", 34 | " * Dimensionality reduction\n", 35 | "* Review key characterisitcs of Scikit-Learn, especially the application programming interface (API)" 36 | ] 37 | }, 38 | { 39 | "cell_type": "markdown", 40 | "metadata": {}, 41 | "source": [ 42 | "# Machine Learning Techniques, Tools and Categories\n", 43 | "---" 44 | ] 45 | }, 46 | { 47 | "cell_type": "markdown", 48 | "metadata": {}, 49 | "source": [ 50 | "Machine learning falls into two main categories: **supervised learning** and **unsupervised learning**." 51 | ] 52 | }, 53 | { 54 | "cell_type": "markdown", 55 | "metadata": {}, 56 | "source": [ 57 | "## Supervised learning" 58 | ] 59 | }, 60 | { 61 | "cell_type": "markdown", 62 | "metadata": {}, 63 | "source": [ 64 | "Supervised learning is the process of modeling the relationship between features of a dataset and targets (labels) associated with each sample of that dataset. With a model in hand, it is possible to use the model to either assign labels to a new dataset that doesn't yet have labels or calculate output values. The most common examples of supervised learning include: **classification** and **regression**." 65 | ] 66 | }, 67 | { 68 | "cell_type": "markdown", 69 | "metadata": {}, 70 | "source": [ 71 | "## Classification" 72 | ] 73 | }, 74 | { 75 | "cell_type": "markdown", 76 | "metadata": {}, 77 | "source": [ 78 | "Classification allows you to assign discrete **labels or categories** to new input data.\n", 79 | "\n", 80 | "|Inputs|Classification|\n", 81 | "|:---|:---|\n", 82 | "|Texts, emails, or comments|Spam detection|\n", 83 | "|Flowers, insects, or animals|Species detection|\n", 84 | "|Viewers, readers, buyers|Customer detection|" 85 | ] 86 | }, 87 | { 88 | "cell_type": "markdown", 89 | "metadata": {}, 90 | "source": [ 91 | "## Regression" 92 | ] 93 | }, 94 | { 95 | "cell_type": "markdown", 96 | "metadata": {}, 97 | "source": [ 98 | "Regression analysis allows you to predict **continuous quantities** based on new input data. \n", 99 | "\n", 100 | "|Inputs|Outputs|\n", 101 | "|:---|:---|\n", 102 | "|Auto characteristics (color, model, age, etc)|Price|\n", 103 | "|Advertising dollars spent|Sales revenue|\n", 104 | "|Candidate characteristics|Salary|" 105 | ] 106 | }, 107 | { 108 | "cell_type": "markdown", 109 | "metadata": {}, 110 | "source": [ 111 | "## Unsupervised learning" 112 | ] 113 | }, 114 | { 115 | "cell_type": "markdown", 116 | "metadata": {}, 117 | "source": [ 118 | "Unsupervised learning is the process of modeling relationships amongst features of a dataset in a way that classifies the raw data without supplying any input labels. There are many algorithms that enable relationships to be identified and each of these models seek to replicate human logic in finding patterns in data. Two of the most common unsupervised learning approaches are **clustering** and **dimensionality reduction**." 119 | ] 120 | }, 121 | { 122 | "cell_type": "markdown", 123 | "metadata": {}, 124 | "source": [ 125 | "## Clustering" 126 | ] 127 | }, 128 | { 129 | "cell_type": "markdown", 130 | "metadata": {}, 131 | "source": [ 132 | "Cluster analysis or clustering is a technique for grouping a collection of objects so that all the objects in a single cluster are more similar to each other than to objects in other clusters.\n", 133 | "\n", 134 | "|Inputs|Classification|\n", 135 | "|:---|:---|\n", 136 | "|Images|Grouping/categorization|\n", 137 | "|Marketing data|Customer segmentation|\n", 138 | "|Social network data|Community classification|" 139 | ] 140 | }, 141 | { 142 | "cell_type": "markdown", 143 | "metadata": {}, 144 | "source": [ 145 | "## Dimensionality reduction" 146 | ] 147 | }, 148 | { 149 | "cell_type": "markdown", 150 | "metadata": {}, 151 | "source": [ 152 | "Dimensionality reduction (also dimension reduction) is the process of reducing the number of random variables in a dataset by identifying a set of principal variables. Dimensionality reduction can be used for feature selection or feature extraction.\n", 153 | "\n", 154 | "As an example, presume you have a dataset with 10 features for coffees:\n", 155 | "* Cup size\n", 156 | "* Roast (dark, etc)\n", 157 | "* Flavoring (nutmeg, vanilla, etc)\n", 158 | "* Country of origin\n", 159 | "* Organic status (organic, not organic)\n", 160 | "* Sustainability status (sustainably harvested?)\n", 161 | "* Preparation (espresso, latte, etc)\n", 162 | "* etc\n", 163 | "\n", 164 | "If, through dimensionality reduction, we can determine that the most influential determinant of whether a coffee will sell well is cup size, roast, flavoring, and preparation, we may be able to speed up our analysis OR reduce our computational overhead by reducing the 10 features down to three OR four. \n", 165 | "\n", 166 | "In some cases, data analysis such as regression or classification can be done in the reduced space more easily and/or accurately than in the original space.\n", 167 | "\n", 168 | "Some benefits from using dimensionality reduction include:\n", 169 | "\n", 170 | "* It reduces the computation time and storage space requirements\n", 171 | "* It can enable easier data visualization if the dimensions can be reduced to much lower dimensions like 2D/3D\n", 172 | "* It can improve the interpretation of the parameters of a machine learning model\n", 173 | "* It helps to avoid issues related to increase in data sparsity as data volume increases ([curse of dimensionality](https://en.wikipedia.org/wiki/Curse_of_dimensionality))" 174 | ] 175 | }, 176 | { 177 | "cell_type": "markdown", 178 | "metadata": {}, 179 | "source": [ 180 | "# Key Characteristics of Scikit-Learn\n", 181 | "---" 182 | ] 183 | }, 184 | { 185 | "cell_type": "markdown", 186 | "metadata": {}, 187 | "source": [ 188 | "Scikit-Learn is a well known package that provides access to many common machine learning algorithms through a consistent, well-organized Application Programming Interface (API) and is supported by very thorough and comprehensive documentation.\n", 189 | "\n", 190 | "The uniform syntax and the consistency in how the API is designed means that once you learn one model, it is surprisingly easy to pick up additional models.\n", 191 | "\n", 192 | "A key goal for this workshop is for you to walk away:\n", 193 | "\n", 194 | "* understanding the API\n", 195 | "* with an improved knowledge of the vocabulary of machine learning\n", 196 | "* knowing how to learn more\n", 197 | "\n", 198 | "If we succeed in these goals, you will be well-poised to continue your journey and to pursue future studies in the awesomeness that is machine learning." 199 | ] 200 | }, 201 | { 202 | "cell_type": "markdown", 203 | "metadata": {}, 204 | "source": [ 205 | "# The Scikit-Learn API\n", 206 | "---" 207 | ] 208 | }, 209 | { 210 | "cell_type": "markdown", 211 | "metadata": {}, 212 | "source": [ 213 | "The Scikit-Learn interface follows a number of guidelines covered in the API Contract (as defined in the [**API design paper**](https://arxiv.org/abs/1309.0238)). Quoting from that paper:\n", 214 | "\n", 215 | "> As much as possible, our design choices have been guided so as to avoid the\n", 216 | "proliferation of framework code. We try to adopt simple conventions and to\n", 217 | "limit to a minimum the number of methods an object must implement. The API\n", 218 | "is designed to adhere to the following broad principles:\n", 219 | "\n", 220 | "> **Consistency**. All objects (basic or composite) share a consistent interface composed of a limited set of methods. This interface is documented in a consistent manner for all objects.\n", 221 | "\n", 222 | "> **Inspection**. Constructor parameters and parameter values determined by learning algorithms are stored and exposed as public attributes.\n", 223 | "\n", 224 | "> **Non-proliferation of classes**. Learning algorithms are the only objects to be\n", 225 | "represented using custom classes. Datasets are represented as NumPy arrays\n", 226 | "or SciPy sparse matrices. Hyper-parameter names and values are represented\n", 227 | "as standard Python strings or numbers whenever possible. This keeps scikit-learn easy to use and easy to combine with other libraries.\n", 228 | "\n", 229 | "> **Composition**. Many machine learning tasks are expressible as sequences or\n", 230 | "combinations of transformations to data. Some learning algorithms are also\n", 231 | "naturally viewed as meta-algorithms parametrized on other algorithms. Whenever feasible, such algorithms are implemented and composed from existing\n", 232 | "building blocks.\n", 233 | "\n", 234 | "> **Sensible defaults**. Whenever an operation requires a user-defined parameter,\n", 235 | "an appropriate default value is defined by the library. The default value\n", 236 | "should cause the operation to be performed in a sensible way (giving a baseline solution for the task at hand)." 237 | ] 238 | }, 239 | { 240 | "cell_type": "markdown", 241 | "metadata": {}, 242 | "source": [ 243 | "For some concrete details on how the API is put together: **[Contributors API Overview](https://scikit-learn.org/stable/developers/contributing.html#api-overview)**" 244 | ] 245 | }, 246 | { 247 | "cell_type": "markdown", 248 | "metadata": {}, 249 | "source": [ 250 | "## Using the Scikit-Learn API" 251 | ] 252 | }, 253 | { 254 | "cell_type": "markdown", 255 | "metadata": {}, 256 | "source": [ 257 | "By and large, using any given model in Scikit-Learn will follow a set of straightforward steps. Each of our examples will follow what I call **The Process™**.\n", 258 | "\n", 259 | "1. **Prep the data**: the data must be well prepared for it to be usable in the various models. This preparation may include normalization, cleansing, wrangling of the data. It often needs to be separated into a `features` matrix and a `target` vector (array) and/or may need to be broken into separate collections of data for training versus testing purposes.\n", 260 | "\n", 261 | "1. **Choose the model**: to choose a model, we will import the appropriate estimator class\n", 262 | "\n", 263 | "1. **Choose appropriate hyperparameters**: to prepare the model, we create a class instance and provide hyperparameters as arguments to the class\n", 264 | "\n", 265 | "1. **Fit the model**: to fit the model to the existing data, we call the `.fit()` method on the model instance and provide training data\n", 266 | "\n", 267 | "1. **Apply the model**: next, we apply the model to new data, primarily by calling one of two methods:\n", 268 | "\n", 269 | " * **Supervised learning**: generally, we use the `.predict()` method to predict new labels\n", 270 | " * **Unsupervised learning**: generally, we use either the `.predict()` or `.transform()` methods to predict properties OR transform properties of the data.\n", 271 | " \n", 272 | "1. **Examine the results**: lastly, it is recommended that we look over the results and do a sanity check. Some of this can be done by simply looking at output values. Other times it really helps to have some form of data visualization (i.e. graph/chart) to help us examine the model predictions or transformations." 273 | ] 274 | }, 275 | { 276 | "cell_type": "markdown", 277 | "metadata": {}, 278 | "source": [ 279 | "## A quick demo" 280 | ] 281 | }, 282 | { 283 | "cell_type": "markdown", 284 | "metadata": {}, 285 | "source": [ 286 | "To whet our appetite for what's to come, we will take a quick look at coffee prices near the North Shore of Oahu, Hawaii. Our goal will be to predict the price of a cup of coffee, given a cup size.\n", 287 | "\n", 288 | "These prices come from several coffee shops in the area, in 2019.\n", 289 | "\n", 290 | "|Size (oz)|Price ($)|\n", 291 | "|----|----|\n", 292 | "|12|2.95|\n", 293 | "|16|3.65|\n", 294 | "|20|4.15|\n", 295 | "|14|3.25|\n", 296 | "|18|4.20|\n", 297 | "|20|4.00|\n" 298 | ] 299 | }, 300 | { 301 | "cell_type": "markdown", 302 | "metadata": {}, 303 | "source": [ 304 | "### Prep the data" 305 | ] 306 | }, 307 | { 308 | "cell_type": "markdown", 309 | "metadata": {}, 310 | "source": [ 311 | "Let's look at the data in a simple scatter plot to compare the cost of coffee versus the size of the cup." 312 | ] 313 | }, 314 | { 315 | "cell_type": "markdown", 316 | "metadata": {}, 317 | "source": [ 318 | "We start with a set of standard imports..." 319 | ] 320 | }, 321 | { 322 | "cell_type": "code", 323 | "execution_count": null, 324 | "metadata": {}, 325 | "outputs": [], 326 | "source": [ 327 | "import matplotlib.pyplot as plt\n", 328 | "import numpy as np\n", 329 | "\n", 330 | "# NOTE: during the Choose the Model step, we will import the \n", 331 | "# model we want, but there is no reason you can't import it here.\n", 332 | "# from sklearn.linear_model import LinearRegression" 333 | ] 334 | }, 335 | { 336 | "cell_type": "markdown", 337 | "metadata": {}, 338 | "source": [ 339 | "### Prep the training and test data" 340 | ] 341 | }, 342 | { 343 | "cell_type": "markdown", 344 | "metadata": {}, 345 | "source": [ 346 | "**The training data**:\n", 347 | "\n", 348 | "We start off by making two `numpy` arrays." 349 | ] 350 | }, 351 | { 352 | "cell_type": "code", 353 | "execution_count": null, 354 | "metadata": {}, 355 | "outputs": [], 356 | "source": [ 357 | "x_train = np.array([12, 16, 20, 14, 18, 20]) # Coffee sizes\n", 358 | "y_train = np.array([2.95, 3.65, 4.15, 3.25, 4.20, 4.00]) # Coffee prices" 359 | ] 360 | }, 361 | { 362 | "cell_type": "markdown", 363 | "metadata": {}, 364 | "source": [ 365 | "Then we plot them using a `matplotlib` scatter plot." 366 | ] 367 | }, 368 | { 369 | "cell_type": "code", 370 | "execution_count": null, 371 | "metadata": {}, 372 | "outputs": [], 373 | "source": [ 374 | "plt.scatter(x_train, y_train);" 375 | ] 376 | }, 377 | { 378 | "cell_type": "markdown", 379 | "metadata": {}, 380 | "source": [ 381 | "In order to put this data into a linear regression machine learning algorithm, we need to create our features matrix, which includes just our coffee sizes (`x_train` values).\n", 382 | "\n", 383 | "In this case, we will use one of the `numpy` techniques to increase the dimensionality of the `x_train` array. We will discuss this process in greater detail in a few minutes.\n", 384 | "```\n", 385 | "X_train = x_train[:, np.newaxis]\n", 386 | "```\n", 387 | "\n", 388 | "We will call our training set: `X_train` (with an upper case `X`)." 389 | ] 390 | }, 391 | { 392 | "cell_type": "code", 393 | "execution_count": null, 394 | "metadata": {}, 395 | "outputs": [], 396 | "source": [ 397 | "X_train = x_train[:, np.newaxis] # creates an array of arrays\n", 398 | "X_train" 399 | ] 400 | }, 401 | { 402 | "cell_type": "markdown", 403 | "metadata": {}, 404 | "source": [ 405 | "Our target values are generally labeled `y_train` (with a lower case `y`) and these values can be a simple array." 406 | ] 407 | }, 408 | { 409 | "cell_type": "code", 410 | "execution_count": null, 411 | "metadata": {}, 412 | "outputs": [], 413 | "source": [ 414 | "y_train" 415 | ] 416 | }, 417 | { 418 | "cell_type": "markdown", 419 | "metadata": {}, 420 | "source": [ 421 | "**Now, the test data**:" 422 | ] 423 | }, 424 | { 425 | "cell_type": "markdown", 426 | "metadata": {}, 427 | "source": [ 428 | "We need to have some test data to see what values the model will predict. Let's presume that some friends will be coming to the North Shore of Oahu and want to buy some coffee in various sizes, include some potentially unusual sizes.\n", 429 | "\n", 430 | "Based on their requests, we prep several cup sizes to see what price the model will predict.\n", 431 | "\n", 432 | "We generate a set of `x_test` values (representing size in oz.) in an array. Then we convert the array to a 2D matrix for inclusion as an argument when we get to the prediction phase. As noted above, we will discuss this in detail shortly." 433 | ] 434 | }, 435 | { 436 | "cell_type": "code", 437 | "execution_count": null, 438 | "metadata": {}, 439 | "outputs": [], 440 | "source": [ 441 | "x_test = np.array([16, 15, 12, 20, 17])" 442 | ] 443 | }, 444 | { 445 | "cell_type": "code", 446 | "execution_count": null, 447 | "metadata": {}, 448 | "outputs": [], 449 | "source": [ 450 | "X_test = x_test[:, None] # None will accomplish the same\n", 451 | "X_test # outcome as np.newaxis" 452 | ] 453 | }, 454 | { 455 | "cell_type": "markdown", 456 | "metadata": {}, 457 | "source": [ 458 | "### Choose the Model" 459 | ] 460 | }, 461 | { 462 | "cell_type": "markdown", 463 | "metadata": {}, 464 | "source": [ 465 | "For this quick example, we are gonna import a simple **linear regression** model from the sklearn collection of linear models." 466 | ] 467 | }, 468 | { 469 | "cell_type": "code", 470 | "execution_count": null, 471 | "metadata": {}, 472 | "outputs": [], 473 | "source": [ 474 | "from sklearn.linear_model import LinearRegression" 475 | ] 476 | }, 477 | { 478 | "cell_type": "markdown", 479 | "metadata": {}, 480 | "source": [ 481 | "### Choose Appropriate Hyperparameters" 482 | ] 483 | }, 484 | { 485 | "cell_type": "markdown", 486 | "metadata": {}, 487 | "source": [ 488 | "This model comes, as do most of the models in sklearn with arguments (or hyperparameters) set to sane defaults, so for this case, we won't add or change any arguments.\n", 489 | "\n", 490 | "**NOTE**: When Jupyter evaluates a model, it displays a string representation of that model with the current settings for the model, including any defaults." 491 | ] 492 | }, 493 | { 494 | "cell_type": "code", 495 | "execution_count": null, 496 | "metadata": {}, 497 | "outputs": [], 498 | "source": [ 499 | "model = LinearRegression()\n", 500 | "model" 501 | ] 502 | }, 503 | { 504 | "cell_type": "markdown", 505 | "metadata": {}, 506 | "source": [ 507 | "### Fit the model" 508 | ] 509 | }, 510 | { 511 | "cell_type": "markdown", 512 | "metadata": {}, 513 | "source": [ 514 | "With a prepared model, we need to feed it data to evaluate. For this linear regression model, we give it two arguments: `X` and `y`." 515 | ] 516 | }, 517 | { 518 | "cell_type": "code", 519 | "execution_count": null, 520 | "metadata": {}, 521 | "outputs": [], 522 | "source": [ 523 | "model.fit(X_train, y_train)" 524 | ] 525 | }, 526 | { 527 | "cell_type": "markdown", 528 | "metadata": {}, 529 | "source": [ 530 | "With these inputs, the model was able to calculate the **slope** (coefficient) and the **y-intercept** of the line that aligns most closely with our training data.\n", 531 | "\n", 532 | "Let's look at both of these calculated results.\n", 533 | "\n", 534 | "```python\n", 535 | "model.coef_\n", 536 | "model.intercept_\n", 537 | "```\n", 538 | "\n", 539 | "**NOTE**: scikit-learn appends an `_` to the end of attributes that return **calculated** values. It does this to help distinguish between inputs and outputs" 540 | ] 541 | }, 542 | { 543 | "cell_type": "code", 544 | "execution_count": null, 545 | "metadata": {}, 546 | "outputs": [], 547 | "source": [ 548 | "model.coef_" 549 | ] 550 | }, 551 | { 552 | "cell_type": "code", 553 | "execution_count": null, 554 | "metadata": {}, 555 | "outputs": [], 556 | "source": [ 557 | "model.intercept_" 558 | ] 559 | }, 560 | { 561 | "cell_type": "markdown", 562 | "metadata": {}, 563 | "source": [ 564 | "### Apply the model" 565 | ] 566 | }, 567 | { 568 | "cell_type": "code", 569 | "execution_count": null, 570 | "metadata": {}, 571 | "outputs": [], 572 | "source": [ 573 | "y_pred = model.predict(X_test)\n", 574 | "y_pred\n", 575 | "\n", 576 | "# reminder, these were the test cup sizes: \n", 577 | "# [16, 15, 12, 20, 17]" 578 | ] 579 | }, 580 | { 581 | "cell_type": "markdown", 582 | "metadata": {}, 583 | "source": [ 584 | "### Examine the Results" 585 | ] 586 | }, 587 | { 588 | "cell_type": "markdown", 589 | "metadata": {}, 590 | "source": [ 591 | "From here, we can plot all of the data points together on one chart:\n", 592 | "\n", 593 | "* original values in purple\n", 594 | "* predicted values in red\n", 595 | "* predicted slope of the line that best fits the original training data" 596 | ] 597 | }, 598 | { 599 | "cell_type": "code", 600 | "execution_count": null, 601 | "metadata": {}, 602 | "outputs": [], 603 | "source": [ 604 | "plt.scatter(x_train, y_train, color='rebeccapurple')\n", 605 | "plt.scatter(x_test, y_pred, color='red', alpha=0.20)" 606 | ] 607 | }, 608 | { 609 | "cell_type": "code", 610 | "execution_count": null, 611 | "metadata": {}, 612 | "outputs": [], 613 | "source": [ 614 | "plt.scatter(x_train, y_train, color='rebeccapurple')\n", 615 | "plt.plot(x_test, y_pred, color='red');" 616 | ] 617 | }, 618 | { 619 | "cell_type": "markdown", 620 | "metadata": {}, 621 | "source": [ 622 | "### Deep Dive" 623 | ] 624 | }, 625 | { 626 | "cell_type": "markdown", 627 | "metadata": {}, 628 | "source": [ 629 | "**The scikit-learn API**: The scikit-learn API is very rich and has many well-thought out approaches. The [API design contract document ](https://arxiv.org/pdf/1309.0238.pdf) helps characterize some of the philosophy behind the tool, which I found to be useful in understand how to use the tool. I probably wouldn't suggest reading the whole document right off the bat, but everything up to *Section 3 Advanced API* is a good overview of the philosophy, some aspects of data formats, and scikit-learn Estimators and Predictors.\n", 630 | "\n", 631 | "**Linear Regression**: We will revisit Linear Regression model in a later lesson and will provide more of a deep dive there." 632 | ] 633 | }, 634 | { 635 | "cell_type": "markdown", 636 | "metadata": {}, 637 | "source": [ 638 | "### Gotchas" 639 | ] 640 | }, 641 | { 642 | "cell_type": "markdown", 643 | "metadata": {}, 644 | "source": [ 645 | "A significant struggle for beginners can be ensuring that the data is in the right format. We will cover that topic in the next session." 646 | ] 647 | }, 648 | { 649 | "cell_type": "markdown", 650 | "metadata": {}, 651 | "source": [ 652 | "### How to learn more: tips and tricks" 653 | ] 654 | }, 655 | { 656 | "cell_type": "markdown", 657 | "metadata": {}, 658 | "source": [ 659 | "As we explore the Scikit-Learn API, and as we progress through the upcoming examples I want to pre-position you for success by showing you where and how you can learn more.\n", 660 | "\n", 661 | "One great resource to better understand the many options available to you in terms of the machine learning algorithms and the hyper parameters in scikit-learn is the API Reference. Throughout the following discussions, we will revisit the API reference repeatedly.\n", 662 | "\n", 663 | "**[API Reference](https://scikit-learn.org/stable/modules/classes.html)**: A one-stop shop for the classes and functions in `sklearn`" 664 | ] 665 | }, 666 | { 667 | "cell_type": "markdown", 668 | "metadata": {}, 669 | "source": [ 670 | "# Experience Points!\n", 671 | "---" 672 | ] 673 | }, 674 | { 675 | "cell_type": "markdown", 676 | "metadata": { 677 | "slideshow": { 678 | "slide_type": "slide" 679 | } 680 | }, 681 | "source": [ 682 | "**Task 01**\n", 683 | "\n", 684 | "* Open the API Reference (mentioned above) and find the section on `model_selection.train_test_split`\n", 685 | "* Review that section (at a high level) for about 2 minutes looking for the following:\n", 686 | " * Make notes of any words that you aren't familiar with. See if you hear them later in this tutorial\n", 687 | " * Explore the section that describes what type of data the `train_test_split()` function returns. What will you get back?" 688 | ] 689 | }, 690 | { 691 | "cell_type": "markdown", 692 | "metadata": {}, 693 | "source": [ 694 | "---\n", 695 | "When you complete this exercise, please put your **green** post-it on your monitor. \n", 696 | "\n", 697 | "If you want to continue on at your own-pace, please feel free to do so.\n", 698 | "\n", 699 | "" 700 | ] 701 | }, 702 | { 703 | "cell_type": "markdown", 704 | "metadata": {}, 705 | "source": [ 706 | "# References\n", 707 | "---" 708 | ] 709 | }, 710 | { 711 | "cell_type": "markdown", 712 | "metadata": {}, 713 | "source": [ 714 | "Below are references that may assist you in learning more:\n", 715 | " \n", 716 | "|Title (link)|Comments|\n", 717 | "|---|---|\n", 718 | "|[API Reference](https://scikit-learn.org/stable/modules/classes.html)|One stop shop for the classes and functions in `sklearn`|\n", 719 | "|[Contributors API Overview](https://scikit-learn.org/stable/developers/contributing.html#api-overview)|Overview of the API for contributors to scikit learn|\n", 720 | "|[API design contract](https://arxiv.org/abs/1309.0238)|An overview of the philosophy behind the API design|\n", 721 | "|[Regression Analysis](https://en.wikipedia.org/wiki/Regression_analysis)|An article on regression analysis|\n", 722 | "|[Cluster analysis](https://en.wikipedia.org/wiki/Cluster_analysis)|An article on cluster analysis|\n", 723 | "|[curse of dimensionality](https://en.wikipedia.org/wiki/Curse_of_dimensionality)|An article on the curse of dimensionality|" 724 | ] 725 | } 726 | ], 727 | "metadata": { 728 | "kernelspec": { 729 | "display_name": "Python 3", 730 | "language": "python", 731 | "name": "python3" 732 | }, 733 | "language_info": { 734 | "codemirror_mode": { 735 | "name": "ipython", 736 | "version": 3 737 | }, 738 | "file_extension": ".py", 739 | "mimetype": "text/x-python", 740 | "name": "python", 741 | "nbconvert_exporter": "python", 742 | "pygments_lexer": "ipython3", 743 | "version": "3.6.7" 744 | } 745 | }, 746 | "nbformat": 4, 747 | "nbformat_minor": 2 748 | } 749 | -------------------------------------------------------------------------------- /06_special_topics/06_special_topics.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": {}, 6 | "source": [ 7 | "# Welcome to the Dark Art of Coding:\n", 8 | "## Introduction to Machine Learning\n", 9 | "Special Topics\n", 10 | "\n", 11 | "" 12 | ] 13 | }, 14 | { 15 | "cell_type": "markdown", 16 | "metadata": {}, 17 | "source": [ 18 | "# Objectives\n", 19 | "---" 20 | ] 21 | }, 22 | { 23 | "cell_type": "markdown", 24 | "metadata": {}, 25 | "source": [ 26 | "In this session, students should expect to:\n", 27 | "\n", 28 | "* Understand the use of the `PolynomialFeatures()` method\n", 29 | "* Explore the use of `Pipelines` to create a workflow of transforms in combination with a final estimator\n", 30 | "* Use `PolynomialFeatures` in a `Pipeline` to explore underfitting and overfitting" 31 | ] 32 | }, 33 | { 34 | "cell_type": "markdown", 35 | "metadata": {}, 36 | "source": [ 37 | "# Overview: PolynomialFeatures\n", 38 | "---" 39 | ] 40 | }, 41 | { 42 | "cell_type": "markdown", 43 | "metadata": {}, 44 | "source": [ 45 | "## PolynomialFeatures\n", 46 | "\n", 47 | "The PolynomialFeature class has a `.fit_transform()` method that transforms input values into a series of output values. These values are often used as inputs in other models.\n", 48 | "\n", 49 | "PolynomialFeatures generates a new feature matrix that has all the polynomial combinations of the original features with a degree less than or equal to the specified degree. \n", 50 | "\n", 51 | "As an example: \n", 52 | "\n", 53 | "An input sample has two dimensions (i.e. $[a, b]$) the resulting degree-2 polynomial features will be $[1, a, b, a^2, ab, b^2]$." 54 | ] 55 | }, 56 | { 57 | "cell_type": "markdown", 58 | "metadata": {}, 59 | "source": [ 60 | "We start with some standard imports:\n", 61 | " " 62 | ] 63 | }, 64 | { 65 | "cell_type": "code", 66 | "execution_count": null, 67 | "metadata": {}, 68 | "outputs": [], 69 | "source": [ 70 | "import matplotlib.pyplot as plt\n", 71 | "import numpy as np\n", 72 | "import pandas as pd\n", 73 | "import sklearn\n", 74 | "from sklearn.preprocessing import PolynomialFeatures" 75 | ] 76 | }, 77 | { 78 | "cell_type": "markdown", 79 | "metadata": {}, 80 | "source": [ 81 | "Let's start with a three element matrix:" 82 | ] 83 | }, 84 | { 85 | "cell_type": "code", 86 | "execution_count": null, 87 | "metadata": {}, 88 | "outputs": [], 89 | "source": [ 90 | "X = np.arange(3).reshape(3, 1)\n", 91 | "X" 92 | ] 93 | }, 94 | { 95 | "cell_type": "markdown", 96 | "metadata": {}, 97 | "source": [ 98 | "The simplest PolynomialFeatures is simply to return the original array, but notice that in this case, the function returns a column of `1`s as well as the original matrix." 99 | ] 100 | }, 101 | { 102 | "cell_type": "code", 103 | "execution_count": null, 104 | "metadata": {}, 105 | "outputs": [], 106 | "source": [ 107 | "poly = PolynomialFeatures(1)\n", 108 | "poly.fit_transform(X)" 109 | ] 110 | }, 111 | { 112 | "cell_type": "markdown", 113 | "metadata": {}, 114 | "source": [ 115 | "Yields $1, a$ for each element in the X matrix" 116 | ] 117 | }, 118 | { 119 | "cell_type": "markdown", 120 | "metadata": {}, 121 | "source": [ 122 | "If you want to have a features matrix that doesn't include the column of `1`s, you can avoid it by using the `include_bias=False` argument.\n", 123 | "\n", 124 | "Including a bias column acts as an intercept term in a linear model." 125 | ] 126 | }, 127 | { 128 | "cell_type": "code", 129 | "execution_count": null, 130 | "metadata": {}, 131 | "outputs": [], 132 | "source": [ 133 | "poly = PolynomialFeatures(1, include_bias=False)\n", 134 | "poly.fit_transform(X)" 135 | ] 136 | }, 137 | { 138 | "cell_type": "code", 139 | "execution_count": null, 140 | "metadata": {}, 141 | "outputs": [], 142 | "source": [ 143 | "poly = PolynomialFeatures(2)\n", 144 | "poly.fit_transform(X)" 145 | ] 146 | }, 147 | { 148 | "cell_type": "markdown", 149 | "metadata": {}, 150 | "source": [ 151 | "Yields $1, a, a^2$ for each element in the X matrix" 152 | ] 153 | }, 154 | { 155 | "cell_type": "code", 156 | "execution_count": null, 157 | "metadata": {}, 158 | "outputs": [], 159 | "source": [ 160 | "poly = PolynomialFeatures(4)\n", 161 | "poly.fit_transform(X)" 162 | ] 163 | }, 164 | { 165 | "cell_type": "markdown", 166 | "metadata": {}, 167 | "source": [ 168 | "Yields $1, a, a^2, a^3, a^4$ for each element in the X matrix" 169 | ] 170 | }, 171 | { 172 | "cell_type": "code", 173 | "execution_count": null, 174 | "metadata": {}, 175 | "outputs": [], 176 | "source": [ 177 | "X2 = np.arange(6).reshape(3, 2)\n", 178 | "X2" 179 | ] 180 | }, 181 | { 182 | "cell_type": "code", 183 | "execution_count": null, 184 | "metadata": {}, 185 | "outputs": [], 186 | "source": [ 187 | "poly = PolynomialFeatures(1)\n", 188 | "poly.fit_transform(X2)" 189 | ] 190 | }, 191 | { 192 | "cell_type": "markdown", 193 | "metadata": {}, 194 | "source": [ 195 | "Yields $1, a, b$ for each element in the X matrix" 196 | ] 197 | }, 198 | { 199 | "cell_type": "code", 200 | "execution_count": null, 201 | "metadata": {}, 202 | "outputs": [], 203 | "source": [ 204 | "poly = PolynomialFeatures(2)\n", 205 | "poly.fit_transform(X2)" 206 | ] 207 | }, 208 | { 209 | "cell_type": "markdown", 210 | "metadata": {}, 211 | "source": [ 212 | "Yields $1, a, b, a^2, ab, b^2$ for each element in the X matrix" 213 | ] 214 | }, 215 | { 216 | "cell_type": "code", 217 | "execution_count": null, 218 | "metadata": {}, 219 | "outputs": [], 220 | "source": [ 221 | "poly = PolynomialFeatures(3)\n", 222 | "poly.fit_transform(X2)\n", 223 | "\n", 224 | "# 1 a b a^2 ab b^2 a^3 a^2*b a*b^2 b^3" 225 | ] 226 | }, 227 | { 228 | "cell_type": "markdown", 229 | "metadata": {}, 230 | "source": [ 231 | "Yields $1, a, b, a^2, ab, b^2, a^3, a^2b, ab^2, b^3$ for each element in the X matrix" 232 | ] 233 | }, 234 | { 235 | "cell_type": "markdown", 236 | "metadata": {}, 237 | "source": [ 238 | "Thus for any `degree` that we feed into the PolynomialFeature model, we can transform an input matrix into a higher order matrix that may allow for potentially more precise calculations of `y` values, given values of `x`." 239 | ] 240 | }, 241 | { 242 | "cell_type": "markdown", 243 | "metadata": {}, 244 | "source": [ 245 | "Why does this matter?... if you recall from your math days it is possible to create very sophisticated curves using formulas such as this:\n", 246 | "\n", 247 | "$$\n", 248 | "y = mx + b \\\\\n", 249 | "y = ax^2 + bx + c \\\\\n", 250 | "y = ax^3 + bx^2 + cx + d \\\\\n", 251 | "y = ax^4 + bx^3 + cx^2 + dx + e \\\\\n", 252 | "$$\n", 253 | "\n", 254 | "With every additional argument and with the appropriate slopes, you have the ability to match a wide array of datasets.\n", 255 | "\n", 256 | "PolynomialFeatures helps you to generate matrices with multiple degrees so that you can run them through models like the LinearRegression model to identify the coefficients and intercept values for equations that resemble those above." 257 | ] 258 | }, 259 | { 260 | "cell_type": "markdown", 261 | "metadata": {}, 262 | "source": [ 263 | "With that very brief intro to `PolynomialFeatures`, we will turn our attention to a new topic, **Pipelines**, but will come back to `PolynomialFeatures` momentarily." 264 | ] 265 | }, 266 | { 267 | "cell_type": "markdown", 268 | "metadata": {}, 269 | "source": [ 270 | "# Overview: Pipelines\n", 271 | "---" 272 | ] 273 | }, 274 | { 275 | "cell_type": "markdown", 276 | "metadata": {}, 277 | "source": [ 278 | "In some cases, it might be necessary to transform the data in some way before feeding it into a particular machine learning model.\n", 279 | "\n", 280 | "The data may need to be:\n", 281 | "* scaled (ie. using `StandardScaler`\n", 282 | "* changed into another format (ie. using `PolynomialFeatures` or `CountVectorizer`)\n", 283 | "* normalized (i.e. using `TfidfTransformer`)\n", 284 | "\n", 285 | "In the example we just looked at, we used a `PolynomialFeatures` function to generate a higher degree matrix.\n", 286 | "\n", 287 | "Pipelines allow you to feed inputs into one \"end\" of a series of components and get transformations or predictions out the other end, without having to take the output of one model and manually drop it into the inputs of the next model.\n", 288 | "\n", 289 | "The following example uses the `PolynomialFeatures` model to transform inputs from a degree 1 polynomial into higher degree polynomials. It then takes the results of those transformations and then feeds them into the `LinearRegression` model. \n", 290 | "\n", 291 | "The `Pipeline` simplifies things so that we only have to call `.fit()` once on the pipeline." 292 | ] 293 | }, 294 | { 295 | "cell_type": "markdown", 296 | "metadata": {}, 297 | "source": [ 298 | "## A first trivial example..." 299 | ] 300 | }, 301 | { 302 | "cell_type": "markdown", 303 | "metadata": {}, 304 | "source": [ 305 | "## Prep the data\n", 306 | "\n", 307 | "Start with some standard imports" 308 | ] 309 | }, 310 | { 311 | "cell_type": "code", 312 | "execution_count": null, 313 | "metadata": {}, 314 | "outputs": [], 315 | "source": [ 316 | "import matplotlib.pyplot as plt\n", 317 | "import numpy as np\n", 318 | "import pandas as pd\n", 319 | "import sklearn" 320 | ] 321 | }, 322 | { 323 | "cell_type": "markdown", 324 | "metadata": {}, 325 | "source": [ 326 | "### Prep the training and test data" 327 | ] 328 | }, 329 | { 330 | "cell_type": "code", 331 | "execution_count": null, 332 | "metadata": {}, 333 | "outputs": [], 334 | "source": [ 335 | "df = pd.read_csv('../universal_datasets/skincancer.txt',\n", 336 | " delim_whitespace=True,\n", 337 | " header=0,\n", 338 | " names=['state', 'lat', 'mort', 'ocean', 'long'])\n", 339 | "df.head()" 340 | ] 341 | }, 342 | { 343 | "cell_type": "code", 344 | "execution_count": null, 345 | "metadata": {}, 346 | "outputs": [], 347 | "source": [ 348 | "X = df['lat'].to_frame()\n", 349 | "y = df['mort']" 350 | ] 351 | }, 352 | { 353 | "cell_type": "code", 354 | "execution_count": null, 355 | "metadata": {}, 356 | "outputs": [], 357 | "source": [ 358 | "from sklearn.model_selection import train_test_split" 359 | ] 360 | }, 361 | { 362 | "cell_type": "code", 363 | "execution_count": null, 364 | "metadata": {}, 365 | "outputs": [], 366 | "source": [ 367 | "X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=42)" 368 | ] 369 | }, 370 | { 371 | "cell_type": "code", 372 | "execution_count": null, 373 | "metadata": {}, 374 | "outputs": [], 375 | "source": [ 376 | "plt.scatter(X_train, y_train)\n", 377 | "plt.title(\"Mortality vs Latitude\")\n", 378 | "plt.xlabel(\"Latitude\")\n", 379 | "plt.ylabel(\"Number of deaths\");" 380 | ] 381 | }, 382 | { 383 | "cell_type": "code", 384 | "execution_count": null, 385 | "metadata": {}, 386 | "outputs": [], 387 | "source": [ 388 | "from sklearn.linear_model import LinearRegression\n", 389 | "from sklearn.preprocessing import PolynomialFeatures\n", 390 | "from sklearn.pipeline import Pipeline" 391 | ] 392 | }, 393 | { 394 | "cell_type": "markdown", 395 | "metadata": {}, 396 | "source": [ 397 | "**NOTE**: for this example, we are simply gonna regurgitate the input data rather than change the degree, so we choose to use a `degree=1` and to avoid the bias column (column of `1`s), we set `include_bias=False`. In a moment, we will look at tweaking the degree to explore underfitting and overfitting. In this first example, I merely want to focus on putting the `Pipeline` together." 398 | ] 399 | }, 400 | { 401 | "cell_type": "code", 402 | "execution_count": null, 403 | "metadata": {}, 404 | "outputs": [], 405 | "source": [ 406 | "polynomial_features = PolynomialFeatures(degree=1,\n", 407 | " include_bias=False)" 408 | ] 409 | }, 410 | { 411 | "cell_type": "code", 412 | "execution_count": null, 413 | "metadata": {}, 414 | "outputs": [], 415 | "source": [ 416 | "linear_regression = LinearRegression()" 417 | ] 418 | }, 419 | { 420 | "cell_type": "markdown", 421 | "metadata": {}, 422 | "source": [ 423 | "This is where the magic comes into play. By providing as an argument to the Pipeline constructor a list containing a series of tuples, we can establish which models to call and in what order.\n", 424 | "\n", 425 | "* Each tuple is a step in the pipeline.\n", 426 | "* Each tuple is comprised of a name for that step and the function or model to call during that step.\n", 427 | "* Each step should be sequentially in the order we want\n", 428 | "* Every step, except for the last step must have either a `.transform()` OR `.fit_transform()` method. As we have seen, `PolynomialFeatures` does indeed have a `.fit_transform()` method." 429 | ] 430 | }, 431 | { 432 | "cell_type": "code", 433 | "execution_count": null, 434 | "metadata": {}, 435 | "outputs": [], 436 | "source": [ 437 | "pipeline = Pipeline([(\"poly_f\", polynomial_features),\n", 438 | " (\"linear_r\", linear_regression)])" 439 | ] 440 | }, 441 | { 442 | "cell_type": "markdown", 443 | "metadata": {}, 444 | "source": [ 445 | "NOTE: in the next cell, we simply call `.fit()` on the Pipeline. We don't have to call the `fit_transform()` method on the PolynomialFeatures at all, the Pipeline does it automagically." 446 | ] 447 | }, 448 | { 449 | "cell_type": "code", 450 | "execution_count": null, 451 | "metadata": {}, 452 | "outputs": [], 453 | "source": [ 454 | "pipeline.fit(X_train, y_train)" 455 | ] 456 | }, 457 | { 458 | "cell_type": "markdown", 459 | "metadata": {}, 460 | "source": [ 461 | "Now that our model has been fit, we simply call `.predict()`, like normal." 462 | ] 463 | }, 464 | { 465 | "cell_type": "code", 466 | "execution_count": null, 467 | "metadata": {}, 468 | "outputs": [], 469 | "source": [ 470 | "y_test = pipeline.predict(X_test)" 471 | ] 472 | }, 473 | { 474 | "cell_type": "markdown", 475 | "metadata": {}, 476 | "source": [ 477 | "Of course, let's take a quick look via a chart." 478 | ] 479 | }, 480 | { 481 | "cell_type": "code", 482 | "execution_count": null, 483 | "metadata": {}, 484 | "outputs": [], 485 | "source": [ 486 | "plt.plot(X_test, y_test, label=\"Model\")\n", 487 | "plt.scatter(X_train, y_train);" 488 | ] 489 | }, 490 | { 491 | "cell_type": "markdown", 492 | "metadata": {}, 493 | "source": [ 494 | "## An example of under/overfitting" 495 | ] 496 | }, 497 | { 498 | "cell_type": "markdown", 499 | "metadata": {}, 500 | "source": [ 501 | "Now that we have a sense for how we can use a Pipeline, we are gonna create one and use it to explore the phenomena of **Underfitting** and **Overfitting**." 502 | ] 503 | }, 504 | { 505 | "cell_type": "markdown", 506 | "metadata": {}, 507 | "source": [ 508 | "A risk in machine learning is using a model that doesn't match the data well enough (**underfitting**) OR matches the training data so well, that it doesn't apply well to test data, it only applies to the training data (**overfitting**).\n", 509 | "\n", 510 | "\n", 511 | "\n", 512 | "\n", 513 | "\n", 514 | "\n", 515 | "Image source: [Overfitting mattress](http://cdn.stylefrizz.com/img/human-body-shape-matress.jpg)\n", 516 | "\n", 517 | "\n", 518 | "For this example, we will look at three graphs. This example comes from the Scikit Learn [Underfitting/Overfitting documentation](https://scikit-learn.org/stable/auto_examples/model_selection/plot_underfitting_overfitting.html), with various modifications by me." 519 | ] 520 | }, 521 | { 522 | "cell_type": "markdown", 523 | "metadata": {}, 524 | "source": [ 525 | "We will do this process three times using `degree=` of `1`, `4`, and `15` to demonstrate underfitting, a good fit, and overfitting.\n", 526 | "\n", 527 | "Two of these cases will generate linear regressions that are not straight lines." 528 | ] 529 | }, 530 | { 531 | "cell_type": "markdown", 532 | "metadata": {}, 533 | "source": [ 534 | "## Prep the training and test data" 535 | ] 536 | }, 537 | { 538 | "cell_type": "code", 539 | "execution_count": null, 540 | "metadata": {}, 541 | "outputs": [], 542 | "source": [ 543 | "from sklearn.pipeline import Pipeline\n", 544 | "from sklearn.preprocessing import PolynomialFeatures\n", 545 | "from sklearn.linear_model import LinearRegression" 546 | ] 547 | }, 548 | { 549 | "cell_type": "markdown", 550 | "metadata": {}, 551 | "source": [ 552 | "In the example, they create a function (`true_fun`) that generates a series of points on a graph in the shape of a Cosine." 553 | ] 554 | }, 555 | { 556 | "cell_type": "code", 557 | "execution_count": null, 558 | "metadata": {}, 559 | "outputs": [], 560 | "source": [ 561 | "def true_fun(X):\n", 562 | " return np.cos(1.5 * np.pi * X)" 563 | ] 564 | }, 565 | { 566 | "cell_type": "markdown", 567 | "metadata": {}, 568 | "source": [ 569 | "Using 30 random values as `X` inputs, they use the function to generate 30 related `y` values." 570 | ] 571 | }, 572 | { 573 | "cell_type": "code", 574 | "execution_count": null, 575 | "metadata": {}, 576 | "outputs": [], 577 | "source": [ 578 | "np.random.seed(0)\n", 579 | "\n", 580 | "n_samples = 30\n", 581 | "\n", 582 | "x = np.sort(np.random.rand(n_samples))\n", 583 | "y = true_fun(x) + np.random.randn(n_samples) * 0.1" 584 | ] 585 | }, 586 | { 587 | "cell_type": "markdown", 588 | "metadata": {}, 589 | "source": [ 590 | "Let's look at X and y." 591 | ] 592 | }, 593 | { 594 | "cell_type": "code", 595 | "execution_count": null, 596 | "metadata": {}, 597 | "outputs": [], 598 | "source": [ 599 | "X = x[:, np.newaxis]\n", 600 | "\n", 601 | "X[:5]" 602 | ] 603 | }, 604 | { 605 | "cell_type": "code", 606 | "execution_count": null, 607 | "metadata": {}, 608 | "outputs": [], 609 | "source": [ 610 | "y[:5]" 611 | ] 612 | }, 613 | { 614 | "cell_type": "code", 615 | "execution_count": null, 616 | "metadata": {}, 617 | "outputs": [], 618 | "source": [ 619 | "plt.scatter(X, y)\n", 620 | "plt.title(\"Cosine Dots\");" 621 | ] 622 | }, 623 | { 624 | "cell_type": "code", 625 | "execution_count": null, 626 | "metadata": {}, 627 | "outputs": [], 628 | "source": [ 629 | "X_test = np.linspace(0.05, 1, 100)[:, np.newaxis]" 630 | ] 631 | }, 632 | { 633 | "cell_type": "markdown", 634 | "metadata": {}, 635 | "source": [ 636 | "## Choose Appropriate Hyperparameters" 637 | ] 638 | }, 639 | { 640 | "cell_type": "markdown", 641 | "metadata": {}, 642 | "source": [ 643 | "Let's:\n", 644 | "* start with PolynomialFeatures **degree of 1**\n", 645 | "* use the default values for LinearRegression\n", 646 | "* feed each into our Pipeline" 647 | ] 648 | }, 649 | { 650 | "cell_type": "code", 651 | "execution_count": null, 652 | "metadata": {}, 653 | "outputs": [], 654 | "source": [ 655 | "polynomial_features = PolynomialFeatures(degree=1,\n", 656 | " include_bias=False)\n", 657 | "linear_regression = LinearRegression()\n", 658 | "pipeline = Pipeline([(\"polynomial_features\", polynomial_features),\n", 659 | " (\"linear_regression\", linear_regression)])" 660 | ] 661 | }, 662 | { 663 | "cell_type": "markdown", 664 | "metadata": {}, 665 | "source": [ 666 | "## Fit the Model" 667 | ] 668 | }, 669 | { 670 | "cell_type": "markdown", 671 | "metadata": {}, 672 | "source": [ 673 | "We only have to call `.fit()` on the pipeline, not on each of the components in the pipeline." 674 | ] 675 | }, 676 | { 677 | "cell_type": "code", 678 | "execution_count": null, 679 | "metadata": {}, 680 | "outputs": [], 681 | "source": [ 682 | "pipeline.fit(X, y)" 683 | ] 684 | }, 685 | { 686 | "cell_type": "markdown", 687 | "metadata": {}, 688 | "source": [ 689 | "## Apply the Model" 690 | ] 691 | }, 692 | { 693 | "cell_type": "code", 694 | "execution_count": null, 695 | "metadata": {}, 696 | "outputs": [], 697 | "source": [ 698 | "y_test = pipeline.predict(X_test)" 699 | ] 700 | }, 701 | { 702 | "cell_type": "markdown", 703 | "metadata": {}, 704 | "source": [ 705 | "## Examine the results" 706 | ] 707 | }, 708 | { 709 | "cell_type": "code", 710 | "execution_count": null, 711 | "metadata": {}, 712 | "outputs": [], 713 | "source": [ 714 | "plt.plot(X_test, y_test, label=\"Model\")\n", 715 | "plt.plot(X_test, true_fun(X_test), label=\"True function\")\n", 716 | "\n", 717 | "plt.scatter(X, y, edgecolor='b', s=20, label=\"Samples\")\n", 718 | "plt.legend()\n", 719 | "plt.title(\"Underfit\"); " 720 | ] 721 | }, 722 | { 723 | "cell_type": "markdown", 724 | "metadata": {}, 725 | "source": [ 726 | "## Choose Appropriate Hyperparameters" 727 | ] 728 | }, 729 | { 730 | "cell_type": "markdown", 731 | "metadata": {}, 732 | "source": [ 733 | "Repeating the process to generate polynomial features of **degree 4**:" 734 | ] 735 | }, 736 | { 737 | "cell_type": "code", 738 | "execution_count": null, 739 | "metadata": {}, 740 | "outputs": [], 741 | "source": [ 742 | "polynomial_features = PolynomialFeatures(degree=4,\n", 743 | " include_bias=False)\n", 744 | "linear_regression = LinearRegression()\n", 745 | "pipeline = Pipeline([(\"polynomial_features\", polynomial_features),\n", 746 | " (\"linear_regression\", linear_regression)])" 747 | ] 748 | }, 749 | { 750 | "cell_type": "markdown", 751 | "metadata": {}, 752 | "source": [ 753 | "## Fit the Model" 754 | ] 755 | }, 756 | { 757 | "cell_type": "code", 758 | "execution_count": null, 759 | "metadata": {}, 760 | "outputs": [], 761 | "source": [ 762 | "pipeline.fit(X, y)" 763 | ] 764 | }, 765 | { 766 | "cell_type": "markdown", 767 | "metadata": {}, 768 | "source": [ 769 | "## Apply the Model" 770 | ] 771 | }, 772 | { 773 | "cell_type": "code", 774 | "execution_count": null, 775 | "metadata": {}, 776 | "outputs": [], 777 | "source": [ 778 | "y_test = pipeline.predict(X_test)" 779 | ] 780 | }, 781 | { 782 | "cell_type": "markdown", 783 | "metadata": {}, 784 | "source": [ 785 | "## Examine the results" 786 | ] 787 | }, 788 | { 789 | "cell_type": "code", 790 | "execution_count": null, 791 | "metadata": {}, 792 | "outputs": [], 793 | "source": [ 794 | "plt.plot(X_test, y_test, label=\"Model\")\n", 795 | "plt.plot(X_test, true_fun(X_test), label=\"True function\")\n", 796 | "plt.scatter(X, y, edgecolor='b', s=20, label=\"Samples\")\n", 797 | "plt.legend()\n", 798 | "plt.title(\"Good match\"); " 799 | ] 800 | }, 801 | { 802 | "cell_type": "markdown", 803 | "metadata": {}, 804 | "source": [ 805 | "## Choose Appropriate Hyperparameters" 806 | ] 807 | }, 808 | { 809 | "cell_type": "markdown", 810 | "metadata": {}, 811 | "source": [ 812 | "Lastly, let's generate polynomial features of **degree 15**:" 813 | ] 814 | }, 815 | { 816 | "cell_type": "code", 817 | "execution_count": null, 818 | "metadata": {}, 819 | "outputs": [], 820 | "source": [ 821 | "polynomial_features = PolynomialFeatures(degree=15,\n", 822 | " include_bias=False)\n", 823 | "linear_regression = LinearRegression()\n", 824 | "pipeline = Pipeline([(\"polynomial_features\", polynomial_features),\n", 825 | " (\"linear_regression\", linear_regression)])" 826 | ] 827 | }, 828 | { 829 | "cell_type": "markdown", 830 | "metadata": {}, 831 | "source": [ 832 | "## Fit the Model" 833 | ] 834 | }, 835 | { 836 | "cell_type": "code", 837 | "execution_count": null, 838 | "metadata": {}, 839 | "outputs": [], 840 | "source": [ 841 | "pipeline.fit(X, y)" 842 | ] 843 | }, 844 | { 845 | "cell_type": "markdown", 846 | "metadata": {}, 847 | "source": [ 848 | "## Apply the Model" 849 | ] 850 | }, 851 | { 852 | "cell_type": "code", 853 | "execution_count": null, 854 | "metadata": {}, 855 | "outputs": [], 856 | "source": [ 857 | "y_test = pipeline.predict(X_test)" 858 | ] 859 | }, 860 | { 861 | "cell_type": "markdown", 862 | "metadata": {}, 863 | "source": [ 864 | "## Examine the results" 865 | ] 866 | }, 867 | { 868 | "cell_type": "code", 869 | "execution_count": null, 870 | "metadata": {}, 871 | "outputs": [], 872 | "source": [ 873 | "plt.plot(X_test, y_test, label=\"Model\")\n", 874 | "plt.plot(X_test, true_fun(X_test), label=\"True function\")\n", 875 | "plt.scatter(X, y, edgecolor='b', s=20, label=\"Samples\")\n", 876 | "plt.legend()\n", 877 | "plt.title(\"Overfit\"); " 878 | ] 879 | }, 880 | { 881 | "cell_type": "markdown", 882 | "metadata": {}, 883 | "source": [ 884 | "# Gotchas\n", 885 | "---" 886 | ] 887 | }, 888 | { 889 | "cell_type": "markdown", 890 | "metadata": {}, 891 | "source": [ 892 | "N/A" 893 | ] 894 | }, 895 | { 896 | "cell_type": "markdown", 897 | "metadata": {}, 898 | "source": [ 899 | "# Deep Dive\n", 900 | "---" 901 | ] 902 | }, 903 | { 904 | "cell_type": "markdown", 905 | "metadata": {}, 906 | "source": [ 907 | "N/A" 908 | ] 909 | }, 910 | { 911 | "cell_type": "markdown", 912 | "metadata": {}, 913 | "source": [ 914 | "# How to learn more: tips and hints\n", 915 | "---" 916 | ] 917 | }, 918 | { 919 | "cell_type": "markdown", 920 | "metadata": {}, 921 | "source": [ 922 | "\n", 923 | "**Tear apart the examples**: The [original example](https://scikit-learn.org/stable/auto_examples/model_selection/plot_underfitting_overfitting.html) showing underfitting/overfitting was a bit more complicated than what I showed here, cause they opted to create a three panel chart in `matplotlib` and to automate the processing by putting the degrees into a list and cycling through the list using a for loop to generate all the charts...\n", 924 | "\n", 925 | "I took individual lines, looked at each line, stripped away as much of the extraneous complications as I could to look at just the machine learning components and that greatly helped clarify what was going on." 926 | ] 927 | }, 928 | { 929 | "cell_type": "markdown", 930 | "metadata": {}, 931 | "source": [ 932 | "# Experience Points!\n", 933 | "---" 934 | ] 935 | }, 936 | { 937 | "cell_type": "markdown", 938 | "metadata": { 939 | "slideshow": { 940 | "slide_type": "slide" 941 | } 942 | }, 943 | "source": [ 944 | "**Task 01**\n", 945 | "\n", 946 | "\n", 947 | "Explore this documentation regarding underfitting/overfitting.\n", 948 | "\n", 949 | "[**Overfitting (link)**](https://scikit-learn.org/stable/auto_examples/model_selection/plot_underfitting_overfitting.html)\n", 950 | "\n", 951 | "Answer this question:\n", 952 | "\n", 953 | "* What technique can be used to quantitatively evaluate underfitting/overfitting?" 954 | ] 955 | }, 956 | { 957 | "cell_type": "markdown", 958 | "metadata": {}, 959 | "source": [ 960 | "---\n", 961 | "When you complete this exercise, please put your **green** post-it on your monitor. \n", 962 | "\n", 963 | "If you want to continue on at your own-pace, please feel free to do so.\n", 964 | "\n", 965 | "" 966 | ] 967 | }, 968 | { 969 | "cell_type": "markdown", 970 | "metadata": {}, 971 | "source": [ 972 | "# References\n", 973 | "---" 974 | ] 975 | }, 976 | { 977 | "cell_type": "markdown", 978 | "metadata": {}, 979 | "source": [ 980 | "Below are references that may assist you in learning more:\n", 981 | " \n", 982 | "|Title (link)|Comments|\n", 983 | "|---|---|\n", 984 | "|[General API Reference](https://scikit-learn.org/stable/modules/classes.html)||\n", 985 | "|[Overfitting Reference](https://scikit-learn.org/stable/auto_examples/model_selection/plot_underfitting_overfitting.html)||" 986 | ] 987 | }, 988 | { 989 | "cell_type": "code", 990 | "execution_count": null, 991 | "metadata": {}, 992 | "outputs": [], 993 | "source": [] 994 | } 995 | ], 996 | "metadata": { 997 | "kernelspec": { 998 | "display_name": "Python 3", 999 | "language": "python", 1000 | "name": "python3" 1001 | }, 1002 | "language_info": { 1003 | "codemirror_mode": { 1004 | "name": "ipython", 1005 | "version": 3 1006 | }, 1007 | "file_extension": ".py", 1008 | "mimetype": "text/x-python", 1009 | "name": "python", 1010 | "nbconvert_exporter": "python", 1011 | "pygments_lexer": "ipython3", 1012 | "version": "3.6.7" 1013 | } 1014 | }, 1015 | "nbformat": 4, 1016 | "nbformat_minor": 2 1017 | } 1018 | --------------------------------------------------------------------------------