├── .gitignore ├── 01_intro ├── 01_01_intro.pdf └── 01_02_version_control.pdf ├── 02_data ├── 02_03_data.pdf ├── 02_04_pandas.ipynb ├── data │ └── my_data.csv └── img │ ├── github.png │ ├── join.png │ └── pandas.png ├── 03_ethics ├── 03_05_ethics.pdf ├── 03_06_datavis.ipynb └── QRCodes │ ├── 03_06_01_viz.png │ ├── 03_06_02_viz.png │ ├── 03_06_03_viz.png │ └── 03_06_04_viz.png ├── 04_analysis └── 04_07_questions.pdf ├── 05_eda ├── 05_08_EDA.ipynb ├── 05_09_inference.pdf └── data │ └── woc_wi25.csv ├── 06_inference ├── 06_10_inference.ipynb └── 06_11_nonparametric.pdf ├── 07_text ├── 07_12_text.pdf └── 07_13_nlp.ipynb ├── 08_ml ├── 08_14_machine_learning.pdf └── 08_15_ml.ipynb ├── 09_geospatial └── 09_16_geospatial.pdf ├── 10_communication ├── 10_17_communication.pdf ├── 10_18_be_wrong.pdf └── 10_19_jobs_future.pdf ├── LICENSE ├── README.md └── XX_section ├── D1.pdf ├── D2.pdf ├── D3.pdf ├── D4.pdf ├── D5.pdf ├── D6.pdf ├── D7.pdf ├── D7_notebook.ipynb └── D8.pdf /.gitignore: -------------------------------------------------------------------------------- 1 | QRCodes/* -------------------------------------------------------------------------------- /01_intro/01_01_intro.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/COGS108/Lectures-Ellis/c9dbb70bd3b6e72b2835115aa6f1c1bfdf89ee79/01_intro/01_01_intro.pdf -------------------------------------------------------------------------------- /01_intro/01_02_version_control.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/COGS108/Lectures-Ellis/c9dbb70bd3b6e72b2835115aa6f1c1bfdf89ee79/01_intro/01_02_version_control.pdf -------------------------------------------------------------------------------- /02_data/02_03_data.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/COGS108/Lectures-Ellis/c9dbb70bd3b6e72b2835115aa6f1c1bfdf89ee79/02_data/02_03_data.pdf -------------------------------------------------------------------------------- /02_data/data/my_data.csv: -------------------------------------------------------------------------------- 1 | id,first_name,last_name,age,score,value 2 | 295,Andrea,Clark,46,-1,24547.87 3 | 620,Bill,Woods,46,492,46713.9 4 | 891,Alexander,Jacobson,48,489,32071.74 5 | 914,Derrick,Bradley,52,-1,30650.48 6 | 1736,Allison,Thomas,44,-1,9553.12 7 | 2049,Stephen,Williams,57,333,138936.92 8 | 2241,Malik,Wood,46,-1,10804.47 9 | 2607,Amber,Garcia,50,536,9367.27 10 | 2635,David,Coleman,68,351,66035.28 11 | 3585,Eric,Atkins,56,582,103977.32 12 | 4199,Justin,Johnson,59,500,34938.08 13 | 6739,Donna,Barnes,48,500,130915.2 14 | 7099,Larry,Prince,52,519,28474.33 15 | 7264,Megan,Mcmahon,45,349,0.0 16 | 7799,Sarah,Jones,47,497,29356.49 17 | 8343,Brian,Weber,49,477,17976.51 18 | 9386,Jackie,Clark,44,432,15446.53 19 | 10753,Laurie,Wood,58,335, 20 | 12243,Monica,Sanchez,49,524,35256.88 21 | 12250,Erica,Adams,41,-1,2126.22 22 | 12841,James,Williams,38,496,2298.05 23 | 12913,William,Blevins,39,249,4287.85 24 | 13120,Daniel,Key,64,165,17444.32 25 | 13255,Kelsey,Palmer,32,519,18858.85 26 | 13806,Ashley,Jones,57,-1,80325.66 27 | 14033,Larry,Ibarra,34,632,4315.74 28 | 14294,Michelle,Walters,58,472,33163.31 29 | 15137,Andrea,Simpson,55,-1,12155.47 30 | 15391,Charles,Santiago,44,266,0.0 31 | 15887,Cassandra,Mann,27,684,17864.92 32 | 16263,Victor,Dawson,42,323, 33 | 17184,Anne,Zuniga,56,549,81928.82 34 | 17345,Kristy,Fletcher,53,492,23577.41 35 | 18031,Michael,Watson,39,378,16165.32 36 | 18610,Jeffrey,Harrell,52,790,59912.55 37 | 19129,Brian,Travis,46,-1,49955.06 38 | 19557,Sabrina,Simon,44,-1,14022.29 39 | 20708,Dylan,Blake,33,750,6984.92 40 | 21427,Benjamin,Tran,34,543,9099.38 41 | 22584,Kenneth,Johnson,65,-1,50174.14 42 | 23915,Mary,Harris,52,266,106602.81 43 | 24650,Stephanie,Hayes,55,-1,15625.5 44 | 24779,Scott,Reyes,54,-1,65163.92 45 | 24794,Jasmine,Mitchell,53,552,86540.27 46 | 24952,James,Wright,51,399, 47 | 25259,Jonathan,Martinez,41,305,3659.99 48 | 25654,Ronald,Perkins,37,533,20991.69 49 | 25735,Anna,Ray,39,578,19122.05 50 | 26091,Marie,Wyatt,44,620, 51 | 26144,Cameron,Walters,44,416,6641.85 52 | 26898,Tina,Riddle,38,310,19281.5 53 | 27225,Dennis,Mason,62,622,105449.11 54 | 27687,Alice,Murphy,47,355,9317.1 55 | 29566,James,Kennedy,69,224, 56 | 29868,Jason,Bentley,69,343,60605.08 57 | 30323,Martin,Obrien,49,497,13049.86 58 | 30524,Kara,Mccoy,47,243,12559.49 59 | 31457,John,Shields,51,-1,25184.43 60 | 32151,Charles,Lam,47,942,35391.52 61 | 32413,Sarah,Harris,53,609,16980.92 62 | 33105,Kevin,Sanchez,31,251,1688.31 63 | 33863,Brian,Murphy,45,-1,20802.53 64 | 34069,Cynthia,West,29,558,1952.0 65 | 35466,Samantha,Park,55,483, 66 | 35621,Vanessa,Hernandez,35,613,28944.29 67 | 35701,Andrew,Nelson,37,406,8145.81 68 | 35870,Sabrina,Mcneil,41,649,32979.4 69 | 36124,Alison,Sullivan,40,362,6123.06 70 | 36575,Elizabeth,Bailey,53,642,16579.52 71 | 37045,Mary,Choi,42,682,39519.13 72 | 37836,Kristin,King,34,-1,24409.19 73 | 37970,Lisa,Myers,50,688,11203.32 74 | 38892,Kimberly,Jefferson,60,-1,28155.47 75 | 39773,Candace,Johnson,44,429,12802.62 76 | 40154,Marie,Daniels,41,512,14380.37 77 | 41347,Anthony,Tucker,52,508,21005.42 78 | 42138,John,Stuart,28,543, 79 | 42140,Aaron,Hart,37,-1,16873.92 80 | 42472,Jacqueline,Young,35,618,42726.56 81 | 42747,Johnathan,Brown,46,567,15226.86 82 | 43140,Nancy,Farley,52,545,30315.75 83 | 44627,Eric,Nelson,31,416,0.0 84 | 45647,Melissa,Bailey,46,-1,4104.35 85 | 47234,Kimberly,Richard,53,465,71801.97 86 | 47459,Todd,Davis,44,708,8012.19 87 | 47669,Michael,Cowan,61,263,16167.9 88 | 48880,Kristen,Dalton,40,555,18512.25 89 | 49196,Elizabeth,Gordon,45,642,12377.85 90 | 49712,David,Richardson,52,400,24932.04 91 | 50960,Maxwell,Mcbride,63,347,105598.79 92 | 51723,Megan,Nguyen,40,443, 93 | 51784,Elizabeth,Stephens,55,149,12632.32 94 | 51909,Michael,Brennan,45,-1,12838.06 95 | 52214,Amy,Scott,36,664,11623.7 96 | 52428,Bryan,Barnett,64,445,59795.5 97 | 53785,Jonathan,Hill,51,344,18568.89 98 | 53812,Melissa,Walton,51,-1,0.0 99 | 53929,James,Miller,43,419,18051.92 100 | 53932,Nicole,Ruiz,36,730,23236.53 101 | 54438,Charles,Schneider,47,553,50773.76 102 | 54849,Sara,Mendoza,42,124,10769.18 103 | 54949,Christine,Myers,45,-1,15230.16 104 | 55691,Katherine,Stafford,54,457,169406.7 105 | 55821,Sarah,Kennedy,41,709, 106 | 56933,Mark,Fletcher,57,253,45151.13 107 | 57077,Jason,Price,32,-1,24557.68 108 | 57542,Ryan,Lee,43,570,5869.58 109 | 57964,Heidi,Allen,57,496,42284.5 110 | 58307,Shannon,Bailey,53,596,22575.35 111 | 58794,Jared,Brown,61,731,51968.96 112 | 59036,Peter,Molina,51,453,15810.33 113 | 59363,Steven,Valentine,46,265,6614.88 114 | 60177,Christopher,Jones,28,748,25035.69 115 | 61015,Nicholas,Chapman,52,289,47600.67 116 | 61534,Alexandra,Chavez,39,598, 117 | 61557,Kathryn,Boyle,48,369,33674.81 118 | 61721,Jason,Murphy,57,163,6766.66 119 | 61808,Krista,Smith,56,469,51671.27 120 | 63587,Paul,Steele,54,616,69185.93 121 | 64423,Dennis,Hernandez,58,329,11611.97 122 | 64535,William,Irwin,60,304,24581.63 123 | 64654,Pamela,Wilson,52,612,65061.89 124 | 64731,Sheila,Zimmerman,32,-1,9726.45 125 | 65382,Laura,Burke,35,613,10589.0 126 | 66968,Cynthia,Davis,43,424,15326.1 127 | 67135,Madeline,Rivera,68,343,30993.97 128 | 68932,Nicholas,Chan,46,299,17580.28 129 | 68961,David,Simmons,34,702,35725.49 130 | 69588,Valerie,Griffin,48,668,10678.15 131 | 70182,Monica,Phillips,42,716,17987.82 132 | 70600,Lindsey,Young,45,630,13339.26 133 | 70721,Jennifer,Perkins,60,338,115356.55 134 | 70735,Eric,Olson,27,665,3975.59 135 | 71120,Jonathan,Blevins,39,423,13456.24 136 | 71787,Maria,West,29,453,7271.65 137 | 71943,Catherine,Sherman,45,416,11873.05 138 | 72035,Mike,Evans,53,288,24561.64 139 | 72631,Scott,Johnson,47,517,43254.45 140 | 74271,Mike,Fisher,44,629,15712.7 141 | 75015,Kelly,Murray,43,542,0.0 142 | 75593,Regina,Morgan,47,614,95351.02 143 | 76282,Ashley,Lynch,53,519,14612.16 144 | 76613,Craig,Lewis,31,340,14766.8 145 | 78041,Heather,Ibarra,52,494,79594.31 146 | 79083,Michael,Simmons,54,490,0.0 147 | 79114,Steven,Kent,32,574,10087.18 148 | 80217,Kellie,Ryan,60,183,5060.72 149 | 80411,Tina,Yu,56,525,29622.86 150 | 80593,Donald,Melton,42,625,20393.78 151 | 80765,David,Andrews,53,545,0.0 152 | 81068,Aaron,Roberts,32,617,4745.92 153 | 81282,Emily,Medina,23,775,9116.03 154 | 81648,Debbie,Barrett,49,766,18573.58 155 | 82183,Tamara,Jenkins,46,-1,23914.62 156 | 82826,Allison,Day,34,523,23130.71 157 | 83528,Brenda,Green,28,794,5233.1 158 | 83590,Alexander,Murphy,60,377,8930.89 159 | 84203,Warren,Wilson,34,-1,17172.97 160 | 84475,Brenda,Cox,64,-1,91349.57 161 | 84683,Jessica,Bryant,42,840,204999.96 162 | 84812,Amanda,Williams,14,749,5241.51 163 | 84838,Renee,Lyons,39,399,0.0 164 | 84846,Eric,Smith,39,714,15188.88 165 | 84965,Robert,Black,59,-1,13873.11 166 | 85812,Mark,Payne,51,563,32609.97 167 | 86230,Jose,Adams,41,551,7416.68 168 | 86560,John,Blevins,50,534,29842.6 169 | 86711,Ashley,Alexander,54,459,11137.09 170 | 87122,Douglas,Hogan,49,-1,21598.29 171 | 87738,Gregory,Sutton,46,462,42052.3 172 | 87876,Austin,Dixon,44,303,5787.5 173 | 87928,Alexandra,Miller,36,752,12297.85 174 | 88273,David,Matthews,49,309,7018.32 175 | 88340,Autumn,Brooks,52,587,36261.02 176 | 88868,Ralph,Wilkinson,51,496,12232.08 177 | 89550,Daniel,Sharp,43,448,8150.64 178 | 89765,Charles,Thompson,46,678,8285.88 179 | 89922,Robert,Woods,64,282,161540.81 180 | 90113,Karen,Morgan,37,371,21187.93 181 | 90367,Kevin,Stewart,48,700,0.0 182 | 91524,Jamie,Gardner,50,291,21141.69 183 | 91623,Amanda,Webb,54,339,62692.36 184 | 91893,Kelsey,Martin,52,-1,40206.46 185 | 91921,Javier,Brooks,38,658,15437.74 186 | 91946,Susan,Garcia,34,664,7084.82 187 | 92298,Jennifer,Marks,39,-1,36443.84 188 | 93114,Kerri,Fields,44,478,33086.07 189 | 93989,David,Adkins,48,460,2434.35 190 | 94421,Tracy,Reyes,57,284,11442.11 191 | 94628,Shannon,Andrews,45,426,9593.03 192 | 94730,Lisa,Dominguez,22,-1, 193 | 95502,Julia,Oliver,57,354,23522.87 194 | 96101,Wayne,Bentley,28,918,18303.2 195 | 96293,Michael,Phillips,32,-1,9159.9 196 | 96371,Sherri,Austin,46,698,29412.01 197 | 97441,Krista,Ortiz,34,-1,24074.79 198 | 97728,Anna,Chambers,37,598,0.0 199 | 98115,Jennifer,Pitts,29,606,6876.75 200 | 98284,Brittany,Jenkins,34,665,43525.88 201 | 98366,Katelyn,Brown,45,501,29668.38 202 | -------------------------------------------------------------------------------- /02_data/img/github.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/COGS108/Lectures-Ellis/c9dbb70bd3b6e72b2835115aa6f1c1bfdf89ee79/02_data/img/github.png -------------------------------------------------------------------------------- /02_data/img/join.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/COGS108/Lectures-Ellis/c9dbb70bd3b6e72b2835115aa6f1c1bfdf89ee79/02_data/img/join.png -------------------------------------------------------------------------------- /02_data/img/pandas.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/COGS108/Lectures-Ellis/c9dbb70bd3b6e72b2835115aa6f1c1bfdf89ee79/02_data/img/pandas.png -------------------------------------------------------------------------------- /03_ethics/03_05_ethics.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/COGS108/Lectures-Ellis/c9dbb70bd3b6e72b2835115aa6f1c1bfdf89ee79/03_ethics/03_05_ethics.pdf -------------------------------------------------------------------------------- /03_ethics/QRCodes/03_06_01_viz.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/COGS108/Lectures-Ellis/c9dbb70bd3b6e72b2835115aa6f1c1bfdf89ee79/03_ethics/QRCodes/03_06_01_viz.png -------------------------------------------------------------------------------- /03_ethics/QRCodes/03_06_02_viz.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/COGS108/Lectures-Ellis/c9dbb70bd3b6e72b2835115aa6f1c1bfdf89ee79/03_ethics/QRCodes/03_06_02_viz.png -------------------------------------------------------------------------------- /03_ethics/QRCodes/03_06_03_viz.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/COGS108/Lectures-Ellis/c9dbb70bd3b6e72b2835115aa6f1c1bfdf89ee79/03_ethics/QRCodes/03_06_03_viz.png -------------------------------------------------------------------------------- /03_ethics/QRCodes/03_06_04_viz.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/COGS108/Lectures-Ellis/c9dbb70bd3b6e72b2835115aa6f1c1bfdf89ee79/03_ethics/QRCodes/03_06_04_viz.png -------------------------------------------------------------------------------- /04_analysis/04_07_questions.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/COGS108/Lectures-Ellis/c9dbb70bd3b6e72b2835115aa6f1c1bfdf89ee79/04_analysis/04_07_questions.pdf -------------------------------------------------------------------------------- /05_eda/05_09_inference.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/COGS108/Lectures-Ellis/c9dbb70bd3b6e72b2835115aa6f1c1bfdf89ee79/05_eda/05_09_inference.pdf -------------------------------------------------------------------------------- /05_eda/data/woc_wi25.csv: -------------------------------------------------------------------------------- 1 | How fast does human hair grow (cm/yr)?,"If every living person stood crammed together side-by-side, how large of an area would they occupy (km²)?","How many days would it take to walk from San Diego to New York City (assuming no stopping to fix shoes, apply sunscreen, or for sleeping, eating, or other biological needs)?" 2 | 10,40000,300 3 | 12,10000,40 4 | 20,5,20 5 | 3,2,50 6 | 45,200,45 7 | 300/1,2,45 8 | 10,10000,60 9 | 50,100,1000 10 | 30,10000,300 11 | 14,1000,15 12 | 50,100000,500 13 | 15,"1,000,000",17 14 | 5cm per year,100000,100 days 15 | 17.5,"100,000,000",200 16 | 10,1000000,4000 days 17 | 20,15,14 18 | 9 inches,10000,45 19 | 10 cm/year,1 billion,160 days 20 | 60 seconds,3200,1600 miles 21 | 7,500,60 22 | 8,10000,365 23 | 7,"100,000",30 24 | 10,1000000,80 25 | 10,10000,45 26 | 5cm/yr,1000000,100 27 | 20,1000,200 28 | 10,10000,40 29 | 10,"100,000","1,000,000" 30 | 10000,700000,8 31 | 150cm/yr,300km^2,80 32 | 35,9,100 33 | 5,5,90 34 | 100,800000000,730 35 | 8,100000,1000 36 | 40,7000000,50 37 | 15,100000,30 38 | 48,10^5,60 39 | 25,10000,78 40 | 10,10000,100 41 | 6,800,30 42 | 14 cm/yr,10000,56 43 | 20,3.5 billion,90 days 44 | 50,3000,14 days 45 | 36,8000000,5 46 | 10,3,60 47 | 24,5000000,45 48 | 10,1000000000,150 49 | 10cm/year,10^9,100 50 | 100,80km^2,3 51 | 12 cm,25000 km2,3 months 52 | 10,100000000000,50 53 | 30 cm/yr,10000km^2,25 days 54 | 15,10000,80 55 | 7,2000000000,80 56 | 10,100000,479 57 | 7cm/yr,10^10,100 58 | 10,100000,1000 59 | 25,1000,102 60 | 60,3,100 61 | 6,50000,10 62 | 10,10000,100 63 | 12,1000,500 64 | 15cm,1000000,50 65 | 48cm/yr,100000,6 months 66 | 13,7,15 67 | 13,2.5,25 68 | 12,1000000,4 69 | 12,5,125 70 | 15 cm / year,10000000,200 71 | 15,10^5,8 72 | 12cm,1000,90days 73 | 80,2,100 74 | 8,100000000,450 75 | 10,25000000,500 76 | 6,1000,1000 77 | 20,10000,20 78 | 20,500000,31 79 | 20,10000,150 80 | 5,1000000,20 81 | 26cm/yr,200000,95 82 | 11cm,10000,50 83 | 24,10000000,60 84 | 20,1000000,350 85 | 5,500000000,50 86 | 20,100,200 87 | 20cm/yr,10000000,2months 88 | 10,10000,50 89 | 24,10000,100 90 | 15,10000,30 91 | 15,10000000000,30 92 | 15,2000000000,30 93 | 20,200000,15 94 | 84cm/yr,1000km2,100 days 95 | 2,2000000,7 96 | 6,1000,75 97 | 50,5000,1 month 98 | (48/yr),10000000,50 99 | 12,100000,18 100 | 4cm,5,6 101 | 10^2,10000000,100 102 | 4,50,12 103 | 20 cm/yr,"8,000",10 104 | 10cm/yr,"10,000km^2",20 105 | 12,1000,20 106 | 72cm,500,100 days 107 | 30,100000,30 108 | 20,1000000,300 109 | 10,1000000,1 month 110 | 25,600000,25 111 | 20 cm/yr,1000000,54 112 | 100,1000,50 113 | 25cm/yr,10^50 km^2,10000000 114 | 18,1000,300 115 | 24,4,80 116 | 60,40,20 117 | 20,5000,30 118 | 3,"4,000,000 km2",100 days 119 | 15,1,1 120 | 10,5000000,20 121 | 30,500,400 122 | 24,32 million,100 123 | 8cm/yr,10000000,30 124 | 10,100000,10000 125 | 12,1000000000,8 126 | 18 cm,4 million,100 days 127 | 100,10,150 128 | 35/1,"140,000,000",100000 129 | 55,300000000,100 130 | 80,100000,50000 131 | 5 inches,200,180 132 | 7,89000,300 133 | 20,"100,000",50 days 134 | 4.5,1000000,365 135 | 50,200,300 136 | 9,5,1000 137 | 16,400000,15 138 | 5,100000,250 139 | 6.5,1000,100 140 | 15,4000,100 141 | 15 cm,100000,90 142 | 40cm/yr,900,3 143 | 25,10^12,130 144 | 185 cm/yr,"1,000,000 km²",25 145 | 24,1000,80 146 | 40,1000,900 147 | 60 cm/yr,20000 km^2,1000 148 | 7,5000000,90 149 | 5,5,H 150 | 30,10000,30 151 | 8 cm/yr,"100,000 km^2",500 152 | 10cm/yr,1000000,30 153 | 2,30000000000,20 154 | 15,100000,800 155 | 40,10000,100 156 | 39,50000,50 157 | 100,5000000,100 158 | 9,10000,200 159 | 48 cm/yr,10^4,30 160 | 10 cm,100000,50 161 | 18 cm,800000000,80 162 | 15cm/yr,100000 km^2,500 days 163 | 60cm/yr,?,30 164 | 15,10^10,1000 165 | 24,40,38 166 | 22,5,3 167 | 25,"100,000,000",ed 168 | 5 cm,3,5 169 | 20 cm,1000000,20 days 170 | 15cm/yr,They could fit in paris,200 days 171 | 5,5,5 172 | 13,500000,26 173 | Around 7-8 cm,2,100 174 | 10,100,10 175 | 50,100,50 176 | 80,50,13 177 | 10,2000,1 week 178 | 30,200000,365 179 | 15,1000000,180 180 | 12cm,"6,000",70 181 | 25,10000000,1000 182 | 0,700000,1000 183 | 24,240,80 184 | 20,100,50 185 | 2,1000000,365 186 | 36,40,583837 187 | 10,1000,1 month 188 | 14,2000,250 189 | 9,100000,24 190 | 50,100000,100 191 | 24,1000000,200 192 | 10,10^8,100 193 | 9,60000,200 194 | 6,17,168 195 | 50,60,1000 196 | 36,7000000,48 days 197 | 3,9999,365 198 | 20,1000,10000 199 | 27,1000,30 days 200 | 15,10000000000,10 201 | 30,10^8,600? 202 | 20,50000,200 203 | 8cm a year,500,10 204 | 25,10000,5 205 | 2 inches,"250,000km^2",18 days 206 | cm,100000,10000 207 | 360,20,40 208 | 5,1000000,100 209 | 12,100000,90 210 | 6cm /yr,1E+38,10000 211 | 15,1000,40 212 | 7,50,3000 213 | 20,100,70 214 | 10,100,60 215 | 15,1150,980 216 | 50,80000,40 217 | 40,1 million,3 months 218 | 5,1000000000,400 219 | 100,10000000,30 220 | 20,10^9,48 221 | 20cm a year,100,200 days 222 | 122,10000,732 223 | 24cm/year,"1,000,000 km^2",60 days 224 | 25,20000,50 225 | 20,3000000,365 226 | 10,100,1 year 227 | 30,20,90 228 | 3000,100000,23 229 | 10,10000,60 230 | 25 cm,93682737856,1000 231 | 5,100,50 232 | 7,37103,498 233 | 3,10000,15 234 | 20 cm,1000,30 days 235 | 10000/yr,1000000,2 months 236 | 10,a lot,2 months 237 | 5 cm/yr,20,3 weeks 238 | 36,100000,10 days 239 | 10 cm/ yr,10000000,500 240 | 30,10000,1000 241 | 20,8000000000,162 242 | 12,10000,25 243 | 20,100000000000,80 244 | 10,100000,50 245 | 18,8000,30 246 | 20,100000,20 247 | 10,1E+19,50000 248 | 10,10000,50 249 | 30,1000,80 250 | 30,10^8,30 251 | 45,1000,10 252 | 25,10000000,28 253 | 4,100000,14 254 | 20,1000000,20 255 | 30,1600,30000 mins 256 | 5,10x 10^3,12 days 257 | 50,1000000,20 258 | 20,1 million,42 259 | 15,1000,1000 260 | 10,10000,8 261 | 48,200,300 262 | 28,10000000,700 263 | 100cm/year,100000000000000km^2,1000days 264 | 150,1000000000,45 265 | 20,800000,29 266 | "exactly 456,000",Whatever the size of Los Angels is ,about 1/3 of a year 267 | 5cm/year,8000km^2,300 days 268 | 30,300000,90 269 | 15,1000,1000 270 | 100,1000,100 271 | 25,10,28 272 | 5,100000,2 years 273 | 2.5,10^12,40 days 274 | 15,500,80 275 | 8,100000,28 276 | 30,10000,31 277 | 1000,10000000,100 278 | 0.4,978,71 279 | 30,10000,100 280 | 20,100000000000,80 281 | 20cm/yr,9 x 10^3,30 days 282 | 18,1000,15 283 | 300cm / yr,250km,14 days 284 | 0.01 cm,100,30 days 285 | 25,10^23,60 286 | 20,80000,80 287 | 100,10000,1000 288 | 16,150,20 days 289 | 7,200,50 290 | 15,20000 km^2,34 291 | 16,"60,000",37 292 | 10,1000,40 293 | 100,1000,100 294 | 100,10000,120 295 | 25,1400,30 296 | 36,10000,30 297 | 20,1000,80 298 | 8,100000000000,8 299 | 100cm/yr,100000km^2,239 days 300 | 1.5,5000000000,150 days 301 | 100,2,1000 302 | 12,100000,1000 303 | ~70cm/yr,10000,150 304 | 15,100000,20 305 | 3018,501,250 306 | 12,100000000,12 * 3000 307 | 10,100,150 308 | 500,10000,25 309 | 20,100000000000,15 310 | 2.5,1000,37 311 | 10,80000,50 312 | 25,8million,6 months 313 | 10cm,10000000,10000 314 | 20,100000,36500 315 | 10 cm/year,50,21 316 | 20,10000,100 317 | 14,9000,30 days 318 | 36 cm,100000000,1000 319 | 50,20,298 days 320 | 7,10^10,10^7 321 | 10,1000,60 322 | 50,100,3 months 323 | 10,10km^2,60 324 | 20,1000000,90 325 | 25,10000,1000000 326 | 15,100000,50000 327 | 15cm/yr,3 billion,150 days 328 | 10,10000000,50 329 | 15cm,100000,50 330 | 20,15,19 331 | 20,450000,85 332 | 10,1000,1 year 333 | 30,1000000,100 334 | 90cm/yr,100,100 335 | 6,8000,800 336 | 12,1000,75 337 | 80,1000000,7 338 | 25,10000,35 339 | 4,78000,"1,500" 340 | 10cm/year,one million km^2,700 days 341 | 100,10^15,28 342 | 300,10000,30 343 | 15,1000,90 344 | 10/yr,10000,1000 345 | 10 cm/yr,"1,000,000 km^2",5 months = 155 days 346 | 12,100000,50 347 | 10,10000000000,50 348 | 10,100000000,300 349 | 20cm,100,150 350 | 10cm/year,100,50 351 | 20,200000,200 352 | 12 cm/yr,10000,1000 353 | 100,1000^2,400 354 | 10,"100,000",100 355 | 6,7,30 356 | 25,10000,10000 357 | 16,"100,000",43 358 | 20,10,2000 359 | 10,10,50 360 | 36,10000,50 361 | 24,10000000,11 362 | 30,50000000000,200 363 | 15,5x10^4,week 364 | 12cm/yr,2000km^2,35 days 365 | 10,1000000,10 366 | 30,1000000,365 367 | 210cm/yr,6000000km^2,53 368 | 50,10000,1000 369 | 2,100,100 370 | 60,10000,20 days 371 | 10,700,300 372 | 20,600,600 373 | 1,200000000,30 374 | 20cm/yr,1000km^2,500 375 | 7,100000,40 376 | 24,1250000,42 377 | 15cm/yr,"2,500,000,200",60 days 378 | 100,7000000,39 379 | 15,10000,100 380 | 16,100,100 381 | 1000,1000000,1000 382 | 50,100000,30 383 | 1 cm per month,2 mile diameter sphere,"Approx 5,500 miles? 400 days?" 384 | 15,20000000,45 385 | 15,10,30 386 | 70,1600,90 387 | 16,25,150 388 | 100,4000,10 389 | 17,8000,"1,000" 390 | 1000,100000,10 391 | 30,100,50 392 | 15,1000,20 393 | 10cm/yr,10*10^100=1×10¹⁰¹,100 394 | 15,1000,60 395 | 7 cm/year,>10000000,24 396 | 7 cm/yr,1000000,600 397 | 12,1E+20,41 398 | 15,280000000,430 399 | 40,60,15 400 | 30,7.8,720 401 | 15,1000000,30 402 | 12 cm/yr,100,30 403 | 1800 cm/year,The would occupy one trillion km 2,75 days 404 | 10,20000,25 405 | 65cm/yr,"88,560km",32 days 406 | 12cm/yr,0.3*0.3*80 billion =7.2 * 10^3km^3,30 407 | 20cm/yr,10000km*2,30 408 | 7 cm/yr,3 mil km,180 days 409 | 25cm a year,"25,000,000km^2",1000 days 410 | 15 cm,800,38 days 411 | 15,1000000,74 412 | 10,5000,150 413 | 10,10,1000 414 | 10.5 cm/yr,10000 km^2,50 days 415 | 6cm,1000,45 416 | 5,10^3,350 417 | 15,400000,47 418 | 8,100,25 419 | 13,5000,45 days 420 | 8,"20,000",75 421 | 20,100,20 422 | 30,1000,100 423 | 24,4000000,30 424 | 17,2000,100 425 | 5 cm/yr,100,60 426 | 15,1500,10 427 | 20 cm/yr,10000,41 days 428 | 1.5 per year,100000 km2,500 days 429 | 5cm a month,80000,187 430 | 5 centimeter per month,100 yards,100 431 | 12,550,100 432 | 12,8000,60 433 | 5,8000000000,60 434 | 15,8,40 435 | 21,16000000,300 436 | 10,1 million,300 437 | 30,250000000,90 438 | 30,300,21 439 | 18,2000,150 days 440 | 10,50,60 441 | 10 cm/yr,"about 3,600,000,000 km",45 442 | 12,100000,10000 -------------------------------------------------------------------------------- /06_inference/06_11_nonparametric.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/COGS108/Lectures-Ellis/c9dbb70bd3b6e72b2835115aa6f1c1bfdf89ee79/06_inference/06_11_nonparametric.pdf -------------------------------------------------------------------------------- /07_text/07_12_text.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/COGS108/Lectures-Ellis/c9dbb70bd3b6e72b2835115aa6f1c1bfdf89ee79/07_text/07_12_text.pdf -------------------------------------------------------------------------------- /07_text/07_13_nlp.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": { 6 | "slideshow": { 7 | "slide_type": "slide" 8 | } 9 | }, 10 | "source": [ 11 | "### Course Announcements\n", 12 | "\n", 13 | "**Due this Sunday** (11:59 PM): D6, Q7, Checkpoint #1, Weekly Project Survey (*optional*)\n", 14 | "\n", 15 | "Notes: \n", 16 | "- No Prof OH on Friday\n", 17 | "- For Data Checkpoint:\n", 18 | " - All changes to proposal sections go in the *data checkpoint notebook*\n", 19 | " - Respond to grader on Proposal Issue\n", 20 | "- To follow along (and get lecture attendance credit): https://forms.gle/sZC9kebUm64pts9ZA (will remain open until Sunday 11:59 PM)\n", 21 | "\n", 22 | "![](../QRCodes/07_13_01.png)" 23 | ] 24 | }, 25 | { 26 | "cell_type": "markdown", 27 | "metadata": { 28 | "slideshow": { 29 | "slide_type": "slide" 30 | } 31 | }, 32 | "source": [ 33 | "# Text Analysis (NLP: Natural Language Processing)\n", 34 | "\n", 35 | "- **Sentiment Analysis**\n", 36 | " - tokenization\n", 37 | " - stop words\n", 38 | " - stemming\n", 39 | "- **TF-IDF**\n", 40 | " - Bag of Words\n", 41 | " - term frequency\n", 42 | " - inverse document frequency\n", 43 | "- Tools: `nltk`" 44 | ] 45 | }, 46 | { 47 | "cell_type": "code", 48 | "execution_count": null, 49 | "metadata": {}, 50 | "outputs": [], 51 | "source": [ 52 | "# if you've never installed nltk before you will have to run this cell\n", 53 | "# but you only need to do this once\n", 54 | "# %pip install nltk" 55 | ] 56 | }, 57 | { 58 | "cell_type": "code", 59 | "execution_count": null, 60 | "metadata": { 61 | "slideshow": { 62 | "slide_type": "slide" 63 | } 64 | }, 65 | "outputs": [], 66 | "source": [ 67 | "# pandas and matplotlib setup\n", 68 | "import pandas as pd\n", 69 | "\n", 70 | "import matplotlib.pyplot as plt\n", 71 | "plt.rcParams['figure.figsize'] = (17, 7)\n", 72 | "plt.rcParams.update({'font.size': 14})\n", 73 | "import seaborn as sns\n", 74 | "\n", 75 | "#improve resolution\n", 76 | "#comment this line if erroring on your machine/screen\n", 77 | "%config InlineBackend.figure_format ='retina'\n", 78 | "\n", 79 | "import warnings\n", 80 | "warnings.filterwarnings('ignore')\n", 81 | "\n", 82 | "#import natural language toolkit\n", 83 | "import nltk\n", 84 | "\n", 85 | "# download stopwords & punkt\n", 86 | "nltk.download('stopwords')\n", 87 | "nltk.download('punkt')" 88 | ] 89 | }, 90 | { 91 | "cell_type": "markdown", 92 | "metadata": { 93 | "slideshow": { 94 | "slide_type": "slide" 95 | } 96 | }, 97 | "source": [ 98 | "#### Reminder: **Natural Language Processing** is a whole field of study.\n", 99 | "\n", 100 | "Like most topics in this course, there are many courses solely focused on the appropriate analysis of text. We'll cover the general concepts in this course, but know you're missing lots of important details." 101 | ] 102 | }, 103 | { 104 | "cell_type": "markdown", 105 | "metadata": { 106 | "slideshow": { 107 | "slide_type": "slide" 108 | } 109 | }, 110 | "source": [ 111 | "## Natural Language Toolkit (`nltk`)\n", 112 | "\n", 113 | "For more details on using the functionality within this package, check out the [NLTK Book](http://www.nltk.org/book/).\n", 114 | "\n", 115 | "0. Preface\n", 116 | "1. Language Processing and Python\n", 117 | "2. Accessing Text Corpora and Lexical Resources\n", 118 | "3. Processing Raw Text\n", 119 | "4. Writing Structured Programs\n", 120 | "5. Categorizing and Tagging Words \n", 121 | "6. Learning to Classify Text\n", 122 | "7. Extracting Information from Text\n", 123 | "8. Analyzing Sentence Structure\n", 124 | "9. Building Feature Based Grammars\n", 125 | "10. Analyzing the Meaning of Sentences \n", 126 | "11. Managing Linguistic Data\n", 127 | "12. Afterword: Facing the Language Challenge" 128 | ] 129 | }, 130 | { 131 | "cell_type": "markdown", 132 | "metadata": { 133 | "slideshow": { 134 | "slide_type": "fragment" 135 | } 136 | }, 137 | "source": [ 138 | "[VADER](https://github.com/cjhutto/vaderSentiment) is a particularly helpful tool/lexicon when working with sentiments expressed in social media (tweets, online reviews, etc.)\n", 139 | "\n", 140 | "Its functionality is available through `nltk`, so we'll download the vader lexicon for use later in this notebook." 141 | ] 142 | }, 143 | { 144 | "cell_type": "code", 145 | "execution_count": null, 146 | "metadata": { 147 | "slideshow": { 148 | "slide_type": "fragment" 149 | } 150 | }, 151 | "outputs": [], 152 | "source": [ 153 | "# get lexicon we'll be working with today\n", 154 | "nltk.download('vader_lexicon') " 155 | ] 156 | }, 157 | { 158 | "cell_type": "markdown", 159 | "metadata": { 160 | "slideshow": { 161 | "slide_type": "slide" 162 | } 163 | }, 164 | "source": [ 165 | "## The Data" 166 | ] 167 | }, 168 | { 169 | "cell_type": "code", 170 | "execution_count": null, 171 | "metadata": { 172 | "slideshow": { 173 | "slide_type": "-" 174 | } 175 | }, 176 | "outputs": [], 177 | "source": [ 178 | "quarters = ['Wi25', 'Wi24', 'Wi21', 'Fa20', 'Wi20', 'Sp20', 'Sp19']" 179 | ] 180 | }, 181 | { 182 | "cell_type": "code", 183 | "execution_count": null, 184 | "metadata": { 185 | "slideshow": { 186 | "slide_type": "-" 187 | } 188 | }, 189 | "outputs": [], 190 | "source": [ 191 | "def read_data(quarter):\n", 192 | " '''read data in from specified quarter, extract columns of interest \n", 193 | " and add a column indicating quarter from which data originated'''\n", 194 | " \n", 195 | " df = pd.read_csv('https://raw.githubusercontent.com/shanellis/datasets/master/COGS108_feedback_' + quarter + '.csv')\n", 196 | " df = df[['enjoyed_most', 'enjoyed_least']]\n", 197 | " df['quarter'] = quarter\n", 198 | " \n", 199 | " return df" 200 | ] 201 | }, 202 | { 203 | "cell_type": "code", 204 | "execution_count": null, 205 | "metadata": { 206 | "scrolled": true, 207 | "slideshow": { 208 | "slide_type": "fragment" 209 | } 210 | }, 211 | "outputs": [], 212 | "source": [ 213 | "# read in data for all quarters\n", 214 | "df = pd.DataFrame()\n", 215 | "\n", 216 | "for quarter in quarters:\n", 217 | " qtr = read_data(quarter)\n", 218 | " df = pd.concat([df, qtr], ignore_index=True)\n", 219 | " \n", 220 | "df" 221 | ] 222 | }, 223 | { 224 | "cell_type": "markdown", 225 | "metadata": { 226 | "slideshow": { 227 | "slide_type": "slide" 228 | } 229 | }, 230 | "source": [ 231 | "## Describe & Explore\n", 232 | "\n", 233 | "We'll quickly describe and explore the data to see what information we have before moving on to Text Analysis." 234 | ] 235 | }, 236 | { 237 | "cell_type": "markdown", 238 | "metadata": { 239 | "slideshow": { 240 | "slide_type": "fragment" 241 | } 242 | }, 243 | "source": [ 244 | "### Data Considerations\n", 245 | "\n", 246 | "- duplicate responses?\n", 247 | "- PIDs for individuals in the class (typos?)\n", 248 | "- missingness?\n", 249 | "- reflect reality?" 250 | ] 251 | }, 252 | { 253 | "cell_type": "code", 254 | "execution_count": null, 255 | "metadata": { 256 | "slideshow": { 257 | "slide_type": "fragment" 258 | } 259 | }, 260 | "outputs": [], 261 | "source": [ 262 | "# how many from each quarter?\n", 263 | "df.value_counts('quarter')" 264 | ] 265 | }, 266 | { 267 | "cell_type": "markdown", 268 | "metadata": { 269 | "slideshow": { 270 | "slide_type": "fragment" 271 | } 272 | }, 273 | "source": [ 274 | "Note: Response Rates\n", 275 | "- Spring 2019: 384/826 (46%)\n", 276 | "- Winter 2020: 295/444 (66%)\n", 277 | "- Spring 2020: 397/475 (84%)\n", 278 | "- Fall 2020: 321/447 (72%)\n", 279 | "- Winter 2021: 314/438 (72%)\n", 280 | "- Winter 2024: 584/701 (83%)\n", 281 | "- **Winter 2025: 690/817 (84%)**" 282 | ] 283 | }, 284 | { 285 | "cell_type": "markdown", 286 | "metadata": { 287 | "slideshow": { 288 | "slide_type": "slide" 289 | } 290 | }, 291 | "source": [ 292 | "### Missingness" 293 | ] 294 | }, 295 | { 296 | "cell_type": "code", 297 | "execution_count": null, 298 | "metadata": { 299 | "slideshow": { 300 | "slide_type": "fragment" 301 | } 302 | }, 303 | "outputs": [], 304 | "source": [ 305 | "# how many nonresponses\n", 306 | "df.isnull().sum()" 307 | ] 308 | }, 309 | { 310 | "cell_type": "markdown", 311 | "metadata": { 312 | "slideshow": { 313 | "slide_type": "fragment" 314 | } 315 | }, 316 | "source": [ 317 | "We see that there are more nonresponses in the `enjoyed_least` category than the `enjoyed_most` category. So, more people left what they enjoyed least blank than they did what they enjoyed most." 318 | ] 319 | }, 320 | { 321 | "cell_type": "code", 322 | "execution_count": null, 323 | "metadata": { 324 | "scrolled": true 325 | }, 326 | "outputs": [], 327 | "source": [ 328 | "# how does that look by quarter?\n", 329 | "null_most = df.groupby('quarter')['enjoyed_most'].apply(lambda x: x.isnull().sum())\n", 330 | "null_least = df.groupby('quarter')['enjoyed_least'].apply(lambda x: x.isnull().sum())\n", 331 | "\n", 332 | "print(null_most, null_least)" 333 | ] 334 | }, 335 | { 336 | "cell_type": "markdown", 337 | "metadata": {}, 338 | "source": [ 339 | "We also see a decrease in Wi21 on. This is when I started requiring these questions (b/c I shortened the survey overall). " 340 | ] 341 | }, 342 | { 343 | "cell_type": "markdown", 344 | "metadata": { 345 | "slideshow": { 346 | "slide_type": "fragment" 347 | } 348 | }, 349 | "source": [ 350 | "#### Previous Quarters\n", 351 | "\n", 352 | "Typically, there are a few people who have what they enjoy least but don't have an enjoy most....but often these students' feedback is of particular interest to me." 353 | ] 354 | }, 355 | { 356 | "cell_type": "code", 357 | "execution_count": null, 358 | "metadata": {}, 359 | "outputs": [], 360 | "source": [ 361 | "# overall\n", 362 | "check_least = df[df['enjoyed_most'].isnull() & df['enjoyed_least'].notnull()]\n", 363 | "list(check_least['enjoyed_least'])" 364 | ] 365 | }, 366 | { 367 | "cell_type": "markdown", 368 | "metadata": { 369 | "slideshow": { 370 | "slide_type": "fragment" 371 | } 372 | }, 373 | "source": [ 374 | "Missing data causes a problem in `nltk`, so we either get rid of individuals who didn't respond to both, or we can replace their missing data with 'No response', knowing that this text will be included in the analysis now." 375 | ] 376 | }, 377 | { 378 | "cell_type": "code", 379 | "execution_count": null, 380 | "metadata": { 381 | "slideshow": { 382 | "slide_type": "fragment" 383 | } 384 | }, 385 | "outputs": [], 386 | "source": [ 387 | "def fill_no_response(df):\n", 388 | " '''replace missing data in enjoyed_most/least series with string No response'''\n", 389 | " \n", 390 | " df['enjoyed_most'] = df['enjoyed_most'].fillna('No response')\n", 391 | " df['enjoyed_least'] = df['enjoyed_least'].fillna('No response')" 392 | ] 393 | }, 394 | { 395 | "cell_type": "code", 396 | "execution_count": null, 397 | "metadata": { 398 | "slideshow": { 399 | "slide_type": "-" 400 | } 401 | }, 402 | "outputs": [], 403 | "source": [ 404 | "# fill NAs with string 'No response'\n", 405 | "fill_no_response(df)" 406 | ] 407 | }, 408 | { 409 | "cell_type": "markdown", 410 | "metadata": { 411 | "slideshow": { 412 | "slide_type": "slide" 413 | } 414 | }, 415 | "source": [ 416 | "## Quick checks: Words of interest\n", 417 | "\n" 418 | ] 419 | }, 420 | { 421 | "cell_type": "code", 422 | "execution_count": null, 423 | "metadata": {}, 424 | "outputs": [], 425 | "source": [ 426 | "def check_word_freq(df, word):\n", 427 | " \"\"\"checks for frequency of word specified in most and least enjoyed responses\"\"\"\n", 428 | " \n", 429 | " # calculate proportion within quarter\n", 430 | " word_most = df[df['enjoyed_most'].str.contains(word, case=False, na=False)]\n", 431 | " proportion_most = word_most.groupby('quarter').size() / df.groupby('quarter').size()\n", 432 | " \n", 433 | " word_least = df[df['enjoyed_least'].str.contains(word, case=False, na=False)]\n", 434 | " proportion_least = word_least.groupby('quarter').size() / df.groupby('quarter').size()\n", 435 | " \n", 436 | " out = combined_df = pd.concat([proportion_most, proportion_least], keys=['most', 'least'], axis=1)\n", 437 | "\n", 438 | " return out" 439 | ] 440 | }, 441 | { 442 | "cell_type": "markdown", 443 | "metadata": { 444 | "slideshow": { 445 | "slide_type": "fragment" 446 | } 447 | }, 448 | "source": [ 449 | "#### Assignment" 450 | ] 451 | }, 452 | { 453 | "cell_type": "code", 454 | "execution_count": null, 455 | "metadata": { 456 | "slideshow": { 457 | "slide_type": "-" 458 | } 459 | }, 460 | "outputs": [], 461 | "source": [ 462 | "## check for assignment\n", 463 | "check_word_freq(df, 'assignment')" 464 | ] 465 | }, 466 | { 467 | "cell_type": "markdown", 468 | "metadata": { 469 | "slideshow": { 470 | "slide_type": "slide" 471 | } 472 | }, 473 | "source": [ 474 | "#### Project" 475 | ] 476 | }, 477 | { 478 | "cell_type": "code", 479 | "execution_count": null, 480 | "metadata": { 481 | "scrolled": true, 482 | "slideshow": { 483 | "slide_type": "fragment" 484 | } 485 | }, 486 | "outputs": [], 487 | "source": [ 488 | "## check for project in free text\n", 489 | "check_word_freq(df, 'project')" 490 | ] 491 | }, 492 | { 493 | "cell_type": "code", 494 | "execution_count": null, 495 | "metadata": { 496 | "scrolled": true, 497 | "slideshow": { 498 | "slide_type": "fragment" 499 | } 500 | }, 501 | "outputs": [], 502 | "source": [ 503 | "## check for group in free text\n", 504 | "check_word_freq(df, 'group')" 505 | ] 506 | }, 507 | { 508 | "cell_type": "markdown", 509 | "metadata": { 510 | "slideshow": { 511 | "slide_type": "fragment" 512 | } 513 | }, 514 | "source": [ 515 | "#### Quizzes" 516 | ] 517 | }, 518 | { 519 | "cell_type": "code", 520 | "execution_count": null, 521 | "metadata": { 522 | "scrolled": true, 523 | "slideshow": { 524 | "slide_type": "-" 525 | } 526 | }, 527 | "outputs": [], 528 | "source": [ 529 | "check_word_freq(df, 'quiz')" 530 | ] 531 | }, 532 | { 533 | "cell_type": "markdown", 534 | "metadata": { 535 | "slideshow": { 536 | "slide_type": "fragment" 537 | } 538 | }, 539 | "source": [ 540 | "#### Labs" 541 | ] 542 | }, 543 | { 544 | "cell_type": "code", 545 | "execution_count": null, 546 | "metadata": { 547 | "slideshow": { 548 | "slide_type": "-" 549 | } 550 | }, 551 | "outputs": [], 552 | "source": [ 553 | "check_word_freq(df, 'lab')" 554 | ] 555 | }, 556 | { 557 | "cell_type": "markdown", 558 | "metadata": { 559 | "slideshow": { 560 | "slide_type": "slide" 561 | } 562 | }, 563 | "source": [ 564 | "## Sentiment Analysis\n", 565 | "\n", 566 | "We get a quick snapshot of what's going on in COGS 108, but we really want to understand the details. To do this, analyzing the sentiment of the text is a good next step." 567 | ] 568 | }, 569 | { 570 | "cell_type": "markdown", 571 | "metadata": { 572 | "slideshow": { 573 | "slide_type": "slide" 574 | } 575 | }, 576 | "source": [ 577 | "#### Step 1: Tokenization\n", 578 | "\n", 579 | "Tokenization is the first step in analyzing text. \n", 580 | "\n", 581 | "1. Aquire text of interest\n", 582 | "2. Break text down (tokenize) into smaller chunks (i.e. words, bigrams, sentences, etc.)\n", 583 | "\n", 584 | "A **token** is a single entity - think of it as a building block of language." 585 | ] 586 | }, 587 | { 588 | "cell_type": "markdown", 589 | "metadata": { 590 | "slideshow": { 591 | "slide_type": "slide" 592 | } 593 | }, 594 | "source": [ 595 | "### Tokenization Example\n", 596 | "\n", 597 | "Here we demonstrate what a tokenized single response looks like." 598 | ] 599 | }, 600 | { 601 | "cell_type": "code", 602 | "execution_count": null, 603 | "metadata": { 604 | "slideshow": { 605 | "slide_type": "fragment" 606 | } 607 | }, 608 | "outputs": [], 609 | "source": [ 610 | "# import regex word tokenizer\n", 611 | "from nltk.tokenize import RegexpTokenizer\n", 612 | "tokenizer = RegexpTokenizer(r'\\w+')" 613 | ] 614 | }, 615 | { 616 | "cell_type": "code", 617 | "execution_count": null, 618 | "metadata": {}, 619 | "outputs": [], 620 | "source": [ 621 | "df.loc[0,'enjoyed_most']" 622 | ] 623 | }, 624 | { 625 | "cell_type": "code", 626 | "execution_count": null, 627 | "metadata": { 628 | "scrolled": true, 629 | "slideshow": { 630 | "slide_type": "fragment" 631 | } 632 | }, 633 | "outputs": [], 634 | "source": [ 635 | "tokenized_word = tokenizer.tokenize(df.loc[0,'enjoyed_most'])\n", 636 | "print(tokenized_word)" 637 | ] 638 | }, 639 | { 640 | "cell_type": "markdown", 641 | "metadata": { 642 | "slideshow": { 643 | "slide_type": "slide" 644 | } 645 | }, 646 | "source": [ 647 | "#### Tokenize COGS108 data\n", 648 | "\n", 649 | "Using that concept we'll tokenize the words in the enjoyed_most and `enjoyed_least` columns for the data in our COGS108 data." 650 | ] 651 | }, 652 | { 653 | "cell_type": "code", 654 | "execution_count": null, 655 | "metadata": { 656 | "scrolled": true, 657 | "slideshow": { 658 | "slide_type": "fragment" 659 | } 660 | }, 661 | "outputs": [], 662 | "source": [ 663 | "# tokenize most and least responses\n", 664 | "df['most_token'] = df['enjoyed_most'].apply(tokenizer.tokenize) \n", 665 | "df['least_token'] = df['enjoyed_least'].apply(tokenizer.tokenize) \n", 666 | "df.head()" 667 | ] 668 | }, 669 | { 670 | "cell_type": "markdown", 671 | "metadata": { 672 | "slideshow": { 673 | "slide_type": "slide" 674 | } 675 | }, 676 | "source": [ 677 | "#### Step 2: Stop Words\n", 678 | "\n", 679 | "**Stop words** are words that are of less interest to your analysis. \n", 680 | "\n", 681 | "For example, you wouldn't expect the following words to be important: is, am, are, this, a, an, the, etc.\n", 682 | "\n", 683 | "By removing stopwords, you can lower the computational burden, focusing on only the words of interest.\n", 684 | "\n", 685 | "To do so in `nltk`, you need to create a list of stopwords and filter them from your tokens.\n" 686 | ] 687 | }, 688 | { 689 | "cell_type": "code", 690 | "execution_count": null, 691 | "metadata": { 692 | "scrolled": true, 693 | "slideshow": { 694 | "slide_type": "fragment" 695 | } 696 | }, 697 | "outputs": [], 698 | "source": [ 699 | "# import stop words\n", 700 | "from nltk.corpus import stopwords\n", 701 | "stop_words = set(stopwords.words('english'))\n", 702 | "\n", 703 | "# look at stop words\n", 704 | "print(stop_words)" 705 | ] 706 | }, 707 | { 708 | "cell_type": "markdown", 709 | "metadata": { 710 | "slideshow": { 711 | "slide_type": "slide" 712 | } 713 | }, 714 | "source": [ 715 | "### Stop Words Example\n", 716 | "\n", 717 | "Here we compare a sentence after tokenization to one that has been tokenized _and had stop words removed_." 718 | ] 719 | }, 720 | { 721 | "cell_type": "code", 722 | "execution_count": null, 723 | "metadata": { 724 | "slideshow": { 725 | "slide_type": "fragment" 726 | } 727 | }, 728 | "outputs": [], 729 | "source": [ 730 | "# example of removing stop words\n", 731 | "filtered_sent=[]\n", 732 | "for w in tokenized_word:\n", 733 | " if w not in stop_words:\n", 734 | " filtered_sent.append(w)\n", 735 | "print(\"Tokenized Sentence:\", tokenized_word)\n", 736 | "print(\"Filtered Sentence:\", filtered_sent)" 737 | ] 738 | }, 739 | { 740 | "cell_type": "markdown", 741 | "metadata": { 742 | "slideshow": { 743 | "slide_type": "slide" 744 | } 745 | }, 746 | "source": [ 747 | "#### Remove Stop Words: COGS108 data\n", 748 | "\n", 749 | "Using that idea, we can go ahead and remove stop words from our tokenized most and least liked tokenized data." 750 | ] 751 | }, 752 | { 753 | "cell_type": "code", 754 | "execution_count": null, 755 | "metadata": { 756 | "slideshow": { 757 | "slide_type": "fragment" 758 | } 759 | }, 760 | "outputs": [], 761 | "source": [ 762 | "# remove stop words\n", 763 | "df['most_stop'] = df['most_token'].apply(lambda x: [item for item in x if item not in stop_words])\n", 764 | "df['least_stop'] = df['least_token'].apply(lambda x: [item for item in x if item not in stop_words])\n", 765 | "df.head()" 766 | ] 767 | }, 768 | { 769 | "cell_type": "markdown", 770 | "metadata": { 771 | "jp-MarkdownHeadingCollapsed": true, 772 | "slideshow": { 773 | "slide_type": "slide" 774 | } 775 | }, 776 | "source": [ 777 | "#### Step 3: Lexicon Normalization (**Stemming**)\n", 778 | "\n", 779 | "In language, many different words come from the same root word. \n", 780 | "\n", 781 | "For example, \"intersection\", \"intersecting\", \"intersects\", and \"intersected\" are all related to the common root word - \"intersect\".\n", 782 | "\n", 783 | "**Stemming** is how linguistic normalization occurs - it reduces words to their root words (and chops off additional things like 'ing') - all of the above words would be reduced to their common stem \"intersect.\"\n", 784 | "\n", 785 | "\n", 786 | "\n", 787 | "\n" 788 | ] 789 | }, 790 | { 791 | "cell_type": "markdown", 792 | "metadata": { 793 | "slideshow": { 794 | "slide_type": "slide" 795 | } 796 | }, 797 | "source": [ 798 | "### Stemming Example\n", 799 | "\n", 800 | "After tokenization and removing stop words, we can get the stem for all tokens (words) in our dataset." 801 | ] 802 | }, 803 | { 804 | "cell_type": "code", 805 | "execution_count": null, 806 | "metadata": { 807 | "scrolled": true, 808 | "slideshow": { 809 | "slide_type": "fragment" 810 | } 811 | }, 812 | "outputs": [], 813 | "source": [ 814 | "# Stemming\n", 815 | "from nltk.stem import PorterStemmer\n", 816 | "\n", 817 | "ps = PorterStemmer()\n", 818 | "\n", 819 | "stemmed_words=[]\n", 820 | "for w in filtered_sent:\n", 821 | " stemmed_words.append(ps.stem(w))\n", 822 | "\n", 823 | "print(\"Filtered Sentence:\", filtered_sent)\n", 824 | "print(\"Stemmed Sentence:\", stemmed_words)" 825 | ] 826 | }, 827 | { 828 | "cell_type": "markdown", 829 | "metadata": {}, 830 | "source": [ 831 | "**Lecture participation**: Pause & do Q1 now." 832 | ] 833 | }, 834 | { 835 | "cell_type": "markdown", 836 | "metadata": { 837 | "slideshow": { 838 | "slide_type": "slide" 839 | } 840 | }, 841 | "source": [ 842 | "#### Stemming: COGS108 data\n", 843 | "\n", 844 | "Here, we obtain the stem (root word) for all tokens in our dataset." 845 | ] 846 | }, 847 | { 848 | "cell_type": "code", 849 | "execution_count": null, 850 | "metadata": { 851 | "scrolled": true, 852 | "slideshow": { 853 | "slide_type": "fragment" 854 | } 855 | }, 856 | "outputs": [], 857 | "source": [ 858 | "df['most_stem'] = df['most_stop'].apply(lambda x: [ps.stem(y) for y in x])\n", 859 | "df['least_stem'] = df['least_stop'].apply(lambda x: [ps.stem(y) for y in x])\n", 860 | "df.head()" 861 | ] 862 | }, 863 | { 864 | "cell_type": "markdown", 865 | "metadata": { 866 | "slideshow": { 867 | "slide_type": "slide" 868 | } 869 | }, 870 | "source": [ 871 | "#### Step 4: Frequency Distribution\n", 872 | "\n", 873 | "It can be helpful to get a sense of which words are most frequent in our dataset." 874 | ] 875 | }, 876 | { 877 | "cell_type": "code", 878 | "execution_count": null, 879 | "metadata": { 880 | "slideshow": { 881 | "slide_type": "fragment" 882 | } 883 | }, 884 | "outputs": [], 885 | "source": [ 886 | "# get series of all most and least liked words after stemming\n", 887 | "# note that \"No Response\" is still being included in the analysis\n", 888 | "most = df['most_stem'].apply(pd.Series).stack()\n", 889 | "least = df['least_stem'].apply(pd.Series).stack()" 890 | ] 891 | }, 892 | { 893 | "cell_type": "markdown", 894 | "metadata": { 895 | "slideshow": { 896 | "slide_type": "fragment" 897 | } 898 | }, 899 | "source": [ 900 | "`FreqDist` calculates the frequency of each word in the text and we can plot the most frequent words." 901 | ] 902 | }, 903 | { 904 | "cell_type": "code", 905 | "execution_count": null, 906 | "metadata": { 907 | "slideshow": { 908 | "slide_type": "fragment" 909 | } 910 | }, 911 | "outputs": [], 912 | "source": [ 913 | "from nltk.probability import FreqDist\n", 914 | "import string\n", 915 | "\n", 916 | "# calculation word frequency\n", 917 | "fdist_most = FreqDist(most)\n", 918 | "fdist_least = FreqDist(least)\n", 919 | "\n", 920 | "# remove punctuation counts\n", 921 | "for punc in string.punctuation:\n", 922 | " del fdist_most[punc]\n", 923 | " del fdist_least[punc]" 924 | ] 925 | }, 926 | { 927 | "cell_type": "code", 928 | "execution_count": null, 929 | "metadata": {}, 930 | "outputs": [], 931 | "source": [ 932 | "# Frequency Distribution Plot - top 20\n", 933 | "# for words in what students like most\n", 934 | "fdist_least.plot(20, cumulative=False);" 935 | ] 936 | }, 937 | { 938 | "cell_type": "markdown", 939 | "metadata": {}, 940 | "source": [ 941 | "**Lecture participation**: Pause & do Q2 now." 942 | ] 943 | }, 944 | { 945 | "cell_type": "markdown", 946 | "metadata": { 947 | "slideshow": { 948 | "slide_type": "slide" 949 | } 950 | }, 951 | "source": [ 952 | "#### Step 5: Sentiment Analysis!\n", 953 | "\n", 954 | "**Sentiment Analysis** quantifies the content, idea, beliefs and opinions conveyed in text. \n", 955 | "\n", 956 | "Two general approaches:\n", 957 | "\n", 958 | "1. **Lexicon-based** - count number of words in a text belonging to each sentiment (positive, negative, happy, angry, etc.)\n", 959 | "2. **Machine learning-based** - develop a classification model on pre-labeled data\n", 960 | "\n" 961 | ] 962 | }, 963 | { 964 | "cell_type": "markdown", 965 | "metadata": { 966 | "slideshow": { 967 | "slide_type": "slide" 968 | } 969 | }, 970 | "source": [ 971 | "### Sentiment Example\n", 972 | "\n", 973 | "To get a measure of overall sentiment in our text, we'll compare our text to the VADER lexicon." 974 | ] 975 | }, 976 | { 977 | "cell_type": "code", 978 | "execution_count": null, 979 | "metadata": { 980 | "slideshow": { 981 | "slide_type": "fragment" 982 | } 983 | }, 984 | "outputs": [], 985 | "source": [ 986 | "from nltk.sentiment.vader import SentimentIntensityAnalyzer \n", 987 | "analyser = SentimentIntensityAnalyzer()" 988 | ] 989 | }, 990 | { 991 | "cell_type": "markdown", 992 | "metadata": { 993 | "slideshow": { 994 | "slide_type": "fragment" 995 | } 996 | }, 997 | "source": [ 998 | "VADER handles:\n", 999 | "\n", 1000 | "- capitalization (great vs GREAT) & punctuation (exclamation makes more positive!)\n", 1001 | "- emojis and emoticons\n", 1002 | "- degree modifiers (extremely good vs. marginally good)\n", 1003 | "- contractions and conjunctions (but signals shift)" 1004 | ] 1005 | }, 1006 | { 1007 | "cell_type": "markdown", 1008 | "metadata": { 1009 | "slideshow": { 1010 | "slide_type": "fragment" 1011 | } 1012 | }, 1013 | "source": [ 1014 | "`pos` + `neg` + `neu` = 1\n", 1015 | "\n", 1016 | "**`compound`** score - metric that calculates sum of all the lexicon ratings and normalizes between -1 (most extreme negative) and +1 (most extreme positive)\n", 1017 | "- positive: `compound` >= 0.05 \n", 1018 | "- neutral: -0.05 < `compound` < 0.05\n", 1019 | "- negative : `compound` <= -0.05" 1020 | ] 1021 | }, 1022 | { 1023 | "cell_type": "code", 1024 | "execution_count": null, 1025 | "metadata": { 1026 | "slideshow": { 1027 | "slide_type": "fragment" 1028 | } 1029 | }, 1030 | "outputs": [], 1031 | "source": [ 1032 | "analyser.polarity_scores(\"The class is super cool.\")" 1033 | ] 1034 | }, 1035 | { 1036 | "cell_type": "code", 1037 | "execution_count": null, 1038 | "metadata": { 1039 | "slideshow": { 1040 | "slide_type": "fragment" 1041 | } 1042 | }, 1043 | "outputs": [], 1044 | "source": [ 1045 | "analyser.polarity_scores(\"The class is not super cool.\")" 1046 | ] 1047 | }, 1048 | { 1049 | "cell_type": "code", 1050 | "execution_count": null, 1051 | "metadata": { 1052 | "slideshow": { 1053 | "slide_type": "fragment" 1054 | } 1055 | }, 1056 | "outputs": [], 1057 | "source": [ 1058 | "analyser.polarity_scores(\"The class is NOT super cool!\")" 1059 | ] 1060 | }, 1061 | { 1062 | "cell_type": "markdown", 1063 | "metadata": {}, 1064 | "source": [ 1065 | "**Lecture participation**: Pause & do Q3 now." 1066 | ] 1067 | }, 1068 | { 1069 | "cell_type": "markdown", 1070 | "metadata": { 1071 | "slideshow": { 1072 | "slide_type": "slide" 1073 | } 1074 | }, 1075 | "source": [ 1076 | "#### Sentiment Analysis: COGS108 data\n", 1077 | "\n", 1078 | "Here, we will calculate the sentiment of each most liked and least liked student response from the survey." 1079 | ] 1080 | }, 1081 | { 1082 | "cell_type": "code", 1083 | "execution_count": null, 1084 | "metadata": { 1085 | "slideshow": { 1086 | "slide_type": "fragment" 1087 | } 1088 | }, 1089 | "outputs": [], 1090 | "source": [ 1091 | "# get list of the 'sentences' (responses) from each individual\n", 1092 | "most_list = list(df[df['quarter'] == 'Wi25']['enjoyed_most'].values)\n", 1093 | "least_list = list(df[df['quarter'] == 'Wi25']['enjoyed_least'].values)" 1094 | ] 1095 | }, 1096 | { 1097 | "cell_type": "code", 1098 | "execution_count": null, 1099 | "metadata": { 1100 | "slideshow": { 1101 | "slide_type": "fragment" 1102 | } 1103 | }, 1104 | "outputs": [], 1105 | "source": [ 1106 | "# create function that will output dataframe \n", 1107 | "# that stores sentiment information\n", 1108 | "def get_sentiments(input_list):\n", 1109 | " \n", 1110 | " output = pd.DataFrame()\n", 1111 | "\n", 1112 | " for sentence in input_list:\n", 1113 | " ss = analyser.polarity_scores(sentence)\n", 1114 | " ss['sentence'] = sentence\n", 1115 | " # Note use of pd.concat\n", 1116 | " output = pd.concat([output, pd.DataFrame([ss])], ignore_index=True)\n", 1117 | "\n", 1118 | "\n", 1119 | " return output" 1120 | ] 1121 | }, 1122 | { 1123 | "cell_type": "code", 1124 | "execution_count": null, 1125 | "metadata": { 1126 | "slideshow": { 1127 | "slide_type": "fragment" 1128 | } 1129 | }, 1130 | "outputs": [], 1131 | "source": [ 1132 | "# get sentiment measures\n", 1133 | "least_sentiments = get_sentiments(least_list)\n", 1134 | "most_sentiments = get_sentiments(most_list)" 1135 | ] 1136 | }, 1137 | { 1138 | "cell_type": "markdown", 1139 | "metadata": { 1140 | "slideshow": { 1141 | "slide_type": "slide" 1142 | } 1143 | }, 1144 | "source": [ 1145 | "#### Sentiment Analysis: COGS108 data output\n", 1146 | "\n", 1147 | "After calculating the sentiment of each response, we can look at the output of each." 1148 | ] 1149 | }, 1150 | { 1151 | "cell_type": "code", 1152 | "execution_count": null, 1153 | "metadata": { 1154 | "slideshow": { 1155 | "slide_type": "fragment" 1156 | } 1157 | }, 1158 | "outputs": [], 1159 | "source": [ 1160 | "# let's get rid of those no response values here\n", 1161 | "most_sentiments = most_sentiments[most_sentiments['sentence'] != 'No response']\n", 1162 | "least_sentiments = least_sentiments[least_sentiments['sentence'] != 'No response']" 1163 | ] 1164 | }, 1165 | { 1166 | "cell_type": "code", 1167 | "execution_count": null, 1168 | "metadata": { 1169 | "scrolled": true, 1170 | "slideshow": { 1171 | "slide_type": "fragment" 1172 | } 1173 | }, 1174 | "outputs": [], 1175 | "source": [ 1176 | "# take a look at the output\n", 1177 | "least_sentiments.sort_values(by='compound', ascending=True).head(10)" 1178 | ] 1179 | }, 1180 | { 1181 | "cell_type": "code", 1182 | "execution_count": null, 1183 | "metadata": { 1184 | "slideshow": { 1185 | "slide_type": "fragment" 1186 | } 1187 | }, 1188 | "outputs": [], 1189 | "source": [ 1190 | "# take a look at the output\n", 1191 | "most_sentiments.sort_values(by='compound', ascending=False).head(10)" 1192 | ] 1193 | }, 1194 | { 1195 | "cell_type": "markdown", 1196 | "metadata": { 1197 | "slideshow": { 1198 | "slide_type": "slide" 1199 | } 1200 | }, 1201 | "source": [ 1202 | "#### Sentiment Analysis: COGS108 data - `describe`\n", 1203 | "\n", 1204 | "To get an overall sense of the values stored in each of these dataframes, we can use `describe`." 1205 | ] 1206 | }, 1207 | { 1208 | "cell_type": "code", 1209 | "execution_count": null, 1210 | "metadata": { 1211 | "scrolled": true, 1212 | "slideshow": { 1213 | "slide_type": "fragment" 1214 | } 1215 | }, 1216 | "outputs": [], 1217 | "source": [ 1218 | "most_sentiments.describe()" 1219 | ] 1220 | }, 1221 | { 1222 | "cell_type": "code", 1223 | "execution_count": null, 1224 | "metadata": { 1225 | "slideshow": { 1226 | "slide_type": "fragment" 1227 | } 1228 | }, 1229 | "outputs": [], 1230 | "source": [ 1231 | "least_sentiments.describe()" 1232 | ] 1233 | }, 1234 | { 1235 | "cell_type": "markdown", 1236 | "metadata": { 1237 | "slideshow": { 1238 | "slide_type": "slide" 1239 | } 1240 | }, 1241 | "source": [ 1242 | "#### Sentiment Analysis: COGS108 data - plotting\n", 1243 | "\n", 1244 | "We can compare the distribution of the `compound` metric between the two analyses." 1245 | ] 1246 | }, 1247 | { 1248 | "cell_type": "code", 1249 | "execution_count": null, 1250 | "metadata": { 1251 | "slideshow": { 1252 | "slide_type": "fragment" 1253 | } 1254 | }, 1255 | "outputs": [], 1256 | "source": [ 1257 | "most_sentiments['compound'].plot.density(label='most')\n", 1258 | "least_sentiments['compound'].plot.density(label='least')\n", 1259 | "plt.legend()\n", 1260 | "plt.xlabel('Compound Sentiment Scores')\n", 1261 | "plt.xlim(-1,1);" 1262 | ] 1263 | }, 1264 | { 1265 | "cell_type": "markdown", 1266 | "metadata": {}, 1267 | "source": [ 1268 | "**Lecture participation**: Pause & do Q4 now." 1269 | ] 1270 | }, 1271 | { 1272 | "cell_type": "code", 1273 | "execution_count": null, 1274 | "metadata": { 1275 | "slideshow": { 1276 | "slide_type": "slide" 1277 | } 1278 | }, 1279 | "outputs": [], 1280 | "source": [ 1281 | "# include label for boxplot\n", 1282 | "most_sentiments['which'] = 'most'\n", 1283 | "least_sentiments['which'] = 'least'\n", 1284 | "# concatenate data frames together\n", 1285 | "compound_out = pd.concat([most_sentiments, least_sentiments])\n", 1286 | "compound_out.head()" 1287 | ] 1288 | }, 1289 | { 1290 | "cell_type": "code", 1291 | "execution_count": null, 1292 | "metadata": {}, 1293 | "outputs": [], 1294 | "source": [ 1295 | "# plot compound by resonse type\n", 1296 | "sns.boxplot(data=compound_out, x='which', y='compound')\n", 1297 | "plt.xlabel('response');" 1298 | ] 1299 | }, 1300 | { 1301 | "cell_type": "markdown", 1302 | "metadata": { 1303 | "slideshow": { 1304 | "slide_type": "fragment" 1305 | } 1306 | }, 1307 | "source": [ 1308 | "Probably unsurprisingly, the overall sentiment of what students like tends to be more positive than what students like less. \n", 1309 | "\n", 1310 | "Probably not surprising given the data and question on the survey. But, let's dig deeper into these data moving beyond sentiment analysis..." 1311 | ] 1312 | }, 1313 | { 1314 | "cell_type": "markdown", 1315 | "metadata": { 1316 | "slideshow": { 1317 | "slide_type": "slide" 1318 | } 1319 | }, 1320 | "source": [ 1321 | "## TF-IDF\n", 1322 | "\n", 1323 | "Term Frequency - Inverse Document Frequency (**TF-IDF**) sets out to identify the tokens most unique to your document of interest (relative to all documents in your corpus). " 1324 | ] 1325 | }, 1326 | { 1327 | "cell_type": "markdown", 1328 | "metadata": { 1329 | "slideshow": { 1330 | "slide_type": "fragment" 1331 | } 1332 | }, 1333 | "source": [ 1334 | "**Term Frequency (TF)** - counts the number of words (tokens) occurring in each document.\n", 1335 | "\n", 1336 | "**Inverse Document Frequency (IDF)** - weights the word by their relative frequency across documents. " 1337 | ] 1338 | }, 1339 | { 1340 | "cell_type": "markdown", 1341 | "metadata": { 1342 | "slideshow": { 1343 | "slide_type": "fragment" 1344 | } 1345 | }, 1346 | "source": [ 1347 | "$$IDF_{word} = log(\\frac{\\# documents}{\\# \\ documents\\_containing\\_word})$$" 1348 | ] 1349 | }, 1350 | { 1351 | "cell_type": "markdown", 1352 | "metadata": { 1353 | "slideshow": { 1354 | "slide_type": "fragment" 1355 | } 1356 | }, 1357 | "source": [ 1358 | "$$TF-IDF = TF \\times IDF$$" 1359 | ] 1360 | }, 1361 | { 1362 | "cell_type": "markdown", 1363 | "metadata": { 1364 | "slideshow": { 1365 | "slide_type": "fragment" 1366 | } 1367 | }, 1368 | "source": [ 1369 | "words with a high TF-IDF are those with high frequency in one document & relatively low frequency in other documents" 1370 | ] 1371 | }, 1372 | { 1373 | "cell_type": "markdown", 1374 | "metadata": { 1375 | "slideshow": { 1376 | "slide_type": "slide" 1377 | } 1378 | }, 1379 | "source": [ 1380 | "For our purposes, our **corpus** will be students' responses to what they like most and least about COGS108.\n", 1381 | "\n", 1382 | "We'll treat this as **two separate documents**:\n", 1383 | "1. What students like most\n", 1384 | "2. What students like least" 1385 | ] 1386 | }, 1387 | { 1388 | "cell_type": "markdown", 1389 | "metadata": { 1390 | "slideshow": { 1391 | "slide_type": "slide" 1392 | } 1393 | }, 1394 | "source": [ 1395 | "### Bag of Words (BoW) approach\n", 1396 | "\n", 1397 | "Converts the text into a co-occurrence matrix across documents within the corpus." 1398 | ] 1399 | }, 1400 | { 1401 | "cell_type": "markdown", 1402 | "metadata": { 1403 | "slideshow": { 1404 | "slide_type": "fragment" 1405 | } 1406 | }, 1407 | "source": [ 1408 | "To do this, let's get our text ready.\n", 1409 | "\n", 1410 | "We're going to make sure all our words are lower case, remove punctuation from each, and then provide the text (`corpus`) to `TfidfVectorizer`." 1411 | ] 1412 | }, 1413 | { 1414 | "cell_type": "code", 1415 | "execution_count": null, 1416 | "metadata": { 1417 | "slideshow": { 1418 | "slide_type": "fragment" 1419 | } 1420 | }, 1421 | "outputs": [], 1422 | "source": [ 1423 | "import string \n", 1424 | "\n", 1425 | "# lowercase text\n", 1426 | "least = list(map(str.lower, least_list))\n", 1427 | "most = list(map(str.lower, most_list))\n", 1428 | "\n", 1429 | "# remove punctuation\n", 1430 | "for c in string.punctuation:\n", 1431 | " least = str(least).replace(c, \"\")\n", 1432 | " most = str(most).replace(c, \"\")\n", 1433 | "\n", 1434 | "# get list of two documents together\n", 1435 | "corpus = [str(least), str(most)]" 1436 | ] 1437 | }, 1438 | { 1439 | "cell_type": "markdown", 1440 | "metadata": { 1441 | "slideshow": { 1442 | "slide_type": "slide" 1443 | } 1444 | }, 1445 | "source": [ 1446 | "### Calculate TF-IDF\n", 1447 | "\n", 1448 | "With our text ready for analysis, it's time to calculate TF-IDF" 1449 | ] 1450 | }, 1451 | { 1452 | "cell_type": "markdown", 1453 | "metadata": { 1454 | "slideshow": { 1455 | "slide_type": "fragment" 1456 | } 1457 | }, 1458 | "source": [ 1459 | "To start our TF-IDF analysis, we'll first **create a `TfidfVectorizer` object to transform our text data into vectors.**" 1460 | ] 1461 | }, 1462 | { 1463 | "cell_type": "code", 1464 | "execution_count": null, 1465 | "metadata": { 1466 | "slideshow": { 1467 | "slide_type": "fragment" 1468 | } 1469 | }, 1470 | "outputs": [], 1471 | "source": [ 1472 | "from sklearn.feature_extraction.text import TfidfVectorizer\n", 1473 | "from nltk.tokenize import word_tokenize" 1474 | ] 1475 | }, 1476 | { 1477 | "cell_type": "code", 1478 | "execution_count": null, 1479 | "metadata": { 1480 | "slideshow": { 1481 | "slide_type": "fragment" 1482 | } 1483 | }, 1484 | "outputs": [], 1485 | "source": [ 1486 | "# create vectorizer\n", 1487 | "tfidf = TfidfVectorizer(sublinear_tf=True,\n", 1488 | " analyzer='word',\n", 1489 | " max_features=2000,\n", 1490 | " tokenizer=word_tokenize,\n", 1491 | " stop_words='english')" 1492 | ] 1493 | }, 1494 | { 1495 | "cell_type": "markdown", 1496 | "metadata": {}, 1497 | "source": [ 1498 | "**Lecture participation**: Pause & do Q5 now. Submit when you're done." 1499 | ] 1500 | }, 1501 | { 1502 | "cell_type": "markdown", 1503 | "metadata": { 1504 | "slideshow": { 1505 | "slide_type": "slide" 1506 | } 1507 | }, 1508 | "source": [ 1509 | "#### TF-IDF: COGS108 data - calculation\n", 1510 | "\n", 1511 | "Here, we use our vectorizer to calculate TF-IDF across the words in our word matrix." 1512 | ] 1513 | }, 1514 | { 1515 | "cell_type": "code", 1516 | "execution_count": null, 1517 | "metadata": { 1518 | "slideshow": { 1519 | "slide_type": "fragment" 1520 | } 1521 | }, 1522 | "outputs": [], 1523 | "source": [ 1524 | "# calculate TF-IDF\n", 1525 | "cogs_tfidf = pd.DataFrame(\n", 1526 | " tfidf.fit_transform(corpus)\n", 1527 | " .toarray()\n", 1528 | ")\n", 1529 | "cogs_tfidf.columns = tfidf.get_feature_names_out()\n", 1530 | "cogs_tfidf = cogs_tfidf.rename(index={0:'least', 1:'most'})" 1531 | ] 1532 | }, 1533 | { 1534 | "cell_type": "markdown", 1535 | "metadata": { 1536 | "slideshow": { 1537 | "slide_type": "slide" 1538 | } 1539 | }, 1540 | "source": [ 1541 | "#### TF-IDF: COGS108 data - output\n", 1542 | "\n", 1543 | "If we just want to look at the word most uniuqe in each document..." 1544 | ] 1545 | }, 1546 | { 1547 | "cell_type": "markdown", 1548 | "metadata": { 1549 | "slideshow": { 1550 | "slide_type": "fragment" 1551 | } 1552 | }, 1553 | "source": [ 1554 | "Alternatively, we can sort by the set or words most unique to each document:" 1555 | ] 1556 | }, 1557 | { 1558 | "cell_type": "code", 1559 | "execution_count": null, 1560 | "metadata": { 1561 | "slideshow": { 1562 | "slide_type": "fragment" 1563 | } 1564 | }, 1565 | "outputs": [], 1566 | "source": [ 1567 | "cogs_tfidf.sort_values(by='most', axis=1, ascending=False)" 1568 | ] 1569 | }, 1570 | { 1571 | "cell_type": "code", 1572 | "execution_count": null, 1573 | "metadata": { 1574 | "slideshow": { 1575 | "slide_type": "fragment" 1576 | } 1577 | }, 1578 | "outputs": [], 1579 | "source": [ 1580 | "cogs_tfidf.sort_values(by='least', axis=1, ascending=False)" 1581 | ] 1582 | }, 1583 | { 1584 | "cell_type": "markdown", 1585 | "metadata": { 1586 | "slideshow": { 1587 | "slide_type": "fragment" 1588 | } 1589 | }, 1590 | "source": [ 1591 | "**Sentiment Analysis** and **TF-IDF** are really helpful when analyzing documents and corpuses of text.\n", 1592 | "\n", 1593 | "But, what if, from the text itself we wanted to predict whether or not the text was likely a 'most' liked or a 'least' liked comment? We'll discuss how to do this in the coming **machine learning** lectures!" 1594 | ] 1595 | } 1596 | ], 1597 | "metadata": { 1598 | "celltoolbar": "Slideshow", 1599 | "kernelspec": { 1600 | "display_name": "Python 3 (ipykernel)", 1601 | "language": "python", 1602 | "name": "python3" 1603 | }, 1604 | "language_info": { 1605 | "codemirror_mode": { 1606 | "name": "ipython", 1607 | "version": 3 1608 | }, 1609 | "file_extension": ".py", 1610 | "mimetype": "text/x-python", 1611 | "name": "python", 1612 | "nbconvert_exporter": "python", 1613 | "pygments_lexer": "ipython3", 1614 | "version": "3.11.8" 1615 | }, 1616 | "rise": { 1617 | "scroll": true 1618 | } 1619 | }, 1620 | "nbformat": 4, 1621 | "nbformat_minor": 4 1622 | } 1623 | -------------------------------------------------------------------------------- /08_ml/08_14_machine_learning.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/COGS108/Lectures-Ellis/c9dbb70bd3b6e72b2835115aa6f1c1bfdf89ee79/08_ml/08_14_machine_learning.pdf -------------------------------------------------------------------------------- /09_geospatial/09_16_geospatial.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/COGS108/Lectures-Ellis/c9dbb70bd3b6e72b2835115aa6f1c1bfdf89ee79/09_geospatial/09_16_geospatial.pdf -------------------------------------------------------------------------------- /10_communication/10_17_communication.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/COGS108/Lectures-Ellis/c9dbb70bd3b6e72b2835115aa6f1c1bfdf89ee79/10_communication/10_17_communication.pdf -------------------------------------------------------------------------------- /10_communication/10_18_be_wrong.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/COGS108/Lectures-Ellis/c9dbb70bd3b6e72b2835115aa6f1c1bfdf89ee79/10_communication/10_18_be_wrong.pdf -------------------------------------------------------------------------------- /10_communication/10_19_jobs_future.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/COGS108/Lectures-Ellis/c9dbb70bd3b6e72b2835115aa6f1c1bfdf89ee79/10_communication/10_19_jobs_future.pdf -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2021 COGS108 - Data Science in Practice 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # Lectures (Winter 2025) 2 | 3 | Course materials are organized by week. 4 | 5 | |Week | General Topic | Link to Materials | 6 | |---|:---|:---| 7 | | 01 | Introduction to Data Science | [01_intro](https://github.com/COGS108/Lectures-Ellis/tree/wi25/01_intro) | 8 | | 02 | Version Control & Data | [02_data](https://github.com/COGS108/Lectures-Ellis/tree/wi25/02_data) | 9 | | 03 | Data Ethics & Wrangling | [03_ethics](https://github.com/COGS108/Lectures-Ellis/tree/wi25/03_ethics) | 10 | | 04 | Data Viz & Analysis | [04_analysis](https://github.com/COGS108/Lectures-Ellis/tree/wi25/04_analysis) | 11 | | 05 | Exploratory Data Analysis | [05_eda](https://github.com/COGS108/Lectures-Ellis/tree/wi25/05_eda) | 12 | | 06 | Inference | [06_inference](https://github.com/COGS108/Lectures-Ellis/tree/wi25/06_inference) | 13 | | 07 | Text Analysis | [07_text](https://github.com/COGS108/Lectures-Ellis/tree/wi25/07_text) | 14 | | 08 | Machine Learning | [08_ml](https://github.com/COGS108/Lectures-Ellis/tree/wi25/08_ml) | 15 | | 09 | Geospatial | [09_geospatial](https://github.com/COGS108/Lectures-Ellis/tree/wi25/09_geospatial) | 16 | | 10 | Data Science Communication & Jobs | [10_communication](https://github.com/COGS108/Lectures-Ellis/tree/wi25/10_communication) | 17 | | -- | Discussion Section Slides | [XX_section](https://github.com/COGS108/Lectures-Ellis/tree/wi25/XX_section) | 18 | 19 | --- 20 | ## License 21 | 22 | The content of this project itself is licensed under the [Creative Commons Attribution 3.0 Unported license](https://creativecommons.org/licenses/by/3.0/), and the underlying source code used to format and display that content is licensed under the [MIT license](https://github.com/github/choosealicense.com/blob/gh-pages/LICENSE.md). 23 | -------------------------------------------------------------------------------- /XX_section/D1.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/COGS108/Lectures-Ellis/c9dbb70bd3b6e72b2835115aa6f1c1bfdf89ee79/XX_section/D1.pdf -------------------------------------------------------------------------------- /XX_section/D2.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/COGS108/Lectures-Ellis/c9dbb70bd3b6e72b2835115aa6f1c1bfdf89ee79/XX_section/D2.pdf -------------------------------------------------------------------------------- /XX_section/D3.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/COGS108/Lectures-Ellis/c9dbb70bd3b6e72b2835115aa6f1c1bfdf89ee79/XX_section/D3.pdf -------------------------------------------------------------------------------- /XX_section/D4.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/COGS108/Lectures-Ellis/c9dbb70bd3b6e72b2835115aa6f1c1bfdf89ee79/XX_section/D4.pdf -------------------------------------------------------------------------------- /XX_section/D5.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/COGS108/Lectures-Ellis/c9dbb70bd3b6e72b2835115aa6f1c1bfdf89ee79/XX_section/D5.pdf -------------------------------------------------------------------------------- /XX_section/D6.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/COGS108/Lectures-Ellis/c9dbb70bd3b6e72b2835115aa6f1c1bfdf89ee79/XX_section/D6.pdf -------------------------------------------------------------------------------- /XX_section/D7.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/COGS108/Lectures-Ellis/c9dbb70bd3b6e72b2835115aa6f1c1bfdf89ee79/XX_section/D7.pdf -------------------------------------------------------------------------------- /XX_section/D7_notebook.ipynb: -------------------------------------------------------------------------------- 1 | {"cells":[{"cell_type":"markdown","id":"8ae41600","metadata":{"id":"8ae41600"},"source":["## Due Dates\n","\n","### D8, Q9 Due Monday 6/3\n","### Checkpoint 2 Due Friday 5/30\n","### A4 (Released this Friday) Due next Friday 6/10"]},{"cell_type":"markdown","id":"8e1ba1ce","metadata":{"id":"8e1ba1ce"},"source":["## D8 Review"]},{"cell_type":"markdown","id":"c7d1b4ff","metadata":{"id":"c7d1b4ff"},"source":["## Part 1"]},{"cell_type":"code","execution_count":1,"id":"0e0b883e","metadata":{"id":"0e0b883e","executionInfo":{"status":"ok","timestamp":1740553021021,"user_tz":480,"elapsed":1866,"user":{"displayName":"Yueyan Tang","userId":"00093492675056380488"}},"outputId":"fa70a9ae-f123-4040-eadc-704acd92a502","colab":{"base_uri":"https://localhost:8080/","height":242}},"outputs":[{"output_type":"execute_result","data":{"text/plain":[" PassengerId Survived Pclass \\\n","0 1 0 3 \n","1 2 1 1 \n","2 3 1 3 \n","3 4 1 1 \n","4 5 0 3 \n","\n"," Name Sex Age SibSp \\\n","0 Braund, Mr. Owen Harris male 22.0 1 \n","1 Cumings, Mrs. John Bradley (Florence Briggs Th... female 38.0 1 \n","2 Heikkinen, Miss. Laina female 26.0 0 \n","3 Futrelle, Mrs. Jacques Heath (Lily May Peel) female 35.0 1 \n","4 Allen, Mr. William Henry male 35.0 0 \n","\n"," Parch Ticket Fare Cabin Embarked \n","0 0 A/5 21171 7.2500 NaN S \n","1 0 PC 17599 71.2833 C85 C \n","2 0 STON/O2. 3101282 7.9250 NaN S \n","3 0 113803 53.1000 C123 S \n","4 0 373450 8.0500 NaN S "],"text/html":["\n","
\n","
\n","\n","\n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n","
PassengerIdSurvivedPclassNameSexAgeSibSpParchTicketFareCabinEmbarked
0103Braund, Mr. Owen Harrismale22.010A/5 211717.2500NaNS
1211Cumings, Mrs. John Bradley (Florence Briggs Th...female38.010PC 1759971.2833C85C
2313Heikkinen, Miss. Lainafemale26.000STON/O2. 31012827.9250NaNS
3411Futrelle, Mrs. Jacques Heath (Lily May Peel)female35.01011380353.1000C123S
4503Allen, Mr. William Henrymale35.0003734508.0500NaNS
\n","
\n","
\n","\n","
\n"," \n","\n"," \n","\n"," \n","
\n","\n","\n","
\n"," \n","\n","\n","\n"," \n","
\n","\n","
\n","
\n"],"application/vnd.google.colaboratory.intrinsic+json":{"type":"dataframe","variable_name":"titan","summary":"{\n \"name\": \"titan\",\n \"rows\": 891,\n \"fields\": [\n {\n \"column\": \"PassengerId\",\n \"properties\": {\n \"dtype\": \"number\",\n \"std\": 257,\n \"min\": 1,\n \"max\": 891,\n \"num_unique_values\": 891,\n \"samples\": [\n 710,\n 440,\n 841\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"Survived\",\n \"properties\": {\n \"dtype\": \"number\",\n \"std\": 0,\n \"min\": 0,\n \"max\": 1,\n \"num_unique_values\": 2,\n \"samples\": [\n 1,\n 0\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"Pclass\",\n \"properties\": {\n \"dtype\": \"number\",\n \"std\": 0,\n \"min\": 1,\n \"max\": 3,\n \"num_unique_values\": 3,\n \"samples\": [\n 3,\n 1\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"Name\",\n \"properties\": {\n \"dtype\": \"string\",\n \"num_unique_values\": 891,\n \"samples\": [\n \"Moubarek, Master. Halim Gonios (\\\"William George\\\")\",\n \"Kvillner, Mr. Johan Henrik Johannesson\"\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"Sex\",\n \"properties\": {\n \"dtype\": \"category\",\n \"num_unique_values\": 2,\n \"samples\": [\n \"female\",\n \"male\"\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"Age\",\n \"properties\": {\n \"dtype\": \"number\",\n \"std\": 14.526497332334044,\n \"min\": 0.42,\n \"max\": 80.0,\n \"num_unique_values\": 88,\n \"samples\": [\n 0.75,\n 22.0\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"SibSp\",\n \"properties\": {\n \"dtype\": \"number\",\n \"std\": 1,\n \"min\": 0,\n \"max\": 8,\n \"num_unique_values\": 7,\n \"samples\": [\n 1,\n 0\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"Parch\",\n \"properties\": {\n \"dtype\": \"number\",\n \"std\": 0,\n \"min\": 0,\n \"max\": 6,\n \"num_unique_values\": 7,\n \"samples\": [\n 0,\n 1\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"Ticket\",\n \"properties\": {\n \"dtype\": \"string\",\n \"num_unique_values\": 681,\n \"samples\": [\n \"11774\",\n \"248740\"\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"Fare\",\n \"properties\": {\n \"dtype\": \"number\",\n \"std\": 49.693428597180905,\n \"min\": 0.0,\n \"max\": 512.3292,\n \"num_unique_values\": 248,\n \"samples\": [\n 11.2417,\n 51.8625\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"Cabin\",\n \"properties\": {\n \"dtype\": \"category\",\n \"num_unique_values\": 147,\n \"samples\": [\n \"D45\",\n \"B49\"\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"Embarked\",\n \"properties\": {\n \"dtype\": \"category\",\n \"num_unique_values\": 3,\n \"samples\": [\n \"S\",\n \"C\"\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n }\n ]\n}"}},"metadata":{},"execution_count":1}],"source":["import pandas as pd\n","\n","titan = pd.read_csv('https://raw.githubusercontent.com/datasciencedojo/datasets/master/titanic.csv')\n","titan.head()"]},{"cell_type":"code","execution_count":null,"id":"c0ada0a1","metadata":{"id":"c0ada0a1","outputId":"3e90de2b-fc0a-42e6-8f2b-b2b7fd23335c"},"outputs":[{"data":{"text/plain":["(891, 12)"]},"execution_count":21,"metadata":{},"output_type":"execute_result"}],"source":["titan.shape"]},{"cell_type":"markdown","id":"aec0eba2","metadata":{"id":"aec0eba2"},"source":["### Finding the number of missing values in each column"]},{"cell_type":"code","execution_count":null,"id":"2233cc78","metadata":{"id":"2233cc78","outputId":"dd68f0a1-098d-4c11-dced-5d18021739ce"},"outputs":[{"data":{"text/plain":["PassengerId 0\n","Survived 0\n","Pclass 0\n","Name 0\n","Sex 0\n","Age 177\n","SibSp 0\n","Parch 0\n","Ticket 0\n","Fare 0\n","Cabin 687\n","Embarked 2\n","dtype: int64"]},"execution_count":22,"metadata":{},"output_type":"execute_result"}],"source":["titan.isna().sum(axis = 0)"]},{"cell_type":"code","execution_count":null,"id":"ac85caa0","metadata":{"id":"ac85caa0","outputId":"0913a981-0747-4c07-a494-045b65ab7378"},"outputs":[{"data":{"text/plain":["PassengerId 0\n","Survived 0\n","Pclass 0\n","Name 0\n","Sex 0\n","Age 177\n","SibSp 0\n","Parch 0\n","Ticket 0\n","Fare 0\n","Cabin 687\n","Embarked 2\n","dtype: int64"]},"execution_count":23,"metadata":{},"output_type":"execute_result"}],"source":["titan.isnull().sum(axis = 0)"]},{"cell_type":"markdown","id":"2bc64de6","metadata":{"id":"2bc64de6"},"source":["### Dropping data where there's missing values for Age and Embarked"]},{"cell_type":"code","execution_count":null,"id":"2dfc1947","metadata":{"id":"2dfc1947"},"outputs":[],"source":["titan = titan.dropna(subset = ['Age', 'Embarked'])"]},{"cell_type":"code","execution_count":null,"id":"1e137e01","metadata":{"id":"1e137e01","outputId":"aa085d11-71cf-4011-b796-023e1ddd8769"},"outputs":[{"data":{"text/plain":["(712, 12)"]},"execution_count":25,"metadata":{},"output_type":"execute_result"}],"source":["titan.shape"]},{"cell_type":"code","execution_count":null,"id":"85c5c820","metadata":{"id":"85c5c820","outputId":"86532bf4-a93e-4ded-8855-1f123d23ec5d"},"outputs":[{"data":{"text/html":["
\n","\n","\n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n","
PassengerIdSurvivedPclassNameSexAgeSibSpParchTicketFareCabinEmbarked
0103Braund, Mr. Owen Harrismale22.010A/5 211717.2500NaNS
1211Cumings, Mrs. John Bradley (Florence Briggs Th...female38.010PC 1759971.2833C85C
2313Heikkinen, Miss. Lainafemale26.000STON/O2. 31012827.9250NaNS
3411Futrelle, Mrs. Jacques Heath (Lily May Peel)female35.01011380353.1000C123S
4503Allen, Mr. William Henrymale35.0003734508.0500NaNS
.......................................
88588603Rice, Mrs. William (Margaret Norton)female39.00538265229.1250NaNQ
88688702Montvila, Rev. Juozasmale27.00021153613.0000NaNS
88788811Graham, Miss. Margaret Edithfemale19.00011205330.0000B42S
88989011Behr, Mr. Karl Howellmale26.00011136930.0000C148C
89089103Dooley, Mr. Patrickmale32.0003703767.7500NaNQ
\n","

712 rows × 12 columns

\n","
"],"text/plain":[" PassengerId Survived Pclass \\\n","0 1 0 3 \n","1 2 1 1 \n","2 3 1 3 \n","3 4 1 1 \n","4 5 0 3 \n",".. ... ... ... \n","885 886 0 3 \n","886 887 0 2 \n","887 888 1 1 \n","889 890 1 1 \n","890 891 0 3 \n","\n"," Name Sex Age SibSp \\\n","0 Braund, Mr. Owen Harris male 22.0 1 \n","1 Cumings, Mrs. John Bradley (Florence Briggs Th... female 38.0 1 \n","2 Heikkinen, Miss. Laina female 26.0 0 \n","3 Futrelle, Mrs. Jacques Heath (Lily May Peel) female 35.0 1 \n","4 Allen, Mr. William Henry male 35.0 0 \n",".. ... ... ... ... \n","885 Rice, Mrs. William (Margaret Norton) female 39.0 0 \n","886 Montvila, Rev. Juozas male 27.0 0 \n","887 Graham, Miss. Margaret Edith female 19.0 0 \n","889 Behr, Mr. Karl Howell male 26.0 0 \n","890 Dooley, Mr. Patrick male 32.0 0 \n","\n"," Parch Ticket Fare Cabin Embarked \n","0 0 A/5 21171 7.2500 NaN S \n","1 0 PC 17599 71.2833 C85 C \n","2 0 STON/O2. 3101282 7.9250 NaN S \n","3 0 113803 53.1000 C123 S \n","4 0 373450 8.0500 NaN S \n",".. ... ... ... ... ... \n","885 5 382652 29.1250 NaN Q \n","886 0 211536 13.0000 NaN S \n","887 0 112053 30.0000 B42 S \n","889 0 111369 30.0000 C148 C \n","890 0 370376 7.7500 NaN Q \n","\n","[712 rows x 12 columns]"]},"execution_count":26,"metadata":{},"output_type":"execute_result"}],"source":["titan"]},{"cell_type":"code","execution_count":null,"id":"0c8e824a","metadata":{"id":"0c8e824a"},"outputs":[],"source":["import numpy as np"]},{"cell_type":"code","execution_count":null,"id":"eea18af5","metadata":{"id":"eea18af5","outputId":"7a267873-9317-4f7b-f1c0-5d833e817b94"},"outputs":[{"data":{"text/html":["
\n","\n","\n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n","
PassengerIdWeight
01241
12161
23199
34203
45245
\n","
"],"text/plain":[" PassengerId Weight\n","0 1 241\n","1 2 161\n","2 3 199\n","3 4 203\n","4 5 245"]},"execution_count":32,"metadata":{},"output_type":"execute_result"}],"source":["passenger_ids = np.arange(1, 892)\n","weights = np.random.randint(150, 251, size=891)\n","passenger_weight_df = pd.DataFrame({\n"," 'PassengerId': passenger_ids,\n"," 'Weight': weights\n","})\n","\n","passenger_weight_df.head()"]},{"cell_type":"markdown","id":"c70bfece","metadata":{"id":"c70bfece"},"source":["### Left joining Titanic dataset with Passenger Weight data"]},{"cell_type":"markdown","id":"54cb4b82","metadata":{"id":"54cb4b82"},"source":["pd.merge in pandas is used to combine two dataframes based on common columns. It automatically merges the dataframes on all columns that both dataframes have in common.\n","\n","Specifying how = 'left' performs a left join, meaning that the resulting dataframe include all rows from the left dataframe and the matched rows from the right dataframe. If there is no match, the right wide will contain NaN."]},{"cell_type":"code","execution_count":null,"id":"31762718","metadata":{"id":"31762718"},"outputs":[],"source":["titan_df = pd.merge(titan, passenger_weight_df, how=\"left\")"]},{"cell_type":"code","execution_count":null,"id":"ff2b4ef4","metadata":{"id":"ff2b4ef4","outputId":"46743d86-8852-4859-e76a-ed24c5a5fc3a"},"outputs":[{"data":{"text/html":["
\n","\n","\n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n","
PassengerIdSurvivedPclassNameSexAgeSibSpParchTicketFareCabinEmbarkedWeight
0103Braund, Mr. Owen Harrismale22.010A/5 211717.2500NaNS241
1211Cumings, Mrs. John Bradley (Florence Briggs Th...female38.010PC 1759971.2833C85C161
2313Heikkinen, Miss. Lainafemale26.000STON/O2. 31012827.9250NaNS199
3411Futrelle, Mrs. Jacques Heath (Lily May Peel)female35.01011380353.1000C123S203
4503Allen, Mr. William Henrymale35.0003734508.0500NaNS245
\n","
"],"text/plain":[" PassengerId Survived Pclass \\\n","0 1 0 3 \n","1 2 1 1 \n","2 3 1 3 \n","3 4 1 1 \n","4 5 0 3 \n","\n"," Name Sex Age SibSp \\\n","0 Braund, Mr. Owen Harris male 22.0 1 \n","1 Cumings, Mrs. John Bradley (Florence Briggs Th... female 38.0 1 \n","2 Heikkinen, Miss. Laina female 26.0 0 \n","3 Futrelle, Mrs. Jacques Heath (Lily May Peel) female 35.0 1 \n","4 Allen, Mr. William Henry male 35.0 0 \n","\n"," Parch Ticket Fare Cabin Embarked Weight \n","0 0 A/5 21171 7.2500 NaN S 241 \n","1 0 PC 17599 71.2833 C85 C 161 \n","2 0 STON/O2. 3101282 7.9250 NaN S 199 \n","3 0 113803 53.1000 C123 S 203 \n","4 0 373450 8.0500 NaN S 245 "]},"execution_count":43,"metadata":{},"output_type":"execute_result"}],"source":["titan_df.head()"]},{"cell_type":"markdown","id":"b21ae968","metadata":{"id":"b21ae968"},"source":["In this case, the common column is PassengerId, so the dataframes merged on PassengerId"]},{"cell_type":"markdown","id":"6e7b9cf0","metadata":{"id":"6e7b9cf0"},"source":["### Checking how many different type of Cabin there is in the dataset"]},{"cell_type":"code","execution_count":null,"id":"d7154e2a","metadata":{"id":"d7154e2a","outputId":"309c691f-4f64-4b87-f3ff-3f738f98c4f5"},"outputs":[{"data":{"text/plain":["Cabin\n","G6 4\n","B96 B98 4\n","C23 C25 C27 4\n","F33 3\n","D 3\n"," ..\n","C91 1\n","C124 1\n","C32 1\n","E34 1\n","C148 1\n","Name: count, Length: 133, dtype: int64"]},"execution_count":50,"metadata":{},"output_type":"execute_result"}],"source":["titan_df['Cabin'].value_counts()"]},{"cell_type":"markdown","id":"576a4ca1","metadata":{"id":"576a4ca1"},"source":["### Limiting to just top 100 Cabins"]},{"cell_type":"code","execution_count":null,"id":"0c52aca1","metadata":{"id":"0c52aca1"},"outputs":[],"source":["cabins = titan_df['Cabin'].value_counts()[:100].index.tolist()\n","temp = titan_df[titan_df['Cabin'].isin(cabins)]"]},{"cell_type":"code","execution_count":null,"id":"ef03e263","metadata":{"id":"ef03e263","outputId":"8e2ac7c4-5645-4742-e982-e5178d5e4cd0"},"outputs":[{"data":{"text/plain":["(150, 13)"]},"execution_count":48,"metadata":{},"output_type":"execute_result"}],"source":["temp.shape"]},{"cell_type":"markdown","id":"8476f25f","metadata":{"id":"8476f25f"},"source":["## Part 2"]},{"cell_type":"code","execution_count":null,"id":"058211f2","metadata":{"id":"058211f2","outputId":"1fad5ed6-8551-4adc-c979-67eeeab2fc25"},"outputs":[{"data":{"text/html":["
\n","\n","\n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n","
sepal length (cm)sepal width (cm)petal length (cm)petal width (cm)species
05.13.51.40.20
14.93.01.40.20
24.73.21.30.20
34.63.11.50.20
45.03.61.40.20
\n","
"],"text/plain":[" sepal length (cm) sepal width (cm) petal length (cm) petal width (cm) \\\n","0 5.1 3.5 1.4 0.2 \n","1 4.9 3.0 1.4 0.2 \n","2 4.7 3.2 1.3 0.2 \n","3 4.6 3.1 1.5 0.2 \n","4 5.0 3.6 1.4 0.2 \n","\n"," species \n","0 0 \n","1 0 \n","2 0 \n","3 0 \n","4 0 "]},"execution_count":59,"metadata":{},"output_type":"execute_result"}],"source":["import pandas as pd\n","from sklearn.datasets import load_iris\n","\n","iris = load_iris()\n","df = pd.DataFrame(iris.data, columns=iris.feature_names)\n","\n","# Add the target variable\n","df['species'] = pd.DataFrame(iris.target)\n","\n","df.head()"]},{"cell_type":"markdown","id":"8fdd8c96","metadata":{"id":"8fdd8c96"},"source":["### Splitting into predictors (everything else) and outcome variable (species)"]},{"cell_type":"code","execution_count":null,"id":"8ae4e66e","metadata":{"id":"8ae4e66e"},"outputs":[],"source":["x = df[['sepal length (cm)', 'sepal width (cm)', 'petal length (cm)','petal width (cm)']]\n","y = df['species']"]},{"cell_type":"markdown","id":"1f733f44","metadata":{"id":"1f733f44"},"source":["### Splitting into 80% train and 20% test"]},{"cell_type":"markdown","id":"8afb0072","metadata":{"id":"8afb0072"},"source":["For the purpose of this lab, do **not** use train_test_split from sklearn"]},{"cell_type":"code","execution_count":null,"id":"fbeba70e","metadata":{"id":"fbeba70e"},"outputs":[],"source":["n_train = int(len(df)*0.8)\n","n_test = len(df) - n_train"]},{"cell_type":"code","execution_count":null,"id":"29d30598","metadata":{"id":"29d30598"},"outputs":[],"source":["train_x = x[:n_train]\n","train_y = y[:n_train]\n","test_x = x[n_train:]\n","test_y = y[n_train:]"]},{"cell_type":"markdown","id":"6b96acb2","metadata":{"id":"6b96acb2"},"source":["train_x is the predictors for the train data, test_x is the outcome for the train data\n","\n","train_y is the predictors for the test data, test_y is the outcome for the test data"]},{"cell_type":"markdown","id":"172aa656","metadata":{"id":"172aa656"},"source":["### Traininig a model"]},{"cell_type":"code","execution_count":null,"id":"e4c4868f","metadata":{"id":"e4c4868f"},"outputs":[],"source":["from sklearn.svm import SVC\n","from sklearn.metrics import confusion_matrix, classification_report, precision_recall_fscore_support"]},{"cell_type":"code","execution_count":null,"id":"31c1e700","metadata":{"id":"31c1e700"},"outputs":[],"source":["def train_SVM(X,y, kernel = 'linear'):\n"," clf = SVC(kernel = kernel)\n"," clf.fit(X, y)\n","\n"," return clf"]},{"cell_type":"markdown","id":"e20760be","metadata":{"id":"e20760be"},"source":["Training on the train data"]},{"cell_type":"code","execution_count":null,"id":"095187e1","metadata":{"id":"095187e1"},"outputs":[],"source":["iris_clf = train_SVM(train_x, train_y)"]},{"cell_type":"markdown","id":"eeb041a3","metadata":{"id":"eeb041a3"},"source":["### Making Predictions"]},{"cell_type":"markdown","id":"a3271c98","metadata":{"id":"a3271c98"},"source":["Making predictions on the train and test data"]},{"cell_type":"code","execution_count":null,"id":"278f027d","metadata":{"id":"278f027d"},"outputs":[],"source":["predicted_train_y = iris_clf.predict(train_x)\n","predicted_test_y = iris_clf.predict(test_x)"]},{"cell_type":"markdown","id":"a1348319","metadata":{"id":"a1348319"},"source":["## Part 3 - Model Assessment"]},{"cell_type":"markdown","id":"f1307229","metadata":{"id":"f1307229"},"source":["### Classification report on the predictions generated from training data"]},{"cell_type":"code","execution_count":null,"id":"b67705ae","metadata":{"id":"b67705ae","outputId":"88cdd795-e1bb-4ca4-f255-515700053323"},"outputs":[{"name":"stdout","output_type":"stream","text":[" precision recall f1-score support\n","\n"," 0 1.00 1.00 1.00 50\n"," 1 1.00 0.98 0.99 50\n"," 2 0.95 1.00 0.98 20\n","\n"," accuracy 0.99 120\n"," macro avg 0.98 0.99 0.99 120\n","weighted avg 0.99 0.99 0.99 120\n","\n"]}],"source":["print(classification_report(train_y, predicted_train_y))"]},{"cell_type":"markdown","id":"52484698","metadata":{"id":"52484698"},"source":["### Confusion Matrix on the predictions generated from training data"]},{"cell_type":"code","execution_count":null,"id":"e649dace","metadata":{"id":"e649dace","outputId":"3ff50e90-e803-4f7f-a71b-9a0067d6a674"},"outputs":[{"name":"stdout","output_type":"stream","text":["[[50 0 0]\n"," [ 0 49 1]\n"," [ 0 0 20]]\n"]}],"source":["print(confusion_matrix(train_y, predicted_train_y, sample_weight=None))"]}],"metadata":{"kernelspec":{"display_name":"Python 3 (ipykernel)","language":"python","name":"python3"},"language_info":{"codemirror_mode":{"name":"ipython","version":3},"file_extension":".py","mimetype":"text/x-python","name":"python","nbconvert_exporter":"python","pygments_lexer":"ipython3","version":"3.9.5"},"colab":{"provenance":[]}},"nbformat":4,"nbformat_minor":5} -------------------------------------------------------------------------------- /XX_section/D8.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/COGS108/Lectures-Ellis/c9dbb70bd3b6e72b2835115aa6f1c1bfdf89ee79/XX_section/D8.pdf --------------------------------------------------------------------------------