├── .gitignore
├── 01_intro
├── 01_01_intro.pdf
└── 01_02_version_control.pdf
├── 02_data
├── 02_03_data.pdf
├── 02_04_pandas.ipynb
├── data
│ └── my_data.csv
└── img
│ ├── github.png
│ ├── join.png
│ └── pandas.png
├── 03_ethics
├── 03_05_ethics.pdf
├── 03_06_datavis.ipynb
└── QRCodes
│ ├── 03_06_01_viz.png
│ ├── 03_06_02_viz.png
│ ├── 03_06_03_viz.png
│ └── 03_06_04_viz.png
├── 04_analysis
└── 04_07_questions.pdf
├── 05_eda
├── 05_08_EDA.ipynb
├── 05_09_inference.pdf
└── data
│ └── woc_wi25.csv
├── 06_inference
├── 06_10_inference.ipynb
└── 06_11_nonparametric.pdf
├── 07_text
├── 07_12_text.pdf
└── 07_13_nlp.ipynb
├── 08_ml
├── 08_14_machine_learning.pdf
└── 08_15_ml.ipynb
├── 09_geospatial
└── 09_16_geospatial.pdf
├── 10_communication
├── 10_17_communication.pdf
├── 10_18_be_wrong.pdf
└── 10_19_jobs_future.pdf
├── LICENSE
├── README.md
└── XX_section
├── D1.pdf
├── D2.pdf
├── D3.pdf
├── D4.pdf
├── D5.pdf
├── D6.pdf
├── D7.pdf
├── D7_notebook.ipynb
└── D8.pdf
/.gitignore:
--------------------------------------------------------------------------------
1 | QRCodes/*
--------------------------------------------------------------------------------
/01_intro/01_01_intro.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/COGS108/Lectures-Ellis/c9dbb70bd3b6e72b2835115aa6f1c1bfdf89ee79/01_intro/01_01_intro.pdf
--------------------------------------------------------------------------------
/01_intro/01_02_version_control.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/COGS108/Lectures-Ellis/c9dbb70bd3b6e72b2835115aa6f1c1bfdf89ee79/01_intro/01_02_version_control.pdf
--------------------------------------------------------------------------------
/02_data/02_03_data.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/COGS108/Lectures-Ellis/c9dbb70bd3b6e72b2835115aa6f1c1bfdf89ee79/02_data/02_03_data.pdf
--------------------------------------------------------------------------------
/02_data/data/my_data.csv:
--------------------------------------------------------------------------------
1 | id,first_name,last_name,age,score,value
2 | 295,Andrea,Clark,46,-1,24547.87
3 | 620,Bill,Woods,46,492,46713.9
4 | 891,Alexander,Jacobson,48,489,32071.74
5 | 914,Derrick,Bradley,52,-1,30650.48
6 | 1736,Allison,Thomas,44,-1,9553.12
7 | 2049,Stephen,Williams,57,333,138936.92
8 | 2241,Malik,Wood,46,-1,10804.47
9 | 2607,Amber,Garcia,50,536,9367.27
10 | 2635,David,Coleman,68,351,66035.28
11 | 3585,Eric,Atkins,56,582,103977.32
12 | 4199,Justin,Johnson,59,500,34938.08
13 | 6739,Donna,Barnes,48,500,130915.2
14 | 7099,Larry,Prince,52,519,28474.33
15 | 7264,Megan,Mcmahon,45,349,0.0
16 | 7799,Sarah,Jones,47,497,29356.49
17 | 8343,Brian,Weber,49,477,17976.51
18 | 9386,Jackie,Clark,44,432,15446.53
19 | 10753,Laurie,Wood,58,335,
20 | 12243,Monica,Sanchez,49,524,35256.88
21 | 12250,Erica,Adams,41,-1,2126.22
22 | 12841,James,Williams,38,496,2298.05
23 | 12913,William,Blevins,39,249,4287.85
24 | 13120,Daniel,Key,64,165,17444.32
25 | 13255,Kelsey,Palmer,32,519,18858.85
26 | 13806,Ashley,Jones,57,-1,80325.66
27 | 14033,Larry,Ibarra,34,632,4315.74
28 | 14294,Michelle,Walters,58,472,33163.31
29 | 15137,Andrea,Simpson,55,-1,12155.47
30 | 15391,Charles,Santiago,44,266,0.0
31 | 15887,Cassandra,Mann,27,684,17864.92
32 | 16263,Victor,Dawson,42,323,
33 | 17184,Anne,Zuniga,56,549,81928.82
34 | 17345,Kristy,Fletcher,53,492,23577.41
35 | 18031,Michael,Watson,39,378,16165.32
36 | 18610,Jeffrey,Harrell,52,790,59912.55
37 | 19129,Brian,Travis,46,-1,49955.06
38 | 19557,Sabrina,Simon,44,-1,14022.29
39 | 20708,Dylan,Blake,33,750,6984.92
40 | 21427,Benjamin,Tran,34,543,9099.38
41 | 22584,Kenneth,Johnson,65,-1,50174.14
42 | 23915,Mary,Harris,52,266,106602.81
43 | 24650,Stephanie,Hayes,55,-1,15625.5
44 | 24779,Scott,Reyes,54,-1,65163.92
45 | 24794,Jasmine,Mitchell,53,552,86540.27
46 | 24952,James,Wright,51,399,
47 | 25259,Jonathan,Martinez,41,305,3659.99
48 | 25654,Ronald,Perkins,37,533,20991.69
49 | 25735,Anna,Ray,39,578,19122.05
50 | 26091,Marie,Wyatt,44,620,
51 | 26144,Cameron,Walters,44,416,6641.85
52 | 26898,Tina,Riddle,38,310,19281.5
53 | 27225,Dennis,Mason,62,622,105449.11
54 | 27687,Alice,Murphy,47,355,9317.1
55 | 29566,James,Kennedy,69,224,
56 | 29868,Jason,Bentley,69,343,60605.08
57 | 30323,Martin,Obrien,49,497,13049.86
58 | 30524,Kara,Mccoy,47,243,12559.49
59 | 31457,John,Shields,51,-1,25184.43
60 | 32151,Charles,Lam,47,942,35391.52
61 | 32413,Sarah,Harris,53,609,16980.92
62 | 33105,Kevin,Sanchez,31,251,1688.31
63 | 33863,Brian,Murphy,45,-1,20802.53
64 | 34069,Cynthia,West,29,558,1952.0
65 | 35466,Samantha,Park,55,483,
66 | 35621,Vanessa,Hernandez,35,613,28944.29
67 | 35701,Andrew,Nelson,37,406,8145.81
68 | 35870,Sabrina,Mcneil,41,649,32979.4
69 | 36124,Alison,Sullivan,40,362,6123.06
70 | 36575,Elizabeth,Bailey,53,642,16579.52
71 | 37045,Mary,Choi,42,682,39519.13
72 | 37836,Kristin,King,34,-1,24409.19
73 | 37970,Lisa,Myers,50,688,11203.32
74 | 38892,Kimberly,Jefferson,60,-1,28155.47
75 | 39773,Candace,Johnson,44,429,12802.62
76 | 40154,Marie,Daniels,41,512,14380.37
77 | 41347,Anthony,Tucker,52,508,21005.42
78 | 42138,John,Stuart,28,543,
79 | 42140,Aaron,Hart,37,-1,16873.92
80 | 42472,Jacqueline,Young,35,618,42726.56
81 | 42747,Johnathan,Brown,46,567,15226.86
82 | 43140,Nancy,Farley,52,545,30315.75
83 | 44627,Eric,Nelson,31,416,0.0
84 | 45647,Melissa,Bailey,46,-1,4104.35
85 | 47234,Kimberly,Richard,53,465,71801.97
86 | 47459,Todd,Davis,44,708,8012.19
87 | 47669,Michael,Cowan,61,263,16167.9
88 | 48880,Kristen,Dalton,40,555,18512.25
89 | 49196,Elizabeth,Gordon,45,642,12377.85
90 | 49712,David,Richardson,52,400,24932.04
91 | 50960,Maxwell,Mcbride,63,347,105598.79
92 | 51723,Megan,Nguyen,40,443,
93 | 51784,Elizabeth,Stephens,55,149,12632.32
94 | 51909,Michael,Brennan,45,-1,12838.06
95 | 52214,Amy,Scott,36,664,11623.7
96 | 52428,Bryan,Barnett,64,445,59795.5
97 | 53785,Jonathan,Hill,51,344,18568.89
98 | 53812,Melissa,Walton,51,-1,0.0
99 | 53929,James,Miller,43,419,18051.92
100 | 53932,Nicole,Ruiz,36,730,23236.53
101 | 54438,Charles,Schneider,47,553,50773.76
102 | 54849,Sara,Mendoza,42,124,10769.18
103 | 54949,Christine,Myers,45,-1,15230.16
104 | 55691,Katherine,Stafford,54,457,169406.7
105 | 55821,Sarah,Kennedy,41,709,
106 | 56933,Mark,Fletcher,57,253,45151.13
107 | 57077,Jason,Price,32,-1,24557.68
108 | 57542,Ryan,Lee,43,570,5869.58
109 | 57964,Heidi,Allen,57,496,42284.5
110 | 58307,Shannon,Bailey,53,596,22575.35
111 | 58794,Jared,Brown,61,731,51968.96
112 | 59036,Peter,Molina,51,453,15810.33
113 | 59363,Steven,Valentine,46,265,6614.88
114 | 60177,Christopher,Jones,28,748,25035.69
115 | 61015,Nicholas,Chapman,52,289,47600.67
116 | 61534,Alexandra,Chavez,39,598,
117 | 61557,Kathryn,Boyle,48,369,33674.81
118 | 61721,Jason,Murphy,57,163,6766.66
119 | 61808,Krista,Smith,56,469,51671.27
120 | 63587,Paul,Steele,54,616,69185.93
121 | 64423,Dennis,Hernandez,58,329,11611.97
122 | 64535,William,Irwin,60,304,24581.63
123 | 64654,Pamela,Wilson,52,612,65061.89
124 | 64731,Sheila,Zimmerman,32,-1,9726.45
125 | 65382,Laura,Burke,35,613,10589.0
126 | 66968,Cynthia,Davis,43,424,15326.1
127 | 67135,Madeline,Rivera,68,343,30993.97
128 | 68932,Nicholas,Chan,46,299,17580.28
129 | 68961,David,Simmons,34,702,35725.49
130 | 69588,Valerie,Griffin,48,668,10678.15
131 | 70182,Monica,Phillips,42,716,17987.82
132 | 70600,Lindsey,Young,45,630,13339.26
133 | 70721,Jennifer,Perkins,60,338,115356.55
134 | 70735,Eric,Olson,27,665,3975.59
135 | 71120,Jonathan,Blevins,39,423,13456.24
136 | 71787,Maria,West,29,453,7271.65
137 | 71943,Catherine,Sherman,45,416,11873.05
138 | 72035,Mike,Evans,53,288,24561.64
139 | 72631,Scott,Johnson,47,517,43254.45
140 | 74271,Mike,Fisher,44,629,15712.7
141 | 75015,Kelly,Murray,43,542,0.0
142 | 75593,Regina,Morgan,47,614,95351.02
143 | 76282,Ashley,Lynch,53,519,14612.16
144 | 76613,Craig,Lewis,31,340,14766.8
145 | 78041,Heather,Ibarra,52,494,79594.31
146 | 79083,Michael,Simmons,54,490,0.0
147 | 79114,Steven,Kent,32,574,10087.18
148 | 80217,Kellie,Ryan,60,183,5060.72
149 | 80411,Tina,Yu,56,525,29622.86
150 | 80593,Donald,Melton,42,625,20393.78
151 | 80765,David,Andrews,53,545,0.0
152 | 81068,Aaron,Roberts,32,617,4745.92
153 | 81282,Emily,Medina,23,775,9116.03
154 | 81648,Debbie,Barrett,49,766,18573.58
155 | 82183,Tamara,Jenkins,46,-1,23914.62
156 | 82826,Allison,Day,34,523,23130.71
157 | 83528,Brenda,Green,28,794,5233.1
158 | 83590,Alexander,Murphy,60,377,8930.89
159 | 84203,Warren,Wilson,34,-1,17172.97
160 | 84475,Brenda,Cox,64,-1,91349.57
161 | 84683,Jessica,Bryant,42,840,204999.96
162 | 84812,Amanda,Williams,14,749,5241.51
163 | 84838,Renee,Lyons,39,399,0.0
164 | 84846,Eric,Smith,39,714,15188.88
165 | 84965,Robert,Black,59,-1,13873.11
166 | 85812,Mark,Payne,51,563,32609.97
167 | 86230,Jose,Adams,41,551,7416.68
168 | 86560,John,Blevins,50,534,29842.6
169 | 86711,Ashley,Alexander,54,459,11137.09
170 | 87122,Douglas,Hogan,49,-1,21598.29
171 | 87738,Gregory,Sutton,46,462,42052.3
172 | 87876,Austin,Dixon,44,303,5787.5
173 | 87928,Alexandra,Miller,36,752,12297.85
174 | 88273,David,Matthews,49,309,7018.32
175 | 88340,Autumn,Brooks,52,587,36261.02
176 | 88868,Ralph,Wilkinson,51,496,12232.08
177 | 89550,Daniel,Sharp,43,448,8150.64
178 | 89765,Charles,Thompson,46,678,8285.88
179 | 89922,Robert,Woods,64,282,161540.81
180 | 90113,Karen,Morgan,37,371,21187.93
181 | 90367,Kevin,Stewart,48,700,0.0
182 | 91524,Jamie,Gardner,50,291,21141.69
183 | 91623,Amanda,Webb,54,339,62692.36
184 | 91893,Kelsey,Martin,52,-1,40206.46
185 | 91921,Javier,Brooks,38,658,15437.74
186 | 91946,Susan,Garcia,34,664,7084.82
187 | 92298,Jennifer,Marks,39,-1,36443.84
188 | 93114,Kerri,Fields,44,478,33086.07
189 | 93989,David,Adkins,48,460,2434.35
190 | 94421,Tracy,Reyes,57,284,11442.11
191 | 94628,Shannon,Andrews,45,426,9593.03
192 | 94730,Lisa,Dominguez,22,-1,
193 | 95502,Julia,Oliver,57,354,23522.87
194 | 96101,Wayne,Bentley,28,918,18303.2
195 | 96293,Michael,Phillips,32,-1,9159.9
196 | 96371,Sherri,Austin,46,698,29412.01
197 | 97441,Krista,Ortiz,34,-1,24074.79
198 | 97728,Anna,Chambers,37,598,0.0
199 | 98115,Jennifer,Pitts,29,606,6876.75
200 | 98284,Brittany,Jenkins,34,665,43525.88
201 | 98366,Katelyn,Brown,45,501,29668.38
202 |
--------------------------------------------------------------------------------
/02_data/img/github.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/COGS108/Lectures-Ellis/c9dbb70bd3b6e72b2835115aa6f1c1bfdf89ee79/02_data/img/github.png
--------------------------------------------------------------------------------
/02_data/img/join.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/COGS108/Lectures-Ellis/c9dbb70bd3b6e72b2835115aa6f1c1bfdf89ee79/02_data/img/join.png
--------------------------------------------------------------------------------
/02_data/img/pandas.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/COGS108/Lectures-Ellis/c9dbb70bd3b6e72b2835115aa6f1c1bfdf89ee79/02_data/img/pandas.png
--------------------------------------------------------------------------------
/03_ethics/03_05_ethics.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/COGS108/Lectures-Ellis/c9dbb70bd3b6e72b2835115aa6f1c1bfdf89ee79/03_ethics/03_05_ethics.pdf
--------------------------------------------------------------------------------
/03_ethics/QRCodes/03_06_01_viz.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/COGS108/Lectures-Ellis/c9dbb70bd3b6e72b2835115aa6f1c1bfdf89ee79/03_ethics/QRCodes/03_06_01_viz.png
--------------------------------------------------------------------------------
/03_ethics/QRCodes/03_06_02_viz.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/COGS108/Lectures-Ellis/c9dbb70bd3b6e72b2835115aa6f1c1bfdf89ee79/03_ethics/QRCodes/03_06_02_viz.png
--------------------------------------------------------------------------------
/03_ethics/QRCodes/03_06_03_viz.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/COGS108/Lectures-Ellis/c9dbb70bd3b6e72b2835115aa6f1c1bfdf89ee79/03_ethics/QRCodes/03_06_03_viz.png
--------------------------------------------------------------------------------
/03_ethics/QRCodes/03_06_04_viz.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/COGS108/Lectures-Ellis/c9dbb70bd3b6e72b2835115aa6f1c1bfdf89ee79/03_ethics/QRCodes/03_06_04_viz.png
--------------------------------------------------------------------------------
/04_analysis/04_07_questions.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/COGS108/Lectures-Ellis/c9dbb70bd3b6e72b2835115aa6f1c1bfdf89ee79/04_analysis/04_07_questions.pdf
--------------------------------------------------------------------------------
/05_eda/05_09_inference.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/COGS108/Lectures-Ellis/c9dbb70bd3b6e72b2835115aa6f1c1bfdf89ee79/05_eda/05_09_inference.pdf
--------------------------------------------------------------------------------
/05_eda/data/woc_wi25.csv:
--------------------------------------------------------------------------------
1 | How fast does human hair grow (cm/yr)?,"If every living person stood crammed together side-by-side, how large of an area would they occupy (km²)?","How many days would it take to walk from San Diego to New York City (assuming no stopping to fix shoes, apply sunscreen, or for sleeping, eating, or other biological needs)?"
2 | 10,40000,300
3 | 12,10000,40
4 | 20,5,20
5 | 3,2,50
6 | 45,200,45
7 | 300/1,2,45
8 | 10,10000,60
9 | 50,100,1000
10 | 30,10000,300
11 | 14,1000,15
12 | 50,100000,500
13 | 15,"1,000,000",17
14 | 5cm per year,100000,100 days
15 | 17.5,"100,000,000",200
16 | 10,1000000,4000 days
17 | 20,15,14
18 | 9 inches,10000,45
19 | 10 cm/year,1 billion,160 days
20 | 60 seconds,3200,1600 miles
21 | 7,500,60
22 | 8,10000,365
23 | 7,"100,000",30
24 | 10,1000000,80
25 | 10,10000,45
26 | 5cm/yr,1000000,100
27 | 20,1000,200
28 | 10,10000,40
29 | 10,"100,000","1,000,000"
30 | 10000,700000,8
31 | 150cm/yr,300km^2,80
32 | 35,9,100
33 | 5,5,90
34 | 100,800000000,730
35 | 8,100000,1000
36 | 40,7000000,50
37 | 15,100000,30
38 | 48,10^5,60
39 | 25,10000,78
40 | 10,10000,100
41 | 6,800,30
42 | 14 cm/yr,10000,56
43 | 20,3.5 billion,90 days
44 | 50,3000,14 days
45 | 36,8000000,5
46 | 10,3,60
47 | 24,5000000,45
48 | 10,1000000000,150
49 | 10cm/year,10^9,100
50 | 100,80km^2,3
51 | 12 cm,25000 km2,3 months
52 | 10,100000000000,50
53 | 30 cm/yr,10000km^2,25 days
54 | 15,10000,80
55 | 7,2000000000,80
56 | 10,100000,479
57 | 7cm/yr,10^10,100
58 | 10,100000,1000
59 | 25,1000,102
60 | 60,3,100
61 | 6,50000,10
62 | 10,10000,100
63 | 12,1000,500
64 | 15cm,1000000,50
65 | 48cm/yr,100000,6 months
66 | 13,7,15
67 | 13,2.5,25
68 | 12,1000000,4
69 | 12,5,125
70 | 15 cm / year,10000000,200
71 | 15,10^5,8
72 | 12cm,1000,90days
73 | 80,2,100
74 | 8,100000000,450
75 | 10,25000000,500
76 | 6,1000,1000
77 | 20,10000,20
78 | 20,500000,31
79 | 20,10000,150
80 | 5,1000000,20
81 | 26cm/yr,200000,95
82 | 11cm,10000,50
83 | 24,10000000,60
84 | 20,1000000,350
85 | 5,500000000,50
86 | 20,100,200
87 | 20cm/yr,10000000,2months
88 | 10,10000,50
89 | 24,10000,100
90 | 15,10000,30
91 | 15,10000000000,30
92 | 15,2000000000,30
93 | 20,200000,15
94 | 84cm/yr,1000km2,100 days
95 | 2,2000000,7
96 | 6,1000,75
97 | 50,5000,1 month
98 | (48/yr),10000000,50
99 | 12,100000,18
100 | 4cm,5,6
101 | 10^2,10000000,100
102 | 4,50,12
103 | 20 cm/yr,"8,000",10
104 | 10cm/yr,"10,000km^2",20
105 | 12,1000,20
106 | 72cm,500,100 days
107 | 30,100000,30
108 | 20,1000000,300
109 | 10,1000000,1 month
110 | 25,600000,25
111 | 20 cm/yr,1000000,54
112 | 100,1000,50
113 | 25cm/yr,10^50 km^2,10000000
114 | 18,1000,300
115 | 24,4,80
116 | 60,40,20
117 | 20,5000,30
118 | 3,"4,000,000 km2",100 days
119 | 15,1,1
120 | 10,5000000,20
121 | 30,500,400
122 | 24,32 million,100
123 | 8cm/yr,10000000,30
124 | 10,100000,10000
125 | 12,1000000000,8
126 | 18 cm,4 million,100 days
127 | 100,10,150
128 | 35/1,"140,000,000",100000
129 | 55,300000000,100
130 | 80,100000,50000
131 | 5 inches,200,180
132 | 7,89000,300
133 | 20,"100,000",50 days
134 | 4.5,1000000,365
135 | 50,200,300
136 | 9,5,1000
137 | 16,400000,15
138 | 5,100000,250
139 | 6.5,1000,100
140 | 15,4000,100
141 | 15 cm,100000,90
142 | 40cm/yr,900,3
143 | 25,10^12,130
144 | 185 cm/yr,"1,000,000 km²",25
145 | 24,1000,80
146 | 40,1000,900
147 | 60 cm/yr,20000 km^2,1000
148 | 7,5000000,90
149 | 5,5,H
150 | 30,10000,30
151 | 8 cm/yr,"100,000 km^2",500
152 | 10cm/yr,1000000,30
153 | 2,30000000000,20
154 | 15,100000,800
155 | 40,10000,100
156 | 39,50000,50
157 | 100,5000000,100
158 | 9,10000,200
159 | 48 cm/yr,10^4,30
160 | 10 cm,100000,50
161 | 18 cm,800000000,80
162 | 15cm/yr,100000 km^2,500 days
163 | 60cm/yr,?,30
164 | 15,10^10,1000
165 | 24,40,38
166 | 22,5,3
167 | 25,"100,000,000",ed
168 | 5 cm,3,5
169 | 20 cm,1000000,20 days
170 | 15cm/yr,They could fit in paris,200 days
171 | 5,5,5
172 | 13,500000,26
173 | Around 7-8 cm,2,100
174 | 10,100,10
175 | 50,100,50
176 | 80,50,13
177 | 10,2000,1 week
178 | 30,200000,365
179 | 15,1000000,180
180 | 12cm,"6,000",70
181 | 25,10000000,1000
182 | 0,700000,1000
183 | 24,240,80
184 | 20,100,50
185 | 2,1000000,365
186 | 36,40,583837
187 | 10,1000,1 month
188 | 14,2000,250
189 | 9,100000,24
190 | 50,100000,100
191 | 24,1000000,200
192 | 10,10^8,100
193 | 9,60000,200
194 | 6,17,168
195 | 50,60,1000
196 | 36,7000000,48 days
197 | 3,9999,365
198 | 20,1000,10000
199 | 27,1000,30 days
200 | 15,10000000000,10
201 | 30,10^8,600?
202 | 20,50000,200
203 | 8cm a year,500,10
204 | 25,10000,5
205 | 2 inches,"250,000km^2",18 days
206 | cm,100000,10000
207 | 360,20,40
208 | 5,1000000,100
209 | 12,100000,90
210 | 6cm /yr,1E+38,10000
211 | 15,1000,40
212 | 7,50,3000
213 | 20,100,70
214 | 10,100,60
215 | 15,1150,980
216 | 50,80000,40
217 | 40,1 million,3 months
218 | 5,1000000000,400
219 | 100,10000000,30
220 | 20,10^9,48
221 | 20cm a year,100,200 days
222 | 122,10000,732
223 | 24cm/year,"1,000,000 km^2",60 days
224 | 25,20000,50
225 | 20,3000000,365
226 | 10,100,1 year
227 | 30,20,90
228 | 3000,100000,23
229 | 10,10000,60
230 | 25 cm,93682737856,1000
231 | 5,100,50
232 | 7,37103,498
233 | 3,10000,15
234 | 20 cm,1000,30 days
235 | 10000/yr,1000000,2 months
236 | 10,a lot,2 months
237 | 5 cm/yr,20,3 weeks
238 | 36,100000,10 days
239 | 10 cm/ yr,10000000,500
240 | 30,10000,1000
241 | 20,8000000000,162
242 | 12,10000,25
243 | 20,100000000000,80
244 | 10,100000,50
245 | 18,8000,30
246 | 20,100000,20
247 | 10,1E+19,50000
248 | 10,10000,50
249 | 30,1000,80
250 | 30,10^8,30
251 | 45,1000,10
252 | 25,10000000,28
253 | 4,100000,14
254 | 20,1000000,20
255 | 30,1600,30000 mins
256 | 5,10x 10^3,12 days
257 | 50,1000000,20
258 | 20,1 million,42
259 | 15,1000,1000
260 | 10,10000,8
261 | 48,200,300
262 | 28,10000000,700
263 | 100cm/year,100000000000000km^2,1000days
264 | 150,1000000000,45
265 | 20,800000,29
266 | "exactly 456,000",Whatever the size of Los Angels is ,about 1/3 of a year
267 | 5cm/year,8000km^2,300 days
268 | 30,300000,90
269 | 15,1000,1000
270 | 100,1000,100
271 | 25,10,28
272 | 5,100000,2 years
273 | 2.5,10^12,40 days
274 | 15,500,80
275 | 8,100000,28
276 | 30,10000,31
277 | 1000,10000000,100
278 | 0.4,978,71
279 | 30,10000,100
280 | 20,100000000000,80
281 | 20cm/yr,9 x 10^3,30 days
282 | 18,1000,15
283 | 300cm / yr,250km,14 days
284 | 0.01 cm,100,30 days
285 | 25,10^23,60
286 | 20,80000,80
287 | 100,10000,1000
288 | 16,150,20 days
289 | 7,200,50
290 | 15,20000 km^2,34
291 | 16,"60,000",37
292 | 10,1000,40
293 | 100,1000,100
294 | 100,10000,120
295 | 25,1400,30
296 | 36,10000,30
297 | 20,1000,80
298 | 8,100000000000,8
299 | 100cm/yr,100000km^2,239 days
300 | 1.5,5000000000,150 days
301 | 100,2,1000
302 | 12,100000,1000
303 | ~70cm/yr,10000,150
304 | 15,100000,20
305 | 3018,501,250
306 | 12,100000000,12 * 3000
307 | 10,100,150
308 | 500,10000,25
309 | 20,100000000000,15
310 | 2.5,1000,37
311 | 10,80000,50
312 | 25,8million,6 months
313 | 10cm,10000000,10000
314 | 20,100000,36500
315 | 10 cm/year,50,21
316 | 20,10000,100
317 | 14,9000,30 days
318 | 36 cm,100000000,1000
319 | 50,20,298 days
320 | 7,10^10,10^7
321 | 10,1000,60
322 | 50,100,3 months
323 | 10,10km^2,60
324 | 20,1000000,90
325 | 25,10000,1000000
326 | 15,100000,50000
327 | 15cm/yr,3 billion,150 days
328 | 10,10000000,50
329 | 15cm,100000,50
330 | 20,15,19
331 | 20,450000,85
332 | 10,1000,1 year
333 | 30,1000000,100
334 | 90cm/yr,100,100
335 | 6,8000,800
336 | 12,1000,75
337 | 80,1000000,7
338 | 25,10000,35
339 | 4,78000,"1,500"
340 | 10cm/year,one million km^2,700 days
341 | 100,10^15,28
342 | 300,10000,30
343 | 15,1000,90
344 | 10/yr,10000,1000
345 | 10 cm/yr,"1,000,000 km^2",5 months = 155 days
346 | 12,100000,50
347 | 10,10000000000,50
348 | 10,100000000,300
349 | 20cm,100,150
350 | 10cm/year,100,50
351 | 20,200000,200
352 | 12 cm/yr,10000,1000
353 | 100,1000^2,400
354 | 10,"100,000",100
355 | 6,7,30
356 | 25,10000,10000
357 | 16,"100,000",43
358 | 20,10,2000
359 | 10,10,50
360 | 36,10000,50
361 | 24,10000000,11
362 | 30,50000000000,200
363 | 15,5x10^4,week
364 | 12cm/yr,2000km^2,35 days
365 | 10,1000000,10
366 | 30,1000000,365
367 | 210cm/yr,6000000km^2,53
368 | 50,10000,1000
369 | 2,100,100
370 | 60,10000,20 days
371 | 10,700,300
372 | 20,600,600
373 | 1,200000000,30
374 | 20cm/yr,1000km^2,500
375 | 7,100000,40
376 | 24,1250000,42
377 | 15cm/yr,"2,500,000,200",60 days
378 | 100,7000000,39
379 | 15,10000,100
380 | 16,100,100
381 | 1000,1000000,1000
382 | 50,100000,30
383 | 1 cm per month,2 mile diameter sphere,"Approx 5,500 miles? 400 days?"
384 | 15,20000000,45
385 | 15,10,30
386 | 70,1600,90
387 | 16,25,150
388 | 100,4000,10
389 | 17,8000,"1,000"
390 | 1000,100000,10
391 | 30,100,50
392 | 15,1000,20
393 | 10cm/yr,10*10^100=1×10¹⁰¹,100
394 | 15,1000,60
395 | 7 cm/year,>10000000,24
396 | 7 cm/yr,1000000,600
397 | 12,1E+20,41
398 | 15,280000000,430
399 | 40,60,15
400 | 30,7.8,720
401 | 15,1000000,30
402 | 12 cm/yr,100,30
403 | 1800 cm/year,The would occupy one trillion km 2,75 days
404 | 10,20000,25
405 | 65cm/yr,"88,560km",32 days
406 | 12cm/yr,0.3*0.3*80 billion =7.2 * 10^3km^3,30
407 | 20cm/yr,10000km*2,30
408 | 7 cm/yr,3 mil km,180 days
409 | 25cm a year,"25,000,000km^2",1000 days
410 | 15 cm,800,38 days
411 | 15,1000000,74
412 | 10,5000,150
413 | 10,10,1000
414 | 10.5 cm/yr,10000 km^2,50 days
415 | 6cm,1000,45
416 | 5,10^3,350
417 | 15,400000,47
418 | 8,100,25
419 | 13,5000,45 days
420 | 8,"20,000",75
421 | 20,100,20
422 | 30,1000,100
423 | 24,4000000,30
424 | 17,2000,100
425 | 5 cm/yr,100,60
426 | 15,1500,10
427 | 20 cm/yr,10000,41 days
428 | 1.5 per year,100000 km2,500 days
429 | 5cm a month,80000,187
430 | 5 centimeter per month,100 yards,100
431 | 12,550,100
432 | 12,8000,60
433 | 5,8000000000,60
434 | 15,8,40
435 | 21,16000000,300
436 | 10,1 million,300
437 | 30,250000000,90
438 | 30,300,21
439 | 18,2000,150 days
440 | 10,50,60
441 | 10 cm/yr,"about 3,600,000,000 km",45
442 | 12,100000,10000
--------------------------------------------------------------------------------
/06_inference/06_11_nonparametric.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/COGS108/Lectures-Ellis/c9dbb70bd3b6e72b2835115aa6f1c1bfdf89ee79/06_inference/06_11_nonparametric.pdf
--------------------------------------------------------------------------------
/07_text/07_12_text.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/COGS108/Lectures-Ellis/c9dbb70bd3b6e72b2835115aa6f1c1bfdf89ee79/07_text/07_12_text.pdf
--------------------------------------------------------------------------------
/07_text/07_13_nlp.ipynb:
--------------------------------------------------------------------------------
1 | {
2 | "cells": [
3 | {
4 | "cell_type": "markdown",
5 | "metadata": {
6 | "slideshow": {
7 | "slide_type": "slide"
8 | }
9 | },
10 | "source": [
11 | "### Course Announcements\n",
12 | "\n",
13 | "**Due this Sunday** (11:59 PM): D6, Q7, Checkpoint #1, Weekly Project Survey (*optional*)\n",
14 | "\n",
15 | "Notes: \n",
16 | "- No Prof OH on Friday\n",
17 | "- For Data Checkpoint:\n",
18 | " - All changes to proposal sections go in the *data checkpoint notebook*\n",
19 | " - Respond to grader on Proposal Issue\n",
20 | "- To follow along (and get lecture attendance credit): https://forms.gle/sZC9kebUm64pts9ZA (will remain open until Sunday 11:59 PM)\n",
21 | "\n",
22 | ""
23 | ]
24 | },
25 | {
26 | "cell_type": "markdown",
27 | "metadata": {
28 | "slideshow": {
29 | "slide_type": "slide"
30 | }
31 | },
32 | "source": [
33 | "# Text Analysis (NLP: Natural Language Processing)\n",
34 | "\n",
35 | "- **Sentiment Analysis**\n",
36 | " - tokenization\n",
37 | " - stop words\n",
38 | " - stemming\n",
39 | "- **TF-IDF**\n",
40 | " - Bag of Words\n",
41 | " - term frequency\n",
42 | " - inverse document frequency\n",
43 | "- Tools: `nltk`"
44 | ]
45 | },
46 | {
47 | "cell_type": "code",
48 | "execution_count": null,
49 | "metadata": {},
50 | "outputs": [],
51 | "source": [
52 | "# if you've never installed nltk before you will have to run this cell\n",
53 | "# but you only need to do this once\n",
54 | "# %pip install nltk"
55 | ]
56 | },
57 | {
58 | "cell_type": "code",
59 | "execution_count": null,
60 | "metadata": {
61 | "slideshow": {
62 | "slide_type": "slide"
63 | }
64 | },
65 | "outputs": [],
66 | "source": [
67 | "# pandas and matplotlib setup\n",
68 | "import pandas as pd\n",
69 | "\n",
70 | "import matplotlib.pyplot as plt\n",
71 | "plt.rcParams['figure.figsize'] = (17, 7)\n",
72 | "plt.rcParams.update({'font.size': 14})\n",
73 | "import seaborn as sns\n",
74 | "\n",
75 | "#improve resolution\n",
76 | "#comment this line if erroring on your machine/screen\n",
77 | "%config InlineBackend.figure_format ='retina'\n",
78 | "\n",
79 | "import warnings\n",
80 | "warnings.filterwarnings('ignore')\n",
81 | "\n",
82 | "#import natural language toolkit\n",
83 | "import nltk\n",
84 | "\n",
85 | "# download stopwords & punkt\n",
86 | "nltk.download('stopwords')\n",
87 | "nltk.download('punkt')"
88 | ]
89 | },
90 | {
91 | "cell_type": "markdown",
92 | "metadata": {
93 | "slideshow": {
94 | "slide_type": "slide"
95 | }
96 | },
97 | "source": [
98 | "#### Reminder: **Natural Language Processing** is a whole field of study.\n",
99 | "\n",
100 | "Like most topics in this course, there are many courses solely focused on the appropriate analysis of text. We'll cover the general concepts in this course, but know you're missing lots of important details."
101 | ]
102 | },
103 | {
104 | "cell_type": "markdown",
105 | "metadata": {
106 | "slideshow": {
107 | "slide_type": "slide"
108 | }
109 | },
110 | "source": [
111 | "## Natural Language Toolkit (`nltk`)\n",
112 | "\n",
113 | "For more details on using the functionality within this package, check out the [NLTK Book](http://www.nltk.org/book/).\n",
114 | "\n",
115 | "0. Preface\n",
116 | "1. Language Processing and Python\n",
117 | "2. Accessing Text Corpora and Lexical Resources\n",
118 | "3. Processing Raw Text\n",
119 | "4. Writing Structured Programs\n",
120 | "5. Categorizing and Tagging Words \n",
121 | "6. Learning to Classify Text\n",
122 | "7. Extracting Information from Text\n",
123 | "8. Analyzing Sentence Structure\n",
124 | "9. Building Feature Based Grammars\n",
125 | "10. Analyzing the Meaning of Sentences \n",
126 | "11. Managing Linguistic Data\n",
127 | "12. Afterword: Facing the Language Challenge"
128 | ]
129 | },
130 | {
131 | "cell_type": "markdown",
132 | "metadata": {
133 | "slideshow": {
134 | "slide_type": "fragment"
135 | }
136 | },
137 | "source": [
138 | "[VADER](https://github.com/cjhutto/vaderSentiment) is a particularly helpful tool/lexicon when working with sentiments expressed in social media (tweets, online reviews, etc.)\n",
139 | "\n",
140 | "Its functionality is available through `nltk`, so we'll download the vader lexicon for use later in this notebook."
141 | ]
142 | },
143 | {
144 | "cell_type": "code",
145 | "execution_count": null,
146 | "metadata": {
147 | "slideshow": {
148 | "slide_type": "fragment"
149 | }
150 | },
151 | "outputs": [],
152 | "source": [
153 | "# get lexicon we'll be working with today\n",
154 | "nltk.download('vader_lexicon') "
155 | ]
156 | },
157 | {
158 | "cell_type": "markdown",
159 | "metadata": {
160 | "slideshow": {
161 | "slide_type": "slide"
162 | }
163 | },
164 | "source": [
165 | "## The Data"
166 | ]
167 | },
168 | {
169 | "cell_type": "code",
170 | "execution_count": null,
171 | "metadata": {
172 | "slideshow": {
173 | "slide_type": "-"
174 | }
175 | },
176 | "outputs": [],
177 | "source": [
178 | "quarters = ['Wi25', 'Wi24', 'Wi21', 'Fa20', 'Wi20', 'Sp20', 'Sp19']"
179 | ]
180 | },
181 | {
182 | "cell_type": "code",
183 | "execution_count": null,
184 | "metadata": {
185 | "slideshow": {
186 | "slide_type": "-"
187 | }
188 | },
189 | "outputs": [],
190 | "source": [
191 | "def read_data(quarter):\n",
192 | " '''read data in from specified quarter, extract columns of interest \n",
193 | " and add a column indicating quarter from which data originated'''\n",
194 | " \n",
195 | " df = pd.read_csv('https://raw.githubusercontent.com/shanellis/datasets/master/COGS108_feedback_' + quarter + '.csv')\n",
196 | " df = df[['enjoyed_most', 'enjoyed_least']]\n",
197 | " df['quarter'] = quarter\n",
198 | " \n",
199 | " return df"
200 | ]
201 | },
202 | {
203 | "cell_type": "code",
204 | "execution_count": null,
205 | "metadata": {
206 | "scrolled": true,
207 | "slideshow": {
208 | "slide_type": "fragment"
209 | }
210 | },
211 | "outputs": [],
212 | "source": [
213 | "# read in data for all quarters\n",
214 | "df = pd.DataFrame()\n",
215 | "\n",
216 | "for quarter in quarters:\n",
217 | " qtr = read_data(quarter)\n",
218 | " df = pd.concat([df, qtr], ignore_index=True)\n",
219 | " \n",
220 | "df"
221 | ]
222 | },
223 | {
224 | "cell_type": "markdown",
225 | "metadata": {
226 | "slideshow": {
227 | "slide_type": "slide"
228 | }
229 | },
230 | "source": [
231 | "## Describe & Explore\n",
232 | "\n",
233 | "We'll quickly describe and explore the data to see what information we have before moving on to Text Analysis."
234 | ]
235 | },
236 | {
237 | "cell_type": "markdown",
238 | "metadata": {
239 | "slideshow": {
240 | "slide_type": "fragment"
241 | }
242 | },
243 | "source": [
244 | "### Data Considerations\n",
245 | "\n",
246 | "- duplicate responses?\n",
247 | "- PIDs for individuals in the class (typos?)\n",
248 | "- missingness?\n",
249 | "- reflect reality?"
250 | ]
251 | },
252 | {
253 | "cell_type": "code",
254 | "execution_count": null,
255 | "metadata": {
256 | "slideshow": {
257 | "slide_type": "fragment"
258 | }
259 | },
260 | "outputs": [],
261 | "source": [
262 | "# how many from each quarter?\n",
263 | "df.value_counts('quarter')"
264 | ]
265 | },
266 | {
267 | "cell_type": "markdown",
268 | "metadata": {
269 | "slideshow": {
270 | "slide_type": "fragment"
271 | }
272 | },
273 | "source": [
274 | "Note: Response Rates\n",
275 | "- Spring 2019: 384/826 (46%)\n",
276 | "- Winter 2020: 295/444 (66%)\n",
277 | "- Spring 2020: 397/475 (84%)\n",
278 | "- Fall 2020: 321/447 (72%)\n",
279 | "- Winter 2021: 314/438 (72%)\n",
280 | "- Winter 2024: 584/701 (83%)\n",
281 | "- **Winter 2025: 690/817 (84%)**"
282 | ]
283 | },
284 | {
285 | "cell_type": "markdown",
286 | "metadata": {
287 | "slideshow": {
288 | "slide_type": "slide"
289 | }
290 | },
291 | "source": [
292 | "### Missingness"
293 | ]
294 | },
295 | {
296 | "cell_type": "code",
297 | "execution_count": null,
298 | "metadata": {
299 | "slideshow": {
300 | "slide_type": "fragment"
301 | }
302 | },
303 | "outputs": [],
304 | "source": [
305 | "# how many nonresponses\n",
306 | "df.isnull().sum()"
307 | ]
308 | },
309 | {
310 | "cell_type": "markdown",
311 | "metadata": {
312 | "slideshow": {
313 | "slide_type": "fragment"
314 | }
315 | },
316 | "source": [
317 | "We see that there are more nonresponses in the `enjoyed_least` category than the `enjoyed_most` category. So, more people left what they enjoyed least blank than they did what they enjoyed most."
318 | ]
319 | },
320 | {
321 | "cell_type": "code",
322 | "execution_count": null,
323 | "metadata": {
324 | "scrolled": true
325 | },
326 | "outputs": [],
327 | "source": [
328 | "# how does that look by quarter?\n",
329 | "null_most = df.groupby('quarter')['enjoyed_most'].apply(lambda x: x.isnull().sum())\n",
330 | "null_least = df.groupby('quarter')['enjoyed_least'].apply(lambda x: x.isnull().sum())\n",
331 | "\n",
332 | "print(null_most, null_least)"
333 | ]
334 | },
335 | {
336 | "cell_type": "markdown",
337 | "metadata": {},
338 | "source": [
339 | "We also see a decrease in Wi21 on. This is when I started requiring these questions (b/c I shortened the survey overall). "
340 | ]
341 | },
342 | {
343 | "cell_type": "markdown",
344 | "metadata": {
345 | "slideshow": {
346 | "slide_type": "fragment"
347 | }
348 | },
349 | "source": [
350 | "#### Previous Quarters\n",
351 | "\n",
352 | "Typically, there are a few people who have what they enjoy least but don't have an enjoy most....but often these students' feedback is of particular interest to me."
353 | ]
354 | },
355 | {
356 | "cell_type": "code",
357 | "execution_count": null,
358 | "metadata": {},
359 | "outputs": [],
360 | "source": [
361 | "# overall\n",
362 | "check_least = df[df['enjoyed_most'].isnull() & df['enjoyed_least'].notnull()]\n",
363 | "list(check_least['enjoyed_least'])"
364 | ]
365 | },
366 | {
367 | "cell_type": "markdown",
368 | "metadata": {
369 | "slideshow": {
370 | "slide_type": "fragment"
371 | }
372 | },
373 | "source": [
374 | "Missing data causes a problem in `nltk`, so we either get rid of individuals who didn't respond to both, or we can replace their missing data with 'No response', knowing that this text will be included in the analysis now."
375 | ]
376 | },
377 | {
378 | "cell_type": "code",
379 | "execution_count": null,
380 | "metadata": {
381 | "slideshow": {
382 | "slide_type": "fragment"
383 | }
384 | },
385 | "outputs": [],
386 | "source": [
387 | "def fill_no_response(df):\n",
388 | " '''replace missing data in enjoyed_most/least series with string No response'''\n",
389 | " \n",
390 | " df['enjoyed_most'] = df['enjoyed_most'].fillna('No response')\n",
391 | " df['enjoyed_least'] = df['enjoyed_least'].fillna('No response')"
392 | ]
393 | },
394 | {
395 | "cell_type": "code",
396 | "execution_count": null,
397 | "metadata": {
398 | "slideshow": {
399 | "slide_type": "-"
400 | }
401 | },
402 | "outputs": [],
403 | "source": [
404 | "# fill NAs with string 'No response'\n",
405 | "fill_no_response(df)"
406 | ]
407 | },
408 | {
409 | "cell_type": "markdown",
410 | "metadata": {
411 | "slideshow": {
412 | "slide_type": "slide"
413 | }
414 | },
415 | "source": [
416 | "## Quick checks: Words of interest\n",
417 | "\n"
418 | ]
419 | },
420 | {
421 | "cell_type": "code",
422 | "execution_count": null,
423 | "metadata": {},
424 | "outputs": [],
425 | "source": [
426 | "def check_word_freq(df, word):\n",
427 | " \"\"\"checks for frequency of word specified in most and least enjoyed responses\"\"\"\n",
428 | " \n",
429 | " # calculate proportion within quarter\n",
430 | " word_most = df[df['enjoyed_most'].str.contains(word, case=False, na=False)]\n",
431 | " proportion_most = word_most.groupby('quarter').size() / df.groupby('quarter').size()\n",
432 | " \n",
433 | " word_least = df[df['enjoyed_least'].str.contains(word, case=False, na=False)]\n",
434 | " proportion_least = word_least.groupby('quarter').size() / df.groupby('quarter').size()\n",
435 | " \n",
436 | " out = combined_df = pd.concat([proportion_most, proportion_least], keys=['most', 'least'], axis=1)\n",
437 | "\n",
438 | " return out"
439 | ]
440 | },
441 | {
442 | "cell_type": "markdown",
443 | "metadata": {
444 | "slideshow": {
445 | "slide_type": "fragment"
446 | }
447 | },
448 | "source": [
449 | "#### Assignment"
450 | ]
451 | },
452 | {
453 | "cell_type": "code",
454 | "execution_count": null,
455 | "metadata": {
456 | "slideshow": {
457 | "slide_type": "-"
458 | }
459 | },
460 | "outputs": [],
461 | "source": [
462 | "## check for assignment\n",
463 | "check_word_freq(df, 'assignment')"
464 | ]
465 | },
466 | {
467 | "cell_type": "markdown",
468 | "metadata": {
469 | "slideshow": {
470 | "slide_type": "slide"
471 | }
472 | },
473 | "source": [
474 | "#### Project"
475 | ]
476 | },
477 | {
478 | "cell_type": "code",
479 | "execution_count": null,
480 | "metadata": {
481 | "scrolled": true,
482 | "slideshow": {
483 | "slide_type": "fragment"
484 | }
485 | },
486 | "outputs": [],
487 | "source": [
488 | "## check for project in free text\n",
489 | "check_word_freq(df, 'project')"
490 | ]
491 | },
492 | {
493 | "cell_type": "code",
494 | "execution_count": null,
495 | "metadata": {
496 | "scrolled": true,
497 | "slideshow": {
498 | "slide_type": "fragment"
499 | }
500 | },
501 | "outputs": [],
502 | "source": [
503 | "## check for group in free text\n",
504 | "check_word_freq(df, 'group')"
505 | ]
506 | },
507 | {
508 | "cell_type": "markdown",
509 | "metadata": {
510 | "slideshow": {
511 | "slide_type": "fragment"
512 | }
513 | },
514 | "source": [
515 | "#### Quizzes"
516 | ]
517 | },
518 | {
519 | "cell_type": "code",
520 | "execution_count": null,
521 | "metadata": {
522 | "scrolled": true,
523 | "slideshow": {
524 | "slide_type": "-"
525 | }
526 | },
527 | "outputs": [],
528 | "source": [
529 | "check_word_freq(df, 'quiz')"
530 | ]
531 | },
532 | {
533 | "cell_type": "markdown",
534 | "metadata": {
535 | "slideshow": {
536 | "slide_type": "fragment"
537 | }
538 | },
539 | "source": [
540 | "#### Labs"
541 | ]
542 | },
543 | {
544 | "cell_type": "code",
545 | "execution_count": null,
546 | "metadata": {
547 | "slideshow": {
548 | "slide_type": "-"
549 | }
550 | },
551 | "outputs": [],
552 | "source": [
553 | "check_word_freq(df, 'lab')"
554 | ]
555 | },
556 | {
557 | "cell_type": "markdown",
558 | "metadata": {
559 | "slideshow": {
560 | "slide_type": "slide"
561 | }
562 | },
563 | "source": [
564 | "## Sentiment Analysis\n",
565 | "\n",
566 | "We get a quick snapshot of what's going on in COGS 108, but we really want to understand the details. To do this, analyzing the sentiment of the text is a good next step."
567 | ]
568 | },
569 | {
570 | "cell_type": "markdown",
571 | "metadata": {
572 | "slideshow": {
573 | "slide_type": "slide"
574 | }
575 | },
576 | "source": [
577 | "#### Step 1: Tokenization\n",
578 | "\n",
579 | "Tokenization is the first step in analyzing text. \n",
580 | "\n",
581 | "1. Aquire text of interest\n",
582 | "2. Break text down (tokenize) into smaller chunks (i.e. words, bigrams, sentences, etc.)\n",
583 | "\n",
584 | "A **token** is a single entity - think of it as a building block of language."
585 | ]
586 | },
587 | {
588 | "cell_type": "markdown",
589 | "metadata": {
590 | "slideshow": {
591 | "slide_type": "slide"
592 | }
593 | },
594 | "source": [
595 | "### Tokenization Example\n",
596 | "\n",
597 | "Here we demonstrate what a tokenized single response looks like."
598 | ]
599 | },
600 | {
601 | "cell_type": "code",
602 | "execution_count": null,
603 | "metadata": {
604 | "slideshow": {
605 | "slide_type": "fragment"
606 | }
607 | },
608 | "outputs": [],
609 | "source": [
610 | "# import regex word tokenizer\n",
611 | "from nltk.tokenize import RegexpTokenizer\n",
612 | "tokenizer = RegexpTokenizer(r'\\w+')"
613 | ]
614 | },
615 | {
616 | "cell_type": "code",
617 | "execution_count": null,
618 | "metadata": {},
619 | "outputs": [],
620 | "source": [
621 | "df.loc[0,'enjoyed_most']"
622 | ]
623 | },
624 | {
625 | "cell_type": "code",
626 | "execution_count": null,
627 | "metadata": {
628 | "scrolled": true,
629 | "slideshow": {
630 | "slide_type": "fragment"
631 | }
632 | },
633 | "outputs": [],
634 | "source": [
635 | "tokenized_word = tokenizer.tokenize(df.loc[0,'enjoyed_most'])\n",
636 | "print(tokenized_word)"
637 | ]
638 | },
639 | {
640 | "cell_type": "markdown",
641 | "metadata": {
642 | "slideshow": {
643 | "slide_type": "slide"
644 | }
645 | },
646 | "source": [
647 | "#### Tokenize COGS108 data\n",
648 | "\n",
649 | "Using that concept we'll tokenize the words in the enjoyed_most and `enjoyed_least` columns for the data in our COGS108 data."
650 | ]
651 | },
652 | {
653 | "cell_type": "code",
654 | "execution_count": null,
655 | "metadata": {
656 | "scrolled": true,
657 | "slideshow": {
658 | "slide_type": "fragment"
659 | }
660 | },
661 | "outputs": [],
662 | "source": [
663 | "# tokenize most and least responses\n",
664 | "df['most_token'] = df['enjoyed_most'].apply(tokenizer.tokenize) \n",
665 | "df['least_token'] = df['enjoyed_least'].apply(tokenizer.tokenize) \n",
666 | "df.head()"
667 | ]
668 | },
669 | {
670 | "cell_type": "markdown",
671 | "metadata": {
672 | "slideshow": {
673 | "slide_type": "slide"
674 | }
675 | },
676 | "source": [
677 | "#### Step 2: Stop Words\n",
678 | "\n",
679 | "**Stop words** are words that are of less interest to your analysis. \n",
680 | "\n",
681 | "For example, you wouldn't expect the following words to be important: is, am, are, this, a, an, the, etc.\n",
682 | "\n",
683 | "By removing stopwords, you can lower the computational burden, focusing on only the words of interest.\n",
684 | "\n",
685 | "To do so in `nltk`, you need to create a list of stopwords and filter them from your tokens.\n"
686 | ]
687 | },
688 | {
689 | "cell_type": "code",
690 | "execution_count": null,
691 | "metadata": {
692 | "scrolled": true,
693 | "slideshow": {
694 | "slide_type": "fragment"
695 | }
696 | },
697 | "outputs": [],
698 | "source": [
699 | "# import stop words\n",
700 | "from nltk.corpus import stopwords\n",
701 | "stop_words = set(stopwords.words('english'))\n",
702 | "\n",
703 | "# look at stop words\n",
704 | "print(stop_words)"
705 | ]
706 | },
707 | {
708 | "cell_type": "markdown",
709 | "metadata": {
710 | "slideshow": {
711 | "slide_type": "slide"
712 | }
713 | },
714 | "source": [
715 | "### Stop Words Example\n",
716 | "\n",
717 | "Here we compare a sentence after tokenization to one that has been tokenized _and had stop words removed_."
718 | ]
719 | },
720 | {
721 | "cell_type": "code",
722 | "execution_count": null,
723 | "metadata": {
724 | "slideshow": {
725 | "slide_type": "fragment"
726 | }
727 | },
728 | "outputs": [],
729 | "source": [
730 | "# example of removing stop words\n",
731 | "filtered_sent=[]\n",
732 | "for w in tokenized_word:\n",
733 | " if w not in stop_words:\n",
734 | " filtered_sent.append(w)\n",
735 | "print(\"Tokenized Sentence:\", tokenized_word)\n",
736 | "print(\"Filtered Sentence:\", filtered_sent)"
737 | ]
738 | },
739 | {
740 | "cell_type": "markdown",
741 | "metadata": {
742 | "slideshow": {
743 | "slide_type": "slide"
744 | }
745 | },
746 | "source": [
747 | "#### Remove Stop Words: COGS108 data\n",
748 | "\n",
749 | "Using that idea, we can go ahead and remove stop words from our tokenized most and least liked tokenized data."
750 | ]
751 | },
752 | {
753 | "cell_type": "code",
754 | "execution_count": null,
755 | "metadata": {
756 | "slideshow": {
757 | "slide_type": "fragment"
758 | }
759 | },
760 | "outputs": [],
761 | "source": [
762 | "# remove stop words\n",
763 | "df['most_stop'] = df['most_token'].apply(lambda x: [item for item in x if item not in stop_words])\n",
764 | "df['least_stop'] = df['least_token'].apply(lambda x: [item for item in x if item not in stop_words])\n",
765 | "df.head()"
766 | ]
767 | },
768 | {
769 | "cell_type": "markdown",
770 | "metadata": {
771 | "jp-MarkdownHeadingCollapsed": true,
772 | "slideshow": {
773 | "slide_type": "slide"
774 | }
775 | },
776 | "source": [
777 | "#### Step 3: Lexicon Normalization (**Stemming**)\n",
778 | "\n",
779 | "In language, many different words come from the same root word. \n",
780 | "\n",
781 | "For example, \"intersection\", \"intersecting\", \"intersects\", and \"intersected\" are all related to the common root word - \"intersect\".\n",
782 | "\n",
783 | "**Stemming** is how linguistic normalization occurs - it reduces words to their root words (and chops off additional things like 'ing') - all of the above words would be reduced to their common stem \"intersect.\"\n",
784 | "\n",
785 | "\n",
786 | "\n",
787 | "\n"
788 | ]
789 | },
790 | {
791 | "cell_type": "markdown",
792 | "metadata": {
793 | "slideshow": {
794 | "slide_type": "slide"
795 | }
796 | },
797 | "source": [
798 | "### Stemming Example\n",
799 | "\n",
800 | "After tokenization and removing stop words, we can get the stem for all tokens (words) in our dataset."
801 | ]
802 | },
803 | {
804 | "cell_type": "code",
805 | "execution_count": null,
806 | "metadata": {
807 | "scrolled": true,
808 | "slideshow": {
809 | "slide_type": "fragment"
810 | }
811 | },
812 | "outputs": [],
813 | "source": [
814 | "# Stemming\n",
815 | "from nltk.stem import PorterStemmer\n",
816 | "\n",
817 | "ps = PorterStemmer()\n",
818 | "\n",
819 | "stemmed_words=[]\n",
820 | "for w in filtered_sent:\n",
821 | " stemmed_words.append(ps.stem(w))\n",
822 | "\n",
823 | "print(\"Filtered Sentence:\", filtered_sent)\n",
824 | "print(\"Stemmed Sentence:\", stemmed_words)"
825 | ]
826 | },
827 | {
828 | "cell_type": "markdown",
829 | "metadata": {},
830 | "source": [
831 | "**Lecture participation**: Pause & do Q1 now."
832 | ]
833 | },
834 | {
835 | "cell_type": "markdown",
836 | "metadata": {
837 | "slideshow": {
838 | "slide_type": "slide"
839 | }
840 | },
841 | "source": [
842 | "#### Stemming: COGS108 data\n",
843 | "\n",
844 | "Here, we obtain the stem (root word) for all tokens in our dataset."
845 | ]
846 | },
847 | {
848 | "cell_type": "code",
849 | "execution_count": null,
850 | "metadata": {
851 | "scrolled": true,
852 | "slideshow": {
853 | "slide_type": "fragment"
854 | }
855 | },
856 | "outputs": [],
857 | "source": [
858 | "df['most_stem'] = df['most_stop'].apply(lambda x: [ps.stem(y) for y in x])\n",
859 | "df['least_stem'] = df['least_stop'].apply(lambda x: [ps.stem(y) for y in x])\n",
860 | "df.head()"
861 | ]
862 | },
863 | {
864 | "cell_type": "markdown",
865 | "metadata": {
866 | "slideshow": {
867 | "slide_type": "slide"
868 | }
869 | },
870 | "source": [
871 | "#### Step 4: Frequency Distribution\n",
872 | "\n",
873 | "It can be helpful to get a sense of which words are most frequent in our dataset."
874 | ]
875 | },
876 | {
877 | "cell_type": "code",
878 | "execution_count": null,
879 | "metadata": {
880 | "slideshow": {
881 | "slide_type": "fragment"
882 | }
883 | },
884 | "outputs": [],
885 | "source": [
886 | "# get series of all most and least liked words after stemming\n",
887 | "# note that \"No Response\" is still being included in the analysis\n",
888 | "most = df['most_stem'].apply(pd.Series).stack()\n",
889 | "least = df['least_stem'].apply(pd.Series).stack()"
890 | ]
891 | },
892 | {
893 | "cell_type": "markdown",
894 | "metadata": {
895 | "slideshow": {
896 | "slide_type": "fragment"
897 | }
898 | },
899 | "source": [
900 | "`FreqDist` calculates the frequency of each word in the text and we can plot the most frequent words."
901 | ]
902 | },
903 | {
904 | "cell_type": "code",
905 | "execution_count": null,
906 | "metadata": {
907 | "slideshow": {
908 | "slide_type": "fragment"
909 | }
910 | },
911 | "outputs": [],
912 | "source": [
913 | "from nltk.probability import FreqDist\n",
914 | "import string\n",
915 | "\n",
916 | "# calculation word frequency\n",
917 | "fdist_most = FreqDist(most)\n",
918 | "fdist_least = FreqDist(least)\n",
919 | "\n",
920 | "# remove punctuation counts\n",
921 | "for punc in string.punctuation:\n",
922 | " del fdist_most[punc]\n",
923 | " del fdist_least[punc]"
924 | ]
925 | },
926 | {
927 | "cell_type": "code",
928 | "execution_count": null,
929 | "metadata": {},
930 | "outputs": [],
931 | "source": [
932 | "# Frequency Distribution Plot - top 20\n",
933 | "# for words in what students like most\n",
934 | "fdist_least.plot(20, cumulative=False);"
935 | ]
936 | },
937 | {
938 | "cell_type": "markdown",
939 | "metadata": {},
940 | "source": [
941 | "**Lecture participation**: Pause & do Q2 now."
942 | ]
943 | },
944 | {
945 | "cell_type": "markdown",
946 | "metadata": {
947 | "slideshow": {
948 | "slide_type": "slide"
949 | }
950 | },
951 | "source": [
952 | "#### Step 5: Sentiment Analysis!\n",
953 | "\n",
954 | "**Sentiment Analysis** quantifies the content, idea, beliefs and opinions conveyed in text. \n",
955 | "\n",
956 | "Two general approaches:\n",
957 | "\n",
958 | "1. **Lexicon-based** - count number of words in a text belonging to each sentiment (positive, negative, happy, angry, etc.)\n",
959 | "2. **Machine learning-based** - develop a classification model on pre-labeled data\n",
960 | "\n"
961 | ]
962 | },
963 | {
964 | "cell_type": "markdown",
965 | "metadata": {
966 | "slideshow": {
967 | "slide_type": "slide"
968 | }
969 | },
970 | "source": [
971 | "### Sentiment Example\n",
972 | "\n",
973 | "To get a measure of overall sentiment in our text, we'll compare our text to the VADER lexicon."
974 | ]
975 | },
976 | {
977 | "cell_type": "code",
978 | "execution_count": null,
979 | "metadata": {
980 | "slideshow": {
981 | "slide_type": "fragment"
982 | }
983 | },
984 | "outputs": [],
985 | "source": [
986 | "from nltk.sentiment.vader import SentimentIntensityAnalyzer \n",
987 | "analyser = SentimentIntensityAnalyzer()"
988 | ]
989 | },
990 | {
991 | "cell_type": "markdown",
992 | "metadata": {
993 | "slideshow": {
994 | "slide_type": "fragment"
995 | }
996 | },
997 | "source": [
998 | "VADER handles:\n",
999 | "\n",
1000 | "- capitalization (great vs GREAT) & punctuation (exclamation makes more positive!)\n",
1001 | "- emojis and emoticons\n",
1002 | "- degree modifiers (extremely good vs. marginally good)\n",
1003 | "- contractions and conjunctions (but signals shift)"
1004 | ]
1005 | },
1006 | {
1007 | "cell_type": "markdown",
1008 | "metadata": {
1009 | "slideshow": {
1010 | "slide_type": "fragment"
1011 | }
1012 | },
1013 | "source": [
1014 | "`pos` + `neg` + `neu` = 1\n",
1015 | "\n",
1016 | "**`compound`** score - metric that calculates sum of all the lexicon ratings and normalizes between -1 (most extreme negative) and +1 (most extreme positive)\n",
1017 | "- positive: `compound` >= 0.05 \n",
1018 | "- neutral: -0.05 < `compound` < 0.05\n",
1019 | "- negative : `compound` <= -0.05"
1020 | ]
1021 | },
1022 | {
1023 | "cell_type": "code",
1024 | "execution_count": null,
1025 | "metadata": {
1026 | "slideshow": {
1027 | "slide_type": "fragment"
1028 | }
1029 | },
1030 | "outputs": [],
1031 | "source": [
1032 | "analyser.polarity_scores(\"The class is super cool.\")"
1033 | ]
1034 | },
1035 | {
1036 | "cell_type": "code",
1037 | "execution_count": null,
1038 | "metadata": {
1039 | "slideshow": {
1040 | "slide_type": "fragment"
1041 | }
1042 | },
1043 | "outputs": [],
1044 | "source": [
1045 | "analyser.polarity_scores(\"The class is not super cool.\")"
1046 | ]
1047 | },
1048 | {
1049 | "cell_type": "code",
1050 | "execution_count": null,
1051 | "metadata": {
1052 | "slideshow": {
1053 | "slide_type": "fragment"
1054 | }
1055 | },
1056 | "outputs": [],
1057 | "source": [
1058 | "analyser.polarity_scores(\"The class is NOT super cool!\")"
1059 | ]
1060 | },
1061 | {
1062 | "cell_type": "markdown",
1063 | "metadata": {},
1064 | "source": [
1065 | "**Lecture participation**: Pause & do Q3 now."
1066 | ]
1067 | },
1068 | {
1069 | "cell_type": "markdown",
1070 | "metadata": {
1071 | "slideshow": {
1072 | "slide_type": "slide"
1073 | }
1074 | },
1075 | "source": [
1076 | "#### Sentiment Analysis: COGS108 data\n",
1077 | "\n",
1078 | "Here, we will calculate the sentiment of each most liked and least liked student response from the survey."
1079 | ]
1080 | },
1081 | {
1082 | "cell_type": "code",
1083 | "execution_count": null,
1084 | "metadata": {
1085 | "slideshow": {
1086 | "slide_type": "fragment"
1087 | }
1088 | },
1089 | "outputs": [],
1090 | "source": [
1091 | "# get list of the 'sentences' (responses) from each individual\n",
1092 | "most_list = list(df[df['quarter'] == 'Wi25']['enjoyed_most'].values)\n",
1093 | "least_list = list(df[df['quarter'] == 'Wi25']['enjoyed_least'].values)"
1094 | ]
1095 | },
1096 | {
1097 | "cell_type": "code",
1098 | "execution_count": null,
1099 | "metadata": {
1100 | "slideshow": {
1101 | "slide_type": "fragment"
1102 | }
1103 | },
1104 | "outputs": [],
1105 | "source": [
1106 | "# create function that will output dataframe \n",
1107 | "# that stores sentiment information\n",
1108 | "def get_sentiments(input_list):\n",
1109 | " \n",
1110 | " output = pd.DataFrame()\n",
1111 | "\n",
1112 | " for sentence in input_list:\n",
1113 | " ss = analyser.polarity_scores(sentence)\n",
1114 | " ss['sentence'] = sentence\n",
1115 | " # Note use of pd.concat\n",
1116 | " output = pd.concat([output, pd.DataFrame([ss])], ignore_index=True)\n",
1117 | "\n",
1118 | "\n",
1119 | " return output"
1120 | ]
1121 | },
1122 | {
1123 | "cell_type": "code",
1124 | "execution_count": null,
1125 | "metadata": {
1126 | "slideshow": {
1127 | "slide_type": "fragment"
1128 | }
1129 | },
1130 | "outputs": [],
1131 | "source": [
1132 | "# get sentiment measures\n",
1133 | "least_sentiments = get_sentiments(least_list)\n",
1134 | "most_sentiments = get_sentiments(most_list)"
1135 | ]
1136 | },
1137 | {
1138 | "cell_type": "markdown",
1139 | "metadata": {
1140 | "slideshow": {
1141 | "slide_type": "slide"
1142 | }
1143 | },
1144 | "source": [
1145 | "#### Sentiment Analysis: COGS108 data output\n",
1146 | "\n",
1147 | "After calculating the sentiment of each response, we can look at the output of each."
1148 | ]
1149 | },
1150 | {
1151 | "cell_type": "code",
1152 | "execution_count": null,
1153 | "metadata": {
1154 | "slideshow": {
1155 | "slide_type": "fragment"
1156 | }
1157 | },
1158 | "outputs": [],
1159 | "source": [
1160 | "# let's get rid of those no response values here\n",
1161 | "most_sentiments = most_sentiments[most_sentiments['sentence'] != 'No response']\n",
1162 | "least_sentiments = least_sentiments[least_sentiments['sentence'] != 'No response']"
1163 | ]
1164 | },
1165 | {
1166 | "cell_type": "code",
1167 | "execution_count": null,
1168 | "metadata": {
1169 | "scrolled": true,
1170 | "slideshow": {
1171 | "slide_type": "fragment"
1172 | }
1173 | },
1174 | "outputs": [],
1175 | "source": [
1176 | "# take a look at the output\n",
1177 | "least_sentiments.sort_values(by='compound', ascending=True).head(10)"
1178 | ]
1179 | },
1180 | {
1181 | "cell_type": "code",
1182 | "execution_count": null,
1183 | "metadata": {
1184 | "slideshow": {
1185 | "slide_type": "fragment"
1186 | }
1187 | },
1188 | "outputs": [],
1189 | "source": [
1190 | "# take a look at the output\n",
1191 | "most_sentiments.sort_values(by='compound', ascending=False).head(10)"
1192 | ]
1193 | },
1194 | {
1195 | "cell_type": "markdown",
1196 | "metadata": {
1197 | "slideshow": {
1198 | "slide_type": "slide"
1199 | }
1200 | },
1201 | "source": [
1202 | "#### Sentiment Analysis: COGS108 data - `describe`\n",
1203 | "\n",
1204 | "To get an overall sense of the values stored in each of these dataframes, we can use `describe`."
1205 | ]
1206 | },
1207 | {
1208 | "cell_type": "code",
1209 | "execution_count": null,
1210 | "metadata": {
1211 | "scrolled": true,
1212 | "slideshow": {
1213 | "slide_type": "fragment"
1214 | }
1215 | },
1216 | "outputs": [],
1217 | "source": [
1218 | "most_sentiments.describe()"
1219 | ]
1220 | },
1221 | {
1222 | "cell_type": "code",
1223 | "execution_count": null,
1224 | "metadata": {
1225 | "slideshow": {
1226 | "slide_type": "fragment"
1227 | }
1228 | },
1229 | "outputs": [],
1230 | "source": [
1231 | "least_sentiments.describe()"
1232 | ]
1233 | },
1234 | {
1235 | "cell_type": "markdown",
1236 | "metadata": {
1237 | "slideshow": {
1238 | "slide_type": "slide"
1239 | }
1240 | },
1241 | "source": [
1242 | "#### Sentiment Analysis: COGS108 data - plotting\n",
1243 | "\n",
1244 | "We can compare the distribution of the `compound` metric between the two analyses."
1245 | ]
1246 | },
1247 | {
1248 | "cell_type": "code",
1249 | "execution_count": null,
1250 | "metadata": {
1251 | "slideshow": {
1252 | "slide_type": "fragment"
1253 | }
1254 | },
1255 | "outputs": [],
1256 | "source": [
1257 | "most_sentiments['compound'].plot.density(label='most')\n",
1258 | "least_sentiments['compound'].plot.density(label='least')\n",
1259 | "plt.legend()\n",
1260 | "plt.xlabel('Compound Sentiment Scores')\n",
1261 | "plt.xlim(-1,1);"
1262 | ]
1263 | },
1264 | {
1265 | "cell_type": "markdown",
1266 | "metadata": {},
1267 | "source": [
1268 | "**Lecture participation**: Pause & do Q4 now."
1269 | ]
1270 | },
1271 | {
1272 | "cell_type": "code",
1273 | "execution_count": null,
1274 | "metadata": {
1275 | "slideshow": {
1276 | "slide_type": "slide"
1277 | }
1278 | },
1279 | "outputs": [],
1280 | "source": [
1281 | "# include label for boxplot\n",
1282 | "most_sentiments['which'] = 'most'\n",
1283 | "least_sentiments['which'] = 'least'\n",
1284 | "# concatenate data frames together\n",
1285 | "compound_out = pd.concat([most_sentiments, least_sentiments])\n",
1286 | "compound_out.head()"
1287 | ]
1288 | },
1289 | {
1290 | "cell_type": "code",
1291 | "execution_count": null,
1292 | "metadata": {},
1293 | "outputs": [],
1294 | "source": [
1295 | "# plot compound by resonse type\n",
1296 | "sns.boxplot(data=compound_out, x='which', y='compound')\n",
1297 | "plt.xlabel('response');"
1298 | ]
1299 | },
1300 | {
1301 | "cell_type": "markdown",
1302 | "metadata": {
1303 | "slideshow": {
1304 | "slide_type": "fragment"
1305 | }
1306 | },
1307 | "source": [
1308 | "Probably unsurprisingly, the overall sentiment of what students like tends to be more positive than what students like less. \n",
1309 | "\n",
1310 | "Probably not surprising given the data and question on the survey. But, let's dig deeper into these data moving beyond sentiment analysis..."
1311 | ]
1312 | },
1313 | {
1314 | "cell_type": "markdown",
1315 | "metadata": {
1316 | "slideshow": {
1317 | "slide_type": "slide"
1318 | }
1319 | },
1320 | "source": [
1321 | "## TF-IDF\n",
1322 | "\n",
1323 | "Term Frequency - Inverse Document Frequency (**TF-IDF**) sets out to identify the tokens most unique to your document of interest (relative to all documents in your corpus). "
1324 | ]
1325 | },
1326 | {
1327 | "cell_type": "markdown",
1328 | "metadata": {
1329 | "slideshow": {
1330 | "slide_type": "fragment"
1331 | }
1332 | },
1333 | "source": [
1334 | "**Term Frequency (TF)** - counts the number of words (tokens) occurring in each document.\n",
1335 | "\n",
1336 | "**Inverse Document Frequency (IDF)** - weights the word by their relative frequency across documents. "
1337 | ]
1338 | },
1339 | {
1340 | "cell_type": "markdown",
1341 | "metadata": {
1342 | "slideshow": {
1343 | "slide_type": "fragment"
1344 | }
1345 | },
1346 | "source": [
1347 | "$$IDF_{word} = log(\\frac{\\# documents}{\\# \\ documents\\_containing\\_word})$$"
1348 | ]
1349 | },
1350 | {
1351 | "cell_type": "markdown",
1352 | "metadata": {
1353 | "slideshow": {
1354 | "slide_type": "fragment"
1355 | }
1356 | },
1357 | "source": [
1358 | "$$TF-IDF = TF \\times IDF$$"
1359 | ]
1360 | },
1361 | {
1362 | "cell_type": "markdown",
1363 | "metadata": {
1364 | "slideshow": {
1365 | "slide_type": "fragment"
1366 | }
1367 | },
1368 | "source": [
1369 | "words with a high TF-IDF are those with high frequency in one document & relatively low frequency in other documents"
1370 | ]
1371 | },
1372 | {
1373 | "cell_type": "markdown",
1374 | "metadata": {
1375 | "slideshow": {
1376 | "slide_type": "slide"
1377 | }
1378 | },
1379 | "source": [
1380 | "For our purposes, our **corpus** will be students' responses to what they like most and least about COGS108.\n",
1381 | "\n",
1382 | "We'll treat this as **two separate documents**:\n",
1383 | "1. What students like most\n",
1384 | "2. What students like least"
1385 | ]
1386 | },
1387 | {
1388 | "cell_type": "markdown",
1389 | "metadata": {
1390 | "slideshow": {
1391 | "slide_type": "slide"
1392 | }
1393 | },
1394 | "source": [
1395 | "### Bag of Words (BoW) approach\n",
1396 | "\n",
1397 | "Converts the text into a co-occurrence matrix across documents within the corpus."
1398 | ]
1399 | },
1400 | {
1401 | "cell_type": "markdown",
1402 | "metadata": {
1403 | "slideshow": {
1404 | "slide_type": "fragment"
1405 | }
1406 | },
1407 | "source": [
1408 | "To do this, let's get our text ready.\n",
1409 | "\n",
1410 | "We're going to make sure all our words are lower case, remove punctuation from each, and then provide the text (`corpus`) to `TfidfVectorizer`."
1411 | ]
1412 | },
1413 | {
1414 | "cell_type": "code",
1415 | "execution_count": null,
1416 | "metadata": {
1417 | "slideshow": {
1418 | "slide_type": "fragment"
1419 | }
1420 | },
1421 | "outputs": [],
1422 | "source": [
1423 | "import string \n",
1424 | "\n",
1425 | "# lowercase text\n",
1426 | "least = list(map(str.lower, least_list))\n",
1427 | "most = list(map(str.lower, most_list))\n",
1428 | "\n",
1429 | "# remove punctuation\n",
1430 | "for c in string.punctuation:\n",
1431 | " least = str(least).replace(c, \"\")\n",
1432 | " most = str(most).replace(c, \"\")\n",
1433 | "\n",
1434 | "# get list of two documents together\n",
1435 | "corpus = [str(least), str(most)]"
1436 | ]
1437 | },
1438 | {
1439 | "cell_type": "markdown",
1440 | "metadata": {
1441 | "slideshow": {
1442 | "slide_type": "slide"
1443 | }
1444 | },
1445 | "source": [
1446 | "### Calculate TF-IDF\n",
1447 | "\n",
1448 | "With our text ready for analysis, it's time to calculate TF-IDF"
1449 | ]
1450 | },
1451 | {
1452 | "cell_type": "markdown",
1453 | "metadata": {
1454 | "slideshow": {
1455 | "slide_type": "fragment"
1456 | }
1457 | },
1458 | "source": [
1459 | "To start our TF-IDF analysis, we'll first **create a `TfidfVectorizer` object to transform our text data into vectors.**"
1460 | ]
1461 | },
1462 | {
1463 | "cell_type": "code",
1464 | "execution_count": null,
1465 | "metadata": {
1466 | "slideshow": {
1467 | "slide_type": "fragment"
1468 | }
1469 | },
1470 | "outputs": [],
1471 | "source": [
1472 | "from sklearn.feature_extraction.text import TfidfVectorizer\n",
1473 | "from nltk.tokenize import word_tokenize"
1474 | ]
1475 | },
1476 | {
1477 | "cell_type": "code",
1478 | "execution_count": null,
1479 | "metadata": {
1480 | "slideshow": {
1481 | "slide_type": "fragment"
1482 | }
1483 | },
1484 | "outputs": [],
1485 | "source": [
1486 | "# create vectorizer\n",
1487 | "tfidf = TfidfVectorizer(sublinear_tf=True,\n",
1488 | " analyzer='word',\n",
1489 | " max_features=2000,\n",
1490 | " tokenizer=word_tokenize,\n",
1491 | " stop_words='english')"
1492 | ]
1493 | },
1494 | {
1495 | "cell_type": "markdown",
1496 | "metadata": {},
1497 | "source": [
1498 | "**Lecture participation**: Pause & do Q5 now. Submit when you're done."
1499 | ]
1500 | },
1501 | {
1502 | "cell_type": "markdown",
1503 | "metadata": {
1504 | "slideshow": {
1505 | "slide_type": "slide"
1506 | }
1507 | },
1508 | "source": [
1509 | "#### TF-IDF: COGS108 data - calculation\n",
1510 | "\n",
1511 | "Here, we use our vectorizer to calculate TF-IDF across the words in our word matrix."
1512 | ]
1513 | },
1514 | {
1515 | "cell_type": "code",
1516 | "execution_count": null,
1517 | "metadata": {
1518 | "slideshow": {
1519 | "slide_type": "fragment"
1520 | }
1521 | },
1522 | "outputs": [],
1523 | "source": [
1524 | "# calculate TF-IDF\n",
1525 | "cogs_tfidf = pd.DataFrame(\n",
1526 | " tfidf.fit_transform(corpus)\n",
1527 | " .toarray()\n",
1528 | ")\n",
1529 | "cogs_tfidf.columns = tfidf.get_feature_names_out()\n",
1530 | "cogs_tfidf = cogs_tfidf.rename(index={0:'least', 1:'most'})"
1531 | ]
1532 | },
1533 | {
1534 | "cell_type": "markdown",
1535 | "metadata": {
1536 | "slideshow": {
1537 | "slide_type": "slide"
1538 | }
1539 | },
1540 | "source": [
1541 | "#### TF-IDF: COGS108 data - output\n",
1542 | "\n",
1543 | "If we just want to look at the word most uniuqe in each document..."
1544 | ]
1545 | },
1546 | {
1547 | "cell_type": "markdown",
1548 | "metadata": {
1549 | "slideshow": {
1550 | "slide_type": "fragment"
1551 | }
1552 | },
1553 | "source": [
1554 | "Alternatively, we can sort by the set or words most unique to each document:"
1555 | ]
1556 | },
1557 | {
1558 | "cell_type": "code",
1559 | "execution_count": null,
1560 | "metadata": {
1561 | "slideshow": {
1562 | "slide_type": "fragment"
1563 | }
1564 | },
1565 | "outputs": [],
1566 | "source": [
1567 | "cogs_tfidf.sort_values(by='most', axis=1, ascending=False)"
1568 | ]
1569 | },
1570 | {
1571 | "cell_type": "code",
1572 | "execution_count": null,
1573 | "metadata": {
1574 | "slideshow": {
1575 | "slide_type": "fragment"
1576 | }
1577 | },
1578 | "outputs": [],
1579 | "source": [
1580 | "cogs_tfidf.sort_values(by='least', axis=1, ascending=False)"
1581 | ]
1582 | },
1583 | {
1584 | "cell_type": "markdown",
1585 | "metadata": {
1586 | "slideshow": {
1587 | "slide_type": "fragment"
1588 | }
1589 | },
1590 | "source": [
1591 | "**Sentiment Analysis** and **TF-IDF** are really helpful when analyzing documents and corpuses of text.\n",
1592 | "\n",
1593 | "But, what if, from the text itself we wanted to predict whether or not the text was likely a 'most' liked or a 'least' liked comment? We'll discuss how to do this in the coming **machine learning** lectures!"
1594 | ]
1595 | }
1596 | ],
1597 | "metadata": {
1598 | "celltoolbar": "Slideshow",
1599 | "kernelspec": {
1600 | "display_name": "Python 3 (ipykernel)",
1601 | "language": "python",
1602 | "name": "python3"
1603 | },
1604 | "language_info": {
1605 | "codemirror_mode": {
1606 | "name": "ipython",
1607 | "version": 3
1608 | },
1609 | "file_extension": ".py",
1610 | "mimetype": "text/x-python",
1611 | "name": "python",
1612 | "nbconvert_exporter": "python",
1613 | "pygments_lexer": "ipython3",
1614 | "version": "3.11.8"
1615 | },
1616 | "rise": {
1617 | "scroll": true
1618 | }
1619 | },
1620 | "nbformat": 4,
1621 | "nbformat_minor": 4
1622 | }
1623 |
--------------------------------------------------------------------------------
/08_ml/08_14_machine_learning.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/COGS108/Lectures-Ellis/c9dbb70bd3b6e72b2835115aa6f1c1bfdf89ee79/08_ml/08_14_machine_learning.pdf
--------------------------------------------------------------------------------
/09_geospatial/09_16_geospatial.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/COGS108/Lectures-Ellis/c9dbb70bd3b6e72b2835115aa6f1c1bfdf89ee79/09_geospatial/09_16_geospatial.pdf
--------------------------------------------------------------------------------
/10_communication/10_17_communication.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/COGS108/Lectures-Ellis/c9dbb70bd3b6e72b2835115aa6f1c1bfdf89ee79/10_communication/10_17_communication.pdf
--------------------------------------------------------------------------------
/10_communication/10_18_be_wrong.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/COGS108/Lectures-Ellis/c9dbb70bd3b6e72b2835115aa6f1c1bfdf89ee79/10_communication/10_18_be_wrong.pdf
--------------------------------------------------------------------------------
/10_communication/10_19_jobs_future.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/COGS108/Lectures-Ellis/c9dbb70bd3b6e72b2835115aa6f1c1bfdf89ee79/10_communication/10_19_jobs_future.pdf
--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
1 | MIT License
2 |
3 | Copyright (c) 2021 COGS108 - Data Science in Practice
4 |
5 | Permission is hereby granted, free of charge, to any person obtaining a copy
6 | of this software and associated documentation files (the "Software"), to deal
7 | in the Software without restriction, including without limitation the rights
8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 |
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 |
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.
22 |
--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
1 | # Lectures (Winter 2025)
2 |
3 | Course materials are organized by week.
4 |
5 | |Week | General Topic | Link to Materials |
6 | |---|:---|:---|
7 | | 01 | Introduction to Data Science | [01_intro](https://github.com/COGS108/Lectures-Ellis/tree/wi25/01_intro) |
8 | | 02 | Version Control & Data | [02_data](https://github.com/COGS108/Lectures-Ellis/tree/wi25/02_data) |
9 | | 03 | Data Ethics & Wrangling | [03_ethics](https://github.com/COGS108/Lectures-Ellis/tree/wi25/03_ethics) |
10 | | 04 | Data Viz & Analysis | [04_analysis](https://github.com/COGS108/Lectures-Ellis/tree/wi25/04_analysis) |
11 | | 05 | Exploratory Data Analysis | [05_eda](https://github.com/COGS108/Lectures-Ellis/tree/wi25/05_eda) |
12 | | 06 | Inference | [06_inference](https://github.com/COGS108/Lectures-Ellis/tree/wi25/06_inference) |
13 | | 07 | Text Analysis | [07_text](https://github.com/COGS108/Lectures-Ellis/tree/wi25/07_text) |
14 | | 08 | Machine Learning | [08_ml](https://github.com/COGS108/Lectures-Ellis/tree/wi25/08_ml) |
15 | | 09 | Geospatial | [09_geospatial](https://github.com/COGS108/Lectures-Ellis/tree/wi25/09_geospatial) |
16 | | 10 | Data Science Communication & Jobs | [10_communication](https://github.com/COGS108/Lectures-Ellis/tree/wi25/10_communication) |
17 | | -- | Discussion Section Slides | [XX_section](https://github.com/COGS108/Lectures-Ellis/tree/wi25/XX_section) |
18 |
19 | ---
20 | ## License
21 |
22 | The content of this project itself is licensed under the [Creative Commons Attribution 3.0 Unported license](https://creativecommons.org/licenses/by/3.0/), and the underlying source code used to format and display that content is licensed under the [MIT license](https://github.com/github/choosealicense.com/blob/gh-pages/LICENSE.md).
23 |
--------------------------------------------------------------------------------
/XX_section/D1.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/COGS108/Lectures-Ellis/c9dbb70bd3b6e72b2835115aa6f1c1bfdf89ee79/XX_section/D1.pdf
--------------------------------------------------------------------------------
/XX_section/D2.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/COGS108/Lectures-Ellis/c9dbb70bd3b6e72b2835115aa6f1c1bfdf89ee79/XX_section/D2.pdf
--------------------------------------------------------------------------------
/XX_section/D3.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/COGS108/Lectures-Ellis/c9dbb70bd3b6e72b2835115aa6f1c1bfdf89ee79/XX_section/D3.pdf
--------------------------------------------------------------------------------
/XX_section/D4.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/COGS108/Lectures-Ellis/c9dbb70bd3b6e72b2835115aa6f1c1bfdf89ee79/XX_section/D4.pdf
--------------------------------------------------------------------------------
/XX_section/D5.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/COGS108/Lectures-Ellis/c9dbb70bd3b6e72b2835115aa6f1c1bfdf89ee79/XX_section/D5.pdf
--------------------------------------------------------------------------------
/XX_section/D6.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/COGS108/Lectures-Ellis/c9dbb70bd3b6e72b2835115aa6f1c1bfdf89ee79/XX_section/D6.pdf
--------------------------------------------------------------------------------
/XX_section/D7.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/COGS108/Lectures-Ellis/c9dbb70bd3b6e72b2835115aa6f1c1bfdf89ee79/XX_section/D7.pdf
--------------------------------------------------------------------------------
/XX_section/D7_notebook.ipynb:
--------------------------------------------------------------------------------
1 | {"cells":[{"cell_type":"markdown","id":"8ae41600","metadata":{"id":"8ae41600"},"source":["## Due Dates\n","\n","### D8, Q9 Due Monday 6/3\n","### Checkpoint 2 Due Friday 5/30\n","### A4 (Released this Friday) Due next Friday 6/10"]},{"cell_type":"markdown","id":"8e1ba1ce","metadata":{"id":"8e1ba1ce"},"source":["## D8 Review"]},{"cell_type":"markdown","id":"c7d1b4ff","metadata":{"id":"c7d1b4ff"},"source":["## Part 1"]},{"cell_type":"code","execution_count":1,"id":"0e0b883e","metadata":{"id":"0e0b883e","executionInfo":{"status":"ok","timestamp":1740553021021,"user_tz":480,"elapsed":1866,"user":{"displayName":"Yueyan Tang","userId":"00093492675056380488"}},"outputId":"fa70a9ae-f123-4040-eadc-704acd92a502","colab":{"base_uri":"https://localhost:8080/","height":242}},"outputs":[{"output_type":"execute_result","data":{"text/plain":[" PassengerId Survived Pclass \\\n","0 1 0 3 \n","1 2 1 1 \n","2 3 1 3 \n","3 4 1 1 \n","4 5 0 3 \n","\n"," Name Sex Age SibSp \\\n","0 Braund, Mr. Owen Harris male 22.0 1 \n","1 Cumings, Mrs. John Bradley (Florence Briggs Th... female 38.0 1 \n","2 Heikkinen, Miss. Laina female 26.0 0 \n","3 Futrelle, Mrs. Jacques Heath (Lily May Peel) female 35.0 1 \n","4 Allen, Mr. William Henry male 35.0 0 \n","\n"," Parch Ticket Fare Cabin Embarked \n","0 0 A/5 21171 7.2500 NaN S \n","1 0 PC 17599 71.2833 C85 C \n","2 0 STON/O2. 3101282 7.9250 NaN S \n","3 0 113803 53.1000 C123 S \n","4 0 373450 8.0500 NaN S "],"text/html":["\n","
\n","
\n","\n","
\n"," \n"," \n"," | \n"," PassengerId | \n"," Survived | \n"," Pclass | \n"," Name | \n"," Sex | \n"," Age | \n"," SibSp | \n"," Parch | \n"," Ticket | \n"," Fare | \n"," Cabin | \n"," Embarked | \n","
\n"," \n"," \n"," \n"," 0 | \n"," 1 | \n"," 0 | \n"," 3 | \n"," Braund, Mr. Owen Harris | \n"," male | \n"," 22.0 | \n"," 1 | \n"," 0 | \n"," A/5 21171 | \n"," 7.2500 | \n"," NaN | \n"," S | \n","
\n"," \n"," 1 | \n"," 2 | \n"," 1 | \n"," 1 | \n"," Cumings, Mrs. John Bradley (Florence Briggs Th... | \n"," female | \n"," 38.0 | \n"," 1 | \n"," 0 | \n"," PC 17599 | \n"," 71.2833 | \n"," C85 | \n"," C | \n","
\n"," \n"," 2 | \n"," 3 | \n"," 1 | \n"," 3 | \n"," Heikkinen, Miss. Laina | \n"," female | \n"," 26.0 | \n"," 0 | \n"," 0 | \n"," STON/O2. 3101282 | \n"," 7.9250 | \n"," NaN | \n"," S | \n","
\n"," \n"," 3 | \n"," 4 | \n"," 1 | \n"," 1 | \n"," Futrelle, Mrs. Jacques Heath (Lily May Peel) | \n"," female | \n"," 35.0 | \n"," 1 | \n"," 0 | \n"," 113803 | \n"," 53.1000 | \n"," C123 | \n"," S | \n","
\n"," \n"," 4 | \n"," 5 | \n"," 0 | \n"," 3 | \n"," Allen, Mr. William Henry | \n"," male | \n"," 35.0 | \n"," 0 | \n"," 0 | \n"," 373450 | \n"," 8.0500 | \n"," NaN | \n"," S | \n","
\n"," \n","
\n","
\n","
\n","
\n"],"application/vnd.google.colaboratory.intrinsic+json":{"type":"dataframe","variable_name":"titan","summary":"{\n \"name\": \"titan\",\n \"rows\": 891,\n \"fields\": [\n {\n \"column\": \"PassengerId\",\n \"properties\": {\n \"dtype\": \"number\",\n \"std\": 257,\n \"min\": 1,\n \"max\": 891,\n \"num_unique_values\": 891,\n \"samples\": [\n 710,\n 440,\n 841\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"Survived\",\n \"properties\": {\n \"dtype\": \"number\",\n \"std\": 0,\n \"min\": 0,\n \"max\": 1,\n \"num_unique_values\": 2,\n \"samples\": [\n 1,\n 0\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"Pclass\",\n \"properties\": {\n \"dtype\": \"number\",\n \"std\": 0,\n \"min\": 1,\n \"max\": 3,\n \"num_unique_values\": 3,\n \"samples\": [\n 3,\n 1\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"Name\",\n \"properties\": {\n \"dtype\": \"string\",\n \"num_unique_values\": 891,\n \"samples\": [\n \"Moubarek, Master. Halim Gonios (\\\"William George\\\")\",\n \"Kvillner, Mr. Johan Henrik Johannesson\"\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"Sex\",\n \"properties\": {\n \"dtype\": \"category\",\n \"num_unique_values\": 2,\n \"samples\": [\n \"female\",\n \"male\"\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"Age\",\n \"properties\": {\n \"dtype\": \"number\",\n \"std\": 14.526497332334044,\n \"min\": 0.42,\n \"max\": 80.0,\n \"num_unique_values\": 88,\n \"samples\": [\n 0.75,\n 22.0\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"SibSp\",\n \"properties\": {\n \"dtype\": \"number\",\n \"std\": 1,\n \"min\": 0,\n \"max\": 8,\n \"num_unique_values\": 7,\n \"samples\": [\n 1,\n 0\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"Parch\",\n \"properties\": {\n \"dtype\": \"number\",\n \"std\": 0,\n \"min\": 0,\n \"max\": 6,\n \"num_unique_values\": 7,\n \"samples\": [\n 0,\n 1\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"Ticket\",\n \"properties\": {\n \"dtype\": \"string\",\n \"num_unique_values\": 681,\n \"samples\": [\n \"11774\",\n \"248740\"\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"Fare\",\n \"properties\": {\n \"dtype\": \"number\",\n \"std\": 49.693428597180905,\n \"min\": 0.0,\n \"max\": 512.3292,\n \"num_unique_values\": 248,\n \"samples\": [\n 11.2417,\n 51.8625\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"Cabin\",\n \"properties\": {\n \"dtype\": \"category\",\n \"num_unique_values\": 147,\n \"samples\": [\n \"D45\",\n \"B49\"\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"Embarked\",\n \"properties\": {\n \"dtype\": \"category\",\n \"num_unique_values\": 3,\n \"samples\": [\n \"S\",\n \"C\"\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n }\n ]\n}"}},"metadata":{},"execution_count":1}],"source":["import pandas as pd\n","\n","titan = pd.read_csv('https://raw.githubusercontent.com/datasciencedojo/datasets/master/titanic.csv')\n","titan.head()"]},{"cell_type":"code","execution_count":null,"id":"c0ada0a1","metadata":{"id":"c0ada0a1","outputId":"3e90de2b-fc0a-42e6-8f2b-b2b7fd23335c"},"outputs":[{"data":{"text/plain":["(891, 12)"]},"execution_count":21,"metadata":{},"output_type":"execute_result"}],"source":["titan.shape"]},{"cell_type":"markdown","id":"aec0eba2","metadata":{"id":"aec0eba2"},"source":["### Finding the number of missing values in each column"]},{"cell_type":"code","execution_count":null,"id":"2233cc78","metadata":{"id":"2233cc78","outputId":"dd68f0a1-098d-4c11-dced-5d18021739ce"},"outputs":[{"data":{"text/plain":["PassengerId 0\n","Survived 0\n","Pclass 0\n","Name 0\n","Sex 0\n","Age 177\n","SibSp 0\n","Parch 0\n","Ticket 0\n","Fare 0\n","Cabin 687\n","Embarked 2\n","dtype: int64"]},"execution_count":22,"metadata":{},"output_type":"execute_result"}],"source":["titan.isna().sum(axis = 0)"]},{"cell_type":"code","execution_count":null,"id":"ac85caa0","metadata":{"id":"ac85caa0","outputId":"0913a981-0747-4c07-a494-045b65ab7378"},"outputs":[{"data":{"text/plain":["PassengerId 0\n","Survived 0\n","Pclass 0\n","Name 0\n","Sex 0\n","Age 177\n","SibSp 0\n","Parch 0\n","Ticket 0\n","Fare 0\n","Cabin 687\n","Embarked 2\n","dtype: int64"]},"execution_count":23,"metadata":{},"output_type":"execute_result"}],"source":["titan.isnull().sum(axis = 0)"]},{"cell_type":"markdown","id":"2bc64de6","metadata":{"id":"2bc64de6"},"source":["### Dropping data where there's missing values for Age and Embarked"]},{"cell_type":"code","execution_count":null,"id":"2dfc1947","metadata":{"id":"2dfc1947"},"outputs":[],"source":["titan = titan.dropna(subset = ['Age', 'Embarked'])"]},{"cell_type":"code","execution_count":null,"id":"1e137e01","metadata":{"id":"1e137e01","outputId":"aa085d11-71cf-4011-b796-023e1ddd8769"},"outputs":[{"data":{"text/plain":["(712, 12)"]},"execution_count":25,"metadata":{},"output_type":"execute_result"}],"source":["titan.shape"]},{"cell_type":"code","execution_count":null,"id":"85c5c820","metadata":{"id":"85c5c820","outputId":"86532bf4-a93e-4ded-8855-1f123d23ec5d"},"outputs":[{"data":{"text/html":["\n","\n","
\n"," \n"," \n"," | \n"," PassengerId | \n"," Survived | \n"," Pclass | \n"," Name | \n"," Sex | \n"," Age | \n"," SibSp | \n"," Parch | \n"," Ticket | \n"," Fare | \n"," Cabin | \n"," Embarked | \n","
\n"," \n"," \n"," \n"," 0 | \n"," 1 | \n"," 0 | \n"," 3 | \n"," Braund, Mr. Owen Harris | \n"," male | \n"," 22.0 | \n"," 1 | \n"," 0 | \n"," A/5 21171 | \n"," 7.2500 | \n"," NaN | \n"," S | \n","
\n"," \n"," 1 | \n"," 2 | \n"," 1 | \n"," 1 | \n"," Cumings, Mrs. John Bradley (Florence Briggs Th... | \n"," female | \n"," 38.0 | \n"," 1 | \n"," 0 | \n"," PC 17599 | \n"," 71.2833 | \n"," C85 | \n"," C | \n","
\n"," \n"," 2 | \n"," 3 | \n"," 1 | \n"," 3 | \n"," Heikkinen, Miss. Laina | \n"," female | \n"," 26.0 | \n"," 0 | \n"," 0 | \n"," STON/O2. 3101282 | \n"," 7.9250 | \n"," NaN | \n"," S | \n","
\n"," \n"," 3 | \n"," 4 | \n"," 1 | \n"," 1 | \n"," Futrelle, Mrs. Jacques Heath (Lily May Peel) | \n"," female | \n"," 35.0 | \n"," 1 | \n"," 0 | \n"," 113803 | \n"," 53.1000 | \n"," C123 | \n"," S | \n","
\n"," \n"," 4 | \n"," 5 | \n"," 0 | \n"," 3 | \n"," Allen, Mr. William Henry | \n"," male | \n"," 35.0 | \n"," 0 | \n"," 0 | \n"," 373450 | \n"," 8.0500 | \n"," NaN | \n"," S | \n","
\n"," \n"," ... | \n"," ... | \n"," ... | \n"," ... | \n"," ... | \n"," ... | \n"," ... | \n"," ... | \n"," ... | \n"," ... | \n"," ... | \n"," ... | \n"," ... | \n","
\n"," \n"," 885 | \n"," 886 | \n"," 0 | \n"," 3 | \n"," Rice, Mrs. William (Margaret Norton) | \n"," female | \n"," 39.0 | \n"," 0 | \n"," 5 | \n"," 382652 | \n"," 29.1250 | \n"," NaN | \n"," Q | \n","
\n"," \n"," 886 | \n"," 887 | \n"," 0 | \n"," 2 | \n"," Montvila, Rev. Juozas | \n"," male | \n"," 27.0 | \n"," 0 | \n"," 0 | \n"," 211536 | \n"," 13.0000 | \n"," NaN | \n"," S | \n","
\n"," \n"," 887 | \n"," 888 | \n"," 1 | \n"," 1 | \n"," Graham, Miss. Margaret Edith | \n"," female | \n"," 19.0 | \n"," 0 | \n"," 0 | \n"," 112053 | \n"," 30.0000 | \n"," B42 | \n"," S | \n","
\n"," \n"," 889 | \n"," 890 | \n"," 1 | \n"," 1 | \n"," Behr, Mr. Karl Howell | \n"," male | \n"," 26.0 | \n"," 0 | \n"," 0 | \n"," 111369 | \n"," 30.0000 | \n"," C148 | \n"," C | \n","
\n"," \n"," 890 | \n"," 891 | \n"," 0 | \n"," 3 | \n"," Dooley, Mr. Patrick | \n"," male | \n"," 32.0 | \n"," 0 | \n"," 0 | \n"," 370376 | \n"," 7.7500 | \n"," NaN | \n"," Q | \n","
\n"," \n","
\n","
712 rows × 12 columns
\n","
"],"text/plain":[" PassengerId Survived Pclass \\\n","0 1 0 3 \n","1 2 1 1 \n","2 3 1 3 \n","3 4 1 1 \n","4 5 0 3 \n",".. ... ... ... \n","885 886 0 3 \n","886 887 0 2 \n","887 888 1 1 \n","889 890 1 1 \n","890 891 0 3 \n","\n"," Name Sex Age SibSp \\\n","0 Braund, Mr. Owen Harris male 22.0 1 \n","1 Cumings, Mrs. John Bradley (Florence Briggs Th... female 38.0 1 \n","2 Heikkinen, Miss. Laina female 26.0 0 \n","3 Futrelle, Mrs. Jacques Heath (Lily May Peel) female 35.0 1 \n","4 Allen, Mr. William Henry male 35.0 0 \n",".. ... ... ... ... \n","885 Rice, Mrs. William (Margaret Norton) female 39.0 0 \n","886 Montvila, Rev. Juozas male 27.0 0 \n","887 Graham, Miss. Margaret Edith female 19.0 0 \n","889 Behr, Mr. Karl Howell male 26.0 0 \n","890 Dooley, Mr. Patrick male 32.0 0 \n","\n"," Parch Ticket Fare Cabin Embarked \n","0 0 A/5 21171 7.2500 NaN S \n","1 0 PC 17599 71.2833 C85 C \n","2 0 STON/O2. 3101282 7.9250 NaN S \n","3 0 113803 53.1000 C123 S \n","4 0 373450 8.0500 NaN S \n",".. ... ... ... ... ... \n","885 5 382652 29.1250 NaN Q \n","886 0 211536 13.0000 NaN S \n","887 0 112053 30.0000 B42 S \n","889 0 111369 30.0000 C148 C \n","890 0 370376 7.7500 NaN Q \n","\n","[712 rows x 12 columns]"]},"execution_count":26,"metadata":{},"output_type":"execute_result"}],"source":["titan"]},{"cell_type":"code","execution_count":null,"id":"0c8e824a","metadata":{"id":"0c8e824a"},"outputs":[],"source":["import numpy as np"]},{"cell_type":"code","execution_count":null,"id":"eea18af5","metadata":{"id":"eea18af5","outputId":"7a267873-9317-4f7b-f1c0-5d833e817b94"},"outputs":[{"data":{"text/html":["\n","\n","
\n"," \n"," \n"," | \n"," PassengerId | \n"," Weight | \n","
\n"," \n"," \n"," \n"," 0 | \n"," 1 | \n"," 241 | \n","
\n"," \n"," 1 | \n"," 2 | \n"," 161 | \n","
\n"," \n"," 2 | \n"," 3 | \n"," 199 | \n","
\n"," \n"," 3 | \n"," 4 | \n"," 203 | \n","
\n"," \n"," 4 | \n"," 5 | \n"," 245 | \n","
\n"," \n","
\n","
"],"text/plain":[" PassengerId Weight\n","0 1 241\n","1 2 161\n","2 3 199\n","3 4 203\n","4 5 245"]},"execution_count":32,"metadata":{},"output_type":"execute_result"}],"source":["passenger_ids = np.arange(1, 892)\n","weights = np.random.randint(150, 251, size=891)\n","passenger_weight_df = pd.DataFrame({\n"," 'PassengerId': passenger_ids,\n"," 'Weight': weights\n","})\n","\n","passenger_weight_df.head()"]},{"cell_type":"markdown","id":"c70bfece","metadata":{"id":"c70bfece"},"source":["### Left joining Titanic dataset with Passenger Weight data"]},{"cell_type":"markdown","id":"54cb4b82","metadata":{"id":"54cb4b82"},"source":["pd.merge in pandas is used to combine two dataframes based on common columns. It automatically merges the dataframes on all columns that both dataframes have in common.\n","\n","Specifying how = 'left' performs a left join, meaning that the resulting dataframe include all rows from the left dataframe and the matched rows from the right dataframe. If there is no match, the right wide will contain NaN."]},{"cell_type":"code","execution_count":null,"id":"31762718","metadata":{"id":"31762718"},"outputs":[],"source":["titan_df = pd.merge(titan, passenger_weight_df, how=\"left\")"]},{"cell_type":"code","execution_count":null,"id":"ff2b4ef4","metadata":{"id":"ff2b4ef4","outputId":"46743d86-8852-4859-e76a-ed24c5a5fc3a"},"outputs":[{"data":{"text/html":["\n","\n","
\n"," \n"," \n"," | \n"," PassengerId | \n"," Survived | \n"," Pclass | \n"," Name | \n"," Sex | \n"," Age | \n"," SibSp | \n"," Parch | \n"," Ticket | \n"," Fare | \n"," Cabin | \n"," Embarked | \n"," Weight | \n","
\n"," \n"," \n"," \n"," 0 | \n"," 1 | \n"," 0 | \n"," 3 | \n"," Braund, Mr. Owen Harris | \n"," male | \n"," 22.0 | \n"," 1 | \n"," 0 | \n"," A/5 21171 | \n"," 7.2500 | \n"," NaN | \n"," S | \n"," 241 | \n","
\n"," \n"," 1 | \n"," 2 | \n"," 1 | \n"," 1 | \n"," Cumings, Mrs. John Bradley (Florence Briggs Th... | \n"," female | \n"," 38.0 | \n"," 1 | \n"," 0 | \n"," PC 17599 | \n"," 71.2833 | \n"," C85 | \n"," C | \n"," 161 | \n","
\n"," \n"," 2 | \n"," 3 | \n"," 1 | \n"," 3 | \n"," Heikkinen, Miss. Laina | \n"," female | \n"," 26.0 | \n"," 0 | \n"," 0 | \n"," STON/O2. 3101282 | \n"," 7.9250 | \n"," NaN | \n"," S | \n"," 199 | \n","
\n"," \n"," 3 | \n"," 4 | \n"," 1 | \n"," 1 | \n"," Futrelle, Mrs. Jacques Heath (Lily May Peel) | \n"," female | \n"," 35.0 | \n"," 1 | \n"," 0 | \n"," 113803 | \n"," 53.1000 | \n"," C123 | \n"," S | \n"," 203 | \n","
\n"," \n"," 4 | \n"," 5 | \n"," 0 | \n"," 3 | \n"," Allen, Mr. William Henry | \n"," male | \n"," 35.0 | \n"," 0 | \n"," 0 | \n"," 373450 | \n"," 8.0500 | \n"," NaN | \n"," S | \n"," 245 | \n","
\n"," \n","
\n","
"],"text/plain":[" PassengerId Survived Pclass \\\n","0 1 0 3 \n","1 2 1 1 \n","2 3 1 3 \n","3 4 1 1 \n","4 5 0 3 \n","\n"," Name Sex Age SibSp \\\n","0 Braund, Mr. Owen Harris male 22.0 1 \n","1 Cumings, Mrs. John Bradley (Florence Briggs Th... female 38.0 1 \n","2 Heikkinen, Miss. Laina female 26.0 0 \n","3 Futrelle, Mrs. Jacques Heath (Lily May Peel) female 35.0 1 \n","4 Allen, Mr. William Henry male 35.0 0 \n","\n"," Parch Ticket Fare Cabin Embarked Weight \n","0 0 A/5 21171 7.2500 NaN S 241 \n","1 0 PC 17599 71.2833 C85 C 161 \n","2 0 STON/O2. 3101282 7.9250 NaN S 199 \n","3 0 113803 53.1000 C123 S 203 \n","4 0 373450 8.0500 NaN S 245 "]},"execution_count":43,"metadata":{},"output_type":"execute_result"}],"source":["titan_df.head()"]},{"cell_type":"markdown","id":"b21ae968","metadata":{"id":"b21ae968"},"source":["In this case, the common column is PassengerId, so the dataframes merged on PassengerId"]},{"cell_type":"markdown","id":"6e7b9cf0","metadata":{"id":"6e7b9cf0"},"source":["### Checking how many different type of Cabin there is in the dataset"]},{"cell_type":"code","execution_count":null,"id":"d7154e2a","metadata":{"id":"d7154e2a","outputId":"309c691f-4f64-4b87-f3ff-3f738f98c4f5"},"outputs":[{"data":{"text/plain":["Cabin\n","G6 4\n","B96 B98 4\n","C23 C25 C27 4\n","F33 3\n","D 3\n"," ..\n","C91 1\n","C124 1\n","C32 1\n","E34 1\n","C148 1\n","Name: count, Length: 133, dtype: int64"]},"execution_count":50,"metadata":{},"output_type":"execute_result"}],"source":["titan_df['Cabin'].value_counts()"]},{"cell_type":"markdown","id":"576a4ca1","metadata":{"id":"576a4ca1"},"source":["### Limiting to just top 100 Cabins"]},{"cell_type":"code","execution_count":null,"id":"0c52aca1","metadata":{"id":"0c52aca1"},"outputs":[],"source":["cabins = titan_df['Cabin'].value_counts()[:100].index.tolist()\n","temp = titan_df[titan_df['Cabin'].isin(cabins)]"]},{"cell_type":"code","execution_count":null,"id":"ef03e263","metadata":{"id":"ef03e263","outputId":"8e2ac7c4-5645-4742-e982-e5178d5e4cd0"},"outputs":[{"data":{"text/plain":["(150, 13)"]},"execution_count":48,"metadata":{},"output_type":"execute_result"}],"source":["temp.shape"]},{"cell_type":"markdown","id":"8476f25f","metadata":{"id":"8476f25f"},"source":["## Part 2"]},{"cell_type":"code","execution_count":null,"id":"058211f2","metadata":{"id":"058211f2","outputId":"1fad5ed6-8551-4adc-c979-67eeeab2fc25"},"outputs":[{"data":{"text/html":["\n","\n","
\n"," \n"," \n"," | \n"," sepal length (cm) | \n"," sepal width (cm) | \n"," petal length (cm) | \n"," petal width (cm) | \n"," species | \n","
\n"," \n"," \n"," \n"," 0 | \n"," 5.1 | \n"," 3.5 | \n"," 1.4 | \n"," 0.2 | \n"," 0 | \n","
\n"," \n"," 1 | \n"," 4.9 | \n"," 3.0 | \n"," 1.4 | \n"," 0.2 | \n"," 0 | \n","
\n"," \n"," 2 | \n"," 4.7 | \n"," 3.2 | \n"," 1.3 | \n"," 0.2 | \n"," 0 | \n","
\n"," \n"," 3 | \n"," 4.6 | \n"," 3.1 | \n"," 1.5 | \n"," 0.2 | \n"," 0 | \n","
\n"," \n"," 4 | \n"," 5.0 | \n"," 3.6 | \n"," 1.4 | \n"," 0.2 | \n"," 0 | \n","
\n"," \n","
\n","
"],"text/plain":[" sepal length (cm) sepal width (cm) petal length (cm) petal width (cm) \\\n","0 5.1 3.5 1.4 0.2 \n","1 4.9 3.0 1.4 0.2 \n","2 4.7 3.2 1.3 0.2 \n","3 4.6 3.1 1.5 0.2 \n","4 5.0 3.6 1.4 0.2 \n","\n"," species \n","0 0 \n","1 0 \n","2 0 \n","3 0 \n","4 0 "]},"execution_count":59,"metadata":{},"output_type":"execute_result"}],"source":["import pandas as pd\n","from sklearn.datasets import load_iris\n","\n","iris = load_iris()\n","df = pd.DataFrame(iris.data, columns=iris.feature_names)\n","\n","# Add the target variable\n","df['species'] = pd.DataFrame(iris.target)\n","\n","df.head()"]},{"cell_type":"markdown","id":"8fdd8c96","metadata":{"id":"8fdd8c96"},"source":["### Splitting into predictors (everything else) and outcome variable (species)"]},{"cell_type":"code","execution_count":null,"id":"8ae4e66e","metadata":{"id":"8ae4e66e"},"outputs":[],"source":["x = df[['sepal length (cm)', 'sepal width (cm)', 'petal length (cm)','petal width (cm)']]\n","y = df['species']"]},{"cell_type":"markdown","id":"1f733f44","metadata":{"id":"1f733f44"},"source":["### Splitting into 80% train and 20% test"]},{"cell_type":"markdown","id":"8afb0072","metadata":{"id":"8afb0072"},"source":["For the purpose of this lab, do **not** use train_test_split from sklearn"]},{"cell_type":"code","execution_count":null,"id":"fbeba70e","metadata":{"id":"fbeba70e"},"outputs":[],"source":["n_train = int(len(df)*0.8)\n","n_test = len(df) - n_train"]},{"cell_type":"code","execution_count":null,"id":"29d30598","metadata":{"id":"29d30598"},"outputs":[],"source":["train_x = x[:n_train]\n","train_y = y[:n_train]\n","test_x = x[n_train:]\n","test_y = y[n_train:]"]},{"cell_type":"markdown","id":"6b96acb2","metadata":{"id":"6b96acb2"},"source":["train_x is the predictors for the train data, test_x is the outcome for the train data\n","\n","train_y is the predictors for the test data, test_y is the outcome for the test data"]},{"cell_type":"markdown","id":"172aa656","metadata":{"id":"172aa656"},"source":["### Traininig a model"]},{"cell_type":"code","execution_count":null,"id":"e4c4868f","metadata":{"id":"e4c4868f"},"outputs":[],"source":["from sklearn.svm import SVC\n","from sklearn.metrics import confusion_matrix, classification_report, precision_recall_fscore_support"]},{"cell_type":"code","execution_count":null,"id":"31c1e700","metadata":{"id":"31c1e700"},"outputs":[],"source":["def train_SVM(X,y, kernel = 'linear'):\n"," clf = SVC(kernel = kernel)\n"," clf.fit(X, y)\n","\n"," return clf"]},{"cell_type":"markdown","id":"e20760be","metadata":{"id":"e20760be"},"source":["Training on the train data"]},{"cell_type":"code","execution_count":null,"id":"095187e1","metadata":{"id":"095187e1"},"outputs":[],"source":["iris_clf = train_SVM(train_x, train_y)"]},{"cell_type":"markdown","id":"eeb041a3","metadata":{"id":"eeb041a3"},"source":["### Making Predictions"]},{"cell_type":"markdown","id":"a3271c98","metadata":{"id":"a3271c98"},"source":["Making predictions on the train and test data"]},{"cell_type":"code","execution_count":null,"id":"278f027d","metadata":{"id":"278f027d"},"outputs":[],"source":["predicted_train_y = iris_clf.predict(train_x)\n","predicted_test_y = iris_clf.predict(test_x)"]},{"cell_type":"markdown","id":"a1348319","metadata":{"id":"a1348319"},"source":["## Part 3 - Model Assessment"]},{"cell_type":"markdown","id":"f1307229","metadata":{"id":"f1307229"},"source":["### Classification report on the predictions generated from training data"]},{"cell_type":"code","execution_count":null,"id":"b67705ae","metadata":{"id":"b67705ae","outputId":"88cdd795-e1bb-4ca4-f255-515700053323"},"outputs":[{"name":"stdout","output_type":"stream","text":[" precision recall f1-score support\n","\n"," 0 1.00 1.00 1.00 50\n"," 1 1.00 0.98 0.99 50\n"," 2 0.95 1.00 0.98 20\n","\n"," accuracy 0.99 120\n"," macro avg 0.98 0.99 0.99 120\n","weighted avg 0.99 0.99 0.99 120\n","\n"]}],"source":["print(classification_report(train_y, predicted_train_y))"]},{"cell_type":"markdown","id":"52484698","metadata":{"id":"52484698"},"source":["### Confusion Matrix on the predictions generated from training data"]},{"cell_type":"code","execution_count":null,"id":"e649dace","metadata":{"id":"e649dace","outputId":"3ff50e90-e803-4f7f-a71b-9a0067d6a674"},"outputs":[{"name":"stdout","output_type":"stream","text":["[[50 0 0]\n"," [ 0 49 1]\n"," [ 0 0 20]]\n"]}],"source":["print(confusion_matrix(train_y, predicted_train_y, sample_weight=None))"]}],"metadata":{"kernelspec":{"display_name":"Python 3 (ipykernel)","language":"python","name":"python3"},"language_info":{"codemirror_mode":{"name":"ipython","version":3},"file_extension":".py","mimetype":"text/x-python","name":"python","nbconvert_exporter":"python","pygments_lexer":"ipython3","version":"3.9.5"},"colab":{"provenance":[]}},"nbformat":4,"nbformat_minor":5}
--------------------------------------------------------------------------------
/XX_section/D8.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/COGS108/Lectures-Ellis/c9dbb70bd3b6e72b2835115aa6f1c1bfdf89ee79/XX_section/D8.pdf
--------------------------------------------------------------------------------