├── .gitignore
├── Data
├── 13May16
│ ├── contains.txt
│ ├── subjects.p
│ └── users.p
├── CrowdstormingDataJuly1st.csv
├── love_those_books.csv
├── salary.csv
├── salary_exercise.csv
└── student-mat.csv
├── LICENSE
├── Notebooks
├── .gitignore
├── 1_Introduction_to_Programming.ipynb
├── 2.1_MatplotlibTemplates.ipynb
├── 2_Introduction_to_Dataframes_&_Plotting.ipynb
├── 3_Introduction_to_Regression.ipynb
├── 4_Mediation_&_Moderation.ipynb
├── 5_Prediction_&_Regularization.ipynb
├── 6_Introduction_to_SocialNetworkAnalysis.ipynb
├── 7_Introduction_to_Scraping.ipynb
├── Figures
│ ├── centrality.png
│ ├── estimating_coefficients.png
│ ├── hw2-3.png
│ ├── hw2-4.png
│ ├── mediation1.png
│ ├── moderator1.gif
│ ├── moderator2.gif
│ ├── moderator3.jpeg
│ ├── nodeedge.png
│ └── slope_intercept.png
└── Old
│ ├── Classification.ipynb
│ └── phacking.ipynb
└── README.md
/.gitignore:
--------------------------------------------------------------------------------
1 | # Byte-compiled / optimized / DLL files
2 | __pycache__/
3 | *.py[cod]
4 | *$py.class
5 |
6 | # C extensions
7 | *.so
8 |
9 | # Distribution / packaging
10 | .Python
11 | env/
12 | build/
13 | develop-eggs/
14 | dist/
15 | downloads/
16 | eggs/
17 | .eggs/
18 | lib/
19 | lib64/
20 | parts/
21 | sdist/
22 | var/
23 | *.egg-info/
24 | .installed.cfg
25 | *.egg
26 |
27 | # PyInstaller
28 | # Usually these files are written by a python script from a template
29 | # before PyInstaller builds the exe, so as to inject date/other infos into it.
30 | *.manifest
31 | *.spec
32 |
33 | # Installer logs
34 | pip-log.txt
35 | pip-delete-this-directory.txt
36 |
37 | # Unit test / coverage reports
38 | htmlcov/
39 | .tox/
40 | .coverage
41 | .coverage.*
42 | .cache
43 | nosetests.xml
44 | coverage.xml
45 | *,cover
46 | .hypothesis/
47 |
48 | # Translations
49 | *.mo
50 | *.pot
51 |
52 | # Django stuff:
53 | *.log
54 | local_settings.py
55 |
56 | # Flask stuff:
57 | instance/
58 | .webassets-cache
59 |
60 | # Scrapy stuff:
61 | .scrapy
62 |
63 | # Sphinx documentation
64 | docs/_build/
65 |
66 | # PyBuilder
67 | target/
68 |
69 | # IPython Notebook
70 | .ipynb_checkpoints
71 |
72 | # pyenv
73 | .python-version
74 |
75 | # celery beat schedule file
76 | celerybeat-schedule
77 |
78 | # dotenv
79 | .env
80 |
81 | # virtualenv
82 | venv/
83 | ENV/
84 |
85 | # Spyder project settings
86 | .spyderproject
87 |
88 | # Rope project settings
89 | .ropeproject
90 |
--------------------------------------------------------------------------------
/Data/13May16/contains.txt:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ljchang/psyc63/b3329228952c9c6793196db502b0442e77863e46/Data/13May16/contains.txt
--------------------------------------------------------------------------------
/Data/13May16/users.p:
--------------------------------------------------------------------------------
1 | (lp0
2 | (dp1
3 | Vservices
4 | p2
5 | (dp3
6 | Vpassword
7 | p4
8 | (dp5
9 | Vbcrypt
10 | p6
11 | V$2a$10$pbAb8K6OmsOmonCzbGRf0e1MrLp7ec1JnYIzEctMLyn9btOpzvAmG
12 | p7
13 | ssVresume
14 | p8
15 | (dp9
16 | VloginTokens
17 | p10
18 | (lp11
19 | sssV_id
20 | p12
21 | Vogrw4dL9mksHEKigC
22 | p13
23 | sVemails
24 | p14
25 | (lp15
26 | (dp16
27 | Vverified
28 | p17
29 | I00
30 | sVaddress
31 | p18
32 | Vejolly@test.com
33 | p19
34 | sasVcreatedAt
35 | p20
36 | cdatetime
37 | datetime
38 | p21
39 | (S'\x07\xe0\x05\x08\x17#\x1d\n\xcd\xa0'
40 | p22
41 | tp23
42 | Rp24
43 | sa(dp25
44 | Vservices
45 | p26
46 | (dp27
47 | Vpassword
48 | p28
49 | (dp29
50 | Vbcrypt
51 | p30
52 | V$2a$10$5P3UUGulQ7RACVROaU56l.H1gYvRhDbuvmuXM7UXHzC/6iyt3gOJ.
53 | p31
54 | ssVresume
55 | p32
56 | (dp33
57 | VloginTokens
58 | p34
59 | (lp35
60 | sssV_id
61 | p36
62 | VGwbhewAmNJ3nA8g4d
63 | p37
64 | sVemails
65 | p38
66 | (lp39
67 | (dp40
68 | Vverified
69 | p41
70 | I00
71 | sVaddress
72 | p42
73 | Vjames.e.carmichael.gr@dartmouth.edu
74 | p43
75 | sasVcreatedAt
76 | p44
77 | g21
78 | (S"\x07\xe0\x05\x08\x17'6\x0bZ@"
79 | p45
80 | tp46
81 | Rp47
82 | sa(dp48
83 | Vservices
84 | p49
85 | (dp50
86 | Vpassword
87 | p51
88 | (dp52
89 | Vbcrypt
90 | p53
91 | V$2a$10$ymYsiX7HMD4inQ20MHD/VutAr844y3JvmjeOcqLH4wu8krSX8fUXK
92 | p54
93 | ssVresume
94 | p55
95 | (dp56
96 | VloginTokens
97 | p57
98 | (lp58
99 | sssV_id
100 | p59
101 | VtksS632f4u9JtZ3LZ
102 | p60
103 | sVemails
104 | p61
105 | (lp62
106 | (dp63
107 | Vverified
108 | p64
109 | I00
110 | sVaddress
111 | p65
112 | Vcbentivoglio@middlebury.edu
113 | p66
114 | sasVcreatedAt
115 | p67
116 | g21
117 | (S'\x07\xe0\x05\t\x006;\x02Y\x90'
118 | p68
119 | tp69
120 | Rp70
121 | sa(dp71
122 | Vservices
123 | p72
124 | (dp73
125 | Vpassword
126 | p74
127 | (dp75
128 | Vbcrypt
129 | p76
130 | V$2a$10$uBmZiohM3ZdT/EpsZWAlUeffalyYaZAToT.wtqAmPEMTABfWk14wi
131 | p77
132 | ssVresume
133 | p78
134 | (dp79
135 | VloginTokens
136 | p80
137 | (lp81
138 | sssV_id
139 | p82
140 | VQa8shq3L9omL5xHKo
141 | p83
142 | sVemails
143 | p84
144 | (lp85
145 | (dp86
146 | Vverified
147 | p87
148 | I00
149 | sVaddress
150 | p88
151 | Vtest@test.com
152 | p89
153 | sasVcreatedAt
154 | p90
155 | g21
156 | (S'\x07\xe0\x05\t\x10\x1d#\x05\xcc`'
157 | p91
158 | tp92
159 | Rp93
160 | sa(dp94
161 | Vservices
162 | p95
163 | (dp96
164 | Vpassword
165 | p97
166 | (dp98
167 | Vbcrypt
168 | p99
169 | V$2a$10$0Bhtui2bp05B9GyS3KBhtubQz9xlVCXYhDvefDAW47TCS/iPxRNUG
170 | p100
171 | ssVresume
172 | p101
173 | (dp102
174 | VloginTokens
175 | p103
176 | (lp104
177 | (dp105
178 | VhashedToken
179 | p106
180 | VbyXZrVYdeasFKoHzaKN0kjoffxeHAEIdcJSJZttl2Ik=
181 | p107
182 | sVwhen
183 | p108
184 | g21
185 | (S"\x07\xe0\x05\n\x16\x18'\x07V\xe8"
186 | p109
187 | tp110
188 | Rp111
189 | sa(dp112
190 | VhashedToken
191 | p113
192 | V5deRxFJTT8QhMfCES+CpwsQ2R31GgE8/RYnVb2UHw7g=
193 | p114
194 | sVwhen
195 | p115
196 | g21
197 | (S'\x07\xe0\x05\n\x16\x19.\t\xa0\xd8'
198 | p116
199 | tp117
200 | Rp118
201 | sa(dp119
202 | VhashedToken
203 | p120
204 | V9ToUywNIWQBE9ZFr1ZK5hNBafsheg1zqOSs8IMCr9YE=
205 | p121
206 | sVwhen
207 | p122
208 | g21
209 | (S'\x07\xe0\x05\n\x17/\x01\x05\xcc`'
210 | p123
211 | tp124
212 | Rp125
213 | sasssV_id
214 | p126
215 | V3fcdRR5Y3hzdABSXn
216 | p127
217 | sVemails
218 | p128
219 | (lp129
220 | (dp130
221 | Vverified
222 | p131
223 | I00
224 | sVaddress
225 | p132
226 | Vdanrwhitcomb@gmail.com
227 | p133
228 | sasVcreatedAt
229 | p134
230 | g21
231 | (S'\x07\xe0\x05\t\x17#\x19\x01\xd0\xd8'
232 | p135
233 | tp136
234 | Rp137
235 | sa(dp138
236 | Vservices
237 | p139
238 | (dp140
239 | Vpassword
240 | p141
241 | (dp142
242 | Vbcrypt
243 | p143
244 | V$2a$10$eKtiz4QTXwGoGksJumGP0O6NQc9QhzlDHWC91gRop4ZKmbTbgpQ52
245 | p144
246 | ssVresume
247 | p145
248 | (dp146
249 | VloginTokens
250 | p147
251 | (lp148
252 | sssV_id
253 | p149
254 | Vm3q2uoH5waEFCdKLQ
255 | p150
256 | sVemails
257 | p151
258 | (lp152
259 | (dp153
260 | Vverified
261 | p154
262 | I00
263 | sVaddress
264 | p155
265 | Vanna.t.prescott.gr@dartmouth.edu
266 | p156
267 | sasVcreatedAt
268 | p157
269 | g21
270 | (S'\x07\xe0\x05\r\x0f\x1e\x03\x08)\xd8'
271 | p158
272 | tp159
273 | Rp160
274 | sa(dp161
275 | Vservices
276 | p162
277 | (dp163
278 | Vpassword
279 | p164
280 | (dp165
281 | Vbcrypt
282 | p166
283 | V$2a$10$URpEJIxBHRr4WHldL4jLkuGBnrIkLGXrEIvpwVeRcnU1Y17XgUOye
284 | p167
285 | ssVresume
286 | p168
287 | (dp169
288 | VloginTokens
289 | p170
290 | (lp171
291 | sssV_id
292 | p172
293 | VhZHxzvxjpzMzA4nJT
294 | p173
295 | sVemails
296 | p174
297 | (lp175
298 | (dp176
299 | Vverified
300 | p177
301 | I00
302 | sVaddress
303 | p178
304 | Varatidoo@gmail.com
305 | p179
306 | sasVcreatedAt
307 | p180
308 | g21
309 | (S'\x07\xe0\x05\r\x13&\x00\x08\x7f\xc8'
310 | p181
311 | tp182
312 | Rp183
313 | sa.
--------------------------------------------------------------------------------
/Data/love_those_books.csv:
--------------------------------------------------------------------------------
1 | enjoy,buy,read
4,16,6
15,19,13
1,0,1
11,19,13
13,25,12
19,24,11
6,22,7
10,21,8
15,13,12
3,7,4
11,28,15
20,31,14
7,4,7
11,26,14
10,11,9
6,12,5
7,14,7
18,16,12
8,20,10
2,13,6
7,12,9
12,23,13
13,22,9
15,19,13
4,12,9
3,10,5
9,7,7
7,22,8
10,7,8
2,0,2
15,16,7
1,17,6
3,11,9
6,5,9
13,29,15
15,29,11
16,20,9
14,16,7
1,3,2
8,8,10
--------------------------------------------------------------------------------
/Data/salary.csv:
--------------------------------------------------------------------------------
1 | salary,gender,departm,years,age,publications
86285,0,bio,26,64,72
77125,0,bio,28,58,43
71922,0,bio,10,38,23
70499,0,bio,16,46,64
66624,0,bio,11,41,23
64451,0,bio,23,60,44
64366,0,bio,23,53,22
59344,0,bio,5,40,11
58560,0,bio,8,38,8
58294,0,bio,20,50,12
56092,0,bio,2,40,4
54452,0,bio,13,43,7
54269,0,bio,26,56,12
55125,0,bio,8,38,9
97630,0,chem,34,64,43
82444,0,chem,31,61,42
76291,0,chem,29,65,33
75382,0,chem,26,56,39
64762,0,chem,25,,29
62607,0,chem,20,45,34
60373,0,chem,26,56,43
58892,0,chem,18,48,21
47021,0,chem,4,34,12
44687,0,chem,4,34,19
104828,0,geol,,50,44
71456,0,geol,11,41,32
65144,0,geol,7,37,12
52766,0,geol,4,38,32
112800,0,neuro,14,44,33
105761,0,neuro,9,39,30
92951,0,neuro,11,41,20
86621,0,neuro,19,49,10
85569,0,neuro,20,46,35
83896,0,neuro,10,40,22
79735,0,neuro,11,41,32
71518,0,neuro,7,37,34
68029,0,neuro,15,45,33
66482,0,neuro,14,44,42
61680,0,neuro,18,48,20
60455,0,neuro,8,38,49
58932,0,neuro,11,41,49
106412,0,stat,23,53,29
86980,0,stat,23,53,42
78114,0,stat,8,38,24
74085,0,stat,11,41,33
72250,0,stat,26,56,9
69596,0,stat,20,50,18
65285,0,stat,20,50,15
62557,0,stat,28,58,14
61947,0,stat,22,58,17
58565,0,stat,29,59,11
58365,0,stat,18,48,21
53656,0,stat,2,32,4
51391,0,stat,5,35,8
96936,0,physics,15,50,17
83216,0,physics,11,37,19
72044,0,physics,2,32,16
64048,0,physics,23,53,4
58888,0,physics,26,56,7
58744,0,physics,20,50,9
55944,0,physics,21,51,8
54076,0,physics,19,49,12
82142,0,math,9,39,9
70509,0,math,23,53,7
60320,0,math,14,44,7
55814,0,math,8,38,6
53638,0,math,4,42,8
53517,2,math,5,35,5
59139,1,bio,8,38,23
52968,1,bio,18,48,32
55949,1,chem,4,34,12
58893,1,neuro,10,35,4
53662,1,neuro,1,31,3
57185,1,stat,9,39,7
52254,1,stat,2,32,9
61885,1,math,23,60,9
49542,1,math,3,33,5
--------------------------------------------------------------------------------
/Data/salary_exercise.csv:
--------------------------------------------------------------------------------
1 | sx,rk,yr,dg,yd,sl
male,full,25,doctorate,35,36350
male,full,13,doctorate,22,35350
male,full,10,doctorate,23,28200
female,full,7,doctorate,27,26775
male,full,19,masters,30,33696
male,full,16,doctorate,21,28516
female,full,0,masters,32,24900
male,full,16,doctorate,18,31909
male,full,13,masters,30,31850
male,full,13,masters,31,32850
male,full,12,doctorate,22,27025
male,associate,15,doctorate,19,24750
male,full,9,doctorate,17,28200
male,associate,9,,27,23712
male,full,9,doctorate,24,25748
male,full,7,doctorate,15,29342
male,full,13,doctorate,20,31114
male,associate,11,masters,14,24742
male,associate,10,masters,15,22906
male,full,6,masters,21,24450
male,assistant,16,masters,23,19175
male,associate,8,masters,31,20525
male,full,7,doctorate,13,27959
female,full,8,doctorate,24,38045
male,associate,9,doctorate,12,
male,full,5,doctorate,18,25400
male,associate,11,doctorate,14,24800
female,full,5,doctorate,16,25500
male,associate,3,masters,7,26182
male,associate,3,masters,17,23725
female,assistant,10,masters,15,21600
male,associate,11,masters,31,23300
male,assistant,9,masters,14,23713
female,associate,4,masters,33,20690
female,associate,6,masters,29,22450
male,associate,1,doctorate,9,20850
female,assistant,8,doctorate,14,18304
male,assistant,4,doctorate,4,17095
male,assistant,4,doctorate,5,16700
male,assistant,4,doctorate,4,17600
,assistant,3,doctorate,4,18075
male,assistant,3,masters,11,18000
male,associate,0,doctorate,7,20999
female,assistant,3,doctorate,3,17250
male,assistant,2,doctorate,3,16500
male,assistant,2,doctorate,1,16094
female,assistant,2,doctorate,6,16150
female,assistant,2,doctorate,2,15350
male,assistant,1,doctorate,1,16244
female,assistant,1,doctorate,1,16686
female,assistant,1,doctorate,1,15000
female,assistant,0,doctorate,2,20300
--------------------------------------------------------------------------------
/Data/student-mat.csv:
--------------------------------------------------------------------------------
1 | school,sex,age,address,famsize,Pstatus,Medu,Fedu,Mjob,Fjob,reason,guardian,traveltime,studytime,failures,schoolsup,famsup,paid,activities,nursery,higher,internet,romantic,famrel,freetime,goout,Dalc,Walc,health,absences,G1,G2,G3
GP,F,18,U,GT3,A,4,4,at_home,teacher,course,mother,2,2,0,yes,no,no,no,yes,yes,no,no,4,3,4,1,1,3,6,5,6,6
GP,F,17,U,GT3,T,1,1,at_home,other,course,father,1,2,0,no,yes,no,no,no,yes,yes,no,5,3,3,1,1,3,4,5,5,6
GP,F,15,U,LE3,T,1,1,at_home,other,other,mother,1,2,3,yes,no,yes,no,yes,yes,yes,no,4,3,2,2,3,3,10,7,8,10
GP,F,15,U,GT3,T,4,2,health,services,home,mother,1,3,0,no,yes,yes,yes,yes,yes,yes,yes,3,2,2,1,1,5,2,15,14,15
GP,F,16,U,GT3,T,3,3,other,other,home,father,1,2,0,no,yes,yes,no,yes,yes,no,no,4,3,2,1,2,5,4,6,10,10
GP,M,16,U,LE3,T,4,3,services,other,reputation,mother,1,2,0,no,yes,yes,yes,yes,yes,yes,no,5,4,2,1,2,5,10,15,15,15
GP,M,16,U,LE3,T,2,2,other,other,home,mother,1,2,0,no,no,no,no,yes,yes,yes,no,4,4,4,1,1,3,0,12,12,11
GP,F,17,U,GT3,A,4,4,other,teacher,home,mother,2,2,0,yes,yes,no,no,yes,yes,no,no,4,1,4,1,1,1,6,6,5,6
GP,M,15,U,LE3,A,3,2,services,other,home,mother,1,2,0,no,yes,yes,no,yes,yes,yes,no,4,2,2,1,1,1,0,16,18,19
GP,M,15,U,GT3,T,3,4,other,other,home,mother,1,2,0,no,yes,yes,yes,yes,yes,yes,no,5,5,1,1,1,5,0,14,15,15
GP,F,15,U,GT3,T,4,4,teacher,health,reputation,mother,1,2,0,no,yes,yes,no,yes,yes,yes,no,3,3,3,1,2,2,0,10,8,9
GP,F,15,U,GT3,T,2,1,services,other,reputation,father,3,3,0,no,yes,no,yes,yes,yes,yes,no,5,2,2,1,1,4,4,10,12,12
GP,M,15,U,LE3,T,4,4,health,services,course,father,1,1,0,no,yes,yes,yes,yes,yes,yes,no,4,3,3,1,3,5,2,14,14,14
GP,M,15,U,GT3,T,4,3,teacher,other,course,mother,2,2,0,no,yes,yes,no,yes,yes,yes,no,5,4,3,1,2,3,2,10,10,11
GP,M,15,U,GT3,A,2,2,other,other,home,other,1,3,0,no,yes,no,no,yes,yes,yes,yes,4,5,2,4,4,3,0,14,16,16
GP,F,16,U,GT3,T,4,4,health,other,home,mother,1,1,0,no,yes,no,no,yes,yes,yes,no,4,4,4,1,2,2,4,14,14,14
GP,F,16,U,GT3,T,4,4,services,services,reputation,mother,1,3,0,no,yes,yes,yes,yes,yes,yes,no,3,2,3,1,2,2,6,13,14,14
GP,F,16,U,GT3,T,3,3,other,other,reputation,mother,3,2,0,yes,yes,no,yes,yes,yes,no,no,5,3,2,1,1,4,4,8,10,10
GP,M,17,U,GT3,T,3,2,services,services,course,mother,1,1,3,no,yes,no,yes,yes,yes,yes,no,5,5,5,5,4,5,16,6,5,5
GP,M,16,U,LE3,T,4,3,health,other,home,father,1,1,0,no,no,yes,yes,yes,yes,yes,no,3,1,3,1,3,5,4,8,10,10
GP,M,15,U,GT3,T,4,3,teacher,other,reputation,mother,1,2,0,no,no,no,no,yes,yes,yes,no,4,4,1,1,1,1,0,13,14,15
GP,M,15,U,GT3,T,4,4,health,health,other,father,1,1,0,no,yes,yes,no,yes,yes,yes,no,5,4,2,1,1,5,0,12,15,15
GP,M,16,U,LE3,T,4,2,teacher,other,course,mother,1,2,0,no,no,no,yes,yes,yes,yes,no,4,5,1,5,3,5,2,15,15,16
GP,M,16,U,LE3,T,2,2,other,other,reputation,mother,2,2,0,no,yes,no,yes,yes,yes,yes,no,5,4,4,2,4,5,0,13,13,12
GP,F,15,R,GT3,T,2,4,services,health,course,mother,1,3,0,yes,yes,yes,yes,yes,yes,yes,no,4,3,2,1,1,5,2,10,9,8
GP,F,16,U,GT3,T,2,2,services,services,home,mother,1,1,2,no,yes,yes,no,no,yes,yes,no,1,2,2,1,3,5,14,6,9,8
GP,M,15,U,GT3,T,2,2,other,other,home,mother,1,1,0,no,yes,yes,no,yes,yes,yes,no,4,2,2,1,2,5,2,12,12,11
GP,M,15,U,GT3,T,4,2,health,services,other,mother,1,1,0,no,no,yes,no,yes,yes,yes,no,2,2,4,2,4,1,4,15,16,15
GP,M,16,U,LE3,A,3,4,services,other,home,mother,1,2,0,yes,yes,no,yes,yes,yes,yes,no,5,3,3,1,1,5,4,11,11,11
GP,M,16,U,GT3,T,4,4,teacher,teacher,home,mother,1,2,0,no,yes,yes,yes,yes,yes,yes,yes,4,4,5,5,5,5,16,10,12,11
GP,M,15,U,GT3,T,4,4,health,services,home,mother,1,2,0,no,yes,yes,no,no,yes,yes,no,5,4,2,5,4,5,0,9,11,12
GP,M,15,U,GT3,T,4,4,services,services,reputation,mother,2,2,0,no,yes,no,yes,yes,yes,yes,no,4,3,1,1,1,5,0,17,16,17
GP,M,15,R,GT3,T,4,3,teacher,at_home,course,mother,1,2,0,no,yes,no,yes,yes,yes,yes,yes,4,5,2,1,1,5,0,17,16,16
GP,M,15,U,LE3,T,3,3,other,other,course,mother,1,2,0,no,no,no,yes,no,yes,yes,no,5,3,2,1,1,2,0,8,10,12
GP,M,16,U,GT3,T,3,2,other,other,home,mother,1,1,0,no,yes,yes,no,no,yes,yes,no,5,4,3,1,1,5,0,12,14,15
GP,F,15,U,GT3,T,2,3,other,other,other,father,2,1,0,no,yes,no,yes,yes,yes,no,no,3,5,1,1,1,5,0,8,7,6
GP,M,15,U,LE3,T,4,3,teacher,services,home,mother,1,3,0,no,yes,no,yes,yes,yes,yes,no,5,4,3,1,1,4,2,15,16,18
GP,M,16,R,GT3,A,4,4,other,teacher,reputation,mother,2,3,0,no,yes,no,yes,yes,yes,yes,yes,2,4,3,1,1,5,7,15,16,15
GP,F,15,R,GT3,T,3,4,services,health,course,mother,1,3,0,yes,yes,yes,yes,yes,yes,yes,no,4,3,2,1,1,5,2,12,12,11
GP,F,15,R,GT3,T,2,2,at_home,other,reputation,mother,1,1,0,yes,yes,yes,yes,yes,yes,no,no,4,3,1,1,1,2,8,14,13,13
GP,F,16,U,LE3,T,2,2,other,other,home,mother,2,2,1,no,yes,no,yes,no,yes,yes,yes,3,3,3,1,2,3,25,7,10,11
GP,M,15,U,LE3,T,4,4,teacher,other,home,other,1,1,0,no,yes,no,no,no,yes,yes,yes,5,4,3,5,4,5,8,12,12,12
GP,M,15,U,GT3,T,4,4,services,teacher,course,father,1,2,0,no,yes,no,yes,yes,yes,yes,no,4,3,3,1,1,5,2,19,18,18
GP,M,15,U,GT3,T,2,2,services,services,course,father,1,1,0,yes,yes,no,no,yes,yes,yes,no,5,4,1,1,1,1,0,8,8,11
GP,F,16,U,LE3,T,2,2,other,at_home,course,father,2,2,1,yes,no,no,yes,yes,yes,yes,no,4,3,3,2,2,5,14,10,10,9
GP,F,15,U,LE3,A,4,3,other,other,course,mother,1,2,0,yes,yes,yes,yes,yes,yes,yes,yes,5,2,2,1,1,5,8,8,8,6
GP,F,16,U,LE3,A,3,3,other,services,home,mother,1,2,0,no,yes,no,no,yes,yes,yes,no,2,3,5,1,4,3,12,11,12,11
GP,M,16,U,GT3,T,4,3,health,services,reputation,mother,1,4,0,no,no,no,yes,yes,yes,yes,no,4,2,2,1,1,2,4,19,19,20
GP,M,15,U,GT3,T,4,2,teacher,other,home,mother,1,2,0,no,yes,yes,no,yes,yes,no,no,4,3,3,2,2,5,2,15,15,14
GP,F,15,U,GT3,T,4,4,services,teacher,other,father,1,2,1,yes,yes,no,yes,no,yes,yes,no,4,4,4,1,1,3,2,7,7,7
GP,F,16,U,LE3,T,2,2,services,services,course,mother,3,2,0,no,yes,yes,no,yes,yes,yes,no,4,3,3,2,3,4,2,12,13,13
GP,F,15,U,LE3,T,4,2,health,other,other,mother,1,2,0,no,yes,yes,no,yes,yes,yes,no,4,3,3,1,1,5,2,11,13,13
GP,M,15,U,LE3,A,4,2,health,health,other,father,2,1,1,no,no,no,no,yes,yes,no,no,5,5,5,5,4,5,6,11,11,10
GP,F,15,U,GT3,T,4,4,services,services,course,mother,1,1,0,yes,yes,yes,no,yes,yes,yes,no,3,3,4,2,3,5,0,8,10,11
GP,F,15,U,LE3,A,3,3,other,other,other,mother,1,1,0,no,no,yes,no,yes,yes,yes,no,5,3,4,4,4,1,6,10,13,13
GP,F,16,U,GT3,A,2,1,other,other,other,mother,1,2,0,no,no,yes,yes,yes,yes,yes,yes,5,3,4,1,1,2,8,8,9,10
GP,F,15,U,GT3,A,4,3,services,services,reputation,mother,1,2,0,no,yes,yes,yes,yes,yes,yes,no,4,3,2,1,1,1,0,14,15,15
GP,M,15,U,GT3,T,4,4,teacher,health,reputation,mother,1,2,0,no,yes,no,yes,yes,yes,no,no,3,2,2,1,1,5,4,14,15,15
GP,M,15,U,LE3,T,1,2,other,at_home,home,father,1,2,0,yes,yes,no,yes,yes,yes,yes,no,4,3,2,1,1,5,2,9,10,9
GP,F,16,U,GT3,T,4,2,services,other,course,mother,1,2,0,no,yes,no,no,yes,yes,yes,no,4,2,3,1,1,5,2,15,16,16
GP,F,16,R,GT3,T,4,4,health,teacher,other,mother,1,2,0,no,yes,no,yes,yes,yes,no,no,2,4,4,5,3,4,6,10,11,11
GP,F,16,U,GT3,T,1,1,services,services,course,father,4,1,0,yes,yes,no,yes,no,yes,yes,yes,5,5,5,5,5,5,6,10,8,11
GP,F,16,U,LE3,T,1,2,other,services,reputation,father,1,2,0,yes,no,no,yes,yes,yes,yes,no,4,4,3,1,1,1,4,8,10,9
GP,F,16,U,GT3,T,4,3,teacher,health,home,mother,1,3,0,yes,yes,yes,yes,yes,yes,yes,no,3,4,4,5,4,4,2,10,9,9
GP,F,15,U,LE3,T,4,3,services,services,reputation,father,1,2,0,yes,no,no,yes,yes,yes,yes,yes,4,4,4,5,4,2,0,10,10,10
GP,F,16,U,LE3,T,4,3,teacher,services,course,mother,3,2,0,no,yes,no,yes,yes,yes,yes,no,5,4,3,1,2,1,2,16,15,15
GP,M,15,U,GT3,A,4,4,other,services,reputation,mother,1,4,0,no,yes,no,yes,no,yes,yes,yes,1,3,3,5,5,3,4,13,13,12
GP,F,16,U,GT3,T,3,1,services,other,course,mother,1,4,0,yes,yes,yes,no,yes,yes,yes,no,4,3,3,1,2,5,4,7,7,6
GP,F,15,R,LE3,T,2,2,health,services,reputation,mother,2,2,0,yes,yes,yes,no,yes,yes,yes,no,4,1,3,1,3,4,2,8,9,8
GP,F,15,R,LE3,T,3,1,other,other,reputation,father,2,4,0,no,yes,no,no,no,yes,yes,no,4,4,2,5,3,3,12,16,16,16
GP,M,16,U,GT3,T,3,1,other,other,reputation,father,2,4,0,no,yes,yes,no,yes,yes,yes,no,4,3,2,1,1,5,0,13,15,15
GP,M,15,U,GT3,T,4,2,other,other,course,mother,1,4,0,no,no,no,no,yes,yes,yes,no,3,3,3,1,1,3,0,10,10,10
GP,F,15,R,GT3,T,1,1,other,other,reputation,mother,1,2,2,yes,yes,no,no,no,yes,yes,yes,3,3,4,2,4,5,2,8,6,5
GP,M,16,U,GT3,T,3,1,other,other,reputation,mother,1,1,0,no,no,no,yes,yes,yes,no,no,5,3,2,2,2,5,2,12,12,14
GP,F,16,U,GT3,T,3,3,other,services,home,mother,1,2,0,yes,yes,yes,yes,yes,yes,yes,no,4,3,3,2,4,5,54,11,12,11
GP,M,15,U,GT3,T,4,3,teacher,other,home,mother,1,2,0,no,yes,yes,yes,yes,yes,yes,no,4,3,3,2,3,5,6,9,9,10
GP,M,15,U,GT3,T,4,0,teacher,other,course,mother,2,4,0,no,no,no,yes,yes,yes,yes,no,3,4,3,1,1,1,8,11,11,10
GP,F,16,U,GT3,T,2,2,other,other,reputation,mother,1,4,0,no,no,yes,no,yes,yes,yes,yes,5,2,3,1,3,3,0,11,11,11
GP,M,17,U,GT3,T,2,1,other,other,home,mother,2,1,3,yes,yes,no,yes,yes,no,yes,no,4,5,1,5,5,3,2,8,8,10
GP,F,16,U,GT3,T,3,4,at_home,other,course,mother,1,2,0,no,yes,no,no,yes,yes,yes,no,2,4,3,1,2,3,12,5,5,5
GP,M,15,U,GT3,T,2,3,other,services,course,father,1,1,0,yes,yes,yes,yes,no,yes,yes,yes,3,2,2,1,3,3,2,10,12,12
GP,M,15,U,GT3,T,2,3,other,other,home,mother,1,3,0,yes,no,yes,no,no,yes,yes,no,5,3,2,1,2,5,4,11,10,11
GP,F,15,U,LE3,T,3,2,services,other,reputation,mother,1,2,0,no,yes,yes,no,yes,yes,yes,no,4,4,4,1,1,5,10,7,6,6
GP,M,15,U,LE3,T,2,2,services,services,home,mother,2,2,0,no,no,yes,yes,yes,yes,yes,no,5,3,3,1,3,4,4,15,15,15
GP,F,15,U,GT3,T,1,1,other,other,home,father,1,2,0,no,yes,no,yes,no,yes,yes,no,4,3,2,2,3,4,2,9,10,10
GP,F,15,U,GT3,T,4,4,services,services,reputation,father,2,2,2,no,no,yes,no,yes,yes,yes,yes,4,4,4,5,3,5,6,7,9,8
GP,F,16,U,LE3,T,2,2,at_home,other,course,mother,1,2,0,no,yes,no,no,yes,yes,no,no,4,3,4,1,2,2,4,8,7,6
GP,F,15,U,GT3,T,4,2,other,other,reputation,mother,1,3,0,no,yes,no,yes,yes,yes,yes,no,5,3,3,1,3,1,4,13,14,14
GP,M,16,U,GT3,T,2,2,services,other,reputation,father,2,2,1,no,no,yes,yes,no,yes,yes,no,4,4,2,1,1,3,12,11,10,10
GP,M,16,U,LE3,A,4,4,teacher,health,reputation,mother,1,2,0,no,yes,no,no,yes,yes,no,no,4,1,3,3,5,5,18,8,6,7
GP,F,16,U,GT3,T,3,3,other,other,home,mother,1,3,0,no,yes,yes,no,yes,yes,yes,yes,4,3,3,1,3,4,0,7,7,8
GP,F,15,U,GT3,T,4,3,services,other,reputation,mother,1,1,0,no,no,yes,yes,yes,yes,yes,no,4,5,5,5,3,1,4,16,17,18
GP,F,16,U,LE3,T,3,1,other,other,home,father,1,2,0,yes,yes,no,no,yes,yes,no,no,3,3,3,2,3,2,4,7,6,6
GP,F,16,U,GT3,T,4,2,teacher,services,home,mother,2,2,0,no,yes,yes,yes,yes,yes,yes,no,5,3,3,1,1,1,0,11,10,10
GP,M,15,U,LE3,T,2,2,services,health,reputation,mother,1,4,0,no,yes,no,yes,yes,yes,yes,no,4,3,4,1,1,4,6,11,13,14
GP,F,15,R,GT3,T,1,1,at_home,other,home,mother,2,4,1,yes,yes,yes,yes,yes,yes,yes,no,3,1,2,1,1,1,2,7,10,10
GP,M,16,R,GT3,T,4,3,services,other,reputation,mother,2,1,0,yes,yes,no,yes,no,yes,yes,no,3,3,3,1,1,4,2,11,15,15
GP,F,16,U,GT3,T,2,1,other,other,course,mother,1,2,0,no,yes,yes,no,yes,yes,no,yes,4,3,5,1,1,5,2,8,9,10
GP,F,16,U,GT3,T,4,4,other,other,reputation,mother,1,1,0,no,no,no,yes,no,yes,yes,no,5,3,4,1,2,1,6,11,14,14
GP,F,16,U,GT3,T,4,3,other,at_home,course,mother,1,3,0,yes,yes,yes,no,yes,yes,yes,no,5,3,5,1,1,3,0,7,9,8
GP,M,16,U,GT3,T,4,4,services,services,other,mother,1,1,0,yes,yes,yes,yes,yes,yes,yes,no,4,5,5,5,5,4,14,7,7,5
GP,M,16,U,GT3,T,4,4,services,teacher,other,father,1,3,0,no,yes,no,yes,yes,yes,yes,yes,4,4,3,1,1,4,0,16,17,17
GP,M,15,U,GT3,T,4,4,services,other,course,mother,1,1,0,no,yes,no,yes,no,yes,yes,no,5,3,3,1,1,5,4,10,13,14
GP,F,15,U,GT3,T,3,2,services,other,home,mother,2,2,0,yes,yes,yes,no,yes,yes,yes,no,4,3,5,1,1,2,26,7,6,6
GP,M,15,U,GT3,A,3,4,services,other,course,mother,1,2,0,no,yes,yes,yes,yes,yes,yes,no,5,4,4,1,1,1,0,16,18,18
GP,F,15,U,GT3,A,3,3,other,health,reputation,father,1,4,0,yes,no,no,no,yes,yes,no,no,4,3,3,1,1,4,10,10,11,11
GP,F,15,U,GT3,T,2,2,other,other,course,mother,1,4,0,yes,yes,yes,no,yes,yes,yes,no,5,1,2,1,1,3,8,7,8,8
GP,M,16,U,GT3,T,3,3,services,other,home,father,1,3,0,no,yes,no,yes,yes,yes,yes,no,5,3,3,1,1,5,2,16,18,18
GP,M,15,R,GT3,T,4,4,other,other,home,father,4,4,0,no,yes,yes,yes,yes,yes,yes,yes,1,3,5,3,5,1,6,10,13,13
GP,F,16,U,LE3,T,4,4,health,health,other,mother,1,3,0,no,yes,yes,yes,yes,yes,yes,yes,5,4,5,1,1,4,4,14,15,16
GP,M,15,U,LE3,A,4,4,teacher,teacher,course,mother,1,1,0,no,no,no,yes,yes,yes,yes,no,5,5,3,5,5,4,6,18,19,19
GP,F,16,R,GT3,T,3,3,services,other,reputation,father,1,3,1,yes,yes,no,yes,yes,yes,yes,no,4,1,2,1,1,2,0,7,10,10
GP,F,16,U,GT3,T,2,2,at_home,other,home,mother,1,2,1,yes,no,no,yes,yes,yes,yes,no,3,1,2,1,1,5,6,10,13,13
GP,M,15,U,LE3,T,4,2,teacher,other,course,mother,1,1,0,no,no,no,no,yes,yes,yes,no,3,5,2,5,5,3,10,18,19,19
GP,M,15,R,GT3,T,2,1,health,services,reputation,mother,1,2,0,no,no,no,yes,yes,yes,yes,yes,5,4,2,1,1,5,8,9,9,9
GP,M,16,U,GT3,T,4,4,teacher,teacher,course,father,1,2,0,no,yes,no,yes,yes,yes,yes,no,5,4,4,1,2,5,2,15,15,16
GP,M,15,U,GT3,T,4,4,other,teacher,reputation,father,2,2,0,no,yes,no,yes,yes,yes,no,no,4,4,3,1,1,2,2,11,13,14
GP,M,16,U,GT3,T,3,3,other,services,home,father,2,1,0,no,no,no,yes,yes,yes,yes,no,5,4,2,1,1,5,0,13,14,13
GP,M,17,R,GT3,T,1,3,other,other,course,father,3,2,1,no,yes,no,yes,yes,yes,yes,no,5,2,4,1,4,5,20,9,7,8
GP,M,15,U,GT3,T,3,4,other,other,reputation,father,1,1,0,no,no,no,no,yes,yes,yes,no,3,4,3,1,2,4,6,14,13,13
GP,F,15,U,GT3,T,1,2,at_home,services,course,mother,1,2,0,no,no,no,no,no,yes,yes,no,3,2,3,1,2,1,2,16,15,15
GP,M,15,U,GT3,T,2,2,services,services,home,father,1,4,0,no,yes,yes,yes,yes,yes,yes,no,5,5,4,1,2,5,6,16,14,15
GP,F,16,U,LE3,T,2,4,other,health,course,father,2,2,0,no,yes,yes,yes,yes,yes,yes,yes,4,2,2,1,2,5,2,13,13,13
GP,M,16,U,GT3,T,4,4,health,other,course,mother,1,1,0,no,yes,no,yes,yes,yes,yes,no,3,4,4,5,4,5,18,14,11,13
GP,F,16,U,GT3,T,2,2,other,other,home,mother,1,2,0,no,no,yes,no,yes,yes,yes,yes,5,4,4,1,1,5,0,8,7,8
GP,M,15,U,GT3,T,3,4,services,services,home,father,1,1,0,yes,no,no,no,yes,yes,yes,no,5,5,5,3,2,5,0,13,13,12
GP,F,15,U,LE3,A,3,4,other,other,home,mother,1,2,0,yes,no,no,yes,yes,yes,yes,yes,5,3,2,1,1,1,0,7,10,11
GP,F,19,U,GT3,T,0,1,at_home,other,course,other,1,2,3,no,yes,no,no,no,no,no,no,3,4,2,1,1,5,2,7,8,9
GP,M,18,R,GT3,T,2,2,services,other,reputation,mother,1,1,2,no,yes,no,yes,yes,yes,yes,no,3,3,3,1,2,4,0,7,4,0
GP,M,16,R,GT3,T,4,4,teacher,teacher,course,mother,1,1,0,no,no,yes,yes,yes,yes,yes,no,3,5,5,4,5,4,8,18,18,18
GP,F,15,R,GT3,T,3,4,services,teacher,course,father,2,3,2,no,yes,no,no,yes,yes,yes,yes,4,2,2,2,2,5,0,12,0,0
GP,F,15,U,GT3,T,1,1,at_home,other,course,mother,3,1,0,no,yes,no,yes,no,yes,yes,yes,4,3,3,1,2,4,0,8,0,0
GP,F,17,U,LE3,T,2,2,other,other,course,father,1,1,0,no,yes,no,no,yes,yes,yes,yes,3,4,4,1,3,5,12,10,13,12
GP,F,16,U,GT3,A,3,4,services,other,course,father,1,1,0,no,no,no,no,yes,yes,yes,no,3,2,1,1,4,5,16,12,11,11
GP,M,15,R,GT3,T,3,4,at_home,teacher,course,mother,4,2,0,no,yes,no,no,yes,yes,no,yes,5,3,3,1,1,5,0,9,0,0
GP,F,15,U,GT3,T,4,4,services,at_home,course,mother,1,3,0,no,yes,no,yes,yes,yes,yes,yes,4,3,3,1,1,5,0,11,0,0
GP,M,17,R,GT3,T,3,4,at_home,other,course,mother,3,2,0,no,no,no,no,yes,yes,no,no,5,4,5,5,4,5,0,10,0,0
GP,F,16,U,GT3,A,3,3,other,other,course,other,2,1,2,no,yes,no,yes,no,yes,yes,yes,4,3,2,1,1,5,0,4,0,0
GP,M,16,U,LE3,T,1,1,services,other,course,mother,1,2,1,no,no,no,no,yes,yes,no,yes,4,4,4,5,3,5,0,14,12,12
GP,F,15,U,GT3,T,4,4,teacher,teacher,course,mother,2,1,0,no,no,no,yes,yes,yes,yes,no,4,3,2,1,1,5,0,16,16,15
GP,M,15,U,GT3,T,4,3,teacher,services,course,father,2,4,0,yes,yes,no,no,yes,yes,yes,no,2,2,2,1,1,3,0,7,9,0
GP,M,16,U,LE3,T,2,2,services,services,reputation,father,2,1,2,no,yes,no,yes,yes,yes,yes,no,2,3,3,2,2,2,8,9,9,9
GP,F,15,U,GT3,T,4,4,teacher,services,course,mother,1,3,0,no,yes,yes,yes,yes,yes,yes,no,4,2,2,1,1,5,2,9,11,11
GP,F,16,U,LE3,T,1,1,at_home,at_home,course,mother,1,1,0,no,no,no,no,yes,yes,yes,no,3,4,4,5,3,1,2,14,14,13
GP,M,17,U,GT3,T,2,1,other,other,home,mother,1,1,3,no,yes,no,no,yes,yes,yes,no,5,4,5,1,2,5,0,5,0,0
GP,F,15,U,GT3,T,1,1,other,services,course,father,1,2,0,no,yes,yes,no,yes,yes,yes,no,4,4,2,1,2,5,0,8,11,11
GP,F,15,U,GT3,T,3,2,health,services,home,father,1,2,3,no,yes,no,no,yes,yes,yes,no,3,3,2,1,1,3,0,6,7,0
GP,F,15,U,GT3,T,1,2,at_home,other,course,mother,1,2,0,no,yes,yes,no,no,yes,yes,no,4,3,2,1,1,5,2,10,11,11
GP,M,16,U,GT3,T,4,4,teacher,teacher,course,mother,1,1,0,no,yes,no,no,yes,no,yes,yes,3,3,2,2,1,5,0,7,6,0
GP,M,15,U,LE3,A,2,1,services,other,course,mother,4,1,3,no,no,no,no,yes,yes,yes,no,4,5,5,4,5,5,0,8,9,10
GP,M,18,U,LE3,T,1,1,other,other,course,mother,1,1,3,no,no,no,no,yes,no,yes,yes,2,3,5,2,5,4,0,6,5,0
GP,M,16,U,LE3,T,2,1,at_home,other,course,mother,1,1,1,no,no,no,yes,yes,yes,no,yes,4,4,4,5,5,5,6,12,13,14
GP,F,15,R,GT3,T,3,3,services,services,reputation,other,2,3,2,no,yes,yes,yes,yes,yes,yes,yes,4,2,1,2,3,3,8,10,10,10
GP,M,19,U,GT3,T,3,2,services,at_home,home,mother,1,1,3,no,yes,no,no,yes,no,yes,yes,4,5,4,1,1,4,0,5,0,0
GP,F,17,U,GT3,T,4,4,other,teacher,course,mother,1,1,0,yes,yes,no,no,yes,yes,no,yes,4,2,1,1,1,4,0,11,11,12
GP,M,15,R,GT3,T,2,3,at_home,services,course,mother,1,2,0,yes,no,yes,yes,yes,yes,no,no,4,4,4,1,1,1,2,11,8,8
GP,M,17,R,LE3,T,1,2,other,other,reputation,mother,1,1,0,no,no,no,no,yes,yes,no,no,2,2,2,3,3,5,8,16,12,13
GP,F,18,R,GT3,T,1,1,at_home,other,course,mother,3,1,3,no,yes,no,yes,no,yes,no,no,5,2,5,1,5,4,6,9,8,10
GP,M,16,R,GT3,T,2,2,at_home,other,course,mother,3,1,0,no,no,no,no,no,yes,no,no,4,2,2,1,2,3,2,17,15,15
GP,M,16,U,GT3,T,3,3,other,services,course,father,1,2,1,no,yes,yes,no,yes,yes,yes,yes,4,5,5,4,4,5,4,10,12,12
GP,M,17,R,LE3,T,2,1,at_home,other,course,mother,2,1,2,no,no,no,yes,yes,no,yes,yes,3,3,2,2,2,5,0,7,6,0
GP,M,15,R,GT3,T,3,2,other,other,course,mother,2,2,2,yes,yes,no,no,yes,yes,yes,yes,4,4,4,5,4,3,6,5,9,7
GP,M,16,U,LE3,T,1,2,other,other,course,mother,2,1,1,no,no,no,yes,yes,yes,no,no,4,4,4,5,4,5,0,7,0,0
GP,M,17,U,GT3,T,1,3,at_home,services,course,father,1,1,0,no,no,no,no,yes,no,yes,no,5,3,3,1,4,2,2,10,10,10
GP,M,17,R,LE3,T,1,1,other,services,course,mother,4,2,3,no,no,no,yes,yes,no,no,yes,5,3,5,1,5,5,0,5,8,7
GP,M,16,U,GT3,T,3,2,services,services,course,mother,2,1,1,no,yes,no,yes,no,no,no,no,4,5,2,1,1,2,16,12,11,12
GP,M,16,U,GT3,T,2,2,other,other,course,father,1,2,0,no,no,no,no,yes,no,yes,no,4,3,5,2,4,4,4,10,10,10
GP,F,16,U,GT3,T,4,2,health,services,home,father,1,2,0,no,no,yes,no,yes,yes,yes,yes,4,2,3,1,1,3,0,14,15,16
GP,F,16,U,GT3,T,2,2,other,other,home,mother,1,2,0,no,yes,yes,no,no,yes,yes,no,5,1,5,1,1,4,0,6,7,0
GP,F,16,U,GT3,T,4,4,health,health,reputation,mother,1,2,0,no,yes,yes,no,yes,yes,yes,yes,4,4,2,1,1,3,0,14,14,14
GP,M,16,U,GT3,T,3,4,other,other,course,father,3,1,2,no,yes,no,yes,no,yes,yes,no,3,4,5,5,4,2,0,6,5,0
GP,M,16,U,GT3,T,1,0,other,other,reputation,mother,2,2,0,no,yes,yes,yes,yes,yes,yes,yes,4,3,2,1,1,3,2,13,15,16
GP,M,17,U,LE3,T,4,4,teacher,other,reputation,mother,1,2,0,no,yes,yes,yes,yes,yes,yes,no,4,4,4,5,3,5,0,13,11,10
GP,F,16,U,GT3,T,1,3,at_home,services,home,mother,1,2,3,no,no,no,yes,no,yes,yes,yes,4,3,5,1,1,3,0,8,7,0
GP,F,16,U,LE3,T,3,3,other,other,reputation,mother,2,2,0,no,yes,yes,yes,yes,yes,yes,no,4,4,5,1,1,4,4,10,11,9
GP,M,17,U,LE3,T,4,3,teacher,other,course,mother,2,2,0,no,no,yes,yes,yes,yes,yes,no,4,4,4,4,4,4,4,10,9,9
GP,F,16,U,GT3,T,2,2,services,other,reputation,mother,2,2,0,no,no,yes,yes,no,yes,yes,no,3,4,4,1,4,5,2,13,13,11
GP,M,17,U,GT3,T,3,3,other,other,reputation,father,1,2,0,no,no,no,yes,no,yes,yes,no,4,3,4,1,4,4,4,6,5,6
GP,M,16,R,GT3,T,4,2,teacher,services,other,mother,1,1,0,no,yes,no,yes,yes,yes,yes,yes,4,3,3,3,4,3,10,10,8,9
GP,M,17,U,GT3,T,4,3,other,other,course,mother,1,2,0,no,yes,no,yes,yes,yes,yes,yes,5,2,3,1,1,2,4,10,10,11
GP,M,16,U,GT3,T,4,3,teacher,other,home,mother,1,2,0,no,yes,yes,yes,yes,yes,yes,no,3,4,3,5,3,3,10,9,8,8
GP,M,16,U,GT3,T,3,3,services,other,home,mother,1,2,0,no,no,yes,yes,yes,yes,yes,yes,4,2,3,1,2,3,2,12,13,12
GP,F,17,U,GT3,T,2,4,services,services,reputation,father,1,2,0,no,yes,no,yes,yes,yes,no,no,5,4,2,5,3,5,0,16,17,17
GP,F,17,U,LE3,T,3,3,other,other,reputation,mother,1,2,0,no,yes,no,yes,yes,yes,yes,yes,5,3,3,2,3,1,56,9,9,8
GP,F,16,U,GT3,T,3,2,other,other,reputation,mother,1,2,0,no,yes,yes,no,yes,yes,yes,no,1,2,2,1,2,1,14,12,13,12
GP,M,17,U,GT3,T,3,3,services,services,other,mother,1,2,0,no,yes,no,yes,yes,yes,yes,yes,4,3,4,2,3,4,12,12,12,11
GP,M,16,U,GT3,T,1,2,services,services,other,mother,1,1,0,no,yes,yes,yes,yes,yes,yes,yes,3,3,3,1,2,3,2,11,12,11
GP,M,16,U,LE3,T,2,1,other,other,course,mother,1,2,0,no,no,yes,yes,yes,yes,yes,yes,4,2,3,1,2,5,0,15,15,15
GP,F,17,U,GT3,A,3,3,health,other,reputation,mother,1,2,0,no,yes,no,no,no,yes,yes,yes,3,3,3,1,3,3,6,8,7,9
GP,M,17,R,GT3,T,1,2,at_home,other,home,mother,1,2,0,no,no,no,no,yes,yes,no,no,3,1,3,1,5,3,4,8,9,10
GP,F,16,U,GT3,T,2,3,services,services,course,mother,1,2,0,no,no,no,no,yes,yes,yes,no,4,3,3,1,1,2,10,11,12,13
GP,F,17,U,GT3,T,1,1,at_home,services,course,mother,1,2,0,no,no,no,yes,yes,yes,yes,no,5,3,3,1,1,3,0,8,8,9
GP,M,17,U,GT3,T,1,2,at_home,services,other,other,2,2,0,no,no,yes,yes,no,yes,yes,no,4,4,4,4,5,5,12,7,8,8
GP,M,16,R,GT3,T,3,3,services,services,reputation,mother,1,1,0,no,yes,no,yes,yes,yes,yes,no,4,3,2,3,4,5,8,8,9,10
GP,M,16,U,GT3,T,2,3,other,other,home,father,2,1,0,no,no,no,no,yes,yes,yes,no,5,3,3,1,1,3,0,13,14,14
GP,F,17,U,LE3,T,2,4,services,services,course,father,1,2,0,no,no,no,yes,yes,yes,yes,yes,4,3,2,1,1,5,0,14,15,15
GP,M,17,U,GT3,T,4,4,services,teacher,home,mother,1,1,0,no,no,no,no,yes,yes,yes,no,5,2,3,1,2,5,4,17,15,16
GP,M,16,R,LE3,T,3,3,teacher,other,home,father,3,1,0,no,yes,yes,yes,yes,yes,yes,no,3,3,4,3,5,3,8,9,9,10
GP,F,17,U,GT3,T,4,4,services,teacher,home,mother,2,1,1,no,yes,no,no,yes,yes,yes,no,4,2,4,2,3,2,24,18,18,18
GP,F,16,U,LE3,T,4,4,teacher,teacher,reputation,mother,1,2,0,no,yes,yes,no,yes,yes,yes,no,4,5,2,1,2,3,0,9,9,10
GP,F,16,U,GT3,T,4,3,health,other,home,mother,1,2,0,no,yes,no,yes,yes,yes,yes,no,4,3,5,1,5,2,2,16,16,16
GP,F,16,U,GT3,T,2,3,other,other,reputation,mother,1,2,0,yes,yes,yes,yes,yes,yes,no,no,4,4,3,1,3,4,6,8,10,10
GP,F,17,U,GT3,T,1,1,other,other,course,mother,1,2,0,no,yes,yes,no,no,yes,no,no,4,4,4,1,3,1,4,9,9,10
GP,F,17,R,GT3,T,2,2,other,other,reputation,mother,1,1,0,no,yes,no,no,yes,yes,yes,no,5,3,2,1,2,3,18,7,6,6
GP,F,16,R,GT3,T,2,2,services,services,reputation,mother,2,4,0,no,yes,yes,yes,no,yes,yes,no,5,3,5,1,1,5,6,10,10,11
GP,F,17,U,GT3,T,3,4,at_home,services,home,mother,1,3,1,no,yes,yes,no,yes,yes,yes,yes,4,4,3,5,4,5,28,10,9,9
GP,F,16,U,GT3,A,3,1,services,other,course,mother,1,2,3,no,yes,yes,no,yes,yes,yes,no,2,3,3,2,2,4,5,7,7,7
GP,F,16,U,GT3,T,4,3,teacher,other,other,mother,1,2,0,no,no,yes,yes,yes,yes,yes,yes,1,3,2,1,1,1,10,11,12,13
GP,F,16,U,GT3,T,1,1,at_home,other,home,mother,2,1,0,no,yes,yes,no,yes,yes,no,no,4,3,2,1,4,5,6,9,9,10
GP,F,17,R,GT3,T,4,3,teacher,other,reputation,mother,2,3,0,no,yes,yes,yes,yes,yes,yes,yes,4,4,2,1,1,4,6,7,7,7
GP,F,19,U,GT3,T,3,3,other,other,reputation,other,1,4,0,no,yes,yes,yes,yes,yes,yes,no,4,3,3,1,2,3,10,8,8,8
GP,M,17,U,LE3,T,4,4,services,other,home,mother,1,2,0,no,yes,yes,no,yes,yes,yes,yes,5,3,5,4,5,3,13,12,12,13
GP,F,16,U,GT3,A,2,2,other,other,reputation,mother,1,2,0,yes,yes,yes,no,yes,yes,yes,no,3,3,4,1,1,4,0,12,13,14
GP,M,18,U,GT3,T,2,2,services,other,home,mother,1,2,1,no,yes,yes,yes,yes,yes,yes,no,4,4,4,5,4,5,15,6,7,8
GP,F,17,R,LE3,T,4,4,services,other,other,mother,1,1,0,no,yes,yes,no,yes,yes,no,no,5,2,1,1,2,3,12,8,10,10
GP,F,17,U,LE3,T,3,2,other,other,reputation,mother,2,2,0,no,no,yes,no,yes,yes,yes,no,4,4,4,1,3,1,2,14,15,15
GP,F,17,U,GT3,T,4,3,other,other,reputation,mother,1,2,2,no,no,yes,no,yes,yes,yes,yes,3,4,5,5,4,1,22,6,6,4
GP,M,18,U,LE3,T,3,3,services,health,home,father,1,2,1,no,yes,yes,no,yes,yes,yes,no,3,2,4,2,4,4,13,6,6,8
GP,F,17,U,GT3,T,2,3,at_home,other,home,father,2,1,0,no,yes,yes,no,yes,yes,no,no,3,3,3,1,4,3,3,7,7,8
GP,F,17,U,GT3,T,2,2,at_home,at_home,course,mother,1,3,0,no,yes,yes,yes,yes,yes,yes,no,4,3,3,1,1,4,4,9,10,10
GP,F,17,R,GT3,T,2,1,at_home,services,reputation,mother,2,2,0,no,yes,no,yes,yes,yes,yes,no,4,2,5,1,2,5,2,6,6,6
GP,F,17,U,GT3,T,1,1,at_home,other,reputation,mother,1,3,1,no,yes,no,yes,yes,yes,no,yes,4,3,4,1,1,5,0,6,5,0
GP,F,16,U,GT3,T,2,3,services,teacher,other,mother,1,2,0,yes,no,no,no,yes,yes,yes,no,2,3,1,1,1,3,2,16,16,17
GP,M,18,U,GT3,T,2,2,other,other,home,mother,2,2,0,no,yes,yes,no,yes,yes,yes,no,3,3,3,5,5,4,0,12,13,13
GP,F,16,U,GT3,T,4,4,teacher,services,home,mother,1,3,0,no,yes,no,yes,no,yes,yes,no,5,3,2,1,1,5,0,13,13,14
GP,F,18,R,GT3,T,3,1,other,other,reputation,mother,1,2,1,no,no,no,yes,yes,yes,yes,yes,5,3,3,1,1,4,16,9,8,7
GP,F,17,U,GT3,T,3,2,other,other,course,mother,1,2,0,no,no,no,yes,no,yes,yes,no,5,3,4,1,3,3,10,16,15,15
GP,M,17,U,LE3,T,2,3,services,services,reputation,father,1,2,0,no,yes,yes,no,no,yes,yes,no,5,3,3,1,3,3,2,12,11,12
GP,M,18,U,LE3,T,2,1,at_home,other,course,mother,4,2,0,yes,yes,yes,yes,yes,yes,yes,yes,4,3,2,4,5,3,14,10,8,9
GP,F,17,U,GT3,A,2,1,other,other,course,mother,2,3,0,no,no,no,yes,yes,yes,yes,yes,3,2,3,1,2,3,10,12,10,12
GP,F,17,U,LE3,T,4,3,health,other,reputation,father,1,2,0,no,no,no,yes,yes,yes,yes,yes,3,2,3,1,2,3,14,13,13,14
GP,M,17,R,GT3,T,2,2,other,other,course,father,2,2,0,no,yes,yes,yes,yes,yes,yes,no,4,5,2,1,1,1,4,11,11,11
GP,M,17,U,GT3,T,4,4,teacher,teacher,reputation,mother,1,2,0,yes,yes,no,yes,yes,yes,yes,yes,4,5,5,1,3,2,14,11,9,9
GP,M,16,U,GT3,T,4,4,health,other,reputation,father,1,2,0,no,yes,yes,yes,yes,yes,yes,no,4,2,4,2,4,1,2,14,13,13
GP,M,16,U,LE3,T,1,1,other,other,home,mother,2,2,0,no,yes,yes,no,yes,yes,yes,no,3,4,2,1,1,5,18,9,7,6
GP,M,16,U,GT3,T,3,2,at_home,other,reputation,mother,2,3,0,no,no,no,yes,yes,yes,yes,yes,5,3,3,1,3,2,10,11,9,10
GP,M,17,U,LE3,T,2,2,other,other,home,father,1,2,0,no,no,yes,yes,no,yes,yes,yes,4,4,2,5,5,4,4,14,13,13
GP,F,16,U,GT3,T,2,1,other,other,home,mother,1,1,0,no,no,no,no,yes,yes,yes,yes,4,5,2,1,1,5,20,13,12,12
GP,F,17,R,GT3,T,2,1,at_home,services,course,mother,3,2,0,no,no,no,yes,yes,yes,no,no,2,1,1,1,1,3,2,13,11,11
GP,M,18,U,GT3,T,2,2,other,services,reputation,father,1,2,1,no,no,no,no,yes,no,yes,no,5,5,4,5,5,2,0,7,7,0
GP,M,17,U,LE3,T,4,3,health,other,course,mother,2,2,0,no,no,no,yes,yes,yes,yes,yes,2,5,5,5,4,5,14,12,12,12
GP,M,17,R,LE3,A,4,4,teacher,other,course,mother,2,2,0,no,yes,yes,no,yes,yes,yes,no,3,3,3,2,3,4,2,10,11,12
GP,M,16,U,LE3,T,4,3,teacher,other,course,mother,1,1,0,no,no,no,yes,no,yes,yes,no,5,4,5,1,1,3,0,6,0,0
GP,M,16,U,GT3,T,4,4,services,services,course,mother,1,1,0,no,no,no,yes,yes,yes,yes,no,5,3,2,1,2,5,0,13,12,12
GP,F,18,U,GT3,T,2,1,other,other,course,other,2,3,0,no,yes,yes,no,no,yes,yes,yes,4,4,4,1,1,3,0,7,0,0
GP,M,16,U,GT3,T,2,1,other,other,course,mother,3,1,0,no,no,no,no,yes,yes,yes,no,4,3,3,1,1,4,6,18,18,18
GP,M,17,U,GT3,T,2,3,other,other,course,father,2,1,0,no,no,no,no,yes,yes,yes,no,5,2,2,1,1,2,4,12,12,13
GP,M,22,U,GT3,T,3,1,services,services,other,mother,1,1,3,no,no,no,no,no,no,yes,yes,5,4,5,5,5,1,16,6,8,8
GP,M,18,R,LE3,T,3,3,other,services,course,mother,1,2,1,no,yes,no,no,yes,yes,yes,yes,4,3,3,1,3,5,8,3,5,5
GP,M,16,U,GT3,T,0,2,other,other,other,mother,1,1,0,no,no,yes,no,no,yes,yes,no,4,3,2,2,4,5,0,13,15,15
GP,M,18,U,GT3,T,3,2,services,other,course,mother,2,1,1,no,no,no,no,yes,no,yes,no,4,4,5,5,4,5,0,6,8,8
GP,M,16,U,GT3,T,3,3,at_home,other,reputation,other,3,2,0,yes,yes,no,no,no,yes,yes,no,5,3,3,1,3,2,6,7,10,10
GP,M,18,U,GT3,T,2,1,services,services,other,mother,1,1,1,no,no,no,no,no,no,yes,no,3,2,5,2,5,5,4,6,9,8
GP,M,16,R,GT3,T,2,1,other,other,course,mother,2,1,0,no,no,no,yes,no,yes,no,no,3,3,2,1,3,3,0,8,9,8
GP,M,17,R,GT3,T,2,1,other,other,course,mother,1,1,0,no,no,no,no,no,yes,yes,no,4,4,2,5,4,5,0,8,12,12
GP,M,17,U,LE3,T,1,1,health,other,course,mother,2,1,1,no,yes,no,yes,yes,yes,yes,no,4,4,4,1,2,5,2,7,9,8
GP,F,17,U,LE3,T,4,2,teacher,services,reputation,mother,1,4,0,no,yes,yes,yes,yes,yes,yes,no,4,2,3,1,1,4,6,14,12,13
GP,M,19,U,LE3,A,4,3,services,at_home,reputation,mother,1,2,0,no,yes,no,no,yes,yes,yes,no,4,3,1,1,1,1,12,11,11,11
GP,M,18,U,GT3,T,2,1,other,other,home,mother,1,2,0,no,no,no,yes,yes,yes,yes,no,5,2,4,1,2,4,8,15,14,14
GP,F,17,U,LE3,T,2,2,services,services,course,father,1,4,0,no,no,yes,yes,yes,yes,yes,yes,3,4,1,1,1,2,0,10,9,0
GP,F,18,U,GT3,T,4,3,services,other,home,father,1,2,0,no,yes,yes,no,yes,yes,yes,yes,3,1,2,1,3,2,21,17,18,18
GP,M,18,U,GT3,T,4,3,teacher,other,course,mother,1,2,0,no,yes,yes,no,no,yes,yes,no,4,3,2,1,1,3,2,8,8,8
GP,M,18,R,GT3,T,3,2,other,other,course,mother,1,3,0,no,no,no,yes,no,yes,no,no,5,3,2,1,1,3,1,13,12,12
GP,F,17,U,GT3,T,3,3,other,other,home,mother,1,3,0,no,no,no,yes,no,yes,no,no,3,2,3,1,1,4,4,10,9,9
GP,F,18,U,GT3,T,2,2,at_home,services,home,mother,1,3,0,no,yes,yes,yes,yes,yes,yes,yes,4,3,3,1,1,3,0,9,10,0
GP,M,18,R,LE3,A,3,4,other,other,reputation,mother,2,2,0,no,yes,yes,yes,yes,yes,yes,no,4,2,5,3,4,1,13,17,17,17
GP,M,17,U,GT3,T,3,1,services,other,other,mother,1,2,0,no,no,yes,yes,yes,yes,yes,yes,5,4,4,5,4,5,2,9,9,10
GP,F,18,R,GT3,T,4,4,teacher,other,reputation,mother,2,2,0,no,no,yes,yes,yes,yes,yes,no,4,3,4,2,2,4,8,12,10,11
GP,M,18,U,GT3,T,4,2,health,other,reputation,father,1,2,0,no,yes,yes,yes,yes,yes,yes,yes,5,4,5,1,3,5,10,10,9,10
GP,F,18,R,GT3,T,2,1,other,other,reputation,mother,2,2,0,no,yes,no,no,yes,no,yes,yes,4,3,5,1,2,3,0,6,0,0
GP,F,19,U,GT3,T,3,3,other,services,home,other,1,2,2,no,yes,yes,yes,yes,yes,yes,no,4,3,5,3,3,5,15,9,9,9
GP,F,18,U,GT3,T,2,3,other,services,reputation,father,1,4,0,no,yes,yes,yes,yes,yes,yes,yes,4,5,5,5,3,2,4,15,14,14
GP,F,18,U,LE3,T,1,1,other,other,home,mother,2,2,0,no,yes,yes,no,no,yes,no,no,4,4,3,1,1,3,2,11,11,11
GP,M,17,R,GT3,T,1,2,at_home,at_home,home,mother,1,2,0,no,yes,yes,yes,no,yes,no,yes,3,5,2,2,2,1,2,15,14,14
GP,F,17,U,GT3,T,2,4,at_home,health,reputation,mother,2,2,0,no,yes,yes,no,yes,yes,yes,yes,4,3,3,1,1,1,2,10,10,10
GP,F,17,U,LE3,T,2,2,services,other,course,mother,2,2,0,yes,yes,yes,no,yes,yes,yes,yes,4,4,4,5,3,5,6,12,12,12
GP,F,18,R,GT3,A,3,2,other,services,home,mother,2,2,0,no,no,no,no,no,no,yes,yes,4,1,1,1,1,5,75,10,9,9
GP,M,18,U,GT3,T,4,4,teacher,services,home,mother,2,1,0,no,no,yes,yes,yes,yes,yes,no,3,2,4,1,4,3,22,9,9,9
GP,F,18,U,GT3,T,4,4,health,health,reputation,father,1,2,1,yes,yes,no,yes,yes,yes,yes,yes,2,4,4,1,1,4,15,9,8,8
GP,M,18,U,LE3,T,4,3,teacher,services,course,mother,2,1,0,no,no,yes,yes,yes,yes,yes,no,4,2,3,1,2,1,8,10,11,10
GP,M,17,U,LE3,A,4,1,services,other,home,mother,2,1,0,no,no,yes,yes,yes,yes,yes,yes,4,5,4,5,4,5,30,8,8,8
GP,M,17,U,LE3,A,3,2,teacher,services,home,mother,1,1,1,no,no,no,no,yes,yes,yes,no,4,4,4,5,4,3,19,11,9,10
GP,F,18,R,LE3,T,1,1,at_home,other,reputation,mother,2,4,0,no,yes,yes,yes,yes,yes,no,no,5,2,2,1,1,3,1,12,12,12
GP,F,18,U,GT3,T,1,1,other,other,home,mother,2,2,0,yes,no,no,yes,yes,yes,yes,no,5,4,4,1,1,4,4,8,9,10
GP,F,17,U,GT3,T,2,2,other,other,course,mother,1,2,0,no,yes,no,no,no,yes,yes,no,5,4,5,1,2,5,4,10,9,11
GP,M,17,U,GT3,T,1,1,other,other,reputation,father,1,2,0,no,no,yes,no,no,yes,yes,no,4,3,3,1,2,4,2,12,10,11
GP,F,18,U,GT3,T,2,2,at_home,at_home,other,mother,1,3,0,no,yes,yes,no,yes,yes,yes,no,4,3,3,1,2,2,5,18,18,19
GP,F,17,U,GT3,T,1,1,services,teacher,reputation,mother,1,3,0,no,yes,yes,no,yes,yes,yes,no,4,3,3,1,1,3,6,13,12,12
GP,M,18,U,GT3,T,2,1,services,services,reputation,mother,1,3,0,no,no,yes,yes,yes,yes,yes,no,4,2,4,1,3,2,6,15,14,14
GP,M,18,U,LE3,A,4,4,teacher,teacher,reputation,mother,1,2,0,no,yes,yes,yes,yes,yes,yes,no,5,4,3,1,1,2,9,15,13,15
GP,M,18,U,GT3,T,4,2,teacher,other,home,mother,1,2,0,no,yes,yes,yes,yes,yes,yes,yes,4,3,2,1,4,5,11,12,11,11
GP,F,17,U,GT3,T,4,3,health,services,reputation,mother,1,3,0,no,yes,yes,no,yes,yes,yes,no,4,2,2,1,2,3,0,15,15,15
GP,F,18,U,LE3,T,2,1,services,at_home,reputation,mother,1,2,1,no,no,no,no,yes,yes,yes,yes,5,4,3,1,1,5,12,12,12,13
GP,F,17,R,LE3,T,3,1,services,other,reputation,mother,2,4,0,no,yes,yes,no,yes,yes,no,no,3,1,2,1,1,3,6,18,18,18
GP,M,18,R,LE3,T,3,2,services,other,reputation,mother,2,3,0,no,yes,yes,yes,yes,yes,yes,no,5,4,2,1,1,4,8,14,13,14
GP,M,17,U,GT3,T,3,3,health,other,home,mother,1,1,0,no,yes,yes,no,yes,yes,yes,no,4,4,3,1,3,5,4,14,12,11
GP,F,19,U,GT3,T,4,4,health,other,reputation,other,2,2,0,no,yes,yes,yes,yes,yes,yes,no,2,3,4,2,3,2,0,10,9,0
GP,F,18,U,LE3,T,4,3,other,other,home,other,2,2,0,no,yes,yes,no,yes,yes,yes,yes,4,4,5,1,2,2,10,10,8,8
GP,F,18,U,GT3,T,4,3,other,other,reputation,father,1,4,0,no,yes,yes,no,yes,yes,yes,no,4,3,3,1,1,3,0,14,13,14
GP,M,18,U,LE3,T,4,4,teacher,teacher,home,mother,1,1,0,no,yes,yes,no,yes,yes,yes,yes,1,4,2,2,2,1,5,16,15,16
GP,F,18,U,LE3,A,4,4,health,other,home,mother,1,2,0,no,yes,no,no,yes,yes,yes,yes,4,2,4,1,1,4,14,12,10,11
GP,M,17,U,LE3,T,4,4,other,teacher,home,father,2,1,0,no,no,yes,no,yes,yes,yes,no,4,1,1,2,2,5,0,11,11,10
GP,F,17,U,GT3,T,4,2,other,other,reputation,mother,2,3,0,no,yes,yes,no,yes,yes,yes,no,4,3,3,1,1,3,0,15,12,14
GP,F,17,U,GT3,T,3,2,health,health,reputation,father,1,4,0,no,yes,yes,yes,no,yes,yes,no,5,2,2,1,2,5,0,17,17,18
GP,M,19,U,GT3,T,3,3,other,other,home,other,1,2,1,no,yes,no,yes,yes,yes,yes,yes,4,4,4,1,1,3,20,15,14,13
GP,F,18,U,GT3,T,2,4,services,at_home,reputation,other,1,2,1,no,yes,yes,yes,yes,yes,yes,no,4,4,3,1,1,3,8,14,12,12
GP,M,20,U,GT3,A,3,2,services,other,course,other,1,1,0,no,no,no,yes,yes,yes,no,no,5,5,3,1,1,5,0,17,18,18
GP,M,19,U,GT3,T,4,4,teacher,services,reputation,other,2,1,1,no,yes,yes,no,yes,yes,yes,yes,4,3,4,1,1,4,38,8,9,8
GP,M,19,R,GT3,T,3,3,other,services,reputation,father,1,2,1,no,no,no,yes,yes,yes,no,yes,4,5,3,1,2,5,0,15,12,12
GP,F,19,U,LE3,T,1,1,at_home,other,reputation,other,1,2,1,yes,yes,no,yes,no,yes,yes,no,4,4,3,1,3,3,18,12,10,10
GP,F,19,U,LE3,T,1,2,services,services,home,other,1,2,1,no,no,no,yes,no,yes,no,yes,4,2,4,2,2,3,0,9,9,0
GP,F,19,U,GT3,T,2,1,at_home,other,other,other,3,2,0,no,yes,no,no,yes,no,yes,yes,3,4,1,1,1,2,20,14,12,13
GP,M,19,U,GT3,T,1,2,other,services,course,other,1,2,1,no,no,no,no,no,yes,yes,no,4,5,2,2,2,4,3,13,11,11
GP,F,19,U,LE3,T,3,2,services,other,reputation,other,2,2,1,no,yes,yes,no,no,yes,yes,yes,4,2,2,1,2,1,22,13,10,11
GP,F,19,U,GT3,T,1,1,at_home,health,home,other,1,3,2,no,no,no,no,no,yes,yes,yes,4,1,2,1,1,3,14,15,13,13
GP,F,19,R,GT3,T,2,3,other,other,reputation,other,1,3,1,no,no,no,no,yes,yes,yes,yes,4,1,2,1,1,3,40,13,11,11
GP,F,18,U,GT3,T,2,1,services,other,course,mother,2,2,0,no,yes,yes,yes,yes,yes,yes,no,5,3,3,1,2,1,0,8,8,0
GP,F,18,U,GT3,T,4,3,other,other,course,mother,1,3,0,no,yes,yes,yes,yes,yes,yes,yes,4,3,4,1,1,5,9,9,10,9
GP,F,17,R,GT3,T,3,4,at_home,services,course,father,1,3,0,no,yes,yes,yes,no,yes,yes,no,4,3,4,2,5,5,0,11,11,10
GP,F,18,U,GT3,T,4,4,teacher,other,course,mother,1,2,0,no,yes,yes,no,yes,yes,yes,no,4,4,4,3,3,5,2,11,11,11
GP,F,17,U,GT3,A,4,3,services,services,course,mother,1,2,0,no,yes,yes,no,yes,yes,yes,yes,5,2,2,1,2,5,23,13,13,13
GP,F,17,U,GT3,T,2,2,other,other,course,mother,1,2,0,no,yes,no,no,yes,yes,no,yes,4,2,2,1,1,3,12,11,9,9
GP,F,17,R,LE3,T,2,2,services,services,course,mother,1,3,0,no,yes,yes,yes,yes,yes,yes,no,3,3,2,2,2,3,3,11,11,11
GP,F,17,U,GT3,T,3,1,services,services,course,father,1,3,0,no,yes,no,no,no,yes,yes,no,3,4,3,2,3,5,1,12,14,15
GP,F,17,U,LE3,T,0,2,at_home,at_home,home,father,2,3,0,no,no,no,no,yes,yes,yes,no,3,3,3,2,3,2,0,16,15,15
GP,M,18,U,GT3,T,4,4,other,other,course,mother,1,3,0,no,no,no,yes,yes,yes,yes,no,4,3,3,2,2,3,3,9,12,11
GP,M,17,U,GT3,T,3,3,other,services,reputation,mother,1,1,0,no,no,no,yes,no,yes,yes,no,4,3,5,3,5,5,3,14,15,16
GP,M,17,R,GT3,T,2,2,services,other,course,mother,4,1,0,no,yes,no,no,yes,yes,yes,no,4,4,5,5,5,4,8,11,10,10
GP,F,17,U,GT3,T,4,4,teacher,services,course,mother,1,3,0,no,yes,yes,yes,yes,yes,yes,no,5,4,4,1,3,4,7,10,9,9
GP,F,17,U,GT3,T,4,4,teacher,teacher,course,mother,2,3,0,no,yes,yes,no,no,yes,yes,yes,4,3,3,1,2,4,4,14,14,14
GP,M,18,U,LE3,T,2,2,other,other,course,mother,1,4,0,no,yes,no,yes,yes,yes,yes,no,4,5,5,5,4,5,2,9,8,8
GP,F,17,R,GT3,T,2,4,at_home,other,course,father,1,3,0,no,yes,no,no,yes,yes,yes,yes,4,4,3,1,1,5,7,12,14,14
GP,F,18,U,GT3,T,3,3,services,services,home,mother,1,2,0,no,no,no,yes,yes,yes,yes,no,5,3,4,1,1,4,0,7,0,0
GP,F,18,U,LE3,T,2,2,other,other,home,other,1,2,0,no,no,no,yes,no,yes,yes,yes,4,3,3,1,1,2,0,8,8,0
GP,F,18,R,GT3,T,2,2,at_home,other,course,mother,2,4,0,no,no,no,yes,yes,yes,no,no,4,4,4,1,1,4,0,10,9,0
GP,F,17,U,GT3,T,3,4,services,other,course,mother,1,3,0,no,no,no,no,yes,yes,yes,no,4,4,5,1,3,5,16,16,15,15
GP,F,19,R,GT3,A,3,1,services,at_home,home,other,1,3,1,no,no,yes,no,yes,yes,no,no,5,4,3,1,2,5,12,14,13,13
GP,F,17,U,GT3,T,3,2,other,other,home,mother,1,2,0,no,yes,yes,no,yes,yes,yes,yes,4,3,2,2,3,2,0,7,8,0
GP,F,18,U,LE3,T,3,3,services,services,home,mother,1,4,0,no,yes,no,no,yes,yes,yes,no,5,3,3,1,1,1,7,16,15,17
GP,F,17,R,GT3,A,3,2,other,other,home,mother,1,2,0,no,yes,yes,no,yes,yes,yes,no,4,3,3,2,3,2,4,9,10,10
GP,F,19,U,GT3,T,2,1,services,services,home,other,1,3,1,no,no,yes,yes,yes,yes,yes,yes,4,3,4,1,3,3,4,11,12,11
GP,M,18,U,GT3,T,4,4,teacher,services,home,father,1,2,1,no,yes,no,yes,yes,yes,yes,no,4,3,3,2,2,2,0,10,10,0
GP,M,18,U,LE3,T,3,4,services,other,home,mother,1,2,0,no,no,no,yes,yes,yes,yes,yes,4,3,3,1,3,5,11,16,15,15
GP,F,17,U,GT3,A,2,2,at_home,at_home,home,father,1,2,1,no,yes,no,no,yes,yes,yes,yes,3,3,1,1,2,4,0,9,8,0
GP,F,18,U,GT3,T,2,3,at_home,other,course,mother,1,3,0,no,yes,no,no,yes,yes,yes,no,4,3,3,1,2,3,4,11,10,10
GP,F,18,U,GT3,T,3,2,other,services,other,mother,1,3,0,no,no,no,no,yes,yes,yes,yes,5,4,3,2,3,1,7,13,13,14
GP,M,18,R,GT3,T,4,3,teacher,services,course,mother,1,3,0,no,no,no,no,yes,yes,yes,yes,5,3,2,1,2,4,9,16,15,16
GP,M,18,U,GT3,T,4,3,teacher,other,course,mother,1,3,0,no,yes,yes,no,yes,yes,yes,yes,5,4,5,2,3,5,0,10,10,9
GP,F,17,U,GT3,T,4,3,health,other,reputation,mother,1,3,0,no,yes,yes,yes,yes,yes,yes,yes,4,4,3,1,3,4,0,13,15,15
MS,M,18,R,GT3,T,3,2,other,other,course,mother,2,1,1,no,yes,no,no,no,yes,yes,no,2,5,5,5,5,5,10,11,13,13
MS,M,19,R,GT3,T,1,1,other,services,home,other,3,2,3,no,no,no,no,yes,yes,yes,no,5,4,4,3,3,2,8,8,7,8
MS,M,17,U,GT3,T,3,3,health,other,course,mother,2,2,0,no,yes,yes,no,yes,yes,yes,no,4,5,4,5,3,3,2,13,13,13
MS,M,18,U,LE3,T,1,3,at_home,services,course,mother,1,1,1,no,no,no,no,yes,no,yes,yes,4,3,3,2,3,3,7,8,7,8
MS,M,19,R,GT3,T,1,1,other,other,home,other,3,1,1,no,yes,no,no,yes,yes,yes,no,4,4,4,3,3,5,4,8,8,8
MS,M,17,R,GT3,T,4,3,services,other,home,mother,2,2,0,no,yes,yes,yes,no,yes,yes,yes,4,5,5,4,5,2,4,13,11,11
MS,F,18,U,GT3,T,3,3,services,services,course,father,1,2,0,no,yes,no,no,yes,yes,no,yes,5,3,4,1,1,5,0,10,9,9
MS,F,17,R,GT3,T,4,4,teacher,services,other,father,2,2,0,no,yes,yes,yes,yes,yes,yes,no,4,3,3,1,2,5,4,12,13,13
MS,F,17,U,LE3,A,3,2,services,other,reputation,mother,2,2,0,no,no,no,no,yes,yes,no,yes,1,2,3,1,2,5,2,12,12,11
MS,M,18,U,LE3,T,1,1,other,services,home,father,2,1,0,no,no,no,no,no,yes,yes,yes,3,3,2,1,2,3,4,10,10,10
MS,F,18,U,LE3,T,1,1,at_home,services,course,father,2,3,0,no,no,no,no,yes,yes,yes,no,5,3,2,1,1,4,0,18,16,16
MS,F,18,R,LE3,A,1,4,at_home,other,course,mother,3,2,0,no,no,no,no,yes,yes,no,yes,4,3,4,1,4,5,0,13,13,13
MS,M,18,R,LE3,T,1,1,at_home,other,other,mother,2,2,1,no,no,no,yes,no,no,no,no,4,4,3,2,3,5,2,13,12,12
MS,F,18,U,GT3,T,3,3,services,services,other,mother,2,2,0,no,yes,no,no,yes,yes,yes,yes,4,3,2,1,3,3,0,11,11,10
MS,F,17,U,LE3,T,4,4,at_home,at_home,course,mother,1,2,0,no,yes,yes,yes,yes,yes,yes,yes,2,3,4,1,1,1,0,16,15,15
MS,F,17,R,GT3,T,1,2,other,services,course,father,2,2,0,no,no,no,no,no,yes,no,no,3,2,2,1,2,3,0,12,11,12
MS,M,18,R,GT3,T,1,3,at_home,other,course,mother,2,2,0,no,yes,yes,no,yes,yes,no,no,3,3,4,2,4,3,4,10,10,10
MS,M,18,U,LE3,T,4,4,teacher,services,other,mother,2,3,0,no,no,yes,no,yes,yes,yes,yes,4,2,2,2,2,5,0,13,13,13
MS,F,17,R,GT3,T,1,1,other,services,reputation,mother,3,1,1,no,yes,yes,no,yes,yes,yes,yes,5,2,1,1,2,1,0,7,6,0
MS,F,18,U,GT3,T,2,3,at_home,services,course,father,2,1,0,no,yes,yes,no,yes,yes,yes,yes,5,2,3,1,2,4,0,11,10,10
MS,F,18,R,GT3,T,4,4,other,teacher,other,father,3,2,0,no,yes,yes,no,no,yes,yes,yes,3,2,2,4,2,5,10,14,12,11
MS,F,19,U,LE3,T,3,2,services,services,home,other,2,2,2,no,no,no,yes,yes,yes,no,yes,3,2,2,1,1,3,4,7,7,9
MS,M,18,R,LE3,T,1,2,at_home,services,other,father,3,1,0,no,yes,yes,yes,yes,no,yes,yes,4,3,3,2,3,3,3,14,12,12
MS,F,17,U,GT3,T,2,2,other,at_home,home,mother,1,3,0,no,no,no,yes,yes,yes,no,yes,3,4,3,1,1,3,8,13,11,11
MS,F,17,R,GT3,T,1,2,other,other,course,mother,1,1,0,no,no,no,yes,yes,yes,yes,no,3,5,5,4,5,1,14,6,5,5
MS,F,18,R,LE3,T,4,4,other,other,reputation,mother,2,3,0,no,no,no,no,yes,yes,yes,no,5,4,4,1,1,1,0,19,18,19
MS,F,18,R,GT3,T,1,1,other,other,home,mother,4,3,0,no,no,no,no,yes,yes,yes,no,4,3,2,1,2,4,2,8,8,10
MS,F,20,U,GT3,T,4,2,health,other,course,other,2,3,2,no,yes,yes,no,no,yes,yes,yes,5,4,3,1,1,3,4,15,14,15
MS,F,18,R,LE3,T,4,4,teacher,services,course,mother,1,2,0,no,no,yes,yes,yes,yes,yes,no,5,4,3,3,4,2,4,8,9,10
MS,F,18,U,GT3,T,3,3,other,other,home,mother,1,2,0,no,no,yes,no,yes,yes,yes,yes,4,1,3,1,2,1,0,15,15,15
MS,F,17,R,GT3,T,3,1,at_home,other,reputation,mother,1,2,0,no,yes,yes,yes,no,yes,yes,no,4,5,4,4,5,1,17,10,10,10
MS,M,18,U,GT3,T,4,4,teacher,teacher,home,father,1,2,0,no,no,yes,yes,no,yes,yes,no,3,2,4,1,4,2,4,15,14,14
MS,M,18,R,GT3,T,2,1,other,other,other,mother,2,1,0,no,no,no,yes,no,yes,yes,yes,4,4,3,1,3,5,5,7,6,7
MS,M,17,U,GT3,T,2,3,other,services,home,father,2,2,0,no,no,no,yes,yes,yes,yes,no,4,4,3,1,1,3,2,11,11,10
MS,M,19,R,GT3,T,1,1,other,services,other,mother,2,1,1,no,no,no,no,yes,yes,no,no,4,3,2,1,3,5,0,6,5,0
MS,M,18,R,GT3,T,4,2,other,other,home,father,2,1,1,no,no,yes,no,yes,yes,no,no,5,4,3,4,3,3,14,6,5,5
MS,F,18,R,GT3,T,2,2,at_home,other,other,mother,2,3,0,no,no,yes,no,yes,yes,no,no,5,3,3,1,3,4,2,10,9,10
MS,F,18,R,GT3,T,4,4,teacher,at_home,reputation,mother,3,1,0,no,yes,yes,yes,yes,yes,yes,yes,4,4,3,2,2,5,7,6,5,6
MS,F,19,R,GT3,T,2,3,services,other,course,mother,1,3,1,no,no,no,yes,no,yes,yes,no,5,4,2,1,2,5,0,7,5,0
MS,F,18,U,LE3,T,3,1,teacher,services,course,mother,1,2,0,no,yes,yes,no,yes,yes,yes,no,4,3,4,1,1,1,0,7,9,8
MS,F,18,U,GT3,T,1,1,other,other,course,mother,2,2,1,no,no,no,yes,yes,yes,no,no,1,1,1,1,1,5,0,6,5,0
MS,M,20,U,LE3,A,2,2,services,services,course,other,1,2,2,no,yes,yes,no,yes,yes,no,no,5,5,4,4,5,4,11,9,9,9
MS,M,17,U,LE3,T,3,1,services,services,course,mother,2,1,0,no,no,no,no,no,yes,yes,no,2,4,5,3,4,2,3,14,16,16
MS,M,21,R,GT3,T,1,1,other,other,course,other,1,1,3,no,no,no,no,no,yes,no,no,5,5,3,5,5,3,3,10,8,7
MS,M,18,R,LE3,T,3,2,services,other,course,mother,3,1,0,no,no,no,no,no,yes,yes,no,4,4,1,3,4,5,0,11,12,10
MS,M,19,U,LE3,T,1,1,other,at_home,course,father,1,1,0,no,no,no,no,yes,yes,yes,no,3,2,3,3,3,5,5,8,9,9
--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
1 | The MIT License (MIT)
2 |
3 | Copyright (c) 2016 Luke Chang
4 |
5 | Permission is hereby granted, free of charge, to any person obtaining a copy
6 | of this software and associated documentation files (the "Software"), to deal
7 | in the Software without restriction, including without limitation the rights
8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 |
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 |
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.
22 |
--------------------------------------------------------------------------------
/Notebooks/.gitignore:
--------------------------------------------------------------------------------
1 | # Ignore solutions
2 | Solutions/*
3 | .ipynb_checkpoints
--------------------------------------------------------------------------------
/Notebooks/1_Introduction_to_Programming.ipynb:
--------------------------------------------------------------------------------
1 | {
2 | "cells": [
3 | {
4 | "cell_type": "markdown",
5 | "metadata": {},
6 | "source": [
7 | "# Introduction to programming\n",
8 | "_Written by Luke Chang_\n",
9 | "\n",
10 | "In this notebook we will begin to learn how to use Python. There are many different ways to install Python, but we recommend starting using Anaconda which is preconfigured for scientific computing. Start with installing [Python 3.7](https://www.anaconda.com/distribution/). For those who prefer a more configurable IDE, [Pycharm](https://www.jetbrains.com/pycharm/) is a nice option. Python is a modular interpreted language with an intuitive minimal syntax that is quickly becoming one of the most popular languages for [conducting research](http://www.talyarkoni.org/blog/2013/11/18/the-homogenization-of-scientific-computing-or-why-python-is-steadily-eating-other-languages-lunch/). You can use python for [stimulus presentation](http://www.psychopy.org/), [data analysis](http://statsmodels.sourceforge.net/), [machine-learning](http://scikit-learn.org/stable/), [scraping data](https://www.crummy.com/software/BeautifulSoup/), creating websites with [flask](http://flask.pocoo.org/) or [django](https://www.djangoproject.com/), or [neuroimaging data analysis](http://nipy.org/).\n",
11 | "\n",
12 | "There are lots of free useful resources to learn how to use python and various modules. See Yaroslav Halchenko's excellent [Dartmouth course](https://github.com/dartmouth-pbs/psyc161). [Codeacademy](https://www.codecademy.com/) is a great interactive tutorial. [Stack Overflow](http://stackoverflow.com/) is an incredibly useful resource for asking specific questions and seeing responses to others that have been rated by the development community. "
13 | ]
14 | },
15 | {
16 | "cell_type": "markdown",
17 | "metadata": {},
18 | "source": [
19 | "## Jupyter Notebooks\n",
20 | "We will primarily be using [Jupyter Notebooks](http://jupyter.org/) to interface with Python. A Jupyter notebook consists of **cells**. The two main types of cells you will use are code cells and markdown cells.\n",
21 | "\n",
22 | "A **_code cell_** contains actual code that you want to run. You can specify a cell as a code cell using the pulldown menu in the toolbar in your Jupyter notebook. Otherwise, you can can hit esc and then y (denoted \"esc, y\") while a cell is selected to specify that it is a code cell. Note that you will have to hit enter after doing this to start editing it.\n",
23 | "If you want to execute the code in a code cell, hit \"shift + enter.\" Note that code cells are executed in the order you execute them. That is to say, the ordering of the cells for which you hit \"shift + enter\" is the order in which the code is executed. If you did not explicitly execute a cell early in the document, its results are now known to the Python interpreter.\n",
24 | "\n",
25 | "**_Markdown cells_** contain text. The text is written in markdown, a lightweight markup language. You can read about its syntax [here](http://daringfireball.net/projects/markdown/syntax). Note that you can also insert HTML into markdown cells, and this will be rendered properly. As you are typing the contents of these cells, the results appear as text. Hitting \"shift + enter\" renders the text in the formatting you specify. You can specify a cell as being a markdown cell in the Jupyter toolbar, or by hitting \"esc, m\" in the cell. Again, you have to hit enter after using the quick keys to bring the cell into edit mode.\n",
26 | "\n",
27 | "In general, when you want to add a new cell, you can use the \"Insert\" pulldown menu from the Jupyter toolbar. The shortcut to insert a cell below is \"esc, b\" and to insert a cell above is \"esc, a.\" Alternatively, you can execute a cell and automatically add a new one below it by hitting \"alt + enter.\""
28 | ]
29 | },
30 | {
31 | "cell_type": "code",
32 | "execution_count": null,
33 | "metadata": {},
34 | "outputs": [],
35 | "source": [
36 | "print(\"Hello World\")"
37 | ]
38 | },
39 | {
40 | "cell_type": "markdown",
41 | "metadata": {},
42 | "source": [
43 | "## Package Management\n",
44 | "Package managment in Python has been dramatically improving. Anaconda has it's own package manager called 'conda'. Use this if you would like to install a new module as it is optimized to work with anaconda. \n",
45 | "\n",
46 | "```\n",
47 | "!conda install *package*\n",
48 | "```\n",
49 | "\n",
50 | "However, sometimes conda doesn't have a particular package. In this case use the default python package manager called 'pip'. \n",
51 | "\n",
52 | "These commands can be run in your unix terminal or you can send them to the shell from a Jupyter notebook by starting the line with ```!```\n",
53 | "\n",
54 | "It is easy to get help on how to use the package managers\n",
55 | "\n",
56 | "```\n",
57 | "!pip help install\n",
58 | "```"
59 | ]
60 | },
61 | {
62 | "cell_type": "code",
63 | "execution_count": null,
64 | "metadata": {},
65 | "outputs": [],
66 | "source": [
67 | "!pip help install"
68 | ]
69 | },
70 | {
71 | "cell_type": "code",
72 | "execution_count": null,
73 | "metadata": {},
74 | "outputs": [],
75 | "source": [
76 | "!pip list --outdated"
77 | ]
78 | },
79 | {
80 | "cell_type": "code",
81 | "execution_count": null,
82 | "metadata": {},
83 | "outputs": [],
84 | "source": [
85 | "!pip install setuptools --upgrade"
86 | ]
87 | },
88 | {
89 | "cell_type": "markdown",
90 | "metadata": {},
91 | "source": [
92 | "## Variables\n",
93 | "* Built-in\n",
94 | " * Numeric types:\n",
95 | " * **int**, **float**, **long**, complex\n",
96 | " * **str**ing \n",
97 | " * **bool**ean\n",
98 | " * True / False\n",
99 | " * **NoneType**\n",
100 | "* User defined\n",
101 | "\n",
102 | "* Use the type() function to find the type for a value or variable\n",
103 | "\n",
104 | "* Data can be converted using cast commands"
105 | ]
106 | },
107 | {
108 | "cell_type": "code",
109 | "execution_count": null,
110 | "metadata": {},
111 | "outputs": [],
112 | "source": [
113 | "# Integer\n",
114 | "a = 1\n",
115 | "print(type(a))\n",
116 | "\n",
117 | "# Float\n",
118 | "b = 1.0\n",
119 | "print(type(b))\n",
120 | "\n",
121 | "# String\n",
122 | "c = 'hello'\n",
123 | "print(type(c))\n",
124 | "\n",
125 | "# Boolean\n",
126 | "d = True\n",
127 | "print(type(d))\n",
128 | "\n",
129 | "# None\n",
130 | "e = None\n",
131 | "print(type(e))\n",
132 | "\n",
133 | "# Cast integer to string\n",
134 | "print(type(str(a)))"
135 | ]
136 | },
137 | {
138 | "cell_type": "markdown",
139 | "metadata": {},
140 | "source": [
141 | "## Math Operators\n",
142 | "* +, -, *, and /\n",
143 | "* Exponentiation **\n",
144 | "* Modulo %\n",
145 | "\n",
146 | "* Note that division with integers in Python 2.7 automatically rounds, which may not be intended. It is recommended to import the division module from python3 `from __future__ import division`"
147 | ]
148 | },
149 | {
150 | "cell_type": "code",
151 | "execution_count": null,
152 | "metadata": {
153 | "ExecuteTime": {
154 | "end_time": "2019-03-27T14:34:18.317397Z",
155 | "start_time": "2019-03-27T14:34:18.312483Z"
156 | }
157 | },
158 | "outputs": [],
159 | "source": [
160 | "# Addition\n",
161 | "a = 2 + 7\n",
162 | "print(a)\n",
163 | "\n",
164 | "# Subtraction\n",
165 | "b = a - 5\n",
166 | "print(b)\n",
167 | "\n",
168 | "# Multiplication\n",
169 | "print(b*2)\n",
170 | "\n",
171 | "# Exponentiation\n",
172 | "print(b**2)\n",
173 | "\n",
174 | "# Modulo\n",
175 | "print(4%9)\n",
176 | "\n",
177 | "# Division\n",
178 | "print(4/9)"
179 | ]
180 | },
181 | {
182 | "cell_type": "markdown",
183 | "metadata": {},
184 | "source": [
185 | "## String Operators\n",
186 | "* Some of the arithmetic operators also have meaning for strings. E.g. for string concatenation use `+` sign\n",
187 | "* String repetition: Use `*` sign with a number of repetitions"
188 | ]
189 | },
190 | {
191 | "cell_type": "code",
192 | "execution_count": null,
193 | "metadata": {},
194 | "outputs": [],
195 | "source": [
196 | "# Combine string\n",
197 | "a = 'Hello'\n",
198 | "b = 'World'\n",
199 | "print(a + b)\n",
200 | "\n",
201 | "# Repeat String\n",
202 | "print(a*5)"
203 | ]
204 | },
205 | {
206 | "cell_type": "markdown",
207 | "metadata": {},
208 | "source": [
209 | "## Logical Operators\n",
210 | "Perform logical comparison and return Boolean value\n",
211 | "\n",
212 | "```python\n",
213 | "x == y # x is equal to y\n",
214 | "x != y # x is not equal to y\n",
215 | "x > y # x is greater than y\n",
216 | "x < y # x is less than y\n",
217 | "x >= y # x is greater than or equal to y \n",
218 | "x <= y # x is less than or equal to y```\n",
219 | "\n",
220 | "| X | not X | \n",
221 | "|-------|--------|\n",
222 | "| True | False | \n",
223 | "| False | True | \n",
224 | "\n",
225 | "| X | Y | X AND Y | X OR Y | \n",
226 | "|------|------|---------|--------|\n",
227 | "|True | True | True | True | \n",
228 | "|True | False| False | True | \n",
229 | "|False | True | False | True | \n",
230 | "|False | False| False | False | "
231 | ]
232 | },
233 | {
234 | "cell_type": "code",
235 | "execution_count": null,
236 | "metadata": {},
237 | "outputs": [],
238 | "source": [
239 | "# Works for string\n",
240 | "a = 'hello'\n",
241 | "b = 'world'\n",
242 | "c = 'Hello'\n",
243 | "print(a==b)\n",
244 | "print(a==c)\n",
245 | "print(a!=b)\n",
246 | "\n",
247 | "# Works for numeric\n",
248 | "d = 5\n",
249 | "e = 8\n",
250 | "print(d < e)"
251 | ]
252 | },
253 | {
254 | "cell_type": "markdown",
255 | "metadata": {},
256 | "source": [
257 | "## Conditional Logic (if...)\n",
258 | "Unlike most other languages, Python uses tab formatting rather than closing conditional statements (e.g., end).\n",
259 | "\n",
260 | "* Syntax:\n",
261 | "\n",
262 | "``` python \n",
263 | "if condition: \n",
264 | " do something\n",
265 | "```\n",
266 | "\n",
267 | "* Implicit conversion of the value to bool() happens if `condition` is of a different type than **bool**, thus all of the following should work:\n",
268 | "\n",
269 | "```python\n",
270 | "if condition:\n",
271 | " do_something\n",
272 | "elif condition:\n",
273 | " do_alternative1\n",
274 | "else:\n",
275 | " do_otherwise # often reserved to report an error\n",
276 | " # after a long list of options\n",
277 | " ```\n"
278 | ]
279 | },
280 | {
281 | "cell_type": "code",
282 | "execution_count": null,
283 | "metadata": {},
284 | "outputs": [],
285 | "source": [
286 | "n = 1\n",
287 | "\n",
288 | "if n:\n",
289 | " print(\"n is non-0\")\n",
290 | "\n",
291 | "if n is None:\n",
292 | " print(\"n is None\")\n",
293 | " \n",
294 | "if n is not None:\n",
295 | " print(\"n is not None\")"
296 | ]
297 | },
298 | {
299 | "cell_type": "markdown",
300 | "metadata": {},
301 | "source": [
302 | "## Loops\n",
303 | "* **for** loop is probably the most popular loop construct in Python:\n",
304 | "\n",
305 | "```python\n",
306 | "for target in sequence:\n",
307 | " do_statements\n",
308 | "```\n",
309 | "\n",
310 | "* However, it's also possible to use a **while** loop to repeat statements while `condition` remains True:\n",
311 | "\n",
312 | "```python\n",
313 | "while condition do:\n",
314 | " do_statements\n",
315 | "```"
316 | ]
317 | },
318 | {
319 | "cell_type": "code",
320 | "execution_count": null,
321 | "metadata": {
322 | "ExecuteTime": {
323 | "end_time": "2019-03-27T14:35:32.341248Z",
324 | "start_time": "2019-03-27T14:35:32.336945Z"
325 | }
326 | },
327 | "outputs": [],
328 | "source": [
329 | "string = \"Python is going to make conducting research easier\"\n",
330 | "for c in string:\n",
331 | " print(c)\n"
332 | ]
333 | },
334 | {
335 | "cell_type": "code",
336 | "execution_count": null,
337 | "metadata": {
338 | "ExecuteTime": {
339 | "end_time": "2019-03-27T14:35:49.426964Z",
340 | "start_time": "2019-03-27T14:35:49.421669Z"
341 | }
342 | },
343 | "outputs": [],
344 | "source": [
345 | "x = 0\n",
346 | "end = 10\n",
347 | "\n",
348 | "csum = 0\n",
349 | "while x < end:\n",
350 | " csum += x\n",
351 | " print(x, csum)\n",
352 | " x += 1\n",
353 | "print(\"Exited with x==%d\" % x )"
354 | ]
355 | },
356 | {
357 | "cell_type": "markdown",
358 | "metadata": {},
359 | "source": [
360 | "## Functions\n",
361 | "A **function** is a named sequence of statements that performs a computation. You define the function by giving it a name, specify a sequence of statements, and optionally values to return. Later, you can “call” the function by name. \n",
362 | "```python\n",
363 | "def make_upper_case(text):\n",
364 | " return (text.upper())\n",
365 | "```\n",
366 | "* The expression in the parenthesis is the **argument**. \n",
367 | "* It is common to say that a function **“takes” an argument** and **“returns” a result**.\n",
368 | "* The result is called the **return value**.\n",
369 | "\n",
370 | "The first line of the function definition is called the **header**; the rest is called the **body**.\n",
371 | "\n",
372 | "The header has to end with a colon and the body has to be indented.\n",
373 | "It is a common practice to use 4 spaces for indentation, and to avoid mixing with tabs.\n",
374 | "\n",
375 | "Function body in Python ends whenever statement begins at the original level of indentation. There is no **end** or **fed** or any other identify to signal the end of function. Indentation is part of the the language syntax in Python, making it more readable and less cluttered."
376 | ]
377 | },
378 | {
379 | "cell_type": "code",
380 | "execution_count": null,
381 | "metadata": {
382 | "ExecuteTime": {
383 | "end_time": "2019-03-27T14:35:59.444165Z",
384 | "start_time": "2019-03-27T14:35:59.440980Z"
385 | }
386 | },
387 | "outputs": [],
388 | "source": [
389 | "def make_upper_case(text):\n",
390 | " return (text.upper())\n",
391 | "\n",
392 | "string = \"Python is going to make conducting research easier\"\n",
393 | "\n",
394 | "print(make_upper_case(string))"
395 | ]
396 | },
397 | {
398 | "cell_type": "markdown",
399 | "metadata": {},
400 | "source": [
401 | "## Python Containers\n",
402 | "There are 4 main types of builtin containers for storing data in Python:\n",
403 | "* list\n",
404 | "* tuple\n",
405 | "* dict\n",
406 | "* set"
407 | ]
408 | },
409 | {
410 | "cell_type": "markdown",
411 | "metadata": {},
412 | "source": [
413 | "### Lists\n",
414 | "In Python, a list is a mutable sequence of values. Mutable means that we can change separate entries within a list. For a more in depth tutorial on lists look [here](http://nbviewer.jupyter.org/github/dartmouth-pbs/psyc161/blob/master/classes/02d-Python-Fundamentals-Containers-Lists.ipynb)\n",
415 | "\n",
416 | "* Each value in the list is an element or item\n",
417 | "* Elements can be any Python data type\n",
418 | "* Lists can mix data types\n",
419 | "\n",
420 | "* Lists are initialized with ```[]``` or ```list()```\n",
421 | "```python\n",
422 | "l = [1,2,3]\n",
423 | "```\n",
424 | "* \n",
425 | "Elements within a list are indexed (**starting with 0**)\n",
426 | "```python\n",
427 | "l[0]\n",
428 | "```\n",
429 | "\n",
430 | "* \n",
431 | "Elements can be nested lists\n",
432 | "```python\n",
433 | "nested = [[1, 2, 3], [4, 5, 6], [7, 8, 9]]\n",
434 | "```\n",
435 | "\n",
436 | "* \n",
437 | "Lists can be *sliced*.\n",
438 | "```python\n",
439 | "l[start:stop:stride]\n",
440 | "```\n",
441 | "\n",
442 | "* Like all python containers, lists have many useful methods that can be applied\n",
443 | "```python\n",
444 | "a.insert(index,new element)\n",
445 | "a.append(element to add at end)\n",
446 | "len(a)\n",
447 | "```\n",
448 | "\n",
449 | "* \n",
450 | "List comprehension is a *Very* powerful technique allowing for efficient construction of new lists.\n",
451 | "```python\n",
452 | "[a for a in l]\n",
453 | "```\n"
454 | ]
455 | },
456 | {
457 | "cell_type": "code",
458 | "execution_count": null,
459 | "metadata": {
460 | "ExecuteTime": {
461 | "end_time": "2019-03-27T14:36:30.240853Z",
462 | "start_time": "2019-03-27T14:36:30.236249Z"
463 | }
464 | },
465 | "outputs": [],
466 | "source": [
467 | "# Indexing and Slicing\n",
468 | "a = ['lists','are','arrays']\n",
469 | "print(a[0])\n",
470 | "print(a[1:3])\n",
471 | "\n",
472 | "# List methods\n",
473 | "a.insert(2,'python')\n",
474 | "a.append('.')\n",
475 | "print(a)\n",
476 | "print(len(a))\n",
477 | "\n",
478 | "# List Comprehension\n",
479 | "print([x.upper() for x in a])"
480 | ]
481 | },
482 | {
483 | "cell_type": "markdown",
484 | "metadata": {},
485 | "source": [
486 | "### Dictionaries\n",
487 | "\n",
488 | "* In Python, a dictionary (or `dict`) is mapping between a set of\n",
489 | "indices (**keys**) and a set of **values**\n",
490 | "\n",
491 | "* The items in a dictionary are key-value pairs\n",
492 | "\n",
493 | "* Keys can be any Python data type\n",
494 | "\n",
495 | "* Dictionaries are unordered\n",
496 | "\n",
497 | "* Here is a more indepth tutorial on [dictionaries](http://nbviewer.jupyter.org/github/dartmouth-pbs/psyc161/blob/master/classes/03c-Python-Fundamentals-Containers-Dicts.ipynb)"
498 | ]
499 | },
500 | {
501 | "cell_type": "code",
502 | "execution_count": null,
503 | "metadata": {
504 | "ExecuteTime": {
505 | "end_time": "2019-03-27T14:36:42.674472Z",
506 | "start_time": "2019-03-27T14:36:42.670435Z"
507 | }
508 | },
509 | "outputs": [],
510 | "source": [
511 | "# Dictionaries\n",
512 | "eng2sp = {}\n",
513 | "eng2sp['one'] = 'uno'\n",
514 | "print(eng2sp)\n",
515 | "\n",
516 | "eng2sp = {'one': 'uno', 'two': 'dos', 'three': 'tres'}\n",
517 | "print(eng2sp)\n",
518 | "\n",
519 | "print(eng2sp.keys())\n",
520 | "print(eng2sp.values())"
521 | ]
522 | },
523 | {
524 | "cell_type": "markdown",
525 | "metadata": {},
526 | "source": [
527 | "### Tuples\n",
528 | "In Python, a **tuple** is an immutable sequence of values, meaning they can't be changed\n",
529 | "\n",
530 | "* Each value in the tuple is an element or item\n",
531 | "\n",
532 | "* Elements can be any Python data type\n",
533 | "\n",
534 | "* Tuples can mix data types\n",
535 | "\n",
536 | "* Elements can be nested tuples\n",
537 | "\n",
538 | "* **Essentially tuples are immutable lists**\n",
539 | "\n",
540 | "Here is a nice tutorial on [tuples](http://nbviewer.jupyter.org/github/dartmouth-pbs/psyc161/blob/master/classes/03b-Python-Fundamentals-Containers-Tuples.ipynb)"
541 | ]
542 | },
543 | {
544 | "cell_type": "code",
545 | "execution_count": null,
546 | "metadata": {
547 | "ExecuteTime": {
548 | "end_time": "2019-03-27T14:36:51.403867Z",
549 | "start_time": "2019-03-27T14:36:51.400160Z"
550 | }
551 | },
552 | "outputs": [],
553 | "source": [
554 | "numbers = (1, 2, 3, 4)\n",
555 | "print(numbers)\n",
556 | "\n",
557 | "t2 = 1, 2\n",
558 | "print(t2)"
559 | ]
560 | },
561 | {
562 | "cell_type": "markdown",
563 | "metadata": {},
564 | "source": [
565 | "## sets\n",
566 | "In Python, a `set` is an efficient storage for \"membership\" checking\n",
567 | "\n",
568 | "* `set` is like a `dict` but only with keys and without values\n",
569 | "\n",
570 | "* a `set` can also perform set operations (e.g., union intersection)\n",
571 | "\n",
572 | "* Here is more info on [sets](http://nbviewer.jupyter.org/github/dartmouth-pbs/psyc161/blob/master/classes/03d-Python-Fundamentals-Containers-Sets.ipynb)\n"
573 | ]
574 | },
575 | {
576 | "cell_type": "code",
577 | "execution_count": null,
578 | "metadata": {
579 | "ExecuteTime": {
580 | "end_time": "2019-03-27T14:37:02.025425Z",
581 | "start_time": "2019-03-27T14:37:02.021137Z"
582 | }
583 | },
584 | "outputs": [],
585 | "source": [
586 | "# Union\n",
587 | "print({1, 2, 3, 'mom', 'dad'} | {2, 3, 10})\n",
588 | "\n",
589 | "# Intersection\n",
590 | "print({1, 2, 3, 'mom', 'dad'} & {2, 3, 10})\n",
591 | "\n",
592 | "# Difference\n",
593 | "print({1, 2, 3, 'mom', 'dad'} - {2, 3, 10})"
594 | ]
595 | },
596 | {
597 | "cell_type": "markdown",
598 | "metadata": {},
599 | "source": [
600 | "## Modules\n",
601 | "A *Module* is a python file that contains a collection of related definitions. Python has *hundreds* of standard modules. These are organized into what is known as the [Python Standard Library](http://docs.python.org/library/). You can also create and use your own modules. To use functionality from a module, you first have to import the entire module or parts of it into your namespace\n",
602 | "\n",
603 | "* To import the entire module, use \n",
604 | "\n",
605 | " ```python\n",
606 | " import module_name```\n",
607 | "\n",
608 | "* You can also import a module using a specific name \n",
609 | "\n",
610 | " ```python\n",
611 | " import module_name as new_module_name```\n",
612 | "\n",
613 | "* To import specific definitions (e.g. functions, variables, etc) from the module into your local namespace, use\n",
614 | "\n",
615 | "```python\n",
616 | "from module_name import name1, name2\n",
617 | "```\n",
618 | " which will make those available directly in your ```namespace```"
619 | ]
620 | },
621 | {
622 | "cell_type": "code",
623 | "execution_count": null,
624 | "metadata": {
625 | "ExecuteTime": {
626 | "end_time": "2017-01-09T09:18:26.329329",
627 | "start_time": "2017-01-09T09:18:26.324573"
628 | }
629 | },
630 | "outputs": [],
631 | "source": [
632 | "import os\n",
633 | "from glob import glob"
634 | ]
635 | },
636 | {
637 | "cell_type": "markdown",
638 | "metadata": {},
639 | "source": [
640 | "Here let's try and get the path of the current working directory using functions from the `os` module"
641 | ]
642 | },
643 | {
644 | "cell_type": "code",
645 | "execution_count": null,
646 | "metadata": {},
647 | "outputs": [],
648 | "source": [
649 | "os.path.abspath(os.path.curdir)"
650 | ]
651 | },
652 | {
653 | "cell_type": "markdown",
654 | "metadata": {},
655 | "source": [
656 | "It looks like we are currently in the notebooks folder of the github repository. Let's use glob, a pattern matching function, to list all of the csv files in the Data folder. "
657 | ]
658 | },
659 | {
660 | "cell_type": "code",
661 | "execution_count": null,
662 | "metadata": {
663 | "ExecuteTime": {
664 | "end_time": "2019-03-27T14:37:10.021529Z",
665 | "start_time": "2019-03-27T14:37:09.940612Z"
666 | }
667 | },
668 | "outputs": [],
669 | "source": [
670 | "data_file_list = glob(os.path.join('..','Data','*csv'))\n",
671 | "print(data_file_list)"
672 | ]
673 | },
674 | {
675 | "cell_type": "markdown",
676 | "metadata": {},
677 | "source": [
678 | "This gives us a list of the files including the relative path from the current directory. What if we wanted just the filenames? There are several different ways to do this. First, we can use the the `os.path.basename` function. We loop over every file, grab the base file name and then append it to a new list. "
679 | ]
680 | },
681 | {
682 | "cell_type": "code",
683 | "execution_count": null,
684 | "metadata": {
685 | "ExecuteTime": {
686 | "end_time": "2017-01-09T09:26:35.521842",
687 | "start_time": "2017-01-09T09:26:35.517653"
688 | }
689 | },
690 | "outputs": [],
691 | "source": [
692 | "file_list = []\n",
693 | "for f in data_file_list:\n",
694 | " file_list.append(os.path.basename(f))\n",
695 | "\n",
696 | "print(file_list)"
697 | ]
698 | },
699 | {
700 | "cell_type": "markdown",
701 | "metadata": {},
702 | "source": [
703 | "Alternatively, we could loop over all files and split on the `/` character. This will create a new list where each element is whatever characters are separated by the splitting character. We can then take the last element of each list."
704 | ]
705 | },
706 | {
707 | "cell_type": "code",
708 | "execution_count": null,
709 | "metadata": {
710 | "ExecuteTime": {
711 | "end_time": "2017-01-09T09:27:57.077454",
712 | "start_time": "2017-01-09T09:27:57.073701"
713 | }
714 | },
715 | "outputs": [],
716 | "source": [
717 | "file_list = []\n",
718 | "for f in data_file_list:\n",
719 | " file_list.append(f.split('/')[-1])\n",
720 | "\n",
721 | "print(file_list)"
722 | ]
723 | },
724 | {
725 | "cell_type": "markdown",
726 | "metadata": {},
727 | "source": [
728 | "It is also sometimes even cleaner to do this as a list comprehension"
729 | ]
730 | },
731 | {
732 | "cell_type": "code",
733 | "execution_count": null,
734 | "metadata": {
735 | "ExecuteTime": {
736 | "end_time": "2017-01-09T09:28:39.734003",
737 | "start_time": "2017-01-09T09:28:39.729813"
738 | }
739 | },
740 | "outputs": [],
741 | "source": [
742 | "[os.path.basename(x) for x in data_file_list]"
743 | ]
744 | },
745 | {
746 | "cell_type": "markdown",
747 | "metadata": {},
748 | "source": [
749 | "# Exercises"
750 | ]
751 | },
752 | {
753 | "cell_type": "markdown",
754 | "metadata": {},
755 | "source": [
756 | "### Find Even Numbers\n",
757 | "Let’s say I give you a list saved in a variable: a = [1, 4, 9, 16, 25, 36, 49, 64, 81, 100]. Make a new list that has only the even elements of this list in it."
758 | ]
759 | },
760 | {
761 | "cell_type": "code",
762 | "execution_count": null,
763 | "metadata": {},
764 | "outputs": [],
765 | "source": []
766 | },
767 | {
768 | "cell_type": "markdown",
769 | "metadata": {},
770 | "source": [
771 | "### Find Maximal Range\n",
772 | "Given an array length 1 or more of ints, return the difference between the largest and smallest values in the array. "
773 | ]
774 | },
775 | {
776 | "cell_type": "code",
777 | "execution_count": null,
778 | "metadata": {},
779 | "outputs": [],
780 | "source": []
781 | },
782 | {
783 | "cell_type": "markdown",
784 | "metadata": {},
785 | "source": [
786 | "### Duplicated Numbers\n",
787 | "Find the numbers in list a that are also in list b\n",
788 | "\n",
789 | "a = [0, 1, 4, 9, 16, 25, 36, 49, 64, 81, 100, 121, 144, 169, 196, 225, 256, 289, 324, 361]\n",
790 | "\n",
791 | "b = [0, 4, 16, 36, 64, 100, 144, 196, 256, 324]"
792 | ]
793 | },
794 | {
795 | "cell_type": "code",
796 | "execution_count": null,
797 | "metadata": {},
798 | "outputs": [],
799 | "source": []
800 | },
801 | {
802 | "cell_type": "markdown",
803 | "metadata": {},
804 | "source": [
805 | "### Speeding Ticket Fine\n",
806 | "You are driving a little too fast on the highway, and a police officer stops you. Write a function that takes the speed as an input and returns the fine. Your function must use a dictionary.\n",
807 | "\n",
808 | "\n",
809 | "If speed is 60 or less, the result is `$0`. If speed is between 61 and 80 inclusive, the result is `$100`. If speed is 81 or more, the result is `$500`. \n"
810 | ]
811 | },
812 | {
813 | "cell_type": "code",
814 | "execution_count": null,
815 | "metadata": {},
816 | "outputs": [],
817 | "source": []
818 | }
819 | ],
820 | "metadata": {
821 | "kernelspec": {
822 | "display_name": "Python 3",
823 | "language": "python",
824 | "name": "python3"
825 | },
826 | "language_info": {
827 | "codemirror_mode": {
828 | "name": "ipython",
829 | "version": 3
830 | },
831 | "file_extension": ".py",
832 | "mimetype": "text/x-python",
833 | "name": "python",
834 | "nbconvert_exporter": "python",
835 | "pygments_lexer": "ipython3",
836 | "version": "3.7.2"
837 | },
838 | "toc": {
839 | "base_numbering": 1,
840 | "nav_menu": {},
841 | "number_sections": true,
842 | "sideBar": true,
843 | "skip_h1_title": false,
844 | "title_cell": "Table of Contents",
845 | "title_sidebar": "Contents",
846 | "toc_cell": false,
847 | "toc_position": {},
848 | "toc_section_display": true,
849 | "toc_window_display": false
850 | }
851 | },
852 | "nbformat": 4,
853 | "nbformat_minor": 1
854 | }
855 |
--------------------------------------------------------------------------------
/Notebooks/2.1_MatplotlibTemplates.ipynb:
--------------------------------------------------------------------------------
1 | {
2 | "cells": [
3 | {
4 | "cell_type": "code",
5 | "execution_count": null,
6 | "metadata": {
7 | "ExecuteTime": {
8 | "end_time": "2019-03-27T14:37:58.582789Z",
9 | "start_time": "2019-03-27T14:37:57.460651Z"
10 | }
11 | },
12 | "outputs": [],
13 | "source": [
14 | "%matplotlib inline\n",
15 | "import numpy as np\n",
16 | "import pandas as pd\n",
17 | "import matplotlib.pyplot as plt\n",
18 | "from scipy import stats\n",
19 | "\n",
20 | "def autolabel(rects):\n",
21 | " # attach some text labels\n",
22 | " for rect in rects:\n",
23 | " height = rect.get_height()\n",
24 | " ax.text(rect.get_x() + rect.get_width()/2., 1.05*height,\n",
25 | " '%2.2f' % float(height),\n",
26 | " ha='center', va='bottom')"
27 | ]
28 | },
29 | {
30 | "cell_type": "markdown",
31 | "metadata": {},
32 | "source": [
33 | "Simple Plots in Python\n",
34 | "\n",
35 | "In this tutorial we'll show you some basic templates of scientific plots using Python matplotlib.\n",
36 | "\n",
37 | "# Bar graphs with standard error bars for 1 group"
38 | ]
39 | },
40 | {
41 | "cell_type": "code",
42 | "execution_count": null,
43 | "metadata": {
44 | "ExecuteTime": {
45 | "end_time": "2019-03-27T14:38:02.687415Z",
46 | "start_time": "2019-03-27T14:38:02.318322Z"
47 | },
48 | "scrolled": true
49 | },
50 | "outputs": [],
51 | "source": [
52 | "# based on http://matplotlib.org/examples/api/barchart_demo.html\n",
53 | "\n",
54 | "# Make some fake data\n",
55 | "d = {'gender': np.hstack([np.ones(10), np.zeros(10)]), 'scores': np.hstack([np.random.rand(10), np.random.rand(10)+1])}\n",
56 | "df = pd.DataFrame(d)\n",
57 | "\n",
58 | "# Change this part and replace with the variables you want to plot and the grouping variable column name.\n",
59 | "vals = ['scores'] # This is the column name of the variable to plot on Y axis\n",
60 | "group = ['gender'] # This is the grouping variable for the X axis\n",
61 | "\n",
62 | "# Get means for each group\n",
63 | "means = df[vals+group].groupby(group).mean().squeeze()\n",
64 | "# Get standard error of means for each group\n",
65 | "sems = df[vals+group].groupby(group).sem().squeeze()\n",
66 | "\n",
67 | "fig,ax = plt.subplots(figsize=(10,5)) # Change figure size in (width,height)\n",
68 | "ind = np.arange(np.size(np.unique(df[group]),0)) # location of bars\n",
69 | "width = .5 # Width of bars\n",
70 | "# (bar x-location, bar heights, width=bar width, color=bar color, yerr=standard error,ecolor=errorbarcolor)\n",
71 | "rects1 = ax.bar(ind - width/2,means,width=.5,color='lightsalmon',yerr=sems,ecolor='blue') \n",
72 | "# Look up different colors here: http://stackoverflow.com/questions/22408237/named-colors-in-matplotlib\n",
73 | "\n",
74 | "# configure axes properties to make pretty\n",
75 | "ax.set_ylabel('scores')\n",
76 | "ax.set_xlabel('gender')\n",
77 | "ax.set_title('Scores by gender')\n",
78 | "ax.set_xticks(ind)\n",
79 | "ax.set_xticklabels(['Male','Female'])\n",
80 | "ax.set_xlim([-.5,1.5]) \n",
81 | "ax.set_ylim([0,2])\n",
82 | "\n",
83 | "# This part calls the function autolabel() defined above, and labels the bars with values\n",
84 | "autolabel(rects1)\n",
85 | "\n",
86 | "plt.show()"
87 | ]
88 | },
89 | {
90 | "cell_type": "markdown",
91 | "metadata": {},
92 | "source": [
93 | "# Bar graphs with standard error bars for 2 group"
94 | ]
95 | },
96 | {
97 | "cell_type": "code",
98 | "execution_count": null,
99 | "metadata": {
100 | "ExecuteTime": {
101 | "end_time": "2019-03-27T14:38:07.243834Z",
102 | "start_time": "2019-03-27T14:38:07.090095Z"
103 | }
104 | },
105 | "outputs": [],
106 | "source": [
107 | "# Make some fake data\n",
108 | "d = {'race': np.random.permutation(np.hstack([np.ones(10), np.zeros(10)])), \n",
109 | " 'gender': np.hstack([np.ones(10), np.zeros(10)]), \n",
110 | " 'scores': np.hstack([np.random.rand(10), np.random.rand(10)+1])}\n",
111 | "df = pd.DataFrame(d)\n",
112 | "\n",
113 | "# Change this part and replace with the variables you want to plot and the grouping variable column name.\n",
114 | "val =['scores']\n",
115 | "group1 = ['gender']\n",
116 | "group2 = ['race']\n",
117 | "\n",
118 | "# Get means and sems for Gender group\n",
119 | "means1 = df[val+group1].groupby(group1).mean().squeeze()\n",
120 | "sems1 = df[val+group1].groupby(group1).sem().squeeze()\n",
121 | "# Get means and sems for Race group\n",
122 | "means2 = df[val+group2].groupby(group2).mean().squeeze()\n",
123 | "sems2 = df[val+group2].groupby(group2).sem().squeeze()\n",
124 | "\n",
125 | "fig,ax = plt.subplots(figsize=(10,5)) # Change figure size in (width,height)\n",
126 | "ind = np.array([0.,1.]) # location of bars\n",
127 | "width = .4 # Width of bars\n",
128 | "\n",
129 | "# plot score by gender\n",
130 | "rects1 = ax.bar(ind - width,means1,width,color='lightcoral',yerr=sems1,ecolor='k') # (bar x-location, bar heights, width=bar width, color=bar color, yerr=standard error)\n",
131 | "# plot score by race \n",
132 | "rects2 = ax.bar(ind,means2,width,color='lightblue',yerr=sems2,ecolor='k')\n",
133 | "\n",
134 | "# configure axes properties to make pretty\n",
135 | "ax.set_ylabel('scores')\n",
136 | "ax.set_xlabel('gender')\n",
137 | "ax.set_title('Scores by gender and race')\n",
138 | "ax.set_xticks(ind)\n",
139 | "ax.set_xticklabels(['Male','Female'])\n",
140 | "ax.set_xlim([ind[0]-width*1.25,ind[-1]+width*1.25]) \n",
141 | "ax.set_ylim([0,1.8])\n",
142 | "\n",
143 | "ax.legend(['Race0','Race1'])\n",
144 | "\n",
145 | "autolabel(rects1)\n",
146 | "autolabel(rects2)\n"
147 | ]
148 | },
149 | {
150 | "cell_type": "markdown",
151 | "metadata": {},
152 | "source": [
153 | "# Scatterplots of 1 group with jittered location\n",
154 | "\n",
155 | "If you try to plot something like a scaled data, you won't be able to see how clustered they are because they would just plot on top of each other. One way to avoid this is to jitter the x,y locations around the actual value."
156 | ]
157 | },
158 | {
159 | "cell_type": "code",
160 | "execution_count": null,
161 | "metadata": {
162 | "ExecuteTime": {
163 | "end_time": "2019-03-27T14:38:10.892018Z",
164 | "start_time": "2019-03-27T14:38:10.778229Z"
165 | },
166 | "scrolled": true
167 | },
168 | "outputs": [],
169 | "source": [
170 | "# Make some fake data\n",
171 | "d = {'race': np.random.permutation(np.hstack([np.ones(20), np.zeros(20)])),\n",
172 | " 'gender': np.hstack([np.ones(20), np.zeros(20)]), \n",
173 | " 'scores': np.round(10*np.hstack([np.random.rand(20), np.random.rand(20)+1]))}\n",
174 | "df = pd.DataFrame(d)\n",
175 | "ax = df.plot(kind='scatter',x='gender',y='scores')\n",
176 | "ax.set_title('Values are stacked')\n",
177 | "plt.show()"
178 | ]
179 | },
180 | {
181 | "cell_type": "markdown",
182 | "metadata": {},
183 | "source": [
184 | "Here is the fix. "
185 | ]
186 | },
187 | {
188 | "cell_type": "code",
189 | "execution_count": null,
190 | "metadata": {
191 | "ExecuteTime": {
192 | "end_time": "2019-03-27T14:38:14.040412Z",
193 | "start_time": "2019-03-27T14:38:13.939597Z"
194 | }
195 | },
196 | "outputs": [],
197 | "source": [
198 | "# Set x,y values for each group\n",
199 | "gender0 = 0 # value of first group\n",
200 | "y0 = df[['scores']].loc[df['gender']==gender0].values.squeeze() # Grabs y values for Gender =0\n",
201 | "y0 = y0+(np.random.rand(len(y0))-.5)*.1 #Change after + sign to control dispersion\n",
202 | "x0 = np.ones(len(y0))*gender0 +(np.random.rand(len(y0))-.5)*.1 #Change after + sign to control dispersion\n",
203 | "\n",
204 | "gender1 = 1 # value of second group\n",
205 | "y1 = df[['scores']].loc[df['gender']==gender1].values.squeeze()\n",
206 | "y1 = y1+(np.random.rand(len(y1))-.5)*.1\n",
207 | "x1 = np.ones(len(y1))*gender1 + (np.random.rand(len(y1))-.5)*.1\n",
208 | "\n",
209 | "fig,ax = plt.subplots(figsize=(5,5))\n",
210 | "ax.scatter(x0,y0,color='lightcoral')\n",
211 | "ax.scatter(x1,y1,color='lightcoral')\n",
212 | "ax.set_ylabel('scores')\n",
213 | "ax.set_xlabel('gender')\n",
214 | "ax.set_title('Values are now dispersed')\n",
215 | "ax.set_xticks([0,1])\n",
216 | "ax.set_xticklabels(['Male','Female'])\n",
217 | "ax.set_xlim([-.5,1.5]) \n",
218 | "ax.grid() # puts grid on\n",
219 | "plt.show()"
220 | ]
221 | },
222 | {
223 | "cell_type": "markdown",
224 | "metadata": {
225 | "collapsed": true
226 | },
227 | "source": [
228 | "# Drawing trend line on a scatterplot"
229 | ]
230 | },
231 | {
232 | "cell_type": "code",
233 | "execution_count": null,
234 | "metadata": {
235 | "ExecuteTime": {
236 | "end_time": "2019-03-27T14:38:31.813333Z",
237 | "start_time": "2019-03-27T14:38:31.403344Z"
238 | }
239 | },
240 | "outputs": [],
241 | "source": [
242 | "import statsmodels.formula.api as smf\n",
243 | "import statsmodels.api as sm\n",
244 | "\n",
245 | "d = {'race': np.random.permutation(np.hstack([np.ones(20), np.zeros(20)])),\n",
246 | " 'gender': np.hstack([np.ones(20), np.zeros(20)]), \n",
247 | " 'scores': np.round(10*np.hstack([np.random.rand(20), np.random.rand(20)+1]))}\n",
248 | "df = pd.DataFrame(d)\n",
249 | "lm = smf.ols(formula = \"scores ~ gender\",data=df).fit()\n",
250 | "print(lm.summary())\n",
251 | "\n",
252 | "# Save the slope for gender to b1 and intercept to b0\n",
253 | "b1 = lm.params[1] # This is slope\n",
254 | "b0 = lm.params[0] # This is intercept"
255 | ]
256 | },
257 | {
258 | "cell_type": "code",
259 | "execution_count": null,
260 | "metadata": {
261 | "ExecuteTime": {
262 | "end_time": "2019-03-27T14:38:34.895112Z",
263 | "start_time": "2019-03-27T14:38:34.792217Z"
264 | }
265 | },
266 | "outputs": [],
267 | "source": [
268 | "# Set x,y values for each group\n",
269 | "gender0 = 0 # value of first group\n",
270 | "y0 = df[['scores']].loc[df['gender']==gender0].values.squeeze()\n",
271 | "y0 = y0+(np.random.rand(len(y0))-.5)*.1 #Change after + sign to control dispersion\n",
272 | "x0 = np.ones(len(y0))*gender0 + (np.random.rand(len(y0))-.5)*.1 #Change after + sign to control dispersion\n",
273 | "\n",
274 | "gender1 = 1 # value of second group\n",
275 | "y1 = df[['scores']].loc[df['gender']==gender1].values.squeeze()\n",
276 | "y1 = y1+(np.random.rand(len(y1))-.5)*.1\n",
277 | "x1 = np.ones(len(y1))*gender1 + (np.random.rand(len(y1))-.5)*.1\n",
278 | "\n",
279 | "fig,ax = plt.subplots(figsize=(5,5))\n",
280 | "ax.scatter(x0,y0,color='lightcoral')\n",
281 | "ax.scatter(x1,y1,color='lightcoral')\n",
282 | "\n",
283 | "# Part that adds the line\n",
284 | "spacing = 10\n",
285 | "minx = df[['gender']].min().squeeze()\n",
286 | "maxx = df[['gender']].max().squeeze()\n",
287 | "lx = np.linspace(minx,maxx,spacing) # make x coordinates \n",
288 | "ly = b0+lx*b1 # Estimate the y values using betas\n",
289 | "ax.plot(lx,ly,'-k')\n",
290 | "\n",
291 | "ax.set_ylabel('scores')\n",
292 | "ax.set_xlabel('gender')\n",
293 | "ax.set_title('Values are now dispersed')\n",
294 | "ax.set_xticks([0,1])\n",
295 | "ax.set_xticklabels(['Male','Female'])\n",
296 | "ax.set_xlim([-.5,1.5]) \n",
297 | "ax.grid()\n",
298 | "plt.show()"
299 | ]
300 | },
301 | {
302 | "cell_type": "code",
303 | "execution_count": null,
304 | "metadata": {},
305 | "outputs": [],
306 | "source": []
307 | },
308 | {
309 | "cell_type": "code",
310 | "execution_count": null,
311 | "metadata": {},
312 | "outputs": [],
313 | "source": []
314 | }
315 | ],
316 | "metadata": {
317 | "kernelspec": {
318 | "display_name": "Python 3",
319 | "language": "python",
320 | "name": "python3"
321 | },
322 | "language_info": {
323 | "codemirror_mode": {
324 | "name": "ipython",
325 | "version": 3
326 | },
327 | "file_extension": ".py",
328 | "mimetype": "text/x-python",
329 | "name": "python",
330 | "nbconvert_exporter": "python",
331 | "pygments_lexer": "ipython3",
332 | "version": "3.7.2"
333 | },
334 | "toc": {
335 | "base_numbering": 1,
336 | "nav_menu": {},
337 | "number_sections": true,
338 | "sideBar": true,
339 | "skip_h1_title": false,
340 | "title_cell": "Table of Contents",
341 | "title_sidebar": "Contents",
342 | "toc_cell": false,
343 | "toc_position": {},
344 | "toc_section_display": true,
345 | "toc_window_display": false
346 | }
347 | },
348 | "nbformat": 4,
349 | "nbformat_minor": 1
350 | }
351 |
--------------------------------------------------------------------------------
/Notebooks/3_Introduction_to_Regression.ipynb:
--------------------------------------------------------------------------------
1 | {
2 | "cells": [
3 | {
4 | "cell_type": "markdown",
5 | "metadata": {
6 | "deletable": true,
7 | "editable": true
8 | },
9 | "source": [
10 | "# Linear Regression Analysis\n",
11 | "\n",
12 | "*Written by Jin Cheong & Luke Chang*\n",
13 | "\n",
14 | "In this lab we are going to learn how to do simple data analyses using python. \n",
15 | "This module includes learning how to run simple linear regression models, comparing models, & visualizing fits. - This notebook was adapted from material available [here](https://github.com/justmarkham/DAT4/blob/master/notebooks/08_linear_regression.ipynb)\n",
16 | "\n",
17 | "After the tutorial you will have the chance to apply the methods to a new set of data. \n"
18 | ]
19 | },
20 | {
21 | "cell_type": "code",
22 | "execution_count": null,
23 | "metadata": {
24 | "ExecuteTime": {
25 | "end_time": "2017-01-18T14:44:10.530703",
26 | "start_time": "2017-01-18T14:44:09.731729"
27 | },
28 | "collapsed": true,
29 | "deletable": true,
30 | "editable": true
31 | },
32 | "outputs": [],
33 | "source": [
34 | "%matplotlib inline\n",
35 | "\n",
36 | "import numpy as np\n",
37 | "import pandas as pd\n",
38 | "import matplotlib.pyplot as plt\n",
39 | "import statsmodels.api as sm\n",
40 | "import statsmodels.formula.api as smf\n",
41 | "import statsmodels.stats.api as stats"
42 | ]
43 | },
44 | {
45 | "cell_type": "markdown",
46 | "metadata": {
47 | "deletable": true,
48 | "editable": true
49 | },
50 | "source": [
51 | "We will load the data which is a comma delimited text file using the `read_csv()` method from pandas. \n",
52 | "The '../Data' indicates that we will move one folder up and then into the Data folder."
53 | ]
54 | },
55 | {
56 | "cell_type": "code",
57 | "execution_count": null,
58 | "metadata": {
59 | "ExecuteTime": {
60 | "end_time": "2017-01-18T14:44:10.536720",
61 | "start_time": "2017-01-18T14:44:10.532385"
62 | },
63 | "collapsed": false,
64 | "deletable": true,
65 | "editable": true
66 | },
67 | "outputs": [],
68 | "source": [
69 | "# Load the data\n",
70 | "df = pd.read_csv('../Data/salary.csv')"
71 | ]
72 | },
73 | {
74 | "cell_type": "code",
75 | "execution_count": null,
76 | "metadata": {
77 | "ExecuteTime": {
78 | "end_time": "2017-01-18T14:44:10.543760",
79 | "start_time": "2017-01-18T14:44:10.537887"
80 | },
81 | "collapsed": false,
82 | "deletable": true,
83 | "editable": true
84 | },
85 | "outputs": [],
86 | "source": [
87 | "df = df.dropna()\n",
88 | "df = df[df.gender!=2]"
89 | ]
90 | },
91 | {
92 | "cell_type": "code",
93 | "execution_count": null,
94 | "metadata": {
95 | "ExecuteTime": {
96 | "end_time": "2017-01-18T14:44:10.554847",
97 | "start_time": "2017-01-18T14:44:10.545364"
98 | },
99 | "collapsed": false,
100 | "deletable": true,
101 | "editable": true
102 | },
103 | "outputs": [],
104 | "source": [
105 | "print 'There are %i rows and %i columns in this data set' % df.shape\n",
106 | "df.isnull().sum()"
107 | ]
108 | },
109 | {
110 | "cell_type": "code",
111 | "execution_count": null,
112 | "metadata": {
113 | "ExecuteTime": {
114 | "end_time": "2017-01-18T14:44:10.765524",
115 | "start_time": "2017-01-18T14:44:10.556141"
116 | },
117 | "collapsed": false,
118 | "deletable": true,
119 | "editable": true
120 | },
121 | "outputs": [],
122 | "source": [
123 | "df[['salary','years']].plot(kind='scatter', x='years', y='salary')"
124 | ]
125 | },
126 | {
127 | "cell_type": "code",
128 | "execution_count": null,
129 | "metadata": {
130 | "ExecuteTime": {
131 | "end_time": "2017-01-18T14:44:10.810764",
132 | "start_time": "2017-01-18T14:44:10.767183"
133 | },
134 | "collapsed": false,
135 | "deletable": true,
136 | "editable": true
137 | },
138 | "outputs": [],
139 | "source": [
140 | "# Oneshot visualization\n",
141 | "## create a new numericalSeries called dept_num !! Just for visualization !! \n",
142 | "df['dept_num'] = df.departm.map({'bio':0, 'chem':1,'geol':2,'neuro':3,'stat':4,'physics':5,'math':6})\n",
143 | "df.tail()"
144 | ]
145 | },
146 | {
147 | "cell_type": "code",
148 | "execution_count": null,
149 | "metadata": {
150 | "ExecuteTime": {
151 | "end_time": "2017-01-18T14:44:11.455813",
152 | "start_time": "2017-01-18T14:44:10.828546"
153 | },
154 | "collapsed": false,
155 | "deletable": true,
156 | "editable": true
157 | },
158 | "outputs": [],
159 | "source": [
160 | "## Now plot all four categories\n",
161 | "fig, axs = plt.subplots(1, 4, sharey=True)\n",
162 | "df.plot(kind='scatter', x='gender', y='salary', ax=axs[0], figsize=(16, 4))\n",
163 | "df.plot(kind='scatter', x='dept_num', y='salary', ax=axs[1])\n",
164 | "df.plot(kind='scatter', x='years', y='salary', ax=axs[2])\n",
165 | "df.plot(kind='scatter', x='age', y='salary', ax=axs[3])\n",
166 | "# The problem is that it treats department as a continuous variable. "
167 | ]
168 | },
169 | {
170 | "cell_type": "markdown",
171 | "metadata": {
172 | "deletable": true,
173 | "editable": true
174 | },
175 | "source": [
176 | "## Linear Regression\n",
177 | "\n",
178 | "Now let's move on to running an analysis.\n",
179 | "\n",
180 | "Simple linear regression is an approach for predicting a **quantitative response** using a **single feature** (or \"predictor\" or \"input variable\"). It takes the following form:\n",
181 | "\n",
182 | "$y = \\beta_0 + \\beta_1\\cdot x $\n",
183 | "\n",
184 | "What does each term represent?\n",
185 | "- $y$ is the response\n",
186 | "- $x$ is the feature\n",
187 | "- $\\beta_0$ is the intercept\n",
188 | "- $\\beta_1$ is the coefficient for x\n",
189 | "\n",
190 | "Together, $\\beta_0$ and $\\beta_1$ are called the **model coefficients**. To create your model, you must \"estimate\" the values of these coefficients. And once we've estimated these parameters, we can use the model to predict things!"
191 | ]
192 | },
193 | {
194 | "cell_type": "markdown",
195 | "metadata": {
196 | "deletable": true,
197 | "editable": true
198 | },
199 | "source": [
200 | "## Estimating Model Coefficients\n",
201 | "\n",
202 | "Generally speaking, coefficients are estimated using the **least squares criterion**, which means we are find the line (mathematically) which minimizes the **sum of squared residuals** (or \"sum of squared errors\"):\n",
203 | "\n",
204 | "
\n",
205 | "\n",
206 | "What elements are present in the diagram?\n",
207 | "- The black dots are the **observed values** of x and y.\n",
208 | "- The blue line is our **least squares line**.\n",
209 | "- The red lines are the **residuals**, which are the distances between the observed values and the least squares line.\n",
210 | "\n",
211 | "How do the model coefficients relate to the least squares line?\n",
212 | "- $\\beta_0$ is the **intercept** (the value of $y$ when $x$=0)\n",
213 | "- $\\beta_1$ is the **slope** (the change in $y$ divided by change in $x$)\n",
214 | "\n",
215 | "Here is a graphical depiction of those calculations:\n",
216 | "\n",
217 | "
"
218 | ]
219 | },
220 | {
221 | "cell_type": "markdown",
222 | "metadata": {
223 | "deletable": true,
224 | "editable": true
225 | },
226 | "source": [
227 | "To run a regression we will be using the statsmodels module that was loaded above. We will first initalize a model object with the regression equation and the data to use. The format for specifying the regression equation is similar to R. (y ~ x).\n",
228 | "\n",
229 | "Here we will estimate the effect of gender on salary. Gender is a \"dummy variable\", meaning that it is only ones and zeros.\n",
230 | "\n",
231 | "After we have initialized the object instance we can run the fit method (this can be chained so you only need to run one line). After the parameters have been estimated we can query attributes of the model such as the estimated model parameters."
232 | ]
233 | },
234 | {
235 | "cell_type": "code",
236 | "execution_count": null,
237 | "metadata": {
238 | "ExecuteTime": {
239 | "end_time": "2017-01-18T14:44:11.484659",
240 | "start_time": "2017-01-18T14:44:11.473799"
241 | },
242 | "collapsed": false,
243 | "deletable": true,
244 | "editable": true,
245 | "scrolled": false
246 | },
247 | "outputs": [],
248 | "source": [
249 | "lm = smf.ols(formula = \"salary ~ gender\",data=df).fit()\n",
250 | "\n",
251 | "# print the coefficients\n",
252 | "print(lm.params)\n",
253 | "print(lm.params[1])"
254 | ]
255 | },
256 | {
257 | "cell_type": "markdown",
258 | "metadata": {
259 | "deletable": true,
260 | "editable": true
261 | },
262 | "source": [
263 | "## Interpreting Model Coefficients\n",
264 | "\n",
265 | "How do we interpret the estimated coefficients for this model?\n",
266 | "- $\\beta_0$, or the intercept, reflects the average salary for the reference condition of the dummy code (i.e., when gender = 0). This means that males make an average of `$69108.5`\n",
267 | "- to get the estimated female salary we need to add $\\beta_1$ (-13388.83), which becomes `$55719.67`\n",
268 | "\n",
269 | "We can also get a summary of all of the model statistics using the `summary()` method."
270 | ]
271 | },
272 | {
273 | "cell_type": "markdown",
274 | "metadata": {
275 | "deletable": true,
276 | "editable": true
277 | },
278 | "source": [
279 | " "
280 | ]
281 | },
282 | {
283 | "cell_type": "code",
284 | "execution_count": null,
285 | "metadata": {
286 | "ExecuteTime": {
287 | "end_time": "2017-01-18T14:44:11.965467",
288 | "start_time": "2017-01-18T14:44:11.954564"
289 | },
290 | "collapsed": false,
291 | "deletable": true,
292 | "editable": true
293 | },
294 | "outputs": [],
295 | "source": [
296 | "print(lm.summary())"
297 | ]
298 | },
299 | {
300 | "cell_type": "markdown",
301 | "metadata": {
302 | "deletable": true,
303 | "editable": true
304 | },
305 | "source": [
306 | "## Confidence in our Model\n",
307 | "\n",
308 | "Now let's examine the output. To learn more about Statsmodels and how to interpret the output, DataRobot has some decent posts on [simple linear regression](http://www.datarobot.com/blog/ordinary-least-squares-in-python/) and [multiple linear regression](http://www.datarobot.com/blog/multiple-regression-using-statsmodels/).\n",
309 | "\n",
310 | "\n",
311 | "**Question:** Is linear regression a high bias/low variance model, or a low bias/high variance model?\n",
312 | "\n",
313 | "**Answer:** High bias/low variance. Under repeated sampling, the line will stay roughly in the same place (low variance), but the average of those models won't do a great job capturing the true relationship (high bias). Note that low variance is a useful characteristic when you don't have a lot of training data!\n",
314 | "\n",
315 | "A closely related concept is **confidence intervals**. Statsmodels calculates 95% confidence intervals for our model coefficients, which are interpreted as follows: If the population from which this sample was drawn was **sampled 100 times**, approximately **95 of those confidence intervals** would contain the \"true\" coefficient.\n",
316 | "\n",
317 | "## Hypothesis Testing and p-values\n",
318 | "\n",
319 | "Closely related to confidence intervals is **hypothesis testing**. Generally speaking, you start with a **null hypothesis** and an **alternative hypothesis** (that is opposite the null). Then, you check whether the data supports **rejecting the null hypothesis** or **failing to reject the null hypothesis**.\n",
320 | "\n",
321 | "(Note that \"failing to reject\" the null is not the same as \"accepting\" the null hypothesis. The alternative hypothesis may indeed be true, except that you just don't have enough data to show that.)\n",
322 | "\n",
323 | "As it relates to model coefficients, here is the conventional hypothesis test:\n",
324 | "- **null hypothesis:** There is no relationship between gender and salary (and thus $\\beta_1$ equals zero)\n",
325 | "- **alternative hypothesis:** There is a relationship between TV ads and Sales (and thus $\\beta_1$ is not equal to zero)\n",
326 | "\n",
327 | "How do we test this hypothesis? Intuitively, we reject the null (and thus believe the alternative) if the 95% confidence interval **does not include zero**. Conversely, the **p-value** represents the probability that the coefficient is actually zero:\n",
328 | "\n",
329 | "## Overall Model Fit\n",
330 | "\n",
331 | "The overall goodness of how well the model fits the data can be described using the AIC or BIC which are information criterions that penalize for the number of free parameters in the model. A common way to evaluate the overall fit of a linear model is **$R^2$** value. $R^2$ is the **proportion of variance explained**, meaning the proportion of variance in the observed data that is explained by the model, or the reduction in error over the **null model**. (The null model just predicts the mean of the observed response, and thus it has an intercept and no slope.)\n",
332 | "\n",
333 | "$R^2$ is between 0 and 1, and higher is better because it means that more variance is explained by the model. Here's an example of what R-squared \"looks like\":\n",
334 | "\n",
335 | "In the model above, we see that $R^2$=0.09, which means that gender does not do a great job at predicting the overall salary."
336 | ]
337 | },
338 | {
339 | "cell_type": "markdown",
340 | "metadata": {
341 | "deletable": true,
342 | "editable": true
343 | },
344 | "source": [
345 | "## Multiple Linear Regression\n",
346 | "\n",
347 | "Simple linear regression can easily be extended to include multiple features. This is called **multiple linear regression**:\n",
348 | "\n",
349 | "$y = \\beta_0 + \\beta_1x_1 + ... + \\beta_nx_n$\n",
350 | "\n",
351 | "Each $x$ represents a different feature, and each feature has its own coefficient. In this case:\n",
352 | "\n",
353 | "$Salary = \\beta_0 + \\beta_1 \\cdot Gender + \\beta_2 \\cdot Years$\n",
354 | "\n",
355 | "Let's use Statsmodels to estimate these coefficients:"
356 | ]
357 | },
358 | {
359 | "cell_type": "code",
360 | "execution_count": null,
361 | "metadata": {
362 | "ExecuteTime": {
363 | "end_time": "2017-01-18T14:44:12.437296",
364 | "start_time": "2017-01-18T14:44:12.419615"
365 | },
366 | "collapsed": false,
367 | "deletable": true,
368 | "editable": true
369 | },
370 | "outputs": [],
371 | "source": [
372 | "lm2 = smf.ols(formula = \"salary ~ gender + years\",data=df).fit()\n",
373 | "print(lm2.summary())"
374 | ]
375 | },
376 | {
377 | "cell_type": "markdown",
378 | "metadata": {
379 | "deletable": true,
380 | "editable": true
381 | },
382 | "source": [
383 | "Here we see that `years` also significantly explains additional variance.\n",
384 | "\n",
385 | "This is reflected in the $R^2$, which has now increased from 0.09 to 0.143. It is important to note that **$R^2$ will always increase as you add more features to the model**, even if they are unrelated to the response. Thus, selecting the model with the highest R-squared is not a reliable approach for choosing the best linear model.\n",
386 | "\n",
387 | "There is alternative to $R^2$ called **adjusted $R^2$** that penalizes model complexity (to control for overfitting), but it generally [under-penalizes complexity](http://scott.fortmann-roe.com/docs/MeasuringError.html).\n",
388 | "\n",
389 | "Next week we will explore another approach known as **Cross-validation.** Importantly, cross-validation can be applied to any model, whereas the methods described above only apply to linear models.\n",
390 | "\n",
391 | "Here, we will try and see if the new model significantly explains additional variance controlling for the extra free parameter using a nested model comparison."
392 | ]
393 | },
394 | {
395 | "cell_type": "code",
396 | "execution_count": null,
397 | "metadata": {
398 | "ExecuteTime": {
399 | "end_time": "2017-01-18T14:44:12.769250",
400 | "start_time": "2017-01-18T14:44:12.755716"
401 | },
402 | "collapsed": false,
403 | "deletable": true,
404 | "editable": true
405 | },
406 | "outputs": [],
407 | "source": [
408 | "print(stats.anova_lm(lm,lm2))"
409 | ]
410 | },
411 | {
412 | "cell_type": "markdown",
413 | "metadata": {
414 | "deletable": true,
415 | "editable": true
416 | },
417 | "source": [
418 | "We see that the addition of `years` in model2 significantly explains additional variance when compared to model 1. Notice that this is similar to the results provided by the t-tests on the individual parameters. Using model comparison to test incrementally adding a single predictor is a feature selection method known as stepwise regression. However, you can also perform model comparison on models with many different features as long as one is nested within the other.\n",
419 | "\n",
420 | "When we only had one predictor we could investigate the relationship using a scatterplot. Now that we have 2 predictors, we need to make a 3 dimensional plot to see the effect of each predictor on salary. "
421 | ]
422 | },
423 | {
424 | "cell_type": "code",
425 | "execution_count": null,
426 | "metadata": {
427 | "ExecuteTime": {
428 | "end_time": "2017-01-18T14:44:13.989664",
429 | "start_time": "2017-01-18T14:44:13.091418"
430 | },
431 | "collapsed": false,
432 | "deletable": true,
433 | "editable": true
434 | },
435 | "outputs": [],
436 | "source": [
437 | "from mpl_toolkits.mplot3d import Axes3D\n",
438 | "\n",
439 | "fig = plt.figure(figsize=(18,6))\n",
440 | "idx = [131,132,133]\n",
441 | "pos = {0:(-35,20),1:(-90,0),2:(0,0)}\n",
442 | "for i,ii in enumerate(idx):\n",
443 | " ax = fig.add_subplot(ii, projection='3d')\n",
444 | "\n",
445 | " # generate a mesh\n",
446 | " x_surf = np.arange(0,11)/10.0\n",
447 | " y_surf = np.arange(0,35,3)\n",
448 | " x_surf, y_surf = np.meshgrid(x_surf, y_surf)\n",
449 | "\n",
450 | " exog = pd.core.frame.DataFrame({'gender': x_surf.ravel(), 'years': y_surf.ravel()})\n",
451 | " exog['intercept'] = np.ones(exog.shape[0])\n",
452 | " out = lm2.predict(exog = exog,transform=True)\n",
453 | "\n",
454 | " ax.plot_surface(x_surf, y_surf,\n",
455 | " out.reshape(x_surf.shape),\n",
456 | " rstride=1,\n",
457 | " cstride=1,\n",
458 | " color='None',\n",
459 | " alpha = 0.2)\n",
460 | "\n",
461 | " ax.scatter(df['gender'], df['years'], df['salary'],\n",
462 | " c='blue',\n",
463 | " marker='o',\n",
464 | " alpha=1)\n",
465 | "\n",
466 | " ax.set_xlabel('Gender')\n",
467 | " ax.set_ylabel('Years')\n",
468 | " ax.set_zlabel('Salary')\n",
469 | " ax.azim = pos[i][0]\n",
470 | " ax.elev = pos[i][1]\n",
471 | "\n",
472 | "plt.tight_layout()"
473 | ]
474 | },
475 | {
476 | "cell_type": "markdown",
477 | "metadata": {
478 | "deletable": true,
479 | "editable": true
480 | },
481 | "source": [
482 | "Notice how when we rotate the 3D plot, we can make projections into 2 dimensional space. This is the effect of each predictor on salary controlling for the other. \n",
483 | "\n",
484 | "We can also generate additional diagnostic plots."
485 | ]
486 | },
487 | {
488 | "cell_type": "code",
489 | "execution_count": null,
490 | "metadata": {
491 | "ExecuteTime": {
492 | "end_time": "2017-01-18T14:44:35.826100",
493 | "start_time": "2017-01-18T14:44:35.156463"
494 | },
495 | "collapsed": false,
496 | "deletable": true,
497 | "editable": true,
498 | "scrolled": false
499 | },
500 | "outputs": [],
501 | "source": [
502 | "f = sm.graphics.plot_regress_exog(lm2, 'years')\n",
503 | "f.set_size_inches(10,5)"
504 | ]
505 | },
506 | {
507 | "cell_type": "markdown",
508 | "metadata": {
509 | "deletable": true,
510 | "editable": true
511 | },
512 | "source": [
513 | "Lastly, we can create a standard ANOVA table with F statistics as if doing Anova. F stat tests whether at least one of the coefficients of the variables of a category is significantly different from 0 or not.\n",
514 | "\n",
515 | "There are different times of Sum of Squared Error (SSQ) to consider. We recommend using Type 3 by default. See [here](https://mcfromnz.wordpress.com/2011/03/02/anova-type-iiiiii-ss-explained/) for more in depth discussion."
516 | ]
517 | },
518 | {
519 | "cell_type": "code",
520 | "execution_count": null,
521 | "metadata": {
522 | "ExecuteTime": {
523 | "end_time": "2017-01-18T14:44:14.602669",
524 | "start_time": "2017-01-18T14:44:14.590320"
525 | },
526 | "collapsed": false,
527 | "deletable": true,
528 | "editable": true
529 | },
530 | "outputs": [],
531 | "source": [
532 | "# Type 3 SSQ: valid for models with significant interaction terms\n",
533 | "print('ANOVA table for Type 3 SSQ')\n",
534 | "print(stats.anova_lm(lm2,typ=3))"
535 | ]
536 | },
537 | {
538 | "cell_type": "markdown",
539 | "metadata": {
540 | "deletable": true,
541 | "editable": true
542 | },
543 | "source": [
544 | "## Additional Resources for Learning about Regression\n",
545 | "\n",
546 | "- To go much more in-depth on linear regression, read Chapter 3 of [An Introduction to Statistical Learning](http://www-bcf.usc.edu/~gareth/ISL/). Alternatively, watch the [related videos](http://www.dataschool.io/15-hours-of-expert-machine-learning-videos/) or read a [quick reference guide](http://www.dataschool.io/applying-and-interpreting-linear-regression/) to the key points in that chapter.\n",
547 | "- This [introduction to linear regression](http://people.duke.edu/~rnau/regintro.htm) is much more detailed and mathematically thorough, and includes lots of good advice.\n",
548 | "- This is a relatively quick post on the [assumptions of linear regression](http://pareonline.net/getvn.asp?n=2&v=8)."
549 | ]
550 | },
551 | {
552 | "cell_type": "markdown",
553 | "metadata": {
554 | "collapsed": false,
555 | "deletable": true,
556 | "editable": true
557 | },
558 | "source": [
559 | "# Exercises\n",
560 | "Use the 'salary_exercise.csv' to apply the analyses we learned today.\n",
561 | "This dataset was adapted from material available [here](http://data.princeton.edu/wws509/datasets/#salary)\n",
562 | "\n",
563 | "These are the salary data used in Weisberg's book, consisting of observations on six variables for 52 tenure-track professors in a small college. The variables are:\n",
564 | "\n",
565 | "- sx = Sex, coded 1 for female and 0 for male\n",
566 | "- rk = Rank, coded\n",
567 | " - 1 for assistant professor,\n",
568 | " - 2 for associate professor, and\n",
569 | " - 3 for full professor\n",
570 | "- yr = Number of years in current rank\n",
571 | "- dg = Highest degree, coded 1 if doctorate, 0 if masters\n",
572 | "- yd = Number of years since highest degree was earned\n",
573 | "- sl = Academic year salary, in dollars.\n",
574 | "\n",
575 | "Reference: S. Weisberg (1985). Applied Linear Regression, Second Edition. New York: John Wiley and Sons. Page 194.\n",
576 | "##### Make sure you check the data for missing values or other errors before beginning your analysis.\n",
577 | "\n",
578 | "## Exercise 1\n",
579 | "Run a simple linear regression model in which you predict salary from sex of the professors. \n",
580 | "Is there a gender difference in salary? \n",
581 | "\n",
582 | "## Exercise 2\n",
583 | "Experiment with different combination of variables in your model to answer the following questions. \n",
584 | "What is your best model to explain salary from the given data? \n",
585 | "Is there a gender difference in salary?"
586 | ]
587 | }
588 | ],
589 | "metadata": {
590 | "anaconda-cloud": {},
591 | "kernelspec": {
592 | "display_name": "Python 2",
593 | "language": "python",
594 | "name": "python2"
595 | },
596 | "language_info": {
597 | "codemirror_mode": {
598 | "name": "ipython",
599 | "version": 2
600 | },
601 | "file_extension": ".py",
602 | "mimetype": "text/x-python",
603 | "name": "python",
604 | "nbconvert_exporter": "python",
605 | "pygments_lexer": "ipython2",
606 | "version": "2.7.13"
607 | },
608 | "toc": {
609 | "nav_menu": {
610 | "height": "262px",
611 | "width": "252px"
612 | },
613 | "navigate_menu": true,
614 | "number_sections": true,
615 | "sideBar": true,
616 | "threshold": 4,
617 | "toc_cell": false,
618 | "toc_section_display": "block",
619 | "toc_window_display": false
620 | }
621 | },
622 | "nbformat": 4,
623 | "nbformat_minor": 0
624 | }
625 |
--------------------------------------------------------------------------------
/Notebooks/4_Mediation_&_Moderation.ipynb:
--------------------------------------------------------------------------------
1 | {
2 | "cells": [
3 | {
4 | "cell_type": "markdown",
5 | "metadata": {
6 | "deletable": true,
7 | "editable": true
8 | },
9 | "source": [
10 | "# Mediation Analysis\n",
11 | "Written by Jin Cheong & Luke Chang\n",
12 | "\n",
13 | "A mediation analysis is conducted when a researcher is interested in the **mechanism** underlying how variable X has an effect on variable Y. It attempts to make a causal inference that a direct effect might be better explained by an indirect effect through a mediating variable. \n",
14 | "\n",
15 | "Consider the instance below where X has an effect **c** on Y:\n",
16 | "\n",
17 | "### 1) $Y = \\beta_1 + c \\cdot X $\n",
18 | "\n",
19 | "In this model, there may be a third variable **M** which mediates the effect of X on Y. In other words, the variable M is partially responsible for the effect X has on Y.\n",
20 | "\n",
21 | "To conduct a mediation analysis one estimates two additional models: 1) the effect of X on M, and 2) the effect of X *and* M on Y. \n",
22 | "\n",
23 | "### 2) $M = \\beta_2 + a \\cdot X $\n",
24 | "\n",
25 | "### 3) $Y = \\beta_3 + c' \\cdot X + b \\cdot M $ \n",
26 | " \n",
27 | " \n",
28 | "Now the direct effect of X on Y, denoted as **C**,can be broken down into two parts:\n",
29 | "\n",
30 | "### $c = a \\cdot b + c' $\n",
31 | "\n",
32 | "$ a \\cdot b $ is the indirect effect of X on Y via the mediator. \n",
33 | "$c'$ is the remaining direct effect of X on Y controlling for M. \n",
34 | "\n",
35 | "This relationship is depicted below. Note that M and Y both also have error included in the model.\n",
36 | "\n",
37 | "***Question*** Why does X not have error included in the model?\n",
38 | "\n",
39 | "***Answer*** Because X is only a regressor not an outcome variable in the models and standard regression does not estimate error on regressors. See [orthogonal regression](http://davegiles.blogspot.com/2014/11/orthogonal-regression-first-steps.html) for a technique that models error on both X and Y.\n",
40 | "\n",
41 | "
\n",
42 | "\n",
43 | "\n",
44 | "### Here are a few examples of mediations. Can you think of more?\n",
45 | "\n",
46 | "1) The effect of failure on depressed feelings is mediated by internalization of failure. \n",
47 | "\n",
48 | "2) Effect of CBT treatment is mediated by changes in cognition. \n",
49 | "\n",
50 | "3) Effect of food intake on weight gain is mediated by metabolism.\n",
51 | "\n",
52 | "#### For more information there is a nice [tutorial](http://davidakenny.net/cm/mediate.htm) on mediation by David Kenny, one of the author's of the original mediation paper."
53 | ]
54 | },
55 | {
56 | "cell_type": "markdown",
57 | "metadata": {
58 | "deletable": true,
59 | "editable": true
60 | },
61 | "source": [
62 | "## Simulate a mediation\n",
63 | "\n",
64 | "In this section we will simulate a mediation.\n",
65 | "\n",
66 | "This is a case in which the true effect of X on Y is positive, but appears negative without testing for mediation. "
67 | ]
68 | },
69 | {
70 | "cell_type": "code",
71 | "execution_count": null,
72 | "metadata": {
73 | "ExecuteTime": {
74 | "end_time": "2017-02-08T11:04:14.080567",
75 | "start_time": "2017-02-08T11:04:14.071762"
76 | },
77 | "collapsed": false,
78 | "deletable": true,
79 | "editable": true
80 | },
81 | "outputs": [],
82 | "source": [
83 | "%matplotlib inline\n",
84 | "\n",
85 | "import numpy as np\n",
86 | "import pandas as pd\n",
87 | "import matplotlib.pyplot as plt\n",
88 | "import statsmodels.formula.api as smf\n",
89 | "import statsmodels.api as sm\n",
90 | "from scipy import stats\n",
91 | "\n",
92 | "def sobel_test(a, b, se_a, se_b):\n",
93 | " '''\n",
94 | " Sobel test for significance of mediation\n",
95 | " \n",
96 | " Args: \n",
97 | " a: coefficient from X to mediator variable, M\n",
98 | " b: coefficient from M to Y \n",
99 | " se_a: Standard error of A\n",
100 | " se_b: Standard error fo B\n",
101 | "\n",
102 | " Returns: \n",
103 | " t: Sobel's test statistic\n",
104 | " pval : Two-tailed probability assuming normal distribution\n",
105 | " '''\n",
106 | " \n",
107 | " SE = np.sqrt( (a**2)*(se_a**2) + (b**2)*(se_b**2))\n",
108 | " t = (a*b) / SE\n",
109 | " n = 100000000\n",
110 | " pval = stats.t.sf(np.abs(t), n-1)*2\n",
111 | " return t, pval"
112 | ]
113 | },
114 | {
115 | "cell_type": "code",
116 | "execution_count": null,
117 | "metadata": {
118 | "ExecuteTime": {
119 | "end_time": "2017-02-08T11:04:14.211563",
120 | "start_time": "2017-02-08T11:04:14.206080"
121 | },
122 | "collapsed": false,
123 | "deletable": true,
124 | "editable": true
125 | },
126 | "outputs": [],
127 | "source": [
128 | "# set random seed so everyone gets same results\n",
129 | "np.random.seed(1)\n",
130 | "\n",
131 | "# Determine effects\n",
132 | "a = -3 # effect of x to M\n",
133 | "b = 3 # effect of M to y\n",
134 | "cq = .1 # effect of x on y controlling for M\n",
135 | "\n",
136 | "# Create a random data x\n",
137 | "x = np.random.rand(100) \n",
138 | "m = x * a + np.random.rand(100)\n",
139 | "\n",
140 | "# Create Y\n",
141 | "y = np.dot(np.array([x,m]).T,[cq,b]) + np.random.rand(100)"
142 | ]
143 | },
144 | {
145 | "cell_type": "code",
146 | "execution_count": null,
147 | "metadata": {
148 | "ExecuteTime": {
149 | "end_time": "2017-02-08T11:04:14.550299",
150 | "start_time": "2017-02-08T11:04:14.364989"
151 | },
152 | "collapsed": false,
153 | "deletable": true,
154 | "editable": true
155 | },
156 | "outputs": [],
157 | "source": [
158 | "plt.scatter(x,y)\n",
159 | "plt.xlabel('X')\n",
160 | "plt.ylabel('Y')\n",
161 | "plt.title('X -> Y')"
162 | ]
163 | },
164 | {
165 | "cell_type": "code",
166 | "execution_count": null,
167 | "metadata": {
168 | "ExecuteTime": {
169 | "end_time": "2017-02-08T11:04:14.728465",
170 | "start_time": "2017-02-08T11:04:14.551956"
171 | },
172 | "collapsed": false,
173 | "deletable": true,
174 | "editable": true
175 | },
176 | "outputs": [],
177 | "source": [
178 | "plt.scatter(x,m)\n",
179 | "plt.xlabel('X')\n",
180 | "plt.ylabel('M')\n",
181 | "plt.title('X -> M')"
182 | ]
183 | },
184 | {
185 | "cell_type": "code",
186 | "execution_count": null,
187 | "metadata": {
188 | "ExecuteTime": {
189 | "end_time": "2017-02-08T11:04:15.133448",
190 | "start_time": "2017-02-08T11:04:14.954491"
191 | },
192 | "collapsed": false,
193 | "deletable": true,
194 | "editable": true
195 | },
196 | "outputs": [],
197 | "source": [
198 | "plt.scatter(m,y)\n",
199 | "plt.xlabel('M')\n",
200 | "plt.ylabel('Y')\n",
201 | "plt.title('M -> Y')"
202 | ]
203 | },
204 | {
205 | "cell_type": "markdown",
206 | "metadata": {
207 | "deletable": true,
208 | "editable": true
209 | },
210 | "source": [
211 | "### 1) Test effect of X on Y"
212 | ]
213 | },
214 | {
215 | "cell_type": "code",
216 | "execution_count": null,
217 | "metadata": {
218 | "ExecuteTime": {
219 | "end_time": "2017-02-08T11:04:15.148139",
220 | "start_time": "2017-02-08T11:04:15.135054"
221 | },
222 | "collapsed": false,
223 | "deletable": true,
224 | "editable": true
225 | },
226 | "outputs": [],
227 | "source": [
228 | "X = pd.DataFrame({'Intercept':np.ones(len(x)),'X':x})\n",
229 | "lm1 = smf.OLS(y,X).fit()\n",
230 | "print lm1.summary()\n",
231 | "ec = lm1.params[1] # save total effect c to ec"
232 | ]
233 | },
234 | {
235 | "cell_type": "markdown",
236 | "metadata": {
237 | "deletable": true,
238 | "editable": true
239 | },
240 | "source": [
241 | "### 2) Test effect of X on M"
242 | ]
243 | },
244 | {
245 | "cell_type": "code",
246 | "execution_count": null,
247 | "metadata": {
248 | "ExecuteTime": {
249 | "end_time": "2017-02-08T11:04:15.410553",
250 | "start_time": "2017-02-08T11:04:15.397131"
251 | },
252 | "collapsed": false,
253 | "deletable": true,
254 | "editable": true
255 | },
256 | "outputs": [],
257 | "source": [
258 | "lm2 = smf.OLS(m,X).fit()\n",
259 | "print lm2.summary()\n",
260 | "ea = lm2.params[1] # Save the effect of X on M, a, to ea\n",
261 | "sea = lm2.bse[1]"
262 | ]
263 | },
264 | {
265 | "cell_type": "markdown",
266 | "metadata": {
267 | "deletable": true,
268 | "editable": true
269 | },
270 | "source": [
271 | "### 3) Test effect of X and M on Y "
272 | ]
273 | },
274 | {
275 | "cell_type": "code",
276 | "execution_count": null,
277 | "metadata": {
278 | "ExecuteTime": {
279 | "end_time": "2017-02-08T11:04:15.737957",
280 | "start_time": "2017-02-08T11:04:15.724569"
281 | },
282 | "collapsed": false,
283 | "deletable": true,
284 | "editable": true
285 | },
286 | "outputs": [],
287 | "source": [
288 | "X['M'] = m\n",
289 | "lm3 = smf.OLS(y,X).fit()\n",
290 | "print lm3.summary()\n",
291 | "ecq,eb = lm3.params[1:3]\n",
292 | "seb = lm3.bse[2]"
293 | ]
294 | },
295 | {
296 | "cell_type": "markdown",
297 | "metadata": {
298 | "deletable": true,
299 | "editable": true
300 | },
301 | "source": [
302 | "### Show how the effect is broken down to direct and indirect effects\n",
303 | "Recall how the overall effect C was decomposed to indirect effect (a*b) and direct effect (c')\n",
304 | "### $c = a \\cdot b + c' $"
305 | ]
306 | },
307 | {
308 | "cell_type": "code",
309 | "execution_count": null,
310 | "metadata": {
311 | "ExecuteTime": {
312 | "end_time": "2017-02-08T11:04:16.057799",
313 | "start_time": "2017-02-08T11:04:16.053220"
314 | },
315 | "collapsed": false,
316 | "deletable": true,
317 | "editable": true
318 | },
319 | "outputs": [],
320 | "source": [
321 | "print('c : %.2f') % ec\n",
322 | "print('a : %.2f') % ea\n",
323 | "print('b : %.2f') % eb\n",
324 | "print('c\\' : %.2f') % ecq"
325 | ]
326 | },
327 | {
328 | "cell_type": "code",
329 | "execution_count": null,
330 | "metadata": {
331 | "ExecuteTime": {
332 | "end_time": "2017-02-08T11:04:16.227940",
333 | "start_time": "2017-02-08T11:04:16.222887"
334 | },
335 | "collapsed": false,
336 | "deletable": true,
337 | "editable": true
338 | },
339 | "outputs": [],
340 | "source": [
341 | "print('Total effect C: %.2f') % ec\n",
342 | "print('is decomposed into the indirect(mediated) effect a*b: %.2f') % (ea*eb)\n",
343 | "print('plus the direct effect c\\': %.2f') % ecq\n",
344 | "print('which adds up to %.2f') % (ea*eb+ecq)"
345 | ]
346 | },
347 | {
348 | "cell_type": "markdown",
349 | "metadata": {
350 | "deletable": true,
351 | "editable": true
352 | },
353 | "source": [
354 | "## Run a Sobel Test for Significance of Mediation\n",
355 | "\n",
356 | "One way to test the significance of a mediation is to perform a Sobel test, where the indirect effect(a*b) is divided by an estimated standard error of the two. This assumes that the product would be normally distributed which may not always be the case. \n",
357 | "\n",
358 | "An alternative method is to bootstrap with replacement on the observed data to generate a 95% confidence interval. You can try this by writing a for-loop that resamples from the data and generate a distribution of the indirect effects(a*b). If the confidence interval does not include 0, it can be considered as significant. "
359 | ]
360 | },
361 | {
362 | "cell_type": "code",
363 | "execution_count": null,
364 | "metadata": {
365 | "ExecuteTime": {
366 | "end_time": "2017-02-08T11:04:16.587230",
367 | "start_time": "2017-02-08T11:04:16.582745"
368 | },
369 | "collapsed": false,
370 | "deletable": true,
371 | "editable": true
372 | },
373 | "outputs": [],
374 | "source": [
375 | "t,p = sobel_test(ea,eb,sea,seb)\n",
376 | "print('Sobel\\'s test of significance t = %2.2f') % t\n",
377 | "print('Two-tailed p-value p = %2.5f ') % p"
378 | ]
379 | },
380 | {
381 | "cell_type": "markdown",
382 | "metadata": {
383 | "deletable": true,
384 | "editable": true
385 | },
386 | "source": [
387 | "# Moderation Analysis\n",
388 | "\n",
389 | "In a moderation analysis, the moderator modifies or changes the relationship between two variables, akin to an interaction term. Moderation is slightly different from an interaction due to the additional constraint that there is a causal relationship from X to Y, BUT not from Z to Y. Therefore, a moderation implies an interaction exists but an interaction does not imply a moderation. \n",
390 | "\n",
391 | "Here is a schematic representation of a moderation relationship. \n",
392 | "This diagram hypothesize that Stress has a causal relationship to Depression \n",
393 | "but the effect of Stress is different for people with high or low Social Support \n",
394 | "\n",
395 | "
\n",
396 | "\n",
397 | "This can be reprsented by an interaction, \n",
398 | "\n",
399 | "\n",
400 | "\n",
401 | "
\n",
402 | "\n",
403 | "\n",
404 | "The pictures have been retrieved from [here](http://www.victoria.ac.nz/psyc/paul-jose-files/helpcentre/help5_moderation_example.php)\n",
405 | "\n",
406 | "### Here are a few examples of moderations. Can you think of more?\n",
407 | "\n",
408 | "1) The effect of compliments on future grades is moderated by growth mindset [(Carol Dweck)](http://mindsetonline.com/)\n",
409 | "\n",
410 | "2) Effect of favorability on government behavior is moderated by political affiliation. \n",
411 | "\n",
412 | "3) Effect of pressure on performance is moderated by confidence (choking vs boosting). \n",
413 | "\n",
414 | "### For more information look at [homepage of Kenny](http://davidakenny.net/cm/moderation.htm) who started all this. "
415 | ]
416 | },
417 | {
418 | "cell_type": "markdown",
419 | "metadata": {
420 | "deletable": true,
421 | "editable": true
422 | },
423 | "source": [
424 | "## Example \n",
425 | "Here we examine whether the effect of buying books (**Buy**) on enjoyment of reading (**Enjoy**) is moderated by frequency of reading (**Read**).\n",
426 | "\n",
427 | "The moderation effect exists if there is an interaction of buying and reading on enjoyment."
428 | ]
429 | },
430 | {
431 | "cell_type": "code",
432 | "execution_count": null,
433 | "metadata": {
434 | "ExecuteTime": {
435 | "end_time": "2017-02-08T11:08:49.941382",
436 | "start_time": "2017-02-08T11:08:49.931106"
437 | },
438 | "collapsed": true
439 | },
440 | "outputs": [],
441 | "source": [
442 | "df = pd.DataFrame({\n",
443 | " 'Enjoy':[4, 16, 4, 12, 9, 5, 15, 21, 3, 4, 8, 11, 7, 5, 8, 19, 11, 9, 9, 13, 11, 21, 18, 12, 15, 3, 2, 10, 7, 9, 5, 6, 9, 12, 9, 5, 17, 15, 9, 7, 5, 10, 10, 6, 7, 9, 12, 2, 1, 5, 7, 5, 8, 5, 5, 11, 8, 9, 13, 9, 19, 8, 21, 1, 11, 8, 6, 23, 2, 9, 13, 4, 10, 12, 5, 7, 10, 11, 12, 13],\n",
444 | " 'Buy':[5, 8, 0, 8, 3, 0, 8, 9, 8, 1, 7, 2, 2, 9, 3, 8, 8, 9, 5, 7, 7, 9, 6, 7, 8, 4, 4, 3, 1, 4, 1, 5, 5, 2, 9, 5, 7, 8, 2, 4, 1, 4, 0, 1, 0, 8, 2, 4, 0, 0, 0, 1, 2, 2, 2, 7, 5, 1, 9, 9, 8, 1, 7, 2, 5, 2, 4, 9, 1, 6, 3, 0, 7, 5, 2, 3, 1, 8, 6, 4],\n",
445 | " 'Read':[0, 8, 0, 5, 4, 9, 6, 9, 1, 3, 3, 3, 8, 0, 9, 8, 6, 0, 5, 6, 1, 7, 7, 5, 7, 2, 0, 5, 1, 7, 7, 4, 6, 5, 3, 1, 7, 6, 0, 4, 0, 9, 5, 9, 2, 3, 5, 2, 5, 2, 9, 1, 1, 7, 9, 3, 0, 4, 4, 3, 8, 8, 8, 2, 3, 7, 1, 8, 6, 1, 7, 0, 3, 2, 5, 3, 8, 6, 9, 7] \n",
446 | "})"
447 | ]
448 | },
449 | {
450 | "cell_type": "markdown",
451 | "metadata": {
452 | "deletable": true,
453 | "editable": true
454 | },
455 | "source": [
456 | "# Importance of centering variables for interaction\n",
457 | "The interaction effect can be **VERY** different if you don't center your variables"
458 | ]
459 | },
460 | {
461 | "cell_type": "code",
462 | "execution_count": null,
463 | "metadata": {
464 | "ExecuteTime": {
465 | "end_time": "2017-02-08T11:08:51.761113",
466 | "start_time": "2017-02-08T11:08:51.752846"
467 | },
468 | "collapsed": false,
469 | "deletable": true,
470 | "editable": true
471 | },
472 | "outputs": [],
473 | "source": [
474 | "df['Interaction'] = df.Read*df.Buy\n",
475 | "np.corrcoef(df.Buy,df.Interaction)"
476 | ]
477 | },
478 | {
479 | "cell_type": "code",
480 | "execution_count": null,
481 | "metadata": {
482 | "ExecuteTime": {
483 | "end_time": "2017-02-08T11:09:41.621877",
484 | "start_time": "2017-02-08T11:09:41.462656"
485 | },
486 | "collapsed": false,
487 | "deletable": true,
488 | "editable": true
489 | },
490 | "outputs": [],
491 | "source": [
492 | "plt.scatter(df.Buy,df.Interaction)\n",
493 | "plt.xlabel('Read * Buy')\n",
494 | "plt.ylabel('Buy')"
495 | ]
496 | },
497 | {
498 | "cell_type": "code",
499 | "execution_count": null,
500 | "metadata": {
501 | "ExecuteTime": {
502 | "end_time": "2017-02-08T11:08:59.826515",
503 | "start_time": "2017-02-08T11:08:59.816012"
504 | },
505 | "collapsed": false,
506 | "deletable": true,
507 | "editable": true
508 | },
509 | "outputs": [],
510 | "source": [
511 | "center = lambda x: (x - x.mean())\n",
512 | "df[['Read_Centered','Buy_Centered']] = df[['Read','Buy']].apply(center)\n",
513 | "df['Interaction_Centered'] = df['Read_Centered'] * df['Buy_Centered']\n",
514 | "np.corrcoef(df.Buy,df.Interaction_Centered)"
515 | ]
516 | },
517 | {
518 | "cell_type": "code",
519 | "execution_count": null,
520 | "metadata": {
521 | "ExecuteTime": {
522 | "end_time": "2017-02-08T11:09:01.142976",
523 | "start_time": "2017-02-08T11:09:00.976403"
524 | },
525 | "collapsed": false,
526 | "deletable": true,
527 | "editable": true
528 | },
529 | "outputs": [],
530 | "source": [
531 | "plt.scatter(df.Buy,df.Interaction_Centered)\n",
532 | "plt.xlabel('Read * Buy')\n",
533 | "plt.ylabel('Buy')"
534 | ]
535 | },
536 | {
537 | "cell_type": "code",
538 | "execution_count": null,
539 | "metadata": {
540 | "ExecuteTime": {
541 | "end_time": "2017-02-08T11:09:05.225235",
542 | "start_time": "2017-02-08T11:09:05.208483"
543 | },
544 | "collapsed": false
545 | },
546 | "outputs": [],
547 | "source": [
548 | "mod = smf.ols(formula = \"Enjoy ~ Buy + Read + Interaction\", data = df).fit()\n",
549 | "print mod.summary()"
550 | ]
551 | },
552 | {
553 | "cell_type": "code",
554 | "execution_count": null,
555 | "metadata": {
556 | "ExecuteTime": {
557 | "end_time": "2017-02-08T11:09:05.834256",
558 | "start_time": "2017-02-08T11:09:05.816079"
559 | },
560 | "collapsed": false,
561 | "deletable": true,
562 | "editable": true
563 | },
564 | "outputs": [],
565 | "source": [
566 | "mod = smf.ols(formula = \"Enjoy ~ Buy + Read + Interaction_Centered\", data = df).fit()\n",
567 | "print mod.summary()"
568 | ]
569 | },
570 | {
571 | "cell_type": "code",
572 | "execution_count": null,
573 | "metadata": {
574 | "collapsed": true
575 | },
576 | "outputs": [],
577 | "source": []
578 | },
579 | {
580 | "cell_type": "markdown",
581 | "metadata": {
582 | "deletable": true,
583 | "editable": true
584 | },
585 | "source": [
586 | "# Exercises\n",
587 | "\n",
588 | "## Exercise 1 - Mediation\n",
589 | "\n",
590 | "For exercise 1, let's revisit the salary dataset. ('../Data/salary_exercise.csv') \n",
591 | "\n",
592 | "These are the salary data used in Weisberg's book, consisting of observations on six variables for 52 tenure-track professors in a small college. The variables are:\n",
593 | "\n",
594 | "- sx = Sex, coded 1 for female and 0 for male\n",
595 | "- rk = Rank, coded\n",
596 | " - 1 for assistant professor,\n",
597 | " - 2 for associate professor, and\n",
598 | " - 3 for full professor\n",
599 | "- yr = Number of years in current rank\n",
600 | "- dg = Highest degree, coded 1 if doctorate, 0 if masters\n",
601 | "- yd = Number of years since highest degree was earned\n",
602 | "- sl = Academic year salary, in dollars.\n",
603 | "\n",
604 | "Reference: S. Weisberg (1985). Applied Linear Regression, Second Edition. New York: John Wiley and Sons. Page 194.\n",
605 | "\n",
606 | "**Make sure you check the data for missing values or other errors before beginning your analysis.**\n",
607 | "\n",
608 | "### Question:\n",
609 | "Evaluate whether academic rank mediates the relationship between salary and gender"
610 | ]
611 | },
612 | {
613 | "cell_type": "code",
614 | "execution_count": null,
615 | "metadata": {
616 | "collapsed": true
617 | },
618 | "outputs": [],
619 | "source": []
620 | },
621 | {
622 | "cell_type": "markdown",
623 | "metadata": {},
624 | "source": [
625 | "## Exercise 2 - Moderation\n",
626 | "\n",
627 | "For Exercise 2, we will use the alcohol dataset again ('../Data/student-mat.csv').\n",
628 | "\n",
629 | "Below is the information about the dataset. \n",
630 | "\n",
631 | "Attribute Information: \n",
632 | "1 school - student's school (binary: 'GP' - Gabriel Pereira or 'MS' - Mousinho da Silveira) \n",
633 | "2 sex - student's sex (binary: 'F' - female or 'M' - male) \n",
634 | "3 age - student's age (numeric: from 15 to 22) \n",
635 | "4 address - student's home address type (binary: 'U' - urban or 'R' - rural) \n",
636 | "5 famsize - family size (binary: 'LE3' - less or equal to 3 or 'GT3' - greater than 3) \n",
637 | "6 Pstatus - parent's cohabitation status (binary: 'T' - living together or 'A' - apart) \n",
638 | "7 Medu - mother's education (numeric: 0 - none, 1 - primary education (4th grade), 2 – 5th to 9th grade, 3 – secondary education or 4 – higher education) \n",
639 | "8 Fedu - father's education (numeric: 0 - none, 1 - primary education (4th grade), 2 – 5th to 9th grade, 3 – secondary education or 4 – higher education) \n",
640 | "9 Mjob - mother's job (nominal: 'teacher', 'health' care related, civil 'services' (e.g. administrative or police), 'at_home' or 'other') \n",
641 | "10 Fjob - father's job (nominal: 'teacher', 'health' care related, civil 'services' (e.g. administrative or police), 'at_home' or 'other') \n",
642 | "11 reason - reason to choose this school (nominal: close to 'home', school 'reputation', 'course' preference or 'other') \n",
643 | "12 guardian - student's guardian (nominal: 'mother', 'father' or 'other') \n",
644 | "13 traveltime - home to school travel time (numeric: 1 - <15 min., 2 - 15 to 30 min., 3 - 30 min. to 1 hour, or 4 - >1 hour) \n",
645 | "14 studytime - weekly study time (numeric: 1 - <2 hours, 2 - 2 to 5 hours, 3 - 5 to 10 hours, or 4 - >10 hours) \n",
646 | "15 failures - number of past class failures (numeric: n if 1<=n<3, else 4) \n",
647 | "16 schoolsup - extra educational support (binary: yes or no) \n",
648 | "17 famsup - family educational support (binary: yes or no) \n",
649 | "18 paid - extra paid classes within the course subject (Math or Portuguese) (binary: yes or no) \n",
650 | "19 activities - extra-curricular activities (binary: yes or no) \n",
651 | "20 nursery - attended nursery school (binary: yes or no) \n",
652 | "21 higher - wants to take higher education (binary: yes or no) \n",
653 | "22 internet - Internet access at home (binary: yes or no) \n",
654 | "23 romantic - with a romantic relationship (binary: yes or no) \n",
655 | "24 famrel - quality of family relationships (numeric: from 1 - very bad to 5 - excellent) \n",
656 | "25 freetime - free time after school (numeric: from 1 - very low to 5 - very high) \n",
657 | "26 goout - going out with friends (numeric: from 1 - very low to 5 - very high) \n",
658 | "27 Dalc - workday alcohol consumption (numeric: from 1 - very low to 5 - very high) \n",
659 | "28 Walc - weekend alcohol consumption (numeric: from 1 - very low to 5 - very high) \n",
660 | "29 health - current health status (numeric: from 1 - very bad to 5 - very good) \n",
661 | "30 absences - number of school absences (numeric: from 0 to 93) \n",
662 | "31 G1 - first period grade (numeric: from 0 to 20) \n",
663 | "31 G2 - second period grade (numeric: from 0 to 20) \n",
664 | "32 G3 - final grade (numeric: from 0 to 20, output target) \n",
665 | "\n",
666 | "### Question:\n",
667 | "\n",
668 | "The amount you go out seems to be associated with increased daily alcohol consumption. Does being in a relationship dampen this association? Are there any other variables that appear to moderate this relationship?\n"
669 | ]
670 | },
671 | {
672 | "cell_type": "code",
673 | "execution_count": null,
674 | "metadata": {
675 | "collapsed": true
676 | },
677 | "outputs": [],
678 | "source": []
679 | },
680 | {
681 | "cell_type": "code",
682 | "execution_count": null,
683 | "metadata": {
684 | "collapsed": true
685 | },
686 | "outputs": [],
687 | "source": []
688 | }
689 | ],
690 | "metadata": {
691 | "anaconda-cloud": {},
692 | "kernelspec": {
693 | "display_name": "Python [default]",
694 | "language": "python",
695 | "name": "python2"
696 | },
697 | "language_info": {
698 | "codemirror_mode": {
699 | "name": "ipython",
700 | "version": 2
701 | },
702 | "file_extension": ".py",
703 | "mimetype": "text/x-python",
704 | "name": "python",
705 | "nbconvert_exporter": "python",
706 | "pygments_lexer": "ipython2",
707 | "version": "2.7.13"
708 | },
709 | "toc": {
710 | "nav_menu": {
711 | "height": "468px",
712 | "width": "252px"
713 | },
714 | "navigate_menu": true,
715 | "number_sections": true,
716 | "sideBar": true,
717 | "threshold": 4,
718 | "toc_cell": false,
719 | "toc_section_display": "block",
720 | "toc_window_display": false
721 | }
722 | },
723 | "nbformat": 4,
724 | "nbformat_minor": 0
725 | }
726 |
--------------------------------------------------------------------------------
/Notebooks/5_Prediction_&_Regularization.ipynb:
--------------------------------------------------------------------------------
1 | {
2 | "cells": [
3 | {
4 | "cell_type": "markdown",
5 | "metadata": {},
6 | "source": [
7 | "# Logistic Regression Analysis\n",
8 | "\n",
9 | "*Written by Jin Cheong & Luke Chang*\n",
10 | "\n",
11 | "In this lab we are going to learn about more advanced topics in regression analysis.\n",
12 | "- How to use link functions to perform regressions on other types of data such as categorical variables\n",
13 | "- How to perform cross-validation to assess degree of overfitting\n",
14 | "- How to deal with multicollinearity with regularization. \n",
15 | "\n",
16 | "After the tutorial you will have the chance to apply the methods on a separate dataset. \n",
17 | "\n",
18 | "\n",
19 | "## Introduction to logistic regression \n",
20 | "\n",
21 | "A logistic regression, also called a logit model, is used to model outcome variables that are binary. Examples of such variables are \"hit versus miss,\" \"success versus fail,\" or \"male versus female.\" We can use a logistic regression to calculate the probability of an item being one or the other.\n",
22 | "\n",
23 | "In the logit model, the log odds of the outcome is modeled as a linear combination of the predictor variables. \n",
24 | "Lets recall that the simple linear regression had the following form :\n",
25 | "### $y = \\beta_0 + \\beta_1\\cdot x $\n",
26 | "\n",
27 | "We can extend this to a multiple linear regression with multiple features into the following form : \n",
28 | "### $y = \\beta_0 + \\beta_1x_1 + ... + \\beta_nx_n$ \n",
29 | "\n",
30 | " \n",
31 | "In logistic regression, instead of estimating the __value__ of Y from Xs, we estimate the __probability__ of Y. For instance if our model tries to predict whether a person is male(Y=0) or female(Y=1), then a Y value of .8 would mean it has 80% chance of being a female.\n",
32 | "Formally, the linear regression model is passed into a sigmoid function. Here is the simple form with one predictor variable.\n",
33 | "## $P(Y) = \\frac{1}{1 + e^{-(\\beta_0 + \\beta_1x_1)}}$\n",
34 | "\n",
35 | "We can easily extend this to multiple regression with n-number of predictors.\n",
36 | "\n",
37 | "\n",
38 | "## $P(Y) = \\frac{1}{1 + e^{-(\\beta_0 + \\beta_1x_1 + ... + \\beta_nx_n)}}$\n",
39 | "\n",
40 | "Now let's see how we can achieve this in code. \n"
41 | ]
42 | },
43 | {
44 | "cell_type": "code",
45 | "execution_count": null,
46 | "metadata": {
47 | "ExecuteTime": {
48 | "end_time": "2017-01-27T10:48:11.684432",
49 | "start_time": "2017-01-27T10:48:11.679536"
50 | },
51 | "collapsed": true
52 | },
53 | "outputs": [],
54 | "source": [
55 | "%matplotlib inline\n",
56 | "\n",
57 | "import numpy as np\n",
58 | "import pandas as pd\n",
59 | "import matplotlib.pyplot as plt\n",
60 | "import statsmodels.formula.api as smf\n",
61 | "import seaborn as sns\n",
62 | "from IPython.display import display, HTML"
63 | ]
64 | },
65 | {
66 | "cell_type": "markdown",
67 | "metadata": {},
68 | "source": [
69 | "## Logistic regression analysis\n",
70 | "\n",
71 | "Let's start off by plotting a sigmoid function and add necessary modules."
72 | ]
73 | },
74 | {
75 | "cell_type": "code",
76 | "execution_count": null,
77 | "metadata": {
78 | "ExecuteTime": {
79 | "end_time": "2017-01-27T10:34:03.578429",
80 | "start_time": "2017-01-27T10:34:03.408176"
81 | },
82 | "collapsed": false
83 | },
84 | "outputs": [],
85 | "source": [
86 | "# Plot a sigmoid function\n",
87 | "plt.figure(figsize=(5,5)) # open a figure base and determine the size (width, height)\n",
88 | "\n",
89 | "def sigmoid(t): # Define the sigmoid function\n",
90 | " return (1/(1 + np.e**(-t))) \n",
91 | "\n",
92 | "plot_range = np.arange(-6, 6, 0.1) # range of x values to plot\n",
93 | "y_values = sigmoid(plot_range)\n",
94 | "\n",
95 | "# Plot curve\n",
96 | "plt.plot(plot_range, # X-axis range\n",
97 | " y_values, # Predicted values\n",
98 | " color=\"red\",linewidth=3)\n",
99 | "plt.title(\"Example of a sigmoid function\",fontsize=18)\n",
100 | "plt.ylabel(\"Probability\",fontsize=16)"
101 | ]
102 | },
103 | {
104 | "cell_type": "markdown",
105 | "metadata": {},
106 | "source": [
107 | "Now we will load our sample data and create a 'Alcabuse' variable from 'Dalc' and 'Walc' (weekday and weekend alcohol consumption level, respectively) and make it a binary variable by calling it alcohol abuse if a student scores more than 8 on drinking alcohol in a week. The dataset was retrieved from [here](http://archive.ics.uci.edu/ml/datasets/STUDENT+ALCOHOL+CONSUMPTION#) but slightly modified to better illustrate our purpose."
108 | ]
109 | },
110 | {
111 | "cell_type": "code",
112 | "execution_count": null,
113 | "metadata": {
114 | "ExecuteTime": {
115 | "end_time": "2017-01-27T10:34:28.969833",
116 | "start_time": "2017-01-27T10:34:28.958171"
117 | },
118 | "collapsed": false
119 | },
120 | "outputs": [],
121 | "source": [
122 | "# Load the data \n",
123 | "df = pd.read_csv('../Data/student-mat.csv')\n",
124 | "df['Alcabuse']=0\n",
125 | "df.loc[df['Dalc']+df['Walc'] >=8,'Alcabuse']=1 # Let's call it abuse if more than 8 alcohol consumption per week"
126 | ]
127 | },
128 | {
129 | "cell_type": "markdown",
130 | "metadata": {},
131 | "source": [
132 | "The hypothesis we want to test is : __\"A student is more likely to abuse alcohol if he has more free time.\"__ \n",
133 | "'Alcabuse' will be our response variable Y, measuring the number of drinks per week. \n",
134 | "'freetime' will be our predictor variable X, measuring the degree of free time.\n",
135 | "\n",
136 | "We will use the logit function in the statsmodels package. "
137 | ]
138 | },
139 | {
140 | "cell_type": "code",
141 | "execution_count": null,
142 | "metadata": {
143 | "ExecuteTime": {
144 | "end_time": "2017-01-27T10:34:38.993233",
145 | "start_time": "2017-01-27T10:34:38.969608"
146 | },
147 | "collapsed": false
148 | },
149 | "outputs": [],
150 | "source": [
151 | "logm = smf.logit('Alcabuse ~ freetime',data=df).fit()\n",
152 | "print(logm.summary())"
153 | ]
154 | },
155 | {
156 | "cell_type": "markdown",
157 | "metadata": {},
158 | "source": [
159 | "## Interpreting logistic model coefficients for prediction\n",
160 | "\n",
161 | "We ran our first logit model but we cannot interpret the coefficients like we did in the linear regression model. Because we applied a logit transform, the coefficients are in log-odds. \n",
162 | "To make them more interpretable we can exponentiate the coefficients and interpret them as either probability or odds-ratios. \n",
163 | "\n",
164 | "### Interpreting coefficients as probabilities\n",
165 | "To change our log-odds coefficients into probabilities we pass the log-odds value of Y into a sigmoid function.\n",
166 | "For instance, let's see what the probability is for a student to be an alcohol abuser if he had a level 5 of free time (x=5). \n",
167 | "\n",
168 | "$x = 5$ \n",
169 | "$\\beta_0 = -7.6262$ \n",
170 | "$\\beta_1 = 1.6416$ \n",
171 | "## $ Y = \\beta_0 + \\beta_1\\cdot x = -7.6262 + 1.6416 \\cdot 5$ \n",
172 | "## $P(Y) = \\frac{1}{1 + e^{-(Y)}} = \\frac{1}{1 + e^{-(-7.6262 + 1.6416 \\cdot 5)}}$\n",
173 | "\n",
174 | "We implement this with the following code by using the 'sigmoid' function as defined earlier in this module.\n",
175 | "We also plot the fit of the model."
176 | ]
177 | },
178 | {
179 | "cell_type": "code",
180 | "execution_count": null,
181 | "metadata": {
182 | "ExecuteTime": {
183 | "end_time": "2017-01-27T10:35:53.052903",
184 | "start_time": "2017-01-27T10:35:53.043411"
185 | },
186 | "collapsed": false,
187 | "scrolled": true
188 | },
189 | "outputs": [],
190 | "source": [
191 | "x = 5\n",
192 | "print('x = %i' %x) # x\n",
193 | "print('b0 = %f' %logm.params[0]) # b0\n",
194 | "print('b1 = %f' %logm.params[1]) # b1\n",
195 | "palcabuse = sigmoid(logm.params[0] + logm.params[1] * x)\n",
196 | "print('palcabuse = %f' %palcabuse)\n",
197 | "print('If a student has %i hours of free time per week, then he has %.2f %% probability of alcohol abuse' % (x,palcabuse*100))\n",
198 | "# Note that we had to use double percent sign %% to print % \n",
199 | "# because a single % is reserved for printing inline numbers as in %i or %.2f"
200 | ]
201 | },
202 | {
203 | "cell_type": "markdown",
204 | "metadata": {},
205 | "source": [
206 | "### Plotting the predictions of the model"
207 | ]
208 | },
209 | {
210 | "cell_type": "code",
211 | "execution_count": null,
212 | "metadata": {
213 | "ExecuteTime": {
214 | "end_time": "2017-01-27T10:37:07.982811",
215 | "start_time": "2017-01-27T10:37:07.767818"
216 | },
217 | "collapsed": false
218 | },
219 | "outputs": [],
220 | "source": [
221 | "# Plot model fit\n",
222 | "xs = np.arange(min(df.freetime)-1,max(df.freetime)+1,0.1)\n",
223 | "ys = np.array([sigmoid(logm.params[0] + logm.params[1] * x) for x in xs])\n",
224 | "plt.figure(figsize=(8,4))\n",
225 | "plt.plot(xs,ys,linewidth=3)\n",
226 | "jitter = np.random.random(len(df.Alcabuse))/5\n",
227 | "ys_outcomes = np.abs((df.Alcabuse==1)-0.01-jitter)\n",
228 | "alpha = 0.7\n",
229 | "plt.plot(df.freetime[df.Alcabuse == 1], ys_outcomes[df.Alcabuse ==1], 'b.', alpha=alpha)\n",
230 | "plt.plot(df.freetime[df.Alcabuse == 0], ys_outcomes[df.Alcabuse ==0], 'r.', alpha=alpha)\n",
231 | "plt.xlabel(\"Freetime\",fontsize=16)\n",
232 | "plt.ylabel(\"Probability of being an alcohol abuser\",fontsize=16)\n",
233 | "plt.title(\"Predicting alcohol abuse by free time\",fontsize=18)"
234 | ]
235 | },
236 | {
237 | "cell_type": "markdown",
238 | "metadata": {},
239 | "source": [
240 | "### Interpreting coefficients as odds\n",
241 | "In some cases it could be more useful to consider the odds of an event happening. \n",
242 | "For example, we might want to be able to make statementes such as \"If a person smokes, he is 10 times more likely to get lung cancer compared to a person who doesn't smoke.\" The odds of an event happening is defined by\n",
243 | "\n",
244 | "## $odds = \\frac{P(\\text{event})}{P(\\text{no event})} = \\frac{P(\\text{Y})}{P(\\text{not Y})}$\n",
245 | "while $P(\\text{Y}) = \\frac{1}{1 + e^{-(\\beta_0 + \\beta_1x_1)}}$\n",
246 | "\n",
247 | "and $P(\\text{not Y}) = 1 - P(\\text{Y})$\n",
248 | "\n",
249 | "If we want to look at how much the odds change for a unit increase in free time, we can use the following \n",
250 | "\n",
251 | "## $\\triangle odds = \\frac{\\text{odds after a unit change in predictor}}{\\text{original odds}} $\n",
252 | "\n",
253 | "So, if we want to calculate the change in odds ratios of being an alcohol abuser when free time increases from 4 to 5, we can calculate the following: \n",
254 | "## $\\triangle odds = \\frac{\\text{odds(Y) when x = 5}}{\\text{odds(Y) when x = 4}} = \\frac{\\frac{\\text{P(Y) when x = 5}}{\\text{1 - (P(Y) when x = 5)}}}{\\frac{\\text{P(Y) when x = 4}}{\\text{1 - (P(Y) when x = 4)}}}$\n"
255 | ]
256 | },
257 | {
258 | "cell_type": "code",
259 | "execution_count": null,
260 | "metadata": {
261 | "ExecuteTime": {
262 | "end_time": "2017-01-27T10:39:10.389767",
263 | "start_time": "2017-01-27T10:39:10.383370"
264 | },
265 | "collapsed": false
266 | },
267 | "outputs": [],
268 | "source": [
269 | "odd1 = sigmoid(logm.params[0] + logm.params[1] * 5) / (1-sigmoid(logm.params[0] + logm.params[1] * 5))\n",
270 | "odd2 = sigmoid(logm.params[0] + logm.params[1] * 4) / (1-sigmoid(logm.params[0] + logm.params[1] * 4))\n",
271 | "dodds = odd1/odd2\n",
272 | "print('A student with 5hrs of free time compared to a student with 4 is %.2f times more likely to be an alcohol abuser' %dodds)"
273 | ]
274 | },
275 | {
276 | "cell_type": "markdown",
277 | "metadata": {},
278 | "source": [
279 | "## Cross-validation \n",
280 | "\n",
281 | "The best way to compare accuracy of models and see how well your model generalize is to train (or fit) your model on a training dataset then test it on a separate test dataset. If you train and test your model on the same dataset (as we did above) you are likely to get a positvely biased result. The true generalizeability can only be observed when you test your model on a separate hold-out test dataset on which the model was not trained. A simple way to do cross-validation is to split the dataset before you begin your analysis. \n",
282 | "\n",
283 | "There are many ways to split the dataset but for simplicity we will use the train_test_split function in the module sci-kit learn. \n",
284 | "We will split the data so that the training dataset consists 60% of the data and the test dataset consists 40% of the data.\n",
285 | "\n",
286 | "### 1. Split the data into training and testing datasets"
287 | ]
288 | },
289 | {
290 | "cell_type": "code",
291 | "execution_count": null,
292 | "metadata": {
293 | "ExecuteTime": {
294 | "end_time": "2017-01-27T10:39:55.750103",
295 | "start_time": "2017-01-27T10:39:55.687554"
296 | },
297 | "collapsed": false
298 | },
299 | "outputs": [],
300 | "source": [
301 | "from sklearn.cross_validation import train_test_split\n",
302 | "np.random.seed(12345678) # set the random number generator seed to get consistent results\n",
303 | "trainratio = .6\n",
304 | "train, test = train_test_split(df, train_size=trainratio, random_state=1)\n",
305 | "print('train-test ratio %.2f' %(float(len(train))/(len(train)+len(test))))"
306 | ]
307 | },
308 | {
309 | "cell_type": "markdown",
310 | "metadata": {},
311 | "source": [
312 | "### 2. Fit the model on the training dataset\n",
313 | "Now we fit a new model and save it to logmt. \n",
314 | "We will try to predict 'Alcabuse' with 'freetime' and 'goout' \n",
315 | "Note that we specify what dataset we are using by setting data = train. "
316 | ]
317 | },
318 | {
319 | "cell_type": "code",
320 | "execution_count": null,
321 | "metadata": {
322 | "ExecuteTime": {
323 | "end_time": "2017-01-27T10:42:00.202890",
324 | "start_time": "2017-01-27T10:42:00.184033"
325 | },
326 | "collapsed": false
327 | },
328 | "outputs": [],
329 | "source": [
330 | "# We run the model on the train dataset\n",
331 | "logmt = smf.logit('Alcabuse ~ freetime + goout',data=train).fit()\n",
332 | "print(logmt.summary())"
333 | ]
334 | },
335 | {
336 | "cell_type": "markdown",
337 | "metadata": {},
338 | "source": [
339 | "### 3. Calculate predictions on the test dataset\n",
340 | "We now use the model with coefficients estimated from the training dataset (logmt) and plug in the predictor variable values from the test dataset by using the .predict method. This method outputs the probabilities for us so we don't have to apply separate sigmoid transform. Therefore the output are the probabilities predicted with our model on the test dataset."
341 | ]
342 | },
343 | {
344 | "cell_type": "code",
345 | "execution_count": null,
346 | "metadata": {
347 | "ExecuteTime": {
348 | "end_time": "2017-01-27T10:42:10.419775",
349 | "start_time": "2017-01-27T10:42:10.410035"
350 | },
351 | "collapsed": false
352 | },
353 | "outputs": [],
354 | "source": [
355 | "ytestpred = logmt.predict(test)\n",
356 | "np.round(ytestpred,2)"
357 | ]
358 | },
359 | {
360 | "cell_type": "markdown",
361 | "metadata": {},
362 | "source": [
363 | "### 4. Calculate accuracy of the predictions\n",
364 | "To calculate the accuracy of the model we need to specify the cutoff for the probabilities to say whether one should be classified as an alcohol abuser or not. For simplicity, let's say that we'll classify the person as an alcohol abuser if the estimated probability is greater than 50%. \n",
365 | "\n",
366 | "We will then calculate the accuracy of the model in classifying alcohol abuse for the training data and the test data separately"
367 | ]
368 | },
369 | {
370 | "cell_type": "code",
371 | "execution_count": null,
372 | "metadata": {
373 | "ExecuteTime": {
374 | "end_time": "2017-01-27T10:42:29.512250",
375 | "start_time": "2017-01-27T10:42:29.479882"
376 | },
377 | "collapsed": false
378 | },
379 | "outputs": [],
380 | "source": [
381 | "cutoff = .5\n",
382 | "\n",
383 | "# Calculate accuracy for train\n",
384 | "ytrainpred = logmt.predict(train)\n",
385 | "ytrainpred[ytrainpred>=cutoff] =1 \n",
386 | "ytrainpred[ytrainpred=cutoff] =1 \n",
392 | "ytestpred[ytestpred=cutoff] =1 \n",
475 | "ytestpred[ytestpred\n",
501 | "\n",
502 | "Regularization attempts to solve this problem by introducting a loss function that penalizes the model for each additional features added to the model. The AIC / BIC values we mentioned in the first session is a form of regularization in comparing models. \n",
503 | "\n",
504 | "In this section we will focus on feature selection regularization methods Lasso and Ridge regressions.\n",
505 | "\n",
506 | "## Lasso regression\n",
507 | "In short, [Lasso](http://stats.stackexchange.com/questions/17251/what-is-the-lasso-in-regression-analysis) is a feature selection method that reduces the number of features to use in a regression. \n",
508 | "This is useful if you have a lot of variables that are correlated or you have more variables than observations. \n",
509 | "[Here](http://studentlife.cs.dartmouth.edu/smartgpa.pdf) is a study from Dartmouth that used Lasso on the Student Life Dataset.\n",
510 | "\n",
511 | "In our example, we will try to find which variables can predict the level of alcohol consumption 'AlcSum'. "
512 | ]
513 | },
514 | {
515 | "cell_type": "code",
516 | "execution_count": null,
517 | "metadata": {
518 | "ExecuteTime": {
519 | "end_time": "2017-01-27T11:05:03.397385",
520 | "start_time": "2017-01-27T11:05:03.393882"
521 | },
522 | "collapsed": true
523 | },
524 | "outputs": [],
525 | "source": [
526 | "df['AlcSum']=df['Dalc']+df['Walc']"
527 | ]
528 | },
529 | {
530 | "cell_type": "code",
531 | "execution_count": null,
532 | "metadata": {
533 | "ExecuteTime": {
534 | "end_time": "2017-01-27T11:05:06.581723",
535 | "start_time": "2017-01-27T11:05:06.577263"
536 | },
537 | "collapsed": false
538 | },
539 | "outputs": [],
540 | "source": [
541 | "# We have a lot of features to choose from\n",
542 | "df.keys()"
543 | ]
544 | },
545 | {
546 | "cell_type": "code",
547 | "execution_count": null,
548 | "metadata": {
549 | "ExecuteTime": {
550 | "end_time": "2017-01-27T11:05:19.241902",
551 | "start_time": "2017-01-27T11:05:18.929428"
552 | },
553 | "collapsed": false
554 | },
555 | "outputs": [],
556 | "source": [
557 | "# Make a model with all the feature variables\n",
558 | "formula = 'AlcSum ~ school+sex+age+address+famsize+Pstatus+Medu+\\\n",
559 | " Fedu+Mjob+Fjob+reason+guardian+traveltime+\\\n",
560 | " studytime+failures+schoolsup+famsup+paid+activities+nursery+higher+\\\n",
561 | " internet+romantic+famrel+freetime+goout+health+absences+G1+G2+G3'\n",
562 | "\n",
563 | "# Generate a design matrix with the formula\n",
564 | "# This automatically generates separate regressors for categorical variables.\n",
565 | "from patsy import dmatrices\n",
566 | "y, X = dmatrices(formula_like=formula, data=df, return_type='dataframe')\n",
567 | "# Ridge if L1_wt = 0 , Lasso if L1_wt = 1\n",
568 | "penalty = .036\n",
569 | "regm = smf.OLS(y,X).fit_regularized(method='coord_descent', maxiter=1000, alpha=penalty, L1_wt=1.0)\n",
570 | "print regm.summary()"
571 | ]
572 | },
573 | {
574 | "cell_type": "markdown",
575 | "metadata": {},
576 | "source": [
577 | "### How do we determine the penalty value? \n",
578 | "Essentially, we want to select the regularization parameter by identifying the one from a set of possible values (e.g. grid search) that results in the best fit of the model to the data. However, it is important to note that it is easy to introduce bias into this process by trying a bunch of alphas and selecting the one that works best. This can lead to optimistic evaluations of how well your model works.\n",
579 | "\n",
580 | "Cross-validation is an ideal method to deal with this. We can use cross-validation to select the alpha while adding minimal bias to the overall model prediction.\n",
581 | "\n",
582 | "There are several different metrics of model fit to use as a criterion variable. Log-likelihood or mean squared error are the two most common depending on how you are estimating the parameters (i.e., maximizing log-likelihood or minimizizing squared error). The Akaike information criterion (AIC) and the Bayes Information criterion (BIC) are two common metrics that apply different penalties to these metrices for the number of free parameters in your model.\n",
583 | "\n",
584 | "Here we will demonstrate using both to select an optimal value of the regularization parameter alpha of the Lasso estimator from an example provided by [scikit-learn](http://scikit-learn.org/stable/auto_examples/linear_model/plot_lasso_model_selection.html). For cross-validation, we use 20-fold with 2 algorithms to compute the Lasso path: coordinate descent, as implemented by the LassoCV class.\n"
585 | ]
586 | },
587 | {
588 | "cell_type": "code",
589 | "execution_count": null,
590 | "metadata": {
591 | "ExecuteTime": {
592 | "end_time": "2017-01-27T11:11:14.478267",
593 | "start_time": "2017-01-27T11:11:14.157680"
594 | },
595 | "collapsed": false
596 | },
597 | "outputs": [],
598 | "source": [
599 | "from sklearn.linear_model import LassoCV, LassoLarsCV, LassoLarsIC\n",
600 | "import time\n",
601 | "\n",
602 | "# LassoLarsIC: least angle regression with BIC/AIC criterion\n",
603 | "model_bic = LassoLarsIC(criterion='bic')\n",
604 | "model_bic.fit(X, y)\n",
605 | "alpha_bic_ = model_bic.alpha_\n",
606 | "\n",
607 | "model_aic = LassoLarsIC(criterion='aic')\n",
608 | "model_aic.fit(X, y)\n",
609 | "alpha_aic_ = model_aic.alpha_\n",
610 | "\n",
611 | "def plot_ic_criterion(model, name, value, color):\n",
612 | " alpha_ = model.alpha_\n",
613 | " alphas_ = model.alphas_\n",
614 | " criterion_ = model.criterion_\n",
615 | " plt.plot(-np.log10(alphas_), criterion_, '--', color=color,\n",
616 | " linewidth=3, label='%s criterion' % name)\n",
617 | " plt.axvline(-np.log10(alpha_), color=color, linewidth=3,\n",
618 | " label='%s estimate alpha: %.3f' %(name,value) )\n",
619 | " plt.xlabel('-log(alpha)',fontsize=16)\n",
620 | " plt.ylabel('Model Fit Criterion',fontsize=16)\n",
621 | "\n",
622 | "plt.figure(figsize=(15,5))\n",
623 | "plot_ic_criterion(model_aic, 'AIC', alpha_aic_ , 'b')\n",
624 | "plot_ic_criterion(model_bic, 'BIC', alpha_bic_ , 'r')\n",
625 | "plt.legend()\n",
626 | "plt.title('Information-criterion for model selection',fontsize=18)"
627 | ]
628 | },
629 | {
630 | "cell_type": "markdown",
631 | "metadata": {},
632 | "source": [
633 | "Now let's create a plot showing how well each alpha value performs across cross-validation folds"
634 | ]
635 | },
636 | {
637 | "cell_type": "code",
638 | "execution_count": null,
639 | "metadata": {
640 | "ExecuteTime": {
641 | "end_time": "2017-01-27T11:23:24.140077",
642 | "start_time": "2017-01-27T11:23:23.747748"
643 | },
644 | "collapsed": false
645 | },
646 | "outputs": [],
647 | "source": [
648 | "# LassoCV: coordinate descent\n",
649 | "y, X = dmatrices(formula_like=formula, data=df, return_type='dataframe')\n",
650 | "\n",
651 | "# Compute paths\n",
652 | "print(\"Computing regularization path using the coordinate descent lasso...\")\n",
653 | "t1 = time.time()\n",
654 | "model = LassoCV(cv=5).fit(X, y)\n",
655 | "t_lasso_cv = time.time() - t1\n",
656 | "\n",
657 | "# Display results\n",
658 | "m_log_alphas = -np.log10(model.alphas_)\n",
659 | "\n",
660 | "# Display results\n",
661 | "plt.figure(figsize=(15,5))\n",
662 | "plt.plot(m_log_alphas, model.mse_path_, ':')\n",
663 | "plt.plot(m_log_alphas, model.mse_path_.mean(axis=-1), 'k',\n",
664 | " label='Average across the folds', linewidth=2)\n",
665 | "plt.axvline(-np.log10(model.alpha_), linestyle='--', color='k',\n",
666 | " label='CV estimate alpha: %.3f' %model.alpha_ )\n",
667 | "plt.legend()\n",
668 | "plt.xlabel('-log(alpha)',fontsize=16)\n",
669 | "plt.ylabel('Mean square error',fontsize=16)\n",
670 | "plt.title('Mean square error on each fold: coordinate descent',fontsize=18)\n",
671 | "plt.axis('tight')"
672 | ]
673 | },
674 | {
675 | "cell_type": "markdown",
676 | "metadata": {},
677 | "source": [
678 | "## Ridge regression\n",
679 | "The goal of the ridge function is to choose a penalty λ for which the coefficients are not rapidly changing and have “sensible” signs. It is especially useful when data suffers from multicollinearity, that is some of your predictor variables are highly correlated. Unlike LASSO, ridge does not produce a sparse solution, but rather shrinks variables that are highly collinear towards zero."
680 | ]
681 | },
682 | {
683 | "cell_type": "code",
684 | "execution_count": null,
685 | "metadata": {
686 | "ExecuteTime": {
687 | "end_time": "2017-01-27T11:26:15.846904",
688 | "start_time": "2017-01-27T11:26:14.695671"
689 | },
690 | "collapsed": false,
691 | "scrolled": false
692 | },
693 | "outputs": [],
694 | "source": [
695 | "from sklearn import linear_model\n",
696 | "y, X = dmatrices(formula_like=formula, data=df, return_type='dataframe')\n",
697 | "n_alphas = 200\n",
698 | "alphas = np.logspace(-2, 6, n_alphas)\n",
699 | "\n",
700 | "# Get optimal lambda\n",
701 | "clf = linear_model.RidgeCV(alphas=alphas)\n",
702 | "clf.fit(X,y)\n",
703 | "aestim = clf.alpha_ # Estimated regularization param\n",
704 | "\n",
705 | "# Get paths\n",
706 | "clf = linear_model.Ridge(fit_intercept=False)\n",
707 | "coefs = []\n",
708 | "for a in alphas:\n",
709 | " clf.set_params(alpha=a)\n",
710 | " clf.fit(X, y)\n",
711 | " coefs.append(clf.coef_)\n",
712 | "\n",
713 | "###############################################################################\n",
714 | "# Display results\n",
715 | "plt.figure(figsize=(15,8))\n",
716 | "\n",
717 | "ax = plt.gca()\n",
718 | "ax.plot(alphas, np.squeeze(coefs))\n",
719 | "ax.set_xscale('log')\n",
720 | "plt.axvline(aestim, linestyle='--', color='k', label='alpha: CV estimate')\n",
721 | "plt.legend()\n",
722 | "plt.xlabel('Lambda',fontsize=16)\n",
723 | "plt.ylabel('Weights',fontsize=16)\n",
724 | "plt.title('Ridge coefficients as a function of the regularization',fontsize=18)\n",
725 | "plt.axis('tight')"
726 | ]
727 | },
728 | {
729 | "cell_type": "markdown",
730 | "metadata": {},
731 | "source": [
732 | "The points on the left vertical axis (the left ends of the lines) are the ordinary least squares regression values.\n",
733 | "These occur for k equal zero. As k is increased, the values of the regression estimates change, often wildly at first.\n",
734 | "At some point, the coefficients seem to settle down and then gradually drift towards zero.\n",
735 | "\n",
736 | "The task of the ridge regression analyst is to determine at what value of k these coefficients are at their stable\n",
737 | "values. A vertical line is drawn at the value selected for reporting purposes. It is anticipated that you would run\n",
738 | "the program several times until an appropriate value of k is determined. In our case, the selected alpha value is indicated by a vertical line at which a = 65.93. The smaller the value of k, the smaller the amount of bias that is included in the estimates."
739 | ]
740 | },
741 | {
742 | "cell_type": "code",
743 | "execution_count": null,
744 | "metadata": {
745 | "ExecuteTime": {
746 | "end_time": "2017-01-27T11:26:48.340880",
747 | "start_time": "2017-01-27T11:26:48.333706"
748 | },
749 | "collapsed": false
750 | },
751 | "outputs": [],
752 | "source": [
753 | "clf = linear_model.Ridge(fit_intercept=False)\n",
754 | "a = aestim\n",
755 | "clf.set_params(alpha=a)\n",
756 | "clf.fit(X, y)\n",
757 | "np.round(clf.coef_,3)"
758 | ]
759 | },
760 | {
761 | "cell_type": "code",
762 | "execution_count": null,
763 | "metadata": {
764 | "ExecuteTime": {
765 | "end_time": "2017-01-27T11:26:56.928052",
766 | "start_time": "2017-01-27T11:26:56.884641"
767 | },
768 | "collapsed": false
769 | },
770 | "outputs": [],
771 | "source": [
772 | "# Ridge if L1_wt = 0 , Lasso if L1_wt = 1\n",
773 | "penalty = aestim\n",
774 | "regm = smf.OLS(y,X).fit_regularized(method='coord_descent', maxiter=1000, alpha=penalty, L1_wt=0)\n",
775 | "print regm.summary()"
776 | ]
777 | },
778 | {
779 | "cell_type": "markdown",
780 | "metadata": {
781 | "collapsed": true
782 | },
783 | "source": [
784 | "# Individual exercise\n",
785 | "Below is the information about the dataset. \n",
786 | "\n",
787 | "Attribute Information: \n",
788 | "1 school - student's school (binary: 'GP' - Gabriel Pereira or 'MS' - Mousinho da Silveira) \n",
789 | "2 sex - student's sex (binary: 'F' - female or 'M' - male) \n",
790 | "3 age - student's age (numeric: from 15 to 22) \n",
791 | "4 address - student's home address type (binary: 'U' - urban or 'R' - rural) \n",
792 | "5 famsize - family size (binary: 'LE3' - less or equal to 3 or 'GT3' - greater than 3) \n",
793 | "6 Pstatus - parent's cohabitation status (binary: 'T' - living together or 'A' - apart) \n",
794 | "7 Medu - mother's education (numeric: 0 - none, 1 - primary education (4th grade), 2 – 5th to 9th grade, 3 – secondary education or 4 – higher education) \n",
795 | "8 Fedu - father's education (numeric: 0 - none, 1 - primary education (4th grade), 2 – 5th to 9th grade, 3 – secondary education or 4 – higher education) \n",
796 | "9 Mjob - mother's job (nominal: 'teacher', 'health' care related, civil 'services' (e.g. administrative or police), 'at_home' or 'other') \n",
797 | "10 Fjob - father's job (nominal: 'teacher', 'health' care related, civil 'services' (e.g. administrative or police), 'at_home' or 'other') \n",
798 | "11 reason - reason to choose this school (nominal: close to 'home', school 'reputation', 'course' preference or 'other') \n",
799 | "12 guardian - student's guardian (nominal: 'mother', 'father' or 'other') \n",
800 | "13 traveltime - home to school travel time (numeric: 1 - <15 min., 2 - 15 to 30 min., 3 - 30 min. to 1 hour, or 4 - >1 hour) \n",
801 | "14 studytime - weekly study time (numeric: 1 - <2 hours, 2 - 2 to 5 hours, 3 - 5 to 10 hours, or 4 - >10 hours) \n",
802 | "15 failures - number of past class failures (numeric: n if 1<=n<3, else 4) \n",
803 | "16 schoolsup - extra educational support (binary: yes or no) \n",
804 | "17 famsup - family educational support (binary: yes or no) \n",
805 | "18 paid - extra paid classes within the course subject (Math or Portuguese) (binary: yes or no) \n",
806 | "19 activities - extra-curricular activities (binary: yes or no) \n",
807 | "20 nursery - attended nursery school (binary: yes or no) \n",
808 | "21 higher - wants to take higher education (binary: yes or no) \n",
809 | "22 internet - Internet access at home (binary: yes or no) \n",
810 | "23 romantic - with a romantic relationship (binary: yes or no) \n",
811 | "24 famrel - quality of family relationships (numeric: from 1 - very bad to 5 - excellent) \n",
812 | "25 freetime - free time after school (numeric: from 1 - very low to 5 - very high) \n",
813 | "26 goout - going out with friends (numeric: from 1 - very low to 5 - very high) \n",
814 | "27 Dalc - workday alcohol consumption (numeric: from 1 - very low to 5 - very high) \n",
815 | "28 Walc - weekend alcohol consumption (numeric: from 1 - very low to 5 - very high) \n",
816 | "29 health - current health status (numeric: from 1 - very bad to 5 - very good) \n",
817 | "30 absences - number of school absences (numeric: from 0 to 93) \n",
818 | "31 G1 - first period grade (numeric: from 0 to 20) \n",
819 | "31 G2 - second period grade (numeric: from 0 to 20) \n",
820 | "32 G3 - final grade (numeric: from 0 to 20, output target) \n",
821 | "\n",
822 | "## Exercise 1\n",
823 | "Before you begin, you may want to replace all the 'yes' or 'no' responses in the dataframe to numerical values such as 1 or 0\n",
824 | "using the df.replace() method. \n",
825 | "\n",
826 | "Run a logistic model to see whether studytime predicts 'wanting to take higher education'. \n",
827 | "\n",
828 | "Based on the model, what is the probability that John wants to pursue higher education if his\n",
829 | "studytime value is 2 (he studies 2 hours/week)?\n",
830 | "\n",
831 | "Based on the model what is the probability that Laura wants to pursue higher education if her\n",
832 | "studytime value is 3 (she studies 5 hours per week)? \n",
833 | "\n",
834 | "How much more likely (in odds) is Laura likely to want to pursue higher education than John ?\n",
835 | "\n",
836 | "## Exercise 2\n",
837 | "Run a LASSO regression to find what variables predict studytime. \n",
838 | "Use the penalty parameter (alpha) estimated through cross validation of coordinate descent. \n",
839 | "What is your interpretation of the results? "
840 | ]
841 | },
842 | {
843 | "cell_type": "code",
844 | "execution_count": null,
845 | "metadata": {
846 | "collapsed": true
847 | },
848 | "outputs": [],
849 | "source": []
850 | }
851 | ],
852 | "metadata": {
853 | "anaconda-cloud": {},
854 | "kernelspec": {
855 | "display_name": "Python 2",
856 | "language": "python",
857 | "name": "python2"
858 | },
859 | "language_info": {
860 | "codemirror_mode": {
861 | "name": "ipython",
862 | "version": 2
863 | },
864 | "file_extension": ".py",
865 | "mimetype": "text/x-python",
866 | "name": "python",
867 | "nbconvert_exporter": "python",
868 | "pygments_lexer": "ipython2",
869 | "version": "2.7.13"
870 | },
871 | "toc": {
872 | "nav_menu": {
873 | "height": "512px",
874 | "width": "252px"
875 | },
876 | "navigate_menu": true,
877 | "number_sections": true,
878 | "sideBar": true,
879 | "threshold": 4,
880 | "toc_cell": false,
881 | "toc_section_display": "block",
882 | "toc_window_display": false
883 | }
884 | },
885 | "nbformat": 4,
886 | "nbformat_minor": 0
887 | }
888 |
--------------------------------------------------------------------------------
/Notebooks/6_Introduction_to_SocialNetworkAnalysis.ipynb:
--------------------------------------------------------------------------------
1 | {
2 | "cells": [
3 | {
4 | "cell_type": "markdown",
5 | "metadata": {
6 | "deletable": true,
7 | "editable": true
8 | },
9 | "source": [
10 | "# Social Network Analysis \n",
11 | "Written by Jin Cheong & Luke Chang\n"
12 | ]
13 | },
14 | {
15 | "cell_type": "code",
16 | "execution_count": null,
17 | "metadata": {
18 | "ExecuteTime": {
19 | "end_time": "2017-02-13T09:24:16.143566",
20 | "start_time": "2017-02-13T09:24:15.172556"
21 | },
22 | "collapsed": false,
23 | "deletable": true,
24 | "editable": true
25 | },
26 | "outputs": [],
27 | "source": [
28 | "%matplotlib inline\n",
29 | "\n",
30 | "import numpy as np\n",
31 | "import matplotlib.pyplot as plt\n",
32 | "try:\n",
33 | " import networkx as nx\n",
34 | "except:\n",
35 | " # Install NetworkX\n",
36 | " !pip install networkx"
37 | ]
38 | },
39 | {
40 | "cell_type": "markdown",
41 | "metadata": {
42 | "deletable": true,
43 | "editable": true
44 | },
45 | "source": [
46 | "# Primer to Network Analysis\n",
47 | "\n",
48 | "A network is made up of two main components: nodes and edges. \n",
49 | "\n",
50 | "The nodes are the individuals, words, or entities that compose a network, and the edges are the links that connect one node to another. \n",
51 | "\n",
52 | "Here is an example of a node and an edge.\n",
53 | "\n",
54 | "
\n",
55 | "\n",
56 | "Now we can try drawing our own network using the NetworkX package.\n",
57 | "Let's start off by initializing a Network, and call it G. "
58 | ]
59 | },
60 | {
61 | "cell_type": "code",
62 | "execution_count": null,
63 | "metadata": {
64 | "ExecuteTime": {
65 | "end_time": "2017-02-13T09:24:16.148155",
66 | "start_time": "2017-02-13T09:24:16.145064"
67 | },
68 | "collapsed": true,
69 | "deletable": true,
70 | "editable": true
71 | },
72 | "outputs": [],
73 | "source": [
74 | "# Initialize Graph object\n",
75 | "G = nx.Graph()"
76 | ]
77 | },
78 | {
79 | "cell_type": "markdown",
80 | "metadata": {
81 | "deletable": true,
82 | "editable": true
83 | },
84 | "source": [
85 | "Now we can add a node using the .add_node('nodename') method"
86 | ]
87 | },
88 | {
89 | "cell_type": "code",
90 | "execution_count": null,
91 | "metadata": {
92 | "ExecuteTime": {
93 | "end_time": "2017-02-13T09:24:16.291054",
94 | "start_time": "2017-02-13T09:24:16.149742"
95 | },
96 | "collapsed": false,
97 | "deletable": true,
98 | "editable": true
99 | },
100 | "outputs": [],
101 | "source": [
102 | "G = nx.Graph()\n",
103 | "G.add_node('Jin')\n",
104 | "G.add_node('Luke')\n",
105 | "G.add_node('Eshin')\n",
106 | "plt.figure(figsize=(5,3))\n",
107 | "nx.draw(G,with_labels=True,node_size=5000,font_size=20,alpha=.5)"
108 | ]
109 | },
110 | {
111 | "cell_type": "markdown",
112 | "metadata": {
113 | "deletable": true,
114 | "editable": true
115 | },
116 | "source": [
117 | "Notice there are no connections between the two nodes because we haven't added any edges yet.\n",
118 | "\n",
119 | "To add edges between nodes, for example node1 and node 2, we can use the .add_edge('node1','node2') method on our graph."
120 | ]
121 | },
122 | {
123 | "cell_type": "code",
124 | "execution_count": null,
125 | "metadata": {
126 | "ExecuteTime": {
127 | "end_time": "2017-02-13T09:24:16.452812",
128 | "start_time": "2017-02-13T09:24:16.292498"
129 | },
130 | "collapsed": false,
131 | "deletable": true,
132 | "editable": true,
133 | "scrolled": true
134 | },
135 | "outputs": [],
136 | "source": [
137 | "G = nx.Graph()\n",
138 | "G.add_edge('Jin','Luke',weight=1)\n",
139 | "G.add_edge('Jin','Eshin',weight=1)\n",
140 | "G.add_edge('Luke','Eshin',weight=1)\n",
141 | "plt.figure(figsize=(5,3))\n",
142 | "pos = nx.spring_layout(G) # One way of specifiying a layout for nodes.\n",
143 | "nx.draw(G,pos,with_labels=True,node_size=5000,font_size=20,alpha=.5)"
144 | ]
145 | },
146 | {
147 | "cell_type": "markdown",
148 | "metadata": {
149 | "deletable": true,
150 | "editable": true
151 | },
152 | "source": [
153 | "Now you can see that we have an edge connecting Jin and Luke. \n",
154 | "\n",
155 | "There are several different types of graphs.\n",
156 | "\n",
157 | "1. **weighted** vs **unweighted** graphs\n",
158 | " - a graph is said to be **unweighted** if every connection is either 1 or 0\n",
159 | " - it is **weighted** if the edges take on other values indicating the strength of the relationship.\n",
160 | "\n",
161 | "\n",
162 | "2. **directed** vs **undirected** graphs\n",
163 | " - a graphs is **undirected** if the edges are symmetrical\n",
164 | " - it is **directed** if the edges are asymmetrical, meaning that each edge can indicate the direction of the relationship\n",
165 | "\n",
166 | "For simplicity, today we will only cover unweighted and undirected graphs. "
167 | ]
168 | },
169 | {
170 | "cell_type": "markdown",
171 | "metadata": {},
172 | "source": [
173 | "There are multiple ways to indicate nodes and edges in a graph. \n",
174 | "As illustrated above, we can manually add each node and edge. This approach is fine for small graphs, but won't scale well. It is also possible to add nodes is to use a python dictionary.\n",
175 | "The key is the node and the values indicate what that key(node) connects to. "
176 | ]
177 | },
178 | {
179 | "cell_type": "code",
180 | "execution_count": null,
181 | "metadata": {
182 | "ExecuteTime": {
183 | "end_time": "2017-02-13T09:24:16.903054",
184 | "start_time": "2017-02-13T09:24:16.454433"
185 | },
186 | "collapsed": false,
187 | "deletable": true,
188 | "editable": true
189 | },
190 | "outputs": [],
191 | "source": [
192 | "d = {'Jin':['Luke','Eshin','Antonia','Andy','Sophie','Rob','Charlie','Vanessa'], \n",
193 | " 'Luke':['Eshin','Antonia','Seth','Andy','Sophie','Rob','Charlie','Vanessa'],\n",
194 | " 'Antonia':['Luke','Jin','Eshin','Seth','Andy'],\n",
195 | " 'Eshin':['Heidi','Antonia','Jin','Sam','Andy'],\n",
196 | " 'Sophie':['Rob','Vanessa'],\n",
197 | " 'Rob':['Sophie','Vanessa']}\n",
198 | "G = nx.Graph(d)\n",
199 | "plt.figure(figsize=(15,8))\n",
200 | "np.random.seed(2) # Just to keep things same\n",
201 | "pos = pos = nx.fruchterman_reingold_layout(G) # Another way of specifiying a layout for nodes.\n",
202 | "nx.draw(G,with_labels=True,node_size=1500,font_size=20,alpha=.3,width=2)"
203 | ]
204 | },
205 | {
206 | "cell_type": "markdown",
207 | "metadata": {
208 | "deletable": true,
209 | "editable": true
210 | },
211 | "source": [
212 | "# What should we look for in a network ? \n",
213 | "\n",
214 | "## Micromeasures\n",
215 | "Centrality measures track the importance of a node derived from its position in the network.\n",
216 | "\n",
217 | "There are four main groups of centrality depending on the type of statistics used:\n",
218 | "1. degree - how connected a node is\n",
219 | "2. betweenness - how important a node is in connecting other nodes\n",
220 | "3. closeness - how easily a node can reach other nodes\n",
221 | "4. neighbors' characteristics - how important, central, or influential a node's neighbors are. \n",
222 | "\n",
223 | "
\n",
224 | "\n",
225 | "Reference:\n",
226 | "Social and Economic Networks by Matthew O. Jackson. Princeton University Press. \n",
227 | "Picture: [Wikipedia article](https://en.wikipedia.org/wiki/Centrality) \n",
228 | "Other examples: http://www.orgnet.com/sna.html \n",
229 | "\n",
230 | "## Degree Centrality\n",
231 | "\n",
232 | "The degree centrality measures the number of edges of a given node. \n",
233 | "\n",
234 | "In other words, it measures how connected a node is."
235 | ]
236 | },
237 | {
238 | "cell_type": "code",
239 | "execution_count": null,
240 | "metadata": {
241 | "ExecuteTime": {
242 | "end_time": "2017-02-13T09:24:17.296820",
243 | "start_time": "2017-02-13T09:24:16.904690"
244 | },
245 | "collapsed": false,
246 | "deletable": true,
247 | "editable": true
248 | },
249 | "outputs": [],
250 | "source": [
251 | "def print_network(val_map,title=None):\n",
252 | " print('\\x1b[1;31m'+title+'\\x1b[0m')\n",
253 | " for k, v in val_map.iteritems():\n",
254 | " print k.ljust(15), str(round(v,3)).ljust(30)\n",
255 | "\n",
256 | "def plot_network(val_map, title=None):\n",
257 | " values = [val_map.get(node, 0.25) for node in G.nodes()]\n",
258 | " plt.figure(figsize=(12,5))\n",
259 | " np.random.seed(2)\n",
260 | "# pos = nx.spring_layout(G)\n",
261 | " pos = nx.fruchterman_reingold_layout(G,dim=2)\n",
262 | " ec = nx.draw_networkx_edges(G,pos,alpha=.2)\n",
263 | " nc = nx.draw_networkx_nodes(G,pos,node_size=1000,with_labels=True,alpha=.5,cmap=plt.get_cmap('jet'),node_color=values)\n",
264 | " nx.draw_networkx_labels(G,pos,font_size=18)\n",
265 | " # nx.draw(G,pos,node_size=400,with_labels=True,alpha=.5,cmap=plt.get_cmap('jet'),node_color=values)\n",
266 | " plt.colorbar(nc)\n",
267 | " plt.axis('off')\n",
268 | " plt.suptitle(title,fontsize=18)\n",
269 | " plt.show()\n",
270 | "\n",
271 | "# Get degree centrality values\n",
272 | "d = nx.degree_centrality(G)\n",
273 | "title = 'Degree Centrality Map'\n",
274 | "# print centrality values\n",
275 | "print_network(d,title)\n",
276 | "# plot graph with values\n",
277 | "plot_network(d,title)"
278 | ]
279 | },
280 | {
281 | "cell_type": "markdown",
282 | "metadata": {
283 | "deletable": true,
284 | "editable": true
285 | },
286 | "source": [
287 | "Luke has the highest number of connections and therefore has the highest degree centrality. Jin is next, and then Eshin.\n",
288 | "\n",
289 | "# Betweenness Centrality\n",
290 | "Think of Between Centrality as a bottleneck or a broker of two separate networks. \n",
291 | "\n",
292 | "The measure is the fraction of the total number of shortest paths a node lie on between two other nodes.\n",
293 | "\n",
294 | "Here Eshin has the highest betweeness centrality because he connects the most different people."
295 | ]
296 | },
297 | {
298 | "cell_type": "code",
299 | "execution_count": null,
300 | "metadata": {
301 | "ExecuteTime": {
302 | "end_time": "2017-02-13T09:24:17.659747",
303 | "start_time": "2017-02-13T09:24:17.298352"
304 | },
305 | "collapsed": false,
306 | "deletable": true,
307 | "editable": true
308 | },
309 | "outputs": [],
310 | "source": [
311 | "d = nx.betweenness_centrality(G)\n",
312 | "title = \"Betweenness Centrality\"\n",
313 | "print_network(d,title)\n",
314 | "plot_network(d,title)"
315 | ]
316 | },
317 | {
318 | "cell_type": "markdown",
319 | "metadata": {
320 | "deletable": true,
321 | "editable": true
322 | },
323 | "source": [
324 | "# Closeness Centrality\n",
325 | "The closeness centrality measures how close a given node is to any other node. \n",
326 | "This is measured by the average length of the shortest path between a node and all other nodes. \n",
327 | "Thus you have higher closeness centrality if you can get to everyone faster than others. "
328 | ]
329 | },
330 | {
331 | "cell_type": "code",
332 | "execution_count": null,
333 | "metadata": {
334 | "ExecuteTime": {
335 | "end_time": "2017-02-13T09:24:18.015038",
336 | "start_time": "2017-02-13T09:24:17.661151"
337 | },
338 | "collapsed": false,
339 | "deletable": true,
340 | "editable": true
341 | },
342 | "outputs": [],
343 | "source": [
344 | "d = nx.closeness_centrality(G)\n",
345 | "title = \"Closeness Centrality\"\n",
346 | "print_network(d,title)\n",
347 | "plot_network(d,title)"
348 | ]
349 | },
350 | {
351 | "cell_type": "markdown",
352 | "metadata": {
353 | "deletable": true,
354 | "editable": true
355 | },
356 | "source": [
357 | "# Eigenvector Centrality\n",
358 | "\n",
359 | "The underlying assumption of the Eigenvector centrality is that a node's importance is determined by how important its neighbors are. \n",
360 | "\n",
361 | "For instance, having more influential friends (betweeness centrality) can be more important than just having more number of friends (degree centrality). \n",
362 | "\n",
363 | "Now we can observe that Luke is back to being more important than Eshin (who had higher betweeness centrality), because Luke is friends with Eshin who also has high centrality."
364 | ]
365 | },
366 | {
367 | "cell_type": "code",
368 | "execution_count": null,
369 | "metadata": {
370 | "ExecuteTime": {
371 | "end_time": "2017-02-13T09:24:18.378309",
372 | "start_time": "2017-02-13T09:24:18.016469"
373 | },
374 | "collapsed": false,
375 | "deletable": true,
376 | "editable": true
377 | },
378 | "outputs": [],
379 | "source": [
380 | "d = nx.eigenvector_centrality(G)\n",
381 | "title = \"Eigenvector Centrality\"\n",
382 | "print_network(d,title)\n",
383 | "plot_network(d,title)"
384 | ]
385 | },
386 | {
387 | "cell_type": "markdown",
388 | "metadata": {
389 | "deletable": true,
390 | "editable": true
391 | },
392 | "source": [
393 | "## Macromeasures\n",
394 | "\n",
395 | "Macromeasures are useful to understand the network as a whole or to compare networks. \n",
396 | "\n",
397 | "We will cover three fundamental measures of the network structure.\n",
398 | "1. Degree distribution\n",
399 | "2. Average shortest path \n",
400 | "3. Clustering Coefficients\n",
401 | "\n",
402 | "## Degree Distribution\n",
403 | "One fundamental characteristic is the distribution of degrees, number of edges for each node, which we can easily plot. \n",
404 | "From this distribution we can discern attributes such as whether everyone is connected to everybody are there many lonely nodes and such."
405 | ]
406 | },
407 | {
408 | "cell_type": "code",
409 | "execution_count": null,
410 | "metadata": {
411 | "ExecuteTime": {
412 | "end_time": "2017-02-13T09:24:19.263314",
413 | "start_time": "2017-02-13T09:24:18.379752"
414 | },
415 | "collapsed": false,
416 | "deletable": true,
417 | "editable": true
418 | },
419 | "outputs": [],
420 | "source": [
421 | "degree_sequence=sorted([d for n,d in G.degree().iteritems()], reverse=True) # degree sequence\n",
422 | "\n",
423 | "G2=nx.complete_graph(5)\n",
424 | "degree_sequence2=sorted([d for n,d in G2.degree().iteritems()], reverse=True) # degree sequence\n",
425 | "\n",
426 | "fig, [(ax1,ax2),(ax3,ax4)] = plt.subplots(2,2,figsize=(15,10))\n",
427 | "ax1.hist(degree_sequence,bins=range(0,7),rwidth=.8)\n",
428 | "ax1.set_title(\"Degree Histogram for Graph 1\",fontsize=16)\n",
429 | "ax1.set_ylabel(\"Count\",fontsize=14)\n",
430 | "ax1.set_xlabel(\"Degree\",fontsize=14)\n",
431 | "ax1.set_xticks([d+0.4 for d in degree_sequence])\n",
432 | "ax1.set_xticklabels([d for d in degree_sequence])\n",
433 | "ax1.set_ylim((0,6))\n",
434 | "\n",
435 | "nx.draw(G,pos=nx.circular_layout(G),ax=ax2)\n",
436 | "ax2.set_title('Network 1: Graph of our class',fontsize=16)\n",
437 | "\n",
438 | "ax3.hist(degree_sequence2,bins=range(0,7),rwidth=.8)\n",
439 | "ax3.set_title(\"Degree Histogram for Complete Graph\",fontsize=16)\n",
440 | "ax3.set_ylabel(\"Count\",fontsize=14)\n",
441 | "ax3.set_xlabel(\"Degree\",fontsize=14)\n",
442 | "ax3.set_xticks([d+0.4 for d in degree_sequence2])\n",
443 | "ax3.set_xticklabels([d for d in degree_sequence2])\n",
444 | "ax3.set_ylim((0,6))\n",
445 | "\n",
446 | "nx.draw(G2,pos=nx.circular_layout(G2),ax=ax4)\n",
447 | "ax4.set_title('Network 2: Graph of a Complete Graph',fontsize=16)\n",
448 | "\n",
449 | "plt.tight_layout(pad=0.4, w_pad=0.5, h_pad=1.0)\n",
450 | "plt.show()"
451 | ]
452 | },
453 | {
454 | "cell_type": "markdown",
455 | "metadata": {
456 | "deletable": true,
457 | "editable": true
458 | },
459 | "source": [
460 | "## Average Shortest Path\n",
461 | "\n",
462 | "The average shortest path length is the average of calculating the shortest number of edges it takes for every pair of nodes in the network. \n",
463 | "\n",
464 | "This can be a mesure of how efficient/fast the network is in the distribution of information, rumors, disease, etc."
465 | ]
466 | },
467 | {
468 | "cell_type": "code",
469 | "execution_count": null,
470 | "metadata": {
471 | "ExecuteTime": {
472 | "end_time": "2017-02-13T09:24:19.269671",
473 | "start_time": "2017-02-13T09:24:19.265105"
474 | },
475 | "collapsed": false,
476 | "deletable": true,
477 | "editable": true,
478 | "scrolled": true
479 | },
480 | "outputs": [],
481 | "source": [
482 | "print 'Network 1 average shortest path: ' + str(round(nx.average_shortest_path_length(G),2))\n",
483 | "print 'Network 2 average shortest path: ' + str(nx.average_shortest_path_length(G2))"
484 | ]
485 | },
486 | {
487 | "cell_type": "markdown",
488 | "metadata": {
489 | "deletable": true,
490 | "editable": true
491 | },
492 | "source": [
493 | "## Clustering coefficient. \n",
494 | "\n",
495 | "Another way to look at how tight-knit a network is to look at how clustered it is. \n",
496 | "To estimate clustering, we start from cliques, which can be considered as a subnetwork composed of three nodes forming a triangle. \n",
497 | "\n",
498 | "We can first calculate for each node, how many cliques(triangles) it is a member of. \n",
499 | "Then we calculate transitivity, which is simply the number of triangles in the network divided by the number of all possible traingles."
500 | ]
501 | },
502 | {
503 | "cell_type": "code",
504 | "execution_count": null,
505 | "metadata": {
506 | "ExecuteTime": {
507 | "end_time": "2017-02-13T09:24:19.633788",
508 | "start_time": "2017-02-13T09:24:19.271359"
509 | },
510 | "collapsed": false,
511 | "deletable": true,
512 | "editable": true,
513 | "scrolled": false
514 | },
515 | "outputs": [],
516 | "source": [
517 | "d = nx.triangles(G)\n",
518 | "print_network(d,'Number of cliques(triangles)')\n",
519 | "print \n",
520 | "print 'Transitivity : ' + str(nx.transitivity(G))\n",
521 | "\n",
522 | "plot_network(d,'Transitivity')"
523 | ]
524 | },
525 | {
526 | "cell_type": "markdown",
527 | "metadata": {
528 | "deletable": true,
529 | "editable": true
530 | },
531 | "source": [
532 | "A similar approach is to calculate clustering coefficient for each node and for the graph.\n",
533 | "Clustering is the measure of how likely it is if A has edges to B and C, that B and C related? \n",
534 | "\n",
535 | "For instance, Charlie form a triangle with Luke and Jin, but Sam and Heidi don't form a single triangle.\n",
536 | "\n",
537 | "We can calculate this for each node and then get an average value overall to get the average clustering coefficient."
538 | ]
539 | },
540 | {
541 | "cell_type": "code",
542 | "execution_count": null,
543 | "metadata": {
544 | "ExecuteTime": {
545 | "end_time": "2017-02-13T09:24:19.642416",
546 | "start_time": "2017-02-13T09:24:19.635348"
547 | },
548 | "collapsed": false,
549 | "deletable": true,
550 | "editable": true
551 | },
552 | "outputs": [],
553 | "source": [
554 | "d = nx.clustering(G)\n",
555 | "print_network(d,'Clustering Coefficients per node')\n",
556 | "print \n",
557 | "print 'Average clustering coefficient : ' + str(nx.average_clustering(G))"
558 | ]
559 | },
560 | {
561 | "cell_type": "code",
562 | "execution_count": null,
563 | "metadata": {
564 | "collapsed": true,
565 | "deletable": true,
566 | "editable": true
567 | },
568 | "outputs": [],
569 | "source": []
570 | },
571 | {
572 | "cell_type": "markdown",
573 | "metadata": {
574 | "collapsed": true,
575 | "deletable": true,
576 | "editable": true
577 | },
578 | "source": [
579 | "# Exercise\n",
580 | "Your assignment this week is to construct your own social network. It could be based on a sports team, your close friends, or family. It could also reflect more abstract aspects of relationships such as social support, gossip, problem solver, etc.\n",
581 | "\n",
582 | "Calculate two different metrics of the network to provide insight into how different members play different roles."
583 | ]
584 | },
585 | {
586 | "cell_type": "code",
587 | "execution_count": null,
588 | "metadata": {
589 | "collapsed": true,
590 | "deletable": true,
591 | "editable": true
592 | },
593 | "outputs": [],
594 | "source": []
595 | }
596 | ],
597 | "metadata": {
598 | "anaconda-cloud": {},
599 | "kernelspec": {
600 | "display_name": "Python [default]",
601 | "language": "python",
602 | "name": "python2"
603 | },
604 | "language_info": {
605 | "codemirror_mode": {
606 | "name": "ipython",
607 | "version": 2
608 | },
609 | "file_extension": ".py",
610 | "mimetype": "text/x-python",
611 | "name": "python",
612 | "nbconvert_exporter": "python",
613 | "pygments_lexer": "ipython2",
614 | "version": "2.7.13"
615 | },
616 | "toc": {
617 | "nav_menu": {
618 | "height": "260px",
619 | "width": "252px"
620 | },
621 | "navigate_menu": true,
622 | "number_sections": true,
623 | "sideBar": true,
624 | "threshold": 4,
625 | "toc_cell": false,
626 | "toc_section_display": "block",
627 | "toc_window_display": false
628 | }
629 | },
630 | "nbformat": 4,
631 | "nbformat_minor": 0
632 | }
633 |
--------------------------------------------------------------------------------
/Notebooks/7_Introduction_to_Scraping.ipynb:
--------------------------------------------------------------------------------
1 | {
2 | "cells": [
3 | {
4 | "cell_type": "markdown",
5 | "metadata": {
6 | "deletable": true,
7 | "editable": true
8 | },
9 | "source": [
10 | "# Introduction to Web Scraping\n",
11 | "Often we are interested in getting data from a website. Modern websites are often built using a [REST](https://en.wikipedia.org/wiki/Representational_state_transfer) framework that has an Application Programming Interface ([API](https://en.wikipedia.org/wiki/Application_programming_interface)) to make [HTTP](https://www.tutorialspoint.com/http/http_requests.htm) requests to retrieve structured data in the form of [JSON](https://en.wikipedia.org/wiki/JSON) or XML.\n",
12 | "\n",
13 | "However, when there is not a clear API, we might need to perform web scraping by directly grabbing the data ourselves"
14 | ]
15 | },
16 | {
17 | "cell_type": "code",
18 | "execution_count": null,
19 | "metadata": {
20 | "ExecuteTime": {
21 | "end_time": "2017-03-06T13:26:29.942689Z",
22 | "start_time": "2017-03-06T08:26:29.777081-05:00"
23 | },
24 | "collapsed": true,
25 | "deletable": true,
26 | "editable": true
27 | },
28 | "outputs": [],
29 | "source": [
30 | "try:\n",
31 | " import requests\n",
32 | "except:\n",
33 | " !pip install requests\n",
34 | "try:\n",
35 | " from bs4 import BeautifulSoup\n",
36 | "except:\n",
37 | " !pip install bs4"
38 | ]
39 | },
40 | {
41 | "cell_type": "markdown",
42 | "metadata": {
43 | "deletable": true,
44 | "editable": true
45 | },
46 | "source": [
47 | "## Getting data using requests\n",
48 | "[Requests](http://docs.python-requests.org/en/master/) is an excellent library for performing HTTP requests. \n",
49 | "\n",
50 | "In this simple example, we will scrape data from the PBS faculty webpage."
51 | ]
52 | },
53 | {
54 | "cell_type": "code",
55 | "execution_count": null,
56 | "metadata": {
57 | "ExecuteTime": {
58 | "end_time": "2017-03-06T13:26:32.826356Z",
59 | "start_time": "2017-03-06T08:26:32.720782-05:00"
60 | },
61 | "collapsed": false,
62 | "deletable": true,
63 | "editable": true
64 | },
65 | "outputs": [],
66 | "source": [
67 | "page = requests.get(\"http://pbs.dartmouth.edu/people\")\n",
68 | "print(page)"
69 | ]
70 | },
71 | {
72 | "cell_type": "markdown",
73 | "metadata": {
74 | "deletable": true,
75 | "editable": true
76 | },
77 | "source": [
78 | "Here the response '200' indicates that the get request was successful. Now let's look at the actual text that was downloaded from the webpage."
79 | ]
80 | },
81 | {
82 | "cell_type": "code",
83 | "execution_count": null,
84 | "metadata": {
85 | "ExecuteTime": {
86 | "end_time": "2017-03-06T05:28:31.476250Z",
87 | "start_time": "2017-03-06T00:28:31.463432-05:00"
88 | },
89 | "collapsed": false,
90 | "deletable": true,
91 | "editable": true
92 | },
93 | "outputs": [],
94 | "source": [
95 | "print(page.content)"
96 | ]
97 | },
98 | {
99 | "cell_type": "markdown",
100 | "metadata": {
101 | "deletable": true,
102 | "editable": true
103 | },
104 | "source": [
105 | "Here you can see that we have downloaded all of the data from the PBS faculty page and that it is in the form of HTML. \n",
106 | "HTML is a markup language that tells a browser how to layout content. HTML consists of elements called tags. Each tag indicates a beginning and end. Here are a few examples: \n",
107 | "\n",
108 | " - `` - indicates hyperlink\n",
109 | " - `` - indicates paragraph\n",
110 | " - `` - indicates a division, or area, of the page.\n",
111 | " - `` - bolds any text inside.\n",
112 | " - `` - italicizes any text inside.\n",
113 | " - `` - indicates a header\n",
114 | " - `` - creates a table.\n",
115 | " - `
` - ordered list\n",
116 | " - `` - unordered list\n",
117 | " - `` - list item\n"
118 | ]
119 | },
120 | {
121 | "cell_type": "markdown",
122 | "metadata": {
123 | "deletable": true,
124 | "editable": true
125 | },
126 | "source": [
127 | "## Parsing HTML using Beautiful Soup\n",
128 | "There are many libraries that can be helpful for quickly parsing structured text such as HTML. We will be using Beautiful Soup as an example."
129 | ]
130 | },
131 | {
132 | "cell_type": "code",
133 | "execution_count": null,
134 | "metadata": {
135 | "ExecuteTime": {
136 | "end_time": "2017-03-06T13:26:39.183625Z",
137 | "start_time": "2017-03-06T08:26:39.071917-05:00"
138 | },
139 | "collapsed": false,
140 | "deletable": true,
141 | "editable": true
142 | },
143 | "outputs": [],
144 | "source": [
145 | "soup = BeautifulSoup(page.content, 'html.parser')\n",
146 | "print(soup.prettify())"
147 | ]
148 | },
149 | {
150 | "cell_type": "markdown",
151 | "metadata": {
152 | "deletable": true,
153 | "editable": true
154 | },
155 | "source": [
156 | "Here we are going to find the unordered list tagged with the id 'faculty-container'. We are then going to look for any nested tag that use the 'h4' header tag. This should give us all of the lines with the faculty names as a list."
157 | ]
158 | },
159 | {
160 | "cell_type": "code",
161 | "execution_count": null,
162 | "metadata": {
163 | "ExecuteTime": {
164 | "end_time": "2017-03-06T13:26:41.798326Z",
165 | "start_time": "2017-03-06T08:26:41.769082-05:00"
166 | },
167 | "collapsed": false,
168 | "deletable": true,
169 | "editable": true
170 | },
171 | "outputs": [],
172 | "source": [
173 | "names_html = soup.find_all('ul',id='faculty-container')[0].find_all('h4')\n",
174 | "names = [x.text for x in names_html]\n",
175 | "print(names)"
176 | ]
177 | },
178 | {
179 | "cell_type": "markdown",
180 | "metadata": {
181 | "deletable": true,
182 | "editable": true
183 | },
184 | "source": [
185 | "What if we wanted to get all of the faculty email addresses?"
186 | ]
187 | },
188 | {
189 | "cell_type": "code",
190 | "execution_count": null,
191 | "metadata": {
192 | "ExecuteTime": {
193 | "end_time": "2017-03-06T13:26:45.729051Z",
194 | "start_time": "2017-03-06T08:26:45.696816-05:00"
195 | },
196 | "collapsed": false,
197 | "deletable": true,
198 | "editable": true
199 | },
200 | "outputs": [],
201 | "source": [
202 | "email_html = soup.find_all('ul',id='faculty-container')[0].find_all('span',{'class' : 'contact'})\n",
203 | "email = [x.text for x in email_html]\n",
204 | "print(email)"
205 | ]
206 | },
207 | {
208 | "cell_type": "markdown",
209 | "metadata": {
210 | "deletable": true,
211 | "editable": true
212 | },
213 | "source": [
214 | "## Parsing string data\n",
215 | "What if we wanted to grab the name from the list of email addresses?"
216 | ]
217 | },
218 | {
219 | "cell_type": "code",
220 | "execution_count": null,
221 | "metadata": {
222 | "ExecuteTime": {
223 | "end_time": "2017-03-06T06:16:47.161190Z",
224 | "start_time": "2017-03-06T01:16:47.154047-05:00"
225 | },
226 | "collapsed": false,
227 | "deletable": true,
228 | "editable": true
229 | },
230 | "outputs": [],
231 | "source": [
232 | "print([x.split('@')[0] for x in email])"
233 | ]
234 | },
235 | {
236 | "cell_type": "markdown",
237 | "metadata": {
238 | "collapsed": true,
239 | "deletable": true,
240 | "editable": true
241 | },
242 | "source": [
243 | "One thing we might do with this data is create a dictionary with names and emails of all of the professors in the department. This could be useful if we wanted to send a bulk email to them. "
244 | ]
245 | },
246 | {
247 | "cell_type": "code",
248 | "execution_count": null,
249 | "metadata": {
250 | "ExecuteTime": {
251 | "end_time": "2017-03-06T13:37:49.514990Z",
252 | "start_time": "2017-03-06T08:37:49.509404-05:00"
253 | },
254 | "collapsed": false,
255 | "deletable": true,
256 | "editable": true
257 | },
258 | "outputs": [],
259 | "source": [
260 | "email_dict = dict([(x.split('@')[0],x) for x in email])\n",
261 | "print(email_dict)"
262 | ]
263 | },
264 | {
265 | "cell_type": "markdown",
266 | "metadata": {},
267 | "source": [
268 | "You can see that every name also includes an initial. Let's try to just pull out the first and last name."
269 | ]
270 | },
271 | {
272 | "cell_type": "code",
273 | "execution_count": null,
274 | "metadata": {
275 | "ExecuteTime": {
276 | "end_time": "2017-03-06T13:37:51.752857Z",
277 | "start_time": "2017-03-06T08:37:51.747826-05:00"
278 | },
279 | "collapsed": false
280 | },
281 | "outputs": [],
282 | "source": [
283 | "for x in email_dict.keys():\n",
284 | " old = x.split('.')\n",
285 | " email_dict[\" \".join([i for i in old if len(i) > 2])] = email_dict[x]\n",
286 | " del email_dict[x]\n",
287 | "\n",
288 | "print(email_dict)"
289 | ]
290 | },
291 | {
292 | "cell_type": "markdown",
293 | "metadata": {
294 | "deletable": true,
295 | "editable": true
296 | },
297 | "source": [
298 | "## Interacting with web page using Selenium\n",
299 | "Sometimes we need to directly interact with aspects of the webpage. Maybe there is a form that needs to be submitted or a javascript button that needs to be pressed. [Selenium](http://selenium-python.readthedocs.io/) is a useful tool for these types of tasks."
300 | ]
301 | },
302 | {
303 | "cell_type": "code",
304 | "execution_count": null,
305 | "metadata": {
306 | "collapsed": true
307 | },
308 | "outputs": [],
309 | "source": []
310 | }
311 | ],
312 | "metadata": {
313 | "kernelspec": {
314 | "display_name": "Python 2",
315 | "language": "python",
316 | "name": "python2"
317 | },
318 | "language_info": {
319 | "codemirror_mode": {
320 | "name": "ipython",
321 | "version": 2
322 | },
323 | "file_extension": ".py",
324 | "mimetype": "text/x-python",
325 | "name": "python",
326 | "nbconvert_exporter": "python",
327 | "pygments_lexer": "ipython2",
328 | "version": "2.7.13"
329 | }
330 | },
331 | "nbformat": 4,
332 | "nbformat_minor": 2
333 | }
334 |
--------------------------------------------------------------------------------
/Notebooks/Figures/centrality.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ljchang/psyc63/b3329228952c9c6793196db502b0442e77863e46/Notebooks/Figures/centrality.png
--------------------------------------------------------------------------------
/Notebooks/Figures/estimating_coefficients.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ljchang/psyc63/b3329228952c9c6793196db502b0442e77863e46/Notebooks/Figures/estimating_coefficients.png
--------------------------------------------------------------------------------
/Notebooks/Figures/hw2-3.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ljchang/psyc63/b3329228952c9c6793196db502b0442e77863e46/Notebooks/Figures/hw2-3.png
--------------------------------------------------------------------------------
/Notebooks/Figures/hw2-4.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ljchang/psyc63/b3329228952c9c6793196db502b0442e77863e46/Notebooks/Figures/hw2-4.png
--------------------------------------------------------------------------------
/Notebooks/Figures/mediation1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ljchang/psyc63/b3329228952c9c6793196db502b0442e77863e46/Notebooks/Figures/mediation1.png
--------------------------------------------------------------------------------
/Notebooks/Figures/moderator1.gif:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ljchang/psyc63/b3329228952c9c6793196db502b0442e77863e46/Notebooks/Figures/moderator1.gif
--------------------------------------------------------------------------------
/Notebooks/Figures/moderator2.gif:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ljchang/psyc63/b3329228952c9c6793196db502b0442e77863e46/Notebooks/Figures/moderator2.gif
--------------------------------------------------------------------------------
/Notebooks/Figures/moderator3.jpeg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ljchang/psyc63/b3329228952c9c6793196db502b0442e77863e46/Notebooks/Figures/moderator3.jpeg
--------------------------------------------------------------------------------
/Notebooks/Figures/nodeedge.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ljchang/psyc63/b3329228952c9c6793196db502b0442e77863e46/Notebooks/Figures/nodeedge.png
--------------------------------------------------------------------------------
/Notebooks/Figures/slope_intercept.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ljchang/psyc63/b3329228952c9c6793196db502b0442e77863e46/Notebooks/Figures/slope_intercept.png
--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
1 | # psyc63
2 | This gihub repository provides lab materials that accompany Psyc63 Experimental Study of Behavior at Dartmouth College. Labs will consist of working through [Jupyter Notebooks](http://jupyter.org/) that contain exercises in Python. Jupyter Notebooks can be installed by downloading and installing [Anaconda Python 2.7](https://www.continuum.io/downloads).
3 |
4 | ## Jupyter Notebooks
5 | ### Introduction to Python
6 | - Installation
7 | - Notebooks
8 | - Variables
9 | - Loops and Conditional Statements
10 | - Functions
11 | - Containers
12 | - Modules
13 |
14 | ### Data Analysis I
15 | - Data Checking
16 | - Data Plotting
17 | - Linear Model Regression
18 | - Model Comparisons
19 |
20 | ### Data Analysis II
21 | - Logistic Regression
22 | - Classification
23 | - Prediction
24 | - Cross-validation
25 | - Regularization Techniques
26 | - Lasso & Ridge
27 |
28 | ### Data Analysis III
29 | - Mediation Analysis
30 | - Moderation Analysis
31 |
32 | ### Social Network Analysis
33 | - Primer to Social Network Analysis
34 | - NetworkX
35 | - Micromeasures
36 | - degree centrality
37 | - betweenness centrality
38 | - closeness centrality
39 | - Eigenvector centrality
40 | - Macromeasures
41 | - Degree Distribution
42 | - Average shortest path
43 | - Clustering coefficients
44 |
45 |
46 |
--------------------------------------------------------------------------------