├── .gitignore
├── Data
    ├── 13May16
    │   ├── contains.txt
    │   ├── subjects.p
    │   └── users.p
    ├── CrowdstormingDataJuly1st.csv
    ├── love_those_books.csv
    ├── salary.csv
    ├── salary_exercise.csv
    └── student-mat.csv
├── LICENSE
├── Notebooks
    ├── .gitignore
    ├── 1_Introduction_to_Programming.ipynb
    ├── 2.1_MatplotlibTemplates.ipynb
    ├── 2_Introduction_to_Dataframes_&_Plotting.ipynb
    ├── 3_Introduction_to_Regression.ipynb
    ├── 4_Mediation_&_Moderation.ipynb
    ├── 5_Prediction_&_Regularization.ipynb
    ├── 6_Introduction_to_SocialNetworkAnalysis.ipynb
    ├── 7_Introduction_to_Scraping.ipynb
    ├── Figures
    │   ├── centrality.png
    │   ├── estimating_coefficients.png
    │   ├── hw2-3.png
    │   ├── hw2-4.png
    │   ├── mediation1.png
    │   ├── moderator1.gif
    │   ├── moderator2.gif
    │   ├── moderator3.jpeg
    │   ├── nodeedge.png
    │   └── slope_intercept.png
    └── Old
    │   ├── Classification.ipynb
    │   └── phacking.ipynb
└── README.md


/.gitignore:
--------------------------------------------------------------------------------
 1 | # Byte-compiled / optimized / DLL files
 2 | __pycache__/
 3 | *.py[cod]
 4 | *$py.class
 5 | 
 6 | # C extensions
 7 | *.so
 8 | 
 9 | # Distribution / packaging
10 | .Python
11 | env/
12 | build/
13 | develop-eggs/
14 | dist/
15 | downloads/
16 | eggs/
17 | .eggs/
18 | lib/
19 | lib64/
20 | parts/
21 | sdist/
22 | var/
23 | *.egg-info/
24 | .installed.cfg
25 | *.egg
26 | 
27 | # PyInstaller
28 | #  Usually these files are written by a python script from a template
29 | #  before PyInstaller builds the exe, so as to inject date/other infos into it.
30 | *.manifest
31 | *.spec
32 | 
33 | # Installer logs
34 | pip-log.txt
35 | pip-delete-this-directory.txt
36 | 
37 | # Unit test / coverage reports
38 | htmlcov/
39 | .tox/
40 | .coverage
41 | .coverage.*
42 | .cache
43 | nosetests.xml
44 | coverage.xml
45 | *,cover
46 | .hypothesis/
47 | 
48 | # Translations
49 | *.mo
50 | *.pot
51 | 
52 | # Django stuff:
53 | *.log
54 | local_settings.py
55 | 
56 | # Flask stuff:
57 | instance/
58 | .webassets-cache
59 | 
60 | # Scrapy stuff:
61 | .scrapy
62 | 
63 | # Sphinx documentation
64 | docs/_build/
65 | 
66 | # PyBuilder
67 | target/
68 | 
69 | # IPython Notebook
70 | .ipynb_checkpoints
71 | 
72 | # pyenv
73 | .python-version
74 | 
75 | # celery beat schedule file
76 | celerybeat-schedule
77 | 
78 | # dotenv
79 | .env
80 | 
81 | # virtualenv
82 | venv/
83 | ENV/
84 | 
85 | # Spyder project settings
86 | .spyderproject
87 | 
88 | # Rope project settings
89 | .ropeproject
90 | 


--------------------------------------------------------------------------------
/Data/13May16/contains.txt:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ljchang/psyc63/b3329228952c9c6793196db502b0442e77863e46/Data/13May16/contains.txt


--------------------------------------------------------------------------------
/Data/13May16/users.p:
--------------------------------------------------------------------------------
  1 | (lp0
  2 | (dp1
  3 | Vservices
  4 | p2
  5 | (dp3
  6 | Vpassword
  7 | p4
  8 | (dp5
  9 | Vbcrypt
 10 | p6
 11 | V$2a$10$pbAb8K6OmsOmonCzbGRf0e1MrLp7ec1JnYIzEctMLyn9btOpzvAmG
 12 | p7
 13 | ssVresume
 14 | p8
 15 | (dp9
 16 | VloginTokens
 17 | p10
 18 | (lp11
 19 | sssV_id
 20 | p12
 21 | Vogrw4dL9mksHEKigC
 22 | p13
 23 | sVemails
 24 | p14
 25 | (lp15
 26 | (dp16
 27 | Vverified
 28 | p17
 29 | I00
 30 | sVaddress
 31 | p18
 32 | Vejolly@test.com
 33 | p19
 34 | sasVcreatedAt
 35 | p20
 36 | cdatetime
 37 | datetime
 38 | p21
 39 | (S'\x07\xe0\x05\x08\x17#\x1d\n\xcd\xa0'
 40 | p22
 41 | tp23
 42 | Rp24
 43 | sa(dp25
 44 | Vservices
 45 | p26
 46 | (dp27
 47 | Vpassword
 48 | p28
 49 | (dp29
 50 | Vbcrypt
 51 | p30
 52 | V$2a$10$5P3UUGulQ7RACVROaU56l.H1gYvRhDbuvmuXM7UXHzC/6iyt3gOJ.
 53 | p31
 54 | ssVresume
 55 | p32
 56 | (dp33
 57 | VloginTokens
 58 | p34
 59 | (lp35
 60 | sssV_id
 61 | p36
 62 | VGwbhewAmNJ3nA8g4d
 63 | p37
 64 | sVemails
 65 | p38
 66 | (lp39
 67 | (dp40
 68 | Vverified
 69 | p41
 70 | I00
 71 | sVaddress
 72 | p42
 73 | Vjames.e.carmichael.gr@dartmouth.edu
 74 | p43
 75 | sasVcreatedAt
 76 | p44
 77 | g21
 78 | (S"\x07\xe0\x05\x08\x17'6\x0bZ@"
 79 | p45
 80 | tp46
 81 | Rp47
 82 | sa(dp48
 83 | Vservices
 84 | p49
 85 | (dp50
 86 | Vpassword
 87 | p51
 88 | (dp52
 89 | Vbcrypt
 90 | p53
 91 | V$2a$10$ymYsiX7HMD4inQ20MHD/VutAr844y3JvmjeOcqLH4wu8krSX8fUXK
 92 | p54
 93 | ssVresume
 94 | p55
 95 | (dp56
 96 | VloginTokens
 97 | p57
 98 | (lp58
 99 | sssV_id
100 | p59
101 | VtksS632f4u9JtZ3LZ
102 | p60
103 | sVemails
104 | p61
105 | (lp62
106 | (dp63
107 | Vverified
108 | p64
109 | I00
110 | sVaddress
111 | p65
112 | Vcbentivoglio@middlebury.edu
113 | p66
114 | sasVcreatedAt
115 | p67
116 | g21
117 | (S'\x07\xe0\x05\t\x006;\x02Y\x90'
118 | p68
119 | tp69
120 | Rp70
121 | sa(dp71
122 | Vservices
123 | p72
124 | (dp73
125 | Vpassword
126 | p74
127 | (dp75
128 | Vbcrypt
129 | p76
130 | V$2a$10$uBmZiohM3ZdT/EpsZWAlUeffalyYaZAToT.wtqAmPEMTABfWk14wi
131 | p77
132 | ssVresume
133 | p78
134 | (dp79
135 | VloginTokens
136 | p80
137 | (lp81
138 | sssV_id
139 | p82
140 | VQa8shq3L9omL5xHKo
141 | p83
142 | sVemails
143 | p84
144 | (lp85
145 | (dp86
146 | Vverified
147 | p87
148 | I00
149 | sVaddress
150 | p88
151 | Vtest@test.com
152 | p89
153 | sasVcreatedAt
154 | p90
155 | g21
156 | (S'\x07\xe0\x05\t\x10\x1d#\x05\xcc`'
157 | p91
158 | tp92
159 | Rp93
160 | sa(dp94
161 | Vservices
162 | p95
163 | (dp96
164 | Vpassword
165 | p97
166 | (dp98
167 | Vbcrypt
168 | p99
169 | V$2a$10$0Bhtui2bp05B9GyS3KBhtubQz9xlVCXYhDvefDAW47TCS/iPxRNUG
170 | p100
171 | ssVresume
172 | p101
173 | (dp102
174 | VloginTokens
175 | p103
176 | (lp104
177 | (dp105
178 | VhashedToken
179 | p106
180 | VbyXZrVYdeasFKoHzaKN0kjoffxeHAEIdcJSJZttl2Ik=
181 | p107
182 | sVwhen
183 | p108
184 | g21
185 | (S"\x07\xe0\x05\n\x16\x18'\x07V\xe8"
186 | p109
187 | tp110
188 | Rp111
189 | sa(dp112
190 | VhashedToken
191 | p113
192 | V5deRxFJTT8QhMfCES+CpwsQ2R31GgE8/RYnVb2UHw7g=
193 | p114
194 | sVwhen
195 | p115
196 | g21
197 | (S'\x07\xe0\x05\n\x16\x19.\t\xa0\xd8'
198 | p116
199 | tp117
200 | Rp118
201 | sa(dp119
202 | VhashedToken
203 | p120
204 | V9ToUywNIWQBE9ZFr1ZK5hNBafsheg1zqOSs8IMCr9YE=
205 | p121
206 | sVwhen
207 | p122
208 | g21
209 | (S'\x07\xe0\x05\n\x17/\x01\x05\xcc`'
210 | p123
211 | tp124
212 | Rp125
213 | sasssV_id
214 | p126
215 | V3fcdRR5Y3hzdABSXn
216 | p127
217 | sVemails
218 | p128
219 | (lp129
220 | (dp130
221 | Vverified
222 | p131
223 | I00
224 | sVaddress
225 | p132
226 | Vdanrwhitcomb@gmail.com
227 | p133
228 | sasVcreatedAt
229 | p134
230 | g21
231 | (S'\x07\xe0\x05\t\x17#\x19\x01\xd0\xd8'
232 | p135
233 | tp136
234 | Rp137
235 | sa(dp138
236 | Vservices
237 | p139
238 | (dp140
239 | Vpassword
240 | p141
241 | (dp142
242 | Vbcrypt
243 | p143
244 | V$2a$10$eKtiz4QTXwGoGksJumGP0O6NQc9QhzlDHWC91gRop4ZKmbTbgpQ52
245 | p144
246 | ssVresume
247 | p145
248 | (dp146
249 | VloginTokens
250 | p147
251 | (lp148
252 | sssV_id
253 | p149
254 | Vm3q2uoH5waEFCdKLQ
255 | p150
256 | sVemails
257 | p151
258 | (lp152
259 | (dp153
260 | Vverified
261 | p154
262 | I00
263 | sVaddress
264 | p155
265 | Vanna.t.prescott.gr@dartmouth.edu
266 | p156
267 | sasVcreatedAt
268 | p157
269 | g21
270 | (S'\x07\xe0\x05\r\x0f\x1e\x03\x08)\xd8'
271 | p158
272 | tp159
273 | Rp160
274 | sa(dp161
275 | Vservices
276 | p162
277 | (dp163
278 | Vpassword
279 | p164
280 | (dp165
281 | Vbcrypt
282 | p166
283 | V$2a$10$URpEJIxBHRr4WHldL4jLkuGBnrIkLGXrEIvpwVeRcnU1Y17XgUOye
284 | p167
285 | ssVresume
286 | p168
287 | (dp169
288 | VloginTokens
289 | p170
290 | (lp171
291 | sssV_id
292 | p172
293 | VhZHxzvxjpzMzA4nJT
294 | p173
295 | sVemails
296 | p174
297 | (lp175
298 | (dp176
299 | Vverified
300 | p177
301 | I00
302 | sVaddress
303 | p178
304 | Varatidoo@gmail.com
305 | p179
306 | sasVcreatedAt
307 | p180
308 | g21
309 | (S'\x07\xe0\x05\r\x13&\x00\x08\x7f\xc8'
310 | p181
311 | tp182
312 | Rp183
313 | sa.


--------------------------------------------------------------------------------
/Data/love_those_books.csv:
--------------------------------------------------------------------------------
1 | enjoy,buy,read4,16,615,19,131,0,111,19,1313,25,1219,24,116,22,710,21,815,13,123,7,411,28,1520,31,147,4,711,26,1410,11,96,12,57,14,718,16,128,20,102,13,67,12,912,23,1313,22,915,19,134,12,93,10,59,7,77,22,810,7,82,0,215,16,71,17,63,11,96,5,913,29,1515,29,1116,20,914,16,71,3,28,8,10


--------------------------------------------------------------------------------
/Data/salary.csv:
--------------------------------------------------------------------------------
1 | salary,gender,departm,years,age,publications86285,0,bio,26,64,7277125,0,bio,28,58,4371922,0,bio,10,38,2370499,0,bio,16,46,6466624,0,bio,11,41,2364451,0,bio,23,60,4464366,0,bio,23,53,2259344,0,bio,5,40,1158560,0,bio,8,38,858294,0,bio,20,50,1256092,0,bio,2,40,454452,0,bio,13,43,754269,0,bio,26,56,1255125,0,bio,8,38,997630,0,chem,34,64,4382444,0,chem,31,61,4276291,0,chem,29,65,3375382,0,chem,26,56,3964762,0,chem,25,,2962607,0,chem,20,45,3460373,0,chem,26,56,4358892,0,chem,18,48,2147021,0,chem,4,34,1244687,0,chem,4,34,19104828,0,geol,,50,4471456,0,geol,11,41,3265144,0,geol,7,37,1252766,0,geol,4,38,32112800,0,neuro,14,44,33105761,0,neuro,9,39,3092951,0,neuro,11,41,2086621,0,neuro,19,49,1085569,0,neuro,20,46,3583896,0,neuro,10,40,2279735,0,neuro,11,41,3271518,0,neuro,7,37,3468029,0,neuro,15,45,3366482,0,neuro,14,44,4261680,0,neuro,18,48,2060455,0,neuro,8,38,4958932,0,neuro,11,41,49106412,0,stat,23,53,2986980,0,stat,23,53,4278114,0,stat,8,38,2474085,0,stat,11,41,3372250,0,stat,26,56,969596,0,stat,20,50,1865285,0,stat,20,50,1562557,0,stat,28,58,1461947,0,stat,22,58,1758565,0,stat,29,59,1158365,0,stat,18,48,2153656,0,stat,2,32,451391,0,stat,5,35,896936,0,physics,15,50,1783216,0,physics,11,37,1972044,0,physics,2,32,1664048,0,physics,23,53,458888,0,physics,26,56,758744,0,physics,20,50,955944,0,physics,21,51,854076,0,physics,19,49,1282142,0,math,9,39,970509,0,math,23,53,760320,0,math,14,44,755814,0,math,8,38,653638,0,math,4,42,853517,2,math,5,35,559139,1,bio,8,38,2352968,1,bio,18,48,3255949,1,chem,4,34,1258893,1,neuro,10,35,453662,1,neuro,1,31,357185,1,stat,9,39,752254,1,stat,2,32,961885,1,math,23,60,949542,1,math,3,33,5


--------------------------------------------------------------------------------
/Data/salary_exercise.csv:
--------------------------------------------------------------------------------
1 | sx,rk,yr,dg,yd,slmale,full,25,doctorate,35,36350male,full,13,doctorate,22,35350male,full,10,doctorate,23,28200female,full,7,doctorate,27,26775male,full,19,masters,30,33696male,full,16,doctorate,21,28516female,full,0,masters,32,24900male,full,16,doctorate,18,31909male,full,13,masters,30,31850male,full,13,masters,31,32850male,full,12,doctorate,22,27025male,associate,15,doctorate,19,24750male,full,9,doctorate,17,28200male,associate,9,,27,23712male,full,9,doctorate,24,25748male,full,7,doctorate,15,29342male,full,13,doctorate,20,31114male,associate,11,masters,14,24742male,associate,10,masters,15,22906male,full,6,masters,21,24450male,assistant,16,masters,23,19175male,associate,8,masters,31,20525male,full,7,doctorate,13,27959female,full,8,doctorate,24,38045male,associate,9,doctorate,12,male,full,5,doctorate,18,25400male,associate,11,doctorate,14,24800female,full,5,doctorate,16,25500male,associate,3,masters,7,26182male,associate,3,masters,17,23725female,assistant,10,masters,15,21600male,associate,11,masters,31,23300male,assistant,9,masters,14,23713female,associate,4,masters,33,20690female,associate,6,masters,29,22450male,associate,1,doctorate,9,20850female,assistant,8,doctorate,14,18304male,assistant,4,doctorate,4,17095male,assistant,4,doctorate,5,16700male,assistant,4,doctorate,4,17600,assistant,3,doctorate,4,18075male,assistant,3,masters,11,18000male,associate,0,doctorate,7,20999female,assistant,3,doctorate,3,17250male,assistant,2,doctorate,3,16500male,assistant,2,doctorate,1,16094female,assistant,2,doctorate,6,16150female,assistant,2,doctorate,2,15350male,assistant,1,doctorate,1,16244female,assistant,1,doctorate,1,16686female,assistant,1,doctorate,1,15000female,assistant,0,doctorate,2,20300


--------------------------------------------------------------------------------
/Data/student-mat.csv:
--------------------------------------------------------------------------------
1 | school,sex,age,address,famsize,Pstatus,Medu,Fedu,Mjob,Fjob,reason,guardian,traveltime,studytime,failures,schoolsup,famsup,paid,activities,nursery,higher,internet,romantic,famrel,freetime,goout,Dalc,Walc,health,absences,G1,G2,G3GP,F,18,U,GT3,A,4,4,at_home,teacher,course,mother,2,2,0,yes,no,no,no,yes,yes,no,no,4,3,4,1,1,3,6,5,6,6GP,F,17,U,GT3,T,1,1,at_home,other,course,father,1,2,0,no,yes,no,no,no,yes,yes,no,5,3,3,1,1,3,4,5,5,6GP,F,15,U,LE3,T,1,1,at_home,other,other,mother,1,2,3,yes,no,yes,no,yes,yes,yes,no,4,3,2,2,3,3,10,7,8,10GP,F,15,U,GT3,T,4,2,health,services,home,mother,1,3,0,no,yes,yes,yes,yes,yes,yes,yes,3,2,2,1,1,5,2,15,14,15GP,F,16,U,GT3,T,3,3,other,other,home,father,1,2,0,no,yes,yes,no,yes,yes,no,no,4,3,2,1,2,5,4,6,10,10GP,M,16,U,LE3,T,4,3,services,other,reputation,mother,1,2,0,no,yes,yes,yes,yes,yes,yes,no,5,4,2,1,2,5,10,15,15,15GP,M,16,U,LE3,T,2,2,other,other,home,mother,1,2,0,no,no,no,no,yes,yes,yes,no,4,4,4,1,1,3,0,12,12,11GP,F,17,U,GT3,A,4,4,other,teacher,home,mother,2,2,0,yes,yes,no,no,yes,yes,no,no,4,1,4,1,1,1,6,6,5,6GP,M,15,U,LE3,A,3,2,services,other,home,mother,1,2,0,no,yes,yes,no,yes,yes,yes,no,4,2,2,1,1,1,0,16,18,19GP,M,15,U,GT3,T,3,4,other,other,home,mother,1,2,0,no,yes,yes,yes,yes,yes,yes,no,5,5,1,1,1,5,0,14,15,15GP,F,15,U,GT3,T,4,4,teacher,health,reputation,mother,1,2,0,no,yes,yes,no,yes,yes,yes,no,3,3,3,1,2,2,0,10,8,9GP,F,15,U,GT3,T,2,1,services,other,reputation,father,3,3,0,no,yes,no,yes,yes,yes,yes,no,5,2,2,1,1,4,4,10,12,12GP,M,15,U,LE3,T,4,4,health,services,course,father,1,1,0,no,yes,yes,yes,yes,yes,yes,no,4,3,3,1,3,5,2,14,14,14GP,M,15,U,GT3,T,4,3,teacher,other,course,mother,2,2,0,no,yes,yes,no,yes,yes,yes,no,5,4,3,1,2,3,2,10,10,11GP,M,15,U,GT3,A,2,2,other,other,home,other,1,3,0,no,yes,no,no,yes,yes,yes,yes,4,5,2,4,4,3,0,14,16,16GP,F,16,U,GT3,T,4,4,health,other,home,mother,1,1,0,no,yes,no,no,yes,yes,yes,no,4,4,4,1,2,2,4,14,14,14GP,F,16,U,GT3,T,4,4,services,services,reputation,mother,1,3,0,no,yes,yes,yes,yes,yes,yes,no,3,2,3,1,2,2,6,13,14,14GP,F,16,U,GT3,T,3,3,other,other,reputation,mother,3,2,0,yes,yes,no,yes,yes,yes,no,no,5,3,2,1,1,4,4,8,10,10GP,M,17,U,GT3,T,3,2,services,services,course,mother,1,1,3,no,yes,no,yes,yes,yes,yes,no,5,5,5,5,4,5,16,6,5,5GP,M,16,U,LE3,T,4,3,health,other,home,father,1,1,0,no,no,yes,yes,yes,yes,yes,no,3,1,3,1,3,5,4,8,10,10GP,M,15,U,GT3,T,4,3,teacher,other,reputation,mother,1,2,0,no,no,no,no,yes,yes,yes,no,4,4,1,1,1,1,0,13,14,15GP,M,15,U,GT3,T,4,4,health,health,other,father,1,1,0,no,yes,yes,no,yes,yes,yes,no,5,4,2,1,1,5,0,12,15,15GP,M,16,U,LE3,T,4,2,teacher,other,course,mother,1,2,0,no,no,no,yes,yes,yes,yes,no,4,5,1,5,3,5,2,15,15,16GP,M,16,U,LE3,T,2,2,other,other,reputation,mother,2,2,0,no,yes,no,yes,yes,yes,yes,no,5,4,4,2,4,5,0,13,13,12GP,F,15,R,GT3,T,2,4,services,health,course,mother,1,3,0,yes,yes,yes,yes,yes,yes,yes,no,4,3,2,1,1,5,2,10,9,8GP,F,16,U,GT3,T,2,2,services,services,home,mother,1,1,2,no,yes,yes,no,no,yes,yes,no,1,2,2,1,3,5,14,6,9,8GP,M,15,U,GT3,T,2,2,other,other,home,mother,1,1,0,no,yes,yes,no,yes,yes,yes,no,4,2,2,1,2,5,2,12,12,11GP,M,15,U,GT3,T,4,2,health,services,other,mother,1,1,0,no,no,yes,no,yes,yes,yes,no,2,2,4,2,4,1,4,15,16,15GP,M,16,U,LE3,A,3,4,services,other,home,mother,1,2,0,yes,yes,no,yes,yes,yes,yes,no,5,3,3,1,1,5,4,11,11,11GP,M,16,U,GT3,T,4,4,teacher,teacher,home,mother,1,2,0,no,yes,yes,yes,yes,yes,yes,yes,4,4,5,5,5,5,16,10,12,11GP,M,15,U,GT3,T,4,4,health,services,home,mother,1,2,0,no,yes,yes,no,no,yes,yes,no,5,4,2,5,4,5,0,9,11,12GP,M,15,U,GT3,T,4,4,services,services,reputation,mother,2,2,0,no,yes,no,yes,yes,yes,yes,no,4,3,1,1,1,5,0,17,16,17GP,M,15,R,GT3,T,4,3,teacher,at_home,course,mother,1,2,0,no,yes,no,yes,yes,yes,yes,yes,4,5,2,1,1,5,0,17,16,16GP,M,15,U,LE3,T,3,3,other,other,course,mother,1,2,0,no,no,no,yes,no,yes,yes,no,5,3,2,1,1,2,0,8,10,12GP,M,16,U,GT3,T,3,2,other,other,home,mother,1,1,0,no,yes,yes,no,no,yes,yes,no,5,4,3,1,1,5,0,12,14,15GP,F,15,U,GT3,T,2,3,other,other,other,father,2,1,0,no,yes,no,yes,yes,yes,no,no,3,5,1,1,1,5,0,8,7,6GP,M,15,U,LE3,T,4,3,teacher,services,home,mother,1,3,0,no,yes,no,yes,yes,yes,yes,no,5,4,3,1,1,4,2,15,16,18GP,M,16,R,GT3,A,4,4,other,teacher,reputation,mother,2,3,0,no,yes,no,yes,yes,yes,yes,yes,2,4,3,1,1,5,7,15,16,15GP,F,15,R,GT3,T,3,4,services,health,course,mother,1,3,0,yes,yes,yes,yes,yes,yes,yes,no,4,3,2,1,1,5,2,12,12,11GP,F,15,R,GT3,T,2,2,at_home,other,reputation,mother,1,1,0,yes,yes,yes,yes,yes,yes,no,no,4,3,1,1,1,2,8,14,13,13GP,F,16,U,LE3,T,2,2,other,other,home,mother,2,2,1,no,yes,no,yes,no,yes,yes,yes,3,3,3,1,2,3,25,7,10,11GP,M,15,U,LE3,T,4,4,teacher,other,home,other,1,1,0,no,yes,no,no,no,yes,yes,yes,5,4,3,5,4,5,8,12,12,12GP,M,15,U,GT3,T,4,4,services,teacher,course,father,1,2,0,no,yes,no,yes,yes,yes,yes,no,4,3,3,1,1,5,2,19,18,18GP,M,15,U,GT3,T,2,2,services,services,course,father,1,1,0,yes,yes,no,no,yes,yes,yes,no,5,4,1,1,1,1,0,8,8,11GP,F,16,U,LE3,T,2,2,other,at_home,course,father,2,2,1,yes,no,no,yes,yes,yes,yes,no,4,3,3,2,2,5,14,10,10,9GP,F,15,U,LE3,A,4,3,other,other,course,mother,1,2,0,yes,yes,yes,yes,yes,yes,yes,yes,5,2,2,1,1,5,8,8,8,6GP,F,16,U,LE3,A,3,3,other,services,home,mother,1,2,0,no,yes,no,no,yes,yes,yes,no,2,3,5,1,4,3,12,11,12,11GP,M,16,U,GT3,T,4,3,health,services,reputation,mother,1,4,0,no,no,no,yes,yes,yes,yes,no,4,2,2,1,1,2,4,19,19,20GP,M,15,U,GT3,T,4,2,teacher,other,home,mother,1,2,0,no,yes,yes,no,yes,yes,no,no,4,3,3,2,2,5,2,15,15,14GP,F,15,U,GT3,T,4,4,services,teacher,other,father,1,2,1,yes,yes,no,yes,no,yes,yes,no,4,4,4,1,1,3,2,7,7,7GP,F,16,U,LE3,T,2,2,services,services,course,mother,3,2,0,no,yes,yes,no,yes,yes,yes,no,4,3,3,2,3,4,2,12,13,13GP,F,15,U,LE3,T,4,2,health,other,other,mother,1,2,0,no,yes,yes,no,yes,yes,yes,no,4,3,3,1,1,5,2,11,13,13GP,M,15,U,LE3,A,4,2,health,health,other,father,2,1,1,no,no,no,no,yes,yes,no,no,5,5,5,5,4,5,6,11,11,10GP,F,15,U,GT3,T,4,4,services,services,course,mother,1,1,0,yes,yes,yes,no,yes,yes,yes,no,3,3,4,2,3,5,0,8,10,11GP,F,15,U,LE3,A,3,3,other,other,other,mother,1,1,0,no,no,yes,no,yes,yes,yes,no,5,3,4,4,4,1,6,10,13,13GP,F,16,U,GT3,A,2,1,other,other,other,mother,1,2,0,no,no,yes,yes,yes,yes,yes,yes,5,3,4,1,1,2,8,8,9,10GP,F,15,U,GT3,A,4,3,services,services,reputation,mother,1,2,0,no,yes,yes,yes,yes,yes,yes,no,4,3,2,1,1,1,0,14,15,15GP,M,15,U,GT3,T,4,4,teacher,health,reputation,mother,1,2,0,no,yes,no,yes,yes,yes,no,no,3,2,2,1,1,5,4,14,15,15GP,M,15,U,LE3,T,1,2,other,at_home,home,father,1,2,0,yes,yes,no,yes,yes,yes,yes,no,4,3,2,1,1,5,2,9,10,9GP,F,16,U,GT3,T,4,2,services,other,course,mother,1,2,0,no,yes,no,no,yes,yes,yes,no,4,2,3,1,1,5,2,15,16,16GP,F,16,R,GT3,T,4,4,health,teacher,other,mother,1,2,0,no,yes,no,yes,yes,yes,no,no,2,4,4,5,3,4,6,10,11,11GP,F,16,U,GT3,T,1,1,services,services,course,father,4,1,0,yes,yes,no,yes,no,yes,yes,yes,5,5,5,5,5,5,6,10,8,11GP,F,16,U,LE3,T,1,2,other,services,reputation,father,1,2,0,yes,no,no,yes,yes,yes,yes,no,4,4,3,1,1,1,4,8,10,9GP,F,16,U,GT3,T,4,3,teacher,health,home,mother,1,3,0,yes,yes,yes,yes,yes,yes,yes,no,3,4,4,5,4,4,2,10,9,9GP,F,15,U,LE3,T,4,3,services,services,reputation,father,1,2,0,yes,no,no,yes,yes,yes,yes,yes,4,4,4,5,4,2,0,10,10,10GP,F,16,U,LE3,T,4,3,teacher,services,course,mother,3,2,0,no,yes,no,yes,yes,yes,yes,no,5,4,3,1,2,1,2,16,15,15GP,M,15,U,GT3,A,4,4,other,services,reputation,mother,1,4,0,no,yes,no,yes,no,yes,yes,yes,1,3,3,5,5,3,4,13,13,12GP,F,16,U,GT3,T,3,1,services,other,course,mother,1,4,0,yes,yes,yes,no,yes,yes,yes,no,4,3,3,1,2,5,4,7,7,6GP,F,15,R,LE3,T,2,2,health,services,reputation,mother,2,2,0,yes,yes,yes,no,yes,yes,yes,no,4,1,3,1,3,4,2,8,9,8GP,F,15,R,LE3,T,3,1,other,other,reputation,father,2,4,0,no,yes,no,no,no,yes,yes,no,4,4,2,5,3,3,12,16,16,16GP,M,16,U,GT3,T,3,1,other,other,reputation,father,2,4,0,no,yes,yes,no,yes,yes,yes,no,4,3,2,1,1,5,0,13,15,15GP,M,15,U,GT3,T,4,2,other,other,course,mother,1,4,0,no,no,no,no,yes,yes,yes,no,3,3,3,1,1,3,0,10,10,10GP,F,15,R,GT3,T,1,1,other,other,reputation,mother,1,2,2,yes,yes,no,no,no,yes,yes,yes,3,3,4,2,4,5,2,8,6,5GP,M,16,U,GT3,T,3,1,other,other,reputation,mother,1,1,0,no,no,no,yes,yes,yes,no,no,5,3,2,2,2,5,2,12,12,14GP,F,16,U,GT3,T,3,3,other,services,home,mother,1,2,0,yes,yes,yes,yes,yes,yes,yes,no,4,3,3,2,4,5,54,11,12,11GP,M,15,U,GT3,T,4,3,teacher,other,home,mother,1,2,0,no,yes,yes,yes,yes,yes,yes,no,4,3,3,2,3,5,6,9,9,10GP,M,15,U,GT3,T,4,0,teacher,other,course,mother,2,4,0,no,no,no,yes,yes,yes,yes,no,3,4,3,1,1,1,8,11,11,10GP,F,16,U,GT3,T,2,2,other,other,reputation,mother,1,4,0,no,no,yes,no,yes,yes,yes,yes,5,2,3,1,3,3,0,11,11,11GP,M,17,U,GT3,T,2,1,other,other,home,mother,2,1,3,yes,yes,no,yes,yes,no,yes,no,4,5,1,5,5,3,2,8,8,10GP,F,16,U,GT3,T,3,4,at_home,other,course,mother,1,2,0,no,yes,no,no,yes,yes,yes,no,2,4,3,1,2,3,12,5,5,5GP,M,15,U,GT3,T,2,3,other,services,course,father,1,1,0,yes,yes,yes,yes,no,yes,yes,yes,3,2,2,1,3,3,2,10,12,12GP,M,15,U,GT3,T,2,3,other,other,home,mother,1,3,0,yes,no,yes,no,no,yes,yes,no,5,3,2,1,2,5,4,11,10,11GP,F,15,U,LE3,T,3,2,services,other,reputation,mother,1,2,0,no,yes,yes,no,yes,yes,yes,no,4,4,4,1,1,5,10,7,6,6GP,M,15,U,LE3,T,2,2,services,services,home,mother,2,2,0,no,no,yes,yes,yes,yes,yes,no,5,3,3,1,3,4,4,15,15,15GP,F,15,U,GT3,T,1,1,other,other,home,father,1,2,0,no,yes,no,yes,no,yes,yes,no,4,3,2,2,3,4,2,9,10,10GP,F,15,U,GT3,T,4,4,services,services,reputation,father,2,2,2,no,no,yes,no,yes,yes,yes,yes,4,4,4,5,3,5,6,7,9,8GP,F,16,U,LE3,T,2,2,at_home,other,course,mother,1,2,0,no,yes,no,no,yes,yes,no,no,4,3,4,1,2,2,4,8,7,6GP,F,15,U,GT3,T,4,2,other,other,reputation,mother,1,3,0,no,yes,no,yes,yes,yes,yes,no,5,3,3,1,3,1,4,13,14,14GP,M,16,U,GT3,T,2,2,services,other,reputation,father,2,2,1,no,no,yes,yes,no,yes,yes,no,4,4,2,1,1,3,12,11,10,10GP,M,16,U,LE3,A,4,4,teacher,health,reputation,mother,1,2,0,no,yes,no,no,yes,yes,no,no,4,1,3,3,5,5,18,8,6,7GP,F,16,U,GT3,T,3,3,other,other,home,mother,1,3,0,no,yes,yes,no,yes,yes,yes,yes,4,3,3,1,3,4,0,7,7,8GP,F,15,U,GT3,T,4,3,services,other,reputation,mother,1,1,0,no,no,yes,yes,yes,yes,yes,no,4,5,5,5,3,1,4,16,17,18GP,F,16,U,LE3,T,3,1,other,other,home,father,1,2,0,yes,yes,no,no,yes,yes,no,no,3,3,3,2,3,2,4,7,6,6GP,F,16,U,GT3,T,4,2,teacher,services,home,mother,2,2,0,no,yes,yes,yes,yes,yes,yes,no,5,3,3,1,1,1,0,11,10,10GP,M,15,U,LE3,T,2,2,services,health,reputation,mother,1,4,0,no,yes,no,yes,yes,yes,yes,no,4,3,4,1,1,4,6,11,13,14GP,F,15,R,GT3,T,1,1,at_home,other,home,mother,2,4,1,yes,yes,yes,yes,yes,yes,yes,no,3,1,2,1,1,1,2,7,10,10GP,M,16,R,GT3,T,4,3,services,other,reputation,mother,2,1,0,yes,yes,no,yes,no,yes,yes,no,3,3,3,1,1,4,2,11,15,15GP,F,16,U,GT3,T,2,1,other,other,course,mother,1,2,0,no,yes,yes,no,yes,yes,no,yes,4,3,5,1,1,5,2,8,9,10GP,F,16,U,GT3,T,4,4,other,other,reputation,mother,1,1,0,no,no,no,yes,no,yes,yes,no,5,3,4,1,2,1,6,11,14,14GP,F,16,U,GT3,T,4,3,other,at_home,course,mother,1,3,0,yes,yes,yes,no,yes,yes,yes,no,5,3,5,1,1,3,0,7,9,8GP,M,16,U,GT3,T,4,4,services,services,other,mother,1,1,0,yes,yes,yes,yes,yes,yes,yes,no,4,5,5,5,5,4,14,7,7,5GP,M,16,U,GT3,T,4,4,services,teacher,other,father,1,3,0,no,yes,no,yes,yes,yes,yes,yes,4,4,3,1,1,4,0,16,17,17GP,M,15,U,GT3,T,4,4,services,other,course,mother,1,1,0,no,yes,no,yes,no,yes,yes,no,5,3,3,1,1,5,4,10,13,14GP,F,15,U,GT3,T,3,2,services,other,home,mother,2,2,0,yes,yes,yes,no,yes,yes,yes,no,4,3,5,1,1,2,26,7,6,6GP,M,15,U,GT3,A,3,4,services,other,course,mother,1,2,0,no,yes,yes,yes,yes,yes,yes,no,5,4,4,1,1,1,0,16,18,18GP,F,15,U,GT3,A,3,3,other,health,reputation,father,1,4,0,yes,no,no,no,yes,yes,no,no,4,3,3,1,1,4,10,10,11,11GP,F,15,U,GT3,T,2,2,other,other,course,mother,1,4,0,yes,yes,yes,no,yes,yes,yes,no,5,1,2,1,1,3,8,7,8,8GP,M,16,U,GT3,T,3,3,services,other,home,father,1,3,0,no,yes,no,yes,yes,yes,yes,no,5,3,3,1,1,5,2,16,18,18GP,M,15,R,GT3,T,4,4,other,other,home,father,4,4,0,no,yes,yes,yes,yes,yes,yes,yes,1,3,5,3,5,1,6,10,13,13GP,F,16,U,LE3,T,4,4,health,health,other,mother,1,3,0,no,yes,yes,yes,yes,yes,yes,yes,5,4,5,1,1,4,4,14,15,16GP,M,15,U,LE3,A,4,4,teacher,teacher,course,mother,1,1,0,no,no,no,yes,yes,yes,yes,no,5,5,3,5,5,4,6,18,19,19GP,F,16,R,GT3,T,3,3,services,other,reputation,father,1,3,1,yes,yes,no,yes,yes,yes,yes,no,4,1,2,1,1,2,0,7,10,10GP,F,16,U,GT3,T,2,2,at_home,other,home,mother,1,2,1,yes,no,no,yes,yes,yes,yes,no,3,1,2,1,1,5,6,10,13,13GP,M,15,U,LE3,T,4,2,teacher,other,course,mother,1,1,0,no,no,no,no,yes,yes,yes,no,3,5,2,5,5,3,10,18,19,19GP,M,15,R,GT3,T,2,1,health,services,reputation,mother,1,2,0,no,no,no,yes,yes,yes,yes,yes,5,4,2,1,1,5,8,9,9,9GP,M,16,U,GT3,T,4,4,teacher,teacher,course,father,1,2,0,no,yes,no,yes,yes,yes,yes,no,5,4,4,1,2,5,2,15,15,16GP,M,15,U,GT3,T,4,4,other,teacher,reputation,father,2,2,0,no,yes,no,yes,yes,yes,no,no,4,4,3,1,1,2,2,11,13,14GP,M,16,U,GT3,T,3,3,other,services,home,father,2,1,0,no,no,no,yes,yes,yes,yes,no,5,4,2,1,1,5,0,13,14,13GP,M,17,R,GT3,T,1,3,other,other,course,father,3,2,1,no,yes,no,yes,yes,yes,yes,no,5,2,4,1,4,5,20,9,7,8GP,M,15,U,GT3,T,3,4,other,other,reputation,father,1,1,0,no,no,no,no,yes,yes,yes,no,3,4,3,1,2,4,6,14,13,13GP,F,15,U,GT3,T,1,2,at_home,services,course,mother,1,2,0,no,no,no,no,no,yes,yes,no,3,2,3,1,2,1,2,16,15,15GP,M,15,U,GT3,T,2,2,services,services,home,father,1,4,0,no,yes,yes,yes,yes,yes,yes,no,5,5,4,1,2,5,6,16,14,15GP,F,16,U,LE3,T,2,4,other,health,course,father,2,2,0,no,yes,yes,yes,yes,yes,yes,yes,4,2,2,1,2,5,2,13,13,13GP,M,16,U,GT3,T,4,4,health,other,course,mother,1,1,0,no,yes,no,yes,yes,yes,yes,no,3,4,4,5,4,5,18,14,11,13GP,F,16,U,GT3,T,2,2,other,other,home,mother,1,2,0,no,no,yes,no,yes,yes,yes,yes,5,4,4,1,1,5,0,8,7,8GP,M,15,U,GT3,T,3,4,services,services,home,father,1,1,0,yes,no,no,no,yes,yes,yes,no,5,5,5,3,2,5,0,13,13,12GP,F,15,U,LE3,A,3,4,other,other,home,mother,1,2,0,yes,no,no,yes,yes,yes,yes,yes,5,3,2,1,1,1,0,7,10,11GP,F,19,U,GT3,T,0,1,at_home,other,course,other,1,2,3,no,yes,no,no,no,no,no,no,3,4,2,1,1,5,2,7,8,9GP,M,18,R,GT3,T,2,2,services,other,reputation,mother,1,1,2,no,yes,no,yes,yes,yes,yes,no,3,3,3,1,2,4,0,7,4,0GP,M,16,R,GT3,T,4,4,teacher,teacher,course,mother,1,1,0,no,no,yes,yes,yes,yes,yes,no,3,5,5,4,5,4,8,18,18,18GP,F,15,R,GT3,T,3,4,services,teacher,course,father,2,3,2,no,yes,no,no,yes,yes,yes,yes,4,2,2,2,2,5,0,12,0,0GP,F,15,U,GT3,T,1,1,at_home,other,course,mother,3,1,0,no,yes,no,yes,no,yes,yes,yes,4,3,3,1,2,4,0,8,0,0GP,F,17,U,LE3,T,2,2,other,other,course,father,1,1,0,no,yes,no,no,yes,yes,yes,yes,3,4,4,1,3,5,12,10,13,12GP,F,16,U,GT3,A,3,4,services,other,course,father,1,1,0,no,no,no,no,yes,yes,yes,no,3,2,1,1,4,5,16,12,11,11GP,M,15,R,GT3,T,3,4,at_home,teacher,course,mother,4,2,0,no,yes,no,no,yes,yes,no,yes,5,3,3,1,1,5,0,9,0,0GP,F,15,U,GT3,T,4,4,services,at_home,course,mother,1,3,0,no,yes,no,yes,yes,yes,yes,yes,4,3,3,1,1,5,0,11,0,0GP,M,17,R,GT3,T,3,4,at_home,other,course,mother,3,2,0,no,no,no,no,yes,yes,no,no,5,4,5,5,4,5,0,10,0,0GP,F,16,U,GT3,A,3,3,other,other,course,other,2,1,2,no,yes,no,yes,no,yes,yes,yes,4,3,2,1,1,5,0,4,0,0GP,M,16,U,LE3,T,1,1,services,other,course,mother,1,2,1,no,no,no,no,yes,yes,no,yes,4,4,4,5,3,5,0,14,12,12GP,F,15,U,GT3,T,4,4,teacher,teacher,course,mother,2,1,0,no,no,no,yes,yes,yes,yes,no,4,3,2,1,1,5,0,16,16,15GP,M,15,U,GT3,T,4,3,teacher,services,course,father,2,4,0,yes,yes,no,no,yes,yes,yes,no,2,2,2,1,1,3,0,7,9,0GP,M,16,U,LE3,T,2,2,services,services,reputation,father,2,1,2,no,yes,no,yes,yes,yes,yes,no,2,3,3,2,2,2,8,9,9,9GP,F,15,U,GT3,T,4,4,teacher,services,course,mother,1,3,0,no,yes,yes,yes,yes,yes,yes,no,4,2,2,1,1,5,2,9,11,11GP,F,16,U,LE3,T,1,1,at_home,at_home,course,mother,1,1,0,no,no,no,no,yes,yes,yes,no,3,4,4,5,3,1,2,14,14,13GP,M,17,U,GT3,T,2,1,other,other,home,mother,1,1,3,no,yes,no,no,yes,yes,yes,no,5,4,5,1,2,5,0,5,0,0GP,F,15,U,GT3,T,1,1,other,services,course,father,1,2,0,no,yes,yes,no,yes,yes,yes,no,4,4,2,1,2,5,0,8,11,11GP,F,15,U,GT3,T,3,2,health,services,home,father,1,2,3,no,yes,no,no,yes,yes,yes,no,3,3,2,1,1,3,0,6,7,0GP,F,15,U,GT3,T,1,2,at_home,other,course,mother,1,2,0,no,yes,yes,no,no,yes,yes,no,4,3,2,1,1,5,2,10,11,11GP,M,16,U,GT3,T,4,4,teacher,teacher,course,mother,1,1,0,no,yes,no,no,yes,no,yes,yes,3,3,2,2,1,5,0,7,6,0GP,M,15,U,LE3,A,2,1,services,other,course,mother,4,1,3,no,no,no,no,yes,yes,yes,no,4,5,5,4,5,5,0,8,9,10GP,M,18,U,LE3,T,1,1,other,other,course,mother,1,1,3,no,no,no,no,yes,no,yes,yes,2,3,5,2,5,4,0,6,5,0GP,M,16,U,LE3,T,2,1,at_home,other,course,mother,1,1,1,no,no,no,yes,yes,yes,no,yes,4,4,4,5,5,5,6,12,13,14GP,F,15,R,GT3,T,3,3,services,services,reputation,other,2,3,2,no,yes,yes,yes,yes,yes,yes,yes,4,2,1,2,3,3,8,10,10,10GP,M,19,U,GT3,T,3,2,services,at_home,home,mother,1,1,3,no,yes,no,no,yes,no,yes,yes,4,5,4,1,1,4,0,5,0,0GP,F,17,U,GT3,T,4,4,other,teacher,course,mother,1,1,0,yes,yes,no,no,yes,yes,no,yes,4,2,1,1,1,4,0,11,11,12GP,M,15,R,GT3,T,2,3,at_home,services,course,mother,1,2,0,yes,no,yes,yes,yes,yes,no,no,4,4,4,1,1,1,2,11,8,8GP,M,17,R,LE3,T,1,2,other,other,reputation,mother,1,1,0,no,no,no,no,yes,yes,no,no,2,2,2,3,3,5,8,16,12,13GP,F,18,R,GT3,T,1,1,at_home,other,course,mother,3,1,3,no,yes,no,yes,no,yes,no,no,5,2,5,1,5,4,6,9,8,10GP,M,16,R,GT3,T,2,2,at_home,other,course,mother,3,1,0,no,no,no,no,no,yes,no,no,4,2,2,1,2,3,2,17,15,15GP,M,16,U,GT3,T,3,3,other,services,course,father,1,2,1,no,yes,yes,no,yes,yes,yes,yes,4,5,5,4,4,5,4,10,12,12GP,M,17,R,LE3,T,2,1,at_home,other,course,mother,2,1,2,no,no,no,yes,yes,no,yes,yes,3,3,2,2,2,5,0,7,6,0GP,M,15,R,GT3,T,3,2,other,other,course,mother,2,2,2,yes,yes,no,no,yes,yes,yes,yes,4,4,4,5,4,3,6,5,9,7GP,M,16,U,LE3,T,1,2,other,other,course,mother,2,1,1,no,no,no,yes,yes,yes,no,no,4,4,4,5,4,5,0,7,0,0GP,M,17,U,GT3,T,1,3,at_home,services,course,father,1,1,0,no,no,no,no,yes,no,yes,no,5,3,3,1,4,2,2,10,10,10GP,M,17,R,LE3,T,1,1,other,services,course,mother,4,2,3,no,no,no,yes,yes,no,no,yes,5,3,5,1,5,5,0,5,8,7GP,M,16,U,GT3,T,3,2,services,services,course,mother,2,1,1,no,yes,no,yes,no,no,no,no,4,5,2,1,1,2,16,12,11,12GP,M,16,U,GT3,T,2,2,other,other,course,father,1,2,0,no,no,no,no,yes,no,yes,no,4,3,5,2,4,4,4,10,10,10GP,F,16,U,GT3,T,4,2,health,services,home,father,1,2,0,no,no,yes,no,yes,yes,yes,yes,4,2,3,1,1,3,0,14,15,16GP,F,16,U,GT3,T,2,2,other,other,home,mother,1,2,0,no,yes,yes,no,no,yes,yes,no,5,1,5,1,1,4,0,6,7,0GP,F,16,U,GT3,T,4,4,health,health,reputation,mother,1,2,0,no,yes,yes,no,yes,yes,yes,yes,4,4,2,1,1,3,0,14,14,14GP,M,16,U,GT3,T,3,4,other,other,course,father,3,1,2,no,yes,no,yes,no,yes,yes,no,3,4,5,5,4,2,0,6,5,0GP,M,16,U,GT3,T,1,0,other,other,reputation,mother,2,2,0,no,yes,yes,yes,yes,yes,yes,yes,4,3,2,1,1,3,2,13,15,16GP,M,17,U,LE3,T,4,4,teacher,other,reputation,mother,1,2,0,no,yes,yes,yes,yes,yes,yes,no,4,4,4,5,3,5,0,13,11,10GP,F,16,U,GT3,T,1,3,at_home,services,home,mother,1,2,3,no,no,no,yes,no,yes,yes,yes,4,3,5,1,1,3,0,8,7,0GP,F,16,U,LE3,T,3,3,other,other,reputation,mother,2,2,0,no,yes,yes,yes,yes,yes,yes,no,4,4,5,1,1,4,4,10,11,9GP,M,17,U,LE3,T,4,3,teacher,other,course,mother,2,2,0,no,no,yes,yes,yes,yes,yes,no,4,4,4,4,4,4,4,10,9,9GP,F,16,U,GT3,T,2,2,services,other,reputation,mother,2,2,0,no,no,yes,yes,no,yes,yes,no,3,4,4,1,4,5,2,13,13,11GP,M,17,U,GT3,T,3,3,other,other,reputation,father,1,2,0,no,no,no,yes,no,yes,yes,no,4,3,4,1,4,4,4,6,5,6GP,M,16,R,GT3,T,4,2,teacher,services,other,mother,1,1,0,no,yes,no,yes,yes,yes,yes,yes,4,3,3,3,4,3,10,10,8,9GP,M,17,U,GT3,T,4,3,other,other,course,mother,1,2,0,no,yes,no,yes,yes,yes,yes,yes,5,2,3,1,1,2,4,10,10,11GP,M,16,U,GT3,T,4,3,teacher,other,home,mother,1,2,0,no,yes,yes,yes,yes,yes,yes,no,3,4,3,5,3,3,10,9,8,8GP,M,16,U,GT3,T,3,3,services,other,home,mother,1,2,0,no,no,yes,yes,yes,yes,yes,yes,4,2,3,1,2,3,2,12,13,12GP,F,17,U,GT3,T,2,4,services,services,reputation,father,1,2,0,no,yes,no,yes,yes,yes,no,no,5,4,2,5,3,5,0,16,17,17GP,F,17,U,LE3,T,3,3,other,other,reputation,mother,1,2,0,no,yes,no,yes,yes,yes,yes,yes,5,3,3,2,3,1,56,9,9,8GP,F,16,U,GT3,T,3,2,other,other,reputation,mother,1,2,0,no,yes,yes,no,yes,yes,yes,no,1,2,2,1,2,1,14,12,13,12GP,M,17,U,GT3,T,3,3,services,services,other,mother,1,2,0,no,yes,no,yes,yes,yes,yes,yes,4,3,4,2,3,4,12,12,12,11GP,M,16,U,GT3,T,1,2,services,services,other,mother,1,1,0,no,yes,yes,yes,yes,yes,yes,yes,3,3,3,1,2,3,2,11,12,11GP,M,16,U,LE3,T,2,1,other,other,course,mother,1,2,0,no,no,yes,yes,yes,yes,yes,yes,4,2,3,1,2,5,0,15,15,15GP,F,17,U,GT3,A,3,3,health,other,reputation,mother,1,2,0,no,yes,no,no,no,yes,yes,yes,3,3,3,1,3,3,6,8,7,9GP,M,17,R,GT3,T,1,2,at_home,other,home,mother,1,2,0,no,no,no,no,yes,yes,no,no,3,1,3,1,5,3,4,8,9,10GP,F,16,U,GT3,T,2,3,services,services,course,mother,1,2,0,no,no,no,no,yes,yes,yes,no,4,3,3,1,1,2,10,11,12,13GP,F,17,U,GT3,T,1,1,at_home,services,course,mother,1,2,0,no,no,no,yes,yes,yes,yes,no,5,3,3,1,1,3,0,8,8,9GP,M,17,U,GT3,T,1,2,at_home,services,other,other,2,2,0,no,no,yes,yes,no,yes,yes,no,4,4,4,4,5,5,12,7,8,8GP,M,16,R,GT3,T,3,3,services,services,reputation,mother,1,1,0,no,yes,no,yes,yes,yes,yes,no,4,3,2,3,4,5,8,8,9,10GP,M,16,U,GT3,T,2,3,other,other,home,father,2,1,0,no,no,no,no,yes,yes,yes,no,5,3,3,1,1,3,0,13,14,14GP,F,17,U,LE3,T,2,4,services,services,course,father,1,2,0,no,no,no,yes,yes,yes,yes,yes,4,3,2,1,1,5,0,14,15,15GP,M,17,U,GT3,T,4,4,services,teacher,home,mother,1,1,0,no,no,no,no,yes,yes,yes,no,5,2,3,1,2,5,4,17,15,16GP,M,16,R,LE3,T,3,3,teacher,other,home,father,3,1,0,no,yes,yes,yes,yes,yes,yes,no,3,3,4,3,5,3,8,9,9,10GP,F,17,U,GT3,T,4,4,services,teacher,home,mother,2,1,1,no,yes,no,no,yes,yes,yes,no,4,2,4,2,3,2,24,18,18,18GP,F,16,U,LE3,T,4,4,teacher,teacher,reputation,mother,1,2,0,no,yes,yes,no,yes,yes,yes,no,4,5,2,1,2,3,0,9,9,10GP,F,16,U,GT3,T,4,3,health,other,home,mother,1,2,0,no,yes,no,yes,yes,yes,yes,no,4,3,5,1,5,2,2,16,16,16GP,F,16,U,GT3,T,2,3,other,other,reputation,mother,1,2,0,yes,yes,yes,yes,yes,yes,no,no,4,4,3,1,3,4,6,8,10,10GP,F,17,U,GT3,T,1,1,other,other,course,mother,1,2,0,no,yes,yes,no,no,yes,no,no,4,4,4,1,3,1,4,9,9,10GP,F,17,R,GT3,T,2,2,other,other,reputation,mother,1,1,0,no,yes,no,no,yes,yes,yes,no,5,3,2,1,2,3,18,7,6,6GP,F,16,R,GT3,T,2,2,services,services,reputation,mother,2,4,0,no,yes,yes,yes,no,yes,yes,no,5,3,5,1,1,5,6,10,10,11GP,F,17,U,GT3,T,3,4,at_home,services,home,mother,1,3,1,no,yes,yes,no,yes,yes,yes,yes,4,4,3,5,4,5,28,10,9,9GP,F,16,U,GT3,A,3,1,services,other,course,mother,1,2,3,no,yes,yes,no,yes,yes,yes,no,2,3,3,2,2,4,5,7,7,7GP,F,16,U,GT3,T,4,3,teacher,other,other,mother,1,2,0,no,no,yes,yes,yes,yes,yes,yes,1,3,2,1,1,1,10,11,12,13GP,F,16,U,GT3,T,1,1,at_home,other,home,mother,2,1,0,no,yes,yes,no,yes,yes,no,no,4,3,2,1,4,5,6,9,9,10GP,F,17,R,GT3,T,4,3,teacher,other,reputation,mother,2,3,0,no,yes,yes,yes,yes,yes,yes,yes,4,4,2,1,1,4,6,7,7,7GP,F,19,U,GT3,T,3,3,other,other,reputation,other,1,4,0,no,yes,yes,yes,yes,yes,yes,no,4,3,3,1,2,3,10,8,8,8GP,M,17,U,LE3,T,4,4,services,other,home,mother,1,2,0,no,yes,yes,no,yes,yes,yes,yes,5,3,5,4,5,3,13,12,12,13GP,F,16,U,GT3,A,2,2,other,other,reputation,mother,1,2,0,yes,yes,yes,no,yes,yes,yes,no,3,3,4,1,1,4,0,12,13,14GP,M,18,U,GT3,T,2,2,services,other,home,mother,1,2,1,no,yes,yes,yes,yes,yes,yes,no,4,4,4,5,4,5,15,6,7,8GP,F,17,R,LE3,T,4,4,services,other,other,mother,1,1,0,no,yes,yes,no,yes,yes,no,no,5,2,1,1,2,3,12,8,10,10GP,F,17,U,LE3,T,3,2,other,other,reputation,mother,2,2,0,no,no,yes,no,yes,yes,yes,no,4,4,4,1,3,1,2,14,15,15GP,F,17,U,GT3,T,4,3,other,other,reputation,mother,1,2,2,no,no,yes,no,yes,yes,yes,yes,3,4,5,5,4,1,22,6,6,4GP,M,18,U,LE3,T,3,3,services,health,home,father,1,2,1,no,yes,yes,no,yes,yes,yes,no,3,2,4,2,4,4,13,6,6,8GP,F,17,U,GT3,T,2,3,at_home,other,home,father,2,1,0,no,yes,yes,no,yes,yes,no,no,3,3,3,1,4,3,3,7,7,8GP,F,17,U,GT3,T,2,2,at_home,at_home,course,mother,1,3,0,no,yes,yes,yes,yes,yes,yes,no,4,3,3,1,1,4,4,9,10,10GP,F,17,R,GT3,T,2,1,at_home,services,reputation,mother,2,2,0,no,yes,no,yes,yes,yes,yes,no,4,2,5,1,2,5,2,6,6,6GP,F,17,U,GT3,T,1,1,at_home,other,reputation,mother,1,3,1,no,yes,no,yes,yes,yes,no,yes,4,3,4,1,1,5,0,6,5,0GP,F,16,U,GT3,T,2,3,services,teacher,other,mother,1,2,0,yes,no,no,no,yes,yes,yes,no,2,3,1,1,1,3,2,16,16,17GP,M,18,U,GT3,T,2,2,other,other,home,mother,2,2,0,no,yes,yes,no,yes,yes,yes,no,3,3,3,5,5,4,0,12,13,13GP,F,16,U,GT3,T,4,4,teacher,services,home,mother,1,3,0,no,yes,no,yes,no,yes,yes,no,5,3,2,1,1,5,0,13,13,14GP,F,18,R,GT3,T,3,1,other,other,reputation,mother,1,2,1,no,no,no,yes,yes,yes,yes,yes,5,3,3,1,1,4,16,9,8,7GP,F,17,U,GT3,T,3,2,other,other,course,mother,1,2,0,no,no,no,yes,no,yes,yes,no,5,3,4,1,3,3,10,16,15,15GP,M,17,U,LE3,T,2,3,services,services,reputation,father,1,2,0,no,yes,yes,no,no,yes,yes,no,5,3,3,1,3,3,2,12,11,12GP,M,18,U,LE3,T,2,1,at_home,other,course,mother,4,2,0,yes,yes,yes,yes,yes,yes,yes,yes,4,3,2,4,5,3,14,10,8,9GP,F,17,U,GT3,A,2,1,other,other,course,mother,2,3,0,no,no,no,yes,yes,yes,yes,yes,3,2,3,1,2,3,10,12,10,12GP,F,17,U,LE3,T,4,3,health,other,reputation,father,1,2,0,no,no,no,yes,yes,yes,yes,yes,3,2,3,1,2,3,14,13,13,14GP,M,17,R,GT3,T,2,2,other,other,course,father,2,2,0,no,yes,yes,yes,yes,yes,yes,no,4,5,2,1,1,1,4,11,11,11GP,M,17,U,GT3,T,4,4,teacher,teacher,reputation,mother,1,2,0,yes,yes,no,yes,yes,yes,yes,yes,4,5,5,1,3,2,14,11,9,9GP,M,16,U,GT3,T,4,4,health,other,reputation,father,1,2,0,no,yes,yes,yes,yes,yes,yes,no,4,2,4,2,4,1,2,14,13,13GP,M,16,U,LE3,T,1,1,other,other,home,mother,2,2,0,no,yes,yes,no,yes,yes,yes,no,3,4,2,1,1,5,18,9,7,6GP,M,16,U,GT3,T,3,2,at_home,other,reputation,mother,2,3,0,no,no,no,yes,yes,yes,yes,yes,5,3,3,1,3,2,10,11,9,10GP,M,17,U,LE3,T,2,2,other,other,home,father,1,2,0,no,no,yes,yes,no,yes,yes,yes,4,4,2,5,5,4,4,14,13,13GP,F,16,U,GT3,T,2,1,other,other,home,mother,1,1,0,no,no,no,no,yes,yes,yes,yes,4,5,2,1,1,5,20,13,12,12GP,F,17,R,GT3,T,2,1,at_home,services,course,mother,3,2,0,no,no,no,yes,yes,yes,no,no,2,1,1,1,1,3,2,13,11,11GP,M,18,U,GT3,T,2,2,other,services,reputation,father,1,2,1,no,no,no,no,yes,no,yes,no,5,5,4,5,5,2,0,7,7,0GP,M,17,U,LE3,T,4,3,health,other,course,mother,2,2,0,no,no,no,yes,yes,yes,yes,yes,2,5,5,5,4,5,14,12,12,12GP,M,17,R,LE3,A,4,4,teacher,other,course,mother,2,2,0,no,yes,yes,no,yes,yes,yes,no,3,3,3,2,3,4,2,10,11,12GP,M,16,U,LE3,T,4,3,teacher,other,course,mother,1,1,0,no,no,no,yes,no,yes,yes,no,5,4,5,1,1,3,0,6,0,0GP,M,16,U,GT3,T,4,4,services,services,course,mother,1,1,0,no,no,no,yes,yes,yes,yes,no,5,3,2,1,2,5,0,13,12,12GP,F,18,U,GT3,T,2,1,other,other,course,other,2,3,0,no,yes,yes,no,no,yes,yes,yes,4,4,4,1,1,3,0,7,0,0GP,M,16,U,GT3,T,2,1,other,other,course,mother,3,1,0,no,no,no,no,yes,yes,yes,no,4,3,3,1,1,4,6,18,18,18GP,M,17,U,GT3,T,2,3,other,other,course,father,2,1,0,no,no,no,no,yes,yes,yes,no,5,2,2,1,1,2,4,12,12,13GP,M,22,U,GT3,T,3,1,services,services,other,mother,1,1,3,no,no,no,no,no,no,yes,yes,5,4,5,5,5,1,16,6,8,8GP,M,18,R,LE3,T,3,3,other,services,course,mother,1,2,1,no,yes,no,no,yes,yes,yes,yes,4,3,3,1,3,5,8,3,5,5GP,M,16,U,GT3,T,0,2,other,other,other,mother,1,1,0,no,no,yes,no,no,yes,yes,no,4,3,2,2,4,5,0,13,15,15GP,M,18,U,GT3,T,3,2,services,other,course,mother,2,1,1,no,no,no,no,yes,no,yes,no,4,4,5,5,4,5,0,6,8,8GP,M,16,U,GT3,T,3,3,at_home,other,reputation,other,3,2,0,yes,yes,no,no,no,yes,yes,no,5,3,3,1,3,2,6,7,10,10GP,M,18,U,GT3,T,2,1,services,services,other,mother,1,1,1,no,no,no,no,no,no,yes,no,3,2,5,2,5,5,4,6,9,8GP,M,16,R,GT3,T,2,1,other,other,course,mother,2,1,0,no,no,no,yes,no,yes,no,no,3,3,2,1,3,3,0,8,9,8GP,M,17,R,GT3,T,2,1,other,other,course,mother,1,1,0,no,no,no,no,no,yes,yes,no,4,4,2,5,4,5,0,8,12,12GP,M,17,U,LE3,T,1,1,health,other,course,mother,2,1,1,no,yes,no,yes,yes,yes,yes,no,4,4,4,1,2,5,2,7,9,8GP,F,17,U,LE3,T,4,2,teacher,services,reputation,mother,1,4,0,no,yes,yes,yes,yes,yes,yes,no,4,2,3,1,1,4,6,14,12,13GP,M,19,U,LE3,A,4,3,services,at_home,reputation,mother,1,2,0,no,yes,no,no,yes,yes,yes,no,4,3,1,1,1,1,12,11,11,11GP,M,18,U,GT3,T,2,1,other,other,home,mother,1,2,0,no,no,no,yes,yes,yes,yes,no,5,2,4,1,2,4,8,15,14,14GP,F,17,U,LE3,T,2,2,services,services,course,father,1,4,0,no,no,yes,yes,yes,yes,yes,yes,3,4,1,1,1,2,0,10,9,0GP,F,18,U,GT3,T,4,3,services,other,home,father,1,2,0,no,yes,yes,no,yes,yes,yes,yes,3,1,2,1,3,2,21,17,18,18GP,M,18,U,GT3,T,4,3,teacher,other,course,mother,1,2,0,no,yes,yes,no,no,yes,yes,no,4,3,2,1,1,3,2,8,8,8GP,M,18,R,GT3,T,3,2,other,other,course,mother,1,3,0,no,no,no,yes,no,yes,no,no,5,3,2,1,1,3,1,13,12,12GP,F,17,U,GT3,T,3,3,other,other,home,mother,1,3,0,no,no,no,yes,no,yes,no,no,3,2,3,1,1,4,4,10,9,9GP,F,18,U,GT3,T,2,2,at_home,services,home,mother,1,3,0,no,yes,yes,yes,yes,yes,yes,yes,4,3,3,1,1,3,0,9,10,0GP,M,18,R,LE3,A,3,4,other,other,reputation,mother,2,2,0,no,yes,yes,yes,yes,yes,yes,no,4,2,5,3,4,1,13,17,17,17GP,M,17,U,GT3,T,3,1,services,other,other,mother,1,2,0,no,no,yes,yes,yes,yes,yes,yes,5,4,4,5,4,5,2,9,9,10GP,F,18,R,GT3,T,4,4,teacher,other,reputation,mother,2,2,0,no,no,yes,yes,yes,yes,yes,no,4,3,4,2,2,4,8,12,10,11GP,M,18,U,GT3,T,4,2,health,other,reputation,father,1,2,0,no,yes,yes,yes,yes,yes,yes,yes,5,4,5,1,3,5,10,10,9,10GP,F,18,R,GT3,T,2,1,other,other,reputation,mother,2,2,0,no,yes,no,no,yes,no,yes,yes,4,3,5,1,2,3,0,6,0,0GP,F,19,U,GT3,T,3,3,other,services,home,other,1,2,2,no,yes,yes,yes,yes,yes,yes,no,4,3,5,3,3,5,15,9,9,9GP,F,18,U,GT3,T,2,3,other,services,reputation,father,1,4,0,no,yes,yes,yes,yes,yes,yes,yes,4,5,5,5,3,2,4,15,14,14GP,F,18,U,LE3,T,1,1,other,other,home,mother,2,2,0,no,yes,yes,no,no,yes,no,no,4,4,3,1,1,3,2,11,11,11GP,M,17,R,GT3,T,1,2,at_home,at_home,home,mother,1,2,0,no,yes,yes,yes,no,yes,no,yes,3,5,2,2,2,1,2,15,14,14GP,F,17,U,GT3,T,2,4,at_home,health,reputation,mother,2,2,0,no,yes,yes,no,yes,yes,yes,yes,4,3,3,1,1,1,2,10,10,10GP,F,17,U,LE3,T,2,2,services,other,course,mother,2,2,0,yes,yes,yes,no,yes,yes,yes,yes,4,4,4,5,3,5,6,12,12,12GP,F,18,R,GT3,A,3,2,other,services,home,mother,2,2,0,no,no,no,no,no,no,yes,yes,4,1,1,1,1,5,75,10,9,9GP,M,18,U,GT3,T,4,4,teacher,services,home,mother,2,1,0,no,no,yes,yes,yes,yes,yes,no,3,2,4,1,4,3,22,9,9,9GP,F,18,U,GT3,T,4,4,health,health,reputation,father,1,2,1,yes,yes,no,yes,yes,yes,yes,yes,2,4,4,1,1,4,15,9,8,8GP,M,18,U,LE3,T,4,3,teacher,services,course,mother,2,1,0,no,no,yes,yes,yes,yes,yes,no,4,2,3,1,2,1,8,10,11,10GP,M,17,U,LE3,A,4,1,services,other,home,mother,2,1,0,no,no,yes,yes,yes,yes,yes,yes,4,5,4,5,4,5,30,8,8,8GP,M,17,U,LE3,A,3,2,teacher,services,home,mother,1,1,1,no,no,no,no,yes,yes,yes,no,4,4,4,5,4,3,19,11,9,10GP,F,18,R,LE3,T,1,1,at_home,other,reputation,mother,2,4,0,no,yes,yes,yes,yes,yes,no,no,5,2,2,1,1,3,1,12,12,12GP,F,18,U,GT3,T,1,1,other,other,home,mother,2,2,0,yes,no,no,yes,yes,yes,yes,no,5,4,4,1,1,4,4,8,9,10GP,F,17,U,GT3,T,2,2,other,other,course,mother,1,2,0,no,yes,no,no,no,yes,yes,no,5,4,5,1,2,5,4,10,9,11GP,M,17,U,GT3,T,1,1,other,other,reputation,father,1,2,0,no,no,yes,no,no,yes,yes,no,4,3,3,1,2,4,2,12,10,11GP,F,18,U,GT3,T,2,2,at_home,at_home,other,mother,1,3,0,no,yes,yes,no,yes,yes,yes,no,4,3,3,1,2,2,5,18,18,19GP,F,17,U,GT3,T,1,1,services,teacher,reputation,mother,1,3,0,no,yes,yes,no,yes,yes,yes,no,4,3,3,1,1,3,6,13,12,12GP,M,18,U,GT3,T,2,1,services,services,reputation,mother,1,3,0,no,no,yes,yes,yes,yes,yes,no,4,2,4,1,3,2,6,15,14,14GP,M,18,U,LE3,A,4,4,teacher,teacher,reputation,mother,1,2,0,no,yes,yes,yes,yes,yes,yes,no,5,4,3,1,1,2,9,15,13,15GP,M,18,U,GT3,T,4,2,teacher,other,home,mother,1,2,0,no,yes,yes,yes,yes,yes,yes,yes,4,3,2,1,4,5,11,12,11,11GP,F,17,U,GT3,T,4,3,health,services,reputation,mother,1,3,0,no,yes,yes,no,yes,yes,yes,no,4,2,2,1,2,3,0,15,15,15GP,F,18,U,LE3,T,2,1,services,at_home,reputation,mother,1,2,1,no,no,no,no,yes,yes,yes,yes,5,4,3,1,1,5,12,12,12,13GP,F,17,R,LE3,T,3,1,services,other,reputation,mother,2,4,0,no,yes,yes,no,yes,yes,no,no,3,1,2,1,1,3,6,18,18,18GP,M,18,R,LE3,T,3,2,services,other,reputation,mother,2,3,0,no,yes,yes,yes,yes,yes,yes,no,5,4,2,1,1,4,8,14,13,14GP,M,17,U,GT3,T,3,3,health,other,home,mother,1,1,0,no,yes,yes,no,yes,yes,yes,no,4,4,3,1,3,5,4,14,12,11GP,F,19,U,GT3,T,4,4,health,other,reputation,other,2,2,0,no,yes,yes,yes,yes,yes,yes,no,2,3,4,2,3,2,0,10,9,0GP,F,18,U,LE3,T,4,3,other,other,home,other,2,2,0,no,yes,yes,no,yes,yes,yes,yes,4,4,5,1,2,2,10,10,8,8GP,F,18,U,GT3,T,4,3,other,other,reputation,father,1,4,0,no,yes,yes,no,yes,yes,yes,no,4,3,3,1,1,3,0,14,13,14GP,M,18,U,LE3,T,4,4,teacher,teacher,home,mother,1,1,0,no,yes,yes,no,yes,yes,yes,yes,1,4,2,2,2,1,5,16,15,16GP,F,18,U,LE3,A,4,4,health,other,home,mother,1,2,0,no,yes,no,no,yes,yes,yes,yes,4,2,4,1,1,4,14,12,10,11GP,M,17,U,LE3,T,4,4,other,teacher,home,father,2,1,0,no,no,yes,no,yes,yes,yes,no,4,1,1,2,2,5,0,11,11,10GP,F,17,U,GT3,T,4,2,other,other,reputation,mother,2,3,0,no,yes,yes,no,yes,yes,yes,no,4,3,3,1,1,3,0,15,12,14GP,F,17,U,GT3,T,3,2,health,health,reputation,father,1,4,0,no,yes,yes,yes,no,yes,yes,no,5,2,2,1,2,5,0,17,17,18GP,M,19,U,GT3,T,3,3,other,other,home,other,1,2,1,no,yes,no,yes,yes,yes,yes,yes,4,4,4,1,1,3,20,15,14,13GP,F,18,U,GT3,T,2,4,services,at_home,reputation,other,1,2,1,no,yes,yes,yes,yes,yes,yes,no,4,4,3,1,1,3,8,14,12,12GP,M,20,U,GT3,A,3,2,services,other,course,other,1,1,0,no,no,no,yes,yes,yes,no,no,5,5,3,1,1,5,0,17,18,18GP,M,19,U,GT3,T,4,4,teacher,services,reputation,other,2,1,1,no,yes,yes,no,yes,yes,yes,yes,4,3,4,1,1,4,38,8,9,8GP,M,19,R,GT3,T,3,3,other,services,reputation,father,1,2,1,no,no,no,yes,yes,yes,no,yes,4,5,3,1,2,5,0,15,12,12GP,F,19,U,LE3,T,1,1,at_home,other,reputation,other,1,2,1,yes,yes,no,yes,no,yes,yes,no,4,4,3,1,3,3,18,12,10,10GP,F,19,U,LE3,T,1,2,services,services,home,other,1,2,1,no,no,no,yes,no,yes,no,yes,4,2,4,2,2,3,0,9,9,0GP,F,19,U,GT3,T,2,1,at_home,other,other,other,3,2,0,no,yes,no,no,yes,no,yes,yes,3,4,1,1,1,2,20,14,12,13GP,M,19,U,GT3,T,1,2,other,services,course,other,1,2,1,no,no,no,no,no,yes,yes,no,4,5,2,2,2,4,3,13,11,11GP,F,19,U,LE3,T,3,2,services,other,reputation,other,2,2,1,no,yes,yes,no,no,yes,yes,yes,4,2,2,1,2,1,22,13,10,11GP,F,19,U,GT3,T,1,1,at_home,health,home,other,1,3,2,no,no,no,no,no,yes,yes,yes,4,1,2,1,1,3,14,15,13,13GP,F,19,R,GT3,T,2,3,other,other,reputation,other,1,3,1,no,no,no,no,yes,yes,yes,yes,4,1,2,1,1,3,40,13,11,11GP,F,18,U,GT3,T,2,1,services,other,course,mother,2,2,0,no,yes,yes,yes,yes,yes,yes,no,5,3,3,1,2,1,0,8,8,0GP,F,18,U,GT3,T,4,3,other,other,course,mother,1,3,0,no,yes,yes,yes,yes,yes,yes,yes,4,3,4,1,1,5,9,9,10,9GP,F,17,R,GT3,T,3,4,at_home,services,course,father,1,3,0,no,yes,yes,yes,no,yes,yes,no,4,3,4,2,5,5,0,11,11,10GP,F,18,U,GT3,T,4,4,teacher,other,course,mother,1,2,0,no,yes,yes,no,yes,yes,yes,no,4,4,4,3,3,5,2,11,11,11GP,F,17,U,GT3,A,4,3,services,services,course,mother,1,2,0,no,yes,yes,no,yes,yes,yes,yes,5,2,2,1,2,5,23,13,13,13GP,F,17,U,GT3,T,2,2,other,other,course,mother,1,2,0,no,yes,no,no,yes,yes,no,yes,4,2,2,1,1,3,12,11,9,9GP,F,17,R,LE3,T,2,2,services,services,course,mother,1,3,0,no,yes,yes,yes,yes,yes,yes,no,3,3,2,2,2,3,3,11,11,11GP,F,17,U,GT3,T,3,1,services,services,course,father,1,3,0,no,yes,no,no,no,yes,yes,no,3,4,3,2,3,5,1,12,14,15GP,F,17,U,LE3,T,0,2,at_home,at_home,home,father,2,3,0,no,no,no,no,yes,yes,yes,no,3,3,3,2,3,2,0,16,15,15GP,M,18,U,GT3,T,4,4,other,other,course,mother,1,3,0,no,no,no,yes,yes,yes,yes,no,4,3,3,2,2,3,3,9,12,11GP,M,17,U,GT3,T,3,3,other,services,reputation,mother,1,1,0,no,no,no,yes,no,yes,yes,no,4,3,5,3,5,5,3,14,15,16GP,M,17,R,GT3,T,2,2,services,other,course,mother,4,1,0,no,yes,no,no,yes,yes,yes,no,4,4,5,5,5,4,8,11,10,10GP,F,17,U,GT3,T,4,4,teacher,services,course,mother,1,3,0,no,yes,yes,yes,yes,yes,yes,no,5,4,4,1,3,4,7,10,9,9GP,F,17,U,GT3,T,4,4,teacher,teacher,course,mother,2,3,0,no,yes,yes,no,no,yes,yes,yes,4,3,3,1,2,4,4,14,14,14GP,M,18,U,LE3,T,2,2,other,other,course,mother,1,4,0,no,yes,no,yes,yes,yes,yes,no,4,5,5,5,4,5,2,9,8,8GP,F,17,R,GT3,T,2,4,at_home,other,course,father,1,3,0,no,yes,no,no,yes,yes,yes,yes,4,4,3,1,1,5,7,12,14,14GP,F,18,U,GT3,T,3,3,services,services,home,mother,1,2,0,no,no,no,yes,yes,yes,yes,no,5,3,4,1,1,4,0,7,0,0GP,F,18,U,LE3,T,2,2,other,other,home,other,1,2,0,no,no,no,yes,no,yes,yes,yes,4,3,3,1,1,2,0,8,8,0GP,F,18,R,GT3,T,2,2,at_home,other,course,mother,2,4,0,no,no,no,yes,yes,yes,no,no,4,4,4,1,1,4,0,10,9,0GP,F,17,U,GT3,T,3,4,services,other,course,mother,1,3,0,no,no,no,no,yes,yes,yes,no,4,4,5,1,3,5,16,16,15,15GP,F,19,R,GT3,A,3,1,services,at_home,home,other,1,3,1,no,no,yes,no,yes,yes,no,no,5,4,3,1,2,5,12,14,13,13GP,F,17,U,GT3,T,3,2,other,other,home,mother,1,2,0,no,yes,yes,no,yes,yes,yes,yes,4,3,2,2,3,2,0,7,8,0GP,F,18,U,LE3,T,3,3,services,services,home,mother,1,4,0,no,yes,no,no,yes,yes,yes,no,5,3,3,1,1,1,7,16,15,17GP,F,17,R,GT3,A,3,2,other,other,home,mother,1,2,0,no,yes,yes,no,yes,yes,yes,no,4,3,3,2,3,2,4,9,10,10GP,F,19,U,GT3,T,2,1,services,services,home,other,1,3,1,no,no,yes,yes,yes,yes,yes,yes,4,3,4,1,3,3,4,11,12,11GP,M,18,U,GT3,T,4,4,teacher,services,home,father,1,2,1,no,yes,no,yes,yes,yes,yes,no,4,3,3,2,2,2,0,10,10,0GP,M,18,U,LE3,T,3,4,services,other,home,mother,1,2,0,no,no,no,yes,yes,yes,yes,yes,4,3,3,1,3,5,11,16,15,15GP,F,17,U,GT3,A,2,2,at_home,at_home,home,father,1,2,1,no,yes,no,no,yes,yes,yes,yes,3,3,1,1,2,4,0,9,8,0GP,F,18,U,GT3,T,2,3,at_home,other,course,mother,1,3,0,no,yes,no,no,yes,yes,yes,no,4,3,3,1,2,3,4,11,10,10GP,F,18,U,GT3,T,3,2,other,services,other,mother,1,3,0,no,no,no,no,yes,yes,yes,yes,5,4,3,2,3,1,7,13,13,14GP,M,18,R,GT3,T,4,3,teacher,services,course,mother,1,3,0,no,no,no,no,yes,yes,yes,yes,5,3,2,1,2,4,9,16,15,16GP,M,18,U,GT3,T,4,3,teacher,other,course,mother,1,3,0,no,yes,yes,no,yes,yes,yes,yes,5,4,5,2,3,5,0,10,10,9GP,F,17,U,GT3,T,4,3,health,other,reputation,mother,1,3,0,no,yes,yes,yes,yes,yes,yes,yes,4,4,3,1,3,4,0,13,15,15MS,M,18,R,GT3,T,3,2,other,other,course,mother,2,1,1,no,yes,no,no,no,yes,yes,no,2,5,5,5,5,5,10,11,13,13MS,M,19,R,GT3,T,1,1,other,services,home,other,3,2,3,no,no,no,no,yes,yes,yes,no,5,4,4,3,3,2,8,8,7,8MS,M,17,U,GT3,T,3,3,health,other,course,mother,2,2,0,no,yes,yes,no,yes,yes,yes,no,4,5,4,5,3,3,2,13,13,13MS,M,18,U,LE3,T,1,3,at_home,services,course,mother,1,1,1,no,no,no,no,yes,no,yes,yes,4,3,3,2,3,3,7,8,7,8MS,M,19,R,GT3,T,1,1,other,other,home,other,3,1,1,no,yes,no,no,yes,yes,yes,no,4,4,4,3,3,5,4,8,8,8MS,M,17,R,GT3,T,4,3,services,other,home,mother,2,2,0,no,yes,yes,yes,no,yes,yes,yes,4,5,5,4,5,2,4,13,11,11MS,F,18,U,GT3,T,3,3,services,services,course,father,1,2,0,no,yes,no,no,yes,yes,no,yes,5,3,4,1,1,5,0,10,9,9MS,F,17,R,GT3,T,4,4,teacher,services,other,father,2,2,0,no,yes,yes,yes,yes,yes,yes,no,4,3,3,1,2,5,4,12,13,13MS,F,17,U,LE3,A,3,2,services,other,reputation,mother,2,2,0,no,no,no,no,yes,yes,no,yes,1,2,3,1,2,5,2,12,12,11MS,M,18,U,LE3,T,1,1,other,services,home,father,2,1,0,no,no,no,no,no,yes,yes,yes,3,3,2,1,2,3,4,10,10,10MS,F,18,U,LE3,T,1,1,at_home,services,course,father,2,3,0,no,no,no,no,yes,yes,yes,no,5,3,2,1,1,4,0,18,16,16MS,F,18,R,LE3,A,1,4,at_home,other,course,mother,3,2,0,no,no,no,no,yes,yes,no,yes,4,3,4,1,4,5,0,13,13,13MS,M,18,R,LE3,T,1,1,at_home,other,other,mother,2,2,1,no,no,no,yes,no,no,no,no,4,4,3,2,3,5,2,13,12,12MS,F,18,U,GT3,T,3,3,services,services,other,mother,2,2,0,no,yes,no,no,yes,yes,yes,yes,4,3,2,1,3,3,0,11,11,10MS,F,17,U,LE3,T,4,4,at_home,at_home,course,mother,1,2,0,no,yes,yes,yes,yes,yes,yes,yes,2,3,4,1,1,1,0,16,15,15MS,F,17,R,GT3,T,1,2,other,services,course,father,2,2,0,no,no,no,no,no,yes,no,no,3,2,2,1,2,3,0,12,11,12MS,M,18,R,GT3,T,1,3,at_home,other,course,mother,2,2,0,no,yes,yes,no,yes,yes,no,no,3,3,4,2,4,3,4,10,10,10MS,M,18,U,LE3,T,4,4,teacher,services,other,mother,2,3,0,no,no,yes,no,yes,yes,yes,yes,4,2,2,2,2,5,0,13,13,13MS,F,17,R,GT3,T,1,1,other,services,reputation,mother,3,1,1,no,yes,yes,no,yes,yes,yes,yes,5,2,1,1,2,1,0,7,6,0MS,F,18,U,GT3,T,2,3,at_home,services,course,father,2,1,0,no,yes,yes,no,yes,yes,yes,yes,5,2,3,1,2,4,0,11,10,10MS,F,18,R,GT3,T,4,4,other,teacher,other,father,3,2,0,no,yes,yes,no,no,yes,yes,yes,3,2,2,4,2,5,10,14,12,11MS,F,19,U,LE3,T,3,2,services,services,home,other,2,2,2,no,no,no,yes,yes,yes,no,yes,3,2,2,1,1,3,4,7,7,9MS,M,18,R,LE3,T,1,2,at_home,services,other,father,3,1,0,no,yes,yes,yes,yes,no,yes,yes,4,3,3,2,3,3,3,14,12,12MS,F,17,U,GT3,T,2,2,other,at_home,home,mother,1,3,0,no,no,no,yes,yes,yes,no,yes,3,4,3,1,1,3,8,13,11,11MS,F,17,R,GT3,T,1,2,other,other,course,mother,1,1,0,no,no,no,yes,yes,yes,yes,no,3,5,5,4,5,1,14,6,5,5MS,F,18,R,LE3,T,4,4,other,other,reputation,mother,2,3,0,no,no,no,no,yes,yes,yes,no,5,4,4,1,1,1,0,19,18,19MS,F,18,R,GT3,T,1,1,other,other,home,mother,4,3,0,no,no,no,no,yes,yes,yes,no,4,3,2,1,2,4,2,8,8,10MS,F,20,U,GT3,T,4,2,health,other,course,other,2,3,2,no,yes,yes,no,no,yes,yes,yes,5,4,3,1,1,3,4,15,14,15MS,F,18,R,LE3,T,4,4,teacher,services,course,mother,1,2,0,no,no,yes,yes,yes,yes,yes,no,5,4,3,3,4,2,4,8,9,10MS,F,18,U,GT3,T,3,3,other,other,home,mother,1,2,0,no,no,yes,no,yes,yes,yes,yes,4,1,3,1,2,1,0,15,15,15MS,F,17,R,GT3,T,3,1,at_home,other,reputation,mother,1,2,0,no,yes,yes,yes,no,yes,yes,no,4,5,4,4,5,1,17,10,10,10MS,M,18,U,GT3,T,4,4,teacher,teacher,home,father,1,2,0,no,no,yes,yes,no,yes,yes,no,3,2,4,1,4,2,4,15,14,14MS,M,18,R,GT3,T,2,1,other,other,other,mother,2,1,0,no,no,no,yes,no,yes,yes,yes,4,4,3,1,3,5,5,7,6,7MS,M,17,U,GT3,T,2,3,other,services,home,father,2,2,0,no,no,no,yes,yes,yes,yes,no,4,4,3,1,1,3,2,11,11,10MS,M,19,R,GT3,T,1,1,other,services,other,mother,2,1,1,no,no,no,no,yes,yes,no,no,4,3,2,1,3,5,0,6,5,0MS,M,18,R,GT3,T,4,2,other,other,home,father,2,1,1,no,no,yes,no,yes,yes,no,no,5,4,3,4,3,3,14,6,5,5MS,F,18,R,GT3,T,2,2,at_home,other,other,mother,2,3,0,no,no,yes,no,yes,yes,no,no,5,3,3,1,3,4,2,10,9,10MS,F,18,R,GT3,T,4,4,teacher,at_home,reputation,mother,3,1,0,no,yes,yes,yes,yes,yes,yes,yes,4,4,3,2,2,5,7,6,5,6MS,F,19,R,GT3,T,2,3,services,other,course,mother,1,3,1,no,no,no,yes,no,yes,yes,no,5,4,2,1,2,5,0,7,5,0MS,F,18,U,LE3,T,3,1,teacher,services,course,mother,1,2,0,no,yes,yes,no,yes,yes,yes,no,4,3,4,1,1,1,0,7,9,8MS,F,18,U,GT3,T,1,1,other,other,course,mother,2,2,1,no,no,no,yes,yes,yes,no,no,1,1,1,1,1,5,0,6,5,0MS,M,20,U,LE3,A,2,2,services,services,course,other,1,2,2,no,yes,yes,no,yes,yes,no,no,5,5,4,4,5,4,11,9,9,9MS,M,17,U,LE3,T,3,1,services,services,course,mother,2,1,0,no,no,no,no,no,yes,yes,no,2,4,5,3,4,2,3,14,16,16MS,M,21,R,GT3,T,1,1,other,other,course,other,1,1,3,no,no,no,no,no,yes,no,no,5,5,3,5,5,3,3,10,8,7MS,M,18,R,LE3,T,3,2,services,other,course,mother,3,1,0,no,no,no,no,no,yes,yes,no,4,4,1,3,4,5,0,11,12,10MS,M,19,U,LE3,T,1,1,other,at_home,course,father,1,1,0,no,no,no,no,yes,yes,yes,no,3,2,3,3,3,5,5,8,9,9


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
 1 | The MIT License (MIT)
 2 | 
 3 | Copyright (c) 2016 Luke Chang
 4 | 
 5 | Permission is hereby granted, free of charge, to any person obtaining a copy
 6 | of this software and associated documentation files (the "Software"), to deal
 7 | in the Software without restriction, including without limitation the rights
 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 | 
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 | 
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.
22 | 


--------------------------------------------------------------------------------
/Notebooks/.gitignore:
--------------------------------------------------------------------------------
1 | # Ignore solutions
2 | Solutions/*
3 | .ipynb_checkpoints


--------------------------------------------------------------------------------
/Notebooks/1_Introduction_to_Programming.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "markdown",
  5 |    "metadata": {},
  6 |    "source": [
  7 |     "# Introduction to programming\n",
  8 |     "_Written by Luke Chang_\n",
  9 |     "\n",
 10 |     "In this notebook we will begin to learn how to use Python.  There are many different ways to install Python, but we recommend starting using Anaconda which is preconfigured for scientific computing.  Start with installing [Python 3.7](https://www.anaconda.com/distribution/).  For those who prefer a more configurable IDE, [Pycharm](https://www.jetbrains.com/pycharm/) is a nice option.  Python is a modular interpreted language with an intuitive minimal syntax that is quickly becoming one of the most popular languages for [conducting research](http://www.talyarkoni.org/blog/2013/11/18/the-homogenization-of-scientific-computing-or-why-python-is-steadily-eating-other-languages-lunch/).  You can use python for [stimulus presentation](http://www.psychopy.org/), [data analysis](http://statsmodels.sourceforge.net/), [machine-learning](http://scikit-learn.org/stable/), [scraping data](https://www.crummy.com/software/BeautifulSoup/), creating websites with [flask](http://flask.pocoo.org/) or [django](https://www.djangoproject.com/), or [neuroimaging data analysis](http://nipy.org/).\n",
 11 |     "\n",
 12 |     "There are lots of free useful resources to learn how to use python and various modules.  See Yaroslav Halchenko's excellent [Dartmouth course](https://github.com/dartmouth-pbs/psyc161).  [Codeacademy](https://www.codecademy.com/) is a great interactive tutorial.  [Stack Overflow](http://stackoverflow.com/) is an incredibly useful resource for asking specific questions and seeing responses to others that have been rated by the development community. "
 13 |    ]
 14 |   },
 15 |   {
 16 |    "cell_type": "markdown",
 17 |    "metadata": {},
 18 |    "source": [
 19 |     "## Jupyter Notebooks\n",
 20 |     "We will primarily be using [Jupyter Notebooks](http://jupyter.org/) to interface with Python.  A Jupyter notebook consists of **cells**. The two main types of cells you will use are code cells and markdown cells.\n",
 21 |     "\n",
 22 |     "A **_code cell_** contains actual code that you want to run. You can specify a cell as a code cell using the pulldown menu in the toolbar in your Jupyter notebook. Otherwise, you can can hit esc and then y (denoted \"esc, y\") while a cell is selected to specify that it is a code cell. Note that you will have to hit enter after doing this to start editing it.\n",
 23 |     "If you want to execute the code in a code cell, hit \"shift + enter.\" Note that code cells are executed in the order you execute them. That is to say, the ordering of the cells for which you hit \"shift + enter\" is the order in which the code is executed. If you did not explicitly execute a cell early in the document, its results are now known to the Python interpreter.\n",
 24 |     "\n",
 25 |     "**_Markdown cells_** contain text. The text is written in markdown, a lightweight markup language. You can read about its syntax [here](http://daringfireball.net/projects/markdown/syntax). Note that you can also insert HTML into markdown cells, and this will be rendered properly. As you are typing the contents of these cells, the results appear as text. Hitting \"shift + enter\" renders the text in the formatting you specify.  You can specify a cell as being a markdown cell in the Jupyter toolbar, or by hitting \"esc, m\" in the cell. Again, you have to hit enter after using the quick keys to bring the cell into edit mode.\n",
 26 |     "\n",
 27 |     "In general, when you want to add a new cell, you can use the \"Insert\" pulldown menu from the Jupyter toolbar. The shortcut to insert a cell below is \"esc, b\" and to insert a cell above is \"esc, a.\" Alternatively, you can execute a cell and automatically add a new one below it by hitting \"alt + enter.\""
 28 |    ]
 29 |   },
 30 |   {
 31 |    "cell_type": "code",
 32 |    "execution_count": null,
 33 |    "metadata": {},
 34 |    "outputs": [],
 35 |    "source": [
 36 |     "print(\"Hello World\")"
 37 |    ]
 38 |   },
 39 |   {
 40 |    "cell_type": "markdown",
 41 |    "metadata": {},
 42 |    "source": [
 43 |     "## Package Management\n",
 44 |     "Package managment in Python has been dramatically improving.  Anaconda has it's own package manager called 'conda'.  Use this if you would like to install a new module as it is optimized to work with anaconda. \n",
 45 |     "\n",
 46 |     "```\n",
 47 |     "!conda install *package*\n",
 48 |     "```\n",
 49 |     "\n",
 50 |     "However, sometimes conda doesn't have a particular package.  In this case use the default python package manager called 'pip'. \n",
 51 |     "\n",
 52 |     "These commands can be run in your unix terminal or you can send them to the shell from a Jupyter notebook by starting the line with ```!```\n",
 53 |     "\n",
 54 |     "It is easy to get help on how to use the package managers\n",
 55 |     "\n",
 56 |     "```\n",
 57 |     "!pip help install\n",
 58 |     "```"
 59 |    ]
 60 |   },
 61 |   {
 62 |    "cell_type": "code",
 63 |    "execution_count": null,
 64 |    "metadata": {},
 65 |    "outputs": [],
 66 |    "source": [
 67 |     "!pip help install"
 68 |    ]
 69 |   },
 70 |   {
 71 |    "cell_type": "code",
 72 |    "execution_count": null,
 73 |    "metadata": {},
 74 |    "outputs": [],
 75 |    "source": [
 76 |     "!pip list --outdated"
 77 |    ]
 78 |   },
 79 |   {
 80 |    "cell_type": "code",
 81 |    "execution_count": null,
 82 |    "metadata": {},
 83 |    "outputs": [],
 84 |    "source": [
 85 |     "!pip install setuptools --upgrade"
 86 |    ]
 87 |   },
 88 |   {
 89 |    "cell_type": "markdown",
 90 |    "metadata": {},
 91 |    "source": [
 92 |     "## Variables\n",
 93 |     "* Built-in\n",
 94 |     "  * Numeric types:\n",
 95 |     "    * **int**, **float**, **long**, complex\n",
 96 |     "  * **str**ing \n",
 97 |     "  * **bool**ean\n",
 98 |     "    * True / False\n",
 99 |     "  * **NoneType**\n",
100 |     "* User defined\n",
101 |     "\n",
102 |     "* Use the type() function to find the type for a value or variable\n",
103 |     "\n",
104 |     "* Data can be converted using cast commands"
105 |    ]
106 |   },
107 |   {
108 |    "cell_type": "code",
109 |    "execution_count": null,
110 |    "metadata": {},
111 |    "outputs": [],
112 |    "source": [
113 |     "# Integer\n",
114 |     "a = 1\n",
115 |     "print(type(a))\n",
116 |     "\n",
117 |     "# Float\n",
118 |     "b = 1.0\n",
119 |     "print(type(b))\n",
120 |     "\n",
121 |     "# String\n",
122 |     "c = 'hello'\n",
123 |     "print(type(c))\n",
124 |     "\n",
125 |     "# Boolean\n",
126 |     "d = True\n",
127 |     "print(type(d))\n",
128 |     "\n",
129 |     "# None\n",
130 |     "e = None\n",
131 |     "print(type(e))\n",
132 |     "\n",
133 |     "# Cast integer to string\n",
134 |     "print(type(str(a)))"
135 |    ]
136 |   },
137 |   {
138 |    "cell_type": "markdown",
139 |    "metadata": {},
140 |    "source": [
141 |     "## Math Operators\n",
142 |     "* +, -, *, and /\n",
143 |     "* Exponentiation **\n",
144 |     "* Modulo %\n",
145 |     "\n",
146 |     "* Note that division with integers in Python 2.7 automatically rounds, which may not be intended.  It is recommended to import the division module from python3 `from __future__ import division`"
147 |    ]
148 |   },
149 |   {
150 |    "cell_type": "code",
151 |    "execution_count": null,
152 |    "metadata": {
153 |     "ExecuteTime": {
154 |      "end_time": "2019-03-27T14:34:18.317397Z",
155 |      "start_time": "2019-03-27T14:34:18.312483Z"
156 |     }
157 |    },
158 |    "outputs": [],
159 |    "source": [
160 |     "# Addition\n",
161 |     "a = 2 + 7\n",
162 |     "print(a)\n",
163 |     "\n",
164 |     "# Subtraction\n",
165 |     "b = a - 5\n",
166 |     "print(b)\n",
167 |     "\n",
168 |     "# Multiplication\n",
169 |     "print(b*2)\n",
170 |     "\n",
171 |     "# Exponentiation\n",
172 |     "print(b**2)\n",
173 |     "\n",
174 |     "# Modulo\n",
175 |     "print(4%9)\n",
176 |     "\n",
177 |     "# Division\n",
178 |     "print(4/9)"
179 |    ]
180 |   },
181 |   {
182 |    "cell_type": "markdown",
183 |    "metadata": {},
184 |    "source": [
185 |     "## String Operators\n",
186 |     "* Some of the arithmetic operators also have meaning for strings. E.g. for string concatenation use `+` sign\n",
187 |     "* String repetition: Use `*` sign with a number of repetitions"
188 |    ]
189 |   },
190 |   {
191 |    "cell_type": "code",
192 |    "execution_count": null,
193 |    "metadata": {},
194 |    "outputs": [],
195 |    "source": [
196 |     "# Combine string\n",
197 |     "a = 'Hello'\n",
198 |     "b = 'World'\n",
199 |     "print(a + b)\n",
200 |     "\n",
201 |     "# Repeat String\n",
202 |     "print(a*5)"
203 |    ]
204 |   },
205 |   {
206 |    "cell_type": "markdown",
207 |    "metadata": {},
208 |    "source": [
209 |     "## Logical Operators\n",
210 |     "Perform logical comparison and return Boolean value\n",
211 |     "\n",
212 |     "```python\n",
213 |     "x == y # x is equal to y\n",
214 |     "x != y # x is not equal to y\n",
215 |     "x > y # x is greater than y\n",
216 |     "x < y # x is less than y\n",
217 |     "x >= y # x is greater than or equal to y \n",
218 |     "x <= y # x is less than or equal to y```\n",
219 |     "\n",
220 |     "|  X    | not X  | \n",
221 |     "|-------|--------|\n",
222 |     "| True  | False  | \n",
223 |     "| False | True   | \n",
224 |     "\n",
225 |     "|  X   | Y    | X AND Y | X OR Y | \n",
226 |     "|------|------|---------|--------|\n",
227 |     "|True  | True | True  | True   | \n",
228 |     "|True  | False| False | True   | \n",
229 |     "|False | True | False | True   | \n",
230 |     "|False | False| False | False  | "
231 |    ]
232 |   },
233 |   {
234 |    "cell_type": "code",
235 |    "execution_count": null,
236 |    "metadata": {},
237 |    "outputs": [],
238 |    "source": [
239 |     "# Works for string\n",
240 |     "a = 'hello'\n",
241 |     "b = 'world'\n",
242 |     "c = 'Hello'\n",
243 |     "print(a==b)\n",
244 |     "print(a==c)\n",
245 |     "print(a!=b)\n",
246 |     "\n",
247 |     "# Works for numeric\n",
248 |     "d = 5\n",
249 |     "e = 8\n",
250 |     "print(d < e)"
251 |    ]
252 |   },
253 |   {
254 |    "cell_type": "markdown",
255 |    "metadata": {},
256 |    "source": [
257 |     "## Conditional Logic (if...)\n",
258 |     "Unlike most other languages, Python uses tab formatting rather than closing conditional statements (e.g., end).\n",
259 |     "\n",
260 |     "* Syntax:\n",
261 |     "\n",
262 |     "``` python \n",
263 |     "if condition: \n",
264 |     "    do something\n",
265 |     "```\n",
266 |     "\n",
267 |     "* Implicit conversion of the value to bool() happens if `condition` is of a different type than **bool**, thus all of the following should work:\n",
268 |     "\n",
269 |     "```python\n",
270 |     "if condition:\n",
271 |     "    do_something\n",
272 |     "elif condition:\n",
273 |     "    do_alternative1\n",
274 |     "else:\n",
275 |     "    do_otherwise # often reserved to report an error\n",
276 |     "                 # after a long list of options\n",
277 |     "    ```\n"
278 |    ]
279 |   },
280 |   {
281 |    "cell_type": "code",
282 |    "execution_count": null,
283 |    "metadata": {},
284 |    "outputs": [],
285 |    "source": [
286 |     "n = 1\n",
287 |     "\n",
288 |     "if n:\n",
289 |     "    print(\"n is non-0\")\n",
290 |     "\n",
291 |     "if n is None:\n",
292 |     "    print(\"n is None\")\n",
293 |     "    \n",
294 |     "if n is not None:\n",
295 |     "    print(\"n is not None\")"
296 |    ]
297 |   },
298 |   {
299 |    "cell_type": "markdown",
300 |    "metadata": {},
301 |    "source": [
302 |     "## Loops\n",
303 |     "* **for** loop is probably the most popular loop construct in Python:\n",
304 |     "\n",
305 |     "```python\n",
306 |     "for target in sequence:\n",
307 |     "    do_statements\n",
308 |     "```\n",
309 |     "\n",
310 |     "* However, it's also possible to use a **while** loop to repeat statements while `condition` remains True:\n",
311 |     "\n",
312 |     "```python\n",
313 |     "while condition do:\n",
314 |     "    do_statements\n",
315 |     "```"
316 |    ]
317 |   },
318 |   {
319 |    "cell_type": "code",
320 |    "execution_count": null,
321 |    "metadata": {
322 |     "ExecuteTime": {
323 |      "end_time": "2019-03-27T14:35:32.341248Z",
324 |      "start_time": "2019-03-27T14:35:32.336945Z"
325 |     }
326 |    },
327 |    "outputs": [],
328 |    "source": [
329 |     "string = \"Python is going to make conducting research easier\"\n",
330 |     "for c in string:\n",
331 |     "    print(c)\n"
332 |    ]
333 |   },
334 |   {
335 |    "cell_type": "code",
336 |    "execution_count": null,
337 |    "metadata": {
338 |     "ExecuteTime": {
339 |      "end_time": "2019-03-27T14:35:49.426964Z",
340 |      "start_time": "2019-03-27T14:35:49.421669Z"
341 |     }
342 |    },
343 |    "outputs": [],
344 |    "source": [
345 |     "x = 0\n",
346 |     "end = 10\n",
347 |     "\n",
348 |     "csum = 0\n",
349 |     "while x < end:\n",
350 |     "    csum += x\n",
351 |     "    print(x, csum)\n",
352 |     "    x += 1\n",
353 |     "print(\"Exited with x==%d\" % x )"
354 |    ]
355 |   },
356 |   {
357 |    "cell_type": "markdown",
358 |    "metadata": {},
359 |    "source": [
360 |     "## Functions\n",
361 |     "A **function** is a named sequence of statements that performs a computation.  You define the function by giving it a name, specify a sequence of statements, and optionally values to return.  Later, you can “call” the function by name. \n",
362 |     "```python\n",
363 |     "def make_upper_case(text):\n",
364 |     "    return (text.upper())\n",
365 |     "```\n",
366 |     "* The expression in the parenthesis is the **argument**. \n",
367 |     "* It is common to say that a function **“takes” an argument** and **“returns” a result**.\n",
368 |     "* The result is called the **return value**.\n",
369 |     "\n",
370 |     "The first line of the function definition is called the **header**; the rest is called the **body**.\n",
371 |     "\n",
372 |     "The header has to end with a colon and the body has to be indented.\n",
373 |     "It is a common practice to use 4 spaces for indentation, and to avoid mixing with tabs.\n",
374 |     "\n",
375 |     "Function body in Python ends whenever statement begins at the original level of indentation.  There is no **end** or **fed** or any other identify to signal the end of function.  Indentation is part of the the language syntax in Python, making it more readable and less cluttered."
376 |    ]
377 |   },
378 |   {
379 |    "cell_type": "code",
380 |    "execution_count": null,
381 |    "metadata": {
382 |     "ExecuteTime": {
383 |      "end_time": "2019-03-27T14:35:59.444165Z",
384 |      "start_time": "2019-03-27T14:35:59.440980Z"
385 |     }
386 |    },
387 |    "outputs": [],
388 |    "source": [
389 |     "def make_upper_case(text):\n",
390 |     "    return (text.upper())\n",
391 |     "\n",
392 |     "string = \"Python is going to make conducting research easier\"\n",
393 |     "\n",
394 |     "print(make_upper_case(string))"
395 |    ]
396 |   },
397 |   {
398 |    "cell_type": "markdown",
399 |    "metadata": {},
400 |    "source": [
401 |     "## Python Containers\n",
402 |     "There are 4 main types of builtin containers for storing data in Python:\n",
403 |     "* list\n",
404 |     "* tuple\n",
405 |     "* dict\n",
406 |     "* set"
407 |    ]
408 |   },
409 |   {
410 |    "cell_type": "markdown",
411 |    "metadata": {},
412 |    "source": [
413 |     "### Lists\n",
414 |     "In Python, a list is a mutable sequence of values.  Mutable means that we can change separate entries within a list. For a more in depth tutorial on lists look [here](http://nbviewer.jupyter.org/github/dartmouth-pbs/psyc161/blob/master/classes/02d-Python-Fundamentals-Containers-Lists.ipynb)\n",
415 |     "\n",
416 |     "* Each value in the list is an element or item\n",
417 |     "* Elements can be any Python data type\n",
418 |     "* Lists can mix data types\n",
419 |     "\n",
420 |     "* Lists are initialized with ```[]``` or ```list()```\n",
421 |     "```python\n",
422 |     "l = [1,2,3]\n",
423 |     "```\n",
424 |     "* \n",
425 |     "Elements within a list are indexed (**starting with 0**)\n",
426 |     "```python\n",
427 |     "l[0]\n",
428 |     "```\n",
429 |     "\n",
430 |     "* \n",
431 |     "Elements can be nested lists\n",
432 |     "```python\n",
433 |     "nested = [[1, 2, 3], [4, 5, 6], [7, 8, 9]]\n",
434 |     "```\n",
435 |     "\n",
436 |     "* \n",
437 |     "Lists can be *sliced*.\n",
438 |     "```python\n",
439 |     "l[start:stop:stride]\n",
440 |     "```\n",
441 |     "\n",
442 |     "* Like all python containers, lists have many useful methods that can be applied\n",
443 |     "```python\n",
444 |     "a.insert(index,new element)\n",
445 |     "a.append(element to add at end)\n",
446 |     "len(a)\n",
447 |     "```\n",
448 |     "\n",
449 |     "* \n",
450 |     "List comprehension is a *Very* powerful technique allowing for efficient construction of new lists.\n",
451 |     "```python\n",
452 |     "[a for a in l]\n",
453 |     "```\n"
454 |    ]
455 |   },
456 |   {
457 |    "cell_type": "code",
458 |    "execution_count": null,
459 |    "metadata": {
460 |     "ExecuteTime": {
461 |      "end_time": "2019-03-27T14:36:30.240853Z",
462 |      "start_time": "2019-03-27T14:36:30.236249Z"
463 |     }
464 |    },
465 |    "outputs": [],
466 |    "source": [
467 |     "# Indexing and Slicing\n",
468 |     "a = ['lists','are','arrays']\n",
469 |     "print(a[0])\n",
470 |     "print(a[1:3])\n",
471 |     "\n",
472 |     "# List methods\n",
473 |     "a.insert(2,'python')\n",
474 |     "a.append('.')\n",
475 |     "print(a)\n",
476 |     "print(len(a))\n",
477 |     "\n",
478 |     "# List Comprehension\n",
479 |     "print([x.upper() for x in a])"
480 |    ]
481 |   },
482 |   {
483 |    "cell_type": "markdown",
484 |    "metadata": {},
485 |    "source": [
486 |     "### Dictionaries\n",
487 |     "\n",
488 |     "* In Python, a dictionary (or `dict`) is mapping between a set of\n",
489 |     "indices (**keys**) and a set of **values**\n",
490 |     "\n",
491 |     "* The items in a dictionary are key-value pairs\n",
492 |     "\n",
493 |     "* Keys can be any Python data type\n",
494 |     "\n",
495 |     "* Dictionaries are unordered\n",
496 |     "\n",
497 |     "* Here is a more indepth tutorial on [dictionaries](http://nbviewer.jupyter.org/github/dartmouth-pbs/psyc161/blob/master/classes/03c-Python-Fundamentals-Containers-Dicts.ipynb)"
498 |    ]
499 |   },
500 |   {
501 |    "cell_type": "code",
502 |    "execution_count": null,
503 |    "metadata": {
504 |     "ExecuteTime": {
505 |      "end_time": "2019-03-27T14:36:42.674472Z",
506 |      "start_time": "2019-03-27T14:36:42.670435Z"
507 |     }
508 |    },
509 |    "outputs": [],
510 |    "source": [
511 |     "# Dictionaries\n",
512 |     "eng2sp = {}\n",
513 |     "eng2sp['one'] = 'uno'\n",
514 |     "print(eng2sp)\n",
515 |     "\n",
516 |     "eng2sp = {'one': 'uno', 'two': 'dos', 'three': 'tres'}\n",
517 |     "print(eng2sp)\n",
518 |     "\n",
519 |     "print(eng2sp.keys())\n",
520 |     "print(eng2sp.values())"
521 |    ]
522 |   },
523 |   {
524 |    "cell_type": "markdown",
525 |    "metadata": {},
526 |    "source": [
527 |     "### Tuples\n",
528 |     "In Python, a **tuple** is an immutable sequence of values, meaning they can't be changed\n",
529 |     "\n",
530 |     "* Each value in the tuple is an element or item\n",
531 |     "\n",
532 |     "* Elements can be any Python data type\n",
533 |     "\n",
534 |     "* Tuples can mix data types\n",
535 |     "\n",
536 |     "* Elements can be nested tuples\n",
537 |     "\n",
538 |     "* **Essentially tuples are immutable lists**\n",
539 |     "\n",
540 |     "Here is a nice tutorial on [tuples](http://nbviewer.jupyter.org/github/dartmouth-pbs/psyc161/blob/master/classes/03b-Python-Fundamentals-Containers-Tuples.ipynb)"
541 |    ]
542 |   },
543 |   {
544 |    "cell_type": "code",
545 |    "execution_count": null,
546 |    "metadata": {
547 |     "ExecuteTime": {
548 |      "end_time": "2019-03-27T14:36:51.403867Z",
549 |      "start_time": "2019-03-27T14:36:51.400160Z"
550 |     }
551 |    },
552 |    "outputs": [],
553 |    "source": [
554 |     "numbers = (1, 2, 3, 4)\n",
555 |     "print(numbers)\n",
556 |     "\n",
557 |     "t2 = 1, 2\n",
558 |     "print(t2)"
559 |    ]
560 |   },
561 |   {
562 |    "cell_type": "markdown",
563 |    "metadata": {},
564 |    "source": [
565 |     "## sets\n",
566 |     "In Python, a `set` is an efficient storage for \"membership\" checking\n",
567 |     "\n",
568 |     "* `set` is like a `dict` but only with keys and without values\n",
569 |     "\n",
570 |     "* a `set` can also perform set operations (e.g., union intersection)\n",
571 |     "\n",
572 |     "* Here is more info on [sets](http://nbviewer.jupyter.org/github/dartmouth-pbs/psyc161/blob/master/classes/03d-Python-Fundamentals-Containers-Sets.ipynb)\n"
573 |    ]
574 |   },
575 |   {
576 |    "cell_type": "code",
577 |    "execution_count": null,
578 |    "metadata": {
579 |     "ExecuteTime": {
580 |      "end_time": "2019-03-27T14:37:02.025425Z",
581 |      "start_time": "2019-03-27T14:37:02.021137Z"
582 |     }
583 |    },
584 |    "outputs": [],
585 |    "source": [
586 |     "# Union\n",
587 |     "print({1, 2, 3, 'mom', 'dad'} | {2, 3, 10})\n",
588 |     "\n",
589 |     "# Intersection\n",
590 |     "print({1, 2, 3, 'mom', 'dad'} & {2, 3, 10})\n",
591 |     "\n",
592 |     "# Difference\n",
593 |     "print({1, 2, 3, 'mom', 'dad'} - {2, 3, 10})"
594 |    ]
595 |   },
596 |   {
597 |    "cell_type": "markdown",
598 |    "metadata": {},
599 |    "source": [
600 |     "## Modules\n",
601 |     "A *Module* is a python file that contains a collection of related definitions. Python has *hundreds* of standard modules.  These are organized into what is known as the [Python Standard Library](http://docs.python.org/library/).  You can also create and use your own modules.  To use functionality from a module, you first have to import the entire module or parts of it into your namespace\n",
602 |     "\n",
603 |     "* To import the entire module, use \n",
604 |     "\n",
605 |     "  ```python\n",
606 |     "  import module_name```\n",
607 |     "\n",
608 |     "* You can also import a module using a specific name \n",
609 |     "\n",
610 |     "  ```python\n",
611 |     "  import module_name as new_module_name```\n",
612 |     "\n",
613 |     "* To import specific definitions (e.g. functions, variables, etc) from the module into your local namespace, use\n",
614 |     "\n",
615 |     "```python\n",
616 |     "from module_name import name1, name2\n",
617 |     "```\n",
618 |     "   which will make those available directly in your ```namespace```"
619 |    ]
620 |   },
621 |   {
622 |    "cell_type": "code",
623 |    "execution_count": null,
624 |    "metadata": {
625 |     "ExecuteTime": {
626 |      "end_time": "2017-01-09T09:18:26.329329",
627 |      "start_time": "2017-01-09T09:18:26.324573"
628 |     }
629 |    },
630 |    "outputs": [],
631 |    "source": [
632 |     "import os\n",
633 |     "from glob import glob"
634 |    ]
635 |   },
636 |   {
637 |    "cell_type": "markdown",
638 |    "metadata": {},
639 |    "source": [
640 |     "Here let's try and get the path of the current working directory using functions from the `os` module"
641 |    ]
642 |   },
643 |   {
644 |    "cell_type": "code",
645 |    "execution_count": null,
646 |    "metadata": {},
647 |    "outputs": [],
648 |    "source": [
649 |     "os.path.abspath(os.path.curdir)"
650 |    ]
651 |   },
652 |   {
653 |    "cell_type": "markdown",
654 |    "metadata": {},
655 |    "source": [
656 |     "It looks like we are currently in the notebooks folder of the github repository.  Let's use glob, a pattern matching function, to list all of the csv files in the Data folder.  "
657 |    ]
658 |   },
659 |   {
660 |    "cell_type": "code",
661 |    "execution_count": null,
662 |    "metadata": {
663 |     "ExecuteTime": {
664 |      "end_time": "2019-03-27T14:37:10.021529Z",
665 |      "start_time": "2019-03-27T14:37:09.940612Z"
666 |     }
667 |    },
668 |    "outputs": [],
669 |    "source": [
670 |     "data_file_list = glob(os.path.join('..','Data','*csv'))\n",
671 |     "print(data_file_list)"
672 |    ]
673 |   },
674 |   {
675 |    "cell_type": "markdown",
676 |    "metadata": {},
677 |    "source": [
678 |     "This gives us a list of the files including the relative path from the current directory.  What if we wanted just the filenames?  There are several different ways to do this.  First, we can use the the `os.path.basename` function.  We loop over every file, grab the base file name and then append it to a new list. "
679 |    ]
680 |   },
681 |   {
682 |    "cell_type": "code",
683 |    "execution_count": null,
684 |    "metadata": {
685 |     "ExecuteTime": {
686 |      "end_time": "2017-01-09T09:26:35.521842",
687 |      "start_time": "2017-01-09T09:26:35.517653"
688 |     }
689 |    },
690 |    "outputs": [],
691 |    "source": [
692 |     "file_list = []\n",
693 |     "for f in data_file_list:\n",
694 |     "    file_list.append(os.path.basename(f))\n",
695 |     "\n",
696 |     "print(file_list)"
697 |    ]
698 |   },
699 |   {
700 |    "cell_type": "markdown",
701 |    "metadata": {},
702 |    "source": [
703 |     "Alternatively, we could loop over all files and split on the `/` character.  This will create a new list where each element is whatever characters are separated by the splitting character.  We can then take the last element of each list."
704 |    ]
705 |   },
706 |   {
707 |    "cell_type": "code",
708 |    "execution_count": null,
709 |    "metadata": {
710 |     "ExecuteTime": {
711 |      "end_time": "2017-01-09T09:27:57.077454",
712 |      "start_time": "2017-01-09T09:27:57.073701"
713 |     }
714 |    },
715 |    "outputs": [],
716 |    "source": [
717 |     "file_list = []\n",
718 |     "for f in data_file_list:\n",
719 |     "    file_list.append(f.split('/')[-1])\n",
720 |     "\n",
721 |     "print(file_list)"
722 |    ]
723 |   },
724 |   {
725 |    "cell_type": "markdown",
726 |    "metadata": {},
727 |    "source": [
728 |     "It is also sometimes even cleaner to do this as a list comprehension"
729 |    ]
730 |   },
731 |   {
732 |    "cell_type": "code",
733 |    "execution_count": null,
734 |    "metadata": {
735 |     "ExecuteTime": {
736 |      "end_time": "2017-01-09T09:28:39.734003",
737 |      "start_time": "2017-01-09T09:28:39.729813"
738 |     }
739 |    },
740 |    "outputs": [],
741 |    "source": [
742 |     "[os.path.basename(x) for x in data_file_list]"
743 |    ]
744 |   },
745 |   {
746 |    "cell_type": "markdown",
747 |    "metadata": {},
748 |    "source": [
749 |     "# Exercises"
750 |    ]
751 |   },
752 |   {
753 |    "cell_type": "markdown",
754 |    "metadata": {},
755 |    "source": [
756 |     "### Find Even Numbers\n",
757 |     "Let’s say I give you a list saved in a variable: a = [1, 4, 9, 16, 25, 36, 49, 64, 81, 100]. Make a new list that has only the even elements of this list in it."
758 |    ]
759 |   },
760 |   {
761 |    "cell_type": "code",
762 |    "execution_count": null,
763 |    "metadata": {},
764 |    "outputs": [],
765 |    "source": []
766 |   },
767 |   {
768 |    "cell_type": "markdown",
769 |    "metadata": {},
770 |    "source": [
771 |     "### Find Maximal Range\n",
772 |     "Given an array length 1 or more of ints, return the difference between the largest and smallest values in the array. "
773 |    ]
774 |   },
775 |   {
776 |    "cell_type": "code",
777 |    "execution_count": null,
778 |    "metadata": {},
779 |    "outputs": [],
780 |    "source": []
781 |   },
782 |   {
783 |    "cell_type": "markdown",
784 |    "metadata": {},
785 |    "source": [
786 |     "### Duplicated Numbers\n",
787 |     "Find the numbers in list a that are also in list b\n",
788 |     "\n",
789 |     "a = [0, 1, 4, 9, 16, 25, 36, 49, 64, 81, 100, 121, 144, 169, 196, 225, 256, 289, 324, 361]\n",
790 |     "\n",
791 |     "b = [0, 4, 16, 36, 64, 100, 144, 196, 256, 324]"
792 |    ]
793 |   },
794 |   {
795 |    "cell_type": "code",
796 |    "execution_count": null,
797 |    "metadata": {},
798 |    "outputs": [],
799 |    "source": []
800 |   },
801 |   {
802 |    "cell_type": "markdown",
803 |    "metadata": {},
804 |    "source": [
805 |     "### Speeding Ticket Fine\n",
806 |     "You are driving a little too fast on the highway, and a police officer stops you. Write a function that takes the speed as an input and returns the fine.  Your function must use a dictionary.\n",
807 |     "\n",
808 |     "\n",
809 |     "If speed is 60 or less, the result is `$0`. If speed is between 61 and 80 inclusive, the result is `$100`. If speed is 81 or more, the result is `$500`. \n"
810 |    ]
811 |   },
812 |   {
813 |    "cell_type": "code",
814 |    "execution_count": null,
815 |    "metadata": {},
816 |    "outputs": [],
817 |    "source": []
818 |   }
819 |  ],
820 |  "metadata": {
821 |   "kernelspec": {
822 |    "display_name": "Python 3",
823 |    "language": "python",
824 |    "name": "python3"
825 |   },
826 |   "language_info": {
827 |    "codemirror_mode": {
828 |     "name": "ipython",
829 |     "version": 3
830 |    },
831 |    "file_extension": ".py",
832 |    "mimetype": "text/x-python",
833 |    "name": "python",
834 |    "nbconvert_exporter": "python",
835 |    "pygments_lexer": "ipython3",
836 |    "version": "3.7.2"
837 |   },
838 |   "toc": {
839 |    "base_numbering": 1,
840 |    "nav_menu": {},
841 |    "number_sections": true,
842 |    "sideBar": true,
843 |    "skip_h1_title": false,
844 |    "title_cell": "Table of Contents",
845 |    "title_sidebar": "Contents",
846 |    "toc_cell": false,
847 |    "toc_position": {},
848 |    "toc_section_display": true,
849 |    "toc_window_display": false
850 |   }
851 |  },
852 |  "nbformat": 4,
853 |  "nbformat_minor": 1
854 | }
855 | 


--------------------------------------------------------------------------------
/Notebooks/2.1_MatplotlibTemplates.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "code",
  5 |    "execution_count": null,
  6 |    "metadata": {
  7 |     "ExecuteTime": {
  8 |      "end_time": "2019-03-27T14:37:58.582789Z",
  9 |      "start_time": "2019-03-27T14:37:57.460651Z"
 10 |     }
 11 |    },
 12 |    "outputs": [],
 13 |    "source": [
 14 |     "%matplotlib inline\n",
 15 |     "import numpy as np\n",
 16 |     "import pandas as pd\n",
 17 |     "import matplotlib.pyplot as plt\n",
 18 |     "from scipy import stats\n",
 19 |     "\n",
 20 |     "def autolabel(rects):\n",
 21 |     "    # attach some text labels\n",
 22 |     "    for rect in rects:\n",
 23 |     "        height = rect.get_height()\n",
 24 |     "        ax.text(rect.get_x() + rect.get_width()/2., 1.05*height,\n",
 25 |     "                '%2.2f' % float(height),\n",
 26 |     "                ha='center', va='bottom')"
 27 |    ]
 28 |   },
 29 |   {
 30 |    "cell_type": "markdown",
 31 |    "metadata": {},
 32 |    "source": [
 33 |     "Simple Plots in Python\n",
 34 |     "\n",
 35 |     "In this tutorial we'll show you some basic templates of scientific plots using Python matplotlib.\n",
 36 |     "\n",
 37 |     "# Bar graphs with standard error bars for 1 group"
 38 |    ]
 39 |   },
 40 |   {
 41 |    "cell_type": "code",
 42 |    "execution_count": null,
 43 |    "metadata": {
 44 |     "ExecuteTime": {
 45 |      "end_time": "2019-03-27T14:38:02.687415Z",
 46 |      "start_time": "2019-03-27T14:38:02.318322Z"
 47 |     },
 48 |     "scrolled": true
 49 |    },
 50 |    "outputs": [],
 51 |    "source": [
 52 |     "# based on http://matplotlib.org/examples/api/barchart_demo.html\n",
 53 |     "\n",
 54 |     "# Make some fake data\n",
 55 |     "d = {'gender': np.hstack([np.ones(10), np.zeros(10)]), 'scores': np.hstack([np.random.rand(10), np.random.rand(10)+1])}\n",
 56 |     "df = pd.DataFrame(d)\n",
 57 |     "\n",
 58 |     "# Change this part and replace with the variables you want to plot and the grouping variable column name.\n",
 59 |     "vals = ['scores'] # This is the column name of the variable to plot on Y axis\n",
 60 |     "group = ['gender'] # This is the grouping variable for the X axis\n",
 61 |     "\n",
 62 |     "# Get means for each group\n",
 63 |     "means = df[vals+group].groupby(group).mean().squeeze()\n",
 64 |     "# Get standard error of means for each group\n",
 65 |     "sems = df[vals+group].groupby(group).sem().squeeze()\n",
 66 |     "\n",
 67 |     "fig,ax = plt.subplots(figsize=(10,5)) # Change figure size in (width,height)\n",
 68 |     "ind = np.arange(np.size(np.unique(df[group]),0)) # location of bars\n",
 69 |     "width = .5 # Width of bars\n",
 70 |     "# (bar x-location, bar heights, width=bar width, color=bar color, yerr=standard error,ecolor=errorbarcolor)\n",
 71 |     "rects1 = ax.bar(ind - width/2,means,width=.5,color='lightsalmon',yerr=sems,ecolor='blue') \n",
 72 |     "# Look up different colors here: http://stackoverflow.com/questions/22408237/named-colors-in-matplotlib\n",
 73 |     "\n",
 74 |     "# configure axes properties to make pretty\n",
 75 |     "ax.set_ylabel('scores')\n",
 76 |     "ax.set_xlabel('gender')\n",
 77 |     "ax.set_title('Scores by gender')\n",
 78 |     "ax.set_xticks(ind)\n",
 79 |     "ax.set_xticklabels(['Male','Female'])\n",
 80 |     "ax.set_xlim([-.5,1.5]) \n",
 81 |     "ax.set_ylim([0,2])\n",
 82 |     "\n",
 83 |     "# This part calls the function autolabel() defined above, and labels the bars with values\n",
 84 |     "autolabel(rects1)\n",
 85 |     "\n",
 86 |     "plt.show()"
 87 |    ]
 88 |   },
 89 |   {
 90 |    "cell_type": "markdown",
 91 |    "metadata": {},
 92 |    "source": [
 93 |     "# Bar graphs with standard error bars for 2 group"
 94 |    ]
 95 |   },
 96 |   {
 97 |    "cell_type": "code",
 98 |    "execution_count": null,
 99 |    "metadata": {
100 |     "ExecuteTime": {
101 |      "end_time": "2019-03-27T14:38:07.243834Z",
102 |      "start_time": "2019-03-27T14:38:07.090095Z"
103 |     }
104 |    },
105 |    "outputs": [],
106 |    "source": [
107 |     "# Make some fake data\n",
108 |     "d = {'race': np.random.permutation(np.hstack([np.ones(10), np.zeros(10)])), \n",
109 |     "     'gender': np.hstack([np.ones(10), np.zeros(10)]), \n",
110 |     "     'scores': np.hstack([np.random.rand(10), np.random.rand(10)+1])}\n",
111 |     "df = pd.DataFrame(d)\n",
112 |     "\n",
113 |     "# Change this part and replace with the variables you want to plot and the grouping variable column name.\n",
114 |     "val =['scores']\n",
115 |     "group1 = ['gender']\n",
116 |     "group2 = ['race']\n",
117 |     "\n",
118 |     "# Get means and sems for Gender group\n",
119 |     "means1 = df[val+group1].groupby(group1).mean().squeeze()\n",
120 |     "sems1 = df[val+group1].groupby(group1).sem().squeeze()\n",
121 |     "# Get means and sems for Race group\n",
122 |     "means2 = df[val+group2].groupby(group2).mean().squeeze()\n",
123 |     "sems2 = df[val+group2].groupby(group2).sem().squeeze()\n",
124 |     "\n",
125 |     "fig,ax = plt.subplots(figsize=(10,5)) # Change figure size in (width,height)\n",
126 |     "ind = np.array([0.,1.]) # location of bars\n",
127 |     "width = .4 # Width of bars\n",
128 |     "\n",
129 |     "# plot score by gender\n",
130 |     "rects1 = ax.bar(ind - width,means1,width,color='lightcoral',yerr=sems1,ecolor='k') # (bar x-location, bar heights, width=bar width, color=bar color, yerr=standard error)\n",
131 |     "# plot score by race \n",
132 |     "rects2 = ax.bar(ind,means2,width,color='lightblue',yerr=sems2,ecolor='k')\n",
133 |     "\n",
134 |     "# configure axes properties to make pretty\n",
135 |     "ax.set_ylabel('scores')\n",
136 |     "ax.set_xlabel('gender')\n",
137 |     "ax.set_title('Scores by gender and race')\n",
138 |     "ax.set_xticks(ind)\n",
139 |     "ax.set_xticklabels(['Male','Female'])\n",
140 |     "ax.set_xlim([ind[0]-width*1.25,ind[-1]+width*1.25]) \n",
141 |     "ax.set_ylim([0,1.8])\n",
142 |     "\n",
143 |     "ax.legend(['Race0','Race1'])\n",
144 |     "\n",
145 |     "autolabel(rects1)\n",
146 |     "autolabel(rects2)\n"
147 |    ]
148 |   },
149 |   {
150 |    "cell_type": "markdown",
151 |    "metadata": {},
152 |    "source": [
153 |     "# Scatterplots of 1 group with jittered location\n",
154 |     "\n",
155 |     "If you try to plot something like a scaled data, you won't be able to see how clustered they are because they would just plot on top of each other. One way to avoid this is to jitter the x,y locations around the actual value."
156 |    ]
157 |   },
158 |   {
159 |    "cell_type": "code",
160 |    "execution_count": null,
161 |    "metadata": {
162 |     "ExecuteTime": {
163 |      "end_time": "2019-03-27T14:38:10.892018Z",
164 |      "start_time": "2019-03-27T14:38:10.778229Z"
165 |     },
166 |     "scrolled": true
167 |    },
168 |    "outputs": [],
169 |    "source": [
170 |     "# Make some fake data\n",
171 |     "d = {'race': np.random.permutation(np.hstack([np.ones(20), np.zeros(20)])),\n",
172 |     "     'gender': np.hstack([np.ones(20), np.zeros(20)]), \n",
173 |     "     'scores': np.round(10*np.hstack([np.random.rand(20), np.random.rand(20)+1]))}\n",
174 |     "df = pd.DataFrame(d)\n",
175 |     "ax = df.plot(kind='scatter',x='gender',y='scores')\n",
176 |     "ax.set_title('Values are stacked')\n",
177 |     "plt.show()"
178 |    ]
179 |   },
180 |   {
181 |    "cell_type": "markdown",
182 |    "metadata": {},
183 |    "source": [
184 |     "Here is the fix. "
185 |    ]
186 |   },
187 |   {
188 |    "cell_type": "code",
189 |    "execution_count": null,
190 |    "metadata": {
191 |     "ExecuteTime": {
192 |      "end_time": "2019-03-27T14:38:14.040412Z",
193 |      "start_time": "2019-03-27T14:38:13.939597Z"
194 |     }
195 |    },
196 |    "outputs": [],
197 |    "source": [
198 |     "# Set x,y values for each group\n",
199 |     "gender0 = 0 # value of first group\n",
200 |     "y0 = df[['scores']].loc[df['gender']==gender0].values.squeeze() # Grabs y values for Gender =0\n",
201 |     "y0 = y0+(np.random.rand(len(y0))-.5)*.1 #Change after + sign to control dispersion\n",
202 |     "x0 = np.ones(len(y0))*gender0 +(np.random.rand(len(y0))-.5)*.1 #Change after + sign to control dispersion\n",
203 |     "\n",
204 |     "gender1 = 1 # value of second group\n",
205 |     "y1 = df[['scores']].loc[df['gender']==gender1].values.squeeze()\n",
206 |     "y1 = y1+(np.random.rand(len(y1))-.5)*.1\n",
207 |     "x1 = np.ones(len(y1))*gender1 + (np.random.rand(len(y1))-.5)*.1\n",
208 |     "\n",
209 |     "fig,ax = plt.subplots(figsize=(5,5))\n",
210 |     "ax.scatter(x0,y0,color='lightcoral')\n",
211 |     "ax.scatter(x1,y1,color='lightcoral')\n",
212 |     "ax.set_ylabel('scores')\n",
213 |     "ax.set_xlabel('gender')\n",
214 |     "ax.set_title('Values are now dispersed')\n",
215 |     "ax.set_xticks([0,1])\n",
216 |     "ax.set_xticklabels(['Male','Female'])\n",
217 |     "ax.set_xlim([-.5,1.5]) \n",
218 |     "ax.grid() # puts grid on\n",
219 |     "plt.show()"
220 |    ]
221 |   },
222 |   {
223 |    "cell_type": "markdown",
224 |    "metadata": {
225 |     "collapsed": true
226 |    },
227 |    "source": [
228 |     "# Drawing trend line on a scatterplot"
229 |    ]
230 |   },
231 |   {
232 |    "cell_type": "code",
233 |    "execution_count": null,
234 |    "metadata": {
235 |     "ExecuteTime": {
236 |      "end_time": "2019-03-27T14:38:31.813333Z",
237 |      "start_time": "2019-03-27T14:38:31.403344Z"
238 |     }
239 |    },
240 |    "outputs": [],
241 |    "source": [
242 |     "import statsmodels.formula.api as smf\n",
243 |     "import statsmodels.api as sm\n",
244 |     "\n",
245 |     "d = {'race': np.random.permutation(np.hstack([np.ones(20), np.zeros(20)])),\n",
246 |     "     'gender': np.hstack([np.ones(20), np.zeros(20)]), \n",
247 |     "     'scores': np.round(10*np.hstack([np.random.rand(20), np.random.rand(20)+1]))}\n",
248 |     "df = pd.DataFrame(d)\n",
249 |     "lm = smf.ols(formula = \"scores ~ gender\",data=df).fit()\n",
250 |     "print(lm.summary())\n",
251 |     "\n",
252 |     "# Save the slope for gender to b1 and intercept to b0\n",
253 |     "b1 = lm.params[1] # This is slope\n",
254 |     "b0 = lm.params[0] # This is intercept"
255 |    ]
256 |   },
257 |   {
258 |    "cell_type": "code",
259 |    "execution_count": null,
260 |    "metadata": {
261 |     "ExecuteTime": {
262 |      "end_time": "2019-03-27T14:38:34.895112Z",
263 |      "start_time": "2019-03-27T14:38:34.792217Z"
264 |     }
265 |    },
266 |    "outputs": [],
267 |    "source": [
268 |     "# Set x,y values for each group\n",
269 |     "gender0 = 0 # value of first group\n",
270 |     "y0 = df[['scores']].loc[df['gender']==gender0].values.squeeze()\n",
271 |     "y0 = y0+(np.random.rand(len(y0))-.5)*.1 #Change after + sign to control dispersion\n",
272 |     "x0 = np.ones(len(y0))*gender0 + (np.random.rand(len(y0))-.5)*.1 #Change after + sign to control dispersion\n",
273 |     "\n",
274 |     "gender1 = 1 # value of second group\n",
275 |     "y1 = df[['scores']].loc[df['gender']==gender1].values.squeeze()\n",
276 |     "y1 = y1+(np.random.rand(len(y1))-.5)*.1\n",
277 |     "x1 = np.ones(len(y1))*gender1 + (np.random.rand(len(y1))-.5)*.1\n",
278 |     "\n",
279 |     "fig,ax = plt.subplots(figsize=(5,5))\n",
280 |     "ax.scatter(x0,y0,color='lightcoral')\n",
281 |     "ax.scatter(x1,y1,color='lightcoral')\n",
282 |     "\n",
283 |     "# Part that adds the line\n",
284 |     "spacing = 10\n",
285 |     "minx = df[['gender']].min().squeeze()\n",
286 |     "maxx = df[['gender']].max().squeeze()\n",
287 |     "lx = np.linspace(minx,maxx,spacing) # make x coordinates \n",
288 |     "ly = b0+lx*b1 # Estimate the y values using betas\n",
289 |     "ax.plot(lx,ly,'-k')\n",
290 |     "\n",
291 |     "ax.set_ylabel('scores')\n",
292 |     "ax.set_xlabel('gender')\n",
293 |     "ax.set_title('Values are now dispersed')\n",
294 |     "ax.set_xticks([0,1])\n",
295 |     "ax.set_xticklabels(['Male','Female'])\n",
296 |     "ax.set_xlim([-.5,1.5]) \n",
297 |     "ax.grid()\n",
298 |     "plt.show()"
299 |    ]
300 |   },
301 |   {
302 |    "cell_type": "code",
303 |    "execution_count": null,
304 |    "metadata": {},
305 |    "outputs": [],
306 |    "source": []
307 |   },
308 |   {
309 |    "cell_type": "code",
310 |    "execution_count": null,
311 |    "metadata": {},
312 |    "outputs": [],
313 |    "source": []
314 |   }
315 |  ],
316 |  "metadata": {
317 |   "kernelspec": {
318 |    "display_name": "Python 3",
319 |    "language": "python",
320 |    "name": "python3"
321 |   },
322 |   "language_info": {
323 |    "codemirror_mode": {
324 |     "name": "ipython",
325 |     "version": 3
326 |    },
327 |    "file_extension": ".py",
328 |    "mimetype": "text/x-python",
329 |    "name": "python",
330 |    "nbconvert_exporter": "python",
331 |    "pygments_lexer": "ipython3",
332 |    "version": "3.7.2"
333 |   },
334 |   "toc": {
335 |    "base_numbering": 1,
336 |    "nav_menu": {},
337 |    "number_sections": true,
338 |    "sideBar": true,
339 |    "skip_h1_title": false,
340 |    "title_cell": "Table of Contents",
341 |    "title_sidebar": "Contents",
342 |    "toc_cell": false,
343 |    "toc_position": {},
344 |    "toc_section_display": true,
345 |    "toc_window_display": false
346 |   }
347 |  },
348 |  "nbformat": 4,
349 |  "nbformat_minor": 1
350 | }
351 | 


--------------------------------------------------------------------------------
/Notebooks/3_Introduction_to_Regression.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "markdown",
  5 |    "metadata": {
  6 |     "deletable": true,
  7 |     "editable": true
  8 |    },
  9 |    "source": [
 10 |     "# Linear Regression Analysis\n",
 11 |     "\n",
 12 |     "*Written by Jin Cheong & Luke Chang*\n",
 13 |     "\n",
 14 |     "In this lab we are going to learn how to do simple data analyses using python. \n",
 15 |     "This module includes learning how to run simple linear regression models, comparing models, & visualizing fits. - This notebook was adapted from material available [here](https://github.com/justmarkham/DAT4/blob/master/notebooks/08_linear_regression.ipynb)\n",
 16 |     "\n",
 17 |     "After the tutorial you will have the chance to apply the methods to a new set of data. \n"
 18 |    ]
 19 |   },
 20 |   {
 21 |    "cell_type": "code",
 22 |    "execution_count": null,
 23 |    "metadata": {
 24 |     "ExecuteTime": {
 25 |      "end_time": "2017-01-18T14:44:10.530703",
 26 |      "start_time": "2017-01-18T14:44:09.731729"
 27 |     },
 28 |     "collapsed": true,
 29 |     "deletable": true,
 30 |     "editable": true
 31 |    },
 32 |    "outputs": [],
 33 |    "source": [
 34 |     "%matplotlib inline\n",
 35 |     "\n",
 36 |     "import numpy as np\n",
 37 |     "import pandas as pd\n",
 38 |     "import matplotlib.pyplot as plt\n",
 39 |     "import statsmodels.api as sm\n",
 40 |     "import statsmodels.formula.api as smf\n",
 41 |     "import statsmodels.stats.api as stats"
 42 |    ]
 43 |   },
 44 |   {
 45 |    "cell_type": "markdown",
 46 |    "metadata": {
 47 |     "deletable": true,
 48 |     "editable": true
 49 |    },
 50 |    "source": [
 51 |     "We will load the data which is a comma delimited text file using the `read_csv()` method from pandas. \n",
 52 |     "The '../Data' indicates that we will move one folder up and then into the Data folder."
 53 |    ]
 54 |   },
 55 |   {
 56 |    "cell_type": "code",
 57 |    "execution_count": null,
 58 |    "metadata": {
 59 |     "ExecuteTime": {
 60 |      "end_time": "2017-01-18T14:44:10.536720",
 61 |      "start_time": "2017-01-18T14:44:10.532385"
 62 |     },
 63 |     "collapsed": false,
 64 |     "deletable": true,
 65 |     "editable": true
 66 |    },
 67 |    "outputs": [],
 68 |    "source": [
 69 |     "# Load the data\n",
 70 |     "df = pd.read_csv('../Data/salary.csv')"
 71 |    ]
 72 |   },
 73 |   {
 74 |    "cell_type": "code",
 75 |    "execution_count": null,
 76 |    "metadata": {
 77 |     "ExecuteTime": {
 78 |      "end_time": "2017-01-18T14:44:10.543760",
 79 |      "start_time": "2017-01-18T14:44:10.537887"
 80 |     },
 81 |     "collapsed": false,
 82 |     "deletable": true,
 83 |     "editable": true
 84 |    },
 85 |    "outputs": [],
 86 |    "source": [
 87 |     "df = df.dropna()\n",
 88 |     "df = df[df.gender!=2]"
 89 |    ]
 90 |   },
 91 |   {
 92 |    "cell_type": "code",
 93 |    "execution_count": null,
 94 |    "metadata": {
 95 |     "ExecuteTime": {
 96 |      "end_time": "2017-01-18T14:44:10.554847",
 97 |      "start_time": "2017-01-18T14:44:10.545364"
 98 |     },
 99 |     "collapsed": false,
100 |     "deletable": true,
101 |     "editable": true
102 |    },
103 |    "outputs": [],
104 |    "source": [
105 |     "print 'There are %i rows and %i columns in this data set' % df.shape\n",
106 |     "df.isnull().sum()"
107 |    ]
108 |   },
109 |   {
110 |    "cell_type": "code",
111 |    "execution_count": null,
112 |    "metadata": {
113 |     "ExecuteTime": {
114 |      "end_time": "2017-01-18T14:44:10.765524",
115 |      "start_time": "2017-01-18T14:44:10.556141"
116 |     },
117 |     "collapsed": false,
118 |     "deletable": true,
119 |     "editable": true
120 |    },
121 |    "outputs": [],
122 |    "source": [
123 |     "df[['salary','years']].plot(kind='scatter', x='years', y='salary')"
124 |    ]
125 |   },
126 |   {
127 |    "cell_type": "code",
128 |    "execution_count": null,
129 |    "metadata": {
130 |     "ExecuteTime": {
131 |      "end_time": "2017-01-18T14:44:10.810764",
132 |      "start_time": "2017-01-18T14:44:10.767183"
133 |     },
134 |     "collapsed": false,
135 |     "deletable": true,
136 |     "editable": true
137 |    },
138 |    "outputs": [],
139 |    "source": [
140 |     "# Oneshot visualization\n",
141 |     "## create a new numericalSeries called dept_num !! Just for visualization !! \n",
142 |     "df['dept_num'] = df.departm.map({'bio':0, 'chem':1,'geol':2,'neuro':3,'stat':4,'physics':5,'math':6})\n",
143 |     "df.tail()"
144 |    ]
145 |   },
146 |   {
147 |    "cell_type": "code",
148 |    "execution_count": null,
149 |    "metadata": {
150 |     "ExecuteTime": {
151 |      "end_time": "2017-01-18T14:44:11.455813",
152 |      "start_time": "2017-01-18T14:44:10.828546"
153 |     },
154 |     "collapsed": false,
155 |     "deletable": true,
156 |     "editable": true
157 |    },
158 |    "outputs": [],
159 |    "source": [
160 |     "## Now plot all four categories\n",
161 |     "fig, axs = plt.subplots(1, 4, sharey=True)\n",
162 |     "df.plot(kind='scatter', x='gender', y='salary', ax=axs[0], figsize=(16, 4))\n",
163 |     "df.plot(kind='scatter', x='dept_num', y='salary', ax=axs[1])\n",
164 |     "df.plot(kind='scatter', x='years', y='salary', ax=axs[2])\n",
165 |     "df.plot(kind='scatter', x='age', y='salary', ax=axs[3])\n",
166 |     "# The problem is that it treats department as a continuous variable. "
167 |    ]
168 |   },
169 |   {
170 |    "cell_type": "markdown",
171 |    "metadata": {
172 |     "deletable": true,
173 |     "editable": true
174 |    },
175 |    "source": [
176 |     "## Linear Regression\n",
177 |     "\n",
178 |     "Now let's move on to running an analysis.\n",
179 |     "\n",
180 |     "Simple linear regression is an approach for predicting a **quantitative response** using a **single feature** (or \"predictor\" or \"input variable\"). It takes the following form:\n",
181 |     "\n",
182 |     "$y = \\beta_0 + \\beta_1\\cdot x $\n",
183 |     "\n",
184 |     "What does each term represent?\n",
185 |     "- $y$ is the response\n",
186 |     "- $x$ is the feature\n",
187 |     "- $\\beta_0$ is the intercept\n",
188 |     "- $\\beta_1$ is the coefficient for x\n",
189 |     "\n",
190 |     "Together, $\\beta_0$ and $\\beta_1$ are called the **model coefficients**. To create your model, you must \"estimate\" the values of these coefficients. And once we've estimated these parameters, we can use the model to predict things!"
191 |    ]
192 |   },
193 |   {
194 |    "cell_type": "markdown",
195 |    "metadata": {
196 |     "deletable": true,
197 |     "editable": true
198 |    },
199 |    "source": [
200 |     "## Estimating Model Coefficients\n",
201 |     "\n",
202 |     "Generally speaking, coefficients are estimated using the **least squares criterion**, which means we are find the line (mathematically) which minimizes the **sum of squared residuals** (or \"sum of squared errors\"):\n",
203 |     "\n",
204 |     "<img src=\"Figures/estimating_coefficients.png\">\n",
205 |     "\n",
206 |     "What elements are present in the diagram?\n",
207 |     "- The black dots are the **observed values** of x and y.\n",
208 |     "- The blue line is our **least squares line**.\n",
209 |     "- The red lines are the **residuals**, which are the distances between the observed values and the least squares line.\n",
210 |     "\n",
211 |     "How do the model coefficients relate to the least squares line?\n",
212 |     "- $\\beta_0$ is the **intercept** (the value of $y$ when $x$=0)\n",
213 |     "- $\\beta_1$ is the **slope** (the change in $y$ divided by change in $x$)\n",
214 |     "\n",
215 |     "Here is a graphical depiction of those calculations:\n",
216 |     "\n",
217 |     "<img src=\"Figures/slope_intercept.png\">"
218 |    ]
219 |   },
220 |   {
221 |    "cell_type": "markdown",
222 |    "metadata": {
223 |     "deletable": true,
224 |     "editable": true
225 |    },
226 |    "source": [
227 |     "To run a regression we will be using the statsmodels module that was loaded above.  We will first initalize a model object with the regression equation and the data to use.  The format for specifying the regression equation is similar to R. (y ~ x).\n",
228 |     "\n",
229 |     "Here we will estimate the effect of gender on salary.  Gender is a \"dummy variable\", meaning that it is only ones and zeros.\n",
230 |     "\n",
231 |     "After we have initialized the object instance we can run the fit method (this can be chained so you only need to run one line).  After the parameters have been estimated we can query attributes of the model such as the estimated model parameters."
232 |    ]
233 |   },
234 |   {
235 |    "cell_type": "code",
236 |    "execution_count": null,
237 |    "metadata": {
238 |     "ExecuteTime": {
239 |      "end_time": "2017-01-18T14:44:11.484659",
240 |      "start_time": "2017-01-18T14:44:11.473799"
241 |     },
242 |     "collapsed": false,
243 |     "deletable": true,
244 |     "editable": true,
245 |     "scrolled": false
246 |    },
247 |    "outputs": [],
248 |    "source": [
249 |     "lm = smf.ols(formula = \"salary ~ gender\",data=df).fit()\n",
250 |     "\n",
251 |     "# print the coefficients\n",
252 |     "print(lm.params)\n",
253 |     "print(lm.params[1])"
254 |    ]
255 |   },
256 |   {
257 |    "cell_type": "markdown",
258 |    "metadata": {
259 |     "deletable": true,
260 |     "editable": true
261 |    },
262 |    "source": [
263 |     "## Interpreting Model Coefficients\n",
264 |     "\n",
265 |     "How do we interpret the estimated coefficients for this model?\n",
266 |     "- $\\beta_0$, or the intercept, reflects the average salary for the reference condition of the dummy code (i.e., when gender = 0).  This means that males make an average of `$69108.5`\n",
267 |     "- to get the estimated female salary we need to add $\\beta_1$ (-13388.83), which becomes `$55719.67`\n",
268 |     "\n",
269 |     "We can also get a summary of all of the model statistics using the `summary()` method."
270 |    ]
271 |   },
272 |   {
273 |    "cell_type": "markdown",
274 |    "metadata": {
275 |     "deletable": true,
276 |     "editable": true
277 |    },
278 |    "source": [
279 |     " "
280 |    ]
281 |   },
282 |   {
283 |    "cell_type": "code",
284 |    "execution_count": null,
285 |    "metadata": {
286 |     "ExecuteTime": {
287 |      "end_time": "2017-01-18T14:44:11.965467",
288 |      "start_time": "2017-01-18T14:44:11.954564"
289 |     },
290 |     "collapsed": false,
291 |     "deletable": true,
292 |     "editable": true
293 |    },
294 |    "outputs": [],
295 |    "source": [
296 |     "print(lm.summary())"
297 |    ]
298 |   },
299 |   {
300 |    "cell_type": "markdown",
301 |    "metadata": {
302 |     "deletable": true,
303 |     "editable": true
304 |    },
305 |    "source": [
306 |     "## Confidence in our Model\n",
307 |     "\n",
308 |     "Now let's examine the output. To learn more about Statsmodels and how to interpret the output, DataRobot has some decent posts on [simple linear regression](http://www.datarobot.com/blog/ordinary-least-squares-in-python/) and [multiple linear regression](http://www.datarobot.com/blog/multiple-regression-using-statsmodels/).\n",
309 |     "\n",
310 |     "\n",
311 |     "**Question:** Is linear regression a high bias/low variance model, or a low bias/high variance model?\n",
312 |     "\n",
313 |     "**Answer:** High bias/low variance. Under repeated sampling, the line will stay roughly in the same place (low variance), but the average of those models won't do a great job capturing the true relationship (high bias). Note that low variance is a useful characteristic when you don't have a lot of training data!\n",
314 |     "\n",
315 |     "A closely related concept is **confidence intervals**. Statsmodels calculates 95% confidence intervals for our model coefficients, which are interpreted as follows: If the population from which this sample was drawn was **sampled 100 times**, approximately **95 of those confidence intervals** would contain the \"true\" coefficient.\n",
316 |     "\n",
317 |     "## Hypothesis Testing and p-values\n",
318 |     "\n",
319 |     "Closely related to confidence intervals is **hypothesis testing**. Generally speaking, you start with a **null hypothesis** and an **alternative hypothesis** (that is opposite the null). Then, you check whether the data supports **rejecting the null hypothesis** or **failing to reject the null hypothesis**.\n",
320 |     "\n",
321 |     "(Note that \"failing to reject\" the null is not the same as \"accepting\" the null hypothesis. The alternative hypothesis may indeed be true, except that you just don't have enough data to show that.)\n",
322 |     "\n",
323 |     "As it relates to model coefficients, here is the conventional hypothesis test:\n",
324 |     "- **null hypothesis:** There is no relationship between gender and salary (and thus $\\beta_1$ equals zero)\n",
325 |     "- **alternative hypothesis:** There is a relationship between TV ads and Sales (and thus $\\beta_1$ is not equal to zero)\n",
326 |     "\n",
327 |     "How do we test this hypothesis? Intuitively, we reject the null (and thus believe the alternative) if the 95% confidence interval **does not include zero**. Conversely, the **p-value** represents the probability that the coefficient is actually zero:\n",
328 |     "\n",
329 |     "## Overall Model Fit\n",
330 |     "\n",
331 |     "The overall goodness of how well the model fits the data can be described using the AIC or BIC which are information criterions that penalize for the number of free parameters in the model.  A common way to evaluate the overall fit of a linear model is **$R^2$** value. $R^2$ is the **proportion of variance explained**, meaning the proportion of variance in the observed data that is explained by the model, or the reduction in error over the **null model**. (The null model just predicts the mean of the observed response, and thus it has an intercept and no slope.)\n",
332 |     "\n",
333 |     "$R^2$ is between 0 and 1, and higher is better because it means that more variance is explained by the model. Here's an example of what R-squared \"looks like\":\n",
334 |     "\n",
335 |     "In the model above, we see that $R^2$=0.09, which means that gender does not do a great job at predicting the overall salary."
336 |    ]
337 |   },
338 |   {
339 |    "cell_type": "markdown",
340 |    "metadata": {
341 |     "deletable": true,
342 |     "editable": true
343 |    },
344 |    "source": [
345 |     "## Multiple Linear Regression\n",
346 |     "\n",
347 |     "Simple linear regression can easily be extended to include multiple features. This is called **multiple linear regression**:\n",
348 |     "\n",
349 |     "$y = \\beta_0 + \\beta_1x_1 + ... + \\beta_nx_n$\n",
350 |     "\n",
351 |     "Each $x$ represents a different feature, and each feature has its own coefficient. In this case:\n",
352 |     "\n",
353 |     "$Salary = \\beta_0 + \\beta_1 \\cdot Gender + \\beta_2 \\cdot Years$\n",
354 |     "\n",
355 |     "Let's use Statsmodels to estimate these coefficients:"
356 |    ]
357 |   },
358 |   {
359 |    "cell_type": "code",
360 |    "execution_count": null,
361 |    "metadata": {
362 |     "ExecuteTime": {
363 |      "end_time": "2017-01-18T14:44:12.437296",
364 |      "start_time": "2017-01-18T14:44:12.419615"
365 |     },
366 |     "collapsed": false,
367 |     "deletable": true,
368 |     "editable": true
369 |    },
370 |    "outputs": [],
371 |    "source": [
372 |     "lm2 = smf.ols(formula = \"salary ~ gender + years\",data=df).fit()\n",
373 |     "print(lm2.summary())"
374 |    ]
375 |   },
376 |   {
377 |    "cell_type": "markdown",
378 |    "metadata": {
379 |     "deletable": true,
380 |     "editable": true
381 |    },
382 |    "source": [
383 |     "Here we see that `years` also significantly explains additional variance.\n",
384 |     "\n",
385 |     "This is reflected in the $R^2$, which has now increased from 0.09 to 0.143. It is important to note that **$R^2$ will always increase as you add more features to the model**, even if they are unrelated to the response. Thus, selecting the model with the highest R-squared is not a reliable approach for choosing the best linear model.\n",
386 |     "\n",
387 |     "There is alternative to $R^2$ called **adjusted $R^2$** that penalizes model complexity (to control for overfitting), but it generally [under-penalizes complexity](http://scott.fortmann-roe.com/docs/MeasuringError.html).\n",
388 |     "\n",
389 |     "Next week we will explore another approach known as **Cross-validation.** Importantly, cross-validation can be applied to any model, whereas the methods described above only apply to linear models.\n",
390 |     "\n",
391 |     "Here, we will try and see if the new model significantly explains additional variance controlling for the extra free parameter using a nested model comparison."
392 |    ]
393 |   },
394 |   {
395 |    "cell_type": "code",
396 |    "execution_count": null,
397 |    "metadata": {
398 |     "ExecuteTime": {
399 |      "end_time": "2017-01-18T14:44:12.769250",
400 |      "start_time": "2017-01-18T14:44:12.755716"
401 |     },
402 |     "collapsed": false,
403 |     "deletable": true,
404 |     "editable": true
405 |    },
406 |    "outputs": [],
407 |    "source": [
408 |     "print(stats.anova_lm(lm,lm2))"
409 |    ]
410 |   },
411 |   {
412 |    "cell_type": "markdown",
413 |    "metadata": {
414 |     "deletable": true,
415 |     "editable": true
416 |    },
417 |    "source": [
418 |     "We see that the addition of `years` in model2 significantly explains additional variance when compared to model 1. Notice that this is similar to the results provided by the t-tests on the individual parameters. Using model comparison to test incrementally adding a single predictor is a feature selection method known as stepwise regression.  However, you can also perform model comparison on models with many different features as long as one is nested within the other.\n",
419 |     "\n",
420 |     "When we only had one predictor we could investigate the relationship using a scatterplot.  Now that we have 2 predictors, we need to make a 3 dimensional plot to see the effect of each predictor on salary. "
421 |    ]
422 |   },
423 |   {
424 |    "cell_type": "code",
425 |    "execution_count": null,
426 |    "metadata": {
427 |     "ExecuteTime": {
428 |      "end_time": "2017-01-18T14:44:13.989664",
429 |      "start_time": "2017-01-18T14:44:13.091418"
430 |     },
431 |     "collapsed": false,
432 |     "deletable": true,
433 |     "editable": true
434 |    },
435 |    "outputs": [],
436 |    "source": [
437 |     "from mpl_toolkits.mplot3d import Axes3D\n",
438 |     "\n",
439 |     "fig = plt.figure(figsize=(18,6))\n",
440 |     "idx = [131,132,133]\n",
441 |     "pos = {0:(-35,20),1:(-90,0),2:(0,0)}\n",
442 |     "for i,ii in enumerate(idx):\n",
443 |     "    ax = fig.add_subplot(ii, projection='3d')\n",
444 |     "\n",
445 |     "    # generate a mesh\n",
446 |     "    x_surf = np.arange(0,11)/10.0\n",
447 |     "    y_surf = np.arange(0,35,3)\n",
448 |     "    x_surf, y_surf = np.meshgrid(x_surf, y_surf)\n",
449 |     "\n",
450 |     "    exog = pd.core.frame.DataFrame({'gender': x_surf.ravel(), 'years': y_surf.ravel()})\n",
451 |     "    exog['intercept'] = np.ones(exog.shape[0])\n",
452 |     "    out = lm2.predict(exog = exog,transform=True)\n",
453 |     "\n",
454 |     "    ax.plot_surface(x_surf, y_surf,\n",
455 |     "                out.reshape(x_surf.shape),\n",
456 |     "                rstride=1,\n",
457 |     "                cstride=1,\n",
458 |     "                color='None',\n",
459 |     "                alpha = 0.2)\n",
460 |     "\n",
461 |     "    ax.scatter(df['gender'], df['years'], df['salary'],\n",
462 |     "           c='blue',\n",
463 |     "           marker='o',\n",
464 |     "           alpha=1)\n",
465 |     "\n",
466 |     "    ax.set_xlabel('Gender')\n",
467 |     "    ax.set_ylabel('Years')\n",
468 |     "    ax.set_zlabel('Salary')\n",
469 |     "    ax.azim = pos[i][0]\n",
470 |     "    ax.elev = pos[i][1]\n",
471 |     "\n",
472 |     "plt.tight_layout()"
473 |    ]
474 |   },
475 |   {
476 |    "cell_type": "markdown",
477 |    "metadata": {
478 |     "deletable": true,
479 |     "editable": true
480 |    },
481 |    "source": [
482 |     "Notice how when we rotate the 3D plot, we can make projections into 2 dimensional space.  This is the effect of each predictor on salary controlling for the other.  \n",
483 |     "\n",
484 |     "We can also generate additional diagnostic plots."
485 |    ]
486 |   },
487 |   {
488 |    "cell_type": "code",
489 |    "execution_count": null,
490 |    "metadata": {
491 |     "ExecuteTime": {
492 |      "end_time": "2017-01-18T14:44:35.826100",
493 |      "start_time": "2017-01-18T14:44:35.156463"
494 |     },
495 |     "collapsed": false,
496 |     "deletable": true,
497 |     "editable": true,
498 |     "scrolled": false
499 |    },
500 |    "outputs": [],
501 |    "source": [
502 |     "f = sm.graphics.plot_regress_exog(lm2, 'years')\n",
503 |     "f.set_size_inches(10,5)"
504 |    ]
505 |   },
506 |   {
507 |    "cell_type": "markdown",
508 |    "metadata": {
509 |     "deletable": true,
510 |     "editable": true
511 |    },
512 |    "source": [
513 |     "Lastly, we can create a standard ANOVA table with F statistics as if doing Anova.  F stat tests whether at least one of the coefficients of the variables of a category is significantly different from 0 or not.\n",
514 |     "\n",
515 |     "There are different times of Sum of Squared Error (SSQ) to consider.  We recommend using Type 3 by default.  See [here](https://mcfromnz.wordpress.com/2011/03/02/anova-type-iiiiii-ss-explained/) for more in depth discussion."
516 |    ]
517 |   },
518 |   {
519 |    "cell_type": "code",
520 |    "execution_count": null,
521 |    "metadata": {
522 |     "ExecuteTime": {
523 |      "end_time": "2017-01-18T14:44:14.602669",
524 |      "start_time": "2017-01-18T14:44:14.590320"
525 |     },
526 |     "collapsed": false,
527 |     "deletable": true,
528 |     "editable": true
529 |    },
530 |    "outputs": [],
531 |    "source": [
532 |     "# Type 3 SSQ: valid for models with significant interaction terms\n",
533 |     "print('ANOVA table for Type 3 SSQ')\n",
534 |     "print(stats.anova_lm(lm2,typ=3))"
535 |    ]
536 |   },
537 |   {
538 |    "cell_type": "markdown",
539 |    "metadata": {
540 |     "deletable": true,
541 |     "editable": true
542 |    },
543 |    "source": [
544 |     "## Additional Resources for Learning about Regression\n",
545 |     "\n",
546 |     "- To go much more in-depth on linear regression, read Chapter 3 of [An Introduction to Statistical Learning](http://www-bcf.usc.edu/~gareth/ISL/). Alternatively, watch the [related videos](http://www.dataschool.io/15-hours-of-expert-machine-learning-videos/) or read a [quick reference guide](http://www.dataschool.io/applying-and-interpreting-linear-regression/) to the key points in that chapter.\n",
547 |     "- This [introduction to linear regression](http://people.duke.edu/~rnau/regintro.htm) is much more detailed and mathematically thorough, and includes lots of good advice.\n",
548 |     "- This is a relatively quick post on the [assumptions of linear regression](http://pareonline.net/getvn.asp?n=2&v=8)."
549 |    ]
550 |   },
551 |   {
552 |    "cell_type": "markdown",
553 |    "metadata": {
554 |     "collapsed": false,
555 |     "deletable": true,
556 |     "editable": true
557 |    },
558 |    "source": [
559 |     "# Exercises\n",
560 |     "Use the 'salary_exercise.csv' to apply the analyses we learned today.\n",
561 |     "This dataset was adapted from material available [here](http://data.princeton.edu/wws509/datasets/#salary)\n",
562 |     "\n",
563 |     "These are the salary data used in Weisberg's book, consisting of observations on six variables for 52 tenure-track professors in a small college. The variables are:\n",
564 |     "\n",
565 |     "- sx = Sex, coded 1 for female and 0 for male\n",
566 |     "- rk = Rank, coded\n",
567 |     " - 1 for assistant professor,\n",
568 |     " - 2 for associate professor, and\n",
569 |     " - 3 for full professor\n",
570 |     "- yr = Number of years in current rank\n",
571 |     "- dg = Highest degree, coded 1 if doctorate, 0 if masters\n",
572 |     "- yd = Number of years since highest degree was earned\n",
573 |     "- sl = Academic year salary, in dollars.\n",
574 |     "\n",
575 |     "Reference: S. Weisberg (1985). Applied Linear Regression, Second Edition. New York: John Wiley and Sons. Page 194.\n",
576 |     "##### Make sure you check the data for missing values or other errors before beginning your analysis.\n",
577 |     "\n",
578 |     "## Exercise 1\n",
579 |     "Run a simple linear regression model in which you predict salary from sex of the professors.  \n",
580 |     "Is there a gender difference in salary?   \n",
581 |     "\n",
582 |     "## Exercise 2\n",
583 |     "Experiment with different combination of variables in your model to answer the following questions.   \n",
584 |     "What is your best model to explain salary from the given data?  \n",
585 |     "Is there a gender difference in salary?"
586 |    ]
587 |   }
588 |  ],
589 |  "metadata": {
590 |   "anaconda-cloud": {},
591 |   "kernelspec": {
592 |    "display_name": "Python 2",
593 |    "language": "python",
594 |    "name": "python2"
595 |   },
596 |   "language_info": {
597 |    "codemirror_mode": {
598 |     "name": "ipython",
599 |     "version": 2
600 |    },
601 |    "file_extension": ".py",
602 |    "mimetype": "text/x-python",
603 |    "name": "python",
604 |    "nbconvert_exporter": "python",
605 |    "pygments_lexer": "ipython2",
606 |    "version": "2.7.13"
607 |   },
608 |   "toc": {
609 |    "nav_menu": {
610 |     "height": "262px",
611 |     "width": "252px"
612 |    },
613 |    "navigate_menu": true,
614 |    "number_sections": true,
615 |    "sideBar": true,
616 |    "threshold": 4,
617 |    "toc_cell": false,
618 |    "toc_section_display": "block",
619 |    "toc_window_display": false
620 |   }
621 |  },
622 |  "nbformat": 4,
623 |  "nbformat_minor": 0
624 | }
625 | 


--------------------------------------------------------------------------------
/Notebooks/4_Mediation_&_Moderation.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "markdown",
  5 |    "metadata": {
  6 |     "deletable": true,
  7 |     "editable": true
  8 |    },
  9 |    "source": [
 10 |     "# Mediation Analysis\n",
 11 |     "Written by Jin Cheong & Luke Chang\n",
 12 |     "\n",
 13 |     "A mediation analysis is conducted when a researcher is interested in the **mechanism** underlying how variable X has an effect on variable Y. It attempts to make a causal inference that a direct effect might be better explained by an indirect effect through a mediating variable.  \n",
 14 |     "\n",
 15 |     "Consider the instance below where X has an effect **c** on Y:\n",
 16 |     "\n",
 17 |     "### 1) $Y = \\beta_1 + c \\cdot X $\n",
 18 |     "\n",
 19 |     "In this model, there may be a third variable **M** which mediates the effect of X on Y. In other words, the variable M is partially responsible for the effect X has on Y.\n",
 20 |     "\n",
 21 |     "￼To conduct a mediation analysis one estimates two additional models: 1) the effect of X on M, and 2) the effect of X *and* M on Y. \n",
 22 |     "\n",
 23 |     "### 2) $M = \\beta_2 + a \\cdot X $\n",
 24 |     "\n",
 25 |     "### 3) $Y = \\beta_3 + c' \\cdot X + b \\cdot M $  \n",
 26 |     "  \n",
 27 |     "  \n",
 28 |     "Now the direct effect of X on Y, denoted as **C**,can be broken down into two parts:\n",
 29 |     "\n",
 30 |     "### $c = a \\cdot b + c' $\n",
 31 |     "\n",
 32 |     "$ a \\cdot b $ is the indirect effect of X on Y via the mediator. \n",
 33 |     "$c'$ is the remaining direct effect of X on Y controlling for M.  \n",
 34 |     "\n",
 35 |     "This relationship is depicted below.  Note that M and Y both also have error included in the model.\n",
 36 |     "\n",
 37 |     "***Question*** Why does X not have error included in the model?\n",
 38 |     "\n",
 39 |     "***Answer*** Because X is only a regressor not an outcome variable in the models and standard regression does not estimate error on regressors.  See [orthogonal regression](http://davegiles.blogspot.com/2014/11/orthogonal-regression-first-steps.html) for a technique that models error on both X and Y.\n",
 40 |     "\n",
 41 |     "<img src=\"Figures/mediation1.png\",width=500,align='center'>\n",
 42 |     "\n",
 43 |     "\n",
 44 |     "### Here are a few examples of mediations. Can you think of more?\n",
 45 |     "\n",
 46 |     "1) The effect of failure on depressed feelings is mediated by internalization of failure.  \n",
 47 |     "\n",
 48 |     "2) Effect of CBT treatment is mediated by changes in cognition. \n",
 49 |     "\n",
 50 |     "3) Effect of food intake on weight gain is mediated by metabolism.\n",
 51 |     "\n",
 52 |     "#### For more information there is a nice [tutorial](http://davidakenny.net/cm/mediate.htm) on mediation by David Kenny, one of the author's of the original mediation paper."
 53 |    ]
 54 |   },
 55 |   {
 56 |    "cell_type": "markdown",
 57 |    "metadata": {
 58 |     "deletable": true,
 59 |     "editable": true
 60 |    },
 61 |    "source": [
 62 |     "## Simulate a mediation\n",
 63 |     "\n",
 64 |     "In this section we will simulate a mediation.\n",
 65 |     "\n",
 66 |     "This is a case in which the true effect of X on Y is positive, but appears negative without testing for mediation. "
 67 |    ]
 68 |   },
 69 |   {
 70 |    "cell_type": "code",
 71 |    "execution_count": null,
 72 |    "metadata": {
 73 |     "ExecuteTime": {
 74 |      "end_time": "2017-02-08T11:04:14.080567",
 75 |      "start_time": "2017-02-08T11:04:14.071762"
 76 |     },
 77 |     "collapsed": false,
 78 |     "deletable": true,
 79 |     "editable": true
 80 |    },
 81 |    "outputs": [],
 82 |    "source": [
 83 |     "%matplotlib inline\n",
 84 |     "\n",
 85 |     "import numpy as np\n",
 86 |     "import pandas as pd\n",
 87 |     "import matplotlib.pyplot as plt\n",
 88 |     "import statsmodels.formula.api as smf\n",
 89 |     "import statsmodels.api as sm\n",
 90 |     "from scipy import stats\n",
 91 |     "\n",
 92 |     "def sobel_test(a, b, se_a, se_b):\n",
 93 |     "    '''\n",
 94 |     "    Sobel test for significance of mediation\n",
 95 |     "    \n",
 96 |     "    Args: \n",
 97 |     "        a: coefficient from X to mediator variable, M\n",
 98 |     "        b: coefficient from M to Y \n",
 99 |     "        se_a: Standard error of A\n",
100 |     "        se_b: Standard error fo B\n",
101 |     "\n",
102 |     "    Returns: \n",
103 |     "        t: Sobel's test statistic\n",
104 |     "        pval : Two-tailed probability assuming normal distribution\n",
105 |     "    '''\n",
106 |     "    \n",
107 |     "    SE = np.sqrt( (a**2)*(se_a**2) + (b**2)*(se_b**2))\n",
108 |     "    t = (a*b) / SE\n",
109 |     "    n = 100000000\n",
110 |     "    pval = stats.t.sf(np.abs(t), n-1)*2\n",
111 |     "    return t, pval"
112 |    ]
113 |   },
114 |   {
115 |    "cell_type": "code",
116 |    "execution_count": null,
117 |    "metadata": {
118 |     "ExecuteTime": {
119 |      "end_time": "2017-02-08T11:04:14.211563",
120 |      "start_time": "2017-02-08T11:04:14.206080"
121 |     },
122 |     "collapsed": false,
123 |     "deletable": true,
124 |     "editable": true
125 |    },
126 |    "outputs": [],
127 |    "source": [
128 |     "# set random seed so everyone gets same results\n",
129 |     "np.random.seed(1)\n",
130 |     "\n",
131 |     "# Determine effects\n",
132 |     "a = -3 # effect of x to M\n",
133 |     "b = 3 # effect of M to y\n",
134 |     "cq = .1 #  effect of x on y controlling for M\n",
135 |     "\n",
136 |     "# Create a random data x\n",
137 |     "x = np.random.rand(100) \n",
138 |     "m = x * a + np.random.rand(100)\n",
139 |     "\n",
140 |     "# Create Y\n",
141 |     "y = np.dot(np.array([x,m]).T,[cq,b]) + np.random.rand(100)"
142 |    ]
143 |   },
144 |   {
145 |    "cell_type": "code",
146 |    "execution_count": null,
147 |    "metadata": {
148 |     "ExecuteTime": {
149 |      "end_time": "2017-02-08T11:04:14.550299",
150 |      "start_time": "2017-02-08T11:04:14.364989"
151 |     },
152 |     "collapsed": false,
153 |     "deletable": true,
154 |     "editable": true
155 |    },
156 |    "outputs": [],
157 |    "source": [
158 |     "plt.scatter(x,y)\n",
159 |     "plt.xlabel('X')\n",
160 |     "plt.ylabel('Y')\n",
161 |     "plt.title('X -> Y')"
162 |    ]
163 |   },
164 |   {
165 |    "cell_type": "code",
166 |    "execution_count": null,
167 |    "metadata": {
168 |     "ExecuteTime": {
169 |      "end_time": "2017-02-08T11:04:14.728465",
170 |      "start_time": "2017-02-08T11:04:14.551956"
171 |     },
172 |     "collapsed": false,
173 |     "deletable": true,
174 |     "editable": true
175 |    },
176 |    "outputs": [],
177 |    "source": [
178 |     "plt.scatter(x,m)\n",
179 |     "plt.xlabel('X')\n",
180 |     "plt.ylabel('M')\n",
181 |     "plt.title('X -> M')"
182 |    ]
183 |   },
184 |   {
185 |    "cell_type": "code",
186 |    "execution_count": null,
187 |    "metadata": {
188 |     "ExecuteTime": {
189 |      "end_time": "2017-02-08T11:04:15.133448",
190 |      "start_time": "2017-02-08T11:04:14.954491"
191 |     },
192 |     "collapsed": false,
193 |     "deletable": true,
194 |     "editable": true
195 |    },
196 |    "outputs": [],
197 |    "source": [
198 |     "plt.scatter(m,y)\n",
199 |     "plt.xlabel('M')\n",
200 |     "plt.ylabel('Y')\n",
201 |     "plt.title('M -> Y')"
202 |    ]
203 |   },
204 |   {
205 |    "cell_type": "markdown",
206 |    "metadata": {
207 |     "deletable": true,
208 |     "editable": true
209 |    },
210 |    "source": [
211 |     "### 1) Test effect of X on Y"
212 |    ]
213 |   },
214 |   {
215 |    "cell_type": "code",
216 |    "execution_count": null,
217 |    "metadata": {
218 |     "ExecuteTime": {
219 |      "end_time": "2017-02-08T11:04:15.148139",
220 |      "start_time": "2017-02-08T11:04:15.135054"
221 |     },
222 |     "collapsed": false,
223 |     "deletable": true,
224 |     "editable": true
225 |    },
226 |    "outputs": [],
227 |    "source": [
228 |     "X = pd.DataFrame({'Intercept':np.ones(len(x)),'X':x})\n",
229 |     "lm1 = smf.OLS(y,X).fit()\n",
230 |     "print lm1.summary()\n",
231 |     "ec = lm1.params[1] # save total effect c to ec"
232 |    ]
233 |   },
234 |   {
235 |    "cell_type": "markdown",
236 |    "metadata": {
237 |     "deletable": true,
238 |     "editable": true
239 |    },
240 |    "source": [
241 |     "### 2) Test effect of X on M"
242 |    ]
243 |   },
244 |   {
245 |    "cell_type": "code",
246 |    "execution_count": null,
247 |    "metadata": {
248 |     "ExecuteTime": {
249 |      "end_time": "2017-02-08T11:04:15.410553",
250 |      "start_time": "2017-02-08T11:04:15.397131"
251 |     },
252 |     "collapsed": false,
253 |     "deletable": true,
254 |     "editable": true
255 |    },
256 |    "outputs": [],
257 |    "source": [
258 |     "lm2 = smf.OLS(m,X).fit()\n",
259 |     "print lm2.summary()\n",
260 |     "ea = lm2.params[1] # Save the effect of X on M, a, to ea\n",
261 |     "sea = lm2.bse[1]"
262 |    ]
263 |   },
264 |   {
265 |    "cell_type": "markdown",
266 |    "metadata": {
267 |     "deletable": true,
268 |     "editable": true
269 |    },
270 |    "source": [
271 |     "### 3) Test effect of  X and M on Y "
272 |    ]
273 |   },
274 |   {
275 |    "cell_type": "code",
276 |    "execution_count": null,
277 |    "metadata": {
278 |     "ExecuteTime": {
279 |      "end_time": "2017-02-08T11:04:15.737957",
280 |      "start_time": "2017-02-08T11:04:15.724569"
281 |     },
282 |     "collapsed": false,
283 |     "deletable": true,
284 |     "editable": true
285 |    },
286 |    "outputs": [],
287 |    "source": [
288 |     "X['M'] = m\n",
289 |     "lm3 = smf.OLS(y,X).fit()\n",
290 |     "print lm3.summary()\n",
291 |     "ecq,eb = lm3.params[1:3]\n",
292 |     "seb = lm3.bse[2]"
293 |    ]
294 |   },
295 |   {
296 |    "cell_type": "markdown",
297 |    "metadata": {
298 |     "deletable": true,
299 |     "editable": true
300 |    },
301 |    "source": [
302 |     "### Show how the effect is broken down to direct and indirect effects\n",
303 |     "Recall how the overall effect C was decomposed to indirect effect (a*b) and direct effect (c')\n",
304 |     "### $c = a \\cdot b + c' $"
305 |    ]
306 |   },
307 |   {
308 |    "cell_type": "code",
309 |    "execution_count": null,
310 |    "metadata": {
311 |     "ExecuteTime": {
312 |      "end_time": "2017-02-08T11:04:16.057799",
313 |      "start_time": "2017-02-08T11:04:16.053220"
314 |     },
315 |     "collapsed": false,
316 |     "deletable": true,
317 |     "editable": true
318 |    },
319 |    "outputs": [],
320 |    "source": [
321 |     "print('c : %.2f') % ec\n",
322 |     "print('a : %.2f') % ea\n",
323 |     "print('b : %.2f') % eb\n",
324 |     "print('c\\' : %.2f') % ecq"
325 |    ]
326 |   },
327 |   {
328 |    "cell_type": "code",
329 |    "execution_count": null,
330 |    "metadata": {
331 |     "ExecuteTime": {
332 |      "end_time": "2017-02-08T11:04:16.227940",
333 |      "start_time": "2017-02-08T11:04:16.222887"
334 |     },
335 |     "collapsed": false,
336 |     "deletable": true,
337 |     "editable": true
338 |    },
339 |    "outputs": [],
340 |    "source": [
341 |     "print('Total effect C: %.2f') % ec\n",
342 |     "print('is decomposed into the indirect(mediated) effect a*b: %.2f') % (ea*eb)\n",
343 |     "print('plus the direct effect c\\': %.2f') % ecq\n",
344 |     "print('which adds up to %.2f') % (ea*eb+ecq)"
345 |    ]
346 |   },
347 |   {
348 |    "cell_type": "markdown",
349 |    "metadata": {
350 |     "deletable": true,
351 |     "editable": true
352 |    },
353 |    "source": [
354 |     "## Run a Sobel Test for Significance of Mediation\n",
355 |     "\n",
356 |     "One way to test the significance of a mediation is to perform a Sobel test, where the indirect effect(a*b) is divided by an estimated standard error of the two. This assumes that the product would be normally distributed which may not always be the case. \n",
357 |     "\n",
358 |     "An alternative method is to bootstrap with replacement on the observed data to generate a 95% confidence interval. You can try this by writing a for-loop that resamples from the data and generate a distribution of the indirect effects(a*b). If the confidence interval does not include 0, it can be considered as significant. "
359 |    ]
360 |   },
361 |   {
362 |    "cell_type": "code",
363 |    "execution_count": null,
364 |    "metadata": {
365 |     "ExecuteTime": {
366 |      "end_time": "2017-02-08T11:04:16.587230",
367 |      "start_time": "2017-02-08T11:04:16.582745"
368 |     },
369 |     "collapsed": false,
370 |     "deletable": true,
371 |     "editable": true
372 |    },
373 |    "outputs": [],
374 |    "source": [
375 |     "t,p = sobel_test(ea,eb,sea,seb)\n",
376 |     "print('Sobel\\'s test of significance t = %2.2f') % t\n",
377 |     "print('Two-tailed p-value p = %2.5f ') % p"
378 |    ]
379 |   },
380 |   {
381 |    "cell_type": "markdown",
382 |    "metadata": {
383 |     "deletable": true,
384 |     "editable": true
385 |    },
386 |    "source": [
387 |     "# Moderation Analysis\n",
388 |     "\n",
389 |     "In a moderation analysis, the moderator modifies or changes the relationship between two variables, akin to an interaction term. Moderation is slightly different from an interaction due to the additional constraint that there is a causal relationship from X to Y, BUT not from Z to Y. Therefore, a moderation implies an interaction exists but an interaction does not imply a moderation. \n",
390 |     "\n",
391 |     "Here is a schematic representation of a moderation relationship. \n",
392 |     "This diagram hypothesize that Stress has a causal relationship to Depression \n",
393 |     "but the effect of Stress is different for people with high or low Social Support \n",
394 |     "\n",
395 |     "<img src=\"Figures/moderator2.gif\",width=500,align='center'>\n",
396 |     "\n",
397 |     "This can be reprsented by an interaction, \n",
398 |     "\n",
399 |     "\n",
400 |     "\n",
401 |     "<img src=\"Figures/moderator3.jpeg\",width=500,align='center'>\n",
402 |     "\n",
403 |     "\n",
404 |     "The pictures have been retrieved from [here](http://www.victoria.ac.nz/psyc/paul-jose-files/helpcentre/help5_moderation_example.php)\n",
405 |     "\n",
406 |     "### Here are a few examples of moderations. Can you think of more?\n",
407 |     "\n",
408 |     "1) The effect of compliments on future grades is moderated by growth mindset [(Carol Dweck)](http://mindsetonline.com/)\n",
409 |     "\n",
410 |     "2) Effect of favorability on government behavior is moderated by political affiliation.    \n",
411 |     "\n",
412 |     "3) Effect of pressure on performance is moderated by confidence (choking vs boosting). \n",
413 |     "\n",
414 |     "### For more information look at [homepage of Kenny](http://davidakenny.net/cm/moderation.htm) who started all this. "
415 |    ]
416 |   },
417 |   {
418 |    "cell_type": "markdown",
419 |    "metadata": {
420 |     "deletable": true,
421 |     "editable": true
422 |    },
423 |    "source": [
424 |     "## Example \n",
425 |     "Here we examine whether the effect of buying books (**Buy**) on enjoyment of reading (**Enjoy**) is moderated by frequency of reading (**Read**).\n",
426 |     "\n",
427 |     "The moderation effect exists if there is an interaction of buying and reading on enjoyment."
428 |    ]
429 |   },
430 |   {
431 |    "cell_type": "code",
432 |    "execution_count": null,
433 |    "metadata": {
434 |     "ExecuteTime": {
435 |      "end_time": "2017-02-08T11:08:49.941382",
436 |      "start_time": "2017-02-08T11:08:49.931106"
437 |     },
438 |     "collapsed": true
439 |    },
440 |    "outputs": [],
441 |    "source": [
442 |     "df = pd.DataFrame({\n",
443 |     "    'Enjoy':[4, 16, 4, 12, 9, 5, 15, 21, 3, 4, 8, 11, 7, 5, 8, 19, 11, 9, 9, 13, 11, 21, 18, 12, 15, 3, 2, 10, 7, 9, 5, 6, 9, 12, 9, 5, 17, 15, 9, 7, 5, 10, 10, 6, 7, 9, 12, 2, 1, 5, 7, 5, 8, 5, 5, 11, 8, 9, 13, 9, 19, 8, 21, 1, 11, 8, 6, 23, 2, 9, 13, 4, 10, 12, 5, 7, 10, 11, 12, 13],\n",
444 |     "    'Buy':[5, 8, 0, 8, 3, 0, 8, 9, 8, 1, 7, 2, 2, 9, 3, 8, 8, 9, 5, 7, 7, 9, 6, 7, 8, 4, 4, 3, 1, 4, 1, 5, 5, 2, 9, 5, 7, 8, 2, 4, 1, 4, 0, 1, 0, 8, 2, 4, 0, 0, 0, 1, 2, 2, 2, 7, 5, 1, 9, 9, 8, 1, 7, 2, 5, 2, 4, 9, 1, 6, 3, 0, 7, 5, 2, 3, 1, 8, 6, 4],\n",
445 |     "    'Read':[0, 8, 0, 5, 4, 9, 6, 9, 1, 3, 3, 3, 8, 0, 9, 8, 6, 0, 5, 6, 1, 7, 7, 5, 7, 2, 0, 5, 1, 7, 7, 4, 6, 5, 3, 1, 7, 6, 0, 4, 0, 9, 5, 9, 2, 3, 5, 2, 5, 2, 9, 1, 1, 7, 9, 3, 0, 4, 4, 3, 8, 8, 8, 2, 3, 7, 1, 8, 6, 1, 7, 0, 3, 2, 5, 3, 8, 6, 9, 7] \n",
446 |     "})"
447 |    ]
448 |   },
449 |   {
450 |    "cell_type": "markdown",
451 |    "metadata": {
452 |     "deletable": true,
453 |     "editable": true
454 |    },
455 |    "source": [
456 |     "# Importance of centering variables for interaction\n",
457 |     "The interaction effect can be **VERY** different if you don't center your variables"
458 |    ]
459 |   },
460 |   {
461 |    "cell_type": "code",
462 |    "execution_count": null,
463 |    "metadata": {
464 |     "ExecuteTime": {
465 |      "end_time": "2017-02-08T11:08:51.761113",
466 |      "start_time": "2017-02-08T11:08:51.752846"
467 |     },
468 |     "collapsed": false,
469 |     "deletable": true,
470 |     "editable": true
471 |    },
472 |    "outputs": [],
473 |    "source": [
474 |     "df['Interaction'] = df.Read*df.Buy\n",
475 |     "np.corrcoef(df.Buy,df.Interaction)"
476 |    ]
477 |   },
478 |   {
479 |    "cell_type": "code",
480 |    "execution_count": null,
481 |    "metadata": {
482 |     "ExecuteTime": {
483 |      "end_time": "2017-02-08T11:09:41.621877",
484 |      "start_time": "2017-02-08T11:09:41.462656"
485 |     },
486 |     "collapsed": false,
487 |     "deletable": true,
488 |     "editable": true
489 |    },
490 |    "outputs": [],
491 |    "source": [
492 |     "plt.scatter(df.Buy,df.Interaction)\n",
493 |     "plt.xlabel('Read * Buy')\n",
494 |     "plt.ylabel('Buy')"
495 |    ]
496 |   },
497 |   {
498 |    "cell_type": "code",
499 |    "execution_count": null,
500 |    "metadata": {
501 |     "ExecuteTime": {
502 |      "end_time": "2017-02-08T11:08:59.826515",
503 |      "start_time": "2017-02-08T11:08:59.816012"
504 |     },
505 |     "collapsed": false,
506 |     "deletable": true,
507 |     "editable": true
508 |    },
509 |    "outputs": [],
510 |    "source": [
511 |     "center = lambda x: (x - x.mean())\n",
512 |     "df[['Read_Centered','Buy_Centered']] = df[['Read','Buy']].apply(center)\n",
513 |     "df['Interaction_Centered'] = df['Read_Centered'] * df['Buy_Centered']\n",
514 |     "np.corrcoef(df.Buy,df.Interaction_Centered)"
515 |    ]
516 |   },
517 |   {
518 |    "cell_type": "code",
519 |    "execution_count": null,
520 |    "metadata": {
521 |     "ExecuteTime": {
522 |      "end_time": "2017-02-08T11:09:01.142976",
523 |      "start_time": "2017-02-08T11:09:00.976403"
524 |     },
525 |     "collapsed": false,
526 |     "deletable": true,
527 |     "editable": true
528 |    },
529 |    "outputs": [],
530 |    "source": [
531 |     "plt.scatter(df.Buy,df.Interaction_Centered)\n",
532 |     "plt.xlabel('Read * Buy')\n",
533 |     "plt.ylabel('Buy')"
534 |    ]
535 |   },
536 |   {
537 |    "cell_type": "code",
538 |    "execution_count": null,
539 |    "metadata": {
540 |     "ExecuteTime": {
541 |      "end_time": "2017-02-08T11:09:05.225235",
542 |      "start_time": "2017-02-08T11:09:05.208483"
543 |     },
544 |     "collapsed": false
545 |    },
546 |    "outputs": [],
547 |    "source": [
548 |     "mod = smf.ols(formula = \"Enjoy ~ Buy + Read + Interaction\", data = df).fit()\n",
549 |     "print mod.summary()"
550 |    ]
551 |   },
552 |   {
553 |    "cell_type": "code",
554 |    "execution_count": null,
555 |    "metadata": {
556 |     "ExecuteTime": {
557 |      "end_time": "2017-02-08T11:09:05.834256",
558 |      "start_time": "2017-02-08T11:09:05.816079"
559 |     },
560 |     "collapsed": false,
561 |     "deletable": true,
562 |     "editable": true
563 |    },
564 |    "outputs": [],
565 |    "source": [
566 |     "mod = smf.ols(formula = \"Enjoy ~ Buy + Read + Interaction_Centered\", data = df).fit()\n",
567 |     "print mod.summary()"
568 |    ]
569 |   },
570 |   {
571 |    "cell_type": "code",
572 |    "execution_count": null,
573 |    "metadata": {
574 |     "collapsed": true
575 |    },
576 |    "outputs": [],
577 |    "source": []
578 |   },
579 |   {
580 |    "cell_type": "markdown",
581 |    "metadata": {
582 |     "deletable": true,
583 |     "editable": true
584 |    },
585 |    "source": [
586 |     "# Exercises\n",
587 |     "\n",
588 |     "## Exercise 1 - Mediation\n",
589 |     "\n",
590 |     "For exercise 1, let's revisit the salary dataset. ('../Data/salary_exercise.csv') \n",
591 |     "\n",
592 |     "These are the salary data used in Weisberg's book, consisting of observations on six variables for 52 tenure-track professors in a small college. The variables are:\n",
593 |     "\n",
594 |     "- sx = Sex, coded 1 for female and 0 for male\n",
595 |     "- rk = Rank, coded\n",
596 |     " - 1 for assistant professor,\n",
597 |     " - 2 for associate professor, and\n",
598 |     " - 3 for full professor\n",
599 |     "- yr = Number of years in current rank\n",
600 |     "- dg = Highest degree, coded 1 if doctorate, 0 if masters\n",
601 |     "- yd = Number of years since highest degree was earned\n",
602 |     "- sl = Academic year salary, in dollars.\n",
603 |     "\n",
604 |     "Reference: S. Weisberg (1985). Applied Linear Regression, Second Edition. New York: John Wiley and Sons. Page 194.\n",
605 |     "\n",
606 |     "**Make sure you check the data for missing values or other errors before beginning your analysis.**\n",
607 |     "\n",
608 |     "### Question:\n",
609 |     "Evaluate whether academic rank mediates the relationship between salary and gender"
610 |    ]
611 |   },
612 |   {
613 |    "cell_type": "code",
614 |    "execution_count": null,
615 |    "metadata": {
616 |     "collapsed": true
617 |    },
618 |    "outputs": [],
619 |    "source": []
620 |   },
621 |   {
622 |    "cell_type": "markdown",
623 |    "metadata": {},
624 |    "source": [
625 |     "## Exercise 2 - Moderation\n",
626 |     "\n",
627 |     "For Exercise 2, we will use the alcohol dataset again ('../Data/student-mat.csv').\n",
628 |     "\n",
629 |     "Below is the information about the dataset. \n",
630 |     "\n",
631 |     "Attribute Information:  \n",
632 |     "1 school - student's school (binary: 'GP' - Gabriel Pereira or 'MS' - Mousinho da Silveira)   \n",
633 |     "2 sex - student's sex (binary: 'F' - female or 'M' - male)   \n",
634 |     "3 age - student's age (numeric: from 15 to 22)   \n",
635 |     "4 address - student's home address type (binary: 'U' - urban or 'R' - rural)   \n",
636 |     "5 famsize - family size (binary: 'LE3' - less or equal to 3 or 'GT3' - greater than 3)   \n",
637 |     "6 Pstatus - parent's cohabitation status (binary: 'T' - living together or 'A' - apart)   \n",
638 |     "7 Medu - mother's education (numeric: 0 - none, 1 - primary education (4th grade), 2 Ã¢â‚¬â€œ 5th to 9th grade, 3 Ã¢â‚¬â€œ secondary education or 4 Ã¢â‚¬â€œ higher education)   \n",
639 |     "8 Fedu - father's education (numeric: 0 - none, 1 - primary education (4th grade), 2 Ã¢â‚¬â€œ 5th to 9th grade, 3 Ã¢â‚¬â€œ secondary education or 4 Ã¢â‚¬â€œ higher education)   \n",
640 |     "9 Mjob - mother's job (nominal: 'teacher', 'health' care related, civil 'services' (e.g. administrative or police), 'at_home' or 'other')   \n",
641 |     "10 Fjob - father's job (nominal: 'teacher', 'health' care related, civil 'services' (e.g. administrative or police), 'at_home' or 'other')   \n",
642 |     "11 reason - reason to choose this school (nominal: close to 'home', school 'reputation', 'course' preference or 'other')   \n",
643 |     "12 guardian - student's guardian (nominal: 'mother', 'father' or 'other')   \n",
644 |     "13 traveltime - home to school travel time (numeric: 1 - <15 min., 2 - 15 to 30 min., 3 - 30 min. to 1 hour, or 4 - >1 hour)   \n",
645 |     "14 studytime - weekly study time (numeric: 1 - <2 hours, 2 - 2 to 5 hours, 3 - 5 to 10 hours, or 4 - >10 hours)   \n",
646 |     "15 failures - number of past class failures (numeric: n if 1<=n<3, else 4)   \n",
647 |     "16 schoolsup - extra educational support (binary: yes or no)   \n",
648 |     "17 famsup - family educational support (binary: yes or no)   \n",
649 |     "18 paid - extra paid classes within the course subject (Math or Portuguese) (binary: yes or no)   \n",
650 |     "19 activities - extra-curricular activities (binary: yes or no)   \n",
651 |     "20 nursery - attended nursery school (binary: yes or no)   \n",
652 |     "21 higher - wants to take higher education (binary: yes or no)   \n",
653 |     "22 internet - Internet access at home (binary: yes or no)   \n",
654 |     "23 romantic - with a romantic relationship (binary: yes or no)   \n",
655 |     "24 famrel - quality of family relationships (numeric: from 1 - very bad to 5 - excellent)   \n",
656 |     "25 freetime - free time after school (numeric: from 1 - very low to 5 - very high)   \n",
657 |     "26 goout - going out with friends (numeric: from 1 - very low to 5 - very high)   \n",
658 |     "27 Dalc - workday alcohol consumption (numeric: from 1 - very low to 5 - very high)   \n",
659 |     "28 Walc - weekend alcohol consumption (numeric: from 1 - very low to 5 - very high)   \n",
660 |     "29 health - current health status (numeric: from 1 - very bad to 5 - very good)   \n",
661 |     "30 absences - number of school absences (numeric: from 0 to 93)     \n",
662 |     "31 G1 - first period grade (numeric: from 0 to 20)   \n",
663 |     "31 G2 - second period grade (numeric: from 0 to 20)   \n",
664 |     "32 G3 - final grade (numeric: from 0 to 20, output target)   \n",
665 |     "\n",
666 |     "### Question:\n",
667 |     "\n",
668 |     "The amount you go out seems to be associated with increased daily alcohol consumption.  Does being in a relationship dampen this association?  Are there any other variables that appear to moderate this relationship?\n"
669 |    ]
670 |   },
671 |   {
672 |    "cell_type": "code",
673 |    "execution_count": null,
674 |    "metadata": {
675 |     "collapsed": true
676 |    },
677 |    "outputs": [],
678 |    "source": []
679 |   },
680 |   {
681 |    "cell_type": "code",
682 |    "execution_count": null,
683 |    "metadata": {
684 |     "collapsed": true
685 |    },
686 |    "outputs": [],
687 |    "source": []
688 |   }
689 |  ],
690 |  "metadata": {
691 |   "anaconda-cloud": {},
692 |   "kernelspec": {
693 |    "display_name": "Python [default]",
694 |    "language": "python",
695 |    "name": "python2"
696 |   },
697 |   "language_info": {
698 |    "codemirror_mode": {
699 |     "name": "ipython",
700 |     "version": 2
701 |    },
702 |    "file_extension": ".py",
703 |    "mimetype": "text/x-python",
704 |    "name": "python",
705 |    "nbconvert_exporter": "python",
706 |    "pygments_lexer": "ipython2",
707 |    "version": "2.7.13"
708 |   },
709 |   "toc": {
710 |    "nav_menu": {
711 |     "height": "468px",
712 |     "width": "252px"
713 |    },
714 |    "navigate_menu": true,
715 |    "number_sections": true,
716 |    "sideBar": true,
717 |    "threshold": 4,
718 |    "toc_cell": false,
719 |    "toc_section_display": "block",
720 |    "toc_window_display": false
721 |   }
722 |  },
723 |  "nbformat": 4,
724 |  "nbformat_minor": 0
725 | }
726 | 


--------------------------------------------------------------------------------
/Notebooks/5_Prediction_&_Regularization.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "markdown",
  5 |    "metadata": {},
  6 |    "source": [
  7 |     "# Logistic Regression Analysis\n",
  8 |     "\n",
  9 |     "*Written by Jin Cheong & Luke Chang*\n",
 10 |     "\n",
 11 |     "In this lab we are going to learn about more advanced topics in regression analysis.\n",
 12 |     "- How to use link functions to perform regressions on other types of data such as categorical variables\n",
 13 |     "- How to perform cross-validation to assess degree of overfitting\n",
 14 |     "- How to deal with multicollinearity with regularization. \n",
 15 |     "\n",
 16 |     "After the tutorial you will have the chance to apply the methods on a separate dataset. \n",
 17 |     "\n",
 18 |     "\n",
 19 |     "## Introduction to logistic regression \n",
 20 |     "\n",
 21 |     "A logistic regression, also called a logit model, is used to model outcome variables that are binary. Examples of such variables are \"hit versus miss,\" \"success versus fail,\" or  \"male versus female.\" We can use a logistic regression to calculate the probability of an item being one or the other.\n",
 22 |     "\n",
 23 |     "In the logit model, the log odds of the outcome is modeled as a linear combination of the predictor variables. \n",
 24 |     "Lets recall that the simple linear regression had the following form :\n",
 25 |     "### $y = \\beta_0 + \\beta_1\\cdot x $\n",
 26 |     "\n",
 27 |     "We can extend this to a multiple linear regression with multiple features into the following form : \n",
 28 |     "### $y = \\beta_0 + \\beta_1x_1 + ... + \\beta_nx_n$   \n",
 29 |     "\n",
 30 |     " \n",
 31 |     "In logistic regression, instead of estimating the __value__ of Y from Xs, we estimate the __probability__ of Y. For instance if our model tries to predict whether a person is male(Y=0) or female(Y=1), then a Y value of .8 would mean it has 80% chance of being a female.\n",
 32 |     "Formally, the linear regression model is passed into a sigmoid function. Here is the simple form with one predictor variable.\n",
 33 |     "## $P(Y) = \\frac{1}{1 + e^{-(\\beta_0 + \\beta_1x_1)}}$\n",
 34 |     "\n",
 35 |     "We can easily extend this to multiple regression with n-number of predictors.\n",
 36 |     "\n",
 37 |     "\n",
 38 |     "## $P(Y) = \\frac{1}{1 + e^{-(\\beta_0 + \\beta_1x_1 + ... + \\beta_nx_n)}}$\n",
 39 |     "\n",
 40 |     "Now let's see how we can achieve this in code.  \n"
 41 |    ]
 42 |   },
 43 |   {
 44 |    "cell_type": "code",
 45 |    "execution_count": null,
 46 |    "metadata": {
 47 |     "ExecuteTime": {
 48 |      "end_time": "2017-01-27T10:48:11.684432",
 49 |      "start_time": "2017-01-27T10:48:11.679536"
 50 |     },
 51 |     "collapsed": true
 52 |    },
 53 |    "outputs": [],
 54 |    "source": [
 55 |     "%matplotlib inline\n",
 56 |     "\n",
 57 |     "import numpy as np\n",
 58 |     "import pandas as pd\n",
 59 |     "import matplotlib.pyplot as plt\n",
 60 |     "import statsmodels.formula.api as smf\n",
 61 |     "import seaborn as sns\n",
 62 |     "from IPython.display import display, HTML"
 63 |    ]
 64 |   },
 65 |   {
 66 |    "cell_type": "markdown",
 67 |    "metadata": {},
 68 |    "source": [
 69 |     "## Logistic regression analysis\n",
 70 |     "\n",
 71 |     "Let's start off by plotting a sigmoid function and add necessary modules."
 72 |    ]
 73 |   },
 74 |   {
 75 |    "cell_type": "code",
 76 |    "execution_count": null,
 77 |    "metadata": {
 78 |     "ExecuteTime": {
 79 |      "end_time": "2017-01-27T10:34:03.578429",
 80 |      "start_time": "2017-01-27T10:34:03.408176"
 81 |     },
 82 |     "collapsed": false
 83 |    },
 84 |    "outputs": [],
 85 |    "source": [
 86 |     "# Plot a sigmoid function\n",
 87 |     "plt.figure(figsize=(5,5))  # open a figure base and determine the size (width, height)\n",
 88 |     "\n",
 89 |     "def sigmoid(t):                          # Define the sigmoid function\n",
 90 |     "    return (1/(1 + np.e**(-t)))    \n",
 91 |     "\n",
 92 |     "plot_range = np.arange(-6, 6, 0.1)       # range of x values to plot\n",
 93 |     "y_values = sigmoid(plot_range)\n",
 94 |     "\n",
 95 |     "# Plot curve\n",
 96 |     "plt.plot(plot_range,   # X-axis range\n",
 97 |     "         y_values,          # Predicted values\n",
 98 |     "         color=\"red\",linewidth=3)\n",
 99 |     "plt.title(\"Example of a sigmoid function\",fontsize=18)\n",
100 |     "plt.ylabel(\"Probability\",fontsize=16)"
101 |    ]
102 |   },
103 |   {
104 |    "cell_type": "markdown",
105 |    "metadata": {},
106 |    "source": [
107 |     "Now we will load our sample data and create a 'Alcabuse' variable from 'Dalc' and 'Walc' (weekday and weekend alcohol consumption level, respectively) and make it a binary variable by calling it alcohol abuse if a student scores more than 8 on drinking alcohol in a week. The dataset was retrieved from [here](http://archive.ics.uci.edu/ml/datasets/STUDENT+ALCOHOL+CONSUMPTION#) but slightly modified to better illustrate our purpose."
108 |    ]
109 |   },
110 |   {
111 |    "cell_type": "code",
112 |    "execution_count": null,
113 |    "metadata": {
114 |     "ExecuteTime": {
115 |      "end_time": "2017-01-27T10:34:28.969833",
116 |      "start_time": "2017-01-27T10:34:28.958171"
117 |     },
118 |     "collapsed": false
119 |    },
120 |    "outputs": [],
121 |    "source": [
122 |     "# Load the data \n",
123 |     "df = pd.read_csv('../Data/student-mat.csv')\n",
124 |     "df['Alcabuse']=0\n",
125 |     "df.loc[df['Dalc']+df['Walc'] >=8,'Alcabuse']=1 # Let's call it abuse if more than 8 alcohol consumption per week"
126 |    ]
127 |   },
128 |   {
129 |    "cell_type": "markdown",
130 |    "metadata": {},
131 |    "source": [
132 |     "The hypothesis we want to test is : __\"A student is more likely to abuse alcohol if he has more free time.\"__  \n",
133 |     "'Alcabuse' will be our response variable Y, measuring the number of drinks per week.   \n",
134 |     "'freetime' will be our predictor variable X, measuring the degree of free time.\n",
135 |     "\n",
136 |     "We will use the logit function in the statsmodels package. "
137 |    ]
138 |   },
139 |   {
140 |    "cell_type": "code",
141 |    "execution_count": null,
142 |    "metadata": {
143 |     "ExecuteTime": {
144 |      "end_time": "2017-01-27T10:34:38.993233",
145 |      "start_time": "2017-01-27T10:34:38.969608"
146 |     },
147 |     "collapsed": false
148 |    },
149 |    "outputs": [],
150 |    "source": [
151 |     "logm = smf.logit('Alcabuse ~ freetime',data=df).fit()\n",
152 |     "print(logm.summary())"
153 |    ]
154 |   },
155 |   {
156 |    "cell_type": "markdown",
157 |    "metadata": {},
158 |    "source": [
159 |     "## Interpreting logistic model coefficients for prediction\n",
160 |     "\n",
161 |     "We ran our first logit model but we cannot interpret the coefficients like we did in the linear regression model.   Because we applied a logit transform, the coefficients are in log-odds. \n",
162 |     "To make them more interpretable we can exponentiate the coefficients and interpret them as either probability or odds-ratios. \n",
163 |     "\n",
164 |     "### Interpreting coefficients as probabilities\n",
165 |     "To change our log-odds coefficients into probabilities we pass the log-odds value of Y into a sigmoid function.\n",
166 |     "For instance, let's see what the probability is for a student to be an alcohol abuser if he had a level 5 of free time (x=5).  \n",
167 |     "\n",
168 |     "$x = 5$   \n",
169 |     "$\\beta_0 = -7.6262$  \n",
170 |     "$\\beta_1 = 1.6416$  \n",
171 |     "## $ Y = \\beta_0 + \\beta_1\\cdot x = -7.6262 + 1.6416 \\cdot 5$ \n",
172 |     "## $P(Y) = \\frac{1}{1 + e^{-(Y)}} = \\frac{1}{1 + e^{-(-7.6262 + 1.6416 \\cdot 5)}}$\n",
173 |     "\n",
174 |     "We implement this with the following code by using the 'sigmoid' function as defined earlier in this module.\n",
175 |     "We also plot the fit of the model."
176 |    ]
177 |   },
178 |   {
179 |    "cell_type": "code",
180 |    "execution_count": null,
181 |    "metadata": {
182 |     "ExecuteTime": {
183 |      "end_time": "2017-01-27T10:35:53.052903",
184 |      "start_time": "2017-01-27T10:35:53.043411"
185 |     },
186 |     "collapsed": false,
187 |     "scrolled": true
188 |    },
189 |    "outputs": [],
190 |    "source": [
191 |     "x = 5\n",
192 |     "print('x = %i' %x) # x\n",
193 |     "print('b0 = %f' %logm.params[0]) # b0\n",
194 |     "print('b1 = %f' %logm.params[1]) # b1\n",
195 |     "palcabuse = sigmoid(logm.params[0] + logm.params[1] * x)\n",
196 |     "print('palcabuse = %f' %palcabuse)\n",
197 |     "print('If a student has %i hours of free time per week, then he has %.2f %% probability of alcohol abuse' % (x,palcabuse*100))\n",
198 |     "# Note that we had to use double percent sign %% to print % \n",
199 |     "# because a single % is reserved for printing inline numbers as in %i or %.2f"
200 |    ]
201 |   },
202 |   {
203 |    "cell_type": "markdown",
204 |    "metadata": {},
205 |    "source": [
206 |     "### Plotting the predictions of the model"
207 |    ]
208 |   },
209 |   {
210 |    "cell_type": "code",
211 |    "execution_count": null,
212 |    "metadata": {
213 |     "ExecuteTime": {
214 |      "end_time": "2017-01-27T10:37:07.982811",
215 |      "start_time": "2017-01-27T10:37:07.767818"
216 |     },
217 |     "collapsed": false
218 |    },
219 |    "outputs": [],
220 |    "source": [
221 |     "# Plot model fit\n",
222 |     "xs =  np.arange(min(df.freetime)-1,max(df.freetime)+1,0.1)\n",
223 |     "ys = np.array([sigmoid(logm.params[0] + logm.params[1] * x) for x in xs])\n",
224 |     "plt.figure(figsize=(8,4))\n",
225 |     "plt.plot(xs,ys,linewidth=3)\n",
226 |     "jitter = np.random.random(len(df.Alcabuse))/5\n",
227 |     "ys_outcomes = np.abs((df.Alcabuse==1)-0.01-jitter)\n",
228 |     "alpha = 0.7\n",
229 |     "plt.plot(df.freetime[df.Alcabuse == 1], ys_outcomes[df.Alcabuse ==1], 'b.', alpha=alpha)\n",
230 |     "plt.plot(df.freetime[df.Alcabuse == 0], ys_outcomes[df.Alcabuse ==0], 'r.', alpha=alpha)\n",
231 |     "plt.xlabel(\"Freetime\",fontsize=16)\n",
232 |     "plt.ylabel(\"Probability of being an alcohol abuser\",fontsize=16)\n",
233 |     "plt.title(\"Predicting alcohol abuse by free time\",fontsize=18)"
234 |    ]
235 |   },
236 |   {
237 |    "cell_type": "markdown",
238 |    "metadata": {},
239 |    "source": [
240 |     "### Interpreting coefficients as odds\n",
241 |     "In some cases it could be more useful to consider the odds of an event happening. \n",
242 |     "For example, we might want to be able to make statementes such as \"If a person smokes, he is 10 times more likely to get lung cancer compared to a person who doesn't smoke.\"  The odds of an event happening is defined by\n",
243 |     "\n",
244 |     "## $odds = \\frac{P(\\text{event})}{P(\\text{no event})} = \\frac{P(\\text{Y})}{P(\\text{not Y})}$\n",
245 |     "while $P(\\text{Y}) = \\frac{1}{1 + e^{-(\\beta_0 + \\beta_1x_1)}}$\n",
246 |     "\n",
247 |     "and $P(\\text{not Y}) = 1 - P(\\text{Y})$\n",
248 |     "\n",
249 |     "If we want to look at how much the odds change for a unit increase in free time, we can use the following \n",
250 |     "\n",
251 |     "## $\\triangle odds = \\frac{\\text{odds after a unit change in predictor}}{\\text{original odds}} $\n",
252 |     "\n",
253 |     "So, if we want to calculate the change in odds ratios of being an alcohol abuser when free time increases from 4 to 5, we can calculate the following: \n",
254 |     "## $\\triangle odds = \\frac{\\text{odds(Y) when x = 5}}{\\text{odds(Y) when x = 4}}  = \\frac{\\frac{\\text{P(Y) when x = 5}}{\\text{1 - (P(Y) when x = 5)}}}{\\frac{\\text{P(Y) when x = 4}}{\\text{1 - (P(Y) when x = 4)}}}$\n"
255 |    ]
256 |   },
257 |   {
258 |    "cell_type": "code",
259 |    "execution_count": null,
260 |    "metadata": {
261 |     "ExecuteTime": {
262 |      "end_time": "2017-01-27T10:39:10.389767",
263 |      "start_time": "2017-01-27T10:39:10.383370"
264 |     },
265 |     "collapsed": false
266 |    },
267 |    "outputs": [],
268 |    "source": [
269 |     "odd1 = sigmoid(logm.params[0] + logm.params[1] * 5) / (1-sigmoid(logm.params[0] + logm.params[1] * 5))\n",
270 |     "odd2 = sigmoid(logm.params[0] + logm.params[1] * 4) / (1-sigmoid(logm.params[0] + logm.params[1] * 4))\n",
271 |     "dodds = odd1/odd2\n",
272 |     "print('A student with 5hrs of free time compared to a student with 4 is %.2f times more likely to be an alcohol abuser' %dodds)"
273 |    ]
274 |   },
275 |   {
276 |    "cell_type": "markdown",
277 |    "metadata": {},
278 |    "source": [
279 |     "## Cross-validation \n",
280 |     "\n",
281 |     "The best way to compare accuracy of models and see how well your model generalize is to train (or fit) your model on a training dataset then test it on a separate test dataset. If you train and test your model on the same dataset (as we did above) you are likely to get a positvely biased result. The true generalizeability can only be observed when you test your model on a separate hold-out test dataset on which the model was not trained. A simple way to do cross-validation is to split the dataset before you begin your analysis. \n",
282 |     "\n",
283 |     "There are many ways to split the dataset but for simplicity we will use the train_test_split function in the module sci-kit learn.  \n",
284 |     "We will split the data so that the training dataset consists 60% of the data and the test dataset consists 40% of the data.\n",
285 |     "\n",
286 |     "### 1. Split the data into training and testing datasets"
287 |    ]
288 |   },
289 |   {
290 |    "cell_type": "code",
291 |    "execution_count": null,
292 |    "metadata": {
293 |     "ExecuteTime": {
294 |      "end_time": "2017-01-27T10:39:55.750103",
295 |      "start_time": "2017-01-27T10:39:55.687554"
296 |     },
297 |     "collapsed": false
298 |    },
299 |    "outputs": [],
300 |    "source": [
301 |     "from sklearn.cross_validation import train_test_split\n",
302 |     "np.random.seed(12345678) # set the random number generator seed to get consistent results\n",
303 |     "trainratio = .6\n",
304 |     "train, test = train_test_split(df, train_size=trainratio, random_state=1)\n",
305 |     "print('train-test ratio %.2f' %(float(len(train))/(len(train)+len(test))))"
306 |    ]
307 |   },
308 |   {
309 |    "cell_type": "markdown",
310 |    "metadata": {},
311 |    "source": [
312 |     "### 2. Fit the model on the training dataset\n",
313 |     "Now we fit a new model and save it to logmt.  \n",
314 |     "We will try to predict 'Alcabuse' with 'freetime' and 'goout'  \n",
315 |     "Note that we specify what dataset we are using by setting data = train. "
316 |    ]
317 |   },
318 |   {
319 |    "cell_type": "code",
320 |    "execution_count": null,
321 |    "metadata": {
322 |     "ExecuteTime": {
323 |      "end_time": "2017-01-27T10:42:00.202890",
324 |      "start_time": "2017-01-27T10:42:00.184033"
325 |     },
326 |     "collapsed": false
327 |    },
328 |    "outputs": [],
329 |    "source": [
330 |     "# We run the model on the train dataset\n",
331 |     "logmt = smf.logit('Alcabuse ~ freetime + goout',data=train).fit()\n",
332 |     "print(logmt.summary())"
333 |    ]
334 |   },
335 |   {
336 |    "cell_type": "markdown",
337 |    "metadata": {},
338 |    "source": [
339 |     "### 3. Calculate predictions on the test dataset\n",
340 |     "We now use the model with coefficients estimated from the training dataset (logmt) and plug in the predictor variable values from the test dataset by using the .predict method. This method outputs the probabilities for us so we don't have to apply separate sigmoid transform. Therefore the output are the probabilities predicted with our model on the test dataset."
341 |    ]
342 |   },
343 |   {
344 |    "cell_type": "code",
345 |    "execution_count": null,
346 |    "metadata": {
347 |     "ExecuteTime": {
348 |      "end_time": "2017-01-27T10:42:10.419775",
349 |      "start_time": "2017-01-27T10:42:10.410035"
350 |     },
351 |     "collapsed": false
352 |    },
353 |    "outputs": [],
354 |    "source": [
355 |     "ytestpred = logmt.predict(test)\n",
356 |     "np.round(ytestpred,2)"
357 |    ]
358 |   },
359 |   {
360 |    "cell_type": "markdown",
361 |    "metadata": {},
362 |    "source": [
363 |     "### 4. Calculate accuracy of the predictions\n",
364 |     "To calculate the accuracy of the model we need to specify the cutoff for the probabilities to say whether one should be classified as an alcohol abuser or not. For simplicity, let's say that we'll classify the person as an alcohol abuser if the estimated probability is greater than 50%.  \n",
365 |     "\n",
366 |     "We will then calculate the accuracy of the model in classifying alcohol abuse for the training data and the test data separately"
367 |    ]
368 |   },
369 |   {
370 |    "cell_type": "code",
371 |    "execution_count": null,
372 |    "metadata": {
373 |     "ExecuteTime": {
374 |      "end_time": "2017-01-27T10:42:29.512250",
375 |      "start_time": "2017-01-27T10:42:29.479882"
376 |     },
377 |     "collapsed": false
378 |    },
379 |    "outputs": [],
380 |    "source": [
381 |     "cutoff = .5\n",
382 |     "\n",
383 |     "# Calculate accuracy for train\n",
384 |     "ytrainpred = logmt.predict(train)\n",
385 |     "ytrainpred[ytrainpred>=cutoff] =1 \n",
386 |     "ytrainpred[ytrainpred<cutoff] =0 \n",
387 |     "accs_train = pd.crosstab(ytrainpred,train[\"Alcabuse\"])\n",
388 |     "\n",
389 |     "# Calculate accuracy for test\n",
390 |     "ytestpred = logmt.predict(test)\n",
391 |     "ytestpred[ytestpred>=cutoff] =1 \n",
392 |     "ytestpred[ytestpred<cutoff] =0 \n",
393 |     "accs_test = pd.crosstab(ytestpred,test[\"Alcabuse\"])\n",
394 |     "\n",
395 |     "# Overall accuracy\n",
396 |     "print('Overall Accuracy in Training set : %.2f ' %((accs_train[0][0]+accs_train[1][1])/float(train.shape[0])))\n",
397 |     "print('overall Accuracy in Testing set  : %.2f ' %((accs_test[0][0]+accs_test[1][1])/float(test.shape[0])))"
398 |    ]
399 |   },
400 |   {
401 |    "cell_type": "markdown",
402 |    "metadata": {},
403 |    "source": [
404 |     "**Question:** Why do you think the accuracy is lower for the test data compared to the training?\n",
405 |     "\n",
406 |     "Accuracy is one number to represent the overall model performance, but doesn't differentiate between different types of classification errors such as false positives and false negatives.  Sensitivity and Specificity are useful metrics to assess these aspects of the model's performance.\n",
407 |     "\n",
408 |     " | **Predicted Positive** | **Predicted Negative**\n",
409 |     "--- | --- | ---\n",
410 |     "**Actual Positive** | 192 | 21\n",
411 |     "**Actual Negative** | 8 | 16\n",
412 |     "\n",
413 |     "*Sensitivity* $ \\frac{TP}{TP+FN} $ : measures the proportion of positives that are correctly identified (e.g. we correctly identify the alcohol abusers)   \n",
414 |     "*Specificity* $ \\frac{TN}{TN+FP} $ : measures the proportion of negatives that are correctly identified (e.g. we correctly identify the non-alcohol abusers) "
415 |    ]
416 |   },
417 |   {
418 |    "cell_type": "code",
419 |    "execution_count": null,
420 |    "metadata": {
421 |     "ExecuteTime": {
422 |      "end_time": "2017-01-27T11:02:18.086823",
423 |      "start_time": "2017-01-27T11:02:18.060057"
424 |     },
425 |     "collapsed": false
426 |    },
427 |    "outputs": [],
428 |    "source": [
429 |     "# Sensitivity and false negatives\n",
430 |     "print('Training')\n",
431 |     "display(accs_train)\n",
432 |     "print('')\n",
433 |     "print('sensitivity : %.2f ' %(float(accs_train[1][1])/(accs_train[1][1]+accs_train[1][0])))\n",
434 |     "print('false negative : %.2f ' %(float(accs_train[1][0])/(accs_train[1][1]+accs_train[1][0])))\n",
435 |     "print('specificity : %.2f ' %(float(accs_train[0][0])/(accs_train[0][0]+accs_train[0][1])))\n",
436 |     "print('false positives : %.2f ' %(float(accs_train[0][1])/(accs_train[0][0]+accs_train[0][1])))\n",
437 |     "\n",
438 |     "# Specificity and false positives\n",
439 |     "print('\\n')\n",
440 |     "print('Test')\n",
441 |     "display(accs_test)\n",
442 |     "print('\\n')\n",
443 |     "print('sensitivity : %.2f ' %(float(accs_test[1][1])/(accs_test[1][1]+accs_test[1][0])))\n",
444 |     "print('false negative : %.2f ' %(float(accs_test[1][0])/(accs_test[1][1]+accs_test[1][0])))\n",
445 |     "print('specificity : %.2f ' %(float(accs_test[0][0])/(accs_test[0][0]+accs_test[0][1])))\n",
446 |     "print('false positives : %.2f ' %(float(accs_test[0][1])/(accs_test[0][0]+accs_test[0][1])))"
447 |    ]
448 |   },
449 |   {
450 |    "cell_type": "markdown",
451 |    "metadata": {},
452 |    "source": [
453 |     "Accuracy, sensitivity, and specificity are all affected if we change the cutoff value to .7\n",
454 |     "So now, we classify the student as an alcohol abuser if there is more than 80% probability.     \n",
455 |     "We can see that our overall accuracy increases from 84% to 85%.  \n",
456 |     "We have a tradeoff of decreasing sensitivity to 28% but increasing specificity to 98%.   \n",
457 |     "We are less likely to falsely accuse a student(decreased false positive), but we are less able to detect an abusing student (increased false negative)."
458 |    ]
459 |   },
460 |   {
461 |    "cell_type": "code",
462 |    "execution_count": null,
463 |    "metadata": {
464 |     "ExecuteTime": {
465 |      "end_time": "2017-01-27T11:03:51.732735",
466 |      "start_time": "2017-01-27T11:03:51.708175"
467 |     },
468 |     "collapsed": false
469 |    },
470 |    "outputs": [],
471 |    "source": [
472 |     "ytestpred = logmt.predict(test)\n",
473 |     "cutoff = .7\n",
474 |     "ytestpred[ytestpred>=cutoff] =1 \n",
475 |     "ytestpred[ytestpred<cutoff] =0 \n",
476 |     "accs = pd.crosstab(ytestpred,test[\"Alcabuse\"])\n",
477 |     "\n",
478 |     "# Overall accuracy\n",
479 |     "print ('overall accuracy : %.2f ' %((accs[0][0]+accs[1][1])/float(test.shape[0])))\n",
480 |     "# Sensitivity and false negatives\n",
481 |     "print('')\n",
482 |     "print('sensitivity : %.2f ' %(float(accs[1][1])/(accs[1][1]+accs[1][0])))\n",
483 |     "print('false negative : %.2f ' %(float(accs[1][0])/(accs[1][1]+accs[1][0])))\n",
484 |     "\n",
485 |     "# Specificity and false positives\n",
486 |     "print('')\n",
487 |     "print('specificity : %.2f ' %(float(accs[0][0])/(accs[0][0]+accs[0][1])))\n",
488 |     "print('false positives : %.2f ' %(float(accs[0][1])/(accs[0][0]+accs[0][1])))\n",
489 |     "\n",
490 |     "display(accs)"
491 |    ]
492 |   },
493 |   {
494 |    "cell_type": "markdown",
495 |    "metadata": {},
496 |    "source": [
497 |     "# Regularization\n",
498 |     "\n",
499 |     "Regularization is a method to help deal with [multicollinearity](https://en.wikipedia.org/wiki/Multicollinearity) and also avoid [overfitting](https://en.wikipedia.org/wiki/Overfitting). Overfitting occurs when you have an overly complex model such as having more features(Xs) than observations(Y). For instance, if you try to fit 10 observations with 10 features, each coefficient can be adjusted for a perfect fit but it wouldn't generalize well. In other cases, you might face the problem of feature selection. If you have numerous variables, it is time consuming to try every single combination of features in your model to see what yields the best result. \n",
500 |     "<img src=\"http://i.stack.imgur.com/0NbOY.png\">\n",
501 |     "\n",
502 |     "Regularization attempts to solve this problem by introducting a loss function that penalizes the model for each additional features added to the model. The AIC / BIC values we mentioned in the first session is a form of regularization in comparing models. \n",
503 |     "\n",
504 |     "In this section we will focus on feature selection regularization methods Lasso and Ridge regressions.\n",
505 |     "\n",
506 |     "## Lasso regression\n",
507 |     "In short, [Lasso](http://stats.stackexchange.com/questions/17251/what-is-the-lasso-in-regression-analysis) is a feature selection method that reduces the number of features to use in a regression.  \n",
508 |     "This is useful if you have a lot of variables that are correlated or you have more variables than observations.  \n",
509 |     "[Here](http://studentlife.cs.dartmouth.edu/smartgpa.pdf) is a study from Dartmouth that used Lasso on the Student Life Dataset.\n",
510 |     "\n",
511 |     "In our example, we will try to find which variables can predict the level of alcohol consumption 'AlcSum'. "
512 |    ]
513 |   },
514 |   {
515 |    "cell_type": "code",
516 |    "execution_count": null,
517 |    "metadata": {
518 |     "ExecuteTime": {
519 |      "end_time": "2017-01-27T11:05:03.397385",
520 |      "start_time": "2017-01-27T11:05:03.393882"
521 |     },
522 |     "collapsed": true
523 |    },
524 |    "outputs": [],
525 |    "source": [
526 |     "df['AlcSum']=df['Dalc']+df['Walc']"
527 |    ]
528 |   },
529 |   {
530 |    "cell_type": "code",
531 |    "execution_count": null,
532 |    "metadata": {
533 |     "ExecuteTime": {
534 |      "end_time": "2017-01-27T11:05:06.581723",
535 |      "start_time": "2017-01-27T11:05:06.577263"
536 |     },
537 |     "collapsed": false
538 |    },
539 |    "outputs": [],
540 |    "source": [
541 |     "# We have a lot of features to choose from\n",
542 |     "df.keys()"
543 |    ]
544 |   },
545 |   {
546 |    "cell_type": "code",
547 |    "execution_count": null,
548 |    "metadata": {
549 |     "ExecuteTime": {
550 |      "end_time": "2017-01-27T11:05:19.241902",
551 |      "start_time": "2017-01-27T11:05:18.929428"
552 |     },
553 |     "collapsed": false
554 |    },
555 |    "outputs": [],
556 |    "source": [
557 |     "# Make a model with all the feature variables\n",
558 |     "formula = 'AlcSum ~ school+sex+age+address+famsize+Pstatus+Medu+\\\n",
559 |     "       Fedu+Mjob+Fjob+reason+guardian+traveltime+\\\n",
560 |     "       studytime+failures+schoolsup+famsup+paid+activities+nursery+higher+\\\n",
561 |     "       internet+romantic+famrel+freetime+goout+health+absences+G1+G2+G3'\n",
562 |     "\n",
563 |     "# Generate a design matrix with the formula\n",
564 |     "# This automatically generates separate regressors for categorical variables.\n",
565 |     "from patsy import dmatrices\n",
566 |     "y, X = dmatrices(formula_like=formula, data=df, return_type='dataframe')\n",
567 |     "# Ridge if L1_wt = 0 , Lasso if L1_wt = 1\n",
568 |     "penalty = .036\n",
569 |     "regm = smf.OLS(y,X).fit_regularized(method='coord_descent', maxiter=1000, alpha=penalty, L1_wt=1.0)\n",
570 |     "print regm.summary()"
571 |    ]
572 |   },
573 |   {
574 |    "cell_type": "markdown",
575 |    "metadata": {},
576 |    "source": [
577 |     "### How do we determine the penalty value? \n",
578 |     "Essentially, we want to select the regularization parameter by identifying the one from a set of possible values (e.g. grid search) that results in the best fit of the model to the data.  However, it is important to note that it is easy to introduce bias into this process by trying a bunch of alphas and selecting the one that works best.  This can lead to optimistic evaluations of how well your model works.\n",
579 |     "\n",
580 |     "Cross-validation is an ideal method to deal with this.  We can use cross-validation to select the alpha while adding minimal bias to the overall model prediction.\n",
581 |     "\n",
582 |     "There are several different metrics of model fit to use as a criterion variable.  Log-likelihood or mean squared error are the two most common depending on how you are estimating the parameters (i.e., maximizing log-likelihood or minimizizing squared error).  The Akaike information criterion (AIC) and the Bayes Information criterion (BIC) are two common metrics that apply different penalties to these metrices for the number of free parameters in your model.\n",
583 |     "\n",
584 |     "Here we will demonstrate using both to select an optimal value of the regularization parameter alpha of the Lasso estimator from an example provided by [scikit-learn](http://scikit-learn.org/stable/auto_examples/linear_model/plot_lasso_model_selection.html). For cross-validation, we use 20-fold with 2 algorithms to compute the Lasso path: coordinate descent, as implemented by the LassoCV class.\n"
585 |    ]
586 |   },
587 |   {
588 |    "cell_type": "code",
589 |    "execution_count": null,
590 |    "metadata": {
591 |     "ExecuteTime": {
592 |      "end_time": "2017-01-27T11:11:14.478267",
593 |      "start_time": "2017-01-27T11:11:14.157680"
594 |     },
595 |     "collapsed": false
596 |    },
597 |    "outputs": [],
598 |    "source": [
599 |     "from sklearn.linear_model import LassoCV, LassoLarsCV, LassoLarsIC\n",
600 |     "import time\n",
601 |     "\n",
602 |     "# LassoLarsIC: least angle regression with BIC/AIC criterion\n",
603 |     "model_bic = LassoLarsIC(criterion='bic')\n",
604 |     "model_bic.fit(X, y)\n",
605 |     "alpha_bic_ = model_bic.alpha_\n",
606 |     "\n",
607 |     "model_aic = LassoLarsIC(criterion='aic')\n",
608 |     "model_aic.fit(X, y)\n",
609 |     "alpha_aic_ = model_aic.alpha_\n",
610 |     "\n",
611 |     "def plot_ic_criterion(model, name, value, color):\n",
612 |     "    alpha_ = model.alpha_\n",
613 |     "    alphas_ = model.alphas_\n",
614 |     "    criterion_ = model.criterion_\n",
615 |     "    plt.plot(-np.log10(alphas_), criterion_, '--', color=color,\n",
616 |     "             linewidth=3, label='%s criterion' % name)\n",
617 |     "    plt.axvline(-np.log10(alpha_), color=color, linewidth=3,\n",
618 |     "                label='%s estimate alpha: %.3f' %(name,value) )\n",
619 |     "    plt.xlabel('-log(alpha)',fontsize=16)\n",
620 |     "    plt.ylabel('Model Fit Criterion',fontsize=16)\n",
621 |     "\n",
622 |     "plt.figure(figsize=(15,5))\n",
623 |     "plot_ic_criterion(model_aic, 'AIC', alpha_aic_ , 'b')\n",
624 |     "plot_ic_criterion(model_bic, 'BIC', alpha_bic_ , 'r')\n",
625 |     "plt.legend()\n",
626 |     "plt.title('Information-criterion for model selection',fontsize=18)"
627 |    ]
628 |   },
629 |   {
630 |    "cell_type": "markdown",
631 |    "metadata": {},
632 |    "source": [
633 |     "Now let's create a plot showing how well each alpha value performs across cross-validation folds"
634 |    ]
635 |   },
636 |   {
637 |    "cell_type": "code",
638 |    "execution_count": null,
639 |    "metadata": {
640 |     "ExecuteTime": {
641 |      "end_time": "2017-01-27T11:23:24.140077",
642 |      "start_time": "2017-01-27T11:23:23.747748"
643 |     },
644 |     "collapsed": false
645 |    },
646 |    "outputs": [],
647 |    "source": [
648 |     "# LassoCV: coordinate descent\n",
649 |     "y, X = dmatrices(formula_like=formula, data=df, return_type='dataframe')\n",
650 |     "\n",
651 |     "# Compute paths\n",
652 |     "print(\"Computing regularization path using the coordinate descent lasso...\")\n",
653 |     "t1 = time.time()\n",
654 |     "model = LassoCV(cv=5).fit(X, y)\n",
655 |     "t_lasso_cv = time.time() - t1\n",
656 |     "\n",
657 |     "# Display results\n",
658 |     "m_log_alphas = -np.log10(model.alphas_)\n",
659 |     "\n",
660 |     "# Display results\n",
661 |     "plt.figure(figsize=(15,5))\n",
662 |     "plt.plot(m_log_alphas, model.mse_path_, ':')\n",
663 |     "plt.plot(m_log_alphas, model.mse_path_.mean(axis=-1), 'k',\n",
664 |     "         label='Average across the folds', linewidth=2)\n",
665 |     "plt.axvline(-np.log10(model.alpha_), linestyle='--', color='k',\n",
666 |     "            label='CV estimate alpha: %.3f' %model.alpha_ )\n",
667 |     "plt.legend()\n",
668 |     "plt.xlabel('-log(alpha)',fontsize=16)\n",
669 |     "plt.ylabel('Mean square error',fontsize=16)\n",
670 |     "plt.title('Mean square error on each fold: coordinate descent',fontsize=18)\n",
671 |     "plt.axis('tight')"
672 |    ]
673 |   },
674 |   {
675 |    "cell_type": "markdown",
676 |    "metadata": {},
677 |    "source": [
678 |     "## Ridge regression\n",
679 |     "The goal of the ridge function is to choose a penalty λ for which the coefficients are not rapidly changing and have “sensible” signs. It is especially useful when data suffers from multicollinearity, that is some of your predictor variables are highly correlated. Unlike LASSO, ridge does not produce a sparse solution, but rather shrinks variables that are highly collinear towards zero."
680 |    ]
681 |   },
682 |   {
683 |    "cell_type": "code",
684 |    "execution_count": null,
685 |    "metadata": {
686 |     "ExecuteTime": {
687 |      "end_time": "2017-01-27T11:26:15.846904",
688 |      "start_time": "2017-01-27T11:26:14.695671"
689 |     },
690 |     "collapsed": false,
691 |     "scrolled": false
692 |    },
693 |    "outputs": [],
694 |    "source": [
695 |     "from sklearn import linear_model\n",
696 |     "y, X = dmatrices(formula_like=formula, data=df, return_type='dataframe')\n",
697 |     "n_alphas = 200\n",
698 |     "alphas = np.logspace(-2, 6, n_alphas)\n",
699 |     "\n",
700 |     "# Get optimal lambda\n",
701 |     "clf = linear_model.RidgeCV(alphas=alphas)\n",
702 |     "clf.fit(X,y)\n",
703 |     "aestim = clf.alpha_ # Estimated regularization param\n",
704 |     "\n",
705 |     "# Get paths\n",
706 |     "clf = linear_model.Ridge(fit_intercept=False)\n",
707 |     "coefs = []\n",
708 |     "for a in alphas:\n",
709 |     "    clf.set_params(alpha=a)\n",
710 |     "    clf.fit(X, y)\n",
711 |     "    coefs.append(clf.coef_)\n",
712 |     "\n",
713 |     "###############################################################################\n",
714 |     "# Display results\n",
715 |     "plt.figure(figsize=(15,8))\n",
716 |     "\n",
717 |     "ax = plt.gca()\n",
718 |     "ax.plot(alphas, np.squeeze(coefs))\n",
719 |     "ax.set_xscale('log')\n",
720 |     "plt.axvline(aestim, linestyle='--', color='k', label='alpha: CV estimate')\n",
721 |     "plt.legend()\n",
722 |     "plt.xlabel('Lambda',fontsize=16)\n",
723 |     "plt.ylabel('Weights',fontsize=16)\n",
724 |     "plt.title('Ridge coefficients as a function of the regularization',fontsize=18)\n",
725 |     "plt.axis('tight')"
726 |    ]
727 |   },
728 |   {
729 |    "cell_type": "markdown",
730 |    "metadata": {},
731 |    "source": [
732 |     "The points on the left vertical axis (the left ends of the lines) are the ordinary least squares regression values.\n",
733 |     "These occur for k equal zero. As k is increased, the values of the regression estimates change, often wildly at first.\n",
734 |     "At some point, the coefficients seem to settle down and then gradually drift towards zero.\n",
735 |     "\n",
736 |     "The task of the ridge regression analyst is to determine at what value of k these coefficients are at their stable\n",
737 |     "values. A vertical line is drawn at the value selected for reporting purposes. It is anticipated that you would run\n",
738 |     "the program several times until an appropriate value of k is determined. In our case, the selected alpha value is indicated by a vertical line at which a = 65.93. The smaller the value of k, the smaller the amount of bias that is included in the estimates."
739 |    ]
740 |   },
741 |   {
742 |    "cell_type": "code",
743 |    "execution_count": null,
744 |    "metadata": {
745 |     "ExecuteTime": {
746 |      "end_time": "2017-01-27T11:26:48.340880",
747 |      "start_time": "2017-01-27T11:26:48.333706"
748 |     },
749 |     "collapsed": false
750 |    },
751 |    "outputs": [],
752 |    "source": [
753 |     "clf = linear_model.Ridge(fit_intercept=False)\n",
754 |     "a = aestim\n",
755 |     "clf.set_params(alpha=a)\n",
756 |     "clf.fit(X, y)\n",
757 |     "np.round(clf.coef_,3)"
758 |    ]
759 |   },
760 |   {
761 |    "cell_type": "code",
762 |    "execution_count": null,
763 |    "metadata": {
764 |     "ExecuteTime": {
765 |      "end_time": "2017-01-27T11:26:56.928052",
766 |      "start_time": "2017-01-27T11:26:56.884641"
767 |     },
768 |     "collapsed": false
769 |    },
770 |    "outputs": [],
771 |    "source": [
772 |     "# Ridge if L1_wt = 0 , Lasso if L1_wt = 1\n",
773 |     "penalty = aestim\n",
774 |     "regm = smf.OLS(y,X).fit_regularized(method='coord_descent', maxiter=1000, alpha=penalty, L1_wt=0)\n",
775 |     "print regm.summary()"
776 |    ]
777 |   },
778 |   {
779 |    "cell_type": "markdown",
780 |    "metadata": {
781 |     "collapsed": true
782 |    },
783 |    "source": [
784 |     "# Individual exercise\n",
785 |     "Below is the information about the dataset. \n",
786 |     "\n",
787 |     "Attribute Information:  \n",
788 |     "1 school - student's school (binary: 'GP' - Gabriel Pereira or 'MS' - Mousinho da Silveira)   \n",
789 |     "2 sex - student's sex (binary: 'F' - female or 'M' - male)   \n",
790 |     "3 age - student's age (numeric: from 15 to 22)   \n",
791 |     "4 address - student's home address type (binary: 'U' - urban or 'R' - rural)   \n",
792 |     "5 famsize - family size (binary: 'LE3' - less or equal to 3 or 'GT3' - greater than 3)   \n",
793 |     "6 Pstatus - parent's cohabitation status (binary: 'T' - living together or 'A' - apart)   \n",
794 |     "7 Medu - mother's education (numeric: 0 - none, 1 - primary education (4th grade), 2 Ã¢â‚¬â€œ 5th to 9th grade, 3 Ã¢â‚¬â€œ secondary education or 4 Ã¢â‚¬â€œ higher education)   \n",
795 |     "8 Fedu - father's education (numeric: 0 - none, 1 - primary education (4th grade), 2 Ã¢â‚¬â€œ 5th to 9th grade, 3 Ã¢â‚¬â€œ secondary education or 4 Ã¢â‚¬â€œ higher education)   \n",
796 |     "9 Mjob - mother's job (nominal: 'teacher', 'health' care related, civil 'services' (e.g. administrative or police), 'at_home' or 'other')   \n",
797 |     "10 Fjob - father's job (nominal: 'teacher', 'health' care related, civil 'services' (e.g. administrative or police), 'at_home' or 'other')   \n",
798 |     "11 reason - reason to choose this school (nominal: close to 'home', school 'reputation', 'course' preference or 'other')   \n",
799 |     "12 guardian - student's guardian (nominal: 'mother', 'father' or 'other')   \n",
800 |     "13 traveltime - home to school travel time (numeric: 1 - <15 min., 2 - 15 to 30 min., 3 - 30 min. to 1 hour, or 4 - >1 hour)   \n",
801 |     "14 studytime - weekly study time (numeric: 1 - <2 hours, 2 - 2 to 5 hours, 3 - 5 to 10 hours, or 4 - >10 hours)   \n",
802 |     "15 failures - number of past class failures (numeric: n if 1<=n<3, else 4)   \n",
803 |     "16 schoolsup - extra educational support (binary: yes or no)   \n",
804 |     "17 famsup - family educational support (binary: yes or no)   \n",
805 |     "18 paid - extra paid classes within the course subject (Math or Portuguese) (binary: yes or no)   \n",
806 |     "19 activities - extra-curricular activities (binary: yes or no)   \n",
807 |     "20 nursery - attended nursery school (binary: yes or no)   \n",
808 |     "21 higher - wants to take higher education (binary: yes or no)   \n",
809 |     "22 internet - Internet access at home (binary: yes or no)   \n",
810 |     "23 romantic - with a romantic relationship (binary: yes or no)   \n",
811 |     "24 famrel - quality of family relationships (numeric: from 1 - very bad to 5 - excellent)   \n",
812 |     "25 freetime - free time after school (numeric: from 1 - very low to 5 - very high)   \n",
813 |     "26 goout - going out with friends (numeric: from 1 - very low to 5 - very high)   \n",
814 |     "27 Dalc - workday alcohol consumption (numeric: from 1 - very low to 5 - very high)   \n",
815 |     "28 Walc - weekend alcohol consumption (numeric: from 1 - very low to 5 - very high)   \n",
816 |     "29 health - current health status (numeric: from 1 - very bad to 5 - very good)   \n",
817 |     "30 absences - number of school absences (numeric: from 0 to 93)     \n",
818 |     "31 G1 - first period grade (numeric: from 0 to 20)   \n",
819 |     "31 G2 - second period grade (numeric: from 0 to 20)   \n",
820 |     "32 G3 - final grade (numeric: from 0 to 20, output target)   \n",
821 |     "\n",
822 |     "## Exercise 1\n",
823 |     "Before you begin, you may want to replace all the 'yes' or 'no' responses in the dataframe to numerical values such as 1 or 0\n",
824 |     "using the df.replace() method. \n",
825 |     "\n",
826 |     "Run a logistic model to see whether studytime predicts 'wanting to take higher education'. \n",
827 |     "\n",
828 |     "Based on the model, what is the probability that John wants to pursue higher education if his\n",
829 |     "studytime value is 2 (he studies 2 hours/week)?\n",
830 |     "\n",
831 |     "Based on the model what is the probability that Laura wants to pursue higher education if her\n",
832 |     "studytime value is 3 (she studies 5 hours per week)? \n",
833 |     "\n",
834 |     "How much more likely (in odds) is Laura likely to want to pursue higher education than John ?\n",
835 |     "\n",
836 |     "## Exercise 2\n",
837 |     "Run a LASSO regression to find what variables predict studytime.   \n",
838 |     "Use the penalty parameter (alpha) estimated through cross validation of coordinate descent.  \n",
839 |     "What is your interpretation of the results?  "
840 |    ]
841 |   },
842 |   {
843 |    "cell_type": "code",
844 |    "execution_count": null,
845 |    "metadata": {
846 |     "collapsed": true
847 |    },
848 |    "outputs": [],
849 |    "source": []
850 |   }
851 |  ],
852 |  "metadata": {
853 |   "anaconda-cloud": {},
854 |   "kernelspec": {
855 |    "display_name": "Python 2",
856 |    "language": "python",
857 |    "name": "python2"
858 |   },
859 |   "language_info": {
860 |    "codemirror_mode": {
861 |     "name": "ipython",
862 |     "version": 2
863 |    },
864 |    "file_extension": ".py",
865 |    "mimetype": "text/x-python",
866 |    "name": "python",
867 |    "nbconvert_exporter": "python",
868 |    "pygments_lexer": "ipython2",
869 |    "version": "2.7.13"
870 |   },
871 |   "toc": {
872 |    "nav_menu": {
873 |     "height": "512px",
874 |     "width": "252px"
875 |    },
876 |    "navigate_menu": true,
877 |    "number_sections": true,
878 |    "sideBar": true,
879 |    "threshold": 4,
880 |    "toc_cell": false,
881 |    "toc_section_display": "block",
882 |    "toc_window_display": false
883 |   }
884 |  },
885 |  "nbformat": 4,
886 |  "nbformat_minor": 0
887 | }
888 | 


--------------------------------------------------------------------------------
/Notebooks/6_Introduction_to_SocialNetworkAnalysis.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "markdown",
  5 |    "metadata": {
  6 |     "deletable": true,
  7 |     "editable": true
  8 |    },
  9 |    "source": [
 10 |     "# Social Network Analysis \n",
 11 |     "Written by Jin Cheong & Luke Chang\n"
 12 |    ]
 13 |   },
 14 |   {
 15 |    "cell_type": "code",
 16 |    "execution_count": null,
 17 |    "metadata": {
 18 |     "ExecuteTime": {
 19 |      "end_time": "2017-02-13T09:24:16.143566",
 20 |      "start_time": "2017-02-13T09:24:15.172556"
 21 |     },
 22 |     "collapsed": false,
 23 |     "deletable": true,
 24 |     "editable": true
 25 |    },
 26 |    "outputs": [],
 27 |    "source": [
 28 |     "%matplotlib inline\n",
 29 |     "\n",
 30 |     "import numpy as np\n",
 31 |     "import matplotlib.pyplot as plt\n",
 32 |     "try:\n",
 33 |     "    import networkx as nx\n",
 34 |     "except:\n",
 35 |     "    # Install NetworkX\n",
 36 |     "    !pip install networkx"
 37 |    ]
 38 |   },
 39 |   {
 40 |    "cell_type": "markdown",
 41 |    "metadata": {
 42 |     "deletable": true,
 43 |     "editable": true
 44 |    },
 45 |    "source": [
 46 |     "# Primer to Network Analysis\n",
 47 |     "\n",
 48 |     "A network is made up of two main components: nodes and edges. \n",
 49 |     "\n",
 50 |     "The nodes are the individuals, words, or entities that compose a network, and the edges are the links that connect one node to another. \n",
 51 |     "\n",
 52 |     "Here is an example of a node and an edge.\n",
 53 |     "\n",
 54 |     "<img src=\"Figures/nodeedge.png\">\n",
 55 |     "\n",
 56 |     "Now we can try drawing our own network using the NetworkX package.\n",
 57 |     "Let's start off by initializing a Network, and call it G. "
 58 |    ]
 59 |   },
 60 |   {
 61 |    "cell_type": "code",
 62 |    "execution_count": null,
 63 |    "metadata": {
 64 |     "ExecuteTime": {
 65 |      "end_time": "2017-02-13T09:24:16.148155",
 66 |      "start_time": "2017-02-13T09:24:16.145064"
 67 |     },
 68 |     "collapsed": true,
 69 |     "deletable": true,
 70 |     "editable": true
 71 |    },
 72 |    "outputs": [],
 73 |    "source": [
 74 |     "# Initialize Graph object\n",
 75 |     "G = nx.Graph()"
 76 |    ]
 77 |   },
 78 |   {
 79 |    "cell_type": "markdown",
 80 |    "metadata": {
 81 |     "deletable": true,
 82 |     "editable": true
 83 |    },
 84 |    "source": [
 85 |     "Now we can add a node using the .add_node('nodename') method"
 86 |    ]
 87 |   },
 88 |   {
 89 |    "cell_type": "code",
 90 |    "execution_count": null,
 91 |    "metadata": {
 92 |     "ExecuteTime": {
 93 |      "end_time": "2017-02-13T09:24:16.291054",
 94 |      "start_time": "2017-02-13T09:24:16.149742"
 95 |     },
 96 |     "collapsed": false,
 97 |     "deletable": true,
 98 |     "editable": true
 99 |    },
100 |    "outputs": [],
101 |    "source": [
102 |     "G = nx.Graph()\n",
103 |     "G.add_node('Jin')\n",
104 |     "G.add_node('Luke')\n",
105 |     "G.add_node('Eshin')\n",
106 |     "plt.figure(figsize=(5,3))\n",
107 |     "nx.draw(G,with_labels=True,node_size=5000,font_size=20,alpha=.5)"
108 |    ]
109 |   },
110 |   {
111 |    "cell_type": "markdown",
112 |    "metadata": {
113 |     "deletable": true,
114 |     "editable": true
115 |    },
116 |    "source": [
117 |     "Notice there are no connections between the two nodes because we haven't added any edges yet.\n",
118 |     "\n",
119 |     "To add edges between nodes, for example node1 and node 2, we can use the .add_edge('node1','node2') method on our graph."
120 |    ]
121 |   },
122 |   {
123 |    "cell_type": "code",
124 |    "execution_count": null,
125 |    "metadata": {
126 |     "ExecuteTime": {
127 |      "end_time": "2017-02-13T09:24:16.452812",
128 |      "start_time": "2017-02-13T09:24:16.292498"
129 |     },
130 |     "collapsed": false,
131 |     "deletable": true,
132 |     "editable": true,
133 |     "scrolled": true
134 |    },
135 |    "outputs": [],
136 |    "source": [
137 |     "G = nx.Graph()\n",
138 |     "G.add_edge('Jin','Luke',weight=1)\n",
139 |     "G.add_edge('Jin','Eshin',weight=1)\n",
140 |     "G.add_edge('Luke','Eshin',weight=1)\n",
141 |     "plt.figure(figsize=(5,3))\n",
142 |     "pos = nx.spring_layout(G) # One way of specifiying a layout for nodes.\n",
143 |     "nx.draw(G,pos,with_labels=True,node_size=5000,font_size=20,alpha=.5)"
144 |    ]
145 |   },
146 |   {
147 |    "cell_type": "markdown",
148 |    "metadata": {
149 |     "deletable": true,
150 |     "editable": true
151 |    },
152 |    "source": [
153 |     "Now you can see that we have an edge connecting Jin and Luke. \n",
154 |     "\n",
155 |     "There are several different types of graphs.\n",
156 |     "\n",
157 |     "1. **weighted** vs **unweighted** graphs\n",
158 |     "    - a graph is said to be **unweighted** if every connection is either 1 or 0\n",
159 |     "    - it is **weighted** if the edges take on other values indicating the strength of the relationship.\n",
160 |     "\n",
161 |     "\n",
162 |     "2. **directed** vs **undirected** graphs\n",
163 |     "    - a graphs is **undirected** if the edges are symmetrical\n",
164 |     "    - it is **directed** if the edges are asymmetrical, meaning that each edge can indicate the direction of the relationship\n",
165 |     "\n",
166 |     "For simplicity, today we will only cover unweighted and undirected graphs.  "
167 |    ]
168 |   },
169 |   {
170 |    "cell_type": "markdown",
171 |    "metadata": {},
172 |    "source": [
173 |     "There are multiple ways to indicate nodes and edges in a graph.  \n",
174 |     "As illustrated above, we can manually add each node and edge.  This approach is fine for small graphs, but won't scale well.  It is also possible to add nodes is to use a python dictionary.\n",
175 |     "The key is the node and the values indicate what that key(node) connects to. "
176 |    ]
177 |   },
178 |   {
179 |    "cell_type": "code",
180 |    "execution_count": null,
181 |    "metadata": {
182 |     "ExecuteTime": {
183 |      "end_time": "2017-02-13T09:24:16.903054",
184 |      "start_time": "2017-02-13T09:24:16.454433"
185 |     },
186 |     "collapsed": false,
187 |     "deletable": true,
188 |     "editable": true
189 |    },
190 |    "outputs": [],
191 |    "source": [
192 |     "d = {'Jin':['Luke','Eshin','Antonia','Andy','Sophie','Rob','Charlie','Vanessa'], \n",
193 |     "     'Luke':['Eshin','Antonia','Seth','Andy','Sophie','Rob','Charlie','Vanessa'],\n",
194 |     "     'Antonia':['Luke','Jin','Eshin','Seth','Andy'],\n",
195 |     "     'Eshin':['Heidi','Antonia','Jin','Sam','Andy'],\n",
196 |     "     'Sophie':['Rob','Vanessa'],\n",
197 |     "     'Rob':['Sophie','Vanessa']}\n",
198 |     "G = nx.Graph(d)\n",
199 |     "plt.figure(figsize=(15,8))\n",
200 |     "np.random.seed(2) # Just to keep things same\n",
201 |     "pos = pos = nx.fruchterman_reingold_layout(G) # Another way of specifiying a layout for nodes.\n",
202 |     "nx.draw(G,with_labels=True,node_size=1500,font_size=20,alpha=.3,width=2)"
203 |    ]
204 |   },
205 |   {
206 |    "cell_type": "markdown",
207 |    "metadata": {
208 |     "deletable": true,
209 |     "editable": true
210 |    },
211 |    "source": [
212 |     "# What should we look for in a network ? \n",
213 |     "\n",
214 |     "## Micromeasures\n",
215 |     "Centrality measures track the importance of a node derived from its position in the network.\n",
216 |     "\n",
217 |     "There are four main groups of centrality depending on the type of statistics used:\n",
218 |     "1. degree - how connected a node is\n",
219 |     "2. betweenness - how important a node is in connecting other nodes\n",
220 |     "3. closeness - how easily a node can reach other nodes\n",
221 |     "4. neighbors' characteristics - how important, central, or influential a node's neighbors are. \n",
222 |     "\n",
223 |     "<img src=\"Figures/centrality.png\",width=300>\n",
224 |     "\n",
225 |     "Reference:\n",
226 |     "Social and Economic Networks by Matthew O. Jackson. Princeton University Press.   \n",
227 |     "Picture: [Wikipedia article](https://en.wikipedia.org/wiki/Centrality)   \n",
228 |     "Other examples: http://www.orgnet.com/sna.html  \n",
229 |     "\n",
230 |     "## Degree Centrality\n",
231 |     "\n",
232 |     "The degree centrality measures the number of edges of a given node. \n",
233 |     "\n",
234 |     "In other words, it measures how connected a node is."
235 |    ]
236 |   },
237 |   {
238 |    "cell_type": "code",
239 |    "execution_count": null,
240 |    "metadata": {
241 |     "ExecuteTime": {
242 |      "end_time": "2017-02-13T09:24:17.296820",
243 |      "start_time": "2017-02-13T09:24:16.904690"
244 |     },
245 |     "collapsed": false,
246 |     "deletable": true,
247 |     "editable": true
248 |    },
249 |    "outputs": [],
250 |    "source": [
251 |     "def print_network(val_map,title=None):\n",
252 |     "    print('\\x1b[1;31m'+title+'\\x1b[0m')\n",
253 |     "    for k, v in val_map.iteritems():\n",
254 |     "        print k.ljust(15), str(round(v,3)).ljust(30)\n",
255 |     "\n",
256 |     "def plot_network(val_map, title=None):\n",
257 |     "    values = [val_map.get(node, 0.25) for node in G.nodes()]\n",
258 |     "    plt.figure(figsize=(12,5))\n",
259 |     "    np.random.seed(2)\n",
260 |     "#     pos = nx.spring_layout(G)\n",
261 |     "    pos = nx.fruchterman_reingold_layout(G,dim=2)\n",
262 |     "    ec = nx.draw_networkx_edges(G,pos,alpha=.2)\n",
263 |     "    nc = nx.draw_networkx_nodes(G,pos,node_size=1000,with_labels=True,alpha=.5,cmap=plt.get_cmap('jet'),node_color=values)\n",
264 |     "    nx.draw_networkx_labels(G,pos,font_size=18)\n",
265 |     "    # nx.draw(G,pos,node_size=400,with_labels=True,alpha=.5,cmap=plt.get_cmap('jet'),node_color=values)\n",
266 |     "    plt.colorbar(nc)\n",
267 |     "    plt.axis('off')\n",
268 |     "    plt.suptitle(title,fontsize=18)\n",
269 |     "    plt.show()\n",
270 |     "\n",
271 |     "# Get degree centrality values\n",
272 |     "d = nx.degree_centrality(G)\n",
273 |     "title = 'Degree Centrality Map'\n",
274 |     "# print centrality values\n",
275 |     "print_network(d,title)\n",
276 |     "# plot graph with values\n",
277 |     "plot_network(d,title)"
278 |    ]
279 |   },
280 |   {
281 |    "cell_type": "markdown",
282 |    "metadata": {
283 |     "deletable": true,
284 |     "editable": true
285 |    },
286 |    "source": [
287 |     "Luke has the highest number of connections and therefore has the highest degree centrality.  Jin is next, and then Eshin.\n",
288 |     "\n",
289 |     "# Betweenness Centrality\n",
290 |     "Think of Between Centrality as a bottleneck or a broker of two separate networks. \n",
291 |     "\n",
292 |     "The measure is the fraction of the total number of shortest paths a node lie on between two other nodes.\n",
293 |     "\n",
294 |     "Here Eshin has the highest betweeness centrality because he connects the most different people."
295 |    ]
296 |   },
297 |   {
298 |    "cell_type": "code",
299 |    "execution_count": null,
300 |    "metadata": {
301 |     "ExecuteTime": {
302 |      "end_time": "2017-02-13T09:24:17.659747",
303 |      "start_time": "2017-02-13T09:24:17.298352"
304 |     },
305 |     "collapsed": false,
306 |     "deletable": true,
307 |     "editable": true
308 |    },
309 |    "outputs": [],
310 |    "source": [
311 |     "d = nx.betweenness_centrality(G)\n",
312 |     "title = \"Betweenness Centrality\"\n",
313 |     "print_network(d,title)\n",
314 |     "plot_network(d,title)"
315 |    ]
316 |   },
317 |   {
318 |    "cell_type": "markdown",
319 |    "metadata": {
320 |     "deletable": true,
321 |     "editable": true
322 |    },
323 |    "source": [
324 |     "# Closeness Centrality\n",
325 |     "The closeness centrality measures how close a given node is to any other node.  \n",
326 |     "This is measured by the average length of the shortest path between a node and all other nodes.   \n",
327 |     "Thus you have higher closeness centrality if you can get to everyone faster than others.  "
328 |    ]
329 |   },
330 |   {
331 |    "cell_type": "code",
332 |    "execution_count": null,
333 |    "metadata": {
334 |     "ExecuteTime": {
335 |      "end_time": "2017-02-13T09:24:18.015038",
336 |      "start_time": "2017-02-13T09:24:17.661151"
337 |     },
338 |     "collapsed": false,
339 |     "deletable": true,
340 |     "editable": true
341 |    },
342 |    "outputs": [],
343 |    "source": [
344 |     "d = nx.closeness_centrality(G)\n",
345 |     "title = \"Closeness Centrality\"\n",
346 |     "print_network(d,title)\n",
347 |     "plot_network(d,title)"
348 |    ]
349 |   },
350 |   {
351 |    "cell_type": "markdown",
352 |    "metadata": {
353 |     "deletable": true,
354 |     "editable": true
355 |    },
356 |    "source": [
357 |     "# Eigenvector Centrality\n",
358 |     "\n",
359 |     "The underlying assumption of the Eigenvector centrality is that a node's importance is determined by how important its neighbors are.  \n",
360 |     "\n",
361 |     "For instance, having more influential friends (betweeness centrality) can be more important than just having more number of friends (degree centrality).  \n",
362 |     "\n",
363 |     "Now we can observe that Luke is back to being more important than Eshin (who had higher betweeness centrality), because Luke is friends with Eshin who also has high centrality."
364 |    ]
365 |   },
366 |   {
367 |    "cell_type": "code",
368 |    "execution_count": null,
369 |    "metadata": {
370 |     "ExecuteTime": {
371 |      "end_time": "2017-02-13T09:24:18.378309",
372 |      "start_time": "2017-02-13T09:24:18.016469"
373 |     },
374 |     "collapsed": false,
375 |     "deletable": true,
376 |     "editable": true
377 |    },
378 |    "outputs": [],
379 |    "source": [
380 |     "d = nx.eigenvector_centrality(G)\n",
381 |     "title = \"Eigenvector Centrality\"\n",
382 |     "print_network(d,title)\n",
383 |     "plot_network(d,title)"
384 |    ]
385 |   },
386 |   {
387 |    "cell_type": "markdown",
388 |    "metadata": {
389 |     "deletable": true,
390 |     "editable": true
391 |    },
392 |    "source": [
393 |     "## Macromeasures\n",
394 |     "\n",
395 |     "Macromeasures are useful to understand the network as a whole or to compare networks. \n",
396 |     "\n",
397 |     "We will cover three fundamental measures of the network structure.\n",
398 |     "1. Degree distribution\n",
399 |     "2. Average shortest path \n",
400 |     "3. Clustering Coefficients\n",
401 |     "\n",
402 |     "## Degree Distribution\n",
403 |     "One fundamental characteristic is the distribution of degrees, number of edges for each node, which we can easily plot.  \n",
404 |     "From this distribution we can discern attributes such as whether everyone is connected to everybody are there many lonely nodes and such."
405 |    ]
406 |   },
407 |   {
408 |    "cell_type": "code",
409 |    "execution_count": null,
410 |    "metadata": {
411 |     "ExecuteTime": {
412 |      "end_time": "2017-02-13T09:24:19.263314",
413 |      "start_time": "2017-02-13T09:24:18.379752"
414 |     },
415 |     "collapsed": false,
416 |     "deletable": true,
417 |     "editable": true
418 |    },
419 |    "outputs": [],
420 |    "source": [
421 |     "degree_sequence=sorted([d for n,d in G.degree().iteritems()], reverse=True) # degree sequence\n",
422 |     "\n",
423 |     "G2=nx.complete_graph(5)\n",
424 |     "degree_sequence2=sorted([d for n,d in G2.degree().iteritems()], reverse=True) # degree sequence\n",
425 |     "\n",
426 |     "fig, [(ax1,ax2),(ax3,ax4)] = plt.subplots(2,2,figsize=(15,10))\n",
427 |     "ax1.hist(degree_sequence,bins=range(0,7),rwidth=.8)\n",
428 |     "ax1.set_title(\"Degree Histogram for Graph 1\",fontsize=16)\n",
429 |     "ax1.set_ylabel(\"Count\",fontsize=14)\n",
430 |     "ax1.set_xlabel(\"Degree\",fontsize=14)\n",
431 |     "ax1.set_xticks([d+0.4 for d in degree_sequence])\n",
432 |     "ax1.set_xticklabels([d for d in degree_sequence])\n",
433 |     "ax1.set_ylim((0,6))\n",
434 |     "\n",
435 |     "nx.draw(G,pos=nx.circular_layout(G),ax=ax2)\n",
436 |     "ax2.set_title('Network 1: Graph of our class',fontsize=16)\n",
437 |     "\n",
438 |     "ax3.hist(degree_sequence2,bins=range(0,7),rwidth=.8)\n",
439 |     "ax3.set_title(\"Degree Histogram for Complete Graph\",fontsize=16)\n",
440 |     "ax3.set_ylabel(\"Count\",fontsize=14)\n",
441 |     "ax3.set_xlabel(\"Degree\",fontsize=14)\n",
442 |     "ax3.set_xticks([d+0.4 for d in degree_sequence2])\n",
443 |     "ax3.set_xticklabels([d for d in degree_sequence2])\n",
444 |     "ax3.set_ylim((0,6))\n",
445 |     "\n",
446 |     "nx.draw(G2,pos=nx.circular_layout(G2),ax=ax4)\n",
447 |     "ax4.set_title('Network 2: Graph of a Complete Graph',fontsize=16)\n",
448 |     "\n",
449 |     "plt.tight_layout(pad=0.4, w_pad=0.5, h_pad=1.0)\n",
450 |     "plt.show()"
451 |    ]
452 |   },
453 |   {
454 |    "cell_type": "markdown",
455 |    "metadata": {
456 |     "deletable": true,
457 |     "editable": true
458 |    },
459 |    "source": [
460 |     "## Average Shortest Path\n",
461 |     "\n",
462 |     "The average shortest path length is the average of calculating the shortest number of edges it takes for every pair of nodes in the network.  \n",
463 |     "\n",
464 |     "This can be a mesure of how efficient/fast the network is in the distribution of information, rumors, disease, etc."
465 |    ]
466 |   },
467 |   {
468 |    "cell_type": "code",
469 |    "execution_count": null,
470 |    "metadata": {
471 |     "ExecuteTime": {
472 |      "end_time": "2017-02-13T09:24:19.269671",
473 |      "start_time": "2017-02-13T09:24:19.265105"
474 |     },
475 |     "collapsed": false,
476 |     "deletable": true,
477 |     "editable": true,
478 |     "scrolled": true
479 |    },
480 |    "outputs": [],
481 |    "source": [
482 |     "print 'Network 1 average shortest path: ' + str(round(nx.average_shortest_path_length(G),2))\n",
483 |     "print 'Network 2 average shortest path: ' + str(nx.average_shortest_path_length(G2))"
484 |    ]
485 |   },
486 |   {
487 |    "cell_type": "markdown",
488 |    "metadata": {
489 |     "deletable": true,
490 |     "editable": true
491 |    },
492 |    "source": [
493 |     "## Clustering coefficient. \n",
494 |     "\n",
495 |     "Another way to look at how tight-knit a network is to look at how clustered it is.   \n",
496 |     "To estimate clustering, we start from cliques, which can be considered as a subnetwork composed of three nodes forming a triangle. \n",
497 |     "\n",
498 |     "We can first calculate for each node, how many cliques(triangles) it is a member of.  \n",
499 |     "Then we calculate transitivity, which is simply the number of triangles in the network divided by the number of all possible traingles."
500 |    ]
501 |   },
502 |   {
503 |    "cell_type": "code",
504 |    "execution_count": null,
505 |    "metadata": {
506 |     "ExecuteTime": {
507 |      "end_time": "2017-02-13T09:24:19.633788",
508 |      "start_time": "2017-02-13T09:24:19.271359"
509 |     },
510 |     "collapsed": false,
511 |     "deletable": true,
512 |     "editable": true,
513 |     "scrolled": false
514 |    },
515 |    "outputs": [],
516 |    "source": [
517 |     "d = nx.triangles(G)\n",
518 |     "print_network(d,'Number of cliques(triangles)')\n",
519 |     "print \n",
520 |     "print 'Transitivity : ' + str(nx.transitivity(G))\n",
521 |     "\n",
522 |     "plot_network(d,'Transitivity')"
523 |    ]
524 |   },
525 |   {
526 |    "cell_type": "markdown",
527 |    "metadata": {
528 |     "deletable": true,
529 |     "editable": true
530 |    },
531 |    "source": [
532 |     "A similar approach is to calculate clustering coefficient for each node and for the graph.\n",
533 |     "Clustering is the measure of how likely it is if A has edges to B and C, that B and C related? \n",
534 |     "\n",
535 |     "For instance, Charlie form a triangle with Luke and Jin, but Sam and Heidi don't form a single triangle.\n",
536 |     "\n",
537 |     "We can calculate this for each node and then get an average value overall to get the average clustering coefficient."
538 |    ]
539 |   },
540 |   {
541 |    "cell_type": "code",
542 |    "execution_count": null,
543 |    "metadata": {
544 |     "ExecuteTime": {
545 |      "end_time": "2017-02-13T09:24:19.642416",
546 |      "start_time": "2017-02-13T09:24:19.635348"
547 |     },
548 |     "collapsed": false,
549 |     "deletable": true,
550 |     "editable": true
551 |    },
552 |    "outputs": [],
553 |    "source": [
554 |     "d = nx.clustering(G)\n",
555 |     "print_network(d,'Clustering Coefficients per node')\n",
556 |     "print \n",
557 |     "print 'Average clustering coefficient : ' + str(nx.average_clustering(G))"
558 |    ]
559 |   },
560 |   {
561 |    "cell_type": "code",
562 |    "execution_count": null,
563 |    "metadata": {
564 |     "collapsed": true,
565 |     "deletable": true,
566 |     "editable": true
567 |    },
568 |    "outputs": [],
569 |    "source": []
570 |   },
571 |   {
572 |    "cell_type": "markdown",
573 |    "metadata": {
574 |     "collapsed": true,
575 |     "deletable": true,
576 |     "editable": true
577 |    },
578 |    "source": [
579 |     "# Exercise\n",
580 |     "Your assignment this week is to construct your own social network.  It could be based on a sports team, your close friends, or family.  It could also reflect more abstract aspects of relationships such as social support, gossip, problem solver, etc.\n",
581 |     "\n",
582 |     "Calculate two different metrics of the network to provide insight into how different members play different roles."
583 |    ]
584 |   },
585 |   {
586 |    "cell_type": "code",
587 |    "execution_count": null,
588 |    "metadata": {
589 |     "collapsed": true,
590 |     "deletable": true,
591 |     "editable": true
592 |    },
593 |    "outputs": [],
594 |    "source": []
595 |   }
596 |  ],
597 |  "metadata": {
598 |   "anaconda-cloud": {},
599 |   "kernelspec": {
600 |    "display_name": "Python [default]",
601 |    "language": "python",
602 |    "name": "python2"
603 |   },
604 |   "language_info": {
605 |    "codemirror_mode": {
606 |     "name": "ipython",
607 |     "version": 2
608 |    },
609 |    "file_extension": ".py",
610 |    "mimetype": "text/x-python",
611 |    "name": "python",
612 |    "nbconvert_exporter": "python",
613 |    "pygments_lexer": "ipython2",
614 |    "version": "2.7.13"
615 |   },
616 |   "toc": {
617 |    "nav_menu": {
618 |     "height": "260px",
619 |     "width": "252px"
620 |    },
621 |    "navigate_menu": true,
622 |    "number_sections": true,
623 |    "sideBar": true,
624 |    "threshold": 4,
625 |    "toc_cell": false,
626 |    "toc_section_display": "block",
627 |    "toc_window_display": false
628 |   }
629 |  },
630 |  "nbformat": 4,
631 |  "nbformat_minor": 0
632 | }
633 | 


--------------------------------------------------------------------------------
/Notebooks/7_Introduction_to_Scraping.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "markdown",
  5 |    "metadata": {
  6 |     "deletable": true,
  7 |     "editable": true
  8 |    },
  9 |    "source": [
 10 |     "# Introduction to Web Scraping\n",
 11 |     "Often we are interested in getting data from a website.  Modern websites are often built using a [REST](https://en.wikipedia.org/wiki/Representational_state_transfer) framework that has an Application Programming Interface ([API](https://en.wikipedia.org/wiki/Application_programming_interface)) to make [HTTP](https://www.tutorialspoint.com/http/http_requests.htm) requests to retrieve structured data in the form of [JSON](https://en.wikipedia.org/wiki/JSON) or XML.\n",
 12 |     "\n",
 13 |     "However, when there is not a clear API, we might need to perform web scraping by directly grabbing the data ourselves"
 14 |    ]
 15 |   },
 16 |   {
 17 |    "cell_type": "code",
 18 |    "execution_count": null,
 19 |    "metadata": {
 20 |     "ExecuteTime": {
 21 |      "end_time": "2017-03-06T13:26:29.942689Z",
 22 |      "start_time": "2017-03-06T08:26:29.777081-05:00"
 23 |     },
 24 |     "collapsed": true,
 25 |     "deletable": true,
 26 |     "editable": true
 27 |    },
 28 |    "outputs": [],
 29 |    "source": [
 30 |     "try:\n",
 31 |     "    import requests\n",
 32 |     "except:\n",
 33 |     "    !pip install requests\n",
 34 |     "try:\n",
 35 |     "    from bs4 import BeautifulSoup\n",
 36 |     "except:\n",
 37 |     "    !pip install bs4"
 38 |    ]
 39 |   },
 40 |   {
 41 |    "cell_type": "markdown",
 42 |    "metadata": {
 43 |     "deletable": true,
 44 |     "editable": true
 45 |    },
 46 |    "source": [
 47 |     "## Getting data using requests\n",
 48 |     "[Requests](http://docs.python-requests.org/en/master/) is an excellent library for performing HTTP requests.  \n",
 49 |     "\n",
 50 |     "In this simple example, we will scrape data from the PBS faculty webpage."
 51 |    ]
 52 |   },
 53 |   {
 54 |    "cell_type": "code",
 55 |    "execution_count": null,
 56 |    "metadata": {
 57 |     "ExecuteTime": {
 58 |      "end_time": "2017-03-06T13:26:32.826356Z",
 59 |      "start_time": "2017-03-06T08:26:32.720782-05:00"
 60 |     },
 61 |     "collapsed": false,
 62 |     "deletable": true,
 63 |     "editable": true
 64 |    },
 65 |    "outputs": [],
 66 |    "source": [
 67 |     "page = requests.get(\"http://pbs.dartmouth.edu/people\")\n",
 68 |     "print(page)"
 69 |    ]
 70 |   },
 71 |   {
 72 |    "cell_type": "markdown",
 73 |    "metadata": {
 74 |     "deletable": true,
 75 |     "editable": true
 76 |    },
 77 |    "source": [
 78 |     "Here the response '200' indicates that the get request was successful.  Now let's look at the actual text that was downloaded from the webpage."
 79 |    ]
 80 |   },
 81 |   {
 82 |    "cell_type": "code",
 83 |    "execution_count": null,
 84 |    "metadata": {
 85 |     "ExecuteTime": {
 86 |      "end_time": "2017-03-06T05:28:31.476250Z",
 87 |      "start_time": "2017-03-06T00:28:31.463432-05:00"
 88 |     },
 89 |     "collapsed": false,
 90 |     "deletable": true,
 91 |     "editable": true
 92 |    },
 93 |    "outputs": [],
 94 |    "source": [
 95 |     "print(page.content)"
 96 |    ]
 97 |   },
 98 |   {
 99 |    "cell_type": "markdown",
100 |    "metadata": {
101 |     "deletable": true,
102 |     "editable": true
103 |    },
104 |    "source": [
105 |     "Here you can see that we have downloaded all of the data from the PBS faculty page and that it is in the form of HTML. \n",
106 |     "HTML is a markup language that tells a browser how to layout content.  HTML consists of elements called tags.  Each tag indicates a beginning and end.  Here are a few examples: \n",
107 |     "\n",
108 |     " - `<a></a>` - indicates hyperlink\n",
109 |     " - `<p></p>` - indicates paragraph\n",
110 |     " - `<div></div>` - indicates a division, or area, of the page.\n",
111 |     " - `<b></b>` - bolds any text inside.\n",
112 |     " - `<i></i>` - italicizes any text inside.\n",
113 |     " - `<h1></h1>` - indicates a header\n",
114 |     " - `<table></table>` - creates a table.\n",
115 |     " - `<ol></ol>` - ordered list\n",
116 |     " - `<ul></ul>` - unordered list\n",
117 |     " - `<li></li>` - list item\n"
118 |    ]
119 |   },
120 |   {
121 |    "cell_type": "markdown",
122 |    "metadata": {
123 |     "deletable": true,
124 |     "editable": true
125 |    },
126 |    "source": [
127 |     "## Parsing HTML using Beautiful Soup\n",
128 |     "There are many libraries that can be helpful for quickly parsing structured text such as HTML.  We will be using Beautiful Soup as an example."
129 |    ]
130 |   },
131 |   {
132 |    "cell_type": "code",
133 |    "execution_count": null,
134 |    "metadata": {
135 |     "ExecuteTime": {
136 |      "end_time": "2017-03-06T13:26:39.183625Z",
137 |      "start_time": "2017-03-06T08:26:39.071917-05:00"
138 |     },
139 |     "collapsed": false,
140 |     "deletable": true,
141 |     "editable": true
142 |    },
143 |    "outputs": [],
144 |    "source": [
145 |     "soup = BeautifulSoup(page.content, 'html.parser')\n",
146 |     "print(soup.prettify())"
147 |    ]
148 |   },
149 |   {
150 |    "cell_type": "markdown",
151 |    "metadata": {
152 |     "deletable": true,
153 |     "editable": true
154 |    },
155 |    "source": [
156 |     "Here we are going to find the unordered list tagged with the id 'faculty-container'.  We are then going to look for any nested tag that use the 'h4' header tag.  This should give us all of the lines with the faculty names as a list."
157 |    ]
158 |   },
159 |   {
160 |    "cell_type": "code",
161 |    "execution_count": null,
162 |    "metadata": {
163 |     "ExecuteTime": {
164 |      "end_time": "2017-03-06T13:26:41.798326Z",
165 |      "start_time": "2017-03-06T08:26:41.769082-05:00"
166 |     },
167 |     "collapsed": false,
168 |     "deletable": true,
169 |     "editable": true
170 |    },
171 |    "outputs": [],
172 |    "source": [
173 |     "names_html = soup.find_all('ul',id='faculty-container')[0].find_all('h4')\n",
174 |     "names = [x.text for x in names_html]\n",
175 |     "print(names)"
176 |    ]
177 |   },
178 |   {
179 |    "cell_type": "markdown",
180 |    "metadata": {
181 |     "deletable": true,
182 |     "editable": true
183 |    },
184 |    "source": [
185 |     "What if we wanted to get all of the faculty email addresses?"
186 |    ]
187 |   },
188 |   {
189 |    "cell_type": "code",
190 |    "execution_count": null,
191 |    "metadata": {
192 |     "ExecuteTime": {
193 |      "end_time": "2017-03-06T13:26:45.729051Z",
194 |      "start_time": "2017-03-06T08:26:45.696816-05:00"
195 |     },
196 |     "collapsed": false,
197 |     "deletable": true,
198 |     "editable": true
199 |    },
200 |    "outputs": [],
201 |    "source": [
202 |     "email_html = soup.find_all('ul',id='faculty-container')[0].find_all('span',{'class' : 'contact'})\n",
203 |     "email = [x.text for x in email_html]\n",
204 |     "print(email)"
205 |    ]
206 |   },
207 |   {
208 |    "cell_type": "markdown",
209 |    "metadata": {
210 |     "deletable": true,
211 |     "editable": true
212 |    },
213 |    "source": [
214 |     "## Parsing string data\n",
215 |     "What if we wanted to grab the name from the list of email addresses?"
216 |    ]
217 |   },
218 |   {
219 |    "cell_type": "code",
220 |    "execution_count": null,
221 |    "metadata": {
222 |     "ExecuteTime": {
223 |      "end_time": "2017-03-06T06:16:47.161190Z",
224 |      "start_time": "2017-03-06T01:16:47.154047-05:00"
225 |     },
226 |     "collapsed": false,
227 |     "deletable": true,
228 |     "editable": true
229 |    },
230 |    "outputs": [],
231 |    "source": [
232 |     "print([x.split('@')[0] for x in email])"
233 |    ]
234 |   },
235 |   {
236 |    "cell_type": "markdown",
237 |    "metadata": {
238 |     "collapsed": true,
239 |     "deletable": true,
240 |     "editable": true
241 |    },
242 |    "source": [
243 |     "One thing we might do with this data is create a dictionary with names and emails of all of the professors in the department.  This could be useful if we wanted to send a bulk email to them.  "
244 |    ]
245 |   },
246 |   {
247 |    "cell_type": "code",
248 |    "execution_count": null,
249 |    "metadata": {
250 |     "ExecuteTime": {
251 |      "end_time": "2017-03-06T13:37:49.514990Z",
252 |      "start_time": "2017-03-06T08:37:49.509404-05:00"
253 |     },
254 |     "collapsed": false,
255 |     "deletable": true,
256 |     "editable": true
257 |    },
258 |    "outputs": [],
259 |    "source": [
260 |     "email_dict = dict([(x.split('@')[0],x) for x in email])\n",
261 |     "print(email_dict)"
262 |    ]
263 |   },
264 |   {
265 |    "cell_type": "markdown",
266 |    "metadata": {},
267 |    "source": [
268 |     "You can see that every name also includes an initial.  Let's try to just pull out the first and last name."
269 |    ]
270 |   },
271 |   {
272 |    "cell_type": "code",
273 |    "execution_count": null,
274 |    "metadata": {
275 |     "ExecuteTime": {
276 |      "end_time": "2017-03-06T13:37:51.752857Z",
277 |      "start_time": "2017-03-06T08:37:51.747826-05:00"
278 |     },
279 |     "collapsed": false
280 |    },
281 |    "outputs": [],
282 |    "source": [
283 |     "for x in email_dict.keys():\n",
284 |     "    old = x.split('.')\n",
285 |     "    email_dict[\" \".join([i for i in old if len(i) > 2])] = email_dict[x]\n",
286 |     "    del email_dict[x]\n",
287 |     "\n",
288 |     "print(email_dict)"
289 |    ]
290 |   },
291 |   {
292 |    "cell_type": "markdown",
293 |    "metadata": {
294 |     "deletable": true,
295 |     "editable": true
296 |    },
297 |    "source": [
298 |     "## Interacting with web page using Selenium\n",
299 |     "Sometimes we need to directly interact with aspects of the webpage.  Maybe there is a form that needs to be submitted or a javascript button that needs to be pressed.  [Selenium](http://selenium-python.readthedocs.io/) is a useful tool for these types of tasks."
300 |    ]
301 |   },
302 |   {
303 |    "cell_type": "code",
304 |    "execution_count": null,
305 |    "metadata": {
306 |     "collapsed": true
307 |    },
308 |    "outputs": [],
309 |    "source": []
310 |   }
311 |  ],
312 |  "metadata": {
313 |   "kernelspec": {
314 |    "display_name": "Python 2",
315 |    "language": "python",
316 |    "name": "python2"
317 |   },
318 |   "language_info": {
319 |    "codemirror_mode": {
320 |     "name": "ipython",
321 |     "version": 2
322 |    },
323 |    "file_extension": ".py",
324 |    "mimetype": "text/x-python",
325 |    "name": "python",
326 |    "nbconvert_exporter": "python",
327 |    "pygments_lexer": "ipython2",
328 |    "version": "2.7.13"
329 |   }
330 |  },
331 |  "nbformat": 4,
332 |  "nbformat_minor": 2
333 | }
334 | 


--------------------------------------------------------------------------------
/Notebooks/Figures/centrality.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ljchang/psyc63/b3329228952c9c6793196db502b0442e77863e46/Notebooks/Figures/centrality.png


--------------------------------------------------------------------------------
/Notebooks/Figures/estimating_coefficients.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ljchang/psyc63/b3329228952c9c6793196db502b0442e77863e46/Notebooks/Figures/estimating_coefficients.png


--------------------------------------------------------------------------------
/Notebooks/Figures/hw2-3.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ljchang/psyc63/b3329228952c9c6793196db502b0442e77863e46/Notebooks/Figures/hw2-3.png


--------------------------------------------------------------------------------
/Notebooks/Figures/hw2-4.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ljchang/psyc63/b3329228952c9c6793196db502b0442e77863e46/Notebooks/Figures/hw2-4.png


--------------------------------------------------------------------------------
/Notebooks/Figures/mediation1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ljchang/psyc63/b3329228952c9c6793196db502b0442e77863e46/Notebooks/Figures/mediation1.png


--------------------------------------------------------------------------------
/Notebooks/Figures/moderator1.gif:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ljchang/psyc63/b3329228952c9c6793196db502b0442e77863e46/Notebooks/Figures/moderator1.gif


--------------------------------------------------------------------------------
/Notebooks/Figures/moderator2.gif:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ljchang/psyc63/b3329228952c9c6793196db502b0442e77863e46/Notebooks/Figures/moderator2.gif


--------------------------------------------------------------------------------
/Notebooks/Figures/moderator3.jpeg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ljchang/psyc63/b3329228952c9c6793196db502b0442e77863e46/Notebooks/Figures/moderator3.jpeg


--------------------------------------------------------------------------------
/Notebooks/Figures/nodeedge.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ljchang/psyc63/b3329228952c9c6793196db502b0442e77863e46/Notebooks/Figures/nodeedge.png


--------------------------------------------------------------------------------
/Notebooks/Figures/slope_intercept.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ljchang/psyc63/b3329228952c9c6793196db502b0442e77863e46/Notebooks/Figures/slope_intercept.png


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | # psyc63
 2 | This gihub repository provides lab materials that accompany Psyc63 Experimental Study of Behavior at Dartmouth College.  Labs will consist of working through [Jupyter Notebooks](http://jupyter.org/) that contain exercises in Python. Jupyter Notebooks can be installed by downloading and installing [Anaconda Python 2.7](https://www.continuum.io/downloads). 
 3 | 
 4 | ## Jupyter Notebooks
 5 | ### Introduction to Python
 6 |   - Installation
 7 |   - Notebooks
 8 |   - Variables
 9 |   - Loops and Conditional Statements
10 |   - Functions
11 |   - Containers
12 |   - Modules
13 |   
14 | ### Data Analysis I
15 |   - Data Checking
16 |   - Data Plotting
17 |   - Linear Model Regression
18 |   - Model Comparisons
19 | 
20 | ### Data Analysis II
21 |   - Logistic Regression
22 |   - Classification
23 |   - Prediction
24 |   - Cross-validation
25 |   - Regularization Techniques 
26 |     - Lasso & Ridge
27 |   
28 | ### Data Analysis III
29 |   - Mediation Analysis
30 |   - Moderation Analysis
31 | 
32 | ### Social Network Analysis
33 |   - Primer to Social Network Analysis
34 |   - NetworkX 
35 |   - Micromeasures
36 |     - degree centrality
37 |     - betweenness centrality
38 |     - closeness centrality
39 |     - Eigenvector centrality
40 |   -   Macromeasures
41 |     - Degree Distribution
42 |     - Average shortest path
43 |     - Clustering coefficients
44 | 
45 | 
46 | 


--------------------------------------------------------------------------------