├── .gitignore ├── Data ├── 13May16 │ ├── contains.txt │ ├── subjects.p │ └── users.p ├── CrowdstormingDataJuly1st.csv ├── love_those_books.csv ├── salary.csv ├── salary_exercise.csv └── student-mat.csv ├── LICENSE ├── Notebooks ├── .gitignore ├── 1_Introduction_to_Programming.ipynb ├── 2.1_MatplotlibTemplates.ipynb ├── 2_Introduction_to_Dataframes_&_Plotting.ipynb ├── 3_Introduction_to_Regression.ipynb ├── 4_Mediation_&_Moderation.ipynb ├── 5_Prediction_&_Regularization.ipynb ├── 6_Introduction_to_SocialNetworkAnalysis.ipynb ├── 7_Introduction_to_Scraping.ipynb ├── Figures │ ├── centrality.png │ ├── estimating_coefficients.png │ ├── hw2-3.png │ ├── hw2-4.png │ ├── mediation1.png │ ├── moderator1.gif │ ├── moderator2.gif │ ├── moderator3.jpeg │ ├── nodeedge.png │ └── slope_intercept.png └── Old │ ├── Classification.ipynb │ └── phacking.ipynb └── README.md /.gitignore: -------------------------------------------------------------------------------- 1 | # Byte-compiled / optimized / DLL files 2 | __pycache__/ 3 | *.py[cod] 4 | *$py.class 5 | 6 | # C extensions 7 | *.so 8 | 9 | # Distribution / packaging 10 | .Python 11 | env/ 12 | build/ 13 | develop-eggs/ 14 | dist/ 15 | downloads/ 16 | eggs/ 17 | .eggs/ 18 | lib/ 19 | lib64/ 20 | parts/ 21 | sdist/ 22 | var/ 23 | *.egg-info/ 24 | .installed.cfg 25 | *.egg 26 | 27 | # PyInstaller 28 | # Usually these files are written by a python script from a template 29 | # before PyInstaller builds the exe, so as to inject date/other infos into it. 30 | *.manifest 31 | *.spec 32 | 33 | # Installer logs 34 | pip-log.txt 35 | pip-delete-this-directory.txt 36 | 37 | # Unit test / coverage reports 38 | htmlcov/ 39 | .tox/ 40 | .coverage 41 | .coverage.* 42 | .cache 43 | nosetests.xml 44 | coverage.xml 45 | *,cover 46 | .hypothesis/ 47 | 48 | # Translations 49 | *.mo 50 | *.pot 51 | 52 | # Django stuff: 53 | *.log 54 | local_settings.py 55 | 56 | # Flask stuff: 57 | instance/ 58 | .webassets-cache 59 | 60 | # Scrapy stuff: 61 | .scrapy 62 | 63 | # Sphinx documentation 64 | docs/_build/ 65 | 66 | # PyBuilder 67 | target/ 68 | 69 | # IPython Notebook 70 | .ipynb_checkpoints 71 | 72 | # pyenv 73 | .python-version 74 | 75 | # celery beat schedule file 76 | celerybeat-schedule 77 | 78 | # dotenv 79 | .env 80 | 81 | # virtualenv 82 | venv/ 83 | ENV/ 84 | 85 | # Spyder project settings 86 | .spyderproject 87 | 88 | # Rope project settings 89 | .ropeproject 90 | -------------------------------------------------------------------------------- /Data/13May16/contains.txt: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ljchang/psyc63/b3329228952c9c6793196db502b0442e77863e46/Data/13May16/contains.txt -------------------------------------------------------------------------------- /Data/13May16/users.p: -------------------------------------------------------------------------------- 1 | (lp0 2 | (dp1 3 | Vservices 4 | p2 5 | (dp3 6 | Vpassword 7 | p4 8 | (dp5 9 | Vbcrypt 10 | p6 11 | V$2a$10$pbAb8K6OmsOmonCzbGRf0e1MrLp7ec1JnYIzEctMLyn9btOpzvAmG 12 | p7 13 | ssVresume 14 | p8 15 | (dp9 16 | VloginTokens 17 | p10 18 | (lp11 19 | sssV_id 20 | p12 21 | Vogrw4dL9mksHEKigC 22 | p13 23 | sVemails 24 | p14 25 | (lp15 26 | (dp16 27 | Vverified 28 | p17 29 | I00 30 | sVaddress 31 | p18 32 | Vejolly@test.com 33 | p19 34 | sasVcreatedAt 35 | p20 36 | cdatetime 37 | datetime 38 | p21 39 | (S'\x07\xe0\x05\x08\x17#\x1d\n\xcd\xa0' 40 | p22 41 | tp23 42 | Rp24 43 | sa(dp25 44 | Vservices 45 | p26 46 | (dp27 47 | Vpassword 48 | p28 49 | (dp29 50 | Vbcrypt 51 | p30 52 | V$2a$10$5P3UUGulQ7RACVROaU56l.H1gYvRhDbuvmuXM7UXHzC/6iyt3gOJ. 53 | p31 54 | ssVresume 55 | p32 56 | (dp33 57 | VloginTokens 58 | p34 59 | (lp35 60 | sssV_id 61 | p36 62 | VGwbhewAmNJ3nA8g4d 63 | p37 64 | sVemails 65 | p38 66 | (lp39 67 | (dp40 68 | Vverified 69 | p41 70 | I00 71 | sVaddress 72 | p42 73 | Vjames.e.carmichael.gr@dartmouth.edu 74 | p43 75 | sasVcreatedAt 76 | p44 77 | g21 78 | (S"\x07\xe0\x05\x08\x17'6\x0bZ@" 79 | p45 80 | tp46 81 | Rp47 82 | sa(dp48 83 | Vservices 84 | p49 85 | (dp50 86 | Vpassword 87 | p51 88 | (dp52 89 | Vbcrypt 90 | p53 91 | V$2a$10$ymYsiX7HMD4inQ20MHD/VutAr844y3JvmjeOcqLH4wu8krSX8fUXK 92 | p54 93 | ssVresume 94 | p55 95 | (dp56 96 | VloginTokens 97 | p57 98 | (lp58 99 | sssV_id 100 | p59 101 | VtksS632f4u9JtZ3LZ 102 | p60 103 | sVemails 104 | p61 105 | (lp62 106 | (dp63 107 | Vverified 108 | p64 109 | I00 110 | sVaddress 111 | p65 112 | Vcbentivoglio@middlebury.edu 113 | p66 114 | sasVcreatedAt 115 | p67 116 | g21 117 | (S'\x07\xe0\x05\t\x006;\x02Y\x90' 118 | p68 119 | tp69 120 | Rp70 121 | sa(dp71 122 | Vservices 123 | p72 124 | (dp73 125 | Vpassword 126 | p74 127 | (dp75 128 | Vbcrypt 129 | p76 130 | V$2a$10$uBmZiohM3ZdT/EpsZWAlUeffalyYaZAToT.wtqAmPEMTABfWk14wi 131 | p77 132 | ssVresume 133 | p78 134 | (dp79 135 | VloginTokens 136 | p80 137 | (lp81 138 | sssV_id 139 | p82 140 | VQa8shq3L9omL5xHKo 141 | p83 142 | sVemails 143 | p84 144 | (lp85 145 | (dp86 146 | Vverified 147 | p87 148 | I00 149 | sVaddress 150 | p88 151 | Vtest@test.com 152 | p89 153 | sasVcreatedAt 154 | p90 155 | g21 156 | (S'\x07\xe0\x05\t\x10\x1d#\x05\xcc`' 157 | p91 158 | tp92 159 | Rp93 160 | sa(dp94 161 | Vservices 162 | p95 163 | (dp96 164 | Vpassword 165 | p97 166 | (dp98 167 | Vbcrypt 168 | p99 169 | V$2a$10$0Bhtui2bp05B9GyS3KBhtubQz9xlVCXYhDvefDAW47TCS/iPxRNUG 170 | p100 171 | ssVresume 172 | p101 173 | (dp102 174 | VloginTokens 175 | p103 176 | (lp104 177 | (dp105 178 | VhashedToken 179 | p106 180 | VbyXZrVYdeasFKoHzaKN0kjoffxeHAEIdcJSJZttl2Ik= 181 | p107 182 | sVwhen 183 | p108 184 | g21 185 | (S"\x07\xe0\x05\n\x16\x18'\x07V\xe8" 186 | p109 187 | tp110 188 | Rp111 189 | sa(dp112 190 | VhashedToken 191 | p113 192 | V5deRxFJTT8QhMfCES+CpwsQ2R31GgE8/RYnVb2UHw7g= 193 | p114 194 | sVwhen 195 | p115 196 | g21 197 | (S'\x07\xe0\x05\n\x16\x19.\t\xa0\xd8' 198 | p116 199 | tp117 200 | Rp118 201 | sa(dp119 202 | VhashedToken 203 | p120 204 | V9ToUywNIWQBE9ZFr1ZK5hNBafsheg1zqOSs8IMCr9YE= 205 | p121 206 | sVwhen 207 | p122 208 | g21 209 | (S'\x07\xe0\x05\n\x17/\x01\x05\xcc`' 210 | p123 211 | tp124 212 | Rp125 213 | sasssV_id 214 | p126 215 | V3fcdRR5Y3hzdABSXn 216 | p127 217 | sVemails 218 | p128 219 | (lp129 220 | (dp130 221 | Vverified 222 | p131 223 | I00 224 | sVaddress 225 | p132 226 | Vdanrwhitcomb@gmail.com 227 | p133 228 | sasVcreatedAt 229 | p134 230 | g21 231 | (S'\x07\xe0\x05\t\x17#\x19\x01\xd0\xd8' 232 | p135 233 | tp136 234 | Rp137 235 | sa(dp138 236 | Vservices 237 | p139 238 | (dp140 239 | Vpassword 240 | p141 241 | (dp142 242 | Vbcrypt 243 | p143 244 | V$2a$10$eKtiz4QTXwGoGksJumGP0O6NQc9QhzlDHWC91gRop4ZKmbTbgpQ52 245 | p144 246 | ssVresume 247 | p145 248 | (dp146 249 | VloginTokens 250 | p147 251 | (lp148 252 | sssV_id 253 | p149 254 | Vm3q2uoH5waEFCdKLQ 255 | p150 256 | sVemails 257 | p151 258 | (lp152 259 | (dp153 260 | Vverified 261 | p154 262 | I00 263 | sVaddress 264 | p155 265 | Vanna.t.prescott.gr@dartmouth.edu 266 | p156 267 | sasVcreatedAt 268 | p157 269 | g21 270 | (S'\x07\xe0\x05\r\x0f\x1e\x03\x08)\xd8' 271 | p158 272 | tp159 273 | Rp160 274 | sa(dp161 275 | Vservices 276 | p162 277 | (dp163 278 | Vpassword 279 | p164 280 | (dp165 281 | Vbcrypt 282 | p166 283 | V$2a$10$URpEJIxBHRr4WHldL4jLkuGBnrIkLGXrEIvpwVeRcnU1Y17XgUOye 284 | p167 285 | ssVresume 286 | p168 287 | (dp169 288 | VloginTokens 289 | p170 290 | (lp171 291 | sssV_id 292 | p172 293 | VhZHxzvxjpzMzA4nJT 294 | p173 295 | sVemails 296 | p174 297 | (lp175 298 | (dp176 299 | Vverified 300 | p177 301 | I00 302 | sVaddress 303 | p178 304 | Varatidoo@gmail.com 305 | p179 306 | sasVcreatedAt 307 | p180 308 | g21 309 | (S'\x07\xe0\x05\r\x13&\x00\x08\x7f\xc8' 310 | p181 311 | tp182 312 | Rp183 313 | sa. -------------------------------------------------------------------------------- /Data/love_those_books.csv: -------------------------------------------------------------------------------- 1 | enjoy,buy,read 4,16,6 15,19,13 1,0,1 11,19,13 13,25,12 19,24,11 6,22,7 10,21,8 15,13,12 3,7,4 11,28,15 20,31,14 7,4,7 11,26,14 10,11,9 6,12,5 7,14,7 18,16,12 8,20,10 2,13,6 7,12,9 12,23,13 13,22,9 15,19,13 4,12,9 3,10,5 9,7,7 7,22,8 10,7,8 2,0,2 15,16,7 1,17,6 3,11,9 6,5,9 13,29,15 15,29,11 16,20,9 14,16,7 1,3,2 8,8,10 -------------------------------------------------------------------------------- /Data/salary.csv: -------------------------------------------------------------------------------- 1 | salary,gender,departm,years,age,publications 86285,0,bio,26,64,72 77125,0,bio,28,58,43 71922,0,bio,10,38,23 70499,0,bio,16,46,64 66624,0,bio,11,41,23 64451,0,bio,23,60,44 64366,0,bio,23,53,22 59344,0,bio,5,40,11 58560,0,bio,8,38,8 58294,0,bio,20,50,12 56092,0,bio,2,40,4 54452,0,bio,13,43,7 54269,0,bio,26,56,12 55125,0,bio,8,38,9 97630,0,chem,34,64,43 82444,0,chem,31,61,42 76291,0,chem,29,65,33 75382,0,chem,26,56,39 64762,0,chem,25,,29 62607,0,chem,20,45,34 60373,0,chem,26,56,43 58892,0,chem,18,48,21 47021,0,chem,4,34,12 44687,0,chem,4,34,19 104828,0,geol,,50,44 71456,0,geol,11,41,32 65144,0,geol,7,37,12 52766,0,geol,4,38,32 112800,0,neuro,14,44,33 105761,0,neuro,9,39,30 92951,0,neuro,11,41,20 86621,0,neuro,19,49,10 85569,0,neuro,20,46,35 83896,0,neuro,10,40,22 79735,0,neuro,11,41,32 71518,0,neuro,7,37,34 68029,0,neuro,15,45,33 66482,0,neuro,14,44,42 61680,0,neuro,18,48,20 60455,0,neuro,8,38,49 58932,0,neuro,11,41,49 106412,0,stat,23,53,29 86980,0,stat,23,53,42 78114,0,stat,8,38,24 74085,0,stat,11,41,33 72250,0,stat,26,56,9 69596,0,stat,20,50,18 65285,0,stat,20,50,15 62557,0,stat,28,58,14 61947,0,stat,22,58,17 58565,0,stat,29,59,11 58365,0,stat,18,48,21 53656,0,stat,2,32,4 51391,0,stat,5,35,8 96936,0,physics,15,50,17 83216,0,physics,11,37,19 72044,0,physics,2,32,16 64048,0,physics,23,53,4 58888,0,physics,26,56,7 58744,0,physics,20,50,9 55944,0,physics,21,51,8 54076,0,physics,19,49,12 82142,0,math,9,39,9 70509,0,math,23,53,7 60320,0,math,14,44,7 55814,0,math,8,38,6 53638,0,math,4,42,8 53517,2,math,5,35,5 59139,1,bio,8,38,23 52968,1,bio,18,48,32 55949,1,chem,4,34,12 58893,1,neuro,10,35,4 53662,1,neuro,1,31,3 57185,1,stat,9,39,7 52254,1,stat,2,32,9 61885,1,math,23,60,9 49542,1,math,3,33,5 -------------------------------------------------------------------------------- /Data/salary_exercise.csv: -------------------------------------------------------------------------------- 1 | sx,rk,yr,dg,yd,sl male,full,25,doctorate,35,36350 male,full,13,doctorate,22,35350 male,full,10,doctorate,23,28200 female,full,7,doctorate,27,26775 male,full,19,masters,30,33696 male,full,16,doctorate,21,28516 female,full,0,masters,32,24900 male,full,16,doctorate,18,31909 male,full,13,masters,30,31850 male,full,13,masters,31,32850 male,full,12,doctorate,22,27025 male,associate,15,doctorate,19,24750 male,full,9,doctorate,17,28200 male,associate,9,,27,23712 male,full,9,doctorate,24,25748 male,full,7,doctorate,15,29342 male,full,13,doctorate,20,31114 male,associate,11,masters,14,24742 male,associate,10,masters,15,22906 male,full,6,masters,21,24450 male,assistant,16,masters,23,19175 male,associate,8,masters,31,20525 male,full,7,doctorate,13,27959 female,full,8,doctorate,24,38045 male,associate,9,doctorate,12, male,full,5,doctorate,18,25400 male,associate,11,doctorate,14,24800 female,full,5,doctorate,16,25500 male,associate,3,masters,7,26182 male,associate,3,masters,17,23725 female,assistant,10,masters,15,21600 male,associate,11,masters,31,23300 male,assistant,9,masters,14,23713 female,associate,4,masters,33,20690 female,associate,6,masters,29,22450 male,associate,1,doctorate,9,20850 female,assistant,8,doctorate,14,18304 male,assistant,4,doctorate,4,17095 male,assistant,4,doctorate,5,16700 male,assistant,4,doctorate,4,17600 ,assistant,3,doctorate,4,18075 male,assistant,3,masters,11,18000 male,associate,0,doctorate,7,20999 female,assistant,3,doctorate,3,17250 male,assistant,2,doctorate,3,16500 male,assistant,2,doctorate,1,16094 female,assistant,2,doctorate,6,16150 female,assistant,2,doctorate,2,15350 male,assistant,1,doctorate,1,16244 female,assistant,1,doctorate,1,16686 female,assistant,1,doctorate,1,15000 female,assistant,0,doctorate,2,20300 -------------------------------------------------------------------------------- /Data/student-mat.csv: -------------------------------------------------------------------------------- 1 | school,sex,age,address,famsize,Pstatus,Medu,Fedu,Mjob,Fjob,reason,guardian,traveltime,studytime,failures,schoolsup,famsup,paid,activities,nursery,higher,internet,romantic,famrel,freetime,goout,Dalc,Walc,health,absences,G1,G2,G3 GP,F,18,U,GT3,A,4,4,at_home,teacher,course,mother,2,2,0,yes,no,no,no,yes,yes,no,no,4,3,4,1,1,3,6,5,6,6 GP,F,17,U,GT3,T,1,1,at_home,other,course,father,1,2,0,no,yes,no,no,no,yes,yes,no,5,3,3,1,1,3,4,5,5,6 GP,F,15,U,LE3,T,1,1,at_home,other,other,mother,1,2,3,yes,no,yes,no,yes,yes,yes,no,4,3,2,2,3,3,10,7,8,10 GP,F,15,U,GT3,T,4,2,health,services,home,mother,1,3,0,no,yes,yes,yes,yes,yes,yes,yes,3,2,2,1,1,5,2,15,14,15 GP,F,16,U,GT3,T,3,3,other,other,home,father,1,2,0,no,yes,yes,no,yes,yes,no,no,4,3,2,1,2,5,4,6,10,10 GP,M,16,U,LE3,T,4,3,services,other,reputation,mother,1,2,0,no,yes,yes,yes,yes,yes,yes,no,5,4,2,1,2,5,10,15,15,15 GP,M,16,U,LE3,T,2,2,other,other,home,mother,1,2,0,no,no,no,no,yes,yes,yes,no,4,4,4,1,1,3,0,12,12,11 GP,F,17,U,GT3,A,4,4,other,teacher,home,mother,2,2,0,yes,yes,no,no,yes,yes,no,no,4,1,4,1,1,1,6,6,5,6 GP,M,15,U,LE3,A,3,2,services,other,home,mother,1,2,0,no,yes,yes,no,yes,yes,yes,no,4,2,2,1,1,1,0,16,18,19 GP,M,15,U,GT3,T,3,4,other,other,home,mother,1,2,0,no,yes,yes,yes,yes,yes,yes,no,5,5,1,1,1,5,0,14,15,15 GP,F,15,U,GT3,T,4,4,teacher,health,reputation,mother,1,2,0,no,yes,yes,no,yes,yes,yes,no,3,3,3,1,2,2,0,10,8,9 GP,F,15,U,GT3,T,2,1,services,other,reputation,father,3,3,0,no,yes,no,yes,yes,yes,yes,no,5,2,2,1,1,4,4,10,12,12 GP,M,15,U,LE3,T,4,4,health,services,course,father,1,1,0,no,yes,yes,yes,yes,yes,yes,no,4,3,3,1,3,5,2,14,14,14 GP,M,15,U,GT3,T,4,3,teacher,other,course,mother,2,2,0,no,yes,yes,no,yes,yes,yes,no,5,4,3,1,2,3,2,10,10,11 GP,M,15,U,GT3,A,2,2,other,other,home,other,1,3,0,no,yes,no,no,yes,yes,yes,yes,4,5,2,4,4,3,0,14,16,16 GP,F,16,U,GT3,T,4,4,health,other,home,mother,1,1,0,no,yes,no,no,yes,yes,yes,no,4,4,4,1,2,2,4,14,14,14 GP,F,16,U,GT3,T,4,4,services,services,reputation,mother,1,3,0,no,yes,yes,yes,yes,yes,yes,no,3,2,3,1,2,2,6,13,14,14 GP,F,16,U,GT3,T,3,3,other,other,reputation,mother,3,2,0,yes,yes,no,yes,yes,yes,no,no,5,3,2,1,1,4,4,8,10,10 GP,M,17,U,GT3,T,3,2,services,services,course,mother,1,1,3,no,yes,no,yes,yes,yes,yes,no,5,5,5,5,4,5,16,6,5,5 GP,M,16,U,LE3,T,4,3,health,other,home,father,1,1,0,no,no,yes,yes,yes,yes,yes,no,3,1,3,1,3,5,4,8,10,10 GP,M,15,U,GT3,T,4,3,teacher,other,reputation,mother,1,2,0,no,no,no,no,yes,yes,yes,no,4,4,1,1,1,1,0,13,14,15 GP,M,15,U,GT3,T,4,4,health,health,other,father,1,1,0,no,yes,yes,no,yes,yes,yes,no,5,4,2,1,1,5,0,12,15,15 GP,M,16,U,LE3,T,4,2,teacher,other,course,mother,1,2,0,no,no,no,yes,yes,yes,yes,no,4,5,1,5,3,5,2,15,15,16 GP,M,16,U,LE3,T,2,2,other,other,reputation,mother,2,2,0,no,yes,no,yes,yes,yes,yes,no,5,4,4,2,4,5,0,13,13,12 GP,F,15,R,GT3,T,2,4,services,health,course,mother,1,3,0,yes,yes,yes,yes,yes,yes,yes,no,4,3,2,1,1,5,2,10,9,8 GP,F,16,U,GT3,T,2,2,services,services,home,mother,1,1,2,no,yes,yes,no,no,yes,yes,no,1,2,2,1,3,5,14,6,9,8 GP,M,15,U,GT3,T,2,2,other,other,home,mother,1,1,0,no,yes,yes,no,yes,yes,yes,no,4,2,2,1,2,5,2,12,12,11 GP,M,15,U,GT3,T,4,2,health,services,other,mother,1,1,0,no,no,yes,no,yes,yes,yes,no,2,2,4,2,4,1,4,15,16,15 GP,M,16,U,LE3,A,3,4,services,other,home,mother,1,2,0,yes,yes,no,yes,yes,yes,yes,no,5,3,3,1,1,5,4,11,11,11 GP,M,16,U,GT3,T,4,4,teacher,teacher,home,mother,1,2,0,no,yes,yes,yes,yes,yes,yes,yes,4,4,5,5,5,5,16,10,12,11 GP,M,15,U,GT3,T,4,4,health,services,home,mother,1,2,0,no,yes,yes,no,no,yes,yes,no,5,4,2,5,4,5,0,9,11,12 GP,M,15,U,GT3,T,4,4,services,services,reputation,mother,2,2,0,no,yes,no,yes,yes,yes,yes,no,4,3,1,1,1,5,0,17,16,17 GP,M,15,R,GT3,T,4,3,teacher,at_home,course,mother,1,2,0,no,yes,no,yes,yes,yes,yes,yes,4,5,2,1,1,5,0,17,16,16 GP,M,15,U,LE3,T,3,3,other,other,course,mother,1,2,0,no,no,no,yes,no,yes,yes,no,5,3,2,1,1,2,0,8,10,12 GP,M,16,U,GT3,T,3,2,other,other,home,mother,1,1,0,no,yes,yes,no,no,yes,yes,no,5,4,3,1,1,5,0,12,14,15 GP,F,15,U,GT3,T,2,3,other,other,other,father,2,1,0,no,yes,no,yes,yes,yes,no,no,3,5,1,1,1,5,0,8,7,6 GP,M,15,U,LE3,T,4,3,teacher,services,home,mother,1,3,0,no,yes,no,yes,yes,yes,yes,no,5,4,3,1,1,4,2,15,16,18 GP,M,16,R,GT3,A,4,4,other,teacher,reputation,mother,2,3,0,no,yes,no,yes,yes,yes,yes,yes,2,4,3,1,1,5,7,15,16,15 GP,F,15,R,GT3,T,3,4,services,health,course,mother,1,3,0,yes,yes,yes,yes,yes,yes,yes,no,4,3,2,1,1,5,2,12,12,11 GP,F,15,R,GT3,T,2,2,at_home,other,reputation,mother,1,1,0,yes,yes,yes,yes,yes,yes,no,no,4,3,1,1,1,2,8,14,13,13 GP,F,16,U,LE3,T,2,2,other,other,home,mother,2,2,1,no,yes,no,yes,no,yes,yes,yes,3,3,3,1,2,3,25,7,10,11 GP,M,15,U,LE3,T,4,4,teacher,other,home,other,1,1,0,no,yes,no,no,no,yes,yes,yes,5,4,3,5,4,5,8,12,12,12 GP,M,15,U,GT3,T,4,4,services,teacher,course,father,1,2,0,no,yes,no,yes,yes,yes,yes,no,4,3,3,1,1,5,2,19,18,18 GP,M,15,U,GT3,T,2,2,services,services,course,father,1,1,0,yes,yes,no,no,yes,yes,yes,no,5,4,1,1,1,1,0,8,8,11 GP,F,16,U,LE3,T,2,2,other,at_home,course,father,2,2,1,yes,no,no,yes,yes,yes,yes,no,4,3,3,2,2,5,14,10,10,9 GP,F,15,U,LE3,A,4,3,other,other,course,mother,1,2,0,yes,yes,yes,yes,yes,yes,yes,yes,5,2,2,1,1,5,8,8,8,6 GP,F,16,U,LE3,A,3,3,other,services,home,mother,1,2,0,no,yes,no,no,yes,yes,yes,no,2,3,5,1,4,3,12,11,12,11 GP,M,16,U,GT3,T,4,3,health,services,reputation,mother,1,4,0,no,no,no,yes,yes,yes,yes,no,4,2,2,1,1,2,4,19,19,20 GP,M,15,U,GT3,T,4,2,teacher,other,home,mother,1,2,0,no,yes,yes,no,yes,yes,no,no,4,3,3,2,2,5,2,15,15,14 GP,F,15,U,GT3,T,4,4,services,teacher,other,father,1,2,1,yes,yes,no,yes,no,yes,yes,no,4,4,4,1,1,3,2,7,7,7 GP,F,16,U,LE3,T,2,2,services,services,course,mother,3,2,0,no,yes,yes,no,yes,yes,yes,no,4,3,3,2,3,4,2,12,13,13 GP,F,15,U,LE3,T,4,2,health,other,other,mother,1,2,0,no,yes,yes,no,yes,yes,yes,no,4,3,3,1,1,5,2,11,13,13 GP,M,15,U,LE3,A,4,2,health,health,other,father,2,1,1,no,no,no,no,yes,yes,no,no,5,5,5,5,4,5,6,11,11,10 GP,F,15,U,GT3,T,4,4,services,services,course,mother,1,1,0,yes,yes,yes,no,yes,yes,yes,no,3,3,4,2,3,5,0,8,10,11 GP,F,15,U,LE3,A,3,3,other,other,other,mother,1,1,0,no,no,yes,no,yes,yes,yes,no,5,3,4,4,4,1,6,10,13,13 GP,F,16,U,GT3,A,2,1,other,other,other,mother,1,2,0,no,no,yes,yes,yes,yes,yes,yes,5,3,4,1,1,2,8,8,9,10 GP,F,15,U,GT3,A,4,3,services,services,reputation,mother,1,2,0,no,yes,yes,yes,yes,yes,yes,no,4,3,2,1,1,1,0,14,15,15 GP,M,15,U,GT3,T,4,4,teacher,health,reputation,mother,1,2,0,no,yes,no,yes,yes,yes,no,no,3,2,2,1,1,5,4,14,15,15 GP,M,15,U,LE3,T,1,2,other,at_home,home,father,1,2,0,yes,yes,no,yes,yes,yes,yes,no,4,3,2,1,1,5,2,9,10,9 GP,F,16,U,GT3,T,4,2,services,other,course,mother,1,2,0,no,yes,no,no,yes,yes,yes,no,4,2,3,1,1,5,2,15,16,16 GP,F,16,R,GT3,T,4,4,health,teacher,other,mother,1,2,0,no,yes,no,yes,yes,yes,no,no,2,4,4,5,3,4,6,10,11,11 GP,F,16,U,GT3,T,1,1,services,services,course,father,4,1,0,yes,yes,no,yes,no,yes,yes,yes,5,5,5,5,5,5,6,10,8,11 GP,F,16,U,LE3,T,1,2,other,services,reputation,father,1,2,0,yes,no,no,yes,yes,yes,yes,no,4,4,3,1,1,1,4,8,10,9 GP,F,16,U,GT3,T,4,3,teacher,health,home,mother,1,3,0,yes,yes,yes,yes,yes,yes,yes,no,3,4,4,5,4,4,2,10,9,9 GP,F,15,U,LE3,T,4,3,services,services,reputation,father,1,2,0,yes,no,no,yes,yes,yes,yes,yes,4,4,4,5,4,2,0,10,10,10 GP,F,16,U,LE3,T,4,3,teacher,services,course,mother,3,2,0,no,yes,no,yes,yes,yes,yes,no,5,4,3,1,2,1,2,16,15,15 GP,M,15,U,GT3,A,4,4,other,services,reputation,mother,1,4,0,no,yes,no,yes,no,yes,yes,yes,1,3,3,5,5,3,4,13,13,12 GP,F,16,U,GT3,T,3,1,services,other,course,mother,1,4,0,yes,yes,yes,no,yes,yes,yes,no,4,3,3,1,2,5,4,7,7,6 GP,F,15,R,LE3,T,2,2,health,services,reputation,mother,2,2,0,yes,yes,yes,no,yes,yes,yes,no,4,1,3,1,3,4,2,8,9,8 GP,F,15,R,LE3,T,3,1,other,other,reputation,father,2,4,0,no,yes,no,no,no,yes,yes,no,4,4,2,5,3,3,12,16,16,16 GP,M,16,U,GT3,T,3,1,other,other,reputation,father,2,4,0,no,yes,yes,no,yes,yes,yes,no,4,3,2,1,1,5,0,13,15,15 GP,M,15,U,GT3,T,4,2,other,other,course,mother,1,4,0,no,no,no,no,yes,yes,yes,no,3,3,3,1,1,3,0,10,10,10 GP,F,15,R,GT3,T,1,1,other,other,reputation,mother,1,2,2,yes,yes,no,no,no,yes,yes,yes,3,3,4,2,4,5,2,8,6,5 GP,M,16,U,GT3,T,3,1,other,other,reputation,mother,1,1,0,no,no,no,yes,yes,yes,no,no,5,3,2,2,2,5,2,12,12,14 GP,F,16,U,GT3,T,3,3,other,services,home,mother,1,2,0,yes,yes,yes,yes,yes,yes,yes,no,4,3,3,2,4,5,54,11,12,11 GP,M,15,U,GT3,T,4,3,teacher,other,home,mother,1,2,0,no,yes,yes,yes,yes,yes,yes,no,4,3,3,2,3,5,6,9,9,10 GP,M,15,U,GT3,T,4,0,teacher,other,course,mother,2,4,0,no,no,no,yes,yes,yes,yes,no,3,4,3,1,1,1,8,11,11,10 GP,F,16,U,GT3,T,2,2,other,other,reputation,mother,1,4,0,no,no,yes,no,yes,yes,yes,yes,5,2,3,1,3,3,0,11,11,11 GP,M,17,U,GT3,T,2,1,other,other,home,mother,2,1,3,yes,yes,no,yes,yes,no,yes,no,4,5,1,5,5,3,2,8,8,10 GP,F,16,U,GT3,T,3,4,at_home,other,course,mother,1,2,0,no,yes,no,no,yes,yes,yes,no,2,4,3,1,2,3,12,5,5,5 GP,M,15,U,GT3,T,2,3,other,services,course,father,1,1,0,yes,yes,yes,yes,no,yes,yes,yes,3,2,2,1,3,3,2,10,12,12 GP,M,15,U,GT3,T,2,3,other,other,home,mother,1,3,0,yes,no,yes,no,no,yes,yes,no,5,3,2,1,2,5,4,11,10,11 GP,F,15,U,LE3,T,3,2,services,other,reputation,mother,1,2,0,no,yes,yes,no,yes,yes,yes,no,4,4,4,1,1,5,10,7,6,6 GP,M,15,U,LE3,T,2,2,services,services,home,mother,2,2,0,no,no,yes,yes,yes,yes,yes,no,5,3,3,1,3,4,4,15,15,15 GP,F,15,U,GT3,T,1,1,other,other,home,father,1,2,0,no,yes,no,yes,no,yes,yes,no,4,3,2,2,3,4,2,9,10,10 GP,F,15,U,GT3,T,4,4,services,services,reputation,father,2,2,2,no,no,yes,no,yes,yes,yes,yes,4,4,4,5,3,5,6,7,9,8 GP,F,16,U,LE3,T,2,2,at_home,other,course,mother,1,2,0,no,yes,no,no,yes,yes,no,no,4,3,4,1,2,2,4,8,7,6 GP,F,15,U,GT3,T,4,2,other,other,reputation,mother,1,3,0,no,yes,no,yes,yes,yes,yes,no,5,3,3,1,3,1,4,13,14,14 GP,M,16,U,GT3,T,2,2,services,other,reputation,father,2,2,1,no,no,yes,yes,no,yes,yes,no,4,4,2,1,1,3,12,11,10,10 GP,M,16,U,LE3,A,4,4,teacher,health,reputation,mother,1,2,0,no,yes,no,no,yes,yes,no,no,4,1,3,3,5,5,18,8,6,7 GP,F,16,U,GT3,T,3,3,other,other,home,mother,1,3,0,no,yes,yes,no,yes,yes,yes,yes,4,3,3,1,3,4,0,7,7,8 GP,F,15,U,GT3,T,4,3,services,other,reputation,mother,1,1,0,no,no,yes,yes,yes,yes,yes,no,4,5,5,5,3,1,4,16,17,18 GP,F,16,U,LE3,T,3,1,other,other,home,father,1,2,0,yes,yes,no,no,yes,yes,no,no,3,3,3,2,3,2,4,7,6,6 GP,F,16,U,GT3,T,4,2,teacher,services,home,mother,2,2,0,no,yes,yes,yes,yes,yes,yes,no,5,3,3,1,1,1,0,11,10,10 GP,M,15,U,LE3,T,2,2,services,health,reputation,mother,1,4,0,no,yes,no,yes,yes,yes,yes,no,4,3,4,1,1,4,6,11,13,14 GP,F,15,R,GT3,T,1,1,at_home,other,home,mother,2,4,1,yes,yes,yes,yes,yes,yes,yes,no,3,1,2,1,1,1,2,7,10,10 GP,M,16,R,GT3,T,4,3,services,other,reputation,mother,2,1,0,yes,yes,no,yes,no,yes,yes,no,3,3,3,1,1,4,2,11,15,15 GP,F,16,U,GT3,T,2,1,other,other,course,mother,1,2,0,no,yes,yes,no,yes,yes,no,yes,4,3,5,1,1,5,2,8,9,10 GP,F,16,U,GT3,T,4,4,other,other,reputation,mother,1,1,0,no,no,no,yes,no,yes,yes,no,5,3,4,1,2,1,6,11,14,14 GP,F,16,U,GT3,T,4,3,other,at_home,course,mother,1,3,0,yes,yes,yes,no,yes,yes,yes,no,5,3,5,1,1,3,0,7,9,8 GP,M,16,U,GT3,T,4,4,services,services,other,mother,1,1,0,yes,yes,yes,yes,yes,yes,yes,no,4,5,5,5,5,4,14,7,7,5 GP,M,16,U,GT3,T,4,4,services,teacher,other,father,1,3,0,no,yes,no,yes,yes,yes,yes,yes,4,4,3,1,1,4,0,16,17,17 GP,M,15,U,GT3,T,4,4,services,other,course,mother,1,1,0,no,yes,no,yes,no,yes,yes,no,5,3,3,1,1,5,4,10,13,14 GP,F,15,U,GT3,T,3,2,services,other,home,mother,2,2,0,yes,yes,yes,no,yes,yes,yes,no,4,3,5,1,1,2,26,7,6,6 GP,M,15,U,GT3,A,3,4,services,other,course,mother,1,2,0,no,yes,yes,yes,yes,yes,yes,no,5,4,4,1,1,1,0,16,18,18 GP,F,15,U,GT3,A,3,3,other,health,reputation,father,1,4,0,yes,no,no,no,yes,yes,no,no,4,3,3,1,1,4,10,10,11,11 GP,F,15,U,GT3,T,2,2,other,other,course,mother,1,4,0,yes,yes,yes,no,yes,yes,yes,no,5,1,2,1,1,3,8,7,8,8 GP,M,16,U,GT3,T,3,3,services,other,home,father,1,3,0,no,yes,no,yes,yes,yes,yes,no,5,3,3,1,1,5,2,16,18,18 GP,M,15,R,GT3,T,4,4,other,other,home,father,4,4,0,no,yes,yes,yes,yes,yes,yes,yes,1,3,5,3,5,1,6,10,13,13 GP,F,16,U,LE3,T,4,4,health,health,other,mother,1,3,0,no,yes,yes,yes,yes,yes,yes,yes,5,4,5,1,1,4,4,14,15,16 GP,M,15,U,LE3,A,4,4,teacher,teacher,course,mother,1,1,0,no,no,no,yes,yes,yes,yes,no,5,5,3,5,5,4,6,18,19,19 GP,F,16,R,GT3,T,3,3,services,other,reputation,father,1,3,1,yes,yes,no,yes,yes,yes,yes,no,4,1,2,1,1,2,0,7,10,10 GP,F,16,U,GT3,T,2,2,at_home,other,home,mother,1,2,1,yes,no,no,yes,yes,yes,yes,no,3,1,2,1,1,5,6,10,13,13 GP,M,15,U,LE3,T,4,2,teacher,other,course,mother,1,1,0,no,no,no,no,yes,yes,yes,no,3,5,2,5,5,3,10,18,19,19 GP,M,15,R,GT3,T,2,1,health,services,reputation,mother,1,2,0,no,no,no,yes,yes,yes,yes,yes,5,4,2,1,1,5,8,9,9,9 GP,M,16,U,GT3,T,4,4,teacher,teacher,course,father,1,2,0,no,yes,no,yes,yes,yes,yes,no,5,4,4,1,2,5,2,15,15,16 GP,M,15,U,GT3,T,4,4,other,teacher,reputation,father,2,2,0,no,yes,no,yes,yes,yes,no,no,4,4,3,1,1,2,2,11,13,14 GP,M,16,U,GT3,T,3,3,other,services,home,father,2,1,0,no,no,no,yes,yes,yes,yes,no,5,4,2,1,1,5,0,13,14,13 GP,M,17,R,GT3,T,1,3,other,other,course,father,3,2,1,no,yes,no,yes,yes,yes,yes,no,5,2,4,1,4,5,20,9,7,8 GP,M,15,U,GT3,T,3,4,other,other,reputation,father,1,1,0,no,no,no,no,yes,yes,yes,no,3,4,3,1,2,4,6,14,13,13 GP,F,15,U,GT3,T,1,2,at_home,services,course,mother,1,2,0,no,no,no,no,no,yes,yes,no,3,2,3,1,2,1,2,16,15,15 GP,M,15,U,GT3,T,2,2,services,services,home,father,1,4,0,no,yes,yes,yes,yes,yes,yes,no,5,5,4,1,2,5,6,16,14,15 GP,F,16,U,LE3,T,2,4,other,health,course,father,2,2,0,no,yes,yes,yes,yes,yes,yes,yes,4,2,2,1,2,5,2,13,13,13 GP,M,16,U,GT3,T,4,4,health,other,course,mother,1,1,0,no,yes,no,yes,yes,yes,yes,no,3,4,4,5,4,5,18,14,11,13 GP,F,16,U,GT3,T,2,2,other,other,home,mother,1,2,0,no,no,yes,no,yes,yes,yes,yes,5,4,4,1,1,5,0,8,7,8 GP,M,15,U,GT3,T,3,4,services,services,home,father,1,1,0,yes,no,no,no,yes,yes,yes,no,5,5,5,3,2,5,0,13,13,12 GP,F,15,U,LE3,A,3,4,other,other,home,mother,1,2,0,yes,no,no,yes,yes,yes,yes,yes,5,3,2,1,1,1,0,7,10,11 GP,F,19,U,GT3,T,0,1,at_home,other,course,other,1,2,3,no,yes,no,no,no,no,no,no,3,4,2,1,1,5,2,7,8,9 GP,M,18,R,GT3,T,2,2,services,other,reputation,mother,1,1,2,no,yes,no,yes,yes,yes,yes,no,3,3,3,1,2,4,0,7,4,0 GP,M,16,R,GT3,T,4,4,teacher,teacher,course,mother,1,1,0,no,no,yes,yes,yes,yes,yes,no,3,5,5,4,5,4,8,18,18,18 GP,F,15,R,GT3,T,3,4,services,teacher,course,father,2,3,2,no,yes,no,no,yes,yes,yes,yes,4,2,2,2,2,5,0,12,0,0 GP,F,15,U,GT3,T,1,1,at_home,other,course,mother,3,1,0,no,yes,no,yes,no,yes,yes,yes,4,3,3,1,2,4,0,8,0,0 GP,F,17,U,LE3,T,2,2,other,other,course,father,1,1,0,no,yes,no,no,yes,yes,yes,yes,3,4,4,1,3,5,12,10,13,12 GP,F,16,U,GT3,A,3,4,services,other,course,father,1,1,0,no,no,no,no,yes,yes,yes,no,3,2,1,1,4,5,16,12,11,11 GP,M,15,R,GT3,T,3,4,at_home,teacher,course,mother,4,2,0,no,yes,no,no,yes,yes,no,yes,5,3,3,1,1,5,0,9,0,0 GP,F,15,U,GT3,T,4,4,services,at_home,course,mother,1,3,0,no,yes,no,yes,yes,yes,yes,yes,4,3,3,1,1,5,0,11,0,0 GP,M,17,R,GT3,T,3,4,at_home,other,course,mother,3,2,0,no,no,no,no,yes,yes,no,no,5,4,5,5,4,5,0,10,0,0 GP,F,16,U,GT3,A,3,3,other,other,course,other,2,1,2,no,yes,no,yes,no,yes,yes,yes,4,3,2,1,1,5,0,4,0,0 GP,M,16,U,LE3,T,1,1,services,other,course,mother,1,2,1,no,no,no,no,yes,yes,no,yes,4,4,4,5,3,5,0,14,12,12 GP,F,15,U,GT3,T,4,4,teacher,teacher,course,mother,2,1,0,no,no,no,yes,yes,yes,yes,no,4,3,2,1,1,5,0,16,16,15 GP,M,15,U,GT3,T,4,3,teacher,services,course,father,2,4,0,yes,yes,no,no,yes,yes,yes,no,2,2,2,1,1,3,0,7,9,0 GP,M,16,U,LE3,T,2,2,services,services,reputation,father,2,1,2,no,yes,no,yes,yes,yes,yes,no,2,3,3,2,2,2,8,9,9,9 GP,F,15,U,GT3,T,4,4,teacher,services,course,mother,1,3,0,no,yes,yes,yes,yes,yes,yes,no,4,2,2,1,1,5,2,9,11,11 GP,F,16,U,LE3,T,1,1,at_home,at_home,course,mother,1,1,0,no,no,no,no,yes,yes,yes,no,3,4,4,5,3,1,2,14,14,13 GP,M,17,U,GT3,T,2,1,other,other,home,mother,1,1,3,no,yes,no,no,yes,yes,yes,no,5,4,5,1,2,5,0,5,0,0 GP,F,15,U,GT3,T,1,1,other,services,course,father,1,2,0,no,yes,yes,no,yes,yes,yes,no,4,4,2,1,2,5,0,8,11,11 GP,F,15,U,GT3,T,3,2,health,services,home,father,1,2,3,no,yes,no,no,yes,yes,yes,no,3,3,2,1,1,3,0,6,7,0 GP,F,15,U,GT3,T,1,2,at_home,other,course,mother,1,2,0,no,yes,yes,no,no,yes,yes,no,4,3,2,1,1,5,2,10,11,11 GP,M,16,U,GT3,T,4,4,teacher,teacher,course,mother,1,1,0,no,yes,no,no,yes,no,yes,yes,3,3,2,2,1,5,0,7,6,0 GP,M,15,U,LE3,A,2,1,services,other,course,mother,4,1,3,no,no,no,no,yes,yes,yes,no,4,5,5,4,5,5,0,8,9,10 GP,M,18,U,LE3,T,1,1,other,other,course,mother,1,1,3,no,no,no,no,yes,no,yes,yes,2,3,5,2,5,4,0,6,5,0 GP,M,16,U,LE3,T,2,1,at_home,other,course,mother,1,1,1,no,no,no,yes,yes,yes,no,yes,4,4,4,5,5,5,6,12,13,14 GP,F,15,R,GT3,T,3,3,services,services,reputation,other,2,3,2,no,yes,yes,yes,yes,yes,yes,yes,4,2,1,2,3,3,8,10,10,10 GP,M,19,U,GT3,T,3,2,services,at_home,home,mother,1,1,3,no,yes,no,no,yes,no,yes,yes,4,5,4,1,1,4,0,5,0,0 GP,F,17,U,GT3,T,4,4,other,teacher,course,mother,1,1,0,yes,yes,no,no,yes,yes,no,yes,4,2,1,1,1,4,0,11,11,12 GP,M,15,R,GT3,T,2,3,at_home,services,course,mother,1,2,0,yes,no,yes,yes,yes,yes,no,no,4,4,4,1,1,1,2,11,8,8 GP,M,17,R,LE3,T,1,2,other,other,reputation,mother,1,1,0,no,no,no,no,yes,yes,no,no,2,2,2,3,3,5,8,16,12,13 GP,F,18,R,GT3,T,1,1,at_home,other,course,mother,3,1,3,no,yes,no,yes,no,yes,no,no,5,2,5,1,5,4,6,9,8,10 GP,M,16,R,GT3,T,2,2,at_home,other,course,mother,3,1,0,no,no,no,no,no,yes,no,no,4,2,2,1,2,3,2,17,15,15 GP,M,16,U,GT3,T,3,3,other,services,course,father,1,2,1,no,yes,yes,no,yes,yes,yes,yes,4,5,5,4,4,5,4,10,12,12 GP,M,17,R,LE3,T,2,1,at_home,other,course,mother,2,1,2,no,no,no,yes,yes,no,yes,yes,3,3,2,2,2,5,0,7,6,0 GP,M,15,R,GT3,T,3,2,other,other,course,mother,2,2,2,yes,yes,no,no,yes,yes,yes,yes,4,4,4,5,4,3,6,5,9,7 GP,M,16,U,LE3,T,1,2,other,other,course,mother,2,1,1,no,no,no,yes,yes,yes,no,no,4,4,4,5,4,5,0,7,0,0 GP,M,17,U,GT3,T,1,3,at_home,services,course,father,1,1,0,no,no,no,no,yes,no,yes,no,5,3,3,1,4,2,2,10,10,10 GP,M,17,R,LE3,T,1,1,other,services,course,mother,4,2,3,no,no,no,yes,yes,no,no,yes,5,3,5,1,5,5,0,5,8,7 GP,M,16,U,GT3,T,3,2,services,services,course,mother,2,1,1,no,yes,no,yes,no,no,no,no,4,5,2,1,1,2,16,12,11,12 GP,M,16,U,GT3,T,2,2,other,other,course,father,1,2,0,no,no,no,no,yes,no,yes,no,4,3,5,2,4,4,4,10,10,10 GP,F,16,U,GT3,T,4,2,health,services,home,father,1,2,0,no,no,yes,no,yes,yes,yes,yes,4,2,3,1,1,3,0,14,15,16 GP,F,16,U,GT3,T,2,2,other,other,home,mother,1,2,0,no,yes,yes,no,no,yes,yes,no,5,1,5,1,1,4,0,6,7,0 GP,F,16,U,GT3,T,4,4,health,health,reputation,mother,1,2,0,no,yes,yes,no,yes,yes,yes,yes,4,4,2,1,1,3,0,14,14,14 GP,M,16,U,GT3,T,3,4,other,other,course,father,3,1,2,no,yes,no,yes,no,yes,yes,no,3,4,5,5,4,2,0,6,5,0 GP,M,16,U,GT3,T,1,0,other,other,reputation,mother,2,2,0,no,yes,yes,yes,yes,yes,yes,yes,4,3,2,1,1,3,2,13,15,16 GP,M,17,U,LE3,T,4,4,teacher,other,reputation,mother,1,2,0,no,yes,yes,yes,yes,yes,yes,no,4,4,4,5,3,5,0,13,11,10 GP,F,16,U,GT3,T,1,3,at_home,services,home,mother,1,2,3,no,no,no,yes,no,yes,yes,yes,4,3,5,1,1,3,0,8,7,0 GP,F,16,U,LE3,T,3,3,other,other,reputation,mother,2,2,0,no,yes,yes,yes,yes,yes,yes,no,4,4,5,1,1,4,4,10,11,9 GP,M,17,U,LE3,T,4,3,teacher,other,course,mother,2,2,0,no,no,yes,yes,yes,yes,yes,no,4,4,4,4,4,4,4,10,9,9 GP,F,16,U,GT3,T,2,2,services,other,reputation,mother,2,2,0,no,no,yes,yes,no,yes,yes,no,3,4,4,1,4,5,2,13,13,11 GP,M,17,U,GT3,T,3,3,other,other,reputation,father,1,2,0,no,no,no,yes,no,yes,yes,no,4,3,4,1,4,4,4,6,5,6 GP,M,16,R,GT3,T,4,2,teacher,services,other,mother,1,1,0,no,yes,no,yes,yes,yes,yes,yes,4,3,3,3,4,3,10,10,8,9 GP,M,17,U,GT3,T,4,3,other,other,course,mother,1,2,0,no,yes,no,yes,yes,yes,yes,yes,5,2,3,1,1,2,4,10,10,11 GP,M,16,U,GT3,T,4,3,teacher,other,home,mother,1,2,0,no,yes,yes,yes,yes,yes,yes,no,3,4,3,5,3,3,10,9,8,8 GP,M,16,U,GT3,T,3,3,services,other,home,mother,1,2,0,no,no,yes,yes,yes,yes,yes,yes,4,2,3,1,2,3,2,12,13,12 GP,F,17,U,GT3,T,2,4,services,services,reputation,father,1,2,0,no,yes,no,yes,yes,yes,no,no,5,4,2,5,3,5,0,16,17,17 GP,F,17,U,LE3,T,3,3,other,other,reputation,mother,1,2,0,no,yes,no,yes,yes,yes,yes,yes,5,3,3,2,3,1,56,9,9,8 GP,F,16,U,GT3,T,3,2,other,other,reputation,mother,1,2,0,no,yes,yes,no,yes,yes,yes,no,1,2,2,1,2,1,14,12,13,12 GP,M,17,U,GT3,T,3,3,services,services,other,mother,1,2,0,no,yes,no,yes,yes,yes,yes,yes,4,3,4,2,3,4,12,12,12,11 GP,M,16,U,GT3,T,1,2,services,services,other,mother,1,1,0,no,yes,yes,yes,yes,yes,yes,yes,3,3,3,1,2,3,2,11,12,11 GP,M,16,U,LE3,T,2,1,other,other,course,mother,1,2,0,no,no,yes,yes,yes,yes,yes,yes,4,2,3,1,2,5,0,15,15,15 GP,F,17,U,GT3,A,3,3,health,other,reputation,mother,1,2,0,no,yes,no,no,no,yes,yes,yes,3,3,3,1,3,3,6,8,7,9 GP,M,17,R,GT3,T,1,2,at_home,other,home,mother,1,2,0,no,no,no,no,yes,yes,no,no,3,1,3,1,5,3,4,8,9,10 GP,F,16,U,GT3,T,2,3,services,services,course,mother,1,2,0,no,no,no,no,yes,yes,yes,no,4,3,3,1,1,2,10,11,12,13 GP,F,17,U,GT3,T,1,1,at_home,services,course,mother,1,2,0,no,no,no,yes,yes,yes,yes,no,5,3,3,1,1,3,0,8,8,9 GP,M,17,U,GT3,T,1,2,at_home,services,other,other,2,2,0,no,no,yes,yes,no,yes,yes,no,4,4,4,4,5,5,12,7,8,8 GP,M,16,R,GT3,T,3,3,services,services,reputation,mother,1,1,0,no,yes,no,yes,yes,yes,yes,no,4,3,2,3,4,5,8,8,9,10 GP,M,16,U,GT3,T,2,3,other,other,home,father,2,1,0,no,no,no,no,yes,yes,yes,no,5,3,3,1,1,3,0,13,14,14 GP,F,17,U,LE3,T,2,4,services,services,course,father,1,2,0,no,no,no,yes,yes,yes,yes,yes,4,3,2,1,1,5,0,14,15,15 GP,M,17,U,GT3,T,4,4,services,teacher,home,mother,1,1,0,no,no,no,no,yes,yes,yes,no,5,2,3,1,2,5,4,17,15,16 GP,M,16,R,LE3,T,3,3,teacher,other,home,father,3,1,0,no,yes,yes,yes,yes,yes,yes,no,3,3,4,3,5,3,8,9,9,10 GP,F,17,U,GT3,T,4,4,services,teacher,home,mother,2,1,1,no,yes,no,no,yes,yes,yes,no,4,2,4,2,3,2,24,18,18,18 GP,F,16,U,LE3,T,4,4,teacher,teacher,reputation,mother,1,2,0,no,yes,yes,no,yes,yes,yes,no,4,5,2,1,2,3,0,9,9,10 GP,F,16,U,GT3,T,4,3,health,other,home,mother,1,2,0,no,yes,no,yes,yes,yes,yes,no,4,3,5,1,5,2,2,16,16,16 GP,F,16,U,GT3,T,2,3,other,other,reputation,mother,1,2,0,yes,yes,yes,yes,yes,yes,no,no,4,4,3,1,3,4,6,8,10,10 GP,F,17,U,GT3,T,1,1,other,other,course,mother,1,2,0,no,yes,yes,no,no,yes,no,no,4,4,4,1,3,1,4,9,9,10 GP,F,17,R,GT3,T,2,2,other,other,reputation,mother,1,1,0,no,yes,no,no,yes,yes,yes,no,5,3,2,1,2,3,18,7,6,6 GP,F,16,R,GT3,T,2,2,services,services,reputation,mother,2,4,0,no,yes,yes,yes,no,yes,yes,no,5,3,5,1,1,5,6,10,10,11 GP,F,17,U,GT3,T,3,4,at_home,services,home,mother,1,3,1,no,yes,yes,no,yes,yes,yes,yes,4,4,3,5,4,5,28,10,9,9 GP,F,16,U,GT3,A,3,1,services,other,course,mother,1,2,3,no,yes,yes,no,yes,yes,yes,no,2,3,3,2,2,4,5,7,7,7 GP,F,16,U,GT3,T,4,3,teacher,other,other,mother,1,2,0,no,no,yes,yes,yes,yes,yes,yes,1,3,2,1,1,1,10,11,12,13 GP,F,16,U,GT3,T,1,1,at_home,other,home,mother,2,1,0,no,yes,yes,no,yes,yes,no,no,4,3,2,1,4,5,6,9,9,10 GP,F,17,R,GT3,T,4,3,teacher,other,reputation,mother,2,3,0,no,yes,yes,yes,yes,yes,yes,yes,4,4,2,1,1,4,6,7,7,7 GP,F,19,U,GT3,T,3,3,other,other,reputation,other,1,4,0,no,yes,yes,yes,yes,yes,yes,no,4,3,3,1,2,3,10,8,8,8 GP,M,17,U,LE3,T,4,4,services,other,home,mother,1,2,0,no,yes,yes,no,yes,yes,yes,yes,5,3,5,4,5,3,13,12,12,13 GP,F,16,U,GT3,A,2,2,other,other,reputation,mother,1,2,0,yes,yes,yes,no,yes,yes,yes,no,3,3,4,1,1,4,0,12,13,14 GP,M,18,U,GT3,T,2,2,services,other,home,mother,1,2,1,no,yes,yes,yes,yes,yes,yes,no,4,4,4,5,4,5,15,6,7,8 GP,F,17,R,LE3,T,4,4,services,other,other,mother,1,1,0,no,yes,yes,no,yes,yes,no,no,5,2,1,1,2,3,12,8,10,10 GP,F,17,U,LE3,T,3,2,other,other,reputation,mother,2,2,0,no,no,yes,no,yes,yes,yes,no,4,4,4,1,3,1,2,14,15,15 GP,F,17,U,GT3,T,4,3,other,other,reputation,mother,1,2,2,no,no,yes,no,yes,yes,yes,yes,3,4,5,5,4,1,22,6,6,4 GP,M,18,U,LE3,T,3,3,services,health,home,father,1,2,1,no,yes,yes,no,yes,yes,yes,no,3,2,4,2,4,4,13,6,6,8 GP,F,17,U,GT3,T,2,3,at_home,other,home,father,2,1,0,no,yes,yes,no,yes,yes,no,no,3,3,3,1,4,3,3,7,7,8 GP,F,17,U,GT3,T,2,2,at_home,at_home,course,mother,1,3,0,no,yes,yes,yes,yes,yes,yes,no,4,3,3,1,1,4,4,9,10,10 GP,F,17,R,GT3,T,2,1,at_home,services,reputation,mother,2,2,0,no,yes,no,yes,yes,yes,yes,no,4,2,5,1,2,5,2,6,6,6 GP,F,17,U,GT3,T,1,1,at_home,other,reputation,mother,1,3,1,no,yes,no,yes,yes,yes,no,yes,4,3,4,1,1,5,0,6,5,0 GP,F,16,U,GT3,T,2,3,services,teacher,other,mother,1,2,0,yes,no,no,no,yes,yes,yes,no,2,3,1,1,1,3,2,16,16,17 GP,M,18,U,GT3,T,2,2,other,other,home,mother,2,2,0,no,yes,yes,no,yes,yes,yes,no,3,3,3,5,5,4,0,12,13,13 GP,F,16,U,GT3,T,4,4,teacher,services,home,mother,1,3,0,no,yes,no,yes,no,yes,yes,no,5,3,2,1,1,5,0,13,13,14 GP,F,18,R,GT3,T,3,1,other,other,reputation,mother,1,2,1,no,no,no,yes,yes,yes,yes,yes,5,3,3,1,1,4,16,9,8,7 GP,F,17,U,GT3,T,3,2,other,other,course,mother,1,2,0,no,no,no,yes,no,yes,yes,no,5,3,4,1,3,3,10,16,15,15 GP,M,17,U,LE3,T,2,3,services,services,reputation,father,1,2,0,no,yes,yes,no,no,yes,yes,no,5,3,3,1,3,3,2,12,11,12 GP,M,18,U,LE3,T,2,1,at_home,other,course,mother,4,2,0,yes,yes,yes,yes,yes,yes,yes,yes,4,3,2,4,5,3,14,10,8,9 GP,F,17,U,GT3,A,2,1,other,other,course,mother,2,3,0,no,no,no,yes,yes,yes,yes,yes,3,2,3,1,2,3,10,12,10,12 GP,F,17,U,LE3,T,4,3,health,other,reputation,father,1,2,0,no,no,no,yes,yes,yes,yes,yes,3,2,3,1,2,3,14,13,13,14 GP,M,17,R,GT3,T,2,2,other,other,course,father,2,2,0,no,yes,yes,yes,yes,yes,yes,no,4,5,2,1,1,1,4,11,11,11 GP,M,17,U,GT3,T,4,4,teacher,teacher,reputation,mother,1,2,0,yes,yes,no,yes,yes,yes,yes,yes,4,5,5,1,3,2,14,11,9,9 GP,M,16,U,GT3,T,4,4,health,other,reputation,father,1,2,0,no,yes,yes,yes,yes,yes,yes,no,4,2,4,2,4,1,2,14,13,13 GP,M,16,U,LE3,T,1,1,other,other,home,mother,2,2,0,no,yes,yes,no,yes,yes,yes,no,3,4,2,1,1,5,18,9,7,6 GP,M,16,U,GT3,T,3,2,at_home,other,reputation,mother,2,3,0,no,no,no,yes,yes,yes,yes,yes,5,3,3,1,3,2,10,11,9,10 GP,M,17,U,LE3,T,2,2,other,other,home,father,1,2,0,no,no,yes,yes,no,yes,yes,yes,4,4,2,5,5,4,4,14,13,13 GP,F,16,U,GT3,T,2,1,other,other,home,mother,1,1,0,no,no,no,no,yes,yes,yes,yes,4,5,2,1,1,5,20,13,12,12 GP,F,17,R,GT3,T,2,1,at_home,services,course,mother,3,2,0,no,no,no,yes,yes,yes,no,no,2,1,1,1,1,3,2,13,11,11 GP,M,18,U,GT3,T,2,2,other,services,reputation,father,1,2,1,no,no,no,no,yes,no,yes,no,5,5,4,5,5,2,0,7,7,0 GP,M,17,U,LE3,T,4,3,health,other,course,mother,2,2,0,no,no,no,yes,yes,yes,yes,yes,2,5,5,5,4,5,14,12,12,12 GP,M,17,R,LE3,A,4,4,teacher,other,course,mother,2,2,0,no,yes,yes,no,yes,yes,yes,no,3,3,3,2,3,4,2,10,11,12 GP,M,16,U,LE3,T,4,3,teacher,other,course,mother,1,1,0,no,no,no,yes,no,yes,yes,no,5,4,5,1,1,3,0,6,0,0 GP,M,16,U,GT3,T,4,4,services,services,course,mother,1,1,0,no,no,no,yes,yes,yes,yes,no,5,3,2,1,2,5,0,13,12,12 GP,F,18,U,GT3,T,2,1,other,other,course,other,2,3,0,no,yes,yes,no,no,yes,yes,yes,4,4,4,1,1,3,0,7,0,0 GP,M,16,U,GT3,T,2,1,other,other,course,mother,3,1,0,no,no,no,no,yes,yes,yes,no,4,3,3,1,1,4,6,18,18,18 GP,M,17,U,GT3,T,2,3,other,other,course,father,2,1,0,no,no,no,no,yes,yes,yes,no,5,2,2,1,1,2,4,12,12,13 GP,M,22,U,GT3,T,3,1,services,services,other,mother,1,1,3,no,no,no,no,no,no,yes,yes,5,4,5,5,5,1,16,6,8,8 GP,M,18,R,LE3,T,3,3,other,services,course,mother,1,2,1,no,yes,no,no,yes,yes,yes,yes,4,3,3,1,3,5,8,3,5,5 GP,M,16,U,GT3,T,0,2,other,other,other,mother,1,1,0,no,no,yes,no,no,yes,yes,no,4,3,2,2,4,5,0,13,15,15 GP,M,18,U,GT3,T,3,2,services,other,course,mother,2,1,1,no,no,no,no,yes,no,yes,no,4,4,5,5,4,5,0,6,8,8 GP,M,16,U,GT3,T,3,3,at_home,other,reputation,other,3,2,0,yes,yes,no,no,no,yes,yes,no,5,3,3,1,3,2,6,7,10,10 GP,M,18,U,GT3,T,2,1,services,services,other,mother,1,1,1,no,no,no,no,no,no,yes,no,3,2,5,2,5,5,4,6,9,8 GP,M,16,R,GT3,T,2,1,other,other,course,mother,2,1,0,no,no,no,yes,no,yes,no,no,3,3,2,1,3,3,0,8,9,8 GP,M,17,R,GT3,T,2,1,other,other,course,mother,1,1,0,no,no,no,no,no,yes,yes,no,4,4,2,5,4,5,0,8,12,12 GP,M,17,U,LE3,T,1,1,health,other,course,mother,2,1,1,no,yes,no,yes,yes,yes,yes,no,4,4,4,1,2,5,2,7,9,8 GP,F,17,U,LE3,T,4,2,teacher,services,reputation,mother,1,4,0,no,yes,yes,yes,yes,yes,yes,no,4,2,3,1,1,4,6,14,12,13 GP,M,19,U,LE3,A,4,3,services,at_home,reputation,mother,1,2,0,no,yes,no,no,yes,yes,yes,no,4,3,1,1,1,1,12,11,11,11 GP,M,18,U,GT3,T,2,1,other,other,home,mother,1,2,0,no,no,no,yes,yes,yes,yes,no,5,2,4,1,2,4,8,15,14,14 GP,F,17,U,LE3,T,2,2,services,services,course,father,1,4,0,no,no,yes,yes,yes,yes,yes,yes,3,4,1,1,1,2,0,10,9,0 GP,F,18,U,GT3,T,4,3,services,other,home,father,1,2,0,no,yes,yes,no,yes,yes,yes,yes,3,1,2,1,3,2,21,17,18,18 GP,M,18,U,GT3,T,4,3,teacher,other,course,mother,1,2,0,no,yes,yes,no,no,yes,yes,no,4,3,2,1,1,3,2,8,8,8 GP,M,18,R,GT3,T,3,2,other,other,course,mother,1,3,0,no,no,no,yes,no,yes,no,no,5,3,2,1,1,3,1,13,12,12 GP,F,17,U,GT3,T,3,3,other,other,home,mother,1,3,0,no,no,no,yes,no,yes,no,no,3,2,3,1,1,4,4,10,9,9 GP,F,18,U,GT3,T,2,2,at_home,services,home,mother,1,3,0,no,yes,yes,yes,yes,yes,yes,yes,4,3,3,1,1,3,0,9,10,0 GP,M,18,R,LE3,A,3,4,other,other,reputation,mother,2,2,0,no,yes,yes,yes,yes,yes,yes,no,4,2,5,3,4,1,13,17,17,17 GP,M,17,U,GT3,T,3,1,services,other,other,mother,1,2,0,no,no,yes,yes,yes,yes,yes,yes,5,4,4,5,4,5,2,9,9,10 GP,F,18,R,GT3,T,4,4,teacher,other,reputation,mother,2,2,0,no,no,yes,yes,yes,yes,yes,no,4,3,4,2,2,4,8,12,10,11 GP,M,18,U,GT3,T,4,2,health,other,reputation,father,1,2,0,no,yes,yes,yes,yes,yes,yes,yes,5,4,5,1,3,5,10,10,9,10 GP,F,18,R,GT3,T,2,1,other,other,reputation,mother,2,2,0,no,yes,no,no,yes,no,yes,yes,4,3,5,1,2,3,0,6,0,0 GP,F,19,U,GT3,T,3,3,other,services,home,other,1,2,2,no,yes,yes,yes,yes,yes,yes,no,4,3,5,3,3,5,15,9,9,9 GP,F,18,U,GT3,T,2,3,other,services,reputation,father,1,4,0,no,yes,yes,yes,yes,yes,yes,yes,4,5,5,5,3,2,4,15,14,14 GP,F,18,U,LE3,T,1,1,other,other,home,mother,2,2,0,no,yes,yes,no,no,yes,no,no,4,4,3,1,1,3,2,11,11,11 GP,M,17,R,GT3,T,1,2,at_home,at_home,home,mother,1,2,0,no,yes,yes,yes,no,yes,no,yes,3,5,2,2,2,1,2,15,14,14 GP,F,17,U,GT3,T,2,4,at_home,health,reputation,mother,2,2,0,no,yes,yes,no,yes,yes,yes,yes,4,3,3,1,1,1,2,10,10,10 GP,F,17,U,LE3,T,2,2,services,other,course,mother,2,2,0,yes,yes,yes,no,yes,yes,yes,yes,4,4,4,5,3,5,6,12,12,12 GP,F,18,R,GT3,A,3,2,other,services,home,mother,2,2,0,no,no,no,no,no,no,yes,yes,4,1,1,1,1,5,75,10,9,9 GP,M,18,U,GT3,T,4,4,teacher,services,home,mother,2,1,0,no,no,yes,yes,yes,yes,yes,no,3,2,4,1,4,3,22,9,9,9 GP,F,18,U,GT3,T,4,4,health,health,reputation,father,1,2,1,yes,yes,no,yes,yes,yes,yes,yes,2,4,4,1,1,4,15,9,8,8 GP,M,18,U,LE3,T,4,3,teacher,services,course,mother,2,1,0,no,no,yes,yes,yes,yes,yes,no,4,2,3,1,2,1,8,10,11,10 GP,M,17,U,LE3,A,4,1,services,other,home,mother,2,1,0,no,no,yes,yes,yes,yes,yes,yes,4,5,4,5,4,5,30,8,8,8 GP,M,17,U,LE3,A,3,2,teacher,services,home,mother,1,1,1,no,no,no,no,yes,yes,yes,no,4,4,4,5,4,3,19,11,9,10 GP,F,18,R,LE3,T,1,1,at_home,other,reputation,mother,2,4,0,no,yes,yes,yes,yes,yes,no,no,5,2,2,1,1,3,1,12,12,12 GP,F,18,U,GT3,T,1,1,other,other,home,mother,2,2,0,yes,no,no,yes,yes,yes,yes,no,5,4,4,1,1,4,4,8,9,10 GP,F,17,U,GT3,T,2,2,other,other,course,mother,1,2,0,no,yes,no,no,no,yes,yes,no,5,4,5,1,2,5,4,10,9,11 GP,M,17,U,GT3,T,1,1,other,other,reputation,father,1,2,0,no,no,yes,no,no,yes,yes,no,4,3,3,1,2,4,2,12,10,11 GP,F,18,U,GT3,T,2,2,at_home,at_home,other,mother,1,3,0,no,yes,yes,no,yes,yes,yes,no,4,3,3,1,2,2,5,18,18,19 GP,F,17,U,GT3,T,1,1,services,teacher,reputation,mother,1,3,0,no,yes,yes,no,yes,yes,yes,no,4,3,3,1,1,3,6,13,12,12 GP,M,18,U,GT3,T,2,1,services,services,reputation,mother,1,3,0,no,no,yes,yes,yes,yes,yes,no,4,2,4,1,3,2,6,15,14,14 GP,M,18,U,LE3,A,4,4,teacher,teacher,reputation,mother,1,2,0,no,yes,yes,yes,yes,yes,yes,no,5,4,3,1,1,2,9,15,13,15 GP,M,18,U,GT3,T,4,2,teacher,other,home,mother,1,2,0,no,yes,yes,yes,yes,yes,yes,yes,4,3,2,1,4,5,11,12,11,11 GP,F,17,U,GT3,T,4,3,health,services,reputation,mother,1,3,0,no,yes,yes,no,yes,yes,yes,no,4,2,2,1,2,3,0,15,15,15 GP,F,18,U,LE3,T,2,1,services,at_home,reputation,mother,1,2,1,no,no,no,no,yes,yes,yes,yes,5,4,3,1,1,5,12,12,12,13 GP,F,17,R,LE3,T,3,1,services,other,reputation,mother,2,4,0,no,yes,yes,no,yes,yes,no,no,3,1,2,1,1,3,6,18,18,18 GP,M,18,R,LE3,T,3,2,services,other,reputation,mother,2,3,0,no,yes,yes,yes,yes,yes,yes,no,5,4,2,1,1,4,8,14,13,14 GP,M,17,U,GT3,T,3,3,health,other,home,mother,1,1,0,no,yes,yes,no,yes,yes,yes,no,4,4,3,1,3,5,4,14,12,11 GP,F,19,U,GT3,T,4,4,health,other,reputation,other,2,2,0,no,yes,yes,yes,yes,yes,yes,no,2,3,4,2,3,2,0,10,9,0 GP,F,18,U,LE3,T,4,3,other,other,home,other,2,2,0,no,yes,yes,no,yes,yes,yes,yes,4,4,5,1,2,2,10,10,8,8 GP,F,18,U,GT3,T,4,3,other,other,reputation,father,1,4,0,no,yes,yes,no,yes,yes,yes,no,4,3,3,1,1,3,0,14,13,14 GP,M,18,U,LE3,T,4,4,teacher,teacher,home,mother,1,1,0,no,yes,yes,no,yes,yes,yes,yes,1,4,2,2,2,1,5,16,15,16 GP,F,18,U,LE3,A,4,4,health,other,home,mother,1,2,0,no,yes,no,no,yes,yes,yes,yes,4,2,4,1,1,4,14,12,10,11 GP,M,17,U,LE3,T,4,4,other,teacher,home,father,2,1,0,no,no,yes,no,yes,yes,yes,no,4,1,1,2,2,5,0,11,11,10 GP,F,17,U,GT3,T,4,2,other,other,reputation,mother,2,3,0,no,yes,yes,no,yes,yes,yes,no,4,3,3,1,1,3,0,15,12,14 GP,F,17,U,GT3,T,3,2,health,health,reputation,father,1,4,0,no,yes,yes,yes,no,yes,yes,no,5,2,2,1,2,5,0,17,17,18 GP,M,19,U,GT3,T,3,3,other,other,home,other,1,2,1,no,yes,no,yes,yes,yes,yes,yes,4,4,4,1,1,3,20,15,14,13 GP,F,18,U,GT3,T,2,4,services,at_home,reputation,other,1,2,1,no,yes,yes,yes,yes,yes,yes,no,4,4,3,1,1,3,8,14,12,12 GP,M,20,U,GT3,A,3,2,services,other,course,other,1,1,0,no,no,no,yes,yes,yes,no,no,5,5,3,1,1,5,0,17,18,18 GP,M,19,U,GT3,T,4,4,teacher,services,reputation,other,2,1,1,no,yes,yes,no,yes,yes,yes,yes,4,3,4,1,1,4,38,8,9,8 GP,M,19,R,GT3,T,3,3,other,services,reputation,father,1,2,1,no,no,no,yes,yes,yes,no,yes,4,5,3,1,2,5,0,15,12,12 GP,F,19,U,LE3,T,1,1,at_home,other,reputation,other,1,2,1,yes,yes,no,yes,no,yes,yes,no,4,4,3,1,3,3,18,12,10,10 GP,F,19,U,LE3,T,1,2,services,services,home,other,1,2,1,no,no,no,yes,no,yes,no,yes,4,2,4,2,2,3,0,9,9,0 GP,F,19,U,GT3,T,2,1,at_home,other,other,other,3,2,0,no,yes,no,no,yes,no,yes,yes,3,4,1,1,1,2,20,14,12,13 GP,M,19,U,GT3,T,1,2,other,services,course,other,1,2,1,no,no,no,no,no,yes,yes,no,4,5,2,2,2,4,3,13,11,11 GP,F,19,U,LE3,T,3,2,services,other,reputation,other,2,2,1,no,yes,yes,no,no,yes,yes,yes,4,2,2,1,2,1,22,13,10,11 GP,F,19,U,GT3,T,1,1,at_home,health,home,other,1,3,2,no,no,no,no,no,yes,yes,yes,4,1,2,1,1,3,14,15,13,13 GP,F,19,R,GT3,T,2,3,other,other,reputation,other,1,3,1,no,no,no,no,yes,yes,yes,yes,4,1,2,1,1,3,40,13,11,11 GP,F,18,U,GT3,T,2,1,services,other,course,mother,2,2,0,no,yes,yes,yes,yes,yes,yes,no,5,3,3,1,2,1,0,8,8,0 GP,F,18,U,GT3,T,4,3,other,other,course,mother,1,3,0,no,yes,yes,yes,yes,yes,yes,yes,4,3,4,1,1,5,9,9,10,9 GP,F,17,R,GT3,T,3,4,at_home,services,course,father,1,3,0,no,yes,yes,yes,no,yes,yes,no,4,3,4,2,5,5,0,11,11,10 GP,F,18,U,GT3,T,4,4,teacher,other,course,mother,1,2,0,no,yes,yes,no,yes,yes,yes,no,4,4,4,3,3,5,2,11,11,11 GP,F,17,U,GT3,A,4,3,services,services,course,mother,1,2,0,no,yes,yes,no,yes,yes,yes,yes,5,2,2,1,2,5,23,13,13,13 GP,F,17,U,GT3,T,2,2,other,other,course,mother,1,2,0,no,yes,no,no,yes,yes,no,yes,4,2,2,1,1,3,12,11,9,9 GP,F,17,R,LE3,T,2,2,services,services,course,mother,1,3,0,no,yes,yes,yes,yes,yes,yes,no,3,3,2,2,2,3,3,11,11,11 GP,F,17,U,GT3,T,3,1,services,services,course,father,1,3,0,no,yes,no,no,no,yes,yes,no,3,4,3,2,3,5,1,12,14,15 GP,F,17,U,LE3,T,0,2,at_home,at_home,home,father,2,3,0,no,no,no,no,yes,yes,yes,no,3,3,3,2,3,2,0,16,15,15 GP,M,18,U,GT3,T,4,4,other,other,course,mother,1,3,0,no,no,no,yes,yes,yes,yes,no,4,3,3,2,2,3,3,9,12,11 GP,M,17,U,GT3,T,3,3,other,services,reputation,mother,1,1,0,no,no,no,yes,no,yes,yes,no,4,3,5,3,5,5,3,14,15,16 GP,M,17,R,GT3,T,2,2,services,other,course,mother,4,1,0,no,yes,no,no,yes,yes,yes,no,4,4,5,5,5,4,8,11,10,10 GP,F,17,U,GT3,T,4,4,teacher,services,course,mother,1,3,0,no,yes,yes,yes,yes,yes,yes,no,5,4,4,1,3,4,7,10,9,9 GP,F,17,U,GT3,T,4,4,teacher,teacher,course,mother,2,3,0,no,yes,yes,no,no,yes,yes,yes,4,3,3,1,2,4,4,14,14,14 GP,M,18,U,LE3,T,2,2,other,other,course,mother,1,4,0,no,yes,no,yes,yes,yes,yes,no,4,5,5,5,4,5,2,9,8,8 GP,F,17,R,GT3,T,2,4,at_home,other,course,father,1,3,0,no,yes,no,no,yes,yes,yes,yes,4,4,3,1,1,5,7,12,14,14 GP,F,18,U,GT3,T,3,3,services,services,home,mother,1,2,0,no,no,no,yes,yes,yes,yes,no,5,3,4,1,1,4,0,7,0,0 GP,F,18,U,LE3,T,2,2,other,other,home,other,1,2,0,no,no,no,yes,no,yes,yes,yes,4,3,3,1,1,2,0,8,8,0 GP,F,18,R,GT3,T,2,2,at_home,other,course,mother,2,4,0,no,no,no,yes,yes,yes,no,no,4,4,4,1,1,4,0,10,9,0 GP,F,17,U,GT3,T,3,4,services,other,course,mother,1,3,0,no,no,no,no,yes,yes,yes,no,4,4,5,1,3,5,16,16,15,15 GP,F,19,R,GT3,A,3,1,services,at_home,home,other,1,3,1,no,no,yes,no,yes,yes,no,no,5,4,3,1,2,5,12,14,13,13 GP,F,17,U,GT3,T,3,2,other,other,home,mother,1,2,0,no,yes,yes,no,yes,yes,yes,yes,4,3,2,2,3,2,0,7,8,0 GP,F,18,U,LE3,T,3,3,services,services,home,mother,1,4,0,no,yes,no,no,yes,yes,yes,no,5,3,3,1,1,1,7,16,15,17 GP,F,17,R,GT3,A,3,2,other,other,home,mother,1,2,0,no,yes,yes,no,yes,yes,yes,no,4,3,3,2,3,2,4,9,10,10 GP,F,19,U,GT3,T,2,1,services,services,home,other,1,3,1,no,no,yes,yes,yes,yes,yes,yes,4,3,4,1,3,3,4,11,12,11 GP,M,18,U,GT3,T,4,4,teacher,services,home,father,1,2,1,no,yes,no,yes,yes,yes,yes,no,4,3,3,2,2,2,0,10,10,0 GP,M,18,U,LE3,T,3,4,services,other,home,mother,1,2,0,no,no,no,yes,yes,yes,yes,yes,4,3,3,1,3,5,11,16,15,15 GP,F,17,U,GT3,A,2,2,at_home,at_home,home,father,1,2,1,no,yes,no,no,yes,yes,yes,yes,3,3,1,1,2,4,0,9,8,0 GP,F,18,U,GT3,T,2,3,at_home,other,course,mother,1,3,0,no,yes,no,no,yes,yes,yes,no,4,3,3,1,2,3,4,11,10,10 GP,F,18,U,GT3,T,3,2,other,services,other,mother,1,3,0,no,no,no,no,yes,yes,yes,yes,5,4,3,2,3,1,7,13,13,14 GP,M,18,R,GT3,T,4,3,teacher,services,course,mother,1,3,0,no,no,no,no,yes,yes,yes,yes,5,3,2,1,2,4,9,16,15,16 GP,M,18,U,GT3,T,4,3,teacher,other,course,mother,1,3,0,no,yes,yes,no,yes,yes,yes,yes,5,4,5,2,3,5,0,10,10,9 GP,F,17,U,GT3,T,4,3,health,other,reputation,mother,1,3,0,no,yes,yes,yes,yes,yes,yes,yes,4,4,3,1,3,4,0,13,15,15 MS,M,18,R,GT3,T,3,2,other,other,course,mother,2,1,1,no,yes,no,no,no,yes,yes,no,2,5,5,5,5,5,10,11,13,13 MS,M,19,R,GT3,T,1,1,other,services,home,other,3,2,3,no,no,no,no,yes,yes,yes,no,5,4,4,3,3,2,8,8,7,8 MS,M,17,U,GT3,T,3,3,health,other,course,mother,2,2,0,no,yes,yes,no,yes,yes,yes,no,4,5,4,5,3,3,2,13,13,13 MS,M,18,U,LE3,T,1,3,at_home,services,course,mother,1,1,1,no,no,no,no,yes,no,yes,yes,4,3,3,2,3,3,7,8,7,8 MS,M,19,R,GT3,T,1,1,other,other,home,other,3,1,1,no,yes,no,no,yes,yes,yes,no,4,4,4,3,3,5,4,8,8,8 MS,M,17,R,GT3,T,4,3,services,other,home,mother,2,2,0,no,yes,yes,yes,no,yes,yes,yes,4,5,5,4,5,2,4,13,11,11 MS,F,18,U,GT3,T,3,3,services,services,course,father,1,2,0,no,yes,no,no,yes,yes,no,yes,5,3,4,1,1,5,0,10,9,9 MS,F,17,R,GT3,T,4,4,teacher,services,other,father,2,2,0,no,yes,yes,yes,yes,yes,yes,no,4,3,3,1,2,5,4,12,13,13 MS,F,17,U,LE3,A,3,2,services,other,reputation,mother,2,2,0,no,no,no,no,yes,yes,no,yes,1,2,3,1,2,5,2,12,12,11 MS,M,18,U,LE3,T,1,1,other,services,home,father,2,1,0,no,no,no,no,no,yes,yes,yes,3,3,2,1,2,3,4,10,10,10 MS,F,18,U,LE3,T,1,1,at_home,services,course,father,2,3,0,no,no,no,no,yes,yes,yes,no,5,3,2,1,1,4,0,18,16,16 MS,F,18,R,LE3,A,1,4,at_home,other,course,mother,3,2,0,no,no,no,no,yes,yes,no,yes,4,3,4,1,4,5,0,13,13,13 MS,M,18,R,LE3,T,1,1,at_home,other,other,mother,2,2,1,no,no,no,yes,no,no,no,no,4,4,3,2,3,5,2,13,12,12 MS,F,18,U,GT3,T,3,3,services,services,other,mother,2,2,0,no,yes,no,no,yes,yes,yes,yes,4,3,2,1,3,3,0,11,11,10 MS,F,17,U,LE3,T,4,4,at_home,at_home,course,mother,1,2,0,no,yes,yes,yes,yes,yes,yes,yes,2,3,4,1,1,1,0,16,15,15 MS,F,17,R,GT3,T,1,2,other,services,course,father,2,2,0,no,no,no,no,no,yes,no,no,3,2,2,1,2,3,0,12,11,12 MS,M,18,R,GT3,T,1,3,at_home,other,course,mother,2,2,0,no,yes,yes,no,yes,yes,no,no,3,3,4,2,4,3,4,10,10,10 MS,M,18,U,LE3,T,4,4,teacher,services,other,mother,2,3,0,no,no,yes,no,yes,yes,yes,yes,4,2,2,2,2,5,0,13,13,13 MS,F,17,R,GT3,T,1,1,other,services,reputation,mother,3,1,1,no,yes,yes,no,yes,yes,yes,yes,5,2,1,1,2,1,0,7,6,0 MS,F,18,U,GT3,T,2,3,at_home,services,course,father,2,1,0,no,yes,yes,no,yes,yes,yes,yes,5,2,3,1,2,4,0,11,10,10 MS,F,18,R,GT3,T,4,4,other,teacher,other,father,3,2,0,no,yes,yes,no,no,yes,yes,yes,3,2,2,4,2,5,10,14,12,11 MS,F,19,U,LE3,T,3,2,services,services,home,other,2,2,2,no,no,no,yes,yes,yes,no,yes,3,2,2,1,1,3,4,7,7,9 MS,M,18,R,LE3,T,1,2,at_home,services,other,father,3,1,0,no,yes,yes,yes,yes,no,yes,yes,4,3,3,2,3,3,3,14,12,12 MS,F,17,U,GT3,T,2,2,other,at_home,home,mother,1,3,0,no,no,no,yes,yes,yes,no,yes,3,4,3,1,1,3,8,13,11,11 MS,F,17,R,GT3,T,1,2,other,other,course,mother,1,1,0,no,no,no,yes,yes,yes,yes,no,3,5,5,4,5,1,14,6,5,5 MS,F,18,R,LE3,T,4,4,other,other,reputation,mother,2,3,0,no,no,no,no,yes,yes,yes,no,5,4,4,1,1,1,0,19,18,19 MS,F,18,R,GT3,T,1,1,other,other,home,mother,4,3,0,no,no,no,no,yes,yes,yes,no,4,3,2,1,2,4,2,8,8,10 MS,F,20,U,GT3,T,4,2,health,other,course,other,2,3,2,no,yes,yes,no,no,yes,yes,yes,5,4,3,1,1,3,4,15,14,15 MS,F,18,R,LE3,T,4,4,teacher,services,course,mother,1,2,0,no,no,yes,yes,yes,yes,yes,no,5,4,3,3,4,2,4,8,9,10 MS,F,18,U,GT3,T,3,3,other,other,home,mother,1,2,0,no,no,yes,no,yes,yes,yes,yes,4,1,3,1,2,1,0,15,15,15 MS,F,17,R,GT3,T,3,1,at_home,other,reputation,mother,1,2,0,no,yes,yes,yes,no,yes,yes,no,4,5,4,4,5,1,17,10,10,10 MS,M,18,U,GT3,T,4,4,teacher,teacher,home,father,1,2,0,no,no,yes,yes,no,yes,yes,no,3,2,4,1,4,2,4,15,14,14 MS,M,18,R,GT3,T,2,1,other,other,other,mother,2,1,0,no,no,no,yes,no,yes,yes,yes,4,4,3,1,3,5,5,7,6,7 MS,M,17,U,GT3,T,2,3,other,services,home,father,2,2,0,no,no,no,yes,yes,yes,yes,no,4,4,3,1,1,3,2,11,11,10 MS,M,19,R,GT3,T,1,1,other,services,other,mother,2,1,1,no,no,no,no,yes,yes,no,no,4,3,2,1,3,5,0,6,5,0 MS,M,18,R,GT3,T,4,2,other,other,home,father,2,1,1,no,no,yes,no,yes,yes,no,no,5,4,3,4,3,3,14,6,5,5 MS,F,18,R,GT3,T,2,2,at_home,other,other,mother,2,3,0,no,no,yes,no,yes,yes,no,no,5,3,3,1,3,4,2,10,9,10 MS,F,18,R,GT3,T,4,4,teacher,at_home,reputation,mother,3,1,0,no,yes,yes,yes,yes,yes,yes,yes,4,4,3,2,2,5,7,6,5,6 MS,F,19,R,GT3,T,2,3,services,other,course,mother,1,3,1,no,no,no,yes,no,yes,yes,no,5,4,2,1,2,5,0,7,5,0 MS,F,18,U,LE3,T,3,1,teacher,services,course,mother,1,2,0,no,yes,yes,no,yes,yes,yes,no,4,3,4,1,1,1,0,7,9,8 MS,F,18,U,GT3,T,1,1,other,other,course,mother,2,2,1,no,no,no,yes,yes,yes,no,no,1,1,1,1,1,5,0,6,5,0 MS,M,20,U,LE3,A,2,2,services,services,course,other,1,2,2,no,yes,yes,no,yes,yes,no,no,5,5,4,4,5,4,11,9,9,9 MS,M,17,U,LE3,T,3,1,services,services,course,mother,2,1,0,no,no,no,no,no,yes,yes,no,2,4,5,3,4,2,3,14,16,16 MS,M,21,R,GT3,T,1,1,other,other,course,other,1,1,3,no,no,no,no,no,yes,no,no,5,5,3,5,5,3,3,10,8,7 MS,M,18,R,LE3,T,3,2,services,other,course,mother,3,1,0,no,no,no,no,no,yes,yes,no,4,4,1,3,4,5,0,11,12,10 MS,M,19,U,LE3,T,1,1,other,at_home,course,father,1,1,0,no,no,no,no,yes,yes,yes,no,3,2,3,3,3,5,5,8,9,9 -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | The MIT License (MIT) 2 | 3 | Copyright (c) 2016 Luke Chang 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /Notebooks/.gitignore: -------------------------------------------------------------------------------- 1 | # Ignore solutions 2 | Solutions/* 3 | .ipynb_checkpoints -------------------------------------------------------------------------------- /Notebooks/1_Introduction_to_Programming.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": {}, 6 | "source": [ 7 | "# Introduction to programming\n", 8 | "_Written by Luke Chang_\n", 9 | "\n", 10 | "In this notebook we will begin to learn how to use Python. There are many different ways to install Python, but we recommend starting using Anaconda which is preconfigured for scientific computing. Start with installing [Python 3.7](https://www.anaconda.com/distribution/). For those who prefer a more configurable IDE, [Pycharm](https://www.jetbrains.com/pycharm/) is a nice option. Python is a modular interpreted language with an intuitive minimal syntax that is quickly becoming one of the most popular languages for [conducting research](http://www.talyarkoni.org/blog/2013/11/18/the-homogenization-of-scientific-computing-or-why-python-is-steadily-eating-other-languages-lunch/). You can use python for [stimulus presentation](http://www.psychopy.org/), [data analysis](http://statsmodels.sourceforge.net/), [machine-learning](http://scikit-learn.org/stable/), [scraping data](https://www.crummy.com/software/BeautifulSoup/), creating websites with [flask](http://flask.pocoo.org/) or [django](https://www.djangoproject.com/), or [neuroimaging data analysis](http://nipy.org/).\n", 11 | "\n", 12 | "There are lots of free useful resources to learn how to use python and various modules. See Yaroslav Halchenko's excellent [Dartmouth course](https://github.com/dartmouth-pbs/psyc161). [Codeacademy](https://www.codecademy.com/) is a great interactive tutorial. [Stack Overflow](http://stackoverflow.com/) is an incredibly useful resource for asking specific questions and seeing responses to others that have been rated by the development community. " 13 | ] 14 | }, 15 | { 16 | "cell_type": "markdown", 17 | "metadata": {}, 18 | "source": [ 19 | "## Jupyter Notebooks\n", 20 | "We will primarily be using [Jupyter Notebooks](http://jupyter.org/) to interface with Python. A Jupyter notebook consists of **cells**. The two main types of cells you will use are code cells and markdown cells.\n", 21 | "\n", 22 | "A **_code cell_** contains actual code that you want to run. You can specify a cell as a code cell using the pulldown menu in the toolbar in your Jupyter notebook. Otherwise, you can can hit esc and then y (denoted \"esc, y\") while a cell is selected to specify that it is a code cell. Note that you will have to hit enter after doing this to start editing it.\n", 23 | "If you want to execute the code in a code cell, hit \"shift + enter.\" Note that code cells are executed in the order you execute them. That is to say, the ordering of the cells for which you hit \"shift + enter\" is the order in which the code is executed. If you did not explicitly execute a cell early in the document, its results are now known to the Python interpreter.\n", 24 | "\n", 25 | "**_Markdown cells_** contain text. The text is written in markdown, a lightweight markup language. You can read about its syntax [here](http://daringfireball.net/projects/markdown/syntax). Note that you can also insert HTML into markdown cells, and this will be rendered properly. As you are typing the contents of these cells, the results appear as text. Hitting \"shift + enter\" renders the text in the formatting you specify. You can specify a cell as being a markdown cell in the Jupyter toolbar, or by hitting \"esc, m\" in the cell. Again, you have to hit enter after using the quick keys to bring the cell into edit mode.\n", 26 | "\n", 27 | "In general, when you want to add a new cell, you can use the \"Insert\" pulldown menu from the Jupyter toolbar. The shortcut to insert a cell below is \"esc, b\" and to insert a cell above is \"esc, a.\" Alternatively, you can execute a cell and automatically add a new one below it by hitting \"alt + enter.\"" 28 | ] 29 | }, 30 | { 31 | "cell_type": "code", 32 | "execution_count": null, 33 | "metadata": {}, 34 | "outputs": [], 35 | "source": [ 36 | "print(\"Hello World\")" 37 | ] 38 | }, 39 | { 40 | "cell_type": "markdown", 41 | "metadata": {}, 42 | "source": [ 43 | "## Package Management\n", 44 | "Package managment in Python has been dramatically improving. Anaconda has it's own package manager called 'conda'. Use this if you would like to install a new module as it is optimized to work with anaconda. \n", 45 | "\n", 46 | "```\n", 47 | "!conda install *package*\n", 48 | "```\n", 49 | "\n", 50 | "However, sometimes conda doesn't have a particular package. In this case use the default python package manager called 'pip'. \n", 51 | "\n", 52 | "These commands can be run in your unix terminal or you can send them to the shell from a Jupyter notebook by starting the line with ```!```\n", 53 | "\n", 54 | "It is easy to get help on how to use the package managers\n", 55 | "\n", 56 | "```\n", 57 | "!pip help install\n", 58 | "```" 59 | ] 60 | }, 61 | { 62 | "cell_type": "code", 63 | "execution_count": null, 64 | "metadata": {}, 65 | "outputs": [], 66 | "source": [ 67 | "!pip help install" 68 | ] 69 | }, 70 | { 71 | "cell_type": "code", 72 | "execution_count": null, 73 | "metadata": {}, 74 | "outputs": [], 75 | "source": [ 76 | "!pip list --outdated" 77 | ] 78 | }, 79 | { 80 | "cell_type": "code", 81 | "execution_count": null, 82 | "metadata": {}, 83 | "outputs": [], 84 | "source": [ 85 | "!pip install setuptools --upgrade" 86 | ] 87 | }, 88 | { 89 | "cell_type": "markdown", 90 | "metadata": {}, 91 | "source": [ 92 | "## Variables\n", 93 | "* Built-in\n", 94 | " * Numeric types:\n", 95 | " * **int**, **float**, **long**, complex\n", 96 | " * **str**ing \n", 97 | " * **bool**ean\n", 98 | " * True / False\n", 99 | " * **NoneType**\n", 100 | "* User defined\n", 101 | "\n", 102 | "* Use the type() function to find the type for a value or variable\n", 103 | "\n", 104 | "* Data can be converted using cast commands" 105 | ] 106 | }, 107 | { 108 | "cell_type": "code", 109 | "execution_count": null, 110 | "metadata": {}, 111 | "outputs": [], 112 | "source": [ 113 | "# Integer\n", 114 | "a = 1\n", 115 | "print(type(a))\n", 116 | "\n", 117 | "# Float\n", 118 | "b = 1.0\n", 119 | "print(type(b))\n", 120 | "\n", 121 | "# String\n", 122 | "c = 'hello'\n", 123 | "print(type(c))\n", 124 | "\n", 125 | "# Boolean\n", 126 | "d = True\n", 127 | "print(type(d))\n", 128 | "\n", 129 | "# None\n", 130 | "e = None\n", 131 | "print(type(e))\n", 132 | "\n", 133 | "# Cast integer to string\n", 134 | "print(type(str(a)))" 135 | ] 136 | }, 137 | { 138 | "cell_type": "markdown", 139 | "metadata": {}, 140 | "source": [ 141 | "## Math Operators\n", 142 | "* +, -, *, and /\n", 143 | "* Exponentiation **\n", 144 | "* Modulo %\n", 145 | "\n", 146 | "* Note that division with integers in Python 2.7 automatically rounds, which may not be intended. It is recommended to import the division module from python3 `from __future__ import division`" 147 | ] 148 | }, 149 | { 150 | "cell_type": "code", 151 | "execution_count": null, 152 | "metadata": { 153 | "ExecuteTime": { 154 | "end_time": "2019-03-27T14:34:18.317397Z", 155 | "start_time": "2019-03-27T14:34:18.312483Z" 156 | } 157 | }, 158 | "outputs": [], 159 | "source": [ 160 | "# Addition\n", 161 | "a = 2 + 7\n", 162 | "print(a)\n", 163 | "\n", 164 | "# Subtraction\n", 165 | "b = a - 5\n", 166 | "print(b)\n", 167 | "\n", 168 | "# Multiplication\n", 169 | "print(b*2)\n", 170 | "\n", 171 | "# Exponentiation\n", 172 | "print(b**2)\n", 173 | "\n", 174 | "# Modulo\n", 175 | "print(4%9)\n", 176 | "\n", 177 | "# Division\n", 178 | "print(4/9)" 179 | ] 180 | }, 181 | { 182 | "cell_type": "markdown", 183 | "metadata": {}, 184 | "source": [ 185 | "## String Operators\n", 186 | "* Some of the arithmetic operators also have meaning for strings. E.g. for string concatenation use `+` sign\n", 187 | "* String repetition: Use `*` sign with a number of repetitions" 188 | ] 189 | }, 190 | { 191 | "cell_type": "code", 192 | "execution_count": null, 193 | "metadata": {}, 194 | "outputs": [], 195 | "source": [ 196 | "# Combine string\n", 197 | "a = 'Hello'\n", 198 | "b = 'World'\n", 199 | "print(a + b)\n", 200 | "\n", 201 | "# Repeat String\n", 202 | "print(a*5)" 203 | ] 204 | }, 205 | { 206 | "cell_type": "markdown", 207 | "metadata": {}, 208 | "source": [ 209 | "## Logical Operators\n", 210 | "Perform logical comparison and return Boolean value\n", 211 | "\n", 212 | "```python\n", 213 | "x == y # x is equal to y\n", 214 | "x != y # x is not equal to y\n", 215 | "x > y # x is greater than y\n", 216 | "x < y # x is less than y\n", 217 | "x >= y # x is greater than or equal to y \n", 218 | "x <= y # x is less than or equal to y```\n", 219 | "\n", 220 | "| X | not X | \n", 221 | "|-------|--------|\n", 222 | "| True | False | \n", 223 | "| False | True | \n", 224 | "\n", 225 | "| X | Y | X AND Y | X OR Y | \n", 226 | "|------|------|---------|--------|\n", 227 | "|True | True | True | True | \n", 228 | "|True | False| False | True | \n", 229 | "|False | True | False | True | \n", 230 | "|False | False| False | False | " 231 | ] 232 | }, 233 | { 234 | "cell_type": "code", 235 | "execution_count": null, 236 | "metadata": {}, 237 | "outputs": [], 238 | "source": [ 239 | "# Works for string\n", 240 | "a = 'hello'\n", 241 | "b = 'world'\n", 242 | "c = 'Hello'\n", 243 | "print(a==b)\n", 244 | "print(a==c)\n", 245 | "print(a!=b)\n", 246 | "\n", 247 | "# Works for numeric\n", 248 | "d = 5\n", 249 | "e = 8\n", 250 | "print(d < e)" 251 | ] 252 | }, 253 | { 254 | "cell_type": "markdown", 255 | "metadata": {}, 256 | "source": [ 257 | "## Conditional Logic (if...)\n", 258 | "Unlike most other languages, Python uses tab formatting rather than closing conditional statements (e.g., end).\n", 259 | "\n", 260 | "* Syntax:\n", 261 | "\n", 262 | "``` python \n", 263 | "if condition: \n", 264 | " do something\n", 265 | "```\n", 266 | "\n", 267 | "* Implicit conversion of the value to bool() happens if `condition` is of a different type than **bool**, thus all of the following should work:\n", 268 | "\n", 269 | "```python\n", 270 | "if condition:\n", 271 | " do_something\n", 272 | "elif condition:\n", 273 | " do_alternative1\n", 274 | "else:\n", 275 | " do_otherwise # often reserved to report an error\n", 276 | " # after a long list of options\n", 277 | " ```\n" 278 | ] 279 | }, 280 | { 281 | "cell_type": "code", 282 | "execution_count": null, 283 | "metadata": {}, 284 | "outputs": [], 285 | "source": [ 286 | "n = 1\n", 287 | "\n", 288 | "if n:\n", 289 | " print(\"n is non-0\")\n", 290 | "\n", 291 | "if n is None:\n", 292 | " print(\"n is None\")\n", 293 | " \n", 294 | "if n is not None:\n", 295 | " print(\"n is not None\")" 296 | ] 297 | }, 298 | { 299 | "cell_type": "markdown", 300 | "metadata": {}, 301 | "source": [ 302 | "## Loops\n", 303 | "* **for** loop is probably the most popular loop construct in Python:\n", 304 | "\n", 305 | "```python\n", 306 | "for target in sequence:\n", 307 | " do_statements\n", 308 | "```\n", 309 | "\n", 310 | "* However, it's also possible to use a **while** loop to repeat statements while `condition` remains True:\n", 311 | "\n", 312 | "```python\n", 313 | "while condition do:\n", 314 | " do_statements\n", 315 | "```" 316 | ] 317 | }, 318 | { 319 | "cell_type": "code", 320 | "execution_count": null, 321 | "metadata": { 322 | "ExecuteTime": { 323 | "end_time": "2019-03-27T14:35:32.341248Z", 324 | "start_time": "2019-03-27T14:35:32.336945Z" 325 | } 326 | }, 327 | "outputs": [], 328 | "source": [ 329 | "string = \"Python is going to make conducting research easier\"\n", 330 | "for c in string:\n", 331 | " print(c)\n" 332 | ] 333 | }, 334 | { 335 | "cell_type": "code", 336 | "execution_count": null, 337 | "metadata": { 338 | "ExecuteTime": { 339 | "end_time": "2019-03-27T14:35:49.426964Z", 340 | "start_time": "2019-03-27T14:35:49.421669Z" 341 | } 342 | }, 343 | "outputs": [], 344 | "source": [ 345 | "x = 0\n", 346 | "end = 10\n", 347 | "\n", 348 | "csum = 0\n", 349 | "while x < end:\n", 350 | " csum += x\n", 351 | " print(x, csum)\n", 352 | " x += 1\n", 353 | "print(\"Exited with x==%d\" % x )" 354 | ] 355 | }, 356 | { 357 | "cell_type": "markdown", 358 | "metadata": {}, 359 | "source": [ 360 | "## Functions\n", 361 | "A **function** is a named sequence of statements that performs a computation. You define the function by giving it a name, specify a sequence of statements, and optionally values to return. Later, you can “call” the function by name. \n", 362 | "```python\n", 363 | "def make_upper_case(text):\n", 364 | " return (text.upper())\n", 365 | "```\n", 366 | "* The expression in the parenthesis is the **argument**. \n", 367 | "* It is common to say that a function **“takes” an argument** and **“returns” a result**.\n", 368 | "* The result is called the **return value**.\n", 369 | "\n", 370 | "The first line of the function definition is called the **header**; the rest is called the **body**.\n", 371 | "\n", 372 | "The header has to end with a colon and the body has to be indented.\n", 373 | "It is a common practice to use 4 spaces for indentation, and to avoid mixing with tabs.\n", 374 | "\n", 375 | "Function body in Python ends whenever statement begins at the original level of indentation. There is no **end** or **fed** or any other identify to signal the end of function. Indentation is part of the the language syntax in Python, making it more readable and less cluttered." 376 | ] 377 | }, 378 | { 379 | "cell_type": "code", 380 | "execution_count": null, 381 | "metadata": { 382 | "ExecuteTime": { 383 | "end_time": "2019-03-27T14:35:59.444165Z", 384 | "start_time": "2019-03-27T14:35:59.440980Z" 385 | } 386 | }, 387 | "outputs": [], 388 | "source": [ 389 | "def make_upper_case(text):\n", 390 | " return (text.upper())\n", 391 | "\n", 392 | "string = \"Python is going to make conducting research easier\"\n", 393 | "\n", 394 | "print(make_upper_case(string))" 395 | ] 396 | }, 397 | { 398 | "cell_type": "markdown", 399 | "metadata": {}, 400 | "source": [ 401 | "## Python Containers\n", 402 | "There are 4 main types of builtin containers for storing data in Python:\n", 403 | "* list\n", 404 | "* tuple\n", 405 | "* dict\n", 406 | "* set" 407 | ] 408 | }, 409 | { 410 | "cell_type": "markdown", 411 | "metadata": {}, 412 | "source": [ 413 | "### Lists\n", 414 | "In Python, a list is a mutable sequence of values. Mutable means that we can change separate entries within a list. For a more in depth tutorial on lists look [here](http://nbviewer.jupyter.org/github/dartmouth-pbs/psyc161/blob/master/classes/02d-Python-Fundamentals-Containers-Lists.ipynb)\n", 415 | "\n", 416 | "* Each value in the list is an element or item\n", 417 | "* Elements can be any Python data type\n", 418 | "* Lists can mix data types\n", 419 | "\n", 420 | "* Lists are initialized with ```[]``` or ```list()```\n", 421 | "```python\n", 422 | "l = [1,2,3]\n", 423 | "```\n", 424 | "* \n", 425 | "Elements within a list are indexed (**starting with 0**)\n", 426 | "```python\n", 427 | "l[0]\n", 428 | "```\n", 429 | "\n", 430 | "* \n", 431 | "Elements can be nested lists\n", 432 | "```python\n", 433 | "nested = [[1, 2, 3], [4, 5, 6], [7, 8, 9]]\n", 434 | "```\n", 435 | "\n", 436 | "* \n", 437 | "Lists can be *sliced*.\n", 438 | "```python\n", 439 | "l[start:stop:stride]\n", 440 | "```\n", 441 | "\n", 442 | "* Like all python containers, lists have many useful methods that can be applied\n", 443 | "```python\n", 444 | "a.insert(index,new element)\n", 445 | "a.append(element to add at end)\n", 446 | "len(a)\n", 447 | "```\n", 448 | "\n", 449 | "* \n", 450 | "List comprehension is a *Very* powerful technique allowing for efficient construction of new lists.\n", 451 | "```python\n", 452 | "[a for a in l]\n", 453 | "```\n" 454 | ] 455 | }, 456 | { 457 | "cell_type": "code", 458 | "execution_count": null, 459 | "metadata": { 460 | "ExecuteTime": { 461 | "end_time": "2019-03-27T14:36:30.240853Z", 462 | "start_time": "2019-03-27T14:36:30.236249Z" 463 | } 464 | }, 465 | "outputs": [], 466 | "source": [ 467 | "# Indexing and Slicing\n", 468 | "a = ['lists','are','arrays']\n", 469 | "print(a[0])\n", 470 | "print(a[1:3])\n", 471 | "\n", 472 | "# List methods\n", 473 | "a.insert(2,'python')\n", 474 | "a.append('.')\n", 475 | "print(a)\n", 476 | "print(len(a))\n", 477 | "\n", 478 | "# List Comprehension\n", 479 | "print([x.upper() for x in a])" 480 | ] 481 | }, 482 | { 483 | "cell_type": "markdown", 484 | "metadata": {}, 485 | "source": [ 486 | "### Dictionaries\n", 487 | "\n", 488 | "* In Python, a dictionary (or `dict`) is mapping between a set of\n", 489 | "indices (**keys**) and a set of **values**\n", 490 | "\n", 491 | "* The items in a dictionary are key-value pairs\n", 492 | "\n", 493 | "* Keys can be any Python data type\n", 494 | "\n", 495 | "* Dictionaries are unordered\n", 496 | "\n", 497 | "* Here is a more indepth tutorial on [dictionaries](http://nbviewer.jupyter.org/github/dartmouth-pbs/psyc161/blob/master/classes/03c-Python-Fundamentals-Containers-Dicts.ipynb)" 498 | ] 499 | }, 500 | { 501 | "cell_type": "code", 502 | "execution_count": null, 503 | "metadata": { 504 | "ExecuteTime": { 505 | "end_time": "2019-03-27T14:36:42.674472Z", 506 | "start_time": "2019-03-27T14:36:42.670435Z" 507 | } 508 | }, 509 | "outputs": [], 510 | "source": [ 511 | "# Dictionaries\n", 512 | "eng2sp = {}\n", 513 | "eng2sp['one'] = 'uno'\n", 514 | "print(eng2sp)\n", 515 | "\n", 516 | "eng2sp = {'one': 'uno', 'two': 'dos', 'three': 'tres'}\n", 517 | "print(eng2sp)\n", 518 | "\n", 519 | "print(eng2sp.keys())\n", 520 | "print(eng2sp.values())" 521 | ] 522 | }, 523 | { 524 | "cell_type": "markdown", 525 | "metadata": {}, 526 | "source": [ 527 | "### Tuples\n", 528 | "In Python, a **tuple** is an immutable sequence of values, meaning they can't be changed\n", 529 | "\n", 530 | "* Each value in the tuple is an element or item\n", 531 | "\n", 532 | "* Elements can be any Python data type\n", 533 | "\n", 534 | "* Tuples can mix data types\n", 535 | "\n", 536 | "* Elements can be nested tuples\n", 537 | "\n", 538 | "* **Essentially tuples are immutable lists**\n", 539 | "\n", 540 | "Here is a nice tutorial on [tuples](http://nbviewer.jupyter.org/github/dartmouth-pbs/psyc161/blob/master/classes/03b-Python-Fundamentals-Containers-Tuples.ipynb)" 541 | ] 542 | }, 543 | { 544 | "cell_type": "code", 545 | "execution_count": null, 546 | "metadata": { 547 | "ExecuteTime": { 548 | "end_time": "2019-03-27T14:36:51.403867Z", 549 | "start_time": "2019-03-27T14:36:51.400160Z" 550 | } 551 | }, 552 | "outputs": [], 553 | "source": [ 554 | "numbers = (1, 2, 3, 4)\n", 555 | "print(numbers)\n", 556 | "\n", 557 | "t2 = 1, 2\n", 558 | "print(t2)" 559 | ] 560 | }, 561 | { 562 | "cell_type": "markdown", 563 | "metadata": {}, 564 | "source": [ 565 | "## sets\n", 566 | "In Python, a `set` is an efficient storage for \"membership\" checking\n", 567 | "\n", 568 | "* `set` is like a `dict` but only with keys and without values\n", 569 | "\n", 570 | "* a `set` can also perform set operations (e.g., union intersection)\n", 571 | "\n", 572 | "* Here is more info on [sets](http://nbviewer.jupyter.org/github/dartmouth-pbs/psyc161/blob/master/classes/03d-Python-Fundamentals-Containers-Sets.ipynb)\n" 573 | ] 574 | }, 575 | { 576 | "cell_type": "code", 577 | "execution_count": null, 578 | "metadata": { 579 | "ExecuteTime": { 580 | "end_time": "2019-03-27T14:37:02.025425Z", 581 | "start_time": "2019-03-27T14:37:02.021137Z" 582 | } 583 | }, 584 | "outputs": [], 585 | "source": [ 586 | "# Union\n", 587 | "print({1, 2, 3, 'mom', 'dad'} | {2, 3, 10})\n", 588 | "\n", 589 | "# Intersection\n", 590 | "print({1, 2, 3, 'mom', 'dad'} & {2, 3, 10})\n", 591 | "\n", 592 | "# Difference\n", 593 | "print({1, 2, 3, 'mom', 'dad'} - {2, 3, 10})" 594 | ] 595 | }, 596 | { 597 | "cell_type": "markdown", 598 | "metadata": {}, 599 | "source": [ 600 | "## Modules\n", 601 | "A *Module* is a python file that contains a collection of related definitions. Python has *hundreds* of standard modules. These are organized into what is known as the [Python Standard Library](http://docs.python.org/library/). You can also create and use your own modules. To use functionality from a module, you first have to import the entire module or parts of it into your namespace\n", 602 | "\n", 603 | "* To import the entire module, use \n", 604 | "\n", 605 | " ```python\n", 606 | " import module_name```\n", 607 | "\n", 608 | "* You can also import a module using a specific name \n", 609 | "\n", 610 | " ```python\n", 611 | " import module_name as new_module_name```\n", 612 | "\n", 613 | "* To import specific definitions (e.g. functions, variables, etc) from the module into your local namespace, use\n", 614 | "\n", 615 | "```python\n", 616 | "from module_name import name1, name2\n", 617 | "```\n", 618 | " which will make those available directly in your ```namespace```" 619 | ] 620 | }, 621 | { 622 | "cell_type": "code", 623 | "execution_count": null, 624 | "metadata": { 625 | "ExecuteTime": { 626 | "end_time": "2017-01-09T09:18:26.329329", 627 | "start_time": "2017-01-09T09:18:26.324573" 628 | } 629 | }, 630 | "outputs": [], 631 | "source": [ 632 | "import os\n", 633 | "from glob import glob" 634 | ] 635 | }, 636 | { 637 | "cell_type": "markdown", 638 | "metadata": {}, 639 | "source": [ 640 | "Here let's try and get the path of the current working directory using functions from the `os` module" 641 | ] 642 | }, 643 | { 644 | "cell_type": "code", 645 | "execution_count": null, 646 | "metadata": {}, 647 | "outputs": [], 648 | "source": [ 649 | "os.path.abspath(os.path.curdir)" 650 | ] 651 | }, 652 | { 653 | "cell_type": "markdown", 654 | "metadata": {}, 655 | "source": [ 656 | "It looks like we are currently in the notebooks folder of the github repository. Let's use glob, a pattern matching function, to list all of the csv files in the Data folder. " 657 | ] 658 | }, 659 | { 660 | "cell_type": "code", 661 | "execution_count": null, 662 | "metadata": { 663 | "ExecuteTime": { 664 | "end_time": "2019-03-27T14:37:10.021529Z", 665 | "start_time": "2019-03-27T14:37:09.940612Z" 666 | } 667 | }, 668 | "outputs": [], 669 | "source": [ 670 | "data_file_list = glob(os.path.join('..','Data','*csv'))\n", 671 | "print(data_file_list)" 672 | ] 673 | }, 674 | { 675 | "cell_type": "markdown", 676 | "metadata": {}, 677 | "source": [ 678 | "This gives us a list of the files including the relative path from the current directory. What if we wanted just the filenames? There are several different ways to do this. First, we can use the the `os.path.basename` function. We loop over every file, grab the base file name and then append it to a new list. " 679 | ] 680 | }, 681 | { 682 | "cell_type": "code", 683 | "execution_count": null, 684 | "metadata": { 685 | "ExecuteTime": { 686 | "end_time": "2017-01-09T09:26:35.521842", 687 | "start_time": "2017-01-09T09:26:35.517653" 688 | } 689 | }, 690 | "outputs": [], 691 | "source": [ 692 | "file_list = []\n", 693 | "for f in data_file_list:\n", 694 | " file_list.append(os.path.basename(f))\n", 695 | "\n", 696 | "print(file_list)" 697 | ] 698 | }, 699 | { 700 | "cell_type": "markdown", 701 | "metadata": {}, 702 | "source": [ 703 | "Alternatively, we could loop over all files and split on the `/` character. This will create a new list where each element is whatever characters are separated by the splitting character. We can then take the last element of each list." 704 | ] 705 | }, 706 | { 707 | "cell_type": "code", 708 | "execution_count": null, 709 | "metadata": { 710 | "ExecuteTime": { 711 | "end_time": "2017-01-09T09:27:57.077454", 712 | "start_time": "2017-01-09T09:27:57.073701" 713 | } 714 | }, 715 | "outputs": [], 716 | "source": [ 717 | "file_list = []\n", 718 | "for f in data_file_list:\n", 719 | " file_list.append(f.split('/')[-1])\n", 720 | "\n", 721 | "print(file_list)" 722 | ] 723 | }, 724 | { 725 | "cell_type": "markdown", 726 | "metadata": {}, 727 | "source": [ 728 | "It is also sometimes even cleaner to do this as a list comprehension" 729 | ] 730 | }, 731 | { 732 | "cell_type": "code", 733 | "execution_count": null, 734 | "metadata": { 735 | "ExecuteTime": { 736 | "end_time": "2017-01-09T09:28:39.734003", 737 | "start_time": "2017-01-09T09:28:39.729813" 738 | } 739 | }, 740 | "outputs": [], 741 | "source": [ 742 | "[os.path.basename(x) for x in data_file_list]" 743 | ] 744 | }, 745 | { 746 | "cell_type": "markdown", 747 | "metadata": {}, 748 | "source": [ 749 | "# Exercises" 750 | ] 751 | }, 752 | { 753 | "cell_type": "markdown", 754 | "metadata": {}, 755 | "source": [ 756 | "### Find Even Numbers\n", 757 | "Let’s say I give you a list saved in a variable: a = [1, 4, 9, 16, 25, 36, 49, 64, 81, 100]. Make a new list that has only the even elements of this list in it." 758 | ] 759 | }, 760 | { 761 | "cell_type": "code", 762 | "execution_count": null, 763 | "metadata": {}, 764 | "outputs": [], 765 | "source": [] 766 | }, 767 | { 768 | "cell_type": "markdown", 769 | "metadata": {}, 770 | "source": [ 771 | "### Find Maximal Range\n", 772 | "Given an array length 1 or more of ints, return the difference between the largest and smallest values in the array. " 773 | ] 774 | }, 775 | { 776 | "cell_type": "code", 777 | "execution_count": null, 778 | "metadata": {}, 779 | "outputs": [], 780 | "source": [] 781 | }, 782 | { 783 | "cell_type": "markdown", 784 | "metadata": {}, 785 | "source": [ 786 | "### Duplicated Numbers\n", 787 | "Find the numbers in list a that are also in list b\n", 788 | "\n", 789 | "a = [0, 1, 4, 9, 16, 25, 36, 49, 64, 81, 100, 121, 144, 169, 196, 225, 256, 289, 324, 361]\n", 790 | "\n", 791 | "b = [0, 4, 16, 36, 64, 100, 144, 196, 256, 324]" 792 | ] 793 | }, 794 | { 795 | "cell_type": "code", 796 | "execution_count": null, 797 | "metadata": {}, 798 | "outputs": [], 799 | "source": [] 800 | }, 801 | { 802 | "cell_type": "markdown", 803 | "metadata": {}, 804 | "source": [ 805 | "### Speeding Ticket Fine\n", 806 | "You are driving a little too fast on the highway, and a police officer stops you. Write a function that takes the speed as an input and returns the fine. Your function must use a dictionary.\n", 807 | "\n", 808 | "\n", 809 | "If speed is 60 or less, the result is `$0`. If speed is between 61 and 80 inclusive, the result is `$100`. If speed is 81 or more, the result is `$500`. \n" 810 | ] 811 | }, 812 | { 813 | "cell_type": "code", 814 | "execution_count": null, 815 | "metadata": {}, 816 | "outputs": [], 817 | "source": [] 818 | } 819 | ], 820 | "metadata": { 821 | "kernelspec": { 822 | "display_name": "Python 3", 823 | "language": "python", 824 | "name": "python3" 825 | }, 826 | "language_info": { 827 | "codemirror_mode": { 828 | "name": "ipython", 829 | "version": 3 830 | }, 831 | "file_extension": ".py", 832 | "mimetype": "text/x-python", 833 | "name": "python", 834 | "nbconvert_exporter": "python", 835 | "pygments_lexer": "ipython3", 836 | "version": "3.7.2" 837 | }, 838 | "toc": { 839 | "base_numbering": 1, 840 | "nav_menu": {}, 841 | "number_sections": true, 842 | "sideBar": true, 843 | "skip_h1_title": false, 844 | "title_cell": "Table of Contents", 845 | "title_sidebar": "Contents", 846 | "toc_cell": false, 847 | "toc_position": {}, 848 | "toc_section_display": true, 849 | "toc_window_display": false 850 | } 851 | }, 852 | "nbformat": 4, 853 | "nbformat_minor": 1 854 | } 855 | -------------------------------------------------------------------------------- /Notebooks/2.1_MatplotlibTemplates.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": null, 6 | "metadata": { 7 | "ExecuteTime": { 8 | "end_time": "2019-03-27T14:37:58.582789Z", 9 | "start_time": "2019-03-27T14:37:57.460651Z" 10 | } 11 | }, 12 | "outputs": [], 13 | "source": [ 14 | "%matplotlib inline\n", 15 | "import numpy as np\n", 16 | "import pandas as pd\n", 17 | "import matplotlib.pyplot as plt\n", 18 | "from scipy import stats\n", 19 | "\n", 20 | "def autolabel(rects):\n", 21 | " # attach some text labels\n", 22 | " for rect in rects:\n", 23 | " height = rect.get_height()\n", 24 | " ax.text(rect.get_x() + rect.get_width()/2., 1.05*height,\n", 25 | " '%2.2f' % float(height),\n", 26 | " ha='center', va='bottom')" 27 | ] 28 | }, 29 | { 30 | "cell_type": "markdown", 31 | "metadata": {}, 32 | "source": [ 33 | "Simple Plots in Python\n", 34 | "\n", 35 | "In this tutorial we'll show you some basic templates of scientific plots using Python matplotlib.\n", 36 | "\n", 37 | "# Bar graphs with standard error bars for 1 group" 38 | ] 39 | }, 40 | { 41 | "cell_type": "code", 42 | "execution_count": null, 43 | "metadata": { 44 | "ExecuteTime": { 45 | "end_time": "2019-03-27T14:38:02.687415Z", 46 | "start_time": "2019-03-27T14:38:02.318322Z" 47 | }, 48 | "scrolled": true 49 | }, 50 | "outputs": [], 51 | "source": [ 52 | "# based on http://matplotlib.org/examples/api/barchart_demo.html\n", 53 | "\n", 54 | "# Make some fake data\n", 55 | "d = {'gender': np.hstack([np.ones(10), np.zeros(10)]), 'scores': np.hstack([np.random.rand(10), np.random.rand(10)+1])}\n", 56 | "df = pd.DataFrame(d)\n", 57 | "\n", 58 | "# Change this part and replace with the variables you want to plot and the grouping variable column name.\n", 59 | "vals = ['scores'] # This is the column name of the variable to plot on Y axis\n", 60 | "group = ['gender'] # This is the grouping variable for the X axis\n", 61 | "\n", 62 | "# Get means for each group\n", 63 | "means = df[vals+group].groupby(group).mean().squeeze()\n", 64 | "# Get standard error of means for each group\n", 65 | "sems = df[vals+group].groupby(group).sem().squeeze()\n", 66 | "\n", 67 | "fig,ax = plt.subplots(figsize=(10,5)) # Change figure size in (width,height)\n", 68 | "ind = np.arange(np.size(np.unique(df[group]),0)) # location of bars\n", 69 | "width = .5 # Width of bars\n", 70 | "# (bar x-location, bar heights, width=bar width, color=bar color, yerr=standard error,ecolor=errorbarcolor)\n", 71 | "rects1 = ax.bar(ind - width/2,means,width=.5,color='lightsalmon',yerr=sems,ecolor='blue') \n", 72 | "# Look up different colors here: http://stackoverflow.com/questions/22408237/named-colors-in-matplotlib\n", 73 | "\n", 74 | "# configure axes properties to make pretty\n", 75 | "ax.set_ylabel('scores')\n", 76 | "ax.set_xlabel('gender')\n", 77 | "ax.set_title('Scores by gender')\n", 78 | "ax.set_xticks(ind)\n", 79 | "ax.set_xticklabels(['Male','Female'])\n", 80 | "ax.set_xlim([-.5,1.5]) \n", 81 | "ax.set_ylim([0,2])\n", 82 | "\n", 83 | "# This part calls the function autolabel() defined above, and labels the bars with values\n", 84 | "autolabel(rects1)\n", 85 | "\n", 86 | "plt.show()" 87 | ] 88 | }, 89 | { 90 | "cell_type": "markdown", 91 | "metadata": {}, 92 | "source": [ 93 | "# Bar graphs with standard error bars for 2 group" 94 | ] 95 | }, 96 | { 97 | "cell_type": "code", 98 | "execution_count": null, 99 | "metadata": { 100 | "ExecuteTime": { 101 | "end_time": "2019-03-27T14:38:07.243834Z", 102 | "start_time": "2019-03-27T14:38:07.090095Z" 103 | } 104 | }, 105 | "outputs": [], 106 | "source": [ 107 | "# Make some fake data\n", 108 | "d = {'race': np.random.permutation(np.hstack([np.ones(10), np.zeros(10)])), \n", 109 | " 'gender': np.hstack([np.ones(10), np.zeros(10)]), \n", 110 | " 'scores': np.hstack([np.random.rand(10), np.random.rand(10)+1])}\n", 111 | "df = pd.DataFrame(d)\n", 112 | "\n", 113 | "# Change this part and replace with the variables you want to plot and the grouping variable column name.\n", 114 | "val =['scores']\n", 115 | "group1 = ['gender']\n", 116 | "group2 = ['race']\n", 117 | "\n", 118 | "# Get means and sems for Gender group\n", 119 | "means1 = df[val+group1].groupby(group1).mean().squeeze()\n", 120 | "sems1 = df[val+group1].groupby(group1).sem().squeeze()\n", 121 | "# Get means and sems for Race group\n", 122 | "means2 = df[val+group2].groupby(group2).mean().squeeze()\n", 123 | "sems2 = df[val+group2].groupby(group2).sem().squeeze()\n", 124 | "\n", 125 | "fig,ax = plt.subplots(figsize=(10,5)) # Change figure size in (width,height)\n", 126 | "ind = np.array([0.,1.]) # location of bars\n", 127 | "width = .4 # Width of bars\n", 128 | "\n", 129 | "# plot score by gender\n", 130 | "rects1 = ax.bar(ind - width,means1,width,color='lightcoral',yerr=sems1,ecolor='k') # (bar x-location, bar heights, width=bar width, color=bar color, yerr=standard error)\n", 131 | "# plot score by race \n", 132 | "rects2 = ax.bar(ind,means2,width,color='lightblue',yerr=sems2,ecolor='k')\n", 133 | "\n", 134 | "# configure axes properties to make pretty\n", 135 | "ax.set_ylabel('scores')\n", 136 | "ax.set_xlabel('gender')\n", 137 | "ax.set_title('Scores by gender and race')\n", 138 | "ax.set_xticks(ind)\n", 139 | "ax.set_xticklabels(['Male','Female'])\n", 140 | "ax.set_xlim([ind[0]-width*1.25,ind[-1]+width*1.25]) \n", 141 | "ax.set_ylim([0,1.8])\n", 142 | "\n", 143 | "ax.legend(['Race0','Race1'])\n", 144 | "\n", 145 | "autolabel(rects1)\n", 146 | "autolabel(rects2)\n" 147 | ] 148 | }, 149 | { 150 | "cell_type": "markdown", 151 | "metadata": {}, 152 | "source": [ 153 | "# Scatterplots of 1 group with jittered location\n", 154 | "\n", 155 | "If you try to plot something like a scaled data, you won't be able to see how clustered they are because they would just plot on top of each other. One way to avoid this is to jitter the x,y locations around the actual value." 156 | ] 157 | }, 158 | { 159 | "cell_type": "code", 160 | "execution_count": null, 161 | "metadata": { 162 | "ExecuteTime": { 163 | "end_time": "2019-03-27T14:38:10.892018Z", 164 | "start_time": "2019-03-27T14:38:10.778229Z" 165 | }, 166 | "scrolled": true 167 | }, 168 | "outputs": [], 169 | "source": [ 170 | "# Make some fake data\n", 171 | "d = {'race': np.random.permutation(np.hstack([np.ones(20), np.zeros(20)])),\n", 172 | " 'gender': np.hstack([np.ones(20), np.zeros(20)]), \n", 173 | " 'scores': np.round(10*np.hstack([np.random.rand(20), np.random.rand(20)+1]))}\n", 174 | "df = pd.DataFrame(d)\n", 175 | "ax = df.plot(kind='scatter',x='gender',y='scores')\n", 176 | "ax.set_title('Values are stacked')\n", 177 | "plt.show()" 178 | ] 179 | }, 180 | { 181 | "cell_type": "markdown", 182 | "metadata": {}, 183 | "source": [ 184 | "Here is the fix. " 185 | ] 186 | }, 187 | { 188 | "cell_type": "code", 189 | "execution_count": null, 190 | "metadata": { 191 | "ExecuteTime": { 192 | "end_time": "2019-03-27T14:38:14.040412Z", 193 | "start_time": "2019-03-27T14:38:13.939597Z" 194 | } 195 | }, 196 | "outputs": [], 197 | "source": [ 198 | "# Set x,y values for each group\n", 199 | "gender0 = 0 # value of first group\n", 200 | "y0 = df[['scores']].loc[df['gender']==gender0].values.squeeze() # Grabs y values for Gender =0\n", 201 | "y0 = y0+(np.random.rand(len(y0))-.5)*.1 #Change after + sign to control dispersion\n", 202 | "x0 = np.ones(len(y0))*gender0 +(np.random.rand(len(y0))-.5)*.1 #Change after + sign to control dispersion\n", 203 | "\n", 204 | "gender1 = 1 # value of second group\n", 205 | "y1 = df[['scores']].loc[df['gender']==gender1].values.squeeze()\n", 206 | "y1 = y1+(np.random.rand(len(y1))-.5)*.1\n", 207 | "x1 = np.ones(len(y1))*gender1 + (np.random.rand(len(y1))-.5)*.1\n", 208 | "\n", 209 | "fig,ax = plt.subplots(figsize=(5,5))\n", 210 | "ax.scatter(x0,y0,color='lightcoral')\n", 211 | "ax.scatter(x1,y1,color='lightcoral')\n", 212 | "ax.set_ylabel('scores')\n", 213 | "ax.set_xlabel('gender')\n", 214 | "ax.set_title('Values are now dispersed')\n", 215 | "ax.set_xticks([0,1])\n", 216 | "ax.set_xticklabels(['Male','Female'])\n", 217 | "ax.set_xlim([-.5,1.5]) \n", 218 | "ax.grid() # puts grid on\n", 219 | "plt.show()" 220 | ] 221 | }, 222 | { 223 | "cell_type": "markdown", 224 | "metadata": { 225 | "collapsed": true 226 | }, 227 | "source": [ 228 | "# Drawing trend line on a scatterplot" 229 | ] 230 | }, 231 | { 232 | "cell_type": "code", 233 | "execution_count": null, 234 | "metadata": { 235 | "ExecuteTime": { 236 | "end_time": "2019-03-27T14:38:31.813333Z", 237 | "start_time": "2019-03-27T14:38:31.403344Z" 238 | } 239 | }, 240 | "outputs": [], 241 | "source": [ 242 | "import statsmodels.formula.api as smf\n", 243 | "import statsmodels.api as sm\n", 244 | "\n", 245 | "d = {'race': np.random.permutation(np.hstack([np.ones(20), np.zeros(20)])),\n", 246 | " 'gender': np.hstack([np.ones(20), np.zeros(20)]), \n", 247 | " 'scores': np.round(10*np.hstack([np.random.rand(20), np.random.rand(20)+1]))}\n", 248 | "df = pd.DataFrame(d)\n", 249 | "lm = smf.ols(formula = \"scores ~ gender\",data=df).fit()\n", 250 | "print(lm.summary())\n", 251 | "\n", 252 | "# Save the slope for gender to b1 and intercept to b0\n", 253 | "b1 = lm.params[1] # This is slope\n", 254 | "b0 = lm.params[0] # This is intercept" 255 | ] 256 | }, 257 | { 258 | "cell_type": "code", 259 | "execution_count": null, 260 | "metadata": { 261 | "ExecuteTime": { 262 | "end_time": "2019-03-27T14:38:34.895112Z", 263 | "start_time": "2019-03-27T14:38:34.792217Z" 264 | } 265 | }, 266 | "outputs": [], 267 | "source": [ 268 | "# Set x,y values for each group\n", 269 | "gender0 = 0 # value of first group\n", 270 | "y0 = df[['scores']].loc[df['gender']==gender0].values.squeeze()\n", 271 | "y0 = y0+(np.random.rand(len(y0))-.5)*.1 #Change after + sign to control dispersion\n", 272 | "x0 = np.ones(len(y0))*gender0 + (np.random.rand(len(y0))-.5)*.1 #Change after + sign to control dispersion\n", 273 | "\n", 274 | "gender1 = 1 # value of second group\n", 275 | "y1 = df[['scores']].loc[df['gender']==gender1].values.squeeze()\n", 276 | "y1 = y1+(np.random.rand(len(y1))-.5)*.1\n", 277 | "x1 = np.ones(len(y1))*gender1 + (np.random.rand(len(y1))-.5)*.1\n", 278 | "\n", 279 | "fig,ax = plt.subplots(figsize=(5,5))\n", 280 | "ax.scatter(x0,y0,color='lightcoral')\n", 281 | "ax.scatter(x1,y1,color='lightcoral')\n", 282 | "\n", 283 | "# Part that adds the line\n", 284 | "spacing = 10\n", 285 | "minx = df[['gender']].min().squeeze()\n", 286 | "maxx = df[['gender']].max().squeeze()\n", 287 | "lx = np.linspace(minx,maxx,spacing) # make x coordinates \n", 288 | "ly = b0+lx*b1 # Estimate the y values using betas\n", 289 | "ax.plot(lx,ly,'-k')\n", 290 | "\n", 291 | "ax.set_ylabel('scores')\n", 292 | "ax.set_xlabel('gender')\n", 293 | "ax.set_title('Values are now dispersed')\n", 294 | "ax.set_xticks([0,1])\n", 295 | "ax.set_xticklabels(['Male','Female'])\n", 296 | "ax.set_xlim([-.5,1.5]) \n", 297 | "ax.grid()\n", 298 | "plt.show()" 299 | ] 300 | }, 301 | { 302 | "cell_type": "code", 303 | "execution_count": null, 304 | "metadata": {}, 305 | "outputs": [], 306 | "source": [] 307 | }, 308 | { 309 | "cell_type": "code", 310 | "execution_count": null, 311 | "metadata": {}, 312 | "outputs": [], 313 | "source": [] 314 | } 315 | ], 316 | "metadata": { 317 | "kernelspec": { 318 | "display_name": "Python 3", 319 | "language": "python", 320 | "name": "python3" 321 | }, 322 | "language_info": { 323 | "codemirror_mode": { 324 | "name": "ipython", 325 | "version": 3 326 | }, 327 | "file_extension": ".py", 328 | "mimetype": "text/x-python", 329 | "name": "python", 330 | "nbconvert_exporter": "python", 331 | "pygments_lexer": "ipython3", 332 | "version": "3.7.2" 333 | }, 334 | "toc": { 335 | "base_numbering": 1, 336 | "nav_menu": {}, 337 | "number_sections": true, 338 | "sideBar": true, 339 | "skip_h1_title": false, 340 | "title_cell": "Table of Contents", 341 | "title_sidebar": "Contents", 342 | "toc_cell": false, 343 | "toc_position": {}, 344 | "toc_section_display": true, 345 | "toc_window_display": false 346 | } 347 | }, 348 | "nbformat": 4, 349 | "nbformat_minor": 1 350 | } 351 | -------------------------------------------------------------------------------- /Notebooks/3_Introduction_to_Regression.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": { 6 | "deletable": true, 7 | "editable": true 8 | }, 9 | "source": [ 10 | "# Linear Regression Analysis\n", 11 | "\n", 12 | "*Written by Jin Cheong & Luke Chang*\n", 13 | "\n", 14 | "In this lab we are going to learn how to do simple data analyses using python. \n", 15 | "This module includes learning how to run simple linear regression models, comparing models, & visualizing fits. - This notebook was adapted from material available [here](https://github.com/justmarkham/DAT4/blob/master/notebooks/08_linear_regression.ipynb)\n", 16 | "\n", 17 | "After the tutorial you will have the chance to apply the methods to a new set of data. \n" 18 | ] 19 | }, 20 | { 21 | "cell_type": "code", 22 | "execution_count": null, 23 | "metadata": { 24 | "ExecuteTime": { 25 | "end_time": "2017-01-18T14:44:10.530703", 26 | "start_time": "2017-01-18T14:44:09.731729" 27 | }, 28 | "collapsed": true, 29 | "deletable": true, 30 | "editable": true 31 | }, 32 | "outputs": [], 33 | "source": [ 34 | "%matplotlib inline\n", 35 | "\n", 36 | "import numpy as np\n", 37 | "import pandas as pd\n", 38 | "import matplotlib.pyplot as plt\n", 39 | "import statsmodels.api as sm\n", 40 | "import statsmodels.formula.api as smf\n", 41 | "import statsmodels.stats.api as stats" 42 | ] 43 | }, 44 | { 45 | "cell_type": "markdown", 46 | "metadata": { 47 | "deletable": true, 48 | "editable": true 49 | }, 50 | "source": [ 51 | "We will load the data which is a comma delimited text file using the `read_csv()` method from pandas. \n", 52 | "The '../Data' indicates that we will move one folder up and then into the Data folder." 53 | ] 54 | }, 55 | { 56 | "cell_type": "code", 57 | "execution_count": null, 58 | "metadata": { 59 | "ExecuteTime": { 60 | "end_time": "2017-01-18T14:44:10.536720", 61 | "start_time": "2017-01-18T14:44:10.532385" 62 | }, 63 | "collapsed": false, 64 | "deletable": true, 65 | "editable": true 66 | }, 67 | "outputs": [], 68 | "source": [ 69 | "# Load the data\n", 70 | "df = pd.read_csv('../Data/salary.csv')" 71 | ] 72 | }, 73 | { 74 | "cell_type": "code", 75 | "execution_count": null, 76 | "metadata": { 77 | "ExecuteTime": { 78 | "end_time": "2017-01-18T14:44:10.543760", 79 | "start_time": "2017-01-18T14:44:10.537887" 80 | }, 81 | "collapsed": false, 82 | "deletable": true, 83 | "editable": true 84 | }, 85 | "outputs": [], 86 | "source": [ 87 | "df = df.dropna()\n", 88 | "df = df[df.gender!=2]" 89 | ] 90 | }, 91 | { 92 | "cell_type": "code", 93 | "execution_count": null, 94 | "metadata": { 95 | "ExecuteTime": { 96 | "end_time": "2017-01-18T14:44:10.554847", 97 | "start_time": "2017-01-18T14:44:10.545364" 98 | }, 99 | "collapsed": false, 100 | "deletable": true, 101 | "editable": true 102 | }, 103 | "outputs": [], 104 | "source": [ 105 | "print 'There are %i rows and %i columns in this data set' % df.shape\n", 106 | "df.isnull().sum()" 107 | ] 108 | }, 109 | { 110 | "cell_type": "code", 111 | "execution_count": null, 112 | "metadata": { 113 | "ExecuteTime": { 114 | "end_time": "2017-01-18T14:44:10.765524", 115 | "start_time": "2017-01-18T14:44:10.556141" 116 | }, 117 | "collapsed": false, 118 | "deletable": true, 119 | "editable": true 120 | }, 121 | "outputs": [], 122 | "source": [ 123 | "df[['salary','years']].plot(kind='scatter', x='years', y='salary')" 124 | ] 125 | }, 126 | { 127 | "cell_type": "code", 128 | "execution_count": null, 129 | "metadata": { 130 | "ExecuteTime": { 131 | "end_time": "2017-01-18T14:44:10.810764", 132 | "start_time": "2017-01-18T14:44:10.767183" 133 | }, 134 | "collapsed": false, 135 | "deletable": true, 136 | "editable": true 137 | }, 138 | "outputs": [], 139 | "source": [ 140 | "# Oneshot visualization\n", 141 | "## create a new numericalSeries called dept_num !! Just for visualization !! \n", 142 | "df['dept_num'] = df.departm.map({'bio':0, 'chem':1,'geol':2,'neuro':3,'stat':4,'physics':5,'math':6})\n", 143 | "df.tail()" 144 | ] 145 | }, 146 | { 147 | "cell_type": "code", 148 | "execution_count": null, 149 | "metadata": { 150 | "ExecuteTime": { 151 | "end_time": "2017-01-18T14:44:11.455813", 152 | "start_time": "2017-01-18T14:44:10.828546" 153 | }, 154 | "collapsed": false, 155 | "deletable": true, 156 | "editable": true 157 | }, 158 | "outputs": [], 159 | "source": [ 160 | "## Now plot all four categories\n", 161 | "fig, axs = plt.subplots(1, 4, sharey=True)\n", 162 | "df.plot(kind='scatter', x='gender', y='salary', ax=axs[0], figsize=(16, 4))\n", 163 | "df.plot(kind='scatter', x='dept_num', y='salary', ax=axs[1])\n", 164 | "df.plot(kind='scatter', x='years', y='salary', ax=axs[2])\n", 165 | "df.plot(kind='scatter', x='age', y='salary', ax=axs[3])\n", 166 | "# The problem is that it treats department as a continuous variable. " 167 | ] 168 | }, 169 | { 170 | "cell_type": "markdown", 171 | "metadata": { 172 | "deletable": true, 173 | "editable": true 174 | }, 175 | "source": [ 176 | "## Linear Regression\n", 177 | "\n", 178 | "Now let's move on to running an analysis.\n", 179 | "\n", 180 | "Simple linear regression is an approach for predicting a **quantitative response** using a **single feature** (or \"predictor\" or \"input variable\"). It takes the following form:\n", 181 | "\n", 182 | "$y = \\beta_0 + \\beta_1\\cdot x $\n", 183 | "\n", 184 | "What does each term represent?\n", 185 | "- $y$ is the response\n", 186 | "- $x$ is the feature\n", 187 | "- $\\beta_0$ is the intercept\n", 188 | "- $\\beta_1$ is the coefficient for x\n", 189 | "\n", 190 | "Together, $\\beta_0$ and $\\beta_1$ are called the **model coefficients**. To create your model, you must \"estimate\" the values of these coefficients. And once we've estimated these parameters, we can use the model to predict things!" 191 | ] 192 | }, 193 | { 194 | "cell_type": "markdown", 195 | "metadata": { 196 | "deletable": true, 197 | "editable": true 198 | }, 199 | "source": [ 200 | "## Estimating Model Coefficients\n", 201 | "\n", 202 | "Generally speaking, coefficients are estimated using the **least squares criterion**, which means we are find the line (mathematically) which minimizes the **sum of squared residuals** (or \"sum of squared errors\"):\n", 203 | "\n", 204 | "\n", 205 | "\n", 206 | "What elements are present in the diagram?\n", 207 | "- The black dots are the **observed values** of x and y.\n", 208 | "- The blue line is our **least squares line**.\n", 209 | "- The red lines are the **residuals**, which are the distances between the observed values and the least squares line.\n", 210 | "\n", 211 | "How do the model coefficients relate to the least squares line?\n", 212 | "- $\\beta_0$ is the **intercept** (the value of $y$ when $x$=0)\n", 213 | "- $\\beta_1$ is the **slope** (the change in $y$ divided by change in $x$)\n", 214 | "\n", 215 | "Here is a graphical depiction of those calculations:\n", 216 | "\n", 217 | "" 218 | ] 219 | }, 220 | { 221 | "cell_type": "markdown", 222 | "metadata": { 223 | "deletable": true, 224 | "editable": true 225 | }, 226 | "source": [ 227 | "To run a regression we will be using the statsmodels module that was loaded above. We will first initalize a model object with the regression equation and the data to use. The format for specifying the regression equation is similar to R. (y ~ x).\n", 228 | "\n", 229 | "Here we will estimate the effect of gender on salary. Gender is a \"dummy variable\", meaning that it is only ones and zeros.\n", 230 | "\n", 231 | "After we have initialized the object instance we can run the fit method (this can be chained so you only need to run one line). After the parameters have been estimated we can query attributes of the model such as the estimated model parameters." 232 | ] 233 | }, 234 | { 235 | "cell_type": "code", 236 | "execution_count": null, 237 | "metadata": { 238 | "ExecuteTime": { 239 | "end_time": "2017-01-18T14:44:11.484659", 240 | "start_time": "2017-01-18T14:44:11.473799" 241 | }, 242 | "collapsed": false, 243 | "deletable": true, 244 | "editable": true, 245 | "scrolled": false 246 | }, 247 | "outputs": [], 248 | "source": [ 249 | "lm = smf.ols(formula = \"salary ~ gender\",data=df).fit()\n", 250 | "\n", 251 | "# print the coefficients\n", 252 | "print(lm.params)\n", 253 | "print(lm.params[1])" 254 | ] 255 | }, 256 | { 257 | "cell_type": "markdown", 258 | "metadata": { 259 | "deletable": true, 260 | "editable": true 261 | }, 262 | "source": [ 263 | "## Interpreting Model Coefficients\n", 264 | "\n", 265 | "How do we interpret the estimated coefficients for this model?\n", 266 | "- $\\beta_0$, or the intercept, reflects the average salary for the reference condition of the dummy code (i.e., when gender = 0). This means that males make an average of `$69108.5`\n", 267 | "- to get the estimated female salary we need to add $\\beta_1$ (-13388.83), which becomes `$55719.67`\n", 268 | "\n", 269 | "We can also get a summary of all of the model statistics using the `summary()` method." 270 | ] 271 | }, 272 | { 273 | "cell_type": "markdown", 274 | "metadata": { 275 | "deletable": true, 276 | "editable": true 277 | }, 278 | "source": [ 279 | " " 280 | ] 281 | }, 282 | { 283 | "cell_type": "code", 284 | "execution_count": null, 285 | "metadata": { 286 | "ExecuteTime": { 287 | "end_time": "2017-01-18T14:44:11.965467", 288 | "start_time": "2017-01-18T14:44:11.954564" 289 | }, 290 | "collapsed": false, 291 | "deletable": true, 292 | "editable": true 293 | }, 294 | "outputs": [], 295 | "source": [ 296 | "print(lm.summary())" 297 | ] 298 | }, 299 | { 300 | "cell_type": "markdown", 301 | "metadata": { 302 | "deletable": true, 303 | "editable": true 304 | }, 305 | "source": [ 306 | "## Confidence in our Model\n", 307 | "\n", 308 | "Now let's examine the output. To learn more about Statsmodels and how to interpret the output, DataRobot has some decent posts on [simple linear regression](http://www.datarobot.com/blog/ordinary-least-squares-in-python/) and [multiple linear regression](http://www.datarobot.com/blog/multiple-regression-using-statsmodels/).\n", 309 | "\n", 310 | "\n", 311 | "**Question:** Is linear regression a high bias/low variance model, or a low bias/high variance model?\n", 312 | "\n", 313 | "**Answer:** High bias/low variance. Under repeated sampling, the line will stay roughly in the same place (low variance), but the average of those models won't do a great job capturing the true relationship (high bias). Note that low variance is a useful characteristic when you don't have a lot of training data!\n", 314 | "\n", 315 | "A closely related concept is **confidence intervals**. Statsmodels calculates 95% confidence intervals for our model coefficients, which are interpreted as follows: If the population from which this sample was drawn was **sampled 100 times**, approximately **95 of those confidence intervals** would contain the \"true\" coefficient.\n", 316 | "\n", 317 | "## Hypothesis Testing and p-values\n", 318 | "\n", 319 | "Closely related to confidence intervals is **hypothesis testing**. Generally speaking, you start with a **null hypothesis** and an **alternative hypothesis** (that is opposite the null). Then, you check whether the data supports **rejecting the null hypothesis** or **failing to reject the null hypothesis**.\n", 320 | "\n", 321 | "(Note that \"failing to reject\" the null is not the same as \"accepting\" the null hypothesis. The alternative hypothesis may indeed be true, except that you just don't have enough data to show that.)\n", 322 | "\n", 323 | "As it relates to model coefficients, here is the conventional hypothesis test:\n", 324 | "- **null hypothesis:** There is no relationship between gender and salary (and thus $\\beta_1$ equals zero)\n", 325 | "- **alternative hypothesis:** There is a relationship between TV ads and Sales (and thus $\\beta_1$ is not equal to zero)\n", 326 | "\n", 327 | "How do we test this hypothesis? Intuitively, we reject the null (and thus believe the alternative) if the 95% confidence interval **does not include zero**. Conversely, the **p-value** represents the probability that the coefficient is actually zero:\n", 328 | "\n", 329 | "## Overall Model Fit\n", 330 | "\n", 331 | "The overall goodness of how well the model fits the data can be described using the AIC or BIC which are information criterions that penalize for the number of free parameters in the model. A common way to evaluate the overall fit of a linear model is **$R^2$** value. $R^2$ is the **proportion of variance explained**, meaning the proportion of variance in the observed data that is explained by the model, or the reduction in error over the **null model**. (The null model just predicts the mean of the observed response, and thus it has an intercept and no slope.)\n", 332 | "\n", 333 | "$R^2$ is between 0 and 1, and higher is better because it means that more variance is explained by the model. Here's an example of what R-squared \"looks like\":\n", 334 | "\n", 335 | "In the model above, we see that $R^2$=0.09, which means that gender does not do a great job at predicting the overall salary." 336 | ] 337 | }, 338 | { 339 | "cell_type": "markdown", 340 | "metadata": { 341 | "deletable": true, 342 | "editable": true 343 | }, 344 | "source": [ 345 | "## Multiple Linear Regression\n", 346 | "\n", 347 | "Simple linear regression can easily be extended to include multiple features. This is called **multiple linear regression**:\n", 348 | "\n", 349 | "$y = \\beta_0 + \\beta_1x_1 + ... + \\beta_nx_n$\n", 350 | "\n", 351 | "Each $x$ represents a different feature, and each feature has its own coefficient. In this case:\n", 352 | "\n", 353 | "$Salary = \\beta_0 + \\beta_1 \\cdot Gender + \\beta_2 \\cdot Years$\n", 354 | "\n", 355 | "Let's use Statsmodels to estimate these coefficients:" 356 | ] 357 | }, 358 | { 359 | "cell_type": "code", 360 | "execution_count": null, 361 | "metadata": { 362 | "ExecuteTime": { 363 | "end_time": "2017-01-18T14:44:12.437296", 364 | "start_time": "2017-01-18T14:44:12.419615" 365 | }, 366 | "collapsed": false, 367 | "deletable": true, 368 | "editable": true 369 | }, 370 | "outputs": [], 371 | "source": [ 372 | "lm2 = smf.ols(formula = \"salary ~ gender + years\",data=df).fit()\n", 373 | "print(lm2.summary())" 374 | ] 375 | }, 376 | { 377 | "cell_type": "markdown", 378 | "metadata": { 379 | "deletable": true, 380 | "editable": true 381 | }, 382 | "source": [ 383 | "Here we see that `years` also significantly explains additional variance.\n", 384 | "\n", 385 | "This is reflected in the $R^2$, which has now increased from 0.09 to 0.143. It is important to note that **$R^2$ will always increase as you add more features to the model**, even if they are unrelated to the response. Thus, selecting the model with the highest R-squared is not a reliable approach for choosing the best linear model.\n", 386 | "\n", 387 | "There is alternative to $R^2$ called **adjusted $R^2$** that penalizes model complexity (to control for overfitting), but it generally [under-penalizes complexity](http://scott.fortmann-roe.com/docs/MeasuringError.html).\n", 388 | "\n", 389 | "Next week we will explore another approach known as **Cross-validation.** Importantly, cross-validation can be applied to any model, whereas the methods described above only apply to linear models.\n", 390 | "\n", 391 | "Here, we will try and see if the new model significantly explains additional variance controlling for the extra free parameter using a nested model comparison." 392 | ] 393 | }, 394 | { 395 | "cell_type": "code", 396 | "execution_count": null, 397 | "metadata": { 398 | "ExecuteTime": { 399 | "end_time": "2017-01-18T14:44:12.769250", 400 | "start_time": "2017-01-18T14:44:12.755716" 401 | }, 402 | "collapsed": false, 403 | "deletable": true, 404 | "editable": true 405 | }, 406 | "outputs": [], 407 | "source": [ 408 | "print(stats.anova_lm(lm,lm2))" 409 | ] 410 | }, 411 | { 412 | "cell_type": "markdown", 413 | "metadata": { 414 | "deletable": true, 415 | "editable": true 416 | }, 417 | "source": [ 418 | "We see that the addition of `years` in model2 significantly explains additional variance when compared to model 1. Notice that this is similar to the results provided by the t-tests on the individual parameters. Using model comparison to test incrementally adding a single predictor is a feature selection method known as stepwise regression. However, you can also perform model comparison on models with many different features as long as one is nested within the other.\n", 419 | "\n", 420 | "When we only had one predictor we could investigate the relationship using a scatterplot. Now that we have 2 predictors, we need to make a 3 dimensional plot to see the effect of each predictor on salary. " 421 | ] 422 | }, 423 | { 424 | "cell_type": "code", 425 | "execution_count": null, 426 | "metadata": { 427 | "ExecuteTime": { 428 | "end_time": "2017-01-18T14:44:13.989664", 429 | "start_time": "2017-01-18T14:44:13.091418" 430 | }, 431 | "collapsed": false, 432 | "deletable": true, 433 | "editable": true 434 | }, 435 | "outputs": [], 436 | "source": [ 437 | "from mpl_toolkits.mplot3d import Axes3D\n", 438 | "\n", 439 | "fig = plt.figure(figsize=(18,6))\n", 440 | "idx = [131,132,133]\n", 441 | "pos = {0:(-35,20),1:(-90,0),2:(0,0)}\n", 442 | "for i,ii in enumerate(idx):\n", 443 | " ax = fig.add_subplot(ii, projection='3d')\n", 444 | "\n", 445 | " # generate a mesh\n", 446 | " x_surf = np.arange(0,11)/10.0\n", 447 | " y_surf = np.arange(0,35,3)\n", 448 | " x_surf, y_surf = np.meshgrid(x_surf, y_surf)\n", 449 | "\n", 450 | " exog = pd.core.frame.DataFrame({'gender': x_surf.ravel(), 'years': y_surf.ravel()})\n", 451 | " exog['intercept'] = np.ones(exog.shape[0])\n", 452 | " out = lm2.predict(exog = exog,transform=True)\n", 453 | "\n", 454 | " ax.plot_surface(x_surf, y_surf,\n", 455 | " out.reshape(x_surf.shape),\n", 456 | " rstride=1,\n", 457 | " cstride=1,\n", 458 | " color='None',\n", 459 | " alpha = 0.2)\n", 460 | "\n", 461 | " ax.scatter(df['gender'], df['years'], df['salary'],\n", 462 | " c='blue',\n", 463 | " marker='o',\n", 464 | " alpha=1)\n", 465 | "\n", 466 | " ax.set_xlabel('Gender')\n", 467 | " ax.set_ylabel('Years')\n", 468 | " ax.set_zlabel('Salary')\n", 469 | " ax.azim = pos[i][0]\n", 470 | " ax.elev = pos[i][1]\n", 471 | "\n", 472 | "plt.tight_layout()" 473 | ] 474 | }, 475 | { 476 | "cell_type": "markdown", 477 | "metadata": { 478 | "deletable": true, 479 | "editable": true 480 | }, 481 | "source": [ 482 | "Notice how when we rotate the 3D plot, we can make projections into 2 dimensional space. This is the effect of each predictor on salary controlling for the other. \n", 483 | "\n", 484 | "We can also generate additional diagnostic plots." 485 | ] 486 | }, 487 | { 488 | "cell_type": "code", 489 | "execution_count": null, 490 | "metadata": { 491 | "ExecuteTime": { 492 | "end_time": "2017-01-18T14:44:35.826100", 493 | "start_time": "2017-01-18T14:44:35.156463" 494 | }, 495 | "collapsed": false, 496 | "deletable": true, 497 | "editable": true, 498 | "scrolled": false 499 | }, 500 | "outputs": [], 501 | "source": [ 502 | "f = sm.graphics.plot_regress_exog(lm2, 'years')\n", 503 | "f.set_size_inches(10,5)" 504 | ] 505 | }, 506 | { 507 | "cell_type": "markdown", 508 | "metadata": { 509 | "deletable": true, 510 | "editable": true 511 | }, 512 | "source": [ 513 | "Lastly, we can create a standard ANOVA table with F statistics as if doing Anova. F stat tests whether at least one of the coefficients of the variables of a category is significantly different from 0 or not.\n", 514 | "\n", 515 | "There are different times of Sum of Squared Error (SSQ) to consider. We recommend using Type 3 by default. See [here](https://mcfromnz.wordpress.com/2011/03/02/anova-type-iiiiii-ss-explained/) for more in depth discussion." 516 | ] 517 | }, 518 | { 519 | "cell_type": "code", 520 | "execution_count": null, 521 | "metadata": { 522 | "ExecuteTime": { 523 | "end_time": "2017-01-18T14:44:14.602669", 524 | "start_time": "2017-01-18T14:44:14.590320" 525 | }, 526 | "collapsed": false, 527 | "deletable": true, 528 | "editable": true 529 | }, 530 | "outputs": [], 531 | "source": [ 532 | "# Type 3 SSQ: valid for models with significant interaction terms\n", 533 | "print('ANOVA table for Type 3 SSQ')\n", 534 | "print(stats.anova_lm(lm2,typ=3))" 535 | ] 536 | }, 537 | { 538 | "cell_type": "markdown", 539 | "metadata": { 540 | "deletable": true, 541 | "editable": true 542 | }, 543 | "source": [ 544 | "## Additional Resources for Learning about Regression\n", 545 | "\n", 546 | "- To go much more in-depth on linear regression, read Chapter 3 of [An Introduction to Statistical Learning](http://www-bcf.usc.edu/~gareth/ISL/). Alternatively, watch the [related videos](http://www.dataschool.io/15-hours-of-expert-machine-learning-videos/) or read a [quick reference guide](http://www.dataschool.io/applying-and-interpreting-linear-regression/) to the key points in that chapter.\n", 547 | "- This [introduction to linear regression](http://people.duke.edu/~rnau/regintro.htm) is much more detailed and mathematically thorough, and includes lots of good advice.\n", 548 | "- This is a relatively quick post on the [assumptions of linear regression](http://pareonline.net/getvn.asp?n=2&v=8)." 549 | ] 550 | }, 551 | { 552 | "cell_type": "markdown", 553 | "metadata": { 554 | "collapsed": false, 555 | "deletable": true, 556 | "editable": true 557 | }, 558 | "source": [ 559 | "# Exercises\n", 560 | "Use the 'salary_exercise.csv' to apply the analyses we learned today.\n", 561 | "This dataset was adapted from material available [here](http://data.princeton.edu/wws509/datasets/#salary)\n", 562 | "\n", 563 | "These are the salary data used in Weisberg's book, consisting of observations on six variables for 52 tenure-track professors in a small college. The variables are:\n", 564 | "\n", 565 | "- sx = Sex, coded 1 for female and 0 for male\n", 566 | "- rk = Rank, coded\n", 567 | " - 1 for assistant professor,\n", 568 | " - 2 for associate professor, and\n", 569 | " - 3 for full professor\n", 570 | "- yr = Number of years in current rank\n", 571 | "- dg = Highest degree, coded 1 if doctorate, 0 if masters\n", 572 | "- yd = Number of years since highest degree was earned\n", 573 | "- sl = Academic year salary, in dollars.\n", 574 | "\n", 575 | "Reference: S. Weisberg (1985). Applied Linear Regression, Second Edition. New York: John Wiley and Sons. Page 194.\n", 576 | "##### Make sure you check the data for missing values or other errors before beginning your analysis.\n", 577 | "\n", 578 | "## Exercise 1\n", 579 | "Run a simple linear regression model in which you predict salary from sex of the professors. \n", 580 | "Is there a gender difference in salary? \n", 581 | "\n", 582 | "## Exercise 2\n", 583 | "Experiment with different combination of variables in your model to answer the following questions. \n", 584 | "What is your best model to explain salary from the given data? \n", 585 | "Is there a gender difference in salary?" 586 | ] 587 | } 588 | ], 589 | "metadata": { 590 | "anaconda-cloud": {}, 591 | "kernelspec": { 592 | "display_name": "Python 2", 593 | "language": "python", 594 | "name": "python2" 595 | }, 596 | "language_info": { 597 | "codemirror_mode": { 598 | "name": "ipython", 599 | "version": 2 600 | }, 601 | "file_extension": ".py", 602 | "mimetype": "text/x-python", 603 | "name": "python", 604 | "nbconvert_exporter": "python", 605 | "pygments_lexer": "ipython2", 606 | "version": "2.7.13" 607 | }, 608 | "toc": { 609 | "nav_menu": { 610 | "height": "262px", 611 | "width": "252px" 612 | }, 613 | "navigate_menu": true, 614 | "number_sections": true, 615 | "sideBar": true, 616 | "threshold": 4, 617 | "toc_cell": false, 618 | "toc_section_display": "block", 619 | "toc_window_display": false 620 | } 621 | }, 622 | "nbformat": 4, 623 | "nbformat_minor": 0 624 | } 625 | -------------------------------------------------------------------------------- /Notebooks/4_Mediation_&_Moderation.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": { 6 | "deletable": true, 7 | "editable": true 8 | }, 9 | "source": [ 10 | "# Mediation Analysis\n", 11 | "Written by Jin Cheong & Luke Chang\n", 12 | "\n", 13 | "A mediation analysis is conducted when a researcher is interested in the **mechanism** underlying how variable X has an effect on variable Y. It attempts to make a causal inference that a direct effect might be better explained by an indirect effect through a mediating variable. \n", 14 | "\n", 15 | "Consider the instance below where X has an effect **c** on Y:\n", 16 | "\n", 17 | "### 1) $Y = \\beta_1 + c \\cdot X $\n", 18 | "\n", 19 | "In this model, there may be a third variable **M** which mediates the effect of X on Y. In other words, the variable M is partially responsible for the effect X has on Y.\n", 20 | "\n", 21 | "To conduct a mediation analysis one estimates two additional models: 1) the effect of X on M, and 2) the effect of X *and* M on Y. \n", 22 | "\n", 23 | "### 2) $M = \\beta_2 + a \\cdot X $\n", 24 | "\n", 25 | "### 3) $Y = \\beta_3 + c' \\cdot X + b \\cdot M $ \n", 26 | " \n", 27 | " \n", 28 | "Now the direct effect of X on Y, denoted as **C**,can be broken down into two parts:\n", 29 | "\n", 30 | "### $c = a \\cdot b + c' $\n", 31 | "\n", 32 | "$ a \\cdot b $ is the indirect effect of X on Y via the mediator. \n", 33 | "$c'$ is the remaining direct effect of X on Y controlling for M. \n", 34 | "\n", 35 | "This relationship is depicted below. Note that M and Y both also have error included in the model.\n", 36 | "\n", 37 | "***Question*** Why does X not have error included in the model?\n", 38 | "\n", 39 | "***Answer*** Because X is only a regressor not an outcome variable in the models and standard regression does not estimate error on regressors. See [orthogonal regression](http://davegiles.blogspot.com/2014/11/orthogonal-regression-first-steps.html) for a technique that models error on both X and Y.\n", 40 | "\n", 41 | "\n", 42 | "\n", 43 | "\n", 44 | "### Here are a few examples of mediations. Can you think of more?\n", 45 | "\n", 46 | "1) The effect of failure on depressed feelings is mediated by internalization of failure. \n", 47 | "\n", 48 | "2) Effect of CBT treatment is mediated by changes in cognition. \n", 49 | "\n", 50 | "3) Effect of food intake on weight gain is mediated by metabolism.\n", 51 | "\n", 52 | "#### For more information there is a nice [tutorial](http://davidakenny.net/cm/mediate.htm) on mediation by David Kenny, one of the author's of the original mediation paper." 53 | ] 54 | }, 55 | { 56 | "cell_type": "markdown", 57 | "metadata": { 58 | "deletable": true, 59 | "editable": true 60 | }, 61 | "source": [ 62 | "## Simulate a mediation\n", 63 | "\n", 64 | "In this section we will simulate a mediation.\n", 65 | "\n", 66 | "This is a case in which the true effect of X on Y is positive, but appears negative without testing for mediation. " 67 | ] 68 | }, 69 | { 70 | "cell_type": "code", 71 | "execution_count": null, 72 | "metadata": { 73 | "ExecuteTime": { 74 | "end_time": "2017-02-08T11:04:14.080567", 75 | "start_time": "2017-02-08T11:04:14.071762" 76 | }, 77 | "collapsed": false, 78 | "deletable": true, 79 | "editable": true 80 | }, 81 | "outputs": [], 82 | "source": [ 83 | "%matplotlib inline\n", 84 | "\n", 85 | "import numpy as np\n", 86 | "import pandas as pd\n", 87 | "import matplotlib.pyplot as plt\n", 88 | "import statsmodels.formula.api as smf\n", 89 | "import statsmodels.api as sm\n", 90 | "from scipy import stats\n", 91 | "\n", 92 | "def sobel_test(a, b, se_a, se_b):\n", 93 | " '''\n", 94 | " Sobel test for significance of mediation\n", 95 | " \n", 96 | " Args: \n", 97 | " a: coefficient from X to mediator variable, M\n", 98 | " b: coefficient from M to Y \n", 99 | " se_a: Standard error of A\n", 100 | " se_b: Standard error fo B\n", 101 | "\n", 102 | " Returns: \n", 103 | " t: Sobel's test statistic\n", 104 | " pval : Two-tailed probability assuming normal distribution\n", 105 | " '''\n", 106 | " \n", 107 | " SE = np.sqrt( (a**2)*(se_a**2) + (b**2)*(se_b**2))\n", 108 | " t = (a*b) / SE\n", 109 | " n = 100000000\n", 110 | " pval = stats.t.sf(np.abs(t), n-1)*2\n", 111 | " return t, pval" 112 | ] 113 | }, 114 | { 115 | "cell_type": "code", 116 | "execution_count": null, 117 | "metadata": { 118 | "ExecuteTime": { 119 | "end_time": "2017-02-08T11:04:14.211563", 120 | "start_time": "2017-02-08T11:04:14.206080" 121 | }, 122 | "collapsed": false, 123 | "deletable": true, 124 | "editable": true 125 | }, 126 | "outputs": [], 127 | "source": [ 128 | "# set random seed so everyone gets same results\n", 129 | "np.random.seed(1)\n", 130 | "\n", 131 | "# Determine effects\n", 132 | "a = -3 # effect of x to M\n", 133 | "b = 3 # effect of M to y\n", 134 | "cq = .1 # effect of x on y controlling for M\n", 135 | "\n", 136 | "# Create a random data x\n", 137 | "x = np.random.rand(100) \n", 138 | "m = x * a + np.random.rand(100)\n", 139 | "\n", 140 | "# Create Y\n", 141 | "y = np.dot(np.array([x,m]).T,[cq,b]) + np.random.rand(100)" 142 | ] 143 | }, 144 | { 145 | "cell_type": "code", 146 | "execution_count": null, 147 | "metadata": { 148 | "ExecuteTime": { 149 | "end_time": "2017-02-08T11:04:14.550299", 150 | "start_time": "2017-02-08T11:04:14.364989" 151 | }, 152 | "collapsed": false, 153 | "deletable": true, 154 | "editable": true 155 | }, 156 | "outputs": [], 157 | "source": [ 158 | "plt.scatter(x,y)\n", 159 | "plt.xlabel('X')\n", 160 | "plt.ylabel('Y')\n", 161 | "plt.title('X -> Y')" 162 | ] 163 | }, 164 | { 165 | "cell_type": "code", 166 | "execution_count": null, 167 | "metadata": { 168 | "ExecuteTime": { 169 | "end_time": "2017-02-08T11:04:14.728465", 170 | "start_time": "2017-02-08T11:04:14.551956" 171 | }, 172 | "collapsed": false, 173 | "deletable": true, 174 | "editable": true 175 | }, 176 | "outputs": [], 177 | "source": [ 178 | "plt.scatter(x,m)\n", 179 | "plt.xlabel('X')\n", 180 | "plt.ylabel('M')\n", 181 | "plt.title('X -> M')" 182 | ] 183 | }, 184 | { 185 | "cell_type": "code", 186 | "execution_count": null, 187 | "metadata": { 188 | "ExecuteTime": { 189 | "end_time": "2017-02-08T11:04:15.133448", 190 | "start_time": "2017-02-08T11:04:14.954491" 191 | }, 192 | "collapsed": false, 193 | "deletable": true, 194 | "editable": true 195 | }, 196 | "outputs": [], 197 | "source": [ 198 | "plt.scatter(m,y)\n", 199 | "plt.xlabel('M')\n", 200 | "plt.ylabel('Y')\n", 201 | "plt.title('M -> Y')" 202 | ] 203 | }, 204 | { 205 | "cell_type": "markdown", 206 | "metadata": { 207 | "deletable": true, 208 | "editable": true 209 | }, 210 | "source": [ 211 | "### 1) Test effect of X on Y" 212 | ] 213 | }, 214 | { 215 | "cell_type": "code", 216 | "execution_count": null, 217 | "metadata": { 218 | "ExecuteTime": { 219 | "end_time": "2017-02-08T11:04:15.148139", 220 | "start_time": "2017-02-08T11:04:15.135054" 221 | }, 222 | "collapsed": false, 223 | "deletable": true, 224 | "editable": true 225 | }, 226 | "outputs": [], 227 | "source": [ 228 | "X = pd.DataFrame({'Intercept':np.ones(len(x)),'X':x})\n", 229 | "lm1 = smf.OLS(y,X).fit()\n", 230 | "print lm1.summary()\n", 231 | "ec = lm1.params[1] # save total effect c to ec" 232 | ] 233 | }, 234 | { 235 | "cell_type": "markdown", 236 | "metadata": { 237 | "deletable": true, 238 | "editable": true 239 | }, 240 | "source": [ 241 | "### 2) Test effect of X on M" 242 | ] 243 | }, 244 | { 245 | "cell_type": "code", 246 | "execution_count": null, 247 | "metadata": { 248 | "ExecuteTime": { 249 | "end_time": "2017-02-08T11:04:15.410553", 250 | "start_time": "2017-02-08T11:04:15.397131" 251 | }, 252 | "collapsed": false, 253 | "deletable": true, 254 | "editable": true 255 | }, 256 | "outputs": [], 257 | "source": [ 258 | "lm2 = smf.OLS(m,X).fit()\n", 259 | "print lm2.summary()\n", 260 | "ea = lm2.params[1] # Save the effect of X on M, a, to ea\n", 261 | "sea = lm2.bse[1]" 262 | ] 263 | }, 264 | { 265 | "cell_type": "markdown", 266 | "metadata": { 267 | "deletable": true, 268 | "editable": true 269 | }, 270 | "source": [ 271 | "### 3) Test effect of X and M on Y " 272 | ] 273 | }, 274 | { 275 | "cell_type": "code", 276 | "execution_count": null, 277 | "metadata": { 278 | "ExecuteTime": { 279 | "end_time": "2017-02-08T11:04:15.737957", 280 | "start_time": "2017-02-08T11:04:15.724569" 281 | }, 282 | "collapsed": false, 283 | "deletable": true, 284 | "editable": true 285 | }, 286 | "outputs": [], 287 | "source": [ 288 | "X['M'] = m\n", 289 | "lm3 = smf.OLS(y,X).fit()\n", 290 | "print lm3.summary()\n", 291 | "ecq,eb = lm3.params[1:3]\n", 292 | "seb = lm3.bse[2]" 293 | ] 294 | }, 295 | { 296 | "cell_type": "markdown", 297 | "metadata": { 298 | "deletable": true, 299 | "editable": true 300 | }, 301 | "source": [ 302 | "### Show how the effect is broken down to direct and indirect effects\n", 303 | "Recall how the overall effect C was decomposed to indirect effect (a*b) and direct effect (c')\n", 304 | "### $c = a \\cdot b + c' $" 305 | ] 306 | }, 307 | { 308 | "cell_type": "code", 309 | "execution_count": null, 310 | "metadata": { 311 | "ExecuteTime": { 312 | "end_time": "2017-02-08T11:04:16.057799", 313 | "start_time": "2017-02-08T11:04:16.053220" 314 | }, 315 | "collapsed": false, 316 | "deletable": true, 317 | "editable": true 318 | }, 319 | "outputs": [], 320 | "source": [ 321 | "print('c : %.2f') % ec\n", 322 | "print('a : %.2f') % ea\n", 323 | "print('b : %.2f') % eb\n", 324 | "print('c\\' : %.2f') % ecq" 325 | ] 326 | }, 327 | { 328 | "cell_type": "code", 329 | "execution_count": null, 330 | "metadata": { 331 | "ExecuteTime": { 332 | "end_time": "2017-02-08T11:04:16.227940", 333 | "start_time": "2017-02-08T11:04:16.222887" 334 | }, 335 | "collapsed": false, 336 | "deletable": true, 337 | "editable": true 338 | }, 339 | "outputs": [], 340 | "source": [ 341 | "print('Total effect C: %.2f') % ec\n", 342 | "print('is decomposed into the indirect(mediated) effect a*b: %.2f') % (ea*eb)\n", 343 | "print('plus the direct effect c\\': %.2f') % ecq\n", 344 | "print('which adds up to %.2f') % (ea*eb+ecq)" 345 | ] 346 | }, 347 | { 348 | "cell_type": "markdown", 349 | "metadata": { 350 | "deletable": true, 351 | "editable": true 352 | }, 353 | "source": [ 354 | "## Run a Sobel Test for Significance of Mediation\n", 355 | "\n", 356 | "One way to test the significance of a mediation is to perform a Sobel test, where the indirect effect(a*b) is divided by an estimated standard error of the two. This assumes that the product would be normally distributed which may not always be the case. \n", 357 | "\n", 358 | "An alternative method is to bootstrap with replacement on the observed data to generate a 95% confidence interval. You can try this by writing a for-loop that resamples from the data and generate a distribution of the indirect effects(a*b). If the confidence interval does not include 0, it can be considered as significant. " 359 | ] 360 | }, 361 | { 362 | "cell_type": "code", 363 | "execution_count": null, 364 | "metadata": { 365 | "ExecuteTime": { 366 | "end_time": "2017-02-08T11:04:16.587230", 367 | "start_time": "2017-02-08T11:04:16.582745" 368 | }, 369 | "collapsed": false, 370 | "deletable": true, 371 | "editable": true 372 | }, 373 | "outputs": [], 374 | "source": [ 375 | "t,p = sobel_test(ea,eb,sea,seb)\n", 376 | "print('Sobel\\'s test of significance t = %2.2f') % t\n", 377 | "print('Two-tailed p-value p = %2.5f ') % p" 378 | ] 379 | }, 380 | { 381 | "cell_type": "markdown", 382 | "metadata": { 383 | "deletable": true, 384 | "editable": true 385 | }, 386 | "source": [ 387 | "# Moderation Analysis\n", 388 | "\n", 389 | "In a moderation analysis, the moderator modifies or changes the relationship between two variables, akin to an interaction term. Moderation is slightly different from an interaction due to the additional constraint that there is a causal relationship from X to Y, BUT not from Z to Y. Therefore, a moderation implies an interaction exists but an interaction does not imply a moderation. \n", 390 | "\n", 391 | "Here is a schematic representation of a moderation relationship. \n", 392 | "This diagram hypothesize that Stress has a causal relationship to Depression \n", 393 | "but the effect of Stress is different for people with high or low Social Support \n", 394 | "\n", 395 | "\n", 396 | "\n", 397 | "This can be reprsented by an interaction, \n", 398 | "\n", 399 | "\n", 400 | "\n", 401 | "\n", 402 | "\n", 403 | "\n", 404 | "The pictures have been retrieved from [here](http://www.victoria.ac.nz/psyc/paul-jose-files/helpcentre/help5_moderation_example.php)\n", 405 | "\n", 406 | "### Here are a few examples of moderations. Can you think of more?\n", 407 | "\n", 408 | "1) The effect of compliments on future grades is moderated by growth mindset [(Carol Dweck)](http://mindsetonline.com/)\n", 409 | "\n", 410 | "2) Effect of favorability on government behavior is moderated by political affiliation. \n", 411 | "\n", 412 | "3) Effect of pressure on performance is moderated by confidence (choking vs boosting). \n", 413 | "\n", 414 | "### For more information look at [homepage of Kenny](http://davidakenny.net/cm/moderation.htm) who started all this. " 415 | ] 416 | }, 417 | { 418 | "cell_type": "markdown", 419 | "metadata": { 420 | "deletable": true, 421 | "editable": true 422 | }, 423 | "source": [ 424 | "## Example \n", 425 | "Here we examine whether the effect of buying books (**Buy**) on enjoyment of reading (**Enjoy**) is moderated by frequency of reading (**Read**).\n", 426 | "\n", 427 | "The moderation effect exists if there is an interaction of buying and reading on enjoyment." 428 | ] 429 | }, 430 | { 431 | "cell_type": "code", 432 | "execution_count": null, 433 | "metadata": { 434 | "ExecuteTime": { 435 | "end_time": "2017-02-08T11:08:49.941382", 436 | "start_time": "2017-02-08T11:08:49.931106" 437 | }, 438 | "collapsed": true 439 | }, 440 | "outputs": [], 441 | "source": [ 442 | "df = pd.DataFrame({\n", 443 | " 'Enjoy':[4, 16, 4, 12, 9, 5, 15, 21, 3, 4, 8, 11, 7, 5, 8, 19, 11, 9, 9, 13, 11, 21, 18, 12, 15, 3, 2, 10, 7, 9, 5, 6, 9, 12, 9, 5, 17, 15, 9, 7, 5, 10, 10, 6, 7, 9, 12, 2, 1, 5, 7, 5, 8, 5, 5, 11, 8, 9, 13, 9, 19, 8, 21, 1, 11, 8, 6, 23, 2, 9, 13, 4, 10, 12, 5, 7, 10, 11, 12, 13],\n", 444 | " 'Buy':[5, 8, 0, 8, 3, 0, 8, 9, 8, 1, 7, 2, 2, 9, 3, 8, 8, 9, 5, 7, 7, 9, 6, 7, 8, 4, 4, 3, 1, 4, 1, 5, 5, 2, 9, 5, 7, 8, 2, 4, 1, 4, 0, 1, 0, 8, 2, 4, 0, 0, 0, 1, 2, 2, 2, 7, 5, 1, 9, 9, 8, 1, 7, 2, 5, 2, 4, 9, 1, 6, 3, 0, 7, 5, 2, 3, 1, 8, 6, 4],\n", 445 | " 'Read':[0, 8, 0, 5, 4, 9, 6, 9, 1, 3, 3, 3, 8, 0, 9, 8, 6, 0, 5, 6, 1, 7, 7, 5, 7, 2, 0, 5, 1, 7, 7, 4, 6, 5, 3, 1, 7, 6, 0, 4, 0, 9, 5, 9, 2, 3, 5, 2, 5, 2, 9, 1, 1, 7, 9, 3, 0, 4, 4, 3, 8, 8, 8, 2, 3, 7, 1, 8, 6, 1, 7, 0, 3, 2, 5, 3, 8, 6, 9, 7] \n", 446 | "})" 447 | ] 448 | }, 449 | { 450 | "cell_type": "markdown", 451 | "metadata": { 452 | "deletable": true, 453 | "editable": true 454 | }, 455 | "source": [ 456 | "# Importance of centering variables for interaction\n", 457 | "The interaction effect can be **VERY** different if you don't center your variables" 458 | ] 459 | }, 460 | { 461 | "cell_type": "code", 462 | "execution_count": null, 463 | "metadata": { 464 | "ExecuteTime": { 465 | "end_time": "2017-02-08T11:08:51.761113", 466 | "start_time": "2017-02-08T11:08:51.752846" 467 | }, 468 | "collapsed": false, 469 | "deletable": true, 470 | "editable": true 471 | }, 472 | "outputs": [], 473 | "source": [ 474 | "df['Interaction'] = df.Read*df.Buy\n", 475 | "np.corrcoef(df.Buy,df.Interaction)" 476 | ] 477 | }, 478 | { 479 | "cell_type": "code", 480 | "execution_count": null, 481 | "metadata": { 482 | "ExecuteTime": { 483 | "end_time": "2017-02-08T11:09:41.621877", 484 | "start_time": "2017-02-08T11:09:41.462656" 485 | }, 486 | "collapsed": false, 487 | "deletable": true, 488 | "editable": true 489 | }, 490 | "outputs": [], 491 | "source": [ 492 | "plt.scatter(df.Buy,df.Interaction)\n", 493 | "plt.xlabel('Read * Buy')\n", 494 | "plt.ylabel('Buy')" 495 | ] 496 | }, 497 | { 498 | "cell_type": "code", 499 | "execution_count": null, 500 | "metadata": { 501 | "ExecuteTime": { 502 | "end_time": "2017-02-08T11:08:59.826515", 503 | "start_time": "2017-02-08T11:08:59.816012" 504 | }, 505 | "collapsed": false, 506 | "deletable": true, 507 | "editable": true 508 | }, 509 | "outputs": [], 510 | "source": [ 511 | "center = lambda x: (x - x.mean())\n", 512 | "df[['Read_Centered','Buy_Centered']] = df[['Read','Buy']].apply(center)\n", 513 | "df['Interaction_Centered'] = df['Read_Centered'] * df['Buy_Centered']\n", 514 | "np.corrcoef(df.Buy,df.Interaction_Centered)" 515 | ] 516 | }, 517 | { 518 | "cell_type": "code", 519 | "execution_count": null, 520 | "metadata": { 521 | "ExecuteTime": { 522 | "end_time": "2017-02-08T11:09:01.142976", 523 | "start_time": "2017-02-08T11:09:00.976403" 524 | }, 525 | "collapsed": false, 526 | "deletable": true, 527 | "editable": true 528 | }, 529 | "outputs": [], 530 | "source": [ 531 | "plt.scatter(df.Buy,df.Interaction_Centered)\n", 532 | "plt.xlabel('Read * Buy')\n", 533 | "plt.ylabel('Buy')" 534 | ] 535 | }, 536 | { 537 | "cell_type": "code", 538 | "execution_count": null, 539 | "metadata": { 540 | "ExecuteTime": { 541 | "end_time": "2017-02-08T11:09:05.225235", 542 | "start_time": "2017-02-08T11:09:05.208483" 543 | }, 544 | "collapsed": false 545 | }, 546 | "outputs": [], 547 | "source": [ 548 | "mod = smf.ols(formula = \"Enjoy ~ Buy + Read + Interaction\", data = df).fit()\n", 549 | "print mod.summary()" 550 | ] 551 | }, 552 | { 553 | "cell_type": "code", 554 | "execution_count": null, 555 | "metadata": { 556 | "ExecuteTime": { 557 | "end_time": "2017-02-08T11:09:05.834256", 558 | "start_time": "2017-02-08T11:09:05.816079" 559 | }, 560 | "collapsed": false, 561 | "deletable": true, 562 | "editable": true 563 | }, 564 | "outputs": [], 565 | "source": [ 566 | "mod = smf.ols(formula = \"Enjoy ~ Buy + Read + Interaction_Centered\", data = df).fit()\n", 567 | "print mod.summary()" 568 | ] 569 | }, 570 | { 571 | "cell_type": "code", 572 | "execution_count": null, 573 | "metadata": { 574 | "collapsed": true 575 | }, 576 | "outputs": [], 577 | "source": [] 578 | }, 579 | { 580 | "cell_type": "markdown", 581 | "metadata": { 582 | "deletable": true, 583 | "editable": true 584 | }, 585 | "source": [ 586 | "# Exercises\n", 587 | "\n", 588 | "## Exercise 1 - Mediation\n", 589 | "\n", 590 | "For exercise 1, let's revisit the salary dataset. ('../Data/salary_exercise.csv') \n", 591 | "\n", 592 | "These are the salary data used in Weisberg's book, consisting of observations on six variables for 52 tenure-track professors in a small college. The variables are:\n", 593 | "\n", 594 | "- sx = Sex, coded 1 for female and 0 for male\n", 595 | "- rk = Rank, coded\n", 596 | " - 1 for assistant professor,\n", 597 | " - 2 for associate professor, and\n", 598 | " - 3 for full professor\n", 599 | "- yr = Number of years in current rank\n", 600 | "- dg = Highest degree, coded 1 if doctorate, 0 if masters\n", 601 | "- yd = Number of years since highest degree was earned\n", 602 | "- sl = Academic year salary, in dollars.\n", 603 | "\n", 604 | "Reference: S. Weisberg (1985). Applied Linear Regression, Second Edition. New York: John Wiley and Sons. Page 194.\n", 605 | "\n", 606 | "**Make sure you check the data for missing values or other errors before beginning your analysis.**\n", 607 | "\n", 608 | "### Question:\n", 609 | "Evaluate whether academic rank mediates the relationship between salary and gender" 610 | ] 611 | }, 612 | { 613 | "cell_type": "code", 614 | "execution_count": null, 615 | "metadata": { 616 | "collapsed": true 617 | }, 618 | "outputs": [], 619 | "source": [] 620 | }, 621 | { 622 | "cell_type": "markdown", 623 | "metadata": {}, 624 | "source": [ 625 | "## Exercise 2 - Moderation\n", 626 | "\n", 627 | "For Exercise 2, we will use the alcohol dataset again ('../Data/student-mat.csv').\n", 628 | "\n", 629 | "Below is the information about the dataset. \n", 630 | "\n", 631 | "Attribute Information: \n", 632 | "1 school - student's school (binary: 'GP' - Gabriel Pereira or 'MS' - Mousinho da Silveira) \n", 633 | "2 sex - student's sex (binary: 'F' - female or 'M' - male) \n", 634 | "3 age - student's age (numeric: from 15 to 22) \n", 635 | "4 address - student's home address type (binary: 'U' - urban or 'R' - rural) \n", 636 | "5 famsize - family size (binary: 'LE3' - less or equal to 3 or 'GT3' - greater than 3) \n", 637 | "6 Pstatus - parent's cohabitation status (binary: 'T' - living together or 'A' - apart) \n", 638 | "7 Medu - mother's education (numeric: 0 - none, 1 - primary education (4th grade), 2 – 5th to 9th grade, 3 – secondary education or 4 – higher education) \n", 639 | "8 Fedu - father's education (numeric: 0 - none, 1 - primary education (4th grade), 2 – 5th to 9th grade, 3 – secondary education or 4 – higher education) \n", 640 | "9 Mjob - mother's job (nominal: 'teacher', 'health' care related, civil 'services' (e.g. administrative or police), 'at_home' or 'other') \n", 641 | "10 Fjob - father's job (nominal: 'teacher', 'health' care related, civil 'services' (e.g. administrative or police), 'at_home' or 'other') \n", 642 | "11 reason - reason to choose this school (nominal: close to 'home', school 'reputation', 'course' preference or 'other') \n", 643 | "12 guardian - student's guardian (nominal: 'mother', 'father' or 'other') \n", 644 | "13 traveltime - home to school travel time (numeric: 1 - <15 min., 2 - 15 to 30 min., 3 - 30 min. to 1 hour, or 4 - >1 hour) \n", 645 | "14 studytime - weekly study time (numeric: 1 - <2 hours, 2 - 2 to 5 hours, 3 - 5 to 10 hours, or 4 - >10 hours) \n", 646 | "15 failures - number of past class failures (numeric: n if 1<=n<3, else 4) \n", 647 | "16 schoolsup - extra educational support (binary: yes or no) \n", 648 | "17 famsup - family educational support (binary: yes or no) \n", 649 | "18 paid - extra paid classes within the course subject (Math or Portuguese) (binary: yes or no) \n", 650 | "19 activities - extra-curricular activities (binary: yes or no) \n", 651 | "20 nursery - attended nursery school (binary: yes or no) \n", 652 | "21 higher - wants to take higher education (binary: yes or no) \n", 653 | "22 internet - Internet access at home (binary: yes or no) \n", 654 | "23 romantic - with a romantic relationship (binary: yes or no) \n", 655 | "24 famrel - quality of family relationships (numeric: from 1 - very bad to 5 - excellent) \n", 656 | "25 freetime - free time after school (numeric: from 1 - very low to 5 - very high) \n", 657 | "26 goout - going out with friends (numeric: from 1 - very low to 5 - very high) \n", 658 | "27 Dalc - workday alcohol consumption (numeric: from 1 - very low to 5 - very high) \n", 659 | "28 Walc - weekend alcohol consumption (numeric: from 1 - very low to 5 - very high) \n", 660 | "29 health - current health status (numeric: from 1 - very bad to 5 - very good) \n", 661 | "30 absences - number of school absences (numeric: from 0 to 93) \n", 662 | "31 G1 - first period grade (numeric: from 0 to 20) \n", 663 | "31 G2 - second period grade (numeric: from 0 to 20) \n", 664 | "32 G3 - final grade (numeric: from 0 to 20, output target) \n", 665 | "\n", 666 | "### Question:\n", 667 | "\n", 668 | "The amount you go out seems to be associated with increased daily alcohol consumption. Does being in a relationship dampen this association? Are there any other variables that appear to moderate this relationship?\n" 669 | ] 670 | }, 671 | { 672 | "cell_type": "code", 673 | "execution_count": null, 674 | "metadata": { 675 | "collapsed": true 676 | }, 677 | "outputs": [], 678 | "source": [] 679 | }, 680 | { 681 | "cell_type": "code", 682 | "execution_count": null, 683 | "metadata": { 684 | "collapsed": true 685 | }, 686 | "outputs": [], 687 | "source": [] 688 | } 689 | ], 690 | "metadata": { 691 | "anaconda-cloud": {}, 692 | "kernelspec": { 693 | "display_name": "Python [default]", 694 | "language": "python", 695 | "name": "python2" 696 | }, 697 | "language_info": { 698 | "codemirror_mode": { 699 | "name": "ipython", 700 | "version": 2 701 | }, 702 | "file_extension": ".py", 703 | "mimetype": "text/x-python", 704 | "name": "python", 705 | "nbconvert_exporter": "python", 706 | "pygments_lexer": "ipython2", 707 | "version": "2.7.13" 708 | }, 709 | "toc": { 710 | "nav_menu": { 711 | "height": "468px", 712 | "width": "252px" 713 | }, 714 | "navigate_menu": true, 715 | "number_sections": true, 716 | "sideBar": true, 717 | "threshold": 4, 718 | "toc_cell": false, 719 | "toc_section_display": "block", 720 | "toc_window_display": false 721 | } 722 | }, 723 | "nbformat": 4, 724 | "nbformat_minor": 0 725 | } 726 | -------------------------------------------------------------------------------- /Notebooks/5_Prediction_&_Regularization.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": {}, 6 | "source": [ 7 | "# Logistic Regression Analysis\n", 8 | "\n", 9 | "*Written by Jin Cheong & Luke Chang*\n", 10 | "\n", 11 | "In this lab we are going to learn about more advanced topics in regression analysis.\n", 12 | "- How to use link functions to perform regressions on other types of data such as categorical variables\n", 13 | "- How to perform cross-validation to assess degree of overfitting\n", 14 | "- How to deal with multicollinearity with regularization. \n", 15 | "\n", 16 | "After the tutorial you will have the chance to apply the methods on a separate dataset. \n", 17 | "\n", 18 | "\n", 19 | "## Introduction to logistic regression \n", 20 | "\n", 21 | "A logistic regression, also called a logit model, is used to model outcome variables that are binary. Examples of such variables are \"hit versus miss,\" \"success versus fail,\" or \"male versus female.\" We can use a logistic regression to calculate the probability of an item being one or the other.\n", 22 | "\n", 23 | "In the logit model, the log odds of the outcome is modeled as a linear combination of the predictor variables. \n", 24 | "Lets recall that the simple linear regression had the following form :\n", 25 | "### $y = \\beta_0 + \\beta_1\\cdot x $\n", 26 | "\n", 27 | "We can extend this to a multiple linear regression with multiple features into the following form : \n", 28 | "### $y = \\beta_0 + \\beta_1x_1 + ... + \\beta_nx_n$ \n", 29 | "\n", 30 | " \n", 31 | "In logistic regression, instead of estimating the __value__ of Y from Xs, we estimate the __probability__ of Y. For instance if our model tries to predict whether a person is male(Y=0) or female(Y=1), then a Y value of .8 would mean it has 80% chance of being a female.\n", 32 | "Formally, the linear regression model is passed into a sigmoid function. Here is the simple form with one predictor variable.\n", 33 | "## $P(Y) = \\frac{1}{1 + e^{-(\\beta_0 + \\beta_1x_1)}}$\n", 34 | "\n", 35 | "We can easily extend this to multiple regression with n-number of predictors.\n", 36 | "\n", 37 | "\n", 38 | "## $P(Y) = \\frac{1}{1 + e^{-(\\beta_0 + \\beta_1x_1 + ... + \\beta_nx_n)}}$\n", 39 | "\n", 40 | "Now let's see how we can achieve this in code. \n" 41 | ] 42 | }, 43 | { 44 | "cell_type": "code", 45 | "execution_count": null, 46 | "metadata": { 47 | "ExecuteTime": { 48 | "end_time": "2017-01-27T10:48:11.684432", 49 | "start_time": "2017-01-27T10:48:11.679536" 50 | }, 51 | "collapsed": true 52 | }, 53 | "outputs": [], 54 | "source": [ 55 | "%matplotlib inline\n", 56 | "\n", 57 | "import numpy as np\n", 58 | "import pandas as pd\n", 59 | "import matplotlib.pyplot as plt\n", 60 | "import statsmodels.formula.api as smf\n", 61 | "import seaborn as sns\n", 62 | "from IPython.display import display, HTML" 63 | ] 64 | }, 65 | { 66 | "cell_type": "markdown", 67 | "metadata": {}, 68 | "source": [ 69 | "## Logistic regression analysis\n", 70 | "\n", 71 | "Let's start off by plotting a sigmoid function and add necessary modules." 72 | ] 73 | }, 74 | { 75 | "cell_type": "code", 76 | "execution_count": null, 77 | "metadata": { 78 | "ExecuteTime": { 79 | "end_time": "2017-01-27T10:34:03.578429", 80 | "start_time": "2017-01-27T10:34:03.408176" 81 | }, 82 | "collapsed": false 83 | }, 84 | "outputs": [], 85 | "source": [ 86 | "# Plot a sigmoid function\n", 87 | "plt.figure(figsize=(5,5)) # open a figure base and determine the size (width, height)\n", 88 | "\n", 89 | "def sigmoid(t): # Define the sigmoid function\n", 90 | " return (1/(1 + np.e**(-t))) \n", 91 | "\n", 92 | "plot_range = np.arange(-6, 6, 0.1) # range of x values to plot\n", 93 | "y_values = sigmoid(plot_range)\n", 94 | "\n", 95 | "# Plot curve\n", 96 | "plt.plot(plot_range, # X-axis range\n", 97 | " y_values, # Predicted values\n", 98 | " color=\"red\",linewidth=3)\n", 99 | "plt.title(\"Example of a sigmoid function\",fontsize=18)\n", 100 | "plt.ylabel(\"Probability\",fontsize=16)" 101 | ] 102 | }, 103 | { 104 | "cell_type": "markdown", 105 | "metadata": {}, 106 | "source": [ 107 | "Now we will load our sample data and create a 'Alcabuse' variable from 'Dalc' and 'Walc' (weekday and weekend alcohol consumption level, respectively) and make it a binary variable by calling it alcohol abuse if a student scores more than 8 on drinking alcohol in a week. The dataset was retrieved from [here](http://archive.ics.uci.edu/ml/datasets/STUDENT+ALCOHOL+CONSUMPTION#) but slightly modified to better illustrate our purpose." 108 | ] 109 | }, 110 | { 111 | "cell_type": "code", 112 | "execution_count": null, 113 | "metadata": { 114 | "ExecuteTime": { 115 | "end_time": "2017-01-27T10:34:28.969833", 116 | "start_time": "2017-01-27T10:34:28.958171" 117 | }, 118 | "collapsed": false 119 | }, 120 | "outputs": [], 121 | "source": [ 122 | "# Load the data \n", 123 | "df = pd.read_csv('../Data/student-mat.csv')\n", 124 | "df['Alcabuse']=0\n", 125 | "df.loc[df['Dalc']+df['Walc'] >=8,'Alcabuse']=1 # Let's call it abuse if more than 8 alcohol consumption per week" 126 | ] 127 | }, 128 | { 129 | "cell_type": "markdown", 130 | "metadata": {}, 131 | "source": [ 132 | "The hypothesis we want to test is : __\"A student is more likely to abuse alcohol if he has more free time.\"__ \n", 133 | "'Alcabuse' will be our response variable Y, measuring the number of drinks per week. \n", 134 | "'freetime' will be our predictor variable X, measuring the degree of free time.\n", 135 | "\n", 136 | "We will use the logit function in the statsmodels package. " 137 | ] 138 | }, 139 | { 140 | "cell_type": "code", 141 | "execution_count": null, 142 | "metadata": { 143 | "ExecuteTime": { 144 | "end_time": "2017-01-27T10:34:38.993233", 145 | "start_time": "2017-01-27T10:34:38.969608" 146 | }, 147 | "collapsed": false 148 | }, 149 | "outputs": [], 150 | "source": [ 151 | "logm = smf.logit('Alcabuse ~ freetime',data=df).fit()\n", 152 | "print(logm.summary())" 153 | ] 154 | }, 155 | { 156 | "cell_type": "markdown", 157 | "metadata": {}, 158 | "source": [ 159 | "## Interpreting logistic model coefficients for prediction\n", 160 | "\n", 161 | "We ran our first logit model but we cannot interpret the coefficients like we did in the linear regression model. Because we applied a logit transform, the coefficients are in log-odds. \n", 162 | "To make them more interpretable we can exponentiate the coefficients and interpret them as either probability or odds-ratios. \n", 163 | "\n", 164 | "### Interpreting coefficients as probabilities\n", 165 | "To change our log-odds coefficients into probabilities we pass the log-odds value of Y into a sigmoid function.\n", 166 | "For instance, let's see what the probability is for a student to be an alcohol abuser if he had a level 5 of free time (x=5). \n", 167 | "\n", 168 | "$x = 5$ \n", 169 | "$\\beta_0 = -7.6262$ \n", 170 | "$\\beta_1 = 1.6416$ \n", 171 | "## $ Y = \\beta_0 + \\beta_1\\cdot x = -7.6262 + 1.6416 \\cdot 5$ \n", 172 | "## $P(Y) = \\frac{1}{1 + e^{-(Y)}} = \\frac{1}{1 + e^{-(-7.6262 + 1.6416 \\cdot 5)}}$\n", 173 | "\n", 174 | "We implement this with the following code by using the 'sigmoid' function as defined earlier in this module.\n", 175 | "We also plot the fit of the model." 176 | ] 177 | }, 178 | { 179 | "cell_type": "code", 180 | "execution_count": null, 181 | "metadata": { 182 | "ExecuteTime": { 183 | "end_time": "2017-01-27T10:35:53.052903", 184 | "start_time": "2017-01-27T10:35:53.043411" 185 | }, 186 | "collapsed": false, 187 | "scrolled": true 188 | }, 189 | "outputs": [], 190 | "source": [ 191 | "x = 5\n", 192 | "print('x = %i' %x) # x\n", 193 | "print('b0 = %f' %logm.params[0]) # b0\n", 194 | "print('b1 = %f' %logm.params[1]) # b1\n", 195 | "palcabuse = sigmoid(logm.params[0] + logm.params[1] * x)\n", 196 | "print('palcabuse = %f' %palcabuse)\n", 197 | "print('If a student has %i hours of free time per week, then he has %.2f %% probability of alcohol abuse' % (x,palcabuse*100))\n", 198 | "# Note that we had to use double percent sign %% to print % \n", 199 | "# because a single % is reserved for printing inline numbers as in %i or %.2f" 200 | ] 201 | }, 202 | { 203 | "cell_type": "markdown", 204 | "metadata": {}, 205 | "source": [ 206 | "### Plotting the predictions of the model" 207 | ] 208 | }, 209 | { 210 | "cell_type": "code", 211 | "execution_count": null, 212 | "metadata": { 213 | "ExecuteTime": { 214 | "end_time": "2017-01-27T10:37:07.982811", 215 | "start_time": "2017-01-27T10:37:07.767818" 216 | }, 217 | "collapsed": false 218 | }, 219 | "outputs": [], 220 | "source": [ 221 | "# Plot model fit\n", 222 | "xs = np.arange(min(df.freetime)-1,max(df.freetime)+1,0.1)\n", 223 | "ys = np.array([sigmoid(logm.params[0] + logm.params[1] * x) for x in xs])\n", 224 | "plt.figure(figsize=(8,4))\n", 225 | "plt.plot(xs,ys,linewidth=3)\n", 226 | "jitter = np.random.random(len(df.Alcabuse))/5\n", 227 | "ys_outcomes = np.abs((df.Alcabuse==1)-0.01-jitter)\n", 228 | "alpha = 0.7\n", 229 | "plt.plot(df.freetime[df.Alcabuse == 1], ys_outcomes[df.Alcabuse ==1], 'b.', alpha=alpha)\n", 230 | "plt.plot(df.freetime[df.Alcabuse == 0], ys_outcomes[df.Alcabuse ==0], 'r.', alpha=alpha)\n", 231 | "plt.xlabel(\"Freetime\",fontsize=16)\n", 232 | "plt.ylabel(\"Probability of being an alcohol abuser\",fontsize=16)\n", 233 | "plt.title(\"Predicting alcohol abuse by free time\",fontsize=18)" 234 | ] 235 | }, 236 | { 237 | "cell_type": "markdown", 238 | "metadata": {}, 239 | "source": [ 240 | "### Interpreting coefficients as odds\n", 241 | "In some cases it could be more useful to consider the odds of an event happening. \n", 242 | "For example, we might want to be able to make statementes such as \"If a person smokes, he is 10 times more likely to get lung cancer compared to a person who doesn't smoke.\" The odds of an event happening is defined by\n", 243 | "\n", 244 | "## $odds = \\frac{P(\\text{event})}{P(\\text{no event})} = \\frac{P(\\text{Y})}{P(\\text{not Y})}$\n", 245 | "while $P(\\text{Y}) = \\frac{1}{1 + e^{-(\\beta_0 + \\beta_1x_1)}}$\n", 246 | "\n", 247 | "and $P(\\text{not Y}) = 1 - P(\\text{Y})$\n", 248 | "\n", 249 | "If we want to look at how much the odds change for a unit increase in free time, we can use the following \n", 250 | "\n", 251 | "## $\\triangle odds = \\frac{\\text{odds after a unit change in predictor}}{\\text{original odds}} $\n", 252 | "\n", 253 | "So, if we want to calculate the change in odds ratios of being an alcohol abuser when free time increases from 4 to 5, we can calculate the following: \n", 254 | "## $\\triangle odds = \\frac{\\text{odds(Y) when x = 5}}{\\text{odds(Y) when x = 4}} = \\frac{\\frac{\\text{P(Y) when x = 5}}{\\text{1 - (P(Y) when x = 5)}}}{\\frac{\\text{P(Y) when x = 4}}{\\text{1 - (P(Y) when x = 4)}}}$\n" 255 | ] 256 | }, 257 | { 258 | "cell_type": "code", 259 | "execution_count": null, 260 | "metadata": { 261 | "ExecuteTime": { 262 | "end_time": "2017-01-27T10:39:10.389767", 263 | "start_time": "2017-01-27T10:39:10.383370" 264 | }, 265 | "collapsed": false 266 | }, 267 | "outputs": [], 268 | "source": [ 269 | "odd1 = sigmoid(logm.params[0] + logm.params[1] * 5) / (1-sigmoid(logm.params[0] + logm.params[1] * 5))\n", 270 | "odd2 = sigmoid(logm.params[0] + logm.params[1] * 4) / (1-sigmoid(logm.params[0] + logm.params[1] * 4))\n", 271 | "dodds = odd1/odd2\n", 272 | "print('A student with 5hrs of free time compared to a student with 4 is %.2f times more likely to be an alcohol abuser' %dodds)" 273 | ] 274 | }, 275 | { 276 | "cell_type": "markdown", 277 | "metadata": {}, 278 | "source": [ 279 | "## Cross-validation \n", 280 | "\n", 281 | "The best way to compare accuracy of models and see how well your model generalize is to train (or fit) your model on a training dataset then test it on a separate test dataset. If you train and test your model on the same dataset (as we did above) you are likely to get a positvely biased result. The true generalizeability can only be observed when you test your model on a separate hold-out test dataset on which the model was not trained. A simple way to do cross-validation is to split the dataset before you begin your analysis. \n", 282 | "\n", 283 | "There are many ways to split the dataset but for simplicity we will use the train_test_split function in the module sci-kit learn. \n", 284 | "We will split the data so that the training dataset consists 60% of the data and the test dataset consists 40% of the data.\n", 285 | "\n", 286 | "### 1. Split the data into training and testing datasets" 287 | ] 288 | }, 289 | { 290 | "cell_type": "code", 291 | "execution_count": null, 292 | "metadata": { 293 | "ExecuteTime": { 294 | "end_time": "2017-01-27T10:39:55.750103", 295 | "start_time": "2017-01-27T10:39:55.687554" 296 | }, 297 | "collapsed": false 298 | }, 299 | "outputs": [], 300 | "source": [ 301 | "from sklearn.cross_validation import train_test_split\n", 302 | "np.random.seed(12345678) # set the random number generator seed to get consistent results\n", 303 | "trainratio = .6\n", 304 | "train, test = train_test_split(df, train_size=trainratio, random_state=1)\n", 305 | "print('train-test ratio %.2f' %(float(len(train))/(len(train)+len(test))))" 306 | ] 307 | }, 308 | { 309 | "cell_type": "markdown", 310 | "metadata": {}, 311 | "source": [ 312 | "### 2. Fit the model on the training dataset\n", 313 | "Now we fit a new model and save it to logmt. \n", 314 | "We will try to predict 'Alcabuse' with 'freetime' and 'goout' \n", 315 | "Note that we specify what dataset we are using by setting data = train. " 316 | ] 317 | }, 318 | { 319 | "cell_type": "code", 320 | "execution_count": null, 321 | "metadata": { 322 | "ExecuteTime": { 323 | "end_time": "2017-01-27T10:42:00.202890", 324 | "start_time": "2017-01-27T10:42:00.184033" 325 | }, 326 | "collapsed": false 327 | }, 328 | "outputs": [], 329 | "source": [ 330 | "# We run the model on the train dataset\n", 331 | "logmt = smf.logit('Alcabuse ~ freetime + goout',data=train).fit()\n", 332 | "print(logmt.summary())" 333 | ] 334 | }, 335 | { 336 | "cell_type": "markdown", 337 | "metadata": {}, 338 | "source": [ 339 | "### 3. Calculate predictions on the test dataset\n", 340 | "We now use the model with coefficients estimated from the training dataset (logmt) and plug in the predictor variable values from the test dataset by using the .predict method. This method outputs the probabilities for us so we don't have to apply separate sigmoid transform. Therefore the output are the probabilities predicted with our model on the test dataset." 341 | ] 342 | }, 343 | { 344 | "cell_type": "code", 345 | "execution_count": null, 346 | "metadata": { 347 | "ExecuteTime": { 348 | "end_time": "2017-01-27T10:42:10.419775", 349 | "start_time": "2017-01-27T10:42:10.410035" 350 | }, 351 | "collapsed": false 352 | }, 353 | "outputs": [], 354 | "source": [ 355 | "ytestpred = logmt.predict(test)\n", 356 | "np.round(ytestpred,2)" 357 | ] 358 | }, 359 | { 360 | "cell_type": "markdown", 361 | "metadata": {}, 362 | "source": [ 363 | "### 4. Calculate accuracy of the predictions\n", 364 | "To calculate the accuracy of the model we need to specify the cutoff for the probabilities to say whether one should be classified as an alcohol abuser or not. For simplicity, let's say that we'll classify the person as an alcohol abuser if the estimated probability is greater than 50%. \n", 365 | "\n", 366 | "We will then calculate the accuracy of the model in classifying alcohol abuse for the training data and the test data separately" 367 | ] 368 | }, 369 | { 370 | "cell_type": "code", 371 | "execution_count": null, 372 | "metadata": { 373 | "ExecuteTime": { 374 | "end_time": "2017-01-27T10:42:29.512250", 375 | "start_time": "2017-01-27T10:42:29.479882" 376 | }, 377 | "collapsed": false 378 | }, 379 | "outputs": [], 380 | "source": [ 381 | "cutoff = .5\n", 382 | "\n", 383 | "# Calculate accuracy for train\n", 384 | "ytrainpred = logmt.predict(train)\n", 385 | "ytrainpred[ytrainpred>=cutoff] =1 \n", 386 | "ytrainpred[ytrainpred=cutoff] =1 \n", 392 | "ytestpred[ytestpred=cutoff] =1 \n", 475 | "ytestpred[ytestpred\n", 501 | "\n", 502 | "Regularization attempts to solve this problem by introducting a loss function that penalizes the model for each additional features added to the model. The AIC / BIC values we mentioned in the first session is a form of regularization in comparing models. \n", 503 | "\n", 504 | "In this section we will focus on feature selection regularization methods Lasso and Ridge regressions.\n", 505 | "\n", 506 | "## Lasso regression\n", 507 | "In short, [Lasso](http://stats.stackexchange.com/questions/17251/what-is-the-lasso-in-regression-analysis) is a feature selection method that reduces the number of features to use in a regression. \n", 508 | "This is useful if you have a lot of variables that are correlated or you have more variables than observations. \n", 509 | "[Here](http://studentlife.cs.dartmouth.edu/smartgpa.pdf) is a study from Dartmouth that used Lasso on the Student Life Dataset.\n", 510 | "\n", 511 | "In our example, we will try to find which variables can predict the level of alcohol consumption 'AlcSum'. " 512 | ] 513 | }, 514 | { 515 | "cell_type": "code", 516 | "execution_count": null, 517 | "metadata": { 518 | "ExecuteTime": { 519 | "end_time": "2017-01-27T11:05:03.397385", 520 | "start_time": "2017-01-27T11:05:03.393882" 521 | }, 522 | "collapsed": true 523 | }, 524 | "outputs": [], 525 | "source": [ 526 | "df['AlcSum']=df['Dalc']+df['Walc']" 527 | ] 528 | }, 529 | { 530 | "cell_type": "code", 531 | "execution_count": null, 532 | "metadata": { 533 | "ExecuteTime": { 534 | "end_time": "2017-01-27T11:05:06.581723", 535 | "start_time": "2017-01-27T11:05:06.577263" 536 | }, 537 | "collapsed": false 538 | }, 539 | "outputs": [], 540 | "source": [ 541 | "# We have a lot of features to choose from\n", 542 | "df.keys()" 543 | ] 544 | }, 545 | { 546 | "cell_type": "code", 547 | "execution_count": null, 548 | "metadata": { 549 | "ExecuteTime": { 550 | "end_time": "2017-01-27T11:05:19.241902", 551 | "start_time": "2017-01-27T11:05:18.929428" 552 | }, 553 | "collapsed": false 554 | }, 555 | "outputs": [], 556 | "source": [ 557 | "# Make a model with all the feature variables\n", 558 | "formula = 'AlcSum ~ school+sex+age+address+famsize+Pstatus+Medu+\\\n", 559 | " Fedu+Mjob+Fjob+reason+guardian+traveltime+\\\n", 560 | " studytime+failures+schoolsup+famsup+paid+activities+nursery+higher+\\\n", 561 | " internet+romantic+famrel+freetime+goout+health+absences+G1+G2+G3'\n", 562 | "\n", 563 | "# Generate a design matrix with the formula\n", 564 | "# This automatically generates separate regressors for categorical variables.\n", 565 | "from patsy import dmatrices\n", 566 | "y, X = dmatrices(formula_like=formula, data=df, return_type='dataframe')\n", 567 | "# Ridge if L1_wt = 0 , Lasso if L1_wt = 1\n", 568 | "penalty = .036\n", 569 | "regm = smf.OLS(y,X).fit_regularized(method='coord_descent', maxiter=1000, alpha=penalty, L1_wt=1.0)\n", 570 | "print regm.summary()" 571 | ] 572 | }, 573 | { 574 | "cell_type": "markdown", 575 | "metadata": {}, 576 | "source": [ 577 | "### How do we determine the penalty value? \n", 578 | "Essentially, we want to select the regularization parameter by identifying the one from a set of possible values (e.g. grid search) that results in the best fit of the model to the data. However, it is important to note that it is easy to introduce bias into this process by trying a bunch of alphas and selecting the one that works best. This can lead to optimistic evaluations of how well your model works.\n", 579 | "\n", 580 | "Cross-validation is an ideal method to deal with this. We can use cross-validation to select the alpha while adding minimal bias to the overall model prediction.\n", 581 | "\n", 582 | "There are several different metrics of model fit to use as a criterion variable. Log-likelihood or mean squared error are the two most common depending on how you are estimating the parameters (i.e., maximizing log-likelihood or minimizizing squared error). The Akaike information criterion (AIC) and the Bayes Information criterion (BIC) are two common metrics that apply different penalties to these metrices for the number of free parameters in your model.\n", 583 | "\n", 584 | "Here we will demonstrate using both to select an optimal value of the regularization parameter alpha of the Lasso estimator from an example provided by [scikit-learn](http://scikit-learn.org/stable/auto_examples/linear_model/plot_lasso_model_selection.html). For cross-validation, we use 20-fold with 2 algorithms to compute the Lasso path: coordinate descent, as implemented by the LassoCV class.\n" 585 | ] 586 | }, 587 | { 588 | "cell_type": "code", 589 | "execution_count": null, 590 | "metadata": { 591 | "ExecuteTime": { 592 | "end_time": "2017-01-27T11:11:14.478267", 593 | "start_time": "2017-01-27T11:11:14.157680" 594 | }, 595 | "collapsed": false 596 | }, 597 | "outputs": [], 598 | "source": [ 599 | "from sklearn.linear_model import LassoCV, LassoLarsCV, LassoLarsIC\n", 600 | "import time\n", 601 | "\n", 602 | "# LassoLarsIC: least angle regression with BIC/AIC criterion\n", 603 | "model_bic = LassoLarsIC(criterion='bic')\n", 604 | "model_bic.fit(X, y)\n", 605 | "alpha_bic_ = model_bic.alpha_\n", 606 | "\n", 607 | "model_aic = LassoLarsIC(criterion='aic')\n", 608 | "model_aic.fit(X, y)\n", 609 | "alpha_aic_ = model_aic.alpha_\n", 610 | "\n", 611 | "def plot_ic_criterion(model, name, value, color):\n", 612 | " alpha_ = model.alpha_\n", 613 | " alphas_ = model.alphas_\n", 614 | " criterion_ = model.criterion_\n", 615 | " plt.plot(-np.log10(alphas_), criterion_, '--', color=color,\n", 616 | " linewidth=3, label='%s criterion' % name)\n", 617 | " plt.axvline(-np.log10(alpha_), color=color, linewidth=3,\n", 618 | " label='%s estimate alpha: %.3f' %(name,value) )\n", 619 | " plt.xlabel('-log(alpha)',fontsize=16)\n", 620 | " plt.ylabel('Model Fit Criterion',fontsize=16)\n", 621 | "\n", 622 | "plt.figure(figsize=(15,5))\n", 623 | "plot_ic_criterion(model_aic, 'AIC', alpha_aic_ , 'b')\n", 624 | "plot_ic_criterion(model_bic, 'BIC', alpha_bic_ , 'r')\n", 625 | "plt.legend()\n", 626 | "plt.title('Information-criterion for model selection',fontsize=18)" 627 | ] 628 | }, 629 | { 630 | "cell_type": "markdown", 631 | "metadata": {}, 632 | "source": [ 633 | "Now let's create a plot showing how well each alpha value performs across cross-validation folds" 634 | ] 635 | }, 636 | { 637 | "cell_type": "code", 638 | "execution_count": null, 639 | "metadata": { 640 | "ExecuteTime": { 641 | "end_time": "2017-01-27T11:23:24.140077", 642 | "start_time": "2017-01-27T11:23:23.747748" 643 | }, 644 | "collapsed": false 645 | }, 646 | "outputs": [], 647 | "source": [ 648 | "# LassoCV: coordinate descent\n", 649 | "y, X = dmatrices(formula_like=formula, data=df, return_type='dataframe')\n", 650 | "\n", 651 | "# Compute paths\n", 652 | "print(\"Computing regularization path using the coordinate descent lasso...\")\n", 653 | "t1 = time.time()\n", 654 | "model = LassoCV(cv=5).fit(X, y)\n", 655 | "t_lasso_cv = time.time() - t1\n", 656 | "\n", 657 | "# Display results\n", 658 | "m_log_alphas = -np.log10(model.alphas_)\n", 659 | "\n", 660 | "# Display results\n", 661 | "plt.figure(figsize=(15,5))\n", 662 | "plt.plot(m_log_alphas, model.mse_path_, ':')\n", 663 | "plt.plot(m_log_alphas, model.mse_path_.mean(axis=-1), 'k',\n", 664 | " label='Average across the folds', linewidth=2)\n", 665 | "plt.axvline(-np.log10(model.alpha_), linestyle='--', color='k',\n", 666 | " label='CV estimate alpha: %.3f' %model.alpha_ )\n", 667 | "plt.legend()\n", 668 | "plt.xlabel('-log(alpha)',fontsize=16)\n", 669 | "plt.ylabel('Mean square error',fontsize=16)\n", 670 | "plt.title('Mean square error on each fold: coordinate descent',fontsize=18)\n", 671 | "plt.axis('tight')" 672 | ] 673 | }, 674 | { 675 | "cell_type": "markdown", 676 | "metadata": {}, 677 | "source": [ 678 | "## Ridge regression\n", 679 | "The goal of the ridge function is to choose a penalty λ for which the coefficients are not rapidly changing and have “sensible” signs. It is especially useful when data suffers from multicollinearity, that is some of your predictor variables are highly correlated. Unlike LASSO, ridge does not produce a sparse solution, but rather shrinks variables that are highly collinear towards zero." 680 | ] 681 | }, 682 | { 683 | "cell_type": "code", 684 | "execution_count": null, 685 | "metadata": { 686 | "ExecuteTime": { 687 | "end_time": "2017-01-27T11:26:15.846904", 688 | "start_time": "2017-01-27T11:26:14.695671" 689 | }, 690 | "collapsed": false, 691 | "scrolled": false 692 | }, 693 | "outputs": [], 694 | "source": [ 695 | "from sklearn import linear_model\n", 696 | "y, X = dmatrices(formula_like=formula, data=df, return_type='dataframe')\n", 697 | "n_alphas = 200\n", 698 | "alphas = np.logspace(-2, 6, n_alphas)\n", 699 | "\n", 700 | "# Get optimal lambda\n", 701 | "clf = linear_model.RidgeCV(alphas=alphas)\n", 702 | "clf.fit(X,y)\n", 703 | "aestim = clf.alpha_ # Estimated regularization param\n", 704 | "\n", 705 | "# Get paths\n", 706 | "clf = linear_model.Ridge(fit_intercept=False)\n", 707 | "coefs = []\n", 708 | "for a in alphas:\n", 709 | " clf.set_params(alpha=a)\n", 710 | " clf.fit(X, y)\n", 711 | " coefs.append(clf.coef_)\n", 712 | "\n", 713 | "###############################################################################\n", 714 | "# Display results\n", 715 | "plt.figure(figsize=(15,8))\n", 716 | "\n", 717 | "ax = plt.gca()\n", 718 | "ax.plot(alphas, np.squeeze(coefs))\n", 719 | "ax.set_xscale('log')\n", 720 | "plt.axvline(aestim, linestyle='--', color='k', label='alpha: CV estimate')\n", 721 | "plt.legend()\n", 722 | "plt.xlabel('Lambda',fontsize=16)\n", 723 | "plt.ylabel('Weights',fontsize=16)\n", 724 | "plt.title('Ridge coefficients as a function of the regularization',fontsize=18)\n", 725 | "plt.axis('tight')" 726 | ] 727 | }, 728 | { 729 | "cell_type": "markdown", 730 | "metadata": {}, 731 | "source": [ 732 | "The points on the left vertical axis (the left ends of the lines) are the ordinary least squares regression values.\n", 733 | "These occur for k equal zero. As k is increased, the values of the regression estimates change, often wildly at first.\n", 734 | "At some point, the coefficients seem to settle down and then gradually drift towards zero.\n", 735 | "\n", 736 | "The task of the ridge regression analyst is to determine at what value of k these coefficients are at their stable\n", 737 | "values. A vertical line is drawn at the value selected for reporting purposes. It is anticipated that you would run\n", 738 | "the program several times until an appropriate value of k is determined. In our case, the selected alpha value is indicated by a vertical line at which a = 65.93. The smaller the value of k, the smaller the amount of bias that is included in the estimates." 739 | ] 740 | }, 741 | { 742 | "cell_type": "code", 743 | "execution_count": null, 744 | "metadata": { 745 | "ExecuteTime": { 746 | "end_time": "2017-01-27T11:26:48.340880", 747 | "start_time": "2017-01-27T11:26:48.333706" 748 | }, 749 | "collapsed": false 750 | }, 751 | "outputs": [], 752 | "source": [ 753 | "clf = linear_model.Ridge(fit_intercept=False)\n", 754 | "a = aestim\n", 755 | "clf.set_params(alpha=a)\n", 756 | "clf.fit(X, y)\n", 757 | "np.round(clf.coef_,3)" 758 | ] 759 | }, 760 | { 761 | "cell_type": "code", 762 | "execution_count": null, 763 | "metadata": { 764 | "ExecuteTime": { 765 | "end_time": "2017-01-27T11:26:56.928052", 766 | "start_time": "2017-01-27T11:26:56.884641" 767 | }, 768 | "collapsed": false 769 | }, 770 | "outputs": [], 771 | "source": [ 772 | "# Ridge if L1_wt = 0 , Lasso if L1_wt = 1\n", 773 | "penalty = aestim\n", 774 | "regm = smf.OLS(y,X).fit_regularized(method='coord_descent', maxiter=1000, alpha=penalty, L1_wt=0)\n", 775 | "print regm.summary()" 776 | ] 777 | }, 778 | { 779 | "cell_type": "markdown", 780 | "metadata": { 781 | "collapsed": true 782 | }, 783 | "source": [ 784 | "# Individual exercise\n", 785 | "Below is the information about the dataset. \n", 786 | "\n", 787 | "Attribute Information: \n", 788 | "1 school - student's school (binary: 'GP' - Gabriel Pereira or 'MS' - Mousinho da Silveira) \n", 789 | "2 sex - student's sex (binary: 'F' - female or 'M' - male) \n", 790 | "3 age - student's age (numeric: from 15 to 22) \n", 791 | "4 address - student's home address type (binary: 'U' - urban or 'R' - rural) \n", 792 | "5 famsize - family size (binary: 'LE3' - less or equal to 3 or 'GT3' - greater than 3) \n", 793 | "6 Pstatus - parent's cohabitation status (binary: 'T' - living together or 'A' - apart) \n", 794 | "7 Medu - mother's education (numeric: 0 - none, 1 - primary education (4th grade), 2 – 5th to 9th grade, 3 – secondary education or 4 – higher education) \n", 795 | "8 Fedu - father's education (numeric: 0 - none, 1 - primary education (4th grade), 2 – 5th to 9th grade, 3 – secondary education or 4 – higher education) \n", 796 | "9 Mjob - mother's job (nominal: 'teacher', 'health' care related, civil 'services' (e.g. administrative or police), 'at_home' or 'other') \n", 797 | "10 Fjob - father's job (nominal: 'teacher', 'health' care related, civil 'services' (e.g. administrative or police), 'at_home' or 'other') \n", 798 | "11 reason - reason to choose this school (nominal: close to 'home', school 'reputation', 'course' preference or 'other') \n", 799 | "12 guardian - student's guardian (nominal: 'mother', 'father' or 'other') \n", 800 | "13 traveltime - home to school travel time (numeric: 1 - <15 min., 2 - 15 to 30 min., 3 - 30 min. to 1 hour, or 4 - >1 hour) \n", 801 | "14 studytime - weekly study time (numeric: 1 - <2 hours, 2 - 2 to 5 hours, 3 - 5 to 10 hours, or 4 - >10 hours) \n", 802 | "15 failures - number of past class failures (numeric: n if 1<=n<3, else 4) \n", 803 | "16 schoolsup - extra educational support (binary: yes or no) \n", 804 | "17 famsup - family educational support (binary: yes or no) \n", 805 | "18 paid - extra paid classes within the course subject (Math or Portuguese) (binary: yes or no) \n", 806 | "19 activities - extra-curricular activities (binary: yes or no) \n", 807 | "20 nursery - attended nursery school (binary: yes or no) \n", 808 | "21 higher - wants to take higher education (binary: yes or no) \n", 809 | "22 internet - Internet access at home (binary: yes or no) \n", 810 | "23 romantic - with a romantic relationship (binary: yes or no) \n", 811 | "24 famrel - quality of family relationships (numeric: from 1 - very bad to 5 - excellent) \n", 812 | "25 freetime - free time after school (numeric: from 1 - very low to 5 - very high) \n", 813 | "26 goout - going out with friends (numeric: from 1 - very low to 5 - very high) \n", 814 | "27 Dalc - workday alcohol consumption (numeric: from 1 - very low to 5 - very high) \n", 815 | "28 Walc - weekend alcohol consumption (numeric: from 1 - very low to 5 - very high) \n", 816 | "29 health - current health status (numeric: from 1 - very bad to 5 - very good) \n", 817 | "30 absences - number of school absences (numeric: from 0 to 93) \n", 818 | "31 G1 - first period grade (numeric: from 0 to 20) \n", 819 | "31 G2 - second period grade (numeric: from 0 to 20) \n", 820 | "32 G3 - final grade (numeric: from 0 to 20, output target) \n", 821 | "\n", 822 | "## Exercise 1\n", 823 | "Before you begin, you may want to replace all the 'yes' or 'no' responses in the dataframe to numerical values such as 1 or 0\n", 824 | "using the df.replace() method. \n", 825 | "\n", 826 | "Run a logistic model to see whether studytime predicts 'wanting to take higher education'. \n", 827 | "\n", 828 | "Based on the model, what is the probability that John wants to pursue higher education if his\n", 829 | "studytime value is 2 (he studies 2 hours/week)?\n", 830 | "\n", 831 | "Based on the model what is the probability that Laura wants to pursue higher education if her\n", 832 | "studytime value is 3 (she studies 5 hours per week)? \n", 833 | "\n", 834 | "How much more likely (in odds) is Laura likely to want to pursue higher education than John ?\n", 835 | "\n", 836 | "## Exercise 2\n", 837 | "Run a LASSO regression to find what variables predict studytime. \n", 838 | "Use the penalty parameter (alpha) estimated through cross validation of coordinate descent. \n", 839 | "What is your interpretation of the results? " 840 | ] 841 | }, 842 | { 843 | "cell_type": "code", 844 | "execution_count": null, 845 | "metadata": { 846 | "collapsed": true 847 | }, 848 | "outputs": [], 849 | "source": [] 850 | } 851 | ], 852 | "metadata": { 853 | "anaconda-cloud": {}, 854 | "kernelspec": { 855 | "display_name": "Python 2", 856 | "language": "python", 857 | "name": "python2" 858 | }, 859 | "language_info": { 860 | "codemirror_mode": { 861 | "name": "ipython", 862 | "version": 2 863 | }, 864 | "file_extension": ".py", 865 | "mimetype": "text/x-python", 866 | "name": "python", 867 | "nbconvert_exporter": "python", 868 | "pygments_lexer": "ipython2", 869 | "version": "2.7.13" 870 | }, 871 | "toc": { 872 | "nav_menu": { 873 | "height": "512px", 874 | "width": "252px" 875 | }, 876 | "navigate_menu": true, 877 | "number_sections": true, 878 | "sideBar": true, 879 | "threshold": 4, 880 | "toc_cell": false, 881 | "toc_section_display": "block", 882 | "toc_window_display": false 883 | } 884 | }, 885 | "nbformat": 4, 886 | "nbformat_minor": 0 887 | } 888 | -------------------------------------------------------------------------------- /Notebooks/6_Introduction_to_SocialNetworkAnalysis.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": { 6 | "deletable": true, 7 | "editable": true 8 | }, 9 | "source": [ 10 | "# Social Network Analysis \n", 11 | "Written by Jin Cheong & Luke Chang\n" 12 | ] 13 | }, 14 | { 15 | "cell_type": "code", 16 | "execution_count": null, 17 | "metadata": { 18 | "ExecuteTime": { 19 | "end_time": "2017-02-13T09:24:16.143566", 20 | "start_time": "2017-02-13T09:24:15.172556" 21 | }, 22 | "collapsed": false, 23 | "deletable": true, 24 | "editable": true 25 | }, 26 | "outputs": [], 27 | "source": [ 28 | "%matplotlib inline\n", 29 | "\n", 30 | "import numpy as np\n", 31 | "import matplotlib.pyplot as plt\n", 32 | "try:\n", 33 | " import networkx as nx\n", 34 | "except:\n", 35 | " # Install NetworkX\n", 36 | " !pip install networkx" 37 | ] 38 | }, 39 | { 40 | "cell_type": "markdown", 41 | "metadata": { 42 | "deletable": true, 43 | "editable": true 44 | }, 45 | "source": [ 46 | "# Primer to Network Analysis\n", 47 | "\n", 48 | "A network is made up of two main components: nodes and edges. \n", 49 | "\n", 50 | "The nodes are the individuals, words, or entities that compose a network, and the edges are the links that connect one node to another. \n", 51 | "\n", 52 | "Here is an example of a node and an edge.\n", 53 | "\n", 54 | "\n", 55 | "\n", 56 | "Now we can try drawing our own network using the NetworkX package.\n", 57 | "Let's start off by initializing a Network, and call it G. " 58 | ] 59 | }, 60 | { 61 | "cell_type": "code", 62 | "execution_count": null, 63 | "metadata": { 64 | "ExecuteTime": { 65 | "end_time": "2017-02-13T09:24:16.148155", 66 | "start_time": "2017-02-13T09:24:16.145064" 67 | }, 68 | "collapsed": true, 69 | "deletable": true, 70 | "editable": true 71 | }, 72 | "outputs": [], 73 | "source": [ 74 | "# Initialize Graph object\n", 75 | "G = nx.Graph()" 76 | ] 77 | }, 78 | { 79 | "cell_type": "markdown", 80 | "metadata": { 81 | "deletable": true, 82 | "editable": true 83 | }, 84 | "source": [ 85 | "Now we can add a node using the .add_node('nodename') method" 86 | ] 87 | }, 88 | { 89 | "cell_type": "code", 90 | "execution_count": null, 91 | "metadata": { 92 | "ExecuteTime": { 93 | "end_time": "2017-02-13T09:24:16.291054", 94 | "start_time": "2017-02-13T09:24:16.149742" 95 | }, 96 | "collapsed": false, 97 | "deletable": true, 98 | "editable": true 99 | }, 100 | "outputs": [], 101 | "source": [ 102 | "G = nx.Graph()\n", 103 | "G.add_node('Jin')\n", 104 | "G.add_node('Luke')\n", 105 | "G.add_node('Eshin')\n", 106 | "plt.figure(figsize=(5,3))\n", 107 | "nx.draw(G,with_labels=True,node_size=5000,font_size=20,alpha=.5)" 108 | ] 109 | }, 110 | { 111 | "cell_type": "markdown", 112 | "metadata": { 113 | "deletable": true, 114 | "editable": true 115 | }, 116 | "source": [ 117 | "Notice there are no connections between the two nodes because we haven't added any edges yet.\n", 118 | "\n", 119 | "To add edges between nodes, for example node1 and node 2, we can use the .add_edge('node1','node2') method on our graph." 120 | ] 121 | }, 122 | { 123 | "cell_type": "code", 124 | "execution_count": null, 125 | "metadata": { 126 | "ExecuteTime": { 127 | "end_time": "2017-02-13T09:24:16.452812", 128 | "start_time": "2017-02-13T09:24:16.292498" 129 | }, 130 | "collapsed": false, 131 | "deletable": true, 132 | "editable": true, 133 | "scrolled": true 134 | }, 135 | "outputs": [], 136 | "source": [ 137 | "G = nx.Graph()\n", 138 | "G.add_edge('Jin','Luke',weight=1)\n", 139 | "G.add_edge('Jin','Eshin',weight=1)\n", 140 | "G.add_edge('Luke','Eshin',weight=1)\n", 141 | "plt.figure(figsize=(5,3))\n", 142 | "pos = nx.spring_layout(G) # One way of specifiying a layout for nodes.\n", 143 | "nx.draw(G,pos,with_labels=True,node_size=5000,font_size=20,alpha=.5)" 144 | ] 145 | }, 146 | { 147 | "cell_type": "markdown", 148 | "metadata": { 149 | "deletable": true, 150 | "editable": true 151 | }, 152 | "source": [ 153 | "Now you can see that we have an edge connecting Jin and Luke. \n", 154 | "\n", 155 | "There are several different types of graphs.\n", 156 | "\n", 157 | "1. **weighted** vs **unweighted** graphs\n", 158 | " - a graph is said to be **unweighted** if every connection is either 1 or 0\n", 159 | " - it is **weighted** if the edges take on other values indicating the strength of the relationship.\n", 160 | "\n", 161 | "\n", 162 | "2. **directed** vs **undirected** graphs\n", 163 | " - a graphs is **undirected** if the edges are symmetrical\n", 164 | " - it is **directed** if the edges are asymmetrical, meaning that each edge can indicate the direction of the relationship\n", 165 | "\n", 166 | "For simplicity, today we will only cover unweighted and undirected graphs. " 167 | ] 168 | }, 169 | { 170 | "cell_type": "markdown", 171 | "metadata": {}, 172 | "source": [ 173 | "There are multiple ways to indicate nodes and edges in a graph. \n", 174 | "As illustrated above, we can manually add each node and edge. This approach is fine for small graphs, but won't scale well. It is also possible to add nodes is to use a python dictionary.\n", 175 | "The key is the node and the values indicate what that key(node) connects to. " 176 | ] 177 | }, 178 | { 179 | "cell_type": "code", 180 | "execution_count": null, 181 | "metadata": { 182 | "ExecuteTime": { 183 | "end_time": "2017-02-13T09:24:16.903054", 184 | "start_time": "2017-02-13T09:24:16.454433" 185 | }, 186 | "collapsed": false, 187 | "deletable": true, 188 | "editable": true 189 | }, 190 | "outputs": [], 191 | "source": [ 192 | "d = {'Jin':['Luke','Eshin','Antonia','Andy','Sophie','Rob','Charlie','Vanessa'], \n", 193 | " 'Luke':['Eshin','Antonia','Seth','Andy','Sophie','Rob','Charlie','Vanessa'],\n", 194 | " 'Antonia':['Luke','Jin','Eshin','Seth','Andy'],\n", 195 | " 'Eshin':['Heidi','Antonia','Jin','Sam','Andy'],\n", 196 | " 'Sophie':['Rob','Vanessa'],\n", 197 | " 'Rob':['Sophie','Vanessa']}\n", 198 | "G = nx.Graph(d)\n", 199 | "plt.figure(figsize=(15,8))\n", 200 | "np.random.seed(2) # Just to keep things same\n", 201 | "pos = pos = nx.fruchterman_reingold_layout(G) # Another way of specifiying a layout for nodes.\n", 202 | "nx.draw(G,with_labels=True,node_size=1500,font_size=20,alpha=.3,width=2)" 203 | ] 204 | }, 205 | { 206 | "cell_type": "markdown", 207 | "metadata": { 208 | "deletable": true, 209 | "editable": true 210 | }, 211 | "source": [ 212 | "# What should we look for in a network ? \n", 213 | "\n", 214 | "## Micromeasures\n", 215 | "Centrality measures track the importance of a node derived from its position in the network.\n", 216 | "\n", 217 | "There are four main groups of centrality depending on the type of statistics used:\n", 218 | "1. degree - how connected a node is\n", 219 | "2. betweenness - how important a node is in connecting other nodes\n", 220 | "3. closeness - how easily a node can reach other nodes\n", 221 | "4. neighbors' characteristics - how important, central, or influential a node's neighbors are. \n", 222 | "\n", 223 | "\n", 224 | "\n", 225 | "Reference:\n", 226 | "Social and Economic Networks by Matthew O. Jackson. Princeton University Press. \n", 227 | "Picture: [Wikipedia article](https://en.wikipedia.org/wiki/Centrality) \n", 228 | "Other examples: http://www.orgnet.com/sna.html \n", 229 | "\n", 230 | "## Degree Centrality\n", 231 | "\n", 232 | "The degree centrality measures the number of edges of a given node. \n", 233 | "\n", 234 | "In other words, it measures how connected a node is." 235 | ] 236 | }, 237 | { 238 | "cell_type": "code", 239 | "execution_count": null, 240 | "metadata": { 241 | "ExecuteTime": { 242 | "end_time": "2017-02-13T09:24:17.296820", 243 | "start_time": "2017-02-13T09:24:16.904690" 244 | }, 245 | "collapsed": false, 246 | "deletable": true, 247 | "editable": true 248 | }, 249 | "outputs": [], 250 | "source": [ 251 | "def print_network(val_map,title=None):\n", 252 | " print('\\x1b[1;31m'+title+'\\x1b[0m')\n", 253 | " for k, v in val_map.iteritems():\n", 254 | " print k.ljust(15), str(round(v,3)).ljust(30)\n", 255 | "\n", 256 | "def plot_network(val_map, title=None):\n", 257 | " values = [val_map.get(node, 0.25) for node in G.nodes()]\n", 258 | " plt.figure(figsize=(12,5))\n", 259 | " np.random.seed(2)\n", 260 | "# pos = nx.spring_layout(G)\n", 261 | " pos = nx.fruchterman_reingold_layout(G,dim=2)\n", 262 | " ec = nx.draw_networkx_edges(G,pos,alpha=.2)\n", 263 | " nc = nx.draw_networkx_nodes(G,pos,node_size=1000,with_labels=True,alpha=.5,cmap=plt.get_cmap('jet'),node_color=values)\n", 264 | " nx.draw_networkx_labels(G,pos,font_size=18)\n", 265 | " # nx.draw(G,pos,node_size=400,with_labels=True,alpha=.5,cmap=plt.get_cmap('jet'),node_color=values)\n", 266 | " plt.colorbar(nc)\n", 267 | " plt.axis('off')\n", 268 | " plt.suptitle(title,fontsize=18)\n", 269 | " plt.show()\n", 270 | "\n", 271 | "# Get degree centrality values\n", 272 | "d = nx.degree_centrality(G)\n", 273 | "title = 'Degree Centrality Map'\n", 274 | "# print centrality values\n", 275 | "print_network(d,title)\n", 276 | "# plot graph with values\n", 277 | "plot_network(d,title)" 278 | ] 279 | }, 280 | { 281 | "cell_type": "markdown", 282 | "metadata": { 283 | "deletable": true, 284 | "editable": true 285 | }, 286 | "source": [ 287 | "Luke has the highest number of connections and therefore has the highest degree centrality. Jin is next, and then Eshin.\n", 288 | "\n", 289 | "# Betweenness Centrality\n", 290 | "Think of Between Centrality as a bottleneck or a broker of two separate networks. \n", 291 | "\n", 292 | "The measure is the fraction of the total number of shortest paths a node lie on between two other nodes.\n", 293 | "\n", 294 | "Here Eshin has the highest betweeness centrality because he connects the most different people." 295 | ] 296 | }, 297 | { 298 | "cell_type": "code", 299 | "execution_count": null, 300 | "metadata": { 301 | "ExecuteTime": { 302 | "end_time": "2017-02-13T09:24:17.659747", 303 | "start_time": "2017-02-13T09:24:17.298352" 304 | }, 305 | "collapsed": false, 306 | "deletable": true, 307 | "editable": true 308 | }, 309 | "outputs": [], 310 | "source": [ 311 | "d = nx.betweenness_centrality(G)\n", 312 | "title = \"Betweenness Centrality\"\n", 313 | "print_network(d,title)\n", 314 | "plot_network(d,title)" 315 | ] 316 | }, 317 | { 318 | "cell_type": "markdown", 319 | "metadata": { 320 | "deletable": true, 321 | "editable": true 322 | }, 323 | "source": [ 324 | "# Closeness Centrality\n", 325 | "The closeness centrality measures how close a given node is to any other node. \n", 326 | "This is measured by the average length of the shortest path between a node and all other nodes. \n", 327 | "Thus you have higher closeness centrality if you can get to everyone faster than others. " 328 | ] 329 | }, 330 | { 331 | "cell_type": "code", 332 | "execution_count": null, 333 | "metadata": { 334 | "ExecuteTime": { 335 | "end_time": "2017-02-13T09:24:18.015038", 336 | "start_time": "2017-02-13T09:24:17.661151" 337 | }, 338 | "collapsed": false, 339 | "deletable": true, 340 | "editable": true 341 | }, 342 | "outputs": [], 343 | "source": [ 344 | "d = nx.closeness_centrality(G)\n", 345 | "title = \"Closeness Centrality\"\n", 346 | "print_network(d,title)\n", 347 | "plot_network(d,title)" 348 | ] 349 | }, 350 | { 351 | "cell_type": "markdown", 352 | "metadata": { 353 | "deletable": true, 354 | "editable": true 355 | }, 356 | "source": [ 357 | "# Eigenvector Centrality\n", 358 | "\n", 359 | "The underlying assumption of the Eigenvector centrality is that a node's importance is determined by how important its neighbors are. \n", 360 | "\n", 361 | "For instance, having more influential friends (betweeness centrality) can be more important than just having more number of friends (degree centrality). \n", 362 | "\n", 363 | "Now we can observe that Luke is back to being more important than Eshin (who had higher betweeness centrality), because Luke is friends with Eshin who also has high centrality." 364 | ] 365 | }, 366 | { 367 | "cell_type": "code", 368 | "execution_count": null, 369 | "metadata": { 370 | "ExecuteTime": { 371 | "end_time": "2017-02-13T09:24:18.378309", 372 | "start_time": "2017-02-13T09:24:18.016469" 373 | }, 374 | "collapsed": false, 375 | "deletable": true, 376 | "editable": true 377 | }, 378 | "outputs": [], 379 | "source": [ 380 | "d = nx.eigenvector_centrality(G)\n", 381 | "title = \"Eigenvector Centrality\"\n", 382 | "print_network(d,title)\n", 383 | "plot_network(d,title)" 384 | ] 385 | }, 386 | { 387 | "cell_type": "markdown", 388 | "metadata": { 389 | "deletable": true, 390 | "editable": true 391 | }, 392 | "source": [ 393 | "## Macromeasures\n", 394 | "\n", 395 | "Macromeasures are useful to understand the network as a whole or to compare networks. \n", 396 | "\n", 397 | "We will cover three fundamental measures of the network structure.\n", 398 | "1. Degree distribution\n", 399 | "2. Average shortest path \n", 400 | "3. Clustering Coefficients\n", 401 | "\n", 402 | "## Degree Distribution\n", 403 | "One fundamental characteristic is the distribution of degrees, number of edges for each node, which we can easily plot. \n", 404 | "From this distribution we can discern attributes such as whether everyone is connected to everybody are there many lonely nodes and such." 405 | ] 406 | }, 407 | { 408 | "cell_type": "code", 409 | "execution_count": null, 410 | "metadata": { 411 | "ExecuteTime": { 412 | "end_time": "2017-02-13T09:24:19.263314", 413 | "start_time": "2017-02-13T09:24:18.379752" 414 | }, 415 | "collapsed": false, 416 | "deletable": true, 417 | "editable": true 418 | }, 419 | "outputs": [], 420 | "source": [ 421 | "degree_sequence=sorted([d for n,d in G.degree().iteritems()], reverse=True) # degree sequence\n", 422 | "\n", 423 | "G2=nx.complete_graph(5)\n", 424 | "degree_sequence2=sorted([d for n,d in G2.degree().iteritems()], reverse=True) # degree sequence\n", 425 | "\n", 426 | "fig, [(ax1,ax2),(ax3,ax4)] = plt.subplots(2,2,figsize=(15,10))\n", 427 | "ax1.hist(degree_sequence,bins=range(0,7),rwidth=.8)\n", 428 | "ax1.set_title(\"Degree Histogram for Graph 1\",fontsize=16)\n", 429 | "ax1.set_ylabel(\"Count\",fontsize=14)\n", 430 | "ax1.set_xlabel(\"Degree\",fontsize=14)\n", 431 | "ax1.set_xticks([d+0.4 for d in degree_sequence])\n", 432 | "ax1.set_xticklabels([d for d in degree_sequence])\n", 433 | "ax1.set_ylim((0,6))\n", 434 | "\n", 435 | "nx.draw(G,pos=nx.circular_layout(G),ax=ax2)\n", 436 | "ax2.set_title('Network 1: Graph of our class',fontsize=16)\n", 437 | "\n", 438 | "ax3.hist(degree_sequence2,bins=range(0,7),rwidth=.8)\n", 439 | "ax3.set_title(\"Degree Histogram for Complete Graph\",fontsize=16)\n", 440 | "ax3.set_ylabel(\"Count\",fontsize=14)\n", 441 | "ax3.set_xlabel(\"Degree\",fontsize=14)\n", 442 | "ax3.set_xticks([d+0.4 for d in degree_sequence2])\n", 443 | "ax3.set_xticklabels([d for d in degree_sequence2])\n", 444 | "ax3.set_ylim((0,6))\n", 445 | "\n", 446 | "nx.draw(G2,pos=nx.circular_layout(G2),ax=ax4)\n", 447 | "ax4.set_title('Network 2: Graph of a Complete Graph',fontsize=16)\n", 448 | "\n", 449 | "plt.tight_layout(pad=0.4, w_pad=0.5, h_pad=1.0)\n", 450 | "plt.show()" 451 | ] 452 | }, 453 | { 454 | "cell_type": "markdown", 455 | "metadata": { 456 | "deletable": true, 457 | "editable": true 458 | }, 459 | "source": [ 460 | "## Average Shortest Path\n", 461 | "\n", 462 | "The average shortest path length is the average of calculating the shortest number of edges it takes for every pair of nodes in the network. \n", 463 | "\n", 464 | "This can be a mesure of how efficient/fast the network is in the distribution of information, rumors, disease, etc." 465 | ] 466 | }, 467 | { 468 | "cell_type": "code", 469 | "execution_count": null, 470 | "metadata": { 471 | "ExecuteTime": { 472 | "end_time": "2017-02-13T09:24:19.269671", 473 | "start_time": "2017-02-13T09:24:19.265105" 474 | }, 475 | "collapsed": false, 476 | "deletable": true, 477 | "editable": true, 478 | "scrolled": true 479 | }, 480 | "outputs": [], 481 | "source": [ 482 | "print 'Network 1 average shortest path: ' + str(round(nx.average_shortest_path_length(G),2))\n", 483 | "print 'Network 2 average shortest path: ' + str(nx.average_shortest_path_length(G2))" 484 | ] 485 | }, 486 | { 487 | "cell_type": "markdown", 488 | "metadata": { 489 | "deletable": true, 490 | "editable": true 491 | }, 492 | "source": [ 493 | "## Clustering coefficient. \n", 494 | "\n", 495 | "Another way to look at how tight-knit a network is to look at how clustered it is. \n", 496 | "To estimate clustering, we start from cliques, which can be considered as a subnetwork composed of three nodes forming a triangle. \n", 497 | "\n", 498 | "We can first calculate for each node, how many cliques(triangles) it is a member of. \n", 499 | "Then we calculate transitivity, which is simply the number of triangles in the network divided by the number of all possible traingles." 500 | ] 501 | }, 502 | { 503 | "cell_type": "code", 504 | "execution_count": null, 505 | "metadata": { 506 | "ExecuteTime": { 507 | "end_time": "2017-02-13T09:24:19.633788", 508 | "start_time": "2017-02-13T09:24:19.271359" 509 | }, 510 | "collapsed": false, 511 | "deletable": true, 512 | "editable": true, 513 | "scrolled": false 514 | }, 515 | "outputs": [], 516 | "source": [ 517 | "d = nx.triangles(G)\n", 518 | "print_network(d,'Number of cliques(triangles)')\n", 519 | "print \n", 520 | "print 'Transitivity : ' + str(nx.transitivity(G))\n", 521 | "\n", 522 | "plot_network(d,'Transitivity')" 523 | ] 524 | }, 525 | { 526 | "cell_type": "markdown", 527 | "metadata": { 528 | "deletable": true, 529 | "editable": true 530 | }, 531 | "source": [ 532 | "A similar approach is to calculate clustering coefficient for each node and for the graph.\n", 533 | "Clustering is the measure of how likely it is if A has edges to B and C, that B and C related? \n", 534 | "\n", 535 | "For instance, Charlie form a triangle with Luke and Jin, but Sam and Heidi don't form a single triangle.\n", 536 | "\n", 537 | "We can calculate this for each node and then get an average value overall to get the average clustering coefficient." 538 | ] 539 | }, 540 | { 541 | "cell_type": "code", 542 | "execution_count": null, 543 | "metadata": { 544 | "ExecuteTime": { 545 | "end_time": "2017-02-13T09:24:19.642416", 546 | "start_time": "2017-02-13T09:24:19.635348" 547 | }, 548 | "collapsed": false, 549 | "deletable": true, 550 | "editable": true 551 | }, 552 | "outputs": [], 553 | "source": [ 554 | "d = nx.clustering(G)\n", 555 | "print_network(d,'Clustering Coefficients per node')\n", 556 | "print \n", 557 | "print 'Average clustering coefficient : ' + str(nx.average_clustering(G))" 558 | ] 559 | }, 560 | { 561 | "cell_type": "code", 562 | "execution_count": null, 563 | "metadata": { 564 | "collapsed": true, 565 | "deletable": true, 566 | "editable": true 567 | }, 568 | "outputs": [], 569 | "source": [] 570 | }, 571 | { 572 | "cell_type": "markdown", 573 | "metadata": { 574 | "collapsed": true, 575 | "deletable": true, 576 | "editable": true 577 | }, 578 | "source": [ 579 | "# Exercise\n", 580 | "Your assignment this week is to construct your own social network. It could be based on a sports team, your close friends, or family. It could also reflect more abstract aspects of relationships such as social support, gossip, problem solver, etc.\n", 581 | "\n", 582 | "Calculate two different metrics of the network to provide insight into how different members play different roles." 583 | ] 584 | }, 585 | { 586 | "cell_type": "code", 587 | "execution_count": null, 588 | "metadata": { 589 | "collapsed": true, 590 | "deletable": true, 591 | "editable": true 592 | }, 593 | "outputs": [], 594 | "source": [] 595 | } 596 | ], 597 | "metadata": { 598 | "anaconda-cloud": {}, 599 | "kernelspec": { 600 | "display_name": "Python [default]", 601 | "language": "python", 602 | "name": "python2" 603 | }, 604 | "language_info": { 605 | "codemirror_mode": { 606 | "name": "ipython", 607 | "version": 2 608 | }, 609 | "file_extension": ".py", 610 | "mimetype": "text/x-python", 611 | "name": "python", 612 | "nbconvert_exporter": "python", 613 | "pygments_lexer": "ipython2", 614 | "version": "2.7.13" 615 | }, 616 | "toc": { 617 | "nav_menu": { 618 | "height": "260px", 619 | "width": "252px" 620 | }, 621 | "navigate_menu": true, 622 | "number_sections": true, 623 | "sideBar": true, 624 | "threshold": 4, 625 | "toc_cell": false, 626 | "toc_section_display": "block", 627 | "toc_window_display": false 628 | } 629 | }, 630 | "nbformat": 4, 631 | "nbformat_minor": 0 632 | } 633 | -------------------------------------------------------------------------------- /Notebooks/7_Introduction_to_Scraping.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": { 6 | "deletable": true, 7 | "editable": true 8 | }, 9 | "source": [ 10 | "# Introduction to Web Scraping\n", 11 | "Often we are interested in getting data from a website. Modern websites are often built using a [REST](https://en.wikipedia.org/wiki/Representational_state_transfer) framework that has an Application Programming Interface ([API](https://en.wikipedia.org/wiki/Application_programming_interface)) to make [HTTP](https://www.tutorialspoint.com/http/http_requests.htm) requests to retrieve structured data in the form of [JSON](https://en.wikipedia.org/wiki/JSON) or XML.\n", 12 | "\n", 13 | "However, when there is not a clear API, we might need to perform web scraping by directly grabbing the data ourselves" 14 | ] 15 | }, 16 | { 17 | "cell_type": "code", 18 | "execution_count": null, 19 | "metadata": { 20 | "ExecuteTime": { 21 | "end_time": "2017-03-06T13:26:29.942689Z", 22 | "start_time": "2017-03-06T08:26:29.777081-05:00" 23 | }, 24 | "collapsed": true, 25 | "deletable": true, 26 | "editable": true 27 | }, 28 | "outputs": [], 29 | "source": [ 30 | "try:\n", 31 | " import requests\n", 32 | "except:\n", 33 | " !pip install requests\n", 34 | "try:\n", 35 | " from bs4 import BeautifulSoup\n", 36 | "except:\n", 37 | " !pip install bs4" 38 | ] 39 | }, 40 | { 41 | "cell_type": "markdown", 42 | "metadata": { 43 | "deletable": true, 44 | "editable": true 45 | }, 46 | "source": [ 47 | "## Getting data using requests\n", 48 | "[Requests](http://docs.python-requests.org/en/master/) is an excellent library for performing HTTP requests. \n", 49 | "\n", 50 | "In this simple example, we will scrape data from the PBS faculty webpage." 51 | ] 52 | }, 53 | { 54 | "cell_type": "code", 55 | "execution_count": null, 56 | "metadata": { 57 | "ExecuteTime": { 58 | "end_time": "2017-03-06T13:26:32.826356Z", 59 | "start_time": "2017-03-06T08:26:32.720782-05:00" 60 | }, 61 | "collapsed": false, 62 | "deletable": true, 63 | "editable": true 64 | }, 65 | "outputs": [], 66 | "source": [ 67 | "page = requests.get(\"http://pbs.dartmouth.edu/people\")\n", 68 | "print(page)" 69 | ] 70 | }, 71 | { 72 | "cell_type": "markdown", 73 | "metadata": { 74 | "deletable": true, 75 | "editable": true 76 | }, 77 | "source": [ 78 | "Here the response '200' indicates that the get request was successful. Now let's look at the actual text that was downloaded from the webpage." 79 | ] 80 | }, 81 | { 82 | "cell_type": "code", 83 | "execution_count": null, 84 | "metadata": { 85 | "ExecuteTime": { 86 | "end_time": "2017-03-06T05:28:31.476250Z", 87 | "start_time": "2017-03-06T00:28:31.463432-05:00" 88 | }, 89 | "collapsed": false, 90 | "deletable": true, 91 | "editable": true 92 | }, 93 | "outputs": [], 94 | "source": [ 95 | "print(page.content)" 96 | ] 97 | }, 98 | { 99 | "cell_type": "markdown", 100 | "metadata": { 101 | "deletable": true, 102 | "editable": true 103 | }, 104 | "source": [ 105 | "Here you can see that we have downloaded all of the data from the PBS faculty page and that it is in the form of HTML. \n", 106 | "HTML is a markup language that tells a browser how to layout content. HTML consists of elements called tags. Each tag indicates a beginning and end. Here are a few examples: \n", 107 | "\n", 108 | " - `` - indicates hyperlink\n", 109 | " - `

` - indicates paragraph\n", 110 | " - `
` - indicates a division, or area, of the page.\n", 111 | " - `` - bolds any text inside.\n", 112 | " - `` - italicizes any text inside.\n", 113 | " - `

` - indicates a header\n", 114 | " - `
` - creates a table.\n", 115 | " - `
    ` - ordered list\n", 116 | " - `
      ` - unordered list\n", 117 | " - `
    • ` - list item\n" 118 | ] 119 | }, 120 | { 121 | "cell_type": "markdown", 122 | "metadata": { 123 | "deletable": true, 124 | "editable": true 125 | }, 126 | "source": [ 127 | "## Parsing HTML using Beautiful Soup\n", 128 | "There are many libraries that can be helpful for quickly parsing structured text such as HTML. We will be using Beautiful Soup as an example." 129 | ] 130 | }, 131 | { 132 | "cell_type": "code", 133 | "execution_count": null, 134 | "metadata": { 135 | "ExecuteTime": { 136 | "end_time": "2017-03-06T13:26:39.183625Z", 137 | "start_time": "2017-03-06T08:26:39.071917-05:00" 138 | }, 139 | "collapsed": false, 140 | "deletable": true, 141 | "editable": true 142 | }, 143 | "outputs": [], 144 | "source": [ 145 | "soup = BeautifulSoup(page.content, 'html.parser')\n", 146 | "print(soup.prettify())" 147 | ] 148 | }, 149 | { 150 | "cell_type": "markdown", 151 | "metadata": { 152 | "deletable": true, 153 | "editable": true 154 | }, 155 | "source": [ 156 | "Here we are going to find the unordered list tagged with the id 'faculty-container'. We are then going to look for any nested tag that use the 'h4' header tag. This should give us all of the lines with the faculty names as a list." 157 | ] 158 | }, 159 | { 160 | "cell_type": "code", 161 | "execution_count": null, 162 | "metadata": { 163 | "ExecuteTime": { 164 | "end_time": "2017-03-06T13:26:41.798326Z", 165 | "start_time": "2017-03-06T08:26:41.769082-05:00" 166 | }, 167 | "collapsed": false, 168 | "deletable": true, 169 | "editable": true 170 | }, 171 | "outputs": [], 172 | "source": [ 173 | "names_html = soup.find_all('ul',id='faculty-container')[0].find_all('h4')\n", 174 | "names = [x.text for x in names_html]\n", 175 | "print(names)" 176 | ] 177 | }, 178 | { 179 | "cell_type": "markdown", 180 | "metadata": { 181 | "deletable": true, 182 | "editable": true 183 | }, 184 | "source": [ 185 | "What if we wanted to get all of the faculty email addresses?" 186 | ] 187 | }, 188 | { 189 | "cell_type": "code", 190 | "execution_count": null, 191 | "metadata": { 192 | "ExecuteTime": { 193 | "end_time": "2017-03-06T13:26:45.729051Z", 194 | "start_time": "2017-03-06T08:26:45.696816-05:00" 195 | }, 196 | "collapsed": false, 197 | "deletable": true, 198 | "editable": true 199 | }, 200 | "outputs": [], 201 | "source": [ 202 | "email_html = soup.find_all('ul',id='faculty-container')[0].find_all('span',{'class' : 'contact'})\n", 203 | "email = [x.text for x in email_html]\n", 204 | "print(email)" 205 | ] 206 | }, 207 | { 208 | "cell_type": "markdown", 209 | "metadata": { 210 | "deletable": true, 211 | "editable": true 212 | }, 213 | "source": [ 214 | "## Parsing string data\n", 215 | "What if we wanted to grab the name from the list of email addresses?" 216 | ] 217 | }, 218 | { 219 | "cell_type": "code", 220 | "execution_count": null, 221 | "metadata": { 222 | "ExecuteTime": { 223 | "end_time": "2017-03-06T06:16:47.161190Z", 224 | "start_time": "2017-03-06T01:16:47.154047-05:00" 225 | }, 226 | "collapsed": false, 227 | "deletable": true, 228 | "editable": true 229 | }, 230 | "outputs": [], 231 | "source": [ 232 | "print([x.split('@')[0] for x in email])" 233 | ] 234 | }, 235 | { 236 | "cell_type": "markdown", 237 | "metadata": { 238 | "collapsed": true, 239 | "deletable": true, 240 | "editable": true 241 | }, 242 | "source": [ 243 | "One thing we might do with this data is create a dictionary with names and emails of all of the professors in the department. This could be useful if we wanted to send a bulk email to them. " 244 | ] 245 | }, 246 | { 247 | "cell_type": "code", 248 | "execution_count": null, 249 | "metadata": { 250 | "ExecuteTime": { 251 | "end_time": "2017-03-06T13:37:49.514990Z", 252 | "start_time": "2017-03-06T08:37:49.509404-05:00" 253 | }, 254 | "collapsed": false, 255 | "deletable": true, 256 | "editable": true 257 | }, 258 | "outputs": [], 259 | "source": [ 260 | "email_dict = dict([(x.split('@')[0],x) for x in email])\n", 261 | "print(email_dict)" 262 | ] 263 | }, 264 | { 265 | "cell_type": "markdown", 266 | "metadata": {}, 267 | "source": [ 268 | "You can see that every name also includes an initial. Let's try to just pull out the first and last name." 269 | ] 270 | }, 271 | { 272 | "cell_type": "code", 273 | "execution_count": null, 274 | "metadata": { 275 | "ExecuteTime": { 276 | "end_time": "2017-03-06T13:37:51.752857Z", 277 | "start_time": "2017-03-06T08:37:51.747826-05:00" 278 | }, 279 | "collapsed": false 280 | }, 281 | "outputs": [], 282 | "source": [ 283 | "for x in email_dict.keys():\n", 284 | " old = x.split('.')\n", 285 | " email_dict[\" \".join([i for i in old if len(i) > 2])] = email_dict[x]\n", 286 | " del email_dict[x]\n", 287 | "\n", 288 | "print(email_dict)" 289 | ] 290 | }, 291 | { 292 | "cell_type": "markdown", 293 | "metadata": { 294 | "deletable": true, 295 | "editable": true 296 | }, 297 | "source": [ 298 | "## Interacting with web page using Selenium\n", 299 | "Sometimes we need to directly interact with aspects of the webpage. Maybe there is a form that needs to be submitted or a javascript button that needs to be pressed. [Selenium](http://selenium-python.readthedocs.io/) is a useful tool for these types of tasks." 300 | ] 301 | }, 302 | { 303 | "cell_type": "code", 304 | "execution_count": null, 305 | "metadata": { 306 | "collapsed": true 307 | }, 308 | "outputs": [], 309 | "source": [] 310 | } 311 | ], 312 | "metadata": { 313 | "kernelspec": { 314 | "display_name": "Python 2", 315 | "language": "python", 316 | "name": "python2" 317 | }, 318 | "language_info": { 319 | "codemirror_mode": { 320 | "name": "ipython", 321 | "version": 2 322 | }, 323 | "file_extension": ".py", 324 | "mimetype": "text/x-python", 325 | "name": "python", 326 | "nbconvert_exporter": "python", 327 | "pygments_lexer": "ipython2", 328 | "version": "2.7.13" 329 | } 330 | }, 331 | "nbformat": 4, 332 | "nbformat_minor": 2 333 | } 334 | -------------------------------------------------------------------------------- /Notebooks/Figures/centrality.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ljchang/psyc63/b3329228952c9c6793196db502b0442e77863e46/Notebooks/Figures/centrality.png -------------------------------------------------------------------------------- /Notebooks/Figures/estimating_coefficients.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ljchang/psyc63/b3329228952c9c6793196db502b0442e77863e46/Notebooks/Figures/estimating_coefficients.png -------------------------------------------------------------------------------- /Notebooks/Figures/hw2-3.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ljchang/psyc63/b3329228952c9c6793196db502b0442e77863e46/Notebooks/Figures/hw2-3.png -------------------------------------------------------------------------------- /Notebooks/Figures/hw2-4.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ljchang/psyc63/b3329228952c9c6793196db502b0442e77863e46/Notebooks/Figures/hw2-4.png -------------------------------------------------------------------------------- /Notebooks/Figures/mediation1.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ljchang/psyc63/b3329228952c9c6793196db502b0442e77863e46/Notebooks/Figures/mediation1.png -------------------------------------------------------------------------------- /Notebooks/Figures/moderator1.gif: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ljchang/psyc63/b3329228952c9c6793196db502b0442e77863e46/Notebooks/Figures/moderator1.gif -------------------------------------------------------------------------------- /Notebooks/Figures/moderator2.gif: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ljchang/psyc63/b3329228952c9c6793196db502b0442e77863e46/Notebooks/Figures/moderator2.gif -------------------------------------------------------------------------------- /Notebooks/Figures/moderator3.jpeg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ljchang/psyc63/b3329228952c9c6793196db502b0442e77863e46/Notebooks/Figures/moderator3.jpeg -------------------------------------------------------------------------------- /Notebooks/Figures/nodeedge.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ljchang/psyc63/b3329228952c9c6793196db502b0442e77863e46/Notebooks/Figures/nodeedge.png -------------------------------------------------------------------------------- /Notebooks/Figures/slope_intercept.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ljchang/psyc63/b3329228952c9c6793196db502b0442e77863e46/Notebooks/Figures/slope_intercept.png -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # psyc63 2 | This gihub repository provides lab materials that accompany Psyc63 Experimental Study of Behavior at Dartmouth College. Labs will consist of working through [Jupyter Notebooks](http://jupyter.org/) that contain exercises in Python. Jupyter Notebooks can be installed by downloading and installing [Anaconda Python 2.7](https://www.continuum.io/downloads). 3 | 4 | ## Jupyter Notebooks 5 | ### Introduction to Python 6 | - Installation 7 | - Notebooks 8 | - Variables 9 | - Loops and Conditional Statements 10 | - Functions 11 | - Containers 12 | - Modules 13 | 14 | ### Data Analysis I 15 | - Data Checking 16 | - Data Plotting 17 | - Linear Model Regression 18 | - Model Comparisons 19 | 20 | ### Data Analysis II 21 | - Logistic Regression 22 | - Classification 23 | - Prediction 24 | - Cross-validation 25 | - Regularization Techniques 26 | - Lasso & Ridge 27 | 28 | ### Data Analysis III 29 | - Mediation Analysis 30 | - Moderation Analysis 31 | 32 | ### Social Network Analysis 33 | - Primer to Social Network Analysis 34 | - NetworkX 35 | - Micromeasures 36 | - degree centrality 37 | - betweenness centrality 38 | - closeness centrality 39 | - Eigenvector centrality 40 | - Macromeasures 41 | - Degree Distribution 42 | - Average shortest path 43 | - Clustering coefficients 44 | 45 | 46 | --------------------------------------------------------------------------------