├── .gitignore ├── 2020_materials ├── data │ ├── cpc_camcan_demographics.csv │ ├── cpc_camcan_demographics_nordan.csv │ ├── cpc_camcan_features.csv │ └── cpc_camcan_features_nordan.csv ├── tasks │ └── cpc_normative_modeling_instructions.ipynb └── tasks_key │ └── key_cpc_machinelearning.ipynb ├── README.md ├── data ├── Z_estimates.csv ├── Z_estimates_counts.csv ├── Z_long_format.csv ├── fcon1000_te.csv ├── fcon1000_te_Z.csv ├── fcon1000_tr.csv ├── nilearn_order.csv ├── phenotypes_lh.txt ├── phenotypes_rh.txt ├── phenotypes_sc.txt ├── sz_ct.npy ├── sz_labels.npy ├── sz_z.npy ├── task1_phenotypes.txt ├── test_data.csv └── train_data.csv ├── nm_utils.py ├── presentation ├── GPU.png ├── How_nm_compressed2020.pdf ├── Normative_Modeling_a_Framework_for_Clinical_Machinelearning.pdf ├── Runtime1.png ├── Runtime2.png ├── keyboard_pref.png ├── settings1.png └── settings2.png └── tasks ├── 1_fit_normative_models.ipynb ├── 2_apply_normative_models.ipynb ├── 3_Visualizations.ipynb └── 4_post_hoc_analysis.ipynb /.gitignore: -------------------------------------------------------------------------------- 1 | __pycache__/ 2 | .ipynb_checkpoints 3 | tasks/.ipynb_checkpoints/ 4 | tasks_key/.ipynb_checkpoints/ 5 | .DS_Store -------------------------------------------------------------------------------- /2020_materials/data/cpc_camcan_demographics.csv: -------------------------------------------------------------------------------- 1 | paricipants,age,sex_name,sex,IQ_random 2 | CC110033,24,MALE,1,73 3 | CC110037,18,MALE,1,103 4 | CC110045,24,FEMALE,0,124 5 | CC110056,22,FEMALE,0,124 6 | CC110062,20,MALE,1,126 7 | CC110069,28,FEMALE,0,140 8 | CC110087,28,FEMALE,0,86 9 | CC110098,23,MALE,1,108 10 | CC110101,23,MALE,1,80 11 | CC110126,22,FEMALE,0,89 12 | CC110174,25,FEMALE,0,139 13 | CC110182,18,FEMALE,0,80 14 | CC110187,25,FEMALE,0,110 15 | CC110319,28,FEMALE,0,110 16 | CC110411,25,MALE,1,95 17 | CC110606,20,MALE,1,124 18 | CC112141,29,MALE,1,137 19 | CC120008,26,MALE,1,114 20 | CC120049,28,MALE,1,147 21 | CC120061,19,MALE,1,130 22 | CC120065,25,FEMALE,0,108 23 | CC120120,25,MALE,1,123 24 | CC120123,19,FEMALE,0,79 25 | CC120137,18,MALE,1,99 26 | CC120166,28,MALE,1,103 27 | CC120182,26,MALE,1,89 28 | CC120184,25,FEMALE,0,128 29 | CC120208,24,FEMALE,0,98 30 | CC120212,20,MALE,1,131 31 | CC120218,27,FEMALE,0,86 32 | CC120234,24,FEMALE,0,109 33 | CC120264,28,MALE,1,131 34 | CC120276,23,FEMALE,0,91 35 | CC120286,22,MALE,1,138 36 | CC120309,27,MALE,1,81 37 | CC120313,28,MALE,1,132 38 | CC120319,27,FEMALE,0,101 39 | CC120347,21,FEMALE,0,124 40 | CC120376,18,FEMALE,0,104 41 | CC120409,18,MALE,1,137 42 | CC120462,18,FEMALE,0,116 43 | CC120469,25,FEMALE,0,88 44 | CC120470,28,FEMALE,0,70 45 | CC120550,19,MALE,1,120 46 | CC120640,26,FEMALE,0,122 47 | CC120727,23,FEMALE,0,135 48 | CC120764,27,MALE,1,145 49 | CC120795,24,MALE,1,121 50 | CC120816,23,MALE,1,122 51 | CC120987,20,FEMALE,0,81 52 | CC121106,28,FEMALE,0,125 53 | CC121111,18,MALE,1,102 54 | CC121144,26,MALE,1,95 55 | CC121158,28,FEMALE,0,117 56 | CC121194,24,FEMALE,0,142 57 | CC121200,26,FEMALE,0,87 58 | CC121317,25,FEMALE,0,98 59 | CC121397,27,MALE,1,129 60 | CC121411,26,FEMALE,0,122 61 | CC121428,26,FEMALE,0,146 62 | CC121479,26,FEMALE,0,83 63 | CC121685,20,MALE,1,129 64 | CC121795,25,FEMALE,0,75 65 | CC122016,23,MALE,1,136 66 | CC122172,27,MALE,1,78 67 | CC122405,27,MALE,1,88 68 | CC122620,24,FEMALE,0,150 69 | CC210023,31,MALE,1,71 70 | CC210051,35,FEMALE,0,101 71 | CC210088,36,FEMALE,0,92 72 | CC210124,35,FEMALE,0,112 73 | CC210148,30,FEMALE,0,106 74 | CC210172,31,FEMALE,0,78 75 | CC210174,30,MALE,1,104 76 | CC210182,31,FEMALE,0,125 77 | CC210250,29,MALE,1,123 78 | CC210304,32,FEMALE,0,88 79 | CC210314,34,FEMALE,0,110 80 | CC210422,34,MALE,1,132 81 | CC210519,29,FEMALE,0,122 82 | CC210526,37,MALE,1,118 83 | CC210617,34,MALE,1,88 84 | CC210657,37,MALE,1,89 85 | CC212153,29,FEMALE,0,76 86 | CC220098,32,FEMALE,0,116 87 | CC220107,34,MALE,1,108 88 | CC220115,29,FEMALE,0,137 89 | CC220132,31,MALE,1,126 90 | CC220151,30,MALE,1,138 91 | CC220198,37,FEMALE,0,144 92 | CC220203,34,FEMALE,0,125 93 | CC220223,33,MALE,1,119 94 | CC220232,36,FEMALE,0,134 95 | CC220234,35,MALE,1,133 96 | CC220284,34,FEMALE,0,149 97 | CC220323,33,FEMALE,0,77 98 | CC220335,34,MALE,1,112 99 | CC220352,29,FEMALE,0,75 100 | CC220372,37,MALE,1,117 101 | CC220394,33,FEMALE,0,119 102 | CC220419,31,MALE,1,117 103 | CC220506,35,FEMALE,0,136 104 | CC220511,32,FEMALE,0,116 105 | CC220518,30,MALE,1,89 106 | CC220519,29,FEMALE,0,148 107 | CC220526,28,FEMALE,0,93 108 | CC220535,32,MALE,1,88 109 | CC220567,37,MALE,1,134 110 | CC220610,32,FEMALE,0,140 111 | CC220635,36,FEMALE,0,111 112 | CC220697,35,MALE,1,146 113 | CC220713,33,FEMALE,0,101 114 | CC220806,35,FEMALE,0,98 115 | CC220828,33,FEMALE,0,150 116 | CC220843,37,MALE,1,139 117 | CC220901,35,FEMALE,0,98 118 | CC220920,34,FEMALE,0,112 119 | CC220974,37,MALE,1,78 120 | CC220999,37,MALE,1,100 121 | CC221002,37,FEMALE,0,101 122 | CC221031,38,MALE,1,102 123 | CC221033,28,FEMALE,0,144 124 | CC221038,29,MALE,1,77 125 | CC221040,36,MALE,1,142 126 | CC221054,31,MALE,1,122 127 | CC221107,35,MALE,1,94 128 | CC221209,29,FEMALE,0,94 129 | CC221220,37,FEMALE,0,124 130 | CC221244,32,FEMALE,0,79 131 | CC221324,36,MALE,1,140 132 | CC221336,34,MALE,1,126 133 | CC221352,37,FEMALE,0,134 134 | CC221373,29,MALE,1,87 135 | CC221487,35,MALE,1,136 136 | CC221511,36,FEMALE,0,143 137 | CC221527,34,FEMALE,0,134 138 | CC221565,32,FEMALE,0,71 139 | CC221580,31,FEMALE,0,84 140 | CC221585,37,MALE,1,114 141 | CC221595,33,MALE,1,124 142 | CC221648,30,FEMALE,0,143 143 | CC221733,36,MALE,1,136 144 | CC221737,34,FEMALE,0,149 145 | CC221740,35,FEMALE,0,131 146 | CC221755,32,MALE,1,87 147 | CC221775,32,FEMALE,0,126 148 | CC221828,32,FEMALE,0,109 149 | CC221886,33,MALE,1,92 150 | CC221935,37,MALE,1,110 151 | CC221954,32,FEMALE,0,133 152 | CC221977,37,MALE,1,81 153 | CC221980,34,MALE,1,141 154 | CC222120,37,MALE,1,106 155 | CC222125,34,MALE,1,110 156 | CC222185,32,FEMALE,0,102 157 | CC222258,35,FEMALE,0,102 158 | CC222264,37,MALE,1,90 159 | CC222304,38,MALE,1,148 160 | CC222326,35,MALE,1,79 161 | CC222367,37,MALE,1,149 162 | CC222496,38,MALE,1,122 163 | CC222555,29,MALE,1,133 164 | CC222652,30,MALE,1,115 165 | CC222797,31,FEMALE,0,119 166 | CC222956,32,FEMALE,0,128 167 | CC223085,38,MALE,1,144 168 | CC223115,36,MALE,1,101 169 | CC223286,36,MALE,1,137 170 | CC310008,45,MALE,1,122 171 | CC310051,42,FEMALE,0,85 172 | CC310052,42,FEMALE,0,89 173 | CC310086,47,FEMALE,0,146 174 | CC310127,47,FEMALE,0,72 175 | CC310129,40,MALE,1,112 176 | CC310135,42,FEMALE,0,100 177 | CC310142,48,MALE,1,137 178 | CC310160,41,MALE,1,136 179 | CC310203,39,FEMALE,0,84 180 | CC310214,46,FEMALE,0,110 181 | CC310224,47,FEMALE,0,79 182 | CC310252,47,MALE,1,124 183 | CC310256,44,MALE,1,119 184 | CC310263,44,MALE,1,88 185 | CC310331,46,MALE,1,118 186 | CC310361,43,FEMALE,0,146 187 | CC310385,48,MALE,1,149 188 | CC310391,41,MALE,1,143 189 | CC310397,41,FEMALE,0,136 190 | CC310400,43,MALE,1,122 191 | CC310402,46,FEMALE,0,71 192 | CC310407,39,FEMALE,0,148 193 | CC310410,41,FEMALE,0,142 194 | CC310414,40,MALE,1,135 195 | CC310450,46,FEMALE,0,105 196 | CC310463,45,FEMALE,0,97 197 | CC310473,46,MALE,1,98 198 | CC312058,47,MALE,1,150 199 | CC312149,45,MALE,1,129 200 | CC312222,48,FEMALE,0,94 201 | CC320002,46,MALE,1,148 202 | CC320022,40,MALE,1,140 203 | CC320036,45,FEMALE,0,131 204 | CC320059,48,MALE,1,92 205 | CC320077,46,FEMALE,0,130 206 | CC320088,41,MALE,1,148 207 | CC320089,47,FEMALE,0,112 208 | CC320107,47,MALE,1,87 209 | CC320109,39,MALE,1,146 210 | CC320116,42,MALE,1,127 211 | CC320160,48,FEMALE,0,133 212 | CC320199,45,MALE,1,70 213 | CC320202,45,MALE,1,141 214 | CC320206,47,MALE,1,97 215 | CC320218,47,FEMALE,0,114 216 | CC320267,38,MALE,1,116 217 | CC320269,42,FEMALE,0,88 218 | CC320297,40,MALE,1,135 219 | CC320321,40,FEMALE,0,96 220 | CC320325,44,FEMALE,0,106 221 | CC320336,44,FEMALE,0,100 222 | CC320342,40,FEMALE,0,81 223 | CC320359,47,MALE,1,99 224 | CC320361,41,FEMALE,0,79 225 | CC320379,48,MALE,1,117 226 | CC320400,48,FEMALE,0,109 227 | CC320417,39,MALE,1,146 228 | CC320428,45,MALE,1,146 229 | CC320429,48,MALE,1,119 230 | CC320445,47,FEMALE,0,117 231 | CC320448,42,MALE,1,128 232 | CC320461,42,MALE,1,72 233 | CC320478,40,MALE,1,99 234 | CC320500,46,MALE,1,133 235 | CC320553,48,FEMALE,0,108 236 | CC320568,44,FEMALE,0,76 237 | CC320574,40,MALE,1,88 238 | CC320575,39,FEMALE,0,95 239 | CC320576,45,FEMALE,0,96 240 | CC320608,45,MALE,1,96 241 | CC320616,39,MALE,1,113 242 | CC320621,46,FEMALE,0,143 243 | CC320636,45,MALE,1,84 244 | CC320651,42,MALE,1,139 245 | CC320661,43,FEMALE,0,99 246 | CC320680,41,MALE,1,73 247 | CC320686,40,FEMALE,0,88 248 | CC320687,41,FEMALE,0,122 249 | CC320698,45,FEMALE,0,150 250 | CC320759,44,FEMALE,0,141 251 | CC320776,47,MALE,1,78 252 | CC320814,43,FEMALE,0,121 253 | CC320818,43,MALE,1,81 254 | CC320850,47,FEMALE,0,133 255 | CC320861,47,FEMALE,0,108 256 | CC320870,47,FEMALE,0,107 257 | CC320888,47,FEMALE,0,100 258 | CC320893,47,FEMALE,0,131 259 | CC320904,42,MALE,1,109 260 | CC321000,40,MALE,1,86 261 | CC321025,40,MALE,1,82 262 | CC321053,40,FEMALE,0,78 263 | CC321069,41,FEMALE,0,149 264 | CC321073,43,FEMALE,0,116 265 | CC321087,40,FEMALE,0,137 266 | CC321107,44,MALE,1,113 267 | CC321137,41,FEMALE,0,133 268 | CC321140,48,FEMALE,0,124 269 | CC321154,48,MALE,1,133 270 | CC321174,39,FEMALE,0,146 271 | CC321201,47,FEMALE,0,77 272 | CC321203,40,MALE,1,127 273 | CC321281,46,FEMALE,0,110 274 | CC321291,44,FEMALE,0,135 275 | CC321331,46,MALE,1,89 276 | CC321368,45,MALE,1,148 277 | CC321428,42,FEMALE,0,90 278 | CC321431,48,FEMALE,0,103 279 | CC321464,45,MALE,1,112 280 | CC321504,42,MALE,1,91 281 | CC321506,39,FEMALE,0,72 282 | CC321529,39,FEMALE,0,150 283 | CC321544,44,MALE,1,149 284 | CC321557,44,FEMALE,0,145 285 | CC321585,44,FEMALE,0,148 286 | CC321594,43,MALE,1,82 287 | CC321595,44,FEMALE,0,121 288 | CC321880,46,FEMALE,0,88 289 | CC321899,49,FEMALE,0,89 290 | CC321976,42,FEMALE,0,140 291 | CC322186,47,MALE,1,126 292 | CC410015,51,FEMALE,0,103 293 | CC410032,55,MALE,1,139 294 | CC410040,55,MALE,1,148 295 | CC410084,57,FEMALE,0,130 296 | CC410086,57,MALE,1,137 297 | CC410091,57,FEMALE,0,144 298 | CC410094,54,FEMALE,0,74 299 | CC410097,49,FEMALE,0,113 300 | CC410101,56,MALE,1,126 301 | CC410113,55,MALE,1,109 302 | CC410119,57,MALE,1,116 303 | CC410121,52,FEMALE,0,70 304 | CC410129,58,MALE,1,149 305 | CC410169,49,FEMALE,0,99 306 | CC410173,55,MALE,1,128 307 | CC410177,57,MALE,1,102 308 | CC410179,55,MALE,1,123 309 | CC410182,53,FEMALE,0,126 310 | CC410220,52,FEMALE,0,124 311 | CC410222,55,FEMALE,0,150 312 | CC410226,56,MALE,1,77 313 | CC410243,56,FEMALE,0,135 314 | CC410248,54,MALE,1,92 315 | CC410251,53,FEMALE,0,110 316 | CC410284,50,FEMALE,0,101 317 | CC410286,49,FEMALE,0,74 318 | CC410287,58,FEMALE,0,139 319 | CC410289,58,FEMALE,0,76 320 | CC410297,58,FEMALE,0,77 321 | CC410323,51,FEMALE,0,94 322 | CC410325,54,FEMALE,0,88 323 | CC410354,48,MALE,1,146 324 | CC410387,57,MALE,1,78 325 | CC410390,56,MALE,1,80 326 | CC410432,52,MALE,1,110 327 | CC410447,58,FEMALE,0,119 328 | CC412004,54,MALE,1,96 329 | CC412021,54,FEMALE,0,120 330 | CC420004,49,FEMALE,0,79 331 | CC420060,51,MALE,1,147 332 | CC420061,57,MALE,1,102 333 | CC420071,52,FEMALE,0,139 334 | CC420075,52,MALE,1,128 335 | CC420089,48,FEMALE,0,94 336 | CC420091,52,MALE,1,70 337 | CC420094,56,MALE,1,130 338 | CC420100,56,MALE,1,124 339 | CC420137,56,MALE,1,77 340 | CC420143,53,FEMALE,0,122 341 | CC420148,56,FEMALE,0,73 342 | CC420149,52,MALE,1,104 343 | CC420157,58,MALE,1,94 344 | CC420162,52,FEMALE,0,89 345 | CC420167,51,FEMALE,0,112 346 | CC420173,49,FEMALE,0,96 347 | CC420180,56,FEMALE,0,129 348 | CC420182,52,FEMALE,0,118 349 | CC420197,55,FEMALE,0,87 350 | CC420198,58,MALE,1,98 351 | CC420202,51,MALE,1,147 352 | CC420204,53,FEMALE,0,71 353 | CC420217,50,MALE,1,102 354 | CC420222,55,MALE,1,76 355 | CC420226,50,MALE,1,101 356 | CC420229,52,MALE,1,93 357 | CC420231,54,MALE,1,72 358 | CC420236,53,MALE,1,85 359 | CC420241,55,MALE,1,98 360 | CC420244,49,MALE,1,94 361 | CC420259,51,FEMALE,0,90 362 | CC420260,50,FEMALE,0,134 363 | CC420261,54,FEMALE,0,74 364 | CC420286,56,MALE,1,75 365 | CC420322,49,MALE,1,129 366 | CC420324,52,FEMALE,0,94 367 | CC420348,57,FEMALE,0,133 368 | CC420356,54,MALE,1,102 369 | CC420364,54,MALE,1,76 370 | CC420383,48,FEMALE,0,120 371 | CC420392,51,FEMALE,0,125 372 | CC420396,53,MALE,1,122 373 | CC420402,49,MALE,1,149 374 | CC420412,52,MALE,1,144 375 | CC420433,51,MALE,1,75 376 | CC420435,53,FEMALE,0,107 377 | CC420454,54,MALE,1,145 378 | CC420462,55,FEMALE,0,103 379 | CC420464,55,FEMALE,0,130 380 | CC420493,51,FEMALE,0,118 381 | CC420566,50,FEMALE,0,114 382 | CC420582,54,MALE,1,135 383 | CC420587,49,FEMALE,0,140 384 | CC420589,52,MALE,1,146 385 | CC420623,51,MALE,1,85 386 | CC420720,50,FEMALE,0,136 387 | CC420729,56,FEMALE,0,119 388 | CC420776,49,FEMALE,0,115 389 | CC420888,50,FEMALE,0,138 390 | CC510015,58,FEMALE,0,91 391 | CC510017,63,FEMALE,0,85 392 | CC510039,60,FEMALE,0,133 393 | CC510043,58,MALE,1,73 394 | CC510050,59,FEMALE,0,70 395 | CC510062,64,MALE,1,71 396 | CC510076,62,MALE,1,73 397 | CC510086,65,MALE,1,128 398 | CC510115,62,FEMALE,0,130 399 | CC510161,58,MALE,1,101 400 | CC510163,64,FEMALE,0,93 401 | CC510179,62,FEMALE,0,77 402 | CC510208,66,FEMALE,0,80 403 | CC510220,66,MALE,1,77 404 | CC510226,67,MALE,1,96 405 | CC510237,66,MALE,1,93 406 | CC510242,60,MALE,1,104 407 | CC510243,60,MALE,1,111 408 | CC510255,62,MALE,1,97 409 | CC510256,63,MALE,1,130 410 | CC510258,60,MALE,1,146 411 | CC510259,60,FEMALE,0,120 412 | CC510284,60,FEMALE,0,129 413 | CC510304,66,FEMALE,0,116 414 | CC510321,64,FEMALE,0,110 415 | CC510323,64,FEMALE,0,109 416 | CC510329,64,MALE,1,107 417 | CC510342,63,FEMALE,0,102 418 | CC510354,62,FEMALE,0,111 419 | CC510355,65,FEMALE,0,123 420 | CC510392,67,MALE,1,120 421 | CC510393,68,MALE,1,107 422 | CC510395,68,MALE,1,92 423 | CC510415,64,FEMALE,0,71 424 | CC510433,61,FEMALE,0,142 425 | CC510434,65,FEMALE,0,134 426 | CC510438,62,MALE,1,87 427 | CC510473,65,MALE,1,148 428 | CC510474,66,MALE,1,76 429 | CC510480,68,MALE,1,103 430 | CC510483,60,FEMALE,0,92 431 | CC510486,63,FEMALE,0,134 432 | CC510511,68,FEMALE,0,142 433 | CC510534,66,MALE,1,100 434 | CC510548,62,MALE,1,85 435 | CC510551,61,MALE,1,147 436 | CC510568,60,MALE,1,115 437 | CC510572,63,MALE,1,78 438 | CC510609,59,MALE,1,101 439 | CC510629,59,MALE,1,107 440 | CC510639,62,MALE,1,137 441 | CC510648,61,FEMALE,0,73 442 | CC512003,62,MALE,1,135 443 | CC520002,64,FEMALE,0,134 444 | CC520011,60,FEMALE,0,82 445 | CC520013,67,MALE,1,117 446 | CC520042,58,MALE,1,118 447 | CC520053,64,MALE,1,145 448 | CC520055,61,FEMALE,0,88 449 | CC520065,60,MALE,1,93 450 | CC520078,63,MALE,1,126 451 | CC520083,65,FEMALE,0,137 452 | CC520097,62,FEMALE,0,96 453 | CC520114,64,FEMALE,0,111 454 | CC520122,67,MALE,1,91 455 | CC520127,66,MALE,1,122 456 | CC520134,67,FEMALE,0,146 457 | CC520136,65,FEMALE,0,83 458 | CC520147,61,FEMALE,0,73 459 | CC520162,68,MALE,1,90 460 | CC520168,59,MALE,1,87 461 | CC520175,61,MALE,1,96 462 | CC520197,59,FEMALE,0,129 463 | CC520200,67,FEMALE,0,123 464 | CC520209,66,FEMALE,0,78 465 | CC520211,63,FEMALE,0,140 466 | CC520215,63,FEMALE,0,150 467 | CC520239,65,FEMALE,0,122 468 | CC520247,63,MALE,1,73 469 | CC520253,58,MALE,1,104 470 | CC520254,66,FEMALE,0,104 471 | CC520275,66,MALE,1,123 472 | CC520279,68,FEMALE,0,86 473 | CC520287,59,FEMALE,0,92 474 | CC520377,63,FEMALE,0,93 475 | CC520390,65,FEMALE,0,97 476 | CC520391,64,FEMALE,0,84 477 | CC520395,61,FEMALE,0,106 478 | CC520398,59,FEMALE,0,142 479 | CC520424,63,MALE,1,75 480 | CC520436,63,FEMALE,0,149 481 | CC520477,66,MALE,1,115 482 | CC520480,60,MALE,1,83 483 | CC520503,66,MALE,1,74 484 | CC520517,65,MALE,1,79 485 | CC520552,64,MALE,1,98 486 | CC520560,65,MALE,1,89 487 | CC520562,66,MALE,1,93 488 | CC520584,59,FEMALE,0,137 489 | CC520585,68,FEMALE,0,78 490 | CC520597,64,MALE,1,104 491 | CC520607,64,FEMALE,0,126 492 | CC520624,60,FEMALE,0,132 493 | CC520673,67,MALE,1,129 494 | CC520745,63,FEMALE,0,88 495 | CC520775,61,FEMALE,0,137 496 | CC520868,67,MALE,1,107 497 | CC520980,68,MALE,1,70 498 | CC521040,63,FEMALE,0,124 499 | CC610022,68,FEMALE,0,109 500 | CC610028,68,MALE,1,105 501 | CC610039,70,MALE,1,124 502 | CC610040,72,MALE,1,109 503 | CC610046,72,MALE,1,81 504 | CC610050,71,FEMALE,0,145 505 | CC610051,68,MALE,1,142 506 | CC610052,77,MALE,1,89 507 | CC610058,73,MALE,1,98 508 | CC610061,76,FEMALE,0,78 509 | CC610071,69,FEMALE,0,150 510 | CC610076,77,FEMALE,0,120 511 | CC610095,71,FEMALE,0,83 512 | CC610096,73,FEMALE,0,146 513 | CC610099,69,FEMALE,0,114 514 | CC610101,68,MALE,1,136 515 | CC610120,70,MALE,1,94 516 | CC610146,76,FEMALE,0,111 517 | CC610178,72,MALE,1,105 518 | CC610210,75,MALE,1,114 519 | CC610212,77,FEMALE,0,111 520 | CC610227,76,FEMALE,0,128 521 | CC610285,71,MALE,1,132 522 | CC610288,69,MALE,1,73 523 | CC610292,72,FEMALE,0,117 524 | CC610308,69,MALE,1,105 525 | CC610344,71,FEMALE,0,115 526 | CC610372,70,MALE,1,145 527 | CC610392,75,FEMALE,0,117 528 | CC610405,72,FEMALE,0,150 529 | CC610462,76,FEMALE,0,108 530 | CC610469,73,FEMALE,0,122 531 | CC610496,70,MALE,1,124 532 | CC610508,77,FEMALE,0,110 533 | CC610568,76,MALE,1,104 534 | CC610575,71,FEMALE,0,113 535 | CC610576,72,FEMALE,0,104 536 | CC610594,69,FEMALE,0,115 537 | CC610614,74,FEMALE,0,109 538 | CC610625,70,FEMALE,0,73 539 | CC610631,77,FEMALE,0,78 540 | CC610653,71,MALE,1,125 541 | CC610658,78,MALE,1,122 542 | CC610671,70,FEMALE,0,77 543 | CC610697,73,FEMALE,0,92 544 | CC620005,74,FEMALE,0,143 545 | CC620026,70,FEMALE,0,134 546 | CC620044,73,FEMALE,0,83 547 | CC620073,69,MALE,1,85 548 | CC620085,70,FEMALE,0,92 549 | CC620090,71,FEMALE,0,94 550 | CC620106,71,FEMALE,0,131 551 | CC620114,73,MALE,1,143 552 | CC620118,71,FEMALE,0,145 553 | CC620121,68,MALE,1,117 554 | CC620129,75,MALE,1,124 555 | CC620152,73,MALE,1,92 556 | CC620164,72,FEMALE,0,107 557 | CC620193,76,MALE,1,140 558 | CC620200,76,FEMALE,0,114 559 | CC620259,68,FEMALE,0,103 560 | CC620262,69,FEMALE,0,103 561 | CC620264,76,FEMALE,0,143 562 | CC620279,77,MALE,1,97 563 | CC620284,75,FEMALE,0,121 564 | CC620314,74,FEMALE,0,85 565 | CC620354,78,FEMALE,0,93 566 | CC620359,68,MALE,1,84 567 | CC620405,76,FEMALE,0,146 568 | CC620406,68,MALE,1,142 569 | CC620413,76,MALE,1,92 570 | CC620429,69,MALE,1,82 571 | CC620436,78,MALE,1,86 572 | CC620442,71,FEMALE,0,115 573 | CC620444,77,FEMALE,0,135 574 | CC620451,75,MALE,1,117 575 | CC620454,75,FEMALE,0,144 576 | CC620466,75,MALE,1,94 577 | CC620479,69,MALE,1,73 578 | CC620490,74,FEMALE,0,144 579 | CC620496,70,FEMALE,0,135 580 | CC620499,71,MALE,1,132 581 | CC620515,75,FEMALE,0,122 582 | CC620518,78,MALE,1,113 583 | CC620526,73,FEMALE,0,84 584 | CC620527,69,MALE,1,136 585 | CC620549,73,FEMALE,0,147 586 | CC620557,74,FEMALE,0,148 587 | CC620560,72,MALE,1,139 588 | CC620567,74,MALE,1,85 589 | CC620572,78,FEMALE,0,84 590 | CC620592,74,FEMALE,0,128 591 | CC620610,76,MALE,1,120 592 | CC620619,71,MALE,1,97 593 | CC620659,71,FEMALE,0,143 594 | CC620685,77,FEMALE,0,76 595 | CC620720,78,FEMALE,0,121 596 | CC620785,69,MALE,1,82 597 | CC620793,71,MALE,1,92 598 | CC620821,70,MALE,1,90 599 | CC620885,78,MALE,1,140 600 | CC620919,78,MALE,1,87 601 | CC620935,71,MALE,1,70 602 | CC621011,76,MALE,1,137 603 | CC621080,70,MALE,1,98 604 | CC621118,76,MALE,1,112 605 | CC621128,75,MALE,1,115 606 | CC621184,69,MALE,1,83 607 | CC621199,72,MALE,1,85 608 | CC621248,72,FEMALE,0,148 609 | CC621284,79,MALE,1,142 610 | CC621642,73,MALE,1,102 611 | CC710037,78,FEMALE,0,136 612 | CC710088,83,FEMALE,0,127 613 | CC710099,85,FEMALE,0,97 614 | CC710131,85,MALE,1,90 615 | CC710154,83,MALE,1,113 616 | CC710176,78,FEMALE,0,81 617 | CC710214,79,MALE,1,143 618 | CC710223,79,FEMALE,0,147 619 | CC710313,81,MALE,1,99 620 | CC710342,83,FEMALE,0,131 621 | CC710350,81,MALE,1,99 622 | CC710382,83,MALE,1,121 623 | CC710416,82,FEMALE,0,74 624 | CC710429,80,MALE,1,143 625 | CC710446,85,MALE,1,87 626 | CC710462,82,FEMALE,0,140 627 | CC710486,79,FEMALE,0,127 628 | CC710494,80,MALE,1,128 629 | CC710501,80,MALE,1,87 630 | CC710518,83,FEMALE,0,103 631 | CC710548,83,MALE,1,145 632 | CC710551,85,MALE,1,135 633 | CC710566,83,MALE,1,118 634 | CC710591,85,FEMALE,0,111 635 | CC710664,82,FEMALE,0,142 636 | CC710679,84,MALE,1,132 637 | CC710858,79,MALE,1,116 638 | CC710982,79,MALE,1,111 639 | CC711027,80,MALE,1,143 640 | CC711035,88,FEMALE,0,123 641 | CC711128,80,MALE,1,143 642 | CC711141,85,MALE,1,141 643 | CC711158,80,MALE,1,112 644 | CC711244,85,FEMALE,0,85 645 | CC711245,85,MALE,1,116 646 | CC712027,87,MALE,1,74 647 | CC712085,81,MALE,1,113 648 | CC720023,82,FEMALE,0,81 649 | CC720071,82,MALE,1,120 650 | CC720103,80,MALE,1,115 651 | CC720119,79,MALE,1,142 652 | CC720180,78,MALE,1,109 653 | CC720188,78,MALE,1,108 654 | CC720238,80,FEMALE,0,140 655 | CC720290,84,MALE,1,146 656 | CC720304,80,FEMALE,0,112 657 | CC720329,80,MALE,1,141 658 | CC720330,80,MALE,1,108 659 | CC720358,83,FEMALE,0,130 660 | CC720359,81,MALE,1,102 661 | CC720400,86,FEMALE,0,104 662 | CC720407,82,MALE,1,140 663 | CC720497,80,FEMALE,0,93 664 | CC720511,79,MALE,1,118 665 | CC720516,84,FEMALE,0,136 666 | CC720622,81,FEMALE,0,116 667 | CC720646,81,MALE,1,126 668 | CC720655,79,FEMALE,0,148 669 | CC720670,79,FEMALE,0,150 670 | CC720685,81,MALE,1,102 671 | CC720723,84,FEMALE,0,124 672 | CC720774,87,MALE,1,87 673 | CC720941,79,MALE,1,120 674 | CC720986,83,MALE,1,149 675 | CC721052,79,MALE,1,72 676 | CC721107,79,FEMALE,0,76 677 | CC721114,79,FEMALE,0,128 678 | CC721224,87,FEMALE,0,74 679 | CC721291,80,FEMALE,0,96 680 | CC721292,79,MALE,1,134 681 | CC721374,86,MALE,1,132 682 | CC721377,80,MALE,1,130 683 | CC721392,80,MALE,1,137 684 | CC721418,79,FEMALE,0,122 685 | CC721434,84,MALE,1,148 686 | CC721449,80,MALE,1,83 687 | CC721504,82,MALE,1,92 688 | CC721519,79,FEMALE,0,127 689 | CC721585,79,FEMALE,0,107 690 | CC721618,81,FEMALE,0,128 691 | CC721648,80,FEMALE,0,143 692 | CC721704,82,FEMALE,0,77 693 | CC721707,80,FEMALE,0,94 694 | CC721729,81,MALE,1,112 695 | CC721888,78,FEMALE,0,115 696 | CC721891,83,MALE,1,104 697 | CC721894,80,FEMALE,0,88 698 | CC721957,83,MALE,1,105 699 | CC722077,82,FEMALE,0,134 700 | CC722216,86,FEMALE,0,73 701 | CC722421,79,FEMALE,0,128 702 | CC722522,84,FEMALE,0,101 703 | CC722536,79,FEMALE,0,150 704 | CC722542,79,MALE,1,116 705 | CC722651,79,FEMALE,0,128 706 | CC722891,84,FEMALE,0,129 707 | CC723197,80,FEMALE,0,96 708 | CC723395,86,FEMALE,0,145 -------------------------------------------------------------------------------- /2020_materials/data/cpc_camcan_demographics_nordan.csv: -------------------------------------------------------------------------------- 1 | paricipants,age,sex_name,sex,IQ_random 2 | NORDAN,65,MALE,1,74 3 | DEM_PATIENT1,66,MALE,1,74 4 | DEM_PATIENT2,65,MALE,1,74 5 | DEM_PATIENT3,67,MALE,1,74 6 | DEM_PATIENT4,65,MALE,1,77 7 | DEM_PATIENT5,68,MALE,1,78 8 | DEM_PATIENT6,63,MALE,1,72 9 | DEM_PATIENT7,65,MALE,1,72 10 | DEM_PATIENT8,70,MALE,1,76 11 | DEM_PATIENT9,62,MALE,1,72 12 | DEM_PATIENT10,61,MALE,1,80 -------------------------------------------------------------------------------- /2020_materials/data/cpc_camcan_features_nordan.csv: -------------------------------------------------------------------------------- 1 | participants,left_Hippocampal_tail,left_subiculum,left_CA1,left_hippocampal-fissure,left_presubiculum,left_parasubiculum,left_molecular_layer_HP,left_GC-ML-DG,left_CA3,left_CA4,left_fimbria,left_HATA,left_Whole_hippocampus,right_Hippocampal_tail,right_subiculum,right_CA1,right_hippocampal-fissure,right_presubiculum,right_parasubiculum,right_molecular_layer_HP,right_GC-ML-DG,right_CA3,right_CA4,right_fimbria,right_HATA,right_Whole_hippocampus 2 | NORDAN,610,330,260,120,280,42,413,240,80,175,46,48,2644,480,328,280,126,226,42,416,203,101,174,22,45,2443 3 | DEM_PATIENT1,406,314,301,107,287,59,392,217,149,147,41,46,2466,311,312,347,103,229,39,312,214,138,222,29,43,2299 4 | DEM_PATIENT2,413,312,304,110,278,53,394,220,133,169,42,56,2484,303,311,339,114,207,40,310,214,138,226,37,46,2285 5 | DEM_PATIENT3,412,305,310,108,210,57,382,239,135,171,44,43,2416,306,314,347,104,217,32,314,215,137,220,29,49,2284 6 | DEM_PATIENT4,410,307,315,101,218,54,412,202,123,184,53,52,2431,307,304,347,107,214,39,304,201,139,224,36,31,2253 7 | DEM_PATIENT5,409,306,302,114,257,49,401,223,102,213,46,49,2471,303,308,342,115,217,41,314,206,136,217,28,52,2279 8 | DEM_PATIENT6,409,311,307,108,310,44,395,217,127,110,51,50,2439,307,302,337,102,233,33,311,212,138,245,37,33,2290 9 | DEM_PATIENT7,412,302,302,113,289,50,399,217,106,116,41,46,2393,303,305,344,104,227,44,315,215,135,207,27,55,2281 10 | DEM_PATIENT8,414,315,306,101,292,41,399,201,145,104,50,48,2416,306,301,348,107,227,49,312,206,140,229,24,51,2300 11 | DEM_PATIENT9,411,313,308,102,226,57,398,212,117,152,47,49,2392,306,311,348,102,220,33,313,205,138,249,30,41,2296 12 | DEM_PATIENT10,406,312,308,106,210,51,407,219,120,140,43,58,2380,302,310,333,101,226,47,307,212,136,223,21,43,2261 -------------------------------------------------------------------------------- /2020_materials/tasks/cpc_normative_modeling_instructions.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "nbformat": 4, 3 | "nbformat_minor": 0, 4 | "metadata": { 5 | "accelerator": "GPU", 6 | "colab": { 7 | "name": "cpc_normative_modeling_instructions.ipynb", 8 | "provenance": [], 9 | "collapsed_sections": [], 10 | "toc_visible": true, 11 | "include_colab_link": true 12 | }, 13 | "kernelspec": { 14 | "display_name": "Python 3", 15 | "language": "python", 16 | "name": "python3" 17 | }, 18 | "language_info": { 19 | "codemirror_mode": { 20 | "name": "ipython", 21 | "version": 3 22 | }, 23 | "file_extension": ".py", 24 | "mimetype": "text/x-python", 25 | "name": "python", 26 | "nbconvert_exporter": "python", 27 | "pygments_lexer": "ipython3", 28 | "version": "3.7.4" 29 | } 30 | }, 31 | "cells": [ 32 | { 33 | "cell_type": "markdown", 34 | "metadata": { 35 | "id": "view-in-github", 36 | "colab_type": "text" 37 | }, 38 | "source": [ 39 | "\"Open" 40 | ] 41 | }, 42 | { 43 | "cell_type": "markdown", 44 | "metadata": { 45 | "colab_type": "text", 46 | "id": "hC8rsih7PHa_" 47 | }, 48 | "source": [ 49 | "# **CPC TUTORIAL ON NORMATIVE MODELING**\n", 50 | "\n", 51 | "\n", 52 | "Created by \n", 53 | "\n", 54 | "Mariam Zabihi [@m_zabihi](https://twitter.com/m_zabihi)\n", 55 | "\n", 56 | "Saige Rutherford [@being_saige](https://twitter.com/being_saige)\n", 57 | "\n", 58 | "Thomas Wolfers [@ThomasWolfers](https://twitter.com/ThomasWolfers)\n", 59 | "_______________________________________________________________________________" 60 | ] 61 | }, 62 | { 63 | "cell_type": "markdown", 64 | "metadata": { 65 | "colab_type": "text", 66 | "id": "irl08XE1AG9v" 67 | }, 68 | "source": [ 69 | "## **Background Story**\n", 70 | "\n", 71 | "Morten and Ingrid are concerned about the health of their father, Nordan. He recently turned 65 years. A few months ago he could not find his way home. Together, they visit a neurologist/psychiatrist to conduct a number of cognitive tests. However, those tests were inconclusive. While Nordan has a relatively low IQ it could not explain his trouble returning home.\n", 72 | "\n", 73 | "Recently, the family heard about a new screening technique called normative modeling with which one can place individuals in reference to a population norm on for instance measures such as brain volume. Nordan would like to undertake this procedure to better know what is going on and to potentially find targets for treatment. Therefore, the family booked an appointment with you, the normative modeling specialist. To find out what is going on you compare Nordan's hyppocampus to the norm and to a group of persons with Dementia disorders, who have a similar IQ, age as well as the same sex as Nordan.\n", 74 | "\n", 75 | "Do your best to get as far as you can. However, you do not need to feel bad if you cannot complete everything during the tutorial.\n" 76 | ] 77 | }, 78 | { 79 | "cell_type": "markdown", 80 | "metadata": { 81 | "colab_type": "text", 82 | "id": "udo6yANOCpvp" 83 | }, 84 | "source": [ 85 | "## **Task 0:** Load data and install the pcntoolkit" 86 | ] 87 | }, 88 | { 89 | "cell_type": "code", 90 | "metadata": { 91 | "colab_type": "code", 92 | "id": "yawDkTLoKYRu", 93 | "colab": {} 94 | }, 95 | "source": [ 96 | "#install normative modeling\n", 97 | "!pip install pcntoolkit" 98 | ], 99 | "execution_count": null, 100 | "outputs": [] 101 | }, 102 | { 103 | "cell_type": "markdown", 104 | "metadata": { 105 | "colab_type": "text", 106 | "id": "EHDKe2ohCxP4" 107 | }, 108 | "source": [ 109 | "**Option 1:** Connect your Google Drive account, and load data from Google Drive. Having Google Drive connected will allow you to save any files created back to your Drive folder. This step will require you to download the csv files from [Github](https://github.com/saigerutherford/CPC_2020/tree/master/data) to your computer, and then make a folder in your Google Drive account and upload the csv files to this folder. " 110 | ] 111 | }, 112 | { 113 | "cell_type": "code", 114 | "metadata": { 115 | "colab_type": "code", 116 | "id": "0SMVyxNZqmlv", 117 | "colab": {} 118 | }, 119 | "source": [ 120 | "from google.colab import drive\n", 121 | "drive.mount('/content/drive')\n", 122 | "\n", 123 | "#change dir to data on your google drive\n", 124 | "import os\n", 125 | "os.chdir('drive/My Drive/name-of-folder-where-you-uploaded-csv-files-from-Github/')\n", 126 | "\n", 127 | "# code by T. Wolfers" 128 | ], 129 | "execution_count": null, 130 | "outputs": [] 131 | }, 132 | { 133 | "cell_type": "markdown", 134 | "metadata": { 135 | "colab_type": "text", 136 | "id": "Bst55nPJDHKb" 137 | }, 138 | "source": [ 139 | "**Option 2:** Import the files directly from Github, and skip adding them to Google Drive." 140 | ] 141 | }, 142 | { 143 | "cell_type": "code", 144 | "metadata": { 145 | "colab_type": "code", 146 | "id": "zuuSkJwPDRrv", 147 | "colab": {} 148 | }, 149 | "source": [ 150 | "!wget -nc https://raw.githubusercontent.com/saigerutherford/CPC_2020/master/data/cpc_camcan_demographics.csv\n", 151 | "!wget -nc https://raw.githubusercontent.com/saigerutherford/CPC_2020/master/data/cpc_camcan_demographics_nordan.csv\n", 152 | "!wget -nc https://raw.githubusercontent.com/saigerutherford/CPC_2020/master/data/cpc_camcan_features.csv\n", 153 | "!wget -nc https://raw.githubusercontent.com/saigerutherford/CPC_2020/master/data/cpc_camcan_features_nordan.csv\n", 154 | " \n", 155 | "# code by S. Rutherford" 156 | ], 157 | "execution_count": null, 158 | "outputs": [] 159 | }, 160 | { 161 | "cell_type": "markdown", 162 | "metadata": { 163 | "colab_type": "text", 164 | "id": "kvSiRjysuGkV" 165 | }, 166 | "source": [ 167 | "## **TASK 1:** Format input data" 168 | ] 169 | }, 170 | { 171 | "cell_type": "markdown", 172 | "metadata": { 173 | "colab_type": "text", 174 | "id": "N2Bon1mJAVjJ" 175 | }, 176 | "source": [ 177 | "You have four files. The features and demographics file for the normsample and two files of the same name for Nordan your test sample. As one of your coworkers has done the preporcessing and quality control there are more subjects in the demographics file than in the features file of the norm sample. Please select the overlap of participants between those two files. \n", 178 | "\n", 179 | "\n", 180 | "*Question for your understanding:*\n", 181 | "\n", 182 | "1) Why do we have to select the overlap between participants in terms of featrues and demographics?" 183 | ] 184 | }, 185 | { 186 | "cell_type": "code", 187 | "metadata": { 188 | "colab_type": "code", 189 | "id": "_RSfxGWku6fU", 190 | "colab": {} 191 | }, 192 | "source": [ 193 | "import pandas as pd\n", 194 | "\n", 195 | "#CODE HERE" 196 | ], 197 | "execution_count": null, 198 | "outputs": [] 199 | }, 200 | { 201 | "cell_type": "markdown", 202 | "metadata": { 203 | "colab_type": "text", 204 | "id": "fUufLg4lQWdn" 205 | }, 206 | "source": [ 207 | "## **TASK 2:** Prepare the covariate_normsample and testresponse_normsample file. " 208 | ] 209 | }, 210 | { 211 | "cell_type": "markdown", 212 | "metadata": { 213 | "colab_type": "text", 214 | "id": "g1i1qp7AAh1Q" 215 | }, 216 | "source": [ 217 | "As mentioned in the introductory presentation those files need a specific format and the entries need to be seperated by spaces. Use whatever method you know to prepare those files based on the data provided in TASK 1. Save those files in .txt format in your drive. Also get rid of the column names and participant IDs.\n", 218 | "\n", 219 | "Given that we only have limited time in this practical we have to make a selection for the features based on your prior knowledge. With the information in mind that Nordan does not remember his way home, which subfield of the hyppocampus is probably a good target for the investigations?\n", 220 | "Select a maximum of four hyppocampal regions as features.\n", 221 | "\n", 222 | "NOTE: Normative modeling is a screening tool we just make this selection due to time constraints, in reality we build these models on millions of putative biomarkers that are not restricted to brain imaging.\n", 223 | "\n", 224 | "\n", 225 | "*Qestions for your understanding:*\n", 226 | "\n", 227 | "2) What is the requirement for the features in terms of variable properties (e.g. dicotomous or continous)? 3) What is the requirement for the covariates in terms of these properties? 4) What are the requirements for both together? 5) How does this depent on the algorithm used?" 228 | ] 229 | }, 230 | { 231 | "cell_type": "code", 232 | "metadata": { 233 | "colab_type": "code", 234 | "id": "lzt6llxyRPyY", 235 | "colab": {} 236 | }, 237 | "source": [ 238 | "#CODE HERE" 239 | ], 240 | "execution_count": null, 241 | "outputs": [] 242 | }, 243 | { 244 | "cell_type": "markdown", 245 | "metadata": { 246 | "colab_type": "text", 247 | "id": "irR4FAIvQ8ds" 248 | }, 249 | "source": [ 250 | "## **TASK 3:** Estimate normative model\n" 251 | ] 252 | }, 253 | { 254 | "cell_type": "markdown", 255 | "metadata": { 256 | "colab_type": "text", 257 | "id": "XV61hQUoA1Kd" 258 | }, 259 | "source": [ 260 | "Once you have prepared and saved all the necessary files. Look at the pcntoolkit for running normative modeling. Select an appropritate method set up the toolkit and run your analyses using 2-fold cross validation in the normsample. Change the output suffix from estimate to '_2fold'. \n", 261 | "\n", 262 | "HINT: You primarily need the estimate function. \n", 263 | "\n", 264 | "SUGGESTION: While this process is running you can go to the next TASK 4, you will have no doubt when it is correctly running.\n", 265 | "\n", 266 | "*Question for your understaning:*\n", 267 | "\n", 268 | "6) What does cvfolds mean and why do we use it? 7) What is the output of the estimate function and what does it mean?" 269 | ] 270 | }, 271 | { 272 | "cell_type": "code", 273 | "metadata": { 274 | "colab_type": "code", 275 | "id": "yRTusEg6SRNL", 276 | "colab": {} 277 | }, 278 | "source": [ 279 | "import pcntoolkit as pcn\n", 280 | "\n", 281 | "#CODE HERE" 282 | ], 283 | "execution_count": null, 284 | "outputs": [] 285 | }, 286 | { 287 | "cell_type": "markdown", 288 | "metadata": { 289 | "colab_type": "text", 290 | "id": "Nonuk7d_SNM6" 291 | }, 292 | "source": [ 293 | "## **TASK 4:** Estimate the forward model of the normative model\n" 294 | ] 295 | }, 296 | { 297 | "cell_type": "markdown", 298 | "metadata": { 299 | "colab_type": "text", 300 | "id": "fmn4TD_tBE70" 301 | }, 302 | "source": [ 303 | "In order to visulize the normative trajectories you first need to run the forward model. To this end you need to set up an appropriate covariate_forwardmodel file that covers the age range appropriately for both sexes. Save this file as .txt . Then you can input the files you made in TASK 1 as well as the file you made now and run the forward model using the appropriate specifications.\n", 304 | "\n", 305 | "*Question for your understaning:*\n", 306 | "\n", 307 | "8) What is yhat and ys2? 9) Why does the output of the forward model does not inlcude the Z-scores?" 308 | ] 309 | }, 310 | { 311 | "cell_type": "code", 312 | "metadata": { 313 | "colab_type": "code", 314 | "id": "22U-knkWSPsZ", 315 | "colab": {} 316 | }, 317 | "source": [ 318 | "#CODE HERE" 319 | ], 320 | "execution_count": null, 321 | "outputs": [] 322 | }, 323 | { 324 | "cell_type": "markdown", 325 | "metadata": { 326 | "colab_type": "text", 327 | "id": "wxeZlXshQ7eS" 328 | }, 329 | "source": [ 330 | "## **TASK 5:** Visualize forward model" 331 | ] 332 | }, 333 | { 334 | "cell_type": "markdown", 335 | "metadata": { 336 | "colab_type": "text", 337 | "id": "BVodlChrBg25" 338 | }, 339 | "source": [ 340 | "Visualize the forward model of the normative model similar to the figure below.\n", 341 | "\n", 342 | "![1-s2.0-S245190221830329X-gr2.jpg]()\n", 343 | "\n", 344 | "HINT: First create a function that calculates the confidence intervals and then plot yhat, y2 of the forward model. Finally, plot the data of individual participants." 345 | ] 346 | }, 347 | { 348 | "cell_type": "code", 349 | "metadata": { 350 | "colab_type": "code", 351 | "id": "ii0H9GDwv-ha", 352 | "colab": {} 353 | }, 354 | "source": [ 355 | "import numpy as np\n", 356 | "import matplotlib.pyplot as plt\n", 357 | "\n", 358 | "#CODE HERE" 359 | ], 360 | "execution_count": null, 361 | "outputs": [] 362 | }, 363 | { 364 | "cell_type": "markdown", 365 | "metadata": { 366 | "colab_type": "text", 367 | "id": "yM4z1BtyWwiF" 368 | }, 369 | "source": [ 370 | "## **TASK 6:** Apply the normative model to Nordan's data and the dementia patients." 371 | ] 372 | }, 373 | { 374 | "cell_type": "code", 375 | "metadata": { 376 | "colab_type": "code", 377 | "id": "eVTYxKjvWBvm", 378 | "colab": {} 379 | }, 380 | "source": [ 381 | "#CODE HERE" 382 | ], 383 | "execution_count": null, 384 | "outputs": [] 385 | }, 386 | { 387 | "cell_type": "markdown", 388 | "metadata": { 389 | "colab_type": "text", 390 | "id": "LFnHCy0XVVwl" 391 | }, 392 | "source": [ 393 | "## **TASK 7:** In which hyppocampal subfield(s) does Nordan deviate extremely? \n" 394 | ] 395 | }, 396 | { 397 | "cell_type": "markdown", 398 | "metadata": { 399 | "colab_type": "text", 400 | "id": "jUhmPAOZB0kp" 401 | }, 402 | "source": [ 403 | "No coding necessary just create a presentation which includes recommendations to Nordan and his family. \n", 404 | "Use i) |Z| > 3.6 ii) |Z| > 1.96 as definitions for extreme normative deviations." 405 | ] 406 | }, 407 | { 408 | "cell_type": "markdown", 409 | "metadata": { 410 | "colab_type": "text", 411 | "id": "AqQhxN9pEFGC" 412 | }, 413 | "source": [ 414 | "## **TASK 8 (OPTIONAL):** Implement a function that calculates percentage change. " 415 | ] 416 | }, 417 | { 418 | "cell_type": "markdown", 419 | "metadata": { 420 | "colab_type": "text", 421 | "id": "weASKkZNBMW5" 422 | }, 423 | "source": [ 424 | "Percentage change = $\\frac{x1 - x2}{|x2|}*100$" 425 | ] 426 | }, 427 | { 428 | "cell_type": "code", 429 | "metadata": { 430 | "colab_type": "code", 431 | "id": "0vIt9fd7EmJx", 432 | "colab": {} 433 | }, 434 | "source": [ 435 | "#CODE HERE" 436 | ], 437 | "execution_count": null, 438 | "outputs": [] 439 | }, 440 | { 441 | "cell_type": "markdown", 442 | "metadata": { 443 | "colab_type": "text", 444 | "id": "1Mypo4xrT7ID" 445 | }, 446 | "source": [ 447 | "## **TASK 9 (OPTIONAL):** Visualize percent change\n", 448 | "\n", 449 | "\n", 450 | "\n" 451 | ] 452 | }, 453 | { 454 | "cell_type": "markdown", 455 | "metadata": { 456 | "colab_type": "text", 457 | "id": "1I1Kwv5iBUJj" 458 | }, 459 | "source": [ 460 | "Plot the prercentage change in Yhat of the forward model in reference to age 20. Do that for both sexes seperately." 461 | ] 462 | }, 463 | { 464 | "cell_type": "code", 465 | "metadata": { 466 | "colab_type": "code", 467 | "id": "1DoJid7R1DBX", 468 | "scrolled": true, 469 | "colab": {} 470 | }, 471 | "source": [ 472 | "#CODE HERE" 473 | ], 474 | "execution_count": null, 475 | "outputs": [] 476 | } 477 | ] 478 | } -------------------------------------------------------------------------------- /2020_materials/tasks_key/key_cpc_machinelearning.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "nbformat": 4, 3 | "nbformat_minor": 0, 4 | "metadata": { 5 | "accelerator": "GPU", 6 | "colab": { 7 | "name": "key_cpc_machinelearning.ipynb", 8 | "provenance": [], 9 | "collapsed_sections": [], 10 | "toc_visible": true 11 | }, 12 | "kernelspec": { 13 | "display_name": "Python 3", 14 | "language": "python", 15 | "name": "python3" 16 | }, 17 | "language_info": { 18 | "codemirror_mode": { 19 | "name": "ipython", 20 | "version": 3 21 | }, 22 | "file_extension": ".py", 23 | "mimetype": "text/x-python", 24 | "name": "python", 25 | "nbconvert_exporter": "python", 26 | "pygments_lexer": "ipython3", 27 | "version": "3.7.4" 28 | } 29 | }, 30 | "cells": [ 31 | { 32 | "cell_type": "markdown", 33 | "metadata": { 34 | "colab_type": "text", 35 | "id": "hC8rsih7PHa_" 36 | }, 37 | "source": [ 38 | "# **CPC TUTORIAL ON NORMATIVE MODELING**\n", 39 | "\n", 40 | "\n", 41 | "Created by \n", 42 | "\n", 43 | "Mariam Zabihi [@m_zabihi](https://twitter.com/m_zabihi)\n", 44 | "\n", 45 | "Saige Rutherford [@being_saige](https://twitter.com/being_saige)\n", 46 | "\n", 47 | "Thomas Wolfers [@ThomasWolfers](https://twitter.com/ThomasWolfers)\n", 48 | "_______________________________________________________________________________" 49 | ] 50 | }, 51 | { 52 | "cell_type": "markdown", 53 | "metadata": { 54 | "colab_type": "text", 55 | "id": "irl08XE1AG9v" 56 | }, 57 | "source": [ 58 | "## **Background Story**\n", 59 | "\n", 60 | "Morten and Ingrid are concerned about the health of their father, Nordan. He recently turned 65 years. A few months ago he could not find his way home. Together, they visit a neurologist/psychiatrist to conduct a number of cognitive tests. However, those tests were inconclusive. While Nordan has a relatively low IQ it could not explain his trouble returning home.\n", 61 | "\n", 62 | "Recently, the family heard about a new screening technique called normative modeling with which one can place individuals in reference to a population norm on for instance measures such as brain volume. Nordan would like to undertake this procedure to better know what is going on and to potentially find targets for treatment. Therefore, the family booked an appointment with you, the normative modeling specialist. To find out what is going on you compare Nordan's hyppocampus to the norm and to a group of persons with Dementia disorders, who have a similar IQ, age as well as the same sex as Nordan.\n", 63 | "\n", 64 | "Do your best to get as far as you can. However, you do not need to feel bad if you cannot complete everything during the tutorial.\n" 65 | ] 66 | }, 67 | { 68 | "cell_type": "markdown", 69 | "metadata": { 70 | "colab_type": "text", 71 | "id": "udo6yANOCpvp" 72 | }, 73 | "source": [ 74 | "## **Task 0:** Load data and install the pcntoolkit" 75 | ] 76 | }, 77 | { 78 | "cell_type": "code", 79 | "metadata": { 80 | "colab_type": "code", 81 | "id": "yawDkTLoKYRu", 82 | "colab": {} 83 | }, 84 | "source": [ 85 | "#install normative modeling\n", 86 | "!pip install pcntoolkit" 87 | ], 88 | "execution_count": null, 89 | "outputs": [] 90 | }, 91 | { 92 | "cell_type": "markdown", 93 | "metadata": { 94 | "colab_type": "text", 95 | "id": "EHDKe2ohCxP4" 96 | }, 97 | "source": [ 98 | "**Option 1:** Connect your Google Drive account, and load data from Google Drive. Having Google Drive connected will allow you to save any files created back to your Drive folder. This step will require you to download the csv files from [Github](https://github.com/saigerutherford/CPC_2020/tree/master/data) to your computer, and then make a folder in your Google Drive account and upload the csv files to this folder. " 99 | ] 100 | }, 101 | { 102 | "cell_type": "code", 103 | "metadata": { 104 | "colab_type": "code", 105 | "id": "0SMVyxNZqmlv", 106 | "colab": {} 107 | }, 108 | "source": [ 109 | "from google.colab import drive\n", 110 | "drive.mount('/content/drive')\n", 111 | "\n", 112 | "#change dir to data on your google drive\n", 113 | "import os\n", 114 | "os.chdir('drive/My Drive/name-of-folder-where-you-uploaded-csv-files-from-Github/') #Change this path to match the path to your data in Google Drive\n", 115 | "\n", 116 | "# code by T. Wolfers" 117 | ], 118 | "execution_count": null, 119 | "outputs": [] 120 | }, 121 | { 122 | "cell_type": "markdown", 123 | "metadata": { 124 | "colab_type": "text", 125 | "id": "Bst55nPJDHKb" 126 | }, 127 | "source": [ 128 | "**Option 2:** Import the files directly from Github, and skip adding them to Google Drive." 129 | ] 130 | }, 131 | { 132 | "cell_type": "code", 133 | "metadata": { 134 | "colab_type": "code", 135 | "id": "zuuSkJwPDRrv", 136 | "colab": {} 137 | }, 138 | "source": [ 139 | "!wget -nc https://raw.githubusercontent.com/saigerutherford/CPC_2020/master/data/cpc_camcan_demographics.csv\n", 140 | "!wget -nc https://raw.githubusercontent.com/saigerutherford/CPC_2020/master/data/cpc_camcan_demographics_nordan.csv\n", 141 | "!wget -nc https://raw.githubusercontent.com/saigerutherford/CPC_2020/master/data/cpc_camcan_features.csv\n", 142 | "!wget -nc https://raw.githubusercontent.com/saigerutherford/CPC_2020/master/data/cpc_camcan_features_nordan.csv\n", 143 | " \n", 144 | "# code by S. Rutherford" 145 | ], 146 | "execution_count": null, 147 | "outputs": [] 148 | }, 149 | { 150 | "cell_type": "markdown", 151 | "metadata": { 152 | "colab_type": "text", 153 | "id": "kvSiRjysuGkV" 154 | }, 155 | "source": [ 156 | "## **TASK 1:** Format input data" 157 | ] 158 | }, 159 | { 160 | "cell_type": "markdown", 161 | "metadata": { 162 | "colab_type": "text", 163 | "id": "N2Bon1mJAVjJ" 164 | }, 165 | "source": [ 166 | "You have four files. The features and demographics file for the normsample and two files of the same name for Nordan your test sample. As one of your coworkers has done the preporcessing and quality control there are more subjects in the demographics file than in the features file of the norm sample. Please select the overlap of participants between those two files. \n", 167 | "\n", 168 | "\n", 169 | "*Question for your understanding:*\n", 170 | "\n", 171 | "1) Why do we have to select the overlap between participants in terms of featrues and demographics?" 172 | ] 173 | }, 174 | { 175 | "cell_type": "code", 176 | "metadata": { 177 | "colab_type": "code", 178 | "id": "_RSfxGWku6fU", 179 | "colab": {} 180 | }, 181 | "source": [ 182 | "import pandas as pd\n", 183 | "\n", 184 | "# read in the files.\n", 185 | "norm_demographics = pd.read_csv('cpc_camcan_demographics.csv',\n", 186 | " sep= \",\",\n", 187 | " index_col = 0)\n", 188 | "norm_features = pd.read_csv('cpc_camcan_features.csv',\n", 189 | " sep=\",\",\n", 190 | " index_col = 0)\n", 191 | "\n", 192 | "# check columns through print [there are other better options]\n", 193 | "print(norm_demographics)\n", 194 | "print(norm_features)\n", 195 | "\n", 196 | "# find overlap in terms of participants between norm_sample_features and \n", 197 | "# norm_sample_demographics\n", 198 | "\n", 199 | "norm_demographics_features = pd.concat([norm_demographics, norm_features],\n", 200 | " axis = 1,\n", 201 | " join = 'inner') # inner checks overlap\n", 202 | " # outer combines\n", 203 | "print(norm_demographics_features)\n", 204 | "\n", 205 | "# code by T. Wolfers" 206 | ], 207 | "execution_count": null, 208 | "outputs": [] 209 | }, 210 | { 211 | "cell_type": "markdown", 212 | "metadata": { 213 | "colab_type": "text", 214 | "id": "fUufLg4lQWdn" 215 | }, 216 | "source": [ 217 | "## **TASK 2:** Prepare the covariate_normsample and testresponse_normsample file. " 218 | ] 219 | }, 220 | { 221 | "cell_type": "markdown", 222 | "metadata": { 223 | "colab_type": "text", 224 | "id": "g1i1qp7AAh1Q" 225 | }, 226 | "source": [ 227 | "As mentioned in the introductory presentation those files need a specific format and the entries need to be seperated by spaces. Use whatever method you know to prepare those files based on the data provided in TASK 1. Save those files in .txt format in your drive. Also get rid of the column names and participant IDs.\n", 228 | "\n", 229 | "Given that we only have limited time in this practical we have to make a selection for the features based on your prior knowledge. With the information in mind that Nordan does not remember his way home, which subfield of the hyppocampus is probably a good target for the investigations?\n", 230 | "Select a maximum of four hyppocampal regions as features.\n", 231 | "\n", 232 | "NOTE: Normative modeling is a screening tool we just make this selection due to time constraints, in reality we build these models on millions of putative biomarkers that are not restricted to brain imaging.\n", 233 | "\n", 234 | "\n", 235 | "*Qestions for your understanding:*\n", 236 | "\n", 237 | "2) What is the requirement for the features in terms of variable properties (e.g. dicotomous or continous)? 3) What is the requirement for the covariates in terms of these properties? 4) What are the requirements for both together? 5) How does this depent on the algorithm used?" 238 | ] 239 | }, 240 | { 241 | "cell_type": "code", 242 | "metadata": { 243 | "colab_type": "code", 244 | "id": "lzt6llxyRPyY", 245 | "colab": {} 246 | }, 247 | "source": [ 248 | "# perpare covariate_normsample for sex and age\n", 249 | "covariate_normsample = norm_demographics_features[['sex',\n", 250 | " 'age']] \n", 251 | "\n", 252 | "covariate_normsample.to_csv('covariate_normsample.txt',\n", 253 | " sep = ' ',\n", 254 | " header = False, \n", 255 | " index = False)\n", 256 | "\n", 257 | "# perpare features_normsample for relevant hyppocampal subfields\n", 258 | "features_normsample = norm_demographics_features[['left_CA1', \n", 259 | " 'left_CA3',\n", 260 | " 'right_CA1',\n", 261 | " 'right_CA3']]\n", 262 | "\n", 263 | "features_normsample.to_csv('features_normsample.txt', \n", 264 | " sep = ' ', \n", 265 | " header = False, \n", 266 | " index = False)\n", 267 | "\n", 268 | "# code by T. Wolfers" 269 | ], 270 | "execution_count": null, 271 | "outputs": [] 272 | }, 273 | { 274 | "cell_type": "markdown", 275 | "metadata": { 276 | "colab_type": "text", 277 | "id": "irR4FAIvQ8ds" 278 | }, 279 | "source": [ 280 | "## **TASK 3:** Estimate normative model\n" 281 | ] 282 | }, 283 | { 284 | "cell_type": "markdown", 285 | "metadata": { 286 | "colab_type": "text", 287 | "id": "XV61hQUoA1Kd" 288 | }, 289 | "source": [ 290 | "Once you have prepared and saved all the necessary files. Look at the pcntoolkit for running normative modeling. Select an appropritate method set up the toolkit and run your analyses using 2-fold cross validation in the normsample. Change the output suffix from estimate to '_2fold'. \n", 291 | "\n", 292 | "HINT: You primarily need the estimate function. \n", 293 | "\n", 294 | "SUGGESTION: While this process is running you can go to the next TASK 4, you will have no doubt when it is correctly running.\n", 295 | "\n", 296 | "*Question for your understaning:*\n", 297 | "\n", 298 | "6) What does cvfolds mean and why do we use it? 7) What is the output of the estimate function and what does it mean?" 299 | ] 300 | }, 301 | { 302 | "cell_type": "code", 303 | "metadata": { 304 | "colab_type": "code", 305 | "id": "yRTusEg6SRNL", 306 | "colab": {} 307 | }, 308 | "source": [ 309 | "import pcntoolkit as pcn\n", 310 | "\n", 311 | "# run normative modeling using 2-fold cross-validation\n", 312 | "\n", 313 | "pcn.normative.estimate(covfile = 'covariate_normsample.txt', \n", 314 | " respfile = 'features_normsample.txt',\n", 315 | " cvfolds = 2,\n", 316 | " alg = 'gpr',\n", 317 | " outputsuffix = '_2fold')\n", 318 | "\n", 319 | "# code by T. Wolfers" 320 | ], 321 | "execution_count": null, 322 | "outputs": [] 323 | }, 324 | { 325 | "cell_type": "markdown", 326 | "metadata": { 327 | "colab_type": "text", 328 | "id": "Nonuk7d_SNM6" 329 | }, 330 | "source": [ 331 | "## **TASK 4:** Estimate the forward model of the normative model\n" 332 | ] 333 | }, 334 | { 335 | "cell_type": "markdown", 336 | "metadata": { 337 | "colab_type": "text", 338 | "id": "fmn4TD_tBE70" 339 | }, 340 | "source": [ 341 | "In order to visulize the normative trajectories you first need to run the forward model. To this end you need to set up an appropriate covariate_forwardmodel file that covers the age range appropriately for both sexes. Save this file as .txt . Then you can input the files you made in TASK 1 as well as the file you made now and run the forward model using the appropriate specifications.\n", 342 | "\n", 343 | "*Question for your understaning:*\n", 344 | "\n", 345 | "8) What is yhat and ys2? 9) Why does the output of the forward model does not inlcude the Z-scores?" 346 | ] 347 | }, 348 | { 349 | "cell_type": "code", 350 | "metadata": { 351 | "colab_type": "code", 352 | "id": "22U-knkWSPsZ", 353 | "colab": {} 354 | }, 355 | "source": [ 356 | "# create covariate_forwardmodel.txt file\n", 357 | "covariate_forwardmodel = {'sex': [0, 0, 0, 0, 0, 0, 0,\n", 358 | " 1, 1, 1, 1, 1, 1, 1],\n", 359 | " 'age': [20, 30, 40, 50, 60, 70, 80,\n", 360 | " 20, 30, 40, 50, 60, 70, 80]}\n", 361 | "covariate_forwardmodel = pd.DataFrame(data=covariate_forwardmodel)\n", 362 | "\n", 363 | "covariate_forwardmodel.to_csv('covariate_forwardmodel.txt', \n", 364 | " sep = ' ', \n", 365 | " header = False, \n", 366 | " index = False)\n", 367 | "\n", 368 | "# estimate forward model\n", 369 | "pcn.normative.estimate(covfile = 'covariate_normsample.txt', \n", 370 | " respfile = 'features_normsample.txt',\n", 371 | " testcov = 'covariate_forwardmodel.txt',\n", 372 | " cvfolds = None,\n", 373 | " alg = 'gpr',\n", 374 | " outputsuffix = '_forward')\n", 375 | "\n", 376 | "# code by T. Wolfers" 377 | ], 378 | "execution_count": null, 379 | "outputs": [] 380 | }, 381 | { 382 | "cell_type": "markdown", 383 | "metadata": { 384 | "colab_type": "text", 385 | "id": "wxeZlXshQ7eS" 386 | }, 387 | "source": [ 388 | "## **TASK 5:** Visualize forward model" 389 | ] 390 | }, 391 | { 392 | "cell_type": "markdown", 393 | "metadata": { 394 | "colab_type": "text", 395 | "id": "BVodlChrBg25" 396 | }, 397 | "source": [ 398 | "Visualize the forward model of the normative model similar to the figure below.\n", 399 | "\n", 400 | "![1-s2.0-S245190221830329X-gr2.jpg]()\n", 401 | "\n", 402 | "HINT: First create a function that calculates the confidence intervals and then plot yhat, y2 of the forward model. Finally, plot the data of individual participants." 403 | ] 404 | }, 405 | { 406 | "cell_type": "code", 407 | "metadata": { 408 | "colab_type": "code", 409 | "id": "ii0H9GDwv-ha", 410 | "colab": {} 411 | }, 412 | "source": [ 413 | "import numpy as np\n", 414 | "import matplotlib.pyplot as plt\n", 415 | "\n", 416 | "# confidence interval calculation at x_forward\n", 417 | "def confidence_interval(s2,x,z):\n", 418 | " CI=np.zeros((len(x_forward),4))\n", 419 | " for i,xdot in enumerate(x_forward):\n", 420 | " ci_inx=np.isin(x,xdot)\n", 421 | " S2=s2[ci_inx]\n", 422 | " S_hat=np.mean(S2,axis=0)\n", 423 | " n=S2.shape[0]\n", 424 | " CI[i,:]=z*np.power(S_hat/n,.5)\n", 425 | " return CI \n", 426 | "\n", 427 | "\n", 428 | "feature_names=['left_CA1','left_CA3','right_CA1','right_CA3']\n", 429 | "sex_covariates=[ 'Female','Male']\n", 430 | "# Creating plots for Female and male \n", 431 | "for i,sex in enumerate(sex_covariates):\n", 432 | "#forward model data\n", 433 | " forward_yhat = pd.read_csv('yhat_forward.txt', sep = ' ', header=None)\n", 434 | " yhat_forward=forward_yhat.values\n", 435 | " yhat_forward=yhat_forward[7*i:7*(i+1)]\n", 436 | " x_forward=[20, 30, 40, 50, 60, 70, 80]\n", 437 | "\n", 438 | "# Find the index of the data exclusively for one sex. Female:0, Male: 1 \n", 439 | " inx=np.where(covariate_normsample.sex==i)[0]\n", 440 | " x=covariate_normsample.values[inx,1]\n", 441 | "# actual data\n", 442 | " y = pd.read_csv('features_normsample.txt', sep = ' ', header=None)\n", 443 | " y=y.values[inx]\n", 444 | "# confidence Interval yhat+ z *(std/n^.5)-->.95 % CI:z=1.96, 99% CI:z=2.58 \n", 445 | " s2= pd.read_csv('ys2_2fold.txt', sep = ' ', header=None)\n", 446 | " s2=s2.values[inx]\n", 447 | "\n", 448 | " CI_95=confidence_interval(s2,x,1.96)\n", 449 | " CI_99=confidence_interval(s2,x,2.58)\n", 450 | "\n", 451 | "# Creat a trejactroy for each point \n", 452 | " for j,name in enumerate(feature_names):\n", 453 | " fig=plt.figure()\n", 454 | " ax=fig.add_subplot(111)\n", 455 | " ax.plot(x_forward,yhat_forward[:,j], linewidth=4, label='Normative trejactory')\n", 456 | "\n", 457 | "\n", 458 | " ax.plot(x_forward,CI_95[:,j]+yhat_forward[:,j], linewidth=2,linestyle='--',c='g', label='95% confidence interval') \n", 459 | " ax.plot(x_forward,-CI_95[:,j]+yhat_forward[:,j], linewidth=2,linestyle='--',c='g') \n", 460 | "\n", 461 | " ax.plot(x_forward,CI_99[:,j]+yhat_forward[:,j], linewidth=1,linestyle='--',c='k', label='99% confidence interval') \n", 462 | " ax.plot(x_forward,-CI_99[:,j]+yhat_forward[:,j], linewidth=1,linestyle='--',c='k') \n", 463 | "\n", 464 | " ax.scatter(x,y[:,j],c='r', label=name)\n", 465 | " plt.legend(loc='upper left')\n", 466 | " plt.title('Normative trejectory of' +name+' in '+sex+' cohort')\n", 467 | " plt.show()\n", 468 | " plt.close()\n", 469 | " \n", 470 | "# code by M. Zabihi" 471 | ], 472 | "execution_count": null, 473 | "outputs": [] 474 | }, 475 | { 476 | "cell_type": "markdown", 477 | "metadata": { 478 | "colab_type": "text", 479 | "id": "yM4z1BtyWwiF" 480 | }, 481 | "source": [ 482 | "## **TASK 6:** Apply the normative model to Nordan's data and the dementia patients." 483 | ] 484 | }, 485 | { 486 | "cell_type": "code", 487 | "metadata": { 488 | "colab_type": "code", 489 | "id": "eVTYxKjvWBvm", 490 | "colab": {} 491 | }, 492 | "source": [ 493 | "# read in Nordan's as well as the patient's demographics and features\n", 494 | "demographics_nordan = pd.read_csv('cpc_camcan_demographics_nordan.csv',\n", 495 | " sep= \",\",\n", 496 | " index_col = 0)\n", 497 | "features_nordan = pd.read_csv('cpc_camcan_features_nordan.csv',\n", 498 | " sep=\",\",\n", 499 | " index_col = 0)\n", 500 | "\n", 501 | "# create a covariate file for Nordan's as well as the patient's demograhpics\n", 502 | "covariate_nordan = demographics_nordan[['sex',\n", 503 | " 'age']] \n", 504 | "covariate_nordan.to_csv('covariate_nordan.txt',\n", 505 | " sep = ' ',\n", 506 | " header = False, \n", 507 | " index = False)\n", 508 | "\n", 509 | "# create the corresponding feature file\n", 510 | "features_nordan = features_nordan[['left_CA1', \n", 511 | " 'left_CA3',\n", 512 | " 'right_CA1',\n", 513 | " 'right_CA3']]\n", 514 | "\n", 515 | "features_nordan.to_csv('features_nordan.txt', \n", 516 | " sep = ' ', \n", 517 | " header = False, \n", 518 | " index = False)\n", 519 | "\n", 520 | "# apply normative modeling\n", 521 | "pcn.normative.estimate(covfile = 'covariate_normsample.txt', \n", 522 | " respfile = 'features_normsample.txt',\n", 523 | " testcov = 'covariate_nordan.txt',\n", 524 | " testresp = 'features_nordan.txt',\n", 525 | " cvfolds = None,\n", 526 | " alg = 'gpr',\n", 527 | " outputsuffix = '_nordan')\n", 528 | "\n", 529 | "# code by T. Wolfers" 530 | ], 531 | "execution_count": null, 532 | "outputs": [] 533 | }, 534 | { 535 | "cell_type": "markdown", 536 | "metadata": { 537 | "colab_type": "text", 538 | "id": "LFnHCy0XVVwl" 539 | }, 540 | "source": [ 541 | "## **TASK 7:** In which hyppocampal subfield(s) does Nordan deviate extremely? \n" 542 | ] 543 | }, 544 | { 545 | "cell_type": "markdown", 546 | "metadata": { 547 | "colab_type": "text", 548 | "id": "jUhmPAOZB0kp" 549 | }, 550 | "source": [ 551 | "No coding necessary just create a presentation which includes recommendations to Nordan and his family. \n", 552 | "Use i) |Z| > 3.6 ii) |Z| > 1.96 as definitions for extreme normative deviations." 553 | ] 554 | }, 555 | { 556 | "cell_type": "markdown", 557 | "metadata": { 558 | "colab_type": "text", 559 | "id": "AqQhxN9pEFGC" 560 | }, 561 | "source": [ 562 | "## **TASK 8 (OPTIONAL):** Implement a function that calculates percentage change. " 563 | ] 564 | }, 565 | { 566 | "cell_type": "markdown", 567 | "metadata": { 568 | "colab_type": "text", 569 | "id": "weASKkZNBMW5" 570 | }, 571 | "source": [ 572 | "Percentage change = $\\frac{x1 - x2}{|x2|}*100$" 573 | ] 574 | }, 575 | { 576 | "cell_type": "code", 577 | "metadata": { 578 | "colab_type": "code", 579 | "id": "0vIt9fd7EmJx", 580 | "colab": {} 581 | }, 582 | "source": [ 583 | "# function that calculates percentage change\n", 584 | "def calculate_percentage_change(x1, x2):\n", 585 | " percentage_change = ((x1 - x2) / abs(x2)) * 100\n", 586 | " return percentage_change\n", 587 | "\n", 588 | "# code by T. Wolfers" 589 | ], 590 | "execution_count": null, 591 | "outputs": [] 592 | }, 593 | { 594 | "cell_type": "markdown", 595 | "metadata": { 596 | "colab_type": "text", 597 | "id": "1Mypo4xrT7ID" 598 | }, 599 | "source": [ 600 | "## **TASK 9 (OPTIONAL):** Visualize percent change\n", 601 | "\n", 602 | "\n", 603 | "\n" 604 | ] 605 | }, 606 | { 607 | "cell_type": "markdown", 608 | "metadata": { 609 | "colab_type": "text", 610 | "id": "1I1Kwv5iBUJj" 611 | }, 612 | "source": [ 613 | "Plot the prercentage change in Yhat of the forward model in reference to age 20. Do that for both sexes seperately." 614 | ] 615 | }, 616 | { 617 | "cell_type": "code", 618 | "metadata": { 619 | "colab_type": "code", 620 | "id": "1DoJid7R1DBX", 621 | "scrolled": true, 622 | "colab": {} 623 | }, 624 | "source": [ 625 | "import matplotlib.pyplot as plt\n", 626 | "\n", 627 | "forward_yhat = pd.read_csv('yhat_forward.txt', sep = ' ', header=None)\n", 628 | "\n", 629 | "# You can indicate here which hypocampal subfield you like to visualize\n", 630 | "hyppocampal_subfield = 0\n", 631 | "\n", 632 | "percentage_change_female = []\n", 633 | "percentage_change_male = []\n", 634 | "count = 0\n", 635 | "lengths = len(forward_yhat[hyppocampal_subfield])\n", 636 | "for entry in forward_yhat[hyppocampal_subfield]:\n", 637 | " if count > 0 and count < 7:\n", 638 | " loop_percentage_change_female = calculate_percentage_change(entry, \n", 639 | " forward_yhat.iloc[0,\n", 640 | " hyppocampal_subfield])\n", 641 | " percentage_change_female.append(loop_percentage_change_female)\n", 642 | " elif count > 7: \n", 643 | " loop_percentage_change_male = calculate_percentage_change(entry,\n", 644 | " forward_yhat.iloc[9,\n", 645 | " hyppocampal_subfield])\n", 646 | " percentage_change_male.append(loop_percentage_change_male)\n", 647 | " count = count + 1 \n", 648 | "\n", 649 | "names = ['30 compared to 20 years', \n", 650 | " '40 compared to 20 years', \n", 651 | " '50 compared to 20 years', \n", 652 | " '60 compared to 20 years', \n", 653 | " '70 compared to 20 years',\n", 654 | " '80 compared to 20 years']\n", 655 | "\n", 656 | "# females\n", 657 | "plt.subplot(121)\n", 658 | "plt.bar(names, percentage_change_female)\n", 659 | "plt.xticks(rotation=90)\n", 660 | "plt.ylim(-20, 2)\n", 661 | "\n", 662 | "# males\n", 663 | "plt.subplot(122)\n", 664 | "plt.bar(names, percentage_change_male)\n", 665 | "plt.xticks(rotation=90)\n", 666 | "plt.ylim(-20, 2)\n", 667 | "\n", 668 | "# code by T. Wolfers" 669 | ], 670 | "execution_count": null, 671 | "outputs": [] 672 | } 673 | ] 674 | } -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # Machine Learning with Normative Modeling Tutorial 2 | # Computational Psychiatry Course 2022 3 | This repository contains written instructions, links to code, and data used for the (virtual) Machine Learning/Normative Modeling Practical at the [Computational Psychiatry Course](https://www.translationalneuromodeling.org/cpcourse/) on September 17th, 2022. 4 | 5 | This repository is a group effort by [Saige Rutherford](https://twitter.com/being_saige) and [Thomas Wolfers](https://twitter.com/ThomasWolfers). 6 | 7 | We will be running all of our code in Google Colab python notebooks. These are essentially Jupyter notebooks run in the :cloud: *cloud* :cloud:. 8 | Running our code using Colab will save us from dealing with python library installation and virtual environment setup. 9 | It also ensures that we are all working on the same operating system which makes troubleshooting much easier (since there are only 2 instructors and lots of students)! 10 | 11 | If you have never used Google Colab before, you can check out an introduction notebook with lots of helpful links here: [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/notebooks/intro.ipynb) 12 | 13 | We will also be using the Pandas library for a lot of our code. There is a great intro to Pandas Colab notebook here: [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/notebooks/mlcc/intro_to_pandas.ipynb) 14 | 15 | Other helpful pandas:panda_face:/plotting:bar_chart: links (not required to do during the practial, just added for those who might need extra python help): 16 | 1. [Pandas cheatsheet](https://pandas.pydata.org/Pandas_Cheat_Sheet.pdf) 17 | 2. [Pandas Selecting/Indexing API](https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html) 18 | 19 | ### :warning: Setup instructions for Google Colab :warning: 20 | You can open the python notebook that we will use in this practical directly from this Github account (the links to the notebook are at the bottom of this Read Me file). Before you open the notebook, make sure you are logged into a Google account. All of the code has been tested using Google Chrome web browser. When you are ready to begin, you will click on the **template** Google Colab button below. This will launch a new browser tab with the Google Colab notebook. 21 | 22 | Once you are in the Colab notebook tab, in the top right corner you will see a `Connect` (or `Reconnect`) button. Click on this, and a dropdown menu will appear as shown below. Click on `Connect to hosted runtime` this will allow you to run the notebook using Google’s cloud resources, which are likely much faster than your computer. If you would prefer to use your own computer’s resources (this is not recommended and instructors will not be able to help you troubleshoot if you are not running the notebook in the cloud), select `Connect to local runtime`. 23 | 24 | :warning: Note: sometimes if the notebook is left running for a long time without any activity (i.e. your computer goes to sleep), you will be disconnected from the runtime. In that case, you will need to click on this same button. It will appear as `Reconnect` instead of `Connect`. You will also need to re-run all code blocks. 25 | 26 | ![](presentation/Runtime1.png) 27 | 28 | :arrow_right: If you are using the Google cloud hosted option: in the upper left corner, you will see a button called `Runtime`. Click on `Runtime`, and another dropdown panel will appear (as shown below). Click on `Change runtime type`. 29 | 30 | ![](presentation/Runtime2.png) 31 | 32 | :arrow_right: This box will open, and you can click the `GPU` option, then click `save`. 33 | 34 | ![](presentation/GPU.png) 35 | 36 | :arrow_right: In the same menu you used to change the runtime, there are several other optional things you can explore that may make your interacting with the notebook easier. Under ‘Tools’ there is a ‘Settings’ tab, which you can use to change the theme to light or dark mode using the ‘Site’ sub-tab. Then under the ‘Miscellaneous’ sub-tab, you can select Corgi or Kitty mode, and this will make cute animals walk across the top of your screen. There is no practical utility to this whatsoever, and it is for the sole purpose that cute animals spark joy. 37 | 38 | :arrow_right: Also under the ‘Tools’ tab, there is an option to look at Keyboard shortcuts. You don’t need to change any of these, but you can review some of them if you want to learn about speeding up your coding practice. 39 | 40 | ![](presentation/keyboard_pref.png) 41 | 42 | :arrow_right: In the Colab python notebook, there are 2 types of cells: text cells & ```code cells```. The text cells have plain text in them, that the notebook will not interpret as code. These are the cells that contain the background story & task instructions. The ```code``` cells have a :arrow_forward: play button on the left side. These are the cells that the notebook will run as code. To run a ```code cell```, you can either click on the play button :arrow_forward: on the left side or use ‘Shift + Enter’ (your cursor must be inside the code cell). 43 | 44 | ### Now you are ready to begin coding :brain: :computer:! 45 | ### Good luck :four_leaf_clover: and remember to have fun :smiley:! 46 | 47 | Before clicking on the colab button below, make sure you are logged into a google account and using Chrome or Firefox internet browser (hopefully a current version) 48 | 49 | **Task 1: Fitting normative models from scratch** [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/saigerutherford/CPC_ML_tutorial/blob/master/tasks/1_fit_normative_models.ipynb) 50 | 51 | **Task 2: Applying pre-trained normative models** [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/saigerutherford/CPC_ML_tutorial/blob/master/tasks/2_apply_normative_models.ipynb) 52 | 53 | **Task 3: Interpreting and visualizing the outputs of normative models** [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/saigerutherford/CPC_ML_tutorial/blob/master/tasks/3_Visualizations.ipynb) 54 | 55 | **Task 4: Using the outputs (Z-scores) as features in predictive model** [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/saigerutherford/CPC_ML_tutorial/blob/master/tasks/4_post_hoc_analysis.ipynb) 56 | 57 | -------------------------------------------------------------------------------- /data/nilearn_order.csv: -------------------------------------------------------------------------------- 1 | ROI 2 | G_and_S_frontomargin 3 | G_and_S_occipital_inf 4 | G_and_S_paracentral 5 | G_and_S_subcentral 6 | G_and_S_transv_frontopol 7 | G_and_S_cingul-Ant 8 | G_and_S_cingul-Mid-Ant 9 | G_and_S_cingul-Mid-Post 10 | G_cingul-Post-dorsal 11 | G_cingul-Post-ventral 12 | G_cuneus 13 | G_front_inf-Opercular 14 | G_front_inf-Orbital 15 | G_front_inf-Triangul 16 | G_front_middle 17 | G_front_sup 18 | G_Ins_lg_and_S_cent_ins 19 | G_insular_short 20 | G_occipital_middle 21 | G_occipital_sup 22 | G_oc-temp_lat-fusifor 23 | G_oc-temp_med-Lingual 24 | G_oc-temp_med-Parahip 25 | G_orbital 26 | G_pariet_inf-Angular 27 | G_pariet_inf-Supramar 28 | G_parietal_sup 29 | G_postcentral 30 | G_precentral 31 | G_precuneus 32 | G_rectus 33 | G_subcallosal 34 | G_temp_sup-G_T_transv 35 | G_temp_sup-Lateral 36 | G_temp_sup-Plan_polar 37 | G_temp_sup-Plan_tempo 38 | G_temporal_inf 39 | G_temporal_middle 40 | Lat_Fis-ant-Horizont 41 | Lat_Fis-ant-Vertical 42 | Lat_Fis-post 43 | Medial_wall 44 | Pole_occipital 45 | Pole_temporal 46 | S_calcarine 47 | S_central 48 | S_cingul-Marginalis 49 | S_circular_insula_ant 50 | S_circular_insula_inf 51 | S_circular_insula_sup 52 | S_collat_transv_ant 53 | S_collat_transv_post 54 | S_front_inf 55 | S_front_middle 56 | S_front_sup 57 | S_interm_prim-Jensen 58 | S_intrapariet_and_P_trans 59 | S_oc_middle_and_Lunatus 60 | S_oc_sup_and_transversal 61 | S_occipital_ant 62 | S_oc-temp_lat 63 | S_oc-temp_med_and_Lingual 64 | S_orbital_lateral 65 | S_orbital_med-olfact 66 | S_orbital-H_Shaped 67 | S_parieto_occipital 68 | S_pericallosal 69 | S_postcentral 70 | S_precentral-inf-part 71 | S_precentral-sup-part 72 | S_suborbital 73 | S_subparietal 74 | S_temporal_inf 75 | S_temporal_sup 76 | S_temporal_transverse 77 | -------------------------------------------------------------------------------- /data/phenotypes_lh.txt: -------------------------------------------------------------------------------- 1 | lh_G&S_frontomargin_thickness 2 | lh_G&S_occipital_inf_thickness 3 | lh_G&S_paracentral_thickness 4 | lh_G&S_subcentral_thickness 5 | lh_G&S_transv_frontopol_thickness 6 | lh_G&S_cingul-Ant_thickness 7 | lh_G&S_cingul-Mid-Ant_thickness 8 | lh_G&S_cingul-Mid-Post_thickness 9 | lh_G_cingul-Post-dorsal_thickness 10 | lh_G_cingul-Post-ventral_thickness 11 | lh_G_cuneus_thickness 12 | lh_G_front_inf-Opercular_thickness 13 | lh_G_front_inf-Orbital_thickness 14 | lh_G_front_inf-Triangul_thickness 15 | lh_G_front_middle_thickness 16 | lh_G_front_sup_thickness 17 | lh_G_Ins_lg&S_cent_ins_thickness 18 | lh_G_insular_short_thickness 19 | lh_G_occipital_middle_thickness 20 | lh_G_occipital_sup_thickness 21 | lh_G_oc-temp_lat-fusifor_thickness 22 | lh_G_oc-temp_med-Lingual_thickness 23 | lh_G_oc-temp_med-Parahip_thickness 24 | lh_G_orbital_thickness 25 | lh_G_pariet_inf-Angular_thickness 26 | lh_G_pariet_inf-Supramar_thickness 27 | lh_G_parietal_sup_thickness 28 | lh_G_postcentral_thickness 29 | lh_G_precentral_thickness 30 | lh_G_precuneus_thickness 31 | lh_G_rectus_thickness 32 | lh_G_subcallosal_thickness 33 | lh_G_temp_sup-G_T_transv_thickness 34 | lh_G_temp_sup-Lateral_thickness 35 | lh_G_temp_sup-Plan_polar_thickness 36 | lh_G_temp_sup-Plan_tempo_thickness 37 | lh_G_temporal_inf_thickness 38 | lh_G_temporal_middle_thickness 39 | lh_Lat_Fis-ant-Horizont_thickness 40 | lh_Lat_Fis-ant-Vertical_thickness 41 | lh_Lat_Fis-post_thickness 42 | lh_Pole_occipital_thickness 43 | lh_Pole_temporal_thickness 44 | lh_S_calcarine_thickness 45 | lh_S_central_thickness 46 | lh_S_cingul-Marginalis_thickness 47 | lh_S_circular_insula_ant_thickness 48 | lh_S_circular_insula_inf_thickness 49 | lh_S_circular_insula_sup_thickness 50 | lh_S_collat_transv_ant_thickness 51 | lh_S_collat_transv_post_thickness 52 | lh_S_front_inf_thickness 53 | lh_S_front_middle_thickness 54 | lh_S_front_sup_thickness 55 | lh_S_interm_prim-Jensen_thickness 56 | lh_S_intrapariet&P_trans_thickness 57 | lh_S_oc_middle&Lunatus_thickness 58 | lh_S_oc_sup&transversal_thickness 59 | lh_S_occipital_ant_thickness 60 | lh_S_oc-temp_lat_thickness 61 | lh_S_oc-temp_med&Lingual_thickness 62 | lh_S_orbital_lateral_thickness 63 | lh_S_orbital_med-olfact_thickness 64 | lh_S_orbital-H_Shaped_thickness 65 | lh_S_parieto_occipital_thickness 66 | lh_S_pericallosal_thickness 67 | lh_S_postcentral_thickness 68 | lh_S_precentral-inf-part_thickness 69 | lh_S_precentral-sup-part_thickness 70 | lh_S_suborbital_thickness 71 | lh_S_subparietal_thickness 72 | lh_S_temporal_inf_thickness 73 | lh_S_temporal_sup_thickness 74 | lh_S_temporal_transverse_thickness 75 | lh_MeanThickness_thickness -------------------------------------------------------------------------------- /data/phenotypes_rh.txt: -------------------------------------------------------------------------------- 1 | rh_G&S_frontomargin_thickness 2 | rh_G&S_occipital_inf_thickness 3 | rh_G&S_paracentral_thickness 4 | rh_G&S_subcentral_thickness 5 | rh_G&S_transv_frontopol_thickness 6 | rh_G&S_cingul-Ant_thickness 7 | rh_G&S_cingul-Mid-Ant_thickness 8 | rh_G&S_cingul-Mid-Post_thickness 9 | rh_G_cingul-Post-dorsal_thickness 10 | rh_G_cingul-Post-ventral_thickness 11 | rh_G_cuneus_thickness 12 | rh_G_front_inf-Opercular_thickness 13 | rh_G_front_inf-Orbital_thickness 14 | rh_G_front_inf-Triangul_thickness 15 | rh_G_front_middle_thickness 16 | rh_G_front_sup_thickness 17 | rh_G_Ins_lg&S_cent_ins_thickness 18 | rh_G_insular_short_thickness 19 | rh_G_occipital_middle_thickness 20 | rh_G_occipital_sup_thickness 21 | rh_G_oc-temp_lat-fusifor_thickness 22 | rh_G_oc-temp_med-Lingual_thickness 23 | rh_G_oc-temp_med-Parahip_thickness 24 | rh_G_orbital_thickness 25 | rh_G_pariet_inf-Angular_thickness 26 | rh_G_pariet_inf-Supramar_thickness 27 | rh_G_parietal_sup_thickness 28 | rh_G_postcentral_thickness 29 | rh_G_precentral_thickness 30 | rh_G_precuneus_thickness 31 | rh_G_rectus_thickness 32 | rh_G_subcallosal_thickness 33 | rh_G_temp_sup-G_T_transv_thickness 34 | rh_G_temp_sup-Lateral_thickness 35 | rh_G_temp_sup-Plan_polar_thickness 36 | rh_G_temp_sup-Plan_tempo_thickness 37 | rh_G_temporal_inf_thickness 38 | rh_G_temporal_middle_thickness 39 | rh_Lat_Fis-ant-Horizont_thickness 40 | rh_Lat_Fis-ant-Vertical_thickness 41 | rh_Lat_Fis-post_thickness 42 | rh_Pole_occipital_thickness 43 | rh_Pole_temporal_thickness 44 | rh_S_calcarine_thickness 45 | rh_S_central_thickness 46 | rh_S_cingul-Marginalis_thickness 47 | rh_S_circular_insula_ant_thickness 48 | rh_S_circular_insula_inf_thickness 49 | rh_S_circular_insula_sup_thickness 50 | rh_S_collat_transv_ant_thickness 51 | rh_S_collat_transv_post_thickness 52 | rh_S_front_inf_thickness 53 | rh_S_front_middle_thickness 54 | rh_S_front_sup_thickness 55 | rh_S_interm_prim-Jensen_thickness 56 | rh_S_intrapariet&P_trans_thickness 57 | rh_S_oc_middle&Lunatus_thickness 58 | rh_S_oc_sup&transversal_thickness 59 | rh_S_occipital_ant_thickness 60 | rh_S_oc-temp_lat_thickness 61 | rh_S_oc-temp_med&Lingual_thickness 62 | rh_S_orbital_lateral_thickness 63 | rh_S_orbital_med-olfact_thickness 64 | rh_S_orbital-H_Shaped_thickness 65 | rh_S_parieto_occipital_thickness 66 | rh_S_pericallosal_thickness 67 | rh_S_postcentral_thickness 68 | rh_S_precentral-inf-part_thickness 69 | rh_S_precentral-sup-part_thickness 70 | rh_S_suborbital_thickness 71 | rh_S_subparietal_thickness 72 | rh_S_temporal_inf_thickness 73 | rh_S_temporal_sup_thickness 74 | rh_S_temporal_transverse_thickness 75 | rh_MeanThickness_thickness -------------------------------------------------------------------------------- /data/phenotypes_sc.txt: -------------------------------------------------------------------------------- 1 | Left-Lateral-Ventricle 2 | Left-Inf-Lat-Vent 3 | Left-Cerebellum-White-Matter 4 | Left-Cerebellum-Cortex 5 | Left-Thalamus-Proper 6 | Left-Caudate 7 | Left-Putamen 8 | Left-Pallidum 9 | 3rd-Ventricle 10 | 4th-Ventricle 11 | Brain-Stem 12 | Left-Hippocampus 13 | Left-Amygdala 14 | CSF 15 | Left-Accumbens-area 16 | Left-VentralDC 17 | Left-vessel 18 | Left-choroid-plexus 19 | Right-Lateral-Ventricle 20 | Right-Inf-Lat-Vent 21 | Right-Cerebellum-White-Matter 22 | Right-Cerebellum-Cortex 23 | Right-Thalamus-Proper 24 | Right-Caudate 25 | Right-Putamen 26 | Right-Pallidum 27 | Right-Hippocampus 28 | Right-Amygdala 29 | Right-Accumbens-area 30 | Right-VentralDC 31 | Right-vessel 32 | Right-choroid-plexus 33 | SubCortGrayVol 34 | TotalGrayVol 35 | SupraTentorialVol 36 | SupraTentorialVolNotVent 37 | EstimatedTotalIntraCranialVol 38 | -------------------------------------------------------------------------------- /data/sz_ct.npy: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/saigerutherford/CPC_ML_tutorial/6e77e6e87b2780063dae57e87f0923dd54cf1845/data/sz_ct.npy -------------------------------------------------------------------------------- /data/sz_labels.npy: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/saigerutherford/CPC_ML_tutorial/6e77e6e87b2780063dae57e87f0923dd54cf1845/data/sz_labels.npy -------------------------------------------------------------------------------- /data/sz_z.npy: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/saigerutherford/CPC_ML_tutorial/6e77e6e87b2780063dae57e87f0923dd54cf1845/data/sz_z.npy -------------------------------------------------------------------------------- /data/task1_phenotypes.txt: -------------------------------------------------------------------------------- 1 | lh_bankssts_thickness 2 | lh_caudalanteriorcingulate_thickness 3 | lh_caudalmiddlefrontal_thickness 4 | lh_cuneus_thickness 5 | lh_entorhinal_thickness 6 | lh_fusiform_thickness 7 | lh_inferiorparietal_thickness 8 | lh_inferiortemporal_thickness 9 | lh_isthmuscingulate_thickness 10 | lh_lateraloccipital_thickness 11 | lh_lateralorbitofrontal_thickness 12 | lh_lingual_thickness 13 | lh_medialorbitofrontal_thickness 14 | lh_middletemporal_thickness 15 | lh_parahippocampal_thickness 16 | lh_paracentral_thickness 17 | lh_parsopercularis_thickness 18 | lh_parsorbitalis_thickness 19 | lh_parstriangularis_thickness 20 | lh_pericalcarine_thickness 21 | lh_postcentral_thickness 22 | lh_posteriorcingulate_thickness 23 | lh_precentral_thickness 24 | lh_precuneus_thickness 25 | lh_rostralanteriorcingulate_thickness 26 | lh_rostralmiddlefrontal_thickness 27 | lh_superiorfrontal_thickness 28 | lh_superiorparietal_thickness 29 | lh_superiortemporal_thickness 30 | lh_supramarginal_thickness 31 | lh_frontalpole_thickness 32 | lh_temporalpole_thickness 33 | lh_transversetemporal_thickness 34 | lh_insula_thickness 35 | lh_MeanThickness_thickness 36 | rh_bankssts_thickness 37 | rh_caudalanteriorcingulate_thickness 38 | rh_caudalmiddlefrontal_thickness 39 | rh_cuneus_thickness 40 | rh_entorhinal_thickness 41 | rh_fusiform_thickness 42 | rh_inferiorparietal_thickness 43 | rh_inferiortemporal_thickness 44 | rh_isthmuscingulate_thickness 45 | rh_lateraloccipital_thickness 46 | rh_lateralorbitofrontal_thickness 47 | rh_lingual_thickness 48 | rh_medialorbitofrontal_thickness 49 | rh_middletemporal_thickness 50 | rh_parahippocampal_thickness 51 | rh_paracentral_thickness 52 | rh_parsopercularis_thickness 53 | rh_parsorbitalis_thickness 54 | rh_parstriangularis_thickness 55 | rh_pericalcarine_thickness 56 | rh_postcentral_thickness 57 | rh_posteriorcingulate_thickness 58 | rh_precentral_thickness 59 | rh_precuneus_thickness 60 | rh_rostralanteriorcingulate_thickness 61 | rh_rostralmiddlefrontal_thickness 62 | rh_superiorfrontal_thickness 63 | rh_superiorparietal_thickness 64 | rh_superiortemporal_thickness 65 | rh_supramarginal_thickness 66 | rh_frontalpole_thickness 67 | rh_temporalpole_thickness 68 | rh_transversetemporal_thickness 69 | rh_insula_thickness 70 | rh_MeanThickness_thickness -------------------------------------------------------------------------------- /nm_utils.py: -------------------------------------------------------------------------------- 1 | import os 2 | import glob 3 | import re 4 | import numpy as np 5 | import pandas as pd 6 | import shutil 7 | import pickle 8 | import subprocess 9 | 10 | import pcntoolkit.dataio.fileio as fileio 11 | 12 | #################################### FUNCTIONS ################################ 13 | def calibration_descriptives(x): 14 | n = np.shape(x)[0] 15 | m1 = np.mean(x) 16 | m2 = sum((x-m1)**2) 17 | m3 = sum((x-m1)**3) 18 | m4 = sum((x-m1)**4) 19 | s1 = np.std(x) 20 | skew = n*m3/(n-1)/(n-2)/s1**3 21 | sdskew = np.sqrt( 6*n*(n-1) / ((n-2)*(n+1)*(n+3)) ) 22 | kurtosis = (n*(n+1)*m4 - 3*m2**2*(n-1)) / ((n-1)*(n-2)*(n-3)*s1**4) 23 | sdkurtosis = np.sqrt( 4*(n**2-1) * sdskew**2 / ((n-3)*(n+5)) ) 24 | semean = np.sqrt(np.var(x)/n) 25 | sesd = s1/np.sqrt(2*(n-1)) 26 | cd = [skew, sdskew, kurtosis, sdkurtosis, semean, sesd] 27 | return cd 28 | 29 | 30 | 31 | def save_output(src_dir, dst_dir, savemodel=True): 32 | 33 | # move everything else to the destination dir 34 | files = [] 35 | files.extend(glob.glob(os.path.join(src_dir,'Z*'))) 36 | files.extend(glob.glob(os.path.join(src_dir,'yhat*'))) 37 | files.extend(glob.glob(os.path.join(src_dir,'ys2*'))) 38 | files.extend(glob.glob(os.path.join(src_dir,'Rho*'))) 39 | files.extend(glob.glob(os.path.join(src_dir,'pRho*'))) 40 | files.extend(glob.glob(os.path.join(src_dir,'RMSE*'))) 41 | files.extend(glob.glob(os.path.join(src_dir,'SMSE*'))) 42 | files.extend(glob.glob(os.path.join(src_dir,'MSLL*'))) 43 | files.extend(glob.glob(os.path.join(src_dir,'EXPV*'))) 44 | 45 | if savemodel: 46 | model_files = glob.glob(os.path.join(src_dir,'Models/*')) 47 | dst_model_dir = os.path.join(dst_dir, 'Models') 48 | os.makedirs(dst_model_dir, exist_ok=True) 49 | for f in model_files: 50 | fdir, fnam = os.path.split(f) 51 | shutil.move(f, os.path.join(dst_model_dir,fnam)) 52 | os.rmdir(os.path.join(src_dir,'Models')) 53 | else: 54 | # remove the model directory to save space 55 | shutil.rmtree(os.path.join(src_dir,'Models')) 56 | 57 | for f in files: 58 | fdir, fnam = os.path.split(f) 59 | shutil.move(f, os.path.join(dst_dir,fnam)) 60 | return 61 | 62 | def predict_on_new_sites(blr, hyp, X, y, Xs=None, 63 | ys=None, 64 | var_groups_test=None): 65 | """ Function to transfer the model to a new site""" 66 | # Get predictions from old model on new data X 67 | ys_ref, s2_ref = blr.predict(hyp, None, None, X) 68 | 69 | # Subtract the predictions from true data to get the residuals 70 | if blr.warp is None: 71 | residuals = ys_ref-y 72 | else: 73 | # Calculate the residuals in warped space 74 | y_ref_ws = blr.warp.f(y, hyp[1:blr.warp.get_n_params()+1]) 75 | residuals = ys_ref - y_ref_ws 76 | 77 | residuals_mu = np.mean(residuals) 78 | residuals_sd = np.std(residuals) 79 | 80 | # Adjust the mean with the mean of the residuals 81 | #blr.m = blr.m-np.ones((len(blr.m)))*residuals_mu 82 | #ys,s2 = blr.predict(hyp, None, None, Xs) 83 | if ys is None: 84 | if Xs is None: 85 | raise(ValueError, 'Either ys or Xs must be specified') 86 | else: 87 | ys, s2 = blr.predict(hyp, None, None, Xs) 88 | ys = ys - residuals_mu 89 | else: 90 | if blr.warp is not None: 91 | y_ws = blr.warp.f(y, hyp[1:blr.warp.get_n_params()+1]) 92 | ys = y_ws - residuals_mu 93 | else: 94 | ys = ys - residuals_mu 95 | 96 | # Set the deviation to the devations of the residuals 97 | s2 = np.ones(len(s2))*residuals_sd**2 98 | 99 | return ys, s2 100 | 101 | 102 | def test_func(x, epsilon, b): 103 | return np.sinh(b * np.arcsinh(x) + epsilon * b) 104 | 105 | def remove_bad_subjects(df, qc):#qc_file): 106 | 107 | """ 108 | Removes low-quality subjects from multi-site data based on Euler characteristic 109 | measure. 110 | 111 | * Inputs: 112 | - df: the data in a pandas' dataframe format. 113 | - qc: pandas dataframe containing the euler charcteristics. 114 | 115 | * Outputs: 116 | - df: the updated data after removing bad subjects. 117 | - removed_subjects: the list of removed subjects. 118 | """ 119 | 120 | n = df.shape[0] 121 | 122 | euler_nums = qc['avg_en'].to_numpy(dtype=np.float32) 123 | # convert to numeric site indices 124 | #sites = df['site'].to_numpy(dtype=np.int) 125 | site_ids = pd.Series(df['site'], copy=True) 126 | for i,s in enumerate(site_ids.unique()): 127 | site_ids.loc[site_ids == s] = i 128 | sites = site_ids.to_numpy(dtype=np.int) 129 | subjects = qc.index 130 | for site in np.unique(sites): 131 | euler_nums[sites==site] = np.sqrt(-(euler_nums[sites==site])) - np.nanmedian(np.sqrt(-(euler_nums[sites==site]))) 132 | 133 | good_subjects = list(subjects[np.bitwise_or(euler_nums<=5, np.isnan(euler_nums))]) 134 | removed_subjects = list(subjects[euler_nums>5]) 135 | 136 | good_subjects = list(set(good_subjects)) 137 | 138 | dfout = df.loc[good_subjects] 139 | 140 | print(len(removed_subjects), 'subjects are removed!') 141 | 142 | return dfout, removed_subjects 143 | 144 | def retrieve_eulernum(freesurfer_dir, subjects=None): 145 | """ Get the Euler Characteristic from a set of subjects 146 | :param freesurfer_dir: Freesurfer SUBJECTS_DIR 147 | :param subjects: a list of subjects to process 148 | """ 149 | 150 | if subjects is None: 151 | subjects = [temp for temp in os.listdir(freesurfer_dir) 152 | if os.path.isdir(os.path.join(freesurfer_dir ,temp))] 153 | 154 | df = pd.DataFrame(index=subjects, columns=['lh_en','rh_en','avg_en']) 155 | missing_subjects = [] 156 | 157 | for s, sub in enumerate(subjects): 158 | sub_dir = os.path.join(freesurfer_dir, sub) 159 | log_file = os.path.join(sub_dir, 'scripts', 'recon-all.log') 160 | 161 | if os.path.exists(sub_dir): 162 | if os.path.exists(log_file): 163 | with open(log_file) as f: 164 | for line in f: 165 | # find the part that refers to the EC 166 | if re.search('orig.nofix lheno', line): 167 | eno_line = line 168 | f.close() 169 | eno_l = eno_line.split()[3][0:-1] # remove the trailing comma 170 | eno_r = eno_line.split()[6] 171 | euler = (float(eno_l) + float(eno_r)) / 2 172 | 173 | df.at[sub, 'lh_en'] = eno_l 174 | df.at[sub, 'rh_en'] = eno_r 175 | df.at[sub, 'avg_en'] = euler 176 | 177 | print('%d: Subject %s is successfully processed. EN = %f' 178 | %(s, sub, df.at[sub, 'avg_en'])) 179 | else: 180 | print('%d: Subject %s is missing log file, running QC ...' %(s, sub)) 181 | try: 182 | bashCommand = 'mris_euler_number '+ freesurfer_dir + sub +'/surf/lh.orig.nofix>' + 'temp_l.txt 2>&1' 183 | res = subprocess.run(bashCommand, stdout=subprocess.PIPE, shell=True) 184 | file = open('temp_l.txt', mode = 'r', encoding = 'utf-8-sig') 185 | lines = file.readlines() 186 | file.close() 187 | words = [] 188 | for line in lines: 189 | line = line.strip() 190 | words.append([item.strip() for item in line.split(' ')]) 191 | eno_l = np.float32(words[0][12]) 192 | 193 | bashCommand = 'mris_euler_number '+ freesurfer_dir + sub +'/surf/rh.orig.nofix>' + 'temp_r.txt 2>&1' 194 | res = subprocess.run(bashCommand, stdout=subprocess.PIPE, shell=True) 195 | file = open('temp_r.txt', mode = 'r', encoding = 'utf-8-sig') 196 | lines = file.readlines() 197 | file.close() 198 | words = [] 199 | for line in lines: 200 | line = line.strip() 201 | words.append([item.strip() for item in line.split(' ')]) 202 | eno_r = np.float32(words[0][12]) 203 | 204 | df.at[sub, 'lh_en'] = eno_l 205 | df.at[sub, 'rh_en'] = eno_r 206 | df.at[sub, 'avg_en'] = (eno_r + eno_l) / 2 207 | 208 | print('%d: Subject %s is successfully processed. EN = %f' 209 | %(s, sub, df.at[sub, 'avg_en'])) 210 | 211 | except: 212 | missing_subjects.append(sub) 213 | print('%d: QC is failed for subject %s.' %(s, sub)) 214 | 215 | else: 216 | missing_subjects.append(sub) 217 | print('%d: Subject %s is missing.' %(s, sub)) 218 | df = df.dropna() 219 | 220 | return df, missing_subjects 221 | 222 | def load_2d(filename): 223 | """ this simple function loads a data type supported by PCNtoolkit and 224 | ensures that the output is a 2d numpy array 225 | """ 226 | 227 | x = fileio.load(filename) 228 | if len(x.shape) == 1: 229 | x = x[:, np.newaxis] 230 | 231 | return x -------------------------------------------------------------------------------- /presentation/GPU.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/saigerutherford/CPC_ML_tutorial/6e77e6e87b2780063dae57e87f0923dd54cf1845/presentation/GPU.png -------------------------------------------------------------------------------- /presentation/How_nm_compressed2020.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/saigerutherford/CPC_ML_tutorial/6e77e6e87b2780063dae57e87f0923dd54cf1845/presentation/How_nm_compressed2020.pdf -------------------------------------------------------------------------------- /presentation/Normative_Modeling_a_Framework_for_Clinical_Machinelearning.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/saigerutherford/CPC_ML_tutorial/6e77e6e87b2780063dae57e87f0923dd54cf1845/presentation/Normative_Modeling_a_Framework_for_Clinical_Machinelearning.pdf -------------------------------------------------------------------------------- /presentation/Runtime1.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/saigerutherford/CPC_ML_tutorial/6e77e6e87b2780063dae57e87f0923dd54cf1845/presentation/Runtime1.png -------------------------------------------------------------------------------- /presentation/Runtime2.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/saigerutherford/CPC_ML_tutorial/6e77e6e87b2780063dae57e87f0923dd54cf1845/presentation/Runtime2.png -------------------------------------------------------------------------------- /presentation/keyboard_pref.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/saigerutherford/CPC_ML_tutorial/6e77e6e87b2780063dae57e87f0923dd54cf1845/presentation/keyboard_pref.png -------------------------------------------------------------------------------- /presentation/settings1.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/saigerutherford/CPC_ML_tutorial/6e77e6e87b2780063dae57e87f0923dd54cf1845/presentation/settings1.png -------------------------------------------------------------------------------- /presentation/settings2.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/saigerutherford/CPC_ML_tutorial/6e77e6e87b2780063dae57e87f0923dd54cf1845/presentation/settings2.png -------------------------------------------------------------------------------- /tasks/1_fit_normative_models.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "id": "4b64f505-ad16-437a-94de-2646f35ae55f", 6 | "metadata": { 7 | "id": "4b64f505-ad16-437a-94de-2646f35ae55f" 8 | }, 9 | "source": [ 10 | "## Estimating lifespan normative models\n", 11 | "\n", 12 | "This notebook provides a complete walkthrough for an analysis of normative modelling using your own dataset. Training and testing data is provided for this tutorial. However, the idea is that you could subsitute our provided training and testing datasets for you own dataset (as long as it matches the same format!)\n", 13 | "\n", 14 | "First, if necessary, we install PCNtoolkit (note: this tutorial requires at least version 0.20)" 15 | ] 16 | }, 17 | { 18 | "cell_type": "code", 19 | "execution_count": null, 20 | "id": "84ec2ca6-c0a2-4abf-8f05-29edc9e0fa24", 21 | "metadata": { 22 | "id": "84ec2ca6-c0a2-4abf-8f05-29edc9e0fa24" 23 | }, 24 | "outputs": [], 25 | "source": [ 26 | "# Make sure to click the restart runtime button at the \n", 27 | "# bottom of this code blocks' output (after you run the cell)\n", 28 | "! pip install pcntoolkit==0.20" 29 | ] 30 | }, 31 | { 32 | "cell_type": "markdown", 33 | "id": "909c3b45-ad46-4e6d-8732-dc5ac68488c6", 34 | "metadata": { 35 | "id": "909c3b45-ad46-4e6d-8732-dc5ac68488c6" 36 | }, 37 | "source": [ 38 | "Then we import the required libraries" 39 | ] 40 | }, 41 | { 42 | "cell_type": "code", 43 | "execution_count": null, 44 | "id": "DGQhP2LbElmI", 45 | "metadata": { 46 | "id": "DGQhP2LbElmI" 47 | }, 48 | "outputs": [], 49 | "source": [ 50 | "! git clone https://github.com/saigerutherford/CPC_ML_tutorial.git" 51 | ] 52 | }, 53 | { 54 | "cell_type": "code", 55 | "execution_count": 14, 56 | "id": "d451c106-08e2-4f5b-baf9-da240768e68b", 57 | "metadata": { 58 | "id": "d451c106-08e2-4f5b-baf9-da240768e68b" 59 | }, 60 | "outputs": [], 61 | "source": [ 62 | "# we need to be in the CPC_ML_tutorial folder when we import the libraries in the code block below,\n", 63 | "# because there is a function called nm_utils that is in this folder that we need to import\n", 64 | "import os\n", 65 | "os.chdir('/content/CPC_ML_tutorial/')" 66 | ] 67 | }, 68 | { 69 | "cell_type": "code", 70 | "execution_count": 15, 71 | "id": "83c494d3-6ebd-4cde-aff0-8fc9344374dd", 72 | "metadata": { 73 | "id": "83c494d3-6ebd-4cde-aff0-8fc9344374dd" 74 | }, 75 | "outputs": [], 76 | "source": [ 77 | "import numpy as np\n", 78 | "import pandas as pd\n", 79 | "import pickle\n", 80 | "from matplotlib import pyplot as plt\n", 81 | "import seaborn as sns\n", 82 | "\n", 83 | "from pcntoolkit.normative import estimate, predict, evaluate\n", 84 | "from pcntoolkit.util.utils import compute_MSLL, create_design_matrix\n", 85 | "from nm_utils import calibration_descriptives, remove_bad_subjects, load_2d" 86 | ] 87 | }, 88 | { 89 | "cell_type": "markdown", 90 | "id": "9822cc19-48e9-428b-8c5e-e059fd2d23f7", 91 | "metadata": { 92 | "id": "9822cc19-48e9-428b-8c5e-e059fd2d23f7" 93 | }, 94 | "source": [ 95 | "Now, we configure the locations in which the data are stored. \n", 96 | "\n", 97 | "**Notes:** \n", 98 | "- The data are assumed to be in CSV format and will be loaded as pandas dataframes\n", 99 | "- Generally the raw data will be in a different location to the analysis\n", 100 | "- The data can have arbitrary columns but some are required by the script, i.e. 'age', 'sex' and 'site', plus the phenotypes you wish to estimate (see below)" 101 | ] 102 | }, 103 | { 104 | "cell_type": "code", 105 | "execution_count": 16, 106 | "id": "7da01c88-7033-498b-a811-79ad58e8c17a", 107 | "metadata": { 108 | "id": "7da01c88-7033-498b-a811-79ad58e8c17a" 109 | }, 110 | "outputs": [], 111 | "source": [ 112 | "# where the raw data are stored\n", 113 | "data_dir = '/content/CPC_ML_tutorial/data/'\n", 114 | "\n", 115 | "# where the analysis takes place\n", 116 | "root_dir = '/content/CPC_ML_tutorial/'\n", 117 | "out_dir = os.path.join(root_dir,'models','test')\n", 118 | "\n", 119 | "# create the output directory if it does not already exist\n", 120 | "os.makedirs(out_dir, exist_ok=True)" 121 | ] 122 | }, 123 | { 124 | "cell_type": "markdown", 125 | "id": "01141f19-a960-4823-baad-8604975304c3", 126 | "metadata": { 127 | "id": "01141f19-a960-4823-baad-8604975304c3" 128 | }, 129 | "source": [ 130 | "Now we load the data. \n", 131 | "\n", 132 | "We will load one pandas dataframe for the training set and one dataframe for the test set. We also configure a list of site ids." 133 | ] 134 | }, 135 | { 136 | "cell_type": "code", 137 | "execution_count": 17, 138 | "id": "850fee6b-421f-41d9-8fd6-7e1dafbf0e9f", 139 | "metadata": { 140 | "id": "850fee6b-421f-41d9-8fd6-7e1dafbf0e9f" 141 | }, 142 | "outputs": [], 143 | "source": [ 144 | "df_tr = pd.read_csv(os.path.join(data_dir,'train_data.csv'), index_col=0) \n", 145 | "df_te = pd.read_csv(os.path.join(data_dir,'test_data.csv'), index_col=0)\n", 146 | "\n", 147 | "# extract a list of unique site ids from the training set\n", 148 | "site_ids = sorted(set(df_tr['site'].to_list()))" 149 | ] 150 | }, 151 | { 152 | "cell_type": "markdown", 153 | "id": "29f9593a-d3c9-4d08-a877-8794203c0001", 154 | "metadata": { 155 | "id": "29f9593a-d3c9-4d08-a877-8794203c0001" 156 | }, 157 | "source": [ 158 | "### Configure which models to fit\n", 159 | "\n", 160 | "Next, we load the image derived phenotypes (IDPs) which we will process in this analysis. This is effectively just a list of columns in your dataframe. Here we estimate normative models for the left hemisphere, right hemisphere and cortical structures." 161 | ] 162 | }, 163 | { 164 | "cell_type": "code", 165 | "execution_count": null, 166 | "id": "7438ef7e-9340-4f13-8d57-816918923705", 167 | "metadata": { 168 | "id": "7438ef7e-9340-4f13-8d57-816918923705" 169 | }, 170 | "outputs": [], 171 | "source": [ 172 | "# we choose here to process all idps. Uncomment lines 2-7 (and comment line 11) to run models for the whole brain, but we suggest just starting with several ROIs\n", 173 | "#os.chdir(root_dir)\n", 174 | "#!wget -nc https://raw.githubusercontent.com/saigerutherford/CPC_ML_tutorial/master/data/task1_phenotypes.txt\n", 175 | "#with open(os.path.join(root_dir,'task1_phenotypes.txt')) as f:\n", 176 | "# idp_ids = f.read().splitlines()\n", 177 | "#for idx, ele in enumerate(idp_ids):\n", 178 | "# idp_ids[idx] = ele.replace('\\t', '')\n", 179 | "\n", 180 | "# we could also just specify a list of IDPs. Use this line to run just 2 models (not the whole brain)...this is a good place to start. If you have time,\n", 181 | "# you can uncomment the above line and run the whole brain models. Be sure to comment out this line if you uncomment the above line. \n", 182 | "idp_ids = ['lh_MeanThickness_thickness', 'rh_MeanThickness_thickness']" 183 | ] 184 | }, 185 | { 186 | "cell_type": "markdown", 187 | "id": "5d791db6-8fe5-450c-88eb-84a390b8753a", 188 | "metadata": { 189 | "id": "5d791db6-8fe5-450c-88eb-84a390b8753a" 190 | }, 191 | "source": [ 192 | "### Configure model parameters\n", 193 | "\n", 194 | "Now, we configure some parameters for the regression model we use to fit the normative model. Here we will use a 'warped' Bayesian linear regression model. To model non-Gaussianity, we select a sin arcsinh warp and to model non-linearity, we stick with the default value for the basis expansion (a cubic b-spline basis set with 5 knot points). Since we are sticking with the default value, we do not need to specify any parameters for this, but we do need to specify the limits. We choose to pad the input by a few years either side of the input range. We will also set a couple of options that control the estimation of the model\n", 195 | "\n", 196 | "For further details about the likelihood warping approach, see [Fraza et al 2021](https://www.biorxiv.org/content/10.1101/2021.04.05.438429v1)." 197 | ] 198 | }, 199 | { 200 | "cell_type": "code", 201 | "execution_count": null, 202 | "id": "0IYl-eg2xGWE", 203 | "metadata": { 204 | "id": "0IYl-eg2xGWE" 205 | }, 206 | "outputs": [], 207 | "source": [ 208 | "# check the min & max age of the dataset, use this info to update the xmin & xmax variables in the code block below. \n", 209 | "df_tr['age'].describe()" 210 | ] 211 | }, 212 | { 213 | "cell_type": "code", 214 | "execution_count": 22, 215 | "id": "e44e257c-676e-49d8-89ec-657e506c3b74", 216 | "metadata": { 217 | "id": "e44e257c-676e-49d8-89ec-657e506c3b74" 218 | }, 219 | "outputs": [], 220 | "source": [ 221 | "# which data columns do we wish to use as covariates? \n", 222 | "# You could add additional covariates from your own dataset here that you wish to use as predictors.\n", 223 | "# However, for this tutorial today we will keep it simple and just use age & sex. \n", 224 | "# Maybe discuss with your partner ideas you have for other covariates you would like to include.\n", 225 | "cols_cov = ['age','sex']\n", 226 | "\n", 227 | "# which warping function to use? We can set this to None in order to fit a vanilla Gaussian noise model\n", 228 | "warp = 'WarpSinArcsinh'\n", 229 | "\n", 230 | "# limits for cubic B-spline basis \n", 231 | "# check the min & max ages of the dataframes, add 5 to the max \n", 232 | "# and subtract 5 from the min and adjust these variables accordingly\n", 233 | "xmin = 13# set this variable\n", 234 | "xmax = 92# set this variable\n", 235 | "\n", 236 | "# Do we want to force the model to be refit every time? \n", 237 | "# When training normative model from scratch like we are doing in this notebook (not re-using a pre-trained model), \n", 238 | "# this variable should be = True\n", 239 | "force_refit = True \n", 240 | "\n", 241 | "# Absolute Z treshold above which a sample is considered to be an outlier (without fitting any model)\n", 242 | "outlier_thresh = 7" 243 | ] 244 | }, 245 | { 246 | "cell_type": "markdown", 247 | "id": "896842d7-8913-4137-9d86-4757c42bcf1b", 248 | "metadata": { 249 | "id": "896842d7-8913-4137-9d86-4757c42bcf1b" 250 | }, 251 | "source": [ 252 | "### Fit the models\n", 253 | "\n", 254 | "Now we fit the models. This involves looping over the IDPs we have selected. We will use a module from PCNtoolkit to set up the design matrices, containing the covariates, fixed effects for site and nonlinear basis expansion. " 255 | ] 256 | }, 257 | { 258 | "cell_type": "code", 259 | "execution_count": null, 260 | "id": "a4e9b50c-574b-4e2c-a511-cc444db4393e", 261 | "metadata": { 262 | "id": "a4e9b50c-574b-4e2c-a511-cc444db4393e" 263 | }, 264 | "outputs": [], 265 | "source": [ 266 | "for idp_num, idp in enumerate(idp_ids): \n", 267 | " print('Running IDP', idp_num, idp, ':')\n", 268 | " \n", 269 | " # set output dir \n", 270 | " idp_dir = os.path.join(out_dir, idp)\n", 271 | " os.makedirs(os.path.join(idp_dir), exist_ok=True)\n", 272 | " os.chdir(idp_dir)\n", 273 | " \n", 274 | " # extract the response variables for training and test set\n", 275 | " y_tr = df_tr[idp].to_numpy() \n", 276 | " y_te = df_te[idp].to_numpy()\n", 277 | " \n", 278 | " # remove gross outliers and implausible values\n", 279 | " yz_tr = (y_tr - np.mean(y_tr)) / np.std(y_tr)\n", 280 | " yz_te = (y_te - np.mean(y_te)) / np.std(y_te)\n", 281 | " nz_tr = np.bitwise_and(np.abs(yz_tr) < outlier_thresh, y_tr > 0)\n", 282 | " nz_te = np.bitwise_and(np.abs(yz_te) < outlier_thresh, y_te > 0)\n", 283 | " y_tr = y_tr[nz_tr]\n", 284 | " y_te = y_te[nz_te]\n", 285 | " \n", 286 | " # write out the response variables for training and test\n", 287 | " resp_file_tr = os.path.join(idp_dir, 'resp_tr.txt')\n", 288 | " resp_file_te = os.path.join(idp_dir, 'resp_te.txt') \n", 289 | " np.savetxt(resp_file_tr, y_tr)\n", 290 | " np.savetxt(resp_file_te, y_te)\n", 291 | " \n", 292 | " # configure the design matrix\n", 293 | " X_tr = create_design_matrix(df_tr[cols_cov].loc[nz_tr], \n", 294 | " site_ids = df_tr['site'].loc[nz_tr],\n", 295 | " basis = 'bspline', \n", 296 | " xmin = xmin, \n", 297 | " xmax = xmax)\n", 298 | " X_te = create_design_matrix(df_te[cols_cov].loc[nz_te], \n", 299 | " site_ids = df_te['site'].loc[nz_te], \n", 300 | " all_sites=site_ids,\n", 301 | " basis = 'bspline', \n", 302 | " xmin = xmin, \n", 303 | " xmax = xmax)\n", 304 | "\n", 305 | " # configure and save the covariates\n", 306 | " cov_file_tr = os.path.join(idp_dir, 'cov_bspline_tr.txt')\n", 307 | " cov_file_te = os.path.join(idp_dir, 'cov_bspline_te.txt')\n", 308 | " np.savetxt(cov_file_tr, X_tr)\n", 309 | " np.savetxt(cov_file_te, X_te)\n", 310 | "\n", 311 | " if not force_refit and os.path.exists(os.path.join(idp_dir, 'Models', 'NM_0_0_estimate.pkl')):\n", 312 | " print('Making predictions using a pre-existing model...')\n", 313 | " suffix = 'predict'\n", 314 | " \n", 315 | " # Make prdictsion with test data\n", 316 | " predict(cov_file_te, \n", 317 | " alg='blr', \n", 318 | " respfile=resp_file_te, \n", 319 | " model_path=os.path.join(idp_dir,'Models'),\n", 320 | " outputsuffix=suffix)\n", 321 | " else:\n", 322 | " print('Estimating the normative model...')\n", 323 | " estimate(cov_file_tr, resp_file_tr, testresp=resp_file_te, \n", 324 | " testcov=cov_file_te, alg='blr', optimizer = 'l-bfgs-b', \n", 325 | " savemodel=True, warp=warp, warp_reparam=True)\n", 326 | " suffix = 'estimate'" 327 | ] 328 | }, 329 | { 330 | "cell_type": "markdown", 331 | "id": "925f77cf-c873-4047-91ac-50b9571704fd", 332 | "metadata": { 333 | "id": "925f77cf-c873-4047-91ac-50b9571704fd" 334 | }, 335 | "source": [ 336 | "### Compute error metrics\n", 337 | "\n", 338 | "In this section we compute the following error metrics for all IDPs (all evaluated on the test set):\n", 339 | "\n", 340 | "- Negative log likelihood (NLL)\n", 341 | "- Explained variance (EV)\n", 342 | "- Mean standardized log loss (MSLL)\n", 343 | "- Bayesian information Criteria (BIC)\n", 344 | "- Skew and Kurtosis of the Z-distribution" 345 | ] 346 | }, 347 | { 348 | "cell_type": "code", 349 | "execution_count": null, 350 | "id": "2e9d7500-4f46-4ee1-9756-81758ae5b1d1", 351 | "metadata": { 352 | "id": "2e9d7500-4f46-4ee1-9756-81758ae5b1d1" 353 | }, 354 | "outputs": [], 355 | "source": [ 356 | "# initialise dataframe we will use to store quantitative metrics \n", 357 | "blr_metrics = pd.DataFrame(columns = ['eid', 'NLL', 'EV', 'MSLL', 'BIC','Skew','Kurtosis'])\n", 358 | "\n", 359 | "for idp_num, idp in enumerate(idp_ids): \n", 360 | " idp_dir = os.path.join(out_dir, idp)\n", 361 | " \n", 362 | " # load the predictions and true data. We use a custom function that ensures 2d arrays\n", 363 | " # equivalent to: y = np.loadtxt(filename); y = y[:, np.newaxis]\n", 364 | " yhat_te = load_2d(os.path.join(idp_dir, 'yhat_' + suffix + '.txt'))\n", 365 | " s2_te = load_2d(os.path.join(idp_dir, 'ys2_' + suffix + '.txt'))\n", 366 | " y_te = load_2d(os.path.join(idp_dir, 'resp_te.txt'))\n", 367 | " \n", 368 | " with open(os.path.join(idp_dir,'Models', 'NM_0_0_estimate.pkl'), 'rb') as handle:\n", 369 | " nm = pickle.load(handle) \n", 370 | " \n", 371 | " # compute error metrics\n", 372 | " if warp is None:\n", 373 | " metrics = evaluate(y_te, yhat_te) \n", 374 | " \n", 375 | " # compute MSLL manually as a sanity check\n", 376 | " y_tr_mean = np.array( [[np.mean(y_tr)]] )\n", 377 | " y_tr_var = np.array( [[np.var(y_tr)]] )\n", 378 | " MSLL = compute_MSLL(y_te, yhat_te, s2_te, y_tr_mean, y_tr_var) \n", 379 | " else:\n", 380 | " warp_param = nm.blr.hyp[1:nm.blr.warp.get_n_params()+1] \n", 381 | " W = nm.blr.warp\n", 382 | " \n", 383 | " # warp predictions\n", 384 | " med_te = W.warp_predictions(np.squeeze(yhat_te), np.squeeze(s2_te), warp_param)[0]\n", 385 | " med_te = med_te[:, np.newaxis]\n", 386 | " \n", 387 | " # evaluation metrics\n", 388 | " metrics = evaluate(y_te, med_te)\n", 389 | " \n", 390 | " # compute MSLL manually\n", 391 | " y_te_w = W.f(y_te, warp_param)\n", 392 | " y_tr_w = W.f(y_tr, warp_param)\n", 393 | " y_tr_mean = np.array( [[np.mean(y_tr_w)]] )\n", 394 | " y_tr_var = np.array( [[np.var(y_tr_w)]] )\n", 395 | " MSLL = compute_MSLL(y_te_w, yhat_te, s2_te, y_tr_mean, y_tr_var) \n", 396 | " \n", 397 | " Z = np.loadtxt(os.path.join(idp_dir, 'Z_' + suffix + '.txt'))\n", 398 | " [skew, sdskew, kurtosis, sdkurtosis, semean, sesd] = calibration_descriptives(Z)\n", 399 | " \n", 400 | " BIC = len(nm.blr.hyp) * np.log(y_tr.shape[0]) + 2 * nm.neg_log_lik\n", 401 | " \n", 402 | " blr_metrics.loc[len(blr_metrics)] = [idp, nm.neg_log_lik, metrics['EXPV'][0], \n", 403 | " MSLL[0], BIC, skew, kurtosis]\n", 404 | " \n", 405 | "display(blr_metrics)\n", 406 | "\n", 407 | "blr_metrics.to_csv(os.path.join(out_dir,'blr_metrics.csv'))" 408 | ] 409 | }, 410 | { 411 | "cell_type": "code", 412 | "execution_count": null, 413 | "id": "NCpzbIwGxVWj", 414 | "metadata": { 415 | "id": "NCpzbIwGxVWj" 416 | }, 417 | "outputs": [], 418 | "source": [ 419 | "blr_metrics['EV'].describe()" 420 | ] 421 | }, 422 | { 423 | "cell_type": "code", 424 | "execution_count": null, 425 | "id": "Et7L-t9RJl75", 426 | "metadata": { 427 | "id": "Et7L-t9RJl75" 428 | }, 429 | "outputs": [], 430 | "source": [ 431 | "blr_metrics['MSLL'].describe()" 432 | ] 433 | }, 434 | { 435 | "cell_type": "code", 436 | "execution_count": null, 437 | "id": "s25LW4QuJqfW", 438 | "metadata": { 439 | "id": "s25LW4QuJqfW" 440 | }, 441 | "outputs": [], 442 | "source": [ 443 | "blr_metrics['EV'].hist()" 444 | ] 445 | }, 446 | { 447 | "cell_type": "code", 448 | "execution_count": null, 449 | "id": "mBhUMsojJu5J", 450 | "metadata": { 451 | "id": "mBhUMsojJu5J" 452 | }, 453 | "outputs": [], 454 | "source": [ 455 | "blr_metrics['MSLL'].hist()" 456 | ] 457 | } 458 | ], 459 | "metadata": { 460 | "colab": { 461 | "name": "1_fit_normative_models.ipynb", 462 | "provenance": [], 463 | "toc_visible": true 464 | }, 465 | "kernelspec": { 466 | "display_name": "Python 3 (ipykernel)", 467 | "language": "python", 468 | "name": "python3" 469 | }, 470 | "language_info": { 471 | "codemirror_mode": { 472 | "name": "ipython", 473 | "version": 3 474 | }, 475 | "file_extension": ".py", 476 | "mimetype": "text/x-python", 477 | "name": "python", 478 | "nbconvert_exporter": "python", 479 | "pygments_lexer": "ipython3", 480 | "version": "3.9.7" 481 | } 482 | }, 483 | "nbformat": 4, 484 | "nbformat_minor": 5 485 | } 486 | -------------------------------------------------------------------------------- /tasks/2_apply_normative_models.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "id": "2d8fb4c8-4360-4fdc-b0a2-e1c2e22bd8f9", 6 | "metadata": { 7 | "id": "2d8fb4c8-4360-4fdc-b0a2-e1c2e22bd8f9" 8 | }, 9 | "source": [ 10 | "## Using lifespan models to make predictions on new data\n", 11 | "\n", 12 | "This notebook shows how to apply the coefficients from [pre-estimated normative models](https://www.biorxiv.org/content/10.1101/2021.08.08.455487v2) to new data. This can be done in two different ways: (i) using a new set of data derived from the same sites used to estimate the model and (ii) on a completely different set of sites. In the latter case, we also need to estimate the site effect, which requires some calibration/adaptation data. As an illustrative example, we use a dataset derived from the [1000 functional connectomes project](https://www.nitrc.org/forum/forum.php?thread_id=2907&forum_id=1383) and adapt the learned model to make predictions on these data. \n", 13 | "\n", 14 | "First, if necessary, we install PCNtoolkit (note: this tutorial requires at least version 0.20)" 15 | ] 16 | }, 17 | { 18 | "cell_type": "code", 19 | "execution_count": null, 20 | "id": "8d05182a-5346-49d2-bfbf-fd3769ecc061", 21 | "metadata": { 22 | "colab": { 23 | "base_uri": "https://localhost:8080/", 24 | "height": 1000 25 | }, 26 | "id": "8d05182a-5346-49d2-bfbf-fd3769ecc061", 27 | "outputId": "22c20334-2291-4553-8e95-9477882ce5c5" 28 | }, 29 | "outputs": [], 30 | "source": [ 31 | "! pip install pcntoolkit==0.20" 32 | ] 33 | }, 34 | { 35 | "cell_type": "code", 36 | "execution_count": null, 37 | "id": "5V6JFzpdJ43R", 38 | "metadata": { 39 | "colab": { 40 | "base_uri": "https://localhost:8080/" 41 | }, 42 | "id": "5V6JFzpdJ43R", 43 | "outputId": "385682e4-f053-4cc6-d6b4-7e018eede435" 44 | }, 45 | "outputs": [], 46 | "source": [ 47 | "! git clone https://github.com/predictive-clinical-neuroscience/braincharts.git" 48 | ] 49 | }, 50 | { 51 | "cell_type": "code", 52 | "execution_count": null, 53 | "id": "_5ZET1btKF6J", 54 | "metadata": { 55 | "id": "_5ZET1btKF6J" 56 | }, 57 | "outputs": [], 58 | "source": [ 59 | "# we need to be in the scripts folder when we import the libraries in the code block below,\n", 60 | "# because there is a function called nm_utils that is in the scripts folder that we need to import\n", 61 | "import os\n", 62 | "os.chdir('/content/braincharts/scripts/')" 63 | ] 64 | }, 65 | { 66 | "cell_type": "markdown", 67 | "id": "b2227bc7-e798-470a-99bc-33561ce4511b", 68 | "metadata": { 69 | "id": "b2227bc7-e798-470a-99bc-33561ce4511b" 70 | }, 71 | "source": [ 72 | "Now we import the required libraries" 73 | ] 74 | }, 75 | { 76 | "cell_type": "code", 77 | "execution_count": null, 78 | "id": "ff661cf2-7d80-46bb-bcfb-1650a93eed3d", 79 | "metadata": { 80 | "id": "ff661cf2-7d80-46bb-bcfb-1650a93eed3d" 81 | }, 82 | "outputs": [], 83 | "source": [ 84 | "import numpy as np\n", 85 | "import pandas as pd\n", 86 | "import pickle\n", 87 | "from matplotlib import pyplot as plt\n", 88 | "import seaborn as sns\n", 89 | "\n", 90 | "from pcntoolkit.normative import estimate, predict, evaluate\n", 91 | "from pcntoolkit.util.utils import compute_MSLL, create_design_matrix\n", 92 | "from nm_utils import remove_bad_subjects, load_2d" 93 | ] 94 | }, 95 | { 96 | "cell_type": "markdown", 97 | "id": "TFxsGN-KgfE0", 98 | "metadata": { 99 | "id": "TFxsGN-KgfE0" 100 | }, 101 | "source": [ 102 | "We need to unzip the models. " 103 | ] 104 | }, 105 | { 106 | "cell_type": "code", 107 | "execution_count": null, 108 | "id": "0OvpUTaIgekS", 109 | "metadata": { 110 | "id": "0OvpUTaIgekS" 111 | }, 112 | "outputs": [], 113 | "source": [ 114 | "os.chdir('/content/braincharts/models/')" 115 | ] 116 | }, 117 | { 118 | "cell_type": "code", 119 | "execution_count": null, 120 | "id": "WBP9CEVcgsjT", 121 | "metadata": { 122 | "colab": { 123 | "base_uri": "https://localhost:8080/" 124 | }, 125 | "id": "WBP9CEVcgsjT", 126 | "outputId": "da6f94f5-fff3-4ebb-aee1-45ddd0af0210" 127 | }, 128 | "outputs": [], 129 | "source": [ 130 | "ls" 131 | ] 132 | }, 133 | { 134 | "cell_type": "code", 135 | "execution_count": null, 136 | "id": "is47bTl_guD4", 137 | "metadata": { 138 | "id": "is47bTl_guD4" 139 | }, 140 | "outputs": [], 141 | "source": [ 142 | "# we will use the biggest sample as our training set (approx. N=57000 subjects from 82 sites)\n", 143 | "# for more info on the other pretrained models available in this repository, \n", 144 | "# please refer to the accompanying preprint https://www.biorxiv.org/content/10.1101/2021.08.08.455487v2\n", 145 | "! unzip lifespan_57K_82sites.zip" 146 | ] 147 | }, 148 | { 149 | "cell_type": "markdown", 150 | "id": "802b1da6-04cc-4310-af81-f50d38c3e653", 151 | "metadata": { 152 | "id": "802b1da6-04cc-4310-af81-f50d38c3e653" 153 | }, 154 | "source": [ 155 | "Next, we configure some basic variables, like where we want the analysis to be done and which model we want to use.\n", 156 | "\n", 157 | "**Note:** We maintain a list of site ids for each dataset, which describe the site names in the training and test data (`site_ids_tr` and `site_ids_te`), plus also the adaptation data . The training site ids are provided as a text file in the distribution and the test ids are extracted automatically from the pandas dataframe (see below). If you use additional data from the sites (e.g. later waves from ABCD), it may be necessary to adjust the site names to match the names in the training set. See the accompanying [paper](https://www.biorxiv.org/content/10.1101/2021.08.08.455487v2) for more details." 158 | ] 159 | }, 160 | { 161 | "cell_type": "code", 162 | "execution_count": null, 163 | "id": "26b35c64-41fd-4ecd-bf6e-3e7b34a67279", 164 | "metadata": { 165 | "id": "26b35c64-41fd-4ecd-bf6e-3e7b34a67279" 166 | }, 167 | "outputs": [], 168 | "source": [ 169 | "# which model do we wish to use?\n", 170 | "model_name = 'lifespan_57K_82sites'\n", 171 | "site_names = 'site_ids_ct_82sites.txt'\n", 172 | "\n", 173 | "# where the analysis takes place\n", 174 | "root_dir = '/content/braincharts'\n", 175 | "out_dir = os.path.join(root_dir, 'models', model_name)\n", 176 | "\n", 177 | "# load a set of site ids from this model. This must match the training data\n", 178 | "with open(os.path.join(root_dir,'docs', site_names)) as f:\n", 179 | " site_ids_tr = f.read().splitlines()" 180 | ] 181 | }, 182 | { 183 | "cell_type": "markdown", 184 | "id": "8dbaebd7-4f86-47d8-82a5-1776eb96690f", 185 | "metadata": { 186 | "id": "8dbaebd7-4f86-47d8-82a5-1776eb96690f" 187 | }, 188 | "source": [ 189 | "### Download test dataset\n", 190 | "\n", 191 | "As mentioned above, to demonstrate this tool we will use a test dataset derived from the FCON 1000 dataset. We provide a prepackaged training/test split of these data in the required format (also after removing sites with only a few data points), [here](https://github.com/predictive-clinical-neuroscience/PCNtoolkit-demo/tree/main/data). you can get these data by running the following commmands:" 192 | ] 193 | }, 194 | { 195 | "cell_type": "code", 196 | "execution_count": null, 197 | "id": "60f72165-9b2f-4248-ba72-1a1f9683d280", 198 | "metadata": { 199 | "colab": { 200 | "base_uri": "https://localhost:8080/" 201 | }, 202 | "id": "60f72165-9b2f-4248-ba72-1a1f9683d280", 203 | "outputId": "7f665ae9-4bac-4b95-e733-d063624d24ea" 204 | }, 205 | "outputs": [], 206 | "source": [ 207 | "os.chdir(root_dir)\n", 208 | "!wget -nc https://raw.githubusercontent.com/saigerutherford/CPC_ML_tutorial/master/data/fcon1000_tr.csv\n", 209 | "!wget -nc https://raw.githubusercontent.com/saigerutherford/CPC_ML_tutorial/master/data/fcon1000_te.csv" 210 | ] 211 | }, 212 | { 213 | "cell_type": "markdown", 214 | "id": "3aab54a5-2579-48d8-a81b-bbd34cea1213", 215 | "metadata": { 216 | "id": "3aab54a5-2579-48d8-a81b-bbd34cea1213" 217 | }, 218 | "source": [ 219 | "### Load test data\n", 220 | "\n", 221 | "Now we load the test data and remove some subjects that may have poor scan quality. This asssesment is based on the Freesurfer Euler characteristic as described in the papers below. \n", 222 | "\n", 223 | "**Note:** For the purposes of this tutorial, we make predictions for all sites in the FCON 1000 dataset, but two of them were also included in the training data (named 'Baltimore' and 'NewYork_a'). In this case, this will only slightly bias the accuracy, but in order to replicate the results in the paper, it would be necessary to additionally remove these sites from the test dataframe.\n", 224 | "\n", 225 | "**References**\n", 226 | "- [Kia et al 2021](https://www.biorxiv.org/content/10.1101/2021.05.28.446120v1.abstract)\n", 227 | "- [Rosen et al 2018](https://www.sciencedirect.com/science/article/abs/pii/S1053811917310832?via%3Dihub)" 228 | ] 229 | }, 230 | { 231 | "cell_type": "code", 232 | "execution_count": null, 233 | "id": "262d429a-160b-4ba3-9ba4-9acc195bc644", 234 | "metadata": { 235 | "colab": { 236 | "base_uri": "https://localhost:8080/" 237 | }, 238 | "id": "262d429a-160b-4ba3-9ba4-9acc195bc644", 239 | "outputId": "e38c0a03-3f44-463b-e385-ec01eafb660a" 240 | }, 241 | "outputs": [], 242 | "source": [ 243 | "test_data = os.path.join(root_dir, 'fcon1000_te.csv')\n", 244 | "\n", 245 | "df_te = pd.read_csv(test_data, index_col=0)\n", 246 | "\n", 247 | "# remove some bad subjects\n", 248 | "df_te, bad_sub = remove_bad_subjects(df_te, df_te)\n", 249 | "\n", 250 | "# extract a list of unique site ids from the test set\n", 251 | "site_ids_te = sorted(set(df_te['site'].to_list()))" 252 | ] 253 | }, 254 | { 255 | "cell_type": "markdown", 256 | "id": "c636509a-8b12-43f1-811c-08cb22640be2", 257 | "metadata": { 258 | "id": "c636509a-8b12-43f1-811c-08cb22640be2" 259 | }, 260 | "source": [ 261 | "### Load adaptation data\n", 262 | "\n", 263 | "If the data you wish to make predictions for is not derived from the same scanning sites as those in the trainig set, it is necessary to learn the site effect so that we can account for it in the predictions. In order to do this in an unbiased way, we use a separate dataset, which we refer to as 'adaptation' data. This must contain data for all the same sites as in the test dataset and we assume these are coded in the same way, based on a the 'sitenum' column in the dataframe. " 264 | ] 265 | }, 266 | { 267 | "cell_type": "code", 268 | "execution_count": null, 269 | "id": "53551023-aff6-4934-ad2d-d77bc63c562d", 270 | "metadata": { 271 | "colab": { 272 | "base_uri": "https://localhost:8080/" 273 | }, 274 | "id": "53551023-aff6-4934-ad2d-d77bc63c562d", 275 | "outputId": "b59cc4e3-3646-47b7-eff8-0abb60dce75e" 276 | }, 277 | "outputs": [], 278 | "source": [ 279 | "adaptation_data = os.path.join(root_dir, 'fcon1000_tr.csv')\n", 280 | "\n", 281 | "df_ad = pd.read_csv(adaptation_data, index_col=0)\n", 282 | "\n", 283 | "# remove some bad subjects\n", 284 | "df_ad, bad_sub = remove_bad_subjects(df_ad, df_ad)\n", 285 | "\n", 286 | "# extract a list of unique site ids from the test set\n", 287 | "site_ids_ad = sorted(set(df_ad['site'].to_list()))\n", 288 | "\n", 289 | "if not all(elem in site_ids_ad for elem in site_ids_te):\n", 290 | " print('Warning: some of the testing sites are not in the adaptation data')" 291 | ] 292 | }, 293 | { 294 | "cell_type": "markdown", 295 | "id": "4f73e30e-c693-44b8-98c6-52b71b577ea8", 296 | "metadata": { 297 | "id": "4f73e30e-c693-44b8-98c6-52b71b577ea8" 298 | }, 299 | "source": [ 300 | "### Configure which models to fit\n", 301 | "\n", 302 | "Now, we configure which imaging derived phenotypes (IDPs) we would like to process. This is just a list of column names in the dataframe we have loaded above. \n", 303 | "\n", 304 | "We could load the whole set i.e., all phenotypes for which we have models for (188 brain regions)." 305 | ] 306 | }, 307 | { 308 | "cell_type": "code", 309 | "execution_count": null, 310 | "id": "b48e104c-cbac-4ae2-8377-cd3ff80162fd", 311 | "metadata": { 312 | "id": "b48e104c-cbac-4ae2-8377-cd3ff80162fd" 313 | }, 314 | "outputs": [], 315 | "source": [ 316 | "# load the list of idps for left and right hemispheres, plus subcortical regions\n", 317 | "with open(os.path.join(root_dir,'docs','phenotypes_ct_lh.txt')) as f:\n", 318 | " idp_ids_lh = f.read().splitlines()\n", 319 | "with open(os.path.join(root_dir,'docs','phenotypes_ct_rh.txt')) as f:\n", 320 | " idp_ids_rh = f.read().splitlines()\n", 321 | "with open(os.path.join(root_dir,'docs','phenotypes_sc.txt')) as f:\n", 322 | " idp_ids_sc = f.read().splitlines()\n", 323 | "\n", 324 | "# we choose here to process all idps\n", 325 | "idp_ids = idp_ids_lh + idp_ids_rh + idp_ids_sc" 326 | ] 327 | }, 328 | { 329 | "cell_type": "markdown", 330 | "id": "280731ad-47d8-43e2-8cb5-4eccfd9f3f81", 331 | "metadata": { 332 | "id": "280731ad-47d8-43e2-8cb5-4eccfd9f3f81" 333 | }, 334 | "source": [ 335 | "... or alternatively, we could just specify a list of the brain regions we are interested in. " 336 | ] 337 | }, 338 | { 339 | "cell_type": "code", 340 | "execution_count": null, 341 | "id": "8b74d75f-77a5-474a-9c9b-29aab1ce53a2", 342 | "metadata": { 343 | "id": "8b74d75f-77a5-474a-9c9b-29aab1ce53a2" 344 | }, 345 | "outputs": [], 346 | "source": [ 347 | "idp_ids = [ 'Left-Thalamus-Proper', 'Left-Lateral-Ventricle', 'rh_MeanThickness_thickness']" 348 | ] 349 | }, 350 | { 351 | "cell_type": "markdown", 352 | "id": "56ee1f7f-8684-4f1c-b142-a68176407029", 353 | "metadata": { 354 | "id": "56ee1f7f-8684-4f1c-b142-a68176407029" 355 | }, 356 | "source": [ 357 | "### Configure covariates \n", 358 | "\n", 359 | "Now, we configure some parameters to fit the model. First, we choose which columns of the pandas dataframe contain the covariates (age and sex). The site parameters are configured automatically later on by the `configure_design_matrix()` function, when we loop through the IDPs in the list\n", 360 | "\n", 361 | "The supplied coefficients are derived from a 'warped' Bayesian linear regression model, which uses a nonlinear warping function to model non-Gaussianity (`sinarcsinh`) plus a non-linear basis expansion (a cubic b-spline basis set with 5 knot points, which is the default value in the PCNtoolkit package). Since we are sticking with the default value, we do not need to specify any parameters for this, but we do need to specify the limits. We choose to pad the input by a few years either side of the input range. We will also set a couple of options that control the estimation of the model\n", 362 | "\n", 363 | "For further details about the likelihood warping approach, see the accompanying paper and [Fraza et al 2021](https://www.biorxiv.org/content/10.1101/2021.04.05.438429v1)." 364 | ] 365 | }, 366 | { 367 | "cell_type": "code", 368 | "execution_count": null, 369 | "id": "62312b8e-4972-4238-abf9-87d9bb33cc10", 370 | "metadata": { 371 | "id": "62312b8e-4972-4238-abf9-87d9bb33cc10" 372 | }, 373 | "outputs": [], 374 | "source": [ 375 | "# which data columns do we wish to use as covariates? \n", 376 | "cols_cov = ['age','sex']\n", 377 | "\n", 378 | "# limits for cubic B-spline basis \n", 379 | "xmin = -5 \n", 380 | "xmax = 110\n", 381 | "\n", 382 | "# Absolute Z treshold above which a sample is considered to be an outlier (without fitting any model)\n", 383 | "outlier_thresh = 7" 384 | ] 385 | }, 386 | { 387 | "cell_type": "markdown", 388 | "id": "42bc1072-e9ed-4f2a-9fdd-cbd626a61542", 389 | "metadata": { 390 | "id": "42bc1072-e9ed-4f2a-9fdd-cbd626a61542" 391 | }, 392 | "source": [ 393 | "### Make predictions\n", 394 | "\n", 395 | "This will make predictions for each IDP separately. This is done by extracting a column from the dataframe (i.e. specifying the IDP as the response variable) and saving it as a numpy array. Then, we configure the covariates, which is a numpy data array having the number of rows equal to the number of datapoints in the test set. The columns are specified as follows: \n", 396 | "\n", 397 | "- A global intercept (column of ones)\n", 398 | "- The covariate columns (here age and sex, coded as 0=female/1=male)\n", 399 | "- Dummy coded columns for the sites in the training set (one column per site)\n", 400 | "- Columns for the basis expansion (seven columns for the default parameterisation)\n", 401 | "\n", 402 | "Once these are saved as numpy arrays in ascii format (as here) or (alternatively) in pickle format, these are passed as inputs to the `predict()` method in the PCNtoolkit normative modelling framework. These are written in the same format to the location specified by `idp_dir`. At the end of this step, we have a set of predictions and Z-statistics for the test dataset that we can take forward to further analysis.\n", 403 | "\n", 404 | "Note that when we need to make predictions on new data, the procedure is more involved, since we need to prepare, process and store covariates, response variables and site ids for the adaptation data. " 405 | ] 406 | }, 407 | { 408 | "cell_type": "code", 409 | "execution_count": null, 410 | "id": "07b7471b-c334-464f-8273-b409b7acaac2", 411 | "metadata": { 412 | "colab": { 413 | "base_uri": "https://localhost:8080/" 414 | }, 415 | "id": "07b7471b-c334-464f-8273-b409b7acaac2", 416 | "outputId": "b5345b37-8335-47c6-c962-47d17a41c384" 417 | }, 418 | "outputs": [], 419 | "source": [ 420 | "for idp_num, idp in enumerate(idp_ids): \n", 421 | " print('Running IDP', idp_num, idp, ':')\n", 422 | " idp_dir = os.path.join(out_dir, idp)\n", 423 | " os.chdir(idp_dir)\n", 424 | " \n", 425 | " # extract and save the response variables for the test set\n", 426 | " y_te = df_te[idp].to_numpy()\n", 427 | " \n", 428 | " # save the variables\n", 429 | " resp_file_te = os.path.join(idp_dir, 'resp_te.txt') \n", 430 | " np.savetxt(resp_file_te, y_te)\n", 431 | " \n", 432 | " # configure and save the design matrix\n", 433 | " cov_file_te = os.path.join(idp_dir, 'cov_bspline_te.txt')\n", 434 | " X_te = create_design_matrix(df_te[cols_cov], \n", 435 | " site_ids = df_te['site'],\n", 436 | " all_sites = site_ids_tr,\n", 437 | " basis = 'bspline', \n", 438 | " xmin = xmin, \n", 439 | " xmax = xmax)\n", 440 | " np.savetxt(cov_file_te, X_te)\n", 441 | " \n", 442 | " # check whether all sites in the test set are represented in the training set\n", 443 | " if all(elem in site_ids_tr for elem in site_ids_te):\n", 444 | " print('All sites are present in the training data')\n", 445 | " \n", 446 | " # just make predictions\n", 447 | " yhat_te, s2_te, Z = predict(cov_file_te, \n", 448 | " alg='blr', \n", 449 | " respfile=resp_file_te, \n", 450 | " model_path=os.path.join(idp_dir,'Models'))\n", 451 | " else:\n", 452 | " print('Some sites missing from the training data. Adapting model')\n", 453 | " \n", 454 | " # save the covariates for the adaptation data\n", 455 | " X_ad = create_design_matrix(df_ad[cols_cov], \n", 456 | " site_ids = df_ad['site'],\n", 457 | " all_sites = site_ids_tr,\n", 458 | " basis = 'bspline', \n", 459 | " xmin = xmin, \n", 460 | " xmax = xmax)\n", 461 | " cov_file_ad = os.path.join(idp_dir, 'cov_bspline_ad.txt') \n", 462 | " np.savetxt(cov_file_ad, X_ad)\n", 463 | " \n", 464 | " # save the responses for the adaptation data\n", 465 | " resp_file_ad = os.path.join(idp_dir, 'resp_ad.txt') \n", 466 | " y_ad = df_ad[idp].to_numpy()\n", 467 | " np.savetxt(resp_file_ad, y_ad)\n", 468 | " \n", 469 | " # save the site ids for the adaptation data\n", 470 | " sitenum_file_ad = os.path.join(idp_dir, 'sitenum_ad.txt') \n", 471 | " site_num_ad = df_ad['sitenum'].to_numpy(dtype=int)\n", 472 | " np.savetxt(sitenum_file_ad, site_num_ad)\n", 473 | " \n", 474 | " # save the site ids for the test data \n", 475 | " sitenum_file_te = os.path.join(idp_dir, 'sitenum_te.txt')\n", 476 | " site_num_te = df_te['sitenum'].to_numpy(dtype=int)\n", 477 | " np.savetxt(sitenum_file_te, site_num_te)\n", 478 | " \n", 479 | " yhat_te, s2_te, Z = predict(cov_file_te, \n", 480 | " alg = 'blr', \n", 481 | " respfile = resp_file_te, \n", 482 | " model_path = os.path.join(idp_dir,'Models'),\n", 483 | " adaptrespfile = resp_file_ad,\n", 484 | " adaptcovfile = cov_file_ad,\n", 485 | " adaptvargroupfile = sitenum_file_ad,\n", 486 | " testvargroupfile = sitenum_file_te)" 487 | ] 488 | }, 489 | { 490 | "cell_type": "markdown", 491 | "id": "75210821-ccb8-4bd2-82f3-641708811b21", 492 | "metadata": { 493 | "id": "75210821-ccb8-4bd2-82f3-641708811b21" 494 | }, 495 | "source": [ 496 | "### Preparing dummy data for plotting\n", 497 | "\n", 498 | "Now, we plot the centiles of variation estimated by the normative model. \n", 499 | "\n", 500 | "We do this by making use of a set of dummy covariates that span the whole range of the input space (for age) for a fixed value of the other covariates (e.g. sex) so that we can make predictions for these dummy data points, then plot them. We configure these dummy predictions using the same procedure as we used for the real data. We can use the same dummy data for all the IDPs we wish to plot" 501 | ] 502 | }, 503 | { 504 | "cell_type": "code", 505 | "execution_count": null, 506 | "id": "2d0743d8-28ca-4a14-8ef0-99bf40434b5b", 507 | "metadata": { 508 | "colab": { 509 | "base_uri": "https://localhost:8080/" 510 | }, 511 | "id": "2d0743d8-28ca-4a14-8ef0-99bf40434b5b", 512 | "outputId": "7d4c8f2e-ca79-46e1-e5a8-0733503fde94" 513 | }, 514 | "outputs": [], 515 | "source": [ 516 | "# which sex do we want to plot? \n", 517 | "sex = 1 # 1 = male 0 = female\n", 518 | "if sex == 1: \n", 519 | " clr = 'blue';\n", 520 | "else:\n", 521 | " clr = 'red'\n", 522 | "\n", 523 | "# create dummy data for visualisation\n", 524 | "print('configuring dummy data ...')\n", 525 | "xx = np.arange(xmin, xmax, 0.5)\n", 526 | "X0_dummy = np.zeros((len(xx), 2))\n", 527 | "X0_dummy[:,0] = xx\n", 528 | "X0_dummy[:,1] = sex\n", 529 | "\n", 530 | "# create the design matrix\n", 531 | "X_dummy = create_design_matrix(X0_dummy, xmin=xmin, xmax=xmax, site_ids=None, all_sites=site_ids_tr)\n", 532 | "\n", 533 | "# save the dummy covariates\n", 534 | "cov_file_dummy = os.path.join(out_dir,'cov_bspline_dummy_mean.txt')\n", 535 | "np.savetxt(cov_file_dummy, X_dummy)" 536 | ] 537 | }, 538 | { 539 | "cell_type": "markdown", 540 | "id": "126323a3-2270-4796-97c4-94629730ddf7", 541 | "metadata": { 542 | "id": "126323a3-2270-4796-97c4-94629730ddf7" 543 | }, 544 | "source": [ 545 | "### Plotting the normative models\n", 546 | "\n", 547 | "Now we loop through the IDPs, plotting each one separately. The outputs of this step are a set of quantitative regression metrics for each IDP and a set of centile curves which we plot the test data against. \n", 548 | "\n", 549 | "This part of the code is relatively complex because we need to keep track of many quantities for the plotting. We also need to remember whether the data need to be warped or not. By default in PCNtoolkit, predictions in the form of `yhat, s2` are always in the warped (Gaussian) space. If we want predictions in the input (non-Gaussian) space, then we need to warp them with the inverse of the estimated warping function. This can be done using the function `nm.blr.warp.warp_predictions()`. \n", 550 | "\n", 551 | "**Note:** it is necessary to update the intercept for each of the sites. For purposes of visualisation, here we do this by adjusting the median of the data to match the dummy predictions, but note that all the quantitative metrics are estimated using the predictions that are adjusted properly using a learned offset (or adjusted using a hold-out adaptation set, as above). Note also that for the calibration data we require at least two data points of the same sex in each site to be able to estimate the variance. Of course, in a real example, you would want many more than just two since we need to get a reliable estimate of the variance for each site. " 552 | ] 553 | }, 554 | { 555 | "cell_type": "code", 556 | "execution_count": null, 557 | "id": "cdd68cc6-212b-4149-b86a-24e842078e1a", 558 | "metadata": { 559 | "id": "cdd68cc6-212b-4149-b86a-24e842078e1a" 560 | }, 561 | "outputs": [], 562 | "source": [ 563 | "sns.set(style='whitegrid')\n", 564 | "\n", 565 | "for idp_num, idp in enumerate(idp_ids): \n", 566 | " print('Running IDP', idp_num, idp, ':')\n", 567 | " idp_dir = os.path.join(out_dir, idp)\n", 568 | " os.chdir(idp_dir)\n", 569 | " \n", 570 | " # load the true data points\n", 571 | " yhat_te = load_2d(os.path.join(idp_dir, 'yhat_predict.txt'))\n", 572 | " s2_te = load_2d(os.path.join(idp_dir, 'ys2_predict.txt'))\n", 573 | " y_te = load_2d(os.path.join(idp_dir, 'resp_te.txt'))\n", 574 | " \n", 575 | " # set up the covariates for the dummy data\n", 576 | " print('Making predictions with dummy covariates (for visualisation)')\n", 577 | " yhat, s2 = predict(cov_file_dummy, \n", 578 | " alg = 'blr', \n", 579 | " respfile = None, \n", 580 | " model_path = os.path.join(idp_dir,'Models'), \n", 581 | " outputsuffix = '_dummy')\n", 582 | " \n", 583 | " # load the normative model\n", 584 | " with open(os.path.join(idp_dir,'Models', 'NM_0_0_estimate.pkl'), 'rb') as handle:\n", 585 | " nm = pickle.load(handle) \n", 586 | " \n", 587 | " # get the warp and warp parameters\n", 588 | " W = nm.blr.warp\n", 589 | " warp_param = nm.blr.hyp[1:nm.blr.warp.get_n_params()+1] \n", 590 | " \n", 591 | " # first, we warp predictions for the true data and compute evaluation metrics\n", 592 | " med_te = W.warp_predictions(np.squeeze(yhat_te), np.squeeze(s2_te), warp_param)[0]\n", 593 | " med_te = med_te[:, np.newaxis]\n", 594 | " print('metrics:', evaluate(y_te, med_te))\n", 595 | " \n", 596 | " # then, we warp dummy predictions to create the plots\n", 597 | " med, pr_int = W.warp_predictions(np.squeeze(yhat), np.squeeze(s2), warp_param)\n", 598 | " \n", 599 | " # extract the different variance components to visualise\n", 600 | " beta, junk1, junk2 = nm.blr._parse_hyps(nm.blr.hyp, X_dummy)\n", 601 | " s2n = 1/beta # variation (aleatoric uncertainty)\n", 602 | " s2s = s2-s2n # modelling uncertainty (epistemic uncertainty)\n", 603 | " \n", 604 | " # plot the data points\n", 605 | " y_te_rescaled_all = np.zeros_like(y_te)\n", 606 | " for sid, site in enumerate(site_ids_te):\n", 607 | " # plot the true test data points \n", 608 | " if all(elem in site_ids_tr for elem in site_ids_te):\n", 609 | " # all data in the test set are present in the training set\n", 610 | " \n", 611 | " # first, we select the data points belonging to this particular site\n", 612 | " idx = np.where(np.bitwise_and(X_te[:,2] == sex, X_te[:,sid+len(cols_cov)+1] !=0))[0]\n", 613 | " if len(idx) == 0:\n", 614 | " print('No data for site', sid, site, 'skipping...')\n", 615 | " continue\n", 616 | " \n", 617 | " # then directly adjust the data\n", 618 | " idx_dummy = np.bitwise_and(X_dummy[:,1] > X_te[idx,1].min(), X_dummy[:,1] < X_te[idx,1].max())\n", 619 | " y_te_rescaled = y_te[idx] - np.median(y_te[idx]) + np.median(med[idx_dummy])\n", 620 | " else:\n", 621 | " # we need to adjust the data based on the adaptation dataset \n", 622 | " \n", 623 | " # first, select the data point belonging to this particular site\n", 624 | " idx = np.where(np.bitwise_and(X_te[:,2] == sex, (df_te['site'] == site).to_numpy()))[0]\n", 625 | " \n", 626 | " # load the adaptation data\n", 627 | " y_ad = load_2d(os.path.join(idp_dir, 'resp_ad.txt'))\n", 628 | " X_ad = load_2d(os.path.join(idp_dir, 'cov_bspline_ad.txt'))\n", 629 | " idx_a = np.where(np.bitwise_and(X_ad[:,2] == sex, (df_ad['site'] == site).to_numpy()))[0]\n", 630 | " if len(idx) < 2 or len(idx_a) < 2:\n", 631 | " print('Insufficent data for site', sid, site, 'skipping...')\n", 632 | " continue\n", 633 | " \n", 634 | " # adjust and rescale the data\n", 635 | " y_te_rescaled, s2_rescaled = nm.blr.predict_and_adjust(nm.blr.hyp, \n", 636 | " X_ad[idx_a,:], \n", 637 | " np.squeeze(y_ad[idx_a]), \n", 638 | " Xs=None, \n", 639 | " ys=np.squeeze(y_te[idx]))\n", 640 | " # plot the (adjusted) data points\n", 641 | " plt.scatter(X_te[idx,1], y_te_rescaled, s=4, color=clr, alpha = 0.1)\n", 642 | " \n", 643 | " # plot the median of the dummy data\n", 644 | " plt.plot(xx, med, clr)\n", 645 | " \n", 646 | " # fill the gaps in between the centiles\n", 647 | " junk, pr_int25 = W.warp_predictions(np.squeeze(yhat), np.squeeze(s2), warp_param, percentiles=[0.25,0.75])\n", 648 | " junk, pr_int95 = W.warp_predictions(np.squeeze(yhat), np.squeeze(s2), warp_param, percentiles=[0.05,0.95])\n", 649 | " junk, pr_int99 = W.warp_predictions(np.squeeze(yhat), np.squeeze(s2), warp_param, percentiles=[0.01,0.99])\n", 650 | " plt.fill_between(xx, pr_int25[:,0], pr_int25[:,1], alpha = 0.1,color=clr)\n", 651 | " plt.fill_between(xx, pr_int95[:,0], pr_int95[:,1], alpha = 0.1,color=clr)\n", 652 | " plt.fill_between(xx, pr_int99[:,0], pr_int99[:,1], alpha = 0.1,color=clr)\n", 653 | " \n", 654 | " # make the width of each centile proportional to the epistemic uncertainty\n", 655 | " junk, pr_int25l = W.warp_predictions(np.squeeze(yhat), np.squeeze(s2-0.5*s2s), warp_param, percentiles=[0.25,0.75])\n", 656 | " junk, pr_int95l = W.warp_predictions(np.squeeze(yhat), np.squeeze(s2-0.5*s2s), warp_param, percentiles=[0.05,0.95])\n", 657 | " junk, pr_int99l = W.warp_predictions(np.squeeze(yhat), np.squeeze(s2-0.5*s2s), warp_param, percentiles=[0.01,0.99])\n", 658 | " junk, pr_int25u = W.warp_predictions(np.squeeze(yhat), np.squeeze(s2+0.5*s2s), warp_param, percentiles=[0.25,0.75])\n", 659 | " junk, pr_int95u = W.warp_predictions(np.squeeze(yhat), np.squeeze(s2+0.5*s2s), warp_param, percentiles=[0.05,0.95])\n", 660 | " junk, pr_int99u = W.warp_predictions(np.squeeze(yhat), np.squeeze(s2+0.5*s2s), warp_param, percentiles=[0.01,0.99]) \n", 661 | " plt.fill_between(xx, pr_int25l[:,0], pr_int25u[:,0], alpha = 0.3,color=clr)\n", 662 | " plt.fill_between(xx, pr_int95l[:,0], pr_int95u[:,0], alpha = 0.3,color=clr)\n", 663 | " plt.fill_between(xx, pr_int99l[:,0], pr_int99u[:,0], alpha = 0.3,color=clr)\n", 664 | " plt.fill_between(xx, pr_int25l[:,1], pr_int25u[:,1], alpha = 0.3,color=clr)\n", 665 | " plt.fill_between(xx, pr_int95l[:,1], pr_int95u[:,1], alpha = 0.3,color=clr)\n", 666 | " plt.fill_between(xx, pr_int99l[:,1], pr_int99u[:,1], alpha = 0.3,color=clr)\n", 667 | "\n", 668 | " # plot actual centile lines\n", 669 | " plt.plot(xx, pr_int25[:,0],color=clr, linewidth=0.5)\n", 670 | " plt.plot(xx, pr_int25[:,1],color=clr, linewidth=0.5)\n", 671 | " plt.plot(xx, pr_int95[:,0],color=clr, linewidth=0.5)\n", 672 | " plt.plot(xx, pr_int95[:,1],color=clr, linewidth=0.5)\n", 673 | " plt.plot(xx, pr_int99[:,0],color=clr, linewidth=0.5)\n", 674 | " plt.plot(xx, pr_int99[:,1],color=clr, linewidth=0.5)\n", 675 | " \n", 676 | " plt.xlabel('Age')\n", 677 | " plt.ylabel(idp) \n", 678 | " plt.title(idp)\n", 679 | " plt.xlim((0,90))\n", 680 | " plt.savefig(os.path.join(idp_dir, 'centiles_' + str(sex)), bbox_inches='tight')\n", 681 | " plt.show()\n", 682 | "\n", 683 | "os.chdir(out_dir)" 684 | ] 685 | }, 686 | { 687 | "cell_type": "code", 688 | "execution_count": null, 689 | "id": "OMUyOWOLmU1b", 690 | "metadata": { 691 | "colab": { 692 | "base_uri": "https://localhost:8080/" 693 | }, 694 | "id": "OMUyOWOLmU1b", 695 | "outputId": "b0111629-5919-40a4-cde7-dbf5eaf9f692" 696 | }, 697 | "outputs": [], 698 | "source": [ 699 | "# explore an example output folder of a single model (one ROI)\n", 700 | "# think about what each of these output files represents. \n", 701 | "# Hint: look at the variable names and comments in the code block above\n", 702 | "! ls rh_MeanThickness_thickness/" 703 | ] 704 | }, 705 | { 706 | "cell_type": "code", 707 | "execution_count": null, 708 | "id": "TJIFVhQ5zKBw", 709 | "metadata": { 710 | "colab": { 711 | "base_uri": "https://localhost:8080/" 712 | }, 713 | "id": "TJIFVhQ5zKBw", 714 | "outputId": "d9f6e492-7ec1-40af-bcb4-1ea94eaed09e" 715 | }, 716 | "outputs": [], 717 | "source": [ 718 | "# check that the number of deviation scores matches the number of subjects in the test set\n", 719 | "# there should be one deviation score per subject (one line per subject), so we can\n", 720 | "# verify by counting the line numbers in the Z_predict.txt file\n", 721 | "! cat rh_MeanThickness_thickness/Z_predict.txt | wc" 722 | ] 723 | }, 724 | { 725 | "cell_type": "markdown", 726 | "id": "hZEs7Ej4-qGi", 727 | "metadata": { 728 | "id": "hZEs7Ej4-qGi" 729 | }, 730 | "source": [ 731 | "The deviation scores are output as a text file in separate folders. We want to summarize the deviation scores across all models estimates so we can organize them into a single file, and merge the deviation scores into the original data file. " 732 | ] 733 | }, 734 | { 735 | "cell_type": "code", 736 | "execution_count": null, 737 | "id": "L-OauNfc5Jrx", 738 | "metadata": { 739 | "id": "L-OauNfc5Jrx" 740 | }, 741 | "outputs": [], 742 | "source": [ 743 | "! mkdir deviation_scores" 744 | ] 745 | }, 746 | { 747 | "cell_type": "code", 748 | "execution_count": null, 749 | "id": "ZEgnixDd5KgK", 750 | "metadata": { 751 | "id": "ZEgnixDd5KgK" 752 | }, 753 | "outputs": [], 754 | "source": [ 755 | "! for i in *; do if [[ -e ${i}/Z_predict.txt ]]; then cp ${i}/Z_predict.txt deviation_scores/${i}_Z_predict.txt; fi; done" 756 | ] 757 | }, 758 | { 759 | "cell_type": "code", 760 | "execution_count": null, 761 | "id": "10gP5z-t7-ZC", 762 | "metadata": { 763 | "id": "10gP5z-t7-ZC" 764 | }, 765 | "outputs": [], 766 | "source": [ 767 | "z_dir = '/content/braincharts/models/lifespan_57K_82sites/deviation_scores/'\n", 768 | "filelist = [name for name in os.listdir(z_dir)]" 769 | ] 770 | }, 771 | { 772 | "cell_type": "code", 773 | "execution_count": null, 774 | "id": "Q2GAFv5F8TFa", 775 | "metadata": { 776 | "id": "Q2GAFv5F8TFa" 777 | }, 778 | "outputs": [], 779 | "source": [ 780 | "os.chdir(z_dir)\n", 781 | "Z_df = pd.concat([pd.read_csv(item, names=[item[:-4]]) for item in filelist], axis=1)" 782 | ] 783 | }, 784 | { 785 | "cell_type": "code", 786 | "execution_count": null, 787 | "id": "FHcx4vsj8eMf", 788 | "metadata": { 789 | "id": "FHcx4vsj8eMf" 790 | }, 791 | "outputs": [], 792 | "source": [ 793 | "df_te.reset_index(inplace=True)" 794 | ] 795 | }, 796 | { 797 | "cell_type": "code", 798 | "execution_count": null, 799 | "id": "9werTREu8c0P", 800 | "metadata": { 801 | "id": "9werTREu8c0P" 802 | }, 803 | "outputs": [], 804 | "source": [ 805 | "Z_df['sub_id'] = df_te['sub_id']" 806 | ] 807 | }, 808 | { 809 | "cell_type": "code", 810 | "execution_count": null, 811 | "id": "WgAJ86wy9U5A", 812 | "metadata": { 813 | "id": "WgAJ86wy9U5A" 814 | }, 815 | "outputs": [], 816 | "source": [ 817 | "df_te_Z = pd.merge(df_te, Z_df, on='sub_id', how='inner')" 818 | ] 819 | }, 820 | { 821 | "cell_type": "code", 822 | "execution_count": null, 823 | "id": "bn6I12zh9t1g", 824 | "metadata": { 825 | "id": "bn6I12zh9t1g" 826 | }, 827 | "outputs": [], 828 | "source": [ 829 | "df_te_Z.to_csv('fcon1000_te_Z.csv', index=False)" 830 | ] 831 | } 832 | ], 833 | "metadata": { 834 | "colab": { 835 | "name": "apply_normative_models.ipynb", 836 | "provenance": [] 837 | }, 838 | "kernelspec": { 839 | "display_name": "Python 3 (ipykernel)", 840 | "language": "python", 841 | "name": "python3" 842 | }, 843 | "language_info": { 844 | "codemirror_mode": { 845 | "name": "ipython", 846 | "version": 3 847 | }, 848 | "file_extension": ".py", 849 | "mimetype": "text/x-python", 850 | "name": "python", 851 | "nbconvert_exporter": "python", 852 | "pygments_lexer": "ipython3", 853 | "version": "3.9.7" 854 | } 855 | }, 856 | "nbformat": 4, 857 | "nbformat_minor": 5 858 | } 859 | -------------------------------------------------------------------------------- /tasks/3_Visualizations.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": { 6 | "id": "HWR8M_FM0kCa" 7 | }, 8 | "source": [ 9 | "# Brain Space Visualization of Deviation Scores" 10 | ] 11 | }, 12 | { 13 | "cell_type": "markdown", 14 | "metadata": { 15 | "id": "nEkEuf7H0kCb" 16 | }, 17 | "source": [ 18 | "## Count the number of extreme (positive & negative) deviations at each brain region and visualize the count for each hemisphere." 19 | ] 20 | }, 21 | { 22 | "cell_type": "code", 23 | "execution_count": null, 24 | "metadata": { 25 | "id": "SdVyEOWVJNyy" 26 | }, 27 | "outputs": [], 28 | "source": [ 29 | "! git clone https://github.com/saigerutherford/CPC_ML_tutorial.git" 30 | ] 31 | }, 32 | { 33 | "cell_type": "code", 34 | "execution_count": null, 35 | "metadata": { 36 | "id": "6c0O3oKQ0kCW" 37 | }, 38 | "outputs": [], 39 | "source": [ 40 | "import os\n", 41 | "import pandas as pd\n", 42 | "import numpy as np\n", 43 | "import matplotlib.pyplot as plt\n", 44 | "import seaborn as sns" 45 | ] 46 | }, 47 | { 48 | "cell_type": "code", 49 | "execution_count": null, 50 | "metadata": { 51 | "id": "XwQOtrKmKd-T" 52 | }, 53 | "outputs": [], 54 | "source": [ 55 | "os.chdir('/content/CPC_ML_tutorial')" 56 | ] 57 | }, 58 | { 59 | "cell_type": "code", 60 | "execution_count": null, 61 | "metadata": { 62 | "id": "bruuGS8Z0kCb" 63 | }, 64 | "outputs": [], 65 | "source": [ 66 | "Z_df = pd.read_csv('data/Z_long_format.csv')" 67 | ] 68 | }, 69 | { 70 | "cell_type": "code", 71 | "execution_count": null, 72 | "metadata": { 73 | "id": "CtDtz47p0kCn" 74 | }, 75 | "outputs": [], 76 | "source": [ 77 | "# Change this threshold to view more or less extreme deviations.\n", 78 | "# Discuss with your partner what you think is an appropriate threshold and adjust the below variables accordingly.\n", 79 | "Z_positive = Z_df.query('value > 2')\n", 80 | "Z_negative = Z_df.query('value < -2')" 81 | ] 82 | }, 83 | { 84 | "cell_type": "code", 85 | "execution_count": null, 86 | "metadata": { 87 | "id": "OCrM8a-c0kCn" 88 | }, 89 | "outputs": [], 90 | "source": [ 91 | "positive_left_z = Z_positive.query('hemi == \"left\"')\n", 92 | "positive_right_z = Z_positive.query('hemi == \"right\"')\n", 93 | "positive_sc_z = Z_positive.query('hemi == \"subcortical\"')\n", 94 | "negative_left_z = Z_negative.query('hemi == \"left\"')\n", 95 | "negative_right_z = Z_negative.query('hemi == \"right\"')\n", 96 | "negative_sc_z = Z_negative.query('hemi == \"subcortical\"')" 97 | ] 98 | }, 99 | { 100 | "cell_type": "code", 101 | "execution_count": null, 102 | "metadata": { 103 | "id": "2oeEd6Ay0kCo" 104 | }, 105 | "outputs": [], 106 | "source": [ 107 | "positive_left_z2 = positive_left_z['ROI_name'].value_counts().rename_axis('ROI').reset_index(name='counts')\n", 108 | "positive_right_z2 = positive_right_z['ROI_name'].value_counts().rename_axis('ROI').reset_index(name='counts')\n", 109 | "positive_sc_z2 = positive_sc_z['ROI_name'].value_counts().rename_axis('ROI').reset_index(name='counts')\n", 110 | "negative_left_z2 = negative_left_z['ROI_name'].value_counts().rename_axis('ROI').reset_index(name='counts')\n", 111 | "negative_right_z2 = negative_right_z['ROI_name'].value_counts().rename_axis('ROI').reset_index(name='counts')\n", 112 | "negative_sc_z2 = negative_sc_z['ROI_name'].value_counts().rename_axis('ROI').reset_index(name='counts')" 113 | ] 114 | }, 115 | { 116 | "cell_type": "code", 117 | "execution_count": null, 118 | "metadata": { 119 | "id": "NlTVUuR6TyXq" 120 | }, 121 | "outputs": [], 122 | "source": [ 123 | "positive_left_z2.describe()" 124 | ] 125 | }, 126 | { 127 | "cell_type": "code", 128 | "execution_count": null, 129 | "metadata": { 130 | "id": "ugzcZHLDT8ve" 131 | }, 132 | "outputs": [], 133 | "source": [ 134 | "positive_right_z2.describe()" 135 | ] 136 | }, 137 | { 138 | "cell_type": "code", 139 | "execution_count": null, 140 | "metadata": { 141 | "id": "ugzcZHLDT8ve" 142 | }, 143 | "outputs": [], 144 | "source": [ 145 | "positive_sc_z2.describe()" 146 | ] 147 | }, 148 | { 149 | "cell_type": "code", 150 | "execution_count": null, 151 | "metadata": { 152 | "id": "NlTVUuR6TyXq" 153 | }, 154 | "outputs": [], 155 | "source": [ 156 | "negative_left_z2.describe()" 157 | ] 158 | }, 159 | { 160 | "cell_type": "code", 161 | "execution_count": null, 162 | "metadata": { 163 | "id": "ugzcZHLDT8ve" 164 | }, 165 | "outputs": [], 166 | "source": [ 167 | "negative_right_z2.describe()" 168 | ] 169 | }, 170 | { 171 | "cell_type": "code", 172 | "execution_count": null, 173 | "metadata": { 174 | "id": "ugzcZHLDT8ve" 175 | }, 176 | "outputs": [], 177 | "source": [ 178 | "negative_sc_z2.describe()" 179 | ] 180 | }, 181 | { 182 | "cell_type": "code", 183 | "execution_count": null, 184 | "metadata": { 185 | "id": "BikyDuO_K_I3" 186 | }, 187 | "outputs": [], 188 | "source": [ 189 | "! pip install nilearn" 190 | ] 191 | }, 192 | { 193 | "cell_type": "code", 194 | "execution_count": null, 195 | "metadata": { 196 | "id": "mzYmi_cK0kCo" 197 | }, 198 | "outputs": [], 199 | "source": [ 200 | "from nilearn import plotting\n", 201 | "import nibabel as nib\n", 202 | "from nilearn import datasets" 203 | ] 204 | }, 205 | { 206 | "cell_type": "code", 207 | "execution_count": null, 208 | "metadata": { 209 | "id": "jxRXeYHd0kCp" 210 | }, 211 | "outputs": [], 212 | "source": [ 213 | "destrieux_atlas = datasets.fetch_atlas_surf_destrieux()\n", 214 | "fsaverage = datasets.fetch_surf_fsaverage()" 215 | ] 216 | }, 217 | { 218 | "cell_type": "code", 219 | "execution_count": null, 220 | "metadata": { 221 | "id": "18hyJU3Z0kCp" 222 | }, 223 | "outputs": [], 224 | "source": [ 225 | "# The parcellation is already loaded into memory\n", 226 | "parcellation_l = destrieux_atlas['map_left']\n", 227 | "parcellation_r = destrieux_atlas['map_right']" 228 | ] 229 | }, 230 | { 231 | "cell_type": "code", 232 | "execution_count": null, 233 | "metadata": { 234 | "id": "9ewObTkj0kCp" 235 | }, 236 | "outputs": [], 237 | "source": [ 238 | "nl = pd.read_csv('data/nilearn_order.csv')" 239 | ] 240 | }, 241 | { 242 | "cell_type": "code", 243 | "execution_count": null, 244 | "metadata": { 245 | "id": "RoAgRhiO0kCq" 246 | }, 247 | "outputs": [], 248 | "source": [ 249 | "atlas_r = destrieux_atlas['map_right']\n", 250 | "atlas_l = destrieux_atlas['map_left']" 251 | ] 252 | }, 253 | { 254 | "cell_type": "code", 255 | "execution_count": null, 256 | "metadata": { 257 | "id": "gvN_Slut0kCq" 258 | }, 259 | "outputs": [], 260 | "source": [ 261 | "nl_ROI = nl['ROI'].to_list()" 262 | ] 263 | }, 264 | { 265 | "cell_type": "markdown", 266 | "metadata": { 267 | "id": "T8wirzP50kCq" 268 | }, 269 | "source": [ 270 | "# Extreme positive deviation viz" 271 | ] 272 | }, 273 | { 274 | "cell_type": "code", 275 | "execution_count": null, 276 | "metadata": { 277 | "id": "sVur0mfY0kCq" 278 | }, 279 | "outputs": [], 280 | "source": [ 281 | "nl_positive_left = pd.merge(nl, positive_left_z2, on='ROI', how='left')\n", 282 | "nl_positive_right = pd.merge(nl, positive_right_z2, on='ROI', how='left')" 283 | ] 284 | }, 285 | { 286 | "cell_type": "code", 287 | "execution_count": null, 288 | "metadata": { 289 | "id": "V1b4dXlB0kCq" 290 | }, 291 | "outputs": [], 292 | "source": [ 293 | "nl_positive_left['counts'] = nl_positive_right['counts'].fillna(0)\n", 294 | "nl_positive_right['counts'] = nl_positive_right['counts'].fillna(0)" 295 | ] 296 | }, 297 | { 298 | "cell_type": "code", 299 | "execution_count": null, 300 | "metadata": { 301 | "id": "6TIc-8JE0kCr" 302 | }, 303 | "outputs": [], 304 | "source": [ 305 | "nl_positive_left = nl_positive_left['counts'].to_numpy()\n", 306 | "nl_positive_right = nl_positive_right['counts'].to_numpy()" 307 | ] 308 | }, 309 | { 310 | "cell_type": "code", 311 | "execution_count": null, 312 | "metadata": { 313 | "id": "dpFgV9610kCr" 314 | }, 315 | "outputs": [], 316 | "source": [ 317 | "a_list = list(range(1, 76))\n", 318 | "parcellation_positive_l = atlas_l\n", 319 | "for i, j in enumerate(a_list):\n", 320 | " parcellation_positive_l = np.where(parcellation_positive_l == j, nl_positive_left[i], parcellation_positive_l)" 321 | ] 322 | }, 323 | { 324 | "cell_type": "code", 325 | "execution_count": null, 326 | "metadata": { 327 | "id": "LEb86JQP0kCr" 328 | }, 329 | "outputs": [], 330 | "source": [ 331 | "a_list = list(range(1, 76))\n", 332 | "parcellation_positive_r = atlas_r\n", 333 | "for i, j in enumerate(a_list):\n", 334 | " parcellation_positive_r = np.where(parcellation_positive_r == j, nl_positive_right[i], parcellation_positive_r)" 335 | ] 336 | }, 337 | { 338 | "cell_type": "code", 339 | "execution_count": null, 340 | "metadata": { 341 | "id": "Uvo8bs0J0kCr" 342 | }, 343 | "outputs": [], 344 | "source": [ 345 | "# you can click around in 3D space on this visualization. Scroll in/out, move the brain around, etc. Have fun with it :) \n", 346 | "view = plotting.view_surf(fsaverage.infl_right, parcellation_positive_r, threshold=None, symmetric_cmap=False, cmap='plasma', bg_map=fsaverage.sulc_right)\n", 347 | "\n", 348 | "view" 349 | ] 350 | }, 351 | { 352 | "cell_type": "code", 353 | "execution_count": null, 354 | "metadata": { 355 | "id": "knItaGcv0kCr" 356 | }, 357 | "outputs": [], 358 | "source": [ 359 | "view = plotting.view_surf(fsaverage.infl_left, parcellation_positive_l, threshold=None, symmetric_cmap=False, cmap='plasma', bg_map=fsaverage.sulc_left)\n", 360 | "\n", 361 | "view" 362 | ] 363 | }, 364 | { 365 | "cell_type": "markdown", 366 | "metadata": { 367 | "id": "k-ASdN3T0kCr" 368 | }, 369 | "source": [ 370 | "# Extreme negative deviation viz" 371 | ] 372 | }, 373 | { 374 | "cell_type": "code", 375 | "execution_count": null, 376 | "metadata": { 377 | "id": "8jEf15fg0kCr" 378 | }, 379 | "outputs": [], 380 | "source": [ 381 | "nl_negative_left = pd.merge(nl, negative_left_z2, on='ROI', how='left')\n", 382 | "nl_negative_right = pd.merge(nl, negative_right_z2, on='ROI', how='left')" 383 | ] 384 | }, 385 | { 386 | "cell_type": "code", 387 | "execution_count": null, 388 | "metadata": { 389 | "id": "TcPzp2ZC0kCr" 390 | }, 391 | "outputs": [], 392 | "source": [ 393 | "nl_negative_left['counts'] = nl_negative_left['counts'].fillna(0)\n", 394 | "nl_negative_right['counts'] = nl_negative_right['counts'].fillna(0)" 395 | ] 396 | }, 397 | { 398 | "cell_type": "code", 399 | "execution_count": null, 400 | "metadata": { 401 | "id": "KIoAvRlN0kCs" 402 | }, 403 | "outputs": [], 404 | "source": [ 405 | "nl_negative_left = nl_negative_left['counts'].to_numpy()\n", 406 | "nl_negative_right = nl_negative_right['counts'].to_numpy()" 407 | ] 408 | }, 409 | { 410 | "cell_type": "code", 411 | "execution_count": null, 412 | "metadata": { 413 | "id": "Ksv8QuLW0kCs" 414 | }, 415 | "outputs": [], 416 | "source": [ 417 | "a_list = list(range(1, 76))\n", 418 | "parcellation_negative_l = atlas_l\n", 419 | "for i, j in enumerate(a_list):\n", 420 | " parcellation_negative_l = np.where(parcellation_negative_l == j, nl_negative_left[i], parcellation_negative_l)" 421 | ] 422 | }, 423 | { 424 | "cell_type": "code", 425 | "execution_count": null, 426 | "metadata": { 427 | "id": "QXmjD4jQ0kCs" 428 | }, 429 | "outputs": [], 430 | "source": [ 431 | "a_list = list(range(1, 76))\n", 432 | "parcellation_negative_r = atlas_r\n", 433 | "for i, j in enumerate(a_list):\n", 434 | " parcellation_negative_r = np.where(parcellation_negative_r == j, nl_negative_right[i], parcellation_negative_r)" 435 | ] 436 | }, 437 | { 438 | "cell_type": "code", 439 | "execution_count": null, 440 | "metadata": { 441 | "id": "suomHT4C0kCs" 442 | }, 443 | "outputs": [], 444 | "source": [ 445 | "view = plotting.view_surf(fsaverage.infl_right, parcellation_negative_r, threshold=None, symmetric_cmap=False, cmap='plasma', bg_map=fsaverage.sulc_right)\n", 446 | "\n", 447 | "view" 448 | ] 449 | }, 450 | { 451 | "cell_type": "code", 452 | "execution_count": null, 453 | "metadata": { 454 | "id": "FbbhY4L80kCs" 455 | }, 456 | "outputs": [], 457 | "source": [ 458 | "view = plotting.view_surf(fsaverage.infl_left, parcellation_negative_l, threshold=None, symmetric_cmap=False, cmap='plasma', bg_map=fsaverage.sulc_left)\n", 459 | "\n", 460 | "view" 461 | ] 462 | } 463 | ], 464 | "metadata": { 465 | "colab": { 466 | "name": "3_Visualizations.ipynb", 467 | "provenance": [], 468 | "toc_visible": true 469 | }, 470 | "kernelspec": { 471 | "display_name": "Python 3", 472 | "language": "python", 473 | "name": "python3" 474 | }, 475 | "language_info": { 476 | "codemirror_mode": { 477 | "name": "ipython", 478 | "version": 3 479 | }, 480 | "file_extension": ".py", 481 | "mimetype": "text/x-python", 482 | "name": "python", 483 | "nbconvert_exporter": "python", 484 | "pygments_lexer": "ipython3", 485 | "version": "3.8.8" 486 | } 487 | }, 488 | "nbformat": 4, 489 | "nbformat_minor": 4 490 | } 491 | -------------------------------------------------------------------------------- /tasks/4_post_hoc_analysis.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": { 6 | "id": "hnjOHEPnSgqZ" 7 | }, 8 | "source": [ 9 | "# SVM classification SZ vs. HC. 5-fold cross validation" 10 | ] 11 | }, 12 | { 13 | "cell_type": "markdown", 14 | "metadata": { 15 | "id": "T1dgnArWand-" 16 | }, 17 | "source": [ 18 | "Classify schizophrenia group from controls using cortical thickness deviation scores (z-scores) and then the true cortical thickness data to see which type of data better separates the groups." 19 | ] 20 | }, 21 | { 22 | "cell_type": "code", 23 | "execution_count": null, 24 | "metadata": { 25 | "id": "dl-cWCkhU5OH" 26 | }, 27 | "outputs": [], 28 | "source": [ 29 | "! git clone https://github.com/saigerutherford/CPC_ML_tutorial.git" 30 | ] 31 | }, 32 | { 33 | "cell_type": "code", 34 | "execution_count": null, 35 | "metadata": { 36 | "id": "oer08RX7Sgqc" 37 | }, 38 | "outputs": [], 39 | "source": [ 40 | "import pandas as pd\n", 41 | "import numpy as np\n", 42 | "import os\n", 43 | "import matplotlib.pyplot as plt\n", 44 | "os.chdir('/content/CPC_ML_tutorial/')" 45 | ] 46 | }, 47 | { 48 | "cell_type": "code", 49 | "execution_count": null, 50 | "metadata": { 51 | "id": "kBA6wv5_Sgqd" 52 | }, 53 | "outputs": [], 54 | "source": [ 55 | "Z_df = pd.read_csv('data/fcon1000_te_Z.csv')" 56 | ] 57 | }, 58 | { 59 | "cell_type": "code", 60 | "execution_count": null, 61 | "metadata": { 62 | "id": "_AtT_a9QSgqe" 63 | }, 64 | "outputs": [], 65 | "source": [ 66 | "from sklearn import svm\n", 67 | "from sklearn.metrics import auc\n", 68 | "from sklearn.metrics import plot_roc_curve\n", 69 | "from sklearn.model_selection import StratifiedKFold" 70 | ] 71 | }, 72 | { 73 | "cell_type": "code", 74 | "execution_count": null, 75 | "metadata": { 76 | "id": "0m3frZSqWHFt" 77 | }, 78 | "outputs": [], 79 | "source": [ 80 | "Z_df.dropna(subset=['group'], inplace=True)" 81 | ] 82 | }, 83 | { 84 | "cell_type": "code", 85 | "execution_count": null, 86 | "metadata": { 87 | "id": "reWNrhN6Wge0" 88 | }, 89 | "outputs": [], 90 | "source": [ 91 | "Z_df['group'] = Z_df['group'].replace(\"SZ\",0)" 92 | ] 93 | }, 94 | { 95 | "cell_type": "code", 96 | "execution_count": null, 97 | "metadata": { 98 | "id": "LuddguUsW_UI" 99 | }, 100 | "outputs": [], 101 | "source": [ 102 | "Z_df['group'] = Z_df['group'].replace(\"Control\",1)" 103 | ] 104 | }, 105 | { 106 | "cell_type": "code", 107 | "execution_count": null, 108 | "metadata": { 109 | "id": "wBuQvJKqVz0p" 110 | }, 111 | "outputs": [], 112 | "source": [ 113 | "deviations = Z_df.loc[:, Z_df.columns.str.contains('Z_predict')]" 114 | ] 115 | }, 116 | { 117 | "cell_type": "code", 118 | "execution_count": null, 119 | "metadata": { 120 | "id": "QZvu0iXlZg7P" 121 | }, 122 | "outputs": [], 123 | "source": [ 124 | "cortical_thickness = Z_df.loc[:, Z_df.columns.str.endswith('_thickness')]" 125 | ] 126 | }, 127 | { 128 | "cell_type": "code", 129 | "execution_count": null, 130 | "metadata": { 131 | "id": "HECqM4rZVcD9" 132 | }, 133 | "outputs": [], 134 | "source": [ 135 | "# Data IO and generation\n", 136 | "X1 = deviations\n", 137 | "X2 = cortical_thickness\n", 138 | "y = Z_df['group']\n", 139 | "n_samples, n_features = X1.shape\n", 140 | "random_state = np.random.RandomState(0)" 141 | ] 142 | }, 143 | { 144 | "cell_type": "code", 145 | "execution_count": null, 146 | "metadata": { 147 | "id": "iho4wkAESgqf" 148 | }, 149 | "outputs": [], 150 | "source": [ 151 | "X1 = X1.to_numpy()" 152 | ] 153 | }, 154 | { 155 | "cell_type": "code", 156 | "execution_count": null, 157 | "metadata": { 158 | "id": "zi7v5e8vZ0Ms" 159 | }, 160 | "outputs": [], 161 | "source": [ 162 | "X2 = X2.to_numpy()" 163 | ] 164 | }, 165 | { 166 | "cell_type": "code", 167 | "execution_count": null, 168 | "metadata": { 169 | "id": "xcA4w73TSgqf" 170 | }, 171 | "outputs": [], 172 | "source": [ 173 | "y = y.astype(int)" 174 | ] 175 | }, 176 | { 177 | "cell_type": "code", 178 | "execution_count": null, 179 | "metadata": { 180 | "id": "mKcM-dA3ZG_u" 181 | }, 182 | "outputs": [], 183 | "source": [ 184 | "y = y.to_numpy()" 185 | ] 186 | }, 187 | { 188 | "cell_type": "code", 189 | "execution_count": null, 190 | "metadata": { 191 | "id": "NNRcb-pvSgqf" 192 | }, 193 | "outputs": [], 194 | "source": [ 195 | "# #############################################################################\n", 196 | "# Classification and ROC analysis\n", 197 | "\n", 198 | "# Run classifier with cross-validation and plot ROC curves\n", 199 | "cv = StratifiedKFold(n_splits=5)\n", 200 | "classifier = svm.SVC(kernel='linear', probability=True,\n", 201 | " random_state=random_state)\n", 202 | "\n", 203 | "tprs = []\n", 204 | "aucs = []\n", 205 | "mean_fpr = np.linspace(0, 1, 100)\n", 206 | "\n", 207 | "fig, ax = plt.subplots(figsize=(15,15))\n", 208 | "parameters = {'axes.labelsize': 20,\n", 209 | " 'axes.titlesize': 25, 'xtick.labelsize':16,'ytick.labelsize':16,'legend.fontsize':14,'legend.title_fontsize':16}\n", 210 | "plt.rcParams.update(parameters)\n", 211 | "\n", 212 | "for i, (train, test) in enumerate(cv.split(X1, y)):\n", 213 | " classifier.fit(X1[train], y[train])\n", 214 | " viz = plot_roc_curve(classifier, X1[test], y[test],\n", 215 | " name='ROC fold {}'.format(i),\n", 216 | " alpha=0.3, lw=1, ax=ax)\n", 217 | " interp_tpr = np.interp(mean_fpr, viz.fpr, viz.tpr)\n", 218 | " interp_tpr[0] = 0.0\n", 219 | " tprs.append(interp_tpr)\n", 220 | " aucs.append(viz.roc_auc)\n", 221 | "\n", 222 | "ax.plot([0, 1], [0, 1], linestyle='--', lw=2, color='r',\n", 223 | " label='Chance', alpha=.8)\n", 224 | "\n", 225 | "mean_tpr = np.mean(tprs, axis=0)\n", 226 | "mean_tpr[-1] = 1.0\n", 227 | "mean_auc = auc(mean_fpr, mean_tpr)\n", 228 | "std_auc = np.std(aucs)\n", 229 | "ax.plot(mean_fpr, mean_tpr, color='b',\n", 230 | " label=r'Mean ROC (AUC = %0.2f $\\pm$ %0.2f)' % (mean_auc, std_auc),\n", 231 | " lw=2, alpha=.8)\n", 232 | "\n", 233 | "std_tpr = np.std(tprs, axis=0)\n", 234 | "tprs_upper = np.minimum(mean_tpr + std_tpr, 1)\n", 235 | "tprs_lower = np.maximum(mean_tpr - std_tpr, 0)\n", 236 | "ax.fill_between(mean_fpr, tprs_lower, tprs_upper, color='grey', alpha=.2,\n", 237 | " label=r'$\\pm$ 1 std. dev.')\n", 238 | "\n", 239 | "ax.set(xlim=[-0.05, 1.05], ylim=[-0.05, 1.05])\n", 240 | "ax.set_title('Receiver operating characteristic SZ vs. HC (deviations)', fontweight=\"bold\", size=20)\n", 241 | "ax.legend(loc=\"lower right\")\n", 242 | "plt.show()" 243 | ] 244 | }, 245 | { 246 | "cell_type": "code", 247 | "execution_count": null, 248 | "metadata": { 249 | "id": "WYPilmZOaNgs" 250 | }, 251 | "outputs": [], 252 | "source": [ 253 | "# #############################################################################\n", 254 | "# Classification and ROC analysis\n", 255 | "\n", 256 | "# Run classifier with cross-validation and plot ROC curves\n", 257 | "cv = StratifiedKFold(n_splits=5)\n", 258 | "classifier = svm.SVC(kernel='linear', probability=True,\n", 259 | " random_state=random_state)\n", 260 | "\n", 261 | "tprs = []\n", 262 | "aucs = []\n", 263 | "mean_fpr = np.linspace(0, 1, 100)\n", 264 | "\n", 265 | "fig, ax = plt.subplots(figsize=(15,15))\n", 266 | "parameters = {'axes.labelsize': 20,\n", 267 | " 'axes.titlesize': 25, 'xtick.labelsize':16,'ytick.labelsize':16,'legend.fontsize':14,'legend.title_fontsize':16}\n", 268 | "plt.rcParams.update(parameters)\n", 269 | "\n", 270 | "for i, (train, test) in enumerate(cv.split(X2, y)):\n", 271 | " classifier.fit(X2[train], y[train])\n", 272 | " viz = plot_roc_curve(classifier, X2[test], y[test],\n", 273 | " name='ROC fold {}'.format(i),\n", 274 | " alpha=0.3, lw=1, ax=ax)\n", 275 | " interp_tpr = np.interp(mean_fpr, viz.fpr, viz.tpr)\n", 276 | " interp_tpr[0] = 0.0\n", 277 | " tprs.append(interp_tpr)\n", 278 | " aucs.append(viz.roc_auc)\n", 279 | "\n", 280 | "ax.plot([0, 1], [0, 1], linestyle='--', lw=2, color='r',\n", 281 | " label='Chance', alpha=.8)\n", 282 | "\n", 283 | "mean_tpr = np.mean(tprs, axis=0)\n", 284 | "mean_tpr[-1] = 1.0\n", 285 | "mean_auc = auc(mean_fpr, mean_tpr)\n", 286 | "std_auc = np.std(aucs)\n", 287 | "ax.plot(mean_fpr, mean_tpr, color='b',\n", 288 | " label=r'Mean ROC (AUC = %0.2f $\\pm$ %0.2f)' % (mean_auc, std_auc),\n", 289 | " lw=2, alpha=.8)\n", 290 | "\n", 291 | "std_tpr = np.std(tprs, axis=0)\n", 292 | "tprs_upper = np.minimum(mean_tpr + std_tpr, 1)\n", 293 | "tprs_lower = np.maximum(mean_tpr - std_tpr, 0)\n", 294 | "ax.fill_between(mean_fpr, tprs_lower, tprs_upper, color='grey', alpha=.2,\n", 295 | " label=r'$\\pm$ 1 std. dev.')\n", 296 | "\n", 297 | "ax.set(xlim=[-0.05, 1.05], ylim=[-0.05, 1.05])\n", 298 | "ax.set_title('Receiver operating characteristic SZ vs. HC (cortical thickness)', fontweight=\"bold\", size=20)\n", 299 | "ax.legend(loc=\"lower right\")\n", 300 | "plt.show()" 301 | ] 302 | }, 303 | { 304 | "cell_type": "markdown", 305 | "metadata": { 306 | "id": "Y9iIgxR1YMzq" 307 | }, 308 | "source": [ 309 | "Which brain feature leads to a better classification between SZ & HC? " 310 | ] 311 | } 312 | ], 313 | "metadata": { 314 | "colab": { 315 | "name": "4_post_hoc_analysis.ipynb", 316 | "provenance": [] 317 | }, 318 | "kernelspec": { 319 | "display_name": "Python 3 (ipykernel)", 320 | "language": "python", 321 | "name": "python3" 322 | }, 323 | "language_info": { 324 | "codemirror_mode": { 325 | "name": "ipython", 326 | "version": 3 327 | }, 328 | "file_extension": ".py", 329 | "mimetype": "text/x-python", 330 | "name": "python", 331 | "nbconvert_exporter": "python", 332 | "pygments_lexer": "ipython3", 333 | "version": "3.9.7" 334 | } 335 | }, 336 | "nbformat": 4, 337 | "nbformat_minor": 4 338 | } 339 | --------------------------------------------------------------------------------