├── .gitignore ├── DIGDriver ├── __init__.py ├── auxilaries │ ├── __init__.py │ ├── concat_data.py │ ├── unzip_h5.py │ └── utils.py ├── data │ ├── dndscv_gene_cds.bed.gz │ ├── genes.MARTINCORENA.bed │ ├── genes_CGC_ALL.txt │ ├── genes_CGC_ALL.txt.bck │ ├── genes_CGC_ONC.txt │ ├── genes_CGC_TSG.txt │ ├── genes_MSK_230.txt │ ├── genes_MSK_341.txt │ ├── genes_MSK_410.txt │ ├── genes_MSK_468.txt │ ├── genes_metabric_173.txt │ ├── genes_ucla_1202.txt │ └── refcds_hg19.rda ├── data_tools │ ├── DIG_auto.py │ ├── __init__.py │ ├── auto_runner.py │ ├── mappability_tools.py │ ├── mutation_tools.py │ └── track_selector.py ├── driver_model │ ├── __init__.py │ ├── onthefly_tools.py │ └── transfer_tools.py ├── region_model │ ├── .DS_Store │ ├── __init__.py │ ├── autoencoders │ │ ├── AE_vec_predictors.py │ │ ├── ae_nets │ │ │ ├── CNNs.py │ │ │ └── fc_nets.py │ │ └── autoencoder_main.py │ ├── data_aux │ │ ├── dataset_generator.py │ │ └── mut_dataset.py │ ├── feature_vectors │ │ ├── gaussian_process.py │ │ ├── get_feature_vectors.py │ │ └── get_heldout_feature_vectors.py │ ├── kfold_mutations_main.py │ ├── mutations_main.py │ ├── nets │ │ ├── __init__.py │ │ ├── cnn_predictors.py │ │ ├── densenet.py │ │ ├── resnet.py │ │ └── rnn_predictors.py │ ├── perturbations_confidance │ │ ├── __init__.py │ │ ├── confidance_perturbations_estimate.py │ │ ├── configs │ │ │ ├── __init__.py │ │ │ ├── config_confidance.json │ │ │ └── config_confidance_kfold.json │ │ └── kfold_test_model_confidance.py │ ├── region_model_tools.py │ ├── train_nn.sh │ └── trainers │ │ ├── __init__.py │ │ ├── gp_trainer.py │ │ └── nn_trainer.py └── sequence_model │ ├── __init__.py │ ├── genic_driver_tools.py │ ├── gp_tools.py │ ├── nb_model.py │ └── sequence_tools.py ├── LICENSE ├── README.md ├── __init__.py ├── conda-recipe └── meta.yaml ├── examples ├── README.md ├── gene_driver.sh ├── mutation_driver.sh └── noncoding_driver.sh ├── requirements.txt ├── scripts ├── DataExtractor.py ├── DigDriver.py ├── DigPreprocess.py ├── DigPretrain.py ├── filter_hypermut.py └── mutationFunction.R └── setup.py /.gitignore: -------------------------------------------------------------------------------- 1 | **/models 2 | **/runs 3 | **/__pycache__ 4 | .git 5 | **.ipynb_checkpoints* 6 | **/._* 7 | *.egg-info 8 | -------------------------------------------------------------------------------- /DIGDriver/__init__.py: -------------------------------------------------------------------------------- 1 | ## python init file 2 | __all__ = ['sequence_model', 'data_tools', 'region_model', 'driver_model', 'auxilaries'] 3 | -------------------------------------------------------------------------------- /DIGDriver/auxilaries/__init__.py: -------------------------------------------------------------------------------- 1 | ## python init file 2 | -------------------------------------------------------------------------------- /DIGDriver/auxilaries/concat_data.py: -------------------------------------------------------------------------------- 1 | import os 2 | import sys 3 | import json 4 | import h5py 5 | import numpy as np 6 | import pickle as pkl 7 | 8 | 9 | 10 | def main(): 11 | cur_dir = os.path.dirname(os.path.realpath(__file__)) 12 | if len(sys.argv) < 2: 13 | print('No input was given. Dataset directory must be given.') 14 | else: 15 | dir_path = sys.argv[1] 16 | 17 | idxs_relative_path = '../data_indices' 18 | data_lst = sorted([f for f in os.listdir(dir_path)]) 19 | idx_lst = sorted([f for f in os.listdir(os.path.join(dir_path, idxs_relative_path))]) 20 | data_arr = [] 21 | idx_arr = [] 22 | 23 | print('Loading all data and index files...') 24 | for data_file, idx_file in zip(data_lst, idx_lst): 25 | hf = h5py.File(os.path.join(dir_path, data_file), 'r') 26 | data_arr.append(hf['x_data'][:]) # returns a numpy array as long as the dataset's ID is 'x_data' 27 | with open(os.path.join(dir_path, idxs_relative_path, idx_file), 'rb') as f: 28 | idx_arr.append(pkl.load(f)) 29 | 30 | print('Saving indices file to ./all_indices.pkl...') 31 | with open('all_indices.pkl', 'wb') as f: 32 | pkl.dump(idx_arr, f) 33 | 34 | print('Saving data file to ./all_data.pkl...') 35 | h5f = h5py.File('all_data.h5', 'w') 36 | h5f.create_dataset('x_data', data=np.concatenate(data_arr)) 37 | h5f.close() 38 | 39 | 40 | 41 | if __name__ == '__main__': 42 | main() 43 | -------------------------------------------------------------------------------- /DIGDriver/auxilaries/unzip_h5.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | 3 | import os 4 | import sys 5 | import h5py 6 | 7 | if len(sys.argv) <= 1: 8 | raise Exception('Expected at least 1 input argument but found {}'.format(len(sys.argv) - 1)) 9 | 10 | zipped_file_path = sys.argv[1] 11 | 12 | print('Opening zipped h5 file...') 13 | zipped_h5f = h5py.File(zipped_file_path, 'r') 14 | 15 | split_path = zipped_file_path.split('/') 16 | unzipped_file_path = os.path.join('/'.join(split_path[:-1]), 'unzipped_{}'.format(split_path[-1])) 17 | unzipped_h5f = h5py.File(unzipped_file_path, 'w') 18 | 19 | print('Loading unzipped data to {}...'.format(unzipped_file_path)) 20 | for k in zipped_h5f.keys(): 21 | print('Unzipping {}'.format(k)) 22 | unzipped_h5f[k] = zipped_h5f[k][:] 23 | 24 | print('Done!') 25 | -------------------------------------------------------------------------------- /DIGDriver/auxilaries/utils.py: -------------------------------------------------------------------------------- 1 | import multiprocessing as mp 2 | 3 | def get_cpus(): 4 | try: 5 | c = min(max(1, mp.cpu_count() - 2), 20) 6 | except: 7 | c = 5 8 | return c 9 | -------------------------------------------------------------------------------- /DIGDriver/data/dndscv_gene_cds.bed.gz: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/maxwellsh/DIGDriver/5bb565a1fbb3924ecdaaedeffb97123febc3b4d1/DIGDriver/data/dndscv_gene_cds.bed.gz -------------------------------------------------------------------------------- /DIGDriver/data/genes_CGC_ALL.txt: -------------------------------------------------------------------------------- 1 | A1CF 2 | ABI1 3 | ABL1 4 | ABL2 5 | ACKR3 6 | ACSL3 7 | ACSL6 8 | ACVR1 9 | ACVR2A 10 | AFDN 11 | AFF1 12 | AFF3 13 | AFF4 14 | AKAP9 15 | AKT1 16 | AKT2 17 | AKT3 18 | ALDH2 19 | ALK 20 | AMER1 21 | ANK1 22 | APC 23 | APOBEC3B 24 | AR 25 | ARAF 26 | ARHGAP26 27 | ARHGAP5 28 | ARHGEF10 29 | ARHGEF10L 30 | ARHGEF12 31 | ARID1A 32 | ARID1B 33 | ARID2 34 | ARNT 35 | ASPSCR1 36 | ASXL1 37 | ASXL2 38 | ATF1 39 | ATIC 40 | ATM 41 | ATP1A1 42 | ATP2B3 43 | ATR 44 | ATRX 45 | AXIN1 46 | AXIN2 47 | B2M 48 | BAP1 49 | BARD1 50 | BAX 51 | BAZ1A 52 | BCL10 53 | BCL11A 54 | BCL11B 55 | BCL2 56 | BCL2L12 57 | BCL3 58 | BCL6 59 | BCL7A 60 | BCL9 61 | BCL9L 62 | BCLAF1 63 | BCOR 64 | BCORL1 65 | BCR 66 | BIRC3 67 | BIRC6 68 | BLM 69 | BMP5 70 | BMPR1A 71 | BRAF 72 | BRCA1 73 | BRCA2 74 | BRD3 75 | BRD4 76 | BRIP1 77 | BTG1 78 | BTK 79 | BUB1B 80 | C15orf65 81 | CACNA1D 82 | CALR 83 | CAMTA1 84 | CANT1 85 | CARD11 86 | CARS 87 | CASP3 88 | CASP8 89 | CASP9 90 | CBFA2T3 91 | CBFB 92 | CBL 93 | CBLB 94 | CBLC 95 | CCDC6 96 | CCNB1IP1 97 | CCNC 98 | CCND1 99 | CCND2 100 | CCND3 101 | CCNE1 102 | CCR4 103 | CCR7 104 | CD209 105 | CD274 106 | CD28 107 | CD74 108 | CD79A 109 | CD79B 110 | CDC73 111 | CDH1 112 | CDH10 113 | CDH11 114 | CDH17 115 | CDK12 116 | CDK4 117 | CDK6 118 | CDKN1A 119 | CDKN1B 120 | CDKN2A 121 | CDKN2C 122 | CDX2 123 | CEBPA 124 | CEP89 125 | CHCHD7 126 | CHD2 127 | CHD4 128 | CHEK2 129 | CHIC2 130 | CHST11 131 | CIC 132 | CIITA 133 | CLIP1 134 | CLP1 135 | CLTC 136 | CLTCL1 137 | CNBD1 138 | CNBP 139 | CNOT3 140 | CNTNAP2 141 | CNTRL 142 | COL1A1 143 | COL2A1 144 | COL3A1 145 | COX6C 146 | CPEB3 147 | CREB1 148 | CREB3L1 149 | CREB3L2 150 | CREBBP 151 | CRLF2 152 | CRNKL1 153 | CRTC1 154 | CRTC3 155 | CSF1R 156 | CSF3R 157 | CSMD3 158 | CTCF 159 | CTNNA2 160 | CTNNB1 161 | CTNND1 162 | CTNND2 163 | CUL3 164 | CUX1 165 | CXCR4 166 | CYLD 167 | CYP2C8 168 | CYSLTR2 169 | DAXX 170 | DCAF12L2 171 | DCC 172 | DCTN1 173 | DDB2 174 | DDIT3 175 | DDR2 176 | DDX10 177 | DDX3X 178 | DDX5 179 | DDX6 180 | DEK 181 | DGCR8 182 | DICER1 183 | DNAJB1 184 | DNM2 185 | DNMT3A 186 | DROSHA 187 | DUX4L1 188 | EBF1 189 | ECT2L 190 | EED 191 | EGFR 192 | EIF1AX 193 | EIF3E 194 | EIF4A2 195 | ELF3 196 | ELF4 197 | ELK4 198 | ELL 199 | ELN 200 | EML4 201 | EP300 202 | EPAS1 203 | EPHA3 204 | EPHA7 205 | EPS15 206 | ERBB2 207 | ERBB3 208 | ERBB4 209 | ERC1 210 | ERCC2 211 | ERCC3 212 | ERCC4 213 | ERCC5 214 | ERG 215 | ESR1 216 | ETNK1 217 | ETV1 218 | ETV4 219 | ETV5 220 | ETV6 221 | EWSR1 222 | EXT1 223 | EXT2 224 | EZH2 225 | EZR 226 | FAM131B 227 | FAM135B 228 | FAM47C 229 | FANCA 230 | FANCC 231 | FANCD2 232 | FANCE 233 | FANCF 234 | FANCG 235 | FAS 236 | FAT1 237 | FAT3 238 | FAT4 239 | FBLN2 240 | FBXO11 241 | FBXW7 242 | FCGR2B 243 | FCRL4 244 | FEN1 245 | FES 246 | FEV 247 | FGFR1 248 | FGFR1OP 249 | FGFR2 250 | FGFR3 251 | FGFR4 252 | FH 253 | FHIT 254 | FIP1L1 255 | FKBP9 256 | FLCN 257 | FLI1 258 | FLNA 259 | FLT3 260 | FLT4 261 | FNBP1 262 | FOXA1 263 | FOXL2 264 | FOXO1 265 | FOXO3 266 | FOXO4 267 | FOXP1 268 | FOXR1 269 | FSTL3 270 | FUBP1 271 | FUS 272 | GAS7 273 | GATA1 274 | GATA2 275 | GATA3 276 | GLI1 277 | GMPS 278 | GNA11 279 | GNAQ 280 | GNAS 281 | GOLGA5 282 | GOPC 283 | GPC3 284 | GPC5 285 | GPHN 286 | GRIN2A 287 | GRM3 288 | H3F3A 289 | H3F3B 290 | HERPUD1 291 | HEY1 292 | HIF1A 293 | HIP1 294 | HIST1H3B 295 | HIST1H4I 296 | HLA-A 297 | HLF 298 | HMGA1 299 | HMGA2 300 | HMGN2P46 301 | HNF1A 302 | HNRNPA2B1 303 | HOOK3 304 | HOXA11 305 | HOXA13 306 | HOXA9 307 | HOXC11 308 | HOXC13 309 | HOXD11 310 | HOXD13 311 | HRAS 312 | HSP90AA1 313 | HSP90AB1 314 | ID3 315 | IDH1 316 | IDH2 317 | IGF2BP2 318 | IGH 319 | IGK 320 | IGL 321 | IKBKB 322 | IKZF1 323 | IL2 324 | IL21R 325 | IL6ST 326 | IL7R 327 | IRF4 328 | IRS4 329 | ISX 330 | ITGAV 331 | ITK 332 | JAK1 333 | JAK2 334 | JAK3 335 | JAZF1 336 | JUN 337 | KAT6A 338 | KAT6B 339 | KAT7 340 | KCNJ5 341 | KDM5A 342 | KDM5C 343 | KDM6A 344 | KDR 345 | KDSR 346 | KEAP1 347 | KIAA1549 348 | KIF5B 349 | KIT 350 | KLF4 351 | KLF6 352 | KLK2 353 | KMT2A 354 | KMT2C 355 | KMT2D 356 | KNL1 357 | KNSTRN 358 | KRAS 359 | KTN1 360 | LARP4B 361 | LASP1 362 | LATS1 363 | LATS2 364 | LCK 365 | LCP1 366 | LEF1 367 | LEPROTL1 368 | LHFPL6 369 | LIFR 370 | LMNA 371 | LMO1 372 | LMO2 373 | LPP 374 | LRIG3 375 | LRP1B 376 | LSM14A 377 | LYL1 378 | LZTR1 379 | MACC1 380 | MAF 381 | MAFB 382 | MALAT1 383 | MALT1 384 | MAML2 385 | MAP2K1 386 | MAP2K2 387 | MAP2K4 388 | MAP3K1 389 | MAP3K13 390 | MAPK1 391 | MAX 392 | MB21D2 393 | MDM2 394 | MDM4 395 | MDS2 396 | MECOM 397 | MED12 398 | MEN1 399 | MET 400 | MGMT 401 | MITF 402 | MLF1 403 | MLH1 404 | MLLT1 405 | MLLT10 406 | MLLT11 407 | MLLT3 408 | MLLT6 409 | MN1 410 | MNX1 411 | MPL 412 | MRTFA 413 | MSH2 414 | MSH6 415 | MSI2 416 | MSN 417 | MTCP1 418 | MTOR 419 | MUC1 420 | MUC16 421 | MUC4 422 | MUTYH 423 | MYB 424 | MYC 425 | MYCL 426 | MYCN 427 | MYD88 428 | MYH11 429 | MYH9 430 | MYO5A 431 | MYOD1 432 | N4BP2 433 | NAB2 434 | NACA 435 | NBEA 436 | NBN 437 | NCKIPSD 438 | NCOA1 439 | NCOA2 440 | NCOA4 441 | NCOR1 442 | NCOR2 443 | NDRG1 444 | NF1 445 | NF2 446 | NFATC2 447 | NFE2L2 448 | NFIB 449 | NFKB2 450 | NFKBIE 451 | NIN 452 | NKX2-1 453 | NONO 454 | NOTCH1 455 | NOTCH2 456 | NPM1 457 | NR4A3 458 | NRAS 459 | NRG1 460 | NSD1 461 | NSD2 462 | NSD3 463 | NT5C2 464 | NTHL1 465 | NTRK1 466 | NTRK3 467 | NUMA1 468 | NUP214 469 | NUP98 470 | NUTM1 471 | NUTM2B 472 | NUTM2D 473 | OLIG2 474 | OMD 475 | P2RY8 476 | PABPC1 477 | PAFAH1B2 478 | PALB2 479 | PATZ1 480 | PAX3 481 | PAX5 482 | PAX7 483 | PAX8 484 | PBRM1 485 | PBX1 486 | PCBP1 487 | PCM1 488 | PDCD1LG2 489 | PDE4DIP 490 | PDGFB 491 | PDGFRA 492 | PDGFRB 493 | PER1 494 | PHF6 495 | PHOX2B 496 | PICALM 497 | PIK3CA 498 | PIK3CB 499 | PIK3R1 500 | PIM1 501 | PLAG1 502 | PLCG1 503 | PML 504 | PMS1 505 | PMS2 506 | POLD1 507 | POLE 508 | POLG 509 | POLQ 510 | POT1 511 | POU2AF1 512 | POU5F1 513 | PPARG 514 | PPFIBP1 515 | PPM1D 516 | PPP2R1A 517 | PPP6C 518 | PRCC 519 | PRDM1 520 | PRDM16 521 | PRDM2 522 | PREX2 523 | PRF1 524 | PRKACA 525 | PRKAR1A 526 | PRKCB 527 | PRPF40B 528 | PRRX1 529 | PSIP1 530 | PTCH1 531 | PTEN 532 | PTK6 533 | PTPN11 534 | PTPN13 535 | PTPN6 536 | PTPRB 537 | PTPRC 538 | PTPRD 539 | PTPRK 540 | PTPRT 541 | PWWP2A 542 | QKI 543 | RABEP1 544 | RAC1 545 | RAD17 546 | RAD21 547 | RAD51B 548 | RAF1 549 | RALGDS 550 | RANBP2 551 | RAP1GDS1 552 | RARA 553 | RB1 554 | RBM10 555 | RBM15 556 | RECQL4 557 | REL 558 | RET 559 | RFWD3 560 | RGPD3 561 | RGS7 562 | RHOA 563 | RHOH 564 | RMI2 565 | RNF213 566 | RNF43 567 | ROBO2 568 | ROS1 569 | RPL10 570 | RPL22 571 | RPL5 572 | RPN1 573 | RSPO2 574 | RSPO3 575 | RUNX1 576 | RUNX1T1 577 | S100A7 578 | SALL4 579 | SBDS 580 | SDC4 581 | SDHA 582 | SDHAF2 583 | SDHB 584 | SDHC 585 | SDHD 586 | 2020-09-05 00:00:00 587 | 2020-09-06 00:00:00 588 | 2020-09-09 00:00:00 589 | SET 590 | SETBP1 591 | SETD1B 592 | SETD2 593 | SETDB1 594 | SF3B1 595 | SFPQ 596 | SFRP4 597 | SGK1 598 | SH2B3 599 | SH3GL1 600 | SHTN1 601 | SIRPA 602 | SIX1 603 | SIX2 604 | SKI 605 | SLC34A2 606 | SLC45A3 607 | SMAD2 608 | SMAD3 609 | SMAD4 610 | SMARCA4 611 | SMARCB1 612 | SMARCD1 613 | SMARCE1 614 | SMC1A 615 | SMO 616 | SND1 617 | SNX29 618 | SOCS1 619 | SOX2 620 | SOX21 621 | SPECC1 622 | SPEN 623 | SPOP 624 | SRC 625 | SRGAP3 626 | SRSF2 627 | SRSF3 628 | SS18 629 | SS18L1 630 | SSX1 631 | SSX2 632 | SSX4 633 | STAG1 634 | STAG2 635 | STAT3 636 | STAT5B 637 | STAT6 638 | STIL 639 | STK11 640 | STRN 641 | SUFU 642 | SUZ12 643 | SYK 644 | TAF15 645 | TAL1 646 | TAL2 647 | TBL1XR1 648 | TBX3 649 | TCEA1 650 | TCF12 651 | TCF3 652 | TCF7L2 653 | TCL1A 654 | TEC 655 | TENT5C 656 | TERT 657 | TET1 658 | TET2 659 | TFE3 660 | TFEB 661 | TFG 662 | TFPT 663 | TFRC 664 | TGFBR2 665 | THRAP3 666 | TLX1 667 | TLX3 668 | TMEM127 669 | TMPRSS2 670 | TNC 671 | TNFAIP3 672 | TNFRSF14 673 | TNFRSF17 674 | TOP1 675 | TP53 676 | TP63 677 | TPM3 678 | TPM4 679 | TPR 680 | TRA 681 | TRAF7 682 | TRB 683 | TRD 684 | TRIM24 685 | TRIM27 686 | TRIM33 687 | TRIP11 688 | TRRAP 689 | TSC1 690 | TSC2 691 | TSHR 692 | U2AF1 693 | UBR5 694 | USP44 695 | USP6 696 | USP8 697 | VAV1 698 | VHL 699 | VTI1A 700 | WAS 701 | WDCP 702 | WIF1 703 | WNK2 704 | WRN 705 | WT1 706 | WWTR1 707 | XPA 708 | XPC 709 | XPO1 710 | YWHAE 711 | ZBTB16 712 | ZCCHC8 713 | ZEB1 714 | ZFHX3 715 | ZMYM2 716 | ZMYM3 717 | ZNF331 718 | ZNF384 719 | ZNF429 720 | ZNF479 721 | ZNF521 722 | ZNRF3 723 | ZRSR2 724 | CDKN2A.p14arf 725 | CDKN2A.p16INK4a 726 | -------------------------------------------------------------------------------- /DIGDriver/data/genes_CGC_ALL.txt.bck: -------------------------------------------------------------------------------- 1 | A1CF 2 | ABI1 3 | ABL1 4 | ABL2 5 | ACKR3 6 | ACSL3 7 | ACSL6 8 | ACVR1 9 | ACVR2A 10 | AFF1 11 | AFF3 12 | AFF4 13 | AKAP9 14 | AKT1 15 | AKT2 16 | AKT3 17 | ALDH2 18 | ALK 19 | ANK1 20 | APC 21 | APOBEC3B 22 | ARHGAP26 23 | ARHGAP5 24 | ARHGEF10 25 | ARHGEF10L 26 | ARHGEF12 27 | ARID1A 28 | ARID1B 29 | ARID2 30 | ARNT 31 | ASPSCR1 32 | ASXL1 33 | ASXL2 34 | ATF1 35 | ATIC 36 | ATM 37 | ATP1A1 38 | ATR 39 | AXIN1 40 | AXIN2 41 | B2M 42 | BAP1 43 | BARD1 44 | BAX 45 | BAZ1A 46 | BCL10 47 | BCL11A 48 | BCL11B 49 | BCL2 50 | BCL2L12 51 | BCL3 52 | BCL6 53 | BCL7A 54 | BCL9 55 | BCL9L 56 | BCLAF1 57 | BCR 58 | BIRC3 59 | BIRC6 60 | BLM 61 | BMP5 62 | BMPR1A 63 | BRAF 64 | BRCA1 65 | BRCA2 66 | BRD3 67 | BRD4 68 | BRIP1 69 | BTG1 70 | BUB1B 71 | C15orf65 72 | CACNA1D 73 | CALR 74 | CAMTA1 75 | CANT1 76 | CARD11 77 | CARS 78 | CASP3 79 | CASP8 80 | CASP9 81 | CBFA2T3 82 | CBFB 83 | CBL 84 | CBLB 85 | CBLC 86 | CCDC6 87 | CCNB1IP1 88 | CCNC 89 | CCND1 90 | CCND2 91 | CCND3 92 | CCNE1 93 | CCR4 94 | CCR7 95 | CD209 96 | CD274 97 | CD28 98 | CD74 99 | CD79A 100 | CD79B 101 | CDC73 102 | CDH1 103 | CDH10 104 | CDH11 105 | CDH17 106 | CDK12 107 | CDK4 108 | CDK6 109 | CDKN1A 110 | CDKN1B 111 | CDKN2C 112 | CDX2 113 | CEBPA 114 | CEP89 115 | CHCHD7 116 | CHD2 117 | CHD4 118 | CHEK2 119 | CHIC2 120 | CHST11 121 | CIC 122 | CIITA 123 | CLIP1 124 | CLP1 125 | CLTC 126 | CLTCL1 127 | CNBD1 128 | CNBP 129 | CNOT3 130 | CNTNAP2 131 | CNTRL 132 | COL1A1 133 | COL2A1 134 | COL3A1 135 | COX6C 136 | CPEB3 137 | CREB1 138 | CREB3L1 139 | CREB3L2 140 | CREBBP 141 | CRNKL1 142 | CRTC1 143 | CRTC3 144 | CSF1R 145 | CSF3R 146 | CSMD3 147 | CTCF 148 | CTNNA2 149 | CTNNB1 150 | CTNND1 151 | CTNND2 152 | CUL3 153 | CUX1 154 | CXCR4 155 | CYLD 156 | CYP2C8 157 | CYSLTR2 158 | DAXX 159 | DCC 160 | DCTN1 161 | DDB2 162 | DDIT3 163 | DDR2 164 | DDX10 165 | DDX5 166 | DDX6 167 | DEK 168 | DGCR8 169 | DICER1 170 | DNAJB1 171 | DNM2 172 | DNMT3A 173 | DROSHA 174 | EBF1 175 | ECT2L 176 | EED 177 | EGFR 178 | EIF3E 179 | EIF4A2 180 | ELF3 181 | ELK4 182 | ELL 183 | ELN 184 | EML4 185 | EP300 186 | EPAS1 187 | EPHA3 188 | EPHA7 189 | EPS15 190 | ERBB2 191 | ERBB3 192 | ERBB4 193 | ERC1 194 | ERCC2 195 | ERCC3 196 | ERCC4 197 | ERCC5 198 | ERG 199 | ESR1 200 | ETNK1 201 | ETV1 202 | ETV4 203 | ETV5 204 | ETV6 205 | EWSR1 206 | EXT1 207 | EXT2 208 | EZH2 209 | EZR 210 | FAM131B 211 | FAM135B 212 | FANCA 213 | FANCC 214 | FANCD2 215 | FANCE 216 | FANCF 217 | FANCG 218 | FAS 219 | FAT1 220 | FAT3 221 | FAT4 222 | FBLN2 223 | FBXO11 224 | FBXW7 225 | FCGR2B 226 | FCRL4 227 | FEN1 228 | FES 229 | FEV 230 | FGFR1 231 | FGFR1OP 232 | FGFR2 233 | FGFR3 234 | FGFR4 235 | FH 236 | FHIT 237 | FIP1L1 238 | FKBP9 239 | FLCN 240 | FLI1 241 | FLT3 242 | FLT4 243 | FNBP1 244 | FOXA1 245 | FOXL2 246 | FOXO1 247 | FOXO3 248 | FOXP1 249 | FOXR1 250 | FSTL3 251 | FUBP1 252 | FUS 253 | GAS7 254 | GATA2 255 | GATA3 256 | GLI1 257 | GMPS 258 | GNA11 259 | GNAQ 260 | GNAS 261 | GOLGA5 262 | GOPC 263 | GPC5 264 | GPHN 265 | GRIN2A 266 | GRM3 267 | H3F3A 268 | H3F3B 269 | HERPUD1 270 | HEY1 271 | HIF1A 272 | HIP1 273 | HIST1H3B 274 | HIST1H4I 275 | HLA-A 276 | HLF 277 | HMGA1 278 | HMGA2 279 | HNF1A 280 | HNRNPA2B1 281 | HOOK3 282 | HOXA11 283 | HOXA13 284 | HOXA9 285 | HOXC11 286 | HOXC13 287 | HOXD11 288 | HOXD13 289 | HRAS 290 | HSP90AA1 291 | HSP90AB1 292 | ID3 293 | IDH1 294 | IDH2 295 | IGF2BP2 296 | IKBKB 297 | IKZF1 298 | IL2 299 | IL21R 300 | IL6ST 301 | IL7R 302 | IRF4 303 | ISX 304 | ITGAV 305 | ITK 306 | JAK1 307 | JAK2 308 | JAK3 309 | JAZF1 310 | JUN 311 | KAT6A 312 | KAT6B 313 | KAT7 314 | KCNJ5 315 | KDM5A 316 | KDR 317 | KDSR 318 | KEAP1 319 | KIAA1549 320 | KIF5B 321 | KIT 322 | KLF4 323 | KLF6 324 | KLK2 325 | KMT2A 326 | KMT2C 327 | KMT2D 328 | KNSTRN 329 | KRAS 330 | KTN1 331 | LARP4B 332 | LASP1 333 | LATS1 334 | LATS2 335 | LCK 336 | LCP1 337 | LEF1 338 | LEPROTL1 339 | LIFR 340 | LMNA 341 | LMO1 342 | LMO2 343 | LPP 344 | LRIG3 345 | LRP1B 346 | LSM14A 347 | LYL1 348 | LZTR1 349 | MACC1 350 | MAF 351 | MAFB 352 | MALT1 353 | MAML2 354 | MAP2K1 355 | MAP2K2 356 | MAP2K4 357 | MAP3K1 358 | MAP3K13 359 | MAPK1 360 | MAX 361 | MB21D2 362 | MDM2 363 | MDM4 364 | MDS2 365 | MECOM 366 | MEN1 367 | MET 368 | MGMT 369 | MITF 370 | MLF1 371 | MLH1 372 | MLLT1 373 | MLLT10 374 | MLLT11 375 | MLLT3 376 | MLLT6 377 | MN1 378 | MNX1 379 | MPL 380 | MSH2 381 | MSH6 382 | MSI2 383 | MTOR 384 | MUC1 385 | MUC16 386 | MUC4 387 | MUTYH 388 | MYB 389 | MYC 390 | MYCL 391 | MYCN 392 | MYD88 393 | MYH11 394 | MYH9 395 | MYO5A 396 | MYOD1 397 | N4BP2 398 | NAB2 399 | NACA 400 | NBEA 401 | NBN 402 | NCKIPSD 403 | NCOA1 404 | NCOA2 405 | NCOA4 406 | NCOR1 407 | NCOR2 408 | NDRG1 409 | NF1 410 | NF2 411 | NFATC2 412 | NFE2L2 413 | NFIB 414 | NFKB2 415 | NFKBIE 416 | NIN 417 | NKX2-1 418 | NOTCH1 419 | NOTCH2 420 | NPM1 421 | NR4A3 422 | NRAS 423 | NRG1 424 | NSD1 425 | NT5C2 426 | NTHL1 427 | NTRK1 428 | NTRK3 429 | NUMA1 430 | NUP214 431 | NUP98 432 | NUTM1 433 | NUTM2B 434 | NUTM2D 435 | OLIG2 436 | OMD 437 | PABPC1 438 | PAFAH1B2 439 | PALB2 440 | PATZ1 441 | PAX3 442 | PAX5 443 | PAX7 444 | PAX8 445 | PBRM1 446 | PBX1 447 | PCBP1 448 | PCM1 449 | PDCD1LG2 450 | PDE4DIP 451 | PDGFB 452 | PDGFRA 453 | PDGFRB 454 | PER1 455 | PHOX2B 456 | PICALM 457 | PIK3CA 458 | PIK3CB 459 | PIK3R1 460 | PIM1 461 | PLAG1 462 | PLCG1 463 | PML 464 | PMS1 465 | PMS2 466 | POLD1 467 | POLE 468 | POLG 469 | POLQ 470 | POT1 471 | POU2AF1 472 | POU5F1 473 | PPARG 474 | PPFIBP1 475 | PPM1D 476 | PPP2R1A 477 | PPP6C 478 | PRCC 479 | PRDM1 480 | PRDM16 481 | PRDM2 482 | PREX2 483 | PRF1 484 | PRKACA 485 | PRKAR1A 486 | PRKCB 487 | PRPF40B 488 | PRRX1 489 | PSIP1 490 | PTCH1 491 | PTEN 492 | PTK6 493 | PTPN11 494 | PTPN13 495 | PTPN6 496 | PTPRB 497 | PTPRC 498 | PTPRD 499 | PTPRK 500 | PTPRT 501 | PWWP2A 502 | QKI 503 | RABEP1 504 | RAC1 505 | RAD17 506 | RAD21 507 | RAD51B 508 | RAF1 509 | RALGDS 510 | RANBP2 511 | RAP1GDS1 512 | RARA 513 | RB1 514 | RBM15 515 | RECQL4 516 | REL 517 | RET 518 | RFWD3 519 | RGPD3 520 | RGS7 521 | RHOA 522 | RHOH 523 | RMI2 524 | RNF213 525 | RNF43 526 | ROBO2 527 | ROS1 528 | RPL22 529 | RPL5 530 | RPN1 531 | RSPO2 532 | RSPO3 533 | RUNX1 534 | RUNX1T1 535 | S100A7 536 | SALL4 537 | SBDS 538 | SDC4 539 | SDHA 540 | SDHAF2 541 | SDHB 542 | SDHC 543 | SDHD 544 | SET 545 | SETBP1 546 | SETD1B 547 | SETD2 548 | SETDB1 549 | SF3B1 550 | SFPQ 551 | SFRP4 552 | SGK1 553 | SH2B3 554 | SH3GL1 555 | SIRPA 556 | SIX1 557 | SIX2 558 | SKI 559 | SLC34A2 560 | SLC45A3 561 | SMAD2 562 | SMAD3 563 | SMAD4 564 | SMARCA4 565 | SMARCB1 566 | SMARCD1 567 | SMARCE1 568 | SMO 569 | SND1 570 | SNX29 571 | SOCS1 572 | SOX2 573 | SOX21 574 | SPECC1 575 | SPEN 576 | SPOP 577 | SRC 578 | SRGAP3 579 | SRSF2 580 | SRSF3 581 | SS18 582 | SS18L1 583 | STAG1 584 | STAT3 585 | STAT5B 586 | STAT6 587 | STIL 588 | STK11 589 | STRN 590 | SUFU 591 | SUZ12 592 | SYK 593 | TAF15 594 | TAL1 595 | TAL2 596 | TBL1XR1 597 | TBX3 598 | TCEA1 599 | TCF12 600 | TCF3 601 | TCF7L2 602 | TCL1A 603 | TEC 604 | TERT 605 | TET1 606 | TET2 607 | TFEB 608 | TFG 609 | TFPT 610 | TFRC 611 | TGFBR2 612 | THRAP3 613 | TLX1 614 | TLX3 615 | TMEM127 616 | TMPRSS2 617 | TNC 618 | TNFAIP3 619 | TNFRSF14 620 | TNFRSF17 621 | TOP1 622 | TP53 623 | TP63 624 | TPM3 625 | TPM4 626 | TPR 627 | TRAF7 628 | TRIM24 629 | TRIM27 630 | TRIM33 631 | TRIP11 632 | TRRAP 633 | TSC1 634 | TSC2 635 | TSHR 636 | U2AF1 637 | UBR5 638 | USP44 639 | USP6 640 | USP8 641 | VAV1 642 | VHL 643 | VTI1A 644 | WIF1 645 | WNK2 646 | WRN 647 | WT1 648 | WWTR1 649 | XPA 650 | XPC 651 | XPO1 652 | YWHAE 653 | ZBTB16 654 | ZCCHC8 655 | ZEB1 656 | ZFHX3 657 | ZMYM2 658 | ZNF331 659 | ZNF384 660 | ZNF429 661 | ZNF479 662 | ZNF521 663 | ZNRF3 664 | CDKN2A.p14arf 665 | CDKN2A.p16INK4a 666 | -------------------------------------------------------------------------------- /DIGDriver/data/genes_CGC_ONC.txt: -------------------------------------------------------------------------------- 1 | RARA 2 | STAT6 3 | PTPRS 4 | SHQ1 5 | PDGFRA 6 | MPL 7 | JAK3 8 | ABL1 9 | AFF3 10 | CDK4 11 | MLLT10 12 | NTRK2 13 | FEV 14 | PARK2 15 | BCL9 16 | KIT 17 | PIK3C2G 18 | MAF 19 | CBLC 20 | GATA3 21 | KEAP1 22 | TNFAIP3 23 | TSC2 24 | MTOR 25 | NT5C2 26 | CDC73 27 | NOTCH1 28 | DEK 29 | DNMT1 30 | DIS3 31 | ZNF521 32 | SND1 33 | MALT1 34 | HNRNPA2B1 35 | KCNJ5 36 | CHEK2 37 | ABL2 38 | MDM4 39 | HOXA13 40 | SUFU 41 | CDC42EP2 42 | H3F3A 43 | TBK1 44 | ATM 45 | MSH6 46 | PREX2 47 | ERBB2 48 | NF2 49 | HIP1 50 | DDX5 51 | CSF1R 52 | ALK 53 | PIK3R2 54 | NFE2L2 55 | CDKN2C 56 | NF1 57 | KRAS 58 | MAML2 59 | NKX2-1 60 | FOXA1 61 | FLT3 62 | MCL1 63 | NUP98 64 | JUN 65 | MAP2K1 66 | SSX4 67 | LPP 68 | POU2AF1 69 | BRCA1 70 | NOTCH4 71 | USP6 72 | TEK 73 | SETBP1 74 | TNFRSF17 75 | FLCN 76 | IRS1 77 | NOTCH3 78 | MYD88 79 | LMO1 80 | CHEK1 81 | EPHB4 82 | SOX2 83 | TLX1 84 | EWSR1 85 | FLT4 86 | SSX1 87 | UBR5 88 | KMT2C 89 | ERBB4 90 | CCND1 91 | ETV6 92 | ATF1 93 | TRRAP 94 | CDH1 95 | PIK3R3 96 | PTEN 97 | MAP3K8 98 | NFATC2 99 | PARP1 100 | ALOX12B 101 | CDK8 102 | CRKL 103 | FGFR2 104 | BCL11A 105 | CRTC1 106 | RAC1 107 | PIK3CB 108 | CBL 109 | H3F3B 110 | IDH1 111 | EZH2 112 | ERBB3 113 | SRSF2 114 | RET 115 | SF3B1 116 | XPO1 117 | CYLD 118 | EPHB1 119 | SET 120 | PDGFB 121 | SMO 122 | NFKB1 123 | IL6ST 124 | TGFBR2 125 | TSHR 126 | SRC 127 | TCF7L2 128 | PBX1 129 | HIF1A 130 | TFE3 131 | ARID1A 132 | BCL6 133 | MYC 134 | MLST8 135 | CEBPA 136 | TSC1 137 | CHD4 138 | DDIT3 139 | GNAS 140 | FLT1 141 | FCGR2B 142 | PSIP1 143 | CCND2 144 | PDGFRB 145 | CBLB 146 | DNMT3A 147 | LDHA 148 | ERG 149 | SMARCA4 150 | TMPRSS2 151 | YES1 152 | PTPRD 153 | PLCG1 154 | WT1 155 | CDKN2A.p16INK4a 156 | AKT1 157 | MSI2 158 | REL 159 | PPM1D 160 | LYL1 161 | POU5F1 162 | WAS 163 | BRD3 164 | KAT6A 165 | SIX1 166 | WWTR1 167 | HSP90AA1 168 | FAS 169 | IGF1R 170 | OLIG2 171 | IKZF1 172 | MYCN 173 | TP53 174 | CDK6 175 | PRKCI 176 | LMO2 177 | NSD3 178 | CDKN2B 179 | IKBKB 180 | VHL 181 | PTPN11 182 | ACKR3 183 | FAM46C 184 | PNRC1 185 | BIRC2 186 | AR 187 | HOXC13 188 | CXCR4 189 | BAP1 190 | HOXD13 191 | SMARCB1 192 | NTRK3 193 | MAGI2 194 | ELK4 195 | EP300 196 | GNA11 197 | TRIM27 198 | HRAS 199 | FSTL3 200 | BCL2 201 | ETV4 202 | KDR 203 | CSF3R 204 | CCNE1 205 | HNF1A 206 | KLF6 207 | PAK7 208 | JAK2 209 | NSD2 210 | CREB3L2 211 | RSPO3 212 | IRS2 213 | USP8 214 | EPHA5 215 | STAT3 216 | YAP1 217 | TFEB 218 | SDHB 219 | NTRK1 220 | TLX3 221 | HOXC11 222 | AFDN 223 | RAF1 224 | FBXW7 225 | MAP2K2 226 | PLAG1 227 | SSX2 228 | SYK 229 | PIK3CD 230 | TET2 231 | SALL4 232 | NOTCH2 233 | FGFR4 234 | PRKACA 235 | ROS1 236 | PTCH1 237 | HLF 238 | GOLPH3 239 | SMAD4 240 | EPHA3 241 | TOP1 242 | ETV5 243 | CD79B 244 | NR4A3 245 | SPOP 246 | AKT2 247 | CD79A 248 | FGFR1 249 | CACNA1D 250 | AURKA 251 | SRSF3 252 | NCOA2 253 | TERT 254 | EPHA6 255 | MLH1 256 | RPTOR 257 | TAF15 258 | AKT3 259 | PIK3CG 260 | PIK3R1 261 | PAX5 262 | ETV1 263 | DNMT3B 264 | TP63 265 | IL7R 266 | CDKN2A.p14arf 267 | PRDM16 268 | RB1 269 | DICER1 270 | PBRM1 271 | RUNX1 272 | PIK3CA 273 | CREB1 274 | SOCS1 275 | EPHA7 276 | CTNNB1 277 | FOXL2 278 | FCRL4 279 | KMT2D 280 | FH 281 | HEY1 282 | LCK 283 | EPHB6 284 | P2RY8 285 | DDX6 286 | SH3GL1 287 | NPM1 288 | PAX3 289 | GRIN2A 290 | MITF 291 | LGR6 292 | TCL1A 293 | SETD2 294 | CRLF2 295 | JAK1 296 | CREBBP 297 | TET1 298 | TAL1 299 | GNAQ 300 | BRAF 301 | BCL3 302 | RICTOR 303 | IGFBP7 304 | GATA2 305 | STK11 306 | TAL2 307 | FLI1 308 | HOXD11 309 | EGFR 310 | MSH2 311 | IDH2 312 | RAP1GDS1 313 | AFF4 314 | EIF4EBP1 315 | STIL 316 | NFKB2 317 | HDAC2 318 | MECOM 319 | FGFR3 320 | MAP2K4 321 | ASXL1 322 | CARD11 323 | MET 324 | MAPK1 325 | HIST1H3B 326 | HMGA1 327 | BRCA2 328 | KDM5A 329 | EPHA8 330 | CCND3 331 | PHOX2B 332 | MYCL 333 | FOXP1 334 | ACVR1 335 | DDR2 336 | IKBKE 337 | BCL2L1 338 | MTCP1 339 | KMT2A 340 | MYOD1 341 | PIM1 342 | HMGA2 343 | NRAS 344 | FUBP1 345 | MDM2 346 | CD74 347 | GSK3B 348 | U2AF1 349 | PDCD1LG2 350 | CALR 351 | PRKAR1A 352 | NUTM1 353 | MEN1 354 | MYB 355 | DAXX 356 | BRD4 357 | PLK2 358 | MN1 359 | INSR 360 | MAFB 361 | ARHGAP26 362 | ESR1 363 | APC 364 | CDKN2A.p14arf 365 | CDKN2A.p16INK4a 366 | -------------------------------------------------------------------------------- /DIGDriver/data/genes_CGC_TSG.txt: -------------------------------------------------------------------------------- 1 | ABI1 2 | ACVR2A 3 | AMER1 4 | APC 5 | ARHGAP26 6 | ARHGEF12 7 | ARID1A 8 | ARID1B 9 | ARID2 10 | ASXL1 11 | ATM 12 | ATP2B3 13 | ATR 14 | ATRX 15 | AXIN1 16 | AXIN2 17 | B2M 18 | BAP1 19 | BARD1 20 | BAX 21 | BCL10 22 | BCOR 23 | BLM 24 | BRCA1 25 | BRCA2 26 | BRIP1 27 | BTG1 28 | BUB1B 29 | CAMTA1 30 | CARS 31 | CASP8 32 | CBFA2T3 33 | CBFB 34 | CBLB 35 | CCDC6 36 | CCNB1IP1 37 | CD274 38 | CDC73 39 | CDH1 40 | CDH11 41 | CDK12 42 | CDKN1B 43 | CDKN2A 44 | CDKN2C 45 | CDX2 46 | CEBPA 47 | CHEK2 48 | CIITA 49 | CLTC 50 | CLTCL1 51 | CNBP 52 | CNOT3 53 | CREB3L1 54 | CTCF 55 | CYLD 56 | DDX10 57 | DDX3X 58 | DICER1 59 | DNM2 60 | DNMT3A 61 | DROSHA 62 | EBF1 63 | EIF3E 64 | ELL 65 | EP300 66 | EPS15 67 | ERCC2 68 | ERCC3 69 | ERCC4 70 | ERCC5 71 | ETNK1 72 | ETV6 73 | EXT1 74 | EXT2 75 | FANCA 76 | FANCC 77 | FANCD2 78 | FANCE 79 | FANCF 80 | FANCG 81 | FAS 82 | FAT1 83 | FAT4 84 | FBXO11 85 | FBXW7 86 | FH 87 | FHIT 88 | FLCN 89 | FUS 90 | GRIN2A 91 | HNF1A 92 | IKZF1 93 | KAT6B 94 | KDM5C 95 | KEAP1 96 | KLF6 97 | KMT2C 98 | KNL1 99 | LATS1 100 | LATS2 101 | LRIG3 102 | LRP1B 103 | LZTR1 104 | MAX 105 | MED12 106 | MEN1 107 | MLF1 108 | MLH1 109 | MSH2 110 | MSH6 111 | MUTYH 112 | MYH9 113 | NAB2 114 | NBN 115 | NCOA4 116 | NCOR1 117 | NCOR2 118 | NDRG1 119 | NF1 120 | NF2 121 | NFKBIE 122 | NRG1 123 | PALB2 124 | PATZ1 125 | PBRM1 126 | PER1 127 | PHF6 128 | PHOX2B 129 | PIK3R1 130 | PML 131 | PMS2 132 | POLD1 133 | POLE 134 | POT1 135 | PPARG 136 | PPP2R1A 137 | PPP6C 138 | PRDM1 139 | PRF1 140 | PTCH1 141 | PTEN 142 | PTPN13 143 | PTPRB 144 | PTPRC 145 | PTPRK 146 | PTPRT 147 | RAD51B 148 | RANBP2 149 | RB1 150 | RBM10 151 | RHOH 152 | RMI2 153 | RNF43 154 | RPL10 155 | RPL22 156 | RPL5 157 | RSPO2 158 | SBDS 159 | SDHA 160 | SDHAF2 161 | SDHB 162 | SDHC 163 | SDHD 164 | SETD2 165 | SFPQ 166 | SFRP4 167 | SH2B3 168 | SLC34A2 169 | SMAD2 170 | SMAD3 171 | SMAD4 172 | SMARCA4 173 | SMARCB1 174 | SMARCD1 175 | SMARCE1 176 | SOCS1 177 | SPEN 178 | SPOP 179 | STAG2 180 | STK11 181 | SUFU 182 | TENT5C 183 | TET2 184 | TGFBR2 185 | TMEM127 186 | TNFAIP3 187 | TNFRSF14 188 | TPM3 189 | TRAF7 190 | TRIM33 191 | TSC1 192 | TSC2 193 | VHL 194 | WIF1 195 | WRN 196 | XPA 197 | XPC 198 | YWHAE 199 | ZBTB16 200 | ZFHX3 201 | ZNF331 202 | ZRSR2 203 | -------------------------------------------------------------------------------- /DIGDriver/data/genes_MSK_230.txt: -------------------------------------------------------------------------------- 1 | ABL1 2 | ABL2 3 | AKT1 4 | AKT2 5 | AKT3 6 | ALK 7 | ALOX12B 8 | APC 9 | AR 10 | ARAF 11 | ARHGAP26 12 | ARID1A 13 | ASXL1 14 | ATM 15 | ATRX 16 | AURKA 17 | BAP1 18 | BCL2L1 19 | BCL6 20 | BIRC2 21 | BRAF 22 | BRCA1 23 | BRCA2 24 | CARD11 25 | CBL 26 | CBLB 27 | CBLC 28 | CCND1 29 | CCNE1 30 | CD79B 31 | CDC42EP2 32 | CDC73 33 | CDH1 34 | CDK4 35 | CDK6 36 | CDK8 37 | CDKN2A 38 | CDKN2B 39 | CDKN2C 40 | CEBPA 41 | CHEK1 42 | CHEK2 43 | CREBBP 44 | CRKL 45 | CRLF2 46 | CSF1R 47 | CTNNB1 48 | CYLD 49 | DAXX 50 | DDR2 51 | DICER1 52 | DIS3 53 | DNMT1 54 | DNMT3A 55 | DNMT3B 56 | EGFR 57 | EIF4EBP1 58 | EP300 59 | EPHA3 60 | EPHA5 61 | EPHA6 62 | EPHA7 63 | EPHA8 64 | EPHB1 65 | EPHB4 66 | EPHB6 67 | ERBB2 68 | ERBB3 69 | ERBB4 70 | ERG 71 | ESR1 72 | ETV1 73 | ETV6 74 | EZH2 75 | FAM123B 76 | FAM46C 77 | FAS 78 | FBXW7 79 | FGFR1 80 | FGFR2 81 | FGFR3 82 | FGFR4 83 | FH 84 | FLCN 85 | FLT1 86 | FLT3 87 | FOXL2 88 | GATA1 89 | GATA2 90 | GATA3 91 | GNA11 92 | GNAQ 93 | GNAS 94 | GOLPH3 95 | GRIN2A 96 | GSK3B 97 | HDAC2 98 | HIF1A 99 | HMGA2 100 | HNF1A 101 | HRAS 102 | HSP90AA1 103 | IDH1 104 | IDH2 105 | IGF1R 106 | IGFBP7 107 | IKBKE 108 | IKZF1 109 | INSR 110 | IRS1 111 | IRS2 112 | JAK1 113 | JAK2 114 | JAK3 115 | JUN 116 | KDM5C 117 | KDM6A 118 | KDR 119 | KEAP1 120 | KIT 121 | KLF6 122 | KMT2A 123 | KMT2C 124 | KMT2D 125 | KRAS 126 | LDHA 127 | LGR6 128 | MAGI2 129 | MAP2K1 130 | MAP2K2 131 | MAP2K4 132 | MAP3K8 133 | MCL1 134 | MDM2 135 | MDM4 136 | MEN1 137 | MET 138 | MITF 139 | MLH1 140 | MLST8 141 | MPL 142 | MSH2 143 | MSH6 144 | MTOR 145 | MYB 146 | MYC 147 | MYCL1 148 | MYCN 149 | NCOA2 150 | NF1 151 | NF2 152 | NFE2L2 153 | NFKB1 154 | NFKB2 155 | NKX2-1 156 | NOTCH1 157 | NOTCH2 158 | NOTCH3 159 | NOTCH4 160 | NPM1 161 | NRAS 162 | NTRK1 163 | NTRK2 164 | NTRK3 165 | PAK7 166 | PARK2 167 | PARP1 168 | PAX5 169 | PBRM1 170 | PDGFRA 171 | PDGFRB 172 | PHOX2B 173 | PIK3C2G 174 | PIK3CA 175 | PIK3CB 176 | PIK3CD 177 | PIK3CG 178 | PIK3R1 179 | PIK3R2 180 | PIK3R3 181 | PKM2 182 | PLK2 183 | PNRC1 184 | PREX2 185 | PRKAR1A 186 | PRKCI 187 | PTCH1 188 | PTEN 189 | PTPN11 190 | PTPRD 191 | PTPRS 192 | RAF1 193 | RARA 194 | RB1 195 | REL 196 | RET 197 | RICTOR 198 | RPTOR 199 | RUNX1 200 | SDHB 201 | SETD2 202 | SHQ1 203 | SMAD4 204 | SMARCA4 205 | SMARCB1 206 | SMO 207 | SOCS1 208 | SOX2 209 | SPOP 210 | SRC 211 | STK11 212 | SUFU 213 | TBK1 214 | TEK 215 | TERT 216 | TET1 217 | TET2 218 | TGFBR2 219 | TMPRSS2 220 | TNFAIP3 221 | TOP1 222 | TP53 223 | TP63 224 | TSC1 225 | TSC2 226 | TSHR 227 | VHL 228 | WT1 229 | YAP1 230 | YES1 231 | CDKN2A.p14arf 232 | CDKN2A.p16INK4a 233 | -------------------------------------------------------------------------------- /DIGDriver/data/genes_MSK_341.txt: -------------------------------------------------------------------------------- 1 | ABL1 2 | AKT1 3 | AKT2 4 | AKT3 5 | ALK 6 | ALOX12B 7 | APC 8 | AR 9 | ARAF 10 | ARID1A 11 | ARID1B 12 | ARID2 13 | ARID5B 14 | ASXL1 15 | ASXL2 16 | ATM 17 | ATR 18 | ATRX 19 | AURKA 20 | AURKB 21 | AXIN1 22 | AXIN2 23 | AXL 24 | B2M 25 | BAP1 26 | BARD1 27 | BBC3 28 | BCL2 29 | BCL2L1 30 | BCL2L11 31 | BCL6 32 | BCOR 33 | BLM 34 | BMPR1A 35 | BRAF 36 | BRCA1 37 | BRCA2 38 | BRD4 39 | BRIP1 40 | BTK 41 | CARD11 42 | CASP8 43 | CBFB 44 | CBL 45 | CCND1 46 | CCND2 47 | CCND3 48 | CCNE1 49 | CD274 50 | CD276 51 | CD79B 52 | CDC73 53 | CDH1 54 | CDK12 55 | CDK4 56 | CDK6 57 | CDK8 58 | CDKN1A 59 | CDKN1B 60 | CDKN2A.p14arf 61 | CDKN2A.p16INK4a 62 | CDKN2B 63 | CDKN2C 64 | CHEK1 65 | CHEK2 66 | CIC 67 | CREBBP 68 | CRKL 69 | CRLF2 70 | CSF1R 71 | CTCF 72 | CTLA4 73 | CTNNB1 74 | CUL3 75 | DAXX 76 | DCUN1D1 77 | DDR2 78 | DICER1 79 | DIS3 80 | DNMT1 81 | DNMT3A 82 | DNMT3B 83 | DOT1L 84 | E2F3 85 | EED 86 | EGFL7 87 | EGFR 88 | EIF1AX 89 | EP300 90 | EPCAM 91 | EPHA3 92 | EPHA5 93 | EPHB1 94 | ERBB2 95 | ERBB3 96 | ERBB4 97 | ERCC2 98 | ERCC3 99 | ERCC4 100 | ERCC5 101 | ERG 102 | ESR1 103 | ETV1 104 | ETV6 105 | EZH2 106 | FAM123B 107 | FAM175A 108 | FAM46C 109 | FANCA 110 | FANCC 111 | FAT1 112 | FBXW7 113 | FGF19 114 | FGF3 115 | FGF4 116 | FGFR1 117 | FGFR2 118 | FGFR3 119 | FGFR4 120 | FH 121 | FLCN 122 | FLT1 123 | FLT3 124 | FLT4 125 | FOXA1 126 | FOXL2 127 | FOXP1 128 | FUBP1 129 | GATA1 130 | GATA2 131 | GATA3 132 | GNA11 133 | GNAQ 134 | GNAS 135 | GREM1 136 | GRIN2A 137 | GSK3B 138 | H3F3C 139 | HGF 140 | HIST1H1C 141 | HIST1H2BD 142 | HIST1H3B 143 | HNF1A 144 | HRAS 145 | ICOSLG 146 | IDH1 147 | IDH2 148 | IFNGR1 149 | IGF1 150 | IGF1R 151 | IGF2 152 | IKBKE 153 | IKZF1 154 | IL10 155 | IL7R 156 | INPP4A 157 | INPP4B 158 | INSR 159 | IRF4 160 | IRS1 161 | IRS2 162 | JAK1 163 | JAK2 164 | JAK3 165 | JUN 166 | KDM5A 167 | KDM5C 168 | KDM6A 169 | KDR 170 | KEAP1 171 | KIT 172 | KLF4 173 | KRAS 174 | LATS1 175 | LATS2 176 | LMO1 177 | MAP2K1 178 | MAP2K2 179 | MAP2K4 180 | MAP3K1 181 | MAP3K13 182 | MAPK1 183 | MAX 184 | MCL1 185 | MDC1 186 | MDM2 187 | MDM4 188 | MED12 189 | MEF2B 190 | MEN1 191 | MET 192 | MITF 193 | MLH1 194 | MLL 195 | MLL2 196 | MLL3 197 | MPL 198 | MRE11A 199 | MSH2 200 | MSH6 201 | MTOR 202 | MUTYH 203 | MYC 204 | MYCL1 205 | MYCN 206 | MYD88 207 | MYOD1 208 | NBN 209 | NCOR1 210 | NF1 211 | NF2 212 | NFE2L2 213 | NKX2-1 214 | NKX3-1 215 | NOTCH1 216 | NOTCH2 217 | NOTCH3 218 | NOTCH4 219 | NPM1 220 | NRAS 221 | NSD1 222 | NTRK1 223 | NTRK2 224 | NTRK3 225 | PAK1 226 | PAK7 227 | PALB2 228 | PARK2 229 | PARP1 230 | PAX5 231 | PBRM1 232 | PDCD1 233 | PDGFRA 234 | PDGFRB 235 | PDPK1 236 | PHOX2B 237 | PIK3C2G 238 | PIK3C3 239 | PIK3CA 240 | PIK3CB 241 | PIK3CD 242 | PIK3CG 243 | PIK3R1 244 | PIK3R2 245 | PIK3R3 246 | PIM1 247 | PLK2 248 | PMAIP1 249 | PMS1 250 | PMS2 251 | PNRC1 252 | POLE 253 | PPP2R1A 254 | PRDM1 255 | PRKAR1A 256 | PTCH1 257 | PTEN 258 | PTPN11 259 | PTPRD 260 | PTPRS 261 | PTPRT 262 | RAC1 263 | RAD50 264 | RAD51 265 | RAD51B 266 | RAD51C 267 | RAD51D 268 | RAD52 269 | RAD54L 270 | RAF1 271 | RARA 272 | RASA1 273 | RB1 274 | RBM10 275 | RECQL4 276 | REL 277 | RET 278 | RFWD2 279 | RHOA 280 | RICTOR 281 | RIT1 282 | RNF43 283 | ROS1 284 | RPS6KA4 285 | RPS6KB2 286 | RPTOR 287 | RUNX1 288 | RYBP 289 | SDHA 290 | SDHAF2 291 | SDHB 292 | SDHC 293 | SDHD 294 | SETD2 295 | SF3B1 296 | SH2D1A 297 | SHQ1 298 | SMAD2 299 | SMAD3 300 | SMAD4 301 | SMARCA4 302 | SMARCB1 303 | SMARCD1 304 | SMO 305 | SOCS1 306 | SOX17 307 | SOX2 308 | SOX9 309 | SPEN 310 | SPOP 311 | SRC 312 | STAG2 313 | STK11 314 | STK40 315 | SUFU 316 | SUZ12 317 | SYK 318 | TBX3 319 | TERT 320 | TET1 321 | TET2 322 | TGFBR1 323 | TGFBR2 324 | TMEM127 325 | TMPRSS2 326 | TNFAIP3 327 | TNFRSF14 328 | TOP1 329 | TP53 330 | TP63 331 | TRAF7 332 | TSC1 333 | TSC2 334 | TSHR 335 | U2AF1 336 | VHL 337 | VTCN1 338 | WT1 339 | XIAP 340 | XPO1 341 | YAP1 342 | YES1 343 | CDKN2AP14ARF 344 | CDKN2AP16INK4A 345 | MYB 346 | NFIB 347 | -------------------------------------------------------------------------------- /DIGDriver/data/genes_MSK_410.txt: -------------------------------------------------------------------------------- 1 | ABL1 2 | ACVR1 3 | AKT1 4 | AKT2 5 | AKT3 6 | ALK 7 | ALOX12B 8 | ANKRD11 9 | APC 10 | AR 11 | ARAF 12 | ARID1A 13 | ARID1B 14 | ARID2 15 | ARID5B 16 | ASXL1 17 | ASXL2 18 | ATM 19 | ATR 20 | ATRX 21 | AURKA 22 | AURKB 23 | AXIN1 24 | AXIN2 25 | AXL 26 | B2M 27 | BAP1 28 | BARD1 29 | BBC3 30 | BCL10 31 | BCL2 32 | BCL2L1 33 | BCL2L11 34 | BCL6 35 | BCOR 36 | BIRC3 37 | BLM 38 | BMPR1A 39 | BRAF 40 | BRCA1 41 | BRCA2 42 | BRD4 43 | BRIP1 44 | BTK 45 | CALR 46 | CARD11 47 | CASP8 48 | CBFB 49 | CBL 50 | CCND1 51 | CCND2 52 | CCND3 53 | CCNE1 54 | CD274 55 | CD276 56 | CD79A 57 | CD79B 58 | CDC73 59 | CDH1 60 | CDK12 61 | CDK4 62 | CDK6 63 | CDK8 64 | CDKN1A 65 | CDKN1B 66 | CDKN2A.p14arf 67 | CDKN2A.p16INK4a 68 | CDKN2B 69 | CDKN2C 70 | CEBPA 71 | CENPA 72 | CHEK1 73 | CHEK2 74 | CIC 75 | CREBBP 76 | CRKL 77 | CRLF2 78 | CSF1R 79 | CSF3R 80 | CTCF 81 | CTLA4 82 | CTNNB1 83 | CUL3 84 | CXCR4 85 | DAXX 86 | DCUN1D1 87 | DDR2 88 | DICER1 89 | DIS3 90 | DNAJB1 91 | DNMT1 92 | DNMT3A 93 | DNMT3B 94 | DOT1L 95 | E2F3 96 | EED 97 | EGFL7 98 | EGFR 99 | EIF1AX 100 | EIF4A2 101 | EIF4E 102 | EP300 103 | EPCAM 104 | EPHA3 105 | EPHA5 106 | EPHA7 107 | EPHB1 108 | ERBB2 109 | ERBB3 110 | ERBB4 111 | ERCC2 112 | ERCC3 113 | ERCC4 114 | ERCC5 115 | ERG 116 | ERRFI1 117 | ESR1 118 | ETV1 119 | ETV6 120 | EZH2 121 | FAM123B 122 | FAM175A 123 | FAM46C 124 | FANCA 125 | FANCC 126 | FAT1 127 | FBXW7 128 | FGF19 129 | FGF3 130 | FGF4 131 | FGFR1 132 | FGFR2 133 | FGFR3 134 | FGFR4 135 | FH 136 | FLCN 137 | FLT1 138 | FLT3 139 | FLT4 140 | FOXA1 141 | FOXL2 142 | FOXO1 143 | FOXP1 144 | FUBP1 145 | FYN 146 | GATA1 147 | GATA2 148 | GATA3 149 | GLI1 150 | GNA11 151 | GNAQ 152 | GNAS 153 | GPS2 154 | GREM1 155 | GRIN2A 156 | GSK3B 157 | H3F3A 158 | H3F3B 159 | H3F3C 160 | HGF 161 | HIST1H1C 162 | HIST1H2BD 163 | HIST1H3A 164 | HIST1H3B 165 | HIST1H3C 166 | HIST1H3D 167 | HIST1H3E 168 | HIST1H3F 169 | HIST1H3G 170 | HIST1H3H 171 | HIST1H3I 172 | HIST1H3J 173 | HIST2H3C 174 | HIST2H3D 175 | HIST3H3 176 | HLA-A 177 | HNF1A 178 | HOXB13 179 | HRAS 180 | ICOSLG 181 | ID3 182 | IDH1 183 | IDH2 184 | IFNGR1 185 | IGF1 186 | IGF1R 187 | IGF2 188 | IKBKE 189 | IKZF1 190 | IL10 191 | IL7R 192 | INHA 193 | INHBA 194 | INPP4A 195 | INPP4B 196 | INSR 197 | IRF4 198 | IRS1 199 | IRS2 200 | JAK1 201 | JAK2 202 | JAK3 203 | JUN 204 | KDM5A 205 | KDM5C 206 | KDM6A 207 | KDR 208 | KEAP1 209 | KIT 210 | KLF4 211 | KRAS 212 | LATS1 213 | LATS2 214 | LMO1 215 | MALT1 216 | MAP2K1 217 | MAP2K2 218 | MAP2K4 219 | MAP3K1 220 | MAP3K13 221 | MAP3K14 222 | MAPK1 223 | MAPK3 224 | MAX 225 | MCL1 226 | MDC1 227 | MDM2 228 | MDM4 229 | MED12 230 | MEF2B 231 | MEN1 232 | MET 233 | MGA 234 | MITF 235 | MLH1 236 | MLL 237 | MLL2 238 | MLL3 239 | MPL 240 | MRE11A 241 | MSH2 242 | MSH6 243 | MST1 244 | MST1R 245 | MTOR 246 | MUTYH 247 | MYC 248 | MYCL1 249 | MYCN 250 | MYD88 251 | MYOD1 252 | NBN 253 | NCOA3 254 | NCOR1 255 | NEGR1 256 | NF1 257 | NF2 258 | NFE2L2 259 | NFKBIA 260 | NKX2-1 261 | NKX3-1 262 | NOTCH1 263 | NOTCH2 264 | NOTCH3 265 | NOTCH4 266 | NPM1 267 | NRAS 268 | NSD1 269 | NTRK1 270 | NTRK2 271 | NTRK3 272 | NUP93 273 | PAK1 274 | PAK7 275 | PALB2 276 | PARK2 277 | PARP1 278 | PAX5 279 | PBRM1 280 | PDCD1 281 | PDGFRA 282 | PDGFRB 283 | PDPK1 284 | PGR 285 | PHOX2B 286 | PIK3C2G 287 | PIK3C3 288 | PIK3CA 289 | PIK3CB 290 | PIK3CD 291 | PIK3CG 292 | PIK3R1 293 | PIK3R2 294 | PIK3R3 295 | PIM1 296 | PLCG2 297 | PLK2 298 | PMAIP1 299 | PMS1 300 | PMS2 301 | PNRC1 302 | POLD1 303 | POLE 304 | PPM1D 305 | PPP2R1A 306 | PPP6C 307 | PRDM1 308 | PRKAR1A 309 | PTCH1 310 | PTEN 311 | PTPN11 312 | PTPRD 313 | PTPRS 314 | PTPRT 315 | RAB35 316 | RAC1 317 | RAD21 318 | RAD50 319 | RAD51 320 | RAD51B 321 | RAD51C 322 | RAD51D 323 | RAD52 324 | RAD54L 325 | RAF1 326 | RARA 327 | RASA1 328 | RB1 329 | RBM10 330 | RECQL4 331 | REL 332 | RET 333 | RFWD2 334 | RHEB 335 | RHOA 336 | RICTOR 337 | RIT1 338 | RNF43 339 | ROS1 340 | RPS6KA4 341 | RPS6KB2 342 | RPTOR 343 | RUNX1 344 | RYBP 345 | SDHA 346 | SDHAF2 347 | SDHB 348 | SDHC 349 | SDHD 350 | SETD2 351 | SF3B1 352 | SH2B3 353 | SH2D1A 354 | SHQ1 355 | SMAD2 356 | SMAD3 357 | SMAD4 358 | SMARCA4 359 | SMARCB1 360 | SMARCD1 361 | SMO 362 | SOCS1 363 | SOX17 364 | SOX2 365 | SOX9 366 | SPEN 367 | SPOP 368 | SRC 369 | SRSF2 370 | STAG2 371 | STAT3 372 | STAT5A 373 | STAT5B 374 | STK11 375 | STK40 376 | SUFU 377 | SUZ12 378 | SYK 379 | TBX3 380 | TCEB1 381 | TCF3 382 | TCF7L2 383 | TERT 384 | TET1 385 | TET2 386 | TGFBR1 387 | TGFBR2 388 | TMEM127 389 | TMPRSS2 390 | TNFAIP3 391 | TNFRSF14 392 | TOP1 393 | TP53 394 | TP63 395 | TRAF2 396 | TRAF7 397 | TSC1 398 | TSC2 399 | TSHR 400 | U2AF1 401 | VEGFA 402 | VHL 403 | VTCN1 404 | WT1 405 | XIAP 406 | XPO1 407 | XRCC2 408 | YAP1 409 | YES1 410 | ZFHX3 411 | ZRSR2 412 | CDKN2AP16INK4A 413 | CDKN2AP14ARF 414 | MYB 415 | NFIB 416 | -------------------------------------------------------------------------------- /DIGDriver/data/genes_MSK_468.txt: -------------------------------------------------------------------------------- 1 | ABL1 2 | ACVR1 3 | AGO2 4 | AKT1 5 | AKT2 6 | AKT3 7 | ALK 8 | ALOX12B 9 | ANKRD11 10 | APC 11 | AR 12 | ARAF 13 | ARID1A 14 | ARID1B 15 | ARID2 16 | ARID5B 17 | ASXL1 18 | ASXL2 19 | ATM 20 | ATR 21 | ATRX 22 | AURKA 23 | AURKB 24 | AXIN1 25 | AXIN2 26 | AXL 27 | B2M 28 | BABAM1 29 | BAP1 30 | BARD1 31 | BBC3 32 | BCL10 33 | BCL2 34 | BCL2L1 35 | BCL2L11 36 | BCL6 37 | BCOR 38 | BIRC3 39 | BLM 40 | BMPR1A 41 | BRAF 42 | BRCA1 43 | BRCA2 44 | BRD4 45 | BRIP1 46 | BTK 47 | CALR 48 | CARD11 49 | CARM1 50 | CASP8 51 | CBFB 52 | CBL 53 | CCND1 54 | CCND2 55 | CCND3 56 | CCNE1 57 | CD274 58 | CD276 59 | CD79A 60 | CD79B 61 | CDC42 62 | CDC73 63 | CDH1 64 | CDK12 65 | CDK4 66 | CDK6 67 | CDK8 68 | CDKN1A 69 | CDKN1B 70 | CDKN2A 71 | CDKN2A.p14arf 72 | CDKN2A.p16INK4a 73 | CDKN2B 74 | CDKN2C 75 | CEBPA 76 | CENPA 77 | CHEK1 78 | CHEK2 79 | CIC 80 | CREBBP 81 | CRKL 82 | CRLF2 83 | CSDE1 84 | CSF1R 85 | CSF3R 86 | CTCF 87 | CTLA4 88 | CTNNB1 89 | CUL3 90 | CXCR4 91 | CYLD 92 | CYSLTR2 93 | DAXX 94 | DCUN1D1 95 | DDR2 96 | DICER1 97 | DIS3 98 | DNAJB1 99 | DNMT1 100 | DNMT3A 101 | DNMT3B 102 | DOT1L 103 | DROSHA 104 | DUSP4 105 | E2F3 106 | EED 107 | EGFL7 108 | EGFR 109 | EIF1AX 110 | EIF4A2 111 | EIF4E 112 | ELF3 113 | EP300 114 | EPAS1 115 | EPCAM 116 | EPHA3 117 | EPHA5 118 | EPHA7 119 | EPHB1 120 | ERBB2 121 | ERBB3 122 | ERBB4 123 | ERCC2 124 | ERCC3 125 | ERCC4 126 | ERCC5 127 | ERF 128 | ERG 129 | ERRFI1 130 | ESR1 131 | ETV1 132 | ETV6 133 | EZH1 134 | EZH2 135 | FAM123B 136 | FAM175A 137 | FAM46C 138 | FAM58A 139 | FANCA 140 | FANCC 141 | FAT1 142 | FBXW7 143 | FGF19 144 | FGF3 145 | FGF4 146 | FGFR1 147 | FGFR2 148 | FGFR3 149 | FGFR4 150 | FH 151 | FLCN 152 | FLT1 153 | FLT3 154 | FLT4 155 | FOXA1 156 | FOXL2 157 | FOXO1 158 | FOXP1 159 | FUBP1 160 | FYN 161 | GATA1 162 | GATA2 163 | GATA3 164 | GLI1 165 | GNA11 166 | GNAQ 167 | GNAS 168 | GPS2 169 | GREM1 170 | GRIN2A 171 | GSK3B 172 | H3F3A 173 | H3F3B 174 | H3F3C 175 | HGF 176 | HIST1H1C 177 | HIST1H2BD 178 | HIST1H3A 179 | HIST1H3B 180 | HIST1H3C 181 | HIST1H3D 182 | HIST1H3E 183 | HIST1H3F 184 | HIST1H3G 185 | HIST1H3H 186 | HIST1H3I 187 | HIST1H3J 188 | HIST2H3C 189 | HIST2H3D 190 | HIST3H3 191 | HLA-A 192 | HLA-B 193 | HNF1A 194 | HOXB13 195 | HRAS 196 | ICOSLG 197 | ID3 198 | IDH1 199 | IDH2 200 | IFNGR1 201 | IGF1 202 | IGF1R 203 | IGF2 204 | IKBKE 205 | IKZF1 206 | IL10 207 | IL7R 208 | INHA 209 | INHBA 210 | INPP4A 211 | INPP4B 212 | INPPL1 213 | INSR 214 | IRF4 215 | IRS1 216 | IRS2 217 | JAK1 218 | JAK2 219 | JAK3 220 | JUN 221 | KDM5A 222 | KDM5C 223 | KDM6A 224 | KDR 225 | KEAP1 226 | KIT 227 | KLF4 228 | KMT2B 229 | KMT5A 230 | KNSTRN 231 | KRAS 232 | LATS1 233 | LATS2 234 | LMO1 235 | LYN 236 | MALT1 237 | MAP2K1 238 | MAP2K2 239 | MAP2K4 240 | MAP3K1 241 | MAP3K13 242 | MAP3K14 243 | MAPK1 244 | MAPK3 245 | MAPKAP1 246 | MAX 247 | MCL1 248 | MDC1 249 | MDM2 250 | MDM4 251 | MED12 252 | MEF2B 253 | MEN1 254 | MET 255 | MGA 256 | MITF 257 | MLH1 258 | MLL 259 | MLL2 260 | MLL3 261 | MPL 262 | MRE11A 263 | MSH2 264 | MSH3 265 | MSH6 266 | MSI1 267 | MSI2 268 | MST1 269 | MST1R 270 | MTOR 271 | MUTYH 272 | MYC 273 | MYCL1 274 | MYCN 275 | MYD88 276 | MYOD1 277 | NBN 278 | NCOA3 279 | NCOR1 280 | NEGR1 281 | NF1 282 | NF2 283 | NFE2L2 284 | NFKBIA 285 | NKX2-1 286 | NKX3-1 287 | NOTCH1 288 | NOTCH2 289 | NOTCH3 290 | NOTCH4 291 | NPM1 292 | NRAS 293 | NSD1 294 | NTHL1 295 | NTRK1 296 | NTRK2 297 | NTRK3 298 | NUF2 299 | NUP93 300 | PAK1 301 | PAK7 302 | PALB2 303 | PARK2 304 | PARP1 305 | PAX5 306 | PBRM1 307 | PDCD1 308 | PDCD1LG2 309 | PDGFRA 310 | PDGFRB 311 | PDPK1 312 | PGR 313 | PHOX2B 314 | PIK3C2G 315 | PIK3C3 316 | PIK3CA 317 | PIK3CB 318 | PIK3CD 319 | PIK3CG 320 | PIK3R1 321 | PIK3R2 322 | PIK3R3 323 | PIM1 324 | PLCG2 325 | PLK2 326 | PMAIP1 327 | PMS1 328 | PMS2 329 | PNRC1 330 | POLD1 331 | POLE 332 | PPARG 333 | PPM1D 334 | PPP2R1A 335 | PPP4R2 336 | PPP6C 337 | PRDM1 338 | PRDM14 339 | PREX2 340 | PRKAR1A 341 | PRKCI 342 | PRKD1 343 | PTCH1 344 | PTEN 345 | PTP4A1 346 | PTPN11 347 | PTPRD 348 | PTPRS 349 | PTPRT 350 | RAB35 351 | RAC1 352 | RAC2 353 | RAD21 354 | RAD50 355 | RAD51 356 | RAD51C 357 | RAD51L1 358 | RAD51L3 359 | RAD52 360 | RAD54L 361 | RAF1 362 | RARA 363 | RASA1 364 | RB1 365 | RBM10 366 | RECQL 367 | RECQL4 368 | REL 369 | RET 370 | RFWD2 371 | RHEB 372 | RHOA 373 | RICTOR 374 | RIT1 375 | RNF43 376 | ROS1 377 | RPS6KA4 378 | RPS6KB2 379 | RPTOR 380 | RRAGC 381 | RRAS 382 | RRAS2 383 | RTEL1 384 | RUNX1 385 | RXRA 386 | RYBP 387 | SDHA 388 | SDHAF2 389 | SDHB 390 | SDHC 391 | SDHD 392 | SESN1 393 | SESN2 394 | SESN3 395 | SETD2 396 | SF3B1 397 | SH2B3 398 | SH2D1A 399 | SHOC2 400 | SHQ1 401 | SLX4 402 | SMAD2 403 | SMAD3 404 | SMAD4 405 | SMARCA4 406 | SMARCB1 407 | SMARCD1 408 | SMO 409 | SMYD3 410 | SOCS1 411 | SOS1 412 | SOX17 413 | SOX2 414 | SOX9 415 | SPEN 416 | SPOP 417 | SPRED1 418 | SRC 419 | SRSF2 420 | STAG2 421 | STAT3 422 | STAT5A 423 | STAT5B 424 | STK11 425 | STK19 426 | STK40 427 | SUFU 428 | SUZ12 429 | SYK 430 | TAP1 431 | TAP2 432 | TBX3 433 | TCEB1 434 | TCF3 435 | TCF7L2 436 | TEK 437 | TERT 438 | TET1 439 | TET2 440 | TGFBR1 441 | TGFBR2 442 | TMEM127 443 | TMPRSS2 444 | TNFAIP3 445 | TNFRSF14 446 | TOP1 447 | TP53 448 | TP53BP1 449 | TP63 450 | TRAF2 451 | TRAF7 452 | TSC1 453 | TSC2 454 | TSHR 455 | U2AF1 456 | UPF1 457 | VEGFA 458 | VHL 459 | VTCN1 460 | WHSC1 461 | WHSC1L1 462 | WT1 463 | WWTR1 464 | XIAP 465 | XPO1 466 | XRCC2 467 | YAP1 468 | YES1 469 | ZFHX3 470 | ZRSR2 471 | CDKN2AP16INK4A 472 | CDKN2AP14ARF 473 | MYB 474 | NFIB 475 | -------------------------------------------------------------------------------- /DIGDriver/data/genes_metabric_173.txt: -------------------------------------------------------------------------------- 1 | TP53 2 | FOXO3 3 | NCOR1 4 | PIK3CA 5 | SETD2 6 | BIRC6 7 | TG 8 | GATA3 9 | ARID2 10 | NCOR2 11 | CBFB 12 | BAP1 13 | STAB2 14 | MUC16 15 | FOXP1 16 | RYR2 17 | FANCD2 18 | KMT2C 19 | CDH1 20 | NF1 21 | USH2A 22 | MTAP 23 | ERBB3 24 | MAP3K1 25 | SF3B1 26 | MLL2 27 | RB1 28 | COL6A3 29 | UTRN 30 | PTEN 31 | BRCA2 32 | CASP8 33 | AHNAK 34 | ALK 35 | KDM6A 36 | AGMO 37 | SYNE1 38 | ARID1A 39 | AKT1 40 | LIPI 41 | ASXL2 42 | TAF1 43 | APC 44 | SETD1A 45 | AKAP9 46 | UBR5 47 | LAMA2 48 | MAP2K4 49 | BRIP1 50 | PRKCE 51 | PIK3R1 52 | HERC2 53 | FBXW7 54 | AHNAK2 55 | GPS2 56 | THSD7A 57 | MYH9 58 | ZFP36L1 59 | GPR32 60 | GH1 61 | L1CAM 62 | SMAD4 63 | NOTCH1 64 | JAK1 65 | DNAH2 66 | COL22A1 67 | TBX3 68 | COL12A1 69 | DNAH5 70 | CTCF 71 | KRAS 72 | CACNA2D3 73 | TTYH1 74 | ERBB4 75 | MBL2 76 | SIK1 77 | AKT2 78 | ARID5B 79 | THADA 80 | FRMD3 81 | ATR 82 | RUNX1 83 | BRCA1 84 | EGFR 85 | PRKCQ 86 | LIFR 87 | SMARCC2 88 | MEN1 89 | ROS1 90 | LAMB3 91 | USP9X 92 | RPGR 93 | AFF2 94 | CDKN1B 95 | PRPS2 96 | PALLD 97 | SHANK2 98 | PTPRM 99 | PTPRD 100 | ASXL1 101 | GPR124 102 | CHEK2 103 | ERBB2 104 | CDKN2A 105 | MLLT4 106 | PDE4DIP 107 | SMARCC1 108 | CTNNA3 109 | MAP3K10 110 | LARGE 111 | SETDB1 112 | ARID1B 113 | DCAF4L2 114 | NCOA3 115 | DNAH11 116 | MAP3K13 117 | BCAS3 118 | PBRM1 119 | NRG3 120 | HDAC9 121 | ACVRL1 122 | NDFIP1 123 | USP28 124 | CHD1 125 | OR6A2 126 | CLK3 127 | PRKCZ 128 | MYO3A 129 | PRKG1 130 | FLT3 131 | BRAF 132 | PRR16 133 | PRKACG 134 | FAM20C 135 | KDM3A 136 | NPNT 137 | NEK1 138 | NF2 139 | FANCA 140 | MYO1A 141 | PPP2R2A 142 | STK11 143 | EP300 144 | CTNNA1 145 | FOXO1 146 | SGCD 147 | SBNO1 148 | HIST1H2BC 149 | SPACA1 150 | SIK2 151 | DTWD2 152 | GLDC 153 | NR2F1 154 | MAGEA8 155 | KLRG1 156 | TAF4B 157 | HRAS 158 | RASGEF1B 159 | SMAD2 160 | NR3C1 161 | LDLRAP1 162 | NT5E 163 | PTPN22 164 | CLRN2 165 | CCND3 166 | SMARCB1 167 | TBL1XR1 168 | PPP2CB 169 | SIAH1 170 | SMARCD1 171 | STMN2 172 | NRAS 173 | AGTR2 174 | CDKN2A.p14arf 175 | CDKN2A.p16INK4a 176 | -------------------------------------------------------------------------------- /DIGDriver/data/refcds_hg19.rda: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/maxwellsh/DIGDriver/5bb565a1fbb3924ecdaaedeffb97123febc3b4d1/DIGDriver/data/refcds_hg19.rda -------------------------------------------------------------------------------- /DIGDriver/data_tools/DIG_auto.py: -------------------------------------------------------------------------------- 1 | import sys 2 | import os 3 | import argparse 4 | import numpy as np 5 | import pandas as pd 6 | from pathlib import Path 7 | from multiprocessing.pool import Pool 8 | import h5py 9 | 10 | sys.path.append('../data_tools/') 11 | sys.path.append('../region_model/') 12 | sys.path.append('../sequence_model/') 13 | import DataExtractor 14 | import kfold_mutations_main 15 | import SequenceModel 16 | import GenicDriver 17 | 18 | 19 | def parse_args(text=None): 20 | parser = argparse.ArgumentParser(description="Automation tool for running DIG pipeline") 21 | subparsers = parser.add_subparsers(help='DIG sub-commands') 22 | 23 | parser_a = subparsers.add_parser('runDIG', help='Run DIG model') 24 | 25 | ## required 26 | parser_a.add_argument('--out-dir', type=str, dest='out_dir', required = True, help='Base Directory of DIG run. All intermediate files will be saved relative to this location') 27 | parser_a.add_argument('--map-ref', type=str, dest='map_ref', help='path to mappability file') 28 | parser_a.add_argument('--window-size', type=int, default=10000, dest='window', help='desired window size for DIG model regions') 29 | parser_a.add_argument('--min-map', type=float, default=0.50, dest='min_mapp', help='minimum mappability for windows') 30 | parser_a.add_argument('--ref-file', type=str, dest='ref_file', help='path to reference hg19 genome') 31 | parser_a.add_argument('--mut-file', type=str, dest='mut_file', required = True, help='path to mutations file') 32 | parser_a.add_argument('--N-procs', type = int, dest='n_procs', default = 20, help= 'number of processes to run') 33 | 34 | ## partial runs 35 | parser_a.add_argument('--map-file', type = str, dest = 'map_file', help = 'map to precomputed mappability file') 36 | parser_a.add_argument('--epi-dir', type=str, dest='epi_dir', help='path to epigenomics files') 37 | parser_a.add_argument('--split_idx', type=str, dest='split_dir', help='path to split index dir') 38 | parser_a.add_argument('--epi-matrix_dir', type=str, dest='epi_matrix_dir', help='path to constructed epigenome matrix h5 file') 39 | parser_a.add_argument('--fmodel-dir', type=str, dest='fmodel_dir', help='path to constructed genome context frequency file') 40 | parser_a.add_argument('--gp-results-base', type=str, dest='gp_res', help='path to generic file name of gp results fold') 41 | 42 | ##optional arguments 43 | parser_a.add_argument('-c', '--cancer-key', type = str, dest = 'cancer_key', help = 'key name for cancer targets') 44 | parser_a.add_argument('-g', "--gpus", required=False, nargs='?', action='store', type=str, dest='gpus', 45 | default='all', help='GPUs devices (all/comma separted list)') 46 | 47 | parser_a.set_defaults(func=run) 48 | 49 | if text: 50 | args = parser.parse_args(text.split()) 51 | else: 52 | args = parser.parse_args() 53 | 54 | return args 55 | 56 | # inputs are epi-genome tracks and mutation file 57 | 58 | 59 | 60 | def run(args): 61 | if args.gp_res is None: 62 | if args.epi_matrix_dir is None: 63 | if args.epi_dir is None: 64 | print('Error: need to provide either a epi_track dir or a epi_matrix_dir') 65 | return 66 | else: 67 | map_file_name = "high_mapp_{}_{}_{}".format(args.min_mapp, args.window, 0) 68 | mapp_file_path = os.path.join(args.out_dir, map_file_name) 69 | if args.map_file is None: 70 | print('Finding mappable windows...') 71 | mapp_args = DataExtractor.parse_args('mappability {} --out-dir {} --window {} --overlap {} --min-map {}'.format(args.map_ref, args.out_dir, args.window, 0, args.min_mapp)) 72 | DataExtractor.mappability(mapp_args) 73 | print('map file saved at: ' + mapp_file_path) 74 | 75 | print('creating split index...') 76 | 77 | if args.split_dir is None: 78 | split_path = os.path.join(args.out_dir, 'splitIdx_{}'.format(args.window)) 79 | if not os.path.exists(split_path): 80 | os.mkdir(split_path) 81 | split_args = DataExtractor.parse_args('splitDataIdx --base-dir {} --out-dir {} --chunk-size {} --window {} --overlap {} --min-map {}'.format(args.out_dir, split_path, 10000, args.window, 0, args.min_mapp)) 82 | DataExtractor.split_data_idx(split_args) 83 | print('splitIdx files saved at'+ split_path) 84 | else: 85 | split_path = args.split_dir 86 | 87 | print('creating matrix chunks...') 88 | chunks_path = os.path.join(args.out_dir, 'matrix_chunks_{}'.format(args.window)) 89 | print(chunks_path) 90 | if not os.path.exists(chunks_path): 91 | os.mkdir(chunks_path) 92 | p = Pool(args.n_procs) 93 | path = Path(split_path).glob('**/*') 94 | files = [str(x) for x in path if x.is_file()] 95 | res = [] 96 | for f in files: 97 | res.append(p.apply_async(chunk_runner, (f, chunks_path, args.ref_file, args.epi_dir, args.mut_file, args.window, args.cancer_key))) 98 | p.close() 99 | p.join() 100 | _ = [r.get() for r in res] 101 | print('chunks saved') 102 | 103 | print('concatenating chunks...') 104 | concat_args = DataExtractor.parse_args('concatH5 {} --out-dir {}'.format(chunks_path, args.out_dir)) 105 | DataExtractor.concatH5(concat_args) 106 | 107 | print('adding mappability track') 108 | epi_matrix_fname = os.path.join(args.out_dir, 'data_matrices' + '_{}_0_{}'.format(args.window, args.min_mapp) + '.h5') 109 | addMap_args = DataExtractor.parse_args('addMappability {} {}'.format(epi_matrix_fname, args.map_ref)) 110 | DataExtractor.add_mappability(addMap_args) 111 | print('epi track done!') 112 | else: 113 | print('running NN model') 114 | epi_matrix_fname = args.epi_matrix_dir 115 | 116 | kfold_args = kfold_mutations_main.get_cmd_arguments('-c {} -d {} -o {} -m {} -g {}'.format(args.cancer_key, epi_matrix_fname, args.out_dir, args.min_mapp, args.gpus)) 117 | kfold_mutations_main.main(kfold_args) 118 | print('finished NN model') 119 | directory = os.path.join(args.out_dir, 'kfold/{}'.format(args.cancer_key)) 120 | date_dir = max([os.path.join(directory,d) for d in os.listdir(directory)], key=os.path.getmtime) 121 | gp_results_base = os.path.join(date_dir, 'gp_results_fold_{}.h5') 122 | else: 123 | gp_results_base = args.gp_res 124 | mapp_file_path = args.map_file 125 | # we assume that you either dont have anything, have the genome counts but not the mutation counts (or annotations) or have everything 126 | if args.fmodel_dir is None: 127 | f_model_path = os.path.join(args.out_dir, 'fmodel_{}_trinuc_192.h5'.format(args.window)) 128 | genome_context_args = SequenceModel.parse_args('countGenomeContext {} {} {} {} --up {} --down {} --n-procs {}'.format(mapp_file_path, args.window, args.ref_file, f_model_path, 1, 1, args.n_procs)) 129 | SequenceModel.countGenomeContext(genome_context_args) 130 | else: 131 | f_model_path = args.fmodel_dir 132 | 133 | fmodel = h5py.File(f_model_path, 'r') 134 | if args.cancer_key + '_mutation_counts' in fmodel.keys(): 135 | run_canc = False 136 | else: 137 | run_canc = True 138 | fmodel.close() 139 | 140 | if run_canc: 141 | annot_name = os.path.basename(args.mut_file).split('txt.gz')[0] + 'trinuc.txt' 142 | annot_path = os.path.join(args.out_dir, annot_name) 143 | print(annot_path) 144 | annot_args = SequenceModel.parse_args('annotateMutationFile {} {} {} {} --n-procs {}'.format(args.mut_file, f_model_path, args.ref_file, annot_path, args.n_procs)) 145 | SequenceModel.annotateMutationFile(annot_args) 146 | annot_path = annot_path + '.gz' 147 | 148 | count_contexts_args = SequenceModel.parse_args('countMutationContext {} {} {} {} {} --n-procs {} '.format(mapp_file_path, annot_path, f_model_path, args.window, args.cancer_key, args.n_procs)) 149 | SequenceModel.countMutationContext(count_contexts_args) 150 | else: 151 | annot_path = args.mut_file 152 | 153 | #run models 154 | print('running models') 155 | submap_path = gp_results_base.split('gp_results')[0] + 'sub_mapp_results_fold_{}.h5' 156 | 157 | # for fold in range(5): 158 | # apply_seq_args = SequenceModel.parse_args('applySequenceModel {} {} {} {} {} --cancer {} --key-prefix {} --key {} --n-procs {} --bins {} --run ensemble'.format(gp_results_base.format(fold), f_model_path, annot_path, args.ref_file, args.window, args.cancer_key, args.cancer_key, args.cancer_key, args.n_procs, 50)) 159 | # SequenceModel.applySequenceModel(apply_seq_args) 160 | 161 | results_path = os.path.join(args.out_dir, 'results') 162 | if not os.path.exists(results_path): 163 | os.mkdir(results_path) 164 | 165 | # concat_sequence_results(gp_results_base, args.cancer_key, os.path.join(results_path, 'hotspot_results_{}.h5'.format(args.cancer_key))) 166 | genic_out = os.path.join(results_path, 'genicDetect_{}_{}_{}.h5'.format(args.cancer_key, args.window, args.min_mapp)) 167 | 168 | genic_args = GenicDriver.parse_args('genicDetectParallel {} {} {} {} -c {} -N {} -m {} -u {}'.format(annot_path, gp_results_base, f_model_path, genic_out, args.cancer_key, args.n_procs, args.min_mapp, submap_path)) 169 | 170 | GenicDriver.genicDetectParallel(genic_args) 171 | 172 | nonc_out = os.path.join(results_path, 'noncDetect_{}_{}_{}.h5'.format(args.cancer_key, args.window, args.min_mapp)) 173 | nonc_args = GenicDriver.parse_args('noncDetectParallel {} {} {} {} -c {} -N {} -m {} -u {} -t both'.format(annot_path, gp_results_base, f_model_path, nonc_out, args.cancer_key, args.n_procs, args.min_mapp, submap_path)) 174 | GenicDriver.noncodingDetectParallel(nonc_args) 175 | 176 | def main(): 177 | args = parse_args() 178 | args.func(args) 179 | print('Done!') 180 | 181 | def chunk_runner(f, chunks_path, ref_file, epi_dir, mut_file, window, cancer_key): 182 | chunk_args = DataExtractor.parse_args('createChunk {} --out-dir {} --ref-file {} --epi-dir {} --mut-file {} --window {} --bins {} --cancer-key {}'.format(f, chunks_path, ref_file, epi_dir, mut_file, window, 100, cancer_key)) 183 | DataExtractor.create_chunk(chunk_args) 184 | 185 | def concat_sequence_results(base_results, cancer, out_path): 186 | fout = h5py.File(out_path, 'a') 187 | #keys = [k for k in f[cancer]['test'].keys() if 'nb_model' in k] 188 | keys = ['nb_model_up1_down1_binsize50_run_ensemble'] 189 | if len(keys) ==0: 190 | return -1 191 | for k in keys: 192 | print('working on {}'.format(k)) 193 | df_lst = [] 194 | for run in range(5): 195 | run_res = pd.read_hdf(base_results.format(run), key='{}/test/{}'.format(cancer, k)) 196 | run_res = run_res.astype({'CHROM': 'int32', 'POS': 'float64', 'OBS': 'int32', 'EXP': 'float64','PVAL': 'float64','Pi': 'float64','MU': 'float64','SIGMA': 'float64','REGION': 'object'}) 197 | df_lst.append(run_res) 198 | complete = pd.concat(df_lst) 199 | complete.to_hdf(out_path, key = k, format='fixed') 200 | fout.close() 201 | 202 | if __name__ == '__main__': 203 | main() 204 | 205 | 206 | -------------------------------------------------------------------------------- /DIGDriver/data_tools/__init__.py: -------------------------------------------------------------------------------- 1 | ## python init file 2 | -------------------------------------------------------------------------------- /DIGDriver/data_tools/auto_runner.py: -------------------------------------------------------------------------------- 1 | import DIG_auto 2 | import sys 3 | import traceback 4 | 5 | cancers =['Head-SCC_SNV', 'Adenocarcinoma_tumors_SNV_msi_low','Liver-HCC_SNV', 'Biliary-AdenoCA_SNV', 6 | 'Bladder-TCC_SNV', 'Lung-SCC_SNV', 'Bone-Osteosarc_SNV', 'Lung_tumors_SNV','Breast-AdenoCa_SNV', 7 | 'Ovary-AdenoCA_SNV','Carcinoma_tumors_SNV_msi_low','Panc-AdenoCA_SNV', 8 | 'CNS-GBM_SNV', 'Pancan_SNV', 'CNS_tumors_SNV','Prost-AdenoCA_SNV', 9 | 'ColoRect-AdenoCA_SNV', 'Sarcoma_tumors_SNV','ColoRect-AdenoCA_SNV_msi_low', 'Skin-Melanoma_SNV', 10 | 'Digestive_tract_tumors_SNV','Squamous_tumors_SNV','Digestive_tract_tumors_SNV_msi_low', 'Thy-AdenoCA_SNV', 11 | 'Eso-AdenoCa_SNV', 'Uterus-AdenoCA_SNV','Female_reproductive_system_tumors_SNV_msi_low','Uterus-AdenoCA_SNV_msi_low'] 12 | 13 | for c in cancers: 14 | try: 15 | fmut_str = '/data/cb/maxas/data/projects/cancer_mutations/cancer_mutations_PCAWG/DIG_FILES/' + c + '.annot.txt.gz' 16 | gp_str = '/scratch2/dig/full_pcawg/' + c + '/gp_results_fold_{}.h5' 17 | dig_args = DIG_auto.parse_args('runDIG --out-dir {} --window-size {} --min-map {} --ref-file {} --mut-file {} --N-procs {} --map-file {} --fmodel-dir {} --gp-results-base {} -c {}'.format('/scratch1/priebeo/PCAWG_full_results/v1_final_results', 10000, 0.5, '/scratch1/maxas/ICGC_Roadmap/reference_genome/hg19.fasta', fmut_str, 30, '/scratch1/priebeo/neurIPS/10kb_map_0', '/scratch1/priebeo/PCAWG_full_results/v1_final_results/fmodel_10000_trinuc_192.h5', gp_str, c)) 18 | DIG_auto.run(dig_args) 19 | except: 20 | print("Unexpected error:") 21 | traceback.print_exc() 22 | print('failed' + c) 23 | print('skipping...') 24 | -------------------------------------------------------------------------------- /DIGDriver/data_tools/mappability_tools.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import pandas as pd 3 | import bbi 4 | import pysam 5 | ##only mappabilty_by_idx called from top level 6 | def load_chromsizes(f_bw): 7 | chroms = bbi.chromsizes(f_bw) 8 | chroms.pop('chrM') 9 | chroms.pop('chrX') 10 | chroms.pop('chrY') 11 | 12 | return chroms 13 | 14 | def mappability_by_window(f_mapp, window, overlap=0): 15 | chroms = load_chromsizes(f_mapp) 16 | 17 | mapp_lst = [] 18 | for chr_id, chr_size in chroms.items(): 19 | print(chr_id, end=' ') 20 | i = 0 21 | while i + window < chr_size: 22 | # print(i) 23 | mapp = bbi.fetch(f_mapp, chr_id, i, i + window, bins=1)[0] 24 | mapp_lst.append([chr_id, i, i+window, mapp]) 25 | i += window - overlap 26 | 27 | return pd.DataFrame(np.array(mapp_lst), 28 | columns=['CHROM', 'START', 'END', 'MAPP']) 29 | 30 | def mappability_by_idx(f_mapp, idx): 31 | 32 | mapp_lst = [] 33 | chr_prev = '' 34 | for row in idx: 35 | chr_id = 'chr{}'.format(row[0]) 36 | start = row[1] 37 | end = row[2] 38 | 39 | if chr_id != chr_prev: 40 | print(chr_id) 41 | 42 | mapp = bbi.fetch(f_mapp, chr_id, start, end, bins=1)[0] 43 | mapp_lst.append([row[0], start, end, mapp]) 44 | chr_prev = chr_id 45 | 46 | return mapp_lst 47 | 48 | def P_bases_by_window(f_fasta, window, overlap=0): 49 | fasta = pysam.FastaFile(f_fasta) 50 | sizes = fasta.lengths 51 | chroms = fasta.references 52 | 53 | mapp_lst = [] 54 | for chr_id, chr_size in zip(chroms, sizes): 55 | print(chr_id, end=' ') 56 | i = 0 57 | while i + window < chr_size: 58 | seq = fasta.fetch(chr_id, i, i + window) 59 | mapp = seq.count('P') / window 60 | mapp_lst.append([chr_id, i, i+window, mapp]) 61 | i += window - overlap 62 | 63 | return pd.DataFrame(np.array(mapp_lst), 64 | columns=['CHROM', 'START', 'END', 'MAPP']) 65 | -------------------------------------------------------------------------------- /DIGDriver/data_tools/track_selector.py: -------------------------------------------------------------------------------- 1 | import os, sys 2 | import pickle as pkl 3 | import numpy as np 4 | import pandas as pd 5 | import argparse 6 | 7 | def get_cmd_arguments(): 8 | ap = argparse.ArgumentParser() 9 | ap.add_argument('-t', '--track-list', required=True, nargs='?', action='store', type=str, dest='track_lst_path',help= 'path to list of tracks being used') 10 | ap.add_argument('-o', '--out-dir', nargs='?', default = './', action='store', type=str, dest='out_dir',help= 'path to save track selection file') 11 | ap.add_argument('-stemcells', action = 'store_true', help='Include Stem cells [ESC, ESC_derived, IPSC, Placental]') 12 | ap.add_argument('-general', action = 'store_true', help='Include general tracks [fibroblasts, stromal cells, adipose tissue]') 13 | ap.add_argument('-other', action = 'store_true', help='Include all other misc tracks') 14 | ap.add_argument('-lung', action = 'store_true', help='Include lung tracks') 15 | ap.add_argument('-breast', action = 'store_true', help='Include breast tracks') 16 | ap.add_argument('-blood', action = 'store_true', help='Include blood tracks') 17 | ap.add_argument('-skin', action = 'store_true', help='Include skin tracks') 18 | ap.add_argument('-liver', action = 'store_true', help='Include liver tracks') 19 | ap.add_argument('-stomach', action = 'store_true', help='Include stomach tracks (limited selection)') 20 | ap.add_argument('-GC', action = 'store_true', help='Include GC content track') 21 | ap.add_argument('-HiC', action = 'store_true', help='Include all HiC tracks') 22 | ap.add_argument('-repli_chip', action = 'store_true', help='Include all repli-seq tracks') 23 | ap.add_argument('-cons', action = 'store_true', help='Include conservation tracks (included in general)') 24 | ap.add_argument('-seq', action = 'store_true', help='Include sequence context tracks (included in general)') 25 | return ap.parse_args() 26 | 27 | def main(): 28 | args = get_cmd_arguments() 29 | meta = pd.read_csv(open('/scratch1/priebeo/neurIPS/new_tracks_meta.csv', 'r')) 30 | track_lst = pkl.load(open(args.track_lst_path, 'rb')) 31 | track_lst = np.array([t.split('/')[-1].split('.')[0] for t in track_lst]) 32 | 33 | meta['track_pos'] = -1 34 | for i, l in enumerate(track_lst): 35 | meta.loc[meta['File accession'] == l, 'track_pos'] = i 36 | meta = meta.astype({'track_pos': int}) 37 | meta = meta.set_index('track_pos') 38 | meta.sort_index(inplace = True) 39 | 40 | track_accumulator = set([]) 41 | if args.stemcells: 42 | track_accumulator = track_accumulator.union(set(np.array(meta.loc[meta['Anatomy'] == 'StemCells'].index))) 43 | if args.general: 44 | track_accumulator = track_accumulator.union(set(np.array(meta.loc[meta['Anatomy'] == 'General'].index))) 45 | if args.other: 46 | track_accumulator = track_accumulator.union(set(np.array(meta.loc[meta['Anatomy'] == 'Other'].index))) 47 | if args.lung: 48 | track_accumulator = track_accumulator.union(set(np.array(meta.loc[meta['Anatomy'] == 'Lung'].index))) 49 | if args.breast: 50 | track_accumulator = track_accumulator.union(set(np.array(meta.loc[meta['Anatomy'] == 'Breast'].index))) 51 | if args.blood: 52 | track_accumulator = track_accumulator.union(set(np.array(meta.loc[meta['Anatomy'] == 'Blood'].index))) 53 | if args.skin: 54 | track_accumulator = track_accumulator.union(set(np.array(meta.loc[meta['Anatomy'] == 'Skin'].index))) 55 | if args.liver: 56 | track_accumulator = track_accumulator.union(set(np.array(meta.loc[meta['Anatomy'] == 'Liver'].index))) 57 | if args.stomach: 58 | track_accumulator = track_accumulator.union(set(np.array(meta.loc[meta['Anatomy'] == 'Stomach'].index))) 59 | 60 | if args.GC: 61 | track_accumulator = track_accumulator.union(set(np.array(meta.loc[meta['File accession'] == 'GC_content'].index))) 62 | if args.seq: 63 | track_accumulator = track_accumulator.union(set(np.array(meta.loc[meta['File accession'] == 'hg19'].index))) 64 | if args.HiC: 65 | track_accumulator = track_accumulator.union(set(np.array(meta.loc[meta['File accession'] == 'GC_content'].index))) 66 | if args.cons: 67 | track_accumulator = track_accumulator.union(set(np.array(meta.loc[meta['Assay'] == 'conservation'].index))) 68 | if args.repli_chip: 69 | track_accumulator = track_accumulator.union(set(np.array(meta.loc[meta['Assay'] == 'Repli-chip'].index))) 70 | 71 | to_add = np.array(sorted(track_accumulator)) 72 | if len(to_add[to_add < 0]) > 0: 73 | print('Some desired file not present in track list') 74 | to_add = to_add[to_add >= 0] 75 | print('adding {} tracks'.format(to_add.shape[0])) 76 | out_dir = os.path.join(args.out_dir, 'track_selection.txt') 77 | np.savetxt(out_dir, to_add, fmt='%i') 78 | 79 | print('Done!') 80 | 81 | if __name__ == "__main__": 82 | main() 83 | -------------------------------------------------------------------------------- /DIGDriver/driver_model/__init__.py: -------------------------------------------------------------------------------- 1 | ## init file for python module 2 | -------------------------------------------------------------------------------- /DIGDriver/driver_model/onthefly_tools.py: -------------------------------------------------------------------------------- 1 | import pandas as pd 2 | import numpy as np 3 | import pysam 4 | import multiprocessing as mp 5 | import pybedtools 6 | import pkg_resources 7 | import h5py 8 | import scipy 9 | import tempfile 10 | import os 11 | 12 | from DIGDriver.sequence_model import genic_driver_tools 13 | from DIGDriver.sequence_model import sequence_tools 14 | from DIGDriver.sequence_model import nb_model 15 | from DIGDriver.driver_model import transfer_tools 16 | from DIGDriver.data_tools import mutation_tools 17 | 18 | 19 | def region_str_to_params(region_str): 20 | col_split = region_str.split(":") 21 | chrom = col_split[0].lstrip("chr") 22 | #chrom = col_split[0] 23 | pos_split = col_split[1].split("-") 24 | start = int(pos_split[0]) 25 | end = int(pos_split[1]) 26 | return chrom, start, end 27 | 28 | def DIG_onthefly(f_pretrained, f_mut, f_fasta, f_elts_bed=None, region_str=None, 29 | scale_factor=None, scale_factor_indel=None, scale_type="genome", scale_by_expectation=True, 30 | max_muts_per_sample=3e9, max_muts_per_elt_per_sample=3e9, skip_pvals=False): 31 | assert f_elts_bed or region_str, "ERROR: you must provide --f-bed or --region_str." 32 | 33 | if region_str: 34 | temp_file, temp_name = tempfile.mkstemp() 35 | 36 | CHROM,START,END = region_str_to_params(region_str) 37 | os.write(temp_file, "{}\t{}\t{}\tUserELT\t0\t+\t0\t0\t.\t1\t{},\t0,".format(CHROM,START,END,END-START).encode()) 38 | os.close(temp_file) 39 | f_elts_bed = temp_name 40 | 41 | print('Tabulating mutations') 42 | df_mut_tab, blacklist = mutation_tools.tabulate_mutations_in_element(f_mut, f_elts_bed, bed12=True, drop_duplicates=True, all_elements = True, 43 | max_muts_per_sample=max_muts_per_sample, max_muts_per_elt_per_sample=max_muts_per_elt_per_sample, return_blacklist=True 44 | ) 45 | if scale_by_expectation: 46 | print('scaling by expected number of mutations') 47 | df_gene = transfer_tools.load_pretrained_model(f_pretrained) 48 | df_mut = transfer_tools.read_mutations_cds(f_mut) 49 | df_mut = df_mut[~df_mut.SAMPLE.isin(blacklist)] 50 | df_syn = df_mut[(df_mut.ANNOT == 'Synonymous') & (df_mut.GENE != 'TP53')].drop_duplicates() 51 | exp_syn = (df_gene[df_gene.index != 'TP53'].MU * df_gene[df_gene.index != 'TP53'].Pi_SYN).sum() 52 | cj = len(df_syn) / exp_syn 53 | 54 | ## INDEL scaling factor 55 | f_panel = 'data/genes_CGC_ALL.txt' 56 | genes = pd.read_table(pkg_resources.resource_stream('DIGDriver', f_panel), names=['GENE']) 57 | all_cosmic = genes.GENE.to_list() + ['CDKN2A.p14arf', 'CDKN2A.p16INK4a'] 58 | df_gene_null = df_gene[~df_gene.index.isin(all_cosmic)] 59 | df_mut_null = df_mut[~df_mut.index.isin(all_cosmic)] 60 | EXP_INDEL_UNIF = (df_gene_null.Pi_INDEL * df_gene_null.ALPHA_INDEL * df_gene_null.THETA_INDEL).sum() 61 | OBS_INDEL = len(df_mut_null[df_mut_null.ANNOT == 'INDEL']) 62 | cj_indel = OBS_INDEL / EXP_INDEL_UNIF 63 | elif scale_factor: 64 | cj = scale_factor 65 | cj_indel = scale_factor_indel 66 | else: 67 | print('Calculating scale factor') 68 | cj, cj_indel = transfer_tools.calc_scale_factor_efficient(f_mut, f_pretrained, scale_type=scale_type) 69 | 70 | L_contexts = sequence_tools.precount_region_contexts_parallel( 71 | f_elts_bed, f_fasta, 10, 10000, sub_elts = True, n_up=1, n_down=1) 72 | 73 | 74 | all_windows_df = pd.read_hdf(f_pretrained, 'region_params') 75 | window = all_windows_df.iloc[0][2]-all_windows_df.iloc[0][1] 76 | window_key = 'window_{}'.format(window) 77 | 78 | df_mut = pd.read_hdf(f_pretrained, key='sequence_model_192') 79 | mut_model_idx = [r[1] + '>' + r[1][0] + r[0][2] + r[1][2] for r in zip(df_mut.MUT_TYPE, df_mut.CONTEXT)] 80 | subst_idx = sorted(mut_model_idx) 81 | revc_subst_idx = [sequence_tools.reverse_complement(sub.split('>')[0]) + '>' + sequence_tools.reverse_complement(sub.split('>')[\ 82 | -1]) for sub in subst_idx] 83 | revc_dic = dict(zip(subst_idx, revc_subst_idx)) 84 | 85 | d_pr = pd.DataFrame(df_mut.FREQ.values, mut_model_idx) 86 | d_pr = d_pr.sort_index()[0].values 87 | 88 | df_elts = mutation_tools.bed12_boundaries(f_elts_bed) 89 | 90 | 91 | elt_lst = [] 92 | mu_lst = [] 93 | sigma_lst = [] 94 | R_obs_lst = [] 95 | alpha_lst = [] 96 | theta_lst = [] 97 | p_mut_lst = [] 98 | flag_lst = [] 99 | 100 | mu_ind_lst = [] 101 | sigma_ind_lst = [] 102 | R_size_lst = [] 103 | elt_len_lst = [] 104 | alpha_ind_lst = [] 105 | theta_ind_lst = [] 106 | p_ind_lst = [] 107 | R_ind_lst=[] 108 | 109 | for _, row in df_elts.iterrows(): 110 | 111 | chrom = row['CHROM'] 112 | elt = row['ELT'] 113 | strand = row['STRAND'] 114 | block_starts = row['BLOCK_STARTS'] 115 | block_ends = row['BLOCK_ENDS'] 116 | elts_as_intervals = np.vstack((block_starts, block_ends)) 117 | overlaps = genic_driver_tools.get_ideal_overlaps(chrom, elts_as_intervals, window) 118 | 119 | chrom_lst, start_lst, end_lst = ['chr' + str(r[0]) for r in overlaps], [r[1] for r in overlaps], [r[2] for r in overlaps] 120 | region_df = sequence_tools.count_contexts_by_regions(f_fasta, chrom_lst, start_lst, end_lst, n_up=1, n_down=1) 121 | region_counts = np.array([np.repeat(region, 3) for region in region_df.values]).sum(axis=0) 122 | 123 | # #if negative strand, take the reverse complement of the region counts 124 | if strand == '-1' or strand == '-': 125 | region_counts = np.array([r[1] for r in sorted(enumerate(region_counts), key=lambda k: revc_dic[subst_idx[k[0]]])]) 126 | 127 | L = np.zeros((192)) 128 | for start, end in zip(block_starts, block_ends): 129 | L += L_contexts.loc['chr{}:{}-{}'.format(chrom, start,end)].values 130 | 131 | prob_sum = region_counts * d_pr 132 | 133 | t_pi = d_pr / prob_sum.sum() 134 | 135 | p_mut = (t_pi * L).sum() 136 | 137 | p_mut_lst.append(p_mut) 138 | mu, sigma, R_obs, FLAG = genic_driver_tools.get_region_params_direct(all_windows_df, overlaps, window) 139 | alpha, theta = nb_model.normal_params_to_gamma(mu, sigma) 140 | theta = theta * cj 141 | 142 | flag_lst.append(FLAG) 143 | R_size_lst.append(int(region_counts.sum() / 3)) ## length of region containing gene 144 | 145 | elt_len_lst.append(int(np.sum(L) / 3)) 146 | p_ind_lst.append(elt_len_lst[-1] / R_size_lst[-1]) 147 | 148 | 149 | mu_ind,sigma_ind,R_ind = mu, sigma, R_obs 150 | alpha_ind, theta_ind = nb_model.normal_params_to_gamma(mu_ind, sigma_ind) 151 | theta_ind = theta_ind * cj_indel 152 | 153 | alpha_ind_lst.append(alpha_ind) 154 | theta_ind_lst.append(theta_ind) 155 | mu_ind_lst.append(mu_ind) 156 | sigma_ind_lst.append(sigma_ind) 157 | 158 | R_ind_lst.append(R_ind) 159 | elt_lst.append(elt) 160 | mu_lst.append(mu) 161 | sigma_lst.append(sigma) 162 | R_obs_lst.append(R_obs) 163 | alpha_lst.append(alpha) 164 | theta_lst.append(theta) 165 | 166 | 167 | pretrain_df = pd.DataFrame({'ELT_SIZE':elt_len_lst, 'FLAG': flag_lst, 'R_SIZE':R_size_lst, 'R_OBS':R_obs_lst, 'R_INDEL':R_ind_lst, 168 | 'MU':mu_lst, 'SIGMA':sigma_lst, 'ALPHA':alpha_lst, 'THETA':theta_lst, 169 | 'MU_INDEL': mu_ind_lst, 'SIGMA_INDEL':sigma_ind_lst, 'ALPHA_INDEL':alpha_ind_lst, 'THETA_INDEL':theta_ind_lst, 170 | 'Pi_SUM':p_mut_lst , 'Pi_INDEL':p_ind_lst 171 | }, index = elt_lst) 172 | 173 | df_model = df_mut_tab.merge(pretrain_df, left_on ='ELT', right_index=True) 174 | df_model = transfer_tools.element_expected_muts_nb(df_model) 175 | 176 | df_model = transfer_tools.element_expected_muts_nb(df_model) 177 | 178 | if not skip_pvals: 179 | df_model = transfer_tools.element_pvalue_burden_nb(df_model) 180 | df_model = transfer_tools.element_pvalue_burden_nb_by_sample(df_model) 181 | df_model = transfer_tools.element_pvalue_indel(df_model, cj_indel) 182 | df_model['PVAL_MUT_BURDEN'] = [ 183 | scipy.stats.combine_pvalues([row.PVAL_SNV_BURDEN, row.PVAL_INDEL_BURDEN], 184 | method='fisher' 185 | )[1] 186 | for i, row in df_model.iterrows() 187 | ] 188 | if region_str: 189 | os.remove(temp_name) 190 | return df_model 191 | -------------------------------------------------------------------------------- /DIGDriver/region_model/.DS_Store: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/maxwellsh/DIGDriver/5bb565a1fbb3924ecdaaedeffb97123febc3b4d1/DIGDriver/region_model/.DS_Store -------------------------------------------------------------------------------- /DIGDriver/region_model/__init__.py: -------------------------------------------------------------------------------- 1 | 2 | -------------------------------------------------------------------------------- /DIGDriver/region_model/autoencoders/ae_nets/CNNs.py: -------------------------------------------------------------------------------- 1 | import torch 2 | from torch import nn, transpose 3 | from torch.autograd import Variable 4 | from torch.nn import functional as F 5 | 6 | class ResNetEncoder(nn.Module): 7 | def __init__(self, shape): 8 | super().__init__() 9 | self.inp_len = shape[1] 10 | self.inp_size = shape[2] 11 | 12 | self.hidden_dim = 128 13 | self.fc2_dim = 128 14 | self.fc3_dim = 16 15 | 16 | 17 | self.conv11 = nn.Conv1d(in_channels=self.inp_size, out_channels=128, kernel_size=5, padding=1, stride=1) 18 | self.bn11 = nn.BatchNorm1d(128) 19 | self.conv12 = nn.Conv1d(in_channels=128, out_channels=256, kernel_size=3, padding=1, stride=2) 20 | self.bn12 = nn.BatchNorm1d(256) 21 | 22 | self.conv21 = nn.Conv1d(in_channels=256, out_channels=256, kernel_size=3, padding=1, stride=1) 23 | self.bn21 = nn.BatchNorm1d(256) 24 | self.conv22 = nn.Conv1d(in_channels=256, out_channels=256, kernel_size=3, padding=1, stride=1) 25 | self.bn22 = nn.BatchNorm1d(256) 26 | 27 | self.conv3 = nn.Conv1d(in_channels=256, out_channels=512, kernel_size=3, padding=1, stride=2) 28 | self.bn3 = nn.BatchNorm1d(512) 29 | 30 | self.conv41 = nn.Conv1d(in_channels=512, out_channels=512, kernel_size=3, padding=1, stride=1) 31 | self.bn41 = nn.BatchNorm1d(512) 32 | self.conv42 = nn.Conv1d(in_channels=512, out_channels=512, kernel_size=3, padding=1, stride=1) 33 | self.bn42 = nn.BatchNorm1d(512) 34 | 35 | self.conv5 = nn.Conv1d(in_channels=512, out_channels=1024, kernel_size=3, padding=1, stride=2) 36 | self.bn5 = nn.BatchNorm1d(1024) 37 | 38 | self.conv61 = nn.Conv1d(in_channels=1024, out_channels=1024, kernel_size=3, padding=1, stride=1) 39 | self.bn61 = nn.BatchNorm1d(1024) 40 | self.conv62 = nn.Conv1d(in_channels=1024, out_channels=1024, kernel_size=3, padding=1, stride=1) 41 | self.bn62 = nn.BatchNorm1d(1024) 42 | 43 | self.fc1 = nn.Linear(in_features=int(1024 * 13), out_features=self.fc2_dim) 44 | self.fc2 = nn.Linear(in_features=self.fc2_dim, out_features=self.fc3_dim) 45 | self.fc3 = nn.Linear(in_features=self.fc3_dim, out_features=16) 46 | 47 | #decoding network 48 | self.dfc3 = nn.Linear(in_features=16, out_features=self.fc3_dim) 49 | self.dfc2 = nn.Linear(in_features=self.fc3_dim, out_features=self.fc2_dim) 50 | self.dfc1 = nn.Linear(in_features=self.fc2_dim, out_features=int(1024 * 13)) 51 | 52 | def forward(self, x): 53 | x = transpose(x, 1, 2) 54 | 55 | x = F.relu(self.bn11(self.conv11(x))) 56 | x = F.relu(self.bn12(self.conv12(x))) 57 | res = x 58 | x = F.relu(self.bn21(self.conv21(x))) 59 | x = F.relu(self.bn22(self.conv22(x))) 60 | x += res 61 | x = F.relu(self.bn3(self.conv3(x))) 62 | res = x 63 | x = F.relu(self.bn41(self.conv41(x))) 64 | x = F.relu(self.bn42(self.conv42(x))) 65 | x += res 66 | x = F.relu(self.bn5(self.conv5(x))) 67 | res = x 68 | x = F.relu(self.bn61(self.conv61(x))) 69 | x = F.relu(self.bn62(self.conv62(x))) 70 | x += res 71 | 72 | x = x.view(-1, int(1024 * 13)) 73 | 74 | x = F.relu(self.fc1(x)) 75 | x = F.relu(self.fc2(x)) 76 | x = self.fc3(x) 77 | 78 | return x 79 | 80 | class ResNet_NoBN_Encoder(nn.Module): 81 | def __init__(self, shape): 82 | super().__init__() 83 | self.inp_len = shape[1] 84 | self.inp_size = shape[2] 85 | 86 | self.hidden_dim = 128 87 | self.fc2_dim = 128 88 | self.fc3_dim = 16 89 | 90 | 91 | self.conv11 = nn.Conv1d(in_channels=self.inp_size, out_channels=128, kernel_size=5, padding=1, stride=1) 92 | self.conv12 = nn.Conv1d(in_channels=128, out_channels=256, kernel_size=3, padding=1, stride=2) 93 | 94 | self.conv21 = nn.Conv1d(in_channels=256, out_channels=256, kernel_size=3, padding=1, stride=1) 95 | self.conv22 = nn.Conv1d(in_channels=256, out_channels=256, kernel_size=3, padding=1, stride=1) 96 | 97 | self.conv3 = nn.Conv1d(in_channels=256, out_channels=512, kernel_size=3, padding=1, stride=2) 98 | 99 | self.conv41 = nn.Conv1d(in_channels=512, out_channels=512, kernel_size=3, padding=1, stride=1) 100 | self.conv42 = nn.Conv1d(in_channels=512, out_channels=512, kernel_size=3, padding=1, stride=1) 101 | 102 | self.conv5 = nn.Conv1d(in_channels=512, out_channels=1024, kernel_size=3, padding=1, stride=2) 103 | 104 | self.conv61 = nn.Conv1d(in_channels=1024, out_channels=1024, kernel_size=3, padding=1, stride=1) 105 | self.conv62 = nn.Conv1d(in_channels=1024, out_channels=1024, kernel_size=3, padding=1, stride=1) 106 | 107 | self.fc1 = nn.Linear(in_features=int(1024 * 13), out_features=self.fc2_dim) 108 | self.fc2 = nn.Linear(in_features=self.fc2_dim, out_features=self.fc3_dim) 109 | self.fc3 = nn.Linear(in_features=self.fc3_dim, out_features=16) 110 | 111 | def forward(self, x): 112 | x = transpose(x, 1, 2) 113 | 114 | x = F.relu(self.conv11(x)) 115 | x = F.relu(self.conv12(x)) 116 | res = x 117 | x = F.relu(self.conv21(x)) 118 | x = F.relu(self.conv22(x)) 119 | x += res 120 | x = F.relu(self.conv3(x)) 121 | res = x 122 | x = F.relu(self.conv41(x)) 123 | x = F.relu(self.conv42(x)) 124 | x += res 125 | x = F.relu(self.conv5(x)) 126 | res = x 127 | x = F.relu(self.conv61(x)) 128 | x = F.relu(self.conv62(x)) 129 | x += res 130 | 131 | x = x.view(-1, int(1024 * 13)) 132 | 133 | x = F.relu(self.fc1(x)) 134 | x = F.relu(self.fc2(x)) 135 | x = self.fc3(x) 136 | 137 | return x 138 | 139 | class ResNetDecoder(nn.Module): 140 | def __init__(self, shape): 141 | super().__init__() 142 | self.inp_len = shape[1] 143 | self.inp_size = shape[2] 144 | 145 | self.hidden_dim = 128 146 | self.fc2_dim = 128 147 | self.fc3_dim = 16 148 | 149 | #decoding network 150 | self.dfc3 = nn.Linear(in_features=16, out_features=self.fc3_dim) 151 | self.dfc2 = nn.Linear(in_features=self.fc3_dim, out_features=self.fc2_dim) 152 | self.dfc1 = nn.Linear(in_features=self.fc2_dim, out_features=int(1024 * 13)) 153 | 154 | #self.dbn62 = nn.BatchNorm1d(1024) 155 | self.dconv62 = nn.ConvTranspose1d(in_channels=1024, out_channels=1024, kernel_size=3, padding=1, stride=1) 156 | #self.dbn61 = nn.BatchNorm1d(1024) 157 | self.dconv61 = nn.ConvTranspose1d(in_channels=1024, out_channels=1024, kernel_size=3, padding=1, stride=1) 158 | 159 | #self.dbn5 = nn.BatchNorm1d(1024) 160 | self.dconv5 = nn.ConvTranspose1d(in_channels=1024, out_channels=512, kernel_size=3, padding=1, stride=2) 161 | 162 | #self.dbn42 = nn.BatchNorm1d(512) 163 | self.dconv42 = nn.ConvTranspose1d(in_channels=512, out_channels=512, kernel_size=3, padding=1, stride=1) 164 | #self.dbn41 = nn.BatchNorm1d(512) 165 | self.dconv41 = nn.ConvTranspose1d(in_channels=512, out_channels=512, kernel_size=3, padding=1, stride=1) 166 | 167 | #self.dbn3 = nn.BatchNorm1d(512) 168 | self.dconv3 = nn.ConvTranspose1d(in_channels=512, out_channels=256, kernel_size=3, padding=1, stride=2) 169 | 170 | #self.dbn22 = nn.BatchNorm1d(256) 171 | self.dconv22 = nn.ConvTranspose1d(in_channels=256, out_channels=256, kernel_size=3, padding=1, stride=1) 172 | #self.dbn21 = nn.BatchNorm1d(256) 173 | self.dconv21 = nn.ConvTranspose1d(in_channels=256, out_channels=256, kernel_size=3, padding=1, stride=1) 174 | 175 | #self.dbn12 = nn.BatchNorm1d(256) 176 | self.dconv12 = nn.ConvTranspose1d(in_channels=256, out_channels=128, kernel_size=3, padding=1, stride=2) 177 | #self.dbn11 = nn.BatchNorm1d(128) 178 | self.dconv11 = nn.ConvTranspose1d(in_channels=128, out_channels=self.inp_size, kernel_size=5, padding=1, stride=1) 179 | 180 | self.last = torch.nn.ConvTranspose1d(in_channels = self.inp_size, out_channels = self.inp_size, kernel_size = 4, padding = 1, stride =1) 181 | 182 | def forward(self, x): 183 | x = F.relu(self.dfc3(x)) 184 | x = F.relu(self.dfc2(x)) 185 | x = F.relu(self.dfc1(x)) 186 | 187 | x = x.view(-1, 1024, 13) 188 | 189 | x = F.relu(self.dconv62(x)) 190 | x = F.relu(self.dconv61(x)) 191 | 192 | x = F.relu(self.dconv5(x)) 193 | 194 | x = F.relu(self.dconv42(x)) 195 | x = F.relu(self.dconv41(x)) 196 | 197 | x = F.relu(self.dconv3(x)) 198 | 199 | x = F.relu(self.dconv22(x)) 200 | x = F.relu(self.dconv21(x)) 201 | 202 | x = F.relu(self.dconv12(x)) 203 | x = F.relu(self.dconv11(x)) 204 | x = F.relu(self.last(x)) 205 | x = transpose(x, 1, 2) 206 | return x 207 | 208 | class ResNetLinearDecoder(nn.Module): 209 | def __init__(self, shape): 210 | super().__init__() 211 | self.inp_len = shape[1] 212 | self.inp_size = shape[2] 213 | 214 | self.hidden_dim = 128 215 | self.fc2_dim = 128 216 | self.fc3_dim = 16 217 | 218 | #decoding network 219 | self.decoder = nn.Sequential( 220 | nn.Linear(in_features=16, out_features=64), 221 | nn.ReLU(), 222 | nn.Linear(in_features=64, out_features=128), 223 | nn.ReLU(), 224 | nn.Linear(in_features=128, out_features=256), 225 | nn.ReLU(), 226 | nn.Linear(in_features=256, out_features=2048), 227 | nn.ReLU(), 228 | nn.Linear(in_features=2048, out_features=self.inp_len * self.inp_size), 229 | ) 230 | 231 | def forward(self, x): 232 | x = self.decoder(x) 233 | x = x.view(-1, 100, 734) 234 | return x 235 | 236 | class ResNetShallowLinearDecoder(nn.Module): 237 | def __init__(self, shape): 238 | super().__init__() 239 | self.inp_len = shape[1] 240 | self.inp_size = shape[2] 241 | 242 | self.hidden_dim = 128 243 | self.fc2_dim = 128 244 | self.fc3_dim = 16 245 | 246 | #decoding network 247 | self.decoder = nn.Sequential( 248 | nn.Linear(in_features=16, out_features=256), 249 | nn.ReLU(), 250 | nn.Linear(in_features=256, out_features=self.inp_len * self.inp_size), 251 | ) 252 | 253 | def forward(self, x): 254 | x = self.decoder(x) 255 | x = x.view(-1, 100, 734) 256 | return x 257 | 258 | class ResNetAE(nn.Module): 259 | def __init__(self, shape): 260 | super().__init__() 261 | self.inp_len = shape[1] 262 | self.inp_size = shape[2] 263 | 264 | self.hidden_dim = 128 265 | self.fc2_dim = 128 266 | self.fc3_dim = 16 267 | 268 | self.encoder = ResNetEncoder(shape) 269 | self.decoder = ResNetDecoder(shape) 270 | 271 | def forward(self, x): 272 | encoded = self.encoder(x) 273 | x = self.decoder(encoded) 274 | return encoded, x 275 | 276 | def embeding(self, x): 277 | x = self.encoder(x) 278 | return x 279 | class ResNetAE_LD(nn.Module): 280 | def __init__(self, shape): 281 | super().__init__() 282 | self.inp_len = shape[1] 283 | self.inp_size = shape[2] 284 | 285 | self.hidden_dim = 128 286 | self.fc2_dim = 128 287 | self.fc3_dim = 16 288 | 289 | self.encoder = ResNetEncoder(shape) 290 | self.decoder = ResNetLinearDecoder(shape) 291 | 292 | def forward(self, x): 293 | x = self.encoder(x) 294 | x = self.decoder(x) 295 | return x 296 | 297 | def embeding(self, x): 298 | x = self.encoder(x) 299 | return x 300 | 301 | class ResNetAE_SLD(nn.Module): 302 | def __init__(self, shape): 303 | super().__init__() 304 | self.inp_len = shape[1] 305 | self.inp_size = shape[2] 306 | 307 | self.hidden_dim = 128 308 | self.fc2_dim = 128 309 | self.fc3_dim = 16 310 | 311 | self.encoder = ResNetEncoder(shape) 312 | self.decoder = ResNetShallowLinearDecoder(shape) 313 | 314 | def forward(self, x): 315 | x = self.encoder(x) 316 | x = self.decoder(x) 317 | return x 318 | 319 | def embeding(self, x): 320 | x = self.encoder(x) 321 | return x 322 | -------------------------------------------------------------------------------- /DIGDriver/region_model/autoencoders/ae_nets/fc_nets.py: -------------------------------------------------------------------------------- 1 | import torch 2 | import torch.nn as nn 3 | 4 | class Autoencoder_FC(nn.Module): 5 | def __init__(self, in_shape): 6 | super().__init__() 7 | bs,w,tracks = in_shape 8 | self.encoder = nn.Sequential( 9 | nn.Linear(w * tracks, 512), 10 | nn.ReLU(), 11 | nn.Linear(512, 128), 12 | nn.ReLU(), 13 | nn.Linear(128, 32), 14 | nn.ReLU(), 15 | nn.Linear(32, 16), 16 | ) 17 | self.decoder = nn.Sequential( 18 | nn.Linear(16, 32), 19 | nn.ReLU(), 20 | nn.Linear(32, 128), 21 | nn.ReLU(), 22 | nn.Linear(128, 512), 23 | nn.ReLU(), 24 | nn.Linear(512, w * tracks), 25 | ) 26 | 27 | def forward(self, x): 28 | x = self.encoder(x) 29 | x = self.decoder(x) 30 | return x 31 | 32 | def embeding(self, x): 33 | x = self.encoder(x) 34 | return x 35 | 36 | class Mean_Vec_Autoencoder_FC(nn.Module): 37 | def __init__(self, in_shape): 38 | super().__init__() 39 | bs,w,tracks = in_shape 40 | self.encoder = nn.Sequential( 41 | nn.Linear(tracks, 512), 42 | nn.ReLU(), 43 | nn.Linear(512, 128), 44 | nn.ReLU(), 45 | nn.Linear(128, 32), 46 | nn.ReLU(), 47 | nn.Linear(32, 16), 48 | ) 49 | self.decoder = nn.Sequential( 50 | nn.Linear(16, 32), 51 | nn.ReLU(), 52 | nn.Linear(32, 128), 53 | nn.ReLU(), 54 | nn.Linear(128, 512), 55 | nn.ReLU(), 56 | nn.Linear(512, tracks), 57 | ) 58 | 59 | def forward(self, x): 60 | x = self.encoder(x) 61 | x = self.decoder(x) 62 | return x 63 | 64 | def embeding(self, x): 65 | x = self.encoder(x) 66 | return x 67 | -------------------------------------------------------------------------------- /DIGDriver/region_model/autoencoders/autoencoder_main.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | import os 3 | import sys 4 | import h5py 5 | import copy 6 | import argparse 7 | import datetime 8 | import numpy as np 9 | import pandas as pd 10 | from torch import nn, optim 11 | from tensorboardX import SummaryWriter 12 | from torch.utils.data import DataLoader 13 | 14 | py_file_path = os.path.dirname(os.path.dirname(os.path.abspath(__file__))) 15 | file_path = os.path.dirname(os.path.abspath(__file__)) 16 | 17 | sys.path.append(os.path.join(py_file_path, 'data_aux')) 18 | sys.path.append(os.path.join(py_file_path, 'trainers')) 19 | sys.path.append(os.path.join(file_path, 'ae_nets')) 20 | 21 | from dataset_generator import * 22 | from fc_nets import * 23 | from gp_trainer import * 24 | from CNNs import * 25 | 26 | 27 | def get_cmd_arguments(): 28 | ap = argparse.ArgumentParser() 29 | 30 | # Required cancer type argument 31 | ap.add_argument('-c', '--cancer-id', required=True, nargs='*', action='store', type=str, dest='label_ids', 32 | help='A list of the h5 file mutation count dataset IDs (e.g. SNV_skin_melanoma_MELAU_AU)') 33 | 34 | # Path arguments 35 | ap.add_argument('-d', "--data", required=False, nargs='?', action='store', type=str, dest='data_file', 36 | default='/storage/datasets/cancer/unzipped_data_matrices_PCAWG_10000_0_0.0.h5', help='Path to h5 data file') 37 | ap.add_argument('-o', "--out-dir", required=False, nargs='?', action='store', type=str, dest='out_dir', 38 | default='/storage/yaari/mutation-density-outputs', help='Path to output directory') 39 | ap.add_argument('-u', "--held-out", required=False, nargs='?', action='store', type=str, dest='heldout_file', 40 | default=None, help='Path to file of held-out samples file') 41 | 42 | # Run type parameters 43 | ap.add_argument('-s', "--split", required=False, nargs='?', action='store', type=str, dest='split_method', 44 | default='random', help='Dataset split method (random/chr)') 45 | ap.add_argument('-m', "--mappability", required=False, nargs='?', action='store', type=float, dest='mappability', 46 | default=0.7, help='Mappability lower bound') 47 | ap.add_argument('-gp', "--gaussian", required=False, nargs='?', action='store', type=bool, dest='run_gaussian', 48 | default=False, help='True: train gaussian process regression on the best performing model') 49 | ap.add_argument('-n', "--network", required=False, nargs='?', action='store', type=str, dest='net', 50 | default='cnn', help='The type of neural network model to use (\'fc\' or \'cnn\')') 51 | 52 | # Train parameters 53 | ap.add_argument('-r', "--train-ratio", required=False, nargs='?', action='store', type=float, dest='train_ratio', 54 | default=0.8, help='Train set split size ratio') 55 | ap.add_argument('-ho', "--heldout-ratio", required=False, nargs='?', action='store', type=float, dest='heldout_ratio', 56 | default=0.2, help='Held-out set split size ratio (will be extracted prior to train validation split)') 57 | ap.add_argument('-e', "--epochs", required=False, nargs='?', action='store', type=int, dest='epochs', 58 | default=20, help='Number of epochs') 59 | ap.add_argument('-b', "--batch", required=False, nargs='?', action='store', type=int, dest='bs', 60 | default=128, help='Batch size') 61 | ap.add_argument('-re', "--reruns", required=False, nargs='?', action='store', type=int, dest='nn_reruns', 62 | default=1, help='Number of NN reinitializations and training runs') 63 | ap.add_argument('-gr', "--gp-reruns", required=False, nargs='?', action='store', type=int, dest='gp_reruns', 64 | default=1, help='Number of GP reinitializations and training runs') 65 | 66 | ap.add_argument('-lr', "--learning-rate", required = False, nargs='?', action='store', type = float, dest = 'lr', default=1e-3, help = 'learning rate for training') 67 | 68 | # Run management parameters 69 | ap.add_argument('-sm', "--save-model", required=False, nargs='?', action='store', type=bool, dest='save_model', 70 | default=False, help='True: save best model across all reruns') 71 | ap.add_argument('-st', "--save-training", required=False, nargs='?', action='store', type=float, dest='save_training', 72 | default=False, help='True: save training process and results to Tensorboard file') 73 | ap.add_argument('-g', "--gpus", required=False, nargs='?', action='store', type=str, dest='gpus', 74 | default='all', help='GPUs devices (all/comma separted list)') 75 | 76 | return ap.parse_args() 77 | 78 | def train(model, device, epoch, train_ds, loss_func, optimizer, net_type, writer = None): 79 | model.train() 80 | batch_num = len(train_ds) 81 | loss_sum = 0 82 | for batch_idx, (X, y) in enumerate(train_ds): 83 | #flatten 84 | bs, w, tracks = X.size() 85 | if net_type == 'fc': 86 | X = X.view(bs,-1, w * tracks) 87 | X = X.to(device) 88 | decoded = model(X) 89 | 90 | loss = loss_func(decoded, X) 91 | optimizer.zero_grad() 92 | loss.backward() 93 | 94 | optimizer.step() 95 | 96 | loss_sum += loss.item() 97 | ### LOGGING 98 | if not batch_idx % 50: 99 | print ('Epoch: %d | Batch %03d/%03d | Loss: %.4f' 100 | %(epoch, batch_idx, len(train_ds), loss)) 101 | epoch_loss = loss_sum / batch_num 102 | if writer is not None: 103 | writer.add_scalar('train_loss', epoch_loss, epoch) 104 | 105 | def embed(model, device, data_ds, label_ids, net_type): 106 | model.eval() 107 | data_loader = DataLoader(data_ds, batch_size=2048, shuffle=False, drop_last=False, pin_memory=True, num_workers=4) 108 | all_features = [[] for _ in range(len(label_ids))] 109 | all_true = [[] for _ in range(len(label_ids))] 110 | for j, (X, t_lst) in enumerate(data_loader): 111 | bs, w, tracks = X.size() 112 | if net_type == 'fc': 113 | X = X.view(bs,-1, w * tracks) 114 | X = X.to(device) 115 | features_lst = model.module.embeding(X) 116 | with torch.no_grad(): 117 | for i, t in enumerate(t_lst): 118 | if net_type == 'fc': 119 | feature_vecs = features_lst[:,0,:] 120 | else: 121 | feature_vecs = features_lst 122 | all_features[i].append(feature_vecs.cpu().detach().numpy()) 123 | all_true[i].extend(t.data.cpu().numpy().tolist()) 124 | all_features = [np.concatenate(all_features[j], axis=0) for j in range(len(all_features))] 125 | return all_features, all_true 126 | 127 | def eval(model, device, data_ds, loss_fn, label_ids, net_type, writer = None): 128 | model.eval() 129 | batch_num = len(data_ds) 130 | loss_sum = 0 131 | 132 | for j, (X, t_lst) in enumerate(data_ds): 133 | bs, w, tracks = X.size() 134 | if net_type == 'fc': 135 | X = X.view(bs,-1, w * tracks) 136 | X = X.to(device) 137 | decoded = model(X) 138 | with torch.no_grad(): 139 | loss_sum += loss_fn(decoded, X)# + torch.norm(attention, p=1, dim=(1,2)).mean() 140 | test_loss = loss_sum / batch_num 141 | 142 | print('====> Test set loss: {}'.format(test_loss)) 143 | if writer is not None: 144 | writer.add_scalar('test_loss', test_loss, epoch) 145 | 146 | return test_loss 147 | 148 | def main(): 149 | args = get_cmd_arguments() 150 | labels_str = '-'.join(args.label_ids) 151 | out_dir = os.path.join(args.out_dir) 152 | print('Generating prediction for cancer types: {}'.format(args.label_ids)) 153 | 154 | # Configure GPUs 155 | if args.gpus is None: 156 | print('Using CPU device.') 157 | device = torch.device('cpu') 158 | else: 159 | print('Using GPU device: \'{}\''.format(args.gpus)) 160 | device = torch.device('cuda') 161 | if args.gpus != 'all': 162 | os.environ["CUDA_VISIBLE_DEVICES"] = args.gpus 163 | 164 | data_generator = DatasetGenerator(args.data_file, 165 | args.label_ids, 166 | args.mappability, 167 | args.heldout_ratio, 168 | heldout_file=args.heldout_file,) 169 | 170 | bs = args.bs 171 | net_type = args.net 172 | train_ds, test_ds = data_generator.get_datasets(args.split_method, args.train_ratio) 173 | ho_ds = data_generator.get_heldout_dataset() 174 | train_dataloader = DataLoader(train_ds, batch_size=bs, shuffle=True, drop_last=False, num_workers=16) 175 | test_dataloader = DataLoader(test_ds, batch_size=bs, shuffle=False, drop_last=False, num_workers=16) 176 | 177 | if net_type == 'fc': 178 | model = Autoencoder_FC(train_ds.get_data_shape()) 179 | elif net_type == 'cnn_ld': 180 | model = ResNetAE_LD(train_ds.get_data_shape()) 181 | elif net_type == 'cnn_sld': 182 | model = ResNetAE_SLD(train_ds.get_data_shape()) 183 | else: 184 | model = ResNetAE(train_ds.get_data_shape()) 185 | 186 | if args.gpus is not None: model = nn.DataParallel(model) 187 | model.to(device) 188 | 189 | print('Running {} AE model'.format(net_type)) 190 | optimizer = optim.Adam(model.parameters(), lr=args.lr, amsgrad=False) 191 | loss_fn = nn.MSELoss() 192 | 193 | epochs = args.epochs 194 | # Create output directory 195 | if args.save_model or args.save_training or args.run_gaussian: 196 | print('Saving results under: \'{}\''.format(out_dir)) 197 | os.makedirs(out_dir) 198 | args_dict = vars(args) 199 | with open(os.path.join(out_dir, 'run_params.txt'), 'w') as f: 200 | [f.write('{}: {}\n'.format(k, args_dict[k])) for k in args_dict.keys()] 201 | 202 | if args.save_training: 203 | writer = SummaryWriter(logdir=out_dir, comment=labels_str) 204 | writer.add_text('configurations', str(args), 0) 205 | writer.add_text('model', str(model), 0) 206 | else: 207 | writer = None 208 | for epoch in range(1, epochs + 1): 209 | print('Running epoch {}/{}'.format(epoch, epochs)) 210 | train(model, device, epoch, train_dataloader, loss_fn, optimizer, net_type, writer) 211 | eval(model, device, test_dataloader, loss_fn, args.label_ids, net_type) 212 | 213 | print('Done Training!') 214 | 215 | if args.save_model: 216 | print('Saving model') 217 | torch.save(model.state_dict(), os.path.join(out_dir, 'saved_model_{}_e{}_{}.h5'.format(net_type, epochs, 218 | args.label_ids[i]))) 219 | train_features, train_labels = embed(model,device, train_ds, args.label_ids, net_type) 220 | test_features, test_labels = embed(model, device, test_ds, args.label_ids, net_type) 221 | ho_features, ho_labels = embed(model, device, ho_ds, args.label_ids, net_type) 222 | 223 | #run gaussian 224 | for i in range(len(args.label_ids)): 225 | print('Running gaussian process model for {}...'.format(args.label_ids[i])) 226 | train_set = (np.array(train_features[i]), np.array(train_labels[i]), train_ds.get_chromosome_locations()) 227 | test_set = (np.array(test_features[i]), np.array(test_labels[i]), test_ds.get_chromosome_locations()) 228 | ho_set = (np.array(ho_features[i]), np.array(ho_labels[i]), ho_ds.get_chromosome_locations()) 229 | best_r2 = 0 230 | for j in range(args.gp_reruns): 231 | print('GP run {}/{}...'.format(j, args.gp_reruns)) 232 | run_successeed = False 233 | n_inducing = 2000 234 | while not run_successeed and n_inducing > 0: 235 | gp_trainer = GPTrainer(device, train_set, test_set, heldout_tup=ho_set, n_inducing=n_inducing) 236 | try: 237 | print('Running GP with {} inducing points...'.format(n_inducing)) 238 | gp_test_results, gp_ho_results = gp_trainer.run() 239 | except RuntimeError as err: 240 | print('Run failed with {} inducing points. Encountered run-time error in training: {}' 241 | .format(n_inducing, err)) 242 | n_inducing -= 200 243 | continue 244 | run_successeed = True 245 | if gp_test_results['r2'] > best_r2: 246 | best_test_results, best_ho_results = gp_test_results, gp_ho_results 247 | best_r2 = gp_test_results['r2'] 248 | gp_out_path = os.path.join(out_dir, 'model_{}_e{}_gp_results_{}.h5'.format(net_type, epochs, args.label_ids[i])) 249 | if best_r2 > 0: 250 | gp_trainer.save_results(gp_out_path, best_test_results, best_ho_results) 251 | else: 252 | gp_trainer.save_results(gp_out_path, gp_test_results, gp_ho_results) 253 | 254 | 255 | 256 | if __name__ == '__main__': 257 | main() 258 | -------------------------------------------------------------------------------- /DIGDriver/region_model/data_aux/mut_dataset.py: -------------------------------------------------------------------------------- 1 | import h5py 2 | import torch 3 | import numpy as np 4 | from torch.utils.data import Dataset 5 | 6 | class SimpleDataset(Dataset): 7 | 8 | def __init__(self, data, labels_lst): 9 | self.data = data 10 | self.labels_lst = [lbl for lbl in labels_lst] 11 | 12 | def __len__(self): 13 | return self.data.shape[0] 14 | 15 | def __getitem__(self, idx): 16 | X = torch.tensor(self.data[idx]).float() 17 | y_lst = [torch.tensor(l[idx]).float() for l in self.labels_lst] 18 | return X, y_lst 19 | 20 | def get_data_shape(self): 21 | return self.data.shape 22 | 23 | def get_train_set_length(self, train_ratio): 24 | return int(train_ratio * self.data.shape[0]) 25 | 26 | 27 | class BaseDatasetFromH5(Dataset): 28 | def __init__(self, preprocessed_idxs, chr_locations, mappability, quantiles, selected_tracks): 29 | self.preprocessed_idxs = preprocessed_idxs 30 | self.chr_locations = chr_locations 31 | self.selected_tracks = selected_tracks 32 | self.mappability = mappability 33 | self.quantiles = quantiles 34 | 35 | def __len__(self): 36 | return len(self.preprocessed_idxs) 37 | 38 | def get_set_indices(self): 39 | return self.preprocessed_idxs 40 | 41 | def get_chromosome_locations(self): 42 | return self.chr_locations[self.preprocessed_idxs] 43 | 44 | def get_mappability_values(self): 45 | return self.mappability[self.preprocessed_idxs] 46 | 47 | def get_quantile_values(self): 48 | return self.quantiles[self.preprocessed_idxs] 49 | 50 | 51 | class SimpleDatasetFromH5(BaseDatasetFromH5): 52 | def __init__(self, h5_file, label_ids, preprocessed_idxs, chr_locations, mappability, quantiles, selected_tracks, data_id): 53 | super(SimpleDatasetFromH5, self).__init__(preprocessed_idxs, chr_locations, mappability, quantiles, selected_tracks) 54 | print('Loading data and labels from file {}...'.format(h5_file)) 55 | with h5py.File(h5_file, 'r') as h5f: 56 | self.data = torch.tensor(h5f[data_id][np.sort(self.preprocessed_idxs)]).float() 57 | self.labels_lst = [torch.tensor(h5f[l][np.sort(self.preprocessed_idxs)]).float() for l in label_ids] 58 | print('Loaded input data of size: {}'.format(self.data.shape)) 59 | 60 | def __getitem__(self, idx): 61 | X = self.data[idx, :, self.selected_tracks] 62 | y_lst = [l[idx] for l in self.labels_lst] 63 | return X, y_lst 64 | 65 | def get_data_shape(self): 66 | return self.data.shape 67 | 68 | 69 | class LazyLoadDatasetFromH5(BaseDatasetFromH5): 70 | def __init__(self, h5_file, label_ids, preprocessed_idxs, chr_locations, mappability, quantiles, selected_tracks, data_id, auto_context=None): 71 | super(LazyLoadDatasetFromH5, self).__init__(preprocessed_idxs, chr_locations, mappability, quantiles, selected_tracks) 72 | self.h5_file = h5_file 73 | self.label_ids = label_ids 74 | self.data_id = data_id 75 | 76 | def __getitem__(self, idx): 77 | data_idx = self.preprocessed_idxs[idx] 78 | with h5py.File(self.h5_file,'r') as db: 79 | X = torch.tensor(db[self.data_id][data_idx, :, self.selected_tracks]).float() 80 | y_lst = [torch.tensor(db[l][data_idx]).float() for l in self.label_ids] 81 | return X, y_lst 82 | 83 | def get_data_shape(self): 84 | with h5py.File(self.h5_file,'r') as db: 85 | return (len(self.preprocessed_idxs), db[self.data_id].shape[1], len(self.selected_tracks)) 86 | 87 | 88 | class AutoregressiveDatasetFromH5(BaseDatasetFromH5): 89 | def __init__(self, h5_file, label_ids, preprocessed_idxs, chr_locations, mappability, quantiles, selected_tracks, data_id, auto_context=1): 90 | super(AutoregressiveDatasetFromH5, self).__init__(preprocessed_idxs, chr_locations, mappability, quantiles, selected_tracks) 91 | self.h5_file = h5_file 92 | self.label_ids = label_ids 93 | self.data_id = data_id 94 | self.auto_context = auto_context 95 | 96 | def get_context(self, c_idx, s_idx, e_idx): 97 | s = s_idx if s_idx >= 0 else 0 98 | e = e_idx if e_idx < len(self.chr_locations) else len(self.chr_locations) - 1 99 | return np.arange(s, e)[np.where(self.chr_locations[np.arange(s, e), 0] == self.chr_locations[c_idx, 0])[0]] 100 | 101 | def __getitem__(self, idx): 102 | data_idx = self.preprocessed_idxs[idx] 103 | pre_context = self.get_context(data_idx, data_idx-self.auto_context, data_idx) 104 | post_context = self.get_context(data_idx, data_idx+1, data_idx+self.auto_context+1) 105 | with h5py.File(self.h5_file,'r') as db: 106 | X = torch.tensor(db[self.data_id][data_idx, :, self.selected_tracks]).float() 107 | X_auto = [torch.tensor([db[l][pre_context].sum(), db[l][post_context].sum()]).float() for l in self.label_ids] 108 | y_lst = [torch.tensor(db[l][data_idx]).float() for l in self.label_ids] 109 | return X, X_auto, y_lst 110 | 111 | def get_data_shape(self): 112 | with h5py.File(self.h5_file,'r') as db: 113 | return (len(self.preprocessed_idxs), db[self.data_id].shape[1], len(self.selected_tracks)) 114 | -------------------------------------------------------------------------------- /DIGDriver/region_model/feature_vectors/gaussian_process.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | 3 | import numpy as np 4 | import pandas as pd 5 | import torch 6 | import gpytorch 7 | from sklearn.metrics import r2_score 8 | from sklearn.preprocessing import StandardScaler 9 | # import matplotlib.pyplot as plt 10 | # import seaborn as sns 11 | import h5py 12 | import scipy.stats 13 | import tqdm 14 | import argparse 15 | 16 | class SparseGP(gpytorch.models.ExactGP): 17 | def __init__(self, train_x, train_y, likelihood, n_inducing=2000): 18 | super(SparseGP, self).__init__(train_x, train_y, likelihood) 19 | 20 | self.mean_module = gpytorch.means.ConstantMean() 21 | base_cov_module = gpytorch.kernels.ScaleKernel( 22 | gpytorch.kernels.RBFKernel() 23 | ) 24 | 25 | self.covar_module = gpytorch.kernels.InducingPointKernel( 26 | base_cov_module, 27 | inducing_points = train_x[:n_inducing, :], 28 | likelihood=likelihood 29 | ) 30 | 31 | def forward(self, x): 32 | mean_x = self.mean_module(x) 33 | covar_x = self.covar_module(x) 34 | return gpytorch.distributions.MultivariateNormal(mean_x, covar_x) 35 | 36 | def fit_params(self, train_x, train_y, likelihood, n_iter=100): 37 | pass 38 | 39 | def predict(self, test_x): 40 | pass 41 | 42 | def load(fname, dataset, idx_feat=np.array([])): 43 | f = h5py.File(fname, 'r') 44 | 45 | if dataset not in f.keys(): 46 | f.close() 47 | return np.array([]), np.array([]), idx_feat 48 | 49 | X = f[dataset]['X'][:] 50 | Y = f[dataset]['y'][:] 51 | # X = f[dataset]['features'][0, :, :] 52 | # Y = f[dataset]['true'][0, :] 53 | 54 | if not idx_feat.any(): 55 | idx_feat = np.where(np.abs(X).mean(axis=0) > 0)[0] 56 | 57 | X = X[:, idx_feat] 58 | 59 | f.close() 60 | 61 | return X, Y, idx_feat 62 | 63 | def standardize(X, Y, scaler=None, y_mean=None, y_std=None): 64 | 65 | if not scaler: 66 | scaler = StandardScaler() 67 | scaler.fit(X) 68 | 69 | if not y_mean: 70 | y_mean = Y.mean() 71 | y_std = Y.std() 72 | 73 | x = scaler.transform(X) 74 | y = (Y - y_mean) / y_std 75 | 76 | return x, y, scaler, y_mean, y_std 77 | 78 | def train_model(train_x, train_y, n_iter=100, n_inducing=2000): 79 | # train_x = torch.FloatTensor(train_x).contiguous().cuda() 80 | # train_y = torch.FloatTensor(train_y).contiguous().cuda() 81 | 82 | # if torch.cuda.is_available(): 83 | # train_x, train_y = train_x.cuda(), train_y.cuda(); 84 | 85 | likelihood = to_gpu( 86 | gpytorch.likelihoods.GaussianLikelihood() 87 | ) 88 | model = to_gpu( 89 | SparseGP(train_x, train_y, likelihood, n_inducing=n_inducing) 90 | ) 91 | 92 | # if torch.cuda.is_available(): 93 | # model, likelihood = model.cuda(), likelihood.cuda() 94 | 95 | model.train() 96 | likelihood.train() 97 | 98 | print(f'Training model with {n_iter} iterations.') 99 | # model.fit_params(train_x, train_y, likelihood, n_iter=n_iter) 100 | optimizer = torch.optim.Adam([ 101 | {'params': model.parameters()}, 102 | ], lr=0.8) 103 | 104 | # "Loss" for GPs - the marginal log likelihood 105 | mll = gpytorch.mlls.ExactMarginalLogLikelihood(likelihood, model) 106 | 107 | # with gpytorch.settings.max_cg_iterations(10000): 108 | # with gpytorch.settings.fast_computations(covar_root_decomposition=False, log_prob=False, solves=False): 109 | # with gpytorch.settings.max_preconditioner_size(80): 110 | iterator = tqdm.tqdm(range(n_iter), desc='GP training') 111 | for i in iterator: 112 | optimizer.zero_grad() 113 | output = model(train_x) 114 | loss = -mll(output, train_y) 115 | loss.backward() 116 | iterator.set_postfix(loss=loss.item()) 117 | optimizer.step() 118 | 119 | print(f"Finished training on {train_x.size(0)} samples.") 120 | print("Final values - Loss: %.3f lengthscale: %.3f outputscale: %.3f noise: %.3f" % ( 121 | loss.item(), 122 | model.covar_module.base_kernel.base_kernel.lengthscale.item(), 123 | model.covar_module.base_kernel.outputscale.item(), 124 | likelihood.noise_covar.noise.item() 125 | )) 126 | 127 | return model, likelihood, loss.item() 128 | 129 | def predict(model, likelihood, test_x): 130 | model.eval() 131 | likelihood.eval() 132 | 133 | # test_x = torch.FloatTensor(test_x).contiguous() 134 | # if torch.cuda.is_available(): 135 | # print('cuda') 136 | # test_x = test_x.cuda() 137 | 138 | print(f'Predicting over {test_x.size(0)} test samples.') 139 | with torch.no_grad(), gpytorch.settings.fast_pred_var(): 140 | # with gpytorch.settings.max_preconditioner_size(10), torch.no_grad(): 141 | # with gpytorch.settings.max_root_decomposition_size(30), gpytorch.settings.fast_pred_var(): 142 | with gpytorch.settings.max_cg_iterations(10000): 143 | y_pred = model(test_x) 144 | 145 | y_hat = y_pred.mean.cpu().numpy() 146 | y_std = y_pred.stddev.cpu().numpy() 147 | 148 | return y_hat, y_std 149 | 150 | def save(fname, dataset, y_hat, y_std, loss, r2, params): 151 | f = h5py.File(fname, 'r+') 152 | data = f[dataset] 153 | keys = [key for key in data.keys()] 154 | 155 | keys_mean = [key for key in keys if key.startswith('gp_mean')] 156 | if keys_mean: 157 | suffix_lst = [int(key.split('_')[-1]) for key in keys_mean] 158 | sfx = max(suffix_lst) + 1 159 | 160 | else: 161 | sfx = 1 162 | 163 | print('Saving GP results into {} gp_*_{:02d}'.format(dataset, sfx)) 164 | data.create_dataset('gp_mean_{:02d}'.format(sfx), data=y_hat) 165 | data.create_dataset('gp_std_{:02d}'.format(sfx), data=y_std) 166 | data.create_dataset('gp_params_{:02d}'.format(sfx), data=params) 167 | data.attrs['gp_loss_{:02d}'.format(sfx)] = loss 168 | data.attrs['gp_R2_{:02d}'.format(sfx)] = r2 169 | 170 | def to_torch(data): 171 | return torch.FloatTensor(data).contiguous() 172 | 173 | def to_gpu(data): 174 | if torch.cuda.is_available(): 175 | return data.cuda() 176 | 177 | def parse_args(): 178 | parser = argparse.ArgumentParser(description='Fit a sparse Gaussian Process') 179 | 180 | parser.add_argument('data', help='h5 file containing train and test data') 181 | parser.add_argument('--n_iter', type=int, default=100, help='number of training iterations') 182 | parser.add_argument('--n_inducing', type=int, default=2000, help='number of inducing points') 183 | parser.add_argument('--n_runs', type=int, default=5, help='number of runs to train the model') 184 | parser.add_argument('--save-train', action='store_true', default=False, help='save training data') 185 | 186 | return parser.parse_args() 187 | 188 | def run(): 189 | args = parse_args() 190 | 191 | ## Load data 192 | train_X, train_Y, idx_feat = load(args.data, 'train') 193 | test_X, test_Y, _ = load(args.data, 'test', idx_feat) 194 | held_X, held_Y, _ = load(args.data, 'held-out', idx_feat) 195 | # print(held_Y[0:5]) 196 | 197 | ## Standardize data 198 | train_X, train_Y, scaler, y_mean, y_std = standardize(train_X, train_Y) 199 | test_X, test_Y, _, _, _ = standardize(test_X, test_Y, scaler, y_mean, y_std) 200 | 201 | train_x, train_y, test_x = to_torch(train_X), to_torch(train_Y), to_torch(test_X) 202 | train_x, train_y, test_x = to_gpu(train_x), to_gpu(train_y), to_gpu(test_x) 203 | 204 | ## Train model 205 | model, likelihood, loss = train_model(train_x, train_y, 206 | n_iter=args.n_iter, n_inducing=args.n_inducing 207 | ) 208 | 209 | ## Validate model 210 | gp_mean, gp_std = predict(model, likelihood, test_x) 211 | r2 = r2_score(test_Y, gp_mean) 212 | print(f'R^2 of model: {r2}') 213 | 214 | params = np.array([model.covar_module.base_kernel.base_kernel.lengthscale.item(), 215 | model.covar_module.base_kernel.outputscale.item(), 216 | likelihood.noise_covar.noise.item() 217 | ]) 218 | 219 | save(args.data, 'test', gp_mean*y_std + y_mean, gp_std * y_std, loss, r2, params) 220 | 221 | if args.save_train: 222 | print('Saving training data') 223 | train_mean, train_std = predict(model, likelihood, train_x) 224 | r2 = r2_score(train_Y, train_mean) 225 | print(r2) 226 | save(args.data, 'train', train_mean*y_std + y_mean, train_std * y_std, loss, r2, params) 227 | 228 | if held_X.any(): 229 | print('Applying GP to heldout data') 230 | held_X, held_Y, _, _, _ = standardize(held_X, held_Y, scaler, y_mean, y_std) 231 | held_x = to_gpu(to_torch(held_X)) 232 | 233 | hld_mean, hld_std = predict(model, likelihood, held_x) 234 | r2 = r2_score(held_Y, hld_mean) 235 | print(r2) 236 | save(args.data, 'held-out', hld_mean*y_std + y_mean, hld_std * y_std, loss, r2, params) 237 | 238 | if __name__ == "__main__": 239 | run() 240 | -------------------------------------------------------------------------------- /DIGDriver/region_model/feature_vectors/get_feature_vectors.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | import os 3 | import re 4 | import sys 5 | import h5py 6 | import numpy as np 7 | import torch 8 | from torch import nn 9 | from torch.utils.data import DataLoader 10 | from sklearn.metrics import r2_score 11 | 12 | sys.path.append('/storage/yaari/mutation_density/pytorch/nets/') 13 | sys.path.append('/storage/yaari/mutation_density/pytorch/trainers/') 14 | sys.path.append('/storage/yaari/mutation_density/pytorch/data_aux/') 15 | 16 | from cnn_predictors import * 17 | from mut_dataset import * 18 | 19 | def tokens_match(strg, search=re.compile(r'[^:0-9]').search): 20 | return not bool(search(strg)) 21 | 22 | def load_track_selection_file(file_path): 23 | with open(file_path, 'r') as f: 24 | lines = f.readlines() 25 | track_lst = [] 26 | for i, l in enumerate(lines): 27 | if l.startswith(('\n', '#')): continue 28 | l = l.rstrip() # remove trailing '\n' 29 | assert tokens_match(l), \ 30 | 'Expected track selection lines to contain only digits and colons. Found: {} in line #{}.'.format(l, i) 31 | 32 | split_l = l.split(':') 33 | assert len(split_l) <= 2, \ 34 | 'Expected track selection lines to contain only one colon. Found: {} in line #{}.'.format(l, i) 35 | assert np.all([split_l[j].isdigit() for j in range(len(split_l))]), \ 36 | 'Expected to have a number in both sides of the colon. Found: {} in line #{}.'.format(l, i) 37 | 38 | if len(split_l) == 1: 39 | track_lst.append(int(split_l[0])) 40 | elif len(split_l) == 2: 41 | assert int(split_l[0]) < int(split_l[1]), 'Expected x < y in pair x:y. Found: {} in line #{}.'.format(l, i) 42 | track_lst.extend(np.arange(int(split_l[0]), int(split_l[1])).tolist()) 43 | 44 | print('Selected {} tracks: \n{}'.format(len(track_lst), track_lst)) 45 | return track_lst 46 | 47 | def predict(model, data_loader, label_ids): 48 | corr_coef_sums = np.zeros(len(label_ids)) 49 | all_preds = [[] for _ in range(len(label_ids))] 50 | all_features = [[] for _ in range(len(label_ids))] 51 | all_true = [[] for _ in range(len(label_ids))] 52 | for j, (X, t_lst) in enumerate(data_loader): 53 | y_lst, features_lst, _ = model(X.cuda()) 54 | with torch.no_grad(): 55 | for i, t in enumerate(t_lst): 56 | y = y_lst[i] 57 | feature_vecs = features_lst[i] 58 | all_features[i].append(feature_vecs.cpu().detach().numpy()) 59 | all_preds[i].extend(y.data.cpu().numpy().tolist()) 60 | all_true[i].extend(t.data.cpu().numpy().tolist()) 61 | all_features = [np.concatenate(all_features[j], axis=0) for j in range(len(all_features))] 62 | return all_preds, all_true, all_features, [r2_score(all_true[i], all_preds[i]) for i in range(len(label_ids))] 63 | 64 | def main(): 65 | assert len(sys.argv) == 3, 'Usage: get_feature_vectors.py ' 66 | models_dir = sys.argv[1] 67 | run_id = sys.argv[2] 68 | 69 | with open(os.path.join(models_dir, 'run_params.txt'), 'r') as f: 70 | config_lst = [(l.split(':')) for l in f.read().split('\n')] 71 | config_dict = {x[0].strip(): x[1].strip() for x in config_lst if len(x) > 1} 72 | 73 | test_idxs = np.sort(np.load(os.path.join(models_dir, 'test_indices_fold_{}.npy'.format(run_id)))) 74 | label_ids = config_dict['label_ids'].replace('[\'', '').replace('\']', '').split(', ') 75 | 76 | file_path = config_dict['data_file'] 77 | with h5py.File(file_path, 'r') as h5f: 78 | chr_idxs = h5f['idx'][:] 79 | pred_h = h5f['x_data'].shape[2] 80 | 81 | track_file = config_dict['track_file'] 82 | if track_file != 'None': 83 | selected_tracks = load_track_selection_file(os.path.join(os.path.dirname(__file__), track_file)) 84 | else: 85 | selected_tracks = np.arange(pred_h) 86 | 87 | test_chr_idxs = chr_idxs[test_idxs] 88 | test_ds = LazyLoadDatasetFromH5(file_path, label_ids, test_idxs, test_chr_idxs, selected_tracks, 'x_data') 89 | test_dl = DataLoader(test_ds, batch_size=4096, shuffle=False, drop_last=False, pin_memory=True, num_workers=4) 90 | train_idxs = np.delete(np.arange(len(chr_idxs)), test_idxs) 91 | train_chr_idxs = chr_idxs[train_idxs] 92 | train_ds = LazyLoadDatasetFromH5(file_path, label_ids, train_idxs, train_chr_idxs, selected_tracks, 'x_data') 93 | train_dl = DataLoader(train_ds, batch_size=4096, shuffle=False, drop_last=False, pin_memory=True, num_workers=4) 94 | samp_num = len(test_ds) 95 | 96 | print('Loading model...') 97 | model = nn.DataParallel(SimpleMultiTaskResNet(test_ds.get_data_shape(), len(label_ids))).cuda() 98 | state_dict = torch.load(os.path.join(models_dir, 'best_model_fold_{}.pt'.format(run_id))) 99 | model.load_state_dict(state_dict) 100 | model.eval() 101 | 102 | print('Computing {} train set features...'.format(train_ds.get_data_shape()[0])) 103 | train_preds, train_labels, train_features, acc = predict(model, train_dl, label_ids) 104 | print('Model train accuracy: {}'.format(acc)) 105 | 106 | print('Computing {} test set features...'.format(test_ds.get_data_shape()[0])) 107 | test_preds, test_labels, test_features, acc = predict(model, test_dl, label_ids) 108 | print('Model test accuracy: {}'.format(acc)) 109 | 110 | print('Saving features, predictions and true labels...') 111 | with h5py.File(os.path.join(models_dir, 'gaussian_process_data_{}.h5'.format(run_id)), 'w') as h5f: 112 | train_group = h5f.create_group('train') 113 | train_group.create_dataset('true', data=np.array(train_labels)) 114 | train_group.create_dataset('predicted', data=np.array(train_preds)) 115 | train_group.create_dataset('idxs', data=np.array(train_chr_idxs)) 116 | train_group.create_dataset('features', data=np.array(train_features)) 117 | test_group = h5f.create_group('test') 118 | test_group.create_dataset('true', data=np.array(test_labels)) 119 | test_group.create_dataset('predicted', data=np.array(test_preds)) 120 | test_group.create_dataset('idxs', data=np.array(test_chr_idxs)) 121 | test_group.create_dataset('features', data=np.array(test_features)) 122 | 123 | print('Done!') 124 | 125 | if __name__ == '__main__': 126 | main() 127 | -------------------------------------------------------------------------------- /DIGDriver/region_model/feature_vectors/get_heldout_feature_vectors.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | import os 3 | import re 4 | import sys 5 | import json 6 | import copy 7 | import h5py 8 | import numpy as np 9 | import pandas as pd 10 | from types import SimpleNamespace 11 | import torch 12 | from torch import nn 13 | from torch.utils.data import DataLoader 14 | from sklearn.metrics import r2_score 15 | 16 | sys.path.append('/storage/yaari/mutation_density/pytorch/nets/') 17 | sys.path.append('/storage/yaari/mutation_density/pytorch/trainers/') 18 | sys.path.append('/storage/yaari/mutation_density/pytorch/data_aux/') 19 | 20 | from cnn_predictors import * 21 | from mut_dataset import * 22 | 23 | 24 | def tokens_match(strg, search=re.compile(r'[^:0-9]').search): 25 | return not bool(search(strg)) 26 | 27 | def load_track_selection_file(file_path): 28 | with open(file_path, 'r') as f: 29 | lines = f.readlines() 30 | track_lst = [] 31 | for i, l in enumerate(lines): 32 | if l.startswith(('\n', '#')): continue 33 | l = l.rstrip() # remove trailing '\n' 34 | assert tokens_match(l), \ 35 | 'Expected track selection lines to contain only digits and colons. Found: {} in line #{}.'.format(l, i) 36 | 37 | split_l = l.split(':') 38 | assert len(split_l) <= 2, \ 39 | 'Expected track selection lines to contain only one colon. Found: {} in line #{}.'.format(l, i) 40 | assert np.all([split_l[j].isdigit() for j in range(len(split_l))]), \ 41 | 'Expected to have a number in both sides of the colon. Found: {} in line #{}.'.format(l, i) 42 | 43 | if len(split_l) == 1: 44 | track_lst.append(int(split_l[0])) 45 | elif len(split_l) == 2: 46 | assert int(split_l[0]) < int(split_l[1]), 'Expected x < y in pair x:y. Found: {} in line #{}.'.format(l, i) 47 | track_lst.extend(np.arange(int(split_l[0]), int(split_l[1])).tolist()) 48 | 49 | print('Selected {} tracks: \n{}'.format(len(track_lst), track_lst)) 50 | return track_lst 51 | 52 | def predict(model, data_loader, label_ids): 53 | corr_coef_sums = np.zeros(len(label_ids)) 54 | all_preds = [[] for _ in range(len(label_ids))] 55 | all_features = [[] for _ in range(len(label_ids))] 56 | all_true = [[] for _ in range(len(label_ids))] 57 | for j, (X, t_lst) in enumerate(data_loader): 58 | y_lst, features_lst, _ = model(X.cuda()) 59 | with torch.no_grad(): 60 | for i, t in enumerate(t_lst): 61 | y = y_lst[i] 62 | feature_vecs = features_lst[i] 63 | all_features[i].append(feature_vecs.cpu().detach().numpy()) 64 | all_preds[i].extend(y.data.cpu().numpy().tolist()) 65 | all_true[i].extend(t.data.cpu().numpy().tolist()) 66 | all_features = [np.concatenate(all_features[j], axis=0) for j in range(len(all_features))] 67 | return all_preds, all_true, all_features, [r2_score(all_true[i], all_preds[i]) for i in range(len(label_ids))] 68 | 69 | def main(): 70 | assert len(sys.argv) >= 3, 'Usage: get_heldout_feature_vectors.py ' 71 | 72 | models_dir = sys.argv[1] 73 | label_ids = sys.argv[2:] 74 | 75 | with open(os.path.join(models_dir, 'run_params.txt'), 'r') as f: 76 | config_lst = [(l.split(':')) for l in f.read().split('\n')] 77 | config_dict = {x[0].strip(): x[1].strip() for x in config_lst if len(x) > 1} 78 | 79 | test_idxs = np.sort(np.load(os.path.join(models_dir, 'test_indices.npy'))) 80 | heldout_idxs = np.sort(np.load(os.path.join(models_dir, 'heldout_indices.npy'))) 81 | 82 | file_path = config['data_file'] 83 | with h5py.File(file_path, 'r') as h5f: 84 | chr_idxs = h5f['idx'][:] 85 | pred_h = h5f['x_data'].shape[2] 86 | 87 | track_file = config_dict['track_file'] 88 | if track_file != 'None': 89 | selected_tracks = load_track_selection_file(os.path.join(os.path.dirname(__file__), track_file)) 90 | else: 91 | selected_tracks = np.arange(pred_h) 92 | 93 | test_chr_idxs = chr_idxs[test_idxs] 94 | test_ds = LazyLoadDatasetFromH5(file_path, label_ids, test_idxs, test_chr_idxs, selected_tracks, 'x_data') 95 | test_dl = DataLoader(test_ds, batch_size=4096, shuffle=False, drop_last=False, pin_memory=True, num_workers=4) 96 | train_idxs = np.delete(np.arange(len(chr_idxs)), test_idxs) 97 | train_chr_idxs = chr_idxs[train_idxs] 98 | train_ds = LazyLoadDatasetFromH5(file_path, label_ids, train_idxs, train_chr_idxs, selected_tracks, 'x_data') 99 | train_dl = DataLoader(train_ds, batch_size=4096, shuffle=False, drop_last=False, pin_memory=True, num_workers=4) 100 | heldout_chr_idxs = chr_idxs[heldout_idxs] 101 | heldout_ds = LazyLoadDatasetFromH5(file_path, label_ids, heldout_idxs, heldout_chr_idxs, 'x_data') 102 | heldout_dl = DataLoader(heldout_ds, batch_size=len(heldout_idxs), shuffle=False, drop_last=False, pin_memory=True, num_workers=4) 103 | 104 | print('Loading model...') 105 | model = nn.DataParallel(SimpleMultiTaskResNet(test_ds.get_data_shape(), len(label_ids), get_feature_vecs=True)).cuda() 106 | state_dict = torch.load(os.path.join(models_dir, 'best_model.pt')) 107 | model.load_state_dict(state_dict) 108 | model.eval() 109 | 110 | print('Predicting train set features...') 111 | train_preds, train_labels, train_features, acc = predict(model, train_dl, label_ids) 112 | print('Model train accuracy: {}'.format(acc)) 113 | 114 | print('Predicting test set features...') 115 | test_preds, test_labels, test_features, acc = predict(model, test_dl, label_ids) 116 | print('Model test accuracy: {}'.format(acc)) 117 | 118 | print('Predicting heldout set features...') 119 | heldout_preds, heldout_labels, heldout_features, acc = predict(model, heldout_dl, label_ids) 120 | print('Model held-out accuracy: {}'.format(acc)) 121 | 122 | print('Model accuracy: {}'.format(acc)) 123 | print('Saving features, predictions and true labels...') 124 | with h5py.File(os.path.join(models_dir, 'heldout_gaussian_process_data.h5'), 'w') as h5f: 125 | train_group = h5f.create_group('train') 126 | train_group.create_dataset('true', data=np.array(train_labels)) 127 | train_group.create_dataset('predicted', data=np.array(train_preds)) 128 | train_group.create_dataset('idxs', data=np.array(train_chr_idxs)) 129 | train_group.create_dataset('features', data=np.array(train_features)) 130 | test_group = h5f.create_group('test') 131 | test_group.create_dataset('true', data=np.array(test_labels)) 132 | test_group.create_dataset('predicted', data=np.array(test_preds)) 133 | test_group.create_dataset('idxs', data=np.array(test_chr_idxs)) 134 | test_group.create_dataset('features', data=np.array(test_features)) 135 | heldout_group = h5f.create_group('heldout') 136 | heldout_group.create_dataset('true', data=np.array(heldout_labels)) 137 | heldout_group.create_dataset('predicted', data=np.array(heldout_preds)) 138 | heldout_group.create_dataset('idxs', data=np.array(heldout_chr_idxs)) 139 | heldout_group.create_dataset('features', data=np.array(heldout_features)) 140 | 141 | print('Done!') 142 | 143 | if __name__ == '__main__': 144 | main() 145 | -------------------------------------------------------------------------------- /DIGDriver/region_model/kfold_mutations_main.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | import os 3 | import sys 4 | import h5py 5 | import copy 6 | import argparse 7 | import numpy as np 8 | import pandas as pd 9 | from torch import nn, optim 10 | from tensorboardX import SummaryWriter 11 | from datetime import datetime 12 | 13 | file_path = os.path.dirname(os.path.abspath(__file__)) 14 | sys.path.append(os.path.join(file_path, 'nets')) 15 | sys.path.append(os.path.join(file_path, 'trainers')) 16 | sys.path.append(os.path.join(file_path, 'data_aux')) 17 | sys.path.append(os.path.join(file_path, '../sequence_model')) 18 | 19 | #from rnn_predictors import * 20 | from cnn_predictors import * 21 | 22 | from nn_trainer import * 23 | from gp_trainer import * 24 | from dataset_generator import * 25 | from mutations_main import OutputGenerator 26 | from gp_tools import * 27 | 28 | def get_cmd_arguments(text=None): 29 | ap = argparse.ArgumentParser() 30 | 31 | # Required cancer type argument 32 | ap.add_argument('-c', '--cancer-id', required=True, nargs='*', action='store', type=str, dest='label_ids', 33 | help='A list of the h5 file mutation count dataset IDs (e.g. SNV_skin_melanoma_MELAU_AU)') 34 | 35 | # Path arguments 36 | ap.add_argument('-d', "--data", required=False, nargs='?', action='store', type=str, dest='data_file', 37 | default='/storage/datasets/cancer/unzipped_data_matrices_pcawg_10k.h5', help='Path to h5 data file') 38 | ap.add_argument('-o', "--out-dir", required=False, nargs='?', action='store', type=str, dest='out_dir', 39 | default='/storage/yaari/mutation-density-outputs', help='Path to output directory') 40 | ap.add_argument('-t', "--tracks", required=False, nargs='?', action='store', type=str, dest='track_file', 41 | default=None, help='Path to predictor tracks selection file') 42 | 43 | # Run type parameters 44 | ap.add_argument('-s', "--split", required=False, nargs='?', action='store', type=str, dest='split_method', 45 | default='random', help='Dataset split method (random/chr)') 46 | ap.add_argument('-m', "--mappability", required=False, nargs='?', action='store', type=float, dest='mappability', 47 | default=0.5, help='Mappability lower bound') 48 | ap.add_argument('-cq', "--count-quantile", required=False, nargs='?', action='store', type=float, dest='count_quantile', 49 | default=0.999, help='Region mutation count quanitle threshold.') 50 | ap.add_argument('-a', "--attention", required=False, action='store_true', dest='get_attention', 51 | help='True: train with attention map training and save attention maps') 52 | ap.add_argument('-gp', "--gaussian", required=False, nargs='?', action='store', type=int, dest='run_gaussian', 53 | default=5, help='True: train gaussian process regression on the best performing model') 54 | ap.add_argument('-as', "--autoregressive-size", required=False, nargs='?', action='store', type=int, 55 | dest='autoregressive_size', default=0, help='number of neighbouring regions for autoregressive features') 56 | # Train parameters 57 | ap.add_argument('-k', required=False, nargs='?', action='store', type=int, dest='k', 58 | default=5, help='Number of folds') 59 | ap.add_argument('-gr', "--gp-reruns", required=False, nargs='?', action='store', type=int, dest='gp_reruns', 60 | default=3, help='GP maximum reinitializations for convergence') 61 | ap.add_argument('-gd', "--gp-delta", required=False, nargs='?', action='store', type=int, dest='gp_delta', 62 | default=0.03, help='Maximum difference between a fold NN and GP scores') 63 | ap.add_argument('-re', "--nn-reruns", required=False, nargs='?', action='store', type=int, dest='nn_reruns', 64 | default=1, help='Number of model reinitializations and training runs') 65 | ap.add_argument('-mr', "--max-nn-reruns", required=False, nargs='?', action='store', type=int, dest='max_nn_reruns', 66 | default=3, help='NN maximum reinitializations for GP to successeed') 67 | ap.add_argument('-vr', "--val-ratio", required=False, nargs='?', action='store', type=float, dest='val_ratio', 68 | default=0.2, help='Validation set split size ratio') 69 | ap.add_argument('-e', "--epochs", required=False, nargs='?', action='store', type=int, dest='epochs', 70 | default=20, help='Number of epochs') 71 | ap.add_argument('-b', "--batch", required=False, nargs='?', action='store', type=int, dest='bs', 72 | default=128, help='Batch size') 73 | ap.add_argument('-nd', "--n-inducing", required=False, nargs='?', action='store', type=int, dest='n_inducing', 74 | default=400, help='Number of GP inducing points') 75 | ap.add_argument('-nt', "--n-iter", required=False, nargs='?', action='store', type=int, dest='n_iter', 76 | default=50, help='Number of GP iterations') 77 | 78 | # Run management parameters 79 | ap.add_argument('-sm', "--save-model", required=False, action='store_true', dest='save_model', 80 | help='True: save best model across all reruns') 81 | ap.add_argument('-st', "--save-training", required=False, action='store_true', dest='save_training', 82 | help='True: save training process and results to Tensorboard file') 83 | ap.add_argument('-g', "--gpus", required=False, nargs='?', action='store', type=str, dest='gpus', 84 | default='all', help='GPUs devices (all/comma separted list)') 85 | ap.add_argument('-u', "--sub_mapp", required=False, action='store_true', dest='sub_mapp', 86 | help='True: run model on regions below mappability threshold') 87 | 88 | if text: 89 | args = ap.parse_args(text.split()) 90 | else: 91 | args = ap.parse_args() 92 | 93 | return args 94 | 95 | 96 | def main(input_args=None): 97 | if input_args is None: 98 | args = get_cmd_arguments() 99 | else: 100 | args = input_args 101 | 102 | labels_str = '-'.join(args.label_ids) 103 | out_dir = os.path.join(args.out_dir, 'kfold', labels_str, str(datetime.now())) 104 | print('Generating prediction for cancer types: {}'.format(args.label_ids)) 105 | 106 | if args.gpus is None: 107 | print('Using CPU device.') 108 | device = torch.device('cpu') 109 | else: 110 | print('Using GPU device: \'{}\''.format(args.gpus)) 111 | device = torch.device('cuda') 112 | if args.gpus != 'all': 113 | os.environ["CUDA_VISIBLE_DEVICES"] = args.gpus 114 | 115 | out_pred = OutputGenerator(args, device, out_dir) 116 | 117 | os.makedirs(out_dir) 118 | args_dict = vars(args) 119 | with open(os.path.join(out_dir, 'run_params.txt'), 'w') as f: 120 | [f.write('{}: {}\n'.format(k, args_dict[k])) for k in args_dict.keys()] 121 | 122 | best_model_file = os.path.join(out_dir, 'best_model_fold_{}.pt') 123 | val_set_file = os.path.join(out_dir, 'val_indices_fold_{}') 124 | 125 | if args.save_model or args.save_training: 126 | print('Saving results under: \'{}\''.format(out_dir)) 127 | 128 | data_generator = KFoldDatasetGenerator(args) 129 | is_autoreg = args.autoregressive_size > 0 130 | model_func = AutoregressiveMultiTaskResNet if is_autoreg else SimpleMultiTaskResNet 131 | print('Running {}-fold prediction...'.format(args.k)) 132 | 133 | k, re = 0, 0 134 | gp_succeed = False 135 | while k < args.k and re < args.max_nn_reruns: 136 | train_ds, val_ds, ho_ds = data_generator.get_datasets(k) 137 | best_overall_acc = -np.inf 138 | for r in range(args.nn_reruns): 139 | print('Setting model and optimizers for run {}/{} and fold {}/{}...'.format(r + 1, args.nn_reruns, k + 1, args.k)) 140 | model = model_func(train_ds.get_data_shape(), len(args.label_ids), get_attention_maps=args.get_attention) 141 | optimizer = optim.Adam(model.parameters(), lr=1e-3, amsgrad=False) 142 | loss_fn = nn.MSELoss() 143 | if args.gpus is not None: model = nn.DataParallel(model) 144 | 145 | if args.save_training: 146 | writer = SummaryWriter(logdir=out_dir, comment=labels_str) 147 | writer.add_text('configurations', str(args), 0) 148 | writer.add_text('model', str(model), 0) 149 | else: 150 | writer = None 151 | trainer = NNTrainer(model, 152 | optimizer, 153 | loss_fn, 154 | args.bs, 155 | args.label_ids, 156 | train_ds, 157 | val_ds, 158 | device, 159 | writer, 160 | get_attention_maps=args.get_attention) 161 | 162 | best_run_acc = -np.inf 163 | for epoch in range(1, args.epochs + 1): 164 | print('Running epoch {}/{}'.format(epoch, args.epochs)) 165 | train_losses, train_accs, train_features_lst, train_pred_lst, train_true_lst = \ 166 | trainer.train(epoch, r, autoreg=is_autoreg) 167 | val_losses, val_accs, val_features_lst, val_pred_lst, val_true_lst, val_attention = \ 168 | trainer.test(epoch, r, autoreg=is_autoreg) 169 | 170 | # Keep only the best model with > 2 non-zero features according to test performance 171 | non_zero_features = np.where(np.abs(train_features_lst[0]).mean(axis=0) > 0)[0] 172 | print('#non-zero features: {}'.format(len(non_zero_features))) 173 | if val_accs[0] > best_run_acc and len(non_zero_features) > 1: 174 | print('Changing run model since best R2 was {} compared to previous {}'.format(val_accs[0], best_run_acc)) 175 | best_run_acc = val_accs[0] 176 | best_run_model, best_run_att = copy.deepcopy(model), val_attention 177 | train_dict = {'feat': train_features_lst, 'lbls': train_true_lst, 'ds': train_ds} 178 | val_dict = {'feat': val_features_lst, 'lbls': val_true_lst, 'ds': val_ds} 179 | 180 | if best_run_acc > best_overall_acc: 181 | best_overall_acc = best_run_acc 182 | best_overall_model = best_run_model 183 | best_train_dict, best_val_dict = train_dict, val_dict 184 | 185 | print(bcolors.OKCYAN + 'Best epoch validation accuracy for run {}/{} was: {}.'.format(r + 1, args.nn_reruns, best_run_acc) + bcolors.ENDC) 186 | print(bcolors.OKCYAN + 'Best overall validation accuracy over {} reruns was: {}.'.format(args.nn_reruns, best_overall_acc) + bcolors.ENDC) 187 | 188 | # Save attention maps from best overall model 189 | if args.get_attention: 190 | out_pred.save_attetnion_maps('attention_maps_{}.h5'.format(k), best_run_att, val_ds, val_pred_lst, val_true_lst) 191 | 192 | # Save best run model 193 | if args.save_model: 194 | print('Saving model and validation indices for future evaluations to {}...'.format(val_set_file)) 195 | np.save(val_set_file.format(k), val_ds.get_set_indices()) 196 | torch.save(best_overall_model.state_dict(), best_model_file.format(k)) 197 | 198 | # Run GP on best overall model 199 | if args.run_gaussian > 0: 200 | print('Computing {} validation set features...'.format(ho_ds.get_data_shape()[0])) 201 | ho_preds, ho_labels, ho_features, ho_acc, ho_att = out_pred.predict(best_overall_model, ho_ds) 202 | ho_dict = {'feat': ho_features, 'lbls': ho_labels, 'ds': ho_ds} 203 | print(bcolors.OKCYAN + 'Model held-out accuracy: {}'.format(ho_acc) + bcolors.ENDC) 204 | 205 | gp_succeed = out_pred.run_gp('gp_results_fold_{}.h5'.format(k), train_dict, val_dict, ho_dict, best_overall_acc, k) 206 | 207 | if args.sub_mapp: 208 | sub_ds = data_generator.get_below_mapp() 209 | print('Computing {} sub-theshold features...'.format(sub_ds.get_data_shape()[0])) 210 | sub_preds, sub_labels, sub_features, sub_acc, sub_att = out_pred.predict(best_overall_model, sub_ds) 211 | sub_dict = {'feat': sub_features, 'lbls': sub_labels, 'ds': sub_ds} 212 | print(bcolors.OKCYAN + 'Model sub-mappable accuracy: {}'.format(sub_acc) + bcolors.ENDC) 213 | 214 | # Save attention maps from unmappable regions 215 | sub_att_path = os.path.join(out_dir, 'attention_maps_submapp.h5') 216 | if args.get_attention and not os.path.exists(sub_att_path): 217 | out_pred.save_attetnion_maps('attention_maps_submapp.h5', sub_att, sub_ds, sub_preds, sub_labels) 218 | 219 | out_pred.run_gp('sub_mapp_results_fold_{}.h5'.format(k), train_dict, val_dict, sub_dict, best_overall_acc, k, prefix='sub') 220 | 221 | if args.run_gaussian > 0 and not gp_succeed: 222 | re += 1 223 | print(bcolors.FAIL + 'GP run failed! Rerunning NN, attempt {}/{}'.format(re + 1, args.max_nn_reruns) + bcolors.ENDC) 224 | else: 225 | k += 1 226 | re = 0 227 | 228 | assert gp_succeed, 'GP failed at fold {} after {} NN reruns'.format(k, re) 229 | print('Done!') 230 | 231 | 232 | if __name__ == '__main__': 233 | startTime = datetime.now() 234 | main() 235 | print('Time elapsed: {}'.format(datetime.now() - startTime)) 236 | -------------------------------------------------------------------------------- /DIGDriver/region_model/nets/__init__.py: -------------------------------------------------------------------------------- 1 | 2 | -------------------------------------------------------------------------------- /DIGDriver/region_model/nets/densenet.py: -------------------------------------------------------------------------------- 1 | import torch 2 | import torch.nn as nn 3 | import torch.nn.functional as F 4 | 5 | 6 | class Dense_Block(nn.Module): 7 | def __init__(self, in_channels): 8 | super(Dense_Block, self).__init__() 9 | self.relu = nn.ReLU(inplace = True) 10 | self.bn = nn.BatchNorm1d(num_channels = in_channels) 11 | 12 | self.conv1 = nn.Conv1d(in_channels=in_channels, out_channels=32, kernel_size=3, stride=1, padding=1) 13 | self.conv2 = nn.Conv1d(in_channels=32, out_channels=32, kernel_size=3, stride=1, padding=1) 14 | self.conv3 = nn.Conv1d(in_channels=64, out_channels=32, kernel_size=3, stride=1, padding=1) 15 | self.conv4 = nn.Conv1d(in_channels=96, out_channels=32, kernel_size=3, stride=1, padding=1) 16 | self.conv5 = nn.Conv1d(in_channels=128, out_channels=32, kernel_size=3, stride=1, padding=1) 17 | 18 | def forward(self, x): 19 | bn = self.bn(x) 20 | conv1 = self.relu(self.conv1(bn)) 21 | conv2 = self.relu(self.conv2(conv1)) 22 | # Concatenate in channel dimension 23 | c2_dense = self.relu(torch.cat([conv1, conv2], 1)) 24 | conv3 = self.relu(self.conv3(c2_dense)) 25 | c3_dense = self.relu(torch.cat([conv1, conv2, conv3], 1)) 26 | 27 | conv4 = self.relu(self.conv4(c3_dense)) 28 | c4_dense = self.relu(torch.cat([conv1, conv2, conv3, conv4], 1)) 29 | 30 | conv5 = self.relu(self.conv5(c4_dense)) 31 | c5_dense = self.relu(torch.cat([conv1, conv2, conv3, conv4, conv5], 1)) 32 | 33 | return c5_dense 34 | 35 | 36 | class Transition_Layer(nn.Module): 37 | def __init__(self, in_channels, out_channels): 38 | super(Transition_Layer, self).__init__() 39 | 40 | self.relu = nn.ReLU(inplace=True) 41 | self.bn = nn.BatchNorm1d(num_features=out_channels) 42 | self.conv = nn.Conv1d(in_channels=in_channels, out_channels=out_channels, kernel_size=1, bias=False) 43 | self.avg_pool = nn.AvgPool1d(kernel_size=2, stride=2, padding=0) 44 | 45 | def forward(self, x): 46 | bn = self.bn(self.relu(self.conv(x))) 47 | out = self.avg_pool(bn) 48 | return out 49 | 50 | 51 | class SingleTaskDenseNet(nn.Module): 52 | def __init__(self, nr_classes): 53 | super(SingleTaskDenseNet, self).__init__() 54 | 55 | self.lowconv = nn.Conv1d(in_channels=3, out_channels=64, kernel_size=7, padding=3, bias=False) 56 | self.relu = nn.ReLU() 57 | 58 | # Make Dense Blocks 59 | self.denseblock1 = self._make_dense_block(Dense_Block, 64) 60 | self.denseblock2 = self._make_dense_block(Dense_Block, 128) 61 | self.denseblock3 = self._make_dense_block(Dense_Block, 128) # Make transition Layers 62 | self.transitionLayer1 = self._make_transition_layer(Transition_Layer, in_channels=160, out_channels=128) 63 | self.transitionLayer2 = self._make_transition_layer(Transition_Layer, in_channels=160, out_channels=128) 64 | self.transitionLayer3 = self._make_transition_layer(Transition_Layer, in_channels=160, out_channels=64) # Classifier 65 | self.bn = nn.BatchNorm1d(num_features=64) 66 | self.pre_classifier = nn.Linear(64*4*4, 512) 67 | self.classifier = nn.Linear(512, nr_classes) 68 | 69 | def _make_dense_block(self, block, in_channels): 70 | layers = [] 71 | layers.append(block(in_channels)) 72 | return nn.Sequential(*layers) 73 | 74 | def _make_transition_layer(self, layer, in_channels, out_channels): 75 | modules = [] 76 | modules.append(layer(in_channels, out_channels)) 77 | return nn.Sequential(*modules) 78 | 79 | def forward(self, x): 80 | out = self.relu(self.lowconv(x)) 81 | out = self.denseblock1(out) 82 | out = self.transitionLayer1(out) 83 | out = self.denseblock2(out) 84 | out = self.transitionLayer2(out) 85 | out = self.denseblock3(out) 86 | out = self.transitionLayer3(out) 87 | 88 | out = self.bn(out) 89 | out = out.view(-1, 64*4*4) 90 | 91 | out = self.pre_classifier(out) 92 | out = self.classifier(out) 93 | return out 94 | -------------------------------------------------------------------------------- /DIGDriver/region_model/nets/resnet.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import torch 3 | import torch.nn as nn 4 | import torch.nn.functional as F 5 | 6 | 7 | class BasicBlock(nn.Module): 8 | expansion = 1 9 | 10 | def __init__(self, in_planes, planes, stride=1): 11 | super(BasicBlock, self).__init__() 12 | self.conv1 = nn.Conv1d(in_planes, planes, kernel_size=3, stride=stride, padding=1, bias=True) 13 | self.bn1 = nn.BatchNorm1d(planes) 14 | self.conv2 = nn.Conv1d(planes, planes, kernel_size=3, stride=1, padding=1, bias=True) 15 | self.bn2 = nn.BatchNorm1d(planes) 16 | 17 | self.shortcut = nn.Sequential() 18 | if stride != 1 or in_planes != self.expansion*planes: 19 | self.shortcut = nn.Sequential( 20 | nn.Conv1d(in_planes, self.expansion*planes, kernel_size=1, stride=stride, bias=True), 21 | nn.BatchNorm1d(self.expansion*planes)) 22 | 23 | def forward(self, x): 24 | out = F.relu(self.bn1(self.conv1(x))) 25 | out = self.bn2(self.conv2(out)) 26 | out += self.shortcut(x) 27 | out = F.relu(out) 28 | return out 29 | 30 | 31 | class Bottleneck(nn.Module): 32 | expansion = 2 33 | 34 | def __init__(self, in_planes, planes, stride=1): 35 | super(Bottleneck, self).__init__() 36 | self.conv1 = nn.Conv1d(in_planes, planes, kernel_size=1, bias=False) 37 | self.bn1 = nn.BatchNorm1d(planes) 38 | self.conv2 = nn.Conv1d(planes, planes, kernel_size=3, stride=stride, padding=1, bias=False) 39 | self.bn2 = nn.BatchNorm1d(planes) 40 | self.conv3 = nn.Conv1d(planes, self.expansion*planes, kernel_size=1, bias=False) 41 | self.bn3 = nn.BatchNorm1d(self.expansion*planes) 42 | 43 | self.shortcut = nn.Sequential() 44 | if stride != 1 or in_planes != self.expansion*planes: 45 | self.shortcut = nn.Sequential( 46 | nn.Conv1d(in_planes, self.expansion*planes, kernel_size=1, stride=stride, bias=False), 47 | nn.BatchNorm1d(self.expansion*planes)) 48 | 49 | def forward(self, x): 50 | out = F.relu(self.bn1(self.conv1(x))) 51 | out = F.relu(self.bn2(self.conv2(out))) 52 | out = self.bn3(self.conv3(out)) 53 | out += self.shortcut(x) 54 | out = F.relu(out) 55 | return out 56 | 57 | 58 | class SingleTaskResNet(nn.Module): 59 | def __init__(self, in_shape, task_num, num_blocks=[3,3,3], strides=[2,2,2], block=Bottleneck): 60 | super(SingleTaskResNet, self).__init__() 61 | assert len(num_blocks) == len(strides), \ 62 | 'Expected number of blocks and strides lists to be of equal length but found {} and {}'.format(len(num_blocks), len(strides)) 63 | in_len = in_shape[1] 64 | in_width = in_shape[2] 65 | self.in_planes = 64 66 | 67 | self.conv1 = nn.Conv1d(in_width, 64, kernel_size=5, stride=1, padding=1, bias=False) 68 | self.bn1 = nn.BatchNorm1d(64) 69 | 70 | conv_blocks = [self._make_layer(block, 64 * 2**i, layer_num, stride=stride) for i, (layer_num, stride) in enumerate(zip(num_blocks, strides))] 71 | self.net = nn.Sequential(*conv_blocks) 72 | 73 | net_out_len = int(block.expansion * 64 * 2**(len(strides)-1) * np.ceil(in_len / np.prod(strides))) 74 | self.linear1 = nn.Linear(net_out_len, 128) 75 | self.linear2 = nn.Linear(128, 1) 76 | 77 | def _make_layer(self, block, planes, num_blocks, stride): 78 | strides = [stride] + [1]*(num_blocks-1) 79 | layers = [] 80 | for s in strides: 81 | layers.append(block(self.in_planes, planes, s)) 82 | self.in_planes = planes * block.expansion 83 | return nn.Sequential(*layers) 84 | 85 | def forward(self, x): 86 | out = F.relu(self.bn1(self.conv1(torch.transpose(x, 1, 2)))) 87 | out = self.net(out) 88 | #out = F.avg_pool1d(out, 4) 89 | out = out.view(out.size(0), -1) 90 | out = self.linear1(out) 91 | out = self.linear2(out) 92 | return [out.reshape(-1)] 93 | 94 | 95 | 96 | 97 | -------------------------------------------------------------------------------- /DIGDriver/region_model/nets/rnn_predictors.py: -------------------------------------------------------------------------------- 1 | from torch import nn, transpose 2 | from torch.autograd import Variable 3 | from torch.nn import functional as F 4 | 5 | 6 | class MultiTaskLinear(nn.Module): 7 | def __init__(self, shape, task_num): 8 | super(MultiTaskLinear, self).__init__() 9 | self.inp_len = shape[1] 10 | self.inp_size = shape[2] 11 | self.task_num = task_num 12 | 13 | self.hidden_dim = 128 14 | self.fc2_dim = 128 15 | 16 | self.conv1 = nn.Conv1d(in_channels=self.inp_size, out_channels=128, kernel_size=3, padding=1, stride=1) 17 | self.bn1 = nn.BatchNorm1d(128) 18 | self.conv2 = nn.Conv1d(in_channels=128, out_channels=256, kernel_size=3, padding=1, stride=2) 19 | self.bn2 = nn.BatchNorm1d(256) 20 | self.conv3 = nn.Conv1d(in_channels=256, out_channels=256, kernel_size=3, padding=1, stride=2) 21 | self.bn3 = nn.BatchNorm1d(256) 22 | 23 | self.birnn = nn.LSTM(input_size=256, hidden_size=self.hidden_dim, num_layers=3, batch_first=True, bidirectional=True) 24 | 25 | self.fc1_lst = nn.ModuleList() 26 | self.fc2_lst = nn.ModuleList() 27 | for _ in range(self.task_num): 28 | self.fc1_lst.append(nn.Linear(in_features=int(self.hidden_dim * 2), out_features=self.fc2_dim)) 29 | self.fc2_lst.append(nn.Linear(in_features=self.fc2_dim, out_features=1)) 30 | 31 | def forward(self, x: Variable) -> (Variable): 32 | self.birnn.flatten_parameters() 33 | x = self.bn1(F.relu(self.conv1(transpose(x, 1, 2)))) 34 | x = F.relu(self.bn2(self.conv2(x))) 35 | x = F.relu(self.bn3(self.conv3(x))) 36 | x = self.birnn(transpose(x, 1, 2)) 37 | 38 | outputs = [] 39 | for i in range(self.task_num): 40 | task_x = F.relu(self.fc1_lst[i](x[0][:, -1, :])) 41 | outputs.append(self.fc2_lst[i](task_x).reshape(-1)) 42 | 43 | return outputs 44 | 45 | class MultiTaskRNN(nn.Module): 46 | def __init__(self, shape, task_num): 47 | super(MultiTaskRNN, self).__init__() 48 | self.inp_len = shape[1] 49 | self.inp_size = shape[2] 50 | self.task_num = task_num 51 | 52 | self.hidden_dim = 128 53 | self.fc2_dim = 128 54 | 55 | self.conv1 = nn.Conv1d(in_channels=self.inp_size, out_channels=128, kernel_size=3, padding=1, stride=1) 56 | self.bn1 = nn.BatchNorm1d(128) 57 | self.conv2 = nn.Conv1d(in_channels=128, out_channels=256, kernel_size=3, padding=1, stride=2) 58 | self.bn2 = nn.BatchNorm1d(256) 59 | self.conv3 = nn.Conv1d(in_channels=256, out_channels=256, kernel_size=3, padding=1, stride=2) 60 | self.bn3 = nn.BatchNorm1d(256) 61 | 62 | self.rnn_lst = nn.ModuleList() 63 | self.fc1_lst = nn.ModuleList() 64 | self.fc2_lst = nn.ModuleList() 65 | for _ in range(self.task_num): 66 | self.rnn_lst.append(nn.LSTM(input_size=256, hidden_size=self.hidden_dim, num_layers=3, batch_first=True, bidirectional=True)) 67 | self.fc1_lst.append(nn.Linear(in_features=int(self.hidden_dim * 2), out_features=self.fc2_dim)) 68 | self.fc2_lst.append(nn.Linear(in_features=self.fc2_dim, out_features=1)) 69 | 70 | def forward(self, x: Variable) -> (Variable): 71 | x = self.bn1(F.relu(self.conv1(transpose(x, 1, 2)))) 72 | x = F.relu(self.bn2(self.conv2(x))) 73 | x = F.relu(self.bn3(self.conv3(x))) 74 | 75 | outputs = [] 76 | for i in range(self.task_num): 77 | self.rnn_lst[i].flatten_parameters() 78 | task_x = self.rnn_lst[i](transpose(x, 1, 2)) 79 | task_x = F.relu(self.fc1_lst[i](task_x[0][:, -1, :])) 80 | outputs.append(self.fc2_lst[i](task_x).reshape(-1)) 81 | 82 | return outputs 83 | 84 | class MultiTaskHierarchicalLinear(nn.Module): 85 | def __init__(self, shape, task_num): 86 | super(MultiTaskHierarchicalLinear, self).__init__() 87 | self.inp_len = shape[1] 88 | self.inp_size = shape[2] 89 | self.task_num = task_num 90 | 91 | self.hidden_dim = 128 92 | self.fc2_dim = 128 93 | 94 | self.conv1 = nn.Conv1d(in_channels=self.inp_size, out_channels=128, kernel_size=3, padding=1, stride=1) 95 | self.bn1 = nn.BatchNorm1d(128) 96 | self.conv2 = nn.Conv1d(in_channels=128, out_channels=256, kernel_size=3, padding=1, stride=2) 97 | self.bn2 = nn.BatchNorm1d(256) 98 | self.conv3 = nn.Conv1d(in_channels=256, out_channels=256, kernel_size=3, padding=1, stride=2) 99 | self.bn3 = nn.BatchNorm1d(256) 100 | 101 | self.birnn = nn.LSTM(input_size=256, hidden_size=self.hidden_dim, num_layers=3, batch_first=True, bidirectional=True) 102 | 103 | self.fc1 = nn.Linear(in_features=int(self.hidden_dim * 2), out_features=self.fc2_dim) 104 | self.t1_out = nn.Linear(in_features=self.fc2_dim, out_features=1) 105 | 106 | self.fc2 = nn.Linear(in_features=self.fc2_dim, out_features=self.fc2_dim) 107 | self.t2_out = nn.Linear(in_features=self.fc2_dim, out_features=1) 108 | 109 | def forward(self, x: Variable) -> (Variable): 110 | self.birnn.flatten_parameters() 111 | x = self.bn1(F.relu(self.conv1(transpose(x, 1, 2)))) 112 | x = F.relu(self.bn2(self.conv2(x))) 113 | x = F.relu(self.bn3(self.conv3(x))) 114 | x = self.birnn(transpose(x, 1, 2)) 115 | x = F.relu(self.fc1(x[0][:, -1, :])) 116 | out1 = self.t1_out(x).reshape(-1) 117 | 118 | x = F.relu(self.fc2(x)) 119 | out2 = self.t2_out(x).reshape(-1) 120 | 121 | return [out1, out2] 122 | -------------------------------------------------------------------------------- /DIGDriver/region_model/perturbations_confidance/__init__.py: -------------------------------------------------------------------------------- 1 | 2 | -------------------------------------------------------------------------------- /DIGDriver/region_model/perturbations_confidance/confidance_perturbations_estimate.py: -------------------------------------------------------------------------------- 1 | import os 2 | import sys 3 | import json 4 | import copy 5 | import h5py 6 | import numpy as np 7 | import torch 8 | from types import SimpleNamespace 9 | from torch import nn 10 | from sklearn.metrics import r2_score 11 | 12 | #from nets.nets import * 13 | #from nets.trainer import * 14 | 15 | def add_noise_to_model(model, noise): 16 | tmp_model = copy.deepcopy(model).cuda() 17 | with torch.no_grad(): 18 | for param in tmp_model.parameters(): 19 | print 20 | param.add_(torch.normal(0, noise, param.size()).cuda()) 21 | return tmp_model 22 | 23 | 24 | def compute_confidance(preds, labels): 25 | confs = np.empty((preds.shape[0], preds.shape[2])) 26 | means = np.empty((preds.shape[0], preds.shape[2])) 27 | accs = np.empty(preds.shape[0]) 28 | for i in range(preds.shape[0]): 29 | for j in range(preds.shape[2]): 30 | confs[i, j] = np.std(preds[i, :, j]) 31 | means[i, j] = np.mean(preds[i, :, j]) 32 | accs[i] = r2_score(means[i], labels) 33 | return means, confs, accs 34 | 35 | 36 | def test_confidance(model, data, labels, loss_fn, params, verbose=False): 37 | # toggle model to test / inference mode 38 | model.eval() 39 | 40 | # round sample num to full batches 41 | samp_num = len(labels) - len(labels) % params.bs 42 | 43 | preds = np.empty((len(params.alphas), params.reps, samp_num)) 44 | for i, alpha in enumerate(params.alphas): 45 | for rep in range(params.reps): 46 | loss_sum = 0 47 | acc_sum = 0 48 | tmp_model = add_noise_to_model(model, alpha) 49 | for b_samp in range(0, samp_num, params.bs): 50 | x = torch.tensor(data[b_samp:b_samp + params.bs]).float().cuda() 51 | with torch.no_grad(): 52 | y = tmp_model(x) 53 | t = torch.tensor(labels[b_samp:b_samp + params.bs]).float().cuda() 54 | 55 | loss_sum += loss_fn(y, t).item() 56 | acc_sum += r2_score(t.data.cpu().numpy(), y.data.cpu().numpy()) 57 | preds[i, rep, b_samp:b_samp + params.bs] = y.data.cpu().numpy() 58 | 59 | if verbose: 60 | print('Repetition {} alpha: {}, loss: {:.4f}, accuracy: {:.4f}'.format(rep, alpha, loss_sum / (samp_num / params.bs), acc_sum / (samp_num / params.bs))) 61 | 62 | print('Accuracy for alpha: {} over {} repetitions is: {}'.format(alpha, params.reps, r2_score(np.mean(preds[i], axis=0), labels[:samp_num]))) 63 | 64 | return compute_confidance(preds, labels[:samp_num]) 65 | 66 | 67 | def main(): 68 | cur_dir = os.path.dirname(os.path.realpath(__file__)) 69 | if len(sys.argv) < 2: 70 | config_file = os.path.join(cur_dir, "configs/config_confidance.json") 71 | print('No input was given, using {} as configuration file.'.format(config_file)) 72 | else: 73 | config_file = sys.argv[1] 74 | 75 | with open(config_file, 'r') as f: 76 | config = json.load(f) 77 | 78 | params = SimpleNamespace() 79 | params.reps = config['repetitions'] 80 | params.alphas = config['alphas'] 81 | params.bs = config['bs'] 82 | 83 | 84 | data_file = os.path.join('models', 'test_data_' + config['model_file'] + '.h5') 85 | print('Loading data and labels from file {}...'.format(data_file)) 86 | h5f = h5py.File(data_file, 'r') 87 | labels = h5f['labels'][:] 88 | data = h5f['data'][:] 89 | 90 | print('Loading model...') 91 | model = torch.load(os.path.join('models', 'best_model_' + config['model_file'] + '.pt')).cuda() 92 | loss_fn = nn.MSELoss() 93 | 94 | with torch.no_grad(): 95 | for name, param in model.named_parameters(): 96 | print(name, np.mean(param.detach().cpu().numpy()), np.std(param.detach().cpu().numpy())) 97 | 98 | print('Computing prediction and confidance...') 99 | preds, confidance, accs = test_confidance(model, data, labels, loss_fn, params) 100 | 101 | #TODO: add downstream task logic 102 | 103 | print('Done!') 104 | 105 | if __name__ == '__main__': 106 | main() 107 | -------------------------------------------------------------------------------- /DIGDriver/region_model/perturbations_confidance/configs/__init__.py: -------------------------------------------------------------------------------- 1 | 2 | -------------------------------------------------------------------------------- /DIGDriver/region_model/perturbations_confidance/configs/config_confidance.json: -------------------------------------------------------------------------------- 1 | { 2 | "model_file": "MELA_AU_100_ALL_TRACKS", 3 | "repetitions": 10, 4 | "alphas": [0.01], 5 | "bs": 128 6 | } 7 | -------------------------------------------------------------------------------- /DIGDriver/region_model/perturbations_confidance/configs/config_confidance_kfold.json: -------------------------------------------------------------------------------- 1 | { 2 | "data_file": "/storage/datasets/cancer/unziped_data_matrices_10000_0_0.8.h5", 3 | "base_path": "/storage/yaari/mutation-density-outputs/kfold", 4 | "repetitions": 1000, 5 | "alpha": 0.01, 6 | "bs": 8192, 7 | "k": 5 8 | } 9 | -------------------------------------------------------------------------------- /DIGDriver/region_model/perturbations_confidance/kfold_test_model_confidance.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | import os 3 | import sys 4 | import json 5 | import copy 6 | import h5py 7 | import numpy as np 8 | import pandas as pd 9 | from types import SimpleNamespace 10 | import torch 11 | from torch import nn 12 | from torch.utils.data import DataLoader 13 | from sklearn.metrics import r2_score 14 | 15 | sys.path.append('/storage/yaari/mutation_density/pytorch/nets/') 16 | sys.path.append('/storage/yaari/mutation_density/pytorch/') 17 | 18 | from cnn_predictors import * 19 | from mut_dataset import * 20 | 21 | def add_noise_to_model(model, noise): 22 | tmp_model = copy.deepcopy(model).cuda() 23 | with torch.no_grad(): 24 | for param in tmp_model.parameters(): 25 | param.add_(torch.normal(0, noise, param.size()).cuda()) 26 | return tmp_model 27 | 28 | def predict(model, data_loader, label_ids): 29 | corr_coef_sums = np.zeros(len(label_ids)) 30 | all_preds = [[] for _ in range(len(label_ids))] 31 | all_true = [[] for _ in range(len(label_ids))] 32 | for j, (X, t_lst) in enumerate(data_loader): 33 | y_lst = model(X.cuda()) 34 | with torch.no_grad(): 35 | for i, t in enumerate(t_lst): 36 | y = y_lst[i] 37 | all_preds[i].extend(y.data.cpu().numpy().tolist()) 38 | all_true[i].extend(t.data.cpu().numpy().tolist()) 39 | return all_preds, all_true, [r2_score(all_preds[i], all_true[i]) for i in range(len(label_ids))] 40 | 41 | 42 | def test_with_perturbations(model, data_loader, label_ids, samp_num, params, fold, verbose=True): 43 | preds = np.empty((samp_num, params.reps)) 44 | for rep in range(params.reps): 45 | tmp_model = add_noise_to_model(model, params.alpha) 46 | tmp_preds, _, acc = predict(tmp_model, data_loader, label_ids) 47 | preds[:, rep] = tmp_preds[0] 48 | 49 | if verbose and rep % 10 == 0: 50 | print('Fold {}, repetition {}, accuracy: {}'.format(fold, rep, acc)) 51 | return preds 52 | 53 | 54 | def main(): 55 | assert len(sys.argv) >= 4, 'Usage: kfold_test_model_confidance.py ' 56 | 57 | cur_dir = os.path.dirname(os.path.realpath(__file__)) 58 | config_path = os.path.join(cur_dir, "../configs/config_confidance_kfold.json") 59 | with open(config_path, 'r') as f: config = json.load(f) 60 | 61 | run_id = sys.argv[1] 62 | label_ids = sys.argv[3:] 63 | labels_str = '-'.join(label_ids) 64 | models_dir = os.path.join(config['base_path'], labels_str, sys.argv[2]) 65 | 66 | file_path = config['data_file'] 67 | with h5py.File(file_path, 'r') as h5f: 68 | chr_idxs = h5f['idx'][:] 69 | 70 | k = config['k'] 71 | params = SimpleNamespace() 72 | params.reps = config['repetitions'] 73 | params.alpha = config['alpha'] 74 | params.bs = config['bs'] 75 | 76 | pred_df = pd.DataFrame() 77 | idx = 0 78 | for i in range(2): 79 | print('Running iteration {} out of {} folds...'.format(i + 1, k)) 80 | test_idxs = np.sort(np.load(os.path.join(models_dir, 'test_indices_fold_{}.npy'.format(i)))) 81 | 82 | test_ds = SimpleDatasetFromH5(file_path, label_ids, test_idxs, chr_idxs[test_idxs], 'x_data') 83 | test_dl = DataLoader(test_ds, batch_size=params.bs, shuffle=False, drop_last=False, pin_memory=True, num_workers=4) 84 | samp_num = len(test_ds) 85 | test_chr_idxs = chr_idxs[test_idxs] 86 | 87 | print('Loading model...') 88 | model = nn.DataParallel(SimpleMultiTaskResNet(test_ds.get_data_shape(), len(label_ids))).cuda() 89 | state_dict = torch.load(os.path.join(models_dir, 'best_model_fold_{}.pt'.format(i))) 90 | model.load_state_dict(state_dict) 91 | model.eval() 92 | 93 | print('Computing prediction and confidance...') 94 | preds, labels, acc = predict(model, test_dl, label_ids) 95 | perturp_preds = test_with_perturbations(model, test_dl, label_ids, samp_num, params, i) 96 | 97 | print('Model accuracy: {}'.format(acc)) 98 | print('Storing predictions...') 99 | 100 | fold_pred_df = pd.DataFrame(data=perturp_preds) 101 | fold_pred_df['chr'] = test_chr_idxs[:,0] 102 | fold_pred_df['s_idx'] = test_chr_idxs[:,1] 103 | fold_pred_df['e_idx'] = test_chr_idxs[:,2] 104 | fold_pred_df['obs_mut'] = labels[0] 105 | fold_pred_df['pred_mut'] = preds[0] 106 | pred_df = pred_df.append(fold_pred_df, ignore_index=True) 107 | 108 | out_dir = os.path.join(models_dir, run_id) 109 | out_path = os.path.join(out_dir, 'perturb_predictions.csv') 110 | if not os.path.exists(out_dir): 111 | os.makedirs(out_dir) 112 | print('Saving predictions to {}...'.format(out_path)) 113 | pred_df.to_csv(out_path) 114 | 115 | print('Done!') 116 | 117 | if __name__ == '__main__': 118 | main() 119 | -------------------------------------------------------------------------------- /DIGDriver/region_model/region_model_tools.py: -------------------------------------------------------------------------------- 1 | import h5py 2 | import pandas as pd 3 | import numpy as np 4 | import scipy 5 | import scipy.stats 6 | 7 | def _load_fold_avg_DEPRECATED(f, cancer, test_idx=[], key='held-out'): 8 | """ Low-level loading of a single fold, removing outlier runs 9 | """ 10 | hf = h5py.File(f, 'r') 11 | dset = hf[cancer] 12 | 13 | if not key in dset.keys(): 14 | #print("WARNING: {} is not a key in the dataset. Defaulting to 'test'".format(key)) 15 | key = 'test' 16 | 17 | runs = [int(key) for key in dset[key].keys() if key.isdigit()] 18 | test_Y = dset[key]['y_true'][:].reshape(-1, 1) 19 | if not len(test_idx): 20 | test_idx = dset[key]['chr_locs'][:] 21 | 22 | # print(test_idx.shape) 23 | test_Yhat_lst = [] 24 | test_std_lst = [] 25 | r2_lst = [] 26 | run_lst = [] 27 | for run in runs: 28 | y_hat = dset[key]['{}'.format(run)]['mean'][:].reshape(-1, 1) 29 | #gets rid of runs with all means predicted the same (casuses nan pearsonr) 30 | # if (y_hat-y_hat.mean()).sum() == 0: 31 | # continue 32 | r2 = scipy.stats.pearsonr(test_Y.squeeze(), y_hat.squeeze())[0]**2 33 | 34 | if np.isnan(r2): 35 | continue 36 | 37 | r2_lst.append(r2) 38 | test_Yhat_lst.append(y_hat) 39 | test_std_lst.append(dset[key]['{}'.format(run)]['std'][:].reshape(-1, 1)) 40 | run_lst.append(run) 41 | # print(r2_lst[-1]) 42 | 43 | hf.close() 44 | r2s = np.array(r2_lst) 45 | # print(r2s) 46 | med = np.median(r2s) 47 | mad = np.median(np.abs(r2s - med)) 48 | 49 | # idx = np.array(run_lst)[r2s > (med - 2*mad)] 50 | idx = np.where(r2s > (np.max(r2s) - 2*mad)) 51 | if not len(idx[0]): 52 | idx = np.arange(len(test_Yhat_lst)) 53 | 54 | test_Yhat = np.array(test_Yhat_lst)[idx].mean(axis = 0) 55 | test_std = np.array(test_std_lst)[idx].mean(axis = 0) 56 | vals = np.hstack([test_idx, test_Y, test_Yhat, test_std]) 57 | df = pd.DataFrame(vals, 58 | columns=['CHROM', 'START', 'END', 'Y_TRUE', 'Y_PRED', 'STD'] 59 | ) 60 | # print(df[0:5]) 61 | 62 | return df 63 | 64 | def _load_fold_avg(f, cancer, key='held-out', fold=None): 65 | """ Low-level loading of a single fold 66 | """ 67 | h5 = h5py.File(f, 'r') 68 | out_h5 = h5[cancer] 69 | # if not fold: 70 | # fold = int(f.split('.h5')[0].split('_')[-1]) 71 | 72 | assert key in out_h5, 'Cannot compute pretrained model with no saved held-out set. Existing feilds are: {}'.format(out_h5.keys()) 73 | ds = out_h5['held-out'] 74 | 75 | runs = [key for key in ds.keys() if key.isdigit()] 76 | # test_Y = dset[key]['y_true'][:].reshape(-1, 1) 77 | # test_idx = dset[key]['chr_locs'][:] 78 | 79 | chr_locs = ds['chr_locs'][:] 80 | mapps = ds['mappability'][:].reshape(-1, 1) 81 | quants = ds['quantiles'][:].reshape(-1, 1) 82 | y_true = ds['y_true'][:].reshape(-1, 1) 83 | mean_lst = [] 84 | std_lst = [] 85 | 86 | for i in runs: 87 | mean_lst.append(ds[i]['mean'][:]) 88 | std_lst.append(ds[i]['std'][:]) 89 | 90 | means = np.array(mean_lst).mean(axis=0).reshape(-1, 1) 91 | stds = np.array(std_lst).mean(axis=0).reshape(-1, 1) 92 | 93 | vals = np.hstack([chr_locs, y_true, means, stds, mapps, quants]) 94 | df = pd.DataFrame(vals, 95 | columns=['CHROM', 'START', 'END', 'Y_TRUE', 'Y_PRED', 'STD', 'MAPP', 'QUANT'] 96 | ) 97 | # df['FOLD'] = fold 98 | # print(df[0:5]) 99 | 100 | return df 101 | 102 | def kfold_supmap_results(kfold_path, cancer_str, key='held-out', drop_pos_cols=False, sort=True): 103 | """ Load kfold results for regions above the user-defined mappability threshold 104 | """ 105 | fold_files = sorted(kfold_path.glob("gp_results_fold*.h5")) 106 | df_lst = [_load_fold_avg(str(fold), cancer=cancer_str, key=key) for fold in fold_files] 107 | df = pd.concat(df_lst).astype({'CHROM':int, 108 | 'START':int, 109 | 'END':int, 110 | 'Y_TRUE':int, 111 | 'Y_PRED':float, 112 | 'STD':float, 113 | 'MAPP': float, 114 | 'QUANT': float}) 115 | # window = int(df.iloc[0]['END'] - df.iloc[0]['START']) 116 | df['FLAG'] = False 117 | df['Region'] = ['chr{}:{}-{}'.format(row[0], row[1], row[2]) \ 118 | for row in zip(df.CHROM, df.START, df.END)] 119 | 120 | if sort: 121 | df = df.sort_values(by=['CHROM', 'START']) 122 | 123 | if drop_pos_cols: 124 | df = df.drop(['CHROM', 'START', 'END'], axis = 1) 125 | 126 | df.set_index('Region', inplace=True) 127 | 128 | return df 129 | 130 | def kfold_submap_results(kfold_path, cancer_str, key='held-out', drop_pos_cols=False, sort=True): 131 | """ Load kfold results for regions below mappabiliy threshold 132 | """ 133 | fold_files = sorted(kfold_path.glob("sub_mapp_results_fold*.h5")) 134 | df_lst = [_load_fold_avg(str(fold), cancer=cancer_str, key=key) for fold in fold_files] 135 | 136 | a_mean = np.array([df.Y_PRED.values for df in df_lst]) 137 | mean = np.mean(a_mean, axis=0) 138 | 139 | a_std = np.array([df.STD.values for df in df_lst]) 140 | std = np.mean(a_std, axis=0) 141 | 142 | df = pd.DataFrame({'CHROM': df_lst[0].CHROM.values, 143 | 'START': df_lst[0].START.values, 144 | 'END': df_lst[0].END.values, 145 | 'Y_TRUE': df_lst[0].Y_TRUE.values, 146 | 'Y_PRED': mean, 147 | 'STD': std, 148 | 'MAPP': df_lst[0].MAPP.values, 149 | 'QUANT': df_lst[0].QUANT.values, 150 | } 151 | ).astype({'CHROM':int, 'START':int, 'END':int, 'Y_TRUE':int, 152 | 'Y_PRED':float, 'STD':float, 'MAPP':float, 'QUANT':float}) 153 | 154 | # window = int(df.iloc[0]['END'] - df.iloc[0]['START']) 155 | df['FLAG'] = True 156 | df['Region'] = ['chr{}:{}-{}'.format(row[0], row[1], row[2]) \ 157 | for row in zip(df.CHROM, df.START, df.END)] 158 | 159 | if sort: 160 | df = df.sort_values(by=['CHROM', 'START']) 161 | 162 | if drop_pos_cols: 163 | df = df.drop(['CHROM', 'START', 'END'], axis = 1) 164 | 165 | df.set_index('Region', inplace=True) 166 | 167 | return df #, window 168 | 169 | def kfold_results(kfold_path, cohort_name, key='held-out'): 170 | """ Load kfold results and remove outlier runs 171 | """ 172 | try: 173 | df_sup = kfold_supmap_results(kfold_path, cohort_name, key=key) 174 | df_sub = kfold_submap_results(kfold_path, cohort_name, key=key) 175 | except: 176 | # except KeyError as e: 177 | raise Exception('ERROR: failed to load kfold {}. You should rerun the CNN+GP kfold.'.format(kfold_path)) 178 | # print('FAIL: {}'.format(kfold_path)) 179 | # print('\nERROR: uh oh there was an error loading the kfold results.') 180 | # print('This probably means a CNN+GP run crashed (it happens).') 181 | # print('Rerunning the CNN+GP kfold should fix the problem') 182 | 183 | # print(scipy.stats.pearsonr(df_sup.Y_TRUE, df_sup.Y_PRED)[0]**2) 184 | # print(scipy.stats.pearsonr(df_sub.Y_TRUE, df_sub.Y_PRED)[0]**2) 185 | 186 | df = pd.concat([df_sup, df_sub]).sort_values(by=['CHROM', 'START']) 187 | df_dedup = df.drop_duplicates(['CHROM', 'START', 'END']) 188 | assert len(df) == len(df_dedup), \ 189 | "Oh snap! There are duplicate entries in the folds. You should rerun this kfold." 190 | 191 | print(scipy.stats.pearsonr(df[~df.FLAG].Y_TRUE, df[~df.FLAG].Y_PRED)[0]**2) 192 | 193 | return df 194 | -------------------------------------------------------------------------------- /DIGDriver/region_model/train_nn.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | ## This is an **example** command to fit neural network predictions for 10kb regions of the PCAWG pan-cancer cohort. 4 | ## 5 | ## NOTE: THE PROCESS REQUIRES INPUT DATA TOO LARGE TO BE INCLUDED IN THIS GITHUB REPO. 6 | ## CONTACT THE AUTHORS TO ENSURE YOU HAVE THE NECESSARY INPUT FILES AND COMPUTE RESROUCES 7 | ## IF YOU WANT TO CREATE MUTATION RATE MAPS FROM YOUR OWN WGS DATASETS. 8 | 9 | python mutations_main.py -c Pancan_SNV 10 | -------------------------------------------------------------------------------- /DIGDriver/region_model/trainers/__init__.py: -------------------------------------------------------------------------------- 1 | 2 | -------------------------------------------------------------------------------- /DIGDriver/region_model/trainers/gp_trainer.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import scipy 3 | import torch 4 | import gpytorch 5 | from sklearn.preprocessing import StandardScaler 6 | import warnings 7 | 8 | warnings.filterwarnings("ignore", category=RuntimeWarning) 9 | 10 | 11 | class bcolors: 12 | HEADER = '\033[95m' 13 | OKBLUE = '\033[94m' 14 | OKCYAN = '\033[96m' 15 | OKGREEN = '\033[92m' 16 | WARNING = '\033[93m' 17 | FAIL = '\033[91m' 18 | ENDC = '\033[0m' 19 | BOLD = '\033[1m' 20 | UNDERLINE = '\033[4m' 21 | 22 | 23 | def r2_score(y_true, y_pred): 24 | r2 = scipy.stats.pearsonr(y_true, y_pred)[0]**2 25 | return r2 if not np.isnan(r2) else 0 26 | 27 | 28 | class SparseGP(gpytorch.models.ExactGP): 29 | def __init__(self, train_x, train_y, likelihood, n_inducing=2000): 30 | super(SparseGP, self).__init__(train_x, train_y, likelihood) 31 | 32 | self.mean_module = gpytorch.means.ConstantMean() 33 | 34 | base_cov_module = gpytorch.kernels.ScaleKernel( 35 | gpytorch.kernels.RBFKernel()) 36 | 37 | self.covar_module = gpytorch.kernels.InducingPointKernel( 38 | base_cov_module, 39 | inducing_points=train_x[:n_inducing, :], 40 | likelihood=likelihood) 41 | 42 | def forward(self, x): 43 | mean_x = self.mean_module(x) 44 | covar_x = self.covar_module(x) 45 | return gpytorch.distributions.MultivariateNormal(mean_x, covar_x) 46 | 47 | def fit_params(self, train_x, train_y, likelihood, n_iter=100): 48 | pass 49 | 50 | def predict(self, val_x): 51 | pass 52 | 53 | 54 | class GPTrainer: 55 | samp_bound = int(1.5e5) 56 | 57 | def __init__(self, device, train_tup, val_tup, heldout_tup=None, n_iter=50, n_inducing=500): 58 | self.device = device 59 | self.n_iter = n_iter 60 | self.n_inducing = n_inducing 61 | self.org_train_x = train_tup[0] 62 | self.org_train_y = train_tup[1] 63 | self.train_chr_locations = train_tup[2] 64 | self.train_mappability = train_tup[3] 65 | self.train_quantiles = train_tup[4] 66 | self.org_val_x = val_tup[0] 67 | self.org_val_y = val_tup[1] 68 | self.val_chr_locations = val_tup[2] 69 | self.val_mappability = train_tup[3] 70 | self.val_quantiles = train_tup[4] 71 | 72 | self.train_x, self.train_y, scaler, self.y_mean, self.y_std = self.standardize(train_tup[0], train_tup[1]) 73 | self.val_x, self.val_y, _, _, _ = self.standardize(val_tup[0], 74 | val_tup[1], 75 | scaler, 76 | self.y_mean, 77 | self.y_std) 78 | 79 | self.idx_feat = np.where(np.abs(self.train_x).mean(axis=0) > 0)[0] 80 | train_size = self.train_x.shape[0] 81 | if train_size > self.samp_bound: # upper bound number of samples to fit on GPU memory 82 | samp_idxs = np.random.choice(self.train_x.shape[0], size=self.samp_bound, replace=False) 83 | assert len(np.unique(samp_idxs)) == len(samp_idxs) 84 | self.train_x = self.train_x[samp_idxs] 85 | self.train_y = self.train_y[samp_idxs] 86 | print('Reduced train set size from {} to {}, to stay within memory limits'.format(train_size, self.samp_bound)) 87 | 88 | self.train_x = self.train_x[:, self.idx_feat] 89 | self.val_x = self.val_x[:, self.idx_feat] 90 | print('After zero features reduction feature vectors are now of size: {}'.format(self.train_x.shape[1])) 91 | 92 | if heldout_tup is not None: 93 | self.org_ho_x = heldout_tup[0] 94 | self.org_ho_y = heldout_tup[1] 95 | self.ho_chr_locations = heldout_tup[2] 96 | self.ho_mappability = heldout_tup[3] 97 | self.ho_quantiles = heldout_tup[4] 98 | self.held_x, self.held_y, _, _, _ = self.standardize(heldout_tup[0], 99 | heldout_tup[1], 100 | scaler, 101 | self.y_mean, 102 | self.y_std) 103 | self.held_x = self.held_x[:, self.idx_feat] 104 | else: 105 | self.held_x, self.held_y = None, None 106 | 107 | def standardize(self, X, Y, scaler=None, y_mean=None, y_std=None): 108 | 109 | if not scaler: 110 | scaler = StandardScaler() 111 | scaler.fit(X) 112 | 113 | if not y_mean: 114 | y_mean = Y.mean() 115 | y_std = Y.std() 116 | 117 | x = scaler.transform(X) 118 | y = (Y - y_mean) / y_std 119 | 120 | return x, y, scaler, y_mean, y_std 121 | 122 | def train_model(self): 123 | X = torch.tensor(self.train_x).float().contiguous().to(self.device) 124 | y = torch.tensor(self.train_y).float().contiguous().to(self.device) 125 | likelihood = gpytorch.likelihoods.GaussianLikelihood().to(self.device) 126 | model = SparseGP(X, y, likelihood, n_inducing=self.n_inducing).to(self.device) 127 | model.train() 128 | likelihood.train() 129 | 130 | optimizer = torch.optim.Adam([{'params': model.parameters()}], lr=0.8) 131 | 132 | # "Loss" for GPs - the marginal log likelihood 133 | mll = gpytorch.mlls.ExactMarginalLogLikelihood(likelihood, model) 134 | 135 | for i in range(self.n_iter): 136 | optimizer.zero_grad() 137 | y_pred = model(X) 138 | loss = -mll(y_pred, y) 139 | loss.backward() 140 | optimizer.step() 141 | 142 | # delete variables to clear memory 143 | del X 144 | del y 145 | del loss 146 | del optimizer 147 | del mll 148 | return model, likelihood 149 | 150 | def predict(self, model, likelihood, x, y): 151 | model.eval() 152 | likelihood.eval() 153 | # "Loss" for GPs - the marginal log likelihood 154 | mll = gpytorch.mlls.ExactMarginalLogLikelihood(likelihood, model) 155 | 156 | X = torch.tensor(x).float().contiguous().to(self.device) 157 | y_true = torch.tensor(y).float().contiguous().to(self.device) 158 | print('Predicting over {} samples.'.format(X.size(0))) 159 | with torch.no_grad(), gpytorch.settings.fast_pred_var(): 160 | y_pred = model(X) 161 | loss = -mll(y_pred, y_true) 162 | y_hat = y_pred.mean.cpu().numpy() 163 | y_std = y_pred.stddev.cpu().numpy() 164 | 165 | # delete variables to clear memory 166 | del X 167 | return y_hat, y_std, loss.item() 168 | 169 | @staticmethod 170 | def get_results_dict(mean, std, r2, loss, params): 171 | return {'gp_mean': mean, 'gp_std': std, 'r2': r2, 'loss': loss, 'params': params} 172 | 173 | def run(self): 174 | torch.cuda.empty_cache() 175 | 176 | # Train model 177 | #with gpytorch.settings.cg_tolerance(1e9), gpytorch.settings.debug(False): 178 | model, likelihood = self.train_model() 179 | 180 | # Validate model 181 | #with gpytorch.settings.eval_cg_tolerance(1e6): 182 | val_mean, val_std, val_loss = self.predict(model, likelihood, self.val_x, self.val_y) 183 | val_r2 = r2_score(self.val_y, val_mean) 184 | print(bcolors.OKCYAN + 'Validation set R2: {}'.format(val_r2) + bcolors.ENDC) 185 | 186 | params = np.array([model.covar_module.base_kernel.base_kernel.lengthscale.item(), 187 | model.covar_module.base_kernel.outputscale.item(), 188 | likelihood.noise_covar.noise.item()]) 189 | 190 | val_res = self.get_results_dict(val_mean * self.y_std + self.y_mean, 191 | val_std * self.y_std, 192 | val_r2, val_loss, params) 193 | 194 | if self.held_x is not None: 195 | #with gpytorch.settings.eval_cg_tolerance(1e6): 196 | hld_mean, hld_std, hld_loss = self.predict(model, likelihood, self.held_x, self.held_y) 197 | hld_r2 = r2_score(self.held_y, hld_mean) 198 | print(bcolors.OKCYAN + 'Held-out set R2: {}'.format(hld_r2) + bcolors.ENDC) 199 | hld_res = self.get_results_dict(hld_mean * self.y_std + self.y_mean, 200 | hld_std * self.y_std, 201 | hld_r2, hld_loss, 202 | params) 203 | return val_res, hld_res 204 | return val_res, None 205 | 206 | def save_results(self, val_res_dict, held_res_dict, h5_file, run_id): 207 | print('Saving GP {} results'.format(int(run_id) + 1)) 208 | if 'train' not in h5_file: 209 | train_grp = h5_file.create_group('train') 210 | train_grp.create_dataset('nn_features', data=self.org_train_x) 211 | train_grp.create_dataset('y_true', data=self.org_train_y) 212 | train_grp.create_dataset('chr_locs', data=np.array(self.train_chr_locations)) 213 | train_grp.create_dataset('mappability', data=np.array(self.train_mappability)) 214 | train_grp.create_dataset('quantiles', data=np.array(self.train_quantiles)) 215 | if 'val' not in h5_file: 216 | val_grp = h5_file.create_group('val') 217 | val_grp.create_dataset('nn_features', data=self.val_x) 218 | val_grp.create_dataset('y_true', data=self.org_val_y) 219 | val_grp.create_dataset('chr_locs', data=np.array(self.val_chr_locations)) 220 | val_grp.create_dataset('mappability', data=np.array(self.val_mappability)) 221 | val_grp.create_dataset('quantiles', data=np.array(self.val_quantiles)) 222 | 223 | val_run_grp = h5_file['val'].create_group(run_id) 224 | val_run_grp.create_dataset('mean', data=val_res_dict['gp_mean']) 225 | val_run_grp.create_dataset('std', data=val_res_dict['gp_std']) 226 | val_run_grp.create_dataset('params', data=val_res_dict['params']) 227 | val_run_grp.attrs['R2'] = val_res_dict['r2'] 228 | val_run_grp.attrs['loss'] = val_res_dict['loss'] 229 | 230 | if held_res_dict is not None: 231 | if 'held-out' not in h5_file: 232 | ho_grp = h5_file.create_group('held-out') 233 | ho_grp.create_dataset('nn_features', data=self.org_ho_x) 234 | ho_grp.create_dataset('y_true', data=self.org_ho_y) 235 | ho_grp.create_dataset('chr_locs', data=np.array(self.ho_chr_locations)) 236 | ho_grp.create_dataset('mappability', data=np.array(self.ho_mappability)) 237 | ho_grp.create_dataset('quantiles', data=np.array(self.ho_quantiles)) 238 | 239 | ho_run_grp = h5_file['held-out'].create_group(run_id) 240 | ho_run_grp.create_dataset('mean', data=held_res_dict['gp_mean']) 241 | ho_run_grp.create_dataset('std', data=held_res_dict['gp_std']) 242 | ho_run_grp.create_dataset('params', data=held_res_dict['params']) 243 | ho_run_grp.attrs['R2'] = held_res_dict['r2'] 244 | ho_run_grp.attrs['loss'] = held_res_dict['loss'] 245 | return val_res_dict['r2'], held_res_dict['r2'] 246 | 247 | def compute_pretrained(self, out_h5, runs_num): 248 | assert 'held-out' in out_h5, 'Cannot compute pretrained model with no saved held-out set. Existing feilds are: {}'.format(out_h5.keys()) 249 | ds = out_h5['held-out'] 250 | chr_locs = ds['chr_locs'][:] 251 | mapps = ds['mappability'][:] 252 | quants = ds['quantiles'][:] 253 | y_true = ds['y_true'][:] 254 | mean_lst = [] 255 | std_lst = [] 256 | for i in np.arange(runs_num).astype(str): 257 | mean_lst.append(ds[i]['mean'][:]) 258 | std_lst.append(ds[i]['std'][:]) 259 | means = np.array(mean_lst).mean(axis=0) 260 | stds = np.array(std_lst).mean(axis=0) 261 | return chr_locs, mapps, quants, y_true, means, stds 262 | 263 | -------------------------------------------------------------------------------- /DIGDriver/region_model/trainers/nn_trainer.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import torch 3 | import scipy 4 | import torch.utils.data 5 | #from sklearn.metrics import r2_score4 6 | from torch.utils.data import DataLoader 7 | import matplotlib.pyplot as plt 8 | plt.switch_backend('agg') 9 | 10 | 11 | def r2_score(y_true, y_pred): 12 | r2 = scipy.stats.pearsonr(y_true, y_pred)[0]**2 13 | return r2 if not np.isnan(r2) else 0 14 | 15 | 16 | class NNTrainer: 17 | def __init__(self, model, optimizer, loss_fn, bs, label_ids, train_ds, test_ds, device, writer=None, get_attention_maps=False): 18 | self.device = device 19 | self.model = model.to(self.device) 20 | self.optimizer = optimizer 21 | self.loss_fn = loss_fn 22 | self.bs = bs 23 | 24 | self.train_dataloader = DataLoader(train_ds, batch_size=bs, shuffle=True, drop_last=False, num_workers=16) 25 | self.test_dataloader = DataLoader(test_ds, batch_size=bs, shuffle=False, drop_last=False, num_workers=16) 26 | 27 | self.label_ids = label_ids 28 | self.get_attention_maps = get_attention_maps 29 | 30 | self.writer = writer 31 | 32 | ''' 33 | if writer is not None: 34 | shape = train_ds.get_data_shape() 35 | dummy_input = (torch.zeros(1, shape[1], shape[2]),) 36 | print(dummy_input[0].size()) 37 | self.writer.add_graph(model(), dummy_input, True) 38 | ''' 39 | 40 | def train(self, epoch, run, print_interval=10, autoreg=False): 41 | # toggle model to train mode 42 | self.model.train() 43 | 44 | samp_ctr = 0 45 | batch_num = len(self.train_dataloader) 46 | loss_sums = np.zeros(len(self.label_ids)) 47 | corr_coef_sums = np.zeros(len(self.label_ids)) 48 | all_preds = [[] for _ in range(len(self.label_ids))] 49 | all_true = [[] for _ in range(len(self.label_ids))] 50 | all_features_lst = [[] for _ in range(len(self.label_ids))] 51 | print('Training epoch {}'.format(epoch)) 52 | for j, batch in enumerate(self.train_dataloader): 53 | t_lst = batch[-1] 54 | if autoreg == True: 55 | y_lst, fv_lst, _ = self.model(batch[0].to(self.device), torch.cat(batch[1], dim=1).to(self.device)) 56 | else: 57 | y_lst, fv_lst, _ = self.model(batch[0].to(self.device)) 58 | samp_ctr += batch[0].size()[0] 59 | loss_lst = [] 60 | for i, t in enumerate(t_lst): 61 | y = y_lst[i] 62 | all_preds[i].extend(y.data.cpu().numpy().tolist()) 63 | all_true[i].extend(t.data.cpu().numpy().tolist()) 64 | all_features_lst[i].extend(fv_lst[i].data.cpu().numpy()) 65 | task_loss = self.loss_fn(y, t.to(self.device))# + torch.norm(attention, p=1, dim=(1,2)).mean() 66 | loss_lst.append(task_loss) 67 | loss_sums[i] += task_loss.item() 68 | corr_coef = r2_score(t.data.cpu().numpy(), y.data.cpu().numpy()) 69 | corr_coef_sums[i] += corr_coef 70 | 71 | loss = torch.sum(torch.stack(loss_lst)) 72 | self.optimizer.zero_grad() 73 | loss.backward() 74 | self.optimizer.step() 75 | 76 | if j % int(batch_num * print_interval / 100) == 0 and j > 0: # print progress every print_interval% 77 | print('Train Epoch: {} [{}/{} ({:.0f}%)]\tLoss: {}\tAccuracy: {}'.format( 78 | epoch, j, batch_num, 100. * j / batch_num, 79 | loss_sums / (samp_ctr / self.bs), corr_coef_sums / (samp_ctr / self.bs))) 80 | 81 | train_accs = corr_coef_sums / batch_num 82 | train_losses = loss_sums / batch_num 83 | 84 | print('====> Epoch: {}, Average loss: {}, Average accuracy: {}'.format(epoch, train_losses, train_accs)) 85 | 86 | if self.writer is not None: 87 | for i in range(len(self.label_ids)): 88 | self.writer.add_scalar('Train_{}/Loss_{}'.format(run, self.label_ids[i]), train_losses[i], epoch) 89 | self.writer.add_scalar('Train_{}/R^2_{}'.format(run, self.label_ids[i]), train_accs[i], epoch) 90 | 91 | return train_losses, train_accs, all_features_lst, all_preds, all_true, 92 | 93 | def predict(self, model, dataloader, epoch, run, set_id='Test', autoreg=False): 94 | # toggle model to test / inference mode 95 | model.eval() 96 | 97 | batch_num = len(dataloader) 98 | loss_sums = np.zeros(len(self.label_ids)) 99 | corr_coef_sums = np.zeros(len(self.label_ids)) 100 | all_preds = [[] for _ in range(len(self.label_ids))] 101 | all_true = [[] for _ in range(len(self.label_ids))] 102 | all_features_lst = [[] for _ in range(len(self.label_ids))] 103 | all_att = [] 104 | for j, batch in enumerate(dataloader): 105 | t_lst = batch[-1] 106 | if autoreg == True: 107 | y_lst, fv_lst, attention = self.model(batch[0].to(self.device), torch.cat(batch[1], dim=1).to(self.device)) 108 | else: 109 | y_lst, fv_lst, attention = self.model(batch[0].to(self.device)) 110 | 111 | if self.get_attention_maps: all_att.append(attention.cpu().detach().numpy()) 112 | with torch.no_grad(): 113 | for i, t in enumerate(t_lst): 114 | y = y_lst[i] 115 | all_features_lst[i].append(fv_lst[i].cpu().detach().numpy()) 116 | all_preds[i].extend(y.data.cpu().numpy().tolist()) 117 | all_true[i].extend(t.data.cpu().numpy().tolist()) 118 | corr_coef_sums[i] += r2_score(t.data.cpu().numpy(), y.data.cpu().numpy()) 119 | loss_sums[i] += self.loss_fn(y, t.to(self.device))# + torch.norm(attention, p=1, dim=(1,2)).mean() 120 | all_features = [np.concatenate(all_features_lst[j], axis=0) for j in range(len(all_features_lst))] 121 | test_accs = corr_coef_sums / batch_num 122 | test_losses = loss_sums / batch_num 123 | 124 | print('====> Test set loss: {}, accuracy: {}'.format(test_losses, test_accs)) 125 | 126 | if self.writer is not None: 127 | for i in range(len(self.label_ids)): 128 | self.writer.add_scalar('{}_{}/Loss_{}'.format(set_id, run, self.label_ids[i]), test_losses[i], epoch) 129 | self.writer.add_scalar('{}_{}/R^2_{}'.format(set_id, run, self.label_ids[i]), test_accs[i], epoch) 130 | 131 | for name, param in self.model.named_parameters(): 132 | if 'bn' not in name: 133 | self.writer.add_histogram(name, param, epoch) 134 | 135 | self.plot_prediction_scatter(dataloader, all_preds, '{}/run_{}/epoch_{}'.format(set_id, run, epoch), test_accs) 136 | self.plot_prediction_histogram(dataloader, all_preds, '{}/run_{}/epoch_{}'.format(set_id, run, epoch), test_accs) 137 | 138 | if self.get_attention_maps: 139 | return test_losses, test_accs, all_features, all_preds, all_true, np.concatenate(all_att, axis=0) 140 | else: 141 | return test_losses, test_accs, all_features, all_preds, all_true, None 142 | 143 | def test(self, epoch, run, autoreg=False): 144 | return self.predict(self.model, self.test_dataloader, epoch, run, autoreg=autoreg) 145 | 146 | def plot_prediction_scatter(self, dataloader, preds, writer_id, accs, confidence=99.9): 147 | for i in range(len(preds)): 148 | t = np.concatenate([l[i].data.cpu().numpy() for (_,l) in dataloader]) 149 | fig = plt.figure() 150 | ax = plt.gca() 151 | y = np.array(preds[i]) 152 | ax.scatter(t, y, alpha=0.3) 153 | x = np.linspace(*ax.get_xlim()) 154 | ax.plot(x, x) 155 | ax.set_ylim(0, np.percentile(y, confidence) + 1) 156 | ax.set_xlim(0, np.percentile(t, confidence) + 1) 157 | ax.set_xlabel('True') 158 | ax.set_ylabel('Predicted') 159 | ax.set_title('Accuracy: {}'.format(np.round(accs[i], 3))) 160 | self.writer.add_figure('{}/{}/Scatter' 161 | .format(self.label_ids[i], writer_id), fig) 162 | 163 | def plot_prediction_histogram(self, dataloader, preds, writer_id, accs, confidence=99.85): 164 | for i in range(len(preds)): 165 | t = np.concatenate([l[i].data.cpu().numpy() for (_,l) in dataloader]) 166 | fig = plt.figure() 167 | ax = plt.gca() 168 | y = np.array(preds[i]) 169 | y_max_bin = int(np.percentile(y, confidence) + 1) 170 | t_max_bin = int(np.percentile(t, confidence) + 1) 171 | max_bin = max(y_max_bin, t_max_bin) 172 | ax.hist(t, max_bin, (0, max_bin), alpha=0.5) 173 | ax.hist(y, max_bin, (0, max_bin), alpha=0.5) 174 | ax.set_xlabel('Mutation Count') 175 | ax.set_ylabel('Window #') 176 | ax.set_title('Accuracy: {}'.format(np.round(accs[i], 3))) 177 | ax.legend(['True', 'Predicted']) 178 | self.writer.add_figure('{}/{}/Histogram' 179 | .format(self.label_ids[i], writer_id), fig) 180 | 181 | -------------------------------------------------------------------------------- /DIGDriver/sequence_model/__init__.py: -------------------------------------------------------------------------------- 1 | ## init file for python module 2 | -------------------------------------------------------------------------------- /DIGDriver/sequence_model/gp_tools.py: -------------------------------------------------------------------------------- 1 | ##entire module seems to be deprecated. superceeeded by region_model_tools 2 | 3 | import pandas as pd 4 | import numpy as np 5 | import scipy.stats 6 | import h5py 7 | import seaborn as sns 8 | import matplotlib.pyplot as plt 9 | 10 | ##deprecated. gp loading now done in region_model_tools. may be useful in notebooks? 11 | ##TODELETE 12 | def load_ensemble(f, cancer=None, split='test'): 13 | ## Load data 14 | data_pred = h5py.File(f, 'r') 15 | if cancer: 16 | dset = data_pred[cancer] 17 | else: 18 | dset = data_pred 19 | 20 | try: 21 | runs = [key for key in dset[split].keys() if key.isdigit()] ## NOTE: bad way to find integers used as keys 22 | train_idx = dset['train']['chr_locs'][:] 23 | y_true = dset[split]['y_true'][:].reshape(-1, 1) 24 | idx = dset[split]['chr_locs'][:] 25 | gp_mean_lst = [dset[split][str(i)]['mean'][:] for i in runs] 26 | gp_std_lst = [dset[split][str(i)]['std'][:] for i in runs] 27 | 28 | except: 29 | reruns = len([key for key in dset[split].keys() if key.startswith('gp_mean')]) 30 | train_idx = dset['train']['idxs'][:] 31 | y_true = dset[split]['true'][0, :].reshape(-1, 1) 32 | idx = dset[split]['idxs'][:] 33 | gp_mean_lst = [dset[split]['gp_mean_{:02d}'.format(run)][:] for run in range(1, reruns-1)] 34 | gp_std_lst = [dset[split]['gp_std_{:02d}'.format(run)][:] for run in range(1, reruns-1)] 35 | 36 | gp_mean_nd = np.vstack(gp_mean_lst) 37 | gp_mean = np.median(gp_mean_nd, axis=0).reshape(-1, 1) 38 | 39 | gp_std_nd = np.vstack(gp_std_lst) 40 | gp_std = np.median(gp_std_nd, axis=0).reshape(-1, 1) 41 | 42 | data_pred.close() 43 | 44 | return train_idx, y_true, idx, gp_mean, gp_std 45 | 46 | ##deprecated. gp loading now done in region_model_tools. may be useful in notebooks? 47 | ##TODELETE 48 | def load_run(f, run, cancer=None, split='test'): 49 | hf = h5py.File(f, 'r') 50 | if cancer: 51 | dset = hf[cancer] 52 | else: 53 | dset = hf 54 | 55 | try: 56 | train_idx = dset['train']['chr_locs'][:] 57 | test_Y = dset[split]['y_true'][:].reshape(-1, 1) 58 | test_idx = dset[split]['chr_locs'][:] 59 | test_Yhat = dset[split]['{}'.format(run)]['mean'][:].reshape(-1, 1) 60 | test_std = dset[split]['{}'.format(run)]['std'][:].reshape(-1, 1) 61 | except: 62 | train_idx = dset['train']['idxs'][:] 63 | test_Y = dset[split]['true'][0, :].reshape(-1, 1) 64 | test_idx = dset[split]['idxs'][:] 65 | test_Yhat = dset[split]['gp_mean_{:02d}'.format(run)][:].reshape(-1, 1) 66 | test_std = dset[split]['gp_std_{:02d}'.format(run)][:].reshape(-1, 1) 67 | 68 | hf.close() 69 | return train_idx, test_Y, test_idx, test_Yhat, test_std 70 | 71 | ##deprecated. gp loading now done in region_model_tools. may be useful in notebooks? 72 | ##TODELETE 73 | def load_fold(f, cancer=None, run=None, split='test', reruns=10): 74 | if run == None: 75 | run = pick_gp_by_calibration(f, cancer=cancer, dataset=split) 76 | 77 | if run=='ensemble': 78 | train_idx, test_Y, test_idx, test_Yhat, test_std = load_ensemble(f, cancer=cancer, split=split) 79 | 80 | else: 81 | train_idx, test_Y, test_idx, test_Yhat, test_std = load_run(f, run, cancer=cancer, split=split) 82 | 83 | vals = np.hstack([test_idx, test_Y, test_Yhat, test_std]) 84 | df = pd.DataFrame(vals, columns=['CHROM', 'START', 'END', 'Y_TRUE', 'Y_PRED', 'STD']) 85 | 86 | return df 87 | 88 | def plot_qq_log(pvals, label='', ax=None, rasterized=False, color=None): 89 | if not ax: 90 | f, ax = plt.subplots(1, 1) 91 | exp = -np.log10(np.arange(1, len(pvals) + 1) / len(pvals)) 92 | pvals_log10_sort = -np.log10(np.sort(pvals)) 93 | 94 | if not color: 95 | color = sns.color_palette()[0] 96 | 97 | ax.plot(exp, pvals_log10_sort, '.', label=label, rasterized=rasterized, color=color) 98 | ax.plot(exp, exp, 'k-') 99 | # ax.plot(exp, exp, 'r-') 100 | 101 | if label: 102 | ax.legend() 103 | 104 | def plot_qq(pvals, label='', ax=None, rasterized=False): 105 | if not ax: 106 | f, ax = plt.subplots(1, 1) 107 | exp = (np.arange(1, len(pvals) + 1) / len(pvals)) 108 | pvals_sort = np.sort(pvals) 109 | 110 | ax.plot(exp, pvals_sort, '.', label=label, rasterized=rasterized) 111 | ax.plot(exp, exp, 'r-') 112 | 113 | if label: 114 | ax.legend() 115 | 116 | 117 | def calibration_score_by_pvals(pvals): 118 | alpha = [0.05, 0.01, 0.001, 0.0001] 119 | alpha_emp = [len(pvals[pvals < a]) / len(pvals) for a in alpha] 120 | 121 | return sum([(a-ap)**2 for a, ap in zip(alpha, alpha_emp)]) 122 | 123 | 124 | # def merge_windows(df, start, end, new_size): 125 | def merge_windows(df, idx_new): 126 | # bins = np.concatenate([np.arange(start, end, new_size), [end]]) 127 | 128 | Y_merge = np.array([df[(df.CHROM==row[0]) & (df.START >= row[1]) & (df.START < row[2])].Y_TRUE.sum() \ 129 | for row in idx_new]) 130 | Yhat_merge = np.array([df[(df.CHROM==row[0]) & (df.START >= row[1]) & (df.START < row[2])].Y_PRED.sum() \ 131 | for row in idx_new]) 132 | std_merge = np.array([np.sqrt((df[(df.CHROM==row[0]) & (df.START >= row[1]) & (df.START < row[2])].STD**2).sum()) \ 133 | for row in idx_new]) 134 | 135 | # Y_merge = np.array([df[(df.START >= v1) & (df.START < v2)].Y_TRUE.sum() \ 136 | # for v1, v2 in zip(bins[:-1], bins[1:])]) 137 | # Yhat_merge = np.array([df[(df.START >= v1) & (df.START < v2)].Y_PRED.sum() \ 138 | # for v1, v2 in zip(bins[:-1], bins[1:])]) 139 | # std_merge = np.array([np.sqrt((df[(df.START >= v1) & (df.START < v2)].STD**2).sum()) \ 140 | # for v1, v2 in zip(bins[:-1], bins[1:])]) 141 | 142 | a_merge = np.hstack([idx_new, 143 | Y_merge.reshape(-1, 1), 144 | Yhat_merge.reshape(-1, 1), 145 | std_merge.reshape(-1, 1) 146 | ] 147 | ) 148 | # a_merge = np.hstack([bins[:-1].reshape(-1, 1), 149 | # bins[1:].reshape(-1, 1), 150 | # Y_merge.reshape(-1, 1), 151 | # Yhat_merge.reshape(-1, 1), 152 | # std_merge.reshape(-1, 1) 153 | # ] 154 | # ) 155 | 156 | df_merge = pd.DataFrame(a_merge, columns=['CHROM', 'START', 'END', 'Y_TRUE', 'Y_PRED', 'STD']) 157 | # df_merge = pd.DataFrame(a_merge, columns=['START', 'END', 'Y_TRUE', 'Y_PRED', 'STD']) 158 | # df_merge.insert(0, 'CHROM', df.CHROM.iloc[0]) 159 | 160 | return df_merge 161 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | BSD 3-Clause License 2 | 3 | Copyright (c) 2021, Adam Yaari, Maxwell Sherman, Oliver Priebe, Bonnie Berger 4 | All rights reserved. 5 | 6 | Redistribution and use in source and binary forms, with or without 7 | modification, are permitted provided that the following conditions are met: 8 | 9 | 1. Redistributions of source code must retain the above copyright notice, this 10 | list of conditions and the following disclaimer. 11 | 12 | 2. Redistributions in binary form must reproduce the above copyright notice, 13 | this list of conditions and the following disclaimer in the documentation 14 | and/or other materials provided with the distribution. 15 | 16 | 3. Neither the name of the copyright holder nor the names of its 17 | contributors may be used to endorse or promote products derived from 18 | this software without specific prior written permission. 19 | 20 | THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" 21 | AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 22 | IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE 23 | DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE 24 | FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 25 | DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR 26 | SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER 27 | CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, 28 | OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE 29 | OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 30 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | ![conda badge](https://anaconda.org/mutation_density/digdriver/badges/installer/conda.svg) 2 | 3 | # Welcome to Dig 4 | Dig builds genome-wide maps of somatic mutation rates in cancer genomes and allows any set of candidate mutations to be tested for an excess of observed mutations compared to the number expected based on the neutral mutation rate. 5 | 6 | ## Web-browseable mutation maps 7 | Want to visually explore somatic mutation rates across the genome? Check out our [genome browser](https://resgen.io/maxsh/Cancer_Mutation_Maps/views) genome browser with maps of predicted and observed mutation counts for 37 types of cancer. 8 | 9 | ## Getting started 10 | See our [wiki](https://github.com/maxwellsh/DIGDriver/wiki) for installation instructions and tutorials. 11 | 12 | ## Data files 13 | All necessary data files are available from our [data portal](http://cb.csail.mit.edu/cb/DIG/downloads/) 14 | 15 | ## Citation 16 | Want to learn more about Dig and its biological applications? Check out our preprint [Sherman et al. 2021](https://www.biorxiv.org/content/10.1101/2021.08.03.454669v1). 17 | 18 | Really want to get into the weeds of the deep-learning model? Check out our [ICRL paper](https://openreview.net/forum?id=KtH8W3S_RE). 19 | 20 | Please cite both papers if you make use of our resources. 21 | -------------------------------------------------------------------------------- /__init__.py: -------------------------------------------------------------------------------- 1 | 2 | -------------------------------------------------------------------------------- /conda-recipe/meta.yaml: -------------------------------------------------------------------------------- 1 | package: 2 | name: digdriver 3 | version: 0.1 4 | 5 | source: 6 | # simply where it sits in git 7 | #git_url: https://github.com/AdamYaari/mutation_density.git 8 | path: ../ 9 | 10 | build: # from HPC support guy 11 | number: 0 12 | preserve_egg_dir: True 13 | script: $PYTHON setup.py install --single-version-externally-managed --record=record.txt 14 | 15 | 16 | requirements: 17 | # My understanding is that requirements needs to include the package 18 | # dependecies (aka other packages called in the package I am uploading) 19 | # 20 | # package can be built by calling conda build . from within the package folde 21 | # given that the folder contains meta.yaml file like this one and the build.sh file 22 | 23 | # then I can run 24 | # conda convert --platform all /opt/anaconda3/conda-bld/osx-64/r-hdatds-0.1.0-r35_0.tar.bz2 25 | # to make the package be usable acrros different platforms 26 | build: 27 | - r-base=3.5 28 | - python=3.7.1 29 | host: 30 | - python=3.7.1 31 | - r-base=3.5 32 | - pip=21.1.3 33 | - setuptools=49.6.0 34 | - bedtools=2.30.0 35 | - r-base=3.5 36 | - r-seqinr=3.6_1 37 | - r-MASS=7.3_51.6 38 | - bioconductor-genomicranges=1.34.0 39 | - bioconductor-biostrings=2.50.2 40 | - bioconductor-iranges=2.16.0 41 | - bioconductor-rsamtools=1.34.0 42 | - r-poilog=0.4 43 | - r-plyr=1.8.6 44 | - numpy=1.21.0 45 | - scipy=1.5.3 46 | - statsmodels=0.12.2 47 | - pandas=1.3.0 48 | - h5py=3.1.0 49 | - pysam=0.15.3 50 | - pybedtools=0.8.1 51 | - pybbi=0.3.0 52 | - seaborn=0.11.1 53 | - pytables=3.6.1 54 | 55 | run: 56 | - python=3.7.1 57 | - r-base=3.5 58 | - pip=21.1.3 59 | - setuptools=49.6.0 60 | - bedtools=2.30.0 61 | - r-base=3.5 62 | - r-seqinr=3.6_1 63 | - r-MASS=7.3_51.6 64 | - bioconductor-genomicranges=1.34.0 65 | - bioconductor-biostrings=2.50.2 66 | - bioconductor-iranges=2.16.0 67 | - bioconductor-rsamtools=1.34.0 68 | - r-poilog=0.4 69 | - r-plyr=1.8.6 70 | - numpy=1.21.0 71 | - scipy=1.5.3 72 | - statsmodels=0.12.2 73 | - pandas=1.3.0 74 | - h5py=3.1.0 75 | - pysam=0.15.3 76 | - pybedtools=0.8.1 77 | - pybbi=0.3.0 78 | - seaborn=0.11.1 79 | - pytables=3.6.1 80 | 81 | test: 82 | commands: 83 | # You can put additional test commands to be run here. 84 | 85 | # You can also put a file called run_test.py, run_test.sh, or run_test.bat 86 | # in the recipe that will be run at test time. 87 | 88 | # requires: 89 | # Put any additional test requirements here. 90 | 91 | 92 | 93 | about: 94 | # user-oriented info to be displayed in anaconda.org 95 | home: hhttps://github.com/AdamYaari/mutation_density 96 | license: MIT 97 | summary: Dig is a computational method that leverages transfer-learning to test for positive selection across arbitrary genomic elements in arbitrary cohorts while requiring the resources only of a personal computer 98 | Reference: http://cb.csail.mit.edu/cb/DIG/ 99 | license_family: MIT 100 | -------------------------------------------------------------------------------- /examples/README.md: -------------------------------------------------------------------------------- 1 | These scripts automatically run examples from https://github.com/maxwellsh/DIGDriver/wiki/05:-Analyzing-existing-annotations. 2 | 3 | They can be executed on any environment with Bash. They require: 4 | 5 | * Dig to be installed. 6 | * 4-6Gb of unused memory. 7 | 8 | Both `noncoding_driver.sh` and `mutation_driver.sh` can analyze different annotations by commenting in the relevant lines in the script. 9 | -------------------------------------------------------------------------------- /examples/gene_driver.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | MODEL="Pancan_SNV_MNV_INDEL.Pretrained.h5" 4 | MUTS="Pancan_SNV_MNV_INDEL.ICGC.annot.txt.gz" 5 | 6 | ## Check that DigDriver.py is in path 7 | [[ $(type -P "DigDriver.py") ]] || 8 | { echo "DigDriver.py is NOT in PATH. Please Ensure Dig is installed." 1>&2; exit 1; } 9 | 10 | ## Download files as necessary 11 | [[ ! -f "$MODEL" ]] && { echo -e "Downloading $MODEL\n"; wget -nv --show-progress "http://cb.csail.mit.edu/cb/DIG/downloads/mutation_maps/$MODEL"; echo -e "\n"; } 12 | 13 | [[ ! -f "$MUTS" ]] && { echo -e "Downloading $MUTS\n"; wget -nv --show-progress "http://cb.csail.mit.edu/cb/DIG/downloads/mutation_files/PCAWG/ICGC_only/$MUTS"; echo -e "\n"; } 14 | 15 | ## Run DigDriver 16 | echo -e "Running DigDriver.py...\n" 17 | DigDriver.py geneDriver \ 18 | Pancan_SNV_MNV_INDEL.ICGC.annot.txt.gz \ 19 | Pancan_SNV_MNV_INDEL.Pretrained.h5 \ 20 | --outdir . \ 21 | --outpfx Pancan_SNV_MNV_INDEL.genes 22 | -------------------------------------------------------------------------------- /examples/mutation_driver.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | MODEL="Pancan_SNV_MNV_INDEL.Pretrained.h5" 4 | MUTS="Pancan_SNV_MNV_INDEL.ICGC.annot.txt.gz" 5 | 6 | ## Annotation to be analyzed. Comment-in the desired annotation. 7 | DRIVERS="grch37.spliceAI_CRYPTIC.noncoding.txt.gz" 8 | NAME="spliceAI_cryptic_noncoding" 9 | 10 | # DRIVERS="grch37.spliceAI_CRYPTIC.txt.gz" 11 | # NAME="spliceAI_cryptic_all" 12 | 13 | # DRIVERS="grch37.spliceAI_CANONICAL.txt.gz" 14 | # NAME="spliceAI_canonical" 15 | 16 | # DRIVERS="grch37.spliceAI_CRYPTIC.coding.txt.gz" 17 | # NAME="spliceAI_cryptic_coding" 18 | 19 | ## Check that DigDriver.py is in path 20 | [[ $(type -P "DigDriver.py") ]] || 21 | { echo "DigDriver.py is NOT in PATH. Please Ensure Dig is installed." 1>&2; exit 1; } 22 | 23 | ## Download files as necessary 24 | [[ ! -f "$MODEL" ]] && { echo -e "Downloading $MODEL\n"; wget -nv --show-progress "http://cb.csail.mit.edu/cb/DIG/downloads/mutation_maps/$MODEL"; echo -e "\n"; } 25 | 26 | [[ ! -f "$MUTS" ]] && { echo -e "Downloading $MUTS\n"; wget -nv --show-progress "http://cb.csail.mit.edu/cb/DIG/downloads/mutation_files/PCAWG/ICGC_only/$MUTS"; echo -e "\n"; } 27 | 28 | [[ ! -f "$DRIVERS" ]] && { echo -e "Downloading $DRIVERS\n"; wget -nv --show-progress "http://cb.csail.mit.edu/cb/DIG/downloads/annotions/splicing/$DRIVERS"; echo -e "\n"; } 29 | 30 | ## Run DigDriver 31 | echo -e "Running DigDriver.py...\n" 32 | DigDriver.py elementDriver \ 33 | Pancan_SNV_MNV_INDEL.ICGC.annot.txt.gz \ 34 | Pancan_SNV_MNV_INDEL.Pretrained.h5 \ 35 | $NAME \ 36 | --f-sites $DRIVERS \ 37 | --outpfx Pancan_SNV_MNV_INDEL.$NAME \ 38 | --outdir . 39 | -------------------------------------------------------------------------------- /examples/noncoding_driver.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | MODEL="Pancan_SNV_MNV_INDEL.Pretrained.h5" 4 | MUTS="Pancan_SNV_MNV_INDEL.ICGC.annot.txt.gz" 5 | 6 | ## Annotation to be analyzed. Comment-in the desired annotation. 7 | REGION="grch37.PCAWG_noncoding.bed" 8 | NAME="PCAWG_all_elts" 9 | 10 | # REGION="grch37.canonical_5utr_with_splice.bed" 11 | # NAME="utr5_w_splice" 12 | 13 | # REGION="grch37.TP53_5UTR_exon1.bed" 14 | # NAME="TP53_5UTR" 15 | 16 | ## Check that DigDriver.py is in path 17 | [[ $(type -P "DigDriver.py") ]] || 18 | { echo "DigDriver.py is NOT in PATH. Please Ensure Dig is installed." 1>&2; exit 1; } 19 | 20 | ## Download files as necessary 21 | [[ ! -f "$MODEL" ]] && { echo -e "Downloading $MODEL\n"; wget -nv --show-progress "http://cb.csail.mit.edu/cb/DIG/downloads/mutation_maps/$MODEL"; echo -e "\n"; } 22 | 23 | [[ ! -f "$MUTS" ]] && { echo -e "Downloading $MUTS\n"; wget -nv --show-progress "http://cb.csail.mit.edu/cb/DIG/downloads/mutation_files/PCAWG/ICGC_only/$MUTS"; echo -e "\n"; } 24 | 25 | [[ ! -f "$REGION" ]] && { echo -e "Downloading $REGION\n"; wget -nv --show-progress "http://cb.csail.mit.edu/cb/DIG/downloads/annotions/noncoding/$REGION"; echo -e "\n"; } 26 | 27 | ## Run DigDriver 28 | echo -e "Running DigDriver.py...\n" 29 | DigDriver.py elementDriver \ 30 | Pancan_SNV_MNV_INDEL.ICGC.annot.txt.gz \ 31 | Pancan_SNV_MNV_INDEL.Pretrained.h5 \ 32 | $NAME \ 33 | --f-bed $REGION \ 34 | --outpfx Pancan_SNV_MNV_INDEL.$NAME \ 35 | --outdir . 36 | -------------------------------------------------------------------------------- /requirements.txt: -------------------------------------------------------------------------------- 1 | numpy 2 | scipy 3 | statsmodels 4 | pandas 5 | h5py 6 | pysam 7 | pybedtools 8 | pybbi>=0.2.0 9 | seaborn 10 | tables 11 | -------------------------------------------------------------------------------- /scripts/filter_hypermut.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | 3 | import pandas as pd 4 | import pkg_resources 5 | import pathlib 6 | import os 7 | import argparse 8 | 9 | from DIGDriver.data_tools import mutation_tools 10 | 11 | if __name__ == "__main__": 12 | 13 | parser = argparse.ArgumentParser(description='Filter hypermutated samples.') 14 | parser.add_argument('--suffix', default='annot.txt', help='suffix of Dig mutation files to filer') 15 | parser.add_argument('--max-muts-per-sample', default=3000, type=int, help='Maximum number of coding mutations allowed per sample. Samples with more coding mutations will be filtered.') 16 | args = parser.parse_args() 17 | 18 | if not os.path.isdir("filter_hypermut"): 19 | os.mkdir("filter_hypermut") 20 | 21 | paths = sorted(pathlib.Path('.').glob('*'+args.suffix)) 22 | 23 | for f in paths: 24 | df = mutation_tools.read_mutation_file(str(f), drop_duplicates=True) 25 | df_mut = df[df.GENE != '.'] 26 | _, sample_blacklist = mutation_tools.filter_hypermut_samples(df_mut, 27 | max_muts_per_sample=3000, 28 | return_blacklist=True 29 | ) 30 | df_out = df[~df.SAMPLE.isin(sample_blacklist)] 31 | print(f.name, df.shape, df_out.shape) 32 | f_out = os.path.join("filter_hypermut", f.name.split('.annot.txt')[0] + ".no_hypermut.annot.txt") 33 | df_out.to_csv(f_out, header=False, index=False, sep="\t") 34 | -------------------------------------------------------------------------------- /setup.py: -------------------------------------------------------------------------------- 1 | from setuptools import setup, find_packages 2 | import os 3 | import io 4 | 5 | 6 | HERE = os.path.dirname(os.path.abspath(__file__)) 7 | 8 | 9 | def read(*parts, **kwargs): 10 | filepath = os.path.join(HERE, *parts) 11 | encoding = kwargs.pop("encoding", "utf-8") 12 | with io.open(filepath, encoding=encoding) as fh: 13 | text = fh.read() 14 | return text 15 | 16 | 17 | def get_requirements(path): 18 | content = read(path) 19 | return [req for req in content.split("\n") if req != "" and not req.startswith("#")] 20 | 21 | 22 | # setup_requires = ["numpy"] 23 | 24 | install_requires = get_requirements("requirements.txt") 25 | 26 | setup( 27 | name="DIGDriver", 28 | version="0.2.0", 29 | description="Flexible cancer driver element detection", 30 | author="Maxwell Sherman", 31 | author_email="msherman997@gmail.com", 32 | url="", 33 | packages=find_packages(), 34 | # packages=["DIGDriver"], 35 | # setup_requires=setup_requires, 36 | install_requires=install_requires, 37 | scripts=["scripts/DataExtractor.py", 38 | "scripts/DigPretrain.py", 39 | "scripts/DigPreprocess.py", 40 | "scripts/mutationFunction.R", 41 | "scripts/DigDriver.py" 42 | ], 43 | include_package_data=True, 44 | package_data={'': ['data/*']}, 45 | # entry_points={"console_scripts": ["clodius = clodius.cli.aggregate:cli"]}, 46 | ) 47 | --------------------------------------------------------------------------------