├── .gitignore
├── DIGDriver
    ├── __init__.py
    ├── auxilaries
    │   ├── __init__.py
    │   ├── concat_data.py
    │   ├── unzip_h5.py
    │   └── utils.py
    ├── data
    │   ├── dndscv_gene_cds.bed.gz
    │   ├── genes.MARTINCORENA.bed
    │   ├── genes_CGC_ALL.txt
    │   ├── genes_CGC_ALL.txt.bck
    │   ├── genes_CGC_ONC.txt
    │   ├── genes_CGC_TSG.txt
    │   ├── genes_MSK_230.txt
    │   ├── genes_MSK_341.txt
    │   ├── genes_MSK_410.txt
    │   ├── genes_MSK_468.txt
    │   ├── genes_metabric_173.txt
    │   ├── genes_ucla_1202.txt
    │   └── refcds_hg19.rda
    ├── data_tools
    │   ├── DIG_auto.py
    │   ├── __init__.py
    │   ├── auto_runner.py
    │   ├── mappability_tools.py
    │   ├── mutation_tools.py
    │   └── track_selector.py
    ├── driver_model
    │   ├── __init__.py
    │   ├── onthefly_tools.py
    │   └── transfer_tools.py
    ├── region_model
    │   ├── .DS_Store
    │   ├── __init__.py
    │   ├── autoencoders
    │   │   ├── AE_vec_predictors.py
    │   │   ├── ae_nets
    │   │   │   ├── CNNs.py
    │   │   │   └── fc_nets.py
    │   │   └── autoencoder_main.py
    │   ├── data_aux
    │   │   ├── dataset_generator.py
    │   │   └── mut_dataset.py
    │   ├── feature_vectors
    │   │   ├── gaussian_process.py
    │   │   ├── get_feature_vectors.py
    │   │   └── get_heldout_feature_vectors.py
    │   ├── kfold_mutations_main.py
    │   ├── mutations_main.py
    │   ├── nets
    │   │   ├── __init__.py
    │   │   ├── cnn_predictors.py
    │   │   ├── densenet.py
    │   │   ├── resnet.py
    │   │   └── rnn_predictors.py
    │   ├── perturbations_confidance
    │   │   ├── __init__.py
    │   │   ├── confidance_perturbations_estimate.py
    │   │   ├── configs
    │   │   │   ├── __init__.py
    │   │   │   ├── config_confidance.json
    │   │   │   └── config_confidance_kfold.json
    │   │   └── kfold_test_model_confidance.py
    │   ├── region_model_tools.py
    │   ├── train_nn.sh
    │   └── trainers
    │   │   ├── __init__.py
    │   │   ├── gp_trainer.py
    │   │   └── nn_trainer.py
    └── sequence_model
    │   ├── __init__.py
    │   ├── genic_driver_tools.py
    │   ├── gp_tools.py
    │   ├── nb_model.py
    │   └── sequence_tools.py
├── LICENSE
├── README.md
├── __init__.py
├── conda-recipe
    └── meta.yaml
├── examples
    ├── README.md
    ├── gene_driver.sh
    ├── mutation_driver.sh
    └── noncoding_driver.sh
├── requirements.txt
├── scripts
    ├── DataExtractor.py
    ├── DigDriver.py
    ├── DigPreprocess.py
    ├── DigPretrain.py
    ├── filter_hypermut.py
    └── mutationFunction.R
└── setup.py


/.gitignore:
--------------------------------------------------------------------------------
1 | **/models
2 | **/runs
3 | **/__pycache__
4 | .git
5 | **.ipynb_checkpoints*
6 | **/._*
7 | *.egg-info
8 | 


--------------------------------------------------------------------------------
/DIGDriver/__init__.py:
--------------------------------------------------------------------------------
1 | ## python init file
2 | __all__ = ['sequence_model', 'data_tools', 'region_model', 'driver_model', 'auxilaries']
3 | 


--------------------------------------------------------------------------------
/DIGDriver/auxilaries/__init__.py:
--------------------------------------------------------------------------------
1 | ## python init file
2 | 


--------------------------------------------------------------------------------
/DIGDriver/auxilaries/concat_data.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | import sys
 3 | import json
 4 | import h5py
 5 | import numpy as np
 6 | import pickle as pkl
 7 | 
 8 | 
 9 | 
10 | def main():
11 |     cur_dir = os.path.dirname(os.path.realpath(__file__))
12 |     if len(sys.argv) < 2:
13 |         print('No input was given. Dataset directory must be given.')
14 |     else:
15 |         dir_path = sys.argv[1]
16 | 
17 |     idxs_relative_path = '../data_indices'
18 |     data_lst = sorted([f for f in os.listdir(dir_path)])
19 |     idx_lst = sorted([f for f in os.listdir(os.path.join(dir_path, idxs_relative_path))])
20 |     data_arr = []
21 |     idx_arr = []
22 | 
23 |     print('Loading all data and index files...')
24 |     for data_file, idx_file  in zip(data_lst, idx_lst):
25 |         hf = h5py.File(os.path.join(dir_path, data_file), 'r')
26 |         data_arr.append(hf['x_data'][:])  # returns a numpy array as long as the dataset's ID is 'x_data'
27 |         with open(os.path.join(dir_path, idxs_relative_path, idx_file), 'rb') as f:
28 |             idx_arr.append(pkl.load(f))
29 | 
30 |     print('Saving indices file to ./all_indices.pkl...')
31 |     with open('all_indices.pkl', 'wb') as f:
32 |         pkl.dump(idx_arr, f)
33 | 
34 |     print('Saving data file to ./all_data.pkl...')
35 |     h5f = h5py.File('all_data.h5', 'w')
36 |     h5f.create_dataset('x_data', data=np.concatenate(data_arr))
37 |     h5f.close()
38 | 
39 | 
40 | 
41 | if __name__ == '__main__':
42 |     main()
43 | 


--------------------------------------------------------------------------------
/DIGDriver/auxilaries/unzip_h5.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | 
 3 | import os
 4 | import sys
 5 | import h5py
 6 | 
 7 | if len(sys.argv) <= 1:
 8 |     raise Exception('Expected at least 1 input argument but found {}'.format(len(sys.argv) - 1))
 9 | 
10 | zipped_file_path = sys.argv[1]
11 | 
12 | print('Opening zipped h5 file...')
13 | zipped_h5f = h5py.File(zipped_file_path, 'r')
14 | 
15 | split_path = zipped_file_path.split('/')
16 | unzipped_file_path = os.path.join('/'.join(split_path[:-1]), 'unzipped_{}'.format(split_path[-1]))
17 | unzipped_h5f = h5py.File(unzipped_file_path, 'w')
18 | 
19 | print('Loading unzipped data to {}...'.format(unzipped_file_path))
20 | for k in zipped_h5f.keys():
21 |     print('Unzipping {}'.format(k))
22 |     unzipped_h5f[k] = zipped_h5f[k][:]
23 | 
24 | print('Done!')
25 | 


--------------------------------------------------------------------------------
/DIGDriver/auxilaries/utils.py:
--------------------------------------------------------------------------------
1 | import multiprocessing as mp
2 | 
3 | def get_cpus():
4 |     try:
5 |         c = min(max(1, mp.cpu_count() - 2), 20)
6 |     except:
7 |         c = 5
8 |     return c
9 | 


--------------------------------------------------------------------------------
/DIGDriver/data/dndscv_gene_cds.bed.gz:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/maxwellsh/DIGDriver/5bb565a1fbb3924ecdaaedeffb97123febc3b4d1/DIGDriver/data/dndscv_gene_cds.bed.gz


--------------------------------------------------------------------------------
/DIGDriver/data/genes_CGC_ALL.txt:
--------------------------------------------------------------------------------
  1 | A1CF
  2 | ABI1
  3 | ABL1
  4 | ABL2
  5 | ACKR3
  6 | ACSL3
  7 | ACSL6
  8 | ACVR1
  9 | ACVR2A
 10 | AFDN
 11 | AFF1
 12 | AFF3
 13 | AFF4
 14 | AKAP9
 15 | AKT1
 16 | AKT2
 17 | AKT3
 18 | ALDH2
 19 | ALK
 20 | AMER1
 21 | ANK1
 22 | APC
 23 | APOBEC3B
 24 | AR
 25 | ARAF
 26 | ARHGAP26
 27 | ARHGAP5
 28 | ARHGEF10
 29 | ARHGEF10L
 30 | ARHGEF12
 31 | ARID1A
 32 | ARID1B
 33 | ARID2
 34 | ARNT
 35 | ASPSCR1
 36 | ASXL1
 37 | ASXL2
 38 | ATF1
 39 | ATIC
 40 | ATM
 41 | ATP1A1
 42 | ATP2B3
 43 | ATR
 44 | ATRX
 45 | AXIN1
 46 | AXIN2
 47 | B2M
 48 | BAP1
 49 | BARD1
 50 | BAX
 51 | BAZ1A
 52 | BCL10
 53 | BCL11A
 54 | BCL11B
 55 | BCL2
 56 | BCL2L12
 57 | BCL3
 58 | BCL6
 59 | BCL7A
 60 | BCL9
 61 | BCL9L
 62 | BCLAF1
 63 | BCOR
 64 | BCORL1
 65 | BCR
 66 | BIRC3
 67 | BIRC6
 68 | BLM
 69 | BMP5
 70 | BMPR1A
 71 | BRAF
 72 | BRCA1
 73 | BRCA2
 74 | BRD3
 75 | BRD4
 76 | BRIP1
 77 | BTG1
 78 | BTK
 79 | BUB1B
 80 | C15orf65
 81 | CACNA1D
 82 | CALR
 83 | CAMTA1
 84 | CANT1
 85 | CARD11
 86 | CARS
 87 | CASP3
 88 | CASP8
 89 | CASP9
 90 | CBFA2T3
 91 | CBFB
 92 | CBL
 93 | CBLB
 94 | CBLC
 95 | CCDC6
 96 | CCNB1IP1
 97 | CCNC
 98 | CCND1
 99 | CCND2
100 | CCND3
101 | CCNE1
102 | CCR4
103 | CCR7
104 | CD209
105 | CD274
106 | CD28
107 | CD74
108 | CD79A
109 | CD79B
110 | CDC73
111 | CDH1
112 | CDH10
113 | CDH11
114 | CDH17
115 | CDK12
116 | CDK4
117 | CDK6
118 | CDKN1A
119 | CDKN1B
120 | CDKN2A
121 | CDKN2C
122 | CDX2
123 | CEBPA
124 | CEP89
125 | CHCHD7
126 | CHD2
127 | CHD4
128 | CHEK2
129 | CHIC2
130 | CHST11
131 | CIC
132 | CIITA
133 | CLIP1
134 | CLP1
135 | CLTC
136 | CLTCL1
137 | CNBD1
138 | CNBP
139 | CNOT3
140 | CNTNAP2
141 | CNTRL
142 | COL1A1
143 | COL2A1
144 | COL3A1
145 | COX6C
146 | CPEB3
147 | CREB1
148 | CREB3L1
149 | CREB3L2
150 | CREBBP
151 | CRLF2
152 | CRNKL1
153 | CRTC1
154 | CRTC3
155 | CSF1R
156 | CSF3R
157 | CSMD3
158 | CTCF
159 | CTNNA2
160 | CTNNB1
161 | CTNND1
162 | CTNND2
163 | CUL3
164 | CUX1
165 | CXCR4
166 | CYLD
167 | CYP2C8
168 | CYSLTR2
169 | DAXX
170 | DCAF12L2
171 | DCC
172 | DCTN1
173 | DDB2
174 | DDIT3
175 | DDR2
176 | DDX10
177 | DDX3X
178 | DDX5
179 | DDX6
180 | DEK
181 | DGCR8
182 | DICER1
183 | DNAJB1
184 | DNM2
185 | DNMT3A
186 | DROSHA
187 | DUX4L1
188 | EBF1
189 | ECT2L
190 | EED
191 | EGFR
192 | EIF1AX
193 | EIF3E
194 | EIF4A2
195 | ELF3
196 | ELF4
197 | ELK4
198 | ELL
199 | ELN
200 | EML4
201 | EP300
202 | EPAS1
203 | EPHA3
204 | EPHA7
205 | EPS15
206 | ERBB2
207 | ERBB3
208 | ERBB4
209 | ERC1
210 | ERCC2
211 | ERCC3
212 | ERCC4
213 | ERCC5
214 | ERG
215 | ESR1
216 | ETNK1
217 | ETV1
218 | ETV4
219 | ETV5
220 | ETV6
221 | EWSR1
222 | EXT1
223 | EXT2
224 | EZH2
225 | EZR
226 | FAM131B
227 | FAM135B
228 | FAM47C
229 | FANCA
230 | FANCC
231 | FANCD2
232 | FANCE
233 | FANCF
234 | FANCG
235 | FAS
236 | FAT1
237 | FAT3
238 | FAT4
239 | FBLN2
240 | FBXO11
241 | FBXW7
242 | FCGR2B
243 | FCRL4
244 | FEN1
245 | FES
246 | FEV
247 | FGFR1
248 | FGFR1OP
249 | FGFR2
250 | FGFR3
251 | FGFR4
252 | FH
253 | FHIT
254 | FIP1L1
255 | FKBP9
256 | FLCN
257 | FLI1
258 | FLNA
259 | FLT3
260 | FLT4
261 | FNBP1
262 | FOXA1
263 | FOXL2
264 | FOXO1
265 | FOXO3
266 | FOXO4
267 | FOXP1
268 | FOXR1
269 | FSTL3
270 | FUBP1
271 | FUS
272 | GAS7
273 | GATA1
274 | GATA2
275 | GATA3
276 | GLI1
277 | GMPS
278 | GNA11
279 | GNAQ
280 | GNAS
281 | GOLGA5
282 | GOPC
283 | GPC3
284 | GPC5
285 | GPHN
286 | GRIN2A
287 | GRM3
288 | H3F3A
289 | H3F3B
290 | HERPUD1
291 | HEY1
292 | HIF1A
293 | HIP1
294 | HIST1H3B
295 | HIST1H4I
296 | HLA-A
297 | HLF
298 | HMGA1
299 | HMGA2
300 | HMGN2P46
301 | HNF1A
302 | HNRNPA2B1
303 | HOOK3
304 | HOXA11
305 | HOXA13
306 | HOXA9
307 | HOXC11
308 | HOXC13
309 | HOXD11
310 | HOXD13
311 | HRAS
312 | HSP90AA1
313 | HSP90AB1
314 | ID3
315 | IDH1
316 | IDH2
317 | IGF2BP2
318 | IGH
319 | IGK
320 | IGL
321 | IKBKB
322 | IKZF1
323 | IL2
324 | IL21R
325 | IL6ST
326 | IL7R
327 | IRF4
328 | IRS4
329 | ISX
330 | ITGAV
331 | ITK
332 | JAK1
333 | JAK2
334 | JAK3
335 | JAZF1
336 | JUN
337 | KAT6A
338 | KAT6B
339 | KAT7
340 | KCNJ5
341 | KDM5A
342 | KDM5C
343 | KDM6A
344 | KDR
345 | KDSR
346 | KEAP1
347 | KIAA1549
348 | KIF5B
349 | KIT
350 | KLF4
351 | KLF6
352 | KLK2
353 | KMT2A
354 | KMT2C
355 | KMT2D
356 | KNL1
357 | KNSTRN
358 | KRAS
359 | KTN1
360 | LARP4B
361 | LASP1
362 | LATS1
363 | LATS2
364 | LCK
365 | LCP1
366 | LEF1
367 | LEPROTL1
368 | LHFPL6
369 | LIFR
370 | LMNA
371 | LMO1
372 | LMO2
373 | LPP
374 | LRIG3
375 | LRP1B
376 | LSM14A
377 | LYL1
378 | LZTR1
379 | MACC1
380 | MAF
381 | MAFB
382 | MALAT1
383 | MALT1
384 | MAML2
385 | MAP2K1
386 | MAP2K2
387 | MAP2K4
388 | MAP3K1
389 | MAP3K13
390 | MAPK1
391 | MAX
392 | MB21D2
393 | MDM2
394 | MDM4
395 | MDS2
396 | MECOM
397 | MED12
398 | MEN1
399 | MET
400 | MGMT
401 | MITF
402 | MLF1
403 | MLH1
404 | MLLT1
405 | MLLT10
406 | MLLT11
407 | MLLT3
408 | MLLT6
409 | MN1
410 | MNX1
411 | MPL
412 | MRTFA
413 | MSH2
414 | MSH6
415 | MSI2
416 | MSN
417 | MTCP1
418 | MTOR
419 | MUC1
420 | MUC16
421 | MUC4
422 | MUTYH
423 | MYB
424 | MYC
425 | MYCL
426 | MYCN
427 | MYD88
428 | MYH11
429 | MYH9
430 | MYO5A
431 | MYOD1
432 | N4BP2
433 | NAB2
434 | NACA
435 | NBEA
436 | NBN
437 | NCKIPSD
438 | NCOA1
439 | NCOA2
440 | NCOA4
441 | NCOR1
442 | NCOR2
443 | NDRG1
444 | NF1
445 | NF2
446 | NFATC2
447 | NFE2L2
448 | NFIB
449 | NFKB2
450 | NFKBIE
451 | NIN
452 | NKX2-1
453 | NONO
454 | NOTCH1
455 | NOTCH2
456 | NPM1
457 | NR4A3
458 | NRAS
459 | NRG1
460 | NSD1
461 | NSD2
462 | NSD3
463 | NT5C2
464 | NTHL1
465 | NTRK1
466 | NTRK3
467 | NUMA1
468 | NUP214
469 | NUP98
470 | NUTM1
471 | NUTM2B
472 | NUTM2D
473 | OLIG2
474 | OMD
475 | P2RY8
476 | PABPC1
477 | PAFAH1B2
478 | PALB2
479 | PATZ1
480 | PAX3
481 | PAX5
482 | PAX7
483 | PAX8
484 | PBRM1
485 | PBX1
486 | PCBP1
487 | PCM1
488 | PDCD1LG2
489 | PDE4DIP
490 | PDGFB
491 | PDGFRA
492 | PDGFRB
493 | PER1
494 | PHF6
495 | PHOX2B
496 | PICALM
497 | PIK3CA
498 | PIK3CB
499 | PIK3R1
500 | PIM1
501 | PLAG1
502 | PLCG1
503 | PML
504 | PMS1
505 | PMS2
506 | POLD1
507 | POLE
508 | POLG
509 | POLQ
510 | POT1
511 | POU2AF1
512 | POU5F1
513 | PPARG
514 | PPFIBP1
515 | PPM1D
516 | PPP2R1A
517 | PPP6C
518 | PRCC
519 | PRDM1
520 | PRDM16
521 | PRDM2
522 | PREX2
523 | PRF1
524 | PRKACA
525 | PRKAR1A
526 | PRKCB
527 | PRPF40B
528 | PRRX1
529 | PSIP1
530 | PTCH1
531 | PTEN
532 | PTK6
533 | PTPN11
534 | PTPN13
535 | PTPN6
536 | PTPRB
537 | PTPRC
538 | PTPRD
539 | PTPRK
540 | PTPRT
541 | PWWP2A
542 | QKI
543 | RABEP1
544 | RAC1
545 | RAD17
546 | RAD21
547 | RAD51B
548 | RAF1
549 | RALGDS
550 | RANBP2
551 | RAP1GDS1
552 | RARA
553 | RB1
554 | RBM10
555 | RBM15
556 | RECQL4
557 | REL
558 | RET
559 | RFWD3
560 | RGPD3
561 | RGS7
562 | RHOA
563 | RHOH
564 | RMI2
565 | RNF213
566 | RNF43
567 | ROBO2
568 | ROS1
569 | RPL10
570 | RPL22
571 | RPL5
572 | RPN1
573 | RSPO2
574 | RSPO3
575 | RUNX1
576 | RUNX1T1
577 | S100A7
578 | SALL4
579 | SBDS
580 | SDC4
581 | SDHA
582 | SDHAF2
583 | SDHB
584 | SDHC
585 | SDHD
586 | 2020-09-05 00:00:00
587 | 2020-09-06 00:00:00
588 | 2020-09-09 00:00:00
589 | SET
590 | SETBP1
591 | SETD1B
592 | SETD2
593 | SETDB1
594 | SF3B1
595 | SFPQ
596 | SFRP4
597 | SGK1
598 | SH2B3
599 | SH3GL1
600 | SHTN1
601 | SIRPA
602 | SIX1
603 | SIX2
604 | SKI
605 | SLC34A2
606 | SLC45A3
607 | SMAD2
608 | SMAD3
609 | SMAD4
610 | SMARCA4
611 | SMARCB1
612 | SMARCD1
613 | SMARCE1
614 | SMC1A
615 | SMO
616 | SND1
617 | SNX29
618 | SOCS1
619 | SOX2
620 | SOX21
621 | SPECC1
622 | SPEN
623 | SPOP
624 | SRC
625 | SRGAP3
626 | SRSF2
627 | SRSF3
628 | SS18
629 | SS18L1
630 | SSX1
631 | SSX2
632 | SSX4
633 | STAG1
634 | STAG2
635 | STAT3
636 | STAT5B
637 | STAT6
638 | STIL
639 | STK11
640 | STRN
641 | SUFU
642 | SUZ12
643 | SYK
644 | TAF15
645 | TAL1
646 | TAL2
647 | TBL1XR1
648 | TBX3
649 | TCEA1
650 | TCF12
651 | TCF3
652 | TCF7L2
653 | TCL1A
654 | TEC
655 | TENT5C
656 | TERT
657 | TET1
658 | TET2
659 | TFE3
660 | TFEB
661 | TFG
662 | TFPT
663 | TFRC
664 | TGFBR2
665 | THRAP3
666 | TLX1
667 | TLX3
668 | TMEM127
669 | TMPRSS2
670 | TNC
671 | TNFAIP3
672 | TNFRSF14
673 | TNFRSF17
674 | TOP1
675 | TP53
676 | TP63
677 | TPM3
678 | TPM4
679 | TPR
680 | TRA
681 | TRAF7
682 | TRB
683 | TRD
684 | TRIM24
685 | TRIM27
686 | TRIM33
687 | TRIP11
688 | TRRAP
689 | TSC1
690 | TSC2
691 | TSHR
692 | U2AF1
693 | UBR5
694 | USP44
695 | USP6
696 | USP8
697 | VAV1
698 | VHL
699 | VTI1A
700 | WAS
701 | WDCP
702 | WIF1
703 | WNK2
704 | WRN
705 | WT1
706 | WWTR1
707 | XPA
708 | XPC
709 | XPO1
710 | YWHAE
711 | ZBTB16
712 | ZCCHC8
713 | ZEB1
714 | ZFHX3
715 | ZMYM2
716 | ZMYM3
717 | ZNF331
718 | ZNF384
719 | ZNF429
720 | ZNF479
721 | ZNF521
722 | ZNRF3
723 | ZRSR2
724 | CDKN2A.p14arf
725 | CDKN2A.p16INK4a
726 | 


--------------------------------------------------------------------------------
/DIGDriver/data/genes_CGC_ALL.txt.bck:
--------------------------------------------------------------------------------
  1 | A1CF
  2 | ABI1
  3 | ABL1
  4 | ABL2
  5 | ACKR3
  6 | ACSL3
  7 | ACSL6
  8 | ACVR1
  9 | ACVR2A
 10 | AFF1
 11 | AFF3
 12 | AFF4
 13 | AKAP9
 14 | AKT1
 15 | AKT2
 16 | AKT3
 17 | ALDH2
 18 | ALK
 19 | ANK1
 20 | APC
 21 | APOBEC3B
 22 | ARHGAP26
 23 | ARHGAP5
 24 | ARHGEF10
 25 | ARHGEF10L
 26 | ARHGEF12
 27 | ARID1A
 28 | ARID1B
 29 | ARID2
 30 | ARNT
 31 | ASPSCR1
 32 | ASXL1
 33 | ASXL2
 34 | ATF1
 35 | ATIC
 36 | ATM
 37 | ATP1A1
 38 | ATR
 39 | AXIN1
 40 | AXIN2
 41 | B2M
 42 | BAP1
 43 | BARD1
 44 | BAX
 45 | BAZ1A
 46 | BCL10
 47 | BCL11A
 48 | BCL11B
 49 | BCL2
 50 | BCL2L12
 51 | BCL3
 52 | BCL6
 53 | BCL7A
 54 | BCL9
 55 | BCL9L
 56 | BCLAF1
 57 | BCR
 58 | BIRC3
 59 | BIRC6
 60 | BLM
 61 | BMP5
 62 | BMPR1A
 63 | BRAF
 64 | BRCA1
 65 | BRCA2
 66 | BRD3
 67 | BRD4
 68 | BRIP1
 69 | BTG1
 70 | BUB1B
 71 | C15orf65
 72 | CACNA1D
 73 | CALR
 74 | CAMTA1
 75 | CANT1
 76 | CARD11
 77 | CARS
 78 | CASP3
 79 | CASP8
 80 | CASP9
 81 | CBFA2T3
 82 | CBFB
 83 | CBL
 84 | CBLB
 85 | CBLC
 86 | CCDC6
 87 | CCNB1IP1
 88 | CCNC
 89 | CCND1
 90 | CCND2
 91 | CCND3
 92 | CCNE1
 93 | CCR4
 94 | CCR7
 95 | CD209
 96 | CD274
 97 | CD28
 98 | CD74
 99 | CD79A
100 | CD79B
101 | CDC73
102 | CDH1
103 | CDH10
104 | CDH11
105 | CDH17
106 | CDK12
107 | CDK4
108 | CDK6
109 | CDKN1A
110 | CDKN1B
111 | CDKN2C
112 | CDX2
113 | CEBPA
114 | CEP89
115 | CHCHD7
116 | CHD2
117 | CHD4
118 | CHEK2
119 | CHIC2
120 | CHST11
121 | CIC
122 | CIITA
123 | CLIP1
124 | CLP1
125 | CLTC
126 | CLTCL1
127 | CNBD1
128 | CNBP
129 | CNOT3
130 | CNTNAP2
131 | CNTRL
132 | COL1A1
133 | COL2A1
134 | COL3A1
135 | COX6C
136 | CPEB3
137 | CREB1
138 | CREB3L1
139 | CREB3L2
140 | CREBBP
141 | CRNKL1
142 | CRTC1
143 | CRTC3
144 | CSF1R
145 | CSF3R
146 | CSMD3
147 | CTCF
148 | CTNNA2
149 | CTNNB1
150 | CTNND1
151 | CTNND2
152 | CUL3
153 | CUX1
154 | CXCR4
155 | CYLD
156 | CYP2C8
157 | CYSLTR2
158 | DAXX
159 | DCC
160 | DCTN1
161 | DDB2
162 | DDIT3
163 | DDR2
164 | DDX10
165 | DDX5
166 | DDX6
167 | DEK
168 | DGCR8
169 | DICER1
170 | DNAJB1
171 | DNM2
172 | DNMT3A
173 | DROSHA
174 | EBF1
175 | ECT2L
176 | EED
177 | EGFR
178 | EIF3E
179 | EIF4A2
180 | ELF3
181 | ELK4
182 | ELL
183 | ELN
184 | EML4
185 | EP300
186 | EPAS1
187 | EPHA3
188 | EPHA7
189 | EPS15
190 | ERBB2
191 | ERBB3
192 | ERBB4
193 | ERC1
194 | ERCC2
195 | ERCC3
196 | ERCC4
197 | ERCC5
198 | ERG
199 | ESR1
200 | ETNK1
201 | ETV1
202 | ETV4
203 | ETV5
204 | ETV6
205 | EWSR1
206 | EXT1
207 | EXT2
208 | EZH2
209 | EZR
210 | FAM131B
211 | FAM135B
212 | FANCA
213 | FANCC
214 | FANCD2
215 | FANCE
216 | FANCF
217 | FANCG
218 | FAS
219 | FAT1
220 | FAT3
221 | FAT4
222 | FBLN2
223 | FBXO11
224 | FBXW7
225 | FCGR2B
226 | FCRL4
227 | FEN1
228 | FES
229 | FEV
230 | FGFR1
231 | FGFR1OP
232 | FGFR2
233 | FGFR3
234 | FGFR4
235 | FH
236 | FHIT
237 | FIP1L1
238 | FKBP9
239 | FLCN
240 | FLI1
241 | FLT3
242 | FLT4
243 | FNBP1
244 | FOXA1
245 | FOXL2
246 | FOXO1
247 | FOXO3
248 | FOXP1
249 | FOXR1
250 | FSTL3
251 | FUBP1
252 | FUS
253 | GAS7
254 | GATA2
255 | GATA3
256 | GLI1
257 | GMPS
258 | GNA11
259 | GNAQ
260 | GNAS
261 | GOLGA5
262 | GOPC
263 | GPC5
264 | GPHN
265 | GRIN2A
266 | GRM3
267 | H3F3A
268 | H3F3B
269 | HERPUD1
270 | HEY1
271 | HIF1A
272 | HIP1
273 | HIST1H3B
274 | HIST1H4I
275 | HLA-A
276 | HLF
277 | HMGA1
278 | HMGA2
279 | HNF1A
280 | HNRNPA2B1
281 | HOOK3
282 | HOXA11
283 | HOXA13
284 | HOXA9
285 | HOXC11
286 | HOXC13
287 | HOXD11
288 | HOXD13
289 | HRAS
290 | HSP90AA1
291 | HSP90AB1
292 | ID3
293 | IDH1
294 | IDH2
295 | IGF2BP2
296 | IKBKB
297 | IKZF1
298 | IL2
299 | IL21R
300 | IL6ST
301 | IL7R
302 | IRF4
303 | ISX
304 | ITGAV
305 | ITK
306 | JAK1
307 | JAK2
308 | JAK3
309 | JAZF1
310 | JUN
311 | KAT6A
312 | KAT6B
313 | KAT7
314 | KCNJ5
315 | KDM5A
316 | KDR
317 | KDSR
318 | KEAP1
319 | KIAA1549
320 | KIF5B
321 | KIT
322 | KLF4
323 | KLF6
324 | KLK2
325 | KMT2A
326 | KMT2C
327 | KMT2D
328 | KNSTRN
329 | KRAS
330 | KTN1
331 | LARP4B
332 | LASP1
333 | LATS1
334 | LATS2
335 | LCK
336 | LCP1
337 | LEF1
338 | LEPROTL1
339 | LIFR
340 | LMNA
341 | LMO1
342 | LMO2
343 | LPP
344 | LRIG3
345 | LRP1B
346 | LSM14A
347 | LYL1
348 | LZTR1
349 | MACC1
350 | MAF
351 | MAFB
352 | MALT1
353 | MAML2
354 | MAP2K1
355 | MAP2K2
356 | MAP2K4
357 | MAP3K1
358 | MAP3K13
359 | MAPK1
360 | MAX
361 | MB21D2
362 | MDM2
363 | MDM4
364 | MDS2
365 | MECOM
366 | MEN1
367 | MET
368 | MGMT
369 | MITF
370 | MLF1
371 | MLH1
372 | MLLT1
373 | MLLT10
374 | MLLT11
375 | MLLT3
376 | MLLT6
377 | MN1
378 | MNX1
379 | MPL
380 | MSH2
381 | MSH6
382 | MSI2
383 | MTOR
384 | MUC1
385 | MUC16
386 | MUC4
387 | MUTYH
388 | MYB
389 | MYC
390 | MYCL
391 | MYCN
392 | MYD88
393 | MYH11
394 | MYH9
395 | MYO5A
396 | MYOD1
397 | N4BP2
398 | NAB2
399 | NACA
400 | NBEA
401 | NBN
402 | NCKIPSD
403 | NCOA1
404 | NCOA2
405 | NCOA4
406 | NCOR1
407 | NCOR2
408 | NDRG1
409 | NF1
410 | NF2
411 | NFATC2
412 | NFE2L2
413 | NFIB
414 | NFKB2
415 | NFKBIE
416 | NIN
417 | NKX2-1
418 | NOTCH1
419 | NOTCH2
420 | NPM1
421 | NR4A3
422 | NRAS
423 | NRG1
424 | NSD1
425 | NT5C2
426 | NTHL1
427 | NTRK1
428 | NTRK3
429 | NUMA1
430 | NUP214
431 | NUP98
432 | NUTM1
433 | NUTM2B
434 | NUTM2D
435 | OLIG2
436 | OMD
437 | PABPC1
438 | PAFAH1B2
439 | PALB2
440 | PATZ1
441 | PAX3
442 | PAX5
443 | PAX7
444 | PAX8
445 | PBRM1
446 | PBX1
447 | PCBP1
448 | PCM1
449 | PDCD1LG2
450 | PDE4DIP
451 | PDGFB
452 | PDGFRA
453 | PDGFRB
454 | PER1
455 | PHOX2B
456 | PICALM
457 | PIK3CA
458 | PIK3CB
459 | PIK3R1
460 | PIM1
461 | PLAG1
462 | PLCG1
463 | PML
464 | PMS1
465 | PMS2
466 | POLD1
467 | POLE
468 | POLG
469 | POLQ
470 | POT1
471 | POU2AF1
472 | POU5F1
473 | PPARG
474 | PPFIBP1
475 | PPM1D
476 | PPP2R1A
477 | PPP6C
478 | PRCC
479 | PRDM1
480 | PRDM16
481 | PRDM2
482 | PREX2
483 | PRF1
484 | PRKACA
485 | PRKAR1A
486 | PRKCB
487 | PRPF40B
488 | PRRX1
489 | PSIP1
490 | PTCH1
491 | PTEN
492 | PTK6
493 | PTPN11
494 | PTPN13
495 | PTPN6
496 | PTPRB
497 | PTPRC
498 | PTPRD
499 | PTPRK
500 | PTPRT
501 | PWWP2A
502 | QKI
503 | RABEP1
504 | RAC1
505 | RAD17
506 | RAD21
507 | RAD51B
508 | RAF1
509 | RALGDS
510 | RANBP2
511 | RAP1GDS1
512 | RARA
513 | RB1
514 | RBM15
515 | RECQL4
516 | REL
517 | RET
518 | RFWD3
519 | RGPD3
520 | RGS7
521 | RHOA
522 | RHOH
523 | RMI2
524 | RNF213
525 | RNF43
526 | ROBO2
527 | ROS1
528 | RPL22
529 | RPL5
530 | RPN1
531 | RSPO2
532 | RSPO3
533 | RUNX1
534 | RUNX1T1
535 | S100A7
536 | SALL4
537 | SBDS
538 | SDC4
539 | SDHA
540 | SDHAF2
541 | SDHB
542 | SDHC
543 | SDHD
544 | SET
545 | SETBP1
546 | SETD1B
547 | SETD2
548 | SETDB1
549 | SF3B1
550 | SFPQ
551 | SFRP4
552 | SGK1
553 | SH2B3
554 | SH3GL1
555 | SIRPA
556 | SIX1
557 | SIX2
558 | SKI
559 | SLC34A2
560 | SLC45A3
561 | SMAD2
562 | SMAD3
563 | SMAD4
564 | SMARCA4
565 | SMARCB1
566 | SMARCD1
567 | SMARCE1
568 | SMO
569 | SND1
570 | SNX29
571 | SOCS1
572 | SOX2
573 | SOX21
574 | SPECC1
575 | SPEN
576 | SPOP
577 | SRC
578 | SRGAP3
579 | SRSF2
580 | SRSF3
581 | SS18
582 | SS18L1
583 | STAG1
584 | STAT3
585 | STAT5B
586 | STAT6
587 | STIL
588 | STK11
589 | STRN
590 | SUFU
591 | SUZ12
592 | SYK
593 | TAF15
594 | TAL1
595 | TAL2
596 | TBL1XR1
597 | TBX3
598 | TCEA1
599 | TCF12
600 | TCF3
601 | TCF7L2
602 | TCL1A
603 | TEC
604 | TERT
605 | TET1
606 | TET2
607 | TFEB
608 | TFG
609 | TFPT
610 | TFRC
611 | TGFBR2
612 | THRAP3
613 | TLX1
614 | TLX3
615 | TMEM127
616 | TMPRSS2
617 | TNC
618 | TNFAIP3
619 | TNFRSF14
620 | TNFRSF17
621 | TOP1
622 | TP53
623 | TP63
624 | TPM3
625 | TPM4
626 | TPR
627 | TRAF7
628 | TRIM24
629 | TRIM27
630 | TRIM33
631 | TRIP11
632 | TRRAP
633 | TSC1
634 | TSC2
635 | TSHR
636 | U2AF1
637 | UBR5
638 | USP44
639 | USP6
640 | USP8
641 | VAV1
642 | VHL
643 | VTI1A
644 | WIF1
645 | WNK2
646 | WRN
647 | WT1
648 | WWTR1
649 | XPA
650 | XPC
651 | XPO1
652 | YWHAE
653 | ZBTB16
654 | ZCCHC8
655 | ZEB1
656 | ZFHX3
657 | ZMYM2
658 | ZNF331
659 | ZNF384
660 | ZNF429
661 | ZNF479
662 | ZNF521
663 | ZNRF3
664 | CDKN2A.p14arf
665 | CDKN2A.p16INK4a
666 | 


--------------------------------------------------------------------------------
/DIGDriver/data/genes_CGC_ONC.txt:
--------------------------------------------------------------------------------
  1 | RARA
  2 | STAT6
  3 | PTPRS
  4 | SHQ1
  5 | PDGFRA
  6 | MPL
  7 | JAK3
  8 | ABL1
  9 | AFF3
 10 | CDK4
 11 | MLLT10
 12 | NTRK2
 13 | FEV
 14 | PARK2
 15 | BCL9
 16 | KIT
 17 | PIK3C2G
 18 | MAF
 19 | CBLC
 20 | GATA3
 21 | KEAP1
 22 | TNFAIP3
 23 | TSC2
 24 | MTOR
 25 | NT5C2
 26 | CDC73
 27 | NOTCH1
 28 | DEK
 29 | DNMT1
 30 | DIS3
 31 | ZNF521
 32 | SND1
 33 | MALT1
 34 | HNRNPA2B1
 35 | KCNJ5
 36 | CHEK2
 37 | ABL2
 38 | MDM4
 39 | HOXA13
 40 | SUFU
 41 | CDC42EP2
 42 | H3F3A
 43 | TBK1
 44 | ATM
 45 | MSH6
 46 | PREX2
 47 | ERBB2
 48 | NF2
 49 | HIP1
 50 | DDX5
 51 | CSF1R
 52 | ALK
 53 | PIK3R2
 54 | NFE2L2
 55 | CDKN2C
 56 | NF1
 57 | KRAS
 58 | MAML2
 59 | NKX2-1
 60 | FOXA1
 61 | FLT3
 62 | MCL1
 63 | NUP98
 64 | JUN
 65 | MAP2K1
 66 | SSX4
 67 | LPP
 68 | POU2AF1
 69 | BRCA1
 70 | NOTCH4
 71 | USP6
 72 | TEK
 73 | SETBP1
 74 | TNFRSF17
 75 | FLCN
 76 | IRS1
 77 | NOTCH3
 78 | MYD88
 79 | LMO1
 80 | CHEK1
 81 | EPHB4
 82 | SOX2
 83 | TLX1
 84 | EWSR1
 85 | FLT4
 86 | SSX1
 87 | UBR5
 88 | KMT2C
 89 | ERBB4
 90 | CCND1
 91 | ETV6
 92 | ATF1
 93 | TRRAP
 94 | CDH1
 95 | PIK3R3
 96 | PTEN
 97 | MAP3K8
 98 | NFATC2
 99 | PARP1
100 | ALOX12B
101 | CDK8
102 | CRKL
103 | FGFR2
104 | BCL11A
105 | CRTC1
106 | RAC1
107 | PIK3CB
108 | CBL
109 | H3F3B
110 | IDH1
111 | EZH2
112 | ERBB3
113 | SRSF2
114 | RET
115 | SF3B1
116 | XPO1
117 | CYLD
118 | EPHB1
119 | SET
120 | PDGFB
121 | SMO
122 | NFKB1
123 | IL6ST
124 | TGFBR2
125 | TSHR
126 | SRC
127 | TCF7L2
128 | PBX1
129 | HIF1A
130 | TFE3
131 | ARID1A
132 | BCL6
133 | MYC
134 | MLST8
135 | CEBPA
136 | TSC1
137 | CHD4
138 | DDIT3
139 | GNAS
140 | FLT1
141 | FCGR2B
142 | PSIP1
143 | CCND2
144 | PDGFRB
145 | CBLB
146 | DNMT3A
147 | LDHA
148 | ERG
149 | SMARCA4
150 | TMPRSS2
151 | YES1
152 | PTPRD
153 | PLCG1
154 | WT1
155 | CDKN2A.p16INK4a
156 | AKT1
157 | MSI2
158 | REL
159 | PPM1D
160 | LYL1
161 | POU5F1
162 | WAS
163 | BRD3
164 | KAT6A
165 | SIX1
166 | WWTR1
167 | HSP90AA1
168 | FAS
169 | IGF1R
170 | OLIG2
171 | IKZF1
172 | MYCN
173 | TP53
174 | CDK6
175 | PRKCI
176 | LMO2
177 | NSD3
178 | CDKN2B
179 | IKBKB
180 | VHL
181 | PTPN11
182 | ACKR3
183 | FAM46C
184 | PNRC1
185 | BIRC2
186 | AR
187 | HOXC13
188 | CXCR4
189 | BAP1
190 | HOXD13
191 | SMARCB1
192 | NTRK3
193 | MAGI2
194 | ELK4
195 | EP300
196 | GNA11
197 | TRIM27
198 | HRAS
199 | FSTL3
200 | BCL2
201 | ETV4
202 | KDR
203 | CSF3R
204 | CCNE1
205 | HNF1A
206 | KLF6
207 | PAK7
208 | JAK2
209 | NSD2
210 | CREB3L2
211 | RSPO3
212 | IRS2
213 | USP8
214 | EPHA5
215 | STAT3
216 | YAP1
217 | TFEB
218 | SDHB
219 | NTRK1
220 | TLX3
221 | HOXC11
222 | AFDN
223 | RAF1
224 | FBXW7
225 | MAP2K2
226 | PLAG1
227 | SSX2
228 | SYK
229 | PIK3CD
230 | TET2
231 | SALL4
232 | NOTCH2
233 | FGFR4
234 | PRKACA
235 | ROS1
236 | PTCH1
237 | HLF
238 | GOLPH3
239 | SMAD4
240 | EPHA3
241 | TOP1
242 | ETV5
243 | CD79B
244 | NR4A3
245 | SPOP
246 | AKT2
247 | CD79A
248 | FGFR1
249 | CACNA1D
250 | AURKA
251 | SRSF3
252 | NCOA2
253 | TERT
254 | EPHA6
255 | MLH1
256 | RPTOR
257 | TAF15
258 | AKT3
259 | PIK3CG
260 | PIK3R1
261 | PAX5
262 | ETV1
263 | DNMT3B
264 | TP63
265 | IL7R
266 | CDKN2A.p14arf
267 | PRDM16
268 | RB1
269 | DICER1
270 | PBRM1
271 | RUNX1
272 | PIK3CA
273 | CREB1
274 | SOCS1
275 | EPHA7
276 | CTNNB1
277 | FOXL2
278 | FCRL4
279 | KMT2D
280 | FH
281 | HEY1
282 | LCK
283 | EPHB6
284 | P2RY8
285 | DDX6
286 | SH3GL1
287 | NPM1
288 | PAX3
289 | GRIN2A
290 | MITF
291 | LGR6
292 | TCL1A
293 | SETD2
294 | CRLF2
295 | JAK1
296 | CREBBP
297 | TET1
298 | TAL1
299 | GNAQ
300 | BRAF
301 | BCL3
302 | RICTOR
303 | IGFBP7
304 | GATA2
305 | STK11
306 | TAL2
307 | FLI1
308 | HOXD11
309 | EGFR
310 | MSH2
311 | IDH2
312 | RAP1GDS1
313 | AFF4
314 | EIF4EBP1
315 | STIL
316 | NFKB2
317 | HDAC2
318 | MECOM
319 | FGFR3
320 | MAP2K4
321 | ASXL1
322 | CARD11
323 | MET
324 | MAPK1
325 | HIST1H3B
326 | HMGA1
327 | BRCA2
328 | KDM5A
329 | EPHA8
330 | CCND3
331 | PHOX2B
332 | MYCL
333 | FOXP1
334 | ACVR1
335 | DDR2
336 | IKBKE
337 | BCL2L1
338 | MTCP1
339 | KMT2A
340 | MYOD1
341 | PIM1
342 | HMGA2
343 | NRAS
344 | FUBP1
345 | MDM2
346 | CD74
347 | GSK3B
348 | U2AF1
349 | PDCD1LG2
350 | CALR
351 | PRKAR1A
352 | NUTM1
353 | MEN1
354 | MYB
355 | DAXX
356 | BRD4
357 | PLK2
358 | MN1
359 | INSR
360 | MAFB
361 | ARHGAP26
362 | ESR1
363 | APC
364 | CDKN2A.p14arf
365 | CDKN2A.p16INK4a
366 | 


--------------------------------------------------------------------------------
/DIGDriver/data/genes_CGC_TSG.txt:
--------------------------------------------------------------------------------
  1 | ABI1
  2 | ACVR2A
  3 | AMER1
  4 | APC
  5 | ARHGAP26
  6 | ARHGEF12
  7 | ARID1A
  8 | ARID1B
  9 | ARID2
 10 | ASXL1
 11 | ATM
 12 | ATP2B3
 13 | ATR
 14 | ATRX
 15 | AXIN1
 16 | AXIN2
 17 | B2M
 18 | BAP1
 19 | BARD1
 20 | BAX
 21 | BCL10
 22 | BCOR
 23 | BLM
 24 | BRCA1
 25 | BRCA2
 26 | BRIP1
 27 | BTG1
 28 | BUB1B
 29 | CAMTA1
 30 | CARS
 31 | CASP8
 32 | CBFA2T3
 33 | CBFB
 34 | CBLB
 35 | CCDC6
 36 | CCNB1IP1
 37 | CD274
 38 | CDC73
 39 | CDH1
 40 | CDH11
 41 | CDK12
 42 | CDKN1B
 43 | CDKN2A
 44 | CDKN2C
 45 | CDX2
 46 | CEBPA
 47 | CHEK2
 48 | CIITA
 49 | CLTC
 50 | CLTCL1
 51 | CNBP
 52 | CNOT3
 53 | CREB3L1
 54 | CTCF
 55 | CYLD
 56 | DDX10
 57 | DDX3X
 58 | DICER1
 59 | DNM2
 60 | DNMT3A
 61 | DROSHA
 62 | EBF1
 63 | EIF3E
 64 | ELL
 65 | EP300
 66 | EPS15
 67 | ERCC2
 68 | ERCC3
 69 | ERCC4
 70 | ERCC5
 71 | ETNK1
 72 | ETV6
 73 | EXT1
 74 | EXT2
 75 | FANCA
 76 | FANCC
 77 | FANCD2
 78 | FANCE
 79 | FANCF
 80 | FANCG
 81 | FAS
 82 | FAT1
 83 | FAT4
 84 | FBXO11
 85 | FBXW7
 86 | FH
 87 | FHIT
 88 | FLCN
 89 | FUS
 90 | GRIN2A
 91 | HNF1A
 92 | IKZF1
 93 | KAT6B
 94 | KDM5C
 95 | KEAP1
 96 | KLF6
 97 | KMT2C
 98 | KNL1
 99 | LATS1
100 | LATS2
101 | LRIG3
102 | LRP1B
103 | LZTR1
104 | MAX
105 | MED12
106 | MEN1
107 | MLF1
108 | MLH1
109 | MSH2
110 | MSH6
111 | MUTYH
112 | MYH9
113 | NAB2
114 | NBN
115 | NCOA4
116 | NCOR1
117 | NCOR2
118 | NDRG1
119 | NF1
120 | NF2
121 | NFKBIE
122 | NRG1
123 | PALB2
124 | PATZ1
125 | PBRM1
126 | PER1
127 | PHF6
128 | PHOX2B
129 | PIK3R1
130 | PML
131 | PMS2
132 | POLD1
133 | POLE
134 | POT1
135 | PPARG
136 | PPP2R1A
137 | PPP6C
138 | PRDM1
139 | PRF1
140 | PTCH1
141 | PTEN
142 | PTPN13
143 | PTPRB
144 | PTPRC
145 | PTPRK
146 | PTPRT
147 | RAD51B
148 | RANBP2
149 | RB1
150 | RBM10
151 | RHOH
152 | RMI2
153 | RNF43
154 | RPL10
155 | RPL22
156 | RPL5
157 | RSPO2
158 | SBDS
159 | SDHA
160 | SDHAF2
161 | SDHB
162 | SDHC
163 | SDHD
164 | SETD2
165 | SFPQ
166 | SFRP4
167 | SH2B3
168 | SLC34A2
169 | SMAD2
170 | SMAD3
171 | SMAD4
172 | SMARCA4
173 | SMARCB1
174 | SMARCD1
175 | SMARCE1
176 | SOCS1
177 | SPEN
178 | SPOP
179 | STAG2
180 | STK11
181 | SUFU
182 | TENT5C
183 | TET2
184 | TGFBR2
185 | TMEM127
186 | TNFAIP3
187 | TNFRSF14
188 | TPM3
189 | TRAF7
190 | TRIM33
191 | TSC1
192 | TSC2
193 | VHL
194 | WIF1
195 | WRN
196 | XPA
197 | XPC
198 | YWHAE
199 | ZBTB16
200 | ZFHX3
201 | ZNF331
202 | ZRSR2
203 | 


--------------------------------------------------------------------------------
/DIGDriver/data/genes_MSK_230.txt:
--------------------------------------------------------------------------------
  1 | ABL1
  2 | ABL2
  3 | AKT1
  4 | AKT2
  5 | AKT3
  6 | ALK
  7 | ALOX12B
  8 | APC
  9 | AR
 10 | ARAF
 11 | ARHGAP26
 12 | ARID1A
 13 | ASXL1
 14 | ATM
 15 | ATRX
 16 | AURKA
 17 | BAP1
 18 | BCL2L1
 19 | BCL6
 20 | BIRC2
 21 | BRAF
 22 | BRCA1
 23 | BRCA2
 24 | CARD11
 25 | CBL
 26 | CBLB
 27 | CBLC
 28 | CCND1
 29 | CCNE1
 30 | CD79B
 31 | CDC42EP2
 32 | CDC73
 33 | CDH1
 34 | CDK4
 35 | CDK6
 36 | CDK8
 37 | CDKN2A
 38 | CDKN2B
 39 | CDKN2C
 40 | CEBPA
 41 | CHEK1
 42 | CHEK2
 43 | CREBBP
 44 | CRKL
 45 | CRLF2
 46 | CSF1R
 47 | CTNNB1
 48 | CYLD
 49 | DAXX
 50 | DDR2
 51 | DICER1
 52 | DIS3
 53 | DNMT1
 54 | DNMT3A
 55 | DNMT3B
 56 | EGFR
 57 | EIF4EBP1
 58 | EP300
 59 | EPHA3
 60 | EPHA5
 61 | EPHA6
 62 | EPHA7
 63 | EPHA8
 64 | EPHB1
 65 | EPHB4
 66 | EPHB6
 67 | ERBB2
 68 | ERBB3
 69 | ERBB4
 70 | ERG
 71 | ESR1
 72 | ETV1
 73 | ETV6
 74 | EZH2
 75 | FAM123B
 76 | FAM46C
 77 | FAS
 78 | FBXW7
 79 | FGFR1
 80 | FGFR2
 81 | FGFR3
 82 | FGFR4
 83 | FH
 84 | FLCN
 85 | FLT1
 86 | FLT3
 87 | FOXL2
 88 | GATA1
 89 | GATA2
 90 | GATA3
 91 | GNA11
 92 | GNAQ
 93 | GNAS
 94 | GOLPH3
 95 | GRIN2A
 96 | GSK3B
 97 | HDAC2
 98 | HIF1A
 99 | HMGA2
100 | HNF1A
101 | HRAS
102 | HSP90AA1
103 | IDH1
104 | IDH2
105 | IGF1R
106 | IGFBP7
107 | IKBKE
108 | IKZF1
109 | INSR
110 | IRS1
111 | IRS2
112 | JAK1
113 | JAK2
114 | JAK3
115 | JUN
116 | KDM5C
117 | KDM6A
118 | KDR
119 | KEAP1
120 | KIT
121 | KLF6
122 | KMT2A
123 | KMT2C
124 | KMT2D
125 | KRAS
126 | LDHA
127 | LGR6
128 | MAGI2
129 | MAP2K1
130 | MAP2K2
131 | MAP2K4
132 | MAP3K8
133 | MCL1
134 | MDM2
135 | MDM4
136 | MEN1
137 | MET
138 | MITF
139 | MLH1
140 | MLST8
141 | MPL
142 | MSH2
143 | MSH6
144 | MTOR
145 | MYB
146 | MYC
147 | MYCL1
148 | MYCN
149 | NCOA2
150 | NF1
151 | NF2
152 | NFE2L2
153 | NFKB1
154 | NFKB2
155 | NKX2-1
156 | NOTCH1
157 | NOTCH2
158 | NOTCH3
159 | NOTCH4
160 | NPM1
161 | NRAS
162 | NTRK1
163 | NTRK2
164 | NTRK3
165 | PAK7
166 | PARK2
167 | PARP1
168 | PAX5
169 | PBRM1
170 | PDGFRA
171 | PDGFRB
172 | PHOX2B
173 | PIK3C2G
174 | PIK3CA
175 | PIK3CB
176 | PIK3CD
177 | PIK3CG
178 | PIK3R1
179 | PIK3R2
180 | PIK3R3
181 | PKM2
182 | PLK2
183 | PNRC1
184 | PREX2
185 | PRKAR1A
186 | PRKCI
187 | PTCH1
188 | PTEN
189 | PTPN11
190 | PTPRD
191 | PTPRS
192 | RAF1
193 | RARA
194 | RB1
195 | REL
196 | RET
197 | RICTOR
198 | RPTOR
199 | RUNX1
200 | SDHB
201 | SETD2
202 | SHQ1
203 | SMAD4
204 | SMARCA4
205 | SMARCB1
206 | SMO
207 | SOCS1
208 | SOX2
209 | SPOP
210 | SRC
211 | STK11
212 | SUFU
213 | TBK1
214 | TEK
215 | TERT
216 | TET1
217 | TET2
218 | TGFBR2
219 | TMPRSS2
220 | TNFAIP3
221 | TOP1
222 | TP53
223 | TP63
224 | TSC1
225 | TSC2
226 | TSHR
227 | VHL
228 | WT1
229 | YAP1
230 | YES1
231 | CDKN2A.p14arf
232 | CDKN2A.p16INK4a
233 | 


--------------------------------------------------------------------------------
/DIGDriver/data/genes_MSK_341.txt:
--------------------------------------------------------------------------------
  1 | ABL1
  2 | AKT1
  3 | AKT2
  4 | AKT3
  5 | ALK
  6 | ALOX12B
  7 | APC
  8 | AR
  9 | ARAF
 10 | ARID1A
 11 | ARID1B
 12 | ARID2
 13 | ARID5B
 14 | ASXL1
 15 | ASXL2
 16 | ATM
 17 | ATR
 18 | ATRX
 19 | AURKA
 20 | AURKB
 21 | AXIN1
 22 | AXIN2
 23 | AXL
 24 | B2M
 25 | BAP1
 26 | BARD1
 27 | BBC3
 28 | BCL2
 29 | BCL2L1
 30 | BCL2L11
 31 | BCL6
 32 | BCOR
 33 | BLM
 34 | BMPR1A
 35 | BRAF
 36 | BRCA1
 37 | BRCA2
 38 | BRD4
 39 | BRIP1
 40 | BTK
 41 | CARD11
 42 | CASP8
 43 | CBFB
 44 | CBL
 45 | CCND1
 46 | CCND2
 47 | CCND3
 48 | CCNE1
 49 | CD274
 50 | CD276
 51 | CD79B
 52 | CDC73
 53 | CDH1
 54 | CDK12
 55 | CDK4
 56 | CDK6
 57 | CDK8
 58 | CDKN1A
 59 | CDKN1B
 60 | CDKN2A.p14arf
 61 | CDKN2A.p16INK4a
 62 | CDKN2B
 63 | CDKN2C
 64 | CHEK1
 65 | CHEK2
 66 | CIC
 67 | CREBBP
 68 | CRKL
 69 | CRLF2
 70 | CSF1R
 71 | CTCF
 72 | CTLA4
 73 | CTNNB1
 74 | CUL3
 75 | DAXX
 76 | DCUN1D1
 77 | DDR2
 78 | DICER1
 79 | DIS3
 80 | DNMT1
 81 | DNMT3A
 82 | DNMT3B
 83 | DOT1L
 84 | E2F3
 85 | EED
 86 | EGFL7
 87 | EGFR
 88 | EIF1AX
 89 | EP300
 90 | EPCAM
 91 | EPHA3
 92 | EPHA5
 93 | EPHB1
 94 | ERBB2
 95 | ERBB3
 96 | ERBB4
 97 | ERCC2
 98 | ERCC3
 99 | ERCC4
100 | ERCC5
101 | ERG
102 | ESR1
103 | ETV1
104 | ETV6
105 | EZH2
106 | FAM123B
107 | FAM175A
108 | FAM46C
109 | FANCA
110 | FANCC
111 | FAT1
112 | FBXW7
113 | FGF19
114 | FGF3
115 | FGF4
116 | FGFR1
117 | FGFR2
118 | FGFR3
119 | FGFR4
120 | FH
121 | FLCN
122 | FLT1
123 | FLT3
124 | FLT4
125 | FOXA1
126 | FOXL2
127 | FOXP1
128 | FUBP1
129 | GATA1
130 | GATA2
131 | GATA3
132 | GNA11
133 | GNAQ
134 | GNAS
135 | GREM1
136 | GRIN2A
137 | GSK3B
138 | H3F3C
139 | HGF
140 | HIST1H1C
141 | HIST1H2BD
142 | HIST1H3B
143 | HNF1A
144 | HRAS
145 | ICOSLG
146 | IDH1
147 | IDH2
148 | IFNGR1
149 | IGF1
150 | IGF1R
151 | IGF2
152 | IKBKE
153 | IKZF1
154 | IL10
155 | IL7R
156 | INPP4A
157 | INPP4B
158 | INSR
159 | IRF4
160 | IRS1
161 | IRS2
162 | JAK1
163 | JAK2
164 | JAK3
165 | JUN
166 | KDM5A
167 | KDM5C
168 | KDM6A
169 | KDR
170 | KEAP1
171 | KIT
172 | KLF4
173 | KRAS
174 | LATS1
175 | LATS2
176 | LMO1
177 | MAP2K1
178 | MAP2K2
179 | MAP2K4
180 | MAP3K1
181 | MAP3K13
182 | MAPK1
183 | MAX
184 | MCL1
185 | MDC1
186 | MDM2
187 | MDM4
188 | MED12
189 | MEF2B
190 | MEN1
191 | MET
192 | MITF
193 | MLH1
194 | MLL
195 | MLL2
196 | MLL3
197 | MPL
198 | MRE11A
199 | MSH2
200 | MSH6
201 | MTOR
202 | MUTYH
203 | MYC
204 | MYCL1
205 | MYCN
206 | MYD88
207 | MYOD1
208 | NBN
209 | NCOR1
210 | NF1
211 | NF2
212 | NFE2L2
213 | NKX2-1
214 | NKX3-1
215 | NOTCH1
216 | NOTCH2
217 | NOTCH3
218 | NOTCH4
219 | NPM1
220 | NRAS
221 | NSD1
222 | NTRK1
223 | NTRK2
224 | NTRK3
225 | PAK1
226 | PAK7
227 | PALB2
228 | PARK2
229 | PARP1
230 | PAX5
231 | PBRM1
232 | PDCD1
233 | PDGFRA
234 | PDGFRB
235 | PDPK1
236 | PHOX2B
237 | PIK3C2G
238 | PIK3C3
239 | PIK3CA
240 | PIK3CB
241 | PIK3CD
242 | PIK3CG
243 | PIK3R1
244 | PIK3R2
245 | PIK3R3
246 | PIM1
247 | PLK2
248 | PMAIP1
249 | PMS1
250 | PMS2
251 | PNRC1
252 | POLE
253 | PPP2R1A
254 | PRDM1
255 | PRKAR1A
256 | PTCH1
257 | PTEN
258 | PTPN11
259 | PTPRD
260 | PTPRS
261 | PTPRT
262 | RAC1
263 | RAD50
264 | RAD51
265 | RAD51B
266 | RAD51C
267 | RAD51D
268 | RAD52
269 | RAD54L
270 | RAF1
271 | RARA
272 | RASA1
273 | RB1
274 | RBM10
275 | RECQL4
276 | REL
277 | RET
278 | RFWD2
279 | RHOA
280 | RICTOR
281 | RIT1
282 | RNF43
283 | ROS1
284 | RPS6KA4
285 | RPS6KB2
286 | RPTOR
287 | RUNX1
288 | RYBP
289 | SDHA
290 | SDHAF2
291 | SDHB
292 | SDHC
293 | SDHD
294 | SETD2
295 | SF3B1
296 | SH2D1A
297 | SHQ1
298 | SMAD2
299 | SMAD3
300 | SMAD4
301 | SMARCA4
302 | SMARCB1
303 | SMARCD1
304 | SMO
305 | SOCS1
306 | SOX17
307 | SOX2
308 | SOX9
309 | SPEN
310 | SPOP
311 | SRC
312 | STAG2
313 | STK11
314 | STK40
315 | SUFU
316 | SUZ12
317 | SYK
318 | TBX3
319 | TERT
320 | TET1
321 | TET2
322 | TGFBR1
323 | TGFBR2
324 | TMEM127
325 | TMPRSS2
326 | TNFAIP3
327 | TNFRSF14
328 | TOP1
329 | TP53
330 | TP63
331 | TRAF7
332 | TSC1
333 | TSC2
334 | TSHR
335 | U2AF1
336 | VHL
337 | VTCN1
338 | WT1
339 | XIAP
340 | XPO1
341 | YAP1
342 | YES1
343 | CDKN2AP14ARF
344 | CDKN2AP16INK4A
345 | MYB
346 | NFIB
347 | 


--------------------------------------------------------------------------------
/DIGDriver/data/genes_MSK_410.txt:
--------------------------------------------------------------------------------
  1 | ABL1
  2 | ACVR1
  3 | AKT1
  4 | AKT2
  5 | AKT3
  6 | ALK
  7 | ALOX12B
  8 | ANKRD11
  9 | APC
 10 | AR
 11 | ARAF
 12 | ARID1A
 13 | ARID1B
 14 | ARID2
 15 | ARID5B
 16 | ASXL1
 17 | ASXL2
 18 | ATM
 19 | ATR
 20 | ATRX
 21 | AURKA
 22 | AURKB
 23 | AXIN1
 24 | AXIN2
 25 | AXL
 26 | B2M
 27 | BAP1
 28 | BARD1
 29 | BBC3
 30 | BCL10
 31 | BCL2
 32 | BCL2L1
 33 | BCL2L11
 34 | BCL6
 35 | BCOR
 36 | BIRC3
 37 | BLM
 38 | BMPR1A
 39 | BRAF
 40 | BRCA1
 41 | BRCA2
 42 | BRD4
 43 | BRIP1
 44 | BTK
 45 | CALR
 46 | CARD11
 47 | CASP8
 48 | CBFB
 49 | CBL
 50 | CCND1
 51 | CCND2
 52 | CCND3
 53 | CCNE1
 54 | CD274
 55 | CD276
 56 | CD79A
 57 | CD79B
 58 | CDC73
 59 | CDH1
 60 | CDK12
 61 | CDK4
 62 | CDK6
 63 | CDK8
 64 | CDKN1A
 65 | CDKN1B
 66 | CDKN2A.p14arf
 67 | CDKN2A.p16INK4a
 68 | CDKN2B
 69 | CDKN2C
 70 | CEBPA
 71 | CENPA
 72 | CHEK1
 73 | CHEK2
 74 | CIC
 75 | CREBBP
 76 | CRKL
 77 | CRLF2
 78 | CSF1R
 79 | CSF3R
 80 | CTCF
 81 | CTLA4
 82 | CTNNB1
 83 | CUL3
 84 | CXCR4
 85 | DAXX
 86 | DCUN1D1
 87 | DDR2
 88 | DICER1
 89 | DIS3
 90 | DNAJB1
 91 | DNMT1
 92 | DNMT3A
 93 | DNMT3B
 94 | DOT1L
 95 | E2F3
 96 | EED
 97 | EGFL7
 98 | EGFR
 99 | EIF1AX
100 | EIF4A2
101 | EIF4E
102 | EP300
103 | EPCAM
104 | EPHA3
105 | EPHA5
106 | EPHA7
107 | EPHB1
108 | ERBB2
109 | ERBB3
110 | ERBB4
111 | ERCC2
112 | ERCC3
113 | ERCC4
114 | ERCC5
115 | ERG
116 | ERRFI1
117 | ESR1
118 | ETV1
119 | ETV6
120 | EZH2
121 | FAM123B
122 | FAM175A
123 | FAM46C
124 | FANCA
125 | FANCC
126 | FAT1
127 | FBXW7
128 | FGF19
129 | FGF3
130 | FGF4
131 | FGFR1
132 | FGFR2
133 | FGFR3
134 | FGFR4
135 | FH
136 | FLCN
137 | FLT1
138 | FLT3
139 | FLT4
140 | FOXA1
141 | FOXL2
142 | FOXO1
143 | FOXP1
144 | FUBP1
145 | FYN
146 | GATA1
147 | GATA2
148 | GATA3
149 | GLI1
150 | GNA11
151 | GNAQ
152 | GNAS
153 | GPS2
154 | GREM1
155 | GRIN2A
156 | GSK3B
157 | H3F3A
158 | H3F3B
159 | H3F3C
160 | HGF
161 | HIST1H1C
162 | HIST1H2BD
163 | HIST1H3A
164 | HIST1H3B
165 | HIST1H3C
166 | HIST1H3D
167 | HIST1H3E
168 | HIST1H3F
169 | HIST1H3G
170 | HIST1H3H
171 | HIST1H3I
172 | HIST1H3J
173 | HIST2H3C
174 | HIST2H3D
175 | HIST3H3
176 | HLA-A
177 | HNF1A
178 | HOXB13
179 | HRAS
180 | ICOSLG
181 | ID3
182 | IDH1
183 | IDH2
184 | IFNGR1
185 | IGF1
186 | IGF1R
187 | IGF2
188 | IKBKE
189 | IKZF1
190 | IL10
191 | IL7R
192 | INHA
193 | INHBA
194 | INPP4A
195 | INPP4B
196 | INSR
197 | IRF4
198 | IRS1
199 | IRS2
200 | JAK1
201 | JAK2
202 | JAK3
203 | JUN
204 | KDM5A
205 | KDM5C
206 | KDM6A
207 | KDR
208 | KEAP1
209 | KIT
210 | KLF4
211 | KRAS
212 | LATS1
213 | LATS2
214 | LMO1
215 | MALT1
216 | MAP2K1
217 | MAP2K2
218 | MAP2K4
219 | MAP3K1
220 | MAP3K13
221 | MAP3K14
222 | MAPK1
223 | MAPK3
224 | MAX
225 | MCL1
226 | MDC1
227 | MDM2
228 | MDM4
229 | MED12
230 | MEF2B
231 | MEN1
232 | MET
233 | MGA
234 | MITF
235 | MLH1
236 | MLL
237 | MLL2
238 | MLL3
239 | MPL
240 | MRE11A
241 | MSH2
242 | MSH6
243 | MST1
244 | MST1R
245 | MTOR
246 | MUTYH
247 | MYC
248 | MYCL1
249 | MYCN
250 | MYD88
251 | MYOD1
252 | NBN
253 | NCOA3
254 | NCOR1
255 | NEGR1
256 | NF1
257 | NF2
258 | NFE2L2
259 | NFKBIA
260 | NKX2-1
261 | NKX3-1
262 | NOTCH1
263 | NOTCH2
264 | NOTCH3
265 | NOTCH4
266 | NPM1
267 | NRAS
268 | NSD1
269 | NTRK1
270 | NTRK2
271 | NTRK3
272 | NUP93
273 | PAK1
274 | PAK7
275 | PALB2
276 | PARK2
277 | PARP1
278 | PAX5
279 | PBRM1
280 | PDCD1
281 | PDGFRA
282 | PDGFRB
283 | PDPK1
284 | PGR
285 | PHOX2B
286 | PIK3C2G
287 | PIK3C3
288 | PIK3CA
289 | PIK3CB
290 | PIK3CD
291 | PIK3CG
292 | PIK3R1
293 | PIK3R2
294 | PIK3R3
295 | PIM1
296 | PLCG2
297 | PLK2
298 | PMAIP1
299 | PMS1
300 | PMS2
301 | PNRC1
302 | POLD1
303 | POLE
304 | PPM1D
305 | PPP2R1A
306 | PPP6C
307 | PRDM1
308 | PRKAR1A
309 | PTCH1
310 | PTEN
311 | PTPN11
312 | PTPRD
313 | PTPRS
314 | PTPRT
315 | RAB35
316 | RAC1
317 | RAD21
318 | RAD50
319 | RAD51
320 | RAD51B
321 | RAD51C
322 | RAD51D
323 | RAD52
324 | RAD54L
325 | RAF1
326 | RARA
327 | RASA1
328 | RB1
329 | RBM10
330 | RECQL4
331 | REL
332 | RET
333 | RFWD2
334 | RHEB
335 | RHOA
336 | RICTOR
337 | RIT1
338 | RNF43
339 | ROS1
340 | RPS6KA4
341 | RPS6KB2
342 | RPTOR
343 | RUNX1
344 | RYBP
345 | SDHA
346 | SDHAF2
347 | SDHB
348 | SDHC
349 | SDHD
350 | SETD2
351 | SF3B1
352 | SH2B3
353 | SH2D1A
354 | SHQ1
355 | SMAD2
356 | SMAD3
357 | SMAD4
358 | SMARCA4
359 | SMARCB1
360 | SMARCD1
361 | SMO
362 | SOCS1
363 | SOX17
364 | SOX2
365 | SOX9
366 | SPEN
367 | SPOP
368 | SRC
369 | SRSF2
370 | STAG2
371 | STAT3
372 | STAT5A
373 | STAT5B
374 | STK11
375 | STK40
376 | SUFU
377 | SUZ12
378 | SYK
379 | TBX3
380 | TCEB1
381 | TCF3
382 | TCF7L2
383 | TERT
384 | TET1
385 | TET2
386 | TGFBR1
387 | TGFBR2
388 | TMEM127
389 | TMPRSS2
390 | TNFAIP3
391 | TNFRSF14
392 | TOP1
393 | TP53
394 | TP63
395 | TRAF2
396 | TRAF7
397 | TSC1
398 | TSC2
399 | TSHR
400 | U2AF1
401 | VEGFA
402 | VHL
403 | VTCN1
404 | WT1
405 | XIAP
406 | XPO1
407 | XRCC2
408 | YAP1
409 | YES1
410 | ZFHX3
411 | ZRSR2
412 | CDKN2AP16INK4A
413 | CDKN2AP14ARF
414 | MYB
415 | NFIB
416 | 


--------------------------------------------------------------------------------
/DIGDriver/data/genes_MSK_468.txt:
--------------------------------------------------------------------------------
  1 | ABL1
  2 | ACVR1
  3 | AGO2
  4 | AKT1
  5 | AKT2
  6 | AKT3
  7 | ALK
  8 | ALOX12B
  9 | ANKRD11
 10 | APC
 11 | AR
 12 | ARAF
 13 | ARID1A
 14 | ARID1B
 15 | ARID2
 16 | ARID5B
 17 | ASXL1
 18 | ASXL2
 19 | ATM
 20 | ATR
 21 | ATRX
 22 | AURKA
 23 | AURKB
 24 | AXIN1
 25 | AXIN2
 26 | AXL
 27 | B2M
 28 | BABAM1
 29 | BAP1
 30 | BARD1
 31 | BBC3
 32 | BCL10
 33 | BCL2
 34 | BCL2L1
 35 | BCL2L11
 36 | BCL6
 37 | BCOR
 38 | BIRC3
 39 | BLM
 40 | BMPR1A
 41 | BRAF
 42 | BRCA1
 43 | BRCA2
 44 | BRD4
 45 | BRIP1
 46 | BTK
 47 | CALR
 48 | CARD11
 49 | CARM1
 50 | CASP8
 51 | CBFB
 52 | CBL
 53 | CCND1
 54 | CCND2
 55 | CCND3
 56 | CCNE1
 57 | CD274
 58 | CD276
 59 | CD79A
 60 | CD79B
 61 | CDC42
 62 | CDC73
 63 | CDH1
 64 | CDK12
 65 | CDK4
 66 | CDK6
 67 | CDK8
 68 | CDKN1A
 69 | CDKN1B
 70 | CDKN2A
 71 | CDKN2A.p14arf
 72 | CDKN2A.p16INK4a
 73 | CDKN2B
 74 | CDKN2C
 75 | CEBPA
 76 | CENPA
 77 | CHEK1
 78 | CHEK2
 79 | CIC
 80 | CREBBP
 81 | CRKL
 82 | CRLF2
 83 | CSDE1
 84 | CSF1R
 85 | CSF3R
 86 | CTCF
 87 | CTLA4
 88 | CTNNB1
 89 | CUL3
 90 | CXCR4
 91 | CYLD
 92 | CYSLTR2
 93 | DAXX
 94 | DCUN1D1
 95 | DDR2
 96 | DICER1
 97 | DIS3
 98 | DNAJB1
 99 | DNMT1
100 | DNMT3A
101 | DNMT3B
102 | DOT1L
103 | DROSHA
104 | DUSP4
105 | E2F3
106 | EED
107 | EGFL7
108 | EGFR
109 | EIF1AX
110 | EIF4A2
111 | EIF4E
112 | ELF3
113 | EP300
114 | EPAS1
115 | EPCAM
116 | EPHA3
117 | EPHA5
118 | EPHA7
119 | EPHB1
120 | ERBB2
121 | ERBB3
122 | ERBB4
123 | ERCC2
124 | ERCC3
125 | ERCC4
126 | ERCC5
127 | ERF
128 | ERG
129 | ERRFI1
130 | ESR1
131 | ETV1
132 | ETV6
133 | EZH1
134 | EZH2
135 | FAM123B
136 | FAM175A
137 | FAM46C
138 | FAM58A
139 | FANCA
140 | FANCC
141 | FAT1
142 | FBXW7
143 | FGF19
144 | FGF3
145 | FGF4
146 | FGFR1
147 | FGFR2
148 | FGFR3
149 | FGFR4
150 | FH
151 | FLCN
152 | FLT1
153 | FLT3
154 | FLT4
155 | FOXA1
156 | FOXL2
157 | FOXO1
158 | FOXP1
159 | FUBP1
160 | FYN
161 | GATA1
162 | GATA2
163 | GATA3
164 | GLI1
165 | GNA11
166 | GNAQ
167 | GNAS
168 | GPS2
169 | GREM1
170 | GRIN2A
171 | GSK3B
172 | H3F3A
173 | H3F3B
174 | H3F3C
175 | HGF
176 | HIST1H1C
177 | HIST1H2BD
178 | HIST1H3A
179 | HIST1H3B
180 | HIST1H3C
181 | HIST1H3D
182 | HIST1H3E
183 | HIST1H3F
184 | HIST1H3G
185 | HIST1H3H
186 | HIST1H3I
187 | HIST1H3J
188 | HIST2H3C
189 | HIST2H3D
190 | HIST3H3
191 | HLA-A
192 | HLA-B
193 | HNF1A
194 | HOXB13
195 | HRAS
196 | ICOSLG
197 | ID3
198 | IDH1
199 | IDH2
200 | IFNGR1
201 | IGF1
202 | IGF1R
203 | IGF2
204 | IKBKE
205 | IKZF1
206 | IL10
207 | IL7R
208 | INHA
209 | INHBA
210 | INPP4A
211 | INPP4B
212 | INPPL1
213 | INSR
214 | IRF4
215 | IRS1
216 | IRS2
217 | JAK1
218 | JAK2
219 | JAK3
220 | JUN
221 | KDM5A
222 | KDM5C
223 | KDM6A
224 | KDR
225 | KEAP1
226 | KIT
227 | KLF4
228 | KMT2B
229 | KMT5A
230 | KNSTRN
231 | KRAS
232 | LATS1
233 | LATS2
234 | LMO1
235 | LYN
236 | MALT1
237 | MAP2K1
238 | MAP2K2
239 | MAP2K4
240 | MAP3K1
241 | MAP3K13
242 | MAP3K14
243 | MAPK1
244 | MAPK3
245 | MAPKAP1
246 | MAX
247 | MCL1
248 | MDC1
249 | MDM2
250 | MDM4
251 | MED12
252 | MEF2B
253 | MEN1
254 | MET
255 | MGA
256 | MITF
257 | MLH1
258 | MLL
259 | MLL2
260 | MLL3
261 | MPL
262 | MRE11A
263 | MSH2
264 | MSH3
265 | MSH6
266 | MSI1
267 | MSI2
268 | MST1
269 | MST1R
270 | MTOR
271 | MUTYH
272 | MYC
273 | MYCL1
274 | MYCN
275 | MYD88
276 | MYOD1
277 | NBN
278 | NCOA3
279 | NCOR1
280 | NEGR1
281 | NF1
282 | NF2
283 | NFE2L2
284 | NFKBIA
285 | NKX2-1
286 | NKX3-1
287 | NOTCH1
288 | NOTCH2
289 | NOTCH3
290 | NOTCH4
291 | NPM1
292 | NRAS
293 | NSD1
294 | NTHL1
295 | NTRK1
296 | NTRK2
297 | NTRK3
298 | NUF2
299 | NUP93
300 | PAK1
301 | PAK7
302 | PALB2
303 | PARK2
304 | PARP1
305 | PAX5
306 | PBRM1
307 | PDCD1
308 | PDCD1LG2
309 | PDGFRA
310 | PDGFRB
311 | PDPK1
312 | PGR
313 | PHOX2B
314 | PIK3C2G
315 | PIK3C3
316 | PIK3CA
317 | PIK3CB
318 | PIK3CD
319 | PIK3CG
320 | PIK3R1
321 | PIK3R2
322 | PIK3R3
323 | PIM1
324 | PLCG2
325 | PLK2
326 | PMAIP1
327 | PMS1
328 | PMS2
329 | PNRC1
330 | POLD1
331 | POLE
332 | PPARG
333 | PPM1D
334 | PPP2R1A
335 | PPP4R2
336 | PPP6C
337 | PRDM1
338 | PRDM14
339 | PREX2
340 | PRKAR1A
341 | PRKCI
342 | PRKD1
343 | PTCH1
344 | PTEN
345 | PTP4A1
346 | PTPN11
347 | PTPRD
348 | PTPRS
349 | PTPRT
350 | RAB35
351 | RAC1
352 | RAC2
353 | RAD21
354 | RAD50
355 | RAD51
356 | RAD51C
357 | RAD51L1
358 | RAD51L3
359 | RAD52
360 | RAD54L
361 | RAF1
362 | RARA
363 | RASA1
364 | RB1
365 | RBM10
366 | RECQL
367 | RECQL4
368 | REL
369 | RET
370 | RFWD2
371 | RHEB
372 | RHOA
373 | RICTOR
374 | RIT1
375 | RNF43
376 | ROS1
377 | RPS6KA4
378 | RPS6KB2
379 | RPTOR
380 | RRAGC
381 | RRAS
382 | RRAS2
383 | RTEL1
384 | RUNX1
385 | RXRA
386 | RYBP
387 | SDHA
388 | SDHAF2
389 | SDHB
390 | SDHC
391 | SDHD
392 | SESN1
393 | SESN2
394 | SESN3
395 | SETD2
396 | SF3B1
397 | SH2B3
398 | SH2D1A
399 | SHOC2
400 | SHQ1
401 | SLX4
402 | SMAD2
403 | SMAD3
404 | SMAD4
405 | SMARCA4
406 | SMARCB1
407 | SMARCD1
408 | SMO
409 | SMYD3
410 | SOCS1
411 | SOS1
412 | SOX17
413 | SOX2
414 | SOX9
415 | SPEN
416 | SPOP
417 | SPRED1
418 | SRC
419 | SRSF2
420 | STAG2
421 | STAT3
422 | STAT5A
423 | STAT5B
424 | STK11
425 | STK19
426 | STK40
427 | SUFU
428 | SUZ12
429 | SYK
430 | TAP1
431 | TAP2
432 | TBX3
433 | TCEB1
434 | TCF3
435 | TCF7L2
436 | TEK
437 | TERT
438 | TET1
439 | TET2
440 | TGFBR1
441 | TGFBR2
442 | TMEM127
443 | TMPRSS2
444 | TNFAIP3
445 | TNFRSF14
446 | TOP1
447 | TP53
448 | TP53BP1
449 | TP63
450 | TRAF2
451 | TRAF7
452 | TSC1
453 | TSC2
454 | TSHR
455 | U2AF1
456 | UPF1
457 | VEGFA
458 | VHL
459 | VTCN1
460 | WHSC1
461 | WHSC1L1
462 | WT1
463 | WWTR1
464 | XIAP
465 | XPO1
466 | XRCC2
467 | YAP1
468 | YES1
469 | ZFHX3
470 | ZRSR2
471 | CDKN2AP16INK4A
472 | CDKN2AP14ARF
473 | MYB
474 | NFIB
475 | 


--------------------------------------------------------------------------------
/DIGDriver/data/genes_metabric_173.txt:
--------------------------------------------------------------------------------
  1 | TP53
  2 | FOXO3
  3 | NCOR1
  4 | PIK3CA
  5 | SETD2
  6 | BIRC6
  7 | TG
  8 | GATA3
  9 | ARID2
 10 | NCOR2
 11 | CBFB
 12 | BAP1
 13 | STAB2
 14 | MUC16
 15 | FOXP1
 16 | RYR2
 17 | FANCD2
 18 | KMT2C
 19 | CDH1
 20 | NF1
 21 | USH2A
 22 | MTAP
 23 | ERBB3
 24 | MAP3K1
 25 | SF3B1
 26 | MLL2
 27 | RB1
 28 | COL6A3
 29 | UTRN
 30 | PTEN
 31 | BRCA2
 32 | CASP8
 33 | AHNAK
 34 | ALK
 35 | KDM6A
 36 | AGMO
 37 | SYNE1
 38 | ARID1A
 39 | AKT1
 40 | LIPI
 41 | ASXL2
 42 | TAF1
 43 | APC
 44 | SETD1A
 45 | AKAP9
 46 | UBR5
 47 | LAMA2
 48 | MAP2K4
 49 | BRIP1
 50 | PRKCE
 51 | PIK3R1
 52 | HERC2
 53 | FBXW7
 54 | AHNAK2
 55 | GPS2
 56 | THSD7A
 57 | MYH9
 58 | ZFP36L1
 59 | GPR32
 60 | GH1
 61 | L1CAM
 62 | SMAD4
 63 | NOTCH1
 64 | JAK1
 65 | DNAH2
 66 | COL22A1
 67 | TBX3
 68 | COL12A1
 69 | DNAH5
 70 | CTCF
 71 | KRAS
 72 | CACNA2D3
 73 | TTYH1
 74 | ERBB4
 75 | MBL2
 76 | SIK1
 77 | AKT2
 78 | ARID5B
 79 | THADA
 80 | FRMD3
 81 | ATR
 82 | RUNX1
 83 | BRCA1
 84 | EGFR
 85 | PRKCQ
 86 | LIFR
 87 | SMARCC2
 88 | MEN1
 89 | ROS1
 90 | LAMB3
 91 | USP9X
 92 | RPGR
 93 | AFF2
 94 | CDKN1B
 95 | PRPS2
 96 | PALLD
 97 | SHANK2
 98 | PTPRM
 99 | PTPRD
100 | ASXL1
101 | GPR124
102 | CHEK2
103 | ERBB2
104 | CDKN2A
105 | MLLT4
106 | PDE4DIP
107 | SMARCC1
108 | CTNNA3
109 | MAP3K10
110 | LARGE
111 | SETDB1
112 | ARID1B
113 | DCAF4L2
114 | NCOA3
115 | DNAH11
116 | MAP3K13
117 | BCAS3
118 | PBRM1
119 | NRG3
120 | HDAC9
121 | ACVRL1
122 | NDFIP1
123 | USP28
124 | CHD1
125 | OR6A2
126 | CLK3
127 | PRKCZ
128 | MYO3A
129 | PRKG1
130 | FLT3
131 | BRAF
132 | PRR16
133 | PRKACG
134 | FAM20C
135 | KDM3A
136 | NPNT
137 | NEK1
138 | NF2
139 | FANCA
140 | MYO1A
141 | PPP2R2A
142 | STK11
143 | EP300
144 | CTNNA1
145 | FOXO1
146 | SGCD
147 | SBNO1
148 | HIST1H2BC
149 | SPACA1
150 | SIK2
151 | DTWD2
152 | GLDC
153 | NR2F1
154 | MAGEA8
155 | KLRG1
156 | TAF4B
157 | HRAS
158 | RASGEF1B
159 | SMAD2
160 | NR3C1
161 | LDLRAP1
162 | NT5E
163 | PTPN22
164 | CLRN2
165 | CCND3
166 | SMARCB1
167 | TBL1XR1
168 | PPP2CB
169 | SIAH1
170 | SMARCD1
171 | STMN2
172 | NRAS
173 | AGTR2
174 | CDKN2A.p14arf
175 | CDKN2A.p16INK4a
176 | 


--------------------------------------------------------------------------------
/DIGDriver/data/refcds_hg19.rda:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/maxwellsh/DIGDriver/5bb565a1fbb3924ecdaaedeffb97123febc3b4d1/DIGDriver/data/refcds_hg19.rda


--------------------------------------------------------------------------------
/DIGDriver/data_tools/DIG_auto.py:
--------------------------------------------------------------------------------
  1 | import sys
  2 | import os
  3 | import argparse
  4 | import numpy as np 
  5 | import pandas as pd
  6 | from pathlib import Path
  7 | from multiprocessing.pool import Pool
  8 | import h5py
  9 | 
 10 | sys.path.append('../data_tools/')
 11 | sys.path.append('../region_model/')
 12 | sys.path.append('../sequence_model/')
 13 | import DataExtractor
 14 | import kfold_mutations_main
 15 | import SequenceModel
 16 | import GenicDriver
 17 | 
 18 | 
 19 | def parse_args(text=None):
 20 |     parser = argparse.ArgumentParser(description="Automation tool for running DIG pipeline")
 21 |     subparsers = parser.add_subparsers(help='DIG sub-commands')
 22 | 
 23 |     parser_a = subparsers.add_parser('runDIG', help='Run DIG model')
 24 | 
 25 |     ## required
 26 |     parser_a.add_argument('--out-dir', type=str, dest='out_dir', required = True, help='Base Directory of DIG run. All intermediate files will be saved relative to this location')
 27 |     parser_a.add_argument('--map-ref', type=str, dest='map_ref', help='path to mappability file')
 28 |     parser_a.add_argument('--window-size', type=int, default=10000, dest='window', help='desired window size for DIG model regions')
 29 |     parser_a.add_argument('--min-map', type=float, default=0.50, dest='min_mapp', help='minimum mappability for windows')
 30 |     parser_a.add_argument('--ref-file', type=str, dest='ref_file', help='path to reference hg19 genome')
 31 |     parser_a.add_argument('--mut-file', type=str, dest='mut_file', required = True, help='path to mutations file')
 32 |     parser_a.add_argument('--N-procs', type = int, dest='n_procs', default = 20, help= 'number of processes to run')
 33 | 
 34 |     ## partial runs
 35 |     parser_a.add_argument('--map-file', type = str, dest = 'map_file', help = 'map to precomputed mappability file')
 36 |     parser_a.add_argument('--epi-dir', type=str, dest='epi_dir', help='path to epigenomics files')
 37 |     parser_a.add_argument('--split_idx', type=str, dest='split_dir', help='path to split index dir')
 38 |     parser_a.add_argument('--epi-matrix_dir', type=str, dest='epi_matrix_dir', help='path to constructed epigenome matrix h5 file')
 39 |     parser_a.add_argument('--fmodel-dir', type=str, dest='fmodel_dir', help='path to constructed genome context frequency file')
 40 |     parser_a.add_argument('--gp-results-base', type=str, dest='gp_res', help='path to generic file name of gp results fold')
 41 | 
 42 |     ##optional arguments
 43 |     parser_a.add_argument('-c', '--cancer-key', type = str, dest = 'cancer_key', help = 'key name for cancer targets')
 44 |     parser_a.add_argument('-g', "--gpus", required=False, nargs='?', action='store', type=str, dest='gpus',
 45 |                     default='all', help='GPUs devices (all/comma separted list)')
 46 | 
 47 |     parser_a.set_defaults(func=run)
 48 |     
 49 |     if text:
 50 |         args = parser.parse_args(text.split())
 51 |     else:
 52 |         args = parser.parse_args()
 53 | 
 54 |     return args
 55 | 
 56 | # inputs are epi-genome tracks and mutation file
 57 | 
 58 | 
 59 | 
 60 | def run(args):
 61 |     if args.gp_res is None:
 62 |         if args.epi_matrix_dir is None:
 63 |             if args.epi_dir is None:
 64 |                 print('Error: need to provide either a epi_track dir or a epi_matrix_dir')
 65 |                 return
 66 |             else:
 67 |                 map_file_name = "high_mapp_{}_{}_{}".format(args.min_mapp, args.window, 0)
 68 |                 mapp_file_path = os.path.join(args.out_dir, map_file_name)
 69 |                 if args.map_file is None:
 70 |                     print('Finding mappable windows...')
 71 |                     mapp_args = DataExtractor.parse_args('mappability {} --out-dir {} --window {} --overlap {} --min-map {}'.format(args.map_ref, args.out_dir, args.window, 0, args.min_mapp))
 72 |                     DataExtractor.mappability(mapp_args)
 73 |                     print('map file saved at: ' + mapp_file_path)
 74 | 
 75 |                 print('creating split index...')
 76 | 
 77 |                 if args.split_dir is None:
 78 |                     split_path = os.path.join(args.out_dir, 'splitIdx_{}'.format(args.window))
 79 |                     if not os.path.exists(split_path):
 80 |                         os.mkdir(split_path)
 81 |                     split_args = DataExtractor.parse_args('splitDataIdx --base-dir {} --out-dir {} --chunk-size {} --window {} --overlap {} --min-map {}'.format(args.out_dir, split_path, 10000, args.window, 0, args.min_mapp))
 82 |                     DataExtractor.split_data_idx(split_args)
 83 |                     print('splitIdx files saved at'+ split_path)
 84 |                 else:
 85 |                     split_path = args.split_dir
 86 | 
 87 |                 print('creating matrix chunks...')
 88 |                 chunks_path = os.path.join(args.out_dir, 'matrix_chunks_{}'.format(args.window))
 89 |                 print(chunks_path)
 90 |                 if not os.path.exists(chunks_path):
 91 |                     os.mkdir(chunks_path)
 92 |                 p = Pool(args.n_procs)
 93 |                 path = Path(split_path).glob('**/*')
 94 |                 files = [str(x) for x in path if x.is_file()]
 95 |                 res = []
 96 |                 for f in files:
 97 |                     res.append(p.apply_async(chunk_runner, (f, chunks_path, args.ref_file, args.epi_dir, args.mut_file, args.window, args.cancer_key)))
 98 |                 p.close()
 99 |                 p.join()
100 |                 _ = [r.get() for r in res]
101 |                 print('chunks saved')
102 | 
103 |                 print('concatenating chunks...')
104 |                 concat_args = DataExtractor.parse_args('concatH5 {} --out-dir {}'.format(chunks_path, args.out_dir))
105 |                 DataExtractor.concatH5(concat_args)
106 | 
107 |                 print('adding mappability track')
108 |                 epi_matrix_fname = os.path.join(args.out_dir, 'data_matrices' + '_{}_0_{}'.format(args.window, args.min_mapp) + '.h5')
109 |                 addMap_args = DataExtractor.parse_args('addMappability {} {}'.format(epi_matrix_fname, args.map_ref))
110 |                 DataExtractor.add_mappability(addMap_args)
111 |                 print('epi track done!')
112 |         else:
113 |             print('running NN model')
114 |             epi_matrix_fname = args.epi_matrix_dir
115 |         
116 |         kfold_args = kfold_mutations_main.get_cmd_arguments('-c {} -d {} -o {} -m {} -g {}'.format(args.cancer_key, epi_matrix_fname, args.out_dir, args.min_mapp, args.gpus))
117 |         kfold_mutations_main.main(kfold_args)
118 |         print('finished NN model')
119 |         directory = os.path.join(args.out_dir, 'kfold/{}'.format(args.cancer_key))
120 |         date_dir = max([os.path.join(directory,d) for d in os.listdir(directory)], key=os.path.getmtime)
121 |         gp_results_base = os.path.join(date_dir, 'gp_results_fold_{}.h5')
122 |     else:
123 |         gp_results_base = args.gp_res
124 |         mapp_file_path = args.map_file
125 |     # we assume that you either dont have anything, have the genome counts but not the mutation counts (or annotations) or have everything
126 |     if args.fmodel_dir is None:
127 |         f_model_path = os.path.join(args.out_dir, 'fmodel_{}_trinuc_192.h5'.format(args.window))
128 |         genome_context_args = SequenceModel.parse_args('countGenomeContext {} {} {} {} --up {} --down {} --n-procs {}'.format(mapp_file_path, args.window, args.ref_file, f_model_path, 1, 1, args.n_procs))
129 |         SequenceModel.countGenomeContext(genome_context_args)
130 |     else:
131 |         f_model_path = args.fmodel_dir
132 | 
133 |     fmodel = h5py.File(f_model_path, 'r')
134 |     if args.cancer_key + '_mutation_counts' in fmodel.keys():
135 |         run_canc = False
136 |     else:
137 |         run_canc = True
138 |     fmodel.close()
139 | 
140 |     if run_canc:
141 |         annot_name = os.path.basename(args.mut_file).split('txt.gz')[0] + 'trinuc.txt'
142 |         annot_path = os.path.join(args.out_dir, annot_name)
143 |         print(annot_path)
144 |         annot_args = SequenceModel.parse_args('annotateMutationFile {} {} {} {} --n-procs {}'.format(args.mut_file, f_model_path, args.ref_file, annot_path, args.n_procs))
145 |         SequenceModel.annotateMutationFile(annot_args)
146 |         annot_path = annot_path + '.gz'
147 |         
148 |         count_contexts_args = SequenceModel.parse_args('countMutationContext {} {} {} {} {} --n-procs {} '.format(mapp_file_path, annot_path, f_model_path, args.window, args.cancer_key, args.n_procs))
149 |         SequenceModel.countMutationContext(count_contexts_args)
150 |     else:
151 |         annot_path = args.mut_file
152 |     
153 |     #run models
154 |     print('running models')
155 |     submap_path = gp_results_base.split('gp_results')[0] + 'sub_mapp_results_fold_{}.h5'
156 | 
157 | #    for fold in range(5):
158 | #        apply_seq_args = SequenceModel.parse_args('applySequenceModel {} {} {} {} {} --cancer {} --key-prefix {} --key {} --n-procs {} --bins {} --run ensemble'.format(gp_results_base.format(fold), f_model_path, annot_path, args.ref_file, args.window, args.cancer_key, args.cancer_key, args.cancer_key, args.n_procs, 50))
159 | #        SequenceModel.applySequenceModel(apply_seq_args)
160 | 
161 |     results_path = os.path.join(args.out_dir, 'results')
162 |     if not os.path.exists(results_path):
163 |         os.mkdir(results_path)
164 | 
165 | #    concat_sequence_results(gp_results_base, args.cancer_key, os.path.join(results_path, 'hotspot_results_{}.h5'.format(args.cancer_key)))
166 |     genic_out = os.path.join(results_path, 'genicDetect_{}_{}_{}.h5'.format(args.cancer_key, args.window, args.min_mapp))
167 | 
168 |     genic_args = GenicDriver.parse_args('genicDetectParallel {} {} {} {} -c {} -N {} -m {} -u {}'.format(annot_path, gp_results_base, f_model_path, genic_out, args.cancer_key, args.n_procs, args.min_mapp, submap_path))
169 | 
170 |     GenicDriver.genicDetectParallel(genic_args)
171 | 
172 |     nonc_out = os.path.join(results_path, 'noncDetect_{}_{}_{}.h5'.format(args.cancer_key, args.window, args.min_mapp))
173 |     nonc_args = GenicDriver.parse_args('noncDetectParallel {} {} {} {} -c {} -N {} -m {} -u {} -t both'.format(annot_path, gp_results_base, f_model_path, nonc_out, args.cancer_key, args.n_procs, args.min_mapp, submap_path))
174 |     GenicDriver.noncodingDetectParallel(nonc_args)
175 |     
176 | def main():
177 |     args = parse_args()
178 |     args.func(args)
179 |     print('Done!')
180 | 
181 | def chunk_runner(f, chunks_path, ref_file, epi_dir, mut_file, window, cancer_key):
182 |     chunk_args = DataExtractor.parse_args('createChunk {} --out-dir {} --ref-file {} --epi-dir {} --mut-file {} --window {} --bins {} --cancer-key {}'.format(f, chunks_path, ref_file, epi_dir, mut_file, window, 100, cancer_key))
183 |     DataExtractor.create_chunk(chunk_args)
184 | 
185 | def concat_sequence_results(base_results, cancer, out_path):
186 |     fout = h5py.File(out_path, 'a')
187 |     #keys = [k for k in f[cancer]['test'].keys() if 'nb_model' in k]
188 |     keys = ['nb_model_up1_down1_binsize50_run_ensemble']
189 |     if len(keys) ==0:
190 |         return -1
191 |     for k in keys:
192 |         print('working on {}'.format(k))
193 |         df_lst = []
194 |         for run in range(5):
195 |             run_res = pd.read_hdf(base_results.format(run), key='{}/test/{}'.format(cancer, k))
196 |             run_res = run_res.astype({'CHROM': 'int32', 'POS': 'float64', 'OBS': 'int32', 'EXP': 'float64','PVAL': 'float64','Pi': 'float64','MU': 'float64','SIGMA': 'float64','REGION': 'object'})
197 |             df_lst.append(run_res)
198 |         complete = pd.concat(df_lst)
199 |         complete.to_hdf(out_path, key = k, format='fixed')
200 |     fout.close()
201 | 
202 | if __name__ == '__main__':
203 |     main()
204 | 
205 |     
206 | 


--------------------------------------------------------------------------------
/DIGDriver/data_tools/__init__.py:
--------------------------------------------------------------------------------
1 | ## python init file
2 | 


--------------------------------------------------------------------------------
/DIGDriver/data_tools/auto_runner.py:
--------------------------------------------------------------------------------
 1 | import DIG_auto
 2 | import sys
 3 | import traceback
 4 | 
 5 | cancers =['Head-SCC_SNV', 'Adenocarcinoma_tumors_SNV_msi_low','Liver-HCC_SNV', 'Biliary-AdenoCA_SNV',
 6 | 'Bladder-TCC_SNV', 'Lung-SCC_SNV', 'Bone-Osteosarc_SNV', 'Lung_tumors_SNV','Breast-AdenoCa_SNV', 
 7 | 'Ovary-AdenoCA_SNV','Carcinoma_tumors_SNV_msi_low','Panc-AdenoCA_SNV',
 8 | 'CNS-GBM_SNV', 'Pancan_SNV', 'CNS_tumors_SNV','Prost-AdenoCA_SNV',
 9 | 'ColoRect-AdenoCA_SNV',	'Sarcoma_tumors_SNV','ColoRect-AdenoCA_SNV_msi_low', 'Skin-Melanoma_SNV',
10 | 'Digestive_tract_tumors_SNV','Squamous_tumors_SNV','Digestive_tract_tumors_SNV_msi_low', 'Thy-AdenoCA_SNV',
11 | 'Eso-AdenoCa_SNV', 'Uterus-AdenoCA_SNV','Female_reproductive_system_tumors_SNV_msi_low','Uterus-AdenoCA_SNV_msi_low']
12 | 
13 | for c in cancers:
14 |     try:
15 |         fmut_str = '/data/cb/maxas/data/projects/cancer_mutations/cancer_mutations_PCAWG/DIG_FILES/' + c + '.annot.txt.gz'
16 |         gp_str = '/scratch2/dig/full_pcawg/' + c + '/gp_results_fold_{}.h5'
17 |         dig_args = DIG_auto.parse_args('runDIG --out-dir {} --window-size {} --min-map {} --ref-file {} --mut-file {} --N-procs {} --map-file {} --fmodel-dir {} --gp-results-base {} -c {}'.format('/scratch1/priebeo/PCAWG_full_results/v1_final_results', 10000, 0.5, '/scratch1/maxas/ICGC_Roadmap/reference_genome/hg19.fasta', fmut_str, 30, '/scratch1/priebeo/neurIPS/10kb_map_0', '/scratch1/priebeo/PCAWG_full_results/v1_final_results/fmodel_10000_trinuc_192.h5', gp_str, c))
18 |         DIG_auto.run(dig_args)
19 |     except:
20 |         print("Unexpected error:")
21 |         traceback.print_exc()
22 |         print('failed' + c)
23 |         print('skipping...')
24 | 


--------------------------------------------------------------------------------
/DIGDriver/data_tools/mappability_tools.py:
--------------------------------------------------------------------------------
 1 | import numpy as np
 2 | import pandas as pd
 3 | import bbi
 4 | import pysam
 5 | ##only mappabilty_by_idx called from top level
 6 | def load_chromsizes(f_bw):
 7 |     chroms = bbi.chromsizes(f_bw)
 8 |     chroms.pop('chrM')
 9 |     chroms.pop('chrX')
10 |     chroms.pop('chrY')
11 | 
12 |     return chroms
13 | 
14 | def mappability_by_window(f_mapp, window, overlap=0):
15 |     chroms = load_chromsizes(f_mapp)
16 | 
17 |     mapp_lst = []
18 |     for chr_id, chr_size in chroms.items():
19 |         print(chr_id, end=' ')
20 |         i = 0
21 |         while i + window < chr_size:
22 |             # print(i)
23 |             mapp = bbi.fetch(f_mapp, chr_id, i, i + window, bins=1)[0]
24 |             mapp_lst.append([chr_id, i, i+window, mapp])
25 |             i += window - overlap
26 | 
27 |     return pd.DataFrame(np.array(mapp_lst),
28 |                         columns=['CHROM', 'START', 'END', 'MAPP'])
29 | 
30 | def mappability_by_idx(f_mapp, idx):
31 | 
32 |     mapp_lst = []
33 |     chr_prev = ''
34 |     for row in idx:
35 |         chr_id = 'chr{}'.format(row[0])
36 |         start = row[1]
37 |         end = row[2]
38 | 
39 |         if chr_id != chr_prev:
40 |             print(chr_id)
41 | 
42 |         mapp = bbi.fetch(f_mapp, chr_id, start, end, bins=1)[0]
43 |         mapp_lst.append([row[0], start, end, mapp])
44 |         chr_prev = chr_id
45 | 
46 |     return mapp_lst
47 | 
48 | def P_bases_by_window(f_fasta, window, overlap=0):
49 |     fasta = pysam.FastaFile(f_fasta)
50 |     sizes = fasta.lengths
51 |     chroms = fasta.references
52 | 
53 |     mapp_lst = []
54 |     for chr_id, chr_size in zip(chroms, sizes):
55 |         print(chr_id, end=' ')
56 |         i = 0
57 |         while i + window < chr_size:
58 |             seq = fasta.fetch(chr_id, i, i + window)
59 |             mapp = seq.count('P') / window
60 |             mapp_lst.append([chr_id, i, i+window, mapp])
61 |             i += window - overlap
62 | 
63 |     return pd.DataFrame(np.array(mapp_lst),
64 |                         columns=['CHROM', 'START', 'END', 'MAPP'])
65 | 


--------------------------------------------------------------------------------
/DIGDriver/data_tools/track_selector.py:
--------------------------------------------------------------------------------
 1 | import os, sys
 2 | import pickle as pkl
 3 | import numpy as np
 4 | import pandas as pd
 5 | import argparse
 6 | 
 7 | def get_cmd_arguments():
 8 |     ap = argparse.ArgumentParser()
 9 |     ap.add_argument('-t', '--track-list', required=True, nargs='?', action='store', type=str, dest='track_lst_path',help= 'path to list of tracks being used')
10 |     ap.add_argument('-o', '--out-dir', nargs='?', default = './', action='store', type=str, dest='out_dir',help= 'path to save track selection file')
11 |     ap.add_argument('-stemcells',  action = 'store_true', help='Include Stem cells [ESC, ESC_derived, IPSC, Placental]')
12 |     ap.add_argument('-general', action = 'store_true', help='Include general tracks [fibroblasts, stromal cells, adipose tissue]')
13 |     ap.add_argument('-other', action = 'store_true', help='Include all other misc tracks')
14 |     ap.add_argument('-lung', action = 'store_true', help='Include lung tracks')
15 |     ap.add_argument('-breast', action = 'store_true', help='Include breast tracks')
16 |     ap.add_argument('-blood', action = 'store_true', help='Include blood tracks')
17 |     ap.add_argument('-skin', action = 'store_true', help='Include skin tracks')
18 |     ap.add_argument('-liver', action = 'store_true', help='Include liver tracks')
19 |     ap.add_argument('-stomach', action = 'store_true', help='Include stomach tracks (limited selection)')
20 |     ap.add_argument('-GC', action = 'store_true', help='Include GC content track')
21 |     ap.add_argument('-HiC', action = 'store_true', help='Include all HiC tracks')
22 |     ap.add_argument('-repli_chip', action = 'store_true', help='Include all repli-seq tracks')
23 |     ap.add_argument('-cons', action = 'store_true', help='Include conservation tracks (included in general)')
24 |     ap.add_argument('-seq', action = 'store_true', help='Include sequence context tracks (included in general)')
25 |     return ap.parse_args()
26 | 
27 | def main():
28 |     args = get_cmd_arguments()
29 |     meta = pd.read_csv(open('/scratch1/priebeo/neurIPS/new_tracks_meta.csv', 'r'))
30 |     track_lst = pkl.load(open(args.track_lst_path, 'rb'))
31 |     track_lst = np.array([t.split('/')[-1].split('.')[0] for t in track_lst])
32 | 
33 |     meta['track_pos'] = -1
34 |     for i, l in enumerate(track_lst):
35 |         meta.loc[meta['File accession'] == l, 'track_pos'] = i
36 |     meta = meta.astype({'track_pos': int})
37 |     meta = meta.set_index('track_pos')
38 |     meta.sort_index(inplace = True)
39 | 
40 |     track_accumulator = set([])
41 |     if args.stemcells:
42 |         track_accumulator = track_accumulator.union(set(np.array(meta.loc[meta['Anatomy'] == 'StemCells'].index)))
43 |     if args.general:
44 |         track_accumulator = track_accumulator.union(set(np.array(meta.loc[meta['Anatomy'] == 'General'].index)))
45 |     if args.other:
46 |         track_accumulator = track_accumulator.union(set(np.array(meta.loc[meta['Anatomy'] == 'Other'].index)))
47 |     if args.lung:
48 |         track_accumulator = track_accumulator.union(set(np.array(meta.loc[meta['Anatomy'] == 'Lung'].index)))
49 |     if args.breast:
50 |         track_accumulator = track_accumulator.union(set(np.array(meta.loc[meta['Anatomy'] == 'Breast'].index)))
51 |     if args.blood:
52 |         track_accumulator = track_accumulator.union(set(np.array(meta.loc[meta['Anatomy'] == 'Blood'].index)))
53 |     if args.skin:
54 |         track_accumulator = track_accumulator.union(set(np.array(meta.loc[meta['Anatomy'] == 'Skin'].index)))
55 |     if args.liver:
56 |         track_accumulator = track_accumulator.union(set(np.array(meta.loc[meta['Anatomy'] == 'Liver'].index)))
57 |     if args.stomach:
58 |         track_accumulator = track_accumulator.union(set(np.array(meta.loc[meta['Anatomy'] == 'Stomach'].index)))
59 |     
60 |     if args.GC:
61 |         track_accumulator = track_accumulator.union(set(np.array(meta.loc[meta['File accession'] == 'GC_content'].index)))
62 |     if args.seq:
63 |         track_accumulator = track_accumulator.union(set(np.array(meta.loc[meta['File accession'] == 'hg19'].index)))
64 |     if args.HiC:
65 |         track_accumulator = track_accumulator.union(set(np.array(meta.loc[meta['File accession'] == 'GC_content'].index)))
66 |     if args.cons:
67 |         track_accumulator = track_accumulator.union(set(np.array(meta.loc[meta['Assay'] == 'conservation'].index)))
68 |     if args.repli_chip:
69 |         track_accumulator = track_accumulator.union(set(np.array(meta.loc[meta['Assay'] == 'Repli-chip'].index)))
70 | 
71 |     to_add = np.array(sorted(track_accumulator))
72 |     if len(to_add[to_add < 0]) > 0:
73 |         print('Some desired file not present in track list')
74 |     to_add = to_add[to_add >= 0]
75 |     print('adding {} tracks'.format(to_add.shape[0]))
76 |     out_dir = os.path.join(args.out_dir, 'track_selection.txt')
77 |     np.savetxt(out_dir, to_add, fmt='%i')
78 | 
79 |     print('Done!')
80 | 
81 | if __name__ == "__main__":
82 |     main()
83 | 


--------------------------------------------------------------------------------
/DIGDriver/driver_model/__init__.py:
--------------------------------------------------------------------------------
1 | ## init file for python module
2 | 


--------------------------------------------------------------------------------
/DIGDriver/driver_model/onthefly_tools.py:
--------------------------------------------------------------------------------
  1 | import pandas as pd
  2 | import numpy as np
  3 | import pysam
  4 | import multiprocessing as mp
  5 | import pybedtools
  6 | import pkg_resources
  7 | import h5py
  8 | import scipy
  9 | import tempfile
 10 | import os
 11 | 
 12 | from DIGDriver.sequence_model import genic_driver_tools
 13 | from DIGDriver.sequence_model import sequence_tools
 14 | from DIGDriver.sequence_model import nb_model
 15 | from DIGDriver.driver_model import transfer_tools
 16 | from DIGDriver.data_tools import mutation_tools
 17 | 
 18 | 
 19 | def region_str_to_params(region_str):
 20 |     col_split = region_str.split(":")
 21 |     chrom = col_split[0].lstrip("chr")
 22 |     #chrom = col_split[0]
 23 |     pos_split = col_split[1].split("-")
 24 |     start = int(pos_split[0])
 25 |     end = int(pos_split[1])
 26 |     return chrom, start, end
 27 | 
 28 | def DIG_onthefly(f_pretrained, f_mut, f_fasta, f_elts_bed=None, region_str=None,
 29 | scale_factor=None, scale_factor_indel=None, scale_type="genome", scale_by_expectation=True,
 30 |  max_muts_per_sample=3e9, max_muts_per_elt_per_sample=3e9, skip_pvals=False):
 31 |     assert f_elts_bed or region_str, "ERROR: you must provide --f-bed or --region_str."
 32 | 
 33 |     if region_str:
 34 |         temp_file, temp_name = tempfile.mkstemp()
 35 | 
 36 |         CHROM,START,END = region_str_to_params(region_str)
 37 |         os.write(temp_file, "{}\t{}\t{}\tUserELT\t0\t+\t0\t0\t.\t1\t{},\t0,".format(CHROM,START,END,END-START).encode())
 38 |         os.close(temp_file)
 39 |         f_elts_bed = temp_name
 40 | 
 41 |     print('Tabulating mutations')
 42 |     df_mut_tab, blacklist = mutation_tools.tabulate_mutations_in_element(f_mut, f_elts_bed, bed12=True, drop_duplicates=True, all_elements = True,
 43 |         max_muts_per_sample=max_muts_per_sample, max_muts_per_elt_per_sample=max_muts_per_elt_per_sample, return_blacklist=True
 44 |     )
 45 |     if scale_by_expectation:
 46 |         print('scaling by expected number of mutations')
 47 |         df_gene = transfer_tools.load_pretrained_model(f_pretrained)
 48 |         df_mut = transfer_tools.read_mutations_cds(f_mut)
 49 |         df_mut = df_mut[~df_mut.SAMPLE.isin(blacklist)]
 50 |         df_syn = df_mut[(df_mut.ANNOT == 'Synonymous') & (df_mut.GENE != 'TP53')].drop_duplicates()
 51 |         exp_syn = (df_gene[df_gene.index != 'TP53'].MU * df_gene[df_gene.index != 'TP53'].Pi_SYN).sum()
 52 |         cj = len(df_syn) / exp_syn
 53 | 
 54 |         ## INDEL scaling factor
 55 |         f_panel = 'data/genes_CGC_ALL.txt'
 56 |         genes = pd.read_table(pkg_resources.resource_stream('DIGDriver', f_panel), names=['GENE'])
 57 |         all_cosmic = genes.GENE.to_list() + ['CDKN2A.p14arf', 'CDKN2A.p16INK4a']
 58 |         df_gene_null = df_gene[~df_gene.index.isin(all_cosmic)]
 59 |         df_mut_null = df_mut[~df_mut.index.isin(all_cosmic)]
 60 |         EXP_INDEL_UNIF = (df_gene_null.Pi_INDEL * df_gene_null.ALPHA_INDEL * df_gene_null.THETA_INDEL).sum()
 61 |         OBS_INDEL = len(df_mut_null[df_mut_null.ANNOT == 'INDEL'])
 62 |         cj_indel = OBS_INDEL / EXP_INDEL_UNIF
 63 |     elif scale_factor:
 64 |         cj = scale_factor
 65 |         cj_indel = scale_factor_indel
 66 |     else:
 67 |         print('Calculating scale factor')
 68 |         cj, cj_indel = transfer_tools.calc_scale_factor_efficient(f_mut, f_pretrained, scale_type=scale_type)
 69 | 
 70 |     L_contexts = sequence_tools.precount_region_contexts_parallel(
 71 |         f_elts_bed, f_fasta, 10, 10000, sub_elts = True, n_up=1, n_down=1)
 72 | 
 73 | 
 74 |     all_windows_df = pd.read_hdf(f_pretrained, 'region_params')
 75 |     window = all_windows_df.iloc[0][2]-all_windows_df.iloc[0][1]
 76 |     window_key = 'window_{}'.format(window)
 77 | 
 78 |     df_mut = pd.read_hdf(f_pretrained, key='sequence_model_192')
 79 |     mut_model_idx = [r[1] + '>' + r[1][0] + r[0][2] + r[1][2] for r in zip(df_mut.MUT_TYPE, df_mut.CONTEXT)]
 80 |     subst_idx = sorted(mut_model_idx)
 81 |     revc_subst_idx = [sequence_tools.reverse_complement(sub.split('>')[0]) + '>' + sequence_tools.reverse_complement(sub.split('>')[\
 82 |     -1]) for sub in subst_idx]
 83 |     revc_dic = dict(zip(subst_idx, revc_subst_idx))
 84 | 
 85 |     d_pr = pd.DataFrame(df_mut.FREQ.values, mut_model_idx)
 86 |     d_pr = d_pr.sort_index()[0].values
 87 | 
 88 |     df_elts = mutation_tools.bed12_boundaries(f_elts_bed)
 89 | 
 90 | 
 91 |     elt_lst = []
 92 |     mu_lst = []
 93 |     sigma_lst = []
 94 |     R_obs_lst = []
 95 |     alpha_lst = []
 96 |     theta_lst = []
 97 |     p_mut_lst = []
 98 |     flag_lst = []
 99 | 
100 |     mu_ind_lst = []
101 |     sigma_ind_lst = []
102 |     R_size_lst = []
103 |     elt_len_lst = []
104 |     alpha_ind_lst = []
105 |     theta_ind_lst = []
106 |     p_ind_lst = []
107 |     R_ind_lst=[]
108 | 
109 |     for _, row in df_elts.iterrows():
110 | 
111 |         chrom = row['CHROM']
112 |         elt = row['ELT']
113 |         strand = row['STRAND']
114 |         block_starts = row['BLOCK_STARTS']
115 |         block_ends = row['BLOCK_ENDS']
116 |         elts_as_intervals = np.vstack((block_starts, block_ends))
117 |         overlaps = genic_driver_tools.get_ideal_overlaps(chrom, elts_as_intervals, window)
118 | 
119 |         chrom_lst, start_lst, end_lst = ['chr' + str(r[0]) for r in overlaps], [r[1] for r in overlaps], [r[2] for r in overlaps]
120 |         region_df = sequence_tools.count_contexts_by_regions(f_fasta, chrom_lst, start_lst, end_lst, n_up=1, n_down=1)
121 |         region_counts = np.array([np.repeat(region, 3) for region in region_df.values]).sum(axis=0)
122 | 
123 | #         #if negative strand, take the reverse complement of the region counts
124 |         if strand == '-1' or strand == '-':
125 |             region_counts = np.array([r[1] for r in sorted(enumerate(region_counts), key=lambda k: revc_dic[subst_idx[k[0]]])])
126 | 
127 |         L = np.zeros((192))
128 |         for start, end in zip(block_starts, block_ends):
129 |             L += L_contexts.loc['chr{}:{}-{}'.format(chrom, start,end)].values
130 | 
131 |         prob_sum = region_counts * d_pr
132 | 
133 |         t_pi = d_pr / prob_sum.sum()
134 | 
135 |         p_mut = (t_pi * L).sum()
136 | 
137 |         p_mut_lst.append(p_mut)
138 |         mu, sigma, R_obs, FLAG = genic_driver_tools.get_region_params_direct(all_windows_df, overlaps, window)
139 |         alpha, theta = nb_model.normal_params_to_gamma(mu, sigma)
140 |         theta = theta * cj
141 | 
142 |         flag_lst.append(FLAG)
143 |         R_size_lst.append(int(region_counts.sum() / 3))  ## length of region containing gene
144 | 
145 |         elt_len_lst.append(int(np.sum(L) / 3))
146 |         p_ind_lst.append(elt_len_lst[-1] / R_size_lst[-1])
147 | 
148 | 
149 |         mu_ind,sigma_ind,R_ind = mu, sigma, R_obs
150 |         alpha_ind, theta_ind = nb_model.normal_params_to_gamma(mu_ind, sigma_ind)
151 |         theta_ind = theta_ind * cj_indel
152 | 
153 |         alpha_ind_lst.append(alpha_ind)
154 |         theta_ind_lst.append(theta_ind)
155 |         mu_ind_lst.append(mu_ind)
156 |         sigma_ind_lst.append(sigma_ind)
157 | 
158 |         R_ind_lst.append(R_ind)
159 |         elt_lst.append(elt)
160 |         mu_lst.append(mu)
161 |         sigma_lst.append(sigma)
162 |         R_obs_lst.append(R_obs)
163 |         alpha_lst.append(alpha)
164 |         theta_lst.append(theta)
165 | 
166 | 
167 |     pretrain_df = pd.DataFrame({'ELT_SIZE':elt_len_lst, 'FLAG': flag_lst, 'R_SIZE':R_size_lst, 'R_OBS':R_obs_lst, 'R_INDEL':R_ind_lst,
168 |                      'MU':mu_lst, 'SIGMA':sigma_lst, 'ALPHA':alpha_lst, 'THETA':theta_lst,
169 |                      'MU_INDEL': mu_ind_lst, 'SIGMA_INDEL':sigma_ind_lst, 'ALPHA_INDEL':alpha_ind_lst, 'THETA_INDEL':theta_ind_lst,
170 |                      'Pi_SUM':p_mut_lst , 'Pi_INDEL':p_ind_lst
171 |         }, index = elt_lst)
172 | 
173 |     df_model = df_mut_tab.merge(pretrain_df, left_on ='ELT', right_index=True)
174 |     df_model = transfer_tools.element_expected_muts_nb(df_model)
175 | 
176 |     df_model = transfer_tools.element_expected_muts_nb(df_model)
177 | 
178 |     if not skip_pvals:
179 |         df_model = transfer_tools.element_pvalue_burden_nb(df_model)
180 |         df_model = transfer_tools.element_pvalue_burden_nb_by_sample(df_model)
181 |         df_model = transfer_tools.element_pvalue_indel(df_model, cj_indel)
182 |         df_model['PVAL_MUT_BURDEN'] = [
183 |                     scipy.stats.combine_pvalues([row.PVAL_SNV_BURDEN, row.PVAL_INDEL_BURDEN],
184 |                         method='fisher'
185 |                     )[1]
186 |                     for i, row in df_model.iterrows()
187 |                 ]
188 |     if region_str:
189 |         os.remove(temp_name)
190 |     return df_model
191 | 


--------------------------------------------------------------------------------
/DIGDriver/region_model/.DS_Store:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/maxwellsh/DIGDriver/5bb565a1fbb3924ecdaaedeffb97123febc3b4d1/DIGDriver/region_model/.DS_Store


--------------------------------------------------------------------------------
/DIGDriver/region_model/__init__.py:
--------------------------------------------------------------------------------
1 | 
2 | 


--------------------------------------------------------------------------------
/DIGDriver/region_model/autoencoders/ae_nets/CNNs.py:
--------------------------------------------------------------------------------
  1 | import torch
  2 | from torch import nn, transpose
  3 | from torch.autograd import Variable
  4 | from torch.nn import functional as F
  5 | 
  6 | class ResNetEncoder(nn.Module):
  7 |     def __init__(self, shape):
  8 |         super().__init__()
  9 |         self.inp_len = shape[1]
 10 |         self.inp_size = shape[2]
 11 | 
 12 |         self.hidden_dim = 128
 13 |         self.fc2_dim = 128
 14 |         self.fc3_dim = 16
 15 | 
 16 | 
 17 |         self.conv11 = nn.Conv1d(in_channels=self.inp_size, out_channels=128, kernel_size=5, padding=1, stride=1)
 18 |         self.bn11 = nn.BatchNorm1d(128)
 19 |         self.conv12 = nn.Conv1d(in_channels=128, out_channels=256, kernel_size=3, padding=1, stride=2)
 20 |         self.bn12 = nn.BatchNorm1d(256)
 21 | 
 22 |         self.conv21 = nn.Conv1d(in_channels=256, out_channels=256, kernel_size=3, padding=1, stride=1)
 23 |         self.bn21 = nn.BatchNorm1d(256)
 24 |         self.conv22 = nn.Conv1d(in_channels=256, out_channels=256, kernel_size=3, padding=1, stride=1)
 25 |         self.bn22 = nn.BatchNorm1d(256)
 26 | 
 27 |         self.conv3 = nn.Conv1d(in_channels=256, out_channels=512, kernel_size=3, padding=1, stride=2)
 28 |         self.bn3 = nn.BatchNorm1d(512)
 29 | 
 30 |         self.conv41 = nn.Conv1d(in_channels=512, out_channels=512, kernel_size=3, padding=1, stride=1)
 31 |         self.bn41 = nn.BatchNorm1d(512)
 32 |         self.conv42 = nn.Conv1d(in_channels=512, out_channels=512, kernel_size=3, padding=1, stride=1)
 33 |         self.bn42 = nn.BatchNorm1d(512)
 34 | 
 35 |         self.conv5 = nn.Conv1d(in_channels=512, out_channels=1024, kernel_size=3, padding=1, stride=2)
 36 |         self.bn5 = nn.BatchNorm1d(1024)
 37 | 
 38 |         self.conv61 = nn.Conv1d(in_channels=1024, out_channels=1024, kernel_size=3, padding=1, stride=1)
 39 |         self.bn61 = nn.BatchNorm1d(1024)
 40 |         self.conv62 = nn.Conv1d(in_channels=1024, out_channels=1024, kernel_size=3, padding=1, stride=1)
 41 |         self.bn62 = nn.BatchNorm1d(1024)
 42 | 
 43 |         self.fc1 = nn.Linear(in_features=int(1024 * 13), out_features=self.fc2_dim)
 44 |         self.fc2 = nn.Linear(in_features=self.fc2_dim, out_features=self.fc3_dim)
 45 |         self.fc3 = nn.Linear(in_features=self.fc3_dim, out_features=16)
 46 | 
 47 |         #decoding network
 48 |         self.dfc3 = nn.Linear(in_features=16, out_features=self.fc3_dim)
 49 |         self.dfc2 = nn.Linear(in_features=self.fc3_dim, out_features=self.fc2_dim)
 50 |         self.dfc1 = nn.Linear(in_features=self.fc2_dim, out_features=int(1024 * 13))
 51 | 
 52 |     def forward(self, x):
 53 |         x = transpose(x, 1, 2)
 54 | 
 55 |         x = F.relu(self.bn11(self.conv11(x)))
 56 |         x = F.relu(self.bn12(self.conv12(x)))
 57 |         res = x
 58 |         x = F.relu(self.bn21(self.conv21(x)))
 59 |         x = F.relu(self.bn22(self.conv22(x)))
 60 |         x += res
 61 |         x = F.relu(self.bn3(self.conv3(x)))
 62 |         res = x
 63 |         x = F.relu(self.bn41(self.conv41(x)))
 64 |         x = F.relu(self.bn42(self.conv42(x)))
 65 |         x += res
 66 |         x = F.relu(self.bn5(self.conv5(x)))
 67 |         res = x
 68 |         x = F.relu(self.bn61(self.conv61(x)))
 69 |         x = F.relu(self.bn62(self.conv62(x)))
 70 |         x += res
 71 | 
 72 |         x = x.view(-1, int(1024 * 13))
 73 | 
 74 |         x = F.relu(self.fc1(x))
 75 |         x = F.relu(self.fc2(x))
 76 |         x = self.fc3(x)
 77 | 
 78 |         return x
 79 | 
 80 | class ResNet_NoBN_Encoder(nn.Module):
 81 |     def __init__(self, shape):
 82 |         super().__init__()
 83 |         self.inp_len = shape[1]
 84 |         self.inp_size = shape[2]
 85 | 
 86 |         self.hidden_dim = 128
 87 |         self.fc2_dim = 128
 88 |         self.fc3_dim = 16
 89 | 
 90 | 
 91 |         self.conv11 = nn.Conv1d(in_channels=self.inp_size, out_channels=128, kernel_size=5, padding=1, stride=1)
 92 |         self.conv12 = nn.Conv1d(in_channels=128, out_channels=256, kernel_size=3, padding=1, stride=2)
 93 | 
 94 |         self.conv21 = nn.Conv1d(in_channels=256, out_channels=256, kernel_size=3, padding=1, stride=1)
 95 |         self.conv22 = nn.Conv1d(in_channels=256, out_channels=256, kernel_size=3, padding=1, stride=1)
 96 | 
 97 |         self.conv3 = nn.Conv1d(in_channels=256, out_channels=512, kernel_size=3, padding=1, stride=2)
 98 | 
 99 |         self.conv41 = nn.Conv1d(in_channels=512, out_channels=512, kernel_size=3, padding=1, stride=1)
100 |         self.conv42 = nn.Conv1d(in_channels=512, out_channels=512, kernel_size=3, padding=1, stride=1)
101 | 
102 |         self.conv5 = nn.Conv1d(in_channels=512, out_channels=1024, kernel_size=3, padding=1, stride=2)
103 | 
104 |         self.conv61 = nn.Conv1d(in_channels=1024, out_channels=1024, kernel_size=3, padding=1, stride=1)
105 |         self.conv62 = nn.Conv1d(in_channels=1024, out_channels=1024, kernel_size=3, padding=1, stride=1)
106 | 
107 |         self.fc1 = nn.Linear(in_features=int(1024 * 13), out_features=self.fc2_dim)
108 |         self.fc2 = nn.Linear(in_features=self.fc2_dim, out_features=self.fc3_dim)
109 |         self.fc3 = nn.Linear(in_features=self.fc3_dim, out_features=16)
110 | 
111 |     def forward(self, x):
112 |         x = transpose(x, 1, 2)
113 | 
114 |         x = F.relu(self.conv11(x))
115 |         x = F.relu(self.conv12(x))
116 |         res = x
117 |         x = F.relu(self.conv21(x))
118 |         x = F.relu(self.conv22(x))
119 |         x += res
120 |         x = F.relu(self.conv3(x))
121 |         res = x
122 |         x = F.relu(self.conv41(x))
123 |         x = F.relu(self.conv42(x))
124 |         x += res
125 |         x = F.relu(self.conv5(x))
126 |         res = x
127 |         x = F.relu(self.conv61(x))
128 |         x = F.relu(self.conv62(x))
129 |         x += res
130 | 
131 |         x = x.view(-1, int(1024 * 13))
132 | 
133 |         x = F.relu(self.fc1(x))
134 |         x = F.relu(self.fc2(x))
135 |         x = self.fc3(x)
136 | 
137 |         return x
138 | 
139 | class ResNetDecoder(nn.Module):
140 |     def __init__(self, shape):
141 |         super().__init__()
142 |         self.inp_len = shape[1]
143 |         self.inp_size = shape[2]
144 | 
145 |         self.hidden_dim = 128
146 |         self.fc2_dim = 128
147 |         self.fc3_dim = 16
148 | 
149 |         #decoding network
150 |         self.dfc3 = nn.Linear(in_features=16, out_features=self.fc3_dim)
151 |         self.dfc2 = nn.Linear(in_features=self.fc3_dim, out_features=self.fc2_dim)
152 |         self.dfc1 = nn.Linear(in_features=self.fc2_dim, out_features=int(1024 * 13))
153 | 
154 |         #self.dbn62 = nn.BatchNorm1d(1024)
155 |         self.dconv62 = nn.ConvTranspose1d(in_channels=1024, out_channels=1024, kernel_size=3, padding=1, stride=1)
156 |         #self.dbn61 = nn.BatchNorm1d(1024)
157 |         self.dconv61 = nn.ConvTranspose1d(in_channels=1024, out_channels=1024, kernel_size=3, padding=1, stride=1)
158 | 
159 |         #self.dbn5 = nn.BatchNorm1d(1024)
160 |         self.dconv5 = nn.ConvTranspose1d(in_channels=1024, out_channels=512, kernel_size=3, padding=1, stride=2)
161 | 
162 |         #self.dbn42 = nn.BatchNorm1d(512)
163 |         self.dconv42 = nn.ConvTranspose1d(in_channels=512, out_channels=512, kernel_size=3, padding=1, stride=1)
164 |         #self.dbn41 = nn.BatchNorm1d(512)
165 |         self.dconv41 = nn.ConvTranspose1d(in_channels=512, out_channels=512, kernel_size=3, padding=1, stride=1)
166 | 
167 |         #self.dbn3 = nn.BatchNorm1d(512)
168 |         self.dconv3 = nn.ConvTranspose1d(in_channels=512, out_channels=256, kernel_size=3, padding=1, stride=2)
169 | 
170 |         #self.dbn22 = nn.BatchNorm1d(256)
171 |         self.dconv22 = nn.ConvTranspose1d(in_channels=256, out_channels=256, kernel_size=3, padding=1, stride=1)
172 |         #self.dbn21 = nn.BatchNorm1d(256)
173 |         self.dconv21 = nn.ConvTranspose1d(in_channels=256, out_channels=256, kernel_size=3, padding=1, stride=1)
174 | 
175 |         #self.dbn12 = nn.BatchNorm1d(256)
176 |         self.dconv12 = nn.ConvTranspose1d(in_channels=256, out_channels=128, kernel_size=3, padding=1, stride=2)
177 |         #self.dbn11 = nn.BatchNorm1d(128)
178 |         self.dconv11 = nn.ConvTranspose1d(in_channels=128, out_channels=self.inp_size, kernel_size=5, padding=1, stride=1)
179 | 
180 |         self.last = torch.nn.ConvTranspose1d(in_channels = self.inp_size, out_channels = self.inp_size, kernel_size = 4, padding = 1, stride =1)
181 | 
182 |     def forward(self, x):
183 |         x = F.relu(self.dfc3(x))
184 |         x = F.relu(self.dfc2(x))
185 |         x = F.relu(self.dfc1(x))
186 | 
187 |         x = x.view(-1, 1024, 13)
188 | 
189 |         x = F.relu(self.dconv62(x))
190 |         x = F.relu(self.dconv61(x))
191 | 
192 |         x = F.relu(self.dconv5(x))
193 | 
194 |         x = F.relu(self.dconv42(x))
195 |         x = F.relu(self.dconv41(x))
196 | 
197 |         x = F.relu(self.dconv3(x))
198 | 
199 |         x = F.relu(self.dconv22(x))
200 |         x = F.relu(self.dconv21(x))
201 | 
202 |         x = F.relu(self.dconv12(x))
203 |         x = F.relu(self.dconv11(x))
204 |         x = F.relu(self.last(x))
205 |         x = transpose(x, 1, 2)
206 |         return x
207 | 
208 | class ResNetLinearDecoder(nn.Module):
209 |     def __init__(self, shape):
210 |         super().__init__()
211 |         self.inp_len = shape[1]
212 |         self.inp_size = shape[2]
213 | 
214 |         self.hidden_dim = 128
215 |         self.fc2_dim = 128
216 |         self.fc3_dim = 16
217 | 
218 |         #decoding network
219 |         self.decoder = nn.Sequential(
220 |         nn.Linear(in_features=16, out_features=64),
221 |         nn.ReLU(),
222 |         nn.Linear(in_features=64, out_features=128),
223 |         nn.ReLU(),
224 |         nn.Linear(in_features=128, out_features=256),
225 |         nn.ReLU(),
226 |         nn.Linear(in_features=256, out_features=2048),
227 |         nn.ReLU(),
228 |         nn.Linear(in_features=2048, out_features=self.inp_len * self.inp_size),
229 |         )
230 | 
231 |     def forward(self, x):
232 |         x = self.decoder(x)
233 |         x = x.view(-1, 100, 734)
234 |         return x
235 | 
236 | class ResNetShallowLinearDecoder(nn.Module):
237 |     def __init__(self, shape):
238 |         super().__init__()
239 |         self.inp_len = shape[1]
240 |         self.inp_size = shape[2]
241 | 
242 |         self.hidden_dim = 128
243 |         self.fc2_dim = 128
244 |         self.fc3_dim = 16
245 | 
246 |         #decoding network
247 |         self.decoder = nn.Sequential(
248 |         nn.Linear(in_features=16, out_features=256),
249 |         nn.ReLU(),
250 |         nn.Linear(in_features=256, out_features=self.inp_len * self.inp_size),
251 |         )
252 | 
253 |     def forward(self, x):
254 |         x = self.decoder(x)
255 |         x = x.view(-1, 100, 734)
256 |         return x
257 | 
258 | class ResNetAE(nn.Module):
259 |     def __init__(self, shape):
260 |         super().__init__()
261 |         self.inp_len = shape[1]
262 |         self.inp_size = shape[2]
263 | 
264 |         self.hidden_dim = 128
265 |         self.fc2_dim = 128
266 |         self.fc3_dim = 16
267 | 
268 |         self.encoder = ResNetEncoder(shape)
269 |         self.decoder = ResNetDecoder(shape)
270 | 
271 |     def forward(self, x):
272 |         encoded = self.encoder(x)
273 |         x = self.decoder(encoded)
274 |         return encoded, x
275 | 
276 |     def embeding(self, x):
277 |         x = self.encoder(x)
278 |         return x
279 | class ResNetAE_LD(nn.Module):
280 |     def __init__(self, shape):
281 |         super().__init__()
282 |         self.inp_len = shape[1]
283 |         self.inp_size = shape[2]
284 | 
285 |         self.hidden_dim = 128
286 |         self.fc2_dim = 128
287 |         self.fc3_dim = 16
288 | 
289 |         self.encoder = ResNetEncoder(shape)
290 |         self.decoder = ResNetLinearDecoder(shape)
291 | 
292 |     def forward(self, x):
293 |         x = self.encoder(x)
294 |         x = self.decoder(x)
295 |         return x
296 | 
297 |     def embeding(self, x):
298 |         x = self.encoder(x)
299 |         return x
300 | 
301 | class ResNetAE_SLD(nn.Module):
302 |     def __init__(self, shape):
303 |         super().__init__()
304 |         self.inp_len = shape[1]
305 |         self.inp_size = shape[2]
306 | 
307 |         self.hidden_dim = 128
308 |         self.fc2_dim = 128
309 |         self.fc3_dim = 16
310 | 
311 |         self.encoder = ResNetEncoder(shape)
312 |         self.decoder = ResNetShallowLinearDecoder(shape)
313 | 
314 |     def forward(self, x):
315 |         x = self.encoder(x)
316 |         x = self.decoder(x)
317 |         return x
318 | 
319 |     def embeding(self, x):
320 |         x = self.encoder(x)
321 |         return x
322 | 


--------------------------------------------------------------------------------
/DIGDriver/region_model/autoencoders/ae_nets/fc_nets.py:
--------------------------------------------------------------------------------
 1 | import torch
 2 | import torch.nn as nn
 3 | 
 4 | class Autoencoder_FC(nn.Module):
 5 |     def __init__(self, in_shape):
 6 |         super().__init__()
 7 |         bs,w,tracks = in_shape
 8 |         self.encoder = nn.Sequential(
 9 |             nn.Linear(w * tracks, 512),
10 |             nn.ReLU(),
11 |             nn.Linear(512, 128),
12 |             nn.ReLU(),
13 |             nn.Linear(128, 32),
14 |             nn.ReLU(),
15 |             nn.Linear(32, 16),
16 |         )
17 |         self.decoder = nn.Sequential(
18 |             nn.Linear(16, 32),
19 |             nn.ReLU(),
20 |             nn.Linear(32, 128),
21 |             nn.ReLU(),
22 |             nn.Linear(128, 512),
23 |             nn.ReLU(),
24 |             nn.Linear(512, w * tracks),
25 |         )
26 | 
27 |     def forward(self, x):
28 |         x = self.encoder(x)
29 |         x = self.decoder(x)
30 |         return x
31 | 
32 |     def embeding(self, x):
33 |         x = self.encoder(x)
34 |         return x
35 | 
36 | class Mean_Vec_Autoencoder_FC(nn.Module):
37 |     def __init__(self, in_shape):
38 |         super().__init__()
39 |         bs,w,tracks = in_shape
40 |         self.encoder = nn.Sequential(
41 |             nn.Linear(tracks, 512),
42 |             nn.ReLU(),
43 |             nn.Linear(512, 128),
44 |             nn.ReLU(),
45 |             nn.Linear(128, 32),
46 |             nn.ReLU(),
47 |             nn.Linear(32, 16),
48 |         )
49 |         self.decoder = nn.Sequential(
50 |             nn.Linear(16, 32),
51 |             nn.ReLU(),
52 |             nn.Linear(32, 128),
53 |             nn.ReLU(),
54 |             nn.Linear(128, 512),
55 |             nn.ReLU(),
56 |             nn.Linear(512, tracks),
57 |         )
58 | 
59 |     def forward(self, x):
60 |         x = self.encoder(x)
61 |         x = self.decoder(x)
62 |         return x
63 | 
64 |     def embeding(self, x):
65 |         x = self.encoder(x)
66 |         return x
67 | 


--------------------------------------------------------------------------------
/DIGDriver/region_model/autoencoders/autoencoder_main.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python
  2 | import os
  3 | import sys
  4 | import h5py
  5 | import copy
  6 | import argparse
  7 | import datetime
  8 | import numpy as np
  9 | import pandas as pd
 10 | from torch import nn, optim
 11 | from tensorboardX import SummaryWriter
 12 | from torch.utils.data import DataLoader
 13 | 
 14 | py_file_path = os.path.dirname(os.path.dirname(os.path.abspath(__file__)))
 15 | file_path = os.path.dirname(os.path.abspath(__file__))
 16 | 
 17 | sys.path.append(os.path.join(py_file_path, 'data_aux'))
 18 | sys.path.append(os.path.join(py_file_path, 'trainers'))
 19 | sys.path.append(os.path.join(file_path, 'ae_nets'))
 20 | 
 21 | from dataset_generator import *
 22 | from fc_nets import *
 23 | from gp_trainer import *
 24 | from CNNs import *
 25 | 
 26 | 
 27 | def get_cmd_arguments():
 28 |     ap = argparse.ArgumentParser()
 29 | 
 30 |     # Required cancer type argument
 31 |     ap.add_argument('-c', '--cancer-id', required=True, nargs='*', action='store', type=str, dest='label_ids',
 32 |                     help='A list of the h5 file mutation count dataset IDs (e.g. SNV_skin_melanoma_MELAU_AU)')
 33 | 
 34 |     # Path arguments
 35 |     ap.add_argument('-d', "--data", required=False, nargs='?', action='store', type=str, dest='data_file',
 36 |                     default='/storage/datasets/cancer/unzipped_data_matrices_PCAWG_10000_0_0.0.h5', help='Path to h5 data file')
 37 |     ap.add_argument('-o', "--out-dir", required=False, nargs='?', action='store', type=str, dest='out_dir',
 38 |                     default='/storage/yaari/mutation-density-outputs', help='Path to output directory')
 39 |     ap.add_argument('-u', "--held-out", required=False, nargs='?', action='store', type=str, dest='heldout_file',
 40 |                     default=None, help='Path to file of held-out samples file')
 41 | 
 42 |     # Run type parameters
 43 |     ap.add_argument('-s', "--split", required=False, nargs='?', action='store', type=str, dest='split_method',
 44 |                     default='random', help='Dataset split method (random/chr)')
 45 |     ap.add_argument('-m', "--mappability", required=False, nargs='?', action='store', type=float, dest='mappability',
 46 |                     default=0.7, help='Mappability lower bound')
 47 |     ap.add_argument('-gp', "--gaussian", required=False, nargs='?', action='store', type=bool, dest='run_gaussian',
 48 |                     default=False, help='True: train gaussian process regression on the best performing model')
 49 |     ap.add_argument('-n', "--network", required=False, nargs='?', action='store', type=str, dest='net',
 50 |                     default='cnn', help='The type of neural network model to use (\'fc\' or \'cnn\')')
 51 | 
 52 |     # Train parameters
 53 |     ap.add_argument('-r', "--train-ratio", required=False, nargs='?', action='store', type=float, dest='train_ratio',
 54 |                     default=0.8, help='Train set split size ratio')
 55 |     ap.add_argument('-ho', "--heldout-ratio", required=False, nargs='?', action='store', type=float, dest='heldout_ratio',
 56 |                     default=0.2, help='Held-out set split size ratio (will be extracted prior to train validation split)')
 57 |     ap.add_argument('-e', "--epochs", required=False, nargs='?', action='store', type=int, dest='epochs',
 58 |                     default=20, help='Number of epochs')
 59 |     ap.add_argument('-b', "--batch", required=False, nargs='?', action='store', type=int, dest='bs',
 60 |                     default=128, help='Batch size')
 61 |     ap.add_argument('-re', "--reruns", required=False, nargs='?', action='store', type=int, dest='nn_reruns',
 62 |                     default=1, help='Number of NN reinitializations and training runs')
 63 |     ap.add_argument('-gr', "--gp-reruns", required=False, nargs='?', action='store', type=int, dest='gp_reruns',
 64 |                     default=1, help='Number of GP reinitializations and training runs')
 65 | 
 66 |     ap.add_argument('-lr', "--learning-rate", required = False, nargs='?', action='store', type = float, dest = 'lr', default=1e-3, help = 'learning rate for training')
 67 | 
 68 |     # Run management parameters
 69 |     ap.add_argument('-sm', "--save-model", required=False, nargs='?', action='store', type=bool, dest='save_model',
 70 |                     default=False, help='True: save best model across all reruns')
 71 |     ap.add_argument('-st', "--save-training", required=False, nargs='?', action='store', type=float, dest='save_training',
 72 |                     default=False, help='True: save training process and results to Tensorboard file')
 73 |     ap.add_argument('-g', "--gpus", required=False, nargs='?', action='store', type=str, dest='gpus',
 74 |                     default='all', help='GPUs devices (all/comma separted list)')
 75 | 
 76 |     return ap.parse_args()
 77 | 
 78 | def train(model, device,  epoch, train_ds, loss_func, optimizer, net_type, writer = None):
 79 |     model.train()
 80 |     batch_num = len(train_ds)
 81 |     loss_sum = 0
 82 |     for batch_idx, (X, y) in enumerate(train_ds):
 83 |         #flatten
 84 |         bs, w, tracks = X.size()
 85 |         if net_type == 'fc':
 86 |             X = X.view(bs,-1, w * tracks)
 87 |         X = X.to(device)
 88 |         decoded = model(X)
 89 | 
 90 |         loss = loss_func(decoded, X)
 91 |         optimizer.zero_grad()
 92 |         loss.backward()
 93 | 
 94 |         optimizer.step()
 95 | 
 96 |         loss_sum += loss.item()
 97 |         ### LOGGING
 98 |         if not batch_idx % 50:
 99 |             print ('Epoch: %d | Batch %03d/%03d | Loss: %.4f'
100 |                 %(epoch, batch_idx, len(train_ds), loss))
101 |     epoch_loss = loss_sum / batch_num
102 |     if writer is not None:
103 |         writer.add_scalar('train_loss', epoch_loss, epoch)
104 | 
105 | def embed(model, device, data_ds, label_ids, net_type):
106 |     model.eval()
107 |     data_loader = DataLoader(data_ds, batch_size=2048, shuffle=False, drop_last=False, pin_memory=True, num_workers=4)
108 |     all_features = [[] for _ in range(len(label_ids))]
109 |     all_true = [[] for _ in range(len(label_ids))]
110 |     for j, (X, t_lst) in enumerate(data_loader):
111 |         bs, w, tracks = X.size()
112 |         if net_type == 'fc':
113 |             X = X.view(bs,-1, w * tracks)
114 |         X = X.to(device)
115 |         features_lst = model.module.embeding(X)
116 |         with torch.no_grad():
117 |             for i, t in enumerate(t_lst):
118 |                 if net_type == 'fc':
119 |                     feature_vecs = features_lst[:,0,:]
120 |                 else:
121 |                     feature_vecs = features_lst
122 |                 all_features[i].append(feature_vecs.cpu().detach().numpy())
123 |                 all_true[i].extend(t.data.cpu().numpy().tolist())
124 |     all_features = [np.concatenate(all_features[j], axis=0) for j in range(len(all_features))]
125 |     return all_features, all_true
126 | 
127 | def eval(model, device, data_ds, loss_fn, label_ids, net_type, writer = None):
128 |     model.eval()
129 |     batch_num = len(data_ds)
130 |     loss_sum = 0
131 | 
132 |     for j, (X, t_lst) in enumerate(data_ds):
133 |         bs, w, tracks = X.size()
134 |         if net_type == 'fc':
135 |             X = X.view(bs,-1, w * tracks)
136 |         X = X.to(device)
137 |         decoded = model(X)
138 |         with torch.no_grad():
139 |             loss_sum += loss_fn(decoded, X)# + torch.norm(attention, p=1, dim=(1,2)).mean()
140 |     test_loss = loss_sum / batch_num
141 | 
142 |     print('====> Test set loss: {}'.format(test_loss))
143 |     if writer is not None:
144 |         writer.add_scalar('test_loss', test_loss, epoch)
145 | 
146 |     return test_loss
147 | 
148 | def main():
149 |     args = get_cmd_arguments()
150 |     labels_str = '-'.join(args.label_ids)
151 |     out_dir = os.path.join(args.out_dir)
152 |     print('Generating prediction for cancer types: {}'.format(args.label_ids))
153 | 
154 |     # Configure GPUs
155 |     if args.gpus is None:
156 |         print('Using CPU device.')
157 |         device = torch.device('cpu')
158 |     else:
159 |         print('Using GPU device: \'{}\''.format(args.gpus))
160 |         device = torch.device('cuda')
161 |         if args.gpus != 'all':
162 |             os.environ["CUDA_VISIBLE_DEVICES"] = args.gpus
163 | 
164 |     data_generator = DatasetGenerator(args.data_file,
165 |                                       args.label_ids,
166 |                                       args.mappability,
167 |                                       args.heldout_ratio,
168 |                                       heldout_file=args.heldout_file,)
169 | 
170 |     bs = args.bs
171 |     net_type = args.net
172 |     train_ds, test_ds = data_generator.get_datasets(args.split_method, args.train_ratio)
173 |     ho_ds = data_generator.get_heldout_dataset()
174 |     train_dataloader = DataLoader(train_ds, batch_size=bs, shuffle=True, drop_last=False, num_workers=16)
175 |     test_dataloader = DataLoader(test_ds, batch_size=bs, shuffle=False, drop_last=False, num_workers=16)
176 | 
177 |     if net_type == 'fc':
178 |         model = Autoencoder_FC(train_ds.get_data_shape())
179 |     elif net_type == 'cnn_ld':
180 |         model = ResNetAE_LD(train_ds.get_data_shape())
181 |     elif net_type == 'cnn_sld':
182 |         model = ResNetAE_SLD(train_ds.get_data_shape())
183 |     else:
184 |         model = ResNetAE(train_ds.get_data_shape())
185 | 
186 |     if args.gpus is not None: model = nn.DataParallel(model)
187 |     model.to(device)
188 | 
189 |     print('Running {} AE model'.format(net_type))
190 |     optimizer = optim.Adam(model.parameters(), lr=args.lr, amsgrad=False)
191 |     loss_fn = nn.MSELoss()
192 | 
193 |     epochs = args.epochs
194 |     # Create output directory
195 |     if args.save_model or args.save_training or args.run_gaussian:
196 |         print('Saving results under: \'{}\''.format(out_dir))
197 |         os.makedirs(out_dir)
198 |         args_dict = vars(args)
199 |         with open(os.path.join(out_dir, 'run_params.txt'), 'w') as f:
200 |             [f.write('{}: {}\n'.format(k, args_dict[k])) for k in args_dict.keys()]
201 | 
202 |     if args.save_training:
203 |         writer = SummaryWriter(logdir=out_dir, comment=labels_str)
204 |         writer.add_text('configurations', str(args), 0)
205 |         writer.add_text('model', str(model), 0)
206 |     else:
207 |         writer = None
208 |     for epoch in range(1, epochs + 1):
209 |         print('Running epoch {}/{}'.format(epoch, epochs))
210 |         train(model, device, epoch, train_dataloader, loss_fn, optimizer, net_type, writer)
211 |         eval(model, device, test_dataloader, loss_fn, args.label_ids, net_type)
212 | 
213 |     print('Done Training!')
214 | 
215 |     if args.save_model:
216 |         print('Saving model')
217 |         torch.save(model.state_dict(), os.path.join(out_dir, 'saved_model_{}_e{}_{}.h5'.format(net_type, epochs,
218 |                                                                                                args.label_ids[i])))
219 |     train_features, train_labels = embed(model,device, train_ds, args.label_ids, net_type)
220 |     test_features, test_labels = embed(model, device, test_ds, args.label_ids, net_type)
221 |     ho_features, ho_labels = embed(model, device, ho_ds, args.label_ids, net_type)
222 | 
223 |     #run gaussian
224 |     for i in range(len(args.label_ids)):
225 |         print('Running gaussian process model for {}...'.format(args.label_ids[i]))
226 |         train_set = (np.array(train_features[i]), np.array(train_labels[i]), train_ds.get_chromosome_locations())
227 |         test_set = (np.array(test_features[i]), np.array(test_labels[i]), test_ds.get_chromosome_locations())
228 |         ho_set = (np.array(ho_features[i]), np.array(ho_labels[i]), ho_ds.get_chromosome_locations())
229 |         best_r2 = 0
230 |         for j in range(args.gp_reruns):
231 |             print('GP run {}/{}...'.format(j, args.gp_reruns))
232 |             run_successeed = False
233 |             n_inducing = 2000
234 |             while not run_successeed and n_inducing > 0:
235 |                 gp_trainer = GPTrainer(device, train_set, test_set, heldout_tup=ho_set, n_inducing=n_inducing)
236 |                 try:
237 |                     print('Running GP with {} inducing points...'.format(n_inducing))
238 |                     gp_test_results, gp_ho_results = gp_trainer.run()
239 |                 except RuntimeError as err:
240 |                     print('Run failed with {} inducing points. Encountered run-time error in training: {}'
241 |                           .format(n_inducing, err))
242 |                     n_inducing -= 200
243 |                     continue
244 |                 run_successeed = True
245 |             if gp_test_results['r2'] > best_r2:
246 |                 best_test_results, best_ho_results = gp_test_results, gp_ho_results
247 |                 best_r2 = gp_test_results['r2']
248 |         gp_out_path = os.path.join(out_dir, 'model_{}_e{}_gp_results_{}.h5'.format(net_type, epochs, args.label_ids[i]))
249 |         if best_r2 > 0:
250 |             gp_trainer.save_results(gp_out_path, best_test_results, best_ho_results)
251 |         else:
252 |             gp_trainer.save_results(gp_out_path, gp_test_results, gp_ho_results)
253 | 
254 | 
255 | 
256 | if __name__ == '__main__':
257 |     main()
258 | 


--------------------------------------------------------------------------------
/DIGDriver/region_model/data_aux/mut_dataset.py:
--------------------------------------------------------------------------------
  1 | import h5py
  2 | import torch
  3 | import numpy as np
  4 | from torch.utils.data import Dataset
  5 | 
  6 | class SimpleDataset(Dataset):
  7 | 
  8 |     def __init__(self, data, labels_lst):
  9 |         self.data = data
 10 |         self.labels_lst = [lbl for lbl in labels_lst]
 11 | 
 12 |     def __len__(self):
 13 |         return self.data.shape[0]
 14 | 
 15 |     def __getitem__(self, idx):
 16 |         X = torch.tensor(self.data[idx]).float()
 17 |         y_lst = [torch.tensor(l[idx]).float() for l in self.labels_lst]
 18 |         return X, y_lst
 19 | 
 20 |     def get_data_shape(self):
 21 |         return self.data.shape
 22 | 
 23 |     def get_train_set_length(self, train_ratio):
 24 |         return int(train_ratio * self.data.shape[0])
 25 | 
 26 | 
 27 | class BaseDatasetFromH5(Dataset):
 28 |     def __init__(self, preprocessed_idxs, chr_locations, mappability, quantiles, selected_tracks):
 29 |         self.preprocessed_idxs = preprocessed_idxs
 30 |         self.chr_locations = chr_locations
 31 |         self.selected_tracks = selected_tracks
 32 |         self.mappability = mappability
 33 |         self.quantiles = quantiles
 34 | 
 35 |     def __len__(self):
 36 |         return len(self.preprocessed_idxs)
 37 | 
 38 |     def get_set_indices(self):
 39 |         return self.preprocessed_idxs
 40 | 
 41 |     def get_chromosome_locations(self):
 42 |         return self.chr_locations[self.preprocessed_idxs]
 43 | 
 44 |     def get_mappability_values(self):
 45 |         return self.mappability[self.preprocessed_idxs]
 46 | 
 47 |     def get_quantile_values(self):
 48 |         return self.quantiles[self.preprocessed_idxs]
 49 | 
 50 | 
 51 | class SimpleDatasetFromH5(BaseDatasetFromH5):
 52 |     def __init__(self, h5_file, label_ids, preprocessed_idxs, chr_locations, mappability, quantiles, selected_tracks, data_id):
 53 |         super(SimpleDatasetFromH5, self).__init__(preprocessed_idxs, chr_locations, mappability, quantiles, selected_tracks)
 54 |         print('Loading data and labels from file {}...'.format(h5_file))
 55 |         with h5py.File(h5_file, 'r') as h5f:
 56 |             self.data = torch.tensor(h5f[data_id][np.sort(self.preprocessed_idxs)]).float()
 57 |             self.labels_lst = [torch.tensor(h5f[l][np.sort(self.preprocessed_idxs)]).float() for l in label_ids]
 58 |         print('Loaded input data of size: {}'.format(self.data.shape))
 59 | 
 60 |     def __getitem__(self, idx):
 61 |         X = self.data[idx, :, self.selected_tracks]
 62 |         y_lst = [l[idx] for l in self.labels_lst]
 63 |         return X, y_lst
 64 | 
 65 |     def get_data_shape(self):
 66 |         return self.data.shape
 67 | 
 68 | 
 69 | class LazyLoadDatasetFromH5(BaseDatasetFromH5):
 70 |     def __init__(self, h5_file, label_ids, preprocessed_idxs, chr_locations, mappability, quantiles, selected_tracks, data_id, auto_context=None):
 71 |         super(LazyLoadDatasetFromH5, self).__init__(preprocessed_idxs, chr_locations, mappability, quantiles, selected_tracks)
 72 |         self.h5_file = h5_file
 73 |         self.label_ids = label_ids
 74 |         self.data_id = data_id
 75 | 
 76 |     def __getitem__(self, idx):
 77 |         data_idx = self.preprocessed_idxs[idx]
 78 |         with h5py.File(self.h5_file,'r') as db:
 79 |             X = torch.tensor(db[self.data_id][data_idx, :, self.selected_tracks]).float()
 80 |             y_lst = [torch.tensor(db[l][data_idx]).float() for l in self.label_ids]
 81 |         return X, y_lst
 82 | 
 83 |     def get_data_shape(self):
 84 |         with h5py.File(self.h5_file,'r') as db:
 85 |             return (len(self.preprocessed_idxs), db[self.data_id].shape[1], len(self.selected_tracks))
 86 | 
 87 | 
 88 | class AutoregressiveDatasetFromH5(BaseDatasetFromH5):
 89 |     def __init__(self, h5_file, label_ids, preprocessed_idxs, chr_locations, mappability, quantiles, selected_tracks, data_id, auto_context=1):
 90 |         super(AutoregressiveDatasetFromH5, self).__init__(preprocessed_idxs, chr_locations, mappability, quantiles, selected_tracks)
 91 |         self.h5_file = h5_file
 92 |         self.label_ids = label_ids
 93 |         self.data_id = data_id
 94 |         self.auto_context = auto_context
 95 | 
 96 |     def get_context(self, c_idx, s_idx, e_idx):
 97 |         s = s_idx if s_idx >= 0 else 0
 98 |         e = e_idx if e_idx < len(self.chr_locations) else len(self.chr_locations) - 1               
 99 |         return np.arange(s, e)[np.where(self.chr_locations[np.arange(s, e), 0] == self.chr_locations[c_idx, 0])[0]]
100 | 
101 |     def __getitem__(self, idx):
102 |         data_idx = self.preprocessed_idxs[idx]
103 |         pre_context = self.get_context(data_idx, data_idx-self.auto_context, data_idx)
104 |         post_context = self.get_context(data_idx, data_idx+1, data_idx+self.auto_context+1)
105 |         with h5py.File(self.h5_file,'r') as db:
106 |             X = torch.tensor(db[self.data_id][data_idx, :, self.selected_tracks]).float()
107 |             X_auto = [torch.tensor([db[l][pre_context].sum(), db[l][post_context].sum()]).float() for l in self.label_ids]
108 |             y_lst = [torch.tensor(db[l][data_idx]).float() for l in self.label_ids]
109 |         return X, X_auto, y_lst
110 | 
111 |     def get_data_shape(self):
112 |         with h5py.File(self.h5_file,'r') as db:
113 |             return (len(self.preprocessed_idxs), db[self.data_id].shape[1], len(self.selected_tracks))
114 | 


--------------------------------------------------------------------------------
/DIGDriver/region_model/feature_vectors/gaussian_process.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python
  2 | 
  3 | import numpy as np
  4 | import pandas as pd
  5 | import torch
  6 | import gpytorch
  7 | from sklearn.metrics import r2_score
  8 | from sklearn.preprocessing import StandardScaler
  9 | # import matplotlib.pyplot as plt
 10 | # import seaborn as sns
 11 | import h5py
 12 | import scipy.stats
 13 | import tqdm
 14 | import argparse
 15 | 
 16 | class SparseGP(gpytorch.models.ExactGP):
 17 |     def __init__(self, train_x, train_y, likelihood, n_inducing=2000):
 18 |         super(SparseGP, self).__init__(train_x, train_y, likelihood)
 19 | 
 20 |         self.mean_module = gpytorch.means.ConstantMean()
 21 |         base_cov_module = gpytorch.kernels.ScaleKernel(
 22 |             gpytorch.kernels.RBFKernel()
 23 |         )
 24 |         
 25 |         self.covar_module = gpytorch.kernels.InducingPointKernel(
 26 |             base_cov_module,
 27 |             inducing_points = train_x[:n_inducing, :],
 28 |             likelihood=likelihood
 29 |         )
 30 | 
 31 |     def forward(self, x):
 32 |         mean_x = self.mean_module(x)
 33 |         covar_x = self.covar_module(x)
 34 |         return gpytorch.distributions.MultivariateNormal(mean_x, covar_x)
 35 | 
 36 |     def fit_params(self, train_x, train_y, likelihood, n_iter=100):
 37 |         pass
 38 | 
 39 |     def predict(self, test_x):
 40 |         pass
 41 | 
 42 | def load(fname, dataset, idx_feat=np.array([])):
 43 |     f = h5py.File(fname, 'r')
 44 | 
 45 |     if dataset not in f.keys():
 46 |         f.close()
 47 |         return np.array([]), np.array([]), idx_feat
 48 | 
 49 |     X = f[dataset]['X'][:]
 50 |     Y = f[dataset]['y'][:]
 51 |     # X = f[dataset]['features'][0, :, :]
 52 |     # Y = f[dataset]['true'][0, :]
 53 | 
 54 |     if not idx_feat.any():
 55 |         idx_feat = np.where(np.abs(X).mean(axis=0) > 0)[0]
 56 | 
 57 |     X = X[:, idx_feat]
 58 | 
 59 |     f.close()
 60 | 
 61 |     return X, Y, idx_feat
 62 | 
 63 | def standardize(X, Y, scaler=None, y_mean=None, y_std=None):
 64 | 
 65 |     if not scaler:
 66 |         scaler = StandardScaler()
 67 |         scaler.fit(X)
 68 | 
 69 |     if not y_mean:
 70 |         y_mean = Y.mean()
 71 |         y_std  = Y.std()
 72 | 
 73 |     x = scaler.transform(X)
 74 |     y = (Y - y_mean) / y_std
 75 | 
 76 |     return x, y, scaler, y_mean, y_std
 77 | 
 78 | def train_model(train_x, train_y, n_iter=100, n_inducing=2000):
 79 |     # train_x = torch.FloatTensor(train_x).contiguous().cuda()
 80 |     # train_y = torch.FloatTensor(train_y).contiguous().cuda()
 81 | 
 82 |     # if torch.cuda.is_available():
 83 |     #     train_x, train_y = train_x.cuda(), train_y.cuda();
 84 | 
 85 |     likelihood = to_gpu(
 86 |         gpytorch.likelihoods.GaussianLikelihood()
 87 |     )
 88 |     model = to_gpu(
 89 |         SparseGP(train_x, train_y, likelihood, n_inducing=n_inducing)
 90 |     )
 91 | 
 92 |     # if torch.cuda.is_available():
 93 |     #     model, likelihood = model.cuda(), likelihood.cuda()
 94 |     
 95 |     model.train()
 96 |     likelihood.train()
 97 | 
 98 |     print(f'Training model with {n_iter} iterations.')
 99 |     # model.fit_params(train_x, train_y, likelihood, n_iter=n_iter)
100 |     optimizer = torch.optim.Adam([
101 |         {'params': model.parameters()},
102 |     ], lr=0.8)
103 |     
104 |     # "Loss" for GPs - the marginal log likelihood
105 |     mll = gpytorch.mlls.ExactMarginalLogLikelihood(likelihood, model)
106 | 
107 |     # with gpytorch.settings.max_cg_iterations(10000):
108 |     # with gpytorch.settings.fast_computations(covar_root_decomposition=False, log_prob=False, solves=False):
109 |     # with gpytorch.settings.max_preconditioner_size(80):
110 |     iterator = tqdm.tqdm(range(n_iter), desc='GP training')
111 |     for i in iterator:
112 |         optimizer.zero_grad()
113 |         output = model(train_x)
114 |         loss = -mll(output, train_y)
115 |         loss.backward()
116 |         iterator.set_postfix(loss=loss.item())
117 |         optimizer.step()
118 | 
119 |     print(f"Finished training on {train_x.size(0)} samples.")
120 |     print("Final values - Loss: %.3f   lengthscale: %.3f   outputscale: %.3f   noise: %.3f" % (
121 |                loss.item(),
122 |                model.covar_module.base_kernel.base_kernel.lengthscale.item(),
123 |                model.covar_module.base_kernel.outputscale.item(),
124 |                likelihood.noise_covar.noise.item()
125 |     ))
126 | 
127 |     return model, likelihood, loss.item()
128 | 
129 | def predict(model, likelihood, test_x):
130 |     model.eval()
131 |     likelihood.eval()
132 | 
133 |     # test_x = torch.FloatTensor(test_x).contiguous()
134 |     # if torch.cuda.is_available():
135 |     #     print('cuda')
136 |     #     test_x = test_x.cuda()
137 | 
138 |     print(f'Predicting over {test_x.size(0)} test samples.')
139 |     with torch.no_grad(), gpytorch.settings.fast_pred_var():
140 |     # with gpytorch.settings.max_preconditioner_size(10), torch.no_grad():
141 |         # with gpytorch.settings.max_root_decomposition_size(30), gpytorch.settings.fast_pred_var():
142 |         with gpytorch.settings.max_cg_iterations(10000):
143 |             y_pred = model(test_x)
144 | 
145 |             y_hat = y_pred.mean.cpu().numpy()
146 |             y_std = y_pred.stddev.cpu().numpy()
147 | 
148 |     return y_hat, y_std
149 | 
150 | def save(fname, dataset, y_hat, y_std, loss, r2, params):
151 |     f = h5py.File(fname, 'r+')
152 |     data = f[dataset]
153 |     keys = [key for key in data.keys()]
154 | 
155 |     keys_mean = [key for key in keys if key.startswith('gp_mean')]
156 |     if keys_mean:
157 |         suffix_lst = [int(key.split('_')[-1]) for key in keys_mean]
158 |         sfx = max(suffix_lst) + 1
159 | 
160 |     else:
161 |         sfx = 1
162 | 
163 |     print('Saving GP results into {} gp_*_{:02d}'.format(dataset, sfx))
164 |     data.create_dataset('gp_mean_{:02d}'.format(sfx), data=y_hat)
165 |     data.create_dataset('gp_std_{:02d}'.format(sfx), data=y_std)
166 |     data.create_dataset('gp_params_{:02d}'.format(sfx), data=params)
167 |     data.attrs['gp_loss_{:02d}'.format(sfx)] = loss
168 |     data.attrs['gp_R2_{:02d}'.format(sfx)] = r2
169 | 
170 | def to_torch(data):
171 |     return torch.FloatTensor(data).contiguous()
172 | 
173 | def to_gpu(data):
174 |     if torch.cuda.is_available():
175 |         return data.cuda()
176 | 
177 | def parse_args():
178 |     parser = argparse.ArgumentParser(description='Fit a sparse Gaussian Process')
179 | 
180 |     parser.add_argument('data', help='h5 file containing train and test data')
181 |     parser.add_argument('--n_iter', type=int, default=100, help='number of training iterations')
182 |     parser.add_argument('--n_inducing', type=int, default=2000, help='number of inducing points')
183 |     parser.add_argument('--n_runs', type=int, default=5, help='number of runs to train the model')
184 |     parser.add_argument('--save-train', action='store_true', default=False, help='save training data')
185 | 
186 |     return parser.parse_args()
187 | 
188 | def run():
189 |     args = parse_args()
190 | 
191 |     ## Load data
192 |     train_X, train_Y, idx_feat = load(args.data, 'train')
193 |     test_X,  test_Y,  _        = load(args.data, 'test', idx_feat)
194 |     held_X,  held_Y,  _        = load(args.data, 'held-out', idx_feat)
195 |     # print(held_Y[0:5])
196 | 
197 |     ## Standardize data
198 |     train_X, train_Y, scaler, y_mean, y_std = standardize(train_X, train_Y)
199 |     test_X,  test_Y,  _,      _,      _     = standardize(test_X, test_Y, scaler, y_mean, y_std)
200 | 
201 |     train_x, train_y, test_x = to_torch(train_X), to_torch(train_Y), to_torch(test_X)
202 |     train_x, train_y, test_x = to_gpu(train_x), to_gpu(train_y), to_gpu(test_x)
203 | 
204 |     ## Train model
205 |     model, likelihood, loss = train_model(train_x, train_y, 
206 |         n_iter=args.n_iter, n_inducing=args.n_inducing
207 |     )
208 | 
209 |     ## Validate model
210 |     gp_mean, gp_std = predict(model, likelihood, test_x)
211 |     r2 = r2_score(test_Y, gp_mean)
212 |     print(f'R^2 of model: {r2}')
213 | 
214 |     params = np.array([model.covar_module.base_kernel.base_kernel.lengthscale.item(),
215 |                        model.covar_module.base_kernel.outputscale.item(),
216 |                        likelihood.noise_covar.noise.item()
217 |     ])
218 | 
219 |     save(args.data, 'test', gp_mean*y_std + y_mean, gp_std * y_std, loss, r2, params)
220 | 
221 |     if args.save_train:
222 |         print('Saving training data')
223 |         train_mean, train_std = predict(model, likelihood, train_x)
224 |         r2 = r2_score(train_Y, train_mean)
225 |         print(r2)
226 |         save(args.data, 'train', train_mean*y_std + y_mean, train_std * y_std, loss, r2, params)
227 | 
228 |     if held_X.any():
229 |         print('Applying GP to heldout data')
230 |         held_X, held_Y, _, _, _ = standardize(held_X, held_Y, scaler, y_mean, y_std)
231 |         held_x = to_gpu(to_torch(held_X))
232 | 
233 |         hld_mean, hld_std = predict(model, likelihood, held_x)
234 |         r2 = r2_score(held_Y, hld_mean)
235 |         print(r2)
236 |         save(args.data, 'held-out', hld_mean*y_std + y_mean, hld_std * y_std, loss, r2, params)
237 | 
238 | if __name__ == "__main__":
239 |     run()
240 | 


--------------------------------------------------------------------------------
/DIGDriver/region_model/feature_vectors/get_feature_vectors.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python
  2 | import os
  3 | import re
  4 | import sys
  5 | import h5py
  6 | import numpy as np
  7 | import torch
  8 | from torch import nn
  9 | from torch.utils.data import DataLoader
 10 | from sklearn.metrics import r2_score
 11 | 
 12 | sys.path.append('/storage/yaari/mutation_density/pytorch/nets/')
 13 | sys.path.append('/storage/yaari/mutation_density/pytorch/trainers/')
 14 | sys.path.append('/storage/yaari/mutation_density/pytorch/data_aux/')
 15 | 
 16 | from cnn_predictors import *
 17 | from mut_dataset import *
 18 | 
 19 | def tokens_match(strg, search=re.compile(r'[^:0-9]').search):
 20 |     return not bool(search(strg))
 21 | 
 22 | def load_track_selection_file(file_path):
 23 |     with open(file_path, 'r') as f:
 24 |         lines = f.readlines()
 25 |     track_lst = []
 26 |     for i, l in enumerate(lines):
 27 |         if l.startswith(('\n', '#')): continue
 28 |         l = l.rstrip()  # remove trailing '\n'
 29 |         assert tokens_match(l), \
 30 |             'Expected track selection lines to contain only digits and colons. Found: {} in line #{}.'.format(l, i)
 31 | 
 32 |         split_l = l.split(':')
 33 |         assert len(split_l) <= 2, \
 34 |             'Expected track selection lines to contain only one colon. Found: {} in line #{}.'.format(l, i)
 35 |         assert np.all([split_l[j].isdigit() for j in range(len(split_l))]), \
 36 |             'Expected to have a number in both sides of the colon. Found: {} in line #{}.'.format(l, i)
 37 | 
 38 |         if len(split_l) == 1:
 39 |             track_lst.append(int(split_l[0]))
 40 |         elif len(split_l) == 2:
 41 |             assert int(split_l[0]) < int(split_l[1]), 'Expected x < y in pair x:y. Found: {} in line #{}.'.format(l, i)
 42 |             track_lst.extend(np.arange(int(split_l[0]), int(split_l[1])).tolist())
 43 | 
 44 |     print('Selected {} tracks: \n{}'.format(len(track_lst), track_lst))
 45 |     return track_lst
 46 | 
 47 | def predict(model, data_loader, label_ids):   
 48 |     corr_coef_sums = np.zeros(len(label_ids))
 49 |     all_preds = [[] for _ in range(len(label_ids))]
 50 |     all_features = [[] for _ in range(len(label_ids))]
 51 |     all_true = [[] for _ in range(len(label_ids))]
 52 |     for j, (X, t_lst) in enumerate(data_loader):
 53 |         y_lst, features_lst, _ = model(X.cuda())
 54 |         with torch.no_grad():
 55 |             for i, t in enumerate(t_lst):
 56 |                 y = y_lst[i]
 57 |                 feature_vecs = features_lst[i]
 58 |                 all_features[i].append(feature_vecs.cpu().detach().numpy())
 59 |                 all_preds[i].extend(y.data.cpu().numpy().tolist())
 60 |                 all_true[i].extend(t.data.cpu().numpy().tolist())
 61 |     all_features = [np.concatenate(all_features[j], axis=0) for j in range(len(all_features))]
 62 |     return all_preds, all_true, all_features, [r2_score(all_true[i], all_preds[i]) for i in range(len(label_ids))]
 63 | 
 64 | def main():
 65 |     assert len(sys.argv) == 3, 'Usage: get_feature_vectors.py <model dir path> <run id>'
 66 |     models_dir = sys.argv[1]
 67 |     run_id = sys.argv[2]
 68 | 
 69 |     with open(os.path.join(models_dir, 'run_params.txt'), 'r') as f:
 70 |         config_lst = [(l.split(':')) for l in f.read().split('\n')]
 71 |         config_dict = {x[0].strip(): x[1].strip() for x in config_lst if len(x) > 1}
 72 | 
 73 |     test_idxs = np.sort(np.load(os.path.join(models_dir, 'test_indices_fold_{}.npy'.format(run_id))))
 74 |     label_ids = config_dict['label_ids'].replace('[\'', '').replace('\']', '').split(', ')
 75 | 
 76 |     file_path = config_dict['data_file']
 77 |     with h5py.File(file_path, 'r') as h5f:
 78 |         chr_idxs = h5f['idx'][:]
 79 |         pred_h = h5f['x_data'].shape[2]
 80 | 
 81 |     track_file = config_dict['track_file']
 82 |     if track_file != 'None':
 83 |         selected_tracks = load_track_selection_file(os.path.join(os.path.dirname(__file__), track_file))
 84 |     else:
 85 |         selected_tracks = np.arange(pred_h)
 86 | 
 87 |     test_chr_idxs = chr_idxs[test_idxs]
 88 |     test_ds = LazyLoadDatasetFromH5(file_path, label_ids, test_idxs, test_chr_idxs, selected_tracks, 'x_data')
 89 |     test_dl = DataLoader(test_ds, batch_size=4096, shuffle=False, drop_last=False, pin_memory=True, num_workers=4)
 90 |     train_idxs = np.delete(np.arange(len(chr_idxs)), test_idxs)
 91 |     train_chr_idxs = chr_idxs[train_idxs]
 92 |     train_ds = LazyLoadDatasetFromH5(file_path, label_ids, train_idxs, train_chr_idxs, selected_tracks, 'x_data')
 93 |     train_dl = DataLoader(train_ds, batch_size=4096, shuffle=False, drop_last=False, pin_memory=True, num_workers=4)
 94 |     samp_num = len(test_ds)
 95 | 
 96 |     print('Loading model...')
 97 |     model = nn.DataParallel(SimpleMultiTaskResNet(test_ds.get_data_shape(), len(label_ids))).cuda()
 98 |     state_dict = torch.load(os.path.join(models_dir, 'best_model_fold_{}.pt'.format(run_id)))
 99 |     model.load_state_dict(state_dict)
100 |     model.eval()
101 | 
102 |     print('Computing {} train set features...'.format(train_ds.get_data_shape()[0]))
103 |     train_preds, train_labels, train_features, acc = predict(model, train_dl, label_ids)
104 |     print('Model train accuracy: {}'.format(acc))
105 | 
106 |     print('Computing {} test set features...'.format(test_ds.get_data_shape()[0]))
107 |     test_preds, test_labels, test_features, acc = predict(model, test_dl, label_ids)
108 |     print('Model test accuracy: {}'.format(acc))
109 | 
110 |     print('Saving features, predictions and true labels...')
111 |     with h5py.File(os.path.join(models_dir, 'gaussian_process_data_{}.h5'.format(run_id)), 'w') as h5f:
112 |         train_group = h5f.create_group('train')
113 |         train_group.create_dataset('true', data=np.array(train_labels))
114 |         train_group.create_dataset('predicted', data=np.array(train_preds))
115 |         train_group.create_dataset('idxs', data=np.array(train_chr_idxs))
116 |         train_group.create_dataset('features', data=np.array(train_features))
117 |         test_group = h5f.create_group('test')
118 |         test_group.create_dataset('true', data=np.array(test_labels))
119 |         test_group.create_dataset('predicted', data=np.array(test_preds))
120 |         test_group.create_dataset('idxs', data=np.array(test_chr_idxs))
121 |         test_group.create_dataset('features', data=np.array(test_features))
122 | 
123 |     print('Done!')
124 | 
125 | if __name__ == '__main__':
126 |     main()
127 | 


--------------------------------------------------------------------------------
/DIGDriver/region_model/feature_vectors/get_heldout_feature_vectors.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python
  2 | import os
  3 | import re
  4 | import sys
  5 | import json
  6 | import copy
  7 | import h5py
  8 | import numpy as np
  9 | import pandas as pd
 10 | from types import SimpleNamespace
 11 | import torch
 12 | from torch import nn
 13 | from torch.utils.data import DataLoader
 14 | from sklearn.metrics import r2_score
 15 | 
 16 | sys.path.append('/storage/yaari/mutation_density/pytorch/nets/')
 17 | sys.path.append('/storage/yaari/mutation_density/pytorch/trainers/')
 18 | sys.path.append('/storage/yaari/mutation_density/pytorch/data_aux/')
 19 | 
 20 | from cnn_predictors import *
 21 | from mut_dataset import *
 22 | 
 23 | 
 24 | def tokens_match(strg, search=re.compile(r'[^:0-9]').search):
 25 |     return not bool(search(strg))
 26 | 
 27 | def load_track_selection_file(file_path):
 28 |     with open(file_path, 'r') as f:
 29 |         lines = f.readlines()
 30 |     track_lst = []
 31 |     for i, l in enumerate(lines):
 32 |         if l.startswith(('\n', '#')): continue
 33 |         l = l.rstrip()  # remove trailing '\n'
 34 |         assert tokens_match(l), \
 35 |             'Expected track selection lines to contain only digits and colons. Found: {} in line #{}.'.format(l, i)
 36 | 
 37 |         split_l = l.split(':')
 38 |         assert len(split_l) <= 2, \
 39 |             'Expected track selection lines to contain only one colon. Found: {} in line #{}.'.format(l, i)
 40 |         assert np.all([split_l[j].isdigit() for j in range(len(split_l))]), \
 41 |             'Expected to have a number in both sides of the colon. Found: {} in line #{}.'.format(l, i)
 42 | 
 43 |         if len(split_l) == 1:
 44 |             track_lst.append(int(split_l[0]))
 45 |         elif len(split_l) == 2:
 46 |             assert int(split_l[0]) < int(split_l[1]), 'Expected x < y in pair x:y. Found: {} in line #{}.'.format(l, i)
 47 |             track_lst.extend(np.arange(int(split_l[0]), int(split_l[1])).tolist())
 48 | 
 49 |     print('Selected {} tracks: \n{}'.format(len(track_lst), track_lst))
 50 |     return track_lst
 51 | 
 52 | def predict(model, data_loader, label_ids):   
 53 |     corr_coef_sums = np.zeros(len(label_ids))
 54 |     all_preds = [[] for _ in range(len(label_ids))]
 55 |     all_features = [[] for _ in range(len(label_ids))]
 56 |     all_true = [[] for _ in range(len(label_ids))]
 57 |     for j, (X, t_lst) in enumerate(data_loader):
 58 |         y_lst, features_lst, _ = model(X.cuda())
 59 |         with torch.no_grad():
 60 |             for i, t in enumerate(t_lst):
 61 |                 y = y_lst[i]
 62 |                 feature_vecs = features_lst[i]
 63 |                 all_features[i].append(feature_vecs.cpu().detach().numpy())
 64 |                 all_preds[i].extend(y.data.cpu().numpy().tolist())
 65 |                 all_true[i].extend(t.data.cpu().numpy().tolist())
 66 |     all_features = [np.concatenate(all_features[j], axis=0) for j in range(len(all_features))]
 67 |     return all_preds, all_true, all_features, [r2_score(all_true[i], all_preds[i]) for i in range(len(label_ids))]
 68 | 
 69 | def main():
 70 |     assert len(sys.argv) >= 3, 'Usage: get_heldout_feature_vectors.py <model dir path> <cancer label ids...>'
 71 | 
 72 |     models_dir = sys.argv[1]
 73 |     label_ids = sys.argv[2:]
 74 | 
 75 |     with open(os.path.join(models_dir, 'run_params.txt'), 'r') as f:
 76 |         config_lst = [(l.split(':')) for l in f.read().split('\n')]
 77 |         config_dict = {x[0].strip(): x[1].strip() for x in config_lst if len(x) > 1}
 78 | 
 79 |     test_idxs = np.sort(np.load(os.path.join(models_dir, 'test_indices.npy')))
 80 |     heldout_idxs = np.sort(np.load(os.path.join(models_dir, 'heldout_indices.npy')))
 81 | 
 82 |     file_path = config['data_file']
 83 |     with h5py.File(file_path, 'r') as h5f:
 84 |         chr_idxs = h5f['idx'][:]
 85 |         pred_h = h5f['x_data'].shape[2]
 86 | 
 87 |     track_file = config_dict['track_file']
 88 |     if track_file != 'None':
 89 |         selected_tracks = load_track_selection_file(os.path.join(os.path.dirname(__file__), track_file))
 90 |     else:
 91 |         selected_tracks = np.arange(pred_h)
 92 | 
 93 |     test_chr_idxs = chr_idxs[test_idxs]
 94 |     test_ds = LazyLoadDatasetFromH5(file_path, label_ids, test_idxs, test_chr_idxs, selected_tracks, 'x_data')
 95 |     test_dl = DataLoader(test_ds, batch_size=4096, shuffle=False, drop_last=False, pin_memory=True, num_workers=4)
 96 |     train_idxs = np.delete(np.arange(len(chr_idxs)), test_idxs)
 97 |     train_chr_idxs = chr_idxs[train_idxs]
 98 |     train_ds = LazyLoadDatasetFromH5(file_path, label_ids, train_idxs, train_chr_idxs, selected_tracks, 'x_data')
 99 |     train_dl = DataLoader(train_ds, batch_size=4096, shuffle=False, drop_last=False, pin_memory=True, num_workers=4)
100 |     heldout_chr_idxs = chr_idxs[heldout_idxs]
101 |     heldout_ds = LazyLoadDatasetFromH5(file_path, label_ids, heldout_idxs, heldout_chr_idxs, 'x_data')
102 |     heldout_dl = DataLoader(heldout_ds, batch_size=len(heldout_idxs), shuffle=False, drop_last=False, pin_memory=True, num_workers=4)
103 | 
104 |     print('Loading model...')
105 |     model = nn.DataParallel(SimpleMultiTaskResNet(test_ds.get_data_shape(), len(label_ids), get_feature_vecs=True)).cuda()
106 |     state_dict = torch.load(os.path.join(models_dir, 'best_model.pt'))
107 |     model.load_state_dict(state_dict)
108 |     model.eval()
109 | 
110 |     print('Predicting train set features...')
111 |     train_preds, train_labels, train_features, acc = predict(model, train_dl, label_ids)
112 |     print('Model train accuracy: {}'.format(acc))
113 | 
114 |     print('Predicting test set features...')
115 |     test_preds, test_labels, test_features, acc = predict(model, test_dl, label_ids)
116 |     print('Model test accuracy: {}'.format(acc))
117 | 
118 |     print('Predicting heldout set features...')
119 |     heldout_preds, heldout_labels, heldout_features, acc = predict(model, heldout_dl, label_ids)
120 |     print('Model held-out accuracy: {}'.format(acc))
121 | 
122 |     print('Model accuracy: {}'.format(acc))
123 |     print('Saving features, predictions and true labels...')
124 |     with h5py.File(os.path.join(models_dir, 'heldout_gaussian_process_data.h5'), 'w') as h5f:
125 |         train_group = h5f.create_group('train')
126 |         train_group.create_dataset('true', data=np.array(train_labels))
127 |         train_group.create_dataset('predicted', data=np.array(train_preds))
128 |         train_group.create_dataset('idxs', data=np.array(train_chr_idxs))
129 |         train_group.create_dataset('features', data=np.array(train_features))
130 |         test_group = h5f.create_group('test')
131 |         test_group.create_dataset('true', data=np.array(test_labels))
132 |         test_group.create_dataset('predicted', data=np.array(test_preds))
133 |         test_group.create_dataset('idxs', data=np.array(test_chr_idxs))
134 |         test_group.create_dataset('features', data=np.array(test_features))
135 |         heldout_group = h5f.create_group('heldout')
136 |         heldout_group.create_dataset('true', data=np.array(heldout_labels))
137 |         heldout_group.create_dataset('predicted', data=np.array(heldout_preds))
138 |         heldout_group.create_dataset('idxs', data=np.array(heldout_chr_idxs))
139 |         heldout_group.create_dataset('features', data=np.array(heldout_features))
140 |         
141 |     print('Done!')
142 |     
143 | if __name__ == '__main__':
144 |     main()
145 | 


--------------------------------------------------------------------------------
/DIGDriver/region_model/kfold_mutations_main.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python
  2 | import os
  3 | import sys
  4 | import h5py
  5 | import copy
  6 | import argparse
  7 | import numpy as np
  8 | import pandas as pd
  9 | from torch import nn, optim
 10 | from tensorboardX import SummaryWriter
 11 | from datetime import datetime
 12 | 
 13 | file_path = os.path.dirname(os.path.abspath(__file__))
 14 | sys.path.append(os.path.join(file_path, 'nets'))
 15 | sys.path.append(os.path.join(file_path, 'trainers'))
 16 | sys.path.append(os.path.join(file_path, 'data_aux'))
 17 | sys.path.append(os.path.join(file_path, '../sequence_model'))
 18 | 
 19 | #from rnn_predictors import *
 20 | from cnn_predictors import *
 21 | 
 22 | from nn_trainer import *
 23 | from gp_trainer import *
 24 | from dataset_generator import *
 25 | from mutations_main import OutputGenerator
 26 | from gp_tools import *
 27 | 
 28 | def get_cmd_arguments(text=None):
 29 |     ap = argparse.ArgumentParser()
 30 | 
 31 |     # Required cancer type argument
 32 |     ap.add_argument('-c', '--cancer-id', required=True, nargs='*', action='store', type=str, dest='label_ids',
 33 |                     help='A list of the h5 file mutation count dataset IDs (e.g. SNV_skin_melanoma_MELAU_AU)')
 34 | 
 35 |     # Path arguments
 36 |     ap.add_argument('-d', "--data", required=False, nargs='?', action='store', type=str, dest='data_file',
 37 |                     default='/storage/datasets/cancer/unzipped_data_matrices_pcawg_10k.h5', help='Path to h5 data file')
 38 |     ap.add_argument('-o', "--out-dir", required=False, nargs='?', action='store', type=str, dest='out_dir',
 39 |                     default='/storage/yaari/mutation-density-outputs', help='Path to output directory')
 40 |     ap.add_argument('-t', "--tracks", required=False, nargs='?', action='store', type=str, dest='track_file',
 41 |                     default=None, help='Path to predictor tracks selection file')
 42 | 
 43 |     # Run type parameters
 44 |     ap.add_argument('-s', "--split", required=False, nargs='?', action='store', type=str, dest='split_method',
 45 |                     default='random', help='Dataset split method (random/chr)')
 46 |     ap.add_argument('-m', "--mappability", required=False, nargs='?', action='store', type=float, dest='mappability',
 47 |                     default=0.5, help='Mappability lower bound')
 48 |     ap.add_argument('-cq', "--count-quantile", required=False, nargs='?', action='store', type=float, dest='count_quantile',
 49 |                     default=0.999, help='Region mutation count quanitle threshold.')
 50 |     ap.add_argument('-a', "--attention", required=False, action='store_true', dest='get_attention',
 51 |                     help='True: train with attention map training and save attention maps')
 52 |     ap.add_argument('-gp', "--gaussian", required=False, nargs='?', action='store', type=int, dest='run_gaussian',
 53 |                     default=5, help='True: train gaussian process regression on the best performing model')
 54 |     ap.add_argument('-as', "--autoregressive-size", required=False, nargs='?', action='store', type=int,
 55 |                     dest='autoregressive_size', default=0, help='number of neighbouring regions for autoregressive features')
 56 |     # Train parameters
 57 |     ap.add_argument('-k', required=False, nargs='?', action='store', type=int, dest='k',
 58 |                     default=5, help='Number of folds')
 59 |     ap.add_argument('-gr', "--gp-reruns", required=False, nargs='?', action='store', type=int, dest='gp_reruns',
 60 |                     default=3, help='GP maximum reinitializations for convergence')
 61 |     ap.add_argument('-gd', "--gp-delta", required=False, nargs='?', action='store', type=int, dest='gp_delta',
 62 |                     default=0.03, help='Maximum difference between a fold NN and GP scores')
 63 |     ap.add_argument('-re', "--nn-reruns", required=False, nargs='?', action='store', type=int, dest='nn_reruns',
 64 |                     default=1, help='Number of model reinitializations and training runs')
 65 |     ap.add_argument('-mr', "--max-nn-reruns", required=False, nargs='?', action='store', type=int, dest='max_nn_reruns',
 66 |                     default=3, help='NN maximum reinitializations for GP to successeed')
 67 |     ap.add_argument('-vr', "--val-ratio", required=False, nargs='?', action='store', type=float, dest='val_ratio',
 68 |                     default=0.2, help='Validation set split size ratio')
 69 |     ap.add_argument('-e', "--epochs", required=False, nargs='?', action='store', type=int, dest='epochs',
 70 |                     default=20, help='Number of epochs')
 71 |     ap.add_argument('-b', "--batch", required=False, nargs='?', action='store', type=int, dest='bs',
 72 |                     default=128, help='Batch size')
 73 |     ap.add_argument('-nd', "--n-inducing", required=False, nargs='?', action='store', type=int, dest='n_inducing',
 74 |                     default=400, help='Number of GP inducing points')
 75 |     ap.add_argument('-nt', "--n-iter", required=False, nargs='?', action='store', type=int, dest='n_iter',
 76 |                     default=50, help='Number of GP iterations')
 77 | 
 78 |     # Run management parameters
 79 |     ap.add_argument('-sm', "--save-model", required=False, action='store_true', dest='save_model',
 80 |                     help='True: save best model across all reruns')
 81 |     ap.add_argument('-st', "--save-training", required=False, action='store_true', dest='save_training',
 82 |                     help='True: save training process and results to Tensorboard file')
 83 |     ap.add_argument('-g', "--gpus", required=False, nargs='?', action='store', type=str, dest='gpus',
 84 |                     default='all', help='GPUs devices (all/comma separted list)')
 85 |     ap.add_argument('-u', "--sub_mapp", required=False,  action='store_true',  dest='sub_mapp',
 86 |                     help='True: run model on regions below mappability threshold')
 87 | 
 88 |     if text:
 89 |         args = ap.parse_args(text.split())
 90 |     else:
 91 |         args = ap.parse_args()
 92 | 
 93 |     return args
 94 | 
 95 | 
 96 | def main(input_args=None):
 97 |     if input_args is None:
 98 |         args = get_cmd_arguments()
 99 |     else:
100 |         args = input_args
101 | 
102 |     labels_str = '-'.join(args.label_ids)
103 |     out_dir = os.path.join(args.out_dir, 'kfold', labels_str, str(datetime.now()))
104 |     print('Generating prediction for cancer types: {}'.format(args.label_ids))
105 | 
106 |     if args.gpus is None:
107 |         print('Using CPU device.')
108 |         device = torch.device('cpu')
109 |     else:
110 |         print('Using GPU device: \'{}\''.format(args.gpus))
111 |         device = torch.device('cuda')
112 |         if args.gpus != 'all':
113 |             os.environ["CUDA_VISIBLE_DEVICES"] = args.gpus
114 | 
115 |     out_pred = OutputGenerator(args, device, out_dir)
116 | 
117 |     os.makedirs(out_dir)
118 |     args_dict = vars(args)
119 |     with open(os.path.join(out_dir, 'run_params.txt'), 'w') as f:
120 |         [f.write('{}: {}\n'.format(k, args_dict[k])) for k in args_dict.keys()]
121 | 
122 |     best_model_file = os.path.join(out_dir, 'best_model_fold_{}.pt')
123 |     val_set_file = os.path.join(out_dir, 'val_indices_fold_{}')
124 | 
125 |     if args.save_model or args.save_training:
126 |         print('Saving results under: \'{}\''.format(out_dir))
127 | 
128 |     data_generator = KFoldDatasetGenerator(args)
129 |     is_autoreg = args.autoregressive_size > 0
130 |     model_func = AutoregressiveMultiTaskResNet if is_autoreg else SimpleMultiTaskResNet
131 |     print('Running {}-fold prediction...'.format(args.k))
132 | 
133 |     k, re = 0, 0
134 |     gp_succeed = False
135 |     while k < args.k and re < args.max_nn_reruns:
136 |         train_ds, val_ds, ho_ds = data_generator.get_datasets(k)
137 |         best_overall_acc = -np.inf
138 |         for r in range(args.nn_reruns):
139 |             print('Setting model and optimizers for run {}/{} and fold {}/{}...'.format(r + 1, args.nn_reruns, k + 1, args.k))
140 |             model = model_func(train_ds.get_data_shape(), len(args.label_ids), get_attention_maps=args.get_attention)
141 |             optimizer = optim.Adam(model.parameters(), lr=1e-3, amsgrad=False)
142 |             loss_fn = nn.MSELoss()
143 |             if args.gpus is not None: model = nn.DataParallel(model)
144 | 
145 |             if args.save_training:
146 |                 writer = SummaryWriter(logdir=out_dir, comment=labels_str)
147 |                 writer.add_text('configurations', str(args), 0)
148 |                 writer.add_text('model', str(model), 0)
149 |             else:
150 |                 writer = None
151 |             trainer = NNTrainer(model,
152 |                                 optimizer,
153 |                                 loss_fn,
154 |                                 args.bs,
155 |                                 args.label_ids,
156 |                                 train_ds,
157 |                                 val_ds,
158 |                                 device,
159 |                                 writer,
160 |                                 get_attention_maps=args.get_attention)
161 | 
162 |             best_run_acc = -np.inf
163 |             for epoch in range(1, args.epochs + 1):
164 |                 print('Running epoch {}/{}'.format(epoch, args.epochs))
165 |                 train_losses, train_accs, train_features_lst, train_pred_lst, train_true_lst = \
166 |                     trainer.train(epoch, r, autoreg=is_autoreg)
167 |                 val_losses, val_accs, val_features_lst, val_pred_lst, val_true_lst, val_attention = \
168 |                     trainer.test(epoch, r, autoreg=is_autoreg)
169 | 
170 |                 # Keep only the best model with > 2 non-zero features according to test performance 
171 |                 non_zero_features = np.where(np.abs(train_features_lst[0]).mean(axis=0) > 0)[0]
172 |                 print('#non-zero features: {}'.format(len(non_zero_features)))
173 |                 if val_accs[0] > best_run_acc and len(non_zero_features) > 1:
174 |                     print('Changing run model since best R2 was {} compared to previous {}'.format(val_accs[0], best_run_acc))
175 |                     best_run_acc = val_accs[0]
176 |                     best_run_model, best_run_att = copy.deepcopy(model), val_attention
177 |                     train_dict = {'feat': train_features_lst, 'lbls': train_true_lst, 'ds': train_ds}
178 |                     val_dict = {'feat': val_features_lst, 'lbls': val_true_lst, 'ds': val_ds}
179 | 
180 |             if best_run_acc > best_overall_acc:
181 |                 best_overall_acc = best_run_acc
182 |                 best_overall_model = best_run_model
183 |                 best_train_dict, best_val_dict = train_dict, val_dict
184 | 
185 |             print(bcolors.OKCYAN + 'Best epoch validation accuracy for run {}/{} was: {}.'.format(r + 1, args.nn_reruns, best_run_acc) + bcolors.ENDC)
186 |         print(bcolors.OKCYAN + 'Best overall validation accuracy over {} reruns was: {}.'.format(args.nn_reruns, best_overall_acc) + bcolors.ENDC)
187 | 
188 |         # Save attention maps from best overall model
189 |         if args.get_attention:
190 |             out_pred.save_attetnion_maps('attention_maps_{}.h5'.format(k), best_run_att, val_ds, val_pred_lst, val_true_lst)
191 | 
192 |          # Save best run model
193 |         if args.save_model:
194 |             print('Saving model and validation indices for future evaluations to {}...'.format(val_set_file))
195 |             np.save(val_set_file.format(k), val_ds.get_set_indices())
196 |             torch.save(best_overall_model.state_dict(), best_model_file.format(k))
197 | 
198 |         # Run GP on best overall model
199 |         if args.run_gaussian > 0:
200 |             print('Computing {} validation set features...'.format(ho_ds.get_data_shape()[0]))
201 |             ho_preds, ho_labels, ho_features, ho_acc, ho_att = out_pred.predict(best_overall_model, ho_ds)
202 |             ho_dict = {'feat': ho_features, 'lbls': ho_labels, 'ds': ho_ds}
203 |             print(bcolors.OKCYAN + 'Model held-out accuracy: {}'.format(ho_acc) + bcolors.ENDC)
204 | 
205 |             gp_succeed = out_pred.run_gp('gp_results_fold_{}.h5'.format(k), train_dict, val_dict, ho_dict, best_overall_acc, k)
206 | 
207 |             if args.sub_mapp:
208 |                 sub_ds = data_generator.get_below_mapp()
209 |                 print('Computing {} sub-theshold features...'.format(sub_ds.get_data_shape()[0]))
210 |                 sub_preds, sub_labels, sub_features, sub_acc, sub_att = out_pred.predict(best_overall_model, sub_ds)
211 |                 sub_dict = {'feat': sub_features, 'lbls': sub_labels, 'ds': sub_ds}
212 |                 print(bcolors.OKCYAN + 'Model sub-mappable accuracy: {}'.format(sub_acc) + bcolors.ENDC)
213 | 
214 |                 # Save attention maps from unmappable regions
215 |                 sub_att_path = os.path.join(out_dir, 'attention_maps_submapp.h5')
216 |                 if args.get_attention and not os.path.exists(sub_att_path):
217 |                     out_pred.save_attetnion_maps('attention_maps_submapp.h5', sub_att, sub_ds, sub_preds, sub_labels)
218 | 
219 |                 out_pred.run_gp('sub_mapp_results_fold_{}.h5'.format(k), train_dict, val_dict, sub_dict, best_overall_acc, k, prefix='sub')
220 | 
221 |         if args.run_gaussian > 0 and not gp_succeed:
222 |             re += 1
223 |             print(bcolors.FAIL + 'GP run failed! Rerunning NN, attempt {}/{}'.format(re + 1, args.max_nn_reruns) + bcolors.ENDC)
224 |         else:
225 |             k += 1
226 |             re = 0
227 | 
228 |     assert gp_succeed, 'GP failed at fold {} after {} NN reruns'.format(k, re)
229 |     print('Done!')
230 | 
231 | 
232 | if __name__ == '__main__':
233 |     startTime = datetime.now()
234 |     main()
235 |     print('Time elapsed: {}'.format(datetime.now() - startTime))
236 | 


--------------------------------------------------------------------------------
/DIGDriver/region_model/nets/__init__.py:
--------------------------------------------------------------------------------
1 | 
2 | 


--------------------------------------------------------------------------------
/DIGDriver/region_model/nets/densenet.py:
--------------------------------------------------------------------------------
 1 | import torch
 2 | import torch.nn as nn
 3 | import torch.nn.functional as F
 4 | 
 5 | 
 6 | class Dense_Block(nn.Module):
 7 |     def __init__(self, in_channels):
 8 |         super(Dense_Block, self).__init__()
 9 |         self.relu = nn.ReLU(inplace = True)
10 |         self.bn = nn.BatchNorm1d(num_channels = in_channels)
11 |     
12 |         self.conv1 = nn.Conv1d(in_channels=in_channels, out_channels=32, kernel_size=3, stride=1, padding=1)
13 |         self.conv2 = nn.Conv1d(in_channels=32, out_channels=32, kernel_size=3, stride=1, padding=1)
14 |         self.conv3 = nn.Conv1d(in_channels=64, out_channels=32, kernel_size=3, stride=1, padding=1)
15 |         self.conv4 = nn.Conv1d(in_channels=96, out_channels=32, kernel_size=3, stride=1, padding=1)
16 |         self.conv5 = nn.Conv1d(in_channels=128, out_channels=32, kernel_size=3, stride=1, padding=1)
17 | 
18 |     def forward(self, x):
19 |         bn = self.bn(x)
20 |         conv1 = self.relu(self.conv1(bn))
21 |         conv2 = self.relu(self.conv2(conv1))
22 |         # Concatenate in channel dimension
23 |         c2_dense = self.relu(torch.cat([conv1, conv2], 1))
24 |         conv3 = self.relu(self.conv3(c2_dense))
25 |         c3_dense = self.relu(torch.cat([conv1, conv2, conv3], 1))
26 |         
27 |         conv4 = self.relu(self.conv4(c3_dense))
28 |         c4_dense = self.relu(torch.cat([conv1, conv2, conv3, conv4], 1))
29 |    
30 |         conv5 = self.relu(self.conv5(c4_dense))
31 |         c5_dense = self.relu(torch.cat([conv1, conv2, conv3, conv4, conv5], 1))
32 |    
33 |         return c5_dense
34 | 
35 |     
36 | class Transition_Layer(nn.Module): 
37 |     def __init__(self, in_channels, out_channels):
38 |         super(Transition_Layer, self).__init__() 
39 |     
40 |         self.relu = nn.ReLU(inplace=True) 
41 |         self.bn = nn.BatchNorm1d(num_features=out_channels) 
42 |         self.conv = nn.Conv1d(in_channels=in_channels, out_channels=out_channels, kernel_size=1, bias=False) 
43 |         self.avg_pool = nn.AvgPool1d(kernel_size=2, stride=2, padding=0)
44 | 
45 |     def forward(self, x): 
46 |         bn = self.bn(self.relu(self.conv(x))) 
47 |         out = self.avg_pool(bn) 
48 |         return out
49 | 
50 |     
51 | class SingleTaskDenseNet(nn.Module): 
52 |     def __init__(self, nr_classes): 
53 |         super(SingleTaskDenseNet, self).__init__() 
54 |   
55 |         self.lowconv = nn.Conv1d(in_channels=3, out_channels=64, kernel_size=7, padding=3, bias=False) 
56 |         self.relu = nn.ReLU()
57 |     
58 |         # Make Dense Blocks 
59 |         self.denseblock1 = self._make_dense_block(Dense_Block, 64) 
60 |         self.denseblock2 = self._make_dense_block(Dense_Block, 128)
61 |         self.denseblock3 = self._make_dense_block(Dense_Block, 128)    # Make transition Layers 
62 |         self.transitionLayer1 = self._make_transition_layer(Transition_Layer, in_channels=160, out_channels=128) 
63 |         self.transitionLayer2 = self._make_transition_layer(Transition_Layer, in_channels=160, out_channels=128) 
64 |         self.transitionLayer3 = self._make_transition_layer(Transition_Layer, in_channels=160, out_channels=64)    # Classifier 
65 |         self.bn = nn.BatchNorm1d(num_features=64) 
66 |         self.pre_classifier = nn.Linear(64*4*4, 512) 
67 |         self.classifier = nn.Linear(512, nr_classes)
68 |  
69 |     def _make_dense_block(self, block, in_channels): 
70 |         layers = [] 
71 |         layers.append(block(in_channels)) 
72 |         return nn.Sequential(*layers)
73 | 
74 |     def _make_transition_layer(self, layer, in_channels, out_channels): 
75 |         modules = [] 
76 |         modules.append(layer(in_channels, out_channels)) 
77 |         return nn.Sequential(*modules)
78 | 
79 |     def forward(self, x): 
80 |         out = self.relu(self.lowconv(x))
81 |         out = self.denseblock1(out) 
82 |         out = self.transitionLayer1(out)
83 |         out = self.denseblock2(out) 
84 |         out = self.transitionLayer2(out) 
85 |         out = self.denseblock3(out) 
86 |         out = self.transitionLayer3(out) 
87 |         
88 |         out = self.bn(out) 
89 |         out = out.view(-1, 64*4*4) 
90 |         
91 |         out = self.pre_classifier(out) 
92 |         out = self.classifier(out)
93 |         return out
94 | 


--------------------------------------------------------------------------------
/DIGDriver/region_model/nets/resnet.py:
--------------------------------------------------------------------------------
 1 | import numpy as np
 2 | import torch
 3 | import torch.nn as nn
 4 | import torch.nn.functional as F
 5 | 
 6 | 
 7 | class BasicBlock(nn.Module):
 8 |     expansion = 1
 9 | 
10 |     def __init__(self, in_planes, planes, stride=1):
11 |         super(BasicBlock, self).__init__()
12 |         self.conv1 = nn.Conv1d(in_planes, planes, kernel_size=3, stride=stride, padding=1, bias=True)
13 |         self.bn1 = nn.BatchNorm1d(planes)
14 |         self.conv2 = nn.Conv1d(planes, planes, kernel_size=3, stride=1, padding=1, bias=True)
15 |         self.bn2 = nn.BatchNorm1d(planes)
16 | 
17 |         self.shortcut = nn.Sequential()
18 |         if stride != 1 or in_planes != self.expansion*planes:
19 |             self.shortcut = nn.Sequential(
20 |                 nn.Conv1d(in_planes, self.expansion*planes, kernel_size=1, stride=stride, bias=True),
21 |                 nn.BatchNorm1d(self.expansion*planes))
22 | 
23 |     def forward(self, x):
24 |         out = F.relu(self.bn1(self.conv1(x)))
25 |         out = self.bn2(self.conv2(out))
26 |         out += self.shortcut(x)
27 |         out = F.relu(out)
28 |         return out
29 | 
30 | 
31 | class Bottleneck(nn.Module):
32 |     expansion = 2
33 | 
34 |     def __init__(self, in_planes, planes, stride=1):
35 |         super(Bottleneck, self).__init__()
36 |         self.conv1 = nn.Conv1d(in_planes, planes, kernel_size=1, bias=False)
37 |         self.bn1 = nn.BatchNorm1d(planes)
38 |         self.conv2 = nn.Conv1d(planes, planes, kernel_size=3, stride=stride, padding=1, bias=False)
39 |         self.bn2 = nn.BatchNorm1d(planes)
40 |         self.conv3 = nn.Conv1d(planes, self.expansion*planes, kernel_size=1, bias=False)
41 |         self.bn3 = nn.BatchNorm1d(self.expansion*planes)
42 | 
43 |         self.shortcut = nn.Sequential()
44 |         if stride != 1 or in_planes != self.expansion*planes:
45 |             self.shortcut = nn.Sequential(
46 |                 nn.Conv1d(in_planes, self.expansion*planes, kernel_size=1, stride=stride, bias=False),
47 |                 nn.BatchNorm1d(self.expansion*planes))
48 | 
49 |     def forward(self, x):
50 |         out = F.relu(self.bn1(self.conv1(x)))
51 |         out = F.relu(self.bn2(self.conv2(out)))
52 |         out = self.bn3(self.conv3(out))
53 |         out += self.shortcut(x)
54 |         out = F.relu(out)
55 |         return out
56 | 
57 | 
58 | class SingleTaskResNet(nn.Module):
59 |     def __init__(self, in_shape, task_num, num_blocks=[3,3,3], strides=[2,2,2], block=Bottleneck):
60 |         super(SingleTaskResNet, self).__init__()
61 |         assert len(num_blocks) == len(strides), \
62 |             'Expected number of blocks and strides lists to be of equal length but found {} and {}'.format(len(num_blocks), len(strides))
63 |         in_len = in_shape[1]
64 |         in_width = in_shape[2]
65 |         self.in_planes = 64
66 | 
67 |         self.conv1 = nn.Conv1d(in_width, 64, kernel_size=5, stride=1, padding=1, bias=False)
68 |         self.bn1 = nn.BatchNorm1d(64)
69 | 
70 |         conv_blocks = [self._make_layer(block, 64 * 2**i, layer_num, stride=stride) for i, (layer_num, stride) in enumerate(zip(num_blocks, strides))]
71 |         self.net = nn.Sequential(*conv_blocks)
72 | 
73 |         net_out_len = int(block.expansion * 64 * 2**(len(strides)-1) * np.ceil(in_len / np.prod(strides)))
74 |         self.linear1 = nn.Linear(net_out_len, 128)
75 |         self.linear2 = nn.Linear(128, 1)
76 | 
77 |     def _make_layer(self, block, planes, num_blocks, stride):
78 |         strides = [stride] + [1]*(num_blocks-1)
79 |         layers = []
80 |         for s in strides:
81 |             layers.append(block(self.in_planes, planes, s))
82 |             self.in_planes = planes * block.expansion
83 |         return nn.Sequential(*layers)
84 | 
85 |     def forward(self, x):
86 |         out = F.relu(self.bn1(self.conv1(torch.transpose(x, 1, 2))))
87 |         out = self.net(out)
88 |         #out = F.avg_pool1d(out, 4)
89 |         out = out.view(out.size(0), -1)
90 |         out = self.linear1(out)
91 |         out = self.linear2(out)
92 |         return [out.reshape(-1)]
93 | 
94 | 
95 | 
96 | 
97 | 


--------------------------------------------------------------------------------
/DIGDriver/region_model/nets/rnn_predictors.py:
--------------------------------------------------------------------------------
  1 | from torch import nn, transpose
  2 | from torch.autograd import Variable
  3 | from torch.nn import functional as F
  4 | 
  5 | 
  6 | class MultiTaskLinear(nn.Module):
  7 |     def __init__(self, shape, task_num):
  8 |         super(MultiTaskLinear, self).__init__()
  9 |         self.inp_len = shape[1]
 10 |         self.inp_size = shape[2]
 11 |         self.task_num = task_num
 12 | 
 13 |         self.hidden_dim = 128
 14 |         self.fc2_dim = 128
 15 | 
 16 |         self.conv1 = nn.Conv1d(in_channels=self.inp_size, out_channels=128, kernel_size=3, padding=1, stride=1)
 17 |         self.bn1 = nn.BatchNorm1d(128)
 18 |         self.conv2 = nn.Conv1d(in_channels=128, out_channels=256, kernel_size=3, padding=1, stride=2)
 19 |         self.bn2 = nn.BatchNorm1d(256)
 20 |         self.conv3 = nn.Conv1d(in_channels=256, out_channels=256, kernel_size=3, padding=1, stride=2)
 21 |         self.bn3 = nn.BatchNorm1d(256)
 22 | 
 23 |         self.birnn = nn.LSTM(input_size=256, hidden_size=self.hidden_dim, num_layers=3, batch_first=True, bidirectional=True)
 24 | 
 25 |         self.fc1_lst = nn.ModuleList()
 26 |         self.fc2_lst = nn.ModuleList()
 27 |         for _ in range(self.task_num):
 28 |             self.fc1_lst.append(nn.Linear(in_features=int(self.hidden_dim * 2), out_features=self.fc2_dim))
 29 |             self.fc2_lst.append(nn.Linear(in_features=self.fc2_dim, out_features=1))
 30 | 
 31 |     def forward(self, x: Variable) -> (Variable):
 32 |         self.birnn.flatten_parameters()
 33 |         x = self.bn1(F.relu(self.conv1(transpose(x, 1, 2))))
 34 |         x = F.relu(self.bn2(self.conv2(x)))
 35 |         x = F.relu(self.bn3(self.conv3(x)))
 36 |         x = self.birnn(transpose(x, 1, 2))
 37 | 
 38 |         outputs = []
 39 |         for i in range(self.task_num):
 40 |             task_x = F.relu(self.fc1_lst[i](x[0][:, -1, :]))
 41 |             outputs.append(self.fc2_lst[i](task_x).reshape(-1))
 42 | 
 43 |         return outputs
 44 |     
 45 | class MultiTaskRNN(nn.Module):
 46 |     def __init__(self, shape, task_num):
 47 |         super(MultiTaskRNN, self).__init__()
 48 |         self.inp_len = shape[1]
 49 |         self.inp_size = shape[2]
 50 |         self.task_num = task_num
 51 | 
 52 |         self.hidden_dim = 128
 53 |         self.fc2_dim = 128
 54 | 
 55 |         self.conv1 = nn.Conv1d(in_channels=self.inp_size, out_channels=128, kernel_size=3, padding=1, stride=1)
 56 |         self.bn1 = nn.BatchNorm1d(128)
 57 |         self.conv2 = nn.Conv1d(in_channels=128, out_channels=256, kernel_size=3, padding=1, stride=2)
 58 |         self.bn2 = nn.BatchNorm1d(256)
 59 |         self.conv3 = nn.Conv1d(in_channels=256, out_channels=256, kernel_size=3, padding=1, stride=2)
 60 |         self.bn3 = nn.BatchNorm1d(256)
 61 | 
 62 |         self.rnn_lst = nn.ModuleList()
 63 |         self.fc1_lst = nn.ModuleList()
 64 |         self.fc2_lst = nn.ModuleList()
 65 |         for _ in range(self.task_num):
 66 |             self.rnn_lst.append(nn.LSTM(input_size=256, hidden_size=self.hidden_dim, num_layers=3, batch_first=True, bidirectional=True))
 67 |             self.fc1_lst.append(nn.Linear(in_features=int(self.hidden_dim * 2), out_features=self.fc2_dim))
 68 |             self.fc2_lst.append(nn.Linear(in_features=self.fc2_dim, out_features=1))
 69 | 
 70 |     def forward(self, x: Variable) -> (Variable):
 71 |         x = self.bn1(F.relu(self.conv1(transpose(x, 1, 2))))
 72 |         x = F.relu(self.bn2(self.conv2(x)))
 73 |         x = F.relu(self.bn3(self.conv3(x)))
 74 | 
 75 |         outputs = []
 76 |         for i in range(self.task_num):
 77 |             self.rnn_lst[i].flatten_parameters()
 78 |             task_x = self.rnn_lst[i](transpose(x, 1, 2))
 79 |             task_x = F.relu(self.fc1_lst[i](task_x[0][:, -1, :]))
 80 |             outputs.append(self.fc2_lst[i](task_x).reshape(-1))
 81 | 
 82 |         return outputs
 83 | 
 84 | class MultiTaskHierarchicalLinear(nn.Module):
 85 |     def __init__(self, shape, task_num):
 86 |         super(MultiTaskHierarchicalLinear, self).__init__()
 87 |         self.inp_len = shape[1]
 88 |         self.inp_size = shape[2]
 89 |         self.task_num = task_num
 90 | 
 91 |         self.hidden_dim = 128
 92 |         self.fc2_dim = 128
 93 | 
 94 |         self.conv1 = nn.Conv1d(in_channels=self.inp_size, out_channels=128, kernel_size=3, padding=1, stride=1)
 95 |         self.bn1 = nn.BatchNorm1d(128)
 96 |         self.conv2 = nn.Conv1d(in_channels=128, out_channels=256, kernel_size=3, padding=1, stride=2)
 97 |         self.bn2 = nn.BatchNorm1d(256)
 98 |         self.conv3 = nn.Conv1d(in_channels=256, out_channels=256, kernel_size=3, padding=1, stride=2)
 99 |         self.bn3 = nn.BatchNorm1d(256)
100 | 
101 |         self.birnn = nn.LSTM(input_size=256, hidden_size=self.hidden_dim, num_layers=3, batch_first=True, bidirectional=True)
102 | 
103 |         self.fc1 = nn.Linear(in_features=int(self.hidden_dim * 2), out_features=self.fc2_dim)
104 |         self.t1_out = nn.Linear(in_features=self.fc2_dim, out_features=1)
105 |         
106 |         self.fc2 = nn.Linear(in_features=self.fc2_dim, out_features=self.fc2_dim)
107 |         self.t2_out = nn.Linear(in_features=self.fc2_dim, out_features=1)
108 |                 
109 |     def forward(self, x: Variable) -> (Variable):
110 |         self.birnn.flatten_parameters()
111 |         x = self.bn1(F.relu(self.conv1(transpose(x, 1, 2))))
112 |         x = F.relu(self.bn2(self.conv2(x)))
113 |         x = F.relu(self.bn3(self.conv3(x)))
114 |         x = self.birnn(transpose(x, 1, 2))
115 |         x = F.relu(self.fc1(x[0][:, -1, :]))
116 |         out1 = self.t1_out(x).reshape(-1)
117 | 
118 |         x = F.relu(self.fc2(x))
119 |         out2 = self.t2_out(x).reshape(-1)
120 | 
121 |         return [out1, out2]
122 | 


--------------------------------------------------------------------------------
/DIGDriver/region_model/perturbations_confidance/__init__.py:
--------------------------------------------------------------------------------
1 | 
2 | 


--------------------------------------------------------------------------------
/DIGDriver/region_model/perturbations_confidance/confidance_perturbations_estimate.py:
--------------------------------------------------------------------------------
  1 | import os
  2 | import sys
  3 | import json
  4 | import copy
  5 | import h5py
  6 | import numpy as np
  7 | import torch
  8 | from types import SimpleNamespace
  9 | from torch import nn
 10 | from sklearn.metrics import r2_score
 11 | 
 12 | #from nets.nets import *
 13 | #from nets.trainer import *
 14 | 
 15 | def add_noise_to_model(model, noise):
 16 |     tmp_model = copy.deepcopy(model).cuda()
 17 |     with torch.no_grad():
 18 |         for param in tmp_model.parameters():
 19 |             print
 20 |             param.add_(torch.normal(0, noise, param.size()).cuda())
 21 |     return tmp_model
 22 | 
 23 | 
 24 | def compute_confidance(preds, labels):
 25 |     confs = np.empty((preds.shape[0], preds.shape[2]))
 26 |     means = np.empty((preds.shape[0], preds.shape[2]))
 27 |     accs = np.empty(preds.shape[0])
 28 |     for i in range(preds.shape[0]):
 29 |         for j in range(preds.shape[2]):
 30 |             confs[i, j] = np.std(preds[i, :, j])
 31 |             means[i, j] = np.mean(preds[i, :, j])
 32 |         accs[i] = r2_score(means[i], labels)
 33 |     return means, confs, accs
 34 | 
 35 | 
 36 | def test_confidance(model, data, labels, loss_fn, params, verbose=False):
 37 |      # toggle model to test / inference mode
 38 |     model.eval()
 39 |         
 40 |     # round sample num to full batches
 41 |     samp_num = len(labels) - len(labels) % params.bs
 42 | 
 43 |     preds = np.empty((len(params.alphas), params.reps, samp_num))
 44 |     for i, alpha in enumerate(params.alphas):
 45 |         for rep in range(params.reps):
 46 |             loss_sum = 0
 47 |             acc_sum = 0 
 48 |             tmp_model = add_noise_to_model(model, alpha)
 49 |             for b_samp in range(0, samp_num, params.bs):
 50 |                 x = torch.tensor(data[b_samp:b_samp + params.bs]).float().cuda()
 51 |                 with torch.no_grad():
 52 |                     y = tmp_model(x)
 53 |                 t = torch.tensor(labels[b_samp:b_samp + params.bs]).float().cuda()
 54 | 
 55 |                 loss_sum += loss_fn(y, t).item()
 56 |                 acc_sum += r2_score(t.data.cpu().numpy(), y.data.cpu().numpy())
 57 |                 preds[i, rep, b_samp:b_samp + params.bs] = y.data.cpu().numpy()      
 58 | 
 59 |             if verbose:
 60 |                 print('Repetition {} alpha: {}, loss: {:.4f}, accuracy: {:.4f}'.format(rep, alpha, loss_sum / (samp_num / params.bs), acc_sum / (samp_num / params.bs)))
 61 | 
 62 |         print('Accuracy for alpha: {} over {} repetitions is: {}'.format(alpha, params.reps, r2_score(np.mean(preds[i], axis=0), labels[:samp_num])))
 63 |     
 64 |     return compute_confidance(preds, labels[:samp_num])
 65 | 
 66 | 
 67 | def main():
 68 |     cur_dir = os.path.dirname(os.path.realpath(__file__))
 69 |     if len(sys.argv) < 2:
 70 |         config_file = os.path.join(cur_dir, "configs/config_confidance.json")
 71 |         print('No input was given, using {} as configuration file.'.format(config_file))
 72 |     else:
 73 |         config_file = sys.argv[1]
 74 | 
 75 |     with open(config_file, 'r') as f:
 76 |         config = json.load(f)
 77 | 
 78 |     params = SimpleNamespace()
 79 |     params.reps = config['repetitions']
 80 |     params.alphas = config['alphas']
 81 |     params.bs = config['bs']
 82 |     
 83 |         
 84 |     data_file = os.path.join('models', 'test_data_' + config['model_file'] + '.h5')
 85 |     print('Loading data and labels from file {}...'.format(data_file))
 86 |     h5f = h5py.File(data_file, 'r')
 87 |     labels = h5f['labels'][:]
 88 |     data = h5f['data'][:]
 89 |              
 90 |     print('Loading model...')
 91 |     model = torch.load(os.path.join('models', 'best_model_' + config['model_file'] + '.pt')).cuda()
 92 |     loss_fn = nn.MSELoss()
 93 | 
 94 |     with torch.no_grad():
 95 |         for name, param in model.named_parameters():
 96 |             print(name, np.mean(param.detach().cpu().numpy()), np.std(param.detach().cpu().numpy()))
 97 | 
 98 |     print('Computing prediction and confidance...')
 99 |     preds, confidance, accs = test_confidance(model, data, labels, loss_fn, params)
100 | 
101 |     #TODO: add downstream task logic
102 | 
103 |     print('Done!')
104 |     
105 | if __name__ == '__main__':
106 |     main()
107 | 


--------------------------------------------------------------------------------
/DIGDriver/region_model/perturbations_confidance/configs/__init__.py:
--------------------------------------------------------------------------------
1 | 
2 | 


--------------------------------------------------------------------------------
/DIGDriver/region_model/perturbations_confidance/configs/config_confidance.json:
--------------------------------------------------------------------------------
1 | {
2 |     "model_file": "MELA_AU_100_ALL_TRACKS",
3 |     "repetitions": 10,
4 |     "alphas": [0.01],
5 |     "bs": 128
6 | }
7 | 


--------------------------------------------------------------------------------
/DIGDriver/region_model/perturbations_confidance/configs/config_confidance_kfold.json:
--------------------------------------------------------------------------------
1 | {
2 | 	"data_file": "/storage/datasets/cancer/unziped_data_matrices_10000_0_0.8.h5",
3 | 	"base_path": "/storage/yaari/mutation-density-outputs/kfold",
4 |     "repetitions": 1000,
5 |     "alpha": 0.01,
6 | 	"bs": 8192,
7 |     "k": 5
8 | }
9 | 


--------------------------------------------------------------------------------
/DIGDriver/region_model/perturbations_confidance/kfold_test_model_confidance.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python
  2 | import os
  3 | import sys
  4 | import json
  5 | import copy
  6 | import h5py
  7 | import numpy as np
  8 | import pandas as pd
  9 | from types import SimpleNamespace
 10 | import torch
 11 | from torch import nn
 12 | from torch.utils.data import DataLoader
 13 | from sklearn.metrics import r2_score
 14 | 
 15 | sys.path.append('/storage/yaari/mutation_density/pytorch/nets/')
 16 | sys.path.append('/storage/yaari/mutation_density/pytorch/')
 17 | 
 18 | from cnn_predictors import *
 19 | from mut_dataset import *
 20 | 
 21 | def add_noise_to_model(model, noise):
 22 |     tmp_model = copy.deepcopy(model).cuda()
 23 |     with torch.no_grad():
 24 |         for param in tmp_model.parameters():
 25 |             param.add_(torch.normal(0, noise, param.size()).cuda())
 26 |     return tmp_model
 27 | 
 28 | def predict(model, data_loader, label_ids):   
 29 |     corr_coef_sums = np.zeros(len(label_ids))
 30 |     all_preds = [[] for _ in range(len(label_ids))]
 31 |     all_true = [[] for _ in range(len(label_ids))]
 32 |     for j, (X, t_lst) in enumerate(data_loader):
 33 |         y_lst = model(X.cuda())
 34 |         with torch.no_grad():
 35 |             for i, t in enumerate(t_lst):
 36 |                 y = y_lst[i]
 37 |                 all_preds[i].extend(y.data.cpu().numpy().tolist())
 38 |                 all_true[i].extend(t.data.cpu().numpy().tolist())
 39 |     return all_preds, all_true, [r2_score(all_preds[i], all_true[i]) for i in range(len(label_ids))]
 40 | 
 41 | 
 42 | def test_with_perturbations(model, data_loader, label_ids, samp_num, params, fold, verbose=True):      
 43 |     preds = np.empty((samp_num, params.reps))
 44 |     for rep in range(params.reps):
 45 |         tmp_model = add_noise_to_model(model, params.alpha)
 46 |         tmp_preds, _, acc = predict(tmp_model, data_loader, label_ids)
 47 |         preds[:, rep] = tmp_preds[0]
 48 |         
 49 |         if verbose and rep % 10 == 0:
 50 |             print('Fold {}, repetition {}, accuracy: {}'.format(fold, rep, acc))    
 51 |     return preds
 52 | 
 53 | 
 54 | def main():
 55 |     assert len(sys.argv) >= 4, 'Usage: kfold_test_model_confidance.py <run_id> <models folder name> <cancer ids...>'
 56 |         
 57 |     cur_dir = os.path.dirname(os.path.realpath(__file__))  
 58 |     config_path = os.path.join(cur_dir, "../configs/config_confidance_kfold.json")
 59 |     with open(config_path, 'r') as f: config = json.load(f)
 60 | 
 61 |     run_id = sys.argv[1]
 62 |     label_ids = sys.argv[3:]
 63 |     labels_str = '-'.join(label_ids)
 64 |     models_dir = os.path.join(config['base_path'], labels_str, sys.argv[2])
 65 | 
 66 |     file_path = config['data_file']
 67 |     with h5py.File(file_path, 'r') as h5f:
 68 |         chr_idxs = h5f['idx'][:]
 69 |     
 70 |     k = config['k']
 71 |     params = SimpleNamespace()
 72 |     params.reps = config['repetitions']
 73 |     params.alpha = config['alpha']
 74 |     params.bs = config['bs']
 75 | 
 76 |     pred_df = pd.DataFrame()
 77 |     idx = 0
 78 |     for i in range(2):
 79 |         print('Running iteration {} out of {} folds...'.format(i + 1, k))
 80 |         test_idxs = np.sort(np.load(os.path.join(models_dir, 'test_indices_fold_{}.npy'.format(i))))
 81 | 
 82 |         test_ds = SimpleDatasetFromH5(file_path, label_ids, test_idxs, chr_idxs[test_idxs], 'x_data')
 83 |         test_dl = DataLoader(test_ds, batch_size=params.bs, shuffle=False, drop_last=False, pin_memory=True, num_workers=4)
 84 |         samp_num = len(test_ds)
 85 |         test_chr_idxs = chr_idxs[test_idxs]
 86 | 
 87 |         print('Loading model...')
 88 |         model = nn.DataParallel(SimpleMultiTaskResNet(test_ds.get_data_shape(), len(label_ids))).cuda()
 89 |         state_dict = torch.load(os.path.join(models_dir, 'best_model_fold_{}.pt'.format(i)))
 90 |         model.load_state_dict(state_dict)
 91 |         model.eval()
 92 |         
 93 |         print('Computing prediction and confidance...')
 94 |         preds, labels, acc = predict(model, test_dl, label_ids)
 95 |         perturp_preds = test_with_perturbations(model, test_dl, label_ids, samp_num, params, i)
 96 | 
 97 |         print('Model accuracy: {}'.format(acc))
 98 |         print('Storing predictions...')
 99 | 
100 |         fold_pred_df = pd.DataFrame(data=perturp_preds)
101 |         fold_pred_df['chr'] = test_chr_idxs[:,0]
102 |         fold_pred_df['s_idx'] = test_chr_idxs[:,1]
103 |         fold_pred_df['e_idx'] = test_chr_idxs[:,2]
104 |         fold_pred_df['obs_mut'] = labels[0]
105 |         fold_pred_df['pred_mut'] = preds[0]
106 |         pred_df = pred_df.append(fold_pred_df, ignore_index=True)
107 | 
108 |     out_dir = os.path.join(models_dir, run_id)
109 |     out_path = os.path.join(out_dir, 'perturb_predictions.csv')
110 |     if not os.path.exists(out_dir):
111 |         os.makedirs(out_dir)
112 |     print('Saving predictions to {}...'.format(out_path))
113 |     pred_df.to_csv(out_path)
114 |     
115 |     print('Done!')
116 |     
117 | if __name__ == '__main__':
118 |     main()
119 | 


--------------------------------------------------------------------------------
/DIGDriver/region_model/region_model_tools.py:
--------------------------------------------------------------------------------
  1 | import h5py
  2 | import pandas as pd
  3 | import numpy as np
  4 | import scipy
  5 | import scipy.stats
  6 | 
  7 | def _load_fold_avg_DEPRECATED(f, cancer, test_idx=[], key='held-out'):
  8 |     """ Low-level loading of a single fold, removing outlier runs
  9 |     """
 10 |     hf = h5py.File(f, 'r')
 11 |     dset = hf[cancer]
 12 | 
 13 |     if not key in dset.keys():
 14 |         #print("WARNING: {} is not a key in the dataset. Defaulting to 'test'".format(key))
 15 |         key = 'test'
 16 | 
 17 |     runs = [int(key) for key in dset[key].keys() if key.isdigit()]
 18 |     test_Y = dset[key]['y_true'][:].reshape(-1, 1)
 19 |     if not len(test_idx):
 20 |         test_idx = dset[key]['chr_locs'][:]
 21 | 
 22 |     # print(test_idx.shape)
 23 |     test_Yhat_lst = []
 24 |     test_std_lst = []
 25 |     r2_lst = []
 26 |     run_lst = []
 27 |     for run in runs:
 28 |         y_hat = dset[key]['{}'.format(run)]['mean'][:].reshape(-1, 1)
 29 |         #gets rid of runs with all means predicted the same (casuses nan pearsonr)
 30 |         # if (y_hat-y_hat.mean()).sum() == 0:
 31 |         #     continue
 32 |         r2 = scipy.stats.pearsonr(test_Y.squeeze(), y_hat.squeeze())[0]**2
 33 | 
 34 |         if np.isnan(r2):
 35 |             continue
 36 | 
 37 |         r2_lst.append(r2)
 38 |         test_Yhat_lst.append(y_hat)
 39 |         test_std_lst.append(dset[key]['{}'.format(run)]['std'][:].reshape(-1, 1))
 40 |         run_lst.append(run)
 41 |         # print(r2_lst[-1])
 42 | 
 43 |     hf.close()
 44 |     r2s = np.array(r2_lst)
 45 |     # print(r2s)
 46 |     med = np.median(r2s)
 47 |     mad = np.median(np.abs(r2s - med))
 48 | 
 49 |     # idx = np.array(run_lst)[r2s > (med - 2*mad)]
 50 |     idx = np.where(r2s > (np.max(r2s) - 2*mad))
 51 |     if not len(idx[0]):
 52 |         idx = np.arange(len(test_Yhat_lst))
 53 | 
 54 |     test_Yhat = np.array(test_Yhat_lst)[idx].mean(axis = 0)
 55 |     test_std = np.array(test_std_lst)[idx].mean(axis = 0)
 56 |     vals = np.hstack([test_idx, test_Y, test_Yhat, test_std])
 57 |     df = pd.DataFrame(vals, 
 58 |             columns=['CHROM', 'START', 'END', 'Y_TRUE', 'Y_PRED', 'STD']
 59 |     )
 60 |     # print(df[0:5])
 61 | 
 62 |     return df
 63 | 
 64 | def _load_fold_avg(f, cancer, key='held-out', fold=None):
 65 |     """ Low-level loading of a single fold
 66 |     """
 67 |     h5 = h5py.File(f, 'r')
 68 |     out_h5 = h5[cancer]
 69 |     # if not fold:
 70 |     #     fold = int(f.split('.h5')[0].split('_')[-1])
 71 | 
 72 |     assert key in out_h5, 'Cannot compute pretrained model with no saved held-out set. Existing feilds are: {}'.format(out_h5.keys())
 73 |     ds = out_h5['held-out']
 74 | 
 75 |     runs = [key for key in ds.keys() if key.isdigit()]
 76 |     # test_Y = dset[key]['y_true'][:].reshape(-1, 1)
 77 |     # test_idx = dset[key]['chr_locs'][:]
 78 | 
 79 |     chr_locs = ds['chr_locs'][:]
 80 |     mapps = ds['mappability'][:].reshape(-1, 1)
 81 |     quants = ds['quantiles'][:].reshape(-1, 1)
 82 |     y_true = ds['y_true'][:].reshape(-1, 1)
 83 |     mean_lst = []
 84 |     std_lst = []
 85 | 
 86 |     for i in runs:
 87 |         mean_lst.append(ds[i]['mean'][:])
 88 |         std_lst.append(ds[i]['std'][:])
 89 | 
 90 |     means = np.array(mean_lst).mean(axis=0).reshape(-1, 1)
 91 |     stds = np.array(std_lst).mean(axis=0).reshape(-1, 1)
 92 | 
 93 |     vals = np.hstack([chr_locs, y_true, means, stds, mapps, quants])
 94 |     df = pd.DataFrame(vals, 
 95 |             columns=['CHROM', 'START', 'END', 'Y_TRUE', 'Y_PRED', 'STD', 'MAPP', 'QUANT']
 96 |     )
 97 |     # df['FOLD'] = fold
 98 |     # print(df[0:5])
 99 | 
100 |     return df
101 | 
102 | def kfold_supmap_results(kfold_path, cancer_str, key='held-out', drop_pos_cols=False, sort=True):
103 |     """ Load kfold results for regions above the user-defined mappability threshold
104 |     """
105 |     fold_files = sorted(kfold_path.glob("gp_results_fold*.h5"))
106 |     df_lst = [_load_fold_avg(str(fold), cancer=cancer_str, key=key) for fold in fold_files]
107 |     df = pd.concat(df_lst).astype({'CHROM':int, 
108 |                                    'START':int, 
109 |                                    'END':int, 
110 |                                    'Y_TRUE':int, 
111 |                                    'Y_PRED':float, 
112 |                                    'STD':float,
113 |                                    'MAPP': float,
114 |                                    'QUANT': float})
115 |     # window = int(df.iloc[0]['END'] - df.iloc[0]['START'])
116 |     df['FLAG'] = False
117 |     df['Region'] = ['chr{}:{}-{}'.format(row[0], row[1], row[2]) \
118 |                     for row in zip(df.CHROM, df.START, df.END)]
119 | 
120 |     if sort:
121 |         df = df.sort_values(by=['CHROM', 'START'])
122 | 
123 |     if drop_pos_cols:
124 |         df = df.drop(['CHROM', 'START', 'END'], axis = 1)
125 | 
126 |     df.set_index('Region', inplace=True)
127 | 
128 |     return df
129 | 
130 | def kfold_submap_results(kfold_path, cancer_str, key='held-out', drop_pos_cols=False, sort=True):
131 |     """ Load kfold results for regions below mappabiliy threshold
132 |     """
133 |     fold_files = sorted(kfold_path.glob("sub_mapp_results_fold*.h5"))
134 |     df_lst = [_load_fold_avg(str(fold), cancer=cancer_str, key=key) for fold in fold_files]
135 | 
136 |     a_mean = np.array([df.Y_PRED.values for df in df_lst])
137 |     mean = np.mean(a_mean, axis=0)
138 | 
139 |     a_std = np.array([df.STD.values for df in df_lst])
140 |     std = np.mean(a_std, axis=0)
141 | 
142 |     df = pd.DataFrame({'CHROM': df_lst[0].CHROM.values,
143 |                        'START': df_lst[0].START.values,
144 |                        'END': df_lst[0].END.values,
145 |                        'Y_TRUE': df_lst[0].Y_TRUE.values,
146 |                        'Y_PRED': mean,
147 |                        'STD': std,
148 |                        'MAPP': df_lst[0].MAPP.values,
149 |                        'QUANT': df_lst[0].QUANT.values,
150 |                        }
151 |                       ).astype({'CHROM':int, 'START':int, 'END':int, 'Y_TRUE':int, 
152 |                                 'Y_PRED':float, 'STD':float, 'MAPP':float, 'QUANT':float})
153 | 
154 |     # window = int(df.iloc[0]['END'] - df.iloc[0]['START'])
155 |     df['FLAG'] = True
156 |     df['Region'] = ['chr{}:{}-{}'.format(row[0], row[1], row[2]) \
157 |                     for row in zip(df.CHROM, df.START, df.END)]
158 | 
159 |     if sort:
160 |         df = df.sort_values(by=['CHROM', 'START'])
161 | 
162 |     if drop_pos_cols:
163 |         df = df.drop(['CHROM', 'START', 'END'], axis = 1)
164 | 
165 |     df.set_index('Region', inplace=True)
166 | 
167 |     return df #, window
168 | 
169 | def kfold_results(kfold_path, cohort_name, key='held-out'):
170 |     """ Load kfold results and remove outlier runs
171 |     """
172 |     try:
173 |         df_sup = kfold_supmap_results(kfold_path, cohort_name, key=key)
174 |         df_sub = kfold_submap_results(kfold_path, cohort_name, key=key)
175 |     except:
176 |     # except KeyError as e:
177 |         raise Exception('ERROR: failed to load kfold {}. You should rerun the CNN+GP kfold.'.format(kfold_path))
178 |         # print('FAIL: {}'.format(kfold_path))
179 |         # print('\nERROR: uh oh there was an error loading the kfold results.')
180 |         # print('This probably means a CNN+GP run crashed (it happens).')
181 |         # print('Rerunning the CNN+GP kfold should fix the problem')
182 | 
183 |     # print(scipy.stats.pearsonr(df_sup.Y_TRUE, df_sup.Y_PRED)[0]**2)
184 |     # print(scipy.stats.pearsonr(df_sub.Y_TRUE, df_sub.Y_PRED)[0]**2)
185 | 
186 |     df = pd.concat([df_sup, df_sub]).sort_values(by=['CHROM', 'START'])
187 |     df_dedup = df.drop_duplicates(['CHROM', 'START', 'END'])
188 |     assert len(df) == len(df_dedup), \
189 |         "Oh snap! There are duplicate entries in the folds. You should rerun this kfold."
190 | 
191 |     print(scipy.stats.pearsonr(df[~df.FLAG].Y_TRUE, df[~df.FLAG].Y_PRED)[0]**2)
192 | 
193 |     return df
194 | 


--------------------------------------------------------------------------------
/DIGDriver/region_model/train_nn.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | ## This is an **example** command to fit neural network predictions for 10kb regions of the PCAWG pan-cancer cohort.
 4 | ## 
 5 | ## NOTE: THE PROCESS REQUIRES INPUT DATA TOO LARGE TO BE INCLUDED IN THIS GITHUB REPO.
 6 | ##       CONTACT THE AUTHORS TO ENSURE YOU HAVE THE NECESSARY INPUT FILES AND COMPUTE RESROUCES
 7 | ##       IF YOU WANT TO CREATE MUTATION RATE MAPS FROM YOUR OWN WGS DATASETS.
 8 | 
 9 | python mutations_main.py -c Pancan_SNV
10 | 


--------------------------------------------------------------------------------
/DIGDriver/region_model/trainers/__init__.py:
--------------------------------------------------------------------------------
1 | 
2 | 


--------------------------------------------------------------------------------
/DIGDriver/region_model/trainers/gp_trainer.py:
--------------------------------------------------------------------------------
  1 | import numpy as np
  2 | import scipy
  3 | import torch
  4 | import gpytorch
  5 | from sklearn.preprocessing import StandardScaler
  6 | import warnings
  7 | 
  8 | warnings.filterwarnings("ignore", category=RuntimeWarning)
  9 | 
 10 | 
 11 | class bcolors:
 12 |     HEADER = '\033[95m'
 13 |     OKBLUE = '\033[94m'
 14 |     OKCYAN = '\033[96m'
 15 |     OKGREEN = '\033[92m'
 16 |     WARNING = '\033[93m'
 17 |     FAIL = '\033[91m'
 18 |     ENDC = '\033[0m'
 19 |     BOLD = '\033[1m'
 20 |     UNDERLINE = '\033[4m'
 21 | 
 22 | 
 23 | def r2_score(y_true, y_pred):
 24 |     r2 = scipy.stats.pearsonr(y_true, y_pred)[0]**2
 25 |     return r2 if not np.isnan(r2) else 0
 26 | 
 27 | 
 28 | class SparseGP(gpytorch.models.ExactGP):
 29 |     def __init__(self, train_x, train_y, likelihood, n_inducing=2000):
 30 |         super(SparseGP, self).__init__(train_x, train_y, likelihood)
 31 | 
 32 |         self.mean_module = gpytorch.means.ConstantMean()
 33 | 
 34 |         base_cov_module = gpytorch.kernels.ScaleKernel(
 35 |             gpytorch.kernels.RBFKernel())
 36 | 
 37 |         self.covar_module = gpytorch.kernels.InducingPointKernel(
 38 |             base_cov_module,
 39 |             inducing_points=train_x[:n_inducing, :],
 40 |             likelihood=likelihood)
 41 | 
 42 |     def forward(self, x):
 43 |         mean_x = self.mean_module(x)
 44 |         covar_x = self.covar_module(x)
 45 |         return gpytorch.distributions.MultivariateNormal(mean_x, covar_x)
 46 | 
 47 |     def fit_params(self, train_x, train_y, likelihood, n_iter=100):
 48 |         pass
 49 | 
 50 |     def predict(self, val_x):
 51 |         pass
 52 | 
 53 | 
 54 | class GPTrainer:
 55 |     samp_bound = int(1.5e5)
 56 | 
 57 |     def __init__(self, device, train_tup, val_tup, heldout_tup=None, n_iter=50, n_inducing=500):
 58 |         self.device = device
 59 |         self.n_iter = n_iter
 60 |         self.n_inducing = n_inducing
 61 |         self.org_train_x = train_tup[0]
 62 |         self.org_train_y = train_tup[1]
 63 |         self.train_chr_locations = train_tup[2]
 64 |         self.train_mappability = train_tup[3]
 65 |         self.train_quantiles = train_tup[4]
 66 |         self.org_val_x = val_tup[0]
 67 |         self.org_val_y = val_tup[1]
 68 |         self.val_chr_locations = val_tup[2]
 69 |         self.val_mappability = train_tup[3]
 70 |         self.val_quantiles = train_tup[4]
 71 | 
 72 |         self.train_x, self.train_y, scaler, self.y_mean, self.y_std = self.standardize(train_tup[0], train_tup[1])
 73 |         self.val_x,  self.val_y,  _, _, _ = self.standardize(val_tup[0],
 74 |                                                              val_tup[1],
 75 |                                                              scaler,
 76 |                                                              self.y_mean,
 77 |                                                              self.y_std)
 78 | 
 79 |         self.idx_feat = np.where(np.abs(self.train_x).mean(axis=0) > 0)[0]
 80 |         train_size = self.train_x.shape[0]
 81 |         if train_size > self.samp_bound:  # upper bound number of samples to fit on GPU memory
 82 |             samp_idxs = np.random.choice(self.train_x.shape[0], size=self.samp_bound, replace=False)
 83 |             assert len(np.unique(samp_idxs)) == len(samp_idxs)
 84 |             self.train_x = self.train_x[samp_idxs]
 85 |             self.train_y = self.train_y[samp_idxs]
 86 |             print('Reduced train set size from {} to {}, to stay within memory limits'.format(train_size, self.samp_bound))
 87 | 
 88 |         self.train_x = self.train_x[:, self.idx_feat]
 89 |         self.val_x = self.val_x[:, self.idx_feat]
 90 |         print('After zero features reduction feature vectors are now of size: {}'.format(self.train_x.shape[1]))
 91 | 
 92 |         if heldout_tup is not None:
 93 |             self.org_ho_x = heldout_tup[0]
 94 |             self.org_ho_y = heldout_tup[1]
 95 |             self.ho_chr_locations = heldout_tup[2]
 96 |             self.ho_mappability = heldout_tup[3]
 97 |             self.ho_quantiles = heldout_tup[4]
 98 |             self.held_x,  self.held_y,  _, _, _ = self.standardize(heldout_tup[0],
 99 |                                                                    heldout_tup[1],
100 |                                                                    scaler,
101 |                                                                    self.y_mean,
102 |                                                                    self.y_std)
103 |             self.held_x = self.held_x[:, self.idx_feat]
104 |         else:
105 |             self.held_x,  self.held_y = None, None
106 | 
107 |     def standardize(self, X, Y, scaler=None, y_mean=None, y_std=None):
108 | 
109 |         if not scaler:
110 |             scaler = StandardScaler()
111 |             scaler.fit(X)
112 | 
113 |         if not y_mean:
114 |             y_mean = Y.mean()
115 |             y_std  = Y.std()
116 | 
117 |         x = scaler.transform(X)
118 |         y = (Y - y_mean) / y_std
119 | 
120 |         return x, y, scaler, y_mean, y_std
121 | 
122 |     def train_model(self):
123 |         X = torch.tensor(self.train_x).float().contiguous().to(self.device)
124 |         y = torch.tensor(self.train_y).float().contiguous().to(self.device)
125 |         likelihood = gpytorch.likelihoods.GaussianLikelihood().to(self.device)
126 |         model = SparseGP(X, y, likelihood, n_inducing=self.n_inducing).to(self.device)
127 |         model.train()
128 |         likelihood.train()
129 | 
130 |         optimizer = torch.optim.Adam([{'params': model.parameters()}], lr=0.8)
131 | 
132 |         # "Loss" for GPs - the marginal log likelihood
133 |         mll = gpytorch.mlls.ExactMarginalLogLikelihood(likelihood, model)
134 | 
135 |         for i in range(self.n_iter):
136 |             optimizer.zero_grad()
137 |             y_pred = model(X)
138 |             loss = -mll(y_pred, y)
139 |             loss.backward()
140 |             optimizer.step()
141 | 
142 |         # delete variables to clear memory
143 |         del X
144 |         del y
145 |         del loss
146 |         del optimizer
147 |         del mll
148 |         return model, likelihood
149 | 
150 |     def predict(self, model, likelihood, x, y):
151 |         model.eval()
152 |         likelihood.eval()
153 |         # "Loss" for GPs - the marginal log likelihood
154 |         mll = gpytorch.mlls.ExactMarginalLogLikelihood(likelihood, model)
155 | 
156 |         X = torch.tensor(x).float().contiguous().to(self.device)
157 |         y_true = torch.tensor(y).float().contiguous().to(self.device)
158 |         print('Predicting over {} samples.'.format(X.size(0)))
159 |         with torch.no_grad(), gpytorch.settings.fast_pred_var():
160 |             y_pred = model(X)
161 |             loss = -mll(y_pred, y_true)
162 |             y_hat = y_pred.mean.cpu().numpy()
163 |             y_std = y_pred.stddev.cpu().numpy()
164 | 
165 |         # delete variables to clear memory
166 |         del X
167 |         return y_hat, y_std, loss.item()
168 | 
169 |     @staticmethod
170 |     def get_results_dict(mean, std, r2, loss, params):
171 |         return {'gp_mean': mean, 'gp_std': std, 'r2': r2, 'loss': loss, 'params': params}
172 | 
173 |     def run(self):
174 |         torch.cuda.empty_cache()
175 | 
176 |         # Train model
177 |         #with gpytorch.settings.cg_tolerance(1e9), gpytorch.settings.debug(False):
178 |         model, likelihood = self.train_model()
179 | 
180 |         # Validate model
181 |         #with gpytorch.settings.eval_cg_tolerance(1e6):
182 |         val_mean, val_std, val_loss = self.predict(model, likelihood, self.val_x, self.val_y)
183 |         val_r2 = r2_score(self.val_y, val_mean)
184 |         print(bcolors.OKCYAN + 'Validation set R2: {}'.format(val_r2) + bcolors.ENDC)
185 | 
186 |         params = np.array([model.covar_module.base_kernel.base_kernel.lengthscale.item(),
187 |                            model.covar_module.base_kernel.outputscale.item(),
188 |                            likelihood.noise_covar.noise.item()])
189 | 
190 |         val_res = self.get_results_dict(val_mean * self.y_std + self.y_mean,
191 |                                         val_std * self.y_std,
192 |                                         val_r2, val_loss, params)
193 | 
194 |         if self.held_x is not None:
195 |             #with gpytorch.settings.eval_cg_tolerance(1e6):
196 |             hld_mean, hld_std, hld_loss = self.predict(model, likelihood, self.held_x, self.held_y)
197 |             hld_r2 = r2_score(self.held_y, hld_mean)
198 |             print(bcolors.OKCYAN + 'Held-out set R2: {}'.format(hld_r2) + bcolors.ENDC)
199 |             hld_res = self.get_results_dict(hld_mean * self.y_std + self.y_mean,
200 |                                             hld_std * self.y_std,
201 |                                             hld_r2, hld_loss,
202 |                                             params)
203 |             return val_res, hld_res
204 |         return val_res, None
205 | 
206 |     def save_results(self, val_res_dict, held_res_dict, h5_file, run_id):
207 |         print('Saving GP {} results'.format(int(run_id) + 1))
208 |         if 'train' not in h5_file:
209 |             train_grp = h5_file.create_group('train')
210 |             train_grp.create_dataset('nn_features', data=self.org_train_x)
211 |             train_grp.create_dataset('y_true', data=self.org_train_y)
212 |             train_grp.create_dataset('chr_locs', data=np.array(self.train_chr_locations))
213 |             train_grp.create_dataset('mappability', data=np.array(self.train_mappability))
214 |             train_grp.create_dataset('quantiles', data=np.array(self.train_quantiles))
215 |         if 'val' not in h5_file:
216 |             val_grp = h5_file.create_group('val')
217 |             val_grp.create_dataset('nn_features', data=self.val_x)
218 |             val_grp.create_dataset('y_true', data=self.org_val_y)
219 |             val_grp.create_dataset('chr_locs', data=np.array(self.val_chr_locations))
220 |             val_grp.create_dataset('mappability', data=np.array(self.val_mappability))
221 |             val_grp.create_dataset('quantiles', data=np.array(self.val_quantiles))
222 | 
223 |         val_run_grp = h5_file['val'].create_group(run_id)
224 |         val_run_grp.create_dataset('mean', data=val_res_dict['gp_mean'])
225 |         val_run_grp.create_dataset('std', data=val_res_dict['gp_std'])
226 |         val_run_grp.create_dataset('params', data=val_res_dict['params'])
227 |         val_run_grp.attrs['R2'] = val_res_dict['r2']
228 |         val_run_grp.attrs['loss'] = val_res_dict['loss']
229 | 
230 |         if held_res_dict is not None:
231 |             if 'held-out' not in h5_file:
232 |                 ho_grp = h5_file.create_group('held-out')
233 |                 ho_grp.create_dataset('nn_features', data=self.org_ho_x)
234 |                 ho_grp.create_dataset('y_true', data=self.org_ho_y)
235 |                 ho_grp.create_dataset('chr_locs', data=np.array(self.ho_chr_locations))
236 |                 ho_grp.create_dataset('mappability', data=np.array(self.ho_mappability))
237 |                 ho_grp.create_dataset('quantiles', data=np.array(self.ho_quantiles))
238 | 
239 |             ho_run_grp = h5_file['held-out'].create_group(run_id)
240 |             ho_run_grp.create_dataset('mean', data=held_res_dict['gp_mean'])
241 |             ho_run_grp.create_dataset('std', data=held_res_dict['gp_std'])
242 |             ho_run_grp.create_dataset('params', data=held_res_dict['params'])
243 |             ho_run_grp.attrs['R2'] = held_res_dict['r2']
244 |             ho_run_grp.attrs['loss'] = held_res_dict['loss']
245 |         return val_res_dict['r2'], held_res_dict['r2']
246 | 
247 |     def compute_pretrained(self, out_h5, runs_num):
248 |         assert 'held-out' in out_h5, 'Cannot compute pretrained model with no saved held-out set. Existing feilds are: {}'.format(out_h5.keys())
249 |         ds = out_h5['held-out']
250 |         chr_locs = ds['chr_locs'][:]
251 |         mapps = ds['mappability'][:]
252 |         quants = ds['quantiles'][:]
253 |         y_true = ds['y_true'][:]
254 |         mean_lst = []
255 |         std_lst = []
256 |         for i in np.arange(runs_num).astype(str):
257 |             mean_lst.append(ds[i]['mean'][:])
258 |             std_lst.append(ds[i]['std'][:])
259 |         means = np.array(mean_lst).mean(axis=0)
260 |         stds = np.array(std_lst).mean(axis=0)
261 |         return chr_locs, mapps, quants, y_true, means, stds
262 | 
263 | 


--------------------------------------------------------------------------------
/DIGDriver/region_model/trainers/nn_trainer.py:
--------------------------------------------------------------------------------
  1 | import numpy as np
  2 | import torch
  3 | import scipy
  4 | import torch.utils.data
  5 | #from sklearn.metrics import r2_score4
  6 | from torch.utils.data import DataLoader
  7 | import matplotlib.pyplot as plt
  8 | plt.switch_backend('agg')
  9 | 
 10 | 
 11 | def r2_score(y_true, y_pred):
 12 |     r2 = scipy.stats.pearsonr(y_true, y_pred)[0]**2
 13 |     return r2 if not np.isnan(r2) else 0
 14 | 
 15 | 
 16 | class NNTrainer:
 17 |     def __init__(self, model, optimizer, loss_fn, bs, label_ids, train_ds, test_ds, device,  writer=None, get_attention_maps=False):
 18 |         self.device = device
 19 |         self.model = model.to(self.device)
 20 |         self.optimizer = optimizer
 21 |         self.loss_fn = loss_fn
 22 |         self.bs = bs
 23 | 
 24 |         self.train_dataloader = DataLoader(train_ds, batch_size=bs, shuffle=True, drop_last=False, num_workers=16)
 25 |         self.test_dataloader = DataLoader(test_ds, batch_size=bs, shuffle=False, drop_last=False, num_workers=16)
 26 | 
 27 |         self.label_ids = label_ids
 28 |         self.get_attention_maps = get_attention_maps
 29 | 
 30 |         self.writer = writer
 31 | 
 32 |         '''
 33 |         if writer is not None:
 34 |             shape = train_ds.get_data_shape()
 35 |             dummy_input = (torch.zeros(1, shape[1], shape[2]),)
 36 |             print(dummy_input[0].size())
 37 |             self.writer.add_graph(model(), dummy_input, True)
 38 |         '''
 39 | 
 40 |     def train(self, epoch, run, print_interval=10, autoreg=False):
 41 |         # toggle model to train mode
 42 |         self.model.train()
 43 | 
 44 |         samp_ctr = 0
 45 |         batch_num = len(self.train_dataloader)
 46 |         loss_sums = np.zeros(len(self.label_ids))
 47 |         corr_coef_sums = np.zeros(len(self.label_ids))
 48 |         all_preds = [[] for _ in range(len(self.label_ids))]
 49 |         all_true = [[] for _ in range(len(self.label_ids))]
 50 |         all_features_lst = [[] for _ in range(len(self.label_ids))]
 51 |         print('Training epoch {}'.format(epoch))
 52 |         for j, batch in enumerate(self.train_dataloader):
 53 |             t_lst = batch[-1]
 54 |             if autoreg == True:
 55 |                 y_lst, fv_lst,  _ = self.model(batch[0].to(self.device), torch.cat(batch[1], dim=1).to(self.device))
 56 |             else:
 57 |                 y_lst, fv_lst,  _ = self.model(batch[0].to(self.device))
 58 |             samp_ctr += batch[0].size()[0]
 59 |             loss_lst = []
 60 |             for i, t in enumerate(t_lst):
 61 |                 y = y_lst[i]
 62 |                 all_preds[i].extend(y.data.cpu().numpy().tolist())
 63 |                 all_true[i].extend(t.data.cpu().numpy().tolist())
 64 |                 all_features_lst[i].extend(fv_lst[i].data.cpu().numpy())
 65 |                 task_loss = self.loss_fn(y, t.to(self.device))# + torch.norm(attention, p=1, dim=(1,2)).mean()
 66 |                 loss_lst.append(task_loss)
 67 |                 loss_sums[i] += task_loss.item()
 68 |                 corr_coef = r2_score(t.data.cpu().numpy(), y.data.cpu().numpy())
 69 |                 corr_coef_sums[i] += corr_coef
 70 | 
 71 |             loss = torch.sum(torch.stack(loss_lst))
 72 |             self.optimizer.zero_grad()
 73 |             loss.backward()
 74 |             self.optimizer.step()
 75 | 
 76 |             if j % int(batch_num * print_interval / 100) == 0 and j > 0:  # print progress every print_interval%
 77 |                 print('Train Epoch: {} [{}/{} ({:.0f}%)]\tLoss: {}\tAccuracy: {}'.format(
 78 |                     epoch, j, batch_num, 100. * j / batch_num,
 79 |                     loss_sums / (samp_ctr / self.bs), corr_coef_sums / (samp_ctr / self.bs)))
 80 | 
 81 |         train_accs = corr_coef_sums / batch_num
 82 |         train_losses = loss_sums / batch_num
 83 | 
 84 |         print('====> Epoch: {}, Average loss: {}, Average accuracy: {}'.format(epoch, train_losses, train_accs))
 85 | 
 86 |         if self.writer is not None:
 87 |             for i in range(len(self.label_ids)):
 88 |                 self.writer.add_scalar('Train_{}/Loss_{}'.format(run, self.label_ids[i]), train_losses[i], epoch)
 89 |                 self.writer.add_scalar('Train_{}/R^2_{}'.format(run, self.label_ids[i]), train_accs[i], epoch)
 90 | 
 91 |         return train_losses, train_accs, all_features_lst, all_preds, all_true,
 92 | 
 93 |     def predict(self, model, dataloader, epoch, run, set_id='Test', autoreg=False):
 94 |         # toggle model to test / inference mode
 95 |         model.eval()
 96 | 
 97 |         batch_num = len(dataloader)
 98 |         loss_sums = np.zeros(len(self.label_ids))
 99 |         corr_coef_sums = np.zeros(len(self.label_ids))
100 |         all_preds = [[] for _ in range(len(self.label_ids))]
101 |         all_true = [[] for _ in range(len(self.label_ids))]
102 |         all_features_lst = [[] for _ in range(len(self.label_ids))]
103 |         all_att = []
104 |         for j, batch in enumerate(dataloader):
105 |             t_lst = batch[-1]
106 |             if autoreg == True:
107 |                 y_lst, fv_lst, attention = self.model(batch[0].to(self.device), torch.cat(batch[1], dim=1).to(self.device))
108 |             else:
109 |                 y_lst, fv_lst, attention = self.model(batch[0].to(self.device))
110 | 
111 |             if self.get_attention_maps: all_att.append(attention.cpu().detach().numpy())
112 |             with torch.no_grad():
113 |                 for i, t in enumerate(t_lst):
114 |                     y = y_lst[i]
115 |                     all_features_lst[i].append(fv_lst[i].cpu().detach().numpy())
116 |                     all_preds[i].extend(y.data.cpu().numpy().tolist())
117 |                     all_true[i].extend(t.data.cpu().numpy().tolist())
118 |                     corr_coef_sums[i] += r2_score(t.data.cpu().numpy(), y.data.cpu().numpy())
119 |                     loss_sums[i] += self.loss_fn(y, t.to(self.device))# + torch.norm(attention, p=1, dim=(1,2)).mean()
120 |         all_features = [np.concatenate(all_features_lst[j], axis=0) for j in range(len(all_features_lst))]
121 |         test_accs = corr_coef_sums / batch_num
122 |         test_losses = loss_sums / batch_num
123 | 
124 |         print('====> Test set loss: {}, accuracy: {}'.format(test_losses, test_accs))
125 | 
126 |         if self.writer is not None:
127 |             for i in range(len(self.label_ids)):
128 |                 self.writer.add_scalar('{}_{}/Loss_{}'.format(set_id, run, self.label_ids[i]), test_losses[i], epoch)
129 |                 self.writer.add_scalar('{}_{}/R^2_{}'.format(set_id, run, self.label_ids[i]), test_accs[i], epoch)
130 | 
131 |             for name, param in self.model.named_parameters():
132 |                 if 'bn' not in name:
133 |                     self.writer.add_histogram(name, param, epoch)
134 | 
135 |             self.plot_prediction_scatter(dataloader, all_preds, '{}/run_{}/epoch_{}'.format(set_id, run, epoch), test_accs)
136 |             self.plot_prediction_histogram(dataloader, all_preds, '{}/run_{}/epoch_{}'.format(set_id, run, epoch), test_accs)
137 | 
138 |         if self.get_attention_maps:
139 |             return test_losses, test_accs, all_features, all_preds, all_true, np.concatenate(all_att, axis=0)
140 |         else:
141 |             return test_losses, test_accs, all_features, all_preds, all_true, None
142 | 
143 |     def test(self, epoch, run, autoreg=False):
144 |         return self.predict(self.model, self.test_dataloader, epoch, run, autoreg=autoreg)
145 | 
146 |     def plot_prediction_scatter(self, dataloader, preds, writer_id, accs, confidence=99.9):
147 |         for i in range(len(preds)):
148 |             t = np.concatenate([l[i].data.cpu().numpy() for (_,l) in dataloader])
149 |             fig = plt.figure()
150 |             ax = plt.gca()
151 |             y = np.array(preds[i])
152 |             ax.scatter(t, y, alpha=0.3)
153 |             x = np.linspace(*ax.get_xlim())
154 |             ax.plot(x, x)
155 |             ax.set_ylim(0, np.percentile(y, confidence) + 1)
156 |             ax.set_xlim(0, np.percentile(t, confidence) + 1)
157 |             ax.set_xlabel('True')
158 |             ax.set_ylabel('Predicted')
159 |             ax.set_title('Accuracy: {}'.format(np.round(accs[i], 3)))
160 |             self.writer.add_figure('{}/{}/Scatter'
161 |                                    .format(self.label_ids[i], writer_id), fig)
162 | 
163 |     def plot_prediction_histogram(self, dataloader, preds, writer_id, accs, confidence=99.85):
164 |         for i in range(len(preds)):
165 |             t = np.concatenate([l[i].data.cpu().numpy() for (_,l) in dataloader])
166 |             fig = plt.figure()
167 |             ax = plt.gca()
168 |             y = np.array(preds[i])
169 |             y_max_bin = int(np.percentile(y, confidence) + 1)
170 |             t_max_bin = int(np.percentile(t, confidence) + 1)
171 |             max_bin = max(y_max_bin, t_max_bin)
172 |             ax.hist(t, max_bin, (0, max_bin), alpha=0.5)
173 |             ax.hist(y, max_bin, (0, max_bin), alpha=0.5)
174 |             ax.set_xlabel('Mutation Count')
175 |             ax.set_ylabel('Window #')
176 |             ax.set_title('Accuracy: {}'.format(np.round(accs[i], 3)))
177 |             ax.legend(['True', 'Predicted'])
178 |             self.writer.add_figure('{}/{}/Histogram'
179 |                                    .format(self.label_ids[i], writer_id), fig)
180 | 
181 | 


--------------------------------------------------------------------------------
/DIGDriver/sequence_model/__init__.py:
--------------------------------------------------------------------------------
1 | ## init file for python module
2 | 


--------------------------------------------------------------------------------
/DIGDriver/sequence_model/gp_tools.py:
--------------------------------------------------------------------------------
  1 | ##entire module seems to be deprecated. superceeeded by region_model_tools
  2 | 
  3 | import pandas as pd
  4 | import numpy as np
  5 | import scipy.stats
  6 | import h5py
  7 | import seaborn as sns
  8 | import matplotlib.pyplot as plt
  9 | 
 10 | ##deprecated. gp loading now done in region_model_tools. may be useful in notebooks?
 11 | ##TODELETE
 12 | def load_ensemble(f, cancer=None, split='test'):
 13 |     ## Load data
 14 |     data_pred = h5py.File(f, 'r')
 15 |     if cancer:
 16 |         dset = data_pred[cancer]
 17 |     else:
 18 |         dset = data_pred
 19 | 
 20 |     try:
 21 |         runs = [key for key in dset[split].keys() if key.isdigit()] ## NOTE: bad way to find integers used as keys
 22 |         train_idx = dset['train']['chr_locs'][:]
 23 |         y_true = dset[split]['y_true'][:].reshape(-1, 1)
 24 |         idx = dset[split]['chr_locs'][:]
 25 |         gp_mean_lst = [dset[split][str(i)]['mean'][:] for i in runs]
 26 |         gp_std_lst = [dset[split][str(i)]['std'][:] for i in runs]
 27 | 
 28 |     except:
 29 |         reruns = len([key for key in dset[split].keys() if key.startswith('gp_mean')])
 30 |         train_idx = dset['train']['idxs'][:]
 31 |         y_true = dset[split]['true'][0, :].reshape(-1, 1)
 32 |         idx = dset[split]['idxs'][:]
 33 |         gp_mean_lst = [dset[split]['gp_mean_{:02d}'.format(run)][:] for run in range(1, reruns-1)]
 34 |         gp_std_lst = [dset[split]['gp_std_{:02d}'.format(run)][:] for run in range(1, reruns-1)]
 35 | 
 36 |     gp_mean_nd = np.vstack(gp_mean_lst)
 37 |     gp_mean = np.median(gp_mean_nd, axis=0).reshape(-1, 1)
 38 | 
 39 |     gp_std_nd = np.vstack(gp_std_lst)
 40 |     gp_std = np.median(gp_std_nd, axis=0).reshape(-1, 1)
 41 | 
 42 |     data_pred.close()
 43 | 
 44 |     return train_idx, y_true, idx, gp_mean, gp_std
 45 | 
 46 | ##deprecated. gp loading now done in region_model_tools. may be useful in notebooks?
 47 | ##TODELETE
 48 | def load_run(f, run, cancer=None, split='test'):
 49 |     hf = h5py.File(f, 'r')
 50 |     if cancer:
 51 |         dset = hf[cancer]
 52 |     else:
 53 |         dset = hf
 54 | 
 55 |     try:
 56 |         train_idx = dset['train']['chr_locs'][:]
 57 |         test_Y = dset[split]['y_true'][:].reshape(-1, 1)
 58 |         test_idx = dset[split]['chr_locs'][:]
 59 |         test_Yhat = dset[split]['{}'.format(run)]['mean'][:].reshape(-1, 1)
 60 |         test_std = dset[split]['{}'.format(run)]['std'][:].reshape(-1, 1)
 61 |     except:
 62 |         train_idx = dset['train']['idxs'][:]
 63 |         test_Y = dset[split]['true'][0, :].reshape(-1, 1)
 64 |         test_idx = dset[split]['idxs'][:]
 65 |         test_Yhat = dset[split]['gp_mean_{:02d}'.format(run)][:].reshape(-1, 1)
 66 |         test_std = dset[split]['gp_std_{:02d}'.format(run)][:].reshape(-1, 1)
 67 | 
 68 |     hf.close()
 69 |     return train_idx, test_Y, test_idx, test_Yhat, test_std
 70 | 
 71 | ##deprecated. gp loading now done in region_model_tools. may be useful in notebooks?
 72 | ##TODELETE
 73 | def load_fold(f, cancer=None, run=None, split='test', reruns=10):
 74 |     if run == None:
 75 |         run = pick_gp_by_calibration(f, cancer=cancer, dataset=split)
 76 | 
 77 |     if run=='ensemble':
 78 |         train_idx, test_Y, test_idx, test_Yhat, test_std = load_ensemble(f, cancer=cancer, split=split)
 79 | 
 80 |     else:
 81 |         train_idx, test_Y, test_idx, test_Yhat, test_std = load_run(f, run, cancer=cancer, split=split)
 82 | 
 83 |     vals = np.hstack([test_idx, test_Y, test_Yhat, test_std])
 84 |     df = pd.DataFrame(vals, columns=['CHROM', 'START', 'END', 'Y_TRUE', 'Y_PRED', 'STD'])
 85 | 
 86 |     return df
 87 | 
 88 | def plot_qq_log(pvals, label='', ax=None, rasterized=False, color=None):
 89 |     if not ax:
 90 |         f, ax = plt.subplots(1, 1)
 91 |     exp = -np.log10(np.arange(1, len(pvals) + 1) / len(pvals))
 92 |     pvals_log10_sort = -np.log10(np.sort(pvals))
 93 | 
 94 |     if not color:
 95 |         color = sns.color_palette()[0]
 96 | 
 97 |     ax.plot(exp, pvals_log10_sort, '.', label=label, rasterized=rasterized, color=color)
 98 |     ax.plot(exp, exp, 'k-')
 99 |     # ax.plot(exp, exp, 'r-')
100 | 
101 |     if label:
102 |         ax.legend()
103 | 
104 | def plot_qq(pvals, label='', ax=None, rasterized=False):
105 |     if not ax:
106 |         f, ax = plt.subplots(1, 1)
107 |     exp  = (np.arange(1, len(pvals) + 1) / len(pvals))
108 |     pvals_sort = np.sort(pvals)
109 | 
110 |     ax.plot(exp, pvals_sort, '.', label=label, rasterized=rasterized)
111 |     ax.plot(exp, exp, 'r-')
112 | 
113 |     if label:
114 |         ax.legend()
115 | 
116 | 
117 | def calibration_score_by_pvals(pvals):
118 |     alpha = [0.05, 0.01, 0.001, 0.0001]
119 |     alpha_emp = [len(pvals[pvals < a]) / len(pvals) for a in alpha]
120 | 
121 |     return sum([(a-ap)**2 for a, ap in zip(alpha, alpha_emp)])
122 | 
123 | 
124 | # def merge_windows(df, start, end, new_size):
125 | def merge_windows(df, idx_new):
126 |     # bins = np.concatenate([np.arange(start, end, new_size), [end]])
127 | 
128 |     Y_merge = np.array([df[(df.CHROM==row[0]) & (df.START >= row[1]) & (df.START < row[2])].Y_TRUE.sum() \
129 |                            for row in idx_new])
130 |     Yhat_merge = np.array([df[(df.CHROM==row[0]) & (df.START >= row[1]) & (df.START < row[2])].Y_PRED.sum() \
131 |                            for row in idx_new])
132 |     std_merge = np.array([np.sqrt((df[(df.CHROM==row[0]) & (df.START >= row[1]) & (df.START < row[2])].STD**2).sum()) \
133 |                            for row in idx_new])
134 | 
135 |     # Y_merge = np.array([df[(df.START >= v1) & (df.START < v2)].Y_TRUE.sum() \
136 |     #                          for v1, v2 in zip(bins[:-1], bins[1:])])
137 |     # Yhat_merge = np.array([df[(df.START >= v1) & (df.START < v2)].Y_PRED.sum() \
138 |     #                             for v1, v2 in zip(bins[:-1], bins[1:])])
139 |     # std_merge = np.array([np.sqrt((df[(df.START >= v1) & (df.START < v2)].STD**2).sum()) \
140 |     #                            for v1, v2 in zip(bins[:-1], bins[1:])])
141 | 
142 |     a_merge = np.hstack([idx_new,
143 |                          Y_merge.reshape(-1, 1),
144 |                          Yhat_merge.reshape(-1, 1),
145 |                          std_merge.reshape(-1, 1)
146 |                          ]
147 |                         )
148 |     # a_merge = np.hstack([bins[:-1].reshape(-1, 1),
149 |     #                      bins[1:].reshape(-1, 1),
150 |     #                      Y_merge.reshape(-1, 1),
151 |     #                      Yhat_merge.reshape(-1, 1),
152 |     #                      std_merge.reshape(-1, 1)
153 |     #                      ]
154 |     #                     )
155 | 
156 |     df_merge = pd.DataFrame(a_merge, columns=['CHROM', 'START', 'END', 'Y_TRUE', 'Y_PRED', 'STD'])
157 |     # df_merge = pd.DataFrame(a_merge, columns=['START', 'END', 'Y_TRUE', 'Y_PRED', 'STD'])
158 |     # df_merge.insert(0, 'CHROM', df.CHROM.iloc[0])
159 | 
160 |     return df_merge
161 | 


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
 1 | BSD 3-Clause License
 2 | 
 3 | Copyright (c) 2021, Adam Yaari, Maxwell Sherman, Oliver Priebe, Bonnie Berger
 4 | All rights reserved.
 5 | 
 6 | Redistribution and use in source and binary forms, with or without
 7 | modification, are permitted provided that the following conditions are met:
 8 | 
 9 | 1. Redistributions of source code must retain the above copyright notice, this
10 |    list of conditions and the following disclaimer.
11 | 
12 | 2. Redistributions in binary form must reproduce the above copyright notice,
13 |    this list of conditions and the following disclaimer in the documentation
14 |    and/or other materials provided with the distribution.
15 | 
16 | 3. Neither the name of the copyright holder nor the names of its
17 |    contributors may be used to endorse or promote products derived from
18 |    this software without specific prior written permission.
19 | 
20 | THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
21 | AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
22 | IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
23 | DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
24 | FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
25 | DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
26 | SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
27 | CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
28 | OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
29 | OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
30 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | ![conda badge](https://anaconda.org/mutation_density/digdriver/badges/installer/conda.svg)
 2 | 
 3 | # Welcome to Dig
 4 | Dig builds genome-wide maps of somatic mutation rates in cancer genomes and allows any set of candidate mutations to be tested for an excess of observed mutations compared to the number expected based on the neutral mutation rate.
 5 | 
 6 | ## Web-browseable mutation maps
 7 | Want to visually explore somatic mutation rates across the genome? Check out our [genome browser](https://resgen.io/maxsh/Cancer_Mutation_Maps/views) genome browser with maps of predicted and observed mutation counts for 37 types of cancer.  
 8 | 
 9 | ## Getting started
10 | See our [wiki](https://github.com/maxwellsh/DIGDriver/wiki) for installation instructions and tutorials.
11 | 
12 | ## Data files
13 | All necessary data files are available from our [data portal](http://cb.csail.mit.edu/cb/DIG/downloads/)
14 | 
15 | ## Citation
16 | Want to learn more about Dig and its biological applications? Check out our preprint [Sherman et al. 2021](https://www.biorxiv.org/content/10.1101/2021.08.03.454669v1).
17 | 
18 | Really want to get into the weeds of the deep-learning model? Check out our [ICRL paper](https://openreview.net/forum?id=KtH8W3S_RE).
19 | 
20 | Please cite both papers if you make use of our resources. 
21 | 


--------------------------------------------------------------------------------
/__init__.py:
--------------------------------------------------------------------------------
1 | 
2 | 


--------------------------------------------------------------------------------
/conda-recipe/meta.yaml:
--------------------------------------------------------------------------------
  1 | package:
  2 |   name: digdriver
  3 |   version: 0.1
  4 | 
  5 | source:
  6 |   # simply where it sits in git
  7 |   #git_url: https://github.com/AdamYaari/mutation_density.git
  8 |   path: ../
  9 | 
 10 | build:            # from HPC support guy
 11 |   number: 0
 12 |   preserve_egg_dir: True
 13 |   script: $PYTHON setup.py install --single-version-externally-managed --record=record.txt
 14 | 
 15 | 
 16 | requirements:
 17 | # My understanding is that requirements needs to include the package
 18 | # dependecies (aka other packages called in the package I am uploading)
 19 | #
 20 | # package can be built by calling conda build . from within the package folde
 21 | # given that the folder contains meta.yaml file like this one and the build.sh file
 22 | 
 23 | # then I can run
 24 | # conda convert --platform all /opt/anaconda3/conda-bld/osx-64/r-hdatds-0.1.0-r35_0.tar.bz2
 25 | # to make the package be usable acrros different platforms
 26 |   build:
 27 |     - r-base=3.5
 28 |     - python=3.7.1
 29 |   host:
 30 |     - python=3.7.1
 31 |     - r-base=3.5
 32 |     - pip=21.1.3
 33 |     - setuptools=49.6.0
 34 |     - bedtools=2.30.0
 35 |     - r-base=3.5
 36 |     - r-seqinr=3.6_1
 37 |     - r-MASS=7.3_51.6
 38 |     - bioconductor-genomicranges=1.34.0
 39 |     - bioconductor-biostrings=2.50.2
 40 |     - bioconductor-iranges=2.16.0
 41 |     - bioconductor-rsamtools=1.34.0
 42 |     - r-poilog=0.4
 43 |     - r-plyr=1.8.6
 44 |     - numpy=1.21.0
 45 |     - scipy=1.5.3
 46 |     - statsmodels=0.12.2
 47 |     - pandas=1.3.0
 48 |     - h5py=3.1.0
 49 |     - pysam=0.15.3
 50 |     - pybedtools=0.8.1
 51 |     - pybbi=0.3.0
 52 |     - seaborn=0.11.1
 53 |     - pytables=3.6.1
 54 | 
 55 |   run:
 56 |     - python=3.7.1
 57 |     - r-base=3.5
 58 |     - pip=21.1.3
 59 |     - setuptools=49.6.0
 60 |     - bedtools=2.30.0
 61 |     - r-base=3.5
 62 |     - r-seqinr=3.6_1
 63 |     - r-MASS=7.3_51.6
 64 |     - bioconductor-genomicranges=1.34.0
 65 |     - bioconductor-biostrings=2.50.2
 66 |     - bioconductor-iranges=2.16.0
 67 |     - bioconductor-rsamtools=1.34.0
 68 |     - r-poilog=0.4
 69 |     - r-plyr=1.8.6
 70 |     - numpy=1.21.0
 71 |     - scipy=1.5.3
 72 |     - statsmodels=0.12.2
 73 |     - pandas=1.3.0
 74 |     - h5py=3.1.0
 75 |     - pysam=0.15.3
 76 |     - pybedtools=0.8.1
 77 |     - pybbi=0.3.0
 78 |     - seaborn=0.11.1
 79 |     - pytables=3.6.1
 80 | 
 81 | test:
 82 |  commands:
 83 |     # You can put additional test commands to be run here.
 84 | 
 85 |   # You can also put a file called run_test.py, run_test.sh, or run_test.bat
 86 |   # in the recipe that will be run at test time.
 87 | 
 88 |   # requires:
 89 |     # Put any additional test requirements here.
 90 | 
 91 | 
 92 | 
 93 | about:
 94 |   # user-oriented info to be displayed in anaconda.org
 95 |   home: hhttps://github.com/AdamYaari/mutation_density
 96 |   license: MIT
 97 |   summary: Dig is a computational method that leverages transfer-learning to test for positive selection across arbitrary genomic elements in arbitrary cohorts while requiring the resources only of a personal computer
 98 |   Reference: http://cb.csail.mit.edu/cb/DIG/
 99 |   license_family: MIT
100 | 


--------------------------------------------------------------------------------
/examples/README.md:
--------------------------------------------------------------------------------
1 | These scripts automatically run examples from https://github.com/maxwellsh/DIGDriver/wiki/05:-Analyzing-existing-annotations.
2 | 
3 | They can be executed on any environment with Bash. They require:
4 | 
5 |     * Dig to be installed.
6 |     * 4-6Gb of unused memory.
7 | 
8 | Both `noncoding_driver.sh` and `mutation_driver.sh` can analyze different annotations by commenting in the relevant lines in the script.
9 | 


--------------------------------------------------------------------------------
/examples/gene_driver.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | MODEL="Pancan_SNV_MNV_INDEL.Pretrained.h5"
 4 | MUTS="Pancan_SNV_MNV_INDEL.ICGC.annot.txt.gz"
 5 | 
 6 | ## Check that DigDriver.py is in path
 7 | [[ $(type -P "DigDriver.py") ]]  || 
 8 |     { echo "DigDriver.py is NOT in PATH. Please Ensure Dig is installed." 1>&2; exit 1; }
 9 | 
10 | ## Download files as necessary
11 | [[ ! -f "$MODEL" ]] && { echo -e "Downloading $MODEL\n"; wget -nv --show-progress "http://cb.csail.mit.edu/cb/DIG/downloads/mutation_maps/$MODEL"; echo -e "\n"; }
12 | 
13 | [[ ! -f "$MUTS" ]] && { echo -e "Downloading $MUTS\n"; wget -nv --show-progress "http://cb.csail.mit.edu/cb/DIG/downloads/mutation_files/PCAWG/ICGC_only/$MUTS"; echo -e "\n"; }
14 | 
15 | ## Run DigDriver
16 | echo -e "Running DigDriver.py...\n"
17 | DigDriver.py geneDriver \
18 |     Pancan_SNV_MNV_INDEL.ICGC.annot.txt.gz \
19 |     Pancan_SNV_MNV_INDEL.Pretrained.h5 \
20 |     --outdir . \
21 |     --outpfx Pancan_SNV_MNV_INDEL.genes 
22 | 


--------------------------------------------------------------------------------
/examples/mutation_driver.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | MODEL="Pancan_SNV_MNV_INDEL.Pretrained.h5"
 4 | MUTS="Pancan_SNV_MNV_INDEL.ICGC.annot.txt.gz"
 5 | 
 6 | ## Annotation to be analyzed. Comment-in the desired annotation.
 7 | DRIVERS="grch37.spliceAI_CRYPTIC.noncoding.txt.gz"
 8 | NAME="spliceAI_cryptic_noncoding"
 9 | 
10 | # DRIVERS="grch37.spliceAI_CRYPTIC.txt.gz"
11 | # NAME="spliceAI_cryptic_all"
12 | 
13 | # DRIVERS="grch37.spliceAI_CANONICAL.txt.gz"
14 | # NAME="spliceAI_canonical"
15 | 
16 | # DRIVERS="grch37.spliceAI_CRYPTIC.coding.txt.gz"
17 | # NAME="spliceAI_cryptic_coding"
18 | 
19 | ## Check that DigDriver.py is in path
20 | [[ $(type -P "DigDriver.py") ]]  || 
21 |     { echo "DigDriver.py is NOT in PATH. Please Ensure Dig is installed." 1>&2; exit 1; }
22 | 
23 | ## Download files as necessary
24 | [[ ! -f "$MODEL" ]] && { echo -e "Downloading $MODEL\n"; wget -nv --show-progress "http://cb.csail.mit.edu/cb/DIG/downloads/mutation_maps/$MODEL"; echo -e "\n"; }
25 | 
26 | [[ ! -f "$MUTS" ]] && { echo -e "Downloading $MUTS\n"; wget -nv --show-progress "http://cb.csail.mit.edu/cb/DIG/downloads/mutation_files/PCAWG/ICGC_only/$MUTS"; echo -e "\n"; }
27 | 
28 | [[ ! -f "$DRIVERS" ]] && { echo -e "Downloading $DRIVERS\n"; wget -nv --show-progress "http://cb.csail.mit.edu/cb/DIG/downloads/annotions/splicing/$DRIVERS"; echo -e "\n"; }
29 | 
30 | ## Run DigDriver
31 | echo -e "Running DigDriver.py...\n"
32 | DigDriver.py elementDriver \
33 |     Pancan_SNV_MNV_INDEL.ICGC.annot.txt.gz \
34 |     Pancan_SNV_MNV_INDEL.Pretrained.h5 \
35 |     $NAME \
36 |     --f-sites $DRIVERS \
37 |     --outpfx Pancan_SNV_MNV_INDEL.$NAME \
38 |     --outdir .
39 | 


--------------------------------------------------------------------------------
/examples/noncoding_driver.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | MODEL="Pancan_SNV_MNV_INDEL.Pretrained.h5"
 4 | MUTS="Pancan_SNV_MNV_INDEL.ICGC.annot.txt.gz"
 5 | 
 6 | ## Annotation to be analyzed. Comment-in the desired annotation.
 7 | REGION="grch37.PCAWG_noncoding.bed"
 8 | NAME="PCAWG_all_elts"
 9 | 
10 | # REGION="grch37.canonical_5utr_with_splice.bed"
11 | # NAME="utr5_w_splice"
12 | 
13 | # REGION="grch37.TP53_5UTR_exon1.bed"
14 | # NAME="TP53_5UTR"
15 | 
16 | ## Check that DigDriver.py is in path
17 | [[ $(type -P "DigDriver.py") ]]  || 
18 |     { echo "DigDriver.py is NOT in PATH. Please Ensure Dig is installed." 1>&2; exit 1; }
19 | 
20 | ## Download files as necessary
21 | [[ ! -f "$MODEL" ]] && { echo -e "Downloading $MODEL\n"; wget -nv --show-progress "http://cb.csail.mit.edu/cb/DIG/downloads/mutation_maps/$MODEL"; echo -e "\n"; }
22 | 
23 | [[ ! -f "$MUTS" ]] && { echo -e "Downloading $MUTS\n"; wget -nv --show-progress "http://cb.csail.mit.edu/cb/DIG/downloads/mutation_files/PCAWG/ICGC_only/$MUTS"; echo -e "\n"; }
24 | 
25 | [[ ! -f "$REGION" ]] && { echo -e "Downloading $REGION\n"; wget -nv --show-progress "http://cb.csail.mit.edu/cb/DIG/downloads/annotions/noncoding/$REGION"; echo -e "\n"; }
26 | 
27 | ## Run DigDriver
28 | echo -e "Running DigDriver.py...\n"
29 | DigDriver.py elementDriver \
30 |     Pancan_SNV_MNV_INDEL.ICGC.annot.txt.gz \
31 |     Pancan_SNV_MNV_INDEL.Pretrained.h5 \
32 |     $NAME \
33 |     --f-bed $REGION \
34 |     --outpfx Pancan_SNV_MNV_INDEL.$NAME \
35 |     --outdir .
36 | 


--------------------------------------------------------------------------------
/requirements.txt:
--------------------------------------------------------------------------------
 1 | numpy
 2 | scipy
 3 | statsmodels
 4 | pandas
 5 | h5py
 6 | pysam
 7 | pybedtools
 8 | pybbi>=0.2.0
 9 | seaborn
10 | tables
11 | 


--------------------------------------------------------------------------------
/scripts/filter_hypermut.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | 
 3 | import pandas as pd
 4 | import pkg_resources
 5 | import pathlib
 6 | import os
 7 | import argparse
 8 | 
 9 | from DIGDriver.data_tools import mutation_tools
10 | 
11 | if __name__ == "__main__":
12 | 
13 |     parser = argparse.ArgumentParser(description='Filter hypermutated samples.')
14 |     parser.add_argument('--suffix', default='annot.txt', help='suffix of Dig mutation files to filer')
15 |     parser.add_argument('--max-muts-per-sample', default=3000, type=int, help='Maximum number of coding mutations allowed per sample. Samples with more coding mutations will be filtered.')
16 |     args = parser.parse_args()
17 | 
18 |     if not os.path.isdir("filter_hypermut"):
19 |         os.mkdir("filter_hypermut")
20 | 
21 |     paths = sorted(pathlib.Path('.').glob('*'+args.suffix))
22 | 
23 |     for f in paths:
24 |         df = mutation_tools.read_mutation_file(str(f), drop_duplicates=True)
25 |         df_mut = df[df.GENE != '.']
26 |         _, sample_blacklist = mutation_tools.filter_hypermut_samples(df_mut, 
27 |             max_muts_per_sample=3000, 
28 |             return_blacklist=True
29 |         )
30 |         df_out = df[~df.SAMPLE.isin(sample_blacklist)]
31 |         print(f.name, df.shape, df_out.shape)
32 |         f_out = os.path.join("filter_hypermut", f.name.split('.annot.txt')[0] + ".no_hypermut.annot.txt")
33 |         df_out.to_csv(f_out, header=False, index=False, sep="\t")
34 | 


--------------------------------------------------------------------------------
/setup.py:
--------------------------------------------------------------------------------
 1 | from setuptools import setup, find_packages
 2 | import os
 3 | import io
 4 | 
 5 | 
 6 | HERE = os.path.dirname(os.path.abspath(__file__))
 7 | 
 8 | 
 9 | def read(*parts, **kwargs):
10 |     filepath = os.path.join(HERE, *parts)
11 |     encoding = kwargs.pop("encoding", "utf-8")
12 |     with io.open(filepath, encoding=encoding) as fh:
13 |         text = fh.read()
14 |     return text
15 | 
16 | 
17 | def get_requirements(path):
18 |     content = read(path)
19 |     return [req for req in content.split("\n") if req != "" and not req.startswith("#")]
20 | 
21 | 
22 | # setup_requires = ["numpy"]
23 | 
24 | install_requires = get_requirements("requirements.txt")
25 | 
26 | setup(
27 |     name="DIGDriver",
28 |     version="0.2.0",
29 |     description="Flexible cancer driver element detection",
30 |     author="Maxwell Sherman",
31 |     author_email="msherman997@gmail.com",
32 |     url="",
33 |     packages=find_packages(),
34 |     # packages=["DIGDriver"],
35 |     # setup_requires=setup_requires,
36 |     install_requires=install_requires,
37 |     scripts=["scripts/DataExtractor.py",
38 |              "scripts/DigPretrain.py",
39 |              "scripts/DigPreprocess.py",
40 |              "scripts/mutationFunction.R",
41 |              "scripts/DigDriver.py"
42 |             ],
43 |     include_package_data=True,
44 |     package_data={'': ['data/*']},
45 |     # entry_points={"console_scripts": ["clodius = clodius.cli.aggregate:cli"]},
46 | )
47 | 


--------------------------------------------------------------------------------