├── .gitignore ├── COSMIC └── cosmic_67_for_HotSpot3D_missense_only.tsv.bz2 ├── Demo ├── Expected │ ├── cmd_list_submit_file │ ├── demo.l0.ad10.r10.mericUns.strInd.subInd.recurrence.clusters │ ├── demo.l0.ad10.r10.mericUns.strInd.subInd.recurrence.clusters.summary │ ├── demo.l0.ad20.pairwise │ ├── demo.weighted.l0.ad10.r10.mericUns.strInd.subInd.recurrence.clusters │ ├── demo.weighted.l0.ad10.r10.mericUns.strInd.subInd.recurrence.clusters.summary │ ├── demo.weighted.l0.ad10.r10.mericUns.strInd.subInd.weight.clusters │ ├── demo.weighted.l0.ad10.r10.mericUns.strInd.subInd.weight.clusters.summary │ └── demo.weighted.l0.ad20.pairwise ├── README_demo ├── demo.maf └── demo.weighted.maf ├── HotSpot3D-1.8.2.tar.gz ├── LICENSE.md ├── README.md ├── bin └── hotspot3d ├── dist.ini ├── lib └── TGI │ ├── Data │ ├── CleanNumber.pm │ └── StringTemplate.pm │ ├── Files │ ├── Clusters.pm │ ├── DrugPairs.pm │ ├── File.pm │ ├── List.pm │ ├── MAF.pm │ ├── Pairwise.pm │ └── PyMOL.pm │ ├── Mutpro │ ├── Distance.pm │ ├── Main │ │ ├── AllMain.pm │ │ ├── BruteForceClustersLines.R │ │ ├── Cluster.pm │ │ ├── ColorScore.R │ │ ├── Density.pm │ │ ├── HorizClustersLines.R │ │ ├── MafSimulator.pm │ │ ├── MembershipProbability.R │ │ ├── Network.pm │ │ ├── Post.pm │ │ ├── Proximity.pm │ │ ├── Significance.pm │ │ ├── Summary.pm │ │ └── Visual.pm │ ├── Pair.pm │ └── Preprocess │ │ ├── AllPreprocess.pm │ │ ├── AminoAcid.pm │ │ ├── Anno.pm │ │ ├── Calpro.pm │ │ ├── Calroi.pm │ │ ├── Complicated.pm │ │ ├── Cosmic.pm │ │ ├── Drugport.pm │ │ ├── Homolog.pm │ │ ├── HugoGene.pm │ │ ├── HugoGeneMethods.pm │ │ ├── PdbStructure.pm │ │ ├── Peptide.pm │ │ ├── Point.pm │ │ ├── Prep.pm │ │ ├── Prior.pm │ │ ├── Statis.pm │ │ ├── Trans.pm │ │ ├── Uniprot.pm │ │ └── Uppro.pm │ ├── ProteinVariant.pm │ └── Variant.pm ├── scripts ├── DensityScripts │ ├── ClusterProbability.pl │ ├── ClustersLines.R │ ├── DensityAll.pl │ ├── DensityVisual.pl │ ├── EasyClustersLines.R │ ├── HorizClustersLines.R │ ├── MembershipProbability.R │ ├── OpticsWithR.pl │ ├── PlotR.R │ ├── README.md │ └── SuperClustersID.pl ├── MafSimulatorScripts │ └── simulate.pl ├── README.annotations ├── addRandomWeight.pl ├── annotate_clusters_HGNC_Kinase.pl ├── annotate_clusters_MAF.pl ├── annotate_clusters_PDB.pl ├── annotate_clusters_domains.pl ├── annotate_clusters_drug_class.pl ├── annotate_clusters_families.pl ├── clusterPDBPresence.drug.pl ├── clusterPDBPresence.pl ├── determine_transcript_lengths.pl ├── filter_PDB.pl ├── genePDBPresence.pl ├── hotspot3d.main.sh └── nStructures.pl └── t └── foo.t /.gitignore: -------------------------------------------------------------------------------- 1 | .DS_Store 2 | *.swp 3 | *.swo 4 | -------------------------------------------------------------------------------- /COSMIC/cosmic_67_for_HotSpot3D_missense_only.tsv.bz2: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ding-lab/hotspot3d/479aa2a97cd67d3b3036d14a11b7c55d8a412117/COSMIC/cosmic_67_for_HotSpot3D_missense_only.tsv.bz2 -------------------------------------------------------------------------------- /Demo/Expected/cmd_list_submit_file: -------------------------------------------------------------------------------- 1 | bsub -oo prep//Logs/Q13485.err.log -R 'select[type==LINUX64 && mem>16000] rusage[mem=16000]' -M 16000000 'hotspot3d calpro --output-dir=prep/ --pdb-file-dir=pdb/ --uniprot-id=Q13485 --3d-distance-cutoff=20 --linear-cutoff=0' 2 | bsub -oo prep//Logs/Q15796.err.log -R 'select[type==LINUX64 && mem>16000] rusage[mem=16000]' -M 16000000 'hotspot3d calpro --output-dir=prep/ --pdb-file-dir=pdb/ --uniprot-id=Q15796 --3d-distance-cutoff=20 --linear-cutoff=0' 3 | bsub -oo prep//Logs/P84022.err.log -R 'select[type==LINUX64 && mem>16000] rusage[mem=16000]' -M 16000000 'hotspot3d calpro --output-dir=prep/ --pdb-file-dir=pdb/ --uniprot-id=P84022 --3d-distance-cutoff=20 --linear-cutoff=0' 4 | -------------------------------------------------------------------------------- /Demo/Expected/demo.l0.ad10.r10.mericUns.strInd.subInd.recurrence.clusters: -------------------------------------------------------------------------------- 1 | Cluster Gene/Drug Mutation/Gene Degree_Connectivity Closeness_Centrality Geodesic_From_Centroid Weight Chromosome Start Stop Reference Alternate Transcript Alternative_Transcripts 2 | 0.0 SMAD2 p.D450N 11 0.154637816475446 7.748 1 18 45368254 45368254 C T ENST00000262160 ENST00000356825:p.D420N|ENST00000402690:p.D450N|ENST00000586040:p.D420N 3 | 0.0 SMAD3 p.Q405L 9 0.0723495051092592 9.887 1 15 67482810 67482810 A T ENST00000327367 ENST00000439724:p.Q361L|ENST00000537194:p.Q210L|ENST00000540846:p.Q300L|ENST00000560424:p.S136C 4 | 0.0 SMAD4 p.D355G 14 0.636981210492366 5.6274 1 18 48591901 48591901 A G ENST00000342988 ENST00000398417:p.D355G|ENST00000588745:p.D259G 5 | 0.0 SMAD4 p.D537E 17 3.12450111385116 8.514 1 18 48604789 48604789 C A ENST00000342988 ENST00000398417:p.D537E|ENST00000588745:p.D441E 6 | 0.0 SMAD4 p.D537G 17 3.12450111385116 8.514 1 18 48604788 48604788 A G ENST00000342988 ENST00000398417:p.D537G|ENST00000588745:p.D441G 7 | 0.0 SMAD4 p.D537V 17 3.12450111385116 8.514 1 18 48604788 48604788 A T ENST00000342988 ENST00000398417:p.D537V|ENST00000588745:p.D441V 8 | 0.0 SMAD4 p.D537Y 17 3.12450111385116 8.514 1 18 48604787 48604787 G T ENST00000342988 ENST00000398417:p.D537Y|ENST00000588745:p.D441Y 9 | 0.0 SMAD4 p.P356L 17 0.252038708531035 8.254 1 18 48591904 48591904 C T ENST00000342988 ENST00000398417:p.P356L|ENST00000588745:p.P260L 10 | 0.0 SMAD4 p.R361C 11 17.0489311408714 0 5 18 48591918 48591918 C T ENST00000342988 ENST00000398417:p.R361C|ENST00000588745:p.R265C 11 | 0.0 SMAD4 p.R361H 11 17.0489311408714 0 10 18 48591919 48591919 G A ENST00000342988 ENST00000398417:p.R361H|ENST00000588745:p.R265H 12 | 0.0 SMAD4 p.R361P 11 17.0489311408714 0 1 18 48591919 48591919 G C ENST00000342988 ENST00000398417:p.R361P|ENST00000588745:p.R265P 13 | 0.0 SMAD4 p.R361S 11 17.0489311408714 0 1 18 48591918 48591918 C A ENST00000342988 ENST00000398417:p.R361S|ENST00000588745:p.R265S 14 | 0.0 SMAD4 p.RFCLG361in_frame_del 11 17.0489311408714 0 1 18 48591917 48591931 TCGCTTTTGTTTGGG - ENST00000342988 ENST00000398417:p.RFCLG361in_frame_del|ENST00000588745:p.RFCLG265in_frame_del 15 | 0.0 SMAD4 p.S357P 13 0.269895315912113 6.9605 1 18 48591906 48591906 T C ENST00000342988 ENST00000398417:p.S357P|ENST00000588745:p.S261P 16 | 0.1 SMAD3 p.M255I 5 0.0268327218099855 8.197 1 15 67473685 67473685 G A ENST00000327367 ENST00000439724:p.M211I|ENST00000537194:p.M60I|ENST00000540846:p.M150I|ENST00000558428:p.M60I|ENST00000558827:p.M60I|ENST00000558894:p.M150I 17 | 0.1 SMAD3 p.R268C 7 2.02105036600258 0 1 15 67473722 67473722 C T ENST00000327367 ENST00000439724:p.R224C|ENST00000537194:p.R73C|ENST00000540846:p.R163C|ENST00000558428:p.R73C|ENST00000558827:p.R73C|ENST00000558894:p.R163C 18 | 0.1 SMAD3 p.R268H 7 2.02105036600257 0 2 15 67473723 67473723 G A ENST00000327367 ENST00000439724:p.R224H|ENST00000537194:p.R73H|ENST00000540846:p.R163H|ENST00000558428:p.R73H|ENST00000558827:p.R73H|ENST00000558894:p.R163H 19 | 0.2 SMAD2 p.D300N 10 0.106913746235539 9.951 1 18 45374945 45374945 C T ENST00000262160 ENST00000356825:p.D270N|ENST00000402690:p.D300N|ENST00000586040:p.D270N|ENST00000591214:p.D270N 20 | 0.2 SMAD2 p.D304G 12 0.276541081770023 3.906 1 18 45374932 45374932 T C ENST00000262160 ENST00000356825:p.D274G|ENST00000402690:p.D304G|ENST00000586040:p.D274G|ENST00000591214:p.D274G 21 | 0.2 SMAD2 p.F311L 7 0.0203828060099014 9.696 1 18 45374910 45374910 G C ENST00000262160 ENST00000356825:p.F281L|ENST00000402690:p.F311L|ENST00000586040:p.F281L|ENST00000591214:p.F281L 22 | 0.2 SMAD2 p.L442V 5 0.027918079222433 9.803 1 18 45368278 45368278 G C ENST00000262160 ENST00000356825:p.L412V|ENST00000402690:p.L442V|ENST00000586040:p.L412V 23 | 0.2 SMAD2 p.N307H 6 0.0300070179556278 7.03833333333333 1 18 45374924 45374924 T G ENST00000262160 ENST00000356825:p.N277H|ENST00000402690:p.N307H|ENST00000586040:p.N277H|ENST00000591214:p.N277H 24 | 0.2 SMAD2 p.P305L 13 1.10848273927962 0 1 18 45374929 45374929 G A ENST00000262160 ENST00000356825:p.P275L|ENST00000402690:p.P305L|ENST00000586040:p.P275L|ENST00000591214:p.P275L 25 | 0.2 SMAD2 p.P305Q 13 1.10848273927962 0 1 18 45374929 45374929 G T ENST00000262160 ENST00000356825:p.P275Q|ENST00000402690:p.P305Q|ENST00000586040:p.P275Q|ENST00000591214:p.P275Q 26 | 0.2 SMAD4 p.C324R 4 0.00547905622818637 9.626 1 18 48591807 48591807 T C ENST00000342988 ENST00000398417:p.C324R|ENST00000588745:p.C228R 27 | 0.2 SMAD4 p.L533R 13 0.153356803127318 6.17 1 18 48604776 48604776 T G ENST00000342988 ENST00000398417:p.L533R|ENST00000588745:p.L437R 28 | 0.2 SMAD4 p.R531Q 9 0.018237392468185 9.525 1 18 48604770 48604770 G A ENST00000342988 ENST00000398417:p.R531Q|ENST00000588745:p.R435Q 29 | 0.3 SMAD4 p.C499Y 4 1.03760965842248 0 2 18 48604674 48604674 G A ENST00000342988 ENST00000398417:p.C499Y|ENST00000588745:p.C403Y 30 | 0.3 SMAD4 p.R496H 5 0.0671885358555349 6.3805 1 18 48604665 48604665 G A ENST00000342988 ENST00000398417:p.R496H|ENST00000588745:p.R400H 31 | 0.4 SMAD2 p.A323V 5 0.0350390737278377 7.457 1 18 45374875 45374875 G A ENST00000262160 ENST00000356825:p.A293V|ENST00000402690:p.A323V|ENST00000586040:p.A293V|ENST00000591214:p.A293V 32 | 0.4 SMAD2 p.M327I 5 0.046603860695402 9.888 1 18 45374862 45374862 C G ENST00000262160 ENST00000356825:p.M297I|ENST00000402690:p.M327I|ENST00000586040:p.M297I|ENST00000591214:p.M297I 33 | 0.4 SMAD2 p.N320H 3 0.0302784844026229 6.36766666666667 1 18 45374885 45374885 T G ENST00000262160 ENST00000356825:p.N290H|ENST00000402690:p.N320H|ENST00000586040:p.N290H|ENST00000591214:p.N290H 34 | 0.4 SMAD2 p.R321Q 6 1.02865466741947 0 2 18 45374881 45374881 C T ENST00000262160 ENST00000356825:p.R291Q|ENST00000402690:p.R321Q|ENST00000586040:p.R291Q|ENST00000591214:p.R291Q 35 | 0.4 SMAD4 p.D493N 7 0.0593787196817366 6.94 1 18 48604655 48604655 G A ENST00000342988 ENST00000398417:p.D493N|ENST00000588745:p.D397N 36 | 0.5 SMAD2 p.S276L 4 1.00965296440224 0 2 18 45375016 45375016 G A ENST00000262160 ENST00000356825:p.S246L|ENST00000402690:p.S276L|ENST00000586040:p.S246L|ENST00000591214:p.S246L 37 | 1.0 SMAD2 p.F356C 2 0.030434987773602 5.04433333333333 1 18 45372102 45372102 A C ENST00000262160 ENST00000356825:p.F326C|ENST00000402690:p.F356C|ENST00000586040:p.F326C|ENST00000591214:p.F326C 38 | 1.0 SMAD2 p.G421W 3 0.0346165873908751 0 1 18 45371730 45371730 C A ENST00000262160 ENST00000356825:p.G391W|ENST00000402690:p.G421W|ENST00000586040:p.G391W|ENST00000591214:p.G391W 39 | 1.0 SMAD2 p.R427Q 2 0.00444296094935197 7.85733333333333 1 18 45371711 45371711 C T ENST00000262160 ENST00000356825:p.R397Q|ENST00000402690:p.R427Q|ENST00000586040:p.R397Q|ENST00000591214:p.R397Q 40 | 2.0 SMAD3 p.Q23R 2 0.00232911889127077 8.746 1 15 67358560 67358560 A G ENST00000327367 - 41 | 2.0 SMAD3 p.Q26H 2 0.00232911889127077 0 1 15 67358570 67358570 G T ENST00000327367 42 | 3.0 SMAD3 p.L91P 2 0.00297890947901618 8.391 1 15 67457298 67457298 T C ENST00000327367 ENST00000439724:p.L47P|ENST00000559092:p.C73R 43 | 3.0 SMAD3 p.R93Q 2 0.00297890947901618 0 1 15 67457304 67457304 G A ENST00000327367 ENST00000439724:p.R49Q|ENST00000559092:p.D75N 44 | 5.0 SMAD3 p.R420H 3 0.0161101004961813 7.11 1 15 67482855 67482855 G A ENST00000327367 ENST00000439724:p.R376H|ENST00000537194:p.R225H|ENST00000540846:p.R315H|ENST00000560424:p.A151T 45 | 5.0 SMAD3 p.S416F 3 0.0266482498621537 6.321 1 15 67482843 67482843 C T ENST00000327367 ENST00000439724:p.S372F|ENST00000537194:p.S221F|ENST00000540846:p.S311F|ENST00000560424:p.P147S 46 | 5.0 SMAD3 p.S418I 3 1.01974701315002 0 1 15 67482849 67482849 G T ENST00000327367 ENST00000439724:p.S374I|ENST00000537194:p.S223I|ENST00000540846:p.S313I|ENST00000560424:p.A149S 47 | 5.0 SMAD3 p.S418N 3 1.01974701315002 0 1 15 67482849 67482849 G A ENST00000327367 ENST00000439724:p.S374N|ENST00000537194:p.S223N|ENST00000540846:p.S313N|ENST00000560424:p.A149T 48 | -------------------------------------------------------------------------------- /Demo/Expected/demo.l0.ad10.r10.mericUns.strInd.subInd.recurrence.clusters.summary: -------------------------------------------------------------------------------- 1 | Cluster_ID Centroid Cluster_Closeness Recurrence_Mass Avg_Centrality Avg_Recurrence Avg_Degree Avg_Geodesic N_Genes N_Vertices N_Genomic_Mutations N_Protein_Mutations N_Protein_Sites N_Protein_Positions N_Drugs Genes Transcripts Genomic_Mutations Protein_Mutations Protein_Sites Protein_Positions Drugs 2 | 0.0 SMAD4:p.RFCLG361in_frame_del 99.129 14 7.081 1.000 13.357 5.181 3 14 14 14 NA 7 NA SMAD2(1),SMAD3(1),SMAD4(12) ENST00000262160(1),ENST00000327367(1),ENST00000342988(12) SMAD2:18:45368254:45368254:C:T,SMAD3:15:67482810:67482810:A:T,SMAD4:18:48591901:48591901:A:G,SMAD4:18:48591904:48591904:C:T,SMAD4:18:48591906:48591906:T:C,SMAD4:18:48591917:48591931:TCGCTTTTGTTTGGG:-,SMAD4:18:48591918:48591918:C:A,SMAD4:18:48591918:48591918:C:T,SMAD4:18:48591919:48591919:G:A,SMAD4:18:48591919:48591919:G:C,SMAD4:18:48604787:48604787:G:T,SMAD4:18:48604788:48604788:A:G,SMAD4:18:48604788:48604788:A:T,SMAD4:18:48604789:48604789:C:A SMAD2:p.D450N,SMAD3:p.Q405L,SMAD4:p.D355G,SMAD4:p.D537E,SMAD4:p.D537G,SMAD4:p.D537V,SMAD4:p.D537Y,SMAD4:p.P356L,SMAD4:p.R361C,SMAD4:p.R361H,SMAD4:p.R361P,SMAD4:p.R361S,SMAD4:p.RFCLG361in_frame_del,SMAD4:p.S357P NA 355,356,357,361,405,450,537 NA 3 | 0.1 SMAD3:p.R268H 4.069 3 1.356 1.000 6.333 2.732 1 3 3 3 NA 2 NA SMAD3(3) ENST00000327367(3) SMAD3:15:67473685:67473685:G:A,SMAD3:15:67473722:67473722:C:T,SMAD3:15:67473723:67473723:G:A SMAD3:p.M255I,SMAD3:p.R268C,SMAD3:p.R268H NA 255,268 NA 4 | 0.2 SMAD2:p.P305Q 2.856 10 0.286 1.000 9.200 6.572 2 10 10 10 NA 9 NA SMAD2(7),SMAD4(3) ENST00000262160(7),ENST00000342988(3) SMAD2:18:45368278:45368278:G:C,SMAD2:18:45374910:45374910:G:C,SMAD2:18:45374924:45374924:T:G,SMAD2:18:45374929:45374929:G:A,SMAD2:18:45374929:45374929:G:T,SMAD2:18:45374932:45374932:T:C,SMAD2:18:45374945:45374945:C:T,SMAD4:18:48591807:48591807:T:C,SMAD4:18:48604770:48604770:G:A,SMAD4:18:48604776:48604776:T:G SMAD2:p.D300N,SMAD2:p.D304G,SMAD2:p.F311L,SMAD2:p.L442V,SMAD2:p.N307H,SMAD2:p.P305L,SMAD2:p.P305Q,SMAD4:p.C324R,SMAD4:p.L533R,SMAD4:p.R531Q NA 300,304,305,307,311,324,442,531,533 NA 5 | 0.3 SMAD4:p.C499Y 1.105 2 0.552 1.000 4.500 3.190 1 2 2 2 NA 2 NA SMAD4(2) ENST00000342988(2) SMAD4:18:48604665:48604665:G:A,SMAD4:18:48604674:48604674:G:A SMAD4:p.C499Y,SMAD4:p.R496H NA 496,499 NA 6 | 0.4 SMAD2:p.R321Q 1.200 5 0.240 1.000 5.200 6.131 2 5 5 5 NA 5 NA SMAD2(4),SMAD4(1) ENST00000262160(4),ENST00000342988(1) SMAD2:18:45374862:45374862:C:G,SMAD2:18:45374875:45374875:G:A,SMAD2:18:45374881:45374881:C:T,SMAD2:18:45374885:45374885:T:G,SMAD4:18:48604655:48604655:G:A SMAD2:p.A323V,SMAD2:p.M327I,SMAD2:p.N320H,SMAD2:p.R321Q,SMAD4:p.D493N NA 320,321,323,327,493 NA 7 | 0.5 SMAD2:p.S276L 1.010 1 1.010 1.000 4.000 0.000 1 1 1 1 NA 1 NA SMAD2(1) ENST00000262160(1) SMAD2:18:45375016:45375016:G:A SMAD2:p.S276L NA 276 NA 8 | 1.0 SMAD2:p.G421W 0.069 3 0.023 1.000 2.333 4.301 1 3 3 3 NA 3 NA SMAD2(3) ENST00000262160(3) SMAD2:18:45371711:45371711:C:T,SMAD2:18:45371730:45371730:C:A,SMAD2:18:45372102:45372102:A:C SMAD2:p.F356C,SMAD2:p.G421W,SMAD2:p.R427Q NA 356,421,427 NA 9 | 2.0 SMAD3:p.Q26H 0.005 2 0.002 1.000 2.000 4.373 1 2 2 2 NA 2 NA SMAD3(2) ENST00000327367(2) SMAD3:15:67358560:67358560:A:G,SMAD3:15:67358570:67358570:G:T SMAD3:p.Q23R,SMAD3:p.Q26H NA 23,26 NA 10 | 3.0 SMAD3:p.R93Q 0.006 2 0.003 1.000 2.000 4.196 1 2 2 2 NA 2 NA SMAD3(2) ENST00000327367(2) SMAD3:15:67457298:67457298:T:C,SMAD3:15:67457304:67457304:G:A SMAD3:p.L91P,SMAD3:p.R93Q NA 91,93 NA 11 | 5.0 SMAD3:p.S418N 2.082 4 0.521 1.000 3.000 3.358 1 4 4 4 NA 3 NA SMAD3(4) ENST00000327367(4) SMAD3:15:67482843:67482843:C:T,SMAD3:15:67482849:67482849:G:A,SMAD3:15:67482849:67482849:G:T,SMAD3:15:67482855:67482855:G:A SMAD3:p.R420H,SMAD3:p.S416F,SMAD3:p.S418I,SMAD3:p.S418N NA 416,418,420 NA 12 | -------------------------------------------------------------------------------- /Demo/Expected/demo.weighted.l0.ad10.r10.mericUns.strInd.subInd.recurrence.clusters: -------------------------------------------------------------------------------- 1 | Cluster Gene/Drug Mutation/Gene Degree_Connectivity Closeness_Centrality Geodesic_From_Centroid Weight Chromosome Start Stop Reference Alternate Transcript Alternative_Transcripts 2 | 0.0 SMAD2 p.D450N 11 0.154637816475446 7.748 1 18 45368254 45368254 C T ENST00000262160 ENST00000356825:p.D420N|ENST00000402690:p.D450N|ENST00000586040:p.D420N 3 | 0.0 SMAD3 p.Q405L 9 0.0723495051092592 9.887 1 15 67482810 67482810 A T ENST00000327367 ENST00000439724:p.Q361L|ENST00000537194:p.Q210L|ENST00000540846:p.Q300L|ENST00000560424:p.S136C 4 | 0.0 SMAD4 p.D355G 14 0.636981210492366 5.6274 1 18 48591901 48591901 A G ENST00000342988 ENST00000398417:p.D355G|ENST00000588745:p.D259G 5 | 0.0 SMAD4 p.D537E 17 3.12450111385116 8.514 1 18 48604789 48604789 C A ENST00000342988 ENST00000398417:p.D537E|ENST00000588745:p.D441E 6 | 0.0 SMAD4 p.D537G 17 3.12450111385116 8.514 1 18 48604788 48604788 A G ENST00000342988 ENST00000398417:p.D537G|ENST00000588745:p.D441G 7 | 0.0 SMAD4 p.D537V 17 3.12450111385116 8.514 1 18 48604788 48604788 A T ENST00000342988 ENST00000398417:p.D537V|ENST00000588745:p.D441V 8 | 0.0 SMAD4 p.D537Y 17 3.12450111385116 8.514 1 18 48604787 48604787 G T ENST00000342988 ENST00000398417:p.D537Y|ENST00000588745:p.D441Y 9 | 0.0 SMAD4 p.P356L 17 0.252038708531035 8.254 1 18 48591904 48591904 C T ENST00000342988 ENST00000398417:p.P356L|ENST00000588745:p.P260L 10 | 0.0 SMAD4 p.R361C 11 17.0489311408714 0 5 18 48591918 48591918 C T ENST00000342988 ENST00000398417:p.R361C|ENST00000588745:p.R265C 11 | 0.0 SMAD4 p.R361H 11 17.0489311408714 0 10 18 48591919 48591919 G A ENST00000342988 ENST00000398417:p.R361H|ENST00000588745:p.R265H 12 | 0.0 SMAD4 p.R361P 11 17.0489311408714 0 1 18 48591919 48591919 G C ENST00000342988 ENST00000398417:p.R361P|ENST00000588745:p.R265P 13 | 0.0 SMAD4 p.R361S 11 17.0489311408714 0 1 18 48591918 48591918 C A ENST00000342988 ENST00000398417:p.R361S|ENST00000588745:p.R265S 14 | 0.0 SMAD4 p.RFCLG361in_frame_del 11 17.0489311408714 0 1 18 48591917 48591931 TCGCTTTTGTTTGGG - ENST00000342988 ENST00000398417:p.RFCLG361in_frame_del|ENST00000588745:p.RFCLG265in_frame_del 15 | 0.0 SMAD4 p.S357P 13 0.269895315912113 6.9605 1 18 48591906 48591906 T C ENST00000342988 ENST00000398417:p.S357P|ENST00000588745:p.S261P 16 | 0.1 SMAD3 p.M255I 5 0.0268327218099855 8.197 1 15 67473685 67473685 G A ENST00000327367 ENST00000439724:p.M211I|ENST00000537194:p.M60I|ENST00000540846:p.M150I|ENST00000558428:p.M60I|ENST00000558827:p.M60I|ENST00000558894:p.M150I 17 | 0.1 SMAD3 p.R268C 7 2.02105036600258 0 1 15 67473722 67473722 C T ENST00000327367 ENST00000439724:p.R224C|ENST00000537194:p.R73C|ENST00000540846:p.R163C|ENST00000558428:p.R73C|ENST00000558827:p.R73C|ENST00000558894:p.R163C 18 | 0.1 SMAD3 p.R268H 7 2.02105036600257 0 2 15 67473723 67473723 G A ENST00000327367 ENST00000439724:p.R224H|ENST00000537194:p.R73H|ENST00000540846:p.R163H|ENST00000558428:p.R73H|ENST00000558827:p.R73H|ENST00000558894:p.R163H 19 | 0.2 SMAD2 p.D300N 10 0.106913746235539 9.951 1 18 45374945 45374945 C T ENST00000262160 ENST00000356825:p.D270N|ENST00000402690:p.D300N|ENST00000586040:p.D270N|ENST00000591214:p.D270N 20 | 0.2 SMAD2 p.D304G 12 0.276541081770023 3.906 1 18 45374932 45374932 T C ENST00000262160 ENST00000356825:p.D274G|ENST00000402690:p.D304G|ENST00000586040:p.D274G|ENST00000591214:p.D274G 21 | 0.2 SMAD2 p.F311L 7 0.0203828060099013 9.696 1 18 45374910 45374910 G C ENST00000262160 ENST00000356825:p.F281L|ENST00000402690:p.F311L|ENST00000586040:p.F281L|ENST00000591214:p.F281L 22 | 0.2 SMAD2 p.L442V 5 0.027918079222433 9.803 1 18 45368278 45368278 G C ENST00000262160 ENST00000356825:p.L412V|ENST00000402690:p.L442V|ENST00000586040:p.L412V 23 | 0.2 SMAD2 p.N307H 6 0.0300070179556278 7.03833333333333 1 18 45374924 45374924 T G ENST00000262160 ENST00000356825:p.N277H|ENST00000402690:p.N307H|ENST00000586040:p.N277H|ENST00000591214:p.N277H 24 | 0.2 SMAD2 p.P305L 13 1.10848273927962 0 1 18 45374929 45374929 G A ENST00000262160 ENST00000356825:p.P275L|ENST00000402690:p.P305L|ENST00000586040:p.P275L|ENST00000591214:p.P275L 25 | 0.2 SMAD2 p.P305Q 13 1.10848273927962 0 1 18 45374929 45374929 G T ENST00000262160 ENST00000356825:p.P275Q|ENST00000402690:p.P305Q|ENST00000586040:p.P275Q|ENST00000591214:p.P275Q 26 | 0.2 SMAD4 p.C324R 4 0.00547905622818637 9.626 1 18 48591807 48591807 T C ENST00000342988 ENST00000398417:p.C324R|ENST00000588745:p.C228R 27 | 0.2 SMAD4 p.L533R 13 0.153356803127318 6.17 1 18 48604776 48604776 T G ENST00000342988 ENST00000398417:p.L533R|ENST00000588745:p.L437R 28 | 0.2 SMAD4 p.R531Q 9 0.018237392468185 9.525 1 18 48604770 48604770 G A ENST00000342988 ENST00000398417:p.R531Q|ENST00000588745:p.R435Q 29 | 0.3 SMAD4 p.C499Y 4 1.03760965842248 0 2 18 48604674 48604674 G A ENST00000342988 ENST00000398417:p.C499Y|ENST00000588745:p.C403Y 30 | 0.3 SMAD4 p.R496H 5 0.0671885358555349 6.3805 1 18 48604665 48604665 G A ENST00000342988 ENST00000398417:p.R496H|ENST00000588745:p.R400H 31 | 0.4 SMAD2 p.A323V 5 0.0350390737278377 7.457 1 18 45374875 45374875 G A ENST00000262160 ENST00000356825:p.A293V|ENST00000402690:p.A323V|ENST00000586040:p.A293V|ENST00000591214:p.A293V 32 | 0.4 SMAD2 p.M327I 5 0.046603860695402 9.888 1 18 45374862 45374862 C G ENST00000262160 ENST00000356825:p.M297I|ENST00000402690:p.M327I|ENST00000586040:p.M297I|ENST00000591214:p.M297I 33 | 0.4 SMAD2 p.N320H 3 0.0302784844026229 6.36766666666667 1 18 45374885 45374885 T G ENST00000262160 ENST00000356825:p.N290H|ENST00000402690:p.N320H|ENST00000586040:p.N290H|ENST00000591214:p.N290H 34 | 0.4 SMAD2 p.R321Q 6 1.02865466741947 0 2 18 45374881 45374881 C T ENST00000262160 ENST00000356825:p.R291Q|ENST00000402690:p.R321Q|ENST00000586040:p.R291Q|ENST00000591214:p.R291Q 35 | 0.4 SMAD4 p.D493N 7 0.0593787196817366 6.94 1 18 48604655 48604655 G A ENST00000342988 ENST00000398417:p.D493N|ENST00000588745:p.D397N 36 | 0.5 SMAD2 p.S276L 4 1.00965296440224 0 2 18 45375016 45375016 G A ENST00000262160 ENST00000356825:p.S246L|ENST00000402690:p.S276L|ENST00000586040:p.S246L|ENST00000591214:p.S246L 37 | 1.0 SMAD2 p.F356C 2 0.030434987773602 5.04433333333333 1 18 45372102 45372102 A C ENST00000262160 ENST00000356825:p.F326C|ENST00000402690:p.F356C|ENST00000586040:p.F326C|ENST00000591214:p.F326C 38 | 1.0 SMAD2 p.G421W 3 0.0346165873908751 0 1 18 45371730 45371730 C A ENST00000262160 ENST00000356825:p.G391W|ENST00000402690:p.G421W|ENST00000586040:p.G391W|ENST00000591214:p.G391W 39 | 1.0 SMAD2 p.R427Q 2 0.00444296094935197 7.85733333333333 1 18 45371711 45371711 C T ENST00000262160 ENST00000356825:p.R397Q|ENST00000402690:p.R427Q|ENST00000586040:p.R397Q|ENST00000591214:p.R397Q 40 | 2.0 SMAD3 p.Q23R 2 0.00232911889127077 8.746 1 15 67358560 67358560 A G ENST00000327367 - 41 | 2.0 SMAD3 p.Q26H 2 0.00232911889127077 0 1 15 67358570 67358570 G T ENST00000327367 42 | 3.0 SMAD3 p.L91P 2 0.00297890947901618 8.391 1 15 67457298 67457298 T C ENST00000327367 ENST00000439724:p.L47P|ENST00000559092:p.C73R 43 | 3.0 SMAD3 p.R93Q 2 0.00297890947901618 0 1 15 67457304 67457304 G A ENST00000327367 ENST00000439724:p.R49Q|ENST00000559092:p.D75N 44 | 5.0 SMAD3 p.R420H 3 0.0161101004961813 7.11 1 15 67482855 67482855 G A ENST00000327367 ENST00000439724:p.R376H|ENST00000537194:p.R225H|ENST00000540846:p.R315H|ENST00000560424:p.A151T 45 | 5.0 SMAD3 p.S416F 3 0.0266482498621537 6.321 1 15 67482843 67482843 C T ENST00000327367 ENST00000439724:p.S372F|ENST00000537194:p.S221F|ENST00000540846:p.S311F|ENST00000560424:p.P147S 46 | 5.0 SMAD3 p.S418I 3 1.01974701315002 0 1 15 67482849 67482849 G T ENST00000327367 ENST00000439724:p.S374I|ENST00000537194:p.S223I|ENST00000540846:p.S313I|ENST00000560424:p.A149S 47 | 5.0 SMAD3 p.S418N 3 1.01974701315002 0 1 15 67482849 67482849 G A ENST00000327367 ENST00000439724:p.S374N|ENST00000537194:p.S223N|ENST00000540846:p.S313N|ENST00000560424:p.A149T 48 | -------------------------------------------------------------------------------- /Demo/Expected/demo.weighted.l0.ad10.r10.mericUns.strInd.subInd.recurrence.clusters.summary: -------------------------------------------------------------------------------- 1 | Cluster_ID Centroid Cluster_Closeness Recurrence_Mass Avg_Centrality Avg_Recurrence Avg_Degree Avg_Geodesic N_Genes N_Vertices N_Genomic_Mutations N_Protein_Mutations N_Protein_Sites N_Protein_Positions N_Drugs Genes Transcripts Genomic_Mutations Protein_Mutations Protein_Sites Protein_Positions Drugs 2 | 0.0 SMAD4:p.RFCLG361in_frame_del 99.129 14 7.081 1.000 13.357 5.181 3 14 14 14 NA 7 NA SMAD2(1),SMAD3(1),SMAD4(12) ENST00000262160(1),ENST00000327367(1),ENST00000342988(12) SMAD2:18:45368254:45368254:C:T,SMAD3:15:67482810:67482810:A:T,SMAD4:18:48591901:48591901:A:G,SMAD4:18:48591904:48591904:C:T,SMAD4:18:48591906:48591906:T:C,SMAD4:18:48591917:48591931:TCGCTTTTGTTTGGG:-,SMAD4:18:48591918:48591918:C:A,SMAD4:18:48591918:48591918:C:T,SMAD4:18:48591919:48591919:G:A,SMAD4:18:48591919:48591919:G:C,SMAD4:18:48604787:48604787:G:T,SMAD4:18:48604788:48604788:A:G,SMAD4:18:48604788:48604788:A:T,SMAD4:18:48604789:48604789:C:A SMAD2:p.D450N,SMAD3:p.Q405L,SMAD4:p.D355G,SMAD4:p.D537E,SMAD4:p.D537G,SMAD4:p.D537V,SMAD4:p.D537Y,SMAD4:p.P356L,SMAD4:p.R361C,SMAD4:p.R361H,SMAD4:p.R361P,SMAD4:p.R361S,SMAD4:p.RFCLG361in_frame_del,SMAD4:p.S357P NA 355,356,357,361,405,450,537 NA 3 | 0.1 SMAD3:p.R268H 4.069 3 1.356 1.000 6.333 2.732 1 3 3 3 NA 2 NA SMAD3(3) ENST00000327367(3) SMAD3:15:67473685:67473685:G:A,SMAD3:15:67473722:67473722:C:T,SMAD3:15:67473723:67473723:G:A SMAD3:p.M255I,SMAD3:p.R268C,SMAD3:p.R268H NA 255,268 NA 4 | 0.2 SMAD2:p.P305Q 2.856 10 0.286 1.000 9.200 6.572 2 10 10 10 NA 9 NA SMAD2(7),SMAD4(3) ENST00000262160(7),ENST00000342988(3) SMAD2:18:45368278:45368278:G:C,SMAD2:18:45374910:45374910:G:C,SMAD2:18:45374924:45374924:T:G,SMAD2:18:45374929:45374929:G:A,SMAD2:18:45374929:45374929:G:T,SMAD2:18:45374932:45374932:T:C,SMAD2:18:45374945:45374945:C:T,SMAD4:18:48591807:48591807:T:C,SMAD4:18:48604770:48604770:G:A,SMAD4:18:48604776:48604776:T:G SMAD2:p.D300N,SMAD2:p.D304G,SMAD2:p.F311L,SMAD2:p.L442V,SMAD2:p.N307H,SMAD2:p.P305L,SMAD2:p.P305Q,SMAD4:p.C324R,SMAD4:p.L533R,SMAD4:p.R531Q NA 300,304,305,307,311,324,442,531,533 NA 5 | 0.3 SMAD4:p.C499Y 1.105 2 0.552 1.000 4.500 3.190 1 2 2 2 NA 2 NA SMAD4(2) ENST00000342988(2) SMAD4:18:48604665:48604665:G:A,SMAD4:18:48604674:48604674:G:A SMAD4:p.C499Y,SMAD4:p.R496H NA 496,499 NA 6 | 0.4 SMAD2:p.R321Q 1.200 5 0.240 1.000 5.200 6.131 2 5 5 5 NA 5 NA SMAD2(4),SMAD4(1) ENST00000262160(4),ENST00000342988(1) SMAD2:18:45374862:45374862:C:G,SMAD2:18:45374875:45374875:G:A,SMAD2:18:45374881:45374881:C:T,SMAD2:18:45374885:45374885:T:G,SMAD4:18:48604655:48604655:G:A SMAD2:p.A323V,SMAD2:p.M327I,SMAD2:p.N320H,SMAD2:p.R321Q,SMAD4:p.D493N NA 320,321,323,327,493 NA 7 | 0.5 SMAD2:p.S276L 1.010 1 1.010 1.000 4.000 0.000 1 1 1 1 NA 1 NA SMAD2(1) ENST00000262160(1) SMAD2:18:45375016:45375016:G:A SMAD2:p.S276L NA 276 NA 8 | 1.0 SMAD2:p.G421W 0.069 3 0.023 1.000 2.333 4.301 1 3 3 3 NA 3 NA SMAD2(3) ENST00000262160(3) SMAD2:18:45371711:45371711:C:T,SMAD2:18:45371730:45371730:C:A,SMAD2:18:45372102:45372102:A:C SMAD2:p.F356C,SMAD2:p.G421W,SMAD2:p.R427Q NA 356,421,427 NA 9 | 2.0 SMAD3:p.Q26H 0.005 2 0.002 1.000 2.000 4.373 1 2 2 2 NA 2 NA SMAD3(2) ENST00000327367(2) SMAD3:15:67358560:67358560:A:G,SMAD3:15:67358570:67358570:G:T SMAD3:p.Q23R,SMAD3:p.Q26H NA 23,26 NA 10 | 3.0 SMAD3:p.R93Q 0.006 2 0.003 1.000 2.000 4.196 1 2 2 2 NA 2 NA SMAD3(2) ENST00000327367(2) SMAD3:15:67457298:67457298:T:C,SMAD3:15:67457304:67457304:G:A SMAD3:p.L91P,SMAD3:p.R93Q NA 91,93 NA 11 | 5.0 SMAD3:p.S418N 2.082 4 0.521 1.000 3.000 3.358 1 4 4 4 NA 3 NA SMAD3(4) ENST00000327367(4) SMAD3:15:67482843:67482843:C:T,SMAD3:15:67482849:67482849:G:A,SMAD3:15:67482849:67482849:G:T,SMAD3:15:67482855:67482855:G:A SMAD3:p.R420H,SMAD3:p.S416F,SMAD3:p.S418I,SMAD3:p.S418N NA 416,418,420 NA 12 | -------------------------------------------------------------------------------- /Demo/Expected/demo.weighted.l0.ad10.r10.mericUns.strInd.subInd.weight.clusters: -------------------------------------------------------------------------------- 1 | Cluster Gene/Drug Mutation/Gene Degree_Connectivity Closeness_Centrality Geodesic_From_Centroid Weight Chromosome Start Stop Reference Alternate Transcript Alternative_Transcripts 2 | 0.0 SMAD3 p.G379A 5 -0.093449203974911 9.105 0 15 67479829 67479829 G C ENST00000327367 ENST00000439724:p.G335A|ENST00000537194:p.G184A|ENST00000540846:p.G274A|ENST00000560424:p.A110P 3 | 0.0 SMAD3 p.P393L 4 -29.006837038473 0 -17 15 67482774 67482774 C T ENST00000327367 ENST00000439724:p.P349L|ENST00000537194:p.P198L|ENST00000540846:p.P288L|ENST00000560424:p.P124S 4 | 0.0 SMAD3 p.P393T 4 -29.006837038473 0 -12 15 67482773 67482773 C A ENST00000327367 ENST00000439724:p.P349T|ENST00000537194:p.P198T|ENST00000540846:p.P288T 5 | 0.0 SMAD3 p.Y237C 5 -4.69424586625339 9.426 -4 15 67473630 67473630 A G ENST00000327367 ENST00000439724:p.Y193C|ENST00000537194:p.Y42C|ENST00000540846:p.Y132C|ENST00000558428:p.Y42C|ENST00000558827:p.Y42C|ENST00000558894:p.Y132C 6 | 0.1 SMAD2 p.D300N 10 -1.4584423263982 8.973 -2 18 45374945 45374945 C T ENST00000262160 ENST00000356825:p.D270N|ENST00000402690:p.D300N|ENST00000586040:p.D270N|ENST00000591214:p.D270N 7 | 0.1 SMAD2 p.D304G 12 -19.9594030663692 5.579 -19 18 45374932 45374932 T C ENST00000262160 ENST00000356825:p.D274G|ENST00000402690:p.D304G|ENST00000586040:p.D274G|ENST00000591214:p.D274G 8 | 0.1 SMAD2 p.G301V 10 4.01422232082202 8.554 4 18 45374941 45374941 C A ENST00000262160 ENST00000356825:p.G271V|ENST00000402690:p.G301V|ENST00000586040:p.G271V|ENST00000591214:p.G271V 9 | 0.1 SMAD2 p.P305L 13 -24.2438592968616 8.709 -10 18 45374929 45374929 G A ENST00000262160 ENST00000356825:p.P275L|ENST00000402690:p.P305L|ENST00000586040:p.P275L|ENST00000591214:p.P275L 10 | 0.1 SMAD2 p.P305Q 13 -24.2438592968616 8.709 -13 18 45374929 45374929 G T ENST00000262160 ENST00000356825:p.P275Q|ENST00000402690:p.P305Q|ENST00000586040:p.P275Q|ENST00000591214:p.P275Q 11 | 0.1 SMAD3 p.R268C 7 0.076512514574411 7.954 -5 15 67473722 67473722 C T ENST00000327367 ENST00000439724:p.R224C|ENST00000537194:p.R73C|ENST00000540846:p.R163C|ENST00000558428:p.R73C|ENST00000558827:p.R73C|ENST00000558894:p.R163C 12 | 0.1 SMAD3 p.R268H 7 0.0765125145744113 7.954 5 15 67473723 67473723 G A ENST00000327367 ENST00000439724:p.R224H|ENST00000537194:p.R73H|ENST00000540846:p.R163H|ENST00000558428:p.R73H|ENST00000558827:p.R73H|ENST00000558894:p.R163H 13 | 0.1 SMAD4 p.D355G 14 -4.32615668566781 5.665 -4 18 48591901 48591901 A G ENST00000342988 ENST00000398417:p.D355G|ENST00000588745:p.D259G 14 | 0.1 SMAD4 p.D537E 17 27.4810989460798 0 10 18 48604789 48604789 C A ENST00000342988 ENST00000398417:p.D537E|ENST00000588745:p.D441E 15 | 0.1 SMAD4 p.D537G 17 27.4810989460798 0 10 18 48604788 48604788 A G ENST00000342988 ENST00000398417:p.D537G|ENST00000588745:p.D441G 16 | 0.1 SMAD4 p.D537V 17 27.4810989460798 0 -6 18 48604788 48604788 A T ENST00000342988 ENST00000398417:p.D537V|ENST00000588745:p.D441V 17 | 0.1 SMAD4 p.D537Y 17 27.4810989460798 0 14 18 48604787 48604787 G T ENST00000342988 ENST00000398417:p.D537Y|ENST00000588745:p.D441Y 18 | 0.1 SMAD4 p.L533R 13 2.57453416754925 7.155 3 18 48604776 48604776 T G ENST00000342988 ENST00000398417:p.L533R|ENST00000588745:p.L437R 19 | 0.1 SMAD4 p.P356L 17 -17.6767348288632 9.057 -18 18 48591904 48591904 C T ENST00000342988 ENST00000398417:p.P356L|ENST00000588745:p.P260L 20 | 0.1 SMAD4 p.R361C 11 3.96150400959899 8.514 0 18 48591918 48591918 C T ENST00000342988 ENST00000398417:p.R361C|ENST00000588745:p.R265C 21 | 0.1 SMAD4 p.R361H 11 3.96150400959899 8.514 8 18 48591919 48591919 G A ENST00000342988 ENST00000398417:p.R361H|ENST00000588745:p.R265H 22 | 0.1 SMAD4 p.R361P 11 3.96150400959899 8.514 -1 18 48591919 48591919 G C ENST00000342988 ENST00000398417:p.R361P|ENST00000588745:p.R265P 23 | 0.1 SMAD4 p.R361S 11 3.96150400959899 8.514 -7 18 48591918 48591918 C A ENST00000342988 ENST00000398417:p.R361S|ENST00000588745:p.R265S 24 | 0.1 SMAD4 p.R531Q 9 1.99952388759441 9.66 2 18 48604770 48604770 G A ENST00000342988 ENST00000398417:p.R531Q|ENST00000588745:p.R435Q 25 | 0.1 SMAD4 p.RFCLG361in_frame_del 11 3.96150400959899 8.514 4 18 48591917 48591931 TCGCTTTTGTTTGGG - ENST00000342988 ENST00000398417:p.RFCLG361in_frame_del|ENST00000588745:p.RFCLG265in_frame_del 26 | 0.1 SMAD4 p.S357P 13 3.74871607737679 9.883 5 18 48591906 48591906 T C ENST00000342988 ENST00000398417:p.S357P|ENST00000588745:p.S261P 27 | 0.2 SMAD4 p.A327V 3 -2.86883805968699 7.319 -3 18 48591817 48591817 C T ENST00000342988 ENST00000398417:p.A327V|ENST00000588745:p.A231V 28 | 0.2 SMAD4 p.K519N 3 -4.81897324516468 7.3605 -5 18 48604735 48604735 A C ENST00000342988 ENST00000398417:p.K519N|ENST00000588745:p.K423N 29 | 0.2 SMAD4 p.S504R 4 14.262472584392 6.4724 14 18 48604690 48604690 T A ENST00000342988 ENST00000398417:p.S504R|ENST00000588745:p.S408R 30 | 0.2 SMAD4 p.W524C 4 18.1096730359827 0 18 18 48604750 48604750 G T ENST00000342988 ENST00000398417:p.W524C|ENST00000588745:p.W428C 31 | 0.3 SMAD2 p.A323V 5 -18.0859675784086 0 -18 18 45374875 45374875 G A ENST00000262160 ENST00000356825:p.A293V|ENST00000402690:p.A323V|ENST00000586040:p.A293V|ENST00000591214:p.A293V 32 | 0.3 SMAD2 p.M327I 5 -8.05655085357411 5.57633333333333 -8 18 45374862 45374862 C G ENST00000262160 ENST00000356825:p.M297I|ENST00000402690:p.M327I|ENST00000586040:p.M297I|ENST00000591214:p.M297I 33 | 0.3 SMAD2 p.R321Q 6 11.0324537054882 7.457 11 18 45374881 45374881 C T ENST00000262160 ENST00000356825:p.R291Q|ENST00000402690:p.R321Q|ENST00000586040:p.R291Q|ENST00000591214:p.R291Q 34 | 0.3 SMAD2 p.R330M 6 13.7573388712351 9.49933333333333 14 18 45374854 45374854 C A ENST00000262160 ENST00000356825:p.R300M|ENST00000402690:p.R330M|ENST00000586040:p.R300M|ENST00000591214:p.R300M 35 | 0.3 SMAD4 p.D493N 7 -0.755603193706578 9.707 -1 18 48604655 48604655 G A ENST00000342988 ENST00000398417:p.D493N|ENST00000588745:p.D397N 36 | 0.4 SMAD2 p.Q407E 3 -17.9810954982243 0 -18 18 45371772 45371772 G C ENST00000262160 ENST00000356825:p.Q377E|ENST00000402690:p.Q407E|ENST00000586040:p.Q377E|ENST00000591214:p.Q377E 37 | 0.4 SMAD2 p.S397F 3 -8.36731789453709 8.963 -8 18 45371801 45371801 G A ENST00000262160 ENST00000356825:p.S367F|ENST00000402690:p.S397F|ENST00000586040:p.S367F|ENST00000591214:p.S367F 38 | 0.5 SMAD3 p.R243C 4 -15.0186324888757 8.761 -15 15 67473647 67473647 C T ENST00000327367 ENST00000439724:p.R199C|ENST00000537194:p.R48C|ENST00000540846:p.R138C|ENST00000558428:p.R48C|ENST00000558827:p.R48C|ENST00000558894:p.R138C 39 | 0.5 SMAD4 p.E337K 3 17.9624534834047 0 18 18 48591846 48591846 G A ENST00000342988 ENST00000398417:p.E337K|ENST00000588745:p.E241K 40 | 0.6 SMAD3 p.A329S 4 17.9024483857515 0 18 15 67477178 67477178 G T ENST00000327367 ENST00000439724:p.A285S|ENST00000537194:p.A134S|ENST00000540846:p.A224S 41 | 0.6 SMAD3 p.A382V 4 16.9783824511904 8.538 17 15 67479838 67479838 C T ENST00000327367 ENST00000439724:p.A338V|ENST00000537194:p.A187V|ENST00000540846:p.A277V|ENST00000560424:p.R113W 42 | 0.6 SMAD3 p.C320Y 6 11.5719155087514 9.647 11 15 67477152 67477152 G A ENST00000327367 ENST00000439724:p.C276Y|ENST00000537194:p.C125Y|ENST00000540846:p.C215Y|ENST00000560424:p.V14I 43 | 0.6 SMAD3 p.V331I 5 -13.6983863253275 6.472 -14 15 67477184 67477184 G A ENST00000327367 ENST00000439724:p.V287I|ENST00000537194:p.V136I|ENST00000540846:p.V226I 44 | 0.7 SMAD2 p.L442V 5 -17.8996163158435 0 -18 18 45368278 45368278 G C ENST00000262160 ENST00000356825:p.L412V|ENST00000402690:p.L442V|ENST00000586040:p.L412V 45 | 0.7 SMAD2 p.L446V 7 13.2613332563646 6.46166666666667 14 18 45368266 45368266 G C ENST00000262160 ENST00000356825:p.L416V|ENST00000402690:p.L446V|ENST00000586040:p.L416V 46 | 0.7 SMAD2 p.S276L 4 -1.1357728088482 7.329 -1 18 45375016 45375016 G A ENST00000262160 ENST00000356825:p.S246L|ENST00000402690:p.S276L|ENST00000586040:p.S246L|ENST00000591214:p.S246L 47 | 0.8 SMAD4 p.A406T 3 17.8646990701873 0 18 18 48593465 48593465 G A ENST00000342988 ENST00000398417:p.A406T|ENST00000588745:p.A310T 48 | 0.8 SMAD4 p.K428T 3 -12.7145703421721 6.0625 -13 18 48593532 48593532 A C ENST00000342988 ENST00000398417:p.K428T|ENST00000588745:p.K332T|ENST00000593223:p.K17T 49 | 0.9 SMAD2 p.G335E 3 13.9856371439189 0 14 18 45372165 45372165 C T ENST00000262160 ENST00000356825:p.G305E|ENST00000402690:p.G335E|ENST00000586040:p.G305E|ENST00000591214:p.G305E 50 | 1.0 SMAD2 p.F356C 2 -6.48365065317708 5.04433333333333 -7 18 45372102 45372102 A C ENST00000262160 ENST00000356825:p.F326C|ENST00000402690:p.F356C|ENST00000586040:p.F326C|ENST00000591214:p.F326C 51 | 1.0 SMAD2 p.G421W 3 16.8266803727969 0 17 18 45371730 45371730 C A ENST00000262160 ENST00000356825:p.G391W|ENST00000402690:p.G421W|ENST00000586040:p.G391W|ENST00000591214:p.G391W 52 | 1.0 SMAD2 p.R427Q 2 9.07239400015404 7.85733333333333 9 18 45371711 45371711 C T ENST00000262160 ENST00000356825:p.R397Q|ENST00000402690:p.R427Q|ENST00000586040:p.R397Q|ENST00000591214:p.R397Q 53 | 2.0 SMAD3 p.Q23R 2 -8 0 -8 15 67358560 67358560 A G ENST00000327367 54 | 2.0 SMAD3 p.Q26H 2 -0.0186329511301661 8.746 0 15 67358570 67358570 G T ENST00000327367 - 55 | 3.0 SMAD3 p.L91P 2 -4.97021090520984 8.391 -5 15 67457298 67457298 T C ENST00000327367 ENST00000439724:p.L47P|ENST00000559092:p.C73R 56 | 3.0 SMAD3 p.R93Q 2 9.98510545260492 0 10 15 67457304 67457304 G A ENST00000327367 ENST00000439724:p.R49Q|ENST00000559092:p.D75N 57 | 5.0 SMAD3 p.R420H 3 -15.1562045188744 7.11 -15 15 67482855 67482855 G A ENST00000327367 ENST00000439724:p.R376H|ENST00000537194:p.R225H|ENST00000540846:p.R315H|ENST00000560424:p.A151T 58 | 5.0 SMAD3 p.S416F 3 -7.27464330876726 6.321 -7 15 67482843 67482843 C T ENST00000327367 ENST00000439724:p.S372F|ENST00000537194:p.S221F|ENST00000540846:p.S311F|ENST00000560424:p.P147S 59 | 5.0 SMAD3 p.S418I 3 -20.1961408459183 0 -1 15 67482849 67482849 G T ENST00000327367 ENST00000439724:p.S374I|ENST00000537194:p.S223I|ENST00000540846:p.S313I|ENST00000560424:p.A149S 60 | 5.0 SMAD3 p.S418N 3 -20.1961408459183 0 -19 15 67482849 67482849 G A ENST00000327367 ENST00000439724:p.S374N|ENST00000537194:p.S223N|ENST00000540846:p.S313N|ENST00000560424:p.A149T 61 | -------------------------------------------------------------------------------- /Demo/Expected/demo.weighted.l0.ad10.r10.mericUns.strInd.subInd.weight.clusters.summary: -------------------------------------------------------------------------------- 1 | Cluster_ID Centroid Cluster_Closeness Recurrence_Mass Avg_Centrality Avg_Recurrence Avg_Degree Avg_Geodesic N_Genes N_Vertices N_Genomic_Mutations N_Protein_Mutations N_Protein_Sites N_Protein_Positions N_Drugs Genes Transcripts Genomic_Mutations Protein_Mutations Protein_Sites Protein_Positions Drugs 2 | 0.0 SMAD3:p.P393T -62.801 4 -15.700 1.000 4.500 4.633 1 4 4 4 NA 3 NA SMAD3(4) ENST00000327367(4) SMAD3:15:67473630:67473630:A:G,SMAD3:15:67479829:67479829:G:C,SMAD3:15:67482773:67482773:C:A,SMAD3:15:67482774:67482774:C:T SMAD3:p.G379A,SMAD3:p.P393L,SMAD3:p.P393T,SMAD3:p.Y237C NA 237,379,393 NA 3 | 0.1 SMAD4:p.D537Y 50.313 21 2.396 1.000 12.429 6.687 3 21 21 21 NA 12 NA SMAD2(5),SMAD3(2),SMAD4(14) ENST00000262160(5),ENST00000327367(2),ENST00000342988(14) SMAD2:18:45374929:45374929:G:A,SMAD2:18:45374929:45374929:G:T,SMAD2:18:45374932:45374932:T:C,SMAD2:18:45374941:45374941:C:A,SMAD2:18:45374945:45374945:C:T,SMAD3:15:67473722:67473722:C:T,SMAD3:15:67473723:67473723:G:A,SMAD4:18:48591901:48591901:A:G,SMAD4:18:48591904:48591904:C:T,SMAD4:18:48591906:48591906:T:C,SMAD4:18:48591917:48591931:TCGCTTTTGTTTGGG:-,SMAD4:18:48591918:48591918:C:A,SMAD4:18:48591918:48591918:C:T,SMAD4:18:48591919:48591919:G:A,SMAD4:18:48591919:48591919:G:C,SMAD4:18:48604770:48604770:G:A,SMAD4:18:48604776:48604776:T:G,SMAD4:18:48604787:48604787:G:T,SMAD4:18:48604788:48604788:A:G,SMAD4:18:48604788:48604788:A:T,SMAD4:18:48604789:48604789:C:A SMAD2:p.D300N,SMAD2:p.D304G,SMAD2:p.G301V,SMAD2:p.P305L,SMAD2:p.P305Q,SMAD3:p.R268C,SMAD3:p.R268H,SMAD4:p.D355G,SMAD4:p.D537E,SMAD4:p.D537G,SMAD4:p.D537V,SMAD4:p.D537Y,SMAD4:p.L533R,SMAD4:p.P356L,SMAD4:p.R361C,SMAD4:p.R361H,SMAD4:p.R361P,SMAD4:p.R361S,SMAD4:p.R531Q,SMAD4:p.RFCLG361in_frame_del,SMAD4:p.S357P NA 268,300,301,304,305,355,356,357,361,531,533,537 NA 4 | 0.2 SMAD4:p.W524C 24.684 4 6.171 1.000 3.500 5.288 1 4 4 4 NA 4 NA SMAD4(4) ENST00000342988(4) SMAD4:18:48591817:48591817:C:T,SMAD4:18:48604690:48604690:T:A,SMAD4:18:48604735:48604735:A:C,SMAD4:18:48604750:48604750:G:T SMAD4:p.A327V,SMAD4:p.K519N,SMAD4:p.S504R,SMAD4:p.W524C NA 327,504,519,524 NA 5 | 0.3 SMAD2:p.A323V -2.108 5 -0.422 1.000 5.800 6.448 2 5 5 5 NA 5 NA SMAD2(4),SMAD4(1) ENST00000262160(4),ENST00000342988(1) SMAD2:18:45374854:45374854:C:A,SMAD2:18:45374862:45374862:C:G,SMAD2:18:45374875:45374875:G:A,SMAD2:18:45374881:45374881:C:T,SMAD4:18:48604655:48604655:G:A SMAD2:p.A323V,SMAD2:p.M327I,SMAD2:p.R321Q,SMAD2:p.R330M,SMAD4:p.D493N NA 321,323,327,330,493 NA 6 | 0.4 SMAD2:p.Q407E -26.348 2 -13.174 1.000 3.000 4.481 1 2 2 2 NA 2 NA SMAD2(2) ENST00000262160(2) SMAD2:18:45371772:45371772:G:C,SMAD2:18:45371801:45371801:G:A SMAD2:p.Q407E,SMAD2:p.S397F NA 397,407 NA 7 | 0.5 SMAD4:p.E337K 2.944 2 1.472 1.000 3.500 4.380 2 2 2 2 NA 2 NA SMAD3(1),SMAD4(1) ENST00000327367(1),ENST00000342988(1) SMAD3:15:67473647:67473647:C:T,SMAD4:18:48591846:48591846:G:A SMAD3:p.R243C,SMAD4:p.E337K NA 243,337 NA 8 | 0.6 SMAD3:p.A329S 32.754 4 8.189 1.000 4.750 6.164 1 4 4 4 NA 4 NA SMAD3(4) ENST00000327367(4) SMAD3:15:67477152:67477152:G:A,SMAD3:15:67477178:67477178:G:T,SMAD3:15:67477184:67477184:G:A,SMAD3:15:67479838:67479838:C:T SMAD3:p.A329S,SMAD3:p.A382V,SMAD3:p.C320Y,SMAD3:p.V331I NA 320,329,331,382 NA 9 | 0.7 SMAD2:p.L442V -5.774 3 -1.925 1.000 5.333 4.597 1 3 3 3 NA 3 NA SMAD2(3) ENST00000262160(3) SMAD2:18:45368266:45368266:G:C,SMAD2:18:45368278:45368278:G:C,SMAD2:18:45375016:45375016:G:A SMAD2:p.L442V,SMAD2:p.L446V,SMAD2:p.S276L NA 276,442,446 NA 10 | 0.8 SMAD4:p.A406T 5.150 2 2.575 1.000 3.000 3.031 1 2 2 2 NA 2 NA SMAD4(2) ENST00000342988(2) SMAD4:18:48593465:48593465:G:A,SMAD4:18:48593532:48593532:A:C SMAD4:p.A406T,SMAD4:p.K428T NA 406,428 NA 11 | 0.9 SMAD2:p.G335E 13.986 1 13.986 1.000 3.000 0.000 1 1 1 1 NA 1 NA SMAD2(1) ENST00000262160(1) SMAD2:18:45372165:45372165:C:T SMAD2:p.G335E NA 335 NA 12 | 1.0 SMAD2:p.G421W 19.415 3 6.472 1.000 2.333 4.301 1 3 3 3 NA 3 NA SMAD2(3) ENST00000262160(3) SMAD2:18:45371711:45371711:C:T,SMAD2:18:45371730:45371730:C:A,SMAD2:18:45372102:45372102:A:C SMAD2:p.F356C,SMAD2:p.G421W,SMAD2:p.R427Q NA 356,421,427 NA 13 | 2.0 SMAD3:p.Q23R -8.019 2 -4.009 1.000 2.000 4.373 1 2 2 2 NA 2 NA SMAD3(2) ENST00000327367(2) SMAD3:15:67358560:67358560:A:G,SMAD3:15:67358570:67358570:G:T SMAD3:p.Q23R,SMAD3:p.Q26H NA 23,26 NA 14 | 3.0 SMAD3:p.R93Q 5.015 2 2.507 1.000 2.000 4.196 1 2 2 2 NA 2 NA SMAD3(2) ENST00000327367(2) SMAD3:15:67457298:67457298:T:C,SMAD3:15:67457304:67457304:G:A SMAD3:p.L91P,SMAD3:p.R93Q NA 91,93 NA 15 | 5.0 SMAD3:p.S418N -62.823 4 -15.706 1.000 3.000 3.358 1 4 4 4 NA 3 NA SMAD3(4) ENST00000327367(4) SMAD3:15:67482843:67482843:C:T,SMAD3:15:67482849:67482849:G:A,SMAD3:15:67482849:67482849:G:T,SMAD3:15:67482855:67482855:G:A SMAD3:p.R420H,SMAD3:p.S416F,SMAD3:p.S418I,SMAD3:p.S418N NA 416,418,420 NA 16 | -------------------------------------------------------------------------------- /Demo/README_demo: -------------------------------------------------------------------------------- 1 | ##### HotSpot3D Demo ############################## 2 | # created by: Adam D. Scott (adamscott@wustl.edu) # 3 | # date: 2016*08*11 # 4 | # last-update: 2017*01*24 # 5 | ################################################### 6 | 7 | # The goal of this demo is to familiarize the user with HotSpot3D. 8 | # There are two main parts to HotSpot3D, preprocessing and analysis. 9 | # There are many options not highlighted here, because I want to show how HotSpot3D can be run with defaults. 10 | # I have included somatic mutations from SMAD2, SMAD3, and SMAD4 derived from TCGA. 11 | # The included .maf file is really a subset of a full .maf file. 12 | # This is because the columns in the demo.maf are the minimum necessary columns needed to work with HotSpot3D. 13 | # For each step I have included standard out (1) and error (2) print outs, but these are optional. 14 | # In total, this demo can be run within an hour for most systems with LSF server capability. 15 | # Without the LSF server, you may try working with mutations from just one gene: 16 | # head -n1 demo.maf > small.demo.maf; grep SMAD2 demo.maf >> small.demo.maf 17 | # In addition without the LSF server, you will need to run calpro instead of uppro (see below). 18 | # Not included in preprocessing is the drugport step, but this can be run separately in the preprocessing stage. 19 | # Not included in analysis is the sigclus step due to very lengthy runtime. 20 | # HotSpot3D sigclus is under active development and should be efficient and available soon. 21 | # For each step, specific help information can be found by entering, 'hotspot3d '. 22 | 23 | ################ 24 | # BEFORE RUNNING 25 | ##### 26 | mkdir -p prep/cosmic pdb 27 | # The pdb directory can hold .pdb files, but downloading them is not necessary for HotSpot3D. 28 | # When necessary, HotSpot3D will retrieve structures from RCSB.org without downloading them to your disk. 29 | 30 | # See the HotSpot3D README.md for details on the COSMIC download. 31 | # Put COSMIC annotation download into prep/cosmic/ 32 | # Make sure COSMIC download has 6 columns: 33 | #1 gene_name 34 | #2 transcript_name 35 | #3 trv_type 36 | #4 amino_acid_change 37 | #5 domain 38 | #6 tissue_type 39 | 40 | ################# 41 | # EXPECTED OUTPUT 42 | ##### 43 | # If at any point you are unsure about the output, please inspect files in Demo/Expected/. 44 | # Files in Demo/Expected/ were generated by running each step as outlined below from this Demo/ directory. 45 | # Those files were then moved to Demo/Expected/ to provide a clean directory for your own test. 46 | # Preprocessing files generated under prep/ are not included, because they are rather large (>>20MB). 47 | # The .out files state the files being generated in preprocessing steps. 48 | # The residue measure used for preprocessing for the Demo/Expected/ output is average (see uppro). 49 | # In the output files, to differentiate between preprocessing with the average & shortest measures (uppro's --measure option), 50 | # average residue distance measure is referred to as ARD, 51 | # whereas shortest residue distance measure is referred to as SRD. 52 | # Similarly in the output files, to differentiate between clustering with the average & shortest measures (cluster's --distance-measure option), 53 | # average structure distance measure is referred to as ASD, 54 | # whereas shortest structure distance measure is referred to as SSD. 55 | 56 | ####################### 57 | # RUNNING PREPROCESSING 58 | ##### 59 | # The uppro step used to default launch jobs to an LSF server where those jobs run calpro (now called with the option --parallel bsub). 60 | # If you do not have an LSF server, you have the option to parallelize on your CPU cores using --parallel local. 61 | # For this demo, the difference between --parallel none & --parallel local with --max-processes 6 reduced the uppro/calpro runtime from 30-60min to 1-2min on a 2014 macbook pro. 62 | # CAUTION: be sure to know your max cores, and be sure not to set --max-process >= max cores. 63 | # This can slow down your system. 64 | # To check on unix-based OS, use 'nproc' to check. 65 | # On Mac-based OS use 'sysctl -n hw.ncpu' to check. 66 | # Otherwise, you can skip uppro, and run calpro 'manually'. 67 | # To get help with calpro, simply enter the command, 'hotspot3d calpro'. 68 | # 69 | # local CPU parallelization 70 | hotspot3d uppro --measure average --output-dir prep/ --pdb pdb/ --gene-file demo.maf --parallel local --max-processes 6 1>demo.uppro.out 2>demo.uppro.err 71 | # bsub parallelization 72 | #hotspot3d uppro --measure average --output-dir prep/ --pdb pdb/ --gene-file demo.maf --parallel bsub 1>demo.uppro.out 2>demo.uppro.err 73 | # no parallelization 74 | #hotspot3d uppro --measure average --output-dir prep/ --pdb pdb/ --gene-file demo.maf --parallel none 1>demo.uppro.out 2>demo.uppro.err 75 | # run with shortest residue distance measure option 76 | #hotspot3d uppro --measure shortest --output-dir prep/ --pdb pdb/ --gene-file demo.maf 1>demo.uppro.out 2>demo.uppro.err 77 | 78 | ###### 79 | # PREP 80 | ##### 81 | # The 'prep' command should be able to run all remaining preprocessing steps (calroi-prior). 82 | # If you wish to run each preprocessing step individually, then run the below steps instead of this prep command. 83 | # If you use 'prep', you should not need to run the remaining preprocessing steps and can skip to the ANALYSIS steps beginning with search. 84 | hotspot3d prep --output-dir prep 1>demo.prep.out 2>demo.prep.err 85 | 86 | ################## 87 | # ALTERNATIVE PREP 88 | ##### 89 | # If you wish to run each preprocessing step individually, then run the following steps instead of the above prep command. 90 | # Before running any additional steps, make sure that the uppro/calpro runs are finished. 91 | #hotspot3d calroi --output-dir prep 1>demo.calroi.out 2>demo.calroi.err 92 | 93 | #hotspot3d statis --output-dir prep 1>demo.statis.out 2>demo.statis.err 94 | 95 | #hotspot3d anno --output-dir prep 1>demo.anno.out 2>demo.anno.err 96 | 97 | #hotspot3d trans --output-dir prep 1>demo.trans.out 2>demo.trans.err 98 | 99 | #hotspot3d cosmic --output-dir prep 1>demo.cosmic.out 2>demo.cosmic.err 100 | 101 | #hotspot3d prior --output-dir prep 1>demo.prior.out 2>demo.prior.err 102 | 103 | ################## 104 | # RUNNING ANALYSIS 105 | ##### 106 | # There are several new options for clustering, meric-type, structure-dependence, subunit-dependence, & parallel. 107 | # meric-type deals with quartenary structures such as monomer, multimer, homomer, heteromer. 108 | # To keep some legacy capability, intra and inter can also be used as a meric-type. 109 | # structure-dependence will partition pairs by what structure they come from and then cluster pairs only within each structure. 110 | # subunit-dependence will partition pairs by specific subunit/chain pairs within each structure and then cluster only those pairs from specific subunits. 111 | # Examples for when subunit-dependence is set to dependent: 112 | # If meric-type is monomer, then clustering is partitioned to pairs within the same structure AND within the same subunit. 113 | # If meric-type is multimer, then clustering is partitioned to pairs within the same structure AND between different subunits. 114 | # If meric-type is not specified, then clustering is partitioned to any pairs within the same structure AND between each pair of subunits where a pair exists. 115 | # intra used to be called singleprotein and means any homodimer and monomer. 116 | # inter used to be called complex and means any heterodimer. 117 | # The post step used to proved a .collapsed file using the singleprotein and complex naming scheme, but the post step is deprecated. 118 | # parallel will parallelize with the same method as used in the uppro step above. 119 | # This is currently only useful if you are doing structure or subunit dependent clustering, since it parallelizes clustering over the structures. 120 | # For best performance, if you expect to have many structures involved (>100) then using parallel with with max-process at least 4 should have similar runtime (estimating). 121 | # If you are not doing structure or subunit dependent clustering, then use the parallel default (none, don't set this option) or set --parallel none. 122 | # 123 | # The visual step generates a PyMol script and will attempt to launch pymol if the 'script-only' flag is given. 124 | 125 | hotspot3d search --maf-file demo.maf --prep-dir prep --3d-distance-cutoff 10 1>demo.search.out 2>demo.search.err 126 | 127 | hotspot3d cluster --pairwise-file 3D_Proximity.pairwise --maf-file demo.maf --meric-type intra --3d-distance-cutoff 10 1>demo.cluster.singleprotein.out 2>demo.cluster.singleprotein.err 128 | #this used to be: hotspot3d cluster --pairwise-file 3D_Proximity.pairwise --maf-file demo.maf --collapsed-file 3D_Proximity.pairwise.singleprotein.collapsed --3d-distance-cutoff 10 1>demo.cluster.singleprotein.out 2>demo.cluster.singleprotein.err 129 | # 130 | #hotspot3d cluster --pairwise-file 3D_Proximity.pairwise --maf-file demo.maf --meric-type inter --3d-distance-cutoff 10 1>demo.cluster.complex.out 2>demo.cluster.complex.err 131 | #this used to be: hotspot3d cluster --pairwise-file 3D_Proximity.pairwise --maf-file demo.maf --collapsed-file 3D_Proximity.pairwise.complex.collapsed --3d-distance-cutoff 10 1>demo.cluster.complex.out 2>demo.cluster.complex.err 132 | 133 | hotspot3d summary --clusters-file demo.maf.3D_Proximity.pairwise.singleprotein.collapsed.l0.ad10.r10.clusters 1>demo.summary.out 2>demo.summary.err 134 | 135 | hotspot3d visual --clusters-file demo.maf.3D_Proximity.pairwise.singleprotein.collapsed.l0.ad10.r10.clusters --pdb 1U7V --pairwise-file 3D_Proximity.pairwise 136 | -------------------------------------------------------------------------------- /HotSpot3D-1.8.2.tar.gz: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ding-lab/hotspot3d/479aa2a97cd67d3b3036d14a11b7c55d8a412117/HotSpot3D-1.8.2.tar.gz -------------------------------------------------------------------------------- /bin/hotspot3d: -------------------------------------------------------------------------------- 1 | #!/usr/bin/perl 2 | #---------------------------------- 3 | # $Authors: Beifang Niu & Adam D Scott 4 | # $Date: 2013-08-08 13:22:08 -0500 (Thu Aug 8 13:22:08 CDT 2013) $ 5 | # $Revision: 1.8.2 $ 6 | # $URL: $ 7 | #---------------------------------- 8 | use strict; 9 | use warnings; 10 | 11 | our $VERSION = 'V1.8.2'; 12 | 13 | use Carp; 14 | use FileHandle; 15 | use IO::File; 16 | use Getopt::Long; 17 | 18 | ## Add packages here 19 | use TGI::Mutpro::Main::Proximity; 20 | use TGI::Mutpro::Main::Post; 21 | use TGI::Mutpro::Main::Cluster; 22 | use TGI::Mutpro::Main::Significance; 23 | use TGI::Mutpro::Main::Summary; 24 | use TGI::Mutpro::Main::Visual; 25 | use TGI::Mutpro::Main::AllMain; 26 | use TGI::Mutpro::Preprocess::Drugport; 27 | use TGI::Mutpro::Preprocess::Uppro; 28 | use TGI::Mutpro::Preprocess::Calpro; 29 | use TGI::Mutpro::Preprocess::Statis; 30 | use TGI::Mutpro::Preprocess::Trans; 31 | use TGI::Mutpro::Preprocess::Calroi; 32 | use TGI::Mutpro::Preprocess::Anno; 33 | use TGI::Mutpro::Preprocess::Cosmic; 34 | use TGI::Mutpro::Preprocess::Prior; 35 | use TGI::Mutpro::Preprocess::Homolog; 36 | use TGI::Mutpro::Preprocess::AllPreprocess; 37 | 38 | my $subCmd = shift; 39 | ## Add module option here 40 | my %cmds = map{ ($_, 1) } qw( search post visual cluster sigclus summary drugport uppro calpro calroi statis anno trans homo cosmic prior prep main density help ); 41 | unless (defined $subCmd) { die help_text(); }; 42 | unless (exists $cmds{$subCmd}) { 43 | warn ' Please give valid sub command ! ', "\n"; 44 | die help_text(); 45 | } 46 | SWITCH:{ 47 | ## Add module action here 48 | $subCmd eq 'search' && do { TGI::Mutpro::Main::Proximity->new(); last SWITCH; }; 49 | $subCmd eq 'post' && do { TGI::Mutpro::Main::Post->new(); last SWITCH; }; 50 | $subCmd eq 'visual' && do { TGI::Mutpro::Main::Visual->new(); last SWITCH; }; 51 | $subCmd eq 'cluster' && do { TGI::Mutpro::Main::Cluster->new(); last SWITCH; }; 52 | $subCmd eq 'sigclus' && do { TGI::Mutpro::Main::Significance->new(); last SWITCH; }; 53 | $subCmd eq 'summary' && do { TGI::Mutpro::Main::Summary->new(); last SWITCH; }; 54 | $subCmd eq 'drugport' && do { TGI::Mutpro::Preprocess::Drugport->new(); last SWITCH; }; 55 | $subCmd eq 'uppro' && do { TGI::Mutpro::Preprocess::Uppro->new(); last SWITCH; }; 56 | $subCmd eq 'calpro' && do { TGI::Mutpro::Preprocess::Calpro->new(); last SWITCH; }; 57 | $subCmd eq 'calroi' && do { TGI::Mutpro::Preprocess::Calroi->new(); last SWITCH; }; 58 | $subCmd eq 'statis' && do { TGI::Mutpro::Preprocess::Statis->new(); last SWITCH; }; 59 | $subCmd eq 'anno' && do { TGI::Mutpro::Preprocess::Anno->new(); last SWITCH; }; 60 | $subCmd eq 'trans' && do { TGI::Mutpro::Preprocess::Trans->new(); last SWITCH; }; 61 | $subCmd eq 'homo' && do { TGI::Mutpro::Preprocess::Homolog->new(); last SWITCH; }; 62 | $subCmd eq 'cosmic' && do { TGI::Mutpro::Preprocess::Cosmic->new(); last SWITCH; }; 63 | $subCmd eq 'prior' && do { TGI::Mutpro::Preprocess::Prior->new(); last SWITCH; }; 64 | $subCmd eq 'prep' && do { TGI::Mutpro::Preprocess::AllPreprocess->new(); last SWITCH; }; 65 | $subCmd eq 'main' && do { TGI::Mutpro::Main::AllMain->new(); last SWITCH; }; 66 | $subCmd eq 'help' && do { die help_text(); last SWITCH; }; 67 | } 68 | sub help_text { 69 | ## Add module help here 70 | return < [options] 76 | 77 | Preprocessing 78 | drugport -- 0) Parse drugport database (OPTIONAL) 79 | uppro -- 1) Update proximity files 80 | prep -- 2) Run steps 2a-2f of preprocessing 81 | calroi -- 2a) Generate region of interest (ROI) information 82 | statis -- 2b) Calculate p_values for pairs of mutations 83 | anno -- 2c) Add region of interest (ROI) annotation 84 | trans -- 2d) Add transcript annotation 85 | cosmic -- 2e) Add COSMIC annotation to proximity file 86 | prior -- 2f) Prioritization 87 | 88 | Analysis 89 | 90 | main -- 0) Run steps a-f of analysis (BETA) 91 | search -- a) 3D mutation proximity searching 92 | cluster -- b) Determine mutation-mutation and mutation-drug clusters 93 | sigclus -- c) Determine significance of clusters (BETA) 94 | summary -- d) Summarize clusters 95 | visual -- e) Visulization of 3D proximity 96 | 97 | help -- this message 98 | 99 | SUPPORT 100 | For user support please email adamscott\@wustl.edu 101 | 102 | HELP 103 | } 104 | 105 | 1; 106 | 107 | __END__ 108 | 109 | =head1 NAME 110 | 111 | hotspot3d - 3D mutation proximity analysis program. 112 | 113 | =head1 SYNOPSIS 114 | 115 | hotspot3d --help; 116 | 117 | =head1 DESCRIPTION 118 | 119 | hotspot3d - 3D mutation proximity analysis program. 120 | 121 | =head1 AUTHOR 122 | 123 | Beifang Niu Ebeifang.cn@gmail.comE 124 | 125 | =head1 SEE ALSO 126 | 127 | https://github.com/ding-lab/hotspot3d 128 | 129 | =head1 LICENSE 130 | 131 | This library is free software with MIT licence; you can redistribute it and/or modify 132 | it under the same terms as Perl itself. 133 | 134 | =cut 135 | 136 | -------------------------------------------------------------------------------- /dist.ini: -------------------------------------------------------------------------------- 1 | name = HotSpot3D 2 | author = Beifang Niu, John Wallis, Adam D Scott, Sohini Sengupta, Amila Weerasinghe, & Matthew H Bailey from McDonnell Genome Institute of Washington University at St. Louis 3 | version = 1.8.2 4 | license = Perl_5 5 | copyright_holder = McDonnell Genome Institute at Washington University 6 | copyright_year = 2017 7 | abstract = 3D mutation proximity & cluster analysis 8 | 9 | [@Basic] 10 | 11 | -------------------------------------------------------------------------------- /lib/TGI/Data/CleanNumber.pm: -------------------------------------------------------------------------------- 1 | package TGI::Data::CleanNumber; 2 | # 3 | #---------------------------------- 4 | # $Authors: Adam Scott 5 | # $Date: 2016-11-07 6 | # $Revision: v0.1 $ 7 | # $URL: $ 8 | # $Doc: $ assures clean numbers 9 | # 10 | #---------------------------------- 11 | # 12 | use strict; 13 | use warnings; 14 | 15 | #sub new { 16 | # my $proto = shift; 17 | # my $class = ref( $proto ) || $proto; 18 | # my $this = {}; 19 | # $this->{'number'} = shift; 20 | # bless $this, $class; 21 | # return $this; 22 | #} 23 | # 24 | #sub nullIsZero { 25 | # my $this = shift; 26 | # if ( $this->{'number'} =~ /N\/A/ ) { return 0; } 27 | # return &numOnly( $this->{'number'} ); 28 | #} 29 | # 30 | #sub numOnly { 31 | # my $this = shift; 32 | # $this->{'number'} =~ s/\D*(\d+)\D*/$1/g; 33 | # return $this->{'number'}; 34 | #} 35 | # 36 | #1; 37 | # 38 | sub new { 39 | my $proto = shift; 40 | my $class = ref( $proto ) || $proto; 41 | my $this = {}; 42 | bless $this, $class; 43 | return $this; 44 | } 45 | 46 | sub nullIsZero { 47 | my $num = shift; 48 | if ( $num =~ /N\/A/ ) { return 0; } 49 | return &numOnly( $num ); 50 | } 51 | 52 | sub numOnly { 53 | my $num = shift; 54 | $num =~ s/\D*(-[\d\.]+)\D*/$1/g; 55 | return $num; 56 | } 57 | 58 | 1; 59 | -------------------------------------------------------------------------------- /lib/TGI/Data/StringTemplate.pm: -------------------------------------------------------------------------------- 1 | package TGI::Data::StringTemplate; 2 | # 3 | #---------------------------------- 4 | # $Authors: Adam Scott 5 | # $Date: 2016-10-27 6 | # $Revision: v0.0 $ 7 | # $URL: $ 8 | # $Doc: $ key/value template for perl hashes 9 | # 10 | #---------------------------------- 11 | # 12 | use strict; 13 | use warnings; 14 | 15 | use List::MoreUtils qw( first_index ); 16 | 17 | sub new { 18 | my $proto = shift; 19 | my $class = ref( $proto ) || $proto; 20 | my $this = {}; 21 | $this->{'parts'} = (); 22 | if ( @_ ) { 23 | my $part = shift; 24 | if ( ref( $part ) eq 'ARRAY' ) { 25 | push @{$this->{'parts'}} , @{$part}; 26 | } else { 27 | push @{$this->{'parts'}} , $part; 28 | } 29 | } 30 | bless $this, $class; 31 | return $this; 32 | } 33 | 34 | sub addToTemplate { 35 | my $this = shift; 36 | if ( @_ ) { 37 | my $part = shift; 38 | push @{$this->{'parts'}} , @{$part}; 39 | } 40 | return $this->{'parts'}; 41 | } 42 | 43 | sub construct { 44 | my $this = shift; 45 | if ( @_ ) { 46 | return join( shift , @{$this->{'parts'}} ); 47 | } else { 48 | return join( "" , @{$this->{'parts'}} ); 49 | } 50 | } 51 | 52 | sub getTemplate { 53 | my $this = shift; 54 | return $this->{'parts'}; 55 | } 56 | 57 | sub constructFromColumns { 58 | my $this = shift; 59 | my $template = ""; 60 | if ( @_ ) { 61 | my $line = shift; 62 | my $required = shift; 63 | #print join( " | " , @{$line} )."\n"; 64 | foreach my $part ( @{$this->getTemplate()} ) { 65 | #print "-----".$part; 66 | if ( $part =~ /^-?\d+$/ ) { 67 | #print "-----"; 68 | #print $line->[$part]."\n"; 69 | $template .= $line->[$part]; 70 | } else { 71 | if ( exists $required->{$part} ) { 72 | #print "=====".$line->[$required->{$part}]."\n"; 73 | $template .= $line->[$required->{$part}]; 74 | } else { 75 | #print "+++++".$part."\n"; 76 | $template .= $part; 77 | } 78 | } 79 | } 80 | } 81 | return $template; 82 | } 83 | 84 | 1; 85 | -------------------------------------------------------------------------------- /lib/TGI/Files/List.pm: -------------------------------------------------------------------------------- 1 | package TGI::Files::List; 2 | # 3 | #---------------------------------- 4 | # $Authors: Adam Scott 5 | # $Date: 2016-10-26 6 | # $Revision: v0.0 $ 7 | # $URL: $ 8 | # $Doc: $ file handling for .mafs 9 | # 10 | #---------------------------------- 11 | # 12 | use strict; 13 | use warnings; 14 | 15 | use Carp; 16 | 17 | use Scalar::Util qw( openhandle ); 18 | use TGI::Files::File; 19 | our @ISA = qw( TGI::Files::File ); 20 | 21 | sub new { 22 | my $class = shift; 23 | my $this = $class->SUPER::new( shift ); 24 | $this->{'items'} = 0; 25 | bless $this, $class; 26 | return $this; 27 | } 28 | 29 | sub getList { 30 | my $this = shift; 31 | my $column = 0; 32 | if ( @_ ) { $column = shift; } 33 | print STDOUT "\nReading in ".$this->{'file_name'}."...\n"; 34 | if ( not $this->isOpen() ) { 35 | $this->open(); 36 | } 37 | seek( $this->{'handle'} , 0 , 0 ); 38 | my %items; 39 | map { 40 | chomp; 41 | #print $_."\n"; 42 | my $item = (split /\t/)[$column]; 43 | #print $item."\n"; 44 | $items{$item} += 1; 45 | } $this->getlines(); 46 | $this->close(); 47 | 48 | return \%items; 49 | } 50 | 51 | 1; 52 | -------------------------------------------------------------------------------- /lib/TGI/Files/MAF.pm: -------------------------------------------------------------------------------- 1 | package TGI::Files::MAF; 2 | # 3 | #---------------------------------- 4 | # $Authors: Adam Scott 5 | # $Date: 2016-10-26 6 | # $Revision: v0.0 $ 7 | # $URL: $ 8 | # $Doc: $ file handling for .mafs 9 | # 10 | #---------------------------------- 11 | # 12 | use strict; 13 | use warnings; 14 | 15 | use Carp; 16 | use Scalar::Util qw( openhandle ); 17 | use List::MoreUtils qw( first_index ); 18 | 19 | use TGI::Files::File; 20 | our @ISA = qw( TGI::Files::File ); 21 | use TGI::Files::List; 22 | use TGI::Data::StringTemplate; 23 | 24 | my $HUGOSYMBOL = "Hugo_Symbol"; 25 | #my $ENTREZGENEID = "Entrez_Gene_Id"; 26 | #my $CENTER = "Center"; 27 | #my $NCBIBUILD = "NCBI_Build"; 28 | my $CHROMOSOME = "Chromosome"; 29 | my $STARTPOSITION = "Start_Position"; 30 | my $ENDPOSITION = "End_Position"; 31 | #my $STRAND = "Strand"; 32 | my $REFERENCEALLELE = "Reference_Allele"; 33 | my $TUMORSEQALLELE2 = "Tumor_Seq_Allele2"; 34 | my $TUMORSAMPLEBARCODE = "Tumor_Sample_Barcode"; 35 | 36 | sub new { 37 | my $class = shift; 38 | my $this = $class->SUPER::new( shift ); 39 | $this->{'samples'} = 0; 40 | $this->{'entries'} = 0; 41 | $this->{'variants'} = 0; 42 | bless $this, $class; 43 | return $this; 44 | } 45 | 46 | sub getHugo { 47 | my $this = shift; 48 | my $line = shift; 49 | return $this->getField( $HUGOSYMBOL , $line ); 50 | } 51 | 52 | sub getChromosome { 53 | my $this = shift; 54 | my $line = shift; 55 | return $this->getField( $CHROMOSOME , $line ); 56 | } 57 | sub getStart { 58 | my $this = shift; 59 | my $line = shift; 60 | return $this->getField( $STARTPOSITION , $line ); 61 | } 62 | sub getStop { 63 | my $this = shift; 64 | my $line = shift; 65 | return $this->getField( $ENDPOSITION , $line ); 66 | } 67 | sub getReference { 68 | my $this = shift; 69 | my $line = shift; 70 | return $this->getField( $REFERENCEALLELE , $line ); 71 | } 72 | sub getVariant { 73 | my $this = shift; 74 | my $line = shift; 75 | return $this->getField( $TUMORSEQALLELE2 , $line ); 76 | } 77 | sub getTumorSample { 78 | my $this = shift; 79 | my $line = shift; 80 | return $this->getField( $TUMORSAMPLEBARCODE , $line ); 81 | } 82 | 83 | sub getGenes { 84 | my $this = shift; 85 | print STDOUT "\nReading in ".$this->{'file_name'}." to get genes...\n"; 86 | my $closed = 0; 87 | if ( defined openhandle( $this->{'handle'} ) ) { 88 | $this->close(); 89 | $closed = 1; 90 | } 91 | my $file = new TGI::Files::List( $this->{'file_name'} ); 92 | $file->close(); 93 | if ( $closed ) { 94 | $file->open(); 95 | } 96 | my $genes = $file->getList( $this->getColumnIndex( $HUGOSYMBOL ) ); 97 | 98 | return $genes; 99 | } 100 | 101 | sub getSamples { 102 | my $this = shift; 103 | print STDOUT "\nReading in ".$this->{'file_name'}." to get samples...\n"; 104 | my $closed = 0; 105 | if ( defined openhandle( $this->{'handle'} ) ) { 106 | $this->close(); 107 | $closed = 1; 108 | } 109 | my $file = new TGI::Files::List( $this->{'file_name'} ); 110 | my $samples = $file->getList( $this->getColumnIndex( $TUMORSAMPLEBARCODE ) ); 111 | $file->close(); 112 | if ( $closed ) { 113 | $file->open(); 114 | } 115 | 116 | return $samples; 117 | } 118 | 119 | 1; 120 | -------------------------------------------------------------------------------- /lib/TGI/Files/Pairwise.pm: -------------------------------------------------------------------------------- 1 | package TGI::Mutpro::Files::Pairwise; 2 | # 3 | #---------------------------------- 4 | # $Authors: Adam Scott 5 | # $Date: 2016-10-26 6 | # $Revision: v0.0 $ 7 | # $URL: $ 8 | # $Doc: $ file handling for .pairwise 9 | # 10 | #---------------------------------- 11 | # 12 | use strict; 13 | use warnings; 14 | 15 | use Carp; 16 | 17 | use TGI::Mutpro::Files::File; 18 | our @ISA = qw( TGI::Mutpro::Files::File ); 19 | 20 | my $TRANSCRIPT = "transcript_name"; 21 | my $AMINOACIDCHANGE = "amino_acid_change"; 22 | my $WEIGHT = "weight"; 23 | my @REQUIRED = ( "Hugo_Symbol" , "Chromosome" , "Start_Position" , 24 | "End_Position" , "Reference_Allele" , 25 | "Tumor_Seq_Allele2" , "Tumor_Sample_Barcode" ); 26 | 27 | sub new { 28 | my $class = shift; 29 | my $this = $class->SUPER::new( shift ); 30 | $this->{'samples'} = 0; 31 | $this->{'entries'} = 0; 32 | $this->{'variants'} = 0; 33 | $this->{'amino_acid_changes'} = 0; 34 | $this->{'required'} = \@REQUIRED; 35 | $this->{'transcript_id_header'} = $TRANSCRIPT; 36 | $this->{'amino_acid_header'} = $AMINOACIDCHANGE; 37 | $this->{'weight_header'} = $WEIGHT; 38 | bless $this, $class; 39 | return $this; 40 | } 41 | 42 | sub openPairwise { 43 | my $this = shift; 44 | if ( defined $this->{'file_name'} ) { 45 | $this->{'handle'}->open( $this->{'file_name'} , "r" ); 46 | return 1; 47 | } 48 | return 0; 49 | } 50 | 51 | sub closePairwise { 52 | my $this = shift; 53 | $this->{'handle'}->close(); 54 | return; 55 | } 56 | 57 | sub setTranscriptHeader { 58 | my $this = shift; 59 | if ( @_ ) { 60 | $this->{'transcript_id_header'} = shift; 61 | } 62 | return; 63 | } 64 | 65 | sub setAminoAcidChangeHeader { 66 | my $this = shift; 67 | if ( @_ ) { 68 | $this->{'amino_acid_header'} = shift; 69 | } 70 | return; 71 | } 72 | 73 | sub setWeightHeader { 74 | my $this = shift; 75 | if ( @_ ) { 76 | $this->{'weight_header'} = shift; 77 | } 78 | return; 79 | } 80 | 81 | sub requireTranscript { 82 | my $this = shift; 83 | if ( @_ ) { 84 | $this->setTranscriptHeader( shift ); 85 | } 86 | push @{$this->{'required'}} , $this->{'transcript_id_header'}; 87 | } 88 | 89 | sub requireAminoAcidChange { 90 | my $this = shift; 91 | if ( @_ ) { 92 | $this->setAminoAcidChangeHeader( shift ); 93 | } 94 | push @{$this->{'required'}} , $this->{'amino_acid_header'}; 95 | } 96 | 97 | sub requireWeight { 98 | my $this = shift; 99 | if ( @_ ) { 100 | $this->setWeightHeader( shift ); 101 | } 102 | push @{$this->{'required'}} , $this->{'weight_header'}; 103 | } 104 | 105 | sub setColumnIndices { 106 | my $this = shift; 107 | my $exception = "HotSpot3D::Pairwise error: required columns not present "; 108 | $exception .= "in this .maf (".$this->{'file_name'}."). Need the "; 109 | $exception .= "following columns:\n\t"; 110 | $exception .= join( "\n\t" , @{$this->{'required'}} )."\n"; 111 | my $header = $this->{'handle'}->getline(); chomp( $header ); 112 | my $mafcols = $this->mapColumns( $header , $this->{'required'} , $exception ); 113 | return $mafcols; 114 | } 115 | 116 | sub getlines { 117 | my $this = shift; 118 | return $this->{'handle'}->getlines; 119 | } 120 | 121 | sub readPairwise { 122 | my $this = shift; 123 | print STDOUT "\nReading in ".$this->{'file_name'}."...\n"; 124 | seek( $this->{'handle'} , 0 , 0 ); 125 | my @mafcols = @{$this->setColumnIndices()}; 126 | my %mutations; 127 | map { 128 | chomp; 129 | my @line = split /\t/; 130 | my $variant = ""; 131 | if ( $#line >= $mafcols[-1] && $#line >= $mafcols[-2] ) { #makes sure custom maf cols are in range 132 | my ( $gene , $chr , $start , $stop , $reference , $tumorAllele , $barID ); 133 | my ( $transcript_name , $aachange , $weight ); 134 | $weight = 1; 135 | if ( grep{ $_ eq $this->{'weight_header'} } @{$this->{'required'}} ) { 136 | ( $gene , $chr , $start , $stop , $reference , $tumorAllele , $barID , $transcript_name , $aachange , $weight ) = @line[@mafcols]; 137 | } elsif ( grep{ $_ eq $this->{'amino_acid_header'} } @{$this->{'required'}} ) { 138 | ( $gene , $chr , $start , $stop , $reference , $tumorAllele , $barID , $transcript_name , $aachange ) = @line[@mafcols]; 139 | $variant = join( "_" , ( $gene , $aachange , $chr , $start , $stop ) ); 140 | } else { 141 | ( $gene , $chr , $start , $stop , $reference , $tumorAllele , $barID ) = @line[@mafcols]; 142 | $variant = join( "_" , ( $gene , $chr , $start , $stop ) ); 143 | } 144 | } 145 | } $this->getlines(); 146 | 147 | return; 148 | } 149 | 150 | sub getGenes { 151 | my $this = shift; 152 | print STDOUT "\nReading in ".$this->{'file_name'}." to get genes...\n"; 153 | seek( $this->{'handle'} , 0 , 0 ); 154 | my @mafcols = @{$this->setColumnIndices()}; 155 | my %genes; 156 | map { 157 | chomp; 158 | my $gene = (split /\t/)[$mafcols[0]]; 159 | $genes{$gene} += 1; 160 | } $this->getlines(); 161 | 162 | return \%genes; 163 | } 164 | 165 | sub getSamples { 166 | my $this = shift; 167 | print STDOUT "\nReading in ".$this->{'file_name'}." to get genes...\n"; 168 | seek( $this->{'handle'} , 0 , 0 ); 169 | my @mafcols = @{$this->setColumnIndices()}; 170 | my %samples; 171 | map { 172 | chomp; 173 | my $sample = (split /\t/)[$mafcols[6]]; 174 | $samples{$sample} += 1; 175 | } $this->getlines(); 176 | 177 | return \%samples; 178 | } 179 | 180 | 1; 181 | 182 | __DATA__ 183 | 0 SMAD2 184 | 1 18 185 | 2 45368254 186 | 3 45368254 187 | 4 p.D450N 188 | 5 [A] 189 | 6 450 190 | 7 MH2. {ECO:0000255|PROSITE 191 | 8 p.D420N|lung,p.D450N|lung 192 | 9 SMAD4 193 | 10 18 194 | 11 48591918 195 | 12 48591918 196 | 13 p.R361C 197 | 14 [B] 198 | 15 361 199 | 16 MH2. {ECO:0000255|PROSITE 200 | 17 N/A 201 | 18 N/A 202 | 19 2.561 1U7V 0.004409| 203 | -------------------------------------------------------------------------------- /lib/TGI/Mutpro/Distance.pm: -------------------------------------------------------------------------------- 1 | package TGI::Mutpro::Distance; 2 | # 3 | #---------------------------------- 4 | # $Authors: Adam Scott 5 | # $Date: 2016-11-01 6 | # $Revision: $ 7 | # $URL: $ 8 | # $Doc: $ distance class 9 | # 10 | #---------------------------------- 11 | # 12 | use strict; 13 | use warnings; 14 | 15 | use Carp; 16 | use Getopt::Long; 17 | 18 | use IO::File; 19 | use FileHandle; 20 | 21 | use Data::Dumper; 22 | 23 | use TGI::Mutpro::Pair; 24 | 25 | my $ABSURD = 10000; 26 | 27 | sub new { 28 | my $class = shift; 29 | my $this = {}; 30 | $this->{'pair'} = undef; 31 | $this->{'pdb'} = ""; 32 | $this->{'chain1'} = ""; 33 | $this->{'chain2'} = ""; 34 | $this->{'position1'} = ""; 35 | $this->{'position2'} = ""; 36 | $this->{'distance'} = $ABSURD; 37 | bless $this, $class; 38 | return $this; 39 | } 40 | 41 | sub reset { 42 | my $this = shift; 43 | $this->pdb( "" ); 44 | $this->chain1( "" ); 45 | $this->chain2( "" ); 46 | $this->position1( "" ); 47 | $this->position2( "" ); 48 | $this->distance( $ABSURD ); 49 | return $this; 50 | } 51 | 52 | sub print { 53 | my $this = shift; 54 | my $delim = "\t"; 55 | if ( @_ ) { $delim = shift; } 56 | print join( $delim , ( $this->chain1().":".$this->position1() , $this->chain2().":".$this->position2() , $this->pdb() , $this->distance() ) ); 57 | } 58 | 59 | sub set { 60 | my $this = shift; 61 | if ( @_ ) { $this->pair( shift ); } 62 | if ( @_ ) { $this->pdb( shift ); } 63 | if ( @_ ) { $this->chain1( shift ); } 64 | if ( @_ ) { $this->chain2( shift ); } 65 | if ( @_ ) { $this->position1( shift ); } 66 | if ( @_ ) { $this->position2( shift ); } 67 | if ( @_ ) { $this->distance( shift ); } 68 | return $this; 69 | } 70 | 71 | sub pair { 72 | my $this = shift; 73 | if ( @_ ) { $this->{'pair'} = shift; } 74 | return $this->{'pair'}; 75 | } 76 | 77 | sub pdb { 78 | my $this = shift; 79 | if ( @_ ) { $this->{'pdb'} = shift; } 80 | return $this->{'pdb'}; 81 | } 82 | 83 | sub chain1 { 84 | my $this = shift; 85 | if ( @_ ) { $this->{'chain1'} = shift; } 86 | return $this->{'chain1'}; 87 | } 88 | 89 | sub chain2 { 90 | my $this = shift; 91 | if ( @_ ) { $this->{'chain2'} = shift; } 92 | return $this->{'chain2'}; 93 | } 94 | 95 | sub position1 { 96 | my $this = shift; 97 | if ( @_ ) { $this->{'position1'} = shift; } 98 | return $this->{'position1'}; 99 | } 100 | 101 | sub position2 { 102 | my $this = shift; 103 | if ( @_ ) { $this->{'position2'} = shift; } 104 | return $this->{'position2'}; 105 | } 106 | 107 | sub distance { 108 | my $this = shift; 109 | if ( @_ ) { $this->{'distance'} = shift; } 110 | return $this->{'distance'}; 111 | } 112 | 113 | 1; 114 | -------------------------------------------------------------------------------- /lib/TGI/Mutpro/Main/BruteForceClustersLines.R: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env Rscript 2 | args = commandArgs(trailingOnly=TRUE) 3 | y = read.table(args[1]) 4 | z = read.table(args[2]) 5 | 6 | RD<-y[[2]] 7 | ID<-y[[1]] 8 | 9 | x0<-z[[1]] 10 | y0<-z[[3]] 11 | x1<-z[[2]]+1 12 | y1<-z[[3]] 13 | 14 | pdf(args[3],width=23.6,height=13.3) 15 | par(mar=c(8,5,5,1)) 16 | barplot(RD,names.arg=ID,ylab="Reachabilty Distance (A)",main=paste("Reachability Plot: Epsilon=",args[4],"MinPts=",args[5]),col="Red", border=NA, space=0, las=2, cex.names=0.4) 17 | segments (x0,y0,x1,y1) 18 | dev.off() 19 | 20 | # args: 1-RD.out, 2-clusters.out, 3-pdf_file_name, 4-epsilon, 5-MinPts, 6-cutoff 21 | -------------------------------------------------------------------------------- /lib/TGI/Mutpro/Main/ColorScore.R: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env Rscript 2 | 3 | # Authors : Amila Weerasinghe 4 | # Generates a reachability plot colored by weight (*.Horiz.weights.pdf) 5 | 6 | args = commandArgs(trailingOnly=TRUE) 7 | d <- read.table(args[1], header=FALSE, sep = "\t") # data table with variant,RD,genomic_annotations,weight 8 | 9 | segData = read.table(args[2], sep = "\t") # data table with start,stop,epsilon',clusterID 10 | 11 | ################################################################# 12 | ### process the segData so that cluster labels do not overlap ### 13 | ################################################################# 14 | 15 | # make a column to show process status 16 | segData$Processed = 0 17 | 18 | # display singleton clusters at RD=0.1 19 | segData[segData$V1==segData$V2,"V3"] = 0.1 20 | segData[segData$V1==segData$V2,"Processed"] = 1 21 | 22 | segData$textOffset = 0 23 | 24 | # now start from the first un-processed row and see if there are nearby levels 25 | unprocessed = segData[segData$Processed==0,] 26 | 27 | while ( length(unprocessed$V1) != 0 ) { 28 | # take the first un-processed one and the nearby stuff 29 | tab = segData[segData$V3=unprocessed$V3[1] & segData$V2==unprocessed$V2[1],] 30 | offset = 0.1 31 | # go through each row in tab and add offset 32 | for (i in c(1:length(tab$V1))) { 33 | segData[segData$V3=unprocessed$V3[1] & segData$V2==unprocessed$V2[1],][i,"textOffset"] = offset 34 | offset = offset + 0.1 35 | } 36 | segData[segData$V3=unprocessed$V3[1] & segData$V2==unprocessed$V2[1],]$Processed = 1 37 | unprocessed = segData[segData$Processed==0,] # update the unprocessed table 38 | } 39 | ################################################################# 40 | 41 | y0<-segData[[3]] 42 | x0<-segData[[1]]+1 43 | y1<-segData[[3]] 44 | x1<-segData[[2]]+1 45 | Cluster<-segData[[5]] 46 | labelOffset <- segData$textOffset 47 | 48 | names(d)[1] <- "variant" 49 | names(d)[2] <- "RD" 50 | names(d)[10] <- "weight" 51 | 52 | # adjust plot height according to the number of variants 53 | newH = (24/300)*(length(d$variant)) # 300 is the number from MET where I used 23.6" 54 | newH = round( newH, 1) 55 | if (newH < 24){ 56 | newH = 24 57 | } 58 | # replace RD=0 by 0.1 (so that we get a bar to color) 59 | d$RD[d$RD==0] <- 0.1 60 | d$weight <- log(d$weight) 61 | 62 | library(ggplot2) 63 | 64 | d$variant <- factor(d$variant, levels = d$variant) # avoid automatic sort 65 | 66 | p <- ggplot(data=d, aes(x=variant, y=RD, fill=weight)) + geom_bar(stat="identity") + scale_fill_gradient(low="grey",high="red") 67 | 68 | p <- p + coord_flip() + theme_bw() + theme(axis.text.y=element_text(size=6)) #+ theme_bw() 69 | 70 | values <- c(1:nrow(segData)) 71 | for (i in values) { 72 | p <- p + geom_segment(x=x0[i], y=y0[i], xend=x1[i], yend=y1[i]) 73 | } 74 | 75 | p <- p + annotate("text", x = x1+labelOffset, y = y0, label = Cluster, size = 2) 76 | 77 | p <- p + ggtitle(paste("Reachability Plot with weights: Epsilon=",args[4],"MinPts=",args[5])) 78 | 79 | ggsave(args[3], width = 13.3, height = newH, limitsize = FALSE) 80 | 81 | # args: 1-RD.out, 2-clusters.plot, 3-pdf_file_name, 4-epsilon, 5-MinPts -------------------------------------------------------------------------------- /lib/TGI/Mutpro/Main/HorizClustersLines.R: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env Rscript 2 | args = commandArgs(trailingOnly=TRUE) 3 | y = read.table(args[1], sep = "\t") 4 | z = read.table(args[2], sep = "\t") 5 | 6 | RD<-y[[2]] 7 | ID<-y[[1]] 8 | 9 | z[z$V1==z$V2,"V3"] = 0.1 # show singletons at RD=0.1 10 | 11 | y0<-z[[1]] 12 | x0<-z[[3]] 13 | y1<-z[[2]]+1 14 | x1<-z[[3]] 15 | 16 | Cluster<-z[[5]] 17 | 18 | # adjust plot height according to the number of variants 19 | newH = (24/300)*(length(ID)) # 300 is the number from MET where I used 23.6" 20 | newH = round( newH, 1) 21 | if (newH < 24){ 22 | newH = 24 23 | } 24 | 25 | pdf(args[3],width=13.3,height=newH) 26 | par(mar=c(8,5,5,1)) 27 | barplot(RD,names.arg=ID,main=paste("Reachability Plot: Epsilon=",args[4],"MinPts=",args[5]),col="Red", cex.names=0.4, horiz=TRUE,border=NA, space=0, las=2, xlab="Reachabilty Distance (A)") # ylab="Reachabilty Distance (A)" 28 | segments (x0,y0,x1,y1) 29 | text(x0,y1+0.5,Cluster, cex=0.4) 30 | dev.off() 31 | 32 | # args: 1-RD.out, 2-clusters.plot, 3-pdf_file_name, 4-epsilon, 5-MinPts -------------------------------------------------------------------------------- /lib/TGI/Mutpro/Main/MembershipProbability.R: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env Rscript 2 | args = commandArgs(trailingOnly=TRUE) 3 | 4 | d <- read.table(args[1], header=TRUE) 5 | 6 | library(ggplot2) 7 | 8 | d$Variant <- factor(d$Variant, levels = d$Variant) 9 | 10 | sp <- ggplot(d, aes(x=Variant,y=ClusterID)) + geom_point(aes(colour = Probability), size=3) + scale_colour_gradient(low = "red", high = "blue") 11 | sp <- sp+ facet_grid(SuperClusterID ~ .,as.table = FALSE,scales="free_y",space="free_y") 12 | #sp <- ggplot(d, aes(x=Variant,y=ClusterID)) + geom_point(aes(size=Probability)) 13 | 14 | sp+theme_bw() + theme(axis.text.x=element_text(angle=90, size=6)) + ggtitle(paste("Cluster Membership Probabilities for",args[2],"runs (",args[3],")")) 15 | ggsave(paste(args[3],"ProbabilityPlot.pdf", sep = "."), width = 23.6, height = 13.3) 16 | 17 | #args 1=ProbabilityData, 2=Number of runs, 3=Gene -------------------------------------------------------------------------------- /lib/TGI/Mutpro/Pair.pm: -------------------------------------------------------------------------------- 1 | package TGI::Mutpro::Pair; 2 | # 3 | #---------------------------------- 4 | # $Authors: Adam Scott 5 | # $Date: 2016-11-01 6 | # $Revision: $ 7 | # $URL: $ 8 | # $Doc: $ pair class 9 | # 10 | #---------------------------------- 11 | # 12 | use strict; 13 | use warnings; 14 | 15 | use Carp; 16 | use Getopt::Long; 17 | 18 | use List::MoreUtils qw( uniq ); 19 | use List::Util qw( min max ); 20 | 21 | use IO::File; 22 | use FileHandle; 23 | 24 | use Data::Dumper; 25 | 26 | use TGI::Variant; 27 | 28 | sub new { 29 | my $class = shift; 30 | my $this = {}; 31 | $this->{'variant1'} = ""; 32 | $this->{'variant2'} = ""; 33 | $this->{'distances'} = []; 34 | bless $this, $class; 35 | return $this; 36 | } 37 | 38 | sub reset { 39 | my $this = shift; 40 | my $new = new TGI::Mutpro::Pair; 41 | $new->set(); 42 | return $new; 43 | } 44 | 45 | sub print { 46 | my $this = shift; 47 | my $delim = "\t"; 48 | if ( @_ ) { $delim = shift; } 49 | foreach my $distance ( @{$this->distances()} ) { 50 | ($this->variant1())->print(); 51 | my $pv1s = ($this->variant1())->proteinVariants(); 52 | my $p1s = ""; 53 | foreach my $pv1 ( @{$pv1s} ) { 54 | $p1s .= $pv1->transcript().":".$pv1->aminoAcidChange(); 55 | } 56 | print $delim; 57 | ($this->variant2())->print(); 58 | my $pv2s = ($this->variant2())->proteinVariants(); 59 | my $p2s = ""; 60 | foreach my $pv2 ( @{$pv2s} ) { 61 | $p2s .= $pv2->transcript().":".$pv2->aminoAcidChange(); 62 | } 63 | print $delim; 64 | #print $distance; 65 | #print $delim; 66 | } 67 | } 68 | 69 | sub set { 70 | my $this = shift; 71 | if ( @_ ) { $this->variant1( shift ); } 72 | if ( @_ ) { $this->variant2( shift ); } 73 | if ( @_ ) { $this->distances( shift ); } 74 | return $this; 75 | } 76 | 77 | sub addDistance { 78 | my $this = shift; 79 | push @{$this->{'distances'}} , shift; 80 | return $this; 81 | } 82 | 83 | sub variant1 { 84 | my $this = shift; 85 | if ( @_ ) { $this->{'variant1'} = shift; } 86 | return $this->{'variant1'}; 87 | } 88 | 89 | sub variant2 { 90 | my $this = shift; 91 | if ( @_ ) { $this->{'variant2'} = shift; } 92 | return $this->{'variant2'}; 93 | } 94 | 95 | sub distances { 96 | my $this = shift; 97 | if ( @_ ) { $this->{'distances'} = shift; } 98 | return $this->{'distances'}; 99 | } 100 | 101 | 1; 102 | -------------------------------------------------------------------------------- /lib/TGI/Mutpro/Preprocess/AllPreprocess.pm: -------------------------------------------------------------------------------- 1 | package TGI::Mutpro::Preprocess::AllPreprocess; 2 | # 3 | #---------------------------------- 4 | # $Authors: Adam D Scott 5 | # $Date: 2016-09-29 $ 6 | # $Revision: 2016-09-29 $ 7 | # $URL: $ 8 | # $Doc: Run all preprocessing steps (assuming bsub capable) $ 9 | #---------------------------------- 10 | # 11 | use strict; 12 | use warnings; 13 | our $VERSION = '0.0'; 14 | 15 | use Carp; 16 | use Cwd; 17 | use Getopt::Long; 18 | use IO::File; 19 | use FileHandle; 20 | use Parallel::ForkManager; 21 | 22 | #use TGI::Mutpro::Preprocess::Uppro; 23 | use TGI::Mutpro::Preprocess::Calroi; 24 | use TGI::Mutpro::Preprocess::Statis; 25 | use TGI::Mutpro::Preprocess::Anno; 26 | use TGI::Mutpro::Preprocess::Trans; 27 | use TGI::Mutpro::Preprocess::Cosmic; 28 | use TGI::Mutpro::Preprocess::Prior; 29 | 30 | my $MINDISTANCE = "minDistance"; 31 | my $AVGDISTANCE = "averageDistance"; 32 | my $CALROI = "calroi"; 33 | my $STATIS = "statis"; 34 | my $ANNO = "anno"; 35 | my $TRANS = "trans"; 36 | my $COSMIC = "cosmic"; 37 | my $PRIOR = "prior"; 38 | my $CPU = "cpu"; 39 | my $BSUB = "bsub"; 40 | my $NONE = "none"; 41 | 42 | sub new { 43 | my $class = shift; 44 | my $this = {}; 45 | $this->{'command'} = ""; 46 | $this->{'_OUTPUT_DIR'} = getcwd; 47 | $this->{'_BLAT'} = "blat"; 48 | $this->{'GRCh'} = undef; 49 | $this->{'release'} = undef; 50 | $this->{'max_3d_dis'} = 100; 51 | $this->{'p_value_cutoff'} = 1; 52 | $this->{'min_seq_dis'} = 0; 53 | $this->{'start'} = $CALROI; 54 | $this->{'parallel'} = $CPU; 55 | $this->{'status'} = 0; 56 | bless $this, $class; 57 | $this->process(); 58 | return $this; 59 | } 60 | 61 | sub process { 62 | my $this = shift; 63 | my ($help, $options); 64 | unless (@ARGV) { die $this->help_text(); } 65 | $this->{'command'} = \@ARGV; 66 | $options = GetOptions ( 67 | 'output-dir=s' => \$this->{'_OUTPUT_DIR'}, 68 | 'blat=s' => \$this->{'_BLAT'}, 69 | 'grch=i' => \$this->{'GRCh'}, 70 | 'release=i' => \$this->{'release'}, 71 | '3d-distance-cutoff=i' => \$this->{'max_3d_dis'}, 72 | 'p-value-cutoff=i' => \$this->{'p_value_cutoff'}, 73 | 'linear-cutoff=i' => \$this->{'min_seq_dis'}, 74 | 'start=s' => \$this->{'start'} , 75 | 'help' => \$help, 76 | ); 77 | if ($help) { print STDERR help_text(); exit 0; } 78 | unless ($options) { die $this->help_text(); } 79 | 80 | $this->steps(); 81 | return; 82 | } 83 | 84 | sub checkStatus { 85 | my ( $this , $step ) = @_; 86 | if ( $this->{'status'} ) { 87 | warn "HotSpot3D::AllPreprocess::checkStatus error: prior step (".$step.") failed!\n"; 88 | exit $this->{'status'}; 89 | } 90 | return $this->{'status'}; 91 | } 92 | 93 | sub command { 94 | my $this = shift; 95 | if ( @_ ) { 96 | $this->{'command'} = \@_; 97 | } 98 | return $this->{'command'}; 99 | } 100 | 101 | sub outputDir { 102 | my $this = shift; 103 | if ( @_ ) { $this->{'_OUTPUT_DIR'} = shift; } 104 | return $this->{'_OUTPUT_DIR'}; 105 | } 106 | 107 | sub blat { 108 | my $this = shift; 109 | if ( @_ ) { $this->{'_BLAT'} = shift; } 110 | return $this->{'_BLAT'}; 111 | } 112 | 113 | sub grch { 114 | my $this = shift; 115 | if ( @_ ) { $this->{'GRCh'} = shift; } 116 | return $this->{'GRCh'}; 117 | } 118 | 119 | sub release { 120 | my $this = shift; 121 | if ( @_ ) { $this->{'release'} = shift; } 122 | return $this->{'release'}; 123 | } 124 | 125 | sub pvaluecutoff { 126 | my $this = shift; 127 | if ( @_ ) { $this->{'p_value_cutoff'} = shift; } 128 | return $this->{'p_value_cutoff'}; 129 | } 130 | 131 | sub max3ddis { 132 | my $this = shift; 133 | if ( @_ ) { $this->{'max_3d_dis'} = shift; } 134 | return $this->{'max_3d_dis'}; 135 | } 136 | 137 | sub minseqdis { 138 | my $this = shift; 139 | if ( @_ ) { $this->{'min_seq_dis'} = shift; } 140 | return $this->{'min_seq_dis'}; 141 | } 142 | 143 | sub start { 144 | my $this = shift; 145 | if ( @_ ) { $this->{'start'} = shift; } 146 | return $this->{'start'}; 147 | } 148 | 149 | sub status { 150 | my $this = shift; 151 | if ( @_ ) { $this->{'status'} = shift; } 152 | return $this->{'status'}; 153 | } 154 | 155 | sub calroi { 156 | my $this = shift; 157 | $this->checkStatus( "uppro/calpro" ); 158 | my $cmd = "hotspot3d calroi --output-dir ".$this->outputDir(); 159 | print STDOUT "running: ".$cmd."\n"; 160 | $this->status( system( $cmd ) ); 161 | return; 162 | } 163 | 164 | sub statis { 165 | my $this = shift; 166 | $this->checkStatus( $CALROI ); 167 | my $cmd = "hotspot3d statis --output-dir ".$this->outputDir(); 168 | print STDOUT "running: ".$cmd."\n"; 169 | $this->status( system( $cmd ) ); 170 | return $this->status(); 171 | } 172 | 173 | sub anno { 174 | my $this = shift; 175 | $this->checkStatus( $STATIS ); 176 | my $cmd = "hotspot3d anno --output-dir ".$this->outputDir(); 177 | print STDOUT "running: ".$cmd."\n"; 178 | $this->status( system( $cmd ) ); 179 | return $this->status(); 180 | } 181 | 182 | sub trans { 183 | my $this = shift; 184 | $this->checkStatus( $ANNO ); 185 | my $cmd = "hotspot3d trans --output-dir ".$this->outputDir(); 186 | $cmd .= " --blat ".$this->blat(); 187 | if ( $this->grch() ) { 188 | $cmd .= " --grch ".$this->grch(); 189 | } 190 | if ( $this->release() ) { 191 | $cmd .= " --release ".$this->release(); 192 | } 193 | print STDOUT "running: ".$cmd."\n"; 194 | $this->status( system( $cmd ) ); 195 | return $this->status(); 196 | } 197 | 198 | sub cosmic { 199 | my $this = shift; 200 | $this->checkStatus( $TRANS ); 201 | my $cmd = "hotspot3d cosmic --output-dir ".$this->outputDir(); 202 | print STDOUT "running: ".$cmd."\n"; 203 | $this->status( system( $cmd ) ); 204 | return $this->status(); 205 | } 206 | 207 | sub prior { 208 | my $this = shift; 209 | $this->checkStatus( $COSMIC ); 210 | my $cmd = "hotspot3d prior --output-dir ".$this->outputDir(); 211 | $cmd .= " --p-value-cutoff ".$this->pvaluecutoff(); 212 | $cmd .= " --3d-distance-cutoff ".$this->max3ddis(); 213 | $cmd .= " --linear-cutoff ".$this->minseqdis(); 214 | print STDOUT "running: ".$cmd."\n"; 215 | $this->status( system( $cmd ) ); 216 | return $this->status(); 217 | } 218 | 219 | sub steps { 220 | my $this = shift; 221 | my $ok = 0; 222 | #if ( $this->{'start'} eq $UPPRO ) { 223 | # fork 224 | #} elsif ( $this->{'start'} eq $CALROI ) { 225 | if ( $this->{'start'} eq $CALROI ) { 226 | $this->calroi(); 227 | $this->statis(); 228 | $this->anno(); 229 | $this->trans(); 230 | $this->cosmic(); 231 | $this->prior(); 232 | } elsif ( $this->{'start'} eq $STATIS ) { 233 | $this->statis(); 234 | $this->anno(); 235 | $this->trans(); 236 | $this->cosmic(); 237 | $this->prior(); 238 | } elsif ( $this->{'start'} eq $ANNO ) { 239 | $this->anno(); 240 | $this->trans(); 241 | $this->cosmic(); 242 | $this->prior(); 243 | } elsif ( $this->{'start'} eq $TRANS ) { 244 | $this->trans(); 245 | $this->cosmic(); 246 | $this->prior(); 247 | } elsif ( $this->{'start'} eq $COSMIC ) { 248 | $this->cosmic(); 249 | $this->prior(); 250 | } elsif ( $this->{'start'} eq $PRIOR ) { 251 | $this->prior(); 252 | } else { 253 | die "HotSpot3D::AllPreprocess error: desired starting step unclear.\n".$this->help_text(); 254 | } 255 | 256 | return; 257 | } 258 | 259 | sub help_text{ 260 | my $this = shift; 261 | return < peptides) for prior, default is 0 277 | 278 | 279 | --help this message 280 | 281 | HELP 282 | 283 | } 284 | 285 | 1; 286 | 287 | __END__ 288 | 289 | =head1 NAME 290 | 291 | TGI::Mutpro::Preprocess::Uppro - Create & update proximity files. 292 | 293 | =head1 SYNOPSIS 294 | 295 | use TGI::Mutpro::Preprocess::Uppro; 296 | 297 | =head1 DESCRIPTION 298 | 299 | TGI::Mutpro::Preprocess::Uppro is to be used to create & update proximity files. 300 | It is the first step of preprocessing procedure. 301 | 302 | 303 | =head1 AUTHOR 304 | 305 | Beifang Niu Ebeifang.cn@gmail.comE 306 | 307 | =head1 SEE ALSO 308 | 309 | https://github.com/ding-lab/hotspot3d 310 | 311 | =head1 LICENSE 312 | 313 | This library is free software with MIT licence; you can redistribute it and/or modify 314 | it under the same terms as Perl itself. 315 | 316 | =cut 317 | 318 | -------------------------------------------------------------------------------- /lib/TGI/Mutpro/Preprocess/AminoAcid.pm: -------------------------------------------------------------------------------- 1 | package TGI::Mutpro::Preprocess::AminoAcid; 2 | # 3 | #---------------------------------- 4 | # $Authors: Beifang Niu & Adam D Scott 5 | # $Date: 2014-01-14 14:34:50 -0500 (Tue Jan 14 14:34:50 CST 2014) $ 6 | # $Revision: $ 7 | # $URL: $ 8 | # $Doc: $ amino acid processing class 9 | #---------------------------------- 10 | # 11 | use strict; 12 | use warnings; 13 | use Carp; 14 | use TGI::Mutpro::Preprocess::Point; 15 | #use PostData; 16 | my $Debug = 0; 17 | # Collection of points (atoms) in crystal structure 18 | sub new { 19 | my $proto = shift; 20 | my $class = ref($proto) || $proto; 21 | my $self = {}; 22 | 23 | $self->{CHAIN} = undef; # ID of peptide chain 24 | $self->{POSITION} = undef; # Residue number in peptide chain 25 | $self->{AA} = undef; # Name of amino acid (three letter code -- ARG, ASN, PRO, etc.) 26 | $self->{AVGPOINT} = undef; # Ref to point with average of X, Y, Z coordinates of all points 27 | $self->{POINTS} = (); # Array of refs to Point objects (unordered Points -- i.e. atoms) 28 | $self->{AMBIGUOUS} = 0; # Some PDB peptides have more than one residue at a given position. 29 | # Don't use these 30 | $self->{ACCEPTED} = { "ALA" => "A" , 31 | "ARG" => "R" , 32 | "ASN" => "N" , 33 | "ASP" => "D" , 34 | "CYS" => "C" , 35 | "GLN" => "Q" , 36 | "GLU" => "E" , 37 | "GLY" => "G" , 38 | "HIS" => "H" , 39 | "ILE" => "I" , 40 | "LEU" => "L" , 41 | "LYS" => "K" , 42 | "MET" => "M" , 43 | "PHE" => "F" , 44 | "PRO" => "P" , 45 | "SER" => "S" , 46 | "THR" => "T" , 47 | "TRP" => "W" , 48 | "TYR" => "Y" , 49 | "VAL" => "V" 50 | }; 51 | bless ($self, $class); 52 | 53 | return $self; 54 | } 55 | 56 | sub convertNameToSingle { 57 | my $this = shift; 58 | if ( @_ ) { 59 | my $aa = shift; 60 | if ( $this->isAA( $aa ) ) { 61 | return $this->{'ACCEPTED'}->{$aa}; 62 | } 63 | warn "AminoAcid::convertNameToSingle - warnging: ".$aa." is not a three letter amino acid name\n"; 64 | return $aa; 65 | } 66 | if ( $this->isAA( $this->name() ) ) { 67 | return $this->{'ACCEPTED'}->{$this->name()}; 68 | } 69 | warn "AminoAcid::convertNameToSingle - warnging: ".$this->name()." is not a three letter amino acid name\n"; 70 | return $this->name(); 71 | } 72 | 73 | sub ambiguous { 74 | my $self = shift; 75 | if (@_) { $self->{AMBIGUOUS} = shift; } 76 | return $self->{AMBIGUOUS}; 77 | } 78 | 79 | sub chain { 80 | my $self = shift; 81 | if (@_) { $self->{CHAIN} = shift; } 82 | return $self->{CHAIN}; 83 | } 84 | 85 | sub position { 86 | # Residue number in peptide chain 87 | my $self = shift; 88 | if (@_) { $self->{POSITION} = shift; } 89 | return $self->{POSITION}; 90 | } 91 | 92 | sub name { 93 | # Name of amino acid (three letter code) 94 | my $self = shift; 95 | if (@_) { 96 | #TODO: check if name is a real AA 97 | $self->{AA} = shift; 98 | } 99 | return $self->{AA}; 100 | } 101 | 102 | sub addPoint { 103 | my $self = shift; 104 | my ($x, $y, $z) = @_; 105 | my $point = new TGI::Mutpro::Preprocess::Point; 106 | $point->xyz($x, $y, $z); 107 | push @{$self->{POINTS}}, \$point; 108 | } 109 | 110 | sub getPoints { 111 | # Return: array of refs to Point objects 112 | my $self = shift; 113 | return @{$self->{POINTS}}; 114 | } 115 | 116 | sub averagePoint { 117 | # Return: ref to Point object with average of X coordinates, 118 | # average of Y coordinates, 119 | # average of Z coordinates of all points (atoms) 120 | # belonging to this amino acid 121 | my $self = shift; 122 | if ( !defined $self->{AVGPOINT} ) { 123 | my ( $xTotal, $yTotal, $zTotal, $totalAtoms, $pointRef, $avgPoint, ); 124 | foreach $pointRef ( $self->getPoints() ) { 125 | #PostData($pointRef); print "\n"; 126 | my ($x,$y,$z) = $$pointRef->xyz(); 127 | $xTotal += $x; 128 | $yTotal += $y; 129 | $zTotal += $z; 130 | $totalAtoms++; 131 | } 132 | $avgPoint = new TGI::Mutpro::Preprocess::Point; 133 | $avgPoint->xyz($xTotal/$totalAtoms, $yTotal/$totalAtoms, $zTotal/$totalAtoms); 134 | $self->{AVGPOINT} = \$avgPoint; 135 | } 136 | return $self->{AVGPOINT}; 137 | } 138 | 139 | sub averageDistance { 140 | # Input: ref to AminoAcid object 141 | # Return: distance between this amino acid and 142 | # the input amino acid 143 | # based on the average position of 144 | # all atoms in each amino acid 145 | my $self = shift; 146 | my $aaRef = shift; 147 | my ( $thisAvgPointRef, $thatAvgPointRef, $avgDistance, ); 148 | $thisAvgPointRef = $self->averagePoint(); 149 | $thatAvgPointRef = $$aaRef->averagePoint(); 150 | $avgDistance = $$thisAvgPointRef->distance($thatAvgPointRef); 151 | return $avgDistance; 152 | } 153 | 154 | sub shortestDistance { 155 | # Input: ref to AminoAcid object 156 | # Return: shortest distance between any 157 | # two points in this amino acid 158 | # and the input amino acid 159 | my $self = shift; 160 | my $aaRef = shift; 161 | my ( $distance, $shortestDistance, $thisPointRef, $thatPointRef ); 162 | $shortestDistance = 1e10; 163 | foreach $thisPointRef ( $self->getPoints() ) { 164 | foreach $thatPointRef ( $$aaRef->getPoints() ) { 165 | $distance = $$thisPointRef->distance($thatPointRef); 166 | if ( $distance < $shortestDistance ) { $shortestDistance = $distance; } 167 | } 168 | } 169 | ($shortestDistance < 1e10) || confess "Did not get any distance values"; 170 | return $shortestDistance; 171 | } 172 | 173 | sub isHOH { 174 | my $this = shift; 175 | my $residue; 176 | if ( @_ ) { $residue = shift; } else { $residue = $this->name(); } 177 | if ( $residue eq "HOH" ) { 178 | return 1; 179 | } 180 | return 0; 181 | } 182 | 183 | sub isAA { 184 | my $this = shift; 185 | my $residue; 186 | if ( @_ ) { $residue = shift; } else { $residue = $this->name(); } 187 | if ( not exists $this->{ACCEPTED}->{$residue} ) { 188 | return 0; 189 | } 190 | return 1; 191 | } 192 | 193 | 1; 194 | -------------------------------------------------------------------------------- /lib/TGI/Mutpro/Preprocess/Anno.pm: -------------------------------------------------------------------------------- 1 | package TGI::Mutpro::Preprocess::Anno; 2 | # 3 | #---------------------------------- 4 | # $Authors: Beifang Niu & Adam D Scott 5 | # $Date: 2014-01-14 14:34:50 -0500 (Tue Jan 14 14:34:50 CST 2014) $ 6 | # $Revision: 1 $ 7 | # $URL: $ 8 | # $Doc: $ add annotation information for pairs 9 | #---------------------------------- 10 | # 11 | 12 | use strict; 13 | use warnings; 14 | 15 | use Carp; 16 | use Cwd; 17 | use FileHandle; 18 | use Getopt::Long; 19 | 20 | use TGI::Data::CleanNumber; 21 | 22 | sub new { 23 | my $class = shift; 24 | my $this = {}; 25 | $this->{_OUTPUT_DIR} = getcwd; 26 | $this->{_STAT} = undef; 27 | bless $this, $class; 28 | $this->process(); 29 | return $this; 30 | } 31 | 32 | sub process { 33 | my $this = shift; 34 | $this->setOptions( ); 35 | #### processing #### 36 | # add ROI annotations after get pvalues 37 | ## do that after get pvalues 38 | my $entireFile = $this->getInputFile( ); 39 | my $proximityDir = $this->getProximityDir( ); 40 | my $pvaluesDir = $this->getPValuesDir( $proximityDir ); 41 | my ( $annotationFileDir , $annotationDir ) = $this->getAnnotationDirs( $proximityDir ); 42 | $this->makeAnnotations( $entireFile , $annotationFileDir , $annotationDir , $pvaluesDir , $proximityDir ); 43 | return 0; 44 | } 45 | 46 | sub makeAnnotations { 47 | my ( $this , $entireFile , $annotationFileDir , $annotationDir , $pvaluesDir , $proximityDir ) = @_; 48 | my $annotations = {}; 49 | foreach my $line ( @{$entireFile} ) { 50 | chomp $line; 51 | my ( $uniprotId , $pdb ); 52 | ( undef, $uniprotId, $pdb, ) = split /\t/, $line; 53 | # Only use Uniprot IDs with PDB structures 54 | next if ( $pdb eq "N/A" || $uniprotId !~ /\w+/ ); 55 | $this->getAnnotation( $uniprotId , $annotationFileDir , $annotations ); 56 | } 57 | foreach my $line ( @{$entireFile} ) { 58 | chomp( $line ); 59 | my ( $uniprotId , $pdb ); 60 | ( undef , $uniprotId , $pdb ) = split /\t/ , $line; 61 | print STDOUT $uniprotId."\n"; 62 | next if ( $pdb eq "N/A" || $uniprotId !~ /\w+/ ); 63 | $this->addAnnotation( $uniprotId , $pvaluesDir , $annotationDir , $annotations ); 64 | } 65 | return; 66 | } 67 | 68 | sub setOptions { 69 | my $this = shift; 70 | my ( $help, $options ); 71 | unless( @ARGV ) { die $this->help_text(); }; 72 | $options = GetOptions ( 73 | 'output-dir=s' => \$this->{_OUTPUT_DIR}, 74 | 'help' => \$help, 75 | ); 76 | if ( $help ) { warn help_text(); exit 0; }; 77 | unless( $options ) { die $this->help_text(); }; 78 | unless( $this->{_OUTPUT_DIR} ) { warn 'HotSpot3D::Anno::setOptions error: You must provide a output directory!', "\n"; die $this->help_text(); }; 79 | unless( -e $this->{_OUTPUT_DIR} ) { warn 'HotSpot3D::Anno::setOptions error: output directory does not exist!', "\n"; die $this->help_text(); }; 80 | return; 81 | } 82 | 83 | sub getPValuesDir { 84 | my ( $this , $proximityDir ) = @_; 85 | my $pvaluesDir = "$proximityDir\/pvalues"; 86 | unless( -d $pvaluesDir ) { warn "HotSpot3D::Anno::getPValuesDir error: You must provide a valid p_values annotation directory!\n"; die help_text(); } 87 | return $pvaluesDir; 88 | } 89 | 90 | sub getProximityDir { 91 | my $this = shift; 92 | return "$this->{_OUTPUT_DIR}\/proximityFiles"; 93 | } 94 | 95 | sub getAnnotationDirs { 96 | my ( $this , $proximityDir ) = @_; 97 | my $annotationFileDir = "$proximityDir\/annotationFiles"; 98 | my $annotationDir = "$proximityDir\/annotations"; 99 | unless( -d $annotationFileDir ) { warn "HotSpot3D::Anno::getAnnotationDirs error: You must provide a valid annotation file directory!\n"; die help_text(); } 100 | unless( -e $annotationDir ) { mkdir( $annotationDir ) || die "HotSpot3D::Anno::getAnnotationDirs error: can not make annotations directory!\n"; }; 101 | return ( $annotationFileDir , $annotationDir ); 102 | } 103 | 104 | sub getInputFile { 105 | my $this = shift; 106 | my $fhuid = new FileHandle; 107 | my $hugoUniprotf = "$this->{_OUTPUT_DIR}\/hugo.uniprot.pdb.csv"; 108 | unless( $fhuid->open("< $hugoUniprotf") ) { die "HotSpot3D::Anno::getInputFile error: Could not open hugo uniprot id file!\n" }; 109 | my @entireFile = $fhuid->getlines; 110 | $fhuid->close(); 111 | return \@entireFile; 112 | } 113 | 114 | # get annotation information 115 | sub getAnnotation { 116 | my ( $this , $uniprotId , $annotationFileDir , $annotations ) = @_; 117 | my $annotationFile = "$annotationFileDir\/$uniprotId\.annotation\.txt"; 118 | my $fhano = new FileHandle; 119 | unless( $fhano->open("< $annotationFile") ) { die "HotSpot3D::Anno::getAnnotation error: Could not open annotation file for ".$uniprotId."!\n" }; 120 | print STDOUT $uniprotId." HotSpot3D::Anno::getAnnotation - collecting annotations from ".$annotationFile."\n"; 121 | while ( my $a = $fhano->getline ) { 122 | chomp($a); 123 | my ( $start, $end, $type, $anno, ) = split /\t/, $a; 124 | $type =~ s/'//g; 125 | #warn $type."\n"; 126 | #$anno =~ s/^'|'$|.$//; 127 | $anno =~ s/^'//; $anno =~ s/'$//; $anno =~ s/.$//; 128 | #print $anno."\n"; 129 | if ( $type eq "DISULFID" ) { 130 | $annotations->{$uniprotId}->{$start} = $anno; 131 | $annotations->{$uniprotId}->{$end} = $anno; 132 | } else { 133 | foreach my $b ($start..$end) { 134 | $annotations->{$uniprotId}->{$b} = $anno; 135 | } 136 | } 137 | } 138 | $fhano->close(); 139 | return; 140 | } 141 | 142 | # Add ROI annotation 143 | sub addAnnotation { 144 | my ( $this , $uniprotId , $pvaluesDir , $annotationDir , $annotations ) = @_; 145 | my $proximityFile = "$pvaluesDir\/$uniprotId\.ProximityFile\.csv"; 146 | if ( not -e $proximityFile ) { 147 | warn $uniprotId." HotSpot3D::Anno::addAnnotation warning: skipping because the annotation file does not exist: ".$proximityFile."\n"; 148 | return; 149 | } 150 | my $outputFile = "$annotationDir\/$uniprotId\.ProximityFile\.csv"; 151 | # add annotation information 152 | my $fhin = new FileHandle; 153 | unless( $fhin->open("<$proximityFile") ) { die "HotSpot3D::Anno::addAnnotation error: Could not open proximity file: ".$proximityFile."\n" }; 154 | my $fhout = new FileHandle; 155 | unless( $fhout->open(">$outputFile") ) { die "HotSpot3D::Anno::addAnnotation error: Could not open the file for annotation: ".$outputFile."\n" }; 156 | print STDOUT $uniprotId." HotSpot3D::Anno::addAnnotation - writing feature annotated file: ".$outputFile."\n"; 157 | my ( $coord1 , $coord2 , $offset1 , $offset2 ); 158 | $fhout->print( "UniProt_ID1\tChain1\tPosition1\tOffset1\t" ); 159 | $fhout->print( "Residue_Name1\tFeature1\t" ); 160 | $fhout->print( "UniProt_ID2\tChain2\tPosition2\tOffset2\t" ); 161 | $fhout->print( "Residue_Name2\tFeature2\t" ); 162 | $fhout->print( "Distance\tPDB_ID\tP_Value\n" ); 163 | while ( my $a = $fhin->getline ) { 164 | next if ($a =~ /^WARNING:/); 165 | next if ($a =~ /UniProt_ID1/); 166 | chomp($a); 167 | my @t = split /\t/, $a; 168 | next if ($t[0] !~ /^\w+$/); 169 | next if ($t[5] !~ /^\w+$/); 170 | next if ($t[1] !~ /^\[[A-Z]\]$/); 171 | next if ($t[6] !~ /^\[[A-Z]\]$/); 172 | my $distance = $t[10]; 173 | if ( $distance !~ /^-?\d+\.?\d*$/ ) { warn "HotSpot3D::Anno::addAnnotation warning: Wrong distance : $distance \n"; next; } 174 | my ( $annoOneEnd, $annoTwoEnd, $uniprotCoorOneEnd, $uniprotCoorTwoEnd, ); 175 | $annoOneEnd = $annoTwoEnd = "N\/A"; 176 | $t[2] = TGI::Data::CleanNumber::nullIsZero( $t[2] ); 177 | $t[3] = TGI::Data::CleanNumber::nullIsZero( $t[3] ); 178 | $t[7] = TGI::Data::CleanNumber::nullIsZero( $t[7] ); 179 | $t[8] = TGI::Data::CleanNumber::nullIsZero( $t[8] ); 180 | $uniprotCoorOneEnd = $t[2] + $t[3]; 181 | $uniprotCoorTwoEnd = $t[7] + $t[8]; 182 | #warn $uniprotCoorOneEnd."\t".$uniprotCoorTwoEnd."\n"; 183 | if ( defined $annotations->{$uniprotId}->{$uniprotCoorOneEnd} ) { $annoOneEnd = $annotations->{$uniprotId}->{$uniprotCoorOneEnd}; } 184 | if ( defined $annotations->{$uniprotId}->{$uniprotCoorTwoEnd} ) { $annoTwoEnd = $annotations->{$uniprotId}->{$uniprotCoorTwoEnd}; } 185 | # warn $annoOneEnd."\t".$annoTwoEnd."\n"; 186 | foreach my $d (0..4) { print $fhout $t[$d]."\t"; } 187 | print $fhout $annoOneEnd."\t"; 188 | foreach my $d (5..9) { print $fhout $t[$d]."\t"; } 189 | print $fhout $annoTwoEnd."\t"; 190 | foreach my $d (10..11) { print $fhout $t[$d]."\t"; } 191 | print $fhout $t[12]."\n"; 192 | } 193 | $fhin->close(); 194 | $fhout->close(); 195 | return; 196 | } 197 | 198 | sub help_text { 199 | my $this = shift; 200 | return <{_OUTPUT_DIR} = getcwd; 26 | $this->{_STAT} = undef; 27 | bless $this, $class; 28 | $this->process(); 29 | return $this; 30 | } 31 | 32 | sub process { 33 | my $this = shift; 34 | $this->setOptions(); 35 | #### processing #### 36 | # generate region of interest information ( ROI ) 37 | my $annoDir = $this->getOutputDir(); 38 | my $fhuid = $this->getInputFile( ); 39 | my $allUniprotIds = $this->getUniprotIds( $fhuid ); 40 | $this->makeROIannotations( $allUniprotIds , $annoDir ); 41 | return 0; 42 | } 43 | 44 | sub makeROIannotations { 45 | my ( $this , $allUniprotIds , $annoDir ) = @_; 46 | 47 | foreach my $uniprotId ( keys %{$allUniprotIds} ) { 48 | $this->makeROIannotationFile( $uniprotId , $annoDir ); 49 | } 50 | } 51 | 52 | sub setOptions { 53 | my $this = shift; 54 | my ( $help, $options ); 55 | unless( @ARGV ) { die $this->help_text(); }; 56 | $options = GetOptions ( 57 | 'output-dir=s' => \$this->{_OUTPUT_DIR}, 58 | 'help' => \$help, 59 | ); 60 | if ( $help ) { warn help_text(); exit 0; }; 61 | unless( $options ) { die $this->help_text(); }; 62 | unless( $this->{_OUTPUT_DIR} ) { warn 'HotSpot3D::Calroi::setOptions error: You must provide a output directory!', "\n"; die $this->help_text(); }; 63 | unless( -e $this->{_OUTPUT_DIR} ) { warn 'HotSpot3D::Calroi::setOptions error: The output directory does not exist!', "\n"; die $this->help_text(); }; 64 | return; 65 | } 66 | 67 | sub getInputFile { 68 | my ( $this , $annoDir ) = @_; 69 | 70 | my $fhuid = new FileHandle; 71 | my $hugoUniprotFile = "$this->{_OUTPUT_DIR}\/hugo.uniprot.pdb.csv"; 72 | unless( $fhuid->open("<$hugoUniprotFile") ) { die "HotSpot3D::Calroi::getInputFile error: Could not open uniprot id file ".$hugoUniprotFile."!\n" }; 73 | 74 | return $fhuid; 75 | } 76 | 77 | sub getOutputDir { 78 | my $this = shift; 79 | 80 | my $proDir = "$this->{_OUTPUT_DIR}\/proximityFiles"; 81 | my $annoDir = "$proDir\/annotationFiles"; 82 | unless( -e $annoDir ) { mkdir( $annoDir ) || die "HotSpot3D::Calroi::getOutputDir error: can not make ROI annotation directory!\n"; }; 83 | 84 | return $annoDir; 85 | } 86 | 87 | sub getUniprotIds { 88 | my ( $this , $fhuid ) = @_; 89 | my $allUniprotIds; 90 | 91 | my ( $line , @entireFile , $uniprotId , $pdb ); 92 | @entireFile = <$fhuid>; 93 | $fhuid->close(); 94 | foreach $line (@entireFile) { 95 | chomp $line; 96 | ( undef , $uniprotId , $pdb ) = split /\s+/ , $line; 97 | # Only use Uniprot IDs with PDB structures 98 | next if ( $pdb eq "N/A" || $uniprotId !~ /\w+/ ); 99 | $allUniprotIds->{$uniprotId} = 1; 100 | } 101 | return $allUniprotIds; 102 | } 103 | 104 | sub makeROIannotationFile { 105 | my ( $this , $uniprotId , $annoDir ) = @_; 106 | my $uniprotRef = TGI::Mutpro::Preprocess::Uniprot->new($uniprotId); 107 | defined ($uniprotRef) || die "HotSpot3D::Calroi::makeROIannotationFile error: no object for '$uniprotId'"; 108 | # The annotation is a ref to array made here: 109 | # 'push @domains, 110 | # "$key\t($dmStart, $dmStop)\t$desc'"; 111 | my $annotationRef = $uniprotRef->domainsAfterPosition(1); 112 | my $file = $annoDir."/".$uniprotId.".annotation.txt"; 113 | my $fhoneuid = new FileHandle; 114 | unless( $fhoneuid->open( $file , "w" ) ) { 115 | die "HotSpot3D::Calroi::makeROIannotationFile error: Could not open annotation file to write for ".$uniprotId." at ".$file."!\n"; 116 | }; 117 | print STDOUT $uniprotId." HotSpot3D::Calroi - Making annotation file ".$file."\n"; 118 | $fhoneuid->print( "Feature_Start\tFeature_End\tFeature_Type\tFeature_Description\n" ); 119 | foreach my $annotation ( @{$annotationRef} ) { 120 | my ( $key , $start , $stop , $desc ); 121 | if ( $annotation =~ /(\w+)\s+\((\d+)\,\s+(\d+)\)\s+(.*)\.?$/ ) { 122 | $key = $1; $start = $2; $stop = $3; $desc = $4; 123 | } else { 124 | warn "HotSpot3D::Calroi::makeROIannotationFile warning: Could not parse domain description for '$uniprotId'\n"; 125 | next; 126 | } 127 | if ( $start > $stop ) { 128 | warn "HotSpot3D::Calroi::makeROIannotationFile warning: Start ($start) > Stop ($stop) in '$uniprotId'\n"; 129 | next; 130 | } 131 | $fhoneuid->print( join( "\t" , ( $start , $stop , $key , $desc ) )."\n" ); 132 | } 133 | $fhoneuid->close(); 134 | return; 135 | } 136 | 137 | sub help_text { 138 | my $this = shift; 139 | return < 'all'; 5 | 6 | sub print_stuff { 7 | print "Hi\n"; 8 | } 9 | 10 | 1; 11 | 12 | -------------------------------------------------------------------------------- /lib/TGI/Mutpro/Preprocess/Drugport.pm: -------------------------------------------------------------------------------- 1 | package TGI::Mutpro::Preprocess::Drugport; 2 | # 3 | #---------------------------------- 4 | # $Authors: Beifang Niu 5 | # $Date: 2014-01-14 14:34:50 -0500 (Tue Jan 14 14:34:50 CST 2014) $ 6 | # $Revision: $ 7 | # $URL: $ 8 | # $Doc: $ drugport database processing module 9 | #---------------------------------- 10 | # 11 | use strict; 12 | use warnings; 13 | 14 | use Carp; 15 | use Cwd; 16 | use Getopt::Long; 17 | use LWP::Simple; 18 | use IO::File; 19 | use FileHandle; 20 | 21 | sub new { 22 | my $class = shift; 23 | my $this = {}; 24 | $this->{'output_file'} = 'drugport_results'; 25 | $this->{'pdb_file_dir'} = undef; 26 | $this->{'stat'} = undef; 27 | $this->{'page'} = ""; 28 | bless $this, $class; 29 | $this->process(); 30 | return $this; 31 | } 32 | 33 | sub process { 34 | my $this = shift; 35 | my ( $help, $options ); 36 | unless( @ARGV ) { die $this->help_text(); }; 37 | $options = GetOptions ( 38 | 'pdb-file-dir=s' => \$this->{'pdb_file_dir'}, 39 | 'output-file=s' => \$this->{'output_file'}, 40 | 'help' => \$help, 41 | ); 42 | if ( $help ) { print STDERR help_text(); exit 0; }; 43 | unless( $options ) { die $this->help_text(); }; 44 | unless( $this->{'output_file'} ne 'drugport_results' ) { 45 | warn "No output file given. Writing DrugPort results to $this->{'output_file'} !\n"; 46 | } 47 | unless( $this->{'pdb_file_dir'} and (-e $this->{'pdb_file_dir'})) { 48 | warn " $_ does not exist ! \n"; 49 | die $this->help_text(); 50 | } 51 | #### processing #### 52 | # parse drugport database 53 | # drug name and ids 54 | my %drug_hash = (); 55 | my $appdrugs_url = "http://www.ebi.ac.uk/thornton-srv/databases/drugport/data/appdrugs_pdb.dat"; 56 | $this->{'page'} = get( $appdrugs_url ); 57 | unless( $this->{'page'} ) { die "can not access drugport database file $! \n"; } 58 | my ($drug, $did); 59 | map { @_ = split / /; if ( /^GENERIC_NAME/ ) { $drug = $_[1] }; if ( /^DRUGNAME_ID/ ) { $did = $_[1]; $drug_hash{$did}{'name'} = $drug } } split /\n/, $this->{'page'}; 60 | my $fhout = new FileHandle; 61 | unless( $fhout->open("> $this->{'output_file'}") ) { die "Could not open output file to write !\n" }; 62 | map { 63 | my ( $content, $id, $drug_name, ); 64 | $content = ""; $id = $_; 65 | $drug_name = $drug_hash{$id}{'name'}; 66 | $content .= $drug_name."\t".$id."\t"; 67 | print STDOUT $drug_name."\t".$id."\n"; 68 | my ($t2n) = $_ =~ /(\d\d)$/; 69 | my $drugdata_url = "http://www.ebi.ac.uk/thornton-srv/databases/drugport/drugs/$t2n/$_/database.dat"; 70 | $this->{'page'} = get( $drugdata_url ); 71 | unless( $this->{'page'} ) { die "can not access drug data file $! \n"; } 72 | my @filter = grep /^HET_GROUP|^TARGET_PDB_ID|^TARGET_CHAIN_ID|^TARGET_DRUG_IN_PDB|^UNASSIGNED_PDB_ID|^UNASSIGNED_CHAIN_ID/, split /\n/, $this->{'page'}; 73 | my ( $het, %ss, %unified, @t, $name, $index, $chain, ); 74 | $het = ""; %ss = (); %unified = (); 75 | map { 76 | if (/^HET_GROUP/) { 77 | @t = split / /; $het = $t[1]; 78 | } elsif ( /^UNASSIGNED/ ) { 79 | @t = split / /; 80 | ($name, $index) = $t[0] =~ /(\w+_\w+_\w+)\[(\d+)\]/; 81 | $chain = $t[1]; 82 | $unified{$index}{$name} = $t[1]; 83 | } else { 84 | @t = split / /; 85 | $t[0] =~ /(.*?)\[(\d+)\]\[(\d+)\]/; 86 | $name = $1; 87 | $index = $2."_".$3; 88 | $chain = $t[1]; $ss{$index}{"TARGET_DRUG_IN_PDB"} = "NA"; 89 | $ss{$index}{$name} = $t[1]; 90 | } 91 | } @filter; 92 | 93 | $content .= $het."\t"; 94 | my ( $target, $nottarget, $unsigned, ); 95 | $target = $nottarget = $unsigned = ""; 96 | map { 97 | if ( $ss{$_}{"TARGET_DRUG_IN_PDB"} eq "TRUE" ) { 98 | $target .= join("\|", $ss{$_}{"TARGET_PDB_ID"}, $ss{$_}{"TARGET_CHAIN_ID"}).","; 99 | } else { 100 | if ( $ss{$_}{"TARGET_PDB_ID"} and $ss{$_}{"TARGET_CHAIN_ID"} ) { 101 | $nottarget .= join("\|", $ss{$_}{"TARGET_PDB_ID"}, $ss{$_}{"TARGET_CHAIN_ID"}).","; 102 | } 103 | } 104 | } keys %ss; 105 | chop($target); chop($nottarget); 106 | if ($target) { $content .= $target."\t"; } else { $content .= "NULL\t" }; 107 | if ($nottarget) { $content .= $nottarget."\t"; } else { $content .= "NULL\t" }; 108 | map { 109 | if ( $unified{$_}{"UNASSIGNED_PDB_ID"} and $unified{$_}{"UNASSIGNED_CHAIN_ID"} ) { 110 | $unsigned .= join("\|", $unified{$_}{"UNASSIGNED_PDB_ID"}, $unified{$_}{"UNASSIGNED_CHAIN_ID"}).","; 111 | } 112 | } keys %unified; 113 | chop( $unsigned ); 114 | if ( $unsigned ) { $content .= $unsigned."\n"; } else { $content .= "NULL\n" }; 115 | ###### 116 | my $temp_content = ""; 117 | chomp( $content ); 118 | @t = split /\t/, $content; 119 | $het = $t[2]; 120 | #print join("\t", @t[0..2]); print "\t"; 121 | $temp_content .= join("\t", @t[0..2])."\t"; 122 | $het =~ s/ //g; 123 | map { 124 | #print $_."\t"; 125 | $temp_content .= $_."\t"; 126 | my @buf1 = (); 127 | unless( $_ =~ /NULL/ ) { 128 | map { 129 | my ($pdb, $chain) = $_ =~ /(\w+)\|(\w+)/; 130 | $pdb =~ s/ //g; $chain =~ s/ //g; 131 | my $t_pdb_f = $this->{'pdb_file_dir'} . uc( $pdb ). ".pdb"; 132 | my $pdb_file_name = uc( $pdb ). ".pdb"; 133 | my @pdb_infor = (); 134 | if ( -e $t_pdb_f ) { 135 | @pdb_infor = map{ chomp; $_ } `cat $t_pdb_f`; 136 | } else { 137 | my $pdb_url = "http://www.rcsb.org/pdb/files/$pdb_file_name"; 138 | $this->{'page'} = get( $pdb_url ); 139 | if ( $this->{'page'} ) { 140 | @pdb_infor = split /\n/, $this->{'page'}; 141 | } else { warn "can not access pdb file $! \n"; } 142 | } 143 | foreach ( @pdb_infor ) { 144 | chomp; 145 | next if ($_ !~ /^HETATM/); 146 | my @cols = split //, $_; 147 | my $t_het = join( "", @cols[17..19] ); 148 | my $t_chain = $cols[21]; 149 | my $t_loc = join( "", @cols[22..25] ); 150 | $t_het =~ s/ //g; 151 | $t_loc =~ s/ //g; 152 | if ( ($t_chain eq $chain) and ( $t_het eq $het ) and ( $t_loc !~ /-/ ) ) { 153 | my $t_con = join("\|", $pdb, $chain, $t_loc, $t_het); 154 | push ( @buf1, $t_con ); last 155 | } 156 | } 157 | } split /,/, $_; 158 | } 159 | if ( @buf1 ) { $temp_content .= join(",", @buf1) } else { $temp_content .= "NULL" } 160 | $temp_content .= "\t"; 161 | } @t[3..5]; 162 | chop( $temp_content ); 163 | $temp_content .= "\n"; 164 | $fhout->print( $temp_content ); 165 | } keys %drug_hash; 166 | 167 | } 168 | 169 | sub help_text { 170 | my $this = shift; 171 | return <{_OUTPUT_DIR} = getcwd; 28 | $this->{_IDENTITY} = 0.3; 29 | $this->{_STAT} = undef; 30 | 31 | bless $this, $class; 32 | $this->process(); 33 | 34 | return $this; 35 | } 36 | 37 | sub process { 38 | my $this = shift; 39 | my ( $help, $options ); 40 | unless( @ARGV ) { die $this->help_text(); }; 41 | $options = GetOptions ( 42 | 'output-dir=s' => \$this->{_OUTPUT_DIR}, 43 | 'identity=f' => \$this->{_IDENTITY}, 44 | 'help' => \$help, 45 | ); 46 | if ( $help ) { print STDERR help_text(); exit 0; }; 47 | unless( $options ) { die $this->help_text(); }; 48 | unless( $this->{_OUTPUT_DIR} ) { 49 | warn 'You must provide a output directory ! ', "\n"; 50 | die $this->help_text(); 51 | }; 52 | unless( -e $this->{_OUTPUT_DIR} ) { 53 | warn 'output directory is not exist ! ', "\n"; 54 | die $this->help_text(); 55 | }; 56 | #### processing #### 57 | # get homology PDBs for uniprots without PDBs annotations 58 | my ( $pdbseqsDir, $UniprotIdFile, $pdbseqsFile, $outputFile, ); 59 | $pdbseqsDir = "$this->{_OUTPUT_DIR}\/pdbsequences"; 60 | $UniprotIdFile = "$this->{_OUTPUT_DIR}\/hugo.uniprot.pdb.csv"; 61 | $pdbseqsFile = "$pdbseqsDir\/pdb_seqres.txt"; 62 | unless( -e $pdbseqsDir ) { mkdir( $pdbseqsDir ) || die "can not make pdb sequences directory !\n"; }; 63 | ## get pdbseqs file 64 | my $url = 'ftp://ftp.rcsb.org/pub/pdb/derived_data/pdb_seqres.txt'; 65 | getstore( $url, $pdbseqsFile ); 66 | ## extract protein seqs 67 | # 68 | my %pdbseq_hash; 69 | my $pdbfh = new FileHandle; 70 | unless( $pdbfh->open("<$pdbseqsFile") ) { die "Could not open pdb sequences file !\n" }; 71 | while (my $a = <$pdbfh>) { 72 | if ($a =~ /^>/) { 73 | my ($name) = $a =~ /^>(\S+)\s/; 74 | my $b = <$pdbfh>; 75 | $pdbseq_hash{$name} = $b; 76 | } 77 | } 78 | $pdbfh->close(); 79 | my ( @entireFile, $uniprotId, %allUniprotIds, $uniprotRef, $annotationRef, $start, $stop, $key, $desc, $entry, $pdb, ); 80 | my $fhuid = new FileHandle; 81 | unless( $fhuid->open("<$UniprotIdFile") ) { die "Could not open uniprot id file !\n" }; 82 | @entireFile = <$fhuid>; 83 | $fhuid->close(); 84 | my $outputContent = ""; 85 | foreach my $line (@entireFile) { 86 | chomp $line; 87 | ( undef, $uniprotId, $pdb, ) = split /\s+/, $line; 88 | # Only use Uniprot IDs with PDB structures 89 | # 90 | # 91 | # 92 | # 93 | next if ($uniprotId !~ /\w+/ ); 94 | next unless ( $pdb eq "N/A" ); 95 | print STDOUT "*** UNIPROT ID: $uniprotId\n"; 96 | $allUniprotIds{$uniprotId} = 1; 97 | $uniprotRef = TGI::Mutpro::Preprocess::Uniprot->new($uniprotId); 98 | defined ($uniprotRef) || die "no object for '$uniprotId'"; 99 | # The annotation is a ref to array made here: 100 | # 'push @domains, 101 | # "$key\t($dmStart, $dmStop)\t$desc'"; 102 | my $uniprotSequence = $uniprotRef->sequence(); 103 | 104 | 105 | my $tfh = new FileHandle; 106 | unless( $tfh->open(">.temp_one_pseq") ) { die "Could not generate temp file !\n" }; 107 | print $tfh ">$uniprotId\n$uniprotSequence\n"; 108 | $tfh->close(); 109 | 110 | my $t_blast_output = `blastall -p blastp -d $pdbseqsFile -i .temp_one_pseq -e 0.05`; 111 | 112 | #print $t_blast_output; 113 | #unlink(".temp_one_pseq"); 114 | # 115 | # 116 | $this->parse_blastp_output( $t_blast_output, $uniprotId, $this->{_IDENTITY}, ); 117 | 118 | } 119 | } 120 | # 121 | # blastp parsing 122 | sub parse_blastp_output { 123 | my ( $this, $blastp_output, $uniprotid, $iden_cutoff, ) = @_; 124 | 125 | my ( $f, @top, %homos, %header, $index, $pdb, $chain, $iden, ); 126 | my ( $qstart, $qend, $qcont, $sstart, $send, $scont, ); 127 | my ( @homoregions, ); 128 | 129 | $f = 0; $index = 0; $iden = 0; 130 | foreach (split /\n/, $blastp_output) { 131 | next unless ( /^>/ || $f == 1 ); 132 | last if ($f == 1 && /^>/ ); 133 | push @top, $_; $f = 1; 134 | } 135 | 136 | return \@homoregions unless( @top ); 137 | foreach (@top) { 138 | if ( /^>/ ) { 139 | ($pdb, $chain) = /^>(\w+)\_(\w) /; 140 | $header{'PDB'} = uc($pdb); 141 | $header{'CHAIN'} = uc($chain); 142 | } 143 | if ( /Identities/ ) { $index++; ($iden) = /\((\d+)\%\)/; $homos{$index}{'IDEN'} = $iden; } 144 | if ( /Query:/ ) { 145 | ( $qstart, $qcont, $qend ) = /Query:\s+(\d+)\s+(\S+)\s+(\d+)/; 146 | unless (defined $homos{$index}{'QUESTART'}) { $homos{$index}{'QUESTART'} = $qstart; } 147 | $homos{$index}{'QUECONT'} .= $qcont; 148 | $homos{$index}{'QUEEND'} = $qend; 149 | } 150 | if ( /Sbjct:/ ) { 151 | ( $sstart, $scont, $send ) = /Sbjct:\s+(\d+)\s+(\S+)\s+(\d+)/; 152 | unless (defined $homos{$index}{'SUBSTART'}) { $homos{$index}{'SUBSTART'} = $sstart; } 153 | $homos{$index}{'SUBCONT'} .= $scont; 154 | $homos{$index}{'SUBEND'} = $send; 155 | } 156 | } 157 | #map { print; print "\n"; } @top; 158 | # 159 | print STDOUT $header{'PDB'}."_".$header{'CHAIN'}."\n"; 160 | foreach my $d (keys %homos) { 161 | next if ( $homos{$d}{'IDEN'} < $iden_cutoff * 100 ); 162 | print STDOUT "==".$d."\n"; 163 | print STDOUT $homos{$d}{'IDEN'}."\n"; 164 | print STDOUT $homos{$d}{'QUESTART'}." ".$homos{$d}{'QUECONT'}." ".$homos{$d}{'QUEEND'}."\n"; 165 | print STDOUT $homos{$d}{'SUBSTART'}." ".$homos{$d}{'SUBCONT'}." ".$homos{$d}{'SUBEND'}."\n"; 166 | my ( @taq, @tas, $tqstart, $tsstart, $i, $j, $k, $open, ); 167 | $tqstart = $i = $homos{$d}{'QUESTART'}; 168 | $tsstart = $j = $homos{$d}{'SUBSTART'}; 169 | @taq = split //, $homos{$d}{'QUECONT'}; 170 | @tas = split //, $homos{$d}{'SUBCONT'}; 171 | $k = $open = 0; 172 | foreach my $e ( @taq ) { 173 | if ( $e ne '-' && $tas[$k] ne '-' ) { 174 | if ($open == 1) { 175 | $tqstart = $i; 176 | $tsstart = $j; 177 | $open = 0; 178 | } 179 | $i++; $j++; $k++; 180 | } else { 181 | if ( $e eq '-') { 182 | if ( $open == 0 ) { 183 | my $tt0 = $i - 1; 184 | my $tt1 = $j - 1; 185 | push( @homoregions, "DBREF\t$header{'PDB'}\t$header{'CHAIN'}\t$tsstart\t$tt1\tUNP\t$uniprotid\tB2MG_HUMAN\t$tqstart\t$tt0\n" ); 186 | print STDOUT "DBREF\t$header{'PDB'}\t$header{'CHAIN'}\t$tsstart\t$tt1\tUNP\t$uniprotid\tB2MG_HUMAN\t$tqstart\t$tt0\n"; 187 | #print "$tqstart,$tt0 -- $tsstart,$tt1\n"; 188 | $j++; $k++; $open = 1; 189 | } else { $j++; $k++; } 190 | } 191 | if ( $tas[$k] eq '-') { 192 | if ( $open == 0 ) { 193 | my $tt0 = $i - 1; 194 | my $tt1 = $j - 1; 195 | push( @homoregions, "DBREF\t$header{'PDB'}\t$header{'CHAIN'}\t$tsstart\t$tt1\tUNP\t$uniprotid\tB2MG_HUMAN\t$tqstart\t$tt0\n" ); 196 | print STDOUT "DBREF\t$header{'PDB'}\t$header{'CHAIN'}\t$tsstart\t$tt1\tUNP\t$uniprotid\tB2MG_HUMAN\t$tqstart\t$tt0\n"; 197 | #print "$tqstart,$tt0 -- $tsstart,$tt1\n"; 198 | $i++; $k++; $open = 1; 199 | } else { $i++; $k++; } 200 | } 201 | } 202 | } 203 | 204 | if ($open == 0) { 205 | my $tt0 = $i - 1; 206 | my $tt1 = $j - 1; 207 | push( @homoregions, "DBREF\t$header{'PDB'}\t$header{'CHAIN'}\t$tsstart\t$tt1\tUNP\t$uniprotid\tB2MG_HUMAN\t$tqstart\t$tt0\n" ); 208 | print STDOUT "DBREF\t$header{'PDB'}\t$header{'CHAIN'}\t$tsstart\t$tt1\tUNP\t$uniprotid\tB2MG_HUMAN\t$tqstart\t$tt0\n"; 209 | #print "$tqstart,$tt0 -- $tsstart,$tt1\n"; 210 | } 211 | } 212 | # 213 | # 214 | # 215 | return \@homoregions; 216 | 217 | } 218 | 219 | sub help_text { 220 | my $this = shift; 221 | return <{NAME} = undef; # Chain id from PDB file 20 | $self->{AA} = {}; # Hash of AminoAcid objects 21 | # key = residue number 22 | # value = ref to AminoAcid object 23 | bless ($self, $class); 24 | return $self; 25 | } 26 | 27 | sub name { 28 | # Chain name (single letter) 29 | my $self = shift; 30 | if (@_) { $self->{NAME} = shift; } 31 | return $self->{NAME}; 32 | } 33 | 34 | sub addPointToAminoAcid { 35 | # Add to AminoAcid object that is in this peptide 36 | # If there is no AminoAcid object for the given position, 37 | # make one 38 | # Input: 3-letter code for amino acid, 39 | # position in this peptide, 40 | # (x,y,z) coordinates 41 | my $self = shift; 42 | my ($name, $position, $x, $y, $z) = @_; 43 | my ( $aaRef, $aa ); 44 | # Make a new AminoAcid object if this position 45 | # has not yet been added to peptide 46 | if (!defined ${$self->{AA}}{$position}) { 47 | $aa = new TGI::Mutpro::Preprocess::AminoAcid; 48 | $aa->name($name); 49 | $aa->position($position); 50 | $aa->chain($self->{NAME}); 51 | ${$self->{AA}}{$position} = \$aa; 52 | } 53 | # Add point 54 | $aaRef = $self->getAminoAcidObject($position); 55 | # If there are two different names for the given amino acid. 56 | # It is ambiguous and should be deleted 57 | if (defined $aaRef && $$aaRef->name() ne $name) { $$aaRef->ambiguous(1); } 58 | $$aaRef->addPoint($x,$y,$z); 59 | } 60 | 61 | sub removeAmbiguousAminoAcids { 62 | # If any of the amino acid positions in this peptide have more than 63 | # one amino acid name associated with them 64 | # they were marked as ambiguous e.g. position 12 is ambiguous 65 | # ATOM 1 N APRO A 12 3.278 21.202 20.087 0.83 56.23 N 66 | # ATOM 8 N BSER A 12 3.302 21.148 20.087 0.17 56.57 N 67 | my $self = shift; 68 | foreach my $position ( keys %{$self->{AA}} ) { 69 | my $aaRef = $self->getAminoAcidObject($position); 70 | if ( $$aaRef->ambiguous() ) { delete ${$self->{AA}}{$position}; } 71 | } 72 | 73 | } 74 | 75 | sub addAminoAcid { 76 | # Input: amino acid residue number, 77 | # ref to AminoAcid object 78 | my $self = shift; 79 | my ($position, $aaRef) = @_; 80 | if ( $aaRef->isAA( $aaRef->name() ) ) { 81 | ${$self->{AA}}{$position} = $aaRef; 82 | } 83 | } 84 | 85 | sub getAminoAcidObject { 86 | # Input: position number 87 | # Return: ref to AminoAcid object at that position 88 | my $self = shift; 89 | my $position = shift; 90 | if ( defined ${$self->{AA}}{$position} ) { 91 | return ${$self->{AA}}{$position}; 92 | } else { return undef; } 93 | } 94 | 95 | sub getAllAminoAcidObjects { 96 | # Return: ref to hash with key = residue number, 97 | # value = ref to AminoAcid object 98 | my $self = shift; 99 | return \%{$self->{AA}}; 100 | } 101 | 102 | sub aminoAcidPositionNumbers { 103 | # Return ref to array with the position 104 | # numbers of all AminoAcid objects (sorted numerically) 105 | my $self = shift; 106 | my @positions = keys %{$self->{AA}}; 107 | @positions = sort {$a<=>$b} @positions; 108 | return \@positions; 109 | } 110 | 111 | 1; 112 | 113 | -------------------------------------------------------------------------------- /lib/TGI/Mutpro/Preprocess/Point.pm: -------------------------------------------------------------------------------- 1 | package TGI::Mutpro::Preprocess::Point; 2 | # 3 | #---------------------------------- 4 | # $Authors: Beifang Niu 5 | # $Date: 2014-01-14 14:34:50 -0500 (Tue Jan 14 14:34:50 CST 2014) $ 6 | # $Revision: $ 7 | # $URL: $ 8 | # $Doc: $ do prioritization 9 | #---------------------------------- 10 | # 11 | use strict; 12 | use Carp; 13 | # Point with X, Y, Z coordinates 14 | # Used to calculate distance between atoms in PDB structure 15 | sub new { 16 | my $proto = shift; 17 | my $class = ref($proto) || $proto; 18 | my $self = {}; 19 | $self->{XYZ} = (); # array of values in order x,y,z 20 | # Values rounded to nearest thousandth 21 | bless ($self, $class); 22 | return $self; 23 | } 24 | 25 | sub xyz { 26 | # Get/Set array that holds X,Y,Z values 27 | # Returns an array 28 | my $self = shift; 29 | if (@_) { 30 | foreach my $coord ( @_ ) { 31 | #print $coord."\n"; 32 | $coord = $self->round($coord); 33 | push @{$self->{XYZ}}, $coord; 34 | } 35 | } 36 | return @{$self->{XYZ}}; 37 | } 38 | 39 | sub distance { 40 | # Input: ref to a Point object 41 | # Return: distance between this point and input point rounded to nearest thousandths 42 | my $self = shift; 43 | my $pointRef = shift; 44 | my ($aX, $aY, $aZ) = $$pointRef->xyz(); 45 | my ($bX, $bY, $bZ) = $self->xyz(); 46 | my $distance = sqrt( ($aX-$bX)*($aX-$bX) + ($aY-$bY)*($aY-$bY) + ($aZ-$bZ)*($aZ-$bZ) ); 47 | # Round off number to nearest 0.001 48 | $distance = $self->round($distance); 49 | return $distance; 50 | } 51 | 52 | sub samePoint { 53 | # Input: ref to a Point object 54 | # Return: 1 if the point has same coordinates as this one, 0 if not 55 | # This was changed from == comparison to 'eq' since seemingly equivalent 56 | # numbers were not comparing as expected. Does Perl store as a float? 57 | my $self = shift; 58 | my $pointRef = shift; 59 | my ($aX, $aY, $aZ) = $$pointRef->xyz(); 60 | my ($bX, $bY, $bZ) = $self->xyz(); 61 | return ( $aX eq $bX && $aY eq $bY && $aZ eq $bZ ); 62 | } 63 | 64 | sub round { 65 | # Round off number to nearest 0.001 66 | my $self = shift; 67 | my $num = shift; 68 | $num += 0.0005; 69 | if ( $num =~ /(-?\d+\.\d{3})/ ) { $num = $1; } 70 | return $num; 71 | } 72 | 73 | return 1; 74 | 75 | -------------------------------------------------------------------------------- /lib/TGI/Mutpro/Preprocess/Prep.pm: -------------------------------------------------------------------------------- 1 | package TGI::Mutpro::Preprocess::Prep; 2 | # 3 | #---------------------------------- 4 | # $Authors: Adam D Scott 5 | # $Date: 2016-08-24 $ 6 | # $Revision: 2016-08-24 $ 7 | # $URL: $ 8 | # $Doc: run all preprocessing steps $ 9 | #---------------------------------- 10 | # 11 | use strict; 12 | use warnings; 13 | our $VERSION = '0.0'; 14 | 15 | use Carp; 16 | use Cwd; 17 | use Getopt::Long; 18 | use LWP::Simple; 19 | use IO::File; 20 | use FileHandle; 21 | use List::MoreUtils qw( uniq ); 22 | 23 | use TGI::Mutpro::Preprocess::Uniprot; 24 | use TGI::Mutpro::Preprocess::HugoGeneMethods; 25 | 26 | sub new { 27 | my $class = shift; 28 | my $this = {}; 29 | my %sub_cmds = ( 30 | 'output_dir' => getcwd, 31 | 'max_3d_dis' => 100, 32 | 'min_seq_dis' => 0, 33 | 'hold' => 0, 34 | 'status' => undef, 35 | 'pdb_file_dir' => undef, 36 | 'genes' => undef, 37 | #'drugport_file' => undef, 38 | 'cmd_list_submit_file' => "cmd_list_submit_file", 39 | ); 40 | map{ $this->{$_} = $sub_cmds{$_} } keys %sub_cmds; 41 | bless $this, $class; 42 | $this->process(); 43 | return $this; 44 | } 45 | 46 | sub process { 47 | my $this = shift; 48 | my ($help, $options); 49 | unless (@ARGV) { die $this->help_text(); } 50 | $options = GetOptions ( 51 | '3d-distance-cutoff=i' => \$this->{'max_3d_dis'}, 52 | 'linear-distance-cutoff=i' => \$this->{'min_seq_dis'}, 53 | 'output-dir=s' => \$this->{'output_dir'}, 54 | 'pdb-file-dir=s' => \$this->{'pdb_file_dir'}, 55 | 'gene-file=s' => \$this->{'genes'}, 56 | #'drugport-file=s' => \$this->{'drugport_file'}, 57 | 'cmd-list-submit-file=s' => \$this->{'cmd_list_submit_file'}, 58 | 'hold' => \$this->{'hold'}, 59 | 'help' => \$help, 60 | ); 61 | if ($help) { print STDERR help_text(); exit 0; } 62 | unless ($options) { die $this->help_text(); } 63 | #map{ unless($this->{$_} and (-e $this->{$_})) { warn " $_ does not exist ! \n"; die $this->help_text(); } } qw( output_dir pdb_file_dir drugport_file ); 64 | map{ 65 | unless($this->{$_} and (-e $this->{$_})) { 66 | warn " $_ does not exist ! \n"; 67 | die $this->help_text(); 68 | } 69 | } qw( output_dir pdb_file_dir ); 70 | my $pro_dir = "$this->{'output_dir'}\/proximityFiles"; 71 | my $inpro_dir = "$pro_dir\/inProgress"; 72 | my $pdbcor_dir = "$pro_dir\/pdbCoordinateFiles"; 73 | my $log_file = "$this->{'output_dir'}\/hugo.uniprot.pdb.csv"; 74 | my $log_dir = "$this->{'output_dir'}\/Logs\/"; 75 | unless (-e $pro_dir) { mkdir($pro_dir) || die "HotSpot3D Prep Error: can not make $pro_dir !\n"; } 76 | unless (-e $log_file) { if (system("touch $log_file") != 0) { die "HotSpot3D Prep Error: can not make hugo uniprot file !\n"; } } 77 | unless (-e $inpro_dir) { mkdir($inpro_dir) || die "HotSpot3D Prep Error: can not make $inpro_dir !\n"; } 78 | unless (-e $pdbcor_dir) { mkdir($pdbcor_dir) || die "HotSpot3D Prep Error: can not make $pdbcor_dir !\n"; } 79 | unless ( -e $log_dir ) { mkdir( $log_dir ) || die "HotSpot3D Prep Error: can not make $log_dir !\n"; } 80 | my %uniprotid_toupdate; 81 | my $uniprot_to_structureref = $this->current_structures($log_file); 82 | my $uniprot_fileref = $this->currentuniprot_files($pro_dir); 83 | my $fh = new FileHandle; 84 | unless ($fh->open(">$log_file")) { die "Could not open hugo uniprot file !\n" }; 85 | print STDOUT "Creating ".$log_file."\n"; 86 | my ($hugo_id, $alias_ref, $previous_ref, $alias_list, $uniprot_id, $uniprot_ref, $pdb_ref); 87 | my $hugogene_ref; 88 | my ( %list , @fields ); 89 | if ( $this->{'genes'} ) { 90 | my $genesFH = new FileHandle; 91 | unless( $genesFH->open( $this->{'genes'} , "r" ) ) { die "HotSpot3D Prep Error: Could not open file with genes (".$this->{'genes'}.")"; } 92 | map { 93 | chomp; 94 | @fields = split( "\t" , $_ ); 95 | $list{$fields[0]} = 1; 96 | } $genesFH->getlines; 97 | $genesFH->close(); 98 | @fields = undef; 99 | } 100 | $hugogene_ref = TGI::Mutpro::Preprocess::HugoGeneMethods::makeHugoGeneObjects(); 101 | foreach $hugo_id (sort keys %{$hugogene_ref}) { 102 | if ( scalar keys %list > 0 ) { 103 | next unless( exists $list{$hugo_id} ); 104 | } 105 | print STDOUT 'HUGO: ', "$hugo_id\n"; 106 | $alias_ref = $$hugogene_ref{$hugo_id}->getAllAliases(); 107 | $previous_ref = $$hugogene_ref{$hugo_id}->getAllPreviousSymbols(); 108 | $alias_list = ""; 109 | map { $alias_list .= "$_ "; } keys %{$alias_ref}; 110 | map { $alias_list .= "$_ "; } keys %{$previous_ref}; 111 | if ($alias_list !~ /\w+/) {$alias_list = "N/A"; }; 112 | $uniprot_id = $$hugogene_ref{$hugo_id}->uniprot(); 113 | if (!defined $uniprot_id) { 114 | $fh->print("$hugo_id\tN/A\tN/A\t$alias_list\n"); 115 | next; 116 | } 117 | $uniprot_ref = TGI::Mutpro::Preprocess::Uniprot->new($uniprot_id); 118 | $pdb_ref = $uniprot_ref->annotations("PDB"); 119 | if (!defined $pdb_ref || scalar(@{$pdb_ref}) == 0) { 120 | $fh->print( "$hugo_id\t$uniprot_id\tN/A\t$alias_list\n" ); 121 | next; 122 | } 123 | if (!defined $$uniprot_fileref{$uniprot_id}) { 124 | $uniprotid_toupdate{$uniprot_id} = 1; 125 | } 126 | $fh->print("$hugo_id\t$uniprot_id\t"); 127 | map { 128 | my ($pdb_id) = $_ =~ /^(\w+)\;/; 129 | if (defined $pdb_id) { 130 | if (!defined $$uniprot_to_structureref{$uniprot_id}{$pdb_id}) { 131 | $uniprotid_toupdate{$uniprot_id} = 1; 132 | } 133 | $fh->print("$pdb_id "); 134 | } 135 | } @{$pdb_ref}; 136 | $fh->print( "\t$alias_list\n" ); 137 | } 138 | $fh->close(); 139 | my $cmd_list_submit_file_fh; 140 | unless( open ( $cmd_list_submit_file_fh, ">", $this->{'cmd_list_submit_file'} ) ) { die "HotSpot3D Prep Error: Could not open cmd file (".$this->{'cmd_list_submit_file'}.")"; } 141 | print STDOUT "Creating ".$this->{'cmd_list_submit_file'}."\n"; 142 | map { 143 | system("touch $inpro_dir/$_.ProximityFile.csv"); 144 | my $bsub = "bsub -oo ".$log_dir.$_.".err.log -R 'select[type==LINUX64 && mem>16000] rusage[mem=16000]' -M 16000000"; 145 | my $update_program = " 'hotspot3d calpro"; 146 | my $programOptions = " --output-dir=".$this->{'output_dir'}." --pdb-file-dir=".$this->{'pdb_file_dir'}." --uniprot-id=".$_." --3d-distance-cutoff=".$this->{'max_3d_dis'}." --linear-cutoff=".$this->{'min_seq_dis'}."'"; 147 | my $submit_cmd = $bsub.$update_program.$programOptions; 148 | print STDOUT $submit_cmd."\n"; 149 | $cmd_list_submit_file_fh->print($submit_cmd."\n"); 150 | if ( not $this->{'hold'} ) { 151 | system( $submit_cmd ); 152 | } 153 | } keys %uniprotid_toupdate; 154 | $cmd_list_submit_file_fh->close(); 155 | 156 | return 1; 157 | } 158 | 159 | # Check which Uniprot files are on disk 160 | # Return ref to hash with 161 | # key = Uniprot Id 162 | sub currentuniprot_files { 163 | my ($this, $dir) = @_; 164 | my (%uniprot_ids, $file); 165 | opendir(DIR, $dir) || die "Could not open '$dir': $!"; 166 | map { 167 | if ($_ =~ /(\w+)\.ProximityFile\.csv/) { 168 | $uniprot_ids{$1} = 1; 169 | } 170 | } (readdir DIR); 171 | closedir DIR; 172 | 173 | return \%uniprot_ids; 174 | } 175 | 176 | # Return ref to hash with 177 | # key = uniprot_id, $pdb_id; 178 | # value = 1 179 | sub current_structures { 180 | my ($this, $logfilef) = @_; 181 | my ($uniprot_id, $pdb_id, %uniprot_tostructure, $pdb_list); 182 | my $fh = new FileHandle; 183 | unless ($fh->open($logfilef)) { die "Could not open hugo uniprot file\n" }; 184 | map { 185 | chomp; 186 | (undef, $uniprot_id, $pdb_list) = split /\t/, $_; 187 | unless ($uniprot_id eq "N/A") { 188 | map { 189 | $uniprot_tostructure{$uniprot_id}{$_} = 1; 190 | } split /s+/, $pdb_list; 191 | } 192 | } $fh->getlines; 193 | $fh->close(); 194 | 195 | return \%uniprot_tostructure; 196 | } 197 | 198 | sub help_text{ 199 | my $this = shift; 200 | return < peptides), default: 0 212 | --cmd-list-submit-file Batch jobs file to run calpro step in parallel, default: cmd_list_submit_file 213 | --hold Do not submit batch jobs, just write cmd_list_submit_file, default: submits (takes no input) 214 | 215 | --help this message 216 | 217 | HELP 218 | 219 | } 220 | 221 | 1; 222 | 223 | __END__ 224 | 225 | =head1 NAME 226 | 227 | TGI::Mutpro::Preprocess::Prep - Create & update proximity files. 228 | 229 | =head1 SYNOPSIS 230 | 231 | use TGI::Mutpro::Preprocess::Prep; 232 | 233 | =head1 DESCRIPTION 234 | 235 | TGI::Mutpro::Preprocess::Prep is to be used to create & update proximity files. 236 | It is the first step of preprocessing procedure. 237 | 238 | 239 | =head1 AUTHOR 240 | 241 | Beifang Niu Ebeifang.cn@gmail.comE 242 | 243 | =head1 SEE ALSO 244 | 245 | https://github.com/ding-lab/hotspot3d 246 | 247 | =head1 LICENSE 248 | 249 | This library is free software with MIT licence; you can redistribute it and/or modify 250 | it under the same terms as Perl itself. 251 | 252 | =cut 253 | 254 | -------------------------------------------------------------------------------- /lib/TGI/Mutpro/Preprocess/Prior.pm: -------------------------------------------------------------------------------- 1 | package TGI::Mutpro::Preprocess::Prior; 2 | # 3 | #---------------------------------- 4 | # $Authors: Beifang Niu and Adam D Scott 5 | # $Date: 2014-01-14 14:34:50 -0500 (Tue Jan 14 14:34:50 CST 2014) $ 6 | # $Revision: 2 $ 7 | # $URL: $ 8 | # $Doc: $ do prioritization 9 | #---------------------------------- 10 | # 11 | use strict; 12 | use warnings; 13 | 14 | use Carp; 15 | use Getopt::Long; 16 | use IO::File; 17 | use FileHandle; 18 | 19 | sub new { 20 | my $class = shift; 21 | my $this = {}; 22 | $this->{_OUTPUT_DIR} = undef; 23 | $this->{_PVALUE_CUTOFF} = 0.05; 24 | $this->{_3D_CUTOFF} = 20; 25 | $this->{_1D_CUTOFF} = 0; 26 | $this->{_STAT} = undef; 27 | bless $this, $class; 28 | $this->process(); 29 | return $this; 30 | } 31 | 32 | sub process { 33 | my $this = shift; 34 | $this->setOptions(); 35 | #### processing #### 36 | # do prioritization 37 | my ( $fhunipro , $proximityDir , $cosmicDir , $prioritizationDir ) = $this->getInputs(); 38 | while ( my $line = <$fhunipro> ) { 39 | chomp $line; 40 | my ( undef, $uniprotId, ) = split /\t/, $line; 41 | # Only use Uniprot IDs with PDB structures 42 | next if ( $uniprotId !~ /\w+/ ); 43 | # proximity file 44 | my $cosmicFile = "$cosmicDir\/$uniprotId\.ProximityFile\.csv"; 45 | next unless( -e $cosmicFile ); 46 | my $outputFile = "$prioritizationDir\/$uniprotId\.ProximityFile\.csv"; 47 | # add annotation infor 48 | $this->doPrior( $cosmicFile , $outputFile , $uniprotId ); 49 | #delete file if null 50 | } 51 | $fhunipro->close(); 52 | return 0; 53 | } 54 | 55 | sub setOptions { 56 | my ( $this ) = @_; 57 | my ( $help, $options ); 58 | unless( @ARGV ) { die $this->help_text(); } 59 | $options = GetOptions ( 60 | 'output-dir=s' => \$this->{_OUTPUT_DIR}, 61 | 'p-value-cutoff=f' => \$this->{_PVALUE_CUTOFF}, 62 | '3d-distance-cutoff=i' => \$this->{_3D_CUTOFF}, 63 | 'linear-cutoff=i' => \$this->{_1D_CUTOFF}, 64 | 'help' => \$help, 65 | ); 66 | if ( $help ) { warn help_text(); exit 0; } 67 | unless( $options ) { die $this->help_text(); } 68 | unless( $this->{_OUTPUT_DIR} ) { warn 'HotSpot3D::Prior::setOptions error: You must provide output directory!', "\n"; die help_text(); } 69 | unless( -d $this->{_OUTPUT_DIR} ) { warn 'HotSpot3D::Prior::setOptions error: You must provide a valid output directory!', "\n"; die help_text(); } 70 | return; 71 | } 72 | 73 | sub getInputs { 74 | my ( $this ) = @_; 75 | my ( $UniprotIdFile, $proximityDir, $cosmicDir, $prioritizationDir, ); 76 | $UniprotIdFile = "$this->{_OUTPUT_DIR}\/hugo.uniprot.pdb.transcript.csv"; 77 | $proximityDir = "$this->{_OUTPUT_DIR}\/proximityFiles"; 78 | $cosmicDir = "$proximityDir\/cosmicanno"; 79 | $prioritizationDir = "$this->{_OUTPUT_DIR}\/prioritization"; 80 | unless( -d $cosmicDir ) { warn "HotSpot3D::Prior::getInputs error: You must provide a valid COSMIC annotations directory!\n"; die help_text(); } 81 | unless( -e $prioritizationDir ) { mkdir($prioritizationDir) || die "HotSpot3D::Prior::getInputs error: can not make prioritization result files directory\n"; } 82 | my $fhunipro = new FileHandle; 83 | unless( $fhunipro->open("<$UniprotIdFile") ) { die "HotSpot3D::Prior::getInputs error: Could not open Uniprot ID file!\n" }; 84 | return ( $fhunipro , $proximityDir , $cosmicDir , $prioritizationDir ); 85 | } 86 | 87 | # prioritization based on 88 | # COSMIC annotation results 89 | sub doPrior { 90 | my ( $this , $proximityFile , $outputFile , $uniprotId ) = @_; 91 | my $outputContent = $this->getProximityInformation( $uniprotId , $proximityFile ); 92 | $this->writeOutput( $uniprotId , $outputFile , $outputContent ); 93 | return; 94 | } 95 | 96 | sub getProximityInformation { 97 | my ( $this , $uniprotId , $proximityFile ) = @_; 98 | # read COSMIC annotation information 99 | my $fhproximity = new FileHandle; 100 | unless( $fhproximity->open("<$proximityFile") ) { die "HotSpot3D::Prior::getProximityInformation error: Could not open COSMIC annotated proximity file!\n" }; 101 | # hash for filtering same pairs 102 | # but keep distances and P_vales 103 | print STDOUT $uniprotId." HotSpot3D::Prior::getProximityInformation - collecting proximity data from: ".$proximityFile."\n"; 104 | my $pValueCutoff = $this->{_PVALUE_CUTOFF}; 105 | my $spatialCutoff = $this->{_3D_CUTOFF}; 106 | my $linearCutoff = $this->{_1D_CUTOFF}; 107 | my %outputContent; 108 | while ( my $line = <$fhproximity> ) { 109 | if ($line =~ /^WARNING:/) { 110 | warn "HotSpot3D::Prior::doPrior warning: no chains were found for a structure in ".$proximityFile."\n"; 111 | next; 112 | } 113 | next if ($line =~ /UniProt_ID1/); 114 | chomp($line); 115 | my @fields = split /\t/, $line; 116 | my ( $annoOneEnd, $annoTwoEnd, $uniprotCoorOneEnd, $uniprotCoorTwoEnd, ); 117 | $annoOneEnd = $annoTwoEnd = "N\/A"; 118 | if ( scalar @fields < 11 ) { 119 | warn "HotSpot3D::Prior::doPrior warning: bad line in ".$proximityFile.": ".$line."\n"; 120 | } 121 | next if ( ($fields[2] =~ /N\/A/) or 122 | ($fields[3] =~ /N\/A/) or 123 | ($fields[9] =~ /N\/A/) or 124 | ($fields[10] =~ /N\/A/)); 125 | next unless ( ($fields[2] =~ /\d+/) and 126 | ($fields[3] =~ /\d+/) and 127 | ($fields[9] =~ /\d+/) and 128 | ($fields[10] =~ /\d+/) ); 129 | my $oneEndContent = join("\t", @fields[0..6]); 130 | my $twoEndContent = join("\t", @fields[7..13]); 131 | my $proInfo = join(" ", @fields[14..16]); 132 | $uniprotCoorOneEnd = $fields[2] + $fields[3]; 133 | $uniprotCoorTwoEnd = $fields[9] + $fields[10]; 134 | my $linearDistance = abs($uniprotCoorOneEnd - $uniprotCoorTwoEnd); 135 | next if ( ( $fields[0] eq $fields[7] ) and 136 | ( $linearDistance <= $linearCutoff) ); 137 | next if ( $fields[16] > $pValueCutoff ); 138 | next if ( $fields[14] > $spatialCutoff ); 139 | # load infor into %outputContent hash 140 | if ( ( defined $outputContent{$oneEndContent}{$twoEndContent} ) or 141 | ( defined $outputContent{$twoEndContent}{$oneEndContent} ) ) { 142 | if (defined $outputContent{$oneEndContent}{$twoEndContent}) { 143 | $outputContent{$oneEndContent}{$twoEndContent}{$proInfo} = 1; 144 | } else { 145 | $outputContent{$twoEndContent}{$oneEndContent}{$proInfo} = 1; 146 | } 147 | } else { 148 | $outputContent{$oneEndContent}{$twoEndContent}{$proInfo} = 1; 149 | } 150 | } 151 | $fhproximity->close(); 152 | return \%outputContent; 153 | } 154 | 155 | sub writeOutput { 156 | my ( $this , $uniprotId , $outputFile , $outputContent ) = @_; 157 | my $fhout = new FileHandle; 158 | unless( $fhout->open(">$outputFile") ) { die "HotSpot3D::Prior::writeOutput error: Could not open prioritization output file to write: ".$outputFile."\n" }; 159 | print STDOUT $uniprotId." HotSpot3D::Prior::writeOutput - writing prioritizations to file: ".$outputFile."\n"; 160 | # write prioritization result into file 161 | $fhout->print( "UniProt_ID1\tChain1\tPosition1\tOffset1\t" ); 162 | $fhout->print( "Residue_Name1\tFeature1\tCOSMIC1\t" ); 163 | $fhout->print( "UniProt_ID2\tChain2\tPosition2\tOffset2\t" ); 164 | $fhout->print( "Residue_Name2\tFeature2\tCOSMIC2\t" ); 165 | $fhout->print( "Distance\tPDB_ID\tP_Value\n" ); 166 | foreach my $mutation1 (keys %{$outputContent} ) { 167 | foreach my $mutation2 (keys %{$outputContent->{$mutation1}}) { 168 | print $fhout $mutation1."\t".$mutation2."\t"; 169 | foreach my $content (keys %{$outputContent->{$mutation1}->{$mutation2}}) { 170 | print $fhout $content."|"; 171 | } 172 | print $fhout "\n"; 173 | } 174 | } 175 | $fhout->close(); 176 | return; 177 | } 178 | 179 | sub help_text{ 180 | my $this = shift; 181 | return < peptides), default is 0 192 | 193 | --help this message 194 | 195 | HELP 196 | 197 | } 198 | 199 | 1; 200 | 201 | -------------------------------------------------------------------------------- /lib/TGI/Mutpro/Preprocess/Statis.pm: -------------------------------------------------------------------------------- 1 | package TGI::Mutpro::Preprocess::Statis; 2 | # 3 | #---------------------------------- 4 | # $Authors: Beifang Niu & Adam D Scott 5 | # $Date: 2014-01-14 14:34:50 -0500 (Tue Jan 14 14:34:50 CST 2014) $ 6 | # $Revision: 1 $ 7 | # $URL: $ 8 | # $Doc: $ statics related infor 9 | #---------------------------------- 10 | # 11 | use strict; 12 | use warnings; 13 | 14 | use Carp; 15 | use Cwd; 16 | use Getopt::Long; 17 | use LWP::Simple; 18 | use IO::File; 19 | use FileHandle; 20 | 21 | sub new { 22 | my $class = shift; 23 | my $this = {}; 24 | $this->{_OUTPUT_DIR} = getcwd; 25 | $this->{_STAT} = undef; 26 | bless $this, $class; 27 | $this->process(); 28 | return $this; 29 | } 30 | 31 | sub process { 32 | my $this = shift; 33 | $this->setOptions(); 34 | #### processing #### 35 | # pvalue calculation program 36 | my $fh = $this->getInputFile( ); 37 | my $proDir = $this->getInputDir( ); 38 | my $pvaluesDir = $this->getOutputDir( $proDir ); 39 | $this->calculatePValues( $pvaluesDir , $fh , $proDir ); 40 | return 0; 41 | } 42 | 43 | sub setOptions { 44 | my $this = shift; 45 | my ( $help, $options ); 46 | unless( @ARGV ) { die $this->help_text(); }; 47 | $options = GetOptions ( 48 | 'output-dir=s' => \$this->{_OUTPUT_DIR}, 49 | 'help' => \$help, 50 | ); 51 | if ( $help ) { warn help_text(); exit 0; }; 52 | unless( $options ) { die $this->help_text(); }; 53 | unless( $this->{_OUTPUT_DIR} ) { warn 'HotSpot3D::Statis::setOptions error: You must provide a output directory!', "\n"; die $this->help_text(); }; 54 | unless( -e $this->{_OUTPUT_DIR} ) { warn 'HotSpot3D::Statis::setOptions error: The output directory does not exist!', "\n"; die $this->help_text(); }; 55 | return; 56 | } 57 | 58 | sub getOutputDir { 59 | my ( $this , $proDir ) = @_; 60 | my $pvaluesDir = "$proDir/pvalues"; 61 | unless( -e $pvaluesDir ) { mkdir($pvaluesDir) || die "HotSpot3D::Statis::getOutputDir error: can not make pvalues directory!\n"; }; 62 | return $pvaluesDir; 63 | } 64 | 65 | sub getInputFile { 66 | my $this = shift; 67 | my $hugoUniprot = "$this->{_OUTPUT_DIR}/hugo.uniprot.pdb.csv"; 68 | unless( -e $hugoUniprot ) { die "HotSpot3D::Statis::getInputFile error: no hugo uniprot file!\n"; }; 69 | my $fh = new FileHandle; 70 | unless( $fh->open("<$hugoUniprot") ) { die "HotSpot3D::Statis::getInputFile error: Could not open hugo uniprot file!\n" }; 71 | return $fh; 72 | } 73 | 74 | sub getInputDir { 75 | my $this = shift; 76 | my $proDir = "$this->{_OUTPUT_DIR}/proximityFiles"; 77 | unless( -e $proDir ) { die "HotSpot3D::Statis::getInputDir error: no proximity file directory!\n"; }; 78 | return $proDir; 79 | } 80 | 81 | sub calculatePValues { 82 | my ( $this , $pvaluesDir , $fh , $proDir ) = @_; 83 | my @entireFile = <$fh>; 84 | $fh->close(); 85 | my $u = 0; 86 | foreach my $line (@entireFile) { 87 | chomp $line; 88 | my ( $uniprotId, $pdb, ); 89 | (undef, $uniprotId, $pdb) = split /\t/, $line; 90 | # Only use Uniprot IDs with PDB structures 91 | next if ( $pdb eq "N/A" || $uniprotId !~ /\w+/ ); 92 | $this->calculatePValuesOfProtein( $uniprotId , $pvaluesDir , $proDir ); 93 | } 94 | return; 95 | } 96 | 97 | sub calculatePValuesOfProtein { 98 | my ( $this , $uniprotId , $pvaluesDir , $proDir ) = @_; 99 | # proximity file 100 | my $proximityFile = "$proDir\/$uniprotId\.ProximityFile\.csv"; 101 | next unless(-e $proximityFile); 102 | my $outputFile = "$pvaluesDir\/$uniprotId\.ProximityFile\.csv"; 103 | # p_value calculating 104 | my $numberlines = $this->getPvalue( $uniprotId , $proximityFile , $outputFile ); 105 | #delete file if null 106 | if ($numberlines == 0) { unlink( $outputFile ) or warn "HotSpot3D::Statis::calculatePValuesOfProtein warning: failed to delete $outputFile: $!"; } 107 | return; 108 | } 109 | 110 | # pvalue calculating 111 | sub getPvalue { 112 | my ( $this , $uniprotId , $proximityfile , $outputf ) = @_; 113 | my $fh = new FileHandle; 114 | unless( $fh->open("<$proximityfile") ) { die "HotSpot3D::Statis::getPvalue error: Could not open hugo uniprot file '$proximityfile'!\n" }; 115 | my %distances; 116 | # get distances list 117 | while (my $a = <$fh>) { 118 | next if ($a =~ /^WARNING:/); 119 | next if ($a =~ /UniProt_ID1/); 120 | chomp($a); 121 | my @t = split /\t/, $a; 122 | next if ($t[0] !~ /^\w+$/); 123 | next if ($t[5] !~ /^\w+$/); 124 | next if ($t[1] !~ /^\[[A-Z]\]$/); 125 | next if ($t[6] !~ /^\[[A-Z]\]$/); 126 | my $distance = $t[10]; 127 | if ( $distance !~ /^-?\d+\.?\d*$/ ) { 128 | warn "Wrong distance : $distance \n"; 129 | next; 130 | } 131 | $distances{$distance} = 1; 132 | } 133 | $fh->close(); 134 | # sort and calculate p_value 135 | my ( %pvalues, $i, @t, $total, ); 136 | @t = sort {$a<=>$b} keys %distances; 137 | $total = scalar( @t ); 138 | $i = 0; 139 | map{ $pvalues{$_} = $i/$total; $i++; } @t; 140 | undef @t; 141 | undef %distances; 142 | 143 | $fh = new FileHandle; 144 | unless( $fh->open("<$proximityfile") ) { die "HotSpot3D::Statis::getPvalue error: Could not open hugo uniprot file '$proximityfile'!\n" }; 145 | my $fho = new FileHandle; 146 | unless( $fho->open(">$outputf") ) { die "HotSpot3D::Statis::getPvalue error: Could not open file '$outputf' to write!\n" }; 147 | print STDOUT $uniprotId." HotSpot3D::Statis::getPvalue - making p-value annotated output: ".$outputf."\n"; 148 | my $numberlines = 0; 149 | # load p_values 150 | $fho->print( "UniProt_ID1\tChain1\tPosition1\tOffset1\tResidue_Name1\t" ); 151 | $fho->print( "UniProt_ID2\tChain2\tPosition2\tOffset2\tResidue_Name2\t" ); 152 | $fho->print( "Distance\tPDB_ID\tP_Value\n" ); 153 | while ( my $a = <$fh> ) { 154 | next if ($a =~ /^WARNING:/); 155 | next if ($a =~ /UniProt_ID1/); 156 | chomp($a); 157 | my @t = split /\t/, $a; 158 | next if ( $t[0] !~ /^\w+$/ ); 159 | next if ( $t[5] !~ /^\w+$/ ); 160 | next if ( $t[1] !~ /^\[[A-Z]\]$/ ); 161 | next if ( $t[6] !~ /^\[[A-Z]\]$/ ); 162 | my $distance = $t[10]; 163 | if ( $distance !~ /^-?\d+\.?\d*$/ ) { 164 | warn "Wrong distance : $distance \n"; 165 | next; 166 | } 167 | my $rounded = sprintf( "%.6f", $pvalues{$distance} ); 168 | print $fho $a."\t".$rounded."\n"; 169 | $numberlines++; 170 | undef @t; 171 | } 172 | $fh->close(); 173 | $fho->close(); 174 | # clear 175 | undef %pvalues; 176 | return $numberlines; 177 | } 178 | 179 | sub help_text{ 180 | my $this = shift; 181 | return <{'transcript'} = ""; 27 | $this->{'amino_acid_change'} = ""; 28 | #$this->{'domain'} = undef; 29 | bless $this, $class; 30 | return $this; 31 | } 32 | 33 | sub reset { 34 | my $this = shift; 35 | $this->enst( "" ); 36 | $this->aminoAcidChange( "" ); 37 | return $this; 38 | } 39 | 40 | sub print { 41 | my $this = shift; 42 | my $delim = ":"; 43 | if ( @_ ) { $delim = shift; } 44 | print join( $delim , ( $this->transcript() , $this->aminoAcidChange() ) ); 45 | } 46 | 47 | sub set { 48 | my $this = shift; 49 | $this->transcript( shift ); 50 | $this->aminoAcidChange( shift ); 51 | return $this; 52 | } 53 | 54 | sub transcript { 55 | my $this = shift; 56 | if ( @_ ) { $this->{'transcript'} = shift; } 57 | return $this->{'transcript'}; 58 | } 59 | 60 | sub aminoAcidChange { 61 | my $this = shift; 62 | if ( @_ ) { $this->{'amino_acid_change'} = shift; } 63 | return $this->{'amino_acid_change'}; 64 | } 65 | 66 | 1; 67 | -------------------------------------------------------------------------------- /lib/TGI/Variant.pm: -------------------------------------------------------------------------------- 1 | package TGI::Variant; 2 | # 3 | #---------------------------------- 4 | # $Authors: Adam Scott 5 | # $Date: 2016-11-01 6 | # $Revision: $ 7 | # $URL: $ 8 | # $Doc: $ variant class 9 | # 10 | #---------------------------------- 11 | # 12 | use strict; 13 | use warnings; 14 | 15 | use Data::Dumper; 16 | 17 | use TGI::ProteinVariant; 18 | 19 | sub new { 20 | my $class = shift; 21 | my $this = {}; 22 | $this->{'gene'} = ""; 23 | $this->{'chromosome'} = ""; 24 | $this->{'start'} = ""; 25 | $this->{'stop'} = ""; 26 | $this->{'reference'} = ""; 27 | $this->{'alternate'} = ""; 28 | $this->{'proteinVariants'} = []; 29 | bless $this, $class; 30 | return $this; 31 | } 32 | 33 | sub reset { 34 | my $this = shift; 35 | $this->gene( "" ); 36 | $this->chromosome( "" ); 37 | $this->start( "" ); 38 | $this->stop( "" ); 39 | $this->reference( "" ); 40 | $this->alternate( "" ); 41 | $this->proteinVariants( [] ); 42 | return $this; 43 | } 44 | 45 | sub print { 46 | my $this = shift; 47 | my $delim = "\t"; 48 | if ( @_ ) { $delim = shift; } 49 | print $this->gene()." ".$this->hgvsg(); 50 | } 51 | 52 | sub gene { 53 | my $this = shift; 54 | if ( @_ ) { $this->{'gene'} = shift; } 55 | return $this->{'gene'}; 56 | } 57 | 58 | sub chromosome { 59 | my $this = shift; 60 | if ( @_ ) { $this->{'chromosome'} = shift }; 61 | return $this->{'chromosome'}; 62 | } 63 | 64 | sub start { 65 | my $this = shift; 66 | if ( @_ ) { $this->{'start'} = shift }; 67 | return $this->{'start'}; 68 | } 69 | 70 | sub stop { 71 | my $this = shift; 72 | if ( @_ ) { $this->{'stop'} = shift }; 73 | return $this->{'stop'}; 74 | } 75 | 76 | sub reference { 77 | my $this = shift; 78 | if ( @_ ) { $this->{'reference'} = shift }; 79 | return $this->{'reference'}; 80 | } 81 | 82 | sub alternate { 83 | my $this = shift; 84 | if ( @_ ) { $this->{'alternate'} = shift }; 85 | return $this->{'alternate'}; 86 | } 87 | 88 | sub hgvsg { 89 | my $this = shift; 90 | my $hgvsg = ""; 91 | $hgvsg .= $this->chromosome().":g."; 92 | $hgvsg .= $this->start(); 93 | if ( $this->stop() ) { 94 | $hgvsg .= "-".$this->stop(); 95 | } 96 | if ( $this->reference() ) { 97 | $hgvsg .= $this->reference(); 98 | if ( $this->alternate() ) { 99 | $hgvsg .= ">"; 100 | $hgvsg .= $this->alternate(); 101 | } 102 | } 103 | return $hgvsg; 104 | } 105 | 106 | sub set { 107 | my $this = shift; 108 | $this->gene( shift ); 109 | $this->chromosome( shift ); 110 | $this->start( shift ); 111 | $this->stop( shift ); 112 | $this->reference( shift ); 113 | $this->alternate( shift ); 114 | return $this; 115 | } 116 | 117 | sub addProteinVariant { 118 | my $this = shift; 119 | if ( @_ ) { push @{$this->{'proteinVariants'}} , shift; } 120 | return $this; 121 | } 122 | 123 | sub proteinVariant { 124 | my $this = shift; 125 | if ( @_ ) { 126 | my $i = shift; 127 | if ( scalar @{$this->{'proteinVariants'}} > $i ) { 128 | return $this->{'proteinVariants'}->[$i]; 129 | } 130 | } 131 | return $this->{'proteinVariants'}->[0]; 132 | } 133 | 134 | sub proteinVariants { 135 | my $this = shift; 136 | return $this->{'proteinVariants'}; 137 | } 138 | 139 | 1; 140 | -------------------------------------------------------------------------------- /scripts/DensityScripts/ClustersLines.R: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env Rscript 2 | args = commandArgs(trailingOnly=TRUE) 3 | y = read.table(args[1]) 4 | z = read.table(args[2]) 5 | 6 | RD<-y[[2]] 7 | ID<-y[[1]] 8 | 9 | x0<-z[[1]] 10 | y0<-z[[3]] 11 | x1<-z[[2]]+1 12 | y1<-z[[3]] 13 | 14 | Cluster<-z[[5]] # cluster ID 15 | xpos<- c(0:(length(ID)-1)) # positions to put tick marks on the x-axis 16 | 17 | pdf(paste("./Results/",args[3]),width=23.6,height=13.3) 18 | par(mar=c(8,5,5,1)) 19 | barplot(RD,names.arg=ID,ylab="Reachabilty Distance (A)",main=paste("Reachability Plot: Epsilon=",args[4],"MinPts=",args[5]),col="Red", border=NA, space=0, las=2, cex.names=0.4) 20 | segments (x0,y0,x1,y1) # horizontal lines to show clusters 21 | segments (xpos+0.5,0,xpos+0.5,-5) # tick marks on the x-axis 22 | text(x1+1,y0,Cluster, cex=0.4) # cluster ID labels 23 | dev.off() 24 | 25 | # args: 1-RD.out, 2-clusters.out, 3-pdf_file_name, 4-epsilon, 5-MinPts 26 | -------------------------------------------------------------------------------- /scripts/DensityScripts/DensityAll.pl: -------------------------------------------------------------------------------- 1 | use strict; 2 | use warnings; 3 | 4 | use LWP::Simple; 5 | use FileHandle; 6 | use Data::Dumper qw(Dumper); 7 | use Scalar::Util qw(looks_like_number); 8 | use List::Util qw[min max]; 9 | 10 | ######################## Command Line Arguments ################################ 11 | 12 | # quit unless we have the correct number of command-line args 13 | my $num_args = $#ARGV + 1; 14 | if ($num_args != 4) { 15 | print "\nUsage: DensityAll.pl InputFileName_in_./Test epsilon MinPts PDB_ID/\n\n"; 16 | exit; 17 | } 18 | 19 | my $PairwiseFileName = $ARGV[0]; 20 | my $Epsilon = $ARGV[1]; 21 | my $MinPts = $ARGV[2]; 22 | my $PDBName = $ARGV[3]; 23 | 24 | 25 | ################################################################################## 26 | 27 | print "\n Running OpticsWithR.pl....\n\n"; 28 | system ("perl OpticsWithR.pl $Epsilon $MinPts $PairwiseFileName"); 29 | 30 | print "\n Running SuperClustersID.pl....\n\n"; 31 | system ("perl SuperClustersID.pl RD.$Epsilon.$MinPts.$PairwiseFileName $Epsilon $MinPts"); 32 | 33 | print "\n Running DensityVisual.pl....\n\n"; 34 | system ("perl DensityVisual.pl RD.$Epsilon.$MinPts.$PairwiseFileName.SuperClustersID.clusters ./Test/$PairwiseFileName $PDBName"); 35 | 36 | print "\n Your Reachability plot with cluster marks, Clusters file, and the Pymol script has been produced.\n"; 37 | print "Done.\n"; -------------------------------------------------------------------------------- /scripts/DensityScripts/EasyClustersLines.R: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env Rscript 2 | args = commandArgs(trailingOnly=TRUE) 3 | y = read.table(args[1]) 4 | z = read.table(args[2]) 5 | 6 | RD<-y[[2]] 7 | ID<-y[[1]] 8 | 9 | x0<-z[[1]] 10 | y0<-z[[3]] 11 | x1<-z[[2]]+1 12 | y1<-z[[3]] 13 | 14 | Cluster<-z[[5]] 15 | xpos<- c(0:(length(ID)-1)) 16 | 17 | pdf(paste("./Results/",args[3]),width=23.6,height=13.3) 18 | par(mar=c(8,5,5,1)) 19 | barplot(RD,ylab="Reachabilty Distance (A)",main=paste("Reachability Plot: Epsilon=",args[4],"MinPts=",args[5]),col="Red", border=NA, xaxt="n", xlab="" , space=0)#names.arg=ID, space=0, las=2, cex.names=0.4 20 | segments (x0,y0,x1,y1) 21 | segments (xpos+0.5,0,xpos+0.5,-0.05) 22 | segments (xpos+0.5,-0.05,xpos+0.2,-0.08) 23 | text(x1+1,y0,Cluster, cex=0.4) 24 | 25 | #op<- par( mar = c(10,1,0,1)) 26 | text( x=xpos-1.35, y=-0.22 ,labels=ID, srt=45, cex=0.3, xpd=TRUE) # par()$usr[3]-0.1*(par()$usr[4]-par()$usr[3]), 27 | #par(op) 28 | dev.off() 29 | 30 | # args: 1-RD.out, 2-clusters.out, 3-pdf_file_name, 4-epsilon, 5-MinPts 31 | -------------------------------------------------------------------------------- /scripts/DensityScripts/HorizClustersLines.R: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env Rscript 2 | args = commandArgs(trailingOnly=TRUE) 3 | y = read.table(args[1]) 4 | z = read.table(args[2]) 5 | 6 | RD<-y[[2]] 7 | ID<-y[[1]] 8 | 9 | y0<-z[[1]] 10 | x0<-z[[3]] 11 | y1<-z[[2]]+1 12 | x1<-z[[3]] 13 | 14 | Cluster<-z[[5]] 15 | 16 | pdf(paste("./Results/",args[3]),width=13.3,height=23.6) 17 | par(mar=c(8,5,5,1)) 18 | barplot(RD,names.arg=ID,main=paste("Reachability Plot: Epsilon=",args[4],"MinPts=",args[5]),col="Red", cex.names=0.4, horiz=TRUE,border=NA, space=0, las=2, xlab="Reachabilty Distance (A)") # ylab="Reachabilty Distance (A)" 19 | segments (x0,y0,x1,y1) 20 | text(x0,y1+0.5,Cluster, cex=0.4) 21 | dev.off() 22 | 23 | # args: 1-RD.out, 2-clusters.out, 3-pdf_file_name, 4-epsilon, 5-MinPts -------------------------------------------------------------------------------- /scripts/DensityScripts/MembershipProbability.R: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env Rscript 2 | args = commandArgs(trailingOnly=TRUE) 3 | 4 | d <- read.table(args[1], header=TRUE) 5 | 6 | library(ggplot2) 7 | 8 | d$Variant <- factor(d$Variant, levels = d$Variant) 9 | 10 | sp <- ggplot(d, aes(x=Variant,y=ClusterID)) + geom_point(aes(colour = Probability), size=3) + scale_colour_gradient(low = "red", high = "blue") 11 | 12 | #sp <- ggplot(d, aes(x=Variant,y=ClusterID)) + geom_point(aes(size=Probability)) 13 | 14 | sp+theme_bw() + theme(axis.text.x=element_text(angle=90, size=6)) + ggtitle(paste("Cluster Membership Probabilities for",args[2],"runs (",args[3],")")) 15 | ggsave(paste("./Results/",args[3],".ProbabilityPlot.pdf"), width = 23.6, height = 13.3) 16 | 17 | #args 1=ProbabilityData, 2=Number of runs, 3=Gene -------------------------------------------------------------------------------- /scripts/DensityScripts/OpticsWithR.pl: -------------------------------------------------------------------------------- 1 | use strict; 2 | use warnings; 3 | 4 | use LWP::Simple; 5 | use FileHandle; 6 | use Data::Dumper qw(Dumper); 7 | use Scalar::Util qw(looks_like_number); 8 | use List::Util qw[min max]; 9 | 10 | ######################## Command Line Arguments ################################ 11 | 12 | # quit unless we have the correct number of command-line args 13 | my $num_args = $#ARGV + 1; 14 | if ($num_args != 3) { 15 | print "\nUsage: OpticsWithR.pl epsilon MinPts InputFileName_in_./Test/\n\n"; 16 | exit; 17 | } 18 | 19 | my $Epsilon = $ARGV[0]; 20 | my $MinPts = $ARGV[1]; 21 | print "Preparing Reachability Data\n"; 22 | 23 | ############# Extracting the set of objects ############## 24 | 25 | my %SetOfNodes; 26 | 27 | my $file = "./Test/$ARGV[2]"; 28 | open(IN, "<$file") || die "Can't open $file: $!"; 29 | 30 | while (my $line = ) { 31 | chomp $line; 32 | my @tabs = split(/\t/,$line); 33 | my @char19 = split("",$tabs[19]); 34 | my $dis = $char19[0].$char19[1].$char19[2].$char19[3].$char19[4]; 35 | my $key1 = CombineWords($tabs[0],$tabs[4]); 36 | my $value1 = CombineWords($tabs[9],$tabs[13]); 37 | 38 | $SetOfNodes{$key1}{distances}{$value1} = $dis; 39 | $SetOfNodes{$value1}{distances}{$key1} = $dis; 40 | } 41 | ###### For variants in the same residue and chain 42 | 43 | foreach my $key ( keys %SetOfNodes ) { 44 | #print "key= $key\n"; 45 | $key =~ /(\w+)\:\D\.(\D+\d+)\D/g; 46 | my $keyGene = $1; 47 | my $keyRes = $2; 48 | my @hits = grep(/$keyGene\:\D\.$keyRes\D/g, keys %SetOfNodes); 49 | #print Dumper \@hits; 50 | foreach my $hit (@hits) { 51 | if ( $hit ne $key ) { 52 | $SetOfNodes{$key}{distances}{$hit} = "0"; 53 | $SetOfNodes{$hit}{distances}{$key} = "0"; 54 | } 55 | } 56 | } 57 | 58 | foreach my $i (keys %SetOfNodes) { 59 | $SetOfNodes{$i}{processInfo} = "False"; 60 | } 61 | 62 | #print Dumper \%SetOfNodes; 63 | print "Number of Objects = "; 64 | print scalar keys %SetOfNodes; 65 | print "\n"; 66 | 67 | my @SetOfCores; 68 | my @SetOfEdges; 69 | foreach my $key ( keys %SetOfNodes ) { 70 | if ( scalar keys $SetOfNodes{$key}{distances} >= $MinPts ) { 71 | push @SetOfCores, $key; 72 | } 73 | else { 74 | push @SetOfEdges, $key; 75 | } 76 | } 77 | my @SetOfCoresThenEdges = ( @SetOfCores, @SetOfEdges ); 78 | 79 | # print "SetofCores="; 80 | # print Dumper \@SetOfCores; 81 | # print "SetofEdges="; 82 | # print Dumper \@SetOfEdges; 83 | # #push @SetofCores, @SetofEdges; 84 | # print "both="; 85 | # print Dumper \@SetOfCoresThenEdges; 86 | ########################################################### 87 | 88 | my @OrderedNodes; 89 | 90 | ################# Main OPTICS function #################### 91 | 92 | foreach my $p ( @SetOfCoresThenEdges ) { 93 | #print "first p=$p\n"; 94 | if ($SetOfNodes{$p}{processInfo} =~ "False") { 95 | ########## Expand Cluster Order ########### 96 | my %neighbors; # is a hash with keys neigbor indices whose values are mutual separations 97 | my %OrderSeeds; # is a hash to add seeds 98 | %neighbors = %{GetNeighbors($p,$Epsilon,\%SetOfNodes)}; 99 | $SetOfNodes{$p}{processInfo} = "True"; # set as processed 100 | my $RD = undef; 101 | my $CD; 102 | $CD = GetCoreDistance(\%neighbors,$MinPts); 103 | #print "p=$p and "; 104 | #print "CD=$CD\n"; 105 | push @OrderedNodes, [$p,$RD,$CD]; # write to the file 106 | if (defined $CD) { 107 | OrderSeedsUpdate(\%neighbors,$p,$CD, \%OrderSeeds, \%SetOfNodes); 108 | #print "For p=$p, OrderSeeds= \n"; 109 | #print Dumper \%OrderSeeds; 110 | while (scalar keys %OrderSeeds != 0) { 111 | my @SeedKeys = sort { $OrderSeeds{$a} <=> $OrderSeeds{$b} } keys %OrderSeeds; 112 | my @SeedValues = @OrderSeeds{@SeedKeys}; 113 | my $CurrentObject = $SeedKeys[0]; # CurrentObject is the object having the least RD in OrderSeeds 114 | #print "\n\n current object= $CurrentObject\t neighbors="; 115 | %neighbors = %{GetNeighbors($CurrentObject,$Epsilon,\%SetOfNodes)}; 116 | #print Dumper \%neighbors; 117 | #print Dumper $SetOfNodes{$CurrentObject}{distances}; 118 | $SetOfNodes{$CurrentObject}{processInfo} = "True"; # set as processed 119 | $RD = $SeedValues[0]; 120 | $CD = GetCoreDistance(\%neighbors,$MinPts); 121 | push @OrderedNodes, [$CurrentObject,$RD,$CD]; # write to the file 122 | delete $OrderSeeds{$CurrentObject}; 123 | if (defined $CD) { 124 | #print "\tCurrent object is a core.\n Updated Order seeds list\n\t"; 125 | OrderSeedsUpdate(\%neighbors,$CurrentObject,$CD, \%OrderSeeds, \%SetOfNodes); 126 | #print Dumper \%OrderSeeds; 127 | } 128 | } 129 | } 130 | #print "p=$p, OrderedNodes= \n"; 131 | #print Dumper \@OrderedNodes; 132 | } 133 | } 134 | 135 | ###################### Reachability Plot ######################## 136 | 137 | my @data; 138 | my @dataX; 139 | my @dataY; 140 | foreach my $x (1...scalar keys %SetOfNodes) { 141 | push @dataX, "$OrderedNodes[$x-1][0]"; 142 | if (defined $OrderedNodes[$x-1][1]) { 143 | push @dataY, "$OrderedNodes[$x-1][1]"; 144 | } 145 | else { 146 | push @dataY, "10"; # just a large number to denote infinity 147 | } 148 | } 149 | push @data, [@dataX]; 150 | push @data, [@dataY]; 151 | #print Dumper \@data; 152 | 153 | my $OrderedFile = "./Results/RD.$Epsilon.$MinPts.$ARGV[2]"; 154 | open (OUT, ">$OrderedFile"); 155 | foreach my $x (1...scalar keys %SetOfNodes) { 156 | if (defined $OrderedNodes[$x-1][1]) { 157 | print OUT "$OrderedNodes[$x-1][0]\t $OrderedNodes[$x-1][1]\n"; 158 | } 159 | else { 160 | print OUT "$OrderedNodes[$x-1][0]\t 10\n"; 161 | } 162 | } 163 | close (OUT); 164 | 165 | #system ("Rscript PlotR.R $OrderedFile RD.$Epsilon.$MinPts.$ARGV[2].pdf $Epsilon $MinPts"); 166 | 167 | print "Done.\n"; 168 | 169 | ######################### Functions ############################ 170 | 171 | sub GetNeighbors { 172 | my ($Obj, $Epsilon, $Set_ref)=@_; 173 | my %neighborHash; 174 | foreach my $i (keys %{$Set_ref->{$Obj}->{distances}}) { 175 | $neighborHash{$i} = "$Set_ref->{$Obj}->{distances}->{$i}"; 176 | } 177 | return \%neighborHash; 178 | } 179 | 180 | sub GetCoreDistance { 181 | my ($neighbors_ref, $MinPts)=@_; 182 | my @keys = sort { $neighbors_ref->{$a} <=> $neighbors_ref->{$b} } keys %{$neighbors_ref}; # sort keys according to distances 183 | my @vals = @{$neighbors_ref}{@keys}; 184 | my $CoreDist; 185 | if (scalar keys %{$neighbors_ref} >= $MinPts){ 186 | $CoreDist = $vals[$MinPts-1]; # MinPt^th-distance 187 | } 188 | else { 189 | $CoreDist = undef; 190 | } 191 | return $CoreDist; 192 | } 193 | 194 | sub OrderSeedsUpdate { 195 | my ($neighbors_ref, $CenterObject, $CD, $OrderSeeds_ref, $Set_ref) = @_; 196 | my $c_dist = $CD; 197 | my %neighborsHash = % { $neighbors_ref }; 198 | my %OrderSeedsHash = % { $OrderSeeds_ref}; 199 | foreach my $q (keys %{$neighbors_ref}) { 200 | if (${$Set_ref}{$q}{processInfo} =~ "False") { 201 | my $new_r_dist = max ($c_dist,${$neighbors_ref}{$q}); 202 | if (exists ${$OrderSeeds_ref}{$q}) { 203 | if ($new_r_dist < ${$OrderSeeds_ref}{$q}) { 204 | ${$OrderSeeds_ref}{$q}="$new_r_dist"; 205 | } 206 | } 207 | else { 208 | ${$OrderSeeds_ref}{$q}="$new_r_dist"; 209 | } 210 | } 211 | } 212 | } 213 | 214 | sub CombineWords { 215 | my ($word1,$word2)=@_; 216 | return $word1.":".$word2; 217 | } -------------------------------------------------------------------------------- /scripts/DensityScripts/PlotR.R: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env Rscript 2 | args = commandArgs(trailingOnly=TRUE) 3 | y = read.table(args[1]) 4 | 5 | RD<-y[[2]] 6 | ID<-y[[1]] 7 | 8 | pdf(args[2],width=23.6,height=13.3) 9 | par(mar=c(8,5,5,1)) 10 | barplot(RD,names.arg=ID,ylab="Reachabilty Distance (A)",main=paste("Reachability Plot: Epsilon=",args[3],"MinPts=",args[4]),col="Red", border=NA, space=0, las=2, cex.names=0.4) 11 | dev.off() 12 | 13 | 14 | 15 | -------------------------------------------------------------------------------- /scripts/DensityScripts/README.md: -------------------------------------------------------------------------------- 1 | Density Scripts 2 | =========== 3 | 4 | These scripts could be used to do density based clustering and visualization. 5 | 6 | Usage 7 | ----- 8 | 9 | Usage: perl DensityAll.pl pairwise_file_name_in_./Test/ Epsilon MinPts PDB_ID 10 | 11 | Results will be copied to ./Results/ directory. 12 | 13 | The DensityAll.pl script runs three different scripts in the following order: 14 | 15 | 1) OpticsWithR.pl : OPTICS clustering - outputs an ordered list of variants with corresponding reachability distances(Output file name: RD.$Epsilon.$MinPts.pairwise_file_name). 16 | 17 | 2) SuperClustersID.pl : Performs clustering. Whenever an important event like merging two clusters or appearing new cluster happens, records clusters with IDs. 18 | 19 | (Output file name1: RD.$Epsilon.$MinPts.pairwise_file_name.SuperClustersID.clusters, 20 | Output file name2: RD.$Epsilon.$MinPts.pairwise_file_name.SuperClustersID.plot, 21 | Output file name3: SuperClustersID.RD.$Epsilon.$MinPts.pairwise_file_name.pdf , 22 | Output file name4: RD.$Epsilon.$MinPts.pairwise_file_name.clusters.shiny.R). 23 | 24 | 3) DensityVisual.pl : Writes a pymol script for visualization 25 | 26 | 4) ClusterProbability.pl : Determines the membership probability of each variant in clusters. 27 | 28 | Additional Softwares 29 | ----------------------------- 30 | 31 | Install RStudio and the R package "shiny" for better visualization of the reachability plot. 32 | 33 | 34 | -------------------------------------------------------------------------------- /scripts/MafSimulatorScripts/simulate.pl: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env perl 2 | # 3 | #---------------------------------- 4 | # $Author: R Jay Mashl 5 | # $Date: 2016-10-10 10:42:04 -0500 $ 6 | # $Revision: 0.2 $ Revised by Amila Weerasinghe (2017-08-01) 7 | # $URL: $ 8 | # $Doc: $ driver for simulated MAF generator for HotSpot3D 9 | #---------------------------------- 10 | # 11 | use strict; 12 | use warnings; 13 | no warnings 'uninitialized'; 14 | 15 | use Getopt::Long; 16 | use TGI::Mutpro::Main::MafSimulator; 17 | 18 | # Defaults 19 | my $distributionsOutDefault = "distributions.out"; 20 | my $mafStemOutDefault = "maf"; 21 | my $numShufflesDefault = 5; 22 | my $offsetDefault = 0; 23 | my $sitesOutDefault = "sites.out"; 24 | my $sizeHistoOutDefault = "sizes.histo"; 25 | my $statsOutDefault = "stats.out"; 26 | my $transcriptHeaderDefault = "transcript_name"; 27 | my $aminoAcidHeaderDefault = "amino_acid_change"; 28 | 29 | # Commands and options 30 | my $command; 31 | my $opts = {}; 32 | $opts->{'clustersListFile'} = undef; 33 | $opts->{'clusterVarCounts'} = undef; 34 | $opts->{'debug'} = 0; 35 | $opts->{'distributionsFn'} = $distributionsOutDefault; 36 | $opts->{'entryNumberOffset'} = $offsetDefault; 37 | $opts->{'geneList'} = undef; 38 | $opts->{'geneListFile'} = undef; 39 | $opts->{'histosListFile'} = undef; 40 | $opts->{'maf'} = undef; 41 | $opts->{'mafStemOutFn'} = $mafStemOutDefault; 42 | $opts->{'numShuffles'} = $numShufflesDefault; 43 | $opts->{'randomUnique'} = 0; 44 | $opts->{'sitesFn'} = $sitesOutDefault; 45 | $opts->{'sizeHistoFn'} = $sizeHistoOutDefault; 46 | $opts->{'srcMaf'} = undef; 47 | $opts->{'statsFn'} = $statsOutDefault; 48 | $opts->{'transcripts'} = undef; 49 | $opts->{'transcript_id_header'} = $transcriptHeaderDefault; 50 | $opts->{'amino_acid_header'} = $aminoAcidHeaderDefault; 51 | 52 | GetOptions( 53 | 'clustersListFile=s' => \$opts->{'clustersListFile'}, 54 | 'clusterVarCounts=s' => \$opts->{'clusterVarCounts'}, 55 | 'command=s' => \$command, 56 | 'debug' => \$opts->{'debug'}, 57 | 'distributions=s' => \$opts->{'distributionsFn'}, 58 | 'geneList=s' => \$opts->{'geneList'}, 59 | 'geneListFile=s' => \$opts->{'geneListFile'}, 60 | 'histosListFile=s' => \$opts->{'histosListFile'}, 61 | 'maf=s' => \$opts->{'maf'}, 62 | 'num-shuffles=i' => \$opts->{'numShuffles'}, 63 | 'offset=i' => \$opts->{'entryNumberOffset'}, 64 | 'out-maf-stem=s' => \$opts->{'mafStemOutFn'}, 65 | 'random-unique' => \$opts->{'randomUnique'}, 66 | 'site-stats=s' => \$opts->{'statsFn'}, 67 | 'sites=s' => \$opts->{'sitesFn'}, 68 | 'size-histo=s' => \$opts->{'sizeHistoFn'}, 69 | 'source-maf=s' => \$opts->{'srcMaf'}, 70 | 'transcripts=s' => \$opts->{'transcripts'}, 71 | 'amino-acid-header=s' => \$opts->{'amino_acid_header'}, 72 | 'transcript-id-header=s' => \$opts->{'transcript_id_header'}, 73 | ) or abort(); 74 | 75 | # Check for errors 76 | if( !$command ) { abort(); } 77 | if( $command eq "getCoverage" && ( !defined $opts->{'transcripts'} || !defined $opts->{'maf'} ) ) { 78 | print "\nError: Please specify both a transcripts file and a maf file.\n\n"; abort(); 79 | } 80 | if( $command eq "randomize" && ( !defined $opts->{'sitesFn'} || !defined $opts->{'statsFn'} ) ) { 81 | print "\nError: Please specify both sites and stats files.\n\n"; abort(); 82 | } 83 | if( $command eq "generateMafs" && ( !defined $opts->{'srcMaf'} || !defined $opts->{'distributionsFn'} ) ) { 84 | print "\nError: Please specify source maf (template) file and random distributions files\n\n"; abort(); 85 | } 86 | if( $command eq "getSizeHisto" ) { 87 | if( !defined $opts->{'clustersListFile'} ) { 88 | print "\nError: Please specify clusters list file for HotSpot3D *.clusters files\n\n"; abort(); 89 | } elsif( !defined $opts->{'distributionsFn'} ) { 90 | print "\nError: Please specify distributions file\n\n"; abort(); 91 | } 92 | } 93 | if( defined $opts->{'geneList'} && defined $opts->{'geneListFile'} ) { 94 | print "\nError: Please either gene list or gene file, not both.\n\n"; abort(); 95 | } 96 | 97 | my $result; 98 | 99 | if( $command eq "getCoverage" ) { $result = MafSimulator::getCoverage( $opts ); } 100 | elsif( $command eq "randomize" ) { $result = MafSimulator::randomize( $opts ); } 101 | elsif( $command eq "generateMafs" ) { $result = MafSimulator::generateMafs( $opts ); } 102 | elsif( $command eq "getSizeHisto" ) { $result = MafSimulator::getSizeHisto( $opts ); } 103 | elsif( $command eq "mergeHistos" ) { $result = MafSimulator::mergeHistos( $opts ); } 104 | else { print "\nError: Please specify a valid command.\n\n"; abort(); } 105 | 106 | 107 | sub help { 108 | return < --maf [--geneList ,,... | --geneListFile ] [--sites [default: $sitesOutDefault]] [--site-stats [default: $statsOutDefault]] [--amino_acid_header [default: $aminoAcidHeaderDefault]] [--transcript-id-header [default: $transcriptHeaderDefault]] 113 | 114 | $0 --command randomize --sites --site-stats [--distributions ] [--geneList ,,... | --geneListFile ] [--num-shuffles [default: $numShufflesDefault]] [--random-unique] [--offset [default: $offsetDefault]] 115 | 116 | $0 --command generateMafs --source-maf --distributions [--out-maf-stem [default: $mafStemOutDefault]] [--geneList ,,... | --geneListFile ] [--offset [default: $offsetDefault]] 117 | 118 | $0 --command getSizeHisto --clustersListFile --distributions [--size-histo [default: sizes.histo]] [--clusterVarCounts ] 119 | 120 | $0 --command mergeHistos --histosListFile [--size-histo [default: sizes.histo]] 121 | 122 | 123 | EOF 124 | } 125 | 126 | sub abort { 127 | print help(); 128 | exit 0; 129 | } 130 | -------------------------------------------------------------------------------- /scripts/README.annotations: -------------------------------------------------------------------------------- 1 | ## HGNC download is from gene families site at HGNC, where gene is in the 2nd column & family name is in the 11th column 2 | ## Protein kinase lists can be obtained from numerous sources. The format expected for use below is gene\tfamily 3 | ## Drug class info can be obtained via DrugBank/DrugPort, NIH, and others. The format used below expects two columns drug\tclass 4 | ## Ensembl .gtf was necessary for longest transcript calculation. Run determine_transcript_lengths.pl to get gene, transcript, length (gtl) data. 5 | # annotated .clusters with a variety of details. These simply append columns, and so they can accumulate. 6 | annotate_clusters_MAF.pl #expects gtl data and the .maf used for clustering 7 | annotate_clusters_PDB.pl #needs HotSpot3D data 8 | annotate_clusters_domains.pl #needs HotSpot3D data 9 | annotate_clusters_drug_class.pl #expects a drug class list 10 | annotate_clusters_HGNC_Kinase.pl #expects HGNC download & protein kinase list 11 | annotate_clusters_families.pl #expects HGNC download 12 | 13 | # determine cluster presence/representation for PDB structures associated with the gene 14 | clusterPDBPresence.drug.pl #for drug-mutation pairs/clusters 15 | clusterPDBPresence.pl #for mutation-mutation pairs/clusters 16 | 17 | genePDBPresence.pl #for genes instead of clusters 18 | 19 | #determine number of structures for each gene 20 | nStructures.pl 21 | -------------------------------------------------------------------------------- /scripts/addRandomWeight.pl: -------------------------------------------------------------------------------- 1 | #!/usr/bin/perl 2 | #25 September 2017 - Adam D Scott - 3 | 4 | use strict; 5 | use warnings; 6 | 7 | use IO::File; 8 | use FileHandle; 9 | 10 | my $usage = 'perl addRandomWeight.pl 11 | '; 12 | 13 | die $usage , unless @ARGV == 2; 14 | my ( $maf , $output ) = @ARGV; 15 | 16 | my $IN1 = FileHandle->new( $maf , "r" ); 17 | if ( not defined $IN1 ) { die "ADSERROR: Could not open/read ".$maf."\n"; } 18 | 19 | my $IN2 = FileHandle->new( $output , "w" ); 20 | if ( not defined $IN2 ) { die "ADSERROR: Could not open/write ".$output."\n"; } 21 | 22 | while ( my $line = <$IN1> ) { 23 | chomp( $line ); 24 | $IN2->print( $line."\t" ); 25 | if ( $line =~ /Hugo/ ) { 26 | $IN2->print( "RandomWeight" ); 27 | } else { 28 | my $r = int( rand( 40 ) ); 29 | $r -= 20; 30 | $IN2->print( $r ) 31 | } 32 | $IN2->print( "\n" ); 33 | } 34 | $IN1->close(); 35 | $IN2->close(); 36 | -------------------------------------------------------------------------------- /scripts/annotate_clusters_HGNC_Kinase.pl: -------------------------------------------------------------------------------- 1 | #!/bin/perl 2 | #15 March 2015 - Adam D Scott - 3 | 4 | use strict; 5 | use warnings; 6 | 7 | my $usage = 'perl annotate_families.pl 8 | '; 9 | 10 | die $usage , unless @ARGV == 4; 11 | my ( $hgnc , $prokin , $clusters , $output ) = @ARGV; 12 | 13 | my $proteinkinase = "Protein Kinase"; 14 | my $unclassified = "Unclassified"; 15 | my $unclassifiedkinase = $unclassified.", Protein Kinase"; 16 | my $notgene = "NA"; 17 | 18 | my %families; 19 | my %fams; 20 | open ( IN , "<$hgnc" ) or die "Cannot open $hgnc: $!"; 21 | while ( ) { 22 | chomp; 23 | my @line = split "\t" , $_; 24 | 25 | my $gene = $line[1]; 26 | my $family = $line[10]; 27 | $families{$gene}{$family} = 1; 28 | $fams{$family} = 0; 29 | } 30 | close IN; 31 | 32 | open ( IN , "<$prokin" ) or die "Cannot open $prokin: $!"; 33 | while ( ) { 34 | chomp; 35 | my @line = split "\t" , $_; 36 | 37 | my $gene = $line[0]; 38 | $families{$gene}{$proteinkinase} = 1; 39 | $fams{$proteinkinase} = 0; 40 | } 41 | close IN; 42 | 43 | my %counted; 44 | my $total = 0; 45 | my %lines; 46 | open ( OUT , ">$output" ) or die "Cannot open $output: $!"; 47 | open ( IN2 , "<$clusters" ) or die "Cannot open $clusters: $!"; 48 | while ( ) { 49 | chomp; 50 | if ( /Cluster/ ) { 51 | print OUT $_."\tGene_Families\n"; 52 | next; 53 | } 54 | my @line = split "\t" , $_; 55 | 56 | my $id = $line[0]; 57 | my $gene = $line[1]; 58 | my $aachange = $line[2]; 59 | my $mutations = $line[6]; 60 | my @families; 61 | if ( $aachange =~ /p\./ ) { 62 | if ( exists $families{$gene} ) { 63 | @families = sort keys %{$families{$gene}}; 64 | foreach my $family ( keys %{$families{$gene}} ) { 65 | if ( exists $fams{$family} ) { 66 | $fams{$family} += $mutations; 67 | } else { 68 | $fams{$family} = $mutations; 69 | } 70 | } 71 | } else { 72 | @families = ( $unclassified ); 73 | if ( exists $fams{$unclassified} ) { 74 | $fams{$unclassified} += $mutations; 75 | } else { 76 | $fams{$unclassified} = $mutations; 77 | } 78 | } 79 | if ( not exists $counted{$gene} ) { 80 | $total++; 81 | } 82 | } else { 83 | @families = ( $notgene ); 84 | } 85 | print OUT join( "\t" , ( @line , join( "|" , @families ) ) )."\n"; 86 | } 87 | close IN2; 88 | close OUT; 89 | 90 | print "Gene_Family\tMutations\tPercentage\n"; 91 | foreach my $family ( keys %fams ) { 92 | my $n = $fams{$family}; 93 | if ( $n > 0 ) { 94 | my $percent = 100*$n/$total; 95 | print $family."\t".$n."\t"; 96 | printf "%.3f\n" , $percent; 97 | } 98 | } 99 | -------------------------------------------------------------------------------- /scripts/annotate_clusters_MAF.pl: -------------------------------------------------------------------------------- 1 | #!/bin/perl 2 | #16 January 2015 - Adam D Scott - 3 | 4 | use strict; 5 | use warnings; 6 | 7 | my $usage = 'perl annotate_clusters_MAF.pl 8 | '; 9 | die $usage , unless @ARGV == 3; 10 | my ( $maf , $clusters , $gtl ) = @ARGV; 11 | my $barcol = 16; 12 | my $chrcol = 33; 13 | my $startcol = 34; 14 | my $refcol = 36; 15 | my $varcol = 37; 16 | my $genecol = 39; 17 | my $transcol = 40; 18 | my $typecol = 46; 19 | my $cposcol = 47; 20 | my $AAcol = 48; 21 | 22 | my %mutations; 23 | my %alternates; 24 | my %transcripts; 25 | 26 | open ( IN0 , "<$gtl" ) or die "Cannot open $gtl: $!"; 27 | while ( ) 28 | { 29 | chomp; 30 | my @line = split( "\t" , $_ ); 31 | my $gene = $line[0]; 32 | my $trans = $line[1]; 33 | my $length = $line[2]; 34 | 35 | $transcripts{$gene}{$trans} = $length; 36 | } 37 | close IN0; 38 | 39 | my %usetrans; 40 | foreach my $gene ( keys %transcripts ) 41 | { 42 | my @ts = keys %{$transcripts{$gene}}; my $long = $ts[0]; $usetrans{$gene}{$long} = $transcripts{$gene}{$long}; 43 | foreach my $trans ( keys %{$transcripts{$gene}} ) 44 | { 45 | if ( $usetrans{$gene}{$long} < $transcripts{$gene}{$trans} ) 46 | { 47 | $usetrans{$gene}{$trans} = $transcripts{$gene}{$trans}; 48 | delete $usetrans{$gene}{$long}; 49 | $long = $trans; 50 | } 51 | } 52 | } 53 | 54 | my %counts; 55 | open ( IN , "<$maf" ) or die "Cannot open $maf: $!"; 56 | while ( ) 57 | { 58 | chomp; 59 | my @line = split( "\t" , $_ ); 60 | 61 | my $gene = $line[$genecol-1]; 62 | my $barcode = $line[$barcol-1]; 63 | my $chr = $line[$chrcol-1]; 64 | my $start = $line[$startcol-1]; 65 | my $stop = $line[$startcol]; 66 | my $ref = $line[$refcol-1]; 67 | my $var = $line[$varcol-1]; 68 | my $type = $line[$typecol-1]; 69 | my $trans = $line[$transcol-1]; 70 | my $cpos = $line[$cposcol-1]; 71 | my $AA = $line[$AAcol-1]; 72 | 73 | if ( $type =~ /missense|in_frame/ ) { 74 | my $cv1 = $line[-2]; 75 | my $cv2 = $line[-1]; 76 | if ( $cv2 !~ /ClinVar/ ) { 77 | $cv1 = "NULL"; 78 | $cv2 = "NULL"; 79 | } 80 | 81 | my $vari = $chr."\t".$start."\t".$stop."\t".$ref."\t".$var; 82 | $mutations{$gene}{$AA}{$vari}{$trans}{$cpos} = $cv1."\t".$cv2; 83 | $alternates{$gene}{$vari}{$trans}{$cpos} = $AA; 84 | $counts{$gene}{$vari}{$barcode} = 1; 85 | } 86 | } 87 | close IN; 88 | 89 | my $na = "NA"; 90 | my @c = split( "\/" , $clusters ); 91 | my %clusterlines; 92 | open ( OUT , ">$maf.v2.$c[-1]" ); 93 | print OUT "Cluster_ID\tGene\tAAchange\tDegree\tCloseness_Centrality\tGeodesic\tFrequency\tTranscript\tc_position\tChromosome\tStart\tStop\tReference\tVariant\tClinVarAnnotation\tClinVarCitation\tLongest_Transcript\tLongest_AAchange\tLongest_c_position\n"; 94 | open ( IN2 , "<$clusters" ) or die "Cannot open $clusters: $!"; 95 | while ( ) 96 | { 97 | chomp; 98 | my $line = $_; 99 | my @line = split( "\t" , $_ ); 100 | 101 | my ( $id , $gene , $AA , $deg , $Cc , $geo , $freq ) = @line; 102 | 103 | if ( exists $mutations{$gene}{$AA} ) 104 | { 105 | my $others = ""; 106 | my $val = ""; 107 | foreach my $vari ( keys %{$mutations{$gene}{$AA}} ) 108 | { 109 | my @l = keys %{$usetrans{$gene}}; my $long = $l[0]; 110 | $line[6] = scalar keys %{$counts{$gene}{$vari}}; 111 | foreach my $trans ( sort keys %{$mutations{$gene}{$AA}{$vari}} ) 112 | { 113 | foreach my $cpos ( sort keys %{$mutations{$gene}{$AA}{$vari}{$trans}} ) 114 | { 115 | my @othercpos = keys %{$alternates{$gene}{$vari}{$long}}; my $othercpos = $othercpos[0]; 116 | my $otherAA = $alternates{$gene}{$vari}{$long}{$othercpos}; 117 | #$line[2] = $otherAA; 118 | $val = $mutations{$gene}{$AA}{$vari}{$trans}{$cpos}; 119 | $clusterlines{$id}{$gene}{$vari}{$long} = join( "\t" , ( @line , $trans , $cpos , $vari , $val , $long , $otherAA , $othercpos ) );#$transcripts{$gene}{$trans} ) ); 120 | } 121 | } 122 | } 123 | } elsif ( $AA !~ /^p\./ ) { #is drug 124 | print OUT join( "\t" , ( @line , $na , $na , $na , $na , $na , $na , $na ) )."\n"; 125 | } 126 | } 127 | close IN2; 128 | 129 | foreach my $id ( keys %clusterlines ) 130 | { 131 | foreach my $gene ( sort keys %{$clusterlines{$id}} ) 132 | { 133 | foreach my $AA ( sort keys %{$clusterlines{$id}{$gene}} ) 134 | { 135 | foreach my $trans ( sort keys %{$usetrans{$gene}} ) 136 | { 137 | print OUT $clusterlines{$id}{$gene}{$AA}{$trans}."\n"; 138 | } 139 | } 140 | } 141 | } 142 | close OUT; 143 | -------------------------------------------------------------------------------- /scripts/annotate_clusters_PDB.pl: -------------------------------------------------------------------------------- 1 | #!/bin/perl 2 | #3 April 2015 - Adam D Scott - 3 | 4 | use strict; 5 | use warnings; 6 | 7 | my $usage = 'perl annotate_clusters_domains.pl 8 | '; 9 | 10 | die $usage , unless @ARGV == 3; 11 | my ( $pairwise , $drugClean , $clusters ) = @ARGV; 12 | 13 | my %domains; 14 | my %variants; 15 | open ( IN , "<$pairwise" ) or die "Cannot open $pairwise: $!"; 16 | while( ) { 17 | chomp; if ( /Gene/ ) { next; } 18 | my @line = split( "\t" , $_ ); 19 | my $gene1 = $line[0]; 20 | my $AA1 = $line[4]; 21 | my $domain1 = $line[7]; 22 | $domain1 = &filter_eco( $domain1 ); 23 | my $gene2 = $line[9]; 24 | my $AA2 = $line[13]; 25 | my $domain2 = $line[16]; 26 | $domain2 = &filter_eco( $domain2 ); 27 | 28 | $domains{$gene1}{$AA1}{$domain1} = 1; 29 | $domains{$gene2}{$AA2}{$domain2} = 1; 30 | } 31 | close IN; 32 | 33 | open ( IN , "<$drugClean" ) or die "Cannot open $drugClean: $!"; 34 | while( ) { 35 | chomp; if ( /Gene/ ) { next; } 36 | my @line = split( "\t" , $_ ); 37 | my $gene1 = $line[5]; 38 | my $AA1 = $line[6]; 39 | my $domain1 = $line[7]; 40 | $domain1 = &filter_eco( $domain1 ); 41 | 42 | $domains{$gene1}{$AA1}{$domain1} = 1; 43 | } 44 | close IN; 45 | 46 | open ( OUT , ">domains.$clusters" ) or die "Cannot open domains.$clusters: $!"; 47 | open ( IN , "<$clusters" ) or die "Cannot open $clusters: $!"; 48 | while ( ) { 49 | chomp; 50 | if ( /Cluster/ ) { 51 | print OUT $_."\tProtein_Domain\n"; 52 | next; 53 | } 54 | my @line = split( "\t" , $_ ); 55 | my $genedrug = $line[1]; 56 | my $AAgene = $line[2]; 57 | 58 | if ( exists $domains{$genedrug} ) { 59 | my @domains = keys %{$domains{$genedrug}{$AAgene}}; 60 | print OUT join( "\t" , ( @line , join( "|" , sort @domains ) ) )."\n"; 61 | } else { 62 | print OUT join( "\t" , @line )."\tNULL\n"; 63 | } 64 | } 65 | close IN; 66 | close OUT; 67 | 68 | sub filter_eco { 69 | my ( $domain ) = @_; 70 | 71 | $domain =~ s/ {ECO.*//; 72 | $domain =~ s/^{ECO.*//; 73 | if ( length( $domain ) == 0 || $domain =~ /N\/A/ ) { 74 | $domain = "NULL"; 75 | } 76 | 77 | return $domain; 78 | } 79 | -------------------------------------------------------------------------------- /scripts/annotate_clusters_domains.pl: -------------------------------------------------------------------------------- 1 | #!/bin/perl 2 | #3 April 2015 - Adam D Scott - 3 | 4 | use strict; 5 | use warnings; 6 | 7 | my $usage = 'perl annotate_clusters_domains.pl 8 | '; 9 | 10 | die $usage , unless @ARGV == 3; 11 | my ( $pairwise , $drugClean , $clusters ) = @ARGV; 12 | 13 | my %domains; 14 | my %variants; 15 | open ( IN , "<$pairwise" ) or die "Cannot open $pairwise: $!"; 16 | while( ) { 17 | chomp; if ( /Gene/ ) { next; } 18 | my @line = split( "\t" , $_ ); 19 | my $gene1 = $line[0]; 20 | my $AA1 = $line[4]; 21 | my $domain1 = $line[7]; 22 | $domain1 = &filter_eco( $domain1 ); 23 | my $gene2 = $line[9]; 24 | my $AA2 = $line[13]; 25 | my $domain2 = $line[16]; 26 | $domain2 = &filter_eco( $domain2 ); 27 | 28 | $domains{$gene1}{$AA1}{$domain1} = 1; 29 | $domains{$gene2}{$AA2}{$domain2} = 1; 30 | } 31 | close IN; 32 | 33 | open ( IN , "<$drugClean" ) or die "Cannot open $drugClean: $!"; 34 | while( ) { 35 | chomp; if ( /Gene/ ) { next; } 36 | my @line = split( "\t" , $_ ); 37 | my $gene1 = $line[5]; 38 | my $AA1 = $line[6]; 39 | my $domain1 = $line[7]; 40 | $domain1 = &filter_eco( $domain1 ); 41 | 42 | $domains{$gene1}{$AA1}{$domain1} = 1; 43 | } 44 | close IN; 45 | 46 | open ( OUT , ">domains.$clusters" ) or die "Cannot open domains.$clusters: $!"; 47 | open ( IN , "<$clusters" ) or die "Cannot open $clusters: $!"; 48 | while ( ) { 49 | chomp; 50 | if ( /Cluster/ ) { 51 | print OUT $_."\tProtein_Domain\n"; 52 | next; 53 | } 54 | my @line = split( "\t" , $_ ); 55 | my $genedrug = $line[1]; 56 | my $AAgene = $line[2]; 57 | 58 | if ( exists $domains{$genedrug} ) { 59 | my @domains = keys %{$domains{$genedrug}{$AAgene}}; 60 | print OUT join( "\t" , ( @line , join( "|" , sort @domains ) ) )."\n"; 61 | } else { 62 | print OUT join( "\t" , @line )."\tNULL\n"; 63 | } 64 | } 65 | close IN; 66 | close OUT; 67 | 68 | sub filter_eco { 69 | my ( $domain ) = @_; 70 | 71 | $domain =~ s/ {ECO.*//; 72 | $domain =~ s/^{ECO.*//; 73 | if ( length( $domain ) == 0 || $domain =~ /N\/A/ ) { 74 | $domain = "NULL"; 75 | } 76 | 77 | return $domain; 78 | } 79 | -------------------------------------------------------------------------------- /scripts/annotate_clusters_drug_class.pl: -------------------------------------------------------------------------------- 1 | #!/bin/perl 2 | #26 February 2015 - Adam D Scott - 3 | 4 | use strict; 5 | use warnings; 6 | 7 | my $usage = 'perl drug_class_annotate_clusters.pl 8 | '; 9 | 10 | die $usage , unless @ARGV == 3; 11 | my ( $clusters , $drugclasses , $output ) = @ARGV; 12 | 13 | my %clusters; 14 | my %classes; 15 | 16 | open ( IN , "<$drugclasses" ) or die "Cannot open $drugclasses: $!"; 17 | while ( ) { 18 | chomp; 19 | my @line = split( "\t" , $_ ); 20 | if ( /Total/ ) { next; } 21 | ##from two-column association list: drug \t class 22 | my $drug = $line[0]; 23 | my $class = $line[1]; 24 | $class =~ s/\"//g; 25 | $class =~ s/\;/\|/g; 26 | if ( $class ) { 27 | $classes{$drug}{$class} = 1; 28 | } else { 29 | $classes{$drug}{"Unclassified"} = 1; 30 | } 31 | } 32 | close IN; 33 | 34 | my $unclassified = "Unclassified"; 35 | my $notdrug = "NA"; 36 | my %listclasses; 37 | open ( IN , "<$clusters" ) or die "Cannot open $clusters: $!"; 38 | while ( ) { 39 | chomp; if ( /Cluster/ ) { next; } 40 | my @line = split( "\t" , $_ ); 41 | 42 | #print join( "\t" , @line )."\n"; 43 | my $id = $line[0]; 44 | my $drug = $line[1]; 45 | my $gene = $line[1]; 46 | my $AA = $line[2]; 47 | if ( exists $classes{$drug} ) { 48 | my @class = sort keys %{$classes{$drug}}; 49 | $clusters{$id}{$drug} = join( "\t" , ( @line , join( "|" , @class ) ) ); 50 | } else { 51 | if ( $AA =~ /^p\./ ) { 52 | $clusters{$id}{$gene.$AA} = join( "\t" , ( @line , $notdrug ) ); 53 | } else { 54 | $clusters{$id}{$gene.$AA} = join( "\t" , ( @line , $unclassified ) ); 55 | } 56 | } 57 | } 58 | close IN; 59 | 60 | open ( OUT , ">$output" ); 61 | foreach my $id ( keys %clusters ) { 62 | foreach my $spec ( keys %{$clusters{$id}} ) { 63 | print OUT $clusters{$id}{$spec}."\n"; 64 | } 65 | } 66 | close OUT; 67 | 68 | #open ( OUT , ">$output.table" ); 69 | #print OUT "Drug\tClass\n"; 70 | #foreach my $drug ( keys %classes ) { 71 | # foreach my $class ( keys %{$classes{$drug}} ) { 72 | # print OUT join( "\t" , ( $drug , $class ) )."\n"; 73 | # } 74 | #} 75 | #close OUT; 76 | -------------------------------------------------------------------------------- /scripts/annotate_clusters_families.pl: -------------------------------------------------------------------------------- 1 | #!/bin/perl 2 | #15 March 2015 - Adam D Scott - 3 | 4 | use strict; 5 | use warnings; 6 | 7 | my $usage = 'perl annotate_families.pl 8 | '; 9 | 10 | die $usage , unless @ARGV == 3; 11 | my ( $families , $clusters , $output ) = @ARGV; 12 | 13 | my %families; 14 | my %fams; 15 | open ( IN , "<$families" ) or die "Cannot open $families: $!"; 16 | while ( ) { 17 | chomp; 18 | my @line = split "\t" , $_; 19 | 20 | my $gene = $line[1]; 21 | my $family = $line[10]; 22 | $families{$gene}{$family} = 1; 23 | $fams{$family} = 0; 24 | } 25 | close IN; 26 | 27 | my %counted; 28 | my $unclassified = "Unclassified"; 29 | my $notgene = "NA"; 30 | my $total = 0; 31 | my %lines; 32 | open ( OUT , ">$output" ) or die "Cannot open $output: $!"; 33 | open ( IN2 , "<$clusters" ) or die "Cannot open $clusters: $!"; 34 | while ( ) { 35 | chomp; 36 | if ( /Cluster/ ) { 37 | print OUT $_."\tHGNC_Gene_Families\n"; 38 | next; 39 | } 40 | my @line = split "\t" , $_; 41 | 42 | my $id = $line[0]; 43 | my $gene = $line[1]; 44 | my $aachange = $line[2]; 45 | my $mutations = $line[6]; 46 | my @families; 47 | if ( $aachange =~ /p\./ ) { 48 | if ( exists $families{$gene} ) { 49 | @families = sort keys %{$families{$gene}}; 50 | foreach my $family ( keys %{$families{$gene}} ) { 51 | if ( exists $fams{$family} ) { 52 | $fams{$family} += $mutations; 53 | } else { 54 | $fams{$family} = $mutations; 55 | } 56 | } 57 | } else { 58 | @families = ( $unclassified ); 59 | if ( exists $fams{$unclassified} ) { 60 | $fams{$unclassified} += $mutations; 61 | } else { 62 | $fams{$unclassified} = $mutations; 63 | } 64 | } 65 | if ( not exists $counted{$gene} ) { 66 | $total++; 67 | } 68 | } else { 69 | @families = ( $notgene ); 70 | } 71 | print OUT join( "\t" , ( @line , join( "|" , @families ) ) )."\n"; 72 | } 73 | close IN2; 74 | close OUT; 75 | 76 | print "Gene_Family\tMutations\tPercentage\n"; 77 | foreach my $family ( keys %fams ) { 78 | my $n = $fams{$family}; 79 | if ( $n > 0 ) { 80 | my $percent = 100*$n/$total; 81 | print $family."\t".$n."\t"; 82 | printf "%.3f\n" , $percent; 83 | } 84 | } 85 | -------------------------------------------------------------------------------- /scripts/clusterPDBPresence.drug.pl: -------------------------------------------------------------------------------- 1 | #!/usr/bin/perl 2 | #24 June 2016 - Adam Scott - 3 | 4 | use strict; 5 | use warnings; 6 | 7 | use IO::File; 8 | use FileHandle; 9 | 10 | my $usage = 'perl clusterPDBPresence.drug.pl 11 | '; 12 | 13 | die $usage , unless @ARGV == 3; 14 | my ( $drugportFile , $clustersFile , $output ) = @ARGV; 15 | 16 | my $IN1 = FileHandle->new( "$drugportFile" , "r" ); 17 | if ( not defined $IN1 ) { die "ADSERROR: Could not open/read $drugportFile\n"; } 18 | my $IN2 = FileHandle->new( "$clustersFile" , "r" ); 19 | if ( not defined $IN2 ) { die "ADSERROR: Could not open/read $clustersFile\n"; } 20 | my $OUT1 = FileHandle->new( "$output.chains" , "w" ); 21 | if ( not defined $OUT1 ) { die "ADSERROR: Could not open/write $output.chains\n"; } 22 | my $OUT2 = FileHandle->new( "$output.xmer" , "w" ); 23 | if ( not defined $OUT2 ) { die "ADSERROR: Could not open/write $output.xmer\n"; } 24 | 25 | my %structures; 26 | my %represent; 27 | my $minMax = {}; 28 | while ( my $line = <$IN1> ) { 29 | chomp( $line ); 30 | next if ( $line =~ /Drug/ ); 31 | #1 Drug 32 | #3 PDB_ID 33 | #4 Chain 34 | #5 Compound_Location 35 | #7 Gene 36 | #11 Amino_Acid_Change 37 | #12 Chain 38 | #13 Mutation_Location_In_PDB 39 | #14 Res_Name 40 | my ( $drug , $pdb , $chain1 , $drugPosition , $gene2 , $mutation2 , $chain2 , $residue2 ) = (split( "\t" , $line ))[0,2,3,4,6,10,11,12]; 41 | $chain1 =~ s/\[(.*)\]/$1/; 42 | $chain2 =~ s/\[(.*)\]/$1/; 43 | if ( 0 ) { 44 | print( $gene2."\t" ); 45 | print( $drug."\t" ); 46 | print( $pdb."\t" ); 47 | print( $chain1."\t" ); 48 | print( $drugPosition."\t" ); 49 | print( $chain2."\t" ); 50 | print( $residue2."\n" ); 51 | } 52 | $structures{$gene2}{$drug}{$pdb}{$chain1} = $drugPosition; 53 | $structures{$gene2}{$mutation2}{$pdb}{$chain2} = $residue2; 54 | } 55 | $IN1->close(); 56 | 57 | while ( my $line = <$IN2> ) { 58 | if ( $line !~ /Cluster/ ) { 59 | chomp( $line ); 60 | my ( $cluster , $cgene , $mutation , $recurrence ) = (split( "\t" , $line ))[0,1,2,6]; 61 | if ( $mutation !~ /^p\./ ) { 62 | my $temp = $cgene; 63 | $cgene = $mutation; 64 | $mutation = $temp; 65 | } 66 | foreach my $pdb ( keys %{$structures{$cgene}{$mutation}} ) { 67 | foreach my $chain ( keys %{$structures{$cgene}{$mutation}{$pdb}} ) { 68 | $represent{$pdb}{$cgene}{$chain}{$cluster}{$mutation.":".$structures{$cgene}{$mutation}{$pdb}{$chain}} = $recurrence; 69 | } 70 | } 71 | } 72 | } 73 | $IN2->close(); 74 | 75 | $OUT1->print( "PDB_ID\tGene\tChain\tCluster\tnMutations\tnResidues\tTotalRecurrence\tMutations|Position\n" ); 76 | my %complex; 77 | foreach my $pdb ( sort keys %represent ) { 78 | foreach my $gene ( sort keys %{$represent{$pdb}} ) { 79 | foreach my $chain ( sort keys %{$represent{$pdb}{$gene}} ) { 80 | foreach my $cluster ( sort keys %{$represent{$pdb}{$gene}{$chain}} ) { 81 | my ( $mutres , $mutation , $position ); 82 | my @mutations; 83 | my %residues; 84 | my $recurrence = 0; 85 | foreach my $mutpos ( sort keys %{$represent{$pdb}{$gene}{$chain}{$cluster}} ) { 86 | ( $mutation , $position ) = split( ":" , $mutpos ); 87 | if ( $mutation =~ m/p\./ ) { 88 | $recurrence += $represent{$pdb}{$gene}{$chain}{$cluster}{$mutpos}; 89 | } 90 | $mutres = join( "|" , ( $mutation , $position ) ); 91 | push @mutations , $mutres; 92 | $residues{$position} = 1; 93 | } 94 | my @logline = ( $pdb , $gene , $chain , $cluster , $mutation , $position , $recurrence ); 95 | #print join( "\t" , @logline )."\n"; 96 | my @outline = ( $pdb , $gene , $chain , $cluster , scalar @mutations , scalar keys %residues , $recurrence , join( ";" , @mutations ) ); 97 | $complex{$cluster}{$pdb}{$gene}{$chain} = \@outline; 98 | $OUT1->print( join( "\t" , @outline )."\n" ); 99 | } 100 | } 101 | } #foreach pdb in represent => cluster 102 | } #foreach cluster in represent 103 | $OUT1->close(); 104 | 105 | $OUT2->print( "Cluster\tPDB_ID\tGene\tChain\tnMutationsDrugs\tnPositions\tTotalRecurrence\tMutationsDrugs|Position\n" ); 106 | foreach my $cluster ( sort keys %complex ) { 107 | foreach my $pdb ( sort keys %{$complex{$cluster}} ) { 108 | my ( @mutations , @geneChains ); 109 | my ( $mutations , $residues , $recurrence ); 110 | foreach my $gene ( sort keys %{$complex{$cluster}{$pdb}} ) { 111 | my @chains; 112 | foreach my $chain ( sort keys %{$complex{$cluster}{$pdb}{$gene}} ) { 113 | my @outline = @{$complex{$cluster}{$pdb}{$gene}{$chain}}; 114 | $mutations += $outline[4]; 115 | $residues += $outline[5]; 116 | $recurrence += $outline[6]; 117 | push @chains , $chain; 118 | push @mutations , $chains[-1]."\\"; 119 | $mutations[-1] .= $outline[-1]; 120 | } #foreach chain 121 | push @geneChains , $gene."|".join( "/" , @chains ); 122 | } #foreach gene 123 | my @complexLine = ( $cluster , $pdb , join( ";" , @geneChains ) , $mutations , $residues , $recurrence , join( ";" , @mutations ) ); 124 | $OUT2->print( join( "\t" , @complexLine )."\n" ); 125 | } #foreach pdb 126 | } #foreach cluster 127 | $OUT2->close(); 128 | -------------------------------------------------------------------------------- /scripts/clusterPDBPresence.pl: -------------------------------------------------------------------------------- 1 | #!/usr/bin/perl 2 | #26 April 2016 - Adam Scott - 3 | 4 | use strict; 5 | use warnings; 6 | 7 | use IO::File; 8 | use FileHandle; 9 | 10 | my $usage = 'perl clusterPDBPresence.pl 11 | '; 12 | 13 | die $usage , unless @ARGV == 3; 14 | my ( $pairwiseFile , $clustersFile , $output ) = @ARGV; 15 | 16 | my $IN1 = FileHandle->new( "$pairwiseFile" , "r" ); 17 | if ( not defined $IN1 ) { die "ADSERROR: Could not open/read $pairwiseFile\n"; } 18 | my $IN2 = FileHandle->new( "$clustersFile" , "r" ); 19 | if ( not defined $IN2 ) { die "ADSERROR: Could not open/read $clustersFile\n"; } 20 | my $OUT1 = FileHandle->new( "$output.chains" , "w" ); 21 | if ( not defined $OUT1 ) { die "ADSERROR: Could not open/write $output.chains\n"; } 22 | my $OUT2 = FileHandle->new( "$output.xmer" , "w" ); 23 | if ( not defined $OUT2 ) { die "ADSERROR: Could not open/write $output.xmer\n"; } 24 | 25 | my %structures; 26 | my %represent; 27 | my $minMax = {}; 28 | while ( my $line = <$IN1> ) { 29 | chomp( $line ); 30 | my ( $gene1 , $mutation1 , $chain1 , $residue1 , $gene2 , $mutation2 , $chain2 , $residue2 , $pdbInfos ) = (split( "\t" , $line ))[0,4,5,6,9,13,14,15,19]; 31 | $chain1 =~ s/\[(.*)\]/$1/; 32 | $chain2 =~ s/\[(.*)\]/$1/; 33 | my @pdbInfos = split( '\|' , $pdbInfos ); 34 | foreach my $pdbInfo ( @pdbInfos ) { 35 | my $pdb = (split( ' ' , $pdbInfo ))[1]; 36 | $structures{$gene1}{$mutation1}{$pdb}{$chain1} = $residue1; 37 | $structures{$gene2}{$mutation2}{$pdb}{$chain2} = $residue2; 38 | &checkMin( $minMax , $pdb , $gene1 , $chain1 , $residue1 ); 39 | &checkMin( $minMax , $pdb , $gene2 , $chain2 , $residue2 ); 40 | &checkMax( $minMax , $pdb , $gene1 , $chain1 , $residue1 ); 41 | &checkMax( $minMax , $pdb , $gene2 , $chain2 , $residue2 ); 42 | } #foreach pdbInfo in pdbInfos 43 | } 44 | $IN1->close(); 45 | 46 | sub checkMin { 47 | my ( $minMax , $pdb , $gene , $chain , $residue ) = @_; 48 | return unless ( $residue =~ /^\d+$/ ); 49 | if ( exists $minMax->{$pdb}->{$gene}->{$chain}->{'min'} ) { 50 | if ( $minMax->{$pdb}->{$gene}->{$chain}->{'min'} <= $residue ) { 51 | return; 52 | } 53 | } 54 | $minMax->{$pdb}->{$gene}->{$chain}->{'min'} = $residue; 55 | } 56 | 57 | sub checkMax { 58 | my ( $minMax , $pdb , $gene , $chain , $residue ) = @_; 59 | return unless ( $residue =~ /^\d+$/ ); 60 | if ( exists $minMax->{$pdb}->{$gene}->{$chain}->{'max'} ) { 61 | if ( $minMax->{$pdb}->{$gene}->{$chain}->{'max'} >= $residue ) { 62 | return; 63 | } 64 | } 65 | $minMax->{$pdb}->{$gene}->{$chain}->{'max'} = $residue; 66 | } 67 | 68 | while ( my $line = <$IN2> ) { 69 | if ( $line !~ /Cluster/ ) { 70 | chomp( $line ); 71 | my ( $cluster , $cgene , $mutation , $recurrence ) = (split( "\t" , $line ))[0,1,2,6]; 72 | foreach my $pdb ( keys %{$structures{$cgene}{$mutation}} ) { 73 | foreach my $chain ( keys %{$structures{$cgene}{$mutation}{$pdb}} ) { 74 | $represent{$pdb}{$cgene}{$chain}{$cluster}{$mutation.":".$structures{$cgene}{$mutation}{$pdb}{$chain}} = $recurrence; 75 | } 76 | } 77 | } 78 | } 79 | $IN2->close(); 80 | 81 | $OUT1->print( "PDB_ID\tGene\tChain\tCluster\tMinResidue\tMaxResidue\tnMutations\tnResidues\tTotalRecurrence\tMutations|Position\n" ); 82 | my %complex; 83 | foreach my $pdb ( sort keys %represent ) { 84 | foreach my $gene ( sort keys %{$represent{$pdb}} ) { 85 | foreach my $chain ( sort keys %{$represent{$pdb}{$gene}} ) { 86 | $chain =~ m/\[(.*)\]/; 87 | foreach my $cluster ( sort keys %{$represent{$pdb}{$gene}{$chain}} ) { 88 | my ( $mutres , $mutation , $position ); 89 | my @mutations; 90 | my %residues; 91 | my $recurrence = 0; 92 | foreach my $mutpos ( sort keys %{$represent{$pdb}{$gene}{$chain}{$cluster}} ) { 93 | ( $mutation , $position ) = split( ":" , $mutpos ); 94 | $recurrence += $represent{$pdb}{$gene}{$chain}{$cluster}{$mutpos}; 95 | $mutres = join( "|" , ( $mutation , $position ) ); 96 | push @mutations , $mutres; 97 | $residues{$position} = 1; 98 | } 99 | my @logline = ( $pdb , $gene , $chain , $cluster , $mutation , $position , $recurrence ); 100 | print join( "\t" , @logline )."\n"; 101 | if ( exists $minMax->{$pdb}->{$gene}->{$chain} ) { 102 | my $min = $minMax->{$pdb}->{$gene}->{$chain}->{'min'}; 103 | my $max = $minMax->{$pdb}->{$gene}->{$chain}->{'max'}; 104 | my @outline = ( $pdb , $gene , $chain , $cluster , $min , $max , scalar @mutations , scalar keys %residues , $recurrence , join( ";" , @mutations ) ); 105 | $complex{$cluster}{$pdb}{$gene}{$chain} = \@outline; 106 | $OUT1->print( join( "\t" , @outline )."\n" ); 107 | } 108 | } 109 | } 110 | } #foreach pdb in represent => cluster 111 | } #foreach cluster in represent 112 | $OUT1->close(); 113 | 114 | $OUT2->print( "Cluster\tPDB_ID\tGene\tChain\tnMutations\tnResidues\tTotalRecurrence\tMutations|Position\n" ); 115 | foreach my $cluster ( sort keys %complex ) { 116 | foreach my $pdb ( sort keys %{$complex{$cluster}} ) { 117 | my ( @mutations , @geneChains ); 118 | my ( $mutations , $residues , $recurrence ); 119 | foreach my $gene ( sort keys %{$complex{$cluster}{$pdb}} ) { 120 | my @chains; 121 | foreach my $chain ( sort keys %{$complex{$cluster}{$pdb}{$gene}} ) { 122 | my @outline = @{$complex{$cluster}{$pdb}{$gene}{$chain}}; 123 | $mutations += $outline[6]; 124 | $residues += $outline[7]; 125 | $recurrence += $outline[8]; 126 | push @chains , $chain; 127 | push @mutations , $chains[-1]."\\"; 128 | $mutations[-1] .= $outline[-1]; 129 | } #foreach chain 130 | push @geneChains , $gene."|".join( "/" , @chains ); 131 | } #foreach gene 132 | my @complexLine = ( $cluster , $pdb , join( ";" , @geneChains ) , $mutations , $residues , $recurrence , join( ";" , @mutations ) ); 133 | $OUT2->print( join( "\t" , @complexLine )."\n" ); 134 | } #foreach pdb 135 | } #foreach cluster 136 | $OUT2->close(); 137 | -------------------------------------------------------------------------------- /scripts/determine_transcript_lengths.pl: -------------------------------------------------------------------------------- 1 | #!/bin/perl 2 | #10 February 2015 - Adam D Scott - 3 | 4 | use strict; 5 | use warnings; 6 | 7 | my $usage = 'perl determine_transcript_lengths.pl 8 | '; 9 | 10 | die $usage , unless @ARGV == 1; 11 | my ( $gtf ) = @ARGV; 12 | 13 | my %transcripts; 14 | 15 | open ( IN , "<$gtf" ) or die "Cannot open $gtf: $!"; 16 | while ( ) 17 | { 18 | chomp; 19 | my @line = split( "\t" , $_ ); 20 | 21 | if ( $line[2] eq "CDS" ) 22 | { 23 | my $length = $line[4] - $line[3]; 24 | my $gene = $line[-1]; 25 | $gene =~ s/.*gene_name \"([\w\\\$.-]+)\"; gene_biotype.*/$1/; 26 | my $trans = $line[-1]; 27 | $trans =~ s/.*transcript_id \"(ENST\d+)\"; exon_number.*/$1/; 28 | my $exon = $line[-1]; 29 | $exon =~ s/.*exon_number \"(\d+)\"\; gene_name.*/$1/; 30 | #print $gene."\t".$trans."\t".$exon."\t".$length."\n"; 31 | $transcripts{$gene}{$trans}{$exon} = $length+1; 32 | } 33 | } 34 | close IN; 35 | 36 | foreach my $gene ( sort keys %transcripts ) 37 | { 38 | foreach my $trans ( sort keys %{$transcripts{$gene}} ) 39 | { 40 | my $length = 0; 41 | foreach my $exon ( keys %{$transcripts{$gene}{$trans}} ) 42 | { 43 | $length += $transcripts{$gene}{$trans}{$exon}; 44 | } 45 | print $gene."\t".$trans."\t".$length."\n"; 46 | } 47 | } 48 | -------------------------------------------------------------------------------- /scripts/filter_PDB.pl: -------------------------------------------------------------------------------- 1 | #!/usr/bin/perl 2 | # May 2017 Kuan-lin Huang 3 | 4 | use strict; 5 | use warnings; 6 | 7 | use IO::File; 8 | use FileHandle; 9 | 10 | my $usage = 'perl filter_PDB.pl 11 | '; 12 | 13 | die $usage , unless @ARGV == 2; 14 | 15 | my $pairwise = shift; 16 | my $pass_PDB = shift; 17 | my $pass_pairwise_out = $pairwise.".pass"; 18 | 19 | # pass PDB structures 20 | my $pass_PDB_structures = parse_pass_PDB($pass_PDB); 21 | # valid phosphosites 22 | my %phosphorylated_aa = ('S' => 1,'T' => 1,'Y' => 1,'D' => 1,'H' => 1); 23 | 24 | open ( IN , "<$pairwise" ) or die "Cannot open $pairwise: $!"; 25 | my $OUT = FileHandle->new( $pass_pairwise_out , "w" ); 26 | if ( not defined $OUT ) { die "ADSERROR: Could not open/write $pass_pairwise_out\n"; } 27 | 28 | my $head = ; 29 | chomp $head; 30 | #print "$head\n"; 31 | 32 | my $hasBadSites = {}; 33 | while ( ) 34 | { 35 | chomp; 36 | my @line = split "\t" , $_; 37 | # filter out structures; include only passing structures 38 | my $structureFail = 0; 39 | foreach my $info ( split /\|/ , $line[-1] ) { 40 | my $PDB_structure = ( split( /\ / , $info ) )[1]; 41 | if ( not exists($pass_PDB_structures->{$PDB_structure} ) ) { 42 | print STDERR "Discard pairs due to not-passed PDB structure: $PDB_structure\n"; 43 | print STDERR "$_\n"; 44 | $structureFail = 1; 45 | last; 46 | } 47 | } 48 | next if ( $structureFail ); 49 | 50 | # filter out phosphosites that is not S, T, Y, D, or H (known phosphorylated aa in human) 51 | # musites columns 12,15; sites columns 3,6,11,14 52 | if ( $pairwise =~ /\.musites$/ ) { 53 | my ( $site2 , $feature2 ) = @line[12,15]; 54 | my $ok = &filterSite( $site2 , $feature2 , $_ , $hasBadSites ); 55 | if ( not $ok ) { next; } 56 | } elsif ( $pairwise =~ /\.sites$/ ) { 57 | my ( $site1 , $feature1 , $site2 , $feature2 ) = @line[3,6,11,14]; 58 | my $ok = &filterSite( $site1 , $feature1 , $_ , $hasBadSites ); 59 | if ( not $ok ) { next; } 60 | $ok = &filterSite( $site2 , $feature2 , $_ , $hasBadSites ); 61 | if ( not $ok ) { next; } 62 | } 63 | 64 | #print "$_\n"; 65 | print $OUT "$_\n"; 66 | } 67 | 68 | close(IN); 69 | $OUT->close(); 70 | 71 | print STDOUT "# These structures had bad sites but were not filtered by PDB ID:\n"; 72 | foreach my $pdb ( sort keys %{$hasBadSites} ) { 73 | print STDOUT $pdb."\n"; 74 | } 75 | 76 | sub filterSite { 77 | my ( $site , $feature , $line , $hasBadSites ) = @_; 78 | my $aa = substr( $site , 2 , 1 ); 79 | if ( $feature =~ /Phospho/ && !exists( $phosphorylated_aa{$aa} ) ) { 80 | my $distInfo = ( split( /\t/ , $line ) )[-1]; 81 | foreach my $info ( split( /\|/ , $distInfo ) ) { 82 | print STDERR $info."\n"; 83 | my $pdb = ( split( /\ / , $info ) )[1]; 84 | $hasBadSites->{$pdb} = 1; 85 | } 86 | print STDERR "Discard phosphosite due to non-phosphorylated amino acid: $aa\n"; 87 | print STDERR "$line\n"; 88 | return 0; 89 | } 90 | return 1; 91 | } 92 | 93 | sub parse_pass_PDB { 94 | my $pass_PDB =shift; 95 | my $pass_PDB_structures ={}; 96 | open(FILE, $pass_PDB ) or die "Unable to open file $pass_PDB due to $!"; 97 | while() { 98 | chomp; 99 | my @line = split "\t" , $_ ; 100 | next if ( scalar @line < 4 ); 101 | my $structure = $line[2]; 102 | $pass_PDB_structures->{$structure}=1; 103 | } 104 | close FILE; 105 | return $pass_PDB_structures; 106 | } 107 | -------------------------------------------------------------------------------- /scripts/genePDBPresence.pl: -------------------------------------------------------------------------------- 1 | #!/usr/bin/perl 2 | #26 April 2016 - Adam Scott - 3 | # customized to genes (instead of cluster) by Kuan Jan. 2017 4 | 5 | use strict; 6 | use warnings; 7 | 8 | use IO::File; 9 | use FileHandle; 10 | 11 | my $usage = 'perl clusterPDBPresence.pl 12 | '; 13 | 14 | die $usage , unless @ARGV == 2; 15 | my ( $pairwiseFile , $output ) = @ARGV; 16 | 17 | my $IN1 = FileHandle->new( "$pairwiseFile" , "r" ); 18 | if ( not defined $IN1 ) { die "ADSERROR: Could not open/read $pairwiseFile\n"; } 19 | # my $IN2 = FileHandle->new( "$clustersFile" , "r" ); 20 | # if ( not defined $IN2 ) { die "ADSERROR: Could not open/read $clustersFile\n"; } 21 | my $OUT1 = FileHandle->new( "$output.gene.chains" , "w" ); 22 | if ( not defined $OUT1 ) { die "ADSERROR: Could not open/write $output.gene.chains\n"; } 23 | my $OUT2 = FileHandle->new( "$output.gene.xmer" , "w" ); 24 | if ( not defined $OUT2 ) { die "ADSERROR: Could not open/write $output.gene.xmer\n"; } 25 | 26 | my %structures; 27 | my %represent; 28 | my $minMax = {}; 29 | while ( my $line = <$IN1> ) { 30 | chomp( $line ); 31 | my ( $gene1 , $mutation1 , $chain1 , $residue1 , $gene2 , $mutation2 , $chain2 , $residue2 , $pdbInfos ) = (split( "\t" , $line ))[0,4,5,6,9,13,14,15,19]; 32 | $chain1 =~ s/\[(.*)\]/$1/; 33 | $chain2 =~ s/\[(.*)\]/$1/; 34 | my @pdbInfos = split( '\|' , $pdbInfos ); 35 | foreach my $pdbInfo ( @pdbInfos ) { 36 | my $pdb = (split( ' ' , $pdbInfo ))[1]; 37 | $structures{$gene1}{$mutation1}{$pdb}{$chain1} = $residue1; 38 | $structures{$gene2}{$mutation2}{$pdb}{$chain2} = $residue2; 39 | &checkMin( $minMax , $pdb , $gene1 , $chain1 , $residue1 ); 40 | &checkMin( $minMax , $pdb , $gene2 , $chain2 , $residue2 ); 41 | &checkMax( $minMax , $pdb , $gene1 , $chain1 , $residue1 ); 42 | &checkMax( $minMax , $pdb , $gene2 , $chain2 , $residue2 ); 43 | 44 | $represent{$pdb}{$gene1}{$chain1}{$mutation1.":".$residue1} = 1; 45 | $represent{$pdb}{$gene2}{$chain2}{$mutation2.":".$residue2} = 1; 46 | } #foreach pdbInfo in pdbInfos 47 | 48 | } 49 | $IN1->close(); 50 | 51 | sub checkMin { 52 | my ( $minMax , $pdb , $gene , $chain , $residue ) = @_; 53 | return unless ( $residue =~ /^\d+$/ ); 54 | if ( exists $minMax->{$pdb}->{$gene}->{$chain}->{'min'} ) { 55 | if ( $minMax->{$pdb}->{$gene}->{$chain}->{'min'} <= $residue ) { 56 | return; 57 | } 58 | } 59 | $minMax->{$pdb}->{$gene}->{$chain}->{'min'} = $residue; 60 | } 61 | 62 | sub checkMax { 63 | my ( $minMax , $pdb , $gene , $chain , $residue ) = @_; 64 | return unless ( $residue =~ /^\d+$/ ); 65 | if ( exists $minMax->{$pdb}->{$gene}->{$chain}->{'max'} ) { 66 | if ( $minMax->{$pdb}->{$gene}->{$chain}->{'max'} >= $residue ) { 67 | return; 68 | } 69 | } 70 | $minMax->{$pdb}->{$gene}->{$chain}->{'max'} = $residue; 71 | } 72 | 73 | $OUT1->print( "PDB_ID\tGene\tChain\tMinResidue\tMaxResidue\tnUniqMutations\tnUniqResidues\tMutations|Position\n" ); 74 | my %complex; 75 | foreach my $pdb ( sort keys %represent ) { 76 | foreach my $gene ( sort keys %{$represent{$pdb}} ) { 77 | foreach my $chain ( sort keys %{$represent{$pdb}{$gene}} ) { 78 | $chain =~ m/\[(.*)\]/; 79 | #foreach my $cluster ( sort keys %{$represent{$pdb}{$gene}{$chain}} ) { 80 | my ( $mutres , $mutation , $position ); 81 | my @mutations; 82 | my %residues; 83 | my $recurrence = 0; 84 | foreach my $mutpos ( sort keys %{$represent{$pdb}{$gene}{$chain}} ) { 85 | ( $mutation , $position ) = split( ":" , $mutpos ); 86 | $recurrence += $represent{$pdb}{$gene}{$chain}{$mutpos}; 87 | $mutres = join( "|" , ( $mutation , $position ) ); 88 | push @mutations , $mutres; 89 | $residues{$position} = 1; 90 | } 91 | my @logline = ( $pdb , $gene , $chain , $mutation , $position , $recurrence ); 92 | print join( "\t" , @logline )."\n"; 93 | if ( exists $minMax->{$pdb}->{$gene}->{$chain} ) { 94 | my $min = $minMax->{$pdb}->{$gene}->{$chain}->{'min'}; 95 | my $max = $minMax->{$pdb}->{$gene}->{$chain}->{'max'}; 96 | my @outline = ( $pdb , $gene , $chain , $min , $max , scalar @mutations , scalar keys %residues , join( ";" , @mutations ) ); 97 | $complex{$pdb}{$gene}{$chain} = \@outline; 98 | $OUT1->print( join( "\t" , @outline )."\n" ); 99 | } 100 | #} 101 | } 102 | } #foreach pdb in represent => cluster 103 | } #foreach cluster in represent 104 | $OUT1->close(); 105 | 106 | $OUT2->print( "PDB_ID\tGene\tChain\tnUniqMutations\tnUniqResidues\tMutations|Position\n" ); 107 | #foreach my $cluster ( sort keys %complex ) { 108 | foreach my $pdb ( sort keys %complex ) { 109 | my ( @mutations , @geneChains ); 110 | my ( $mutations , $residues , $recurrence ); 111 | foreach my $gene ( sort keys %{$complex{$pdb}} ) { 112 | my @chains; 113 | foreach my $chain ( sort keys %{$complex{$pdb}{$gene}} ) { 114 | my @outline = @{$complex{$pdb}{$gene}{$chain}}; 115 | $mutations += $outline[5]; 116 | $residues += $outline[6]; 117 | push @chains , $chain; 118 | push @mutations , $chains[-1]."\\"; 119 | $mutations[-1] .= $outline[-1]; 120 | } #foreach chain 121 | push @geneChains , $gene."|".join( "/" , @chains ); 122 | } #foreach gene 123 | my @complexLine = ( $pdb , join( ";" , @geneChains ) , $mutations , $residues , join( ";" , @mutations ) ); 124 | $OUT2->print( join( "\t" , @complexLine )."\n" ); 125 | } #foreach pdb 126 | #} #foreach cluster 127 | $OUT2->close(); 128 | -------------------------------------------------------------------------------- /scripts/hotspot3d.main.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | # Author: Adam D Scott (adamscott@wustl.edu) 3 | # Wrapper for all Main steps of HotSpot3D except for visual module. 4 | 5 | function run { 6 | maf=$1 7 | dataDir=$2 8 | prefix=$3 9 | 10 | pairwiseFile="${prefix}.pairwise" 11 | intraFile="${prefix}.pairwise.singleprotein.collapsed" 12 | interFile="${prefix}.pairwise.complex.collapsed" 13 | clustersIntraFile="${intraFile}.clusters" 14 | clustersInterFile="${interFile}.clusters" 15 | summaryIntraFile="${clustersIntraFile}.summary" 16 | summaryInterFile="${clustersInterFile}.summary" 17 | 18 | echo "hotspot3d search --maf-file ${maf} --data-dir ${dataDir} --output-prefix ${prefix} 1> ${prefix}.search.out 2> ${prefix}.search.err" 19 | hotspot3d search --maf-file ${maf} --data-dir ${dataDir} --output-prefix ${prefix} 1> ${prefix}.search.out 2> ${prefix}.search.err 20 | 21 | echo "hotspot3d post --maf-file ${maf} --input-prefix ${prefix} 1> ${prefix}.post.out 2>${prefix}.post.err" 22 | hotspot3d post --maf-file ${maf} --input-prefix ${prefix} 1> ${prefix}.post.out 2>${prefix}.post.err 23 | 24 | echo "hotspot3d cluster --collapsed-pairs-file ${intraFile} --pairwise-file ${pairwiseFile} --output-file ${clustersIntraFile} --maf-file ${maf} 1> ${prefix}.cluster.out 2> ${prefix}.cluster.err" 25 | hotspot3d cluster --collapsed-pairs-file ${intraFile} --pairwise-file ${pairwiseFile} --output-file ${clustersIntraFile} --maf-file ${maf} 1> ${prefix}.cluster.out 2> ${prefix}.cluster.err 26 | 27 | echo "hotspot3d summary --clusters-file ${clustersIntraFile} --output-file ${summaryIntraFile} 1> ${prefix}.summary.out 2> ${prefix}.summary.err" 28 | hotspot3d summary --clusters-file ${clustersIntraFile} --output-file ${summaryIntraFile} 1> ${prefix}.summary.out 2> ${prefix}.summary.err 29 | } 30 | 31 | defaultOut="hotspot3d.results" 32 | defaultDir="./" 33 | if [ ! -z $1 ]; then 34 | if [ ! -z $2 ]; then 35 | if [ ! -z $3 ]; then 36 | run $1 $2 $3 37 | else 38 | run $1 $2 ${defaultOut} 39 | fi 40 | else 41 | run $1 ${defaultDir} ${defaultOut} 42 | fi 43 | else 44 | echo "bash $0 /preprocessing-dir/ \"output-prefix\"" 45 | fi 46 | -------------------------------------------------------------------------------- /scripts/nStructures.pl: -------------------------------------------------------------------------------- 1 | #!/usr/bin/perl 2 | #08 February 2017 - Adam D Scott - 3 | 4 | use strict; 5 | use warnings; 6 | 7 | use IO::File; 8 | use FileHandle; 9 | 10 | my $usage = 'perl nStructures.pl 11 | '; 12 | 13 | die $usage , unless @ARGV == 2; 14 | my ( $hupt , $output ) = @ARGV; 15 | 16 | my $IN1 = FileHandle->new( "$hupt" , "r" ); 17 | if ( not defined $IN1 ) { die "ADSERROR: Could not open/read $hupt\n"; } 18 | 19 | my $OUT = FileHandle->new( "$output" , "w" ); 20 | if ( not defined $OUT ) { die "ADSERROR: Could not open/write $output\n"; } 21 | 22 | my $nStructures = 0; 23 | my $nGenesWithStructure = 0; 24 | my $nGenes = 0; 25 | while ( my $line = <$IN1> ) { 26 | chomp( $line ); 27 | my @line = split( /\t/ , $line ); 28 | next if ( $line[2] eq "N/A" ); 29 | my @pdbs = split( /\ / , $line[2] ); 30 | my $nStructureForGene = scalar @pdbs; 31 | if ( $nStructureForGene > 0 ) { 32 | $nGenesWithStructure += 1; 33 | $nStructures += $nStructureForGene; 34 | $OUT->print( join( "\t" , ( $line[0] , $nStructureForGene ) )."\n" ); 35 | } 36 | } 37 | $IN1->close(); 38 | $OUT->close(); 39 | print join( "\t" , ( $nStructures , $nGenesWithStructure , ( $nStructures / $nGenesWithStructure ) ) )."\n"; 40 | -------------------------------------------------------------------------------- /t/foo.t: -------------------------------------------------------------------------------- 1 | use strict; 2 | use warnings; 3 | 4 | use Test::Most; 5 | use TGI::Mutpro::Preprocess::Complicated; 6 | 7 | TGI::Mutpro::Preprocess::Complicated::print_stuff(); 8 | ok(1, '1 is definitely okay'); 9 | 10 | done_testing; 11 | 12 | --------------------------------------------------------------------------------