├── README.md ├── analysis ├── CNV │ ├── analyzeCNV.R │ └── cnvExpressAssoc.R ├── CharGer_analysis │ ├── PCGP │ │ ├── liftover_CharGer_result_PCGP.py │ │ ├── plot_Charged_PCGP.R │ │ ├── run_charger.sh │ │ └── work.log.sh │ └── append_readcounts │ │ └── append_readcounts.pl ├── LOH │ ├── plotPathVarLOH.R │ ├── prepFileForAllelicImbalance.pl │ └── pvalue_per_site_unified_background.pl ├── PCA_IBD_dist │ ├── plot_relatedness.R │ └── run_plink_pca.ibd.rel.sh ├── README.md ├── RPPA_effect │ ├── PathVarRPPAAssoc.R │ ├── RPPA_effect.R │ ├── plotPathVarRPPA.R │ └── plotPathVarRPPAassoc.R ├── association_test │ ├── TFT_functions.R │ ├── analyzeAssocTFT.DDR.R │ ├── analyzeAssocTFT.R │ ├── plotPathVarAssoc.R │ ├── postProcessAssoc.R │ ├── qqman.R │ ├── single_var_association.py │ ├── single_var_association_ethni.py │ └── work.log.sh ├── burden_assoc │ ├── TFT.R │ ├── analyze_path_burden.R │ └── label_onco_var_ExAC.R ├── clinical_association │ ├── PathVarAAOassoc.R │ ├── PathVarEthnicStats.R │ └── plotPathVarEthnic.R ├── data_integration │ ├── combine_data.R │ ├── integrative_analysis.R │ └── pathVar_integrative_analysis.R ├── dependency_files.R ├── expression_effect │ ├── PathVarExpressAssoc.R │ ├── expression_effect.R │ ├── plotPathVarExpressAssoc.R │ └── plotPathVarExpression.R ├── family_history │ └── fam_history.R ├── functional_assay │ └── plot_result.R ├── gene_list │ └── examine_gene_list.R ├── global_aes_out.R ├── hotspot3d │ ├── cluster_analysis.R │ ├── spotlightVar.R │ └── work.log.sh ├── mutation_signature │ ├── 1_germlineVsMutationSiganture.R │ ├── 2_plotPathVarMutsigAssoc.R │ ├── 3_somaticVsMutationSiganture.R │ └── 4_plotSomaticMutsigAssoc.R ├── nominate_variants │ └── nominateVars.R ├── pathogenic_variants │ ├── label_onco_var.R │ └── plot_path_var_dist.R ├── pleiotropy │ └── pleiotropy.R ├── process_files │ ├── genotype │ │ ├── cancer_type.txt │ │ ├── merge_genotype_by_cancer.sh │ │ └── merge_log_gcloud.sh │ └── germline │ │ ├── google-cloud-ISB │ │ ├── README.md │ │ ├── analysisID_lists │ │ │ ├── ACC.ids │ │ │ ├── BLCA.ids │ │ │ ├── BRCA.ids │ │ │ ├── CESC.ids │ │ │ ├── CHOL.ids │ │ │ ├── COAD.ids │ │ │ ├── DLBC.ids │ │ │ ├── ESCA.ids │ │ │ ├── GBM.ids │ │ │ ├── HNSC.ids │ │ │ ├── KICH.ids │ │ │ ├── KIRC.ids │ │ │ ├── KIRP.ids │ │ │ ├── LGG.ids │ │ │ ├── LIHC.ids │ │ │ ├── LUAD.ids │ │ │ ├── LUSC.ids │ │ │ ├── MESO.ids │ │ │ ├── OV.ids │ │ │ ├── PAAD.ids │ │ │ ├── PCPG.ids │ │ │ ├── PRAD.ids │ │ │ ├── READ.ids │ │ │ ├── SARC.ids │ │ │ ├── SKCM.ids │ │ │ ├── STAD.ids │ │ │ ├── TGCT.ids │ │ │ ├── THCA.ids │ │ │ ├── THYM.ids │ │ │ ├── UCEC.ids │ │ │ ├── UCS.ids │ │ │ └── UVM.ids │ │ ├── annotate.not-in-exac.sh │ │ ├── docker │ │ │ ├── Dockerfile │ │ │ ├── ExAC_config.toml │ │ │ ├── filter_VCF_AF_AD.py │ │ │ └── variant_QC_annotation.sh │ │ ├── make_lists.sh │ │ └── unused │ │ │ └── Dockerfile │ │ ├── local │ │ ├── ExAC_config.toml │ │ ├── calc_vcf_concordance.py │ │ ├── combine_CharGer2VCF.py │ │ ├── create_ROI_genotype_VCF.sh │ │ ├── expand_csq.py │ │ ├── filter_VCF_AD.py │ │ ├── filter_VCF_AF_AD.py │ │ ├── filter_VCF_AF_AD_keepExAConly.py │ │ ├── filter_merge_germline_by_cancer.sh │ │ ├── filter_merge_germline_by_sample.sh │ │ ├── make.bsub.commands.sh │ │ ├── post_CharGer.sh │ │ ├── recalc_AF_PM2.py │ │ ├── replace_vcf_header_sample_with_source_TCGA.pl │ │ ├── run_VEP.v85.sh │ │ ├── run_calc_vcf_concordance.sh │ │ ├── run_charger_on_vep_VCF.sh │ │ ├── update_vcfHeader_to_TCGA.sh │ │ ├── variant_QC_annotation.sh │ │ └── work.log.sh │ │ ├── merge_germline_cloud.sh │ │ ├── readme.txt │ │ └── var_freq │ │ ├── batch_run_vcf_var_freq_filter.sh │ │ ├── run_vcf_var_freq_filter.sh │ │ └── vcf_var_freq_filter.pl ├── sample_listing │ ├── compile_compare_samples.R │ └── make_clin_summary_table.R ├── segregation_analysis │ ├── batch_run_segregation.sh │ ├── find_relatives.R │ ├── find_segregating_var.sh │ ├── find_shared_var_relatives.py │ └── segregation.log.sh └── variant_QC │ ├── batch_run_pseq_stats.sh │ ├── batch_run_pseq_vcfstats.sh │ ├── batch_run_vcfstats.sh │ ├── plot_concordance.R │ ├── plot_pseq_stats.R │ ├── run_pseq_stats.sh │ └── separate_batch_ethnicity_effects.R ├── doc ├── 20170118_TCGA_Germline_Abstract.docx └── notes.txt ├── germline_somatic_analysis ├── dependency_files.R ├── global_aes_out.R ├── load_somatic.R ├── mutation_signature │ ├── 1_germlineVsMutationSiganture.R │ ├── 2_plotPathVarMutsigAssoc.R │ ├── 3_somaticVsMutationSiganture.R │ └── 4_plotSomaticMutsigAssoc.R ├── somatic_germline │ ├── plotSomaticGermline.R │ ├── plotSomaticGermlineByCancer.R │ ├── somaticDriver_germline_fisher.R │ ├── somaticDriver_germline_fisher_byCancer.R │ └── somatic_germline_fisher.R └── somatic_germline_overlap │ └── somatic_germline_overlap_genes.R └── util └── edit_vcf_samplenames ├── list_vcf_source-sample_pairs.pl ├── replace_vcf_header_sample_with_source.pl ├── uniqueify_merged_samplenames.driver.sh └── uniqueify_merged_samplenames.template.sh /README.md: -------------------------------------------------------------------------------- 1 | # PanCanAtlasGermline # 2 | 3 | ## Pathogenic Germline Variants in 10,389 Adult Cancers ## 4 | >The TCGA PanCanAtlas germline analysis working group is investigating germline variants in the largest sequencing cohort of cancer to date: 10,389 cases in 33 cancer types. 5 | ```Pathogenic Germline Variants in 10,389 Adult Cancers. 6 | 7 | Huang KL, Mashl RJ, Wu Y, Ritter DI, Wang J, Oh C, Paczkowska M, Reynolds S, Wyczalkowski MA, Oak N, Scott AD, Krassowski M, Cherniack AD, Houlahan KE, Jayasinghe R, Wang LB, Zhou DC, Liu D, Cao S, Kim YW, Koire A, McMichael JF, Hucthagowder V, Kim TB, Hahn A, Wang C, McLellan MD, Al-Mulla F, Johnson KJ; Cancer Genome Atlas Research Network, Lichtarge O, Boutros PC, Raphael B, Lazar AJ, Zhang W, Wendl MC, Govindan R, Jain S, Wheeler D, Kulkarni S, Dipersio JF, Reimand J, Meric-Bernstam F, Chen K, Shmulevich I, Plon SE, Chen F, Ding L. 8 | 9 | Cell. 2018 Apr 5;173(2):355-370.e14. doi: 10.1016/j.cell.2018.03.039. 10 | 11 | PMID: 29625052 12 | ``` 13 | 14 | ## Data Access ## 15 | >De-identified variant-level data for prioritized VUS and pathogenic variants: Table S2 of the publication. 16 | 17 | >The protected variants+sample ID and the full callset (Authorized User only): 18 | 19 | >GDC link: https://gdc.cancer.gov/about-data/publications/PanCanAtlas-Germline-AWG 20 | 21 | >Compressed VCF file of the combined, filtered variant calls using GATK, VarScan2, and Pindel on WES data of the 10,389 final passed-QC samples. - PCA.r1.TCGAbarcode.merge.tnSwapCorrected.10389.vcf.gz 22 | 23 | >Tabix file of the compressed VCF file of the combined, filtered variant calls using GATK, VarScan2, and Pindel on WES data of the 10,389 final passed-QC samples. - PCA.r1.TCGAbarcode.merge.tnSwapCorrected.10389.vcf.gz.tbi 24 | 25 | >Prioritized, cancer related variants discovered in 10,389 cases. Please use "Overall_Classification" column to distinguish between Pathogenic, Likely Pathogenic and Priortizied VUSs. - PCA_pathVar_integrated_filtered_adjusted.tsv 26 | 27 | ## PanCanAtlas Germline Working Group info (for members) ## 28 | >Wiki: https://wiki.nci.nih.gov/display/TCGAM/PanCanAtlas+Germline+AWG 29 | >Synapse: syn4602499 30 | 31 | ## Directory set-up ## 32 | > The analyses directory should be set up to live parallelly to the TCGA_data directory to allow proper data sourcing. Due to the protected nature of some of those files they are not shared publicly. 33 | > Analyses scripts are in the "analysis" folder 34 | 35 | ## Getting started on ISB-CGC [For TCGA PanCanAtlas Germline AWG members Only]: ## 36 | >1) Install Google Cloud SDK (https://cloud.google.com/sdk/docs/quickstarts) and read through at least the basic gsutil command 37 | >2) Make sure you can access the project on Google cloud (https://console.cloud.google.com/home/dashboard?project=isb-cgc-06-0004); if not here are some relevant steps (http://isb-cancer-genomics-cloud.readthedocs.io/en/latest/sections/webapp/Gaining-Access-To-Contolled-Access-Data.html) 38 | >3) Read this one page quick start to google cloud compute engine (https://cloud.google.com/compute/docs/quickstarts) . 39 | >4) If you want to download the data to your local computer/cluster, you need to make sure with your system administrator that the environment you plan to work on have appropriate access authority and security. 40 | >5) Read Jay's short getting started guide (https://drive.google.com/file/d/0B0aS3CDIQ_RAd01ld0tKX3JHa00/view). And if you are up to it here is a more detailed guide by Sheila (https://docs.google.com/document/d/1f1YBVG1dAhpF-Un5lp70kMI8Yo2rD3NNQdpl0FDAu-c/edit#heading=h.gv4f8tqq5731) 41 | 42 | >The PCAGermline AWG google doc of shared data (https://docs.google.com/document/d/1ymdfAnRR4o4-20bwHI3vPaRPRuoqtqc0pNUVYO2oiPc/edit) 43 | 44 | 45 | > Contact: Kuan-lin Huang [kuan-lin.huang@wustl.edu] 46 | -------------------------------------------------------------------------------- /analysis/CharGer_analysis/PCGP/liftover_CharGer_result_PCGP.py: -------------------------------------------------------------------------------- 1 | #!/bin/python 2 | #03 February 2016 - Kuan-Lin Huang @ WashU - 3 | 4 | import sys 5 | import getopt 6 | 7 | def main(): 8 | def usage(): 9 | print """ 10 | liftover_CharGer_result.py : why do I exist? 11 | 12 | USAGE: liftover_CharGer_result.py [-h] 13 | -h print this message 14 | input file 15 | """ 16 | 17 | #use getopt to get inputs 18 | # try: 19 | # opts, args = getopt.getopt(sys.argv[1:], 'h') #:after option meaning required arguments 20 | # except getopt.GetoptError: 21 | # print "liftover_CharGer_result.py " 22 | 23 | # for opt, arg in opts: #store the input options 24 | # if opt == '-h': # h means user needs help 25 | # usage(); sys.exit() 26 | 27 | args = sys.argv[1:] 28 | if len(args) < 1: 29 | usage(); sys.exit("input file missing") 30 | 31 | #open input file 32 | try: 33 | charGerF = open(args[0],"r") 34 | except IOError: 35 | print("File , args[0], does not exist!") 36 | 37 | CharGerHeader = charGerF.readline().strip() 38 | varCharGer = {} 39 | #read input file 40 | for line in charGerF: 41 | line=line.strip() 42 | F = line.split("\t") 43 | # chrom = F[1] 44 | # start = F[2] 45 | # stop = F[3] 46 | # ref = F[4] 47 | # alt = F[5] 48 | if len(F) > 4: 49 | # if F[4] == "-": 50 | # F[4] = "0" 51 | # if F[5] == "-": 52 | # F[5] = "0" 53 | var = "_".join(F[1:3]+F[4:6]) 54 | varCharGer[var]=line 55 | charGerF.close() 56 | 57 | 58 | try: 59 | inputF = open(args[1],"r") 60 | except IOError: 61 | print("File , args[1], does not exist!") 62 | 63 | header = inputF.readline().strip() 64 | headerF = header.split("\t") #arrays mutable, I'm farily sure 65 | i = 1 66 | #for headerItem in headerF: 67 | for k in range(0, len(headerF)): 68 | if headerF[k] == "": 69 | headerF[k] = "Missing" + str(i) 70 | i+=1 71 | headerF[k] = headerF[k].replace("#","").strip() #this is the same problem as self.userVariant or something 72 | # print ":" + headerF[k] + ":" 73 | # print i 74 | # i +=1 75 | 76 | 77 | print "\t".join(headerF[0:35]) + "\t" + CharGerHeader 78 | 79 | #read input file 80 | for line in inputF: 81 | line=line.strip() 82 | F = line.split("\t") 83 | if len(F) > 20: 84 | F[16] = F[16].upper().replace("CHR", "") 85 | start = F[17] 86 | #stop = F[2] 87 | ref = F[18] 88 | alt = F[19] 89 | #sample = F[21] 90 | var = "_".join(F[16:20]) 91 | 92 | CharGerAnno = "" 93 | if var in varCharGer: 94 | CharGerAnno = varCharGer[var] 95 | 96 | print "\t".join(F[0:35]) + "\t" + CharGerAnno 97 | #print line + "\t" + CharGerAnno 98 | 99 | inputF.close() 100 | 101 | if __name__ == "__main__": 102 | main() 103 | -------------------------------------------------------------------------------- /analysis/CharGer_analysis/PCGP/run_charger.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | inputDir="/gscmnt/gc3020/dinglab/medseq/Germline/projects/pan8000_germline_clinical/variant_files/201604_PCGP_variants/" 4 | mmGenes="/gscmnt/gc2737/ding/Analysis/Germline/CharGer/20160301_Rahman_KJ_KH_gene_table_CharGer.txt" 5 | mmVariants="/gscmnt/gc2737/ding/Analysis/VariantLists/emptyRemoved_20160428_pathogenic_variants_HGVSg_VEP.vcf" 6 | hotspot="/gscmnt/gc2737/ding/Analysis/Germline/CharGer/MC3.noHypers.mericUnspecified.d10.r20.v114.clusters" 7 | clinvar="/gscmnt/gc2706/dinglab/medseq/ClinVar/MacArthurLab/clinvar/output/b37/single/clinvar_alleles.single.b37.tsv.gz" 8 | rareThreshold="0.0005" 9 | results="Charged/" 10 | if [ ! -d ${results} ]; then 11 | mkdir ${results} 12 | fi 13 | 14 | queue="ding-lab" 15 | #queue="long" 16 | group="/khuang" 17 | 18 | sample="/gscmnt/gc3020/dinglab/medseq/Germline/projects/pan8000_germline_clinical/variant_files/201604_PCGP_variants/2015_stJude_germline_nejm_S4_AD_varOnly.vep.vcf" 19 | output="${results}charged.2015_stJude_germline_nejm_S4_AD_varOnly.vep.tsv" 20 | command="python CharGer/bin/charger -f ${sample} -o ${output} -O -D -g ${mmGenes} -z ${mmVariants} -H ${hotspot} -l --mac-clinvar-tsv ${clinvar} > ${results}charger.PCGP_AD.out" 21 | log="${results}charger.PCGP_AD.log" 22 | echo "bsub -g ${group} -q ${queue} -oo ${log} \"${command}\"" 23 | 24 | sample="/gscmnt/gc3020/dinglab/medseq/Germline/projects/pan8000_germline_clinical/variant_files/201604_PCGP_variants/2015_stJude_germline_nejm_S4_AR_varOnly.vep.vcf" 25 | output="${results}charged.2015_stJude_germline_nejm_S4_AR_varOnly.vep.tsv" 26 | command="python CharGer/bin/charger -f ${sample} -o ${output} -O -D -g ${mmGenes} -z ${mmVariants} -H ${hotspot} -l --mac-clinvar-tsv ${clinvar} > ${results}charger.PCGP_AR.out" 27 | log="${results}charger.PCGP_AR.log" 28 | echo "bsub -g ${group} -q ${queue} -oo ${log} \"${command}\"" 29 | -------------------------------------------------------------------------------- /analysis/CharGer_analysis/PCGP/work.log.sh: -------------------------------------------------------------------------------- 1 | #0. run charger 2 | bash run_charger.sh 3 | 4 | #1. append results 5 | python liftover_CharGer_result_PCGP.py charged.2015_stJude_germline_nejm_S4_AD_varOnly.vep.tsv /Users/khuang/Box\ Sync/PhD/germline/pan8000_germline_clinical/variant_files/201604_PCGP_variants/2015_stJude_germline_nejm_S4_AD.txt > 2015_stJude_germline_nejm_S4_AD_charger.txt 6 | python liftover_CharGer_result_PCGP.py charged.2015_stJude_germline_nejm_S4_AR_varOnly.vep.tsv /Users/khuang/Box\ Sync/PhD/germline/pan8000_germline_clinical/variant_files/201604_PCGP_variants/2015_stJude_germline_nejm_S4_AR.txt > 2015_stJude_germline_nejm_S4_AR_charger.txt 7 | 8 | #2. Run analysis and plotting 9 | Rscript plot_CharGer_summary_PCGP.R 10 | -------------------------------------------------------------------------------- /analysis/LOH/plotPathVarLOH.R: -------------------------------------------------------------------------------- 1 | ##### plotPathVarLOH.R ##### 2 | # Kuan-lin Huang @ WashU 201711 3 | # plot assoc results for pathogenic variants 4 | 5 | bdir = "/Users/khuang/Box\ Sync/PhD/germline/PanCanAtlasGermline/analysis/LOH" 6 | setwd(bdir) 7 | source("../global_aes_out.R") 8 | source("../dependency_files.R") 9 | 10 | getLOHColorScale = function() { 11 | colors = c("#e31a1c", "#b2df8a", "#1f78b4") #positive is dark grey 12 | color.names = c("Significant","Suggestive","None") 13 | names(colors) = color.names 14 | color.scale = scale_color_manual(name="Loss of Heterozygosity", values=colors) 15 | return(color.scale) 16 | } 17 | 18 | getLOHFillScale = function() { 19 | colors = c("#e31a1c", "#b2df8a", "#1f78b4") #positive is dark grey 20 | color.names = c("Significant","Suggestive","None") 21 | names(colors) = color.names 22 | color.scale = scale_fill_manual(name="Loss of Heterozygosity", values=colors) 23 | return(color.scale) 24 | } 25 | 26 | #pathVarOT$LOH_Sig = factor(pathVarOT$LOH_Sig, levels=c("None","Suggestive","Significant")) 27 | 28 | p = ggplot(pathVarOT,aes(x=normalVAF, y =tumorVAF, color = LOH_Sig)) 29 | p = p + facet_grid(.~Gene_Classification,drop=T) 30 | p = p + geom_point(alpha=0.3, stroke=0) + theme_bw() + theme_nogrid() #+ guides(color=FALSE) 31 | p = p + geom_abline(intercept = 0, slope=1, alpha=0.2) #+ geom_density2d(alpha=0.5) 32 | p = p + theme(axis.title = element_text(size=16), axis.text.x = element_text(colour="black", size=14, angle=90,vjust=0.5), axis.text.y = element_text(colour="black", size=14))#element_text(colour="black", size=14)) 33 | p = p + expand_limits(x = 0, y = 0) 34 | p = p + coord_equal() + getLOHColorScale() 35 | p = p + labs(x = "Normal VAF", y = "Tumor VAF") 36 | p 37 | fn = "out/pathVarLOH.pdf" 38 | ggsave(file=fn, width=6, useDingbats=FALSE) 39 | 40 | # find LOH gene percentage 41 | geneLOH = data.frame(table(pathVarOT$HUGO_Symbol,pathVarOT$LOH_Sig)) 42 | colnames(geneLOH) = c("Gene","LOH_category","Count") 43 | 44 | geneLOHsig = geneLOH[geneLOH$LOH_category=="Significant",] 45 | sig_gene = geneLOHsig[geneLOHsig$Count>2,]$Gene 46 | sig_gene_order = geneLOHsig$Gene[order(geneLOHsig$Count,decreasing = T)] 47 | geneLOH_g = geneLOH[geneLOH$Gene %in% sig_gene,] 48 | 49 | geneLOH_g$Gene = factor(geneLOH_g$Gene, levels=sig_gene_order) 50 | geneLOH_g$LOH_category = factor(geneLOH_g$LOH_category, levels=c("None","Suggestive","Significant")) 51 | 52 | p = ggplot(geneLOH_g,aes(x = Gene, y = Count, fill = LOH_category)) 53 | p = p + geom_bar(stat = "identity") + theme_bw() + theme_nogrid() 54 | p = p + labs(x = "Gene", y="Count of variants") 55 | p = p + getLOHFillScale() 56 | p = p + theme(axis.title = element_text(size=12), axis.text.x = element_text(colour="black", size=10, angle = 90, vjust=0.5), axis.text.y = element_text(colour="black", size=12))#element_text(colour="black", size=14)) 57 | p = p + theme(legend.position = "top") 58 | p 59 | fn = 'out/LOH_var_count_by_gene.pdf' 60 | ggsave(file=fn, height = 6, width = 6, useDingbats=FALSE) 61 | 62 | pathVarOT$TumorByNormalVAFPlot = pathVarOT$TumorByNormalVAF 63 | pathVarOT$TumorByNormalVAFPlot[pathVarOT$TumorByNormalVAFPlot> 2 ] = 2 64 | pathVarOT$TumorByNormalVAFPlot[pathVarOT$TumorByNormalVAFPlot< 0.5 ] = 0.5 65 | p = ggplot(pathVarOT[pathVarOT$HUGO_Symbol %in% sig_gene,],aes(x = HUGO_Symbol, y = TumorByNormalVAFPlot, color = LOH_Sig, fill = HUGO_Symbol)) 66 | p = p + geom_point(position = position_jitter(w = 0.2, h = 0), alpha = 0.3) 67 | #p = p + geom_violin(alpha = 0.3) 68 | p = p + labs(x = "Gene", y="Tumor VAF / Normal VAF") + theme_bw() 69 | #p = p + getVarFillScale() 70 | p = p + theme(axis.title = element_text(size=12), axis.text.x = element_text(colour="black", size=10, angle = 90, vjust=0.5), axis.text.y = element_text(colour="black", size=12))#element_text(colour="black", size=14)) 71 | p = p + theme(legend.position = "none") 72 | p 73 | fn = 'out/LOH_var_VAFratio_by_gene.pdf' 74 | ggsave(file=fn, height = 6, width = 6, useDingbats=FALSE) 75 | 76 | p = ggplot(pathVarOT[pathVarOT$HUGO_Symbol %in% sig_gene,],aes(x=HUGO_Symbol,y = TumorByNormalVAFPlot, color = LOH_Sig, fill = LOH_Sig)) 77 | #p = p + facet_grid(.~Gene_Classification, scale = "free", space = "free", drop=T) 78 | p = p + geom_dotplot(dotsize=0.6,binwidth=.015, binaxis= "y",stackdir ="centerwhole",alpha=0.5) 79 | p = p + theme_bw() 80 | p = p + labs(x = "Gene", y="Tumor VAF / Normal VAF") 81 | p = p + scale_y_continuous(breaks = seq(0.5,2.0, by= 0.5)) 82 | #p = p + getVarColorScale() 83 | p = p + theme(axis.title = element_text(size=12), axis.text.x = element_text(colour="black", size=10, angle = 90, vjust=0.5), axis.text.y = element_text(colour="black", size=12))#element_text(colour="black", size=14)) 84 | p = p + theme(legend.position = "none") 85 | p 86 | fn = 'out/LOH_var_VAFratio_by_gene_dotplot.pdf' 87 | ggsave(file=fn, height = 6, width = 6, useDingbats=FALSE) -------------------------------------------------------------------------------- /analysis/LOH/prepFileForAllelicImbalance.pl: -------------------------------------------------------------------------------- 1 | #!/usr/bin/perl 2 | #03 August 2017 - Adam D Scott - 3 | # adopted by Kuan @ WashU 11/1/2017 for PCA file 4 | 5 | use strict; 6 | use warnings; 7 | 8 | use IO::File; 9 | use FileHandle; 10 | 11 | my $usage = 'perl prepFileForAllelicImbalance.pl 12 | '; 13 | 14 | die $usage , unless @ARGV == 2; 15 | my ( $charger , $preAI ) = @ARGV; 16 | 17 | # my $IN1 = FileHandle->new( "gunzip -c $charger |" , "r" ); 18 | # if ( not defined $IN1 ) { die "ADSERROR: Could not open/read $charger\n"; } 19 | open(my $IN1, "gunzip -c $charger |") || die "can't open pipe to $charger"; 20 | 21 | my $OUTmissense = FileHandle->new( "ready.missense.".$preAI , "w" ); 22 | if ( not defined $OUTmissense ) { die "ADSERROR: Could not open/write $preAI\n"; } 23 | 24 | my $OUTtruncation = FileHandle->new( "ready.truncation.".$preAI , "w" ); 25 | if ( not defined $OUTtruncation ) { die "ADSERROR: Could not open/write $preAI\n"; } 26 | 27 | my $useTheseClasses = { 'missense' => 1 , 'frame_shift_del' => 1 , 'frame_shift_ins' => 1 , 'splice_site_del' => 1 , 'splice_site_ins' => 1 , 'nonsense' => 1 , 'splice_site' => 1 , 'nonstop' => 1 }; 28 | my $sites = {}; 29 | $OUTmissense->print( "HUGO_Symbol\tCHR\tSTART\tSTOP\tREF\tALT\tTYPE\tNormalRef\tNormalVar\tNormalVAF\tTumorRef\tTumorVar\tTumorVAF\tSample\n" ); 30 | $OUTtruncation->print( "HUGO_Symbol\tCHR\tSTART\tSTOP\tREF\tALT\tTYPE\tNormalRef\tNormalVar\tNormalVAF\tTumorRef\tTumorVar\tTumorVAF\tSample\n" ); 31 | 32 | while ( my $line = <$IN1> ) { 33 | next if ( $line =~ /HGVSg/ ); 34 | chomp( $line ); 35 | my @line = split( "\t" , $line ); 36 | next if ( $line[9] =~ /synonymous_variant/ ); 37 | next if ( $line[9] =~ /stop_retained_variant/ ); 38 | next if ( $line[9] =~ /start_lost/ ); 39 | next if ( $line[9] =~ /non_coding_transcript_exon_variant/ ); 40 | next if ( $line[9] =~ /UTR/ ); 41 | if ( $line[9] =~ /missense/ ) { $line[9] = "missense";} 42 | if ( $line[9] =~ /frameshift/ and $line[4] eq "-" ) { $line[9] = "frame_shift_ins"; } 43 | if ( $line[9] =~ /frameshift/ and $line[5] eq "-" ) { $line[9] = "frame_shift_del"; } 44 | if ( $line[9] =~ /splice/ and $line[7] eq "-" ) { $line[9] = "splice_site_ins"; } 45 | if ( $line[9] =~ /splice/ and $line[8] eq "-" ) { $line[9] = "splice_site_del"; } 46 | if ( $line[9] =~ /splice/ and $line[7] ne "-" and $line[8] ne "-" ) { $line[9] = "splice_site"; } 47 | if ( $line[9] =~ /stop_gained/ ) { $line[9] = "nonsense"; } 48 | if ( $line[9] =~ /stop_lost/ ) { $line[9] = "nonstop"; } 49 | 50 | # # expected fields: $gene, $chr, $strt, $stop, $ref, $var, $type, 51 | # $num_norm_refs, $num_norm_vars, $nvaf, 52 | # $num_tumr_refs, $num_tumr_vars, $tvaf, 53 | # $sample_name 54 | my $outString = join( "\t" , ( @line[3..9] , @line[99,100] , $line[104] , @line[102,103], $line[105], $line[1] ) ); 55 | if ( $line[9] =~ /missense/) { $OUTmissense->print( $outString."\n" );} #gene site variant_class charger_pathogenicity 56 | else{$OUTtruncation->print( $outString."\n" );} 57 | } 58 | $IN1->close(); 59 | 60 | $OUTmissense->close(); 61 | $OUTtruncation->close(); 62 | -------------------------------------------------------------------------------- /analysis/PCA_IBD_dist/plot_relatedness.R: -------------------------------------------------------------------------------- 1 | ##### plot_PCA.R ##### 2 | # Kuan-lin Huang @ WashU 2017 March 3 | # plot PCA output from plink 4 | # for TCGA samples, label based on reported ethnicity 5 | 6 | setwd("/Users/khuang/Box Sync/PhD/germline/PanCanAtlasGermline/analysis/PCA_IBD_dist") 7 | source("../global_aes_out.R") 8 | 9 | # plink relation file 10 | # pca_f = "plink_out/all.normal.merge.vcf.pca.eigenvec" 11 | # relationship based on Yang J, Lee SH, Goddard ME, Visscher PM (2011) GCTA: A Tool for Genome-wide Complex Trait Analysis 12 | rel_f = "plink_out/all.normal.merge.indep_50_5_2.vcf.rel.rel" 13 | rel = read.table(header=F, quote = "", sep="\t", file = rel_f) 14 | 15 | # merge with sample 16 | sample_f = "plink_out/all.normal.merge.indep_50_5_2.vcf.rel.rel.id" 17 | sample = read.table(header=F, quote = "", sep="\t", row.names =NULL, file = sample_f, stringsAsFactors=FALSE) 18 | samples = sample[,1] 19 | row.names(rel) = samples 20 | colnames(rel) = samples 21 | rel$sample = samples 22 | 23 | # make a small version of the table and take a quick look 24 | rel_m = melt(rel[1:10,],id.var="sample") 25 | rel_m[rel_m$sample ==rel_m$variable,] 26 | 27 | # plotting 28 | p = ggplot(data=rel_m) 29 | p = p + geom_bar(aes(x=value),stat="bin",bins=100) 30 | #p = p + scale_colour_gradientn(na.value="grey", colours=getPalette(100))#, limits=c(-4.2,4.2)) 31 | p = p + theme_bw() + scale_y_log10() #+ expand_limits(y=1)#+ guides(fill=FALSE) 32 | p = p + geom_vline(xintercept = 0,alpha=.7) + geom_vline(xintercept = 1,alpha=.7) 33 | p 34 | fn = paste(pd, "PanCanAtlas_rel_10samples_hist.pdf", sep="_") 35 | ggsave(file=fn, useDingbats=FALSE) 36 | 37 | 38 | ### IBD 39 | dist_f = "plink_out/all.normal.merge.indep_50_5_2.vcf.dist.dist" 40 | dist = read.table(header=F, quote = "", sep="\t", file = dist_f) 41 | 42 | # merge with sample 43 | sample_f = "plink_out/all.normal.merge.indep_50_5_2.vcf.dist.dist.id" 44 | sample = read.table(header=F, quote = "", sep="\t", row.names =NULL, file = sample_f, stringsAsFactors=FALSE) 45 | samples = sample[,1] 46 | row.names(dist) = samples 47 | colnames(dist) = samples 48 | dist$sample = samples 49 | 50 | # make a small version of the table and take a quick look 51 | dist_m = melt(dist[1:10,],id.var="sample") 52 | dist_m[dist_m$sample ==dist_m$variable,] 53 | 54 | # plotting 55 | p = ggplot(data=dist_m) 56 | p = p + geom_bar(aes(x=value),stat="bin",bins=100) 57 | #p = p + scale_colour_gradientn(na.value="grey", colours=getPalette(100))#, limits=c(-4.2,4.2)) 58 | p = p + theme_bw() + scale_y_log10() #+ expand_limits(y=1)#+ guides(fill=FALSE) 59 | p = p + geom_vline(xintercept = 0,alpha=.7) + geom_vline(xintercept = 1,alpha=.7) 60 | p 61 | fn = paste(pd, "PanCanAtlas_dist_10samples_hist.pdf", sep="_") 62 | ggsave(file=fn, useDingbats=FALSE) -------------------------------------------------------------------------------- /analysis/PCA_IBD_dist/run_plink_pca.ibd.rel.sh: -------------------------------------------------------------------------------- 1 | # use plink 1.9 to: 2 | # calculate PCA/IBS/relatedness, before and after pruning 3 | plink --vcf all.normal.merge.vcf.gz --pca --out all.normal.merge.vcf.pca 4 | plink --vcf all.normal.merge.vcf.gz --pca --maf 0.15 --out all.normal.merge.vcf.MAF0.15.pca & 5 | 6 | plink --vcf all.normal.merge.vcf.gz --distance square --indep 50 5 2 --maf 0.05 --out all.normal.merge.indep_50_5_2.vcf.dist 7 | 8 | plink --vcf all.normal.merge.vcf.gz --make-rel square --indep 50 5 2 --maf 0.05 --out all.normal.merge.indep_50_5_2.vcf.rel 9 | 10 | rm -f *temporary* 11 | 12 | # make tar balls 13 | tar -cvzf plink_dist_rel.tar *indep* 14 | tar -cvzf plink_pca.tar *pca* 15 | 16 | # transfer to local 17 | gcloud compute copy-files --zone us-central1-f huangkuanlin@kuan-merge-genotype-bigmem:~/plink*tar plink_out 18 | # transfer to storage 19 | gsutil cp plink*tar gs://dinglab/isb-cgc/tcga/analysis_files -------------------------------------------------------------------------------- /analysis/README.md: -------------------------------------------------------------------------------- 1 | Store each analyses-specific script and outputs 2 | =============================================== 3 | # Directory set-up # 4 | > this analyses directory should be set up to live parallelly to the TCGA_data directory to allow proper data sourcing. Due to the protected nature of some of those files they are not shared publicly. 5 | 6 | # Dependency files # 7 | >global_aes_out.R: used to define plotting functions and color codes for other analysis R codes 8 | 9 | >dependency_files.R: load and prep files used for downstream analysis 10 | 11 | # Analysis workflow # 12 | 13 | # Genotype data analysis # 14 | ## 1. process_files/: 15 | > Scripts in process_files/genotype/ are used to preprocess germline variants calls to classified variants. 16 | 17 | ## 2. PCA_IBD_dist/: 18 | > Scripts used for PCA and relatedness analysis 19 | 20 | # Exomic germline variants analysis # 21 | ## 1. process_files/: 22 | > Scripts in process_files/germline/ are used to preprocess germline variants calls to classified variants. Steps downstream of GATK/VarScan/Pindel merged calls are described in process_files/germline/local/work.log.sh. 23 | 24 | ## 2. sample_listing/: 25 | > Compare PCA germline sample list (based on ISB-CGC manifest) against PCA clinical sample and MC3 sample. Generate population level summary for the 10,467 final samples. 26 | 27 | ## 3. pathogenic_variants/: 28 | > Scripts in pathogenic_variants/ are used to further filter variants based on readcount and cancer relevance. 29 | 30 | ## 4. association_test/: 31 | > Conduct association test using ExAC and ExAC-nonTCGA data. 32 | 33 | ## 5. LOH/: 34 | > Conduct LOH analysis using readcount data. 35 | 36 | ## 6. expression_effect/: 37 | > Calculate cohort level expression quantile and retrieve RSEM expression value for each sample within cancer types. 38 | 39 | ## 7. data_integration/: 40 | > Combine multi-level data for pathogenic variants. 41 | 42 | ## 8. hotspot3d/: 43 | > Co-clustering with somatic mutations using HotSpot3D. 44 | 45 | ## 9. data_intergration/: 46 | > Integrate pathogenic variant data with other molecular and omics data 47 | > plot results from data_integration 48 | 49 | ## All subsequent analysis require integrated file ## 50 | > source dependency_files.R to read in these additional data 51 | 52 | ## 10. integrated_analysis/: 53 | > generate plots and stats for pathogenic variants that required integrated datap 54 | 55 | ## -------------------------------------------------------------------------------- /analysis/RPPA_effect/RPPA_effect.R: -------------------------------------------------------------------------------- 1 | ##### RPPA_effect.R ##### 2 | # Kuan-lin Huang @ WashU 2016 May , updated 2017 Nov. 3 | # analyze cohort level RPPA data and convert to different matrices in a sample-gene format 4 | 5 | bdir = "/Users/khuang/Box Sync/PhD/germline/PanCanAtlasGermline/analysis/RPPA_effect" 6 | setwd(bdir) 7 | source("../global_aes_out.R") 8 | source("../dependency_files.R") 9 | ecdf_fun = function(x,perc) ecdf(x)(perc) 10 | 11 | ### preprocess RPPA file ### 12 | fileNames = Sys.glob("/Users/khuang/Box\ Sync/PhD/germline/pan8000_germline_clinical/oncogene_signaling/data/RPPA/*.rppa.txt") 13 | all_tables = vector("list") 14 | 15 | for (fileName in fileNames) { 16 | cancer2 = strsplit(fileName, split="/")[[1]][length(strsplit(fileName, split="/")[[1]])] 17 | cancer = gsub("\\..*","",cancer2) 18 | 19 | exp_table = read.table(header=TRUE, sep="\t", file=fileName) 20 | exp_table_q = exp_table 21 | for (i in 1:nrow(exp_table_q)){ 22 | min_RPPA = min(exp_table_q[i,-1],na.rm=T) 23 | if (min_RPPA<0){ exp_table_q[i,-1] = exp_table_q[i,-1] - min_RPPA} 24 | exp_table_q[i,-1] = ecdf_fun(unlist(exp_table_q[i,-1]),unlist(exp_table_q[i,-1])) 25 | } 26 | exp_table.m = melt(exp_table, id.var="Composite.Element.REF") 27 | exp_table_q.m = melt(exp_table_q, id.var="Composite.Element.REF") 28 | colnames(exp_table.m) = c("marker","sample","expression") 29 | colnames(exp_table_q.m) = c("marker","sample","quantile") 30 | exp_table_c = merge(exp_table.m,exp_table_q.m,by=c("marker","sample")) 31 | exp_table_c$cancer = cancer 32 | all_tables[[cancer]] = exp_table_c 33 | } 34 | RPPA = do.call(rbind,all_tables) 35 | RPPA$sample_l = substr(RPPA$sample, start=0, stop=16) 36 | RPPA$sample_l = gsub("\\.","-",RPPA$sample) 37 | RPPA$bcr_patient_barcode = substr(RPPA$sample_l, start=0, stop=12) 38 | 39 | RPPA$status = "tumor" 40 | status_n = substr(RPPA$sample_l, start=14, stop=14) 41 | RPPA$status[status_n==1] = "normal" 42 | RPPA = RPPA[RPPA$status != "normal",]# exclude normal from BRCA for now 43 | 44 | RPPA$genes = gsub("\\|.*","",RPPA$marker) 45 | RPPA$genes[RPPA$genes=="MAPK1 MAPK3"] = "MAPK3" 46 | RPPA$genes[RPPA$genes=="PIK3R1 PIK3R2"] = "PIK3R1" 47 | RPPA$genes[RPPA$genes=="PIK3R1/2"] = "PIK3R1" 48 | RPPA$genes[RPPA$genes=="PIK3CA "] = "PIK3CA" 49 | RPPA$genes[RPPA$genes=="PDK1"] = "PDPK1" 50 | 51 | fn = "out/pancan_RPPA_quantile_all.tsv" 52 | write.table(RPPA, file=fn, quote=F, sep="\t", col.names=T, row.names=F) -------------------------------------------------------------------------------- /analysis/RPPA_effect/plotPathVarRPPA.R: -------------------------------------------------------------------------------- 1 | ##### plotPathVarExpression.R ##### 2 | # Kuan-lin Huang @ WashU 201711 3 | # plot assoc results for pathogenic variants 4 | 5 | bdir = "/Users/khuang/Box\ Sync/PhD/germline/PanCanAtlasGermline/analysis/RPPA_effect" 6 | setwd(bdir) 7 | source("../global_aes_out.R") 8 | source("../dependency_files.R") 9 | 10 | RPPA = read.table("out/pancan_RPPA_quantile_all.tsv",header=T, stringsAsFactors = F, quote = "", sep = "\t") 11 | 12 | RPPA_g = RPPA[RPPA$genes %in% pathVarP$HUGO_Symbol,] 13 | RPPA_g_m = RPPA_g[,c("marker","genes","sample_l","bcr_patient_barcode","expression","quantile")] 14 | RPPA_g_m$marker = gsub(".*\\|","",RPPA_g_m$marker) 15 | colnames(RPPA_g_m)[2] = "HUGO_Symbol" 16 | pathVarP_RPPA = merge(pathVarP,RPPA_g_m,by=c("HUGO_Symbol","bcr_patient_barcode")) 17 | pathVarP_RPPA_fg = pathVarP_RPPA[pathVarP_RPPA$HUGO_Symbol %in% featGenes,] 18 | pathVarP_RPPA_fg = pathVarP_RPPA_fg[!is.na(pathVarP_RPPA_fg$binary_type),] 19 | 20 | pathVarP_RPPA_fg_p = pathVarP_RPPA_fg[!(pathVarP_RPPA_fg$marker %in% c("c-Met","Ret_pY905")),] 21 | 22 | p = ggplot(pathVarP_RPPA_fg_p,aes(x=marker,y=quantile, fill=binary_type)) 23 | p = p + facet_grid(.~Gene_Classification, scale = "free", space = "free", drop=T) 24 | p = p + geom_dotplot(dotsize=1.2,binwidth=.01, binaxis= "y",colour=NA,stackdir ="centerwhole") 25 | p = p + geom_text(aes(label=ifelse(Gene_Classification=="Oncogene" & quantile>0.75, gsub("p.","",HGVSp_short),NA)),size=2.5) 26 | p = p + theme_bw() 27 | p = p + ylab("RPPA Expression Quantile") + xlab("Protein with Germline Variant") 28 | p = p + scale_y_continuous(breaks = seq(0,1, by= 0.25)) 29 | #p = p + getVarColorScale() 30 | p = p + theme(axis.title = element_text(size=12), axis.text.x = element_text(colour="black", size=10, angle = 90, vjust=0.5), axis.text.y = element_text(colour="black", size=12)) 31 | p 32 | fn = "out/pathVarRPPAExpression_byGene.pdf" 33 | ggsave(file=fn, w=6, h=6,useDingbats=FALSE) 34 | 35 | # ### somatic information ### 36 | # somatic_f = "/Users/khuang/Box\ Sync/PhD/germline/pan8000_germline_clinical/somatic_germline/pancan.merged.v0.2.4.filtered.maf.gene_vclass_HGVSp_sample.txt" 37 | # somatic = read.table(header=T, quote = "", sep="\t", file = somatic_f, stringsAsFactors=FALSE) 38 | # colnames(somatic) = c("HUGO_Symbol","Somatic_Variant_Classification","sample","Somatic_HGVSp") 39 | # somatic$sample = gsub("(^TCGA-[A-Z0-9][A-Z0-9]-[A-Z0-9][A-Z0-9][A-Z0-9][A-Z0-9])-.*","\\1",somatic$sample) 40 | # # one entry per sample 41 | # somatic_class_agg = aggregate(somatic[c('Somatic_Variant_Classification','Somatic_HGVSp')], by=somatic[c('sample',"HUGO_Symbol")], paste, collapse = ",") 42 | # germ_so = merge(germ_clin_abb,somatic_class_agg, by =c("sample","HUGO_Symbol"), all=T) 43 | # 44 | # ### clustering information### 45 | # cluster_var_f = "/Users/khuang/Box Sync/PhD/germline/pan8000_germline_clinical/germline_hotspot/20161010_germline_ARD_ASD_run/pan8000_somatic_germline_combined.maf.3D_Proximity.pairwise.singleprotein.collapsed.l0.p0.05.r10.clusters_wcounts.tsv" 46 | # cluster_var = read.table(header=T, quote = "", sep="\t", stringsAsFactors = F, fill =T, file = cluster_var_f) 47 | # 48 | # cluster_germsoma_f = "/Users/khuang/Box Sync/PhD/germline/pan8000_germline_clinical/germline_hotspot/20161010_germline_ARD_ASD_run/pan8000_somatic_germline_combined.maf.3D_Proximity.pairwise.singleprotein.collapsed.l0.p0.05.r10.clusters.summary_wcounts_cc10.3_wgermsoma.tsv" 49 | # cluster_germsoma = read.table(header=T, quote = "", sep="\t", fill =T, file = cluster_germsoma_f, stringsAsFactors=FALSE) 50 | # 51 | # cluster_var_in_hybrid = cluster_var[cluster_var$Cluster %in% cluster_germsoma$Cluster_ID,] 52 | # germ_clin_var = germ_clin[paste(germ_clin$chromosome_name, germ_clin$start, germ_clin$stop) %in% 53 | # paste(cluster_var_in_hybrid$Chromosome, cluster_var_in_hybrid$Start, cluster_var_in_hybrid$Stop),] 54 | 55 | 56 | -------------------------------------------------------------------------------- /analysis/RPPA_effect/plotPathVarRPPAassoc.R: -------------------------------------------------------------------------------- 1 | ##### plotPathVarExpressAssoc.R ##### 2 | # Kuan-lin Huang @ WashU 2016 August updated 2017 3 | # conduct association of pathVarPline variants with AAO 4 | 5 | bdir = "/Users/khuang/Box\ Sync/PhD/germline/PanCanAtlasGermline/analysis/RPPA_effect" 6 | setwd(bdir) 7 | source("../global_aes_out.R") 8 | source("../dependency_files.R") 9 | 10 | tn = "out/pathVarRPPAAssoc.txt" 11 | tt = read.table(sep="\t",header=T,file=tn, stringsAsFactors=FALSE) 12 | 13 | ### plotting ### 14 | tt$gene = as.character(tt$gene) 15 | tt$marker = as.character(tt$marker) 16 | tt$FDR_plot = tt$FDR 17 | 18 | # # using GLM test result 19 | # tt$FDR_plot[tt$FDR_plot<10^(-6)]= 0.95*10^(-6) 20 | # p = ggplot(data=tt) 21 | # p = p + geom_point(aes(y=-log10(FDR_plot),x= coefficient,color = cancer),alpha=0.5) 22 | # #p = p + geom_text_repel(aes(y=-log10(FDR),x= cohort_AF,label=ifelse(FDR<0.05, Gene,NA),color = Cancer))#,alpha=1.3) 23 | # #p = p + geom_point(aes(y=cohort_AF,x=Cancer,size=-log10(FDR),color = Cancer)) 24 | # p = p + geom_text_repel(aes(y=-log10(FDR_plot),x= coefficient,color = cancer,label=ifelse(FDR<0.05, marker,NA))) 25 | # p = p + getPCACancerColor() 26 | # p = p + labs(x="Coefficient",y= "-log10(FDR)") 27 | # p = p + geom_vline(xintercept = 0, alpha=0.5) 28 | # p = p + theme_bw() + 29 | # theme(axis.text.x = element_text(colour="black", size=12), axis.text.y = element_text(colour="black", size=12),axis.ticks = element_blank())#element_text(colour="black", size=14)) 30 | # p 31 | # fn = 'out/RPPAAssocVolcanoGLM.pdf' 32 | # ggsave(fn,w = 5, h = 5, useDingbat=F) 33 | 34 | # # using the Wilcox test result 35 | # p = ggplot(data=tt) 36 | # p = p + geom_point(aes(y=-log10(wilcoxFDR),x= coefficient,color = cancer),alpha=0.5) 37 | # #p = p + geom_text_repel(aes(y=-log10(FDR),x= cohort_AF,label=ifelse(FDR<0.05, Gene,NA),color = Cancer))#,alpha=1.3) 38 | # #p = p + geom_point(aes(y=cohort_AF,x=Cancer,size=-log10(FDR),color = Cancer)) 39 | # p = p + geom_text_repel(aes(y=-log10(wilcoxFDR),x= coefficient,color = cancer,label=ifelse(FDR<0.05, marker,NA))) 40 | # p = p + getPCACancerColor() 41 | # p = p + labs(x="Coefficient",y= "-log10(FDR)") 42 | # p = p + geom_vline(xintercept = 0, alpha=0.5) + xlim(-1.6,1.6) 43 | # p = p + theme_bw() + 44 | # theme(axis.text.x = element_text(colour="black", size=12), axis.text.y = element_text(colour="black", size=12),axis.ticks = element_blank())#element_text(colour="black", size=14)) 45 | # p 46 | # fn = 'out/rppaExpressAssocVolcanoWCOX.pdf' 47 | # ggsave(fn,w = 5, h = 5, useDingbat=F) 48 | 49 | tt$association = "None" 50 | tt$association[tt$FDR<0.15] = "Suggestive" 51 | tt$association[tt$FDR<0.05] = "Significant" 52 | #tt$association = factor(tt$association,level=c("None","Suggestive","Significant")) 53 | 54 | # using the Wilcox test result: plot by gene 55 | p = ggplot(data=tt,aes(x=coefficient,y=cancer,color = cancer)) 56 | # p = p + geom_point(aes(y=-log10(wilcoxFDR),x= coefficient,color = cancer),alpha=0.5) 57 | p = p + geom_point(aes(shape=association),alpha=0.3,size=2) 58 | p = p + geom_text_repel(aes(label=ifelse(FDR<0.05,gene,NA))) 59 | p = p + getPCACancerColor() 60 | p = p + labs(x="Coefficient",y= "-log10(FDR)") 61 | p = p + geom_vline(xintercept = 0, alpha=0.5) + xlim(-1.7,1.7) 62 | p = p + theme_bw() + 63 | theme(axis.text.x = element_text(colour="black", size=12), axis.text.y = element_text(colour="black", size=12),axis.ticks = element_blank())#element_text(colour="black", size=14)) 64 | p 65 | fn = 'out/rppaExpressAssocByGene.pdf' 66 | ggsave(fn,w = 5, h = 5, useDingbat=F) -------------------------------------------------------------------------------- /analysis/association_test/TFT_functions.R: -------------------------------------------------------------------------------- 1 | ##### burden test functions ##### 2 | TFT = function(data){ 3 | ExAC_nonTCGA_AN = max(data$ExAC_nonTCGA_AN, na.rm=T) 4 | TCGA_AN = max(data$TCGA_AN, na.rm=T) 5 | ExAC_nonTCGA_AC = sum(data$ExAC_nonTCGA_AC, na.rm=T) 6 | TCGA_AC = sum(data$TCGA_AC, na.rm=T) 7 | 8 | p = NA; OR = NA 9 | 10 | fisher_elements = as.numeric(c(ExAC_nonTCGA_AN,ExAC_nonTCGA_AC,TCGA_AN,TCGA_AC)) 11 | 12 | if (fisher_elements[1] > 0 && fisher_elements[3] > 0 && fisher_elements[2] >= 0 && fisher_elements[4] >= 0){ 13 | test.table = matrix(as.numeric(fisher_elements), nrow=2) 14 | f.test = fisher.test(test.table) 15 | OR = f.test$estimate 16 | p = f.test$p.value 17 | } 18 | result_row = c(fisher_elements,p,OR) 19 | 20 | return(result_row) 21 | } 22 | 23 | run_TFT = function(data, AF_thres = 0.01){ 24 | # some clean-ups 25 | data = data[data$ExAC_AC/data$ExAC_AN < AF_thres,] 26 | first_Q = summary(data$ExAC_AN)[2] 27 | data = data[data$ExAC_AN > first_Q,] # require enough samples with observed data; 114600 = first quantile 28 | num_genes = length(unique(data$gene_symbol)) 29 | # burden test: TFT: http://slideplayer.com/slide/8660600/ 30 | stats = matrix(,nrow=num_genes,ncol=7) 31 | 32 | for (i in 1:num_genes){ 33 | gene = unique(data$gene_symbol)[i] 34 | data_g = data[data$gene_symbol==gene,] 35 | gene_stat = TFT(data_g) 36 | stats[i,] = c(gene, gene_stat) 37 | } 38 | colnames(stats) = c("gene","nonTCGA_AN","total_nonTCGA_AC","TCGA_AN","total_TCGA_AC","P","OR") 39 | stats = data.frame(stats,stringsAsFactors = F) 40 | stats[,2:7] = sapply(stats[,2:7],as.numeric,2) 41 | 42 | #stats$P = as.numeric(stats$P) 43 | stats$FDR = p.adjust(stats$P, method="BH") 44 | stats = stats[order(stats$P),] 45 | return(stats) 46 | } 47 | 48 | plot_burden_result = function(stats){ 49 | p = ggplot(data=stats, aes(x=total_nonTCGA_AC,y=total_TCGA_AC, color=OR)) 50 | p = p + geom_point(aes(size=-log(FDR)),alpha=0.5) 51 | p = p + geom_text(aes(label=ifelse(FDR<0.05,gene, NA))) 52 | p = p + labs(x = "Total nonTCGA variant counts", y = "Total TCGA variant counts") + theme_bw() 53 | p = p + theme(text = element_text(colour="black", size=16), axis.text.x = element_text(colour="black", size=14), 54 | axis.text.y = element_text(colour="black", size=14)) 55 | p = p + geom_abline(slope=15202/106210, alpha=0.8) 56 | return(p) 57 | } 58 | 59 | run_plot_burden = function(data,AF_thres=0.01){ 60 | data_name = deparse(substitute(data)) 61 | data_stats = run_TFT(data,AF_thres = AF_thres) 62 | # write result 63 | tn = paste("out/burden",data_name, "AF", AF_thres, "TFT_stats.tsv", sep="_") 64 | write.table(data_stats, file=tn, quote=F, sep = '\t', col.names=NA) 65 | # plot result 66 | plot_burden_result(data_stats) 67 | fn = paste("out/burden",data_name, "AF", AF_thres, "point.pdf", sep="_") 68 | ggsave(file=fn, h=6,w=6, useDingbats=FALSE) 69 | } -------------------------------------------------------------------------------- /analysis/association_test/analyzeAssocTFT.DDR.R: -------------------------------------------------------------------------------- 1 | ##### analyzeAssocTFT.DDR.R ##### 2 | # Kuan-lin Huang @ WashU 2017 Oct. 3 | 4 | setwd("/Users/khuang/Box\ Sync/PhD/germline/PanCanAtlasGermline/analysis/association_test") 5 | source("../global_aes_out.R") 6 | source("TFT_functions.R") 7 | 8 | fn = "assoc_results/ExAC.r1.sites.vep.biallelic.combine.fisher.anno.v2.DDRgene.tsv" 9 | data = read.table( fn, sep="\t", head=T, fill=T,stringsAsFactors = F) 10 | 11 | ##### data annotating ##### 12 | data$multi_allele = F 13 | #pos = gsub("(^[0-9]+:[0-9]+):.*","\\1",data$Var) 14 | data$pos = sapply(as.character(data$Var),function(x) paste(strsplit(x,":")[[1]][1:2],collapse = ":")) 15 | duplicated_pos = data$pos[duplicated(data$pos)] 16 | data$multi_allele[data$pos %in% duplicated_pos] = T 17 | 18 | data$missense = FALSE 19 | data$missense[grep("missense",data$impact)] = TRUE 20 | 21 | data$truncation = FALSE 22 | for (type in vep_truncations){ 23 | data$truncation[grep(type,data$impact)] = TRUE 24 | } 25 | 26 | data$inframe = FALSE 27 | for (type in vep_inframe){ 28 | data$inframe[grep(type,data$impact)] = TRUE 29 | } 30 | 31 | data$variant_type = "other" 32 | data$variant_type[data$missense] = "missense" 33 | data$variant_type[data$inframe] = "inframe" 34 | data$variant_type[data$truncation] = "truncation" 35 | data$HGVSpAbbre = gsub(".*:","",data$HGVSp) 36 | 37 | ##### some exploration into top candidates ##### 38 | data_sig = data[data$P < 0.00001 & data$ExAC_AC/data$ExAC_AN < 0.01 & !data$multi_allele,] 39 | 40 | plot_top_counts(data_sig, n=30,x_string="gene_symbol",fill_string="variant_type") 41 | fn = "out/DDR_277gene.p0.00001.dis.top30gene.pdf" 42 | ggsave(file=fn, height=5,w=10, useDingbats=FALSE) 43 | 44 | # sele_genes = c("BRCA1","BRCA2","ATM","PALB2","BRIP1","MSH6","FANCI","FANCM") 45 | # data_sig_g = data_sig[data_sig$gene_symbol %in% sele_genes,] 46 | # p = ggplot(data=data_sig_g, aes(x=gene_symbol,y=-log10(P), color=variant_type)) 47 | # p = p + geom_point(alpha=0.5) 48 | # p = p + geom_text(aes(label=ifelse(variant_type=="other",as.character(impact), as.character(HGVSpAbbre)))) 49 | # p = p + labs(x = "Gene", y = "-log10(P)") + theme_bw() 50 | # p = p + theme(text = element_text(colour="black", size=16), axis.text.x = element_text(colour="black", size=14), 51 | # axis.text.y = element_text(colour="black", size=14)) 52 | # p 53 | # fn = "out/DDR_seleGene.var.sig.pdf" 54 | # ggsave(file=fn, height=10,w=10,useDingbats=FALSE) 55 | 56 | ##### burden test ##### 57 | DDR_truncation = data[data$variant_type=="truncation" & !data$multi_allele,] 58 | run_plot_burden(DDR_truncation, AF_thres=0.01) 59 | run_plot_burden(DDR_truncation, AF_thres=0.0001) 60 | 61 | DDR_truncation_noMulti = data[data$variant_type=="truncation" & !data$multi_allele,] 62 | run_plot_burden(DDR_truncation_noMulti, AF_thres=0.01) 63 | run_plot_burden(DDR_truncation_noMulti, AF_thres=0.0001) 64 | 65 | DDR_missense = data[data$variant_type=="missense" & !data$multi_allele,] 66 | run_plot_burden(DDR_missense, AF_thres=0.01) 67 | run_plot_burden(DDR_missense, AF_thres=0.001) 68 | run_plot_burden(DDR_missense, AF_thres=0.0001) 69 | 70 | DDR_missense_noMulti = data[data$variant_type=="missense" & !data$multi_allele,] 71 | run_plot_burden(DDR_missense_noMulti, AF_thres=0.01) 72 | run_plot_burden(DDR_missense_noMulti, AF_thres=0.0001) 73 | 74 | DDR_FivePrimeUTR_noMulti = data[data$impact=="5_prime_UTR_variant" & !data$multi_allele,] 75 | run_plot_burden(DDR_FivePrimeUTR_noMulti, AF_thres=0.01) 76 | run_plot_burden(DDR_FivePrimeUTR_noMulti, AF_thres=0.0001) 77 | 78 | DDR_ThreePrimeUTR_noMulti = data[data$impact=="3_prime_UTR_variant" & !data$multi_allele,] 79 | run_plot_burden(DDR_ThreePrimeUTR_noMulti, AF_thres=0.01) 80 | run_plot_burden(DDR_ThreePrimeUTR_noMulti, AF_thres=0.0001) -------------------------------------------------------------------------------- /analysis/association_test/analyzeAssocTFT.R: -------------------------------------------------------------------------------- 1 | ##### analyzeAssoc.R ##### 2 | # Kuan-lin Huang @ WashU 2017 Oct. 3 | 4 | setwd("/Users/khuang/Box\ Sync/PhD/germline/PanCanAtlasGermline/analysis/association_test") 5 | source("../global_aes_out.R") 6 | source("TFT_functions.R") 7 | 8 | fn = "assoc_results/ExAC.r1.sites.vep.biallelic.combine.fisher.NFE.152gene.tsv" 9 | data = read.table( fn, sep="\t", head=T, fill=T,stringsAsFactors = F) 10 | 11 | ##### data annotating ##### 12 | data$multi_allele = F 13 | #pos = gsub("(^[0-9]+:[0-9]+):.*","\\1",data$Var) 14 | data$pos = sapply(as.character(data$Var),function(x) paste(strsplit(x,":")[[1]][1:2],collapse = ":")) 15 | duplicated_pos = data$pos[duplicated(data$pos)] 16 | data$multi_allele[data$pos %in% duplicated_pos] = T 17 | 18 | data$missense = FALSE 19 | data$missense[grep("missense",data$impact)] = TRUE 20 | 21 | data$truncation = FALSE 22 | for (type in vep_truncations){ 23 | data$truncation[grep(type,data$impact)] = TRUE 24 | } 25 | 26 | data$inframe = FALSE 27 | for (type in vep_inframe){ 28 | data$inframe[grep(type,data$impact)] = TRUE 29 | } 30 | 31 | data$variant_type = "other" 32 | data$variant_type[data$missense] = "missense" 33 | data$variant_type[data$inframe] = "inframe" 34 | data$variant_type[data$truncation] = "truncation" 35 | data$HGVSpAbbre = gsub(".*:","",data$HGVSp) 36 | 37 | ##### some exploration into top candidates ##### 38 | data_sig = data[data$P < 0.00001 & data$ExAC_AC/data$ExAC_AN < 0.01 & !data$multi_allele,] 39 | 40 | plot_top_counts(data_sig, n=30,x_string="gene_symbol",fill_string="variant_type") 41 | fn = "out/NFE_152gene.p0.00001.dis.top30gene.pdf" 42 | ggsave(file=fn, height=5,w=10, useDingbats=FALSE) 43 | 44 | sele_genes = c("BRCA1","BRCA2","ATM","PALB2","BRIP1","MSH6","FANCI","FANCM") 45 | data_sig_g = data_sig[data_sig$gene_symbol %in% sele_genes,] 46 | p = ggplot(data=data_sig_g, aes(x=gene_symbol,y=-log10(P), color=variant_type)) 47 | p = p + geom_point(alpha=0.5) 48 | p = p + geom_text(aes(label=ifelse(variant_type=="other",as.character(impact), as.character(HGVSpAbbre)))) 49 | p = p + labs(x = "Gene", y = "-log10(P)") + theme_bw() 50 | p = p + theme(text = element_text(colour="black", size=16), axis.text.x = element_text(colour="black", size=14), 51 | axis.text.y = element_text(colour="black", size=14)) 52 | p 53 | fn = "out/NFE_seleGene.var.sig.pdf" 54 | ggsave(file=fn, height=10,w=10,useDingbats=FALSE) 55 | 56 | ##### burden test ##### 57 | NFE_truncation = data[data$variant_type=="truncation" & !data$multi_allele,] 58 | run_plot_burden(NFE_truncation, AF_thres=0.01) 59 | run_plot_burden(NFE_truncation, AF_thres=0.0001) 60 | 61 | NFE_truncation_noMulti = data[data$variant_type=="truncation" & !data$multi_allele,] 62 | run_plot_burden(NFE_truncation_noMulti, AF_thres=0.01) 63 | run_plot_burden(NFE_truncation_noMulti, AF_thres=0.0001) 64 | 65 | NFE_missense = data[data$variant_type=="missense" & !data$multi_allele,] 66 | run_plot_burden(NFE_missense, AF_thres=0.01) 67 | run_plot_burden(NFE_missense, AF_thres=0.001) 68 | run_plot_burden(NFE_missense, AF_thres=0.0001) 69 | 70 | NFE_missense_noMulti = data[data$variant_type=="missense" & !data$multi_allele,] 71 | run_plot_burden(NFE_missense_noMulti, AF_thres=0.01) 72 | run_plot_burden(NFE_missense_noMulti, AF_thres=0.0001) 73 | 74 | NFE_FivePrimeUTR_noMulti = data[data$impact=="5_prime_UTR_variant" & !data$multi_allele,] 75 | run_plot_burden(NFE_FivePrimeUTR_noMulti, AF_thres=0.01) 76 | run_plot_burden(NFE_FivePrimeUTR_noMulti, AF_thres=0.0001) 77 | 78 | NFE_ThreePrimeUTR_noMulti = data[data$impact=="3_prime_UTR_variant" & !data$multi_allele,] 79 | run_plot_burden(NFE_ThreePrimeUTR_noMulti, AF_thres=0.01) 80 | run_plot_burden(NFE_ThreePrimeUTR_noMulti, AF_thres=0.0001) 81 | -------------------------------------------------------------------------------- /analysis/association_test/postProcessAssoc.R: -------------------------------------------------------------------------------- 1 | setwd("/Users/khuang/Box Sync/PhD/germline/PanCanAtlasGermline/analysis/association_test") 2 | #fn = "test.fisher.tsv" #"ExAC.r1.sites.vep.biallelic.combine.fisher.tsv" 3 | # fn = "ExAC.r1.sites.vep.biallelic.combine.fisher.tsv" 4 | fn = "assoc_results/ExAC.r1.sites.vep.biallelic.combine.fisher.NFE.152gene.tsv" 5 | data = read.table( fn, sep="\t", fill=T, head=T) 6 | title = "ExAC.r1.fisher.NFE.152g" 7 | 8 | data_MAF0.01 = data[data$ExAC_AC/data$ExAC_AN < 0.01,] 9 | # fn = "ExAC.r1.sites.vep.biallelic.combine.fisher.maf0.01.tsv" 10 | # write.table(file=fn, data_MAF0.01, quote=F, sep="\t", row.names = F, col.names=F) 11 | 12 | #Genomic correction 13 | sink( sprintf( 'out/lambda.%s.MAF0.01.txt', title) ) 14 | data2=data_MAF0.01[!is.na(data_MAF0.01$P),]# & -log10(data$P)<20,] 15 | ch = qchisq( data2$P, 1, lower.tail=F) 16 | data2$ch=ch 17 | theMedian = median(ch) 18 | theLambda = median(ch)/0.456 19 | cat( "median=", theMedian, "lambda=", theLambda, "\n") 20 | sink() 21 | 22 | source( 'qqman.R' ) 23 | jpeg( sprintf( 'out/qqplot.%s.MAF0.01.jpg', title), width=1200, height=1200) 24 | qq( data2$P ) 25 | dev.off() 26 | 27 | library(reshape2) 28 | 29 | split_var = colsplit(string=data2$Var, pattern=":", names=c("CHR", "BP","ID","REF","ALT")) 30 | data3 = cbind(data2,split_var) 31 | data4 = data3[!(data3$CHR %in% c("X","Y")),] 32 | data4$CHR = as.numeric(data4$CHR) 33 | jpeg( sprintf( 'out/manhattan.%s.MAF0.01.jpg', title) , width=1200, height=1200) 34 | manhattan( data4[,c("CHR","BP","P")], main=title) 35 | dev.off() -------------------------------------------------------------------------------- /analysis/association_test/work.log.sh: -------------------------------------------------------------------------------- 1 | # data location: /gscmnt/gc3014/dinglab/ExAC/VCF 2 | 3 | # normalize exac vcf and subset to only rare variants 4 | bsubl -oo ExAC.r1.multi2biallelic.log '~/bin/bcftools-1.5/bcftools norm -m - ExAC.r1.sites.vep.vcf.gz | bgzip -c > ExAC.r1.sites.vep.biallelic.vcf.gz' 5 | bsubl -oo ExAC_nonTCGA.r1.multi2biallelic.log '~/bin/bcftools-1.5/bcftools norm -m - ExAC_nonTCGA.r1.sites.vep.vcf.gz | bgzip -c > ExAC_nonTCGA.r1.sites.vep.biallelic.vcf.gz' 6 | bsubl -oo tabix.ExAC.r1.sites.vep.biallelic.log 'tabix -p vcf ExAC.r1.sites.vep.biallelic.vcf.gz' 7 | bsubl -oo tabix.ExAC_nonTCGA.r1.sites.vep.biallelic.log 'tabix -p vcf ExAC_nonTCGA.r1.sites.vep.biallelic.vcf.gz' 8 | # annotate nonTCGA frequency using vcfanno 9 | bsub -q bigmem -R"select[mem>80000] rusage[mem=80000]" -M 80000000 -oo ExAC.r1.vcfanno.log '~/bin/vcfanno_linux64.1 -p 8 ExAC_nonTCGA_config.toml ExAC.r1.sites.vep.biallelic.vcf.gz | bgzip -c > ExAC.r1.sites.vep.biallelic.combine.vcf.gz' 10 | bsubl -oo tabix.ExAC.r1.sites.vep.biallelic.combine.log 'tabix -p vcf ExAC.r1.sites.vep.biallelic.combine.vcf.gz' 11 | # get rare variants 12 | bsubl -oo ExAC.r1.getRareAllele.log '~/bin/bcftools-1.5/bcftools view --max-af 0.0005 ExAC.r1.sites.vep.biallelic.combine.vcf.gz | bgzip -c > ExAC.r1.sites.vep.biallelic.combine.maxAF0.01.vcf.gz' 13 | mv ExAC.r1.sites.vep.biallelic.combine.maxAF0.01.vcf.gz ExAC.r1.sites.vep.biallelic.combine.maxAF0.0005.vcf.gz 14 | # Each ethnicity only sums up to AN_Adj; not AN 15 | 16 | # his workflow: 17 | # use vcf anno to combine nonTCGA AC/AN to ExAC vcf 18 | vcfanno -p 8 ExAC_nonTCGA_config.toml ExAC.r0.3.1.sites.vep.vcf.gz | bgzip -c > ExAC.r0.3.1.combine_all.vep.vcf.gz 19 | 20 | # conduct single variant assoc test 21 | bsubl -oo single_var_association.log 'python2.7 single_var_association.py VCF/ExAC.r1.sites.vep.biallelic.combine.vcf.gz ExAC.r1.sites.vep.biallelic.combine.fisher.anno.tsv' 22 | 23 | ### from here: 24 | # filter out the non-rare variants; convert to tab delimited 25 | # may need to limit only to regions with sufficient coverage 26 | # Do burden test on variants with specific consequence and AF bins 27 | 28 | gzcat ExAC.r1.sites.vep.biallelic.combine.fisher.anno.v2.tsv.gz | grepList /Users/khuang/Box\ Sync/PhD/germline/PanCanAtlasGermline/TCGA_data/reference_files/20160713_Rahman_KJ_KH_152_gene_table_list.txt 9 > ExAC.r1.sites.vep.biallelic.combine.fisher.anno.v2.152gene.tsv 29 | -------------------------------------------------------------------------------- /analysis/burden_assoc/TFT.R: -------------------------------------------------------------------------------- 1 | # TFT function for testing associations 2 | 3 | TFT = function(data){ 4 | 5 | p = NA; OR = NA 6 | 7 | fisher_elements = as.numeric(data) 8 | 9 | if (fisher_elements[1] > 0 && fisher_elements[3] > 0 && fisher_elements[2] >= 0 && fisher_elements[4] >= 0){ 10 | test.table = matrix(as.numeric(fisher_elements), nrow=2) 11 | f.test = fisher.test(test.table, alternative = "greater") 12 | OR = f.test$estimate 13 | p = f.test$p.value 14 | } 15 | result_row = c(fisher_elements,p,OR) 16 | 17 | return(result_row) 18 | 19 | } 20 | 21 | run_TFT = function(data, AF_thres = 0.01){ 22 | data=data[data$Freq > AF_thres,] 23 | num_genes = length(unique(data$Gene)) 24 | # burden test: TFT: http://slideplayer.com/slide/8660600/ 25 | stats = matrix(,nrow=num_genes,ncol=7) 26 | 27 | for (i in 1:num_genes){ 28 | gene = unique(data$Gene)[i] 29 | #data_g = data[data$gene_symbol==gene,] 30 | data_g = c(53105,data$ExAC_Count[data$Gene == gene],data$Sample_size[1],data$Count[data$Gene == gene]) 31 | gene_stat = TFT(data_g) 32 | stats[i,] = c(as.character(gene), gene_stat) 33 | } 34 | colnames(stats) = c("Gene","ExAC_nonTCGA_AN","ExAC_nonTCGA_AC","cohort_AN","cohort_AC","P","OR") 35 | stats = data.frame(stats,stringsAsFactors = F) 36 | stats[,2:7] = sapply(stats[,2:7],as.numeric,2) 37 | 38 | #stats$P = as.numeric(stats$P) 39 | #stats$FDR = p.adjust(stats$P, method="BH") 40 | #stats = stats[order(stats$P),] 41 | return(stats) 42 | } 43 | 44 | run_TFT_against_others = function(data_c, PCA_count_byCancer, all_cancer_stat_m_suggest, AF_thres = 0.01){ 45 | data_c=data_c[data_c$Freq > AF_thres,] 46 | cancer = data_c$Cancer[1] 47 | num_genes = length(unique(data_c$Gene)) 48 | # burden test: TFT: http://slideplayer.com/slide/8660600/ 49 | stats = matrix(,nrow=num_genes,ncol=7) 50 | 51 | for (i in 1:num_genes){ 52 | gene = unique(data_c$Gene)[i] 53 | 54 | sig_cancers = all_cancer_stat_m_suggest$Cancer[all_cancer_stat_m_suggest$Gene==gene] 55 | data_other_c = PCA_count_byCancer[!(PCA_count_byCancer$Cancer %in% c(sig_cancers,as.character(cancer))),] 56 | 57 | data_other_c_g = data_other_c[data_other_c$Gene == gene,] 58 | #data_c_g = data_c[data_c$gene_symbol==gene,] 59 | data_c_g = c(sum(data_other_c_g$Sample_size),sum(data_other_c_g$Count),data_c$Sample_size[i],data_c$Count[data_c$Gene == gene]) 60 | gene_stat = TFT(data_c_g) 61 | stats[i,] = c(as.character(gene), gene_stat) 62 | } 63 | colnames(stats) = c("Gene","other_cancers_AN","other_cancers_AC","cohort_AN","cohort_AC","P","OR") 64 | stats = data.frame(stats,stringsAsFactors = F) 65 | stats[,2:7] = sapply(stats[,2:7],as.numeric,2) 66 | 67 | #stats$P = as.numeric(stats$P) 68 | #stats$FDR = p.adjust(stats$P, method="BH") 69 | #stats = stats[order(stats$P),] 70 | return(stats) 71 | } 72 | -------------------------------------------------------------------------------- /analysis/burden_assoc/label_onco_var_ExAC.R: -------------------------------------------------------------------------------- 1 | ##### label_onco_var_ExAC.R ##### 2 | # Kuan-lin Huang @ WashU 2017 Oct 3 | # find non-cancer pathogenic variant in the ExAC cohort 4 | 5 | bdir = "/Users/khuang/Box Sync/PhD/germline/PanCanAtlasGermline/analysis/burden_assoc" 6 | setwd(bdir) 7 | source("../global_aes_out.R") 8 | source("../dependency_files.R") 9 | 10 | fn = "charged.ExAC.r1.sites.vep.biallelic.combine.exon.all.patho.expanded.tsv" 11 | variants = read.table(sep="\t",header=T,file=fn, stringsAsFactors=FALSE, quote = "",fill=TRUE) 12 | 13 | gene_fn = "/Users/khuang/Box\ Sync/PhD/germline/PanCanAtlasGermline/TCGA_data/reference_files/20160713_Rahman_KJ_KH_152_gene_table_list.txt" 14 | predisposition_genes = as.vector(t(read.table(sep="\t",header=F,file=gene_fn, stringsAsFactors=FALSE, quote = ""))) 15 | 16 | cat("Original count of variants: ",sum(variants$ExAC_nonTCGA_AC_Adj),"\n") 17 | 18 | ##### classify whether a variant is cancer-relevant ##### 19 | cancer_terms = c("tumor","cancer","neoplasia") 20 | 21 | variants$predisposition_gene = F 22 | variants$predisposition_gene[variants$HUGO_Symbol %in% predisposition_genes] = TRUE 23 | variants$cancer_term_trait = FALSE 24 | for (term in cancer_terms){ 25 | variants$cancer_term_trait[grep(term,tolower(variants$ClinVar_Traits))] = TRUE 26 | } 27 | variants$cancer_term_trait[grep("oma$",tolower(variants$ClinVar_Traits))] = TRUE 28 | 29 | table(variants$predisposition_gene,variants$cancer_term_trait) 30 | variants$cancer_related = F 31 | variants$cancer_related[variants$predisposition_gene | variants$cancer_term_trait] = T 32 | 33 | #table(variants$ClinVar_Traits[variants$cancer_term_trait])[table(variants$ClinVar_Traits[variants$cancer_term_trait])>3] 34 | 35 | # variant frequency annotation 36 | 37 | #### rare frequency filter ##### 38 | variants = variants[variants$ExAC_AF < 0.0005,] # not less 39 | 40 | variants_cancer = variants[variants$cancer_related,] 41 | cat("Number of these variants that are cancer-relevant:",sum(variants_cancer$ExAC_nonTCGA_AC_Adj),"\n") 42 | 43 | variants_cancer$Overall_Classification = "Uncertain Significance" 44 | variants_cancer$Overall_Classification[variants_cancer$CharGer_Classification=="Pathogenic"] = "Likely Pathogenic" 45 | variants_cancer$Overall_Classification[variants_cancer$ClinVar_Pathogenicity=="Pathogenic"] = "Pathogenic" 46 | variants_cancer$Overall_Classification[grep("PS1",variants_cancer$Positive_Evidence)] = "Pathogenic" 47 | variants_cancerP = variants_cancer[variants_cancer$Overall_Classification %in% c("Likely Pathogenic", "Pathogenic"),] 48 | 49 | cat("Number of these variants that are cancer-relevant and pathogenic:",sum(variants_cancerP$ExAC_nonTCGA_AC_Adj),"\n") 50 | 51 | # tn = "out/ExAC_pathogenic_variants.tsv" 52 | # write.table(variants_cancerP, quote=F, sep="\t", file = tn, row.names = F) 53 | -------------------------------------------------------------------------------- /analysis/clinical_association/PathVarEthnicStats.R: -------------------------------------------------------------------------------- 1 | ##### plotPathVarEthnic.R ##### 2 | # Kuan-lin Huang @ WashU 201711 3 | # plot assoc results for pathogenic variants 4 | 5 | bdir = "/Users/khuang/Box\ Sync/PhD/germline/PanCanAtlasGermline/analysis/clinical_association" 6 | setwd(bdir) 7 | source("../global_aes_out.R") 8 | source("../dependency_files.R") 9 | source("plotPathVarEthnic.R") 10 | 11 | out_table=NULL 12 | 13 | for (cancer in unique(count.F_25$Cancer)){ 14 | count_c = count.F_25[count.F_25$Cancer==cancer,] 15 | ethnicities = unique(count_c$Ethnicity) 16 | caucasian_all = count_c[!is.na(count_c$Ethnicity) & count_c$Ethnicity=="Europian","CohortCount"][1] 17 | 18 | if ("Asian" %in% ethnicities){ 19 | asian_all = count_c[!is.na(count_c$Ethnicity) & count_c$Ethnicity=="Asian","CohortCount"][1] 20 | for (gene in unique(count_c$Gene)){ 21 | count_c_g = count_c[count_c$Gene==gene,] 22 | caucasian_carrier = count_c_g[!is.na(count_c_g$Ethnicity) & count_c_g$Ethnicity=="Europian","CarrierCount"] 23 | asian_carrier = count_c_g[!is.na(count_c_g$Ethnicity) & count_c_g$Ethnicity=="Asian","CarrierCount"] 24 | if (!length(caucasian_carrier)){caucasian_carrier=0} 25 | if (!length(asian_carrier)){asian_carrier=0} 26 | dist_nums = c(caucasian_all-caucasian_carrier,caucasian_carrier,asian_all-asian_carrier,asian_carrier) 27 | test.t = matrix(dist_nums,nrow=2) 28 | f.test = fisher.test(test.t) 29 | OR = f.test$estimate 30 | p = f.test$p.value 31 | 32 | row_stat = cbind(cancer, gene, "Europian", "Asian", caucasian_all-caucasian_carrier,caucasian_carrier,asian_all-asian_carrier,asian_carrier, OR, p) 33 | out_table = rbind(out_table, row_stat) 34 | } 35 | } 36 | if ("African American" %in% ethnicities){ 37 | asian_all = count_c[!is.na(count_c$Ethnicity) & count_c$Ethnicity=="African American","CohortCount"][1] 38 | for (gene in unique(count_c$Gene)){ 39 | count_c_g = count_c[count_c$Gene==gene,] 40 | caucasian_carrier = count_c_g[!is.na(count_c_g$Ethnicity) & count_c_g$Ethnicity=="Europian","CarrierCount"] 41 | asian_carrier = count_c_g[!is.na(count_c_g$Ethnicity) & count_c_g$Ethnicity=="African American","CarrierCount"] 42 | if (!length(caucasian_carrier)){caucasian_carrier=0} 43 | if (!length(asian_carrier)){asian_carrier=0} 44 | dist_nums = c(caucasian_all-caucasian_carrier,caucasian_carrier,asian_all-asian_carrier,asian_carrier) 45 | test.t = matrix(dist_nums,nrow=2) 46 | f.test = fisher.test(test.t) 47 | OR = f.test$estimate 48 | p = f.test$p.value 49 | 50 | row_stat = cbind(cancer, gene, "Europian", "African American", caucasian_all-caucasian_carrier,caucasian_carrier,asian_all-asian_carrier,asian_carrier, OR, p) 51 | out_table = rbind(out_table, row_stat) 52 | } 53 | } 54 | if ("American" %in% ethnicities){ 55 | asian_all = count_c[!is.na(count_c$Ethnicity) & count_c$Ethnicity=="American","CohortCount"][1] 56 | for (gene in unique(count_c$Gene)){ 57 | count_c_g = count_c[count_c$Gene==gene,] 58 | caucasian_carrier = count_c_g[!is.na(count_c_g$Ethnicity) & count_c_g$Ethnicity=="Europian","CarrierCount"] 59 | asian_carrier = count_c_g[!is.na(count_c_g$Ethnicity) & count_c_g$Ethnicity=="]American","CarrierCount"] 60 | if (!length(caucasian_carrier)){caucasian_carrier=0} 61 | if (!length(asian_carrier)){asian_carrier=0} 62 | dist_nums = c(caucasian_all-caucasian_carrier,caucasian_carrier,asian_all-asian_carrier,asian_carrier) 63 | test.t = matrix(dist_nums,nrow=2) 64 | f.test = fisher.test(test.t) 65 | OR = f.test$estimate 66 | p = f.test$p.value 67 | 68 | row_stat = cbind(cancer, gene, "Europian", "American", caucasian_all-caucasian_carrier,caucasian_carrier,asian_all-asian_carrier,asian_carrier, OR, p) 69 | out_table = rbind(out_table, row_stat) 70 | } 71 | } 72 | } 73 | out_table = as.data.frame(out_table) 74 | colnames(out_table) = c("cancer","gene","EthnicityA","EthnicityB","EthnicityA_noncarrier","EthnicityA_carrier" 75 | ,"EthnicityB_noncarrier","EthnicityB_carrier","OR","P") 76 | out_table = out_table[as.numeric(as.character(out_table$EthnicityB_carrier))>1,] 77 | out_table$FDR = p.adjust(out_table[,"P"], method="fdr") # MAW new, calculates FDR based on the method from, 78 | out_table=out_table[order(out_table$P, decreasing=FALSE),] 79 | tn = "out/pathVar_2ethni_TFT_assoc.txt" 80 | write.table(out_table, quote=F, sep="\t", file = tn, row.names = F) 81 | -------------------------------------------------------------------------------- /analysis/expression_effect/expression_effect.R: -------------------------------------------------------------------------------- 1 | ##### expression_effect.R ##### 2 | # Kuan-lin Huang @ WashU 2016 May , updated 2017 Nov. 3 | # analyze cohort level RNA-Seq data and convert to different matrices in a sample-gene format 4 | 5 | bdir = "/Users/khuang/Box Sync/PhD/germline/PanCanAtlasGermline/analysis/expression_effect" 6 | setwd(bdir) 7 | source("../../global_aes_out.R") 8 | source("../../dependency_files.R") 9 | 10 | 11 | ## function ## 12 | unfactorize = function(df){ 13 | for(i in which(sapply(df, class) == "factor")) df[[i]] = as.numeric(as.character(df[[i]])) 14 | return(df) 15 | } 16 | 17 | ecdf_fun = function(x,perc) ecdf(x)(perc) 18 | 19 | expression_effect = function(m){ 20 | cat("##### EXPRESSION ANALYSIS #####\n") 21 | minNum = 5 22 | m = as.matrix(m) 23 | num = nrow(m) 24 | m2 = as.matrix(m[rowSums(!is.na(m)) >= minNum, ]) 25 | num_NA= nrow(m2) 26 | cat(paste("Original number of markers:", num, "; NA filtered:", num_NA, "\n", sep=" ")) 27 | 28 | # initiate tables 29 | outlier = matrix(,nrow=dim(m2)[1],ncol=dim(m2)[2]) 30 | row.names(outlier) = row.names(m2) 31 | colnames(outlier) = colnames(m2) 32 | exp_score = outlier 33 | exp_quantile = outlier 34 | 35 | # gene-wise expression score and quantile score 36 | for (i in 1:nrow(m2)){ 37 | #IQR = quantile(m2[i,], probs=0.75, na.rm=T) - quantile(m2[i,], probs=0.25, na.rm=T) 38 | exp_score[i,] = m2[i,]#(m2[i,] - quantile(m2[i,], probs=0.50, na.rm=T))/IQR 39 | exp_quantile[i,] = ecdf_fun(m2[i,],m2[i,]) 40 | } 41 | 42 | return(list("exp_score"=exp_score, "exp_quantile"=exp_quantile)) 43 | } 44 | 45 | glist_f = read.table(header=FALSE, stringsAsFactors = F, file = "/Users/khuang/Box Sync/PhD/germline/pan8000_germline_clinical/reference_files/CancerGeneListV5-2014-04-18.add-Rahman.add-Fanconi-Gene.txt") 46 | glist = as.vector(t(glist_f)) 47 | 48 | fileNames = Sys.glob("/Users/khuang/Box\ Sync/PhD/collaborations/premed_2015/data/All_gene_RNASeq/raw_output/*RSEM_hugo.txt") 49 | # # get rid of COADREAD 50 | # CR = "/Users/khuang/Box Sync/PhD/collaborations/premed_2015/data/All_gene_RNASeq/raw_output/COADREAD_RSEM_hugo.txt" 51 | # fileNames = fileNames[-which(fileNames == CR)] 52 | 53 | exp_score_tables = vector("list") 54 | exp_quantile_tables = vector("list") 55 | exp_tables = vector("list") 56 | 57 | for (fileName in fileNames) { 58 | cancer2 = strsplit(fileName, split="/")[[1]][11] 59 | cancer = gsub("_.*","",cancer2) 60 | #cat(paste(cancer,"\n")) 61 | #exp_table = read.table(row.names=1,header=TRUE, sep="\t", file=fileName) 62 | exp_table = read.table(header=TRUE, sep="\t", file=fileName) 63 | exp_table = exp_table[exp_table$Hybridization.REF %in% glist,] 64 | 65 | row.names(exp_table) = make.names(exp_table[,1],unique=T) 66 | exp_table = exp_table[,-c(1,2)] 67 | 68 | # get tumor only 69 | normal = substr(colnames(exp_table), 14, 14) 70 | exp_table = exp_table[,normal=="0"] 71 | 72 | exp_table = unfactorize(exp_table) 73 | exp_table = log2(exp_table+1) 74 | new_colname = vector() 75 | for (name in colnames(exp_table)){ 76 | splitted_sname = strsplit(name, split = "\\.")[[1]] 77 | new_name = paste(splitted_sname[1],splitted_sname[2],splitted_sname[3],sep=".") 78 | new_colname = c(new_colname,new_name) 79 | } 80 | colnames(exp_table) = new_colname 81 | 82 | if (dim(exp_table)[1] == 0 || dim(exp_table)[2] == 0){next;} 83 | 84 | exp_results = expression_effect(exp_table) 85 | 86 | exp_score_table_m = melt(exp_results$exp_score) 87 | exp_score_table_m$cancer = cancer 88 | exp_score_tables[[cancer]] = exp_score_table_m 89 | 90 | exp_quantile_table_m = melt(exp_results$exp_quantile) 91 | exp_quantile_table_m$cancer = cancer 92 | exp_quantile_tables[[cancer]] = exp_quantile_table_m 93 | 94 | exp_tables[[cancer]] 95 | } 96 | 97 | exp_score_tables_all = do.call(rbind,exp_score_tables) 98 | colnames(exp_score_tables_all) = c("gene_name","sample","log2RSEM","cancer") 99 | exp_quantile_tables_all = do.call(rbind,exp_quantile_tables) 100 | colnames(exp_quantile_tables_all) = c("gene_name","sample","expression_quantile","cancer") 101 | exp_score_tables_all$sample = gsub("\\.","-",exp_score_tables_all$sample) 102 | exp_quantile_tables_all$sample = gsub("\\.","-",exp_quantile_tables_all$sample) 103 | 104 | fn = "out/pancan_exp_log2RSEM_all.tsv" 105 | write.table(exp_score_tables_all, file=fn, quote=F, sep="\t", col.names=T, row.names=F) 106 | fn = "out/pancan_exp_quantile_all.tsv" 107 | write.table(exp_quantile_tables_all, file=fn, quote=F, sep="\t", col.names=T, row.names=F) 108 | -------------------------------------------------------------------------------- /analysis/expression_effect/plotPathVarExpressAssoc.R: -------------------------------------------------------------------------------- 1 | ##### plotPathVarExpressAssoc.R ##### 2 | # Kuan-lin Huang @ WashU 2016 August updated 2017 3 | # conduct association of pathVarPline variants with AAO 4 | 5 | bdir = "/Users/khuang/Box\ Sync/PhD/germline/PanCanAtlasGermline/analysis/expression_effect" 6 | setwd(bdir) 7 | source("../global_aes_out.R") 8 | source("../dependency_files.R") 9 | 10 | tn = "out/pathVarExpressAssoc.txt" 11 | tt = read.table(sep="\t",header=T,file=tn, stringsAsFactors=FALSE) 12 | 13 | ### plotting ### 14 | tt$gene = as.character(tt$gene) 15 | tt$FDR_plot = tt$FDR 16 | 17 | # # using GLM test result 18 | # tt$FDR_plot[tt$FDR_plot<10^(-6)]= 0.95*10^(-6) 19 | # p = ggplot(data=tt) 20 | # p = p + geom_point(aes(y=-log10(FDR_plot),x= coefficient,color = cancer),alpha=0.5) 21 | # #p = p + geom_text_repel(aes(y=-log10(FDR),x= cohort_AF,label=ifelse(FDR<0.05, Gene,NA),color = Cancer))#,alpha=1.3) 22 | # #p = p + geom_point(aes(y=cohort_AF,x=Cancer,size=-log10(FDR),color = Cancer)) 23 | # p = p + geom_text_repel(aes(y=-log10(FDR_plot),x= coefficient,color = cancer,label=ifelse(FDR<0.05, gene,NA))) 24 | # p = p + getPCACancerColor() 25 | # p = p + labs(x="Coefficient",y= "-log10(FDR)") 26 | # p = p + geom_vline(xintercept = 0, alpha=0.5) 27 | # p = p + theme_bw() + 28 | # theme(axis.text.x = element_text(colour="black", size=12), axis.text.y = element_text(colour="black", size=12),axis.ticks = element_blank())#element_text(colour="black", size=14)) 29 | # p 30 | # fn = 'out/geneExpressAssocVolcanoGLM.pdf' 31 | # ggsave(fn,w = 5, h = 5, useDingbat=F) 32 | 33 | # # using the Wilcox test result 34 | # p = ggplot(data=tt) 35 | # p = p + geom_point(aes(y=-log10(wilcoxFDR),x= coefficient,color = cancer),alpha=0.5) 36 | # #p = p + geom_text_repel(aes(y=-log10(FDR),x= cohort_AF,label=ifelse(FDR<0.05, Gene,NA),color = Cancer))#,alpha=1.3) 37 | # #p = p + geom_point(aes(y=cohort_AF,x=Cancer,size=-log10(FDR),color = Cancer)) 38 | # p = p + geom_text_repel(aes(y=-log10(wilcoxFDR),x= coefficient,color = cancer,label=ifelse(FDR<0.05, gene,NA))) 39 | # p = p + getPCACancerColor() 40 | # p = p + labs(x="Coefficient",y= "-log10(FDR)") 41 | # p = p + geom_vline(xintercept = 0, alpha=0.5) 42 | # p = p + theme_bw() + 43 | # theme(axis.text.x = element_text(colour="black", size=12), axis.text.y = element_text(colour="black", size=12),axis.ticks = element_blank())#element_text(colour="black", size=14)) 44 | # p 45 | # fn = 'out/geneExpressAssocVolcanoWCOX.pdf' 46 | # ggsave(fn,w = 5, h = 5, useDingbat=F) 47 | 48 | tt$association = "None" 49 | tt$association[tt$FDR<0.15] = "Suggestive" 50 | tt$association[tt$FDR<0.05] = "Significant" 51 | #tt$association = factor(tt$association,level=c("None","Suggestive","Significant")) 52 | 53 | # using the Wilcox test result: plot by gene 54 | p = ggplot(data=tt,aes(x=coefficient,y=cancer,color = cancer)) 55 | # p = p + geom_point(aes(y=-log10(wilcoxFDR),x= coefficient,color = cancer),alpha=0.5) 56 | p = p + geom_point(aes(shape=association),alpha=0.3,size=2) 57 | p = p + geom_text_repel(aes(label=ifelse(FDR<0.05,gene,NA))) 58 | p = p + getPCACancerColor() 59 | p = p + labs(x="Cancer",y= "-log10(FDR)") 60 | p = p + geom_vline(xintercept = 0, alpha=0.5) + xlim(-3.1,3.1) 61 | p = p + theme_bw() + 62 | theme(axis.text.x = element_text(colour="black", size=12), axis.text.y = element_text(colour="black", size=12),axis.ticks = element_blank())#element_text(colour="black", size=14)) 63 | p 64 | fn = 'out/geneExpressAssocByGene.pdf' 65 | ggsave(fn,w = 5, h = 5, useDingbat=F) 66 | -------------------------------------------------------------------------------- /analysis/family_history/fam_history.R: -------------------------------------------------------------------------------- 1 | ##### fam_history.R ##### 2 | # Kuan-lin Huang @ WashU 2017 updated Nov. 3 | 4 | setwd("/Users/khuang/Box\ Sync/PhD/germline/PanCanAtlasGermline/analysis/family_history/") 5 | source("../global_aes_out.R") 6 | source("../dependency_files.R") 7 | 8 | ##### individual level stats ##### 9 | 10 | fileName = "clinical_PANCAN_patient_cancerhistory.111817.tsv" 11 | famHist = read.table(header=TRUE, sep="\t", file=fileName, fill=T, quote="",stringsAsFactors = F) 12 | str(famHist) 13 | 14 | fileNameB = "clinical_PANCAN_patient_cancerhistory.2col.111817.tsv" 15 | famHistB = read.table(header=TRUE, sep="\t", file=fileNameB, fill=T, quote="",stringsAsFactors = F) 16 | str(famHistB) 17 | 18 | fam_hist_samples = famHistB$bcr_patient_barcode[famHistB[,2]=="Yes"] 19 | 20 | famHistPos = famHist[famHist$bcr_patient_barcode %in% fam_hist_samples,] 21 | 22 | for (col in colnames(famHistPos)[2:14]){ 23 | print(col) 24 | print(table(famHistPos[,col])) 25 | } 26 | 27 | fam_var_merge = merge(pathVarP, famHistPos, by="bcr_patient_barcode") 28 | 29 | for (col in colnames(fam_var_merge)[182:194]){ 30 | print(col) 31 | print(table(fam_var_merge[,col])) 32 | } 33 | 34 | fam_var_merge$fam_tag = "Other" 35 | fam_var_merge$fam_tag[grep("reast",fam_var_merge$relative_family_cancer_hx_text)] = "Breast cancer" 36 | fam_var_merge$fam_tag[grep("rostate",fam_var_merge$relative_family_cancer_hx_text)] = "Prostate cancer" 37 | fam_var_merge$fam_tag[fam_var_merge$family_history_of_stomach_cancer=="YES"] = "Stomach cancer" 38 | fam_var_merge$fam_tag[fam_var_merge$number_of_first_degree_relatives_with_cancer_diagnosis==1] = "First degree relatives" 39 | table(fam_var_merge$fam_tag) 40 | table(fam_var_merge$HUGO_Symbol) 41 | table(fam_var_merge$HUGO_Symbol,fam_var_merge$fam_tag) 42 | table(fam_var_merge$cancer) 43 | 44 | fam_var_merge_noNA = fam_var_merge[!is.na(fam_var_merge$HGVSp_short),] 45 | p = ggplot(fam_var_merge_noNA,aes(x = HGVSp_short, fill = cancer)) 46 | p = p + facet_grid(.~HUGO_Symbol, scale="free", space="free") 47 | p = p + geom_bar() + theme_bw() + theme_nogrid() + getPCACancerFill() 48 | p = p + labs(x = "Variant", y="Number of carriers with family history") 49 | p = p + theme(axis.title = element_text(size=16), axis.text.x = element_text(colour="black", size=14, angle = 90, vjust=0.5), axis.text.y = element_text(colour="black", size=14))#element_text(colour="black", size=14)) 50 | p = p + theme(legend.position = "top") 51 | p 52 | fn = 'out/family_var_count.pdf' 53 | ggsave(file=fn, w=15, h =6 ,useDingbats=FALSE) 54 | 55 | dup_var = fam_var_merge[duplicated(fam_var_merge$HGVSp_short),c("HUGO_Symbol","HGVSp_short","cancer")] 56 | cat("Duplicated variants","\n") 57 | dup_var 58 | dup_genes = fam_var_merge$HUGO_Symbol[duplicated(fam_var_merge$HUGO_Symbol)] 59 | fam_var_merge_g = fam_var_merge[fam_var_merge$HUGO_Symbol %in% dup_genes,] 60 | 61 | p = ggplot(fam_var_merge_g,aes(y = HUGO_Symbol, x = cancer, color=fam_tag)) 62 | #p = p + facet_grid(Gene_Classification~., scale="free", space="free") 63 | p = p + geom_jitter(alpha=0.5,size=1.5, height = 0.2,width = 0.22,shape=16, stroke=0) 64 | p = p + geom_text_repel(aes(label=as.character(HGVSp_short),size=3,angle=0,vjust=1.5)) 65 | p = p + theme_bw() 66 | #p = p + geom_hline(yintercept = -log10(0.05),alpha=0.3) 67 | #p = p + xlim(0,6) 68 | #p = p + getVarColorScale() 69 | p = p + labs( x="TCGA case cancer type", y = "Gene") + scale_colour_discrete(name = "Case family history") 70 | p = p + theme(axis.text.x = element_text(colour="black", size=14, angle=90, vjust = 0.5),axis.text.y = element_text(colour="black", size=14)) 71 | #p = p + coord_equal() 72 | p 73 | fn = "out/fam_history_var.pdf" 74 | ggsave(fn, h=6, w = 8,useDingbat=F) 75 | 76 | p = ggplot(fam_var_merge_g,aes(y = HUGO_Symbol, x = cancer, color=fam_tag)) 77 | #p = p + facet_grid(Gene_Classification~., scale="free", space="free") 78 | p = p + geom_jitter(alpha=0.4,size=3, height = 0.1,width = 0.1,shape=16, stroke=0) 79 | p = p + geom_text(aes(label=ifelse(as.character(fam_var_merge_g$HGVSp_short) %in% dup_var$HGVSp_short,HGVSp_short,NA),size=3,angle=0,vjust=1.5)) 80 | p = p + theme_bw() 81 | #p = p + geom_hline(yintercept = -log10(0.05),alpha=0.3) 82 | #p = p + xlim(0,6) 83 | #p = p + getVarColorScale() 84 | p = p + labs( x="TCGA case cancer type", y = "Gene") + scale_colour_discrete(name = "Case family history") 85 | p = p + theme(axis.text.x = element_text(colour="black", size=14, angle=90, vjust = 0.5),axis.text.y = element_text(colour="black", size=14)) 86 | #p = p + coord_equal() 87 | p 88 | fn = "out/fam_history_var_dupVarOnly.pdf" 89 | ggsave(fn, h=6, w = 8,useDingbat=F) 90 | 91 | p = ggplot(fam_var_merge_g,aes(x = HUGO_Symbol, fill = fam_tag)) 92 | p = p + facet_grid(.~cancer, scale="free", space="free") 93 | p = p + geom_bar() + theme_bw() + theme_nogrid() #+ getPCACancerFill() 94 | p = p + labs(x = "Gene", y="Number of carriers with family history") + scale_fill_discrete(name = "Case family history") 95 | p = p + theme(axis.title = element_text(size=16), axis.text.x = element_text(colour="black", size=14, angle = 90, vjust=0.5), axis.text.y = element_text(colour="black", size=14))#element_text(colour="black", size=14)) 96 | p = p + theme(legend.position = "top") 97 | p 98 | fn = "out/fam_history_var_famtag.pdf" 99 | ggsave(fn, h=4, w = 8,useDingbat=F) 100 | -------------------------------------------------------------------------------- /analysis/functional_assay/plot_result.R: -------------------------------------------------------------------------------- 1 | ##### plot_result.R ##### 2 | # Kuan-lin Huang @ WashU 2017 July 3 | # plot RET functional experiment results 4 | library(ggplot2) 5 | setwd("/Users/khuang/Box\ Sync/PhD/germline/pan8000_germline_clinical/functional_assay/") 6 | 7 | args = commandArgs(trailingOnly=TRUE) 8 | # test if there is at least one argument: if not, return an error 9 | if (length(args)==0) { 10 | stop("At least one argument must be supplied (input file).n", call.=FALSE) 11 | } else if (length(args)==1) { 12 | fn = args[1] 13 | } 14 | #fn = "20170717_CO/All_Results_RET_Gel1_75_4_071717.txt" 15 | outFile = paste("output/",gsub(".*/","",fn),".pdf",sep="") 16 | outFile2 = paste("output/",gsub(".*/","",fn),"2.pdf",sep="") 17 | 18 | results = read.table(fn, sep="\t", header=T) 19 | 20 | colnames(results) = gsub("\\.","_",colnames(results)) 21 | #colnames(results) 22 | results$Ligand = gsub(".*_","",results$Sample) 23 | results$Mut = gsub("_.*","",results$Sample) 24 | 25 | results$Sample = factor(make.names(results$Sample,unique = T), levels=make.names(results$Sample,unique = T)) 26 | 27 | p = ggplot(results,aes(x=Sample, y=MAPK_RET_GAPDH_WT, fill=Ligand)) 28 | p = p + facet_grid(.~Ligand, drop=T, space="free",scale="free") 29 | p = p + geom_bar(stat="identity") + theme_bw() #+ theme_nogrid() 30 | p = p + labs(x = "Sample", y="MAPK/RET/GAPDH (normalized to WT)") 31 | p = p + theme(axis.title = element_text(size=16), axis.text.x = element_text(colour="black", size=12, angle=90,vjust=0.5), axis.text.y = element_text(colour="black", size=12))#element_text(colour="black", size=14)) 32 | p = p + theme(legend.position="bottom") 33 | p = p + geom_hline(yintercept = 1, alpha=0.3) 34 | #p = p + geom_text(aes(label=Number_of_Phosphosites), vjust=-0.25) 35 | p 36 | #fn = "output/All_Results_RET_Gel1_75_4_071717.pdf" 37 | ggsave(file=outFile, useDingbats=FALSE) 38 | 39 | p = ggplot(results,aes(x=Sample, y=MAPK_GAPDH_WT, fill=Ligand)) 40 | p = p + facet_grid(.~Ligand, drop=T, space="free",scale="free") 41 | p = p + geom_bar(stat="identity") + theme_bw() #+ theme_nogrid() 42 | p = p + labs(x = "Sample", y="MAPK/RET/GAPDH (normalized to WT)") 43 | p = p + theme(axis.title = element_text(size=16), axis.text.x = element_text(colour="black", size=12, angle=90,vjust=0.5), axis.text.y = element_text(colour="black", size=12))#element_text(colour="black", size=14)) 44 | p = p + theme(legend.position="bottom") 45 | p = p + geom_hline(yintercept = 1, alpha=0.3) 46 | #p = p + geom_text(aes(label=Number_of_Phosphosites), vjust=-0.25) 47 | p 48 | 49 | ggsave(file=outFile2, useDingbats=FALSE) -------------------------------------------------------------------------------- /analysis/gene_list/examine_gene_list.R: -------------------------------------------------------------------------------- 1 | ##### examine_gene_list.R ##### 2 | # Kuan-lin Huang @ WashU 201802 3 | # examine the curated gene list 4 | 5 | bdir = "/Users/khuang/Box\ Sync/PhD/germline/PanCanAtlasGermline/analysis/gene_list" 6 | setwd(bdir) 7 | source("../global_aes_out.R") 8 | source("../dependency_files.R") 9 | 10 | library(readxl) 11 | 12 | ### annotate 152 gene table with oncogene/TSG 13 | CPG_fn = "/Users/khuang/Box\ Sync/PhD/germline/PanCanAtlasGermline/doc/reference/CPG152_table.xlsx" 14 | CPG = data.frame(readxl::read_excel(CPG_fn)) 15 | colnames(CPG)[1]="Gene" 16 | CPG$Gene_Classification = "Not classified" 17 | CPG$Gene_Classification[CPG$Gene %in% all_TSGs] = "Tumor Suppressor Gene" 18 | CPG$Gene_Classification[CPG$Gene %in% all_oncogenes] = "Oncogene" 19 | # write.table(CPG[c("Gene","Gene_Classification")], file="/Users/khuang/Box\ Sync/PhD/germline/PanCanAtlasGermline/doc/reference/CPG152_table_gClass.tsv", quote=F, sep="\t", col.names=T, row.names=F) 20 | 21 | ### examine the number of pathogenic variants found in each category of curated genes 22 | cat("Source of 152 CPGs","\n") 23 | table(CPG$Source) 24 | 25 | gene_var_count = data.frame(table(pathVarP$HUGO_Symbol,pathVarP$Overall_Classification)) 26 | colnames(gene_var_count) = c("Gene","Classification","Count") 27 | # write.table(CPG_count[c("Gene","Classification", "Count")], file="/Users/khuang/Box\ Sync/PhD/germline/PanCanAtlasGermline/doc/reference/CPG152_table_gVarCount.tsv", quote=F, sep="\t", col.names=T, row.names=F) 28 | 29 | gene_var_count2= data.frame(table(pathVarP$HUGO_Symbol)) 30 | colnames(gene_var_count2) = c("Gene","Count") 31 | CPG_count = merge(CPG,gene_var_count2, by="Gene", all.x=T) 32 | CPG_count$Count[is.na(CPG_count$Count)] = 0 33 | CPG_count$Source[CPG_count$Source=="Cancer Gene Census Germline download 1/5/2016 (http://cancer.sanger.ac.uk/census/ )"] = "Cancer Gene Census Germline" 34 | CPG_count$Source[CPG_count$Source=="Reference (see PMID)"] = "Curated from literature" 35 | CPG_count$Source[CPG_count$Source=="personal communication; related to DICER1"] = "Personal communication" 36 | 37 | p = ggplot(CPG_count,aes(x=Source, y = Count )) 38 | #p = p + facet_grid(.~Classification) 39 | p = p + geom_jitter(height =0, width = 0.3, alpha=0.5,aes(fill=Source, color=Source)) 40 | p = p + geom_violin(alpha=0.5, stroke=0,aes(fill=Source, color=Source)) 41 | p = p + geom_label_repel(aes(label=ifelse(Source != "Rahman 114 CPG" & Count > 5, paste(Gene,Count,sep="-"),NA))) 42 | p = p + theme_bw() + labs(y="Count of likely pathogenic and pathogenic variant", x = "Source of gene") + 43 | theme(legend.position = "None", axis.text.x = element_text(colour="black", size=12, angle=90, vjust = 0.5), axis.text.y = element_text(colour="black", size=12),axis.ticks = element_blank())#element_text(colour="black", size=14)) 44 | p 45 | 46 | fn = 'out/source_var_count.pdf' 47 | ggsave(fn,h=7,w=4,useDingbat=F) -------------------------------------------------------------------------------- /analysis/hotspot3d/cluster_analysis.R: -------------------------------------------------------------------------------- 1 | ##### cluster_analysis.R ##### 2 | # Kuan-lin Huang @ WashU 201711 3 | # plot assoc results for pathogenic variants 4 | 5 | bdir = "/Users/khuang/Box\ Sync/PhD/germline/PanCanAtlasGermline/analysis/hotspot3d" 6 | setwd(bdir) 7 | source("../global_aes_out.R") 8 | source("../dependency_files.R") 9 | 10 | s_fn = "mc3.v0.2.8.PUBLIC.code.filtered.PCAgenes.hotspot.maf" 11 | somatic = read.table(sep="\t",header=T, quote="",stringsAsFactors = F, file=s_fn) 12 | 13 | fn = "PCA_somatic_germline_combined_missense.maf.3D_Proximity.pairwise.recurrence.l0.r10.clusters" 14 | cluster = read.table(sep="\t",header=T, quote="",stringsAsFactors = F, file=fn) 15 | 16 | colnames(cluster)[2:3] = c("HUGO_Symbol","HGVSp_short") 17 | cluster$type = NA 18 | cluster$type[paste(cluster$HUGO_Symbol,cluster$HGVSp_short) %in% paste(somatic$Hugo_Symbol,somatic$HGVSp_Short)] = "Somatic" 19 | cluster$type[is.na(cluster$type) & (paste(cluster$HUGO_Symbol,cluster$HGVSp_short) %in% paste(pathVarP$HUGO_Symbol,pathVarP$HGVSp_short))] = "Germline" 20 | cluster$type[cluster$type== "Somatic" & (paste(cluster$HUGO_Symbol,cluster$HGVSp_short) %in% paste(pathVar$HUGO_Symbol,pathVar$HGVSp_short))] = "Colocalized" 21 | 22 | cluster_w_germ = cluster$Cluster[cluster$type %in% c("Germline","Colocalized")] 23 | 24 | cluster_germ = cluster[cluster$Cluster %in% cluster_w_germ,] 25 | cat("Number of sites co-clustered: ","\n") 26 | table(cluster_germ$type) 27 | 28 | cat("Number of unique clusters: ",length(unique(cluster_germ$Cluster)),"\n") 29 | 30 | table(cluster_germ$HUGO_Symbol[!duplicated(cluster_germ$Cluster)]) 31 | 32 | tn = "PCA_somatic_germline_combined_missense.maf.3D_Proximity.pairwise.recurrence.l0.r10.clusters_annotated.tsv" 33 | write.table(cluster_germ, quote=F, sep="\t", file = tn, row.names = F) 34 | -------------------------------------------------------------------------------- /analysis/hotspot3d/spotlightVar.R: -------------------------------------------------------------------------------- 1 | ##### spotlightVar.R ##### 2 | # Kuan-lin Huang @ WashU 201711 3 | # plot special variants 4 | 5 | bdir = "/Users/khuang/Box\ Sync/PhD/germline/PanCanAtlasGermline/analysis/hotspot3d" 6 | setwd(bdir) 7 | source("../global_aes_out.R") 8 | source("../dependency_files.R") 9 | 10 | # check if there is statistical enrichment of overlaps 11 | # $ awk -F'\t' 'BEGIN{SUM=0}{ SUM+=$3-$2 }END{print SUM}' all_CDS_and_ncRNA_24Chroms_Contigs_1BasedStart_2bpFlanks_ForMusic_merged 12 | # 49586385 13 | exonSize = 49586385 14 | nPath = nrow(pathVarP) 15 | 16 | ### somatic mutation 17 | # $ gzcat /Users/khuang/Box\ Sync/PhD/germline/PanCanAtlasGermline/TCGA_data/somatic/mc3.v0.2.8.PUBLIC.maf.gene_vclass_HGVSp_sample.gz | cut -f1,4 | sort | uniq -c | awk '$1 >2 && $3 != "."' | wc -l 18 | # 68537 19 | numSomaticOverlap = nrow(pathVarP[pathVarP$colocalized_somatic_mutation_count > 2,]) 20 | somaticMutRate = 68537/exonSize 21 | poisson.test(numSomaticOverlap, T = nPath, r = somaticMutRate, conf.level = 0.95, alternative = "greater") 22 | 23 | ### PCGP germ var 24 | # /charged.2015_stJude_germline_nejm_S4_AD_varOnly.vep.tsv 25 | # 551 /Users/khuang/Box Sync/PhD/germline/PanCanAtlasGermline/analysis/pathogenic_variants/PCGP/charged.2015_stJude_germline_nejm_S4_AD_varOnly.vep.tsv 26 | # vpn-10-1-24-5:analysis khuang$ wc -l /Users/khuang/Box\ Sync/PhD/germline/PanCanAtlasGermline/analysis/pathogenic_variants/PCGP/charged.2015_stJude_germline_nejm_S4_AR_varOnly.vep.tsv 27 | # 239 /Users/khuang/Box Sync/PhD/germline/PanCanAtlasGermline/analysis/pathogenic_variants/PCGP/charged.2015_stJude_germline_nejm_S4_AR_varOnly.vep.tsv 28 | 29 | numPCGPOverlap = nrow(pathVarP[pathVarP$PCGP,]) 30 | pcgpMutRate = (551 + 239)/exonSize 31 | poisson.test(numPCGPOverlap, T = nPath, r = pcgpMutRate, conf.level = 0.95, alternative = "greater") 32 | 33 | # write file 34 | pathVarP_hot = pathVarP[pathVarP$colocalized_somatic_mutation_count > 2 | pathVarP$PCGP,] 35 | write.table(file = "out/colocalize_var.tsv", pathVarP_hot,quote=F, sep = '\t',row.names = F) 36 | 37 | # plot 38 | pathVarPOT_hot = pathVarPOT[pathVarPOT$colocalized_somatic_mutation_count > 2 | pathVarPOT$PCGP,] 39 | table(pathVarPOT_hot$HUGO_Symbol) 40 | pathVarPOT_hot$somatic_count_plot = pathVarPOT_hot$colocalized_somatic_mutation_count 41 | pathVarPOT_hot$HGVSp_short_plot = gsub("p.","",pathVarPOT_hot$HGVSp_short) 42 | #pathVarPOT_hot$somatic_count_plot[pathVarPOT_hot$somatic_count_plot> 100 ] = 100 43 | 44 | p = ggplot(pathVarPOT_hot,aes(y=HUGO_Symbol, x =somatic_count_plot, color = PCGP)) 45 | #p = p + facet_grid(PCGP~Gene_Classification,drop=T,scale="free",space="free") 46 | p = p + facet_grid(Gene_Classification~ .,drop=T,scale="free_y",space="free_y") 47 | p = p + geom_point(stroke=0) + theme_bw() #+ guides(color=FALSE) 48 | #p = p + geom_abline(intercept = 0, slope=1, alpha=0.2) #+ geom_density2d(alpha=0.5) 49 | p = p + geom_text_repel(aes(label=ifelse(duplicated(HGVSp_short),NA,HGVSp_short_plot))) 50 | p = p + theme(axis.title = element_text(size=16), axis.text.x = element_text(colour="black", size=14, angle=90,vjust=0.5), axis.text.y = element_text(colour="black", size=14))#element_text(colour="black", size=14)) 51 | p = p + scale_x_log10() 52 | p = p + expand_limits(x = 0) 53 | #p = p + coord_equal() + getLOHColorScale() 54 | p = p + labs(x = "Co-localizing somatic mutation count", y = "Gene") 55 | p 56 | fn = "out/pathVarP_spotlight.pdf" 57 | ggsave(file=fn, width=7, h =5, useDingbats=FALSE) 58 | -------------------------------------------------------------------------------- /analysis/hotspot3d/work.log.sh: -------------------------------------------------------------------------------- 1 | #1. get variants 2 | #get somatic mafs with genes of interest 3 | cut -f1,5-7,9,11,13,16,37,38 /gscmnt/gc2741/ding/Drivers/Data/mc3.v0.2.8.PUBLIC.code.filtered.maf | grepList PCA.all.genes.txt 0 > mc3.v0.2.8.PUBLIC.code.filtered.PCAgenes.hotspot.maf 4 | # my @mafcols = ( $mafcols{"Hugo_Symbol"}, 5 | # $mafcols{"Chromosome"}, 6 | # $mafcols{"Start_Position"}, 7 | # $mafcols{"End_Position"}, 8 | # $mafcols{"Variant_Classification"}, 9 | # $mafcols{"Reference_Allele"}, 10 | # $mafcols{"Tumor_Seq_Allele2"}, 11 | # $mafcols{"Tumor_Sample_Barcode"}, 12 | # $mafcols{$this->{"transcript_id_header"}}, 13 | #Hugo_Symbol Chromosome Start_Position End_Position Variant_Classification Reference_Allele Tumor_Seq_Allele2 Tumor_Sample_Barcode HGVSp_Short Transcript_ID 14 | 15 | # get germline file 16 | awkt '$13 == "missense_variant" {print $2,$9,$5,$10,"Missense_Mutation",$11,$12,$1,$116,$6}' PCA_pathVar_integrated_filtered.tsv > PCA_pathVar_integrated_filtered_hotspot3d.tsv 17 | 18 | #2. combine the somatic file with germline file 19 | 20 | 21 | cat mc3.v0.2.8.PUBLIC.code.filtered.PCAgenes.hotspot.maf PCA_pathVar_integrated_filtered_hotspot3d.tsv > PCA_somatic_germline_combined.maf 22 | awk 'NR==1 || $9 ~ "p."' PCA_somatic_germline_combined.maf > PCA_somatic_germline_combined_filtered.maf 23 | awk 'NR==1 || $5=="Missense_Mutation" {print $1"\t"$2"\t"$3"\t"$4"\t"$5"\t"$6"\t"$6"\t"$7"\t"$8"\t"$10"\t"$9}' PCA_somatic_germline_combined_filtered.maf > PCA_somatic_germline_combined_missense.maf 24 | 25 | #3. hotspot run; following steps on the github 26 | bsubl -oo proximity_search.log 'hotspot3d search --maf-file=PCA_somatic_germline_combined_missense.maf --prep-dir=/gscmnt/gc2706/dinglab/medseq/Structure_Projects/Preprocessing_Output_20141023/' 27 | 28 | bsubl -oo post.log 'hotspot3d post --maf-file=PCA_somatic_germline_combined_missense.maf' 29 | 30 | bsubl -oo cluster.log 'hotspot3d cluster --pairwise-file=3D_Proximity.pairwise --maf-file=PCA_somatic_germline_combined_missense.maf --vertex-type=recurrence' 31 | 32 | bsubl -oo summary.log 'hotspot3d summary --clusters-file=PCA_somatic_germline_combined_missense.maf.3D_Proximity.pairwise.recurrence.l0.r10.clusters' 33 | 34 | #4. post processing: plotting 35 | # get presence 36 | perl ~/bin/hotspot3d_KH/scripts/clusterPDBPresence.pl 3D_Proximity.pairwise PCA_somatic_germline_combined_missense.maf.3D_Proximity.pairwise.recurrence.l0.r10.clusters PCA 37 | 38 | # plot some of the previous top candidates 39 | musite=canonical.combined.mc3.musites 40 | cluster=PCA_somatic_germline_combined_missense.maf.3D_Proximity.pairwise.recurrence.l0.r10.clusters 41 | 42 | # limit cluster file to clusters of interest first 43 | 44 | # visualize 45 | grep1 245.0 ${cluster} > tmp.clusters 46 | hotspot3d visual --pairwise-file=3D_Proximity.pairwise --clusters-file=tmp.clusters --pdb=2IVT --output-file=pml_scripts/PCA.2IVT.RET.pml --script-only 47 | grep1 245.0 ${cluster} > tmp.clusters 48 | hotspot3d visual --pairwise-file=3D_Proximity.pairwise --clusters-file=tmp.clusters --pdb=2X2M --output-file=pml_scripts/PCA.2X2M.RET.pml --script-only 49 | -------------------------------------------------------------------------------- /analysis/mutation_signature/4_plotSomaticMutsigAssoc.R: -------------------------------------------------------------------------------- 1 | ##### plotPathVarMutsigAssoc.R ##### 2 | # Kuan-lin Huang 2018 3 | 4 | source("/Users/khuang/Box\ Sync/PhD/germline/PanCanAtlasGermline/analysis/global_aes_out.R") 5 | source("/Users/khuang/Box\ Sync/PhD/germline/PanCanAtlasGermline/analysis/dependency_files.R") 6 | 7 | g_tn = "out/pathVarMutsigAssoc.txt" 8 | g_tt = read.table(sep="\t",header=T,file=g_tn, stringsAsFactors=FALSE) 9 | 10 | tn = "out/somaticMutMutsigAssoc.txt" 11 | tt = read.table(sep="\t",header=T,file=tn, stringsAsFactors=FALSE) 12 | 13 | ### plotting ### 14 | tt$signature = factor(tt$signature) 15 | tt$signature = factor(tt$signature,levels = c("Signature-1","Signature-2","Signature-3","Signature-4","Signature-5","Signature-6" 16 | ,"Signature-7","Signature-8","Signature-9","Signature-10","Signature-11","Signature-12" 17 | ,"Signature-13","Signature-14","Signature-15","Signature-16","Signature-17","Signature-18" 18 | ,"Signature-19","Signature-20","Signature-21","Signature-22","Signature-23","Signature-24" 19 | ,"Signature-25","Signature-26","Signature-27","Signature-28","Signature-29","Signature-30")) 20 | tt$association = "None" 21 | tt$association[tt$FDR<0.15] = "Suggestive" 22 | tt$association[tt$FDR<0.05] = "Significant" 23 | tt$gene = as.character(tt$gene) 24 | tt$FDR_plot = -log(tt$FDR) 25 | tt$FDR_plot[tt$FDR_plot > 5 ] = 5 26 | #uniqG = unique(tt$gene[tt$FDR<0.05]) 27 | uniqG = unique(g_tt$gene[g_tt$FDR<0.05]) # plot just the germline genes for now 28 | ttG = tt[tt$gene %in% uniqG,] 29 | 30 | getPalette = colorRampPalette(c("#FFFFFF","#fed976","#e31a1c")) 31 | p = ggplot(data=ttG) 32 | p = p + facet_grid(gene~.,space="free",scale="free") 33 | p = p + geom_tile(data=ttG,aes(y=cancer, x=signature, fill= coefficient), linetype="blank") + scale_fill_gradientn(name= "Coefficient", colours=getPalette(100), na.value=NA, limit=c(0,NA)) 34 | #p = p + geom_text(data=ttG,aes(x=cancer, y=signature, label = coefficient), color="black", size=3) 35 | #p = p + geom_tile(data=ttG[ttG$FDR < 0.15,],aes(y=cancer, x=signature), color="grey",fill=NA, size=1.5) #+ scale_color_gradientn(name= "Sig", colours=sigColors) 36 | p = p + geom_tile(data=ttG[ttG$FDR < 0.05,],aes(y=cancer, x=signature), color="black",fill=NA, size=1.5) #+ scale_color_gradientn(name= "Sig", colours=sigColors) 37 | p = p + theme_bw() + theme_nogrid() + 38 | theme(axis.title = element_blank(), axis.text.x = element_text(colour="black", size=12, angle=90, vjust = 0.5), axis.text.y = element_text(colour="black", size=12),axis.ticks = element_blank())#element_text(colour="black", size=14)) 39 | p + labs(x="Signature",y = "Cancer") 40 | 41 | fn = 'out/SomaticWithmutSignatureHeatmap.pdf' 42 | ggsave(fn,h=25,useDingbat=F) 43 | 44 | # plot by gene 45 | p = ggplot(data=ttG,aes(x=coefficient,y=cancer,color = cancer)) 46 | p = p + facet_grid(gene~.,space="free",scale="free") 47 | # p = p + geom_point(aes(y=-log10(wilcoxFDR),x= coefficient,color = cancer),alpha=0.5) 48 | p = p + geom_point(aes(shape=association),alpha=0.3,size=2) 49 | p = p + geom_text_repel(aes(label=ifelse(FDR<0.0005,signature,NA))) 50 | p = p + getPCACancerColor() 51 | p = p + labs(x="Cancer",y= "-log10(FDR)") 52 | p = p + geom_vline(xintercept = 0, alpha=0.5) #+ xlim(-3.1,3.1) 53 | p = p + theme_bw() + 54 | theme(axis.text.x = element_text(colour="black", size=12), axis.text.y = element_text(colour="black", size=12),axis.ticks = element_blank())#element_text(colour="black", size=14)) 55 | p + labs(x = "coefficient",y="cancer") 56 | fn = 'out/SomaticWithmutSignatureByGene.pdf' 57 | ggsave(fn,h=28,useDingbat=F) 58 | -------------------------------------------------------------------------------- /analysis/pleiotropy/pleiotropy.R: -------------------------------------------------------------------------------- 1 | ##### examine_gene_list.R ##### 2 | # Kuan-lin Huang @ WashU 201802 3 | # examine the curated gene list 4 | 5 | bdir = "/Users/khuang/Box\ Sync/PhD/germline/PanCanAtlasGermline/analysis/pleiotropy" 6 | setwd(bdir) 7 | source("../global_aes_out.R") 8 | source("../dependency_files.R") 9 | 10 | pathVarP_otherSymptoms = pathVarP[!pathVarP$cancer_term_trait & pathVarP$binary_type=="Missense" & pathVarP$ClinVar_Pathogenicity == "Pathogenic",] 11 | pathVarP_otherSymptoms_sele = pathVarP_otherSymptoms[,c(2,15,23,27,108,109)] 12 | pathVarP_otherSymptoms_sele_uni = pathVarP_otherSymptoms_sele[!duplicated(pathVarP_otherSymptoms_sele$HGVSp),] 13 | 14 | dim(pathVarP_otherSymptoms_sele_uni) 15 | 16 | fn = "out/pleitropy_vars_inCPG.tsv" 17 | write.table(pathVarP_otherSymptoms_sele_uni, file=fn, quote=F, sep="\t", col.names=T, row.names=F) 18 | -------------------------------------------------------------------------------- /analysis/process_files/genotype/cancer_type.txt: -------------------------------------------------------------------------------- 1 | ACC 2 | BLCA 3 | BRCA 4 | CESC 5 | CHOL 6 | COAD 7 | DLBC 8 | ESCA 9 | GBM 10 | HNSC 11 | KICH 12 | KIRC 13 | KIRP 14 | LAML 15 | LGG 16 | LIHC 17 | LUAD 18 | LUSC 19 | MESO 20 | OV 21 | PAAD 22 | PCPG 23 | PRAD 24 | READ 25 | SARC 26 | SKCM 27 | STAD 28 | TGCT 29 | THCA 30 | THYM 31 | UCEC 32 | UCS 33 | UVM -------------------------------------------------------------------------------- /analysis/process_files/genotype/merge_genotype_by_cancer.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | while IFS='' read -r line || [[ -n "$line" ]]; do 3 | echo "Processing cancer type: $line" 4 | cancer="${line%\\n}" 5 | #get file 6 | gsutil ls gs://dinglab/isb-cgc/tcga/genotyping/tarballs/genotyping.${cancer}.tar 7 | gsutil cp gs://dinglab/isb-cgc/tcga/genotyping/tarballs/genotyping.${cancer}.tar . 8 | tar -xvf genotyping.${cancer}.tar 9 | 10 | #merge 11 | vcf-merge $(ls -1 ${cancer}/*.vcf.gz | grep TCGA-..-....-1.* | perl -pe 's/\n/ /g') | bgzip -c > ${cancer}.normal.merge.vcf.gz 12 | #index 13 | tabix -p vcf ${cancer}.normal.merge.vcf.gz 14 | #upload 15 | gsutil cp ${cancer}.normal.merge.vcf.gz* gs://dinglab/isb-cgc/tcga/genotyping/merge 16 | # delete files 17 | rm -rf ${cancer}/* 18 | rm -rf genotyping.${cancer}.tar 19 | done < "$1" 20 | 21 | # set unlimit file number higher so it works for breast cancer 22 | ulimit -n 2500 -------------------------------------------------------------------------------- /analysis/process_files/genotype/merge_log_gcloud.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | # 2 vCPU; 60GiB <- not enough memory; maybe because vcf-merge open up lots of vcf to do re-indexing? 4 | # 8vCPU; 52GiB mem, 60GiB 5 | 6 | # set unlimit file number higher so it works for breast cancer 7 | ulimit -n 2500 8 | 9 | nohup bash merge_genotype_by_cancer.sh cancer_type.txt > merge_genotype_by_cancer.log & 10 | 11 | # merge 12 | vcf-merge $(ls -1 *.normal.merge.vcf.gz | perl -pe 's/\n/ /g') > all.normal.merge.vcf 13 | # job ended because disk run out of space (60G, zip into partial vcf) 14 | bgzip -c all.normal.merge.vcf > all.normal.merge_partial.vcf.gz 15 | tabix -p vcf all.normal.merge_partial.vcf.gz 16 | 17 | # get the remaining SNPs 18 | nohup vcftools --gzvcf BRCA.normal.merge.vcf.gz --gzdiff all.normal.merge_partial.vcf.gz --diff-site --out inBRCA_v_inAllPartial & 19 | awk -F '\t' '$4=="1"{print $1"\t"$2}' inBRCA_v_inAllPartial.diff.sites_in_files > leftover.sites_in_files.positions.txt 20 | # get each cancer types leftover vcf 21 | for file in *.normal.merge.vcf.gz; do 22 | echo $file 23 | echo leftover.$file 24 | vcftools --gzvcf $file --positions leftover.sites_in_files.positions.txt --recode --stdout | bgzip -c > leftover.$file 25 | done 26 | # tabix 27 | for file in leftover*.normal.merge.vcf.gz; do 28 | tabix -p vcf $file 29 | done 30 | 31 | # merge the remaining sites 32 | nohup vcf-merge $(ls -1 leftover*.normal.merge.vcf.gz | perl -pe 's/\n/ /g') > leftover.all.normal.merge.vcf & 33 | bgzip -c leftover.all.normal.merge.vcf > leftover.all.normal.merge.vcf.gz 34 | tabix -p vcf leftover.all.normal.merge.vcf.gz 35 | 36 | # the last line is broken; clean it up 37 | zcat all.normal.merge_partial.vcf.gz | head -n -1 | bgzip -c > all.normal.merge_partial_cleaned.vcf.gz 38 | tabix -p vcf all.normal.merge_partial_cleaned.vcf.gz 39 | 40 | # merge both and upload 41 | vcf-concat all.normal.merge_partial_cleaned.vcf.gz leftover.all.normal.merge.vcf.gz | bgzip -c > all.normal.merge.vcf.gz & 42 | tabix -p vcf all.normal.merge.vcf.gz 43 | # $ zcat all.normal.merge.vcf.gz | wc -l 44 | # 522763 45 | 46 | gsutil cp all.normal.merge.vcf.gz* gs://dinglab/isb-cgc/tcga/genotyping/merge -------------------------------------------------------------------------------- /analysis/process_files/germline/google-cloud-ISB/README.md: -------------------------------------------------------------------------------- 1 | Preliminary steps (any order): 2 | 3 | * Build dockerfile and push to repository of choice 4 | 5 | * Install dsub 6 | 7 | see [https://github.com/googlegenomics/dsub](https://github.com/googlegenomics/dsub) 8 | 9 | * Make lists of samples ids to process (if needed) 10 | 11 | ./make_lists.sh 12 | 13 | This action creates the directory analysisID_lists/ and files therein 14 | 15 | 16 | 17 | Main steps 18 | 19 | 1. In script annotate.sh, check for the appropriate paths and filenames, dsub location, docker image repository location, as well as cloud project information and container parameters, etc. 20 | 21 | 2. Activate dsub. 22 | 23 | 3. Run 24 | 25 | ./annotate.sh $cancerType 26 | 27 | This action typically launches several hundred jobs, which is the recommended approach. 28 | -------------------------------------------------------------------------------- /analysis/process_files/germline/google-cloud-ISB/analysisID_lists/CHOL.ids: -------------------------------------------------------------------------------- 1 | 00b0d456-bc7c-43cc-9c23-e9bdd89af52f 2 | 024e4bf7-2391-478b-9de5-89cd17b89fa1 3 | 08f1a00b-9ed6-4cc6-bb35-68bf3510700e 4 | 0af33967-eaf5-4b31-8a3e-474950c128cd 5 | 0c177782-f4b7-46e9-9f84-d50ec1aa9308 6 | 0ed3bf36-969d-4e5c-b30d-240d463da44b 7 | 10a331fa-b9a8-4dea-b5a8-f65d36c8c7cd 8 | 1746e80c-3a23-4277-8977-ae5cf486e52a 9 | 174c632d-f035-483a-9f79-d9b5e18aaa03 10 | 1b8e6945-69d9-4015-b178-af2e4e54f4d5 11 | 1c4f3554-4788-4de1-aada-e95960974373 12 | 1efacff6-9331-4bfd-9680-ce62df20af39 13 | 1f76a26a-b2eb-4cf5-926a-ba39129f68a9 14 | 21c32d9a-e76f-4f35-941d-45c15158dede 15 | 23ae3329-9c10-4d83-b64b-c5c578cb2bf6 16 | 23e27f4e-1037-43aa-b764-4fb311f3ffdb 17 | 26f66a14-b7de-4072-9b9b-5f5f67103b4b 18 | 27f90585-a7ed-492a-88e9-b52d7e4dd3bf 19 | 280d5745-fde7-46e0-ae7e-e454b0033659 20 | 3118c963-8446-4d4a-8146-6d46f1465780 21 | 343bc2ac-6a8f-4c5a-80b5-2c94bf32dd02 22 | 3b6a787b-33c9-4119-8f3b-94b96029f2d7 23 | 3c0257cf-12bf-4ec7-aacf-ac52d68dda71 24 | 3c5e8db6-8f6b-4d5e-aa27-53ec9fb57214 25 | 3de7df2c-2375-4c9c-aa17-f1437f2c9489 26 | 3e1b05dc-b62c-46d7-a20a-bf2781992cc4 27 | 3fbcfb4e-4034-4ab1-911e-28d4aea07791 28 | 44280092-b8cc-47b2-a561-2e5957f20713 29 | 47632a94-5413-43df-a26a-8bd7292824f7 30 | 4b60d271-7397-46bd-aaee-e3c9b5375a79 31 | 4d85b3fd-53f3-4a52-9322-4a68de4ef7ad 32 | 4f1bc424-598a-445f-8566-c135d2f806f4 33 | 548ddecc-f07c-4b7b-a3fd-08a8466f7c35 34 | 549afc0b-a2a8-4653-bc72-141eaf63ec8c 35 | 59919629-a114-478c-9d48-52a861fedf40 36 | 5ce81c93-c432-4db9-aae9-08f2ca5c5cd1 37 | 5dd3429f-feb7-43d4-ad64-ff30b53c041f 38 | 5f068ddf-6dec-4559-acf3-637730b9e004 39 | 5fddf1ad-bf26-48e6-baef-09c8ee208b53 40 | 61468067-f567-4119-870d-48b930e68c43 41 | 61f9d026-7b71-47b3-a78f-39edd32ad99c 42 | 657a9e5d-4e7d-4568-9e25-a72901cbfc12 43 | 6587bc6d-9f8d-421e-999b-8d88e409c3f8 44 | 68d5f8e2-0050-4caf-b12d-1b08f033bfc7 45 | 6d064547-7125-4072-ac1d-1bcf1948597d 46 | 6debc21c-b1e2-47fe-bd66-28bc99c613ed 47 | 6e8ecd99-5aa1-4614-87b7-c374787e75fc 48 | 71b53b01-d6ec-4410-8692-f3a4317db5d9 49 | 722842ff-fd56-4965-9f31-c12bd81de159 50 | 722d4179-a064-4e83-b39a-75b5ef361d13 51 | 723ab076-7381-4364-bd04-6a04d8011e8a 52 | 72b856e7-8ce1-4ae8-8d69-a7f4b76b97f9 53 | 73ffa0ba-a122-4be8-ab74-8116c432d361 54 | 75634c49-7875-4c65-870b-8983b185547c 55 | 790e85fa-b916-4b51-8512-37ac7d36f6de 56 | 79c7d7fb-8ac8-4872-b0f5-fdfab8091391 57 | 7d134ea2-8519-4a35-8b00-593a6e73d881 58 | 84e45fce-d12f-4b6e-ba7a-8c6f5b2eadfb 59 | 88312e9f-a42d-45f8-b720-043b7783cd96 60 | 899cb43c-9ef0-4997-a5cc-a90bdbbb6f17 61 | 89e67c77-33a8-4cd0-b866-a9dbfe09e26d 62 | 8de6d5d1-bbb0-4027-bd49-c42e6fb7ef01 63 | 914c8d2d-051f-4a34-8c44-679d10c9c0a8 64 | 92d0abec-0f49-46f5-83ee-1ef2c2b29cd1 65 | 93838731-fec0-4d4a-9b7e-dcadcf083dc6 66 | 9414c05a-029c-430c-96c3-fa87405e2b1e 67 | 995516f3-253f-4c05-ad4b-b857a9135a8e 68 | 996b5802-eb44-4b6e-b6a0-5eaf64a29178 69 | 9c70ced1-7eda-41e5-a59b-91a9c7c55b15 70 | 9e75ab9c-a7dd-4d12-8f7e-2aa855b0061a 71 | 9f1fe80d-3319-4fa4-a0ec-7c622ccea401 72 | a0953af0-0593-4e93-82b7-5d850685a397 73 | a0d82814-b3df-42e0-9a85-dc7d0c5d00fc 74 | a1c00a3b-5164-4898-b7e8-832e8a78622f 75 | a67a55d6-39f6-44d5-a90e-7af3f925434f 76 | a6c27c5e-0346-4d36-919e-0ccd80a76c20 77 | aa58211e-379f-463b-b370-a9adad976ba0 78 | ab45bb9d-a326-4143-b6a4-c6e4b6bc1b40 79 | b06c9621-e956-49a7-b8b5-3a44695ae9e7 80 | b6b5dcfa-d692-4940-83cb-3a343aef28e6 81 | b6e63155-cd48-45d3-a787-c0c3ca330c16 82 | ba3f3a2d-2ffd-461c-a9c6-9cbfe967ff0e 83 | bac40ece-2ec0-4807-a35d-748fdb0d45f7 84 | bcb1ea92-64b0-4d2d-a751-9034aaa0748c 85 | bef9ef3c-b6c8-4cf0-aab4-1facf08a04a7 86 | bf95def0-ed0a-40b0-9e6b-824464c9d037 87 | c8319529-1328-4372-ac3e-b38faf22b122 88 | cc44026d-7056-4627-a7b7-4eb7e42ae7a2 89 | cc96310f-aad1-4fa2-ae04-a7239432f338 90 | cd64550f-4cbd-480f-afb8-3cb6940ffa49 91 | ce5363ef-2de7-451f-9718-3a2f130d955a 92 | d0ff754e-7757-426d-834e-759e43b41a85 93 | d5049a47-27de-4463-b213-9123199fe8ba 94 | d5610dde-7d1e-4917-b700-3cba9158041b 95 | d7a5d40b-b2f3-4b46-a613-b2ff5fa24480 96 | dbeafaed-2acb-485f-9e88-92ee7e94b168 97 | dc084a43-a6ee-49fe-b704-780af79e687d 98 | dd344f2e-aadd-49ab-a644-f18fcc343ef5 99 | e4785c4d-bfaa-485b-b124-d3ab0a19aef8 100 | e49f3ac3-4d27-4fc2-86e9-7adb7a91e208 101 | e894afd0-a924-4951-ac22-b6841231e0a2 102 | e962ce93-c3b8-4524-8955-1f826e27756d 103 | ea95e3b2-e3a2-4493-8a60-eaf9bb0b5363 104 | ec0f44d5-e3f7-4b09-b44a-f1dc9731ce31 105 | f0359e15-f01f-425d-b231-fb53ea4ccd71 106 | f2c9cbc1-9383-4266-8dba-4167eb127e50 107 | f34b80b1-0a0f-4e37-8ecc-2e13b4c1553b 108 | f53f8656-b74f-47d6-8bae-d86c6da684d1 109 | f73c413b-a6cf-4da6-a833-14893ebf39ae 110 | fd5f514c-bc98-4042-a13f-40915c6f715d 111 | fe83094b-a94c-46be-a74d-faf572d73b10 112 | feaeac67-aee8-4b27-a10d-42f3ddfb6266 113 | feeb857a-30b1-41ac-96dc-482f85b04a2d 114 | -------------------------------------------------------------------------------- /analysis/process_files/germline/google-cloud-ISB/analysisID_lists/DLBC.ids: -------------------------------------------------------------------------------- 1 | 0706b153-fb32-445d-8ab5-b4ff0d75a7ff 2 | 0b2ff723-4788-497a-be41-789db913ac9d 3 | 1283e672-5e91-4e41-bdfa-46ee5e17b255 4 | 15d7b6b0-2e09-4bd5-bbc8-98af6ee83fe0 5 | 17da368c-d609-4792-97f6-c454ba732c71 6 | 1e727a47-150a-40e7-8bcb-c7b3897d5276 7 | 1fdae439-6285-496c-ae97-f64265e8110e 8 | 26d61be3-88d4-4714-a25e-09a347d1645e 9 | 29e221f7-34bb-4811-b943-2d1f186a1c29 10 | 2c10b72d-e10b-4284-a0d3-c40f0af522f7 11 | 2db07d8b-22f4-4799-9bb9-d1bb58fcf1ec 12 | 2f749f44-fe34-4882-ab91-e231ba6a67f9 13 | 30b796bd-7020-4a64-800a-7b39d624cc7c 14 | 335fa833-2125-480b-b009-e89ba8a610cf 15 | 355d4cae-8061-4d6c-b737-7209d721e8a9 16 | 3dd0c890-265b-4bd1-ae0d-22ced50c24fe 17 | 3e0f1eaf-b928-4d5d-9997-7e9a366f4cf4 18 | 48b53f33-2a09-47b4-becc-a063f3c7bfc6 19 | 498edf5e-1cab-4579-9170-bf6ce90a59df 20 | 5352e2a7-8b20-4909-8bae-5467c710bd8f 21 | 54050a4a-8577-4eed-902b-c54c1f519a65 22 | 570a8c41-9433-46bb-8283-1870307ed2cb 23 | 58308fe4-a9a3-40e7-a417-44732826c70f 24 | 5a2510f2-0d30-41d3-956c-9dccace0fbb3 25 | 5adc8875-a9a9-441b-995a-2f62f45d0577 26 | 5b6aae67-7801-4842-9996-502d5d872b1b 27 | 5bced86b-7bed-4a51-8c9e-3c0b4ebcdf3c 28 | 668491ad-c0e1-4055-8b4d-5368111b7f72 29 | 676bcdca-e088-418a-8079-435c6e14519c 30 | 69d5e60c-8dd9-4927-8888-14d182da9c65 31 | 6aa7bb0c-0929-4332-abeb-a9140691a8d1 32 | 6c89c9aa-9d84-42dc-bcc2-66a8a7bbd3e5 33 | 78885547-bfe9-4252-814a-ee5ca91369c0 34 | 799f0ecb-99d2-4bd8-a05e-9542f5c0d960 35 | 7bf895e1-2184-449c-a256-058a52d0d540 36 | 7d0c1dda-ef6f-4e74-9e19-0b793c972d12 37 | 7f3492d5-1cd6-41cb-aee3-32f670add6fc 38 | 80c661c8-78d0-4ff5-8cf3-360ff312e36a 39 | 82dad9bd-3e63-4e4c-af18-e394b4d0f17a 40 | 8b7408a3-ce15-40d8-9d3e-b969d373d8f9 41 | 92762a6f-4d78-457d-843e-7a470df7c827 42 | 95c41569-cbc5-46fd-a39c-3680c41a3f03 43 | 9754651c-7be9-4140-8d83-2ee34fcebb4e 44 | 9863378d-5320-4e88-b0db-cca0145b6ccc 45 | 9b256771-6d08-48c7-9696-49ce822961a0 46 | 9c02043f-9daf-437c-8d81-6c214ca5e098 47 | 9e24e779-e612-4256-a41c-667dcacc0728 48 | 9f3f4b94-4adc-4724-b449-cf6c63bdd5d1 49 | a4aa6fdc-4136-4d5d-8858-e918bdbbb3cc 50 | a6aa1829-cbf5-4775-b21d-7d63bfa7a1a4 51 | a78918b5-8182-4eaf-99d7-443e68db17c2 52 | a87753ea-3962-4fa8-a335-7e7ca1197a8e 53 | a8abdd19-9a22-4e82-966f-cf1c44d475d0 54 | abe975a5-4cf2-4d0a-82d7-5b1474673bf7 55 | ac923ff6-77a9-43ef-b7b1-298e5b9c0e1f 56 | aec76bf9-f389-4744-997d-90157466f650 57 | b1f6abd2-b661-4790-a55c-3df2dbd3b105 58 | b27c2a2d-fdf1-4e70-a78a-d18be45f54c4 59 | b3e9db29-8551-4e73-b65d-3c7c3d7d4619 60 | b703c2d0-10a6-4988-b5bb-45c746364ebc 61 | b70b0300-0a53-4d60-b6fc-edef9ba05bc8 62 | bf7633ac-eb47-48d9-95d5-84bee621f230 63 | c12d7063-94fd-4b29-a52d-81581ac47fbc 64 | c5e0c935-5018-47ce-abbc-e200ae808d3b 65 | cd223dab-94b1-4b87-9ff3-9af759ba1d21 66 | ce46292f-25b6-4b84-9631-1198f75de4b4 67 | d273ceec-6525-4a8e-b71a-eef9ea7025cb 68 | d7901b09-70bc-472e-a790-9a3134d28d86 69 | d7916479-6e6b-4926-83e3-002682cbd0bd 70 | d8a691e9-1b11-4666-87f4-188a12e716c5 71 | dd3e9cb7-ef09-4f4b-b940-a5959953d11c 72 | ddf60239-37d2-4d30-9523-3a359ece1b3c 73 | de117caa-acbe-4fe8-bd89-6e1b2743881d 74 | de5f55b5-dec3-465b-be46-a073f89fbe0d 75 | deebc37c-0f59-4de3-8727-072b3c60fdda 76 | e1d1e2ce-5eaa-4487-921f-a6c791fb5e7a 77 | e3b54861-5f84-4341-84a8-3a74f1f42e51 78 | e820d45d-cba3-453f-b618-063236575f78 79 | ea038362-67b8-4c50-bb49-45731ea0c90a 80 | ede52a3c-ea6c-4f85-9dc2-6e2efaa8eab3 81 | f1fdc4ed-bc79-4670-99ae-cd1e20795d36 82 | f44c54ed-40cb-4a51-a526-ec7c211a008c 83 | f7d459b9-3353-4fcd-b477-32e5964d9217 84 | f811068c-70c8-4825-9567-16cc0037cf03 85 | f9ba0957-2226-4c24-a1d3-dd5f693e8c80 86 | -------------------------------------------------------------------------------- /analysis/process_files/germline/google-cloud-ISB/analysisID_lists/UCS.ids: -------------------------------------------------------------------------------- 1 | 08e12a8e-5220-4770-a50d-00ee4046ccaf 2 | 0b8434ac-f295-437d-acdd-08e6d428936d 3 | 0e1c1fa0-ae6f-4f34-aea9-657a801f04d9 4 | 103a746b-f2df-4f06-a5a0-90b92d823458 5 | 10fd317c-6a26-4da3-9694-373e7b2066cd 6 | 1222db1c-c99c-411c-9f51-6b0ec158a49d 7 | 17cea4ac-0b5f-42fe-a1b7-362311349965 8 | 184000f8-0963-4ce3-87dd-5acdab06c56e 9 | 18e815cd-6225-489a-ac9b-37132b635b53 10 | 1b65b29b-22bd-4706-867f-0d953e818453 11 | 1c7b7652-0311-45bf-9cb8-18b3b7caced2 12 | 1e515b00-58c4-4ce4-8c19-73a27eaaa707 13 | 2299c370-db0f-4d36-abce-894a039d6895 14 | 22d1b470-710d-415c-bdae-786e14ff68e3 15 | 22dfbcbd-2b8d-49ac-9a50-02dffe45263e 16 | 25c94f94-2154-4024-a9cb-b3c8c09eec5d 17 | 26fd12f9-0c59-470d-8ef2-31ad48ded9d8 18 | 29a60b4a-8b42-4394-ad99-a6704da8274c 19 | 30085e74-ced4-43d2-9eb9-cc0ff4402d7d 20 | 556d0725-a072-4f94-9467-2f7a148d9e40 21 | 585388db-c4fc-4b1d-b161-ebe19a4acd4c 22 | 5b1a7d51-92f7-4003-aee2-0fa68764e400 23 | 5b969f8b-2484-4cdf-ae77-fd77d40a2ac5 24 | 5c29cdfc-5120-423b-98ad-ba2fbb91833d 25 | 5d582226-0942-447b-a834-7c7d578b652a 26 | 7c116e6b-566e-4e20-aafc-aa68ea775fb8 27 | 7e82d505-51cf-472f-b850-519e50e4152f 28 | 823e8f5e-34ff-4751-a696-6e1270e57ba9 29 | 8eef9582-d240-4df8-b8e9-b5e521060653 30 | 9170a2b3-fd9f-4849-84a1-eeb1f89f2d02 31 | 948e4ecc-676b-44d4-90ad-4539f82cafc3 32 | 994d9165-bf0f-4412-a4af-16e4a0193cfe 33 | 9957321a-c3d4-4a6b-a78b-a29f843c36ba 34 | 9c8afe1b-1690-4f8d-8771-9f277a6ea1c9 35 | a0cd9f50-fbc7-4d76-badb-204a58de275f 36 | a4603f50-94d0-49f5-8686-b3579e5c393a 37 | a4edf06b-db69-448e-9317-18629e102b91 38 | aa25fc96-c66b-4629-9e23-70ca0661cb6c 39 | b61f6646-0e70-4a54-ae8b-87c4ba21880d 40 | be3f17b4-65d7-45a4-8bc9-686c298184fa 41 | c0a5553c-b623-45ae-a369-42fefff00109 42 | c704a3b7-05ee-4340-854a-dcdd7f8168a1 43 | c7716f57-e9ed-4acc-a3d6-d8ff7c15a801 44 | d2c491bf-7ce4-44cb-b735-dde9cd6bffc3 45 | d7736ff5-ab09-40b0-b634-70367b17fb42 46 | db042e45-e59b-42ab-8031-2da836a8cd65 47 | de931de2-38c0-4b8e-b21e-35cebbe9d2b4 48 | e07b0cb2-d492-4dad-94ed-81d53cd1f4aa 49 | e0974afe-5847-4fd3-b78d-43dce6851a2d 50 | e2d04a29-694d-446a-80d1-b957b1f0f9e4 51 | e2dc9062-a426-402c-a0f0-f147580d8bcd 52 | e58159cb-ff8c-4b7c-b136-f044a6424c46 53 | eccb6457-4629-49d7-91d3-0329a9943bc9 54 | ed64e08c-40c2-4712-aaf7-6625dcad4aaf 55 | f77313cf-8249-4b8b-a230-ab42afdd8bc5 56 | f8238d45-38a4-4b43-941f-fe5500ee4d40 57 | faaae401-c7b6-40f3-9491-28adcfb1550a 58 | -------------------------------------------------------------------------------- /analysis/process_files/germline/google-cloud-ISB/analysisID_lists/UVM.ids: -------------------------------------------------------------------------------- 1 | 06b67b89-65ff-43d5-ada7-5e8e6413815f 2 | 0a06f8b7-c37e-4b21-aa9a-2e7acce85904 3 | 0b5c74aa-2ae3-4f6c-a65a-f6c2debad86d 4 | 0b6c8bcb-be31-4e16-8359-2b4080fb403d 5 | 0cf5dd7c-1d8d-4f8e-a66d-be3556515c8e 6 | 0da5e271-e403-4b84-a5ea-6a2148472aef 7 | 0e41d859-bc51-4c6a-b880-5789a8ee83ac 8 | 0e9e81c5-9391-4adb-945c-8eaf5859ef20 9 | 10f63d37-d925-496b-bf16-2c4fa71fdc05 10 | 127793c7-a726-4216-b9a7-ad9641e03d15 11 | 12dd6b53-4000-46be-a442-2486f6fe257d 12 | 13725399-7ec0-4620-975d-69e0b9d50002 13 | 14a68f08-85b3-451d-b4bb-30bf6a6df3c5 14 | 151daea0-6553-437e-bcc7-c7d68c155b95 15 | 196e5009-adcf-47ec-ad6a-0c0d399c45cc 16 | 1ae6cc79-178e-4693-b340-3290d2086782 17 | 1dd3d2dd-cf62-4760-8560-ca03622b2f6a 18 | 1e55a54d-8ae3-4d1b-92f1-e5f6e8cc54da 19 | 21b3e8bd-76ce-4957-8de7-bff48af5e05c 20 | 245edb11-fa42-4dc6-abb9-00f3ddda65fe 21 | 266f9b4a-f427-4131-928e-2ccb53a6de7d 22 | 2a17109e-4b46-4ac2-8bc9-89549a0a65e5 23 | 2a8318e1-6565-4fcd-8764-ecb2f208d671 24 | 2b97651a-1306-462a-97fe-7bf22054da81 25 | 2d9cb3b8-5ed2-438b-873f-fa8e46a857af 26 | 2ec8feb1-e07e-4803-9c65-555c1f4b1415 27 | 301e9730-5c5e-4934-a951-eca76df91b95 28 | 3da21087-0c6f-4972-a8d1-15847c5ab2b8 29 | 3e08b7f7-abae-43d3-b996-a85cf07f443f 30 | 4685b14f-1153-42a2-ae0a-06c9d833b2a5 31 | 512df98e-abd6-487a-8955-46f73ed9eb75 32 | 521e8b6c-287f-45ba-a08c-dfb8ab2f5d6c 33 | 56e324f4-24cc-49ce-bb7b-8f0cc8881a1d 34 | 608584ee-2611-4312-8a21-a53ce20f58ac 35 | 61ae63c7-6581-4bb5-9856-476f9077ad7b 36 | 6b38ee33-5bd2-455e-b0a3-c82968e7fde7 37 | 71e03403-d24e-4039-9940-f88fce01ade5 38 | 73e00332-2716-47c6-8f73-61b0d4730454 39 | 7432dc4c-5af1-4aa5-b22a-6f2d45e11219 40 | 74eb2254-3398-4680-93e3-cbc60b83e01b 41 | 79ee7ade-d019-454a-b53a-fc8fec750f19 42 | 7b869ca3-86df-461e-b1f2-bd43960da45c 43 | 7b8db7d1-9978-4038-b321-8ec7e7a02210 44 | 7e18cd39-cf37-43c4-9fd2-0aa522bdbd10 45 | 7fe12666-cba9-44b2-a6f9-1c078b88121b 46 | 84a97aa6-3c93-48b1-b7e9-e063f80606cd 47 | 852bc893-6939-4558-bb15-cebb997a9797 48 | 869dc9d7-a30a-400b-80d5-596924a73a63 49 | 8be24848-b089-4cdb-be6e-0d617b1fe16c 50 | 8ed4629c-1e6c-43f1-8275-f1494f94859b 51 | 9347d4b7-88d5-45d1-96dc-5e781185bfdf 52 | 960d692d-c144-4aa0-b0a1-acfc3a43ee2f 53 | 96ae97a7-7631-4bd3-a7c0-d42761042f1f 54 | a5aa583d-bbb8-4129-a7a4-8542a647398f 55 | a6bc4b41-a8ec-47cd-ab2d-a07e2b70f2e3 56 | a9f94e7b-32b5-48f3-a4a6-ef76d5408463 57 | ac91dae2-608c-4a0b-bf22-282d6274fcf8 58 | b11c7d83-d013-4a95-ad65-86a421c6c0c9 59 | b3a011d8-11ae-4d39-90cd-c5f12d7fb963 60 | b677aba6-a979-49cd-9de4-dedc5ff775e6 61 | bcfd1131-c6b1-4692-acf9-5000afe79d89 62 | c4430ed7-4733-4b2d-9606-63ee509abf09 63 | c6208775-c4bc-4869-a7fb-d0d820b19a99 64 | c7e4a802-a691-48dc-b447-1e4181feb5eb 65 | ced4f609-8f81-4323-b739-bc41e7304b44 66 | d0035037-a84c-4819-b379-a76f2c455224 67 | d436eb15-6638-472c-a011-196fd98606c8 68 | d539f558-316e-4e83-a4d8-3ef06c3df63e 69 | d9d24327-ee9f-462c-8672-b48c31bc4656 70 | dcc7f7a7-bac8-4932-b1f8-ee1ef1327c94 71 | df51e932-1fe3-45d1-ae96-d6de6d8e592a 72 | e113d903-5773-47ce-855f-fe43446783c8 73 | e3a6ef74-634e-4c97-8ead-dcacdfea4258 74 | ef9a88a0-106c-42e0-8c22-ec86bed608b8 75 | f42dfc88-d12b-490b-aadd-8fb7cd08da3b 76 | f440c9d3-d62e-4ffc-afc1-7b444d191c99 77 | f50234f7-56a7-4b50-8249-bb19940d0f70 78 | fbad6458-9565-4454-a03a-8ec83e6fb60c 79 | fc50c7f1-e67d-43d4-a211-543491d05f44 80 | fcf2b05a-7fdd-43fc-bf51-9e219e39526d 81 | -------------------------------------------------------------------------------- /analysis/process_files/germline/google-cloud-ISB/annotate.not-in-exac.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | # by Jay Mashl, June 2017 4 | # PanCanAtlas Germline 5 | 6 | #for cancerType in ACC BLCA BRCA CESC CHOL COAD DLBC ESCA GBM HNSC KICH KIRC KIRP LGG LIHC LUAD LUSC MESO OV PAAD PCPG PRAD READ SARC SKCM STAD TGCT THCA THYM UCEC UCS UVM ; do 7 | cancerType=$1 8 | 9 | analysisList=analysisID_lists/$cancerType.ids 10 | 11 | for analysisId in $(cat $analysisList) ; do 12 | 13 | # variable input 14 | inputPath=gs://dinglab/isb-cgc/tcga/germline/production/${cancerType}/${analysisId}/combine 15 | VCFIN=prefilter.snp_indel.vcf.gz 16 | 17 | outputPath=gs://dinglab/isb-cgc/tcga/germline/production/${cancerType}/${analysisId}/combine 18 | #VCFOUT=${VCFIN/%vcf.gz/annotated.ExAC_AF.0.01.AD.3.ROI.vcf.gz} 19 | VCFOUT=${VCFIN/%vcf.gz/annotated.ExAC_AF.0.01.ExAConly.AD.3.ROI.vcf.gz} # updated for the updated run with only ExAC rare 20 | 21 | logsPath=gs://dinglab/isb-cgc/tcga/germline/production/${cancerType}/${analysisId}/combine/annotate_logs 22 | 23 | # fixed input 24 | EXAC=gs://dinglab/jay/annotation/ExAC_nonTCGA.r0.3.1.sites.vep.vcf.gz 25 | BEDFILE=gs://dinglab/isb-cgc/tcga/reference_files/all_CDS_ncRNA_ENCODE_multicell_ROI.bed 26 | 27 | ~/dsub/dsub/dsub \ 28 | --project isb-cgc-06-0004 \ 29 | --zones "us-central1-*" \ 30 | --logging $logsPath \ 31 | --input VCFIN=${inputPath}/${VCFIN} VCFINIDX=${inputPath}/${VCFIN}.tbi EXAC=${EXAC} EXACIDX=${EXAC}.tbi BEDFILE=${BEDFILE} \ 32 | --output VCFOUT=${outputPath}/${VCFOUT} VCFOUTIDX=${outputPath}/${VCFOUT}.tbi \ 33 | --command 'cd $(dirname ${VCFIN}) && mv $EXAC $EXACIDX $BEDFILE $(dirname ${VCFIN}) && /usr/local/bin/variant_QC_annotation.sh $(basename ${VCFIN}) && mv $(basename ${VCFOUT}) $(basename ${VCFOUTIDX}) $(dirname ${VCFOUT})' \ 34 | --disk-size 20 \ 35 | --min-ram 4 \ 36 | --min-cores 1 \ 37 | --name annotate \ 38 | --image gcr.io/isb-cgc-06-0004/dinglab_pca_analysis:0.1 \ 39 | --scopes https://www.googleapis.com/auth/compute https://www.googleapis.com/auth/devstorage.full_control https://www.googleapis.com/auth/genomics https://www.googleapis.com/auth/logging.write https://www.googleapis.com/auth/monitoring.write 40 | 41 | done 42 | 43 | #done 44 | -------------------------------------------------------------------------------- /analysis/process_files/germline/google-cloud-ISB/docker/Dockerfile: -------------------------------------------------------------------------------- 1 | 2 | FROM google/cloud-sdk 3 | 4 | LABEL maintainer="R. Jay Mashl " 5 | LABEL program="PanCanAtlas analysis" 6 | LABEL version="0.1" 7 | 8 | RUN apt-get update && apt-get -y install \ 9 | autoconf \ 10 | build-essential \ 11 | libncurses-dev \ 12 | perl \ 13 | pkg-config \ 14 | unzip \ 15 | wget \ 16 | zlib1g-dev \ 17 | && rm -rf /var/lib/apt/lists/* 18 | 19 | # install vcfanno 20 | WORKDIR /usr/local/bin 21 | RUN wget -O vcfanno https://github.com/brentp/vcfanno/releases/download/v0.2.6/vcfanno_linux64 22 | RUN chmod +x ./vcfanno 23 | 24 | # install vcftools 25 | WORKDIR /usr/local/src 26 | RUN wget -O v0.1.14.zip https://github.com/vcftools/vcftools/archive/v0.1.14.zip && unzip v0.1.14.zip && rm -f v0.1.14.zip && cd vcftools-0.1.14 && export ZLIB_LIBS=-lz && export ZLIB_CFLAGS=-I/usr/include && ./autogen.sh && ./configure --prefix=/usr/local && make && make install 27 | 28 | # install samtools 29 | WORKDIR /usr/local/src 30 | RUN wget -O samtools-1.2.tar.bz2 https://github.com/samtools/samtools/releases/download/1.2/samtools-1.2.tar.bz2 && tar xjf samtools-1.2.tar.bz2 && rm -f samtools-1.2.tar.bz2 31 | RUN cd samtools-1.2/htslib-1.2.1 && ./configure && make && cp bgzip htsfile tabix /usr/local/bin/ && cp libhts.so.1 /usr/local/lib/ && /sbin/ldconfig 32 | WORKDIR /usr/local/src 33 | RUN cd samtools-1.2 && make && cp samtools /usr/local/bin/ 34 | 35 | # scripts 36 | WORKDIR /usr/local/bin 37 | COPY variant_QC_annotation.sh filter_VCF_AF_AD.py ExAC_config.toml ./ 38 | RUN chmod +x ./variant_QC_annotation.sh 39 | 40 | 41 | -------------------------------------------------------------------------------- /analysis/process_files/germline/google-cloud-ISB/docker/ExAC_config.toml: -------------------------------------------------------------------------------- 1 | [[postannotation]] 2 | fields=["ExAC_AC_Adj", "ExAC_AN_Adj"] 3 | name="ExAC_AF_Adj" 4 | op="div2" 5 | type="Float" 6 | 7 | [[annotation]] 8 | #file="/Users/khuang/Box Sync/PhD/germline/ExAC/ExAC.r1.sites.vep.vcf.gz" 9 | file="ExAC_nonTCGA.r0.3.1.sites.vep.vcf.gz" 10 | fields=["AC_Adj","AN_Adj"] 11 | ops=["self","self"] 12 | names=["ExAC_AC_Adj","ExAC_AN_Adj"] 13 | -------------------------------------------------------------------------------- /analysis/process_files/germline/google-cloud-ISB/docker/filter_VCF_AF_AD.py: -------------------------------------------------------------------------------- 1 | #!/bin/python 2 | #03 February 2016 - Kuan-Lin Huang @ WashU - 3 | 4 | import sys 5 | import getopt 6 | import gzip 7 | 8 | class autovivification(dict): 9 | '''Implementation of perl's autovivification feature.''' 10 | def __init__( self , *args , **kwargs ): 11 | super( autovivification , self ).__init__( *args , **kwargs ) 12 | self.itemlist = super( autovivification , self ).keys() 13 | def __getitem__(self, item): 14 | try: 15 | return dict.__getitem__(self, item) 16 | except KeyError: 17 | value = self[item] = type(self)() 18 | return value 19 | 20 | def main(): 21 | def usage(): 22 | print """ 23 | filter_AD_VCF.py : why do I exist? 24 | 25 | USAGE: filter_AD_VCF.py [-h] 26 | -h print this message 27 | input file 28 | """ 29 | 30 | if len(sys.argv) == 4: 31 | vcfFH= sys.argv[1] 32 | MAF_thres = float(sys.argv[2]) 33 | AD_thres = int(sys.argv[3]) 34 | else: 35 | usage() 36 | sys.exit() 37 | 38 | try: 39 | vcfF = open(vcfFH,"r") 40 | except IOError: 41 | print("VCF file does not exist!") 42 | 43 | outFstring = "ExAC_AF." + str(MAF_thres) + ".AD." + str(AD_thres) + ".vcf" 44 | outF = vcfFH.replace("vcf",outFstring) 45 | outFH = open(outF, "w") 46 | 47 | all_var = 0 48 | nonpass_MAF_var = 0 49 | nonpass_AD_var = 0 50 | pass_var = 0 51 | 52 | for line in vcfF: 53 | line=line.strip() 54 | # print the info lines 55 | if line.startswith("#"): 56 | outFH.write(line + "\n") 57 | else: 58 | F = line.split("\t") 59 | all_var = all_var + 1 60 | 61 | info_f = str(F[7]).split(";") 62 | format_f = str(F[8]).split(":") 63 | geno_f = str(F[9]).split(":") 64 | AD_index = -1 65 | 66 | ### MAF filter 67 | nonpass_MAF = False 68 | for info in info_f: 69 | # find the cases with annotated ExAC frequency 70 | if info.startswith("ExAC_AF_Adj"): 71 | ExAC_AF = info.replace("ExAC_AC_Adj=","") 72 | if "," in ExAC_AF: 73 | ExAC_AFs = ExAC_AF.split(",") 74 | if ExAC_AFs[0] > MAF_thres: # need to assume it's the first allele 75 | nonpass_MAF = True 76 | else: 77 | if ExAC_AF > MAF_thres: 78 | nonpass_MAF = True 79 | if nonpass_MAF: 80 | nonpass_MAF_var = nonpass_MAF_var + 1 81 | continue 82 | 83 | 84 | ### AD filter 85 | nonpass_AD = False 86 | for i in range(0,len(format_f)): 87 | if str(format_f[i]) == "AD": 88 | AD_index = i 89 | 90 | genotype = str(geno_f[AD_index]) 91 | # GATK and Pindel calls 92 | # second int for alt allele 93 | if "," in genotype: 94 | genotypes = genotype.split(",") 95 | if int(genotypes[1]) < AD_thres: 96 | nonpass_AD = True 97 | nonpass_AD_var = nonpass_AD_var + 1 98 | # varscan calls 99 | else: 100 | if int(genotype) < AD_thres: 101 | nonpass_AD = True 102 | nonpass_AD_var = nonpass_AD_var + 1 103 | 104 | if not nonpass_MAF and not nonpass_AD: 105 | pass_var = pass_var + 1 106 | outFH.write(line + "\n") 107 | 108 | # filter summary 109 | print "number of total variants:", all_var 110 | print "number of variants failing MAF filter of", MAF_thres,":", nonpass_MAF_var 111 | print "number of variants failing AD filter of", AD_thres,":", nonpass_AD_var 112 | print "number of total passed variants:", pass_var 113 | # if len(genotypes) == 1 & int(genotypes[0])>=AD_thres: 114 | # print line 115 | # elif len(genotypes) == 2 & int(genotypes[1])>=AD_thres: 116 | # print line 117 | outFH.close() 118 | 119 | if __name__ == "__main__": 120 | main() 121 | -------------------------------------------------------------------------------- /analysis/process_files/germline/google-cloud-ISB/docker/variant_QC_annotation.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | # by Kuan-lin Huang 2017 May @ WashU 3 | echo "start time" 4 | date 5 | # requires: vcftools & vcfanno (https://github.com/brentp/vcfanno) 6 | 7 | LOCAL=/usr/local/bin 8 | 9 | ### AF and AD frequency filter ### 10 | vcfFile=$1 11 | 12 | # gsutil cp gs://dinglab/isb-cgc/tcga/reference_files/all_CDS_ncRNA_ENCODE_multicell_ROI.bed 13 | bedFile=all_CDS_ncRNA_ENCODE_multicell_ROI.bed 14 | 15 | vcfannoConfigFile=$LOCAL/ExAC_config.toml 16 | AF_thres=0.01 17 | AD_thres=3 18 | 19 | # annotate with ExAC frequency 20 | annotated_VCF=${vcfFile/vcf.gz/annotated.vcf} 21 | echo "using vcfanno to annotate" ${vcfFile} "into" ${annotated_VCF} 22 | vcfanno ${vcfannoConfigFile} ${vcfFile} > ${annotated_VCF} 23 | 24 | filtered_VCF=${annotated_VCF/vcf/ExAC_AF.${AF_thres}.AD.${AD_thres}.vcf} 25 | echo "filtering" ${annotated_VCF} "into" ${filtered_VCF} 26 | python $LOCAL/filter_VCF_AF_AD.py ${annotated_VCF} $AF_thres $AD_thres 27 | date 28 | 29 | ### extract ROI (region of interest) ### 30 | extracted_VCF=${filtered_VCF/vcf/ROI.vcf.gz} 31 | echo "extracting" ${filtered_VCF} "based on" $bedFile "into" ${extracted_VCF} 32 | vcftools --vcf $filtered_VCF \ 33 | --bed $bedFile \ 34 | --keep-INFO-all --recode -c | bgzip -c > ${extracted_VCF} 35 | 36 | # tabix 37 | echo "Indexing extracted VCF" 38 | tabix -p vcf ${extracted_VCF} 39 | # remove intermediate VCF files 40 | rm -f $annotated_VCF 41 | rm -f $filtered_VCF 42 | date 43 | 44 | # possible option: calculate concordance with genotype file here [in this case you may want to batch the jobs by cancer types as I have already extracted those to ENCODE and exon regions in the VM: kuan-merge-genotype-bigmem] 45 | 46 | # [alternative: merge first across samples in a separate workflow] 47 | # [alternative: do a 5% MAF filter for the merged VCF within cohort to remove potential pipeline artifacts] 48 | 49 | # annotate the resulting VCF using VEP 50 | 51 | # run CharGer 52 | 53 | # move the resulting VCF and files back to storage 54 | 55 | echo "end time" 56 | date 57 | -------------------------------------------------------------------------------- /analysis/process_files/germline/google-cloud-ISB/make_lists.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | # by Jay Mashl, June 2017 4 | # PanCanAtlas Germline 5 | 6 | # Get analysisId 7 | LISTS_DIR=analysisID_lists 8 | mkdir -p $LISTS_DIR 9 | for cancerType in ACC BLCA BRCA CESC CHOL COAD DLBC ESCA GBM HNSC KICH KIRC KIRP LGG LIHC LUAD LUSC MESO OV PAAD PCPG PRAD READ SARC SKCM STAD TGCT THCA THYM UCEC UCS UVM ; do 10 | echo $cancerType 11 | listFile=$LISTS_DIR/$cancerType.ids 12 | if [ ! -e $listFile ] ; then 13 | gsutil ls -d gs://dinglab/isb-cgc/tcga/germline/production/$cancerType/* | sed -e 's/\// /g' | awk '{print $NF}' > $listFile 14 | fi 15 | done 16 | -------------------------------------------------------------------------------- /analysis/process_files/germline/google-cloud-ISB/unused/Dockerfile: -------------------------------------------------------------------------------- 1 | ##### bioconda: https://hub.docker.com/r/bioconda/bioconda-builder/~/dockerfile/ ##### 2 | 3 | FROM centos:centos5 4 | 5 | # add tools useful for compilation 6 | RUN rpm -Uvh http://dl.fedoraproject.org/pub/epel/5/x86_64/epel-release-5-4.noarch.rpm 7 | # Install wget first so we can download devtools-2 and autotools repos 8 | RUN yum install -y wget && \ 9 | yum clean all 10 | RUN wget http://people.centos.org/tru/devtools-2/devtools-2.repo -O /etc/yum.repos.d/devtools-2.repo 11 | RUN yum install -y \ 12 | bzip2 \ 13 | git \ 14 | gcc \ 15 | gcc-c++ \ 16 | patch \ 17 | make \ 18 | gcc44 \ 19 | gcc44-c++ \ 20 | cmake \ 21 | unzip \ 22 | byacc \ 23 | devtoolset-2-gcc \ 24 | devtoolset-2-binutils \ 25 | devtoolset-2-gcc-c++ \ 26 | devtoolset-2-gcc-gfortran \ 27 | autotools-latest \ 28 | pkgconfig \ 29 | which \ 30 | file \ 31 | gpg \ 32 | # Needed for perl-db-file 33 | db4-devel \ 34 | # install X11 dependencies and openGL/mesa 35 | xorg-x11-apps \ 36 | mesa-libGLU-devel \ 37 | && yum clean all 38 | 39 | 40 | 41 | # install conda 42 | RUN mkdir -p /tmp/conda-build && wget https://repo.continuum.io/miniconda/Miniconda3-latest-Linux-x86_64.sh && bash Miniconda3-latest-Linux-x86_64.sh -b -p /anaconda 43 | ENV PATH=/opt/rh/devtoolset-2/root/usr/bin:/opt/rh/autotools-latest/root/usr/bin:/anaconda/bin:$PATH 44 | ENV LANG en_US.UTF-8 45 | ENV LC_ALL en_US.UTF-8 46 | RUN mkdir -p /anaconda/conda-bld/linux-64 /anaconda/conda-bld/osx-64 # workaround for bug in current conda (conda issue #466) 47 | 48 | # setup conda 49 | ADD requirements.txt requirements.txt 50 | RUN conda update conda 51 | RUN conda install -y --file requirements.txt 52 | RUN conda update conda-build 53 | RUN conda index /anaconda/conda-bld/linux-64 /anaconda/conda-bld/osx-64 54 | RUN conda config --add channels bioconda 55 | RUN conda config --add channels r 56 | RUN conda config --add channels file://anaconda/conda-bld 57 | RUN conda install -y toposort 58 | 59 | # setup entrypoint (assuming that repo is mounted under /bioconda-recipes) 60 | ENTRYPOINT ["/bioconda-recipes/scripts/build-packages.py"] 61 | CMD [] 62 | 63 | ##### vcftools: https://hub.docker.com/r/biocontainers/vcftools/~/dockerfile/ ##### 64 | # Base Image 65 | FROM biocontainers/biocontainers:latest 66 | 67 | # Metadata 68 | LABEL base.image="biocontainers:latest" 69 | LABEL version="1" 70 | LABEL software="vcftools" 71 | LABEL software.version="0.1.14" 72 | LABEL description="A set of tools written in Perl and C++ for working with VCF files, such as those generated by the 1000 Genomes Project" 73 | LABEL website="https://github.com/vcftools/vcftools|https://vcftools.github.io/index.html" 74 | LABEL documentation="https://github.com/vcftools/vcftools|https://vcftools.github.io/index.html" 75 | LABEL license="https://github.com/vcftools/vcftools|https://vcftools.github.io/index.html" 76 | LABEL tags="Genomics" 77 | 78 | # Maintainer 79 | MAINTAINER Saulo Alves Aflitos 80 | 81 | USER root 82 | 83 | ENV ZIP=vcftools-0.1.14.tar.gz 84 | ENV URL=https://github.com/vcftools/vcftools/releases/download/v0.1.14/ 85 | ENV FOLDER=vcftools-0.1.14 86 | ENV DST=/tmp 87 | 88 | RUN wget $URL/$ZIP -O $DST/$ZIP && \ 89 | tar xvf $DST/$ZIP -C $DST && \ 90 | rm $DST/$ZIP && \ 91 | cd $DST/$FOLDER && \ 92 | ./configure && \ 93 | make && \ 94 | make install && \ 95 | cd / && \ 96 | rm -rf $DST/$FOLDER 97 | 98 | USER biodocker 99 | 100 | WORKDIR /data/ 101 | 102 | 103 | ##### vcfanno: http://brentp.github.io/vcfanno/#installation ##### 104 | 105 | #RUN conda install -c bioconda vcfanno 106 | 107 | RUN wget https://github.com/brentp/vcfanno/releases/download/v0.2.6/vcfanno_linux64 && \ 108 | cp vcfanno_linux64 /usr/local/bin/vcfanno_linux64 && \ 109 | rm vcfanno_linux64 110 | 111 | ##### vcflib: https://hub.docker.com/r/itsjeffreyy/vcflib/~/dockerfile/ ##### 112 | # Base image ubuntu:16.04 113 | FROM ubuntu:16.04 114 | 115 | # Author 116 | MAINTAINER Jeffreyy Chun-Hui Yu 117 | 118 | # install the system requirement 119 | RUN \ 120 | apt-get update --fix-missing -yq \ 121 | && apt-get install -q -y wget g++ gcc make bzip2 git autoconf automake make g++ gcc build-essential zlib1g-dev libgsl0-dev curl git wget unzip tabix libncurses5-dev 122 | 123 | WORKDIR /opt 124 | 125 | # install vcflib 126 | RUN \ 127 | git clone --recursive https://github.com/vcflib/vcflib.git \ 128 | && cd vcflib \ 129 | && make 130 | 131 | ENV PATH=/opt/vcflib/bin:$PATH 132 | 133 | #clean tar balls 134 | RUN \ 135 | rm -rf /var/lib/apt/lists/* \ 136 | && apt-get autoremove -y 137 | 138 | 139 | # set path 140 | WORKDIR /root 141 | -------------------------------------------------------------------------------- /analysis/process_files/germline/local/ExAC_config.toml: -------------------------------------------------------------------------------- 1 | [[postannotation]] 2 | fields=["ExAC_AC_Adj", "ExAC_AN_Adj"] 3 | name="ExAC_AF_Adj" 4 | op="div2" 5 | type="Float" 6 | 7 | [[annotation]] 8 | #file="/Users/khuang/Box Sync/PhD/germline/ExAC/ExAC.r1.sites.vep.vcf.gz" 9 | file="ExAC_nonTCGA.r0.3.1.sites.vep.vcf.gz" 10 | fields=["AC_Adj","AN_Adj"] 11 | ops=["self","self"] 12 | names=["ExAC_AC_Adj","ExAC_AN_Adj"] 13 | -------------------------------------------------------------------------------- /analysis/process_files/germline/local/create_ROI_genotype_VCF.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | gcloud compute copy-files --zone us-central1-f ../../../../TCGA_data/reference_files/all_CDS_and_ncRNA_24Chroms_Contigs_1BasedStart_2bpFlanks_ForMusic huangkuanlin@kuan-merge-genotype-bigmem:~/ 3 | gcloud compute copy-files --zone us-central1-f ../../../../TCGA_data/reference_files/ROI_MultiCell_perid.txt huangkuanlin@kuan-merge-genotype-bigmem:~/ 4 | 5 | while IFS='' read -r line || [[ -n "$line" ]]; do 6 | echo "Processing cancer type: $line" 7 | cancer="${line%\\n}" 8 | 9 | # extract exonic region 10 | vcftools --gzvcf ${cancer}.normal.merge.vcf.gz \ 11 | --bed all_CDS_and_ncRNA_24Chroms_Contigs_1BasedStart_2bpFlanks_ForMusic \ 12 | --keep-INFO-all --recode -c | bgzip -c > ${cancer}.normal.merge.allCDS.vcf.gz & 13 | 14 | # extract encode region 15 | vcftools --gzvcf ${cancer}.normal.merge.vcf.gz \ 16 | --bed ROI_MultiCell_perid.txt \ 17 | --keep-INFO-all --recode -c | bgzip -c > ${cancer}.normal.merge.ENCODE.vcf.gz 18 | 19 | #index 20 | tabix -p vcf ${cancer}.normal.merge.allCDS.vcf.gz & 21 | tabix -p vcf ${cancer}.normal.merge.ENCODE.vcf.gz 22 | 23 | done < cancer_type.txt -------------------------------------------------------------------------------- /analysis/process_files/germline/local/expand_csq.py: -------------------------------------------------------------------------------- 1 | #!/bin/python 2 | #Oct 2017 - Kuan-Lin Huang @ WashU - 3 | 4 | import sys 5 | import getopt 6 | import gzip 7 | 8 | 9 | def main(): 10 | def usage(): 11 | print """ 12 | combine_CharGer2VCF.py : combine CharGer output to its originating VCF file 13 | 14 | USAGE: liftover_CharGer_result.py [-h] 15 | -h print this message 16 | input file 17 | """ 18 | 19 | if len(sys.argv) >= 3: 20 | vcfheadFH = sys.argv[1] 21 | CharGerFH= sys.argv[2] 22 | else: 23 | usage() 24 | sys.exit() 25 | 26 | try: 27 | vcfheadF = open(vcfheadFH,"r") 28 | except IOError: 29 | print("File , CharGerFH, does not exist!") 30 | csq_header = "" 31 | for line in vcfheadF: 32 | line = line.strip() 33 | if line.startswith("##INFO= 26 | -h print this message 27 | input file 28 | """ 29 | 30 | if len(sys.argv) == 3: 31 | vcfFH= sys.argv[1] 32 | AD_thres = int(sys.argv[2]) 33 | else: 34 | usage() 35 | sys.exit() 36 | 37 | try: 38 | vcfF = gzip.open(vcfFH,"r") 39 | except IOError: 40 | print("VCF file does not exist!") 41 | 42 | 43 | for line in vcfF: 44 | line=line.strip() 45 | # print the info lines 46 | if line.startswith("#"): 47 | print line 48 | else: 49 | F = line.split("\t") 50 | 51 | ref = str(F[3]) 52 | info_f = str(F[7]).split(";") 53 | format_f = str(F[8]).split(":") 54 | geno_f = str(F[9]).split(":") 55 | AD_index = -1 56 | 57 | ### reference filter 58 | nonpass_ref = False 59 | if ref == "N": 60 | nonpass_ref = True 61 | 62 | ### AD filter 63 | nonpass_AD = False 64 | for i in range(0,len(format_f)): 65 | if str(format_f[i]) == "AD": 66 | AD_index = i 67 | 68 | genotype = str(geno_f[AD_index]) 69 | # GATK and Pindel calls 70 | # second int for alt allele 71 | if "," in genotype: 72 | genotypes = genotype.split(",") 73 | if int(genotypes[1]) < AD_thres: 74 | nonpass_AD = True 75 | # varscan calls 76 | else: 77 | if int(genotype) < AD_thres: 78 | nonpass_AD = True 79 | 80 | if not nonpass_ref and not nonpass_AD: 81 | #outFH.write(line + "\n") 82 | print line 83 | 84 | 85 | if __name__ == "__main__": 86 | main() 87 | -------------------------------------------------------------------------------- /analysis/process_files/germline/local/filter_VCF_AF_AD.py: -------------------------------------------------------------------------------- 1 | #!/bin/python 2 | #03 February 2016 - Kuan-Lin Huang @ WashU - 3 | 4 | import sys 5 | import getopt 6 | import gzip 7 | 8 | class autovivification(dict): 9 | '''Implementation of perl's autovivification feature.''' 10 | def __init__( self , *args , **kwargs ): 11 | super( autovivification , self ).__init__( *args , **kwargs ) 12 | self.itemlist = super( autovivification , self ).keys() 13 | def __getitem__(self, item): 14 | try: 15 | return dict.__getitem__(self, item) 16 | except KeyError: 17 | value = self[item] = type(self)() 18 | return value 19 | 20 | def main(): 21 | def usage(): 22 | print """ 23 | filter_AD_VCF.py : why do I exist? 24 | 25 | USAGE: filter_AD_VCF.py [-h] 26 | -h print this message 27 | input file 28 | """ 29 | 30 | if len(sys.argv) == 4: 31 | vcfFH= sys.argv[1] 32 | MAF_thres = float(sys.argv[2]) 33 | AD_thres = int(sys.argv[3]) 34 | else: 35 | usage() 36 | sys.exit() 37 | 38 | try: 39 | vcfF = open(vcfFH,"r") 40 | except IOError: 41 | print("VCF file does not exist!") 42 | 43 | outFstring = "ExAC_AF." + str(MAF_thres) + ".AD." + str(AD_thres) + ".vcf" 44 | outF = vcfFH.replace("vcf",outFstring) 45 | outFH = open(outF, "w") 46 | 47 | all_var = 0 48 | nonpass_MAF_var = 0 49 | nonpass_AD_var = 0 50 | pass_var = 0 51 | 52 | for line in vcfF: 53 | line=line.strip() 54 | # print the info lines 55 | if line.startswith("#"): 56 | outFH.write(line + "\n") 57 | else: 58 | F = line.split("\t") 59 | all_var = all_var + 1 60 | 61 | info_f = str(F[7]).split(";") 62 | format_f = str(F[8]).split(":") 63 | geno_f = str(F[9]).split(":") 64 | AD_index = -1 65 | 66 | ### MAF filter 67 | nonpass_MAF = False 68 | for info in info_f: 69 | # find the cases with annotated ExAC frequency 70 | if info.startswith("ExAC_AF_Adj"): 71 | ExAC_AF = info.replace("ExAC_AC_Adj=","") 72 | if "," in ExAC_AF: 73 | ExAC_AFs = ExAC_AF.split(",") 74 | if ExAC_AFs[0] > MAF_thres: # need to assume it's the first allele 75 | nonpass_MAF = True 76 | else: 77 | if ExAC_AF > MAF_thres: 78 | nonpass_MAF = True 79 | if nonpass_MAF: 80 | nonpass_MAF_var = nonpass_MAF_var + 1 81 | continue 82 | 83 | 84 | ### AD filter 85 | nonpass_AD = False 86 | for i in range(0,len(format_f)): 87 | if str(format_f[i]) == "AD": 88 | AD_index = i 89 | 90 | genotype = str(geno_f[AD_index]) 91 | # GATK and Pindel calls 92 | # second int for alt allele 93 | if "," in genotype: 94 | genotypes = genotype.split(",") 95 | if int(genotypes[1]) < AD_thres: 96 | nonpass_AD = True 97 | nonpass_AD_var = nonpass_AD_var + 1 98 | # varscan calls 99 | else: 100 | if int(genotype) < AD_thres: 101 | nonpass_AD = True 102 | nonpass_AD_var = nonpass_AD_var + 1 103 | 104 | if not nonpass_MAF and not nonpass_AD: 105 | pass_var = pass_var + 1 106 | outFH.write(line + "\n") 107 | 108 | # filter summary 109 | print "number of total variants:", all_var 110 | print "number of variants failing MAF filter of", MAF_thres,":", nonpass_MAF_var 111 | print "number of variants failing AD filter of", AD_thres,":", nonpass_AD_var 112 | print "number of total passed variants:", pass_var 113 | # if len(genotypes) == 1 & int(genotypes[0])>=AD_thres: 114 | # print line 115 | # elif len(genotypes) == 2 & int(genotypes[1])>=AD_thres: 116 | # print line 117 | outFH.close() 118 | 119 | if __name__ == "__main__": 120 | main() 121 | -------------------------------------------------------------------------------- /analysis/process_files/germline/local/filter_VCF_AF_AD_keepExAConly.py: -------------------------------------------------------------------------------- 1 | #!/bin/python 2 | #Aug 2017 - Kuan-Lin Huang @ WashU - 3 | 4 | import sys 5 | import getopt 6 | import gzip 7 | 8 | class autovivification(dict): 9 | '''Implementation of perl's autovivification feature.''' 10 | def __init__( self , *args , **kwargs ): 11 | super( autovivification , self ).__init__( *args , **kwargs ) 12 | self.itemlist = super( autovivification , self ).keys() 13 | def __getitem__(self, item): 14 | try: 15 | return dict.__getitem__(self, item) 16 | except KeyError: 17 | value = self[item] = type(self)() 18 | return value 19 | 20 | def main(): 21 | def usage(): 22 | print """ 23 | filter_AD_VCF.py : why do I exist? 24 | 25 | USAGE: filter_AD_VCF.py [-h] 26 | -h print this message 27 | input file 28 | """ 29 | 30 | if len(sys.argv) == 4: 31 | vcfFH= sys.argv[1] 32 | MAF_thres = float(sys.argv[2]) 33 | AD_thres = int(sys.argv[3]) 34 | else: 35 | usage() 36 | sys.exit() 37 | 38 | try: 39 | vcfF = open(vcfFH,"r") 40 | except IOError: 41 | print("VCF file does not exist!") 42 | 43 | outFstring = "ExAC_AF." + str(MAF_thres) + ".ExAConly.AD." + str(AD_thres) + ".vcf" 44 | outF = vcfFH.replace("vcf",outFstring) 45 | outFH = open(outF, "w") 46 | 47 | all_var = 0 48 | nonpass_MAF_var = 0 49 | nonpass_AD_var = 0 50 | pass_var = 0 51 | 52 | for line in vcfF: 53 | line=line.strip() 54 | # print the info lines 55 | if line.startswith("#"): 56 | outFH.write(line + "\n") 57 | else: 58 | F = line.split("\t") 59 | all_var = all_var + 1 60 | 61 | info_f = str(F[7]).split(";") 62 | format_f = str(F[8]).split(":") 63 | geno_f = str(F[9]).split(":") 64 | AD_index = -1 65 | 66 | # only keep rare, ExAC variants 67 | ### MAF filter 68 | nonpass_MAF = True 69 | for info in info_f: 70 | # find the cases with annotated ExAC frequency 71 | if info.startswith("ExAC_AF_Adj"): 72 | ExAC_AF = info.replace("ExAC_AF_Adj=","") 73 | if "," in ExAC_AF: 74 | ExAC_AFs = ExAC_AF.split(",") 75 | if float(ExAC_AFs[0]) < MAF_thres: # need to assume it's the first allele 76 | nonpass_MAF = False 77 | else: 78 | if float(ExAC_AF) < MAF_thres: 79 | nonpass_MAF = False 80 | if nonpass_MAF: 81 | nonpass_MAF_var = nonpass_MAF_var + 1 82 | continue 83 | 84 | 85 | ### AD filter 86 | nonpass_AD = False 87 | for i in range(0,len(format_f)): 88 | if str(format_f[i]) == "AD": 89 | AD_index = i 90 | 91 | genotype = str(geno_f[AD_index]) 92 | # GATK and Pindel calls 93 | # second int for alt allele 94 | if "," in genotype: 95 | genotypes = genotype.split(",") 96 | if int(genotypes[1]) < AD_thres: 97 | nonpass_AD = True 98 | nonpass_AD_var = nonpass_AD_var + 1 99 | # varscan calls 100 | else: 101 | if int(genotype) < AD_thres: 102 | nonpass_AD = True 103 | nonpass_AD_var = nonpass_AD_var + 1 104 | 105 | if not nonpass_MAF and not nonpass_AD: 106 | pass_var = pass_var + 1 107 | outFH.write(line + "\n") 108 | 109 | # filter summary 110 | print "number of total variants:", all_var 111 | print "number of variants failing MAF filter of", MAF_thres,":", nonpass_MAF_var 112 | print "number of variants failing AD filter of", AD_thres,":", nonpass_AD_var 113 | print "number of total passed variants:", pass_var 114 | # if len(genotypes) == 1 & int(genotypes[0])>=AD_thres: 115 | # print line 116 | # elif len(genotypes) == 2 & int(genotypes[1])>=AD_thres: 117 | # print line 118 | outFH.close() 119 | 120 | if __name__ == "__main__": 121 | main() 122 | -------------------------------------------------------------------------------- /analysis/process_files/germline/local/filter_merge_germline_by_cancer.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | cancer=$1 3 | 4 | # set limit to more files 5 | ulimit -n 2500 6 | echo "Processing cancer type: $line" 7 | echo "Start time" 8 | date 9 | mkdir $cancer 10 | samples=$(gsutil ls gs://dinglab/isb-cgc/tcga/germline/production/${cancer}/) 11 | for sample in $samples; do 12 | sampleNamePre=${sample##*$cancer/} 13 | sampleName=${sampleNamePre%/} 14 | echo "Sample "$sample 15 | echo "Copying vcf for "$sampleName 16 | gsutil cp ${sample}combine/prefilter.snp_indel.vcf.gz ${cancer}/${sampleName}.prefilter.snp_indel.vcf.gz 17 | python filter_VCF_AD.py ${cancer}/${sampleName}.prefilter.snp_indel.vcf.gz 5 | bgzip -c > ${cancer}/${sampleName}.AD.5.vcf.gz 18 | tabix -p vcf ${cancer}/${sampleName}.AD.5.vcf.gz 19 | # copy the filtered VCF back to storage 20 | gsutil cp ${cancer}/${sampleName}.AD.5.vcf.gz* ${sample}combine/ 21 | done 22 | 23 | # here we may need to limit to the best BAM 24 | 25 | #merge 26 | ~/bin/bcftools-1.5/bcftools merge --output-type z --output ${cancer}.merge.vcf.gz $(ls -1 ${cancer}/*.AD.5.vcf.gz | perl -pe 's/\n/ /g') 27 | #index 28 | tabix -p vcf ${cancer}.merge.vcf.gz 29 | #upload 30 | #gsutil cp ${cancer}.merge.vcf.gz* gs://dinglab/isb-cgc/tcga/germline/production/merge 31 | # delete files 32 | #rm -rf ${cancer}/*.vcf.gz 33 | #rm -rf ${cancer}/*.vcf.gz.tbi 34 | echo "End time" 35 | date 36 | -------------------------------------------------------------------------------- /analysis/process_files/germline/local/filter_merge_germline_by_sample.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | batch=${1##*pca_table_split/} 4 | echo "Start time" 5 | date 6 | echo "Processing batch "$batch 7 | echo "" 8 | mkdir ${batch} 9 | 10 | while IFS='' read -r line || [[ -n "$line" ]]; do 11 | sample="${line%\\n}" 12 | 13 | sampleNamePre=${sample##*production/} 14 | sampleNamePre2=${sampleNamePre%/combine/prefilter.snp_indel.vcf.gz} 15 | sampleName=${sampleNamePre2#*/} 16 | echo "Sample "$sample 17 | echo "Copying vcf for "$sampleName 18 | gsutil cp ${sample} ${batch}/${sampleName}.prefilter.snp_indel.vcf.gz 19 | python filter_VCF_AD.py ${batch}/${sampleName}.prefilter.snp_indel.vcf.gz 5 | bgzip -c > ${batch}/${sampleName}.AD.5.vcf.gz 20 | tabix -p vcf ${batch}/${sampleName}.AD.5.vcf.gz 21 | # copy the filtered VCF back to storage 22 | gsutil cp ${batch}/${sampleName}.AD.5.vcf.gz* gs://dinglab/isb-cgc/tcga/germline/release1.0/individualVCF/ 23 | # remove original VCF 24 | rm -f ${batch}/${sampleName}.prefilter.snp_indel.vcf.gz 25 | done < "$1" 26 | 27 | #merge 28 | ~/bin/bcftools-1.5/bcftools merge --output-type z --output ${batch}.merge.vcf.gz $(ls -1 ${batch}/*.AD.5.vcf.gz | perl -pe 's/\n/ /g') 29 | #index 30 | tabix -p vcf ${batch}.merge.vcf.gz 31 | #upload 32 | #gsutil cp ${batch}.merge.vcf.gz* gs://dinglab/isb-cgc/tcga/germline/production/merge 33 | # delete files 34 | #rm -rf ${batch}/*.vcf.gz 35 | #rm -rf ${batch}/*.vcf.gz.tbi 36 | echo "End time" 37 | date 38 | -------------------------------------------------------------------------------- /analysis/process_files/germline/local/make.bsub.commands.sh: -------------------------------------------------------------------------------- 1 | for file in pca_table_split/*; do echo "bsubl -oo filter.merge."${file##*pca_table_split/}".log 'bash filter_merge_germline_by_sample.sh "$file"'"; done 2 | -------------------------------------------------------------------------------- /analysis/process_files/germline/local/post_CharGer.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | input="Charged_VEP/" 4 | results="ChargedSample/" 5 | variantBuffer=2000 6 | queue="long" 7 | group="/khuang" 8 | forks=4 9 | mem=20000000 10 | 11 | inputDir="/gscmnt/gc3020/dinglab/medseq/Germline/projects/PanCanAtlasGermline/analysis/process_files/germline/local/Charged_VEP/" 12 | 13 | if [ ! -d ${results} ]; then 14 | mkdir ${results} 15 | fi 16 | 17 | for i in {1..22} X Y 18 | do 19 | tsv=${input}charged.PCA.r1.TCGAbarcode.merge.exon.chr${i}.vcf.tsv 20 | vcf=AnnotatedVCFs/anno.PCA.r1.TCGAbarcode.merge.exon.chr${i}.norm.vcf.gz 21 | out=${results}charged.PCA.r1.TCGAbarcode.merge.exon.chr${i}.vcf.samples.tsv 22 | runCMD="python combine_CharGer2VCF.py ${tsv} ${vcf} > ${out}" 23 | log="${out}.log" 24 | echo "bsub -g ${group} -q ${queue} -n ${forks} -M ${mem} -oo ${log} \"${runCMD}\"" 25 | done 26 | -------------------------------------------------------------------------------- /analysis/process_files/germline/local/recalc_AF_PM2.py: -------------------------------------------------------------------------------- 1 | #!/bin/python 2 | #Oct 2017 - Kuan-Lin Huang @ WashU - 3 | 4 | import sys 5 | import getopt 6 | import gzip 7 | 8 | 9 | def main(): 10 | def usage(): 11 | print """ 12 | combine_CharGer2VCF.py : combine CharGer output to its originating VCF file 13 | 14 | USAGE: liftover_CharGer_result.py [-h] 15 | -h print this message 16 | input file 17 | """ 18 | 19 | if len(sys.argv) >= 2: 20 | CharGerFH= sys.argv[1] 21 | else: 22 | usage() 23 | sys.exit() 24 | 25 | #open CharGer file 26 | try: 27 | charGerF = open(CharGerFH,"r") 28 | except IOError: 29 | print("File , CharGerFH, does not exist!") 30 | 31 | CharGerHeader = charGerF.readline().strip() 32 | print CharGerHeader + "\tExAC_adj_AF" 33 | #read input file 34 | for line in charGerF: 35 | line=line.strip() 36 | F = line.split("\t") 37 | AF = 0 38 | AFstr = F[80] 39 | AFstr1 = AFstr.split("&") 40 | AFrelevantStr = "" 41 | for AFstrings in AFstr1: 42 | if AFstrings.startswith(F[7]): 43 | AFrelevantStr = AFstrings 44 | if len(AFrelevantStr.split(":")) >1: 45 | AF = float( AFrelevantStr.split(":")[1] ) 46 | if len(F) > 14 and "PM2" in F[14]: 47 | score = F[18] 48 | if AF > 0.0005: 49 | F[18] = int(F[18])-2 # score 50 | F[14] = F[14].replace("PM2,","") 51 | if int(F[18]) < 5: 52 | continue 53 | F[18] = str(F[18]) 54 | AF = str(AF) 55 | F.append(AF) 56 | print "\t".join(F) 57 | 58 | charGerF.close() 59 | 60 | if __name__ == "__main__": 61 | main() 62 | -------------------------------------------------------------------------------- /analysis/process_files/germline/local/replace_vcf_header_sample_with_source_TCGA.pl: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env perl 2 | # 3 | # Jay Mashl, July 2017 4 | # Syntax: uncompressed vcf | $thisScript 5 | # adopted by Kuan Oct 2017 for bcftools output and update directly to TCGA ID 6 | 7 | 8 | use strict; 9 | use warnings; 10 | 11 | my @myList=(); 12 | my @a; 13 | my $samples={}; 14 | 15 | my $fn = "/gscmnt/gc3020/dinglab/medseq/Germline/projects/PanCanAtlasGermline/TCGA_data/sampleQC/pca_table.20171017.tsv"; 16 | #my $fn = "../../../../TCGA_data/sampleQC/pca_table.20171017.tsv"; 17 | open ( IN , "<$fn" ) or die "Cannot open $fn: $!"; 18 | while ( ) 19 | { 20 | chomp; 21 | my @line = split "\t" , $_; 22 | $samples->{$line[0]}=$line[1]; 23 | } 24 | close IN; 25 | 26 | while() { 27 | chomp; 28 | if( /^#/ ) { 29 | # get list of input filenames given to merge 30 | #if( /vcf-merge/ ) { 31 | if( /bcftools_mergeCommand/ ){ 32 | @a = split /\s+/; 33 | #for(my $i = 1; $i < scalar @a; $i++) { 34 | for(my $i = 5; $i < scalar @a; $i++) { 35 | my %data = ('inputfile' => $a[ $i ], 'samplename' => ""); 36 | push @myList, \%data; 37 | } 38 | } 39 | if( /^#CHROM/ ) { 40 | @a = split /\t/; 41 | for(my $i = 9 ; $i < scalar @a; $i++) { 42 | # $myList[ $i - 9 ]{'samplename'} = $a[ $i ]; 43 | 44 | # in this application, extract unique identifier from first field 45 | my @b = split /\./, $myList[ $i - 9 ]{'inputfile'}; 46 | my @c = split /\//, $b[0]; 47 | my $TCGA = $samples->{$c[1]}; 48 | $a[ $i ] = $TCGA; 49 | } 50 | } 51 | 52 | #Print 53 | if( /^#CHROM/ ) { 54 | print join("\t", @a),"\n"; 55 | } else { 56 | print $_,"\n"; 57 | } 58 | 59 | } else { 60 | last; 61 | } 62 | } 63 | 64 | #for(my $j=0 ; $j < scalar @myList; $j++) { 65 | # print $j,"\t", $myList[$j]{'inputfile'},"\t", $myList[$j]{'samplename'}, "\n"; 66 | #} 67 | -------------------------------------------------------------------------------- /analysis/process_files/germline/local/run_VEP.v85.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | #(ads&rjm) 2016-09-22 4 | 5 | # use new perl 6 | . /gscmnt/gc2525/dinglab/rmashl/Software/perl/set_envvars 7 | #which ${PERL_BIN} 8 | #exit 9 | #ads_vep="/gscmnt/gc2706/dinglab/medseq/LabCode/AdamDS/ensembl-vep/vep" 10 | #ads_cachevep="/gscmnt/gc2706/dinglab/medseq/LabCode/AdamDS/VEP/.vep/" 11 | vep_cmd="/gscmnt/gc2525/dinglab/rmashl/Software/perl/perl-5.22.0/bin/perl /gscmnt/gc2525/dinglab/rmashl/Software/bin/VEP/v85/ensembl-tools-release-85/scripts/variant_effect_predictor/variant_effect_predictor.pl" 12 | cachedir="/gscmnt/gc2525/dinglab/rmashl/Software/bin/VEP/v85/cache/" 13 | reffasta="/gscmnt/gc2525/dinglab/rmashl/Software/bin/VEP/v85/cache/homo_sapiens/85_GRCh37/Homo_sapiens.GRCh37.75.dna.primary_assembly.fa" 14 | assembly="GRCh37" 15 | #opts="--plugin ExAC,/gscmnt/gc2706/dinglab/medseq/ExAC/VCF/ExAC.r0.3.1.sites.vep.vcf.gz" #"--everything" 16 | opts="--everything" 17 | results="new_run/AnnotatedVCFs/" 18 | variantBuffer=2000 19 | queue="long" 20 | group="/khuang" 21 | forks=4 22 | mem=20000000 23 | export SAMTOOLSDIR="/gscmnt/gc2525/dinglab/rmashl/Software/bin/samtools/1.2/bin" 24 | expoSAMTOOLS="$SAMTOOLSDIR/samtools" 25 | 26 | inputDir="/gscmnt/gc3020/dinglab/medseq/Germline/projects/PanCanAtlasGermline/analysis/process_files/germline/local/VCF" 27 | 28 | if [ ! -d ${results} ]; then 29 | mkdir ${results} 30 | fi 31 | 32 | for i in {1..22} X Y 33 | do 34 | vcf=PCA.r1.TCGAbarcode.merge.exon.chr${i}.vcf.gz 35 | out=PCA.r1.TCGAbarcode.merge.exon.chr${i}.vcf 36 | runVEP="perl format.pl $inputDir $vcf; ${vep_cmd} ${opts} --offline --cache --dir ${cachedir} --assembly ${assembly} --format vcf --vcf -i new_run/preVEP/anno.$out -o ${results}anno.${out} --force_overwrite --fasta ${reffasta} --fork ${forks} --buffer_size ${variantBuffer};" 37 | log="${results}anno.${vcf}.log" 38 | echo "bsub -g ${group} -q ${queue} -n ${forks} -M ${mem} -oo ${log} \"${runVEP}\"" 39 | done 40 | 41 | -------------------------------------------------------------------------------- /analysis/process_files/germline/local/run_calc_vcf_concordance.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | date 3 | 4 | cancer=$1 5 | outFileExon=${cancer}.exon.QCstat.tsv 6 | outFileEncode=${cancer}.encodeROI.QCstat.tsv 7 | touch $outFileExon 8 | touch $outFileEncode 9 | 10 | samples=$(gsutil ls gs://dinglab/isb-cgc/tcga/germline/production/${cancer}/) 11 | for sample in $samples; do 12 | sampleNamePre=${sample##*$cancer/} 13 | sampleName=${sampleNamePre%/} 14 | echo "Copying vcf for "$sampleName 15 | 16 | gsutil cp ${sample}combine/*gz ${sampleName}.prefilter.snp_indel.vcf.gz 17 | gsutil cp ${sample}combine/*gz.tbi ${sampleName}.prefilter.snp_indel.vcf.gz.tbi 18 | 19 | #echo "running" ${cancer}.normal.merge.vcf.gz ${sampleName}.prefilter.snp_indel.vcf.gz $outFile 20 | python calc_vcf_concordance.py ${sampleName}.prefilter.snp_indel.vcf.gz ${cancer}.normal.merge.allCDS.vcf.gz $outFileExon 21 | python calc_vcf_concordance.py ${sampleName}.prefilter.snp_indel.vcf.gz ${cancer}.normal.merge.ENCODE.vcf.gz $outFileEncode 22 | rm -f ${sampleName}.prefilter.snp_indel.vcf.gz* 23 | 24 | done 25 | date 26 | -------------------------------------------------------------------------------- /analysis/process_files/germline/local/run_charger_on_vep_VCF.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | #setEnvironment=". /gscuser/ascott/python2_7_13_env" 4 | 5 | inputDir="/gscmnt/gc3020/dinglab/medseq/Germline/projects/PanCanAtlasGermline/analysis/process_files/germline/local/new_run/AnnotatedVCFs" 6 | 7 | #clinvar="/gscmnt/gc2706/dinglab/medseq/ClinVar/MacArthurLab/DataSnapshots/201701_ead5de/clinvar_alleles.tsv.gz" 8 | clinvar="/gscmnt/gc2706/dinglab/medseq/ClinVar/MacArthurLab/clinvar/output/b37/single/clinvar_alleles.single.b37.tsv.gz" 9 | mmGenes="/gscmnt/gc2737/ding/Analysis/Germline/CharGer/20160301_Rahman_KJ_KH_gene_table_CharGer.txt" 10 | mmVariants="/gscmnt/gc2737/ding/Analysis/VariantLists/emptyRemoved_20160428_pathogenic_variants_HGVSg_VEP.vcf" 11 | hotspot="/gscmnt/gc2737/ding/Analysis/Germline/CharGer/MC3.noHypers.mericUnspecified.d10.r20.v114.clusters" 12 | rareThreshold="0.0005" 13 | 14 | results="new_run/Charged_VEP/" 15 | if [ ! -d ${results} ]; then 16 | mkdir ${results} 17 | fi 18 | 19 | #queue="bigmem" 20 | queue="long" 21 | queue="ding-lab" 22 | group="/khuang" 23 | 24 | 25 | for i in {1..22} X Y; 26 | do 27 | sample="$inputDir/anno.PCA.r1.TCGAbarcode.merge.exon.chr${i}.vcf" 28 | vcf="PCA.r1.TCGAbarcode.merge.exon.chr${i}.vcf" 29 | output="${results}charged.${vcf}.tsv" 30 | command="charger --include-vcf-details -f ${sample} -o ${output} -O -D -g ${mmGenes} -z ${mmVariants} -H ${hotspot} -l --mac-clinvar-tsv ${clinvar} --rare-threshold ${rareThreshold} > ${results}charger.${vcf}.out" 31 | log="${results}charger.${vcf}.log" 32 | echo "bsub -R\"select[type==LINUX64 && mem>80000] rusage[mem=80000]\" -M 60000000 -g ${group} -q ${queue} -oo ${log} \"${command}\"" 33 | done 34 | -------------------------------------------------------------------------------- /analysis/process_files/germline/local/update_vcfHeader_to_TCGA.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | # Kuan, Oct 2017 adapted from Jay's script 3 | 4 | for file in PCA_*.merge.vcf.gz; do 5 | echo '------' 6 | echo "Reheadering "$file 7 | chunk=${file%.merge.vcf.gz} 8 | echo '------' 9 | gunzip -dc $file | perl replace_vcf_header_sample_with_source_TCGA.pl > $chunk.merge.newheader.txt 10 | tabix -r $chunk.merge.newheader.txt $file > $chunk.merge.TCGAbarcode.vcf.gz 11 | tabix -p vcf $chunk.merge.TCGAbarcode.vcf.gz 12 | 13 | rm -f $chunk.merge.newheader.txt #$file $file.tbi 14 | done -------------------------------------------------------------------------------- /analysis/process_files/germline/local/variant_QC_annotation.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | # by Kuan-lin Huang 2017 May @ WashU 3 | echo "start time" 4 | date 5 | # requires: vcftools & vcfanno (https://github.com/brentp/vcfanno) 6 | 7 | ### AF and AD frequency filter ### 8 | vcfFile=$1 9 | bedFile=../../../../TCGA_data/reference_files/all_CDS_ncRNA_ENCODE_multicell_ROI.bed 10 | vcfannoConfigFile=ExAC_config.toml 11 | AF_thres=0.01 12 | AD_thres=3 13 | 14 | # annotate with ExAC frequency 15 | annotated_VCF=${vcfFile/vcf.gz/annotated.vcf} 16 | echo "using vcfanno to annotate" ${vcfFile} "into" ${annotated_VCF} 17 | vcfanno ${vcfannoConfigFile} ${vcfFile} > ${annotated_VCF} 18 | 19 | # # note the previous versions only takes variants not appearing in ExAC due to bug in filter_VCF_AF_AD.py 20 | # filtered_VCF=${annotated_VCF/vcf/ExAC_AF.${AF_thres}.AD.${AD_thres}.vcf} 21 | # echo "filtering" ${annotated_VCF} "into" ${filtered_VCF} 22 | # python filter_VCF_AF_AD.py ${annotated_VCF} $AF_thres $AD_thres 23 | 24 | # now we only need variants that are rare in ExAC 25 | filtered_VCF=${annotated_VCF/vcf/ExAC_AF.${AF_thres}.ExAConly.AD.${AD_thres}.vcf} 26 | echo "filtering" ${annotated_VCF} "into" ${filtered_VCF} 27 | python filter_VCF_AF_AD_keepExAConly.py ${annotated_VCF} $AF_thres $AD_thres 28 | date 29 | 30 | ### extract ROI (region of interest) ### 31 | extracted_VCF=${filtered_VCF/vcf/ROI.vcf.gz} 32 | echo "extracting" ${filtered_VCF} "based on" $bedFile "into" ${extracted_VCF} 33 | vcftools --vcf $filtered_VCF \ 34 | --bed $bedFile \ 35 | --keep-INFO-all --recode -c | bgzip -c > ${extracted_VCF} 36 | 37 | # tabix 38 | echo "Indexing extracted VCF" 39 | tabix -p vcf ${extracted_VCF} 40 | # remove intermediate VCF files 41 | rm -f $annotated_VCF 42 | rm -f $filtered_VCF 43 | date 44 | 45 | # possible option: calculate concordance with genotype file here [in this case you may want to batch the jobs by cancer types as I have already extracted those to ENCODE and exon regions in the VM: kuan-merge-genotype-bigmem] 46 | 47 | # [alternative: merge first across samples in a separate workflow] 48 | # [alternative: do a 5% MAF filter for the merged VCF within cohort to remove potential pipeline artifacts] 49 | 50 | # annotate the resulting VCF using VEP 51 | 52 | # run CharGer 53 | 54 | # move the resulting VCF and files back to storage 55 | 56 | echo "end time" 57 | date 58 | -------------------------------------------------------------------------------- /analysis/process_files/germline/merge_germline_cloud.sh: -------------------------------------------------------------------------------- 1 | # do cancer type by cancer type to avoid file size 2 | $ cat cancer_type.txt 3 | ACC 4 | BLCA 5 | BRCA 6 | CESC 7 | CHOL 8 | COAD 9 | DLBC 10 | ESCA 11 | GBM 12 | HNSC 13 | KICH 14 | KIRC 15 | KIRP 16 | LAML # there are no LAML samples so this was throwing errors earlier 17 | LGG 18 | LIHC 19 | LUAD 20 | LUSC 21 | MESO 22 | OV 23 | PAAD 24 | PCPG 25 | PRAD 26 | READ 27 | SARC 28 | SKCM 29 | STAD 30 | TGCT 31 | THCA 32 | THYM 33 | UCEC 34 | UCS 35 | UVM 36 | 37 | $ cat merge_germline_by_cancer.sh 38 | #!/bin/bash 39 | while IFS='' read -r line || [[ -n "$line" ]]; do 40 | echo "Processing cancer type: $line" 41 | echo "Start time" 42 | date 43 | cancer="${line%\\n}" 44 | mkdir $cancer 45 | samples=$(gsutil ls gs://dinglab/isb-cgc/tcga/germline/production/${cancer}/) 46 | for sample in $samples; do 47 | sampleNamePre=${sample##*$cancer/} 48 | sampleName=${sampleNamePre%/} 49 | echo "Copying vcf for "$sampleName 50 | #gsutil cp ${sample}combine/*gz ${sampleName}.prefilter.snp_indel.vcf.gz 51 | #gsutil cp ${sample}combine/*gz.tbi ${sampleName}.prefilter.snp_indel.vcf.gz.tbi 52 | gsutil cp ${sample}combine/prefilter.snp_indel.annotated.ExAC_AF.0.01.AD.3.ROI.vcf.gz ${cancer}/${sampleName}.prefilter.snp_indel.annotated.ExAC_AF.0.01.AD.3.ROI.vcf.gz 53 | gsutil cp ${sample}combine/prefilter.snp_indel.annotated.ExAC_AF.0.01.AD.3.ROI.vcf.gz.tbi ${cancer}/${sampleName}.prefilter.snp_indel.annotated.ExAC_AF.0.01.AD.3.ROI.vcf.gz.tbi 54 | done 55 | 56 | #merge 57 | vcf-merge $(ls -1 ${cancer}/*.vcf.gz | perl -pe 's/\n/ /g') > ${cancer}.merge.vcf 58 | bgzip -c ${cancer}.merge.vcf > ${cancer}.merge.vcf.gz 59 | #index 60 | tabix -p vcf ${cancer}.merge.vcf.gz 61 | #upload 62 | gsutil cp ${cancer}.merge.vcf.gz* gs://dinglab/isb-cgc/tcga/germline/production/merge 63 | # delete files 64 | rm -rf ${cancer}.merge.vcf 65 | rm -rf ${cancer}/*ROI.vcf.gz 66 | rm -rf ${cancer}/*ROI.vcf.gz.tbi 67 | echo "End time" 68 | date 69 | done < "$1" 70 | 71 | # set unlimit file number higher so it works for breast cancer 72 | 73 | ulimit -n 2500 74 | 75 | nohup bash merge_germline_by_cancer.sh cancer_type.txt > merge_germline_by_cancer.log & 76 | nohup bash merge_germline_by_cancer2.sh cancer_type2.txt > merge_germline_by_cancer2.log & 77 | nohup bash merge_germline_by_cancer2.sh cancer_type3.txt > merge_germline_by_cancer3.log & 78 | nohup bash merge_germline_by_cancer2.sh cancer_type4.txt > merge_germline_by_cancer4.log & 79 | nohup bash merge_germline_by_cancer2.sh CESC.txt > merge_germline_by_cancerCESC.log & 80 | 81 | # there is no LAML? if so delete 82 | rm -rf LAML* 83 | 84 | ### merge verything 85 | # copy file to big VM 86 | echo "Start time" 87 | date 88 | gsutil cp gs://dinglab/isb-cgc/tcga/germline/production/merge/* . 89 | #merge 90 | echo "Merging" 91 | #vcf-merge $(ls -1 *.vcf.gz | perl -pe 's/\n/ /g') > PCA.merge.vcf 92 | bcftools merge --output-type z --output PCA.merge.vcf.gz $(ls -1 *.vcf.gz | perl -pe 's/\n/ /g') 93 | #echo "Zipping" 94 | #bgzip -c PCA.merge.vcf > PCA.merge.vcf.gz 95 | #index 96 | echo "Indexing" 97 | tabix -p vcf PCA.merge.vcf.gz 98 | #upload 99 | gsutil cp PCA.merge.vcf.gz* gs://dinglab/isb-cgc/tcga/germline/production/merge 100 | echo "End time" 101 | date 102 | 103 | # # note: bcftools turned out much more efficient and can directly generate zipped file 104 | 105 | # # note: other tools 106 | # # copy local files to VM 107 | # gcloud compute copy-files --zone us-central1-f /Users/khuang/Downloads/GenomeAnalysisTK-3.7.tar.bz2 huangkuanlin@kuan-merge-germline-bigmem:~/ 108 | # gcloud compute copy-files --zone us-central1-f /Users/khuang/Downloads/picard-2.9.0.zip huangkuanlin@kuan-merge-germline-bigmem:~/ 109 | 110 | # # set up 111 | # wget ftp://genome.wustl.edu/pub/reference/GRCh37-lite/GRCh37-lite.fa.gz 112 | # gunzip GRCh37-lite.fa.gz 113 | # samtools faidx GRCh37-lite.fa -------------------------------------------------------------------------------- /analysis/process_files/germline/readme.txt: -------------------------------------------------------------------------------- 1 | # describe workflow to conduct variant QC, concordance calculation and annotation 2 | # by Kuan-lin Huang and Jay Mashl 2017 May @ WashU 3 | 4 | variant_QC_annotation.sh which runs script: 5 | filter_VCF_AF_AD.py 6 | 7 | and other dependencies: 8 | vcfanno 9 | ExAC_config.toml [vcfanno configuration file] 10 | ExAC_nonTCGA frequency file (ftp://ftp.broadinstitute.org/pub/ExAC_release/release0.3.1/subsets/ExAC_nonTCGA.r0.3.1.sites.vep.vcf.gz and ftp://ftp.broadinstitute.org/pub/ExAC_release/release0.3.1/subsets/ExAC_nonTCGA.r0.3.1.sites.vep.vcf.gz.tbi) 11 | vcftools 12 | bed file: gs://dinglab/isb-cgc/tcga/reference_files/all_CDS_ncRNA_ENCODE_multicell_ROI.bed 13 | 14 | 15 | Note: # this is rough ballpark number check the exact number of this sample H_LS-E2-A10B-10A-01D-A10M-09 in the most recent ppt 16 | Variant QC and filter: 17 | 1. Annotate AF, filter with AF and AD [3 min] 18 | 226K -> 102K variants 19 | 2. Extract variants in ROI (exon + encode all cell regulatory region), index [28 min] 20 | 102K -> 21K variants 21 | 22 | 23 | Concordance [may be added in to the last variant QC and filter step before annotation] 24 | run_calc_vcf_concordance.sh which runs: 25 | calc_vcf_concordance.py 26 | 27 | kuan-merge-genotype-bigmem which has the genotype VCF already trimmed down to region of interests: 28 | ${cancer}.normal.merge.allCDS.vcf.gz 29 | ${cancer}.normal.merge.ENCODE.vcf.gz -------------------------------------------------------------------------------- /analysis/process_files/germline/var_freq/batch_run_vcf_var_freq_filter.sh: -------------------------------------------------------------------------------- 1 | # not in ExAC 2 | VCFs=$(gsutil ls gs://dinglab/isb-cgc/tcga/germline/production/merge_v2/not-in-exac/3.annotated/whitelisted/by_tcgaBarcode/PCA.merge.tcgaBarcode.chr*.anno.whitelist.vcf.gz) 3 | for file in $VCFs; do 4 | bash run_vcf_var_freq_filter.sh $file & 5 | 6 | NPROC=$(($NPROC+1)) 7 | if [ "$NPROC" -ge 8 ]; then 8 | wait 9 | NPROC=0 10 | fi 11 | done 12 | 13 | # in ExAC: gs://dinglab/isb-cgc/tcga/germline/production/merge.ExAConly/all/annotated/by_tcgaBarcode/PCA.merge.ExAConly.tcgaBarcode.anno.whitelist.vcf.gz* 14 | 15 | file=gs://dinglab/isb-cgc/tcga/germline/production/merge.ExAConly/all/annotated/by_tcgaBarcode/PCA.merge.ExAConly.tcgaBarcode.anno.whitelist.vcf.gz 16 | bash run_vcf_var_freq_filter.sh $file 17 | -------------------------------------------------------------------------------- /analysis/process_files/germline/var_freq/run_vcf_var_freq_filter.sh: -------------------------------------------------------------------------------- 1 | # run vcf frequency filter to rule out (1) alleles with greater than 5% AF in the PCA cohort and without any variant in the final sample list 2 | # update AN/AC fields to the final cohort of 9401 samples 3 | 4 | # not in ExAC: gs://dinglab/isb-cgc/tcga/germline/production/merge_v2/not-in-exac/3.annotated/whitelisted/by_tcgaBarcode/PCA.merge.tcgaBarcode.chr*.anno.whitelist.vcf.gz 5 | 6 | 7 | file=$1 8 | gsutil cp $file* . 9 | vcfName=${file##*/} 10 | outVCF=${vcfName/.vcf.gz/.cohortAF0.05.vcf.gz} 11 | echo "Filtering ${vcfName} into $outVCF" 12 | # frequency check, recalculate AC, AN, and AF based on the cohort 13 | perl vcf_var_freq_filter.pl --vcf $vcfName | bgzip -c > $outVCF 14 | tabix -p $outVCF 15 | ls -klh ${outVCF}* 16 | 17 | gsutil cp ${outVCF}* gs://dinglab/isb-cgc/tcga/germline/production/merge_v2/not-in-exac/3.annotated/whitelisted/by_tcgaBarcode/cohort_AF_filtered 18 | rm -f $vcfName 19 | rm -f ${outVCF}* -------------------------------------------------------------------------------- /analysis/process_files/germline/var_freq/vcf_var_freq_filter.pl: -------------------------------------------------------------------------------- 1 | # Kuan Huang @ WashU 2017 Aug1 2 | # reference: # https://github.com/zhuochenbioinfo/VcfStat 3 | # filter variants based on a specific frequency using the cohort freq 4 | # update AN/AC fields 5 | 6 | use strict; 7 | use warnings; 8 | use Getopt::Long; 9 | 10 | my($vcf,$keepList,$all,$CHROM); 11 | 12 | my $usage = "USAGE:\nperl $0 --vcf --out \n"; 13 | $usage .= " is the input vcf fle. [Necessary].\n"; 14 | #$usage .= " is the output file. [Necessary]\n"; 15 | 16 | GetOptions( 17 | "vcf=s" => \$vcf, 18 | #"out=s" => \$out, 19 | "chr=s" => \$CHROM, 20 | ) or die $usage; 21 | 22 | die $usage unless(defined $vcf); 23 | my $log = $vcf."cohortAF.filter.log"; 24 | 25 | # add in a module to open both vcf and vcf.gz 26 | if ($vcf =~ /.gz$/) { 27 | open(IN, "gunzip -c $vcf |") || die "can’t open pipe to $vcf"; 28 | } 29 | else { 30 | open(IN, $vcf) || die "can’t open $vcf"; 31 | } 32 | 33 | # open(IN,"<$vcf") or die $!; 34 | open(LOG,">$log"); 35 | 36 | # arbitrary cut-off for now 37 | my $sample_size = 9401; 38 | my $AF_threshold = 0.05; 39 | 40 | my @samples = (); 41 | my @keepRanks = (); 42 | my $num_pass_alleles = 0; 43 | my $filtered_alleles = 0; 44 | my $filtered_vars = 0; 45 | my $nonexisting_alleles = 0; 46 | my $nonexisting_vars = 0; 47 | 48 | while(){ 49 | chomp; 50 | if($_ =~ /^##/){ 51 | print $_."\n"; 52 | next; 53 | } 54 | my($chr,$pos,$id,$ref,$alts_join,$qual,$filter,$info,$format,@datas) = split/\t/; 55 | my @alts = split/,/,$alts_join; 56 | my @alleles = ($ref,@alts); 57 | my @alleles_count = (0 x $#alleles); 58 | my @pass_alleles = (); 59 | my @pass_alleles_index = (0); #keeping the ref index 60 | my @pass_alleles_count = (); 61 | my @pass_alleles_freq = (); 62 | 63 | if($_ =~ /^#CHROM/){ 64 | print $_."\n"; 65 | #print "##samplenum=$num\n"; 66 | #print OUT "CHROM\tPOS\tREF\tALT\tFILTER\tALLELEnum\tHETnum\tNAnum\tCOVfreq\tALLELEfreq\n"; 67 | next; 68 | } 69 | 70 | foreach my $spot(@datas){ 71 | if($spot =~ /^(\d+)\/(\d+)/){ # add count for each allele here 72 | $alleles_count[$1]++; 73 | $alleles_count[$2]++; 74 | } 75 | } 76 | 77 | # keep only alleles that are rare in the cohort 78 | for(my $i = 1; $i < @alleles; $i++){ 79 | my $count = $alleles_count[$i]; 80 | if (!defined($count)){ 81 | $nonexisting_alleles++; 82 | next; 83 | } 84 | 85 | my $AF = $count/$sample_size; 86 | if ( $AF < $AF_threshold ){ 87 | $num_pass_alleles++; 88 | push @pass_alleles, $alleles[$i]; 89 | push @pass_alleles_index, $i; 90 | push @pass_alleles_count, $alleles_count[$i]; 91 | push @pass_alleles_freq, $AF; 92 | #print $AF."(AF)\t" 93 | } else { 94 | $filtered_alleles++; 95 | } 96 | 97 | } 98 | 99 | # modify the geno field; 100 | # reorder the index to the new index; replace essentially 101 | for(my $i=1; $i < @pass_alleles_index; $i++){ 102 | my $old_index = $pass_alleles_index[$i]; 103 | foreach my $spot(@datas){ 104 | $spot =~ s/^$old_index\//$i\//; 105 | $spot =~ s/\/$old_index/\/$i/; 106 | } 107 | } 108 | 109 | my $count_of_pass_alleles = scalar @pass_alleles; 110 | # only print the line if we have more than one passed allele 111 | if($count_of_pass_alleles > 0){ 112 | # update alternative allele 113 | my $alts_join = join(",",@pass_alleles); 114 | # update AC, AN, AF in info field of vcf; AF=0.5,0.5;MLEAC=1,1;MLEAF=0.5,0.5;AN=326;AC=169,23 115 | my $AC = join(",",@pass_alleles_count); 116 | my $AF = join(",",@pass_alleles_freq); 117 | $info =~ s/"AF=.*;"/"AF=".$AF.";"/; 118 | $info =~ s/"AC=.*;"/"AC=".$AC.";"/; 119 | $info =~ s/"AN=.*;"/"AF=".$sample_size.";"/; 120 | print join("\t",($chr,$pos,$id,$ref,$alts_join,$qual,$filter,$info,$format,@datas))."\n"; 121 | } else{ 122 | $filtered_vars++; 123 | #print LOG "Filtered variant (including nonexisting variants): ".join(",",($chr,$pos,$id,$ref,$alts_join))."\n"; 124 | } 125 | 126 | } 127 | 128 | print LOG "Pass alleles: $num_pass_alleles\n\n"; 129 | print LOG "Nonexisting alleles: $nonexisting_alleles\n"; 130 | print LOG "Filtered alleles: $filtered_alleles\n"; 131 | print LOG "Total filtered variants: $filtered_vars\n"; 132 | my $total_filtered_alleles = $nonexisting_alleles + $filtered_alleles; 133 | print LOG "Total filtered alleles: $total_filtered_alleles\n"; 134 | close LOG; 135 | -------------------------------------------------------------------------------- /analysis/sample_listing/make_clin_summary_table.R: -------------------------------------------------------------------------------- 1 | ##### compile_compare_samples.R ##### 2 | # Kuan-lin Huang @ WashU 2017 Oct. 3 | # make a clinical supplementary table for pan-germline manuscript 4 | 5 | setwd("/Users/khuang/Box Sync/PhD/germline/PanCanAtlasGermline/analysis/sample_listing") 6 | source("../global_aes_out.R") 7 | 8 | clin_f = "/Users/khuang/Box Sync/PhD/germline/PanCanAtlasGermline/TCGA_data/clinical/PanCan_ClinicalData_V4_wAIM_filtered10389.txt" 9 | clin_full = read.table(header=T, quote = "", sep="\t", fill =T, file = clin_f, stringsAsFactors=FALSE) 10 | clin = clin_full[,c("bcr_patient_barcode", "type","age_at_initial_pathologic_diagnosis","gender","race")] 11 | colnames(clin) = c("sample","cancer","age_at_onset","gender","ethnicity") 12 | 13 | clin$ethnicity[clin$ethnicity %in% c("","[Not Available]","[Not Evaluated]","[Unknown]")]=NA 14 | 15 | s_c_list_f = "/Users/khuang/Box\ Sync/PhD/germline/PanCanAtlasGermline/TCGA_data/sampleQC/pca_table.20171118.filtered.wclin.tsv" 16 | sample_cancer = read.table(header=T, quote = "", sep="\t", file = s_c_list_f, stringsAsFactors=FALSE) 17 | sample_cancer = sample_cancer[,c("bcr_patient_barcode", "cancer")] 18 | colnames(sample_cancer) = c("sample","cancer") 19 | 20 | sample_cancer_clin = merge(sample_cancer,clin, by = c("sample","cancer"), all.x =T) 21 | mean(sample_cancer_clin$age_at_onset, na.rm=T) 22 | sample_cancer_clin$gender[sample_cancer_clin$gender==""]=NA 23 | 24 | cancer_count = data.frame(table(data = sample_cancer_clin$cancer )) 25 | cancer_ethni_count = as.data.frame(table(sample_cancer_clin$cancer,sample_cancer_clin$ethnicity)) 26 | cancer_ethni_count_d = dcast(cancer_ethni_count, formula = Var1 ~ Var2, value.var = "Freq") 27 | cancer_gender_count = as.data.frame(table(sample_cancer_clin$cancer,sample_cancer_clin$gender)) 28 | cancer_gender_count_d = dcast(cancer_gender_count, formula = Var1 ~ Var2, value.var = "Freq") 29 | cancer_gender_count_d$FemalePercent=cancer_gender_count_d$FEMALE/(cancer_gender_count_d$FEMALE+ cancer_gender_count_d$MALE) 30 | cancer_aao = data.frame(aggregate(data = sample_cancer_clin, age_at_onset~cancer, FUN = "mean")) 31 | cancer_aao_sd = data.frame(aggregate(data = sample_cancer_clin, age_at_onset~cancer, FUN = "sd")) 32 | colnames(cancer_count) = c("Cancer","Sample size") 33 | colnames(cancer_ethni_count_d)[1] = c("Cancer") 34 | colnames(cancer_aao) = c("Cancer", "Average_AAO") 35 | colnames(cancer_aao_sd) = c("Cancer", "AAO_SD") 36 | colnames(cancer_gender_count_d)[1]="Cancer" 37 | cancer_gender_count_d_sum = cancer_gender_count_d[,c("Cancer","FemalePercent")] 38 | 39 | cancer_count_gender = merge(cancer_count,cancer_gender_count_d_sum,by="Cancer") 40 | cancer_count_wethni = merge(cancer_count_gender,cancer_ethni_count_d, by="Cancer") 41 | cancer_count_wethni_waao = merge(cancer_count_wethni, cancer_aao, by = "Cancer") 42 | cancer_count_wethni_waao_sd = merge(cancer_count_wethni_waao, cancer_aao_sd, by = "Cancer") 43 | cancer_count_wethni_waao_sd$AAO = paste(round(cancer_count_wethni_waao_sd$Average_AAO,1), "+/-", round(cancer_count_wethni_waao_sd$AAO_SD,1)) 44 | colnames(cancer_count_wethni_waao_sd) = tolower(colnames(cancer_count_wethni_waao_sd)) 45 | colnames(cancer_count_wethni_waao_sd) = paste(toupper(substring(colnames(cancer_count_wethni_waao_sd), 1,1)), substring(colnames(cancer_count_wethni_waao_sd), 2),sep="") 46 | all_sum = sum(as.numeric(cancer_count_wethni_waao_sd[,2])) 47 | all_sum_ethni = sapply(cancer_count_wethni_waao_sd[,c(4:8)],sum) 48 | gender_ratio = sum(sample_cancer_clin$gender=="FEMALE",na.rm=T)/(sum(sample_cancer_clin$gender=="FEMALE",na.rm=T)+sum(sample_cancer_clin$gender=="MALE",na.rm=T)) 49 | all_aao = round(mean(sample_cancer_clin$age_at_onset, na.rm=T),1) 50 | all_aao_sd = round(sd(sample_cancer_clin$age_at_onset, na.rm=T),1) 51 | all_row = c("All", all_sum, gender_ratio, all_sum_ethni,all_aao, all_aao_sd, paste(all_aao, "+/-", all_aao_sd)) 52 | cancer_count_wethni_waao_sd$Cancer = as.character(cancer_count_wethni_waao_sd$Cancer) 53 | cancer_count_wethni_waao_sd = rbind(cancer_count_wethni_waao_sd,all_row) 54 | 55 | cancer_count_wethni_waao_sd_p = cancer_count_wethni_waao_sd[,!(colnames(cancer_count_wethni_waao_sd) %in% c("Average_aao","Aao_sd"))] 56 | #colnames(cancer_count_wethni_waao_sd_p) = c("Cancer", "Sample size", "American indian", "Asian", "African american", "Pacific islander", "White", "Age at onset") 57 | 58 | tn = "out/cancer_count_wethni_waao.txt" 59 | write.table(cancer_count_wethni_waao_sd_p, quote=F, sep="\t", file = tn, row.names = F) 60 | -------------------------------------------------------------------------------- /analysis/segregation_analysis/batch_run_segregation.sh: -------------------------------------------------------------------------------- 1 | # not in ExAC 2 | VCFs=$(gsutil ls gs://dinglab/isb-cgc/tcga/germline/production/merge_v2/not-in-exac/3.annotated/whitelisted/by_tcgaBarcode/PCA.merge.tcgaBarcode.chr*.anno.whitelist.vcf.gz) 3 | for file in $VCFs; do 4 | bash find_segregating_var.sh $file & 5 | 6 | NPROC=$(($NPROC+1)) 7 | if [ "$NPROC" -ge 8 ]; then 8 | wait 9 | NPROC=0 10 | fi 11 | done 12 | 13 | # in ExAC: gs://dinglab/isb-cgc/tcga/germline/production/merge.ExAConly/all/annotated/by_tcgaBarcode/PCA.merge.ExAConly.tcgaBarcode.anno.whitelist.vcf.gz* 14 | 15 | file=gs://dinglab/isb-cgc/tcga/germline/production/merge.ExAConly/all/annotated/by_tcgaBarcode/PCA.merge.ExAConly.tcgaBarcode.anno.whitelist.vcf.gz 16 | bash find_segregating_var.sh $file 17 | -------------------------------------------------------------------------------- /analysis/segregation_analysis/find_relatives.R: -------------------------------------------------------------------------------- 1 | ##### find_relatives.R ##### 2 | # Kuan-lin Huang @ WashU 2017 July 3 | 4 | setwd("/Users/khuang/Box Sync/PhD/germline/PanCanAtlasGermline/analysis/segregation_analysis") 5 | source("../global_aes_out.R") 6 | system("mkdir out") 7 | # ethnicity assignment 8 | ethni_fn = "/Users/khuang/Box\ Sync/PhD/germline/PanCanAtlasGermline/analysis/PCA_IBD_dist/out/2017-04-24/2017-04-24_GDAN_AIM_PCA_ethnicity_assigned_WashU.tsv" 9 | ethni = read.table(header=T, sep = '\t',file=ethni_fn) 10 | ethni_short = ethni[,c("Case","cancer","washu_assigned_ethnicity")] 11 | 12 | 13 | #genome file for all the pi HAT 14 | rel_f = "plink_out/all.normal.merge.indep_50_5_2.vcf.ibd.genome.PI_HAT0.05.tsv" 15 | rel = read.table(header=T, quote = "", sep="\t", row.names =NULL, file = rel_f, stringsAsFactors=FALSE) 16 | rel_short = rel[rel$Z1 + rel$Z2 > 0.2,] 17 | rel_short$Sample1 = gsub("(.{12}).*","\\1",as.character(rel_short$FID1)) 18 | rel_short$Sample2 = gsub("(.{12}).*","\\1",as.character(rel_short$FID2)) 19 | 20 | colnames(ethni_short) = c("Sample1","cancer1","assigned_ethnicity1") 21 | rel_short_m1 = merge(rel_short,ethni_short, by="Sample1",all.x=T) 22 | colnames(ethni_short) = c("Sample2","cancer2","assigned_ethnicity2") 23 | rel_short_m2 = merge(rel_short_m1,ethni_short, by="Sample2",all.x=T) 24 | rel_short_m2$same_sample = (rel_short_m2$Sample1 == rel_short_m2$Sample2) 25 | 26 | rel_short_m2_same = rel_short_m2[rel_short_m2$assigned_ethnicity2==rel_short_m2$assigned_ethnicity1,] 27 | rel_short_m2_same_withethni = rel_short_m2_same[rel_short_m2_same$assigned_ethnicity1!="unknown",] 28 | rel_short_m2_same_withethni$assigned_ethnicity1 = as.character(rel_short_m2_same_withethni$assigned_ethnicity1) 29 | 30 | p = ggplot(data=rel_short_m2_same_withethni,aes(x = Z1, y=Z2, color=same_sample)) 31 | p = p + facet_grid(.~assigned_ethnicity1, drop=T, scales="free",space="free") 32 | p = p + geom_point(alpha=0.2) 33 | p = p + theme_bw() #+ expand_limits(y=1)#+ guides(fill=FALSE) 34 | p = p + xlim(0,1) + ylim(0,1) 35 | #p = p + geom_vline(xintercept = 0,alpha=.7) + geom_vline(xintercept = 1,alpha=.7) 36 | p 37 | fn = paste(pd, "PanCanAtlas_rel_z1.z2_withinEthni.pdf", sep="_") 38 | ggsave(file=fn, height=5, width=12, useDingbats=FALSE) 39 | 40 | rel_short_m2_same_withethni_ofinterest = rel_short_m2_same_withethni[!rel_short_m2_same_withethni$same_sample,] 41 | rel_short_m2_same_withethni_ofinterest = rel_short_m2_same_withethni_ofinterest[(rel_short_m2_same_withethni_ofinterest$Z2 > 0.125 & rel_short_m2_same_withethni_ofinterest$Z1 > 0.25) | 42 | (rel_short_m2_same_withethni_ofinterest$Z1 > 0.20) ,] 43 | 44 | tn = paste("out/TCGA_z1_z2_relatives.tsv", sep="_") 45 | write.table(rel_short_m2_same_withethni_ofinterest,quote=F, sep = '\t', row.names = FALSE,file=tn) 46 | 47 | rel_short_m2_same_withethni_ofinterest2 = rel_short_m2_same_withethni_ofinterest[(rel_short_m2_same_withethni_ofinterest$Z2 > 0.125 & rel_short_m2_same_withethni_ofinterest$Z1 > 0.25) | 48 | (rel_short_m2_same_withethni_ofinterest$Z1 > 0.625) ,] 49 | #rel_short_m2_same_withethni_ofinterest2[,c("Sample1","Sample2","cancer1","cancer2")] 50 | tn = paste("out/TCGA_z1_z2_relatives_strict.tsv", sep="_") 51 | write.table(rel_short_m2_same_withethni_ofinterest2,quote=F, sep = '\t', row.names = FALSE,file=tn) 52 | -------------------------------------------------------------------------------- /analysis/segregation_analysis/find_segregating_var.sh: -------------------------------------------------------------------------------- 1 | # run segregation python script 2 | file=$1 3 | gsutil cp $file* . 4 | vcfName=${file##*/} 5 | echo "Filtering ${vcfName} into $outVCF" 6 | # frequency check, recalculate AC, AN, and AF based on the cohort 7 | python find_shared_var_relatives.py TCGA_z1_z2_relatives.tsv $vcfName 8 | ls -klh ${outVCF}* 9 | 10 | rm -f $vcfName -------------------------------------------------------------------------------- /analysis/segregation_analysis/find_shared_var_relatives.py: -------------------------------------------------------------------------------- 1 | #!/bin/python 2 | #Aug 2017 - Kuan-Lin Huang @ WashU - 3 | # find_shared variants in relatives identified in TCGA samples 4 | 5 | import sys 6 | import getopt 7 | import gzip 8 | 9 | def main(): 10 | def usage(): 11 | print """ 12 | find_shared_var_relatives.py : why do I exist? 13 | 14 | USAGE: find_shared_var_relatives.py [-h] 15 | -h print this message 16 | input file 17 | """ 18 | 19 | #use getopt to get inputs 20 | try: 21 | opts, args = getopt.getopt(sys.argv[1:], 'h') #:after option meaning required arguments 22 | except getopt.GetoptError: 23 | print "find_shared_var_relatives.py " 24 | 25 | for opt, arg in opts: #store the input options 26 | if opt == '-h': # h means user needs help 27 | usage(); sys.exit() 28 | 29 | args = sys.argv[1:] 30 | if len(args) < 1: 31 | usage(); sys.exit("input file missing") 32 | 33 | #open input file 34 | try: 35 | fn = args[0] 36 | relativeF = open(fn ,"r") 37 | except IOError: 38 | print("File , args[0], does not exist!") 39 | 40 | # dictionaries 41 | relative2relative = {} 42 | existing_relatives = {} 43 | sample2colID = {} 44 | 45 | # output file 46 | outFstring = args[0] + "." + args[1] 47 | #outF = outFstring.replace("vcf","segragatingVar.tsv") 48 | outF = outFstring.replace("vcf.gz","segragatingVar.tsv") 49 | outFH = open(outF, "w") 50 | 51 | #read input file 52 | for line in relativeF: 53 | line=line.strip() 54 | F = line.split("\t") 55 | #print str(len(F)) + "\n" 56 | if len(F)==2: 57 | print "Searching for segregating variants in relative pairs: "+ F[0] + ":" + F[1] + "\n" 58 | relative2relative[F[0]] = F[1] 59 | relativeF.close() 60 | 61 | 62 | try: 63 | fn = args[1] 64 | if fn.endswith(".gz"): 65 | vcfF = gzip.open(fn,"r") 66 | elif fn.endswith(".vcf"): 67 | vcfF = open(fn,"r") 68 | except IOError: 69 | print("File , args[1], does not exist or is not a valid vcf!") 70 | 71 | colnames = "sample1\tsample2\tsample1GENO\tsample2GENO\tCHROM\tPOS\tID\tREF\tALT\tQUAL\tFILTER\tINFO\tFORMAT\n" 72 | outFH.write(colnames) 73 | #read input file 74 | for line in vcfF: 75 | line=line.strip() 76 | 77 | # print headers 78 | if line.startswith("##"): 79 | #print line 80 | continue 81 | 82 | F = line.split("\t") 83 | colNum = len(F) 84 | 85 | if line.startswith("#CHR"): # row with column names 86 | #print line 87 | for i in range(0,colNum): 88 | TCGA_barcode = F[i][0:12] 89 | sample2colID[TCGA_barcode] = i 90 | for sample in relative2relative: 91 | relative = relative2relative[sample] 92 | # only keep the existing pairs to save iteration time later on 93 | if sample in sample2colID and relative in sample2colID: 94 | existing_relatives[sample] = relative 95 | 96 | else: 97 | INFO = F[7] 98 | INFOsplit = INFO.split(";AC=") 99 | AC = int( INFOsplit[1] ) 100 | if AC > 20: 101 | continue 102 | # loop through the relatives; if one has the var, check the other; only print if both has it 103 | for sample in existing_relatives: 104 | relative = relative2relative[sample] 105 | sampleColID = sample2colID[sample] 106 | relativeColID = sample2colID[relative] 107 | sampleGeno = F[sampleColID] 108 | relativeGeno = F[relativeColID] 109 | # print line if segragating var found! ( BOTH NOT WT ) 110 | if not sampleGeno.startswith("./.") and not relativeGeno.startswith("./."): 111 | outLine = sample + "\t" + relative + "\t" + sampleGeno + "\t" + relativeGeno + "\t".join(F[0:8]) + "\n" 112 | outFH.write(outLine) 113 | #print outLine 114 | 115 | 116 | vcfF.close() 117 | 118 | if __name__ == "__main__": 119 | main() 120 | -------------------------------------------------------------------------------- /analysis/segregation_analysis/segregation.log.sh: -------------------------------------------------------------------------------- 1 | # 201708 Kuan-lin Huang @ WashU 2 | 3 | # transfer needed files to VM 4 | gcloud compute scp find_shared_var_relatives.py huangkuanlin@kuan-8cpu-30gb:~/ --zone us-central1-c 5 | gcloud compute scp out/TCGA_z1_z2_relatives.tsv huangkuanlin@kuan-8cpu-30gb:~/ --zone us-central1-c 6 | gcloud compute scp *.sh huangkuanlin@kuan-8cpu-30gb:~/ --zone us-central1-c 7 | 8 | # run script to find segregating variants -------------------------------------------------------------------------------- /analysis/variant_QC/batch_run_pseq_stats.sh: -------------------------------------------------------------------------------- 1 | # not in ExAC 2 | #VCFs=$(gsutil ls gs://dinglab/isb-cgc/tcga/germline/production/merge_v2/not-in-exac/3.annotated/whitelisted/by_tcgaBarcode/PCA.merge.tcgaBarcode.chr*.anno.whitelist.vcf.gz) 3 | #for file in $VCFs; do 4 | for i in {17..22}; do 5 | file=gs://dinglab/isb-cgc/tcga/germline/production/merge_v2/not-in-exac/3.annotated/whitelisted/by_tcgaBarcode/PCA.merge.tcgaBarcode.chr${i}.anno.whitelist.vcf.gz 6 | bash run_pseq_stats.sh $file & 7 | 8 | NPROC=$(($NPROC+1)) 9 | if [ "$NPROC" -ge 8 ]; then 10 | wait 11 | NPROC=0 12 | fi 13 | done 14 | 15 | # in ExAC: gs://dinglab/isb-cgc/tcga/germline/production/merge.ExAConly/all/annotated/by_tcgaBarcode/PCA.merge.ExAConly.tcgaBarcode.anno.whitelist.vcf.gz* 16 | 17 | #file=gs://dinglab/isb-cgc/tcga/germline/production/merge.ExAConly/all/annotated/by_tcgaBarcode/PCA.merge.ExAConly.tcgaBarcode.anno.whitelist.vcf.gz 18 | #bash run_pseq_stats.sh $file 19 | -------------------------------------------------------------------------------- /analysis/variant_QC/batch_run_pseq_vcfstats.sh: -------------------------------------------------------------------------------- 1 | # not in ExAC 2 | VCFs=$(gsutil ls gs://dinglab/isb-cgc/tcga/germline/production/merge_v2/not-in-exac/3.annotated/whitelisted/by_tcgaBarcode/PCA.merge.tcgaBarcode.chr*.anno.whitelist.vcf.gz) 3 | for file in $VCFs; do 4 | gsutil cp $file* . 5 | vcfName=${file##*/} 6 | date 7 | echo "Examining vcf-stats for ${vcfName}" 8 | ../plinkseq/build/execs/pseq $vcfName v-stats > ${vcfName}.pseq.vstats.tsv 9 | ../plinkseq/build/execs/pseq $vcfName i-stats > ${vcfName}.pseq.istats.tsv 10 | #vcf-stats $vcfName > ${vcfName}.vcfstats.json 11 | 12 | rm -f $vcfName 13 | rm -f ${vcfName}.tbi 14 | done 15 | 16 | # in ExAC: gs://dinglab/isb-cgc/tcga/germline/production/merge.ExAConly/all/annotated/by_tcgaBarcode/PCA.merge.ExAConly.tcgaBarcode.anno.whitelist.vcf.gz* 17 | 18 | file=gs://dinglab/isb-cgc/tcga/germline/production/merge.ExAConly/all/annotated/by_tcgaBarcode/PCA.merge.ExAConly.tcgaBarcode.anno.whitelist.vcf.gz 19 | gsutil cp $file* . 20 | vcfName=${file##*/} 21 | date 22 | echo "Examining vcf-stats for ${vcfName}" 23 | ../plinkseq/build/execs/pseq $vcfName v-stats > ${vcfName}.pseq.vstats.tsv 24 | ../plinkseq/build/execs/pseq $vcfName i-stats > ${vcfName}.pseq.istats.tsv 25 | #vcf-stats $vcfName > ${vcfName}.vcfstats.json 26 | 27 | rm -f $vcfName 28 | rm -f ${vcfName}.tbi 29 | -------------------------------------------------------------------------------- /analysis/variant_QC/batch_run_vcfstats.sh: -------------------------------------------------------------------------------- 1 | # not in ExAC 2 | VCFs=$(gsutil ls gs://dinglab/isb-cgc/tcga/germline/production/merge_v2/not-in-exac/3.annotated/whitelisted/by_tcgaBarcode/PCA.merge.tcgaBarcode.chr*.anno.whitelist.vcf.gz) 3 | for file in $VCFs; do 4 | gsutil cp $file* . 5 | vcfName=${file##*/} 6 | date 7 | echo "Examining vcf-stats for ${vcfName}" 8 | vcf-stats $vcfName > ${vcfName}.vcfstats.json 9 | 10 | rm -f $vcfName 11 | rm -f ${vcfName}.tbi 12 | done 13 | 14 | # in ExAC: gs://dinglab/isb-cgc/tcga/germline/production/merge.ExAConly/all/annotated/by_tcgaBarcode/PCA.merge.ExAConly.tcgaBarcode.anno.whitelist.vcf.gz* 15 | 16 | file=gs://dinglab/isb-cgc/tcga/germline/production/merge.ExAConly/all/annotated/by_tcgaBarcode/PCA.merge.ExAConly.tcgaBarcode.anno.whitelist.vcf.gz 17 | gsutil cp $file* . 18 | vcfName=${file##*/} 19 | date 20 | echo "Examining vcf-stats for ${vcfName}" 21 | vcf-stats $vcfName > ${vcfName}.vcfstats.json 22 | 23 | rm -f $vcfName 24 | rm -f ${vcfName}.tbi 25 | -------------------------------------------------------------------------------- /analysis/variant_QC/plot_concordance.R: -------------------------------------------------------------------------------- 1 | ##### plot_pseq_stats.R ##### 2 | # Kuan-lin Huang @ WashU 2017 Aug./Sep. 3 | 4 | setwd("/Users/khuang/Box\ Sync/PhD/germline/PanCanAtlasGermline/analysis/variant_QC/") 5 | source("../global_aes_out.R") 6 | system("mkdir out") 7 | 8 | ##### individual level stats ##### 9 | fileName = "out/run_calc_vcf_concordance.chr22only.sampleID.GT.txt" 10 | concordance = read.table(header=F, sep=" ", file=fileName) 11 | colnames(concordance) = c("sample","validated","unvalidated") 12 | concordance$validated[concordance$sample=="All"]/(concordance$validated[concordance$sample=="All"] + concordance$unvalidated[concordance$sample=="All"]) 13 | concordance =concordance[-c(9236:9240),] 14 | 15 | 16 | clin_f = "/Users/khuang/Box\ Sync/PhD/germline/PanCanAtlasGermline/TCGA_data/clinical/all.clin.merged.picked.txt" 17 | clin = read.table(header=T, quote = "", sep="\t", row.names =NULL, file = clin_f, stringsAsFactors=FALSE) 18 | clin = clin[,c("sample","cancer","race")] 19 | colnames(clin) = c("sample","cancer","ethnicity") 20 | clin = clin[!(clin$cancer %in% c("GBMLGG","COAD","KIPAN")),] 21 | 22 | concordance_clin = merge(concordance,clin,by="sample",all.x=T) 23 | 24 | p = ggplot(data=concordance_clin,aes(x = validated, y=unvalidated, color = cancer)) 25 | p = p + geom_point(alpha=0.1,stroke=0) 26 | p = p + theme_bw() + theme(legend.position="bottom") 27 | p = p + geom_label(aes(label=ifelse(unvalidated > validated | unvalidated > 50, as.character(sample),NA)),size=1) 28 | p 29 | fn = paste('out/chr22_validated_vs_unvalidated.pdf',sep=".") 30 | ggsave(file=fn, useDingbats=FALSE,limitsize=FALSE) 31 | 32 | concordance_clin[concordance_clin$unvalidated > concordance_clin$validated | concordance_clin$unvalidated > 50,] 33 | 34 | # ##### compare to pan8000 ##### 35 | # miss = read.table(header=F, sep="\t", "out/PCA_pan8000_missense_concordance.txt") 36 | # trun = read.table(header=F, sep="\t", "out/PCA_pan8000_truncation_concordance.txt") 37 | # PCA_sample = as.vector(t(read.table(header=F, sep="\t", "out/PCA_samples.txt"))) 38 | # colnames(miss) = c("Var","Sample","Validated") 39 | # colnames(trun) = c("Var","Sample","Validated") 40 | # 41 | # PCA_sample_ID = substr(PCA_sample,1,12) 42 | # miss$inPCA = miss$Sample %in% PCA_sample_ID 43 | # trun$inPCA = trun$Sample %in% PCA_sample_ID 44 | # 45 | # table(miss$Validated,miss$inPCA) 46 | # table(trun$Validated,trun$inPCA) 47 | -------------------------------------------------------------------------------- /analysis/variant_QC/run_pseq_stats.sh: -------------------------------------------------------------------------------- 1 | file=$1 2 | gsutil cp $file* . 3 | vcfName=${file##*/} 4 | date 5 | echo "Examining vcf-stats for ${vcfName}" 6 | ../plinkseq/build/execs/pseq $vcfName v-stats > ${vcfName}.pseq.vstats.tsv 7 | ../plinkseq/build/execs/pseq $vcfName i-stats > ${vcfName}.pseq.istats.tsv 8 | #vcf-stats $vcfName > ${vcfName}.vcfstats.json 9 | 10 | rm -f $vcfName 11 | rm -f ${vcfName}.tbi 12 | -------------------------------------------------------------------------------- /doc/20170118_TCGA_Germline_Abstract.docx: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ding-lab/PanCanAtlasGermline/e57bfec6660bccf359154ae551ebffb06c42470b/doc/20170118_TCGA_Germline_Abstract.docx -------------------------------------------------------------------------------- /doc/notes.txt: -------------------------------------------------------------------------------- 1 | # Note on vcf merging and processing: 2 | #we can potentially use vcfanno for quick annotation of vcfs on ExAC vs. TCGA frequencies and such 3 | https://genomebiology.biomedcentral.com/articles/10.1186/s13059-016-0973-5 4 | https://github.com/brentp/vcfanno/tree/master/docs/examples/exac_combine 5 | 6 | 7 | 8 | -------------------------------------------------------------------------------- /germline_somatic_analysis/load_somatic.R: -------------------------------------------------------------------------------- 1 | ##### load_somatic.R ##### 2 | # Kuan-lin Huang 2018 3 | # load somatic mutation/driver/gene list files 4 | 5 | ### MAIN ### 6 | somaticDriver299_f = "../../TCGA_data/somatic/Driver_BaileyCell2018/299driverGene.txt" 7 | somaticDriver299 = as.vector(t(read.table(header=F, quote = "", sep="\t", file = somaticDriver299_f, stringsAsFactors=FALSE))) 8 | 9 | somatic_f = "../../TCGA_data/somatic/mc3.v0.2.8.PUBLIC.maf.gene_vclass_HGVSp_sample.gz" 10 | somatic = read.table(header=T, quote = "", sep="\t", file = gzfile(somatic_f), stringsAsFactors=FALSE) 11 | somatic$bcr_patient_barcode = gsub("(^TCGA-[A-Z0-9][A-Z0-9]-[A-Z0-9][A-Z0-9][A-Z0-9][A-Z0-9])-.*","\\1",somatic$Tumor_Sample_Barcode) 12 | 13 | clin_cmap = clin[,c("bcr_patient_barcode","type"),] 14 | colnames(clin_cmap)[2] = "cancer" 15 | somatic = merge(somatic,clin_cmap,by="bcr_patient_barcode") 16 | # somatic_mut_count = data.frame(table(somatic$bcr_patient_barcode)) 17 | # colnames(somatic_mut_count) = c("bcr_patient_barcode","MutationCount") 18 | 19 | table(somatic$Variant_Classification) 20 | likelyFunctionalTypes = c("Frame_Shift_Del","Frame_Shift_Ins","In_Frame_Del","In_Frame_Ins","Missense_Mutation", 21 | "Nonsense_Mutation","Splice_Site","Translation_Start_Site") 22 | somatic_likelyfunctional = somatic[somatic$Hugo_Symbol %in% c(pathVar$HUGO_Symbol,somaticDriver299) & somatic$Variant_Classification %in% likelyFunctionalTypes,] 23 | 24 | # driver mutation 25 | driver_f = "../../TCGA_data/somatic/Driver_BaileyCell2018/Mutation.CTAT.3D.Scores.txt.gz" 26 | driver = read.table(header=T, quote = "", sep="\t", file = gzfile(driver_f), stringsAsFactors=FALSE) 27 | colnames(driver) = gsub("\\.","_",colnames(driver)) 28 | driver$numOfEvidence = driver$New_Linear__cancer_focused__flag + driver$New_Linear__functional__flag + driver$New_3D_mutational_hotspot_flag 29 | table(driver$numOfEvidence) 30 | driver_func = driver[driver$numOfEvidence > 1,] 31 | somatic_likelyfunctional_driver = somatic_likelyfunctional[somatic_likelyfunctional$Variant_Classification != "Missense_Mutation" | 32 | paste(somatic_likelyfunctional$Hugo_Symbol,somatic_likelyfunctional$HGVSp_Short) %in% paste(driver_func$gene,driver_func$protein_change),] 33 | -------------------------------------------------------------------------------- /germline_somatic_analysis/mutation_signature/2_plotPathVarMutsigAssoc.R: -------------------------------------------------------------------------------- 1 | ##### plotPathVarMutsigAssoc.R ##### 2 | # Kuan-lin Huang 2018 3 | 4 | source("../global_aes_out.R") 5 | source("../dependency_files.R") 6 | 7 | tn = "out/pathVarMutsigAssoc.txt" 8 | tt = read.table(sep="\t",header=T,file=tn, stringsAsFactors=FALSE) 9 | 10 | ### plotting ### 11 | tt$signature = factor(tt$signature) 12 | tt$signature = factor(tt$signature,levels = c("Signature-1","Signature-2","Signature-3","Signature-4","Signature-5","Signature-6" 13 | ,"Signature-7","Signature-8","Signature-9","Signature-10","Signature-11","Signature-12" 14 | ,"Signature-13","Signature-14","Signature-15","Signature-16","Signature-17","Signature-18" 15 | ,"Signature-19","Signature-20","Signature-21","Signature-22","Signature-23","Signature-24" 16 | ,"Signature-25","Signature-26","Signature-27","Signature-28","Signature-29","Signature-30")) 17 | tt$association = "None" 18 | tt$association[tt$FDR<0.15] = "Suggestive" 19 | tt$association[tt$FDR<0.05] = "Significant" 20 | tt$gene = as.character(tt$gene) 21 | tt$FDR_plot = -log(tt$FDR) 22 | tt$FDR_plot[tt$FDR_plot > 5 ] = 5 23 | uniqG = unique(tt$gene[tt$FDR<0.05]) 24 | ttG = tt[tt$gene %in% uniqG,] 25 | 26 | getPalette = colorRampPalette(c("#FFFFFF","#fed976","#e31a1c")) 27 | p = ggplot(data=ttG) 28 | p = p + facet_grid(gene~.,space="free",scale="free") 29 | p = p + geom_tile(data=ttG,aes(y=cancer, x=signature, fill= coefficient), linetype="blank") + scale_fill_gradientn(name= "Coefficient", colours=getPalette(100), na.value=NA, limit=c(0,NA)) 30 | #p = p + geom_text(data=ttG,aes(x=cancer, y=signature, label = coefficient), color="black", size=3) 31 | p = p + geom_tile(data=ttG[ttG$FDR < 0.15,],aes(y=cancer, x=signature), color="grey",fill=NA, size=1.5) #+ scale_color_gradientn(name= "Sig", colours=sigColors) 32 | p = p + geom_tile(data=ttG[ttG$FDR < 0.05,],aes(y=cancer, x=signature), color="black",fill=NA, size=1.5) #+ scale_color_gradientn(name= "Sig", colours=sigColors) 33 | p = p + theme_bw() + theme_nogrid() + 34 | theme(axis.title = element_blank(), axis.text.x = element_text(colour="black", size=12, angle=90, vjust = 0.5), axis.text.y = element_text(colour="black", size=12),axis.ticks = element_blank())#element_text(colour="black", size=14)) 35 | p + labs(x="Signature",y = "Cancer") 36 | 37 | fn = 'out/pan10389_germlineAssocWithmutSignatureHeatmap.pdf' 38 | ggsave(fn,h=8,useDingbat=F) 39 | 40 | # # using GLM test result 41 | # tt$FDR_plot[tt$FDR_plot<10^(-6)]= 0.95*10^(-6) 42 | # p = ggplot(data=tt) 43 | # p = p + geom_point(aes(y=-log10(FDR_plot),x= coefficient,color = cancer),alpha=0.5) 44 | # #p = p + geom_text_repel(aes(y=-log10(FDR),x= cohort_AF,label=ifelse(FDR<0.05, Gene,NA),color = Cancer))#,alpha=1.3) 45 | # #p = p + geom_point(aes(y=cohort_AF,x=Cancer,size=-log10(FDR),color = Cancer)) 46 | # p = p + geom_text_repel(aes(y=-log10(FDR_plot),x= coefficient,color = cancer,label=ifelse(FDR<0.05, gene,NA))) 47 | # p = p + getPCACancerColor() 48 | # p = p + labs(x="Coefficient",y= "-log10(FDR)") 49 | # p = p + geom_vline(xintercept = 0, alpha=0.5) 50 | # p = p + theme_bw() + 51 | # theme(axis.text.x = element_text(colour="black", size=12), axis.text.y = element_text(colour="black", size=12),axis.ticks = element_blank())#element_text(colour="black", size=14)) 52 | # p 53 | # fn = 'out/geneExpressAssocVolcanoGLM.pdf' 54 | # ggsave(fn,w = 5, h = 5, useDingbat=F) 55 | 56 | # # using the Wilcox test result 57 | # p = ggplot(data=tt) 58 | # p = p + geom_point(aes(y=-log10(wilcoxFDR),x= coefficient,color = cancer),alpha=0.5) 59 | # #p = p + geom_text_repel(aes(y=-log10(FDR),x= cohort_AF,label=ifelse(FDR<0.05, Gene,NA),color = Cancer))#,alpha=1.3) 60 | # #p = p + geom_point(aes(y=cohort_AF,x=Cancer,size=-log10(FDR),color = Cancer)) 61 | # p = p + geom_text_repel(aes(y=-log10(wilcoxFDR),x= coefficient,color = cancer,label=ifelse(FDR<0.05, gene,NA))) 62 | # p = p + getPCACancerColor() 63 | # p = p + labs(x="Coefficient",y= "-log10(FDR)") 64 | # p = p + geom_vline(xintercept = 0, alpha=0.5) 65 | # p = p + theme_bw() + 66 | # theme(axis.text.x = element_text(colour="black", size=12), axis.text.y = element_text(colour="black", size=12),axis.ticks = element_blank())#element_text(colour="black", size=14)) 67 | # p 68 | # fn = 'out/geneExpressAssocVolcanoWCOX.pdf' 69 | # ggsave(fn,w = 5, h = 5, useDingbat=F) 70 | 71 | #tt$association = factor(tt$association,level=c("None","Suggestive","Significant")) 72 | 73 | # plot by gene 74 | p = ggplot(data=ttG,aes(x=coefficient,y=cancer,color = cancer)) 75 | p = p + facet_grid(gene~.,space="free",scale="free") 76 | # p = p + geom_point(aes(y=-log10(wilcoxFDR),x= coefficient,color = cancer),alpha=0.5) 77 | p = p + geom_point(aes(shape=association),alpha=0.3,size=2) 78 | p = p + geom_text_repel(aes(label=ifelse(FDR<0.05,as.character(signature),NA))) 79 | p = p + getPCACancerColor() 80 | p = p + labs(x="Cancer",y= "-log10(FDR)") 81 | p = p + geom_vline(xintercept = 0, alpha=0.5) #+ xlim(-3.1,3.1) 82 | p = p + theme_bw() + 83 | theme(axis.text.x = element_text(colour="black", size=12), axis.text.y = element_text(colour="black", size=12),axis.ticks = element_blank())#element_text(colour="black", size=14)) 84 | p + labs(x = "coefficient",y="cancer") 85 | fn = 'out/pan10389_germlineAssocWithmutSignatureByGene.pdf' 86 | ggsave(fn,h=8,useDingbat=F) 87 | -------------------------------------------------------------------------------- /germline_somatic_analysis/mutation_signature/4_plotSomaticMutsigAssoc.R: -------------------------------------------------------------------------------- 1 | ##### plotPathVarMutsigAssoc.R ##### 2 | # Kuan-lin Huang 2018 3 | 4 | source("../global_aes_out.R") 5 | source("../dependency_files.R") 6 | 7 | g_tn = "out/pathVarMutsigAssoc.txt" 8 | g_tt = read.table(sep="\t",header=T,file=g_tn, stringsAsFactors=FALSE) 9 | 10 | tn = "out/somaticMutMutsigAssoc.txt" 11 | tt = read.table(sep="\t",header=T,file=tn, stringsAsFactors=FALSE) 12 | 13 | ### plotting ### 14 | tt$signature = factor(tt$signature) 15 | tt$signature = factor(tt$signature,levels = c("Signature-1","Signature-2","Signature-3","Signature-4","Signature-5","Signature-6" 16 | ,"Signature-7","Signature-8","Signature-9","Signature-10","Signature-11","Signature-12" 17 | ,"Signature-13","Signature-14","Signature-15","Signature-16","Signature-17","Signature-18" 18 | ,"Signature-19","Signature-20","Signature-21","Signature-22","Signature-23","Signature-24" 19 | ,"Signature-25","Signature-26","Signature-27","Signature-28","Signature-29","Signature-30")) 20 | tt$association = "None" 21 | tt$association[tt$FDR<0.15] = "Suggestive" 22 | tt$association[tt$FDR<0.05] = "Significant" 23 | tt$gene = as.character(tt$gene) 24 | tt$FDR_plot = -log(tt$FDR) 25 | tt$FDR_plot[tt$FDR_plot > 5 ] = 5 26 | #uniqG = unique(tt$gene[tt$FDR<0.05]) 27 | uniqG = unique(g_tt$gene[g_tt$FDR<0.05]) # plot just the germline genes for now 28 | ttG = tt[tt$gene %in% uniqG,] 29 | 30 | # getPalette = colorRampPalette(c("#FFFFFF","#fed976","#e31a1c")) 31 | # p = ggplot(data=ttG) 32 | # p = p + facet_grid(gene~.,space="free",scale="free") 33 | # p = p + geom_tile(data=ttG,aes(y=cancer, x=signature, fill= coefficient), linetype="blank") + scale_fill_gradientn(name= "Coefficient", colours=getPalette(100), na.value=NA, limit=c(0,NA)) 34 | # #p = p + geom_text(data=ttG,aes(x=cancer, y=signature, label = coefficient), color="black", size=3) 35 | # #p = p + geom_tile(data=ttG[ttG$FDR < 0.15,],aes(y=cancer, x=signature), color="grey",fill=NA, size=1.5) #+ scale_color_gradientn(name= "Sig", colours=sigColors) 36 | # p = p + geom_tile(data=ttG[ttG$FDR < 0.05,],aes(y=cancer, x=signature), color="black",fill=NA, size=1.5) #+ scale_color_gradientn(name= "Sig", colours=sigColors) 37 | # p = p + theme_bw() + theme_nogrid() + 38 | # theme(axis.title = element_blank(), axis.text.x = element_text(colour="black", size=12, angle=90, vjust = 0.5), axis.text.y = element_text(colour="black", size=12),axis.ticks = element_blank())#element_text(colour="black", size=14)) 39 | # p + labs(x="Signature",y = "Cancer") 40 | # 41 | # fn = 'out/SomaticWithmutSignatureHeatmap.pdf' 42 | # ggsave(fn,h=25,useDingbat=F) 43 | 44 | # plot by gene 45 | p = ggplot(data=ttG,aes(x=coefficient,y=cancer,color = cancer)) 46 | p = p + facet_grid(gene~.,space="free",scale="free") 47 | # p = p + geom_point(aes(y=-log10(wilcoxFDR),x= coefficient,color = cancer),alpha=0.5) 48 | p = p + geom_point(aes(shape=association),alpha=0.3,size=2) 49 | p = p + geom_text_repel(aes(label=ifelse(FDR<0.0005,signature,NA))) 50 | p = p + getPCACancerColor() 51 | p = p + labs(x="Cancer",y= "-log10(FDR)") 52 | p = p + geom_vline(xintercept = 0, alpha=0.5) #+ xlim(-3.1,3.1) 53 | p = p + theme_bw() + 54 | theme(axis.text.x = element_text(colour="black", size=12), axis.text.y = element_text(colour="black", size=12),axis.ticks = element_blank())#element_text(colour="black", size=14)) 55 | p + labs(x = "coefficient",y="cancer") 56 | fn = 'out/SomaticWithmutSignatureByGene.pdf' 57 | ggsave(fn,h=28,useDingbat=F) 58 | -------------------------------------------------------------------------------- /germline_somatic_analysis/somatic_germline/plotSomaticGermline.R: -------------------------------------------------------------------------------- 1 | ##### plotSomaticGermline.R ##### 2 | # Kuan-lin Huang 2018 3 | 4 | #setwd("/Users/khuang/Box\ Sync/PhD/germline/PanCanAtlasGermline/analysis/somatic_germline/") 5 | 6 | source("../global_aes_out.R") 7 | source("../dependency_files.R") 8 | 9 | tn = "out/germline_somatic_driver_fisher.tsv" 10 | tt = read.table(sep="\t",header=T,file=tn, stringsAsFactors=FALSE) 11 | 12 | germlineG = unique(tt$GermlineGene[tt$P<0.05]) 13 | somaticG = unique(tt$SomaticGene[tt$P<0.01]) 14 | ttG = tt[tt$GermlineGene %in% germlineG & tt$SomaticGene %in% somaticG,] 15 | 16 | # pre-plotting 17 | ttG$minusLogP = -log10(ttG$P) 18 | ttG$plotP = ttG$minusLogP 19 | ttG$plotP[ttG$OR < 1] = -ttG$plotP[ttG$OR < 1] # opposite effect size (mutual exclusivity) 20 | ttG$plotP[ttG$plotP > 5] = 5 21 | ttG$plotP[ttG$plotP < -5] = -5 22 | 23 | ### plotting ### 24 | 25 | #getPalette = colorRampPalette(c("#FFFFFF","#fed976","#e31a1c")) 26 | p = ggplot(data=ttG) 27 | p = p + geom_tile(data=ttG,aes(y=SomaticGene, x=GermlineGene, fill= plotP), linetype="blank") + scale_fill_gradientn(name= "-log10(P)", colours=getPalette(100), na.value=NA, limit=c(-5,5)) 28 | #p = p + geom_text(data=ttG,aes(x=cancer, y=signature, label = coefficient), color="black", size=3) 29 | #p = p + geom_tile(data=ttG[ttG$FDR < 0.15,],aes(y=cancer, x=signature), color="grey",fill=NA, size=1.5) #+ scale_color_gradientn(name= "Sig", colours=sigColors) 30 | p = p + geom_tile(data=ttG[ttG$FDR < 0.05,],aes(y=SomaticGene, x=GermlineGene, fill= plotP), color="black",fill=NA, size=1.5) #+ scale_color_gradientn(name= "Sig", colours=sigColors) 31 | p = p + theme_bw() + theme_nogrid() + 32 | theme(axis.text.x = element_text(colour="black", size=12, angle=90, vjust = 0.5), axis.text.y = element_text(colour="black", size=12),axis.ticks = element_blank())#element_text(colour="black", size=14)) 33 | p = p + labs(title="Germline-somatic Interaction: Pan-cancer",x="Germline variant carrier",y = "Somatic mutation carrier") 34 | p 35 | fn = 'out/pan10389_germlineAssocWithSomaticHeatmap.pdf' 36 | ggsave(fn,useDingbat=F) -------------------------------------------------------------------------------- /germline_somatic_analysis/somatic_germline/plotSomaticGermlineByCancer.R: -------------------------------------------------------------------------------- 1 | ##### plotSomaticGermline.R ##### 2 | # Kuan-lin Huang 2018 3 | 4 | #setwd("/Users/khuang/Box\ Sync/PhD/germline/PanCanAtlasGermline/analysis/somatic_germline/") 5 | 6 | source("../global_aes_out.R") 7 | source("../dependency_files.R") 8 | 9 | tn = "out/germline_somatic_driver_fisher_byCancer.tsv" 10 | tt = read.table(sep="\t",header=T,file=tn, stringsAsFactors=FALSE) 11 | 12 | germlineG = unique(tt$GermlineGene[tt$P<0.05]) 13 | somaticG = unique(tt$SomaticGene[tt$P<0.05]) 14 | cancer = unique(tt$Cancer[tt$P<0.05]) 15 | ttG = tt[tt$GermlineGene %in% germlineG & tt$SomaticGene %in% somaticG & tt$Cancer %in% cancer,] 16 | 17 | # pre-plotting 18 | ttG$minusLogP = -log10(ttG$P) 19 | ttG$plotP = ttG$minusLogP 20 | ttG$plotP[ttG$OR < 1] = -ttG$plotP[ttG$OR < 1] # opposite effect size (mutual exclusivity) 21 | ttG$plotP[ttG$plotP > 5] = 5 22 | ttG$plotP[ttG$plotP < -5] = -5 23 | 24 | ### plotting ### 25 | 26 | #getPalette = colorRampPalette(c("#FFFFFF","#fed976","#e31a1c")) 27 | p = ggplot(data=ttG) 28 | p = p + facet_grid(.~Cancer, scale="free", space="free") 29 | p = p + geom_tile(data=ttG,aes(y=SomaticGene, x=GermlineGene, fill= plotP), linetype="blank") + scale_fill_gradientn(name= "-log10(P)", colours=getPalette(100), na.value=NA, limit=c(-5,5)) 30 | #p = p + geom_text(data=ttG,aes(x=cancer, y=signature, label = coefficient), color="black", size=3) 31 | #p = p + geom_tile(data=ttG[ttG$FDR < 0.15,],aes(y=cancer, x=signature), color="grey",fill=NA, size=1.5) #+ scale_color_gradientn(name= "Sig", colours=sigColors) 32 | p = p + geom_tile(data=ttG[ttG$FDR < 0.05,],aes(y=SomaticGene, x=GermlineGene, fill= plotP), color="black",fill=NA, size=1.5) #+ scale_color_gradientn(name= "Sig", colours=sigColors) 33 | p = p + theme_bw() + theme_nogrid() + 34 | theme( axis.text.x = element_text(colour="black", size=12, angle=90, vjust = 0.5), axis.text.y = element_text(colour="black", size=12),axis.ticks = element_blank())#element_text(colour="black", size=14)) 35 | p = p + labs(title="Germline-somatic Interaction: Single-cancer",x="Germline variant carrier",y = "Somatic mutation carrier") 36 | p 37 | fn = 'out/pan10389_germlineAssocWithSomaticHeatmapByCancer.pdf' 38 | ggsave(fn,useDingbat=F) 39 | -------------------------------------------------------------------------------- /germline_somatic_analysis/somatic_germline/somaticDriver_germline_fisher.R: -------------------------------------------------------------------------------- 1 | ##### somaticDriver_germline_fisher.R ##### 2 | # Kuan-lin Huang @ WashU 2016 June 3 | # updated 2018 4 | # Conduct fisher's exact test to find mutual occurence/mutual exclusivity of germline/somatic variants 5 | # somatic driver only 6 | 7 | ### dependencies ### 8 | #setwd("/Users/khuang/Box\ Sync/PhD/germline/PanCanAtlasGermline/analysis/somatic_germline") 9 | source("../global_aes_out.R") 10 | source("../dependency_files.R") 11 | source("../load_somatic.R") 12 | 13 | ### function ### 14 | sg_fisher_test = function(all_samples, germline_carriers, somatic_carriers){ 15 | p = NA; OR = NA 16 | 17 | fisher_elements = c(sum(!all_samples %in% c(germline_carriers,somatic_carriers)),sum((all_samples %in% germline_carriers) & !(all_samples %in% somatic_carriers)), 18 | sum(!(all_samples %in% germline_carriers) & (all_samples %in% somatic_carriers)),sum((all_samples %in% germline_carriers) & (all_samples %in% somatic_carriers))) 19 | if (fisher_elements[1] > 0 && fisher_elements[3] > 0 && fisher_elements[2] >= 0 && fisher_elements[4] >= 0){ 20 | test.table = matrix(as.numeric(fisher_elements), nrow=2) 21 | f.test = fisher.test(test.table) 22 | OR = f.test$estimate 23 | p = f.test$p.value 24 | 25 | count00 = test.table[1,1] 26 | count10 = test.table[2,1] 27 | count01 = test.table[1,2] 28 | count11 = test.table[2,2] 29 | return(list("p"=p, "OR"=OR, "count00" = count00, "count00" = count00 30 | , "count10" = count10, "count01" = count01, "count11" = count11)) 31 | } 32 | } 33 | 34 | ### get input date and files ### 35 | 36 | ### germline ### 37 | germ_list = names(table(pathVarP$HUGO_Symbol)[table(pathVarP$HUGO_Symbol)>9]) # limit analyses to germline genes with equal to or more than 10 variants 38 | 39 | ### germline somatic interaction ### 40 | # samples being tested 41 | test_samples = intersect(unique(somatic$bcr_patient_barcode), clin$bcr_patient_barcode) 42 | cat("Running germline-somatic interaction in ",length(test_samples),"samples.\n") 43 | 44 | # run through function 45 | out_table=character(0); 46 | for (g_gene in germ_list){ 47 | for (s_gene in somaticDriver299){ 48 | germline_carriers = unique(pathVarP$bcr_patient_barcode[pathVarP$HUGO_Symbol==g_gene]) 49 | somatic_carriers = unique(somatic_likelyfunctional_driver$bcr_patient_barcode[somatic_likelyfunctional_driver$Hugo_Symbol==s_gene]) 50 | if(length(germline_carriers) > 9 & length(somatic_carriers) > 19){ 51 | t_result = sg_fisher_test(all_samples = test_samples, germline_carriers, somatic_carriers) 52 | p = t_result$p 53 | OR = t_result$OR 54 | count00 = t_result$count00 55 | count10 = t_result$count10 56 | count01 = t_result$count01 57 | count11 = t_result$count11 58 | 59 | 60 | out_row = c(g_gene, s_gene, count00, count10, count01, count11, OR, p) 61 | out_table = rbind(out_table,out_row) 62 | } 63 | } 64 | } 65 | rownames(out_table)=NULL 66 | colnames(out_table) = c("GermlineGene", "SomaticGene", "germline0somatic0", "germline1somatic0", "germline0somatic1", "germline1somatic1","OR", "P") 67 | out_table = data.frame(out_table) 68 | out_table[,"P"] = as.numeric(as.character(out_table[,"P"])) 69 | 70 | FDR = p.adjust(out_table[,"P"],method="fdr") 71 | out_table = cbind(out_table,FDR) 72 | out_table = out_table[order(out_table[,"P"]),] 73 | 74 | fn = paste("out/germline_somatic_driver_fisher.tsv") 75 | write.table(out_table,file=fn,quote=FALSE,row.names=FALSE,sep="\t") -------------------------------------------------------------------------------- /germline_somatic_analysis/somatic_germline/somaticDriver_germline_fisher_byCancer.R: -------------------------------------------------------------------------------- 1 | ##### somaticDriver_germline_fisher.R ##### 2 | # Kuan-lin Huang @ WashU 2016 June 3 | # updated 2018 4 | # Conduct fisher's exact test to find mutual occurence/mutual exclusivity of germline/somatic variants 5 | # somatic driver only 6 | 7 | ### dependencies ### 8 | #setwd("/Users/khuang/Box\ Sync/PhD/germline/PanCanAtlasGermline/analysis/somatic_germline") 9 | source("../global_aes_out.R") 10 | source("../dependency_files.R") 11 | source("../load_somatic.R") 12 | 13 | ### function ### 14 | sg_fisher_test = function(all_samples, germline_carriers, somatic_carriers){ 15 | p = NA; OR = NA 16 | 17 | fisher_elements = c(sum(!all_samples %in% c(germline_carriers,somatic_carriers)),sum((all_samples %in% germline_carriers) & !(all_samples %in% somatic_carriers)), 18 | sum(!(all_samples %in% germline_carriers) & (all_samples %in% somatic_carriers)),sum((all_samples %in% germline_carriers) & (all_samples %in% somatic_carriers))) 19 | if (fisher_elements[1] > 0 && fisher_elements[3] > 0 && fisher_elements[2] >= 0 && fisher_elements[4] >= 0){ 20 | test.table = matrix(as.numeric(fisher_elements), nrow=2) 21 | f.test = fisher.test(test.table) 22 | OR = f.test$estimate 23 | p = f.test$p.value 24 | 25 | count00 = test.table[1,1] 26 | count10 = test.table[2,1] 27 | count01 = test.table[1,2] 28 | count11 = test.table[2,2] 29 | return(list("p"=p, "OR"=OR, "count00" = count00, "count00" = count00 30 | , "count10" = count10, "count01" = count01, "count11" = count11)) 31 | } 32 | } 33 | 34 | ### get input date and files ### 35 | 36 | ### germline ### 37 | germ_list = names(table(pathVarP$HUGO_Symbol)[table(pathVarP$HUGO_Symbol)>9]) # limit analyses to germline genes with equal to or more than 10 variants 38 | 39 | ### germline somatic interaction ### 40 | # samples being tested 41 | all_test_samples = intersect(unique(somatic$bcr_patient_barcode), clin$bcr_patient_barcode) 42 | 43 | # run through function 44 | out_table=character(0); 45 | for (cancer in unique(pathVarP$cancer)){ 46 | test_samples = intersect(all_test_samples, clin$bcr_patient_barcode[clin$type==cancer]) 47 | cat("Running germline-somatic interaction in ",length(test_samples),"samples.\n") 48 | 49 | pathVarPcancer =pathVarP[pathVarP$cancer==cancer,] 50 | somatic_likelyfunctional_driver_cancer = somatic_likelyfunctional_driver[somatic_likelyfunctional_driver$cancer==cancer,] 51 | 52 | for (g_gene in germ_list){ 53 | for (s_gene in somaticDriver299){ 54 | germline_carriers = unique(pathVarPcancer$bcr_patient_barcode[pathVarPcancer$HUGO_Symbol==g_gene]) 55 | somatic_carriers = unique(somatic_likelyfunctional_driver_cancer$bcr_patient_barcode[somatic_likelyfunctional_driver_cancer$Hugo_Symbol==s_gene]) 56 | if(length(germline_carriers) > 4 & length(somatic_carriers) > 9){ 57 | t_result = sg_fisher_test(all_samples = test_samples, germline_carriers, somatic_carriers) 58 | p = t_result$p 59 | OR = t_result$OR 60 | count00 = t_result$count00 61 | count10 = t_result$count10 62 | count01 = t_result$count01 63 | count11 = t_result$count11 64 | 65 | 66 | out_row = c(cancer,g_gene, s_gene, count00, count10, count01, count11, OR, p) 67 | out_table = rbind(out_table,out_row) 68 | } 69 | } 70 | } 71 | } 72 | rownames(out_table)=NULL 73 | colnames(out_table) = c("Cancer","GermlineGene", "SomaticGene", "germline0somatic0", "germline1somatic0", "germline0somatic1", "germline1somatic1","OR", "P") 74 | out_table = data.frame(out_table) 75 | out_table[,"P"] = as.numeric(as.character(out_table[,"P"])) 76 | 77 | FDR = p.adjust(out_table[,"P"],method="fdr") 78 | out_table = cbind(out_table,FDR) 79 | out_table = out_table[order(out_table[,"P"]),] 80 | 81 | fn = paste("out/germline_somatic_driver_fisher_byCancer.tsv") 82 | write.table(out_table,file=fn,quote=FALSE,row.names=FALSE,sep="\t") -------------------------------------------------------------------------------- /germline_somatic_analysis/somatic_germline/somatic_germline_fisher.R: -------------------------------------------------------------------------------- 1 | ##### somatic_germline_fisher.R ##### 2 | # Kuan-lin Huang @ WashU 2016 June 3 | # updated 2018 4 | # Conduct fisher's exact test to find mutual occurence/mutual exclusivity of germline/somatic variants 5 | 6 | ### dependencies ### 7 | #setwd("/Users/khuang/Box\ Sync/PhD/germline/PanCanAtlasGermline/analysis/somatic_germline") 8 | source("../global_aes_out.R") 9 | source("../dependency_files.R") 10 | 11 | ### function ### 12 | sg_fisher_test = function(all_samples, germline_carriers, somatic_carriers){ 13 | p = NA; OR = NA 14 | 15 | fisher_elements = c(sum(!all_samples %in% c(germline_carriers,somatic_carriers)),sum((all_samples %in% germline_carriers) & !(all_samples %in% somatic_carriers)), 16 | sum(!(all_samples %in% germline_carriers) & (all_samples %in% somatic_carriers)),sum((all_samples %in% germline_carriers) & (all_samples %in% somatic_carriers))) 17 | if (fisher_elements[1] > 0 && fisher_elements[3] > 0 && fisher_elements[2] >= 0 && fisher_elements[4] >= 0){ 18 | test.table = matrix(as.numeric(fisher_elements), nrow=2) 19 | f.test = fisher.test(test.table) 20 | OR = f.test$estimate 21 | p = f.test$p.value 22 | 23 | count00 = test.table[1,1] 24 | count10 = test.table[2,1] 25 | count01 = test.table[1,2] 26 | count11 = test.table[2,2] 27 | return(list("p"=p, "OR"=OR, "count00" = count00, "count00" = count00 28 | , "count10" = count10, "count01" = count01, "count11" = count11)) 29 | } 30 | } 31 | 32 | ### get input date and files ### 33 | 34 | ### germline ### 35 | germ_list = names(table(pathVarP$HUGO_Symbol)[table(pathVarP$HUGO_Symbol)>9]) 36 | 37 | ### somatic ### 38 | # somatic gene list 39 | somaticDriver299_f = "../../TCGA_data/somatic/Driver_BaileyCell2018/299driverGene.txt" 40 | somaticDriver299 = as.vector(t(read.table(header=F, quote = "", sep="\t", file = somaticDriver299_f, stringsAsFactors=FALSE))) 41 | somatic_list = somaticDriver299 42 | 43 | # mutations 44 | somatic_f = "../../TCGA_data/somatic/mc3.v0.2.8.PUBLIC.maf.gene_vclass_HGVSp_sample.gz" 45 | somatic = read.table(header=T, quote = "", sep="\t", file = gzfile(somatic_f), stringsAsFactors=FALSE) 46 | somatic$bcr_patient_barcode = gsub("(^TCGA-[A-Z0-9][A-Z0-9]-[A-Z0-9][A-Z0-9][A-Z0-9][A-Z0-9])-.*","\\1",somatic$Tumor_Sample_Barcode) 47 | somatic_mut_count = data.frame(table(somatic$bcr_patient_barcode)) 48 | colnames(somatic_mut_count) = c("bcr_patient_barcode","MutationCount") 49 | 50 | table(somatic$Variant_Classification) 51 | likelyFunctionalTypes = c("Frame_Shift_Del","Frame_Shift_Ins","In_Frame_Del","In_Frame_Ins","Missense_Mutation", 52 | "Nonsense_Mutation","Splice_Site","Translation_Start_Site") 53 | somatic_likelyfunctional = somatic[somatic$Hugo_Symbol %in% c(pathVar$HUGO_Symbol,somaticDriver299) & somatic$Variant_Classification %in% likelyFunctionalTypes,] 54 | 55 | 56 | 57 | ### germline somatic interaction ### 58 | # samples being tested 59 | test_samples = intersect(unique(somatic$bcr_patient_barcode), clin$bcr_patient_barcode) 60 | cat("Running germline-somatic interaction in ",length(test_samples),"samples.\n") 61 | 62 | # run through function 63 | out_table=character(0); 64 | for (g_gene in germ_list){ 65 | for (s_gene in somatic_list){ 66 | germline_carriers = unique(pathVarP$bcr_patient_barcode[pathVarP$HUGO_Symbol==g_gene]) 67 | somatic_carriers = unique(somatic_likelyfunctional$bcr_patient_barcode[somatic_likelyfunctional$Hugo_Symbol==s_gene]) 68 | t_result = sg_fisher_test(all_samples = test_samples, germline_carriers, somatic_carriers) 69 | p = t_result$p 70 | OR = t_result$OR 71 | count00 = t_result$count00 72 | count10 = t_result$count10 73 | count01 = t_result$count01 74 | count11 = t_result$count11 75 | 76 | if(count10 + count11 >= 8){ 77 | out_row = c(g_gene, s_gene, count00, count10, count01, count11, OR, p) 78 | out_table = rbind(out_table,out_row) 79 | } 80 | } 81 | } 82 | rownames(out_table)=NULL 83 | colnames(out_table) = c("GermlineGene", "SomaticGene", "germline0somatic0", "germline1somatic0", "germline0somatic1", "germline1somatic1","OR", "P") 84 | out_table = data.frame(out_table) 85 | out_table[,"P"] = as.numeric(as.character(out_table[,"P"])) 86 | 87 | FDR = p.adjust(out_table[,"P"],method="fdr") 88 | out_table = cbind(out_table,FDR) 89 | out_table = out_table[order(out_table[,"P"]),] 90 | 91 | fn = paste("out/germline_all_somatic_fisher.tsv") 92 | write.table(out_table,file=fn,quote=FALSE,row.names=FALSE,sep="\t") -------------------------------------------------------------------------------- /germline_somatic_analysis/somatic_germline_overlap/somatic_germline_overlap_genes.R: -------------------------------------------------------------------------------- 1 | ##### somatic_germline_overlap.R ##### 2 | # Kuan-lin Huang @ WashU 2018 3 | # Find overlap of genes/variants for somatic/germline variants 4 | 5 | ### dependencies ### 6 | source("../global_aes_out.R") 7 | source("../dependency_files.R") 8 | source("../load_somatic.R") 9 | 10 | # counts of somatic functional mutation by gene 11 | somatic_gene_count = data.frame(table(somatic_likelyfunctional_driver$Hugo_Symbol)) 12 | germline_gene_count = data.frame(table(pathVarP$HUGO_Symbol)) 13 | colnames(somatic_gene_count) = c("Gene","PredictedFunctionalSomaticMutationCount") 14 | colnames(germline_gene_count) = c("Gene","PathogenicGermlineVariantCount") 15 | gene_count = merge(somatic_gene_count,germline_gene_count,by="Gene",all=T) 16 | gene_count[is.na(gene_count)] = 0 17 | highlight_g = as.character(gene_count$Gene[gene_count$PredictedFunctionalSomaticMutationCount > 400 | gene_count$PathogenicGermlineVariantCount > 20 18 | | (gene_count$PredictedFunctionalSomaticMutationCount > 140 & gene_count$PathogenicGermlineVariantCount > 3)]) 19 | gene_count$GeneClass = "Others" 20 | gene_count$GeneClass[gene_count$Gene %in% all_oncogenes] = "Oncogene" 21 | gene_count$GeneClass[gene_count$Gene %in% all_TSGs] = "TSG" 22 | 23 | p = ggplot(gene_count,aes(y=PredictedFunctionalSomaticMutationCount, x =PathogenicGermlineVariantCount, color = GeneClass)) 24 | p = p + geom_point(stroke=0,alpha = 0.2) + theme_bw() #+ guides(color=FALSE) 25 | #p = p + geom_abline(intercept = 0, slope=1, alpha=0.2) #+ geom_density2d(alpha=0.5) 26 | p = p + geom_text_repel(aes(label=ifelse(as.character(Gene) %in% highlight_g,as.character(Gene), NA))) 27 | p = p + theme(axis.title = element_text(size=16), axis.text.x = element_text(colour="black", size=14, angle=90,vjust=0.5), axis.text.y = element_text(colour="black", size=14))#element_text(colour="black", size=14)) 28 | #p = p + scale_x_log10() + scale_y_log10() 29 | p = p + expand_limits(x = 0,y=0) + ylim(0,1100) 30 | #p = p + coord_equal() + getLOHColorScale() 31 | p 32 | fn = "out/somatic_vs_germline_var_counts_by_gene.pdf" 33 | ggsave(file=fn, width=5, h =5, useDingbats=FALSE) 34 | 35 | 36 | # by cancer 37 | somatic_cancer_gene_count = data.frame(table(somatic_likelyfunctional_driver$Hugo_Symbol, somatic_likelyfunctional_driver$cancer)) 38 | germline_cancer_gene_count = data.frame(table(pathVarP$HUGO_Symbol,pathVarP$cancer)) 39 | colnames(somatic_cancer_gene_count) = c("Gene","Cancer","PredictedFunctionalSomaticMutationCount") 40 | colnames(germline_cancer_gene_count) = c("Gene","Cancer","PathogenicGermlineVariantCount") 41 | cancer_gene_count = merge(somatic_cancer_gene_count,germline_cancer_gene_count,by=c("Gene","Cancer"),all=T) 42 | cancer_gene_count[is.na(cancer_gene_count)] = 0 43 | highlight_g = as.character(cancer_gene_count$Gene[cancer_gene_count$PredictedFunctionalSomaticMutationCount > 400 | cancer_gene_count$PathogenicGermlineVariantCount > 20 44 | | (cancer_gene_count$PredictedFunctionalSomaticMutationCount > 140 & cancer_gene_count$PathogenicGermlineVariantCount > 3)]) 45 | cancer_gene_count$GeneClass = "Others" 46 | cancer_gene_count$GeneClass[cancer_gene_count$Gene %in% all_oncogenes] = "Oncogene" 47 | cancer_gene_count$GeneClass[cancer_gene_count$Gene %in% all_TSGs] = "TSG" 48 | 49 | p = ggplot(cancer_gene_count,aes(y=PredictedFunctionalSomaticMutationCount, x =PathogenicGermlineVariantCount, color = GeneClass)) 50 | p = p + facet_wrap(~Cancer) 51 | p = p + geom_point(stroke=0,alpha = 0.2) + theme_bw() #+ guides(color=FALSE) 52 | #p = p + geom_abline(intercept = 0, slope=1, alpha=0.2) #+ geom_density2d(alpha=0.5) 53 | p = p + geom_text_repel(aes(label=ifelse(as.character(Gene) %in% highlight_g,as.character(Gene), NA)),size=1) 54 | p = p + theme(axis.title = element_text(size=16), axis.text.x = element_text(colour="black", size=14, angle=90,vjust=0.5), axis.text.y = element_text(colour="black", size=14))#element_text(colour="black", size=14)) 55 | p = p + scale_x_log10() + scale_y_log10() 56 | p = p + expand_limits(x = 0,y=0) #+ ylim(0,1100) 57 | #p = p + coord_equal() + getLOHColorScale() 58 | p 59 | fn = "out/somatic_vs_germline_var_counts_by_gene_by_cancer.pdf" 60 | ggsave(file=fn, width=10, h =10, useDingbats=FALSE) 61 | -------------------------------------------------------------------------------- /util/edit_vcf_samplenames/list_vcf_source-sample_pairs.pl: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env perl 2 | # 3 | # (rjm), July 2017 4 | # Syntax: uncompressed vcf | $thisScript 5 | 6 | 7 | use strict; 8 | use warnings; 9 | 10 | my @myList=(); 11 | my @a; 12 | 13 | while() { 14 | chomp; 15 | if( /^#/ ) { 16 | # get list of input filenames given to merge 17 | if( /vcf-merge/ ) { 18 | @a = split /\s+/; 19 | for(my $i = 1; $i < scalar @a; $i++) { 20 | my %data = ('inputfile' => $a[ $i ], 'samplename' => ""); 21 | push @myList, \%data; 22 | } 23 | } 24 | if( /^#CHROM/ ) { 25 | @a = split /\t/; 26 | for(my $i = 9 ; $i < scalar @a; $i++) { 27 | $myList[ $i - 9 ]{'samplename'} = $a[ $i ]; 28 | } 29 | } 30 | 31 | } else { 32 | last; 33 | } 34 | } 35 | 36 | for(my $j=0 ; $j < scalar @myList; $j++) { 37 | print $j,"\t", $myList[$j]{'inputfile'},"\t", $myList[$j]{'samplename'}, "\n"; 38 | } 39 | -------------------------------------------------------------------------------- /util/edit_vcf_samplenames/replace_vcf_header_sample_with_source.pl: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env perl 2 | # 3 | # Jay Mashl, July 2017 4 | # Syntax: uncompressed vcf | $thisScript 5 | 6 | 7 | use strict; 8 | use warnings; 9 | 10 | my @myList=(); 11 | my @a; 12 | 13 | while() { 14 | chomp; 15 | if( /^#/ ) { 16 | # get list of input filenames given to merge 17 | if( /vcf-merge/ ) { 18 | @a = split /\s+/; 19 | for(my $i = 1; $i < scalar @a; $i++) { 20 | my %data = ('inputfile' => $a[ $i ], 'samplename' => ""); 21 | push @myList, \%data; 22 | } 23 | } 24 | if( /^#CHROM/ ) { 25 | @a = split /\t/; 26 | for(my $i = 9 ; $i < scalar @a; $i++) { 27 | # $myList[ $i - 9 ]{'samplename'} = $a[ $i ]; 28 | 29 | # in this application, extract unique identifier from first field 30 | my @b = split /\./, $myList[ $i - 9 ]{'inputfile'}; 31 | $a[ $i ] = $b[0]; 32 | } 33 | } 34 | 35 | #Print 36 | if( /^#CHROM/ ) { 37 | print join("\t", @a),"\n"; 38 | } else { 39 | print $_,"\n"; 40 | } 41 | 42 | } else { 43 | last; 44 | } 45 | } 46 | 47 | #for(my $j=0 ; $j < scalar @myList; $j++) { 48 | # print $j,"\t", $myList[$j]{'inputfile'},"\t", $myList[$j]{'samplename'}, "\n"; 49 | #} 50 | -------------------------------------------------------------------------------- /util/edit_vcf_samplenames/uniqueify_merged_samplenames.driver.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | # Jay Mashl, July 2017 4 | 5 | for ct in ACC BLCA BRCA CESC CHOL COAD DLBC ESCA GBM HNSC KICH KIRC KIRP LAML LGG LIHC LUAD LUSC MESO OV PAAD PCA PCPG PRAD READ SARC SKCM STAD TGCT THCA THYM UCEC UCS UVM ; do 6 | echo '------' 7 | echo $ct 8 | echo '------' 9 | gsutil -m cp gs://dinglab/isb-cgc/tcga/germline/production/merge/$ct.merge.vcf.gz gs://dinglab/isb-cgc/tcga/germline/production/merge/$ct.merge.vcf.gz.tbi . 10 | gunzip -dc $ct.merge.vcf.gz | ./replace_vcf_header_sample_with_source.pl > $ct.merge.newheader.txt 11 | tabix -r $ct.merge.newheader.txt $ct.merge.vcf.gz > $ct.merge.fixedHeader.vcf.gz 12 | tabix -p vcf $ct.merge.fixedHeader.vcf.gz 13 | 14 | rm -f $ct.merge.vcf.gz $ct.merge.vcf.gz.tbi 15 | 16 | done 17 | 18 | 19 | 20 | -------------------------------------------------------------------------------- /util/edit_vcf_samplenames/uniqueify_merged_samplenames.template.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | # Jay Mashl, July 2017 3 | 4 | SOURCE=variants.vcf.gz 5 | DEST=${SOURCE/variants/variants.fixedHeader} 6 | 7 | gunzip -dc $SOURCE | ./replace_vcf_header_sample_with_source.pl > newheader.txt 8 | tabix -r newheader.txt $SOURCE > $DEST 9 | tabix -p vcf $DEST 10 | --------------------------------------------------------------------------------