├── README.md
├── analysis
    ├── CNV
    │   ├── analyzeCNV.R
    │   └── cnvExpressAssoc.R
    ├── CharGer_analysis
    │   ├── PCGP
    │   │   ├── liftover_CharGer_result_PCGP.py
    │   │   ├── plot_Charged_PCGP.R
    │   │   ├── run_charger.sh
    │   │   └── work.log.sh
    │   └── append_readcounts
    │   │   └── append_readcounts.pl
    ├── LOH
    │   ├── plotPathVarLOH.R
    │   ├── prepFileForAllelicImbalance.pl
    │   └── pvalue_per_site_unified_background.pl
    ├── PCA_IBD_dist
    │   ├── plot_relatedness.R
    │   └── run_plink_pca.ibd.rel.sh
    ├── README.md
    ├── RPPA_effect
    │   ├── PathVarRPPAAssoc.R
    │   ├── RPPA_effect.R
    │   ├── plotPathVarRPPA.R
    │   └── plotPathVarRPPAassoc.R
    ├── association_test
    │   ├── TFT_functions.R
    │   ├── analyzeAssocTFT.DDR.R
    │   ├── analyzeAssocTFT.R
    │   ├── plotPathVarAssoc.R
    │   ├── postProcessAssoc.R
    │   ├── qqman.R
    │   ├── single_var_association.py
    │   ├── single_var_association_ethni.py
    │   └── work.log.sh
    ├── burden_assoc
    │   ├── TFT.R
    │   ├── analyze_path_burden.R
    │   └── label_onco_var_ExAC.R
    ├── clinical_association
    │   ├── PathVarAAOassoc.R
    │   ├── PathVarEthnicStats.R
    │   └── plotPathVarEthnic.R
    ├── data_integration
    │   ├── combine_data.R
    │   ├── integrative_analysis.R
    │   └── pathVar_integrative_analysis.R
    ├── dependency_files.R
    ├── expression_effect
    │   ├── PathVarExpressAssoc.R
    │   ├── expression_effect.R
    │   ├── plotPathVarExpressAssoc.R
    │   └── plotPathVarExpression.R
    ├── family_history
    │   └── fam_history.R
    ├── functional_assay
    │   └── plot_result.R
    ├── gene_list
    │   └── examine_gene_list.R
    ├── global_aes_out.R
    ├── hotspot3d
    │   ├── cluster_analysis.R
    │   ├── spotlightVar.R
    │   └── work.log.sh
    ├── mutation_signature
    │   ├── 1_germlineVsMutationSiganture.R
    │   ├── 2_plotPathVarMutsigAssoc.R
    │   ├── 3_somaticVsMutationSiganture.R
    │   └── 4_plotSomaticMutsigAssoc.R
    ├── nominate_variants
    │   └── nominateVars.R
    ├── pathogenic_variants
    │   ├── label_onco_var.R
    │   └── plot_path_var_dist.R
    ├── pleiotropy
    │   └── pleiotropy.R
    ├── process_files
    │   ├── genotype
    │   │   ├── cancer_type.txt
    │   │   ├── merge_genotype_by_cancer.sh
    │   │   └── merge_log_gcloud.sh
    │   └── germline
    │   │   ├── google-cloud-ISB
    │   │       ├── README.md
    │   │       ├── analysisID_lists
    │   │       │   ├── ACC.ids
    │   │       │   ├── BLCA.ids
    │   │       │   ├── BRCA.ids
    │   │       │   ├── CESC.ids
    │   │       │   ├── CHOL.ids
    │   │       │   ├── COAD.ids
    │   │       │   ├── DLBC.ids
    │   │       │   ├── ESCA.ids
    │   │       │   ├── GBM.ids
    │   │       │   ├── HNSC.ids
    │   │       │   ├── KICH.ids
    │   │       │   ├── KIRC.ids
    │   │       │   ├── KIRP.ids
    │   │       │   ├── LGG.ids
    │   │       │   ├── LIHC.ids
    │   │       │   ├── LUAD.ids
    │   │       │   ├── LUSC.ids
    │   │       │   ├── MESO.ids
    │   │       │   ├── OV.ids
    │   │       │   ├── PAAD.ids
    │   │       │   ├── PCPG.ids
    │   │       │   ├── PRAD.ids
    │   │       │   ├── READ.ids
    │   │       │   ├── SARC.ids
    │   │       │   ├── SKCM.ids
    │   │       │   ├── STAD.ids
    │   │       │   ├── TGCT.ids
    │   │       │   ├── THCA.ids
    │   │       │   ├── THYM.ids
    │   │       │   ├── UCEC.ids
    │   │       │   ├── UCS.ids
    │   │       │   └── UVM.ids
    │   │       ├── annotate.not-in-exac.sh
    │   │       ├── docker
    │   │       │   ├── Dockerfile
    │   │       │   ├── ExAC_config.toml
    │   │       │   ├── filter_VCF_AF_AD.py
    │   │       │   └── variant_QC_annotation.sh
    │   │       ├── make_lists.sh
    │   │       └── unused
    │   │       │   └── Dockerfile
    │   │   ├── local
    │   │       ├── ExAC_config.toml
    │   │       ├── calc_vcf_concordance.py
    │   │       ├── combine_CharGer2VCF.py
    │   │       ├── create_ROI_genotype_VCF.sh
    │   │       ├── expand_csq.py
    │   │       ├── filter_VCF_AD.py
    │   │       ├── filter_VCF_AF_AD.py
    │   │       ├── filter_VCF_AF_AD_keepExAConly.py
    │   │       ├── filter_merge_germline_by_cancer.sh
    │   │       ├── filter_merge_germline_by_sample.sh
    │   │       ├── make.bsub.commands.sh
    │   │       ├── post_CharGer.sh
    │   │       ├── recalc_AF_PM2.py
    │   │       ├── replace_vcf_header_sample_with_source_TCGA.pl
    │   │       ├── run_VEP.v85.sh
    │   │       ├── run_calc_vcf_concordance.sh
    │   │       ├── run_charger_on_vep_VCF.sh
    │   │       ├── update_vcfHeader_to_TCGA.sh
    │   │       ├── variant_QC_annotation.sh
    │   │       └── work.log.sh
    │   │   ├── merge_germline_cloud.sh
    │   │   ├── readme.txt
    │   │   └── var_freq
    │   │       ├── batch_run_vcf_var_freq_filter.sh
    │   │       ├── run_vcf_var_freq_filter.sh
    │   │       └── vcf_var_freq_filter.pl
    ├── sample_listing
    │   ├── compile_compare_samples.R
    │   └── make_clin_summary_table.R
    ├── segregation_analysis
    │   ├── batch_run_segregation.sh
    │   ├── find_relatives.R
    │   ├── find_segregating_var.sh
    │   ├── find_shared_var_relatives.py
    │   └── segregation.log.sh
    └── variant_QC
    │   ├── batch_run_pseq_stats.sh
    │   ├── batch_run_pseq_vcfstats.sh
    │   ├── batch_run_vcfstats.sh
    │   ├── plot_concordance.R
    │   ├── plot_pseq_stats.R
    │   ├── run_pseq_stats.sh
    │   └── separate_batch_ethnicity_effects.R
├── doc
    ├── 20170118_TCGA_Germline_Abstract.docx
    └── notes.txt
├── germline_somatic_analysis
    ├── dependency_files.R
    ├── global_aes_out.R
    ├── load_somatic.R
    ├── mutation_signature
    │   ├── 1_germlineVsMutationSiganture.R
    │   ├── 2_plotPathVarMutsigAssoc.R
    │   ├── 3_somaticVsMutationSiganture.R
    │   └── 4_plotSomaticMutsigAssoc.R
    ├── somatic_germline
    │   ├── plotSomaticGermline.R
    │   ├── plotSomaticGermlineByCancer.R
    │   ├── somaticDriver_germline_fisher.R
    │   ├── somaticDriver_germline_fisher_byCancer.R
    │   └── somatic_germline_fisher.R
    └── somatic_germline_overlap
    │   └── somatic_germline_overlap_genes.R
└── util
    └── edit_vcf_samplenames
        ├── list_vcf_source-sample_pairs.pl
        ├── replace_vcf_header_sample_with_source.pl
        ├── uniqueify_merged_samplenames.driver.sh
        └── uniqueify_merged_samplenames.template.sh


/README.md:
--------------------------------------------------------------------------------
 1 | # PanCanAtlasGermline #
 2 | 
 3 | ## Pathogenic Germline Variants in 10,389 Adult Cancers ## 
 4 | >The TCGA PanCanAtlas germline analysis working group is investigating germline variants in the largest sequencing cohort of cancer to date: 10,389 cases in 33 cancer types.     
 5 | ```Pathogenic Germline Variants in 10,389 Adult Cancers.
 6 | 
 7 | Huang KL, Mashl RJ, Wu Y, Ritter DI, Wang J, Oh C, Paczkowska M, Reynolds S, Wyczalkowski MA, Oak N, Scott AD, Krassowski M, Cherniack AD, Houlahan KE, Jayasinghe R, Wang LB, Zhou DC, Liu D, Cao S, Kim YW, Koire A, McMichael JF, Hucthagowder V, Kim TB, Hahn A, Wang C, McLellan MD, Al-Mulla F, Johnson KJ; Cancer Genome Atlas Research Network, Lichtarge O, Boutros PC, Raphael B, Lazar AJ, Zhang W, Wendl MC, Govindan R, Jain S, Wheeler D, Kulkarni S, Dipersio JF, Reimand J, Meric-Bernstam F, Chen K, Shmulevich I, Plon SE, Chen F, Ding L.
 8 | 
 9 | Cell. 2018 Apr 5;173(2):355-370.e14. doi: 10.1016/j.cell.2018.03.039.
10 | 
11 | PMID: 29625052
12 | ```
13 | 
14 | ## Data Access ##
15 | >De-identified variant-level data for prioritized VUS and pathogenic variants: Table S2 of the publication.
16 | 
17 | >The protected variants+sample ID and the full callset (Authorized User only):
18 | 
19 | >GDC link: https://gdc.cancer.gov/about-data/publications/PanCanAtlas-Germline-AWG
20 | 
21 | >Compressed VCF file of the combined, filtered variant calls using GATK, VarScan2, and Pindel on WES data of the 10,389 final passed-QC samples. - PCA.r1.TCGAbarcode.merge.tnSwapCorrected.10389.vcf.gz
22 | 
23 | >Tabix file of the compressed VCF file of the combined, filtered variant calls using GATK, VarScan2, and Pindel on WES data of the 10,389 final passed-QC samples. - PCA.r1.TCGAbarcode.merge.tnSwapCorrected.10389.vcf.gz.tbi
24 | 
25 | >Prioritized, cancer related variants discovered in 10,389 cases. Please use "Overall_Classification" column to distinguish between Pathogenic, Likely Pathogenic and Priortizied VUSs. - PCA_pathVar_integrated_filtered_adjusted.tsv
26 | 
27 | ## PanCanAtlas Germline Working Group info (for members) ##
28 | >Wiki: https://wiki.nci.nih.gov/display/TCGAM/PanCanAtlas+Germline+AWG 
29 | >Synapse: syn4602499  
30 | 
31 | ## Directory set-up ##
32 | > The analyses directory should be set up to live parallelly to the TCGA_data directory to allow proper data sourcing. Due to the protected nature of some of those files they are not shared publicly. 
33 | > Analyses scripts are in the "analysis" folder
34 | 
35 | ## Getting started on ISB-CGC [For TCGA PanCanAtlas Germline AWG members Only]: ##  
36 | >1) Install Google Cloud SDK (https://cloud.google.com/sdk/docs/quickstarts) and read through at least the basic gsutil command 
37 | >2) Make sure you can access the project on Google cloud (https://console.cloud.google.com/home/dashboard?project=isb-cgc-06-0004); if not here are some relevant steps (http://isb-cancer-genomics-cloud.readthedocs.io/en/latest/sections/webapp/Gaining-Access-To-Contolled-Access-Data.html)  
38 | >3) Read this one page quick start to google cloud compute engine (https://cloud.google.com/compute/docs/quickstarts) . 
39 | >4) If you want to download the data to your local computer/cluster, you need to make sure with your system administrator that the environment you plan to work on have appropriate access authority and security.  
40 | >5) Read Jay's short getting started guide (https://drive.google.com/file/d/0B0aS3CDIQ_RAd01ld0tKX3JHa00/view). And if you are up to it here is a more detailed guide by Sheila (https://docs.google.com/document/d/1f1YBVG1dAhpF-Un5lp70kMI8Yo2rD3NNQdpl0FDAu-c/edit#heading=h.gv4f8tqq5731)  
41 | 
42 | >The PCAGermline AWG google doc of shared data (https://docs.google.com/document/d/1ymdfAnRR4o4-20bwHI3vPaRPRuoqtqc0pNUVYO2oiPc/edit) 
43 | 
44 | 
45 | > Contact: Kuan-lin Huang [kuan-lin.huang@wustl.edu]
46 | 


--------------------------------------------------------------------------------
/analysis/CharGer_analysis/PCGP/liftover_CharGer_result_PCGP.py:
--------------------------------------------------------------------------------
  1 | #!/bin/python
  2 | #03 February 2016 - Kuan-Lin Huang @ WashU - 
  3 |  
  4 | import sys
  5 | import getopt
  6 | 
  7 | def main():
  8 |     def usage():
  9 |         print """
 10 |     liftover_CharGer_result.py : why do I exist?
 11 | 
 12 |     USAGE: liftover_CharGer_result.py [-h] <CharGer results file> <PCGP input file>
 13 |      -h    print this message
 14 |      <filename>    input file
 15 |         """
 16 | 
 17 |     #use getopt to get inputs
 18 |     # try:
 19 |     #     opts, args = getopt.getopt(sys.argv[1:], 'h') #:after option meaning required arguments
 20 |     # except getopt.GetoptError:
 21 |     #     print "liftover_CharGer_result.py <CharGerSummaryFile> <inputFile>"
 22 | 
 23 |     # for opt, arg in opts: #store the input options
 24 |     #     if opt == '-h': # h means user needs help
 25 |     #         usage(); sys.exit()
 26 | 
 27 |     args = sys.argv[1:]
 28 |     if len(args) < 1:
 29 |         usage(); sys.exit("input file missing")
 30 | 
 31 |     #open input file
 32 |     try:
 33 |         charGerF = open(args[0],"r")
 34 |     except IOError:
 35 |         print("File , args[0], does not exist!")
 36 |     
 37 |     CharGerHeader = charGerF.readline().strip()
 38 |     varCharGer = {}
 39 |     #read input file
 40 |     for line in charGerF:
 41 |         line=line.strip()
 42 |         F = line.split("\t")
 43 |         # chrom = F[1]
 44 |         # start = F[2]
 45 |         # stop = F[3]
 46 |         # ref = F[4]
 47 |         # alt = F[5]
 48 |         if len(F) > 4:
 49 |             # if F[4] == "-":
 50 |             #     F[4] = "0"
 51 |             # if F[5] == "-":
 52 |             #     F[5] = "0"
 53 |             var = "_".join(F[1:3]+F[4:6])
 54 |             varCharGer[var]=line
 55 |     charGerF.close()
 56 | 
 57 | 
 58 |     try:
 59 |         inputF = open(args[1],"r")
 60 |     except IOError:
 61 |         print("File , args[1], does not exist!")
 62 | 
 63 |     header = inputF.readline().strip()
 64 |     headerF = header.split("\t") #arrays mutable, I'm farily sure
 65 |     i = 1
 66 |     #for headerItem in headerF:
 67 |     for k in range(0, len(headerF)):
 68 |         if headerF[k] == "":
 69 |              headerF[k] = "Missing" + str(i)
 70 |              i+=1
 71 |         headerF[k] = headerF[k].replace("#","").strip() #this is the same problem as self.userVariant or something
 72 |         # print ":" + headerF[k] + ":"
 73 |         # print i
 74 |         # i +=1
 75 |          
 76 | 
 77 |     print "\t".join(headerF[0:35]) + "\t" + CharGerHeader
 78 | 
 79 |     #read input file
 80 |     for line in inputF:
 81 |         line=line.strip()
 82 |         F = line.split("\t")
 83 |         if len(F) > 20:
 84 |             F[16] = F[16].upper().replace("CHR", "")
 85 |             start = F[17]
 86 |             #stop = F[2]
 87 |             ref = F[18]
 88 |             alt = F[19]
 89 |             #sample = F[21]
 90 |             var = "_".join(F[16:20])
 91 |         
 92 |             CharGerAnno = ""
 93 |             if var in varCharGer:
 94 |                 CharGerAnno = varCharGer[var]
 95 |         
 96 |         print "\t".join(F[0:35]) + "\t" + CharGerAnno  
 97 |         #print line + "\t" + CharGerAnno
 98 |     
 99 |     inputF.close()
100 | 
101 | if __name__ == "__main__":
102 |     main()
103 | 


--------------------------------------------------------------------------------
/analysis/CharGer_analysis/PCGP/run_charger.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | inputDir="/gscmnt/gc3020/dinglab/medseq/Germline/projects/pan8000_germline_clinical/variant_files/201604_PCGP_variants/"
 4 | mmGenes="/gscmnt/gc2737/ding/Analysis/Germline/CharGer/20160301_Rahman_KJ_KH_gene_table_CharGer.txt"
 5 | mmVariants="/gscmnt/gc2737/ding/Analysis/VariantLists/emptyRemoved_20160428_pathogenic_variants_HGVSg_VEP.vcf"
 6 | hotspot="/gscmnt/gc2737/ding/Analysis/Germline/CharGer/MC3.noHypers.mericUnspecified.d10.r20.v114.clusters"
 7 | clinvar="/gscmnt/gc2706/dinglab/medseq/ClinVar/MacArthurLab/clinvar/output/b37/single/clinvar_alleles.single.b37.tsv.gz"
 8 | rareThreshold="0.0005"
 9 | results="Charged/"
10 | if [ ! -d ${results} ]; then
11 |         mkdir ${results}
12 | fi
13 | 
14 | queue="ding-lab"
15 | #queue="long"
16 | group="/khuang"
17 | 
18 | 	sample="/gscmnt/gc3020/dinglab/medseq/Germline/projects/pan8000_germline_clinical/variant_files/201604_PCGP_variants/2015_stJude_germline_nejm_S4_AD_varOnly.vep.vcf"
19 | 	output="${results}charged.2015_stJude_germline_nejm_S4_AD_varOnly.vep.tsv"
20 | 	command="python CharGer/bin/charger -f ${sample} -o ${output} -O -D -g ${mmGenes} -z ${mmVariants} -H ${hotspot} -l --mac-clinvar-tsv ${clinvar} > ${results}charger.PCGP_AD.out"
21 | 	log="${results}charger.PCGP_AD.log"
22 | 	echo "bsub -g ${group} -q ${queue} -oo ${log} \"${command}\""
23 | 
24 |         sample="/gscmnt/gc3020/dinglab/medseq/Germline/projects/pan8000_germline_clinical/variant_files/201604_PCGP_variants/2015_stJude_germline_nejm_S4_AR_varOnly.vep.vcf"
25 |         output="${results}charged.2015_stJude_germline_nejm_S4_AR_varOnly.vep.tsv"
26 |         command="python CharGer/bin/charger -f ${sample} -o ${output} -O -D -g ${mmGenes} -z ${mmVariants} -H ${hotspot} -l --mac-clinvar-tsv ${clinvar} > ${results}charger.PCGP_AR.out"
27 |         log="${results}charger.PCGP_AR.log"
28 |         echo "bsub -g ${group} -q ${queue} -oo ${log} \"${command}\""
29 | 


--------------------------------------------------------------------------------
/analysis/CharGer_analysis/PCGP/work.log.sh:
--------------------------------------------------------------------------------
 1 | #0. run charger
 2 | bash run_charger.sh
 3 | 
 4 | #1. append results
 5 | python liftover_CharGer_result_PCGP.py charged.2015_stJude_germline_nejm_S4_AD_varOnly.vep.tsv /Users/khuang/Box\ Sync/PhD/germline/pan8000_germline_clinical/variant_files/201604_PCGP_variants/2015_stJude_germline_nejm_S4_AD.txt > 2015_stJude_germline_nejm_S4_AD_charger.txt
 6 | python liftover_CharGer_result_PCGP.py charged.2015_stJude_germline_nejm_S4_AR_varOnly.vep.tsv /Users/khuang/Box\ Sync/PhD/germline/pan8000_germline_clinical/variant_files/201604_PCGP_variants/2015_stJude_germline_nejm_S4_AR.txt > 2015_stJude_germline_nejm_S4_AR_charger.txt
 7 | 
 8 | #2. Run analysis and plotting
 9 | Rscript plot_CharGer_summary_PCGP.R
10 | 


--------------------------------------------------------------------------------
/analysis/LOH/plotPathVarLOH.R:
--------------------------------------------------------------------------------
 1 | ##### plotPathVarLOH.R #####
 2 | # Kuan-lin Huang @ WashU 201711
 3 | # plot assoc results for pathogenic variants
 4 | 
 5 | bdir = "/Users/khuang/Box\ Sync/PhD/germline/PanCanAtlasGermline/analysis/LOH"
 6 | setwd(bdir)
 7 | source("../global_aes_out.R")
 8 | source("../dependency_files.R")
 9 | 
10 | getLOHColorScale = function() {
11 |   colors = c("#e31a1c", "#b2df8a", "#1f78b4") #positive is dark grey       
12 |   color.names = c("Significant","Suggestive","None")
13 |   names(colors) = color.names
14 |   color.scale = scale_color_manual(name="Loss of Heterozygosity", values=colors)
15 |   return(color.scale)
16 | }
17 | 
18 | getLOHFillScale = function() {
19 |   colors = c("#e31a1c", "#b2df8a", "#1f78b4") #positive is dark grey       
20 |   color.names = c("Significant","Suggestive","None")
21 |   names(colors) = color.names
22 |   color.scale = scale_fill_manual(name="Loss of Heterozygosity", values=colors)
23 |   return(color.scale)
24 | }
25 | 
26 | #pathVarOT$LOH_Sig = factor(pathVarOT$LOH_Sig, levels=c("None","Suggestive","Significant"))
27 | 
28 | p = ggplot(pathVarOT,aes(x=normalVAF, y =tumorVAF, color = LOH_Sig))
29 | p = p + facet_grid(.~Gene_Classification,drop=T)
30 | p = p + geom_point(alpha=0.3, stroke=0) + theme_bw() + theme_nogrid() #+ guides(color=FALSE)
31 | p = p + geom_abline(intercept = 0, slope=1, alpha=0.2) #+ geom_density2d(alpha=0.5)
32 | p = p + theme(axis.title = element_text(size=16), axis.text.x = element_text(colour="black", size=14, angle=90,vjust=0.5), axis.text.y = element_text(colour="black", size=14))#element_text(colour="black", size=14))
33 | p = p + expand_limits(x = 0, y = 0)
34 | p = p + coord_equal() + getLOHColorScale()
35 | p = p + labs(x = "Normal VAF", y = "Tumor VAF")
36 | p
37 | fn = "out/pathVarLOH.pdf"
38 | ggsave(file=fn, width=6, useDingbats=FALSE)
39 | 
40 | # find LOH gene percentage
41 | geneLOH = data.frame(table(pathVarOT$HUGO_Symbol,pathVarOT$LOH_Sig))
42 | colnames(geneLOH) = c("Gene","LOH_category","Count")
43 | 
44 | geneLOHsig = geneLOH[geneLOH$LOH_category=="Significant",]
45 | sig_gene = geneLOHsig[geneLOHsig$Count>2,]$Gene
46 | sig_gene_order = geneLOHsig$Gene[order(geneLOHsig$Count,decreasing = T)]
47 | geneLOH_g = geneLOH[geneLOH$Gene %in% sig_gene,]
48 | 
49 | geneLOH_g$Gene = factor(geneLOH_g$Gene, levels=sig_gene_order)
50 | geneLOH_g$LOH_category = factor(geneLOH_g$LOH_category, levels=c("None","Suggestive","Significant"))
51 | 
52 | p = ggplot(geneLOH_g,aes(x = Gene, y = Count, fill = LOH_category))
53 | p = p + geom_bar(stat = "identity") + theme_bw() + theme_nogrid() 
54 | p = p + labs(x = "Gene", y="Count of variants")
55 | p = p + getLOHFillScale()
56 | p = p + theme(axis.title = element_text(size=12), axis.text.x = element_text(colour="black", size=10, angle = 90, vjust=0.5), axis.text.y = element_text(colour="black", size=12))#element_text(colour="black", size=14))
57 | p = p + theme(legend.position = "top")
58 | p
59 | fn = 'out/LOH_var_count_by_gene.pdf'
60 | ggsave(file=fn, height = 6, width = 6, useDingbats=FALSE)
61 | 
62 | pathVarOT$TumorByNormalVAFPlot = pathVarOT$TumorByNormalVAF
63 | pathVarOT$TumorByNormalVAFPlot[pathVarOT$TumorByNormalVAFPlot> 2 ] = 2
64 | pathVarOT$TumorByNormalVAFPlot[pathVarOT$TumorByNormalVAFPlot< 0.5 ] = 0.5
65 | p = ggplot(pathVarOT[pathVarOT$HUGO_Symbol %in% sig_gene,],aes(x = HUGO_Symbol, y = TumorByNormalVAFPlot, color = LOH_Sig, fill = HUGO_Symbol))
66 | p = p + geom_point(position = position_jitter(w = 0.2, h = 0), alpha = 0.3)
67 | #p = p + geom_violin(alpha = 0.3) 
68 | p = p + labs(x = "Gene", y="Tumor VAF / Normal VAF") + theme_bw()
69 | #p = p + getVarFillScale()
70 | p = p + theme(axis.title = element_text(size=12), axis.text.x = element_text(colour="black", size=10, angle = 90, vjust=0.5), axis.text.y = element_text(colour="black", size=12))#element_text(colour="black", size=14))
71 | p = p + theme(legend.position = "none")
72 | p
73 | fn = 'out/LOH_var_VAFratio_by_gene.pdf'
74 | ggsave(file=fn, height = 6, width = 6, useDingbats=FALSE)
75 | 
76 | p = ggplot(pathVarOT[pathVarOT$HUGO_Symbol %in% sig_gene,],aes(x=HUGO_Symbol,y = TumorByNormalVAFPlot, color = LOH_Sig, fill = LOH_Sig))
77 | #p = p + facet_grid(.~Gene_Classification, scale = "free", space = "free", drop=T)
78 | p = p + geom_dotplot(dotsize=0.6,binwidth=.015, binaxis= "y",stackdir ="centerwhole",alpha=0.5)
79 | p = p + theme_bw() 
80 | p = p + labs(x = "Gene", y="Tumor VAF / Normal VAF")
81 | p = p + scale_y_continuous(breaks = seq(0.5,2.0, by= 0.5))
82 | #p = p + getVarColorScale()
83 | p = p + theme(axis.title = element_text(size=12), axis.text.x = element_text(colour="black", size=10, angle = 90, vjust=0.5), axis.text.y = element_text(colour="black", size=12))#element_text(colour="black", size=14))
84 | p = p + theme(legend.position = "none")
85 | p
86 | fn = 'out/LOH_var_VAFratio_by_gene_dotplot.pdf'
87 | ggsave(file=fn, height = 6, width = 6, useDingbats=FALSE)


--------------------------------------------------------------------------------
/analysis/LOH/prepFileForAllelicImbalance.pl:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/perl
 2 | #03 August 2017 - Adam D Scott - 
 3 | # adopted by Kuan @ WashU 11/1/2017 for PCA file
 4 | 
 5 | use strict;
 6 | use warnings;
 7 | 
 8 | use IO::File;
 9 | use FileHandle;
10 | 
11 | my $usage = 'perl prepFileForAllelicImbalance.pl <charger> <preAI>
12 | ';
13 | 
14 | die $usage , unless @ARGV == 2;
15 | my ( $charger , $preAI ) = @ARGV;
16 | 
17 | # my $IN1 = FileHandle->new( "gunzip -c $charger |" , "r" );
18 | # if ( not defined $IN1 ) { die "ADSERROR: Could not open/read $charger\n"; }
19 | open(my $IN1, "gunzip -c $charger |") || die "can't open pipe to $charger";
20 | 
21 | my $OUTmissense = FileHandle->new( "ready.missense.".$preAI , "w" );
22 | if ( not defined $OUTmissense ) { die "ADSERROR: Could not open/write $preAI\n"; }
23 | 
24 | my $OUTtruncation = FileHandle->new( "ready.truncation.".$preAI , "w" );
25 | if ( not defined $OUTtruncation ) { die "ADSERROR: Could not open/write $preAI\n"; }
26 | 
27 | my $useTheseClasses = { 'missense' => 1 , 'frame_shift_del' => 1 , 'frame_shift_ins' => 1 , 'splice_site_del' => 1 , 'splice_site_ins' => 1 , 'nonsense' => 1 , 'splice_site' => 1 , 'nonstop' => 1 };
28 | my $sites = {};
29 | $OUTmissense->print( "HUGO_Symbol\tCHR\tSTART\tSTOP\tREF\tALT\tTYPE\tNormalRef\tNormalVar\tNormalVAF\tTumorRef\tTumorVar\tTumorVAF\tSample\n" );
30 | $OUTtruncation->print( "HUGO_Symbol\tCHR\tSTART\tSTOP\tREF\tALT\tTYPE\tNormalRef\tNormalVar\tNormalVAF\tTumorRef\tTumorVar\tTumorVAF\tSample\n" );
31 | 
32 | while ( my $line = <$IN1> ) {
33 | 	next if ( $line =~ /HGVSg/ );
34 | 	chomp( $line );
35 | 	my @line = split( "\t" , $line );
36 | 	next if ( $line[9] =~ /synonymous_variant/ );
37 | 	next if ( $line[9] =~ /stop_retained_variant/ );
38 | 	next if ( $line[9] =~ /start_lost/ );
39 | 	next if ( $line[9] =~ /non_coding_transcript_exon_variant/ );
40 | 	next if ( $line[9] =~ /UTR/ );
41 | 	if ( $line[9] =~ /missense/ ) { $line[9] = "missense";}
42 | 	if ( $line[9] =~ /frameshift/ and $line[4] eq "-" ) { $line[9] = "frame_shift_ins"; }
43 | 	if ( $line[9] =~ /frameshift/ and $line[5] eq "-" ) { $line[9] = "frame_shift_del"; }
44 | 	if ( $line[9] =~ /splice/ and $line[7] eq "-" ) { $line[9] = "splice_site_ins"; }
45 | 	if ( $line[9] =~ /splice/ and $line[8] eq "-" ) { $line[9] = "splice_site_del"; }
46 | 	if ( $line[9] =~ /splice/ and $line[7] ne "-" and $line[8] ne "-" ) { $line[9] = "splice_site"; }
47 | 	if ( $line[9] =~ /stop_gained/ ) { $line[9] = "nonsense"; }
48 | 	if ( $line[9] =~ /stop_lost/ ) { $line[9] = "nonstop"; }
49 | 	
50 | 	# # expected fields: $gene, $chr, $strt, $stop, $ref, $var, $type,
51 |     #       			$num_norm_refs, $num_norm_vars, $nvaf,
52 |     #                   $num_tumr_refs, $num_tumr_vars, $tvaf,
53 |     #                   $sample_name
54 | 	my $outString = join( "\t" , ( @line[3..9] , @line[99,100] , $line[104] , @line[102,103], $line[105], $line[1] ) );
55 | 	if ( $line[9] =~ /missense/) { $OUTmissense->print( $outString."\n" );} #gene site variant_class charger_pathogenicity
56 | 	else{$OUTtruncation->print( $outString."\n" );}
57 | }
58 | $IN1->close();
59 | 
60 | $OUTmissense->close();
61 | $OUTtruncation->close();
62 | 


--------------------------------------------------------------------------------
/analysis/PCA_IBD_dist/plot_relatedness.R:
--------------------------------------------------------------------------------
 1 | ##### plot_PCA.R #####
 2 | # Kuan-lin Huang @ WashU 2017 March
 3 | # plot PCA output from plink
 4 | # for TCGA samples, label based on reported ethnicity
 5 | 
 6 | setwd("/Users/khuang/Box Sync/PhD/germline/PanCanAtlasGermline/analysis/PCA_IBD_dist")
 7 | source("../global_aes_out.R")
 8 | 
 9 | # plink relation file
10 | # pca_f = "plink_out/all.normal.merge.vcf.pca.eigenvec"
11 | # relationship based on Yang J, Lee SH, Goddard ME, Visscher PM (2011) GCTA: A Tool for Genome-wide Complex Trait Analysis 
12 | rel_f = "plink_out/all.normal.merge.indep_50_5_2.vcf.rel.rel"
13 | rel = read.table(header=F, quote = "", sep="\t", file = rel_f)
14 | 
15 | # merge with sample
16 | sample_f = "plink_out/all.normal.merge.indep_50_5_2.vcf.rel.rel.id"
17 | sample = read.table(header=F, quote = "", sep="\t", row.names =NULL, file = sample_f, stringsAsFactors=FALSE)
18 | samples = sample[,1]
19 | row.names(rel) = samples
20 | colnames(rel) = samples
21 | rel$sample = samples
22 | 
23 | # make a small version of the table and take a quick look
24 | rel_m = melt(rel[1:10,],id.var="sample")
25 | rel_m[rel_m$sample ==rel_m$variable,]
26 | 
27 | # plotting
28 | p = ggplot(data=rel_m)
29 | p = p + geom_bar(aes(x=value),stat="bin",bins=100)  
30 | #p = p + scale_colour_gradientn(na.value="grey", colours=getPalette(100))#, limits=c(-4.2,4.2))
31 | p = p + theme_bw() + scale_y_log10() #+ expand_limits(y=1)#+ guides(fill=FALSE)
32 | p = p + geom_vline(xintercept = 0,alpha=.7) + geom_vline(xintercept = 1,alpha=.7)
33 | p
34 | fn = paste(pd, "PanCanAtlas_rel_10samples_hist.pdf", sep="_")
35 | ggsave(file=fn, useDingbats=FALSE)
36 | 
37 | 
38 | ### IBD
39 | dist_f = "plink_out/all.normal.merge.indep_50_5_2.vcf.dist.dist"
40 | dist = read.table(header=F, quote = "", sep="\t", file = dist_f)
41 | 
42 | # merge with sample
43 | sample_f = "plink_out/all.normal.merge.indep_50_5_2.vcf.dist.dist.id"
44 | sample = read.table(header=F, quote = "", sep="\t", row.names =NULL, file = sample_f, stringsAsFactors=FALSE)
45 | samples = sample[,1]
46 | row.names(dist) = samples
47 | colnames(dist) = samples
48 | dist$sample = samples
49 | 
50 | # make a small version of the table and take a quick look
51 | dist_m = melt(dist[1:10,],id.var="sample")
52 | dist_m[dist_m$sample ==dist_m$variable,]
53 | 
54 | # plotting
55 | p = ggplot(data=dist_m)
56 | p = p + geom_bar(aes(x=value),stat="bin",bins=100)  
57 | #p = p + scale_colour_gradientn(na.value="grey", colours=getPalette(100))#, limits=c(-4.2,4.2))
58 | p = p + theme_bw() + scale_y_log10() #+ expand_limits(y=1)#+ guides(fill=FALSE)
59 | p = p + geom_vline(xintercept = 0,alpha=.7) + geom_vline(xintercept = 1,alpha=.7)
60 | p
61 | fn = paste(pd, "PanCanAtlas_dist_10samples_hist.pdf", sep="_")
62 | ggsave(file=fn, useDingbats=FALSE)


--------------------------------------------------------------------------------
/analysis/PCA_IBD_dist/run_plink_pca.ibd.rel.sh:
--------------------------------------------------------------------------------
 1 | # use plink 1.9 to:
 2 | # calculate PCA/IBS/relatedness, before and after pruning
 3 | plink --vcf all.normal.merge.vcf.gz --pca --out all.normal.merge.vcf.pca
 4 | plink --vcf all.normal.merge.vcf.gz --pca --maf 0.15 --out all.normal.merge.vcf.MAF0.15.pca &
 5 | 
 6 | plink --vcf all.normal.merge.vcf.gz --distance square --indep 50 5 2 --maf 0.05 --out all.normal.merge.indep_50_5_2.vcf.dist
 7 | 
 8 | plink --vcf all.normal.merge.vcf.gz --make-rel square --indep 50 5 2 --maf 0.05 --out all.normal.merge.indep_50_5_2.vcf.rel
 9 | 
10 | rm -f *temporary*
11 | 
12 | # make tar balls
13 | tar -cvzf plink_dist_rel.tar *indep*
14 | tar -cvzf plink_pca.tar *pca*
15 | 
16 | # transfer to local 
17 | gcloud compute copy-files --zone us-central1-f huangkuanlin@kuan-merge-genotype-bigmem:~/plink*tar plink_out
18 | # transfer to storage
19 | gsutil cp plink*tar gs://dinglab/isb-cgc/tcga/analysis_files


--------------------------------------------------------------------------------
/analysis/README.md:
--------------------------------------------------------------------------------
 1 | Store each analyses-specific script and outputs
 2 | ===============================================
 3 | # Directory set-up #
 4 | > this analyses directory should be set up to live parallelly to the TCGA_data directory to allow proper data sourcing. Due to the protected nature of some of those files they are not shared publicly. 
 5 | 
 6 | # Dependency files #
 7 | >global_aes_out.R: used to define plotting functions and color codes for other analysis R codes
 8 | 
 9 | >dependency_files.R: load and prep files used for downstream analysis
10 | 
11 | # Analysis workflow #
12 | 
13 | # Genotype data analysis #
14 | ## 1. process_files/:
15 | > Scripts in process_files/genotype/ are used to preprocess germline variants calls to classified variants.
16 | 
17 | ## 2. PCA_IBD_dist/:
18 | > Scripts used for PCA and relatedness analysis
19 | 
20 | # Exomic germline variants analysis #
21 | ## 1. process_files/:
22 | > Scripts in process_files/germline/ are used to preprocess germline variants calls to classified variants. Steps downstream of GATK/VarScan/Pindel merged calls are described in process_files/germline/local/work.log.sh. 
23 | 
24 | ## 2. sample_listing/:
25 | > Compare PCA germline sample list (based on ISB-CGC manifest) against PCA clinical sample and MC3 sample. Generate population level summary for the 10,467 final samples.
26 | 
27 | ## 3. pathogenic_variants/: 
28 | > Scripts in pathogenic_variants/ are used to further filter variants based on readcount and cancer relevance. 
29 | 
30 | ## 4. association_test/: 
31 | > Conduct association test using ExAC and ExAC-nonTCGA data.
32 | 
33 | ## 5. LOH/:
34 | > Conduct LOH analysis using readcount data. 
35 | 
36 | ## 6. expression_effect/:
37 | > Calculate cohort level expression quantile and retrieve RSEM expression value for each sample within cancer types.
38 | 
39 | ## 7. data_integration/:
40 | > Combine multi-level data for pathogenic variants. 
41 | 
42 | ## 8. hotspot3d/: 
43 | > Co-clustering with somatic mutations using HotSpot3D. 
44 | 
45 | ## 9. data_intergration/:
46 | > Integrate pathogenic variant data with other molecular and omics data
47 | > plot results from data_integration
48 | 
49 | ## All subsequent analysis require integrated file ##
50 | > source dependency_files.R to read in these additional data
51 | 
52 | ## 10. integrated_analysis/:
53 | > generate plots and stats for pathogenic variants that required integrated datap
54 | 
55 | ## 


--------------------------------------------------------------------------------
/analysis/RPPA_effect/RPPA_effect.R:
--------------------------------------------------------------------------------
 1 | ##### RPPA_effect.R #####
 2 | # Kuan-lin Huang @ WashU 2016 May , updated 2017 Nov.
 3 | # analyze cohort level RPPA data and convert to different matrices in a sample-gene format
 4 | 
 5 | bdir = "/Users/khuang/Box Sync/PhD/germline/PanCanAtlasGermline/analysis/RPPA_effect"
 6 | setwd(bdir)
 7 | source("../global_aes_out.R")
 8 | source("../dependency_files.R")
 9 | ecdf_fun = function(x,perc) ecdf(x)(perc)
10 | 
11 | ### preprocess RPPA file ###
12 | fileNames = Sys.glob("/Users/khuang/Box\ Sync/PhD/germline/pan8000_germline_clinical/oncogene_signaling/data/RPPA/*.rppa.txt")
13 | all_tables = vector("list")
14 | 
15 | for (fileName in fileNames) {
16 |   cancer2 = strsplit(fileName, split="/")[[1]][length(strsplit(fileName, split="/")[[1]])]
17 |   cancer = gsub("\\..*","",cancer2)
18 | 
19 |   exp_table = read.table(header=TRUE, sep="\t", file=fileName)
20 |   exp_table_q = exp_table
21 |   for (i in 1:nrow(exp_table_q)){
22 |     min_RPPA = min(exp_table_q[i,-1],na.rm=T)
23 |     if (min_RPPA<0){ exp_table_q[i,-1] = exp_table_q[i,-1] - min_RPPA}
24 |     exp_table_q[i,-1] = ecdf_fun(unlist(exp_table_q[i,-1]),unlist(exp_table_q[i,-1]))
25 |   }
26 |   exp_table.m = melt(exp_table, id.var="Composite.Element.REF")
27 |   exp_table_q.m = melt(exp_table_q, id.var="Composite.Element.REF")
28 |   colnames(exp_table.m) = c("marker","sample","expression")
29 |   colnames(exp_table_q.m) = c("marker","sample","quantile")
30 |   exp_table_c = merge(exp_table.m,exp_table_q.m,by=c("marker","sample"))
31 |   exp_table_c$cancer = cancer
32 |   all_tables[[cancer]] = exp_table_c
33 | }
34 | RPPA = do.call(rbind,all_tables)
35 | RPPA$sample_l = substr(RPPA$sample, start=0, stop=16)
36 | RPPA$sample_l = gsub("\\.","-",RPPA$sample)
37 | RPPA$bcr_patient_barcode = substr(RPPA$sample_l, start=0, stop=12) 
38 | 
39 | RPPA$status = "tumor"
40 | status_n = substr(RPPA$sample_l, start=14, stop=14)
41 | RPPA$status[status_n==1] = "normal"
42 | RPPA = RPPA[RPPA$status != "normal",]# exclude normal from BRCA for now
43 | 
44 | RPPA$genes = gsub("\\|.*","",RPPA$marker)
45 | RPPA$genes[RPPA$genes=="MAPK1 MAPK3"] = "MAPK3"
46 | RPPA$genes[RPPA$genes=="PIK3R1 PIK3R2"] = "PIK3R1"
47 | RPPA$genes[RPPA$genes=="PIK3R1/2"] = "PIK3R1"
48 | RPPA$genes[RPPA$genes=="PIK3CA "] = "PIK3CA"
49 | RPPA$genes[RPPA$genes=="PDK1"] = "PDPK1"
50 | 
51 | fn = "out/pancan_RPPA_quantile_all.tsv"
52 | write.table(RPPA, file=fn, quote=F, sep="\t", col.names=T, row.names=F)


--------------------------------------------------------------------------------
/analysis/RPPA_effect/plotPathVarRPPA.R:
--------------------------------------------------------------------------------
 1 | ##### plotPathVarExpression.R #####
 2 | # Kuan-lin Huang @ WashU 201711
 3 | # plot assoc results for pathogenic variants
 4 | 
 5 | bdir = "/Users/khuang/Box\ Sync/PhD/germline/PanCanAtlasGermline/analysis/RPPA_effect"
 6 | setwd(bdir)
 7 | source("../global_aes_out.R")
 8 | source("../dependency_files.R")
 9 | 
10 | RPPA = read.table("out/pancan_RPPA_quantile_all.tsv",header=T, stringsAsFactors = F, quote = "", sep = "\t")
11 | 
12 | RPPA_g = RPPA[RPPA$genes %in% pathVarP$HUGO_Symbol,]
13 | RPPA_g_m = RPPA_g[,c("marker","genes","sample_l","bcr_patient_barcode","expression","quantile")]
14 | RPPA_g_m$marker = gsub(".*\\|","",RPPA_g_m$marker)
15 | colnames(RPPA_g_m)[2] = "HUGO_Symbol" 
16 | pathVarP_RPPA = merge(pathVarP,RPPA_g_m,by=c("HUGO_Symbol","bcr_patient_barcode"))
17 | pathVarP_RPPA_fg = pathVarP_RPPA[pathVarP_RPPA$HUGO_Symbol %in% featGenes,]
18 | pathVarP_RPPA_fg = pathVarP_RPPA_fg[!is.na(pathVarP_RPPA_fg$binary_type),]
19 | 
20 | pathVarP_RPPA_fg_p = pathVarP_RPPA_fg[!(pathVarP_RPPA_fg$marker %in% c("c-Met","Ret_pY905")),]
21 | 
22 | p = ggplot(pathVarP_RPPA_fg_p,aes(x=marker,y=quantile, fill=binary_type))
23 | p = p + facet_grid(.~Gene_Classification, scale = "free", space = "free", drop=T)
24 | p = p + geom_dotplot(dotsize=1.2,binwidth=.01, binaxis= "y",colour=NA,stackdir ="centerwhole")
25 | p = p + geom_text(aes(label=ifelse(Gene_Classification=="Oncogene" & quantile>0.75, gsub("p.","",HGVSp_short),NA)),size=2.5)
26 | p = p + theme_bw() 
27 | p = p + ylab("RPPA Expression Quantile") + xlab("Protein with Germline Variant")
28 | p = p + scale_y_continuous(breaks = seq(0,1, by= 0.25))
29 | #p = p + getVarColorScale()
30 | p = p + theme(axis.title = element_text(size=12), axis.text.x = element_text(colour="black", size=10, angle = 90, vjust=0.5), axis.text.y = element_text(colour="black", size=12))
31 | p
32 | fn = "out/pathVarRPPAExpression_byGene.pdf"
33 | ggsave(file=fn, w=6, h=6,useDingbats=FALSE)
34 | 
35 | # ### somatic information ###
36 | # somatic_f = "/Users/khuang/Box\ Sync/PhD/germline/pan8000_germline_clinical/somatic_germline/pancan.merged.v0.2.4.filtered.maf.gene_vclass_HGVSp_sample.txt"
37 | # somatic = read.table(header=T, quote = "", sep="\t", file = somatic_f, stringsAsFactors=FALSE)
38 | # colnames(somatic) = c("HUGO_Symbol","Somatic_Variant_Classification","sample","Somatic_HGVSp")
39 | # somatic$sample = gsub("(^TCGA-[A-Z0-9][A-Z0-9]-[A-Z0-9][A-Z0-9][A-Z0-9][A-Z0-9])-.*","\\1",somatic$sample)
40 | # # one entry per sample
41 | # somatic_class_agg = aggregate(somatic[c('Somatic_Variant_Classification','Somatic_HGVSp')], by=somatic[c('sample',"HUGO_Symbol")], paste, collapse = ",")
42 | # germ_so = merge(germ_clin_abb,somatic_class_agg, by =c("sample","HUGO_Symbol"), all=T)
43 | # 
44 | # ### clustering information###
45 | # cluster_var_f = "/Users/khuang/Box Sync/PhD/germline/pan8000_germline_clinical/germline_hotspot/20161010_germline_ARD_ASD_run/pan8000_somatic_germline_combined.maf.3D_Proximity.pairwise.singleprotein.collapsed.l0.p0.05.r10.clusters_wcounts.tsv"
46 | # cluster_var = read.table(header=T, quote = "", sep="\t", stringsAsFactors = F, fill =T, file = cluster_var_f)
47 | # 
48 | # cluster_germsoma_f = "/Users/khuang/Box Sync/PhD/germline/pan8000_germline_clinical/germline_hotspot/20161010_germline_ARD_ASD_run/pan8000_somatic_germline_combined.maf.3D_Proximity.pairwise.singleprotein.collapsed.l0.p0.05.r10.clusters.summary_wcounts_cc10.3_wgermsoma.tsv"
49 | # cluster_germsoma = read.table(header=T, quote = "", sep="\t", fill =T, file = cluster_germsoma_f, stringsAsFactors=FALSE)
50 | # 
51 | # cluster_var_in_hybrid = cluster_var[cluster_var$Cluster %in% cluster_germsoma$Cluster_ID,]
52 | # germ_clin_var = germ_clin[paste(germ_clin$chromosome_name, germ_clin$start, germ_clin$stop) %in% 
53 | #                             paste(cluster_var_in_hybrid$Chromosome, cluster_var_in_hybrid$Start, cluster_var_in_hybrid$Stop),]
54 | 
55 | 
56 | 


--------------------------------------------------------------------------------
/analysis/RPPA_effect/plotPathVarRPPAassoc.R:
--------------------------------------------------------------------------------
 1 | ##### plotPathVarExpressAssoc.R #####
 2 | # Kuan-lin Huang @ WashU 2016 August updated 2017
 3 | # conduct association of pathVarPline variants with AAO
 4 | 
 5 | bdir = "/Users/khuang/Box\ Sync/PhD/germline/PanCanAtlasGermline/analysis/RPPA_effect"
 6 | setwd(bdir)
 7 | source("../global_aes_out.R")
 8 | source("../dependency_files.R")
 9 | 
10 | tn = "out/pathVarRPPAAssoc.txt"
11 | tt = read.table(sep="\t",header=T,file=tn, stringsAsFactors=FALSE)
12 | 
13 | ### plotting ###
14 | tt$gene = as.character(tt$gene)
15 | tt$marker = as.character(tt$marker)
16 | tt$FDR_plot = tt$FDR
17 | 
18 | # # using GLM test result
19 | # tt$FDR_plot[tt$FDR_plot<10^(-6)]= 0.95*10^(-6)
20 | # p = ggplot(data=tt)
21 | # p = p + geom_point(aes(y=-log10(FDR_plot),x= coefficient,color = cancer),alpha=0.5)
22 | # #p = p + geom_text_repel(aes(y=-log10(FDR),x= cohort_AF,label=ifelse(FDR<0.05, Gene,NA),color = Cancer))#,alpha=1.3)
23 | # #p = p + geom_point(aes(y=cohort_AF,x=Cancer,size=-log10(FDR),color = Cancer))
24 | # p = p + geom_text_repel(aes(y=-log10(FDR_plot),x= coefficient,color = cancer,label=ifelse(FDR<0.05, marker,NA)))
25 | # p = p + getPCACancerColor()
26 | # p = p + labs(x="Coefficient",y= "-log10(FDR)")
27 | # p = p + geom_vline(xintercept = 0, alpha=0.5)
28 | # p = p  + theme_bw() +
29 | #   theme(axis.text.x = element_text(colour="black", size=12), axis.text.y = element_text(colour="black", size=12),axis.ticks = element_blank())#element_text(colour="black", size=14))
30 | # p
31 | # fn = 'out/RPPAAssocVolcanoGLM.pdf'
32 | # ggsave(fn,w = 5, h = 5, useDingbat=F)
33 | 
34 | # # using the Wilcox test result
35 | # p = ggplot(data=tt)
36 | # p = p + geom_point(aes(y=-log10(wilcoxFDR),x= coefficient,color = cancer),alpha=0.5)
37 | # #p = p + geom_text_repel(aes(y=-log10(FDR),x= cohort_AF,label=ifelse(FDR<0.05, Gene,NA),color = Cancer))#,alpha=1.3)
38 | # #p = p + geom_point(aes(y=cohort_AF,x=Cancer,size=-log10(FDR),color = Cancer))
39 | # p = p + geom_text_repel(aes(y=-log10(wilcoxFDR),x= coefficient,color = cancer,label=ifelse(FDR<0.05, marker,NA)))
40 | # p = p + getPCACancerColor()
41 | # p = p + labs(x="Coefficient",y= "-log10(FDR)")
42 | # p = p + geom_vline(xintercept = 0, alpha=0.5) + xlim(-1.6,1.6)
43 | # p = p  + theme_bw() +
44 | #   theme(axis.text.x = element_text(colour="black", size=12), axis.text.y = element_text(colour="black", size=12),axis.ticks = element_blank())#element_text(colour="black", size=14))
45 | # p
46 | # fn = 'out/rppaExpressAssocVolcanoWCOX.pdf'
47 | # ggsave(fn,w = 5, h = 5, useDingbat=F)
48 | 
49 | tt$association = "None"
50 | tt$association[tt$FDR<0.15] = "Suggestive"
51 | tt$association[tt$FDR<0.05] = "Significant"
52 | #tt$association = factor(tt$association,level=c("None","Suggestive","Significant"))
53 | 
54 | # using the Wilcox test result: plot by gene
55 | p = ggplot(data=tt,aes(x=coefficient,y=cancer,color = cancer))
56 | # p = p + geom_point(aes(y=-log10(wilcoxFDR),x= coefficient,color = cancer),alpha=0.5)
57 | p = p + geom_point(aes(shape=association),alpha=0.3,size=2)
58 | p = p + geom_text_repel(aes(label=ifelse(FDR<0.05,gene,NA)))
59 | p = p + getPCACancerColor()
60 | p = p + labs(x="Coefficient",y= "-log10(FDR)")
61 | p = p + geom_vline(xintercept = 0, alpha=0.5) + xlim(-1.7,1.7)
62 | p = p  + theme_bw() +
63 |   theme(axis.text.x = element_text(colour="black", size=12), axis.text.y = element_text(colour="black", size=12),axis.ticks = element_blank())#element_text(colour="black", size=14))
64 | p
65 | fn = 'out/rppaExpressAssocByGene.pdf'
66 | ggsave(fn,w = 5, h = 5, useDingbat=F)


--------------------------------------------------------------------------------
/analysis/association_test/TFT_functions.R:
--------------------------------------------------------------------------------
 1 | ##### burden test functions #####
 2 | TFT = function(data){
 3 |   ExAC_nonTCGA_AN = max(data$ExAC_nonTCGA_AN, na.rm=T)
 4 |   TCGA_AN = max(data$TCGA_AN, na.rm=T)
 5 |   ExAC_nonTCGA_AC = sum(data$ExAC_nonTCGA_AC, na.rm=T)
 6 |   TCGA_AC = sum(data$TCGA_AC, na.rm=T)
 7 |   
 8 |   p = NA; OR = NA
 9 |   
10 |   fisher_elements = as.numeric(c(ExAC_nonTCGA_AN,ExAC_nonTCGA_AC,TCGA_AN,TCGA_AC))
11 |   
12 |   if (fisher_elements[1] > 0 && fisher_elements[3] > 0 && fisher_elements[2] >= 0 && fisher_elements[4] >= 0){
13 |     test.table = matrix(as.numeric(fisher_elements), nrow=2)
14 |     f.test = fisher.test(test.table)
15 |     OR = f.test$estimate
16 |     p = f.test$p.value
17 |   }
18 |   result_row = c(fisher_elements,p,OR)
19 |   
20 |   return(result_row)
21 | }
22 | 
23 | run_TFT = function(data, AF_thres = 0.01){
24 |   # some clean-ups
25 |   data = data[data$ExAC_AC/data$ExAC_AN < AF_thres,]
26 |   first_Q = summary(data$ExAC_AN)[2]
27 |   data = data[data$ExAC_AN > first_Q,] # require enough samples with observed data; 114600 = first quantile
28 |   num_genes = length(unique(data$gene_symbol))
29 |   # burden test: TFT: http://slideplayer.com/slide/8660600/
30 |   stats = matrix(,nrow=num_genes,ncol=7)
31 |   
32 |   for (i in 1:num_genes){
33 |     gene = unique(data$gene_symbol)[i]
34 |     data_g = data[data$gene_symbol==gene,]
35 |     gene_stat = TFT(data_g)
36 |     stats[i,] = c(gene, gene_stat)
37 |   }
38 |   colnames(stats) = c("gene","nonTCGA_AN","total_nonTCGA_AC","TCGA_AN","total_TCGA_AC","P","OR")
39 |   stats = data.frame(stats,stringsAsFactors = F)
40 |   stats[,2:7] = sapply(stats[,2:7],as.numeric,2)
41 |   
42 |   #stats$P = as.numeric(stats$P)
43 |   stats$FDR = p.adjust(stats$P, method="BH")
44 |   stats = stats[order(stats$P),]
45 |   return(stats)
46 | }
47 | 
48 | plot_burden_result = function(stats){
49 |   p = ggplot(data=stats, aes(x=total_nonTCGA_AC,y=total_TCGA_AC, color=OR))
50 |   p = p + geom_point(aes(size=-log(FDR)),alpha=0.5)
51 |   p = p + geom_text(aes(label=ifelse(FDR<0.05,gene, NA)))
52 |   p = p + labs(x = "Total nonTCGA variant counts", y = "Total TCGA variant counts") + theme_bw()
53 |   p = p + theme(text = element_text(colour="black", size=16), axis.text.x = element_text(colour="black", size=14),
54 |                 axis.text.y = element_text(colour="black", size=14))
55 |   p = p + geom_abline(slope=15202/106210, alpha=0.8)
56 |   return(p)
57 | }
58 | 
59 | run_plot_burden = function(data,AF_thres=0.01){
60 |   data_name = deparse(substitute(data))
61 |   data_stats = run_TFT(data,AF_thres = AF_thres)
62 |   # write result
63 |   tn = paste("out/burden",data_name, "AF", AF_thres, "TFT_stats.tsv", sep="_")
64 |   write.table(data_stats, file=tn, quote=F, sep = '\t', col.names=NA)
65 |   # plot result
66 |   plot_burden_result(data_stats)
67 |   fn = paste("out/burden",data_name, "AF", AF_thres, "point.pdf", sep="_")
68 |   ggsave(file=fn, h=6,w=6, useDingbats=FALSE)
69 | }


--------------------------------------------------------------------------------
/analysis/association_test/analyzeAssocTFT.DDR.R:
--------------------------------------------------------------------------------
 1 | ##### analyzeAssocTFT.DDR.R #####
 2 | # Kuan-lin Huang @ WashU 2017 Oct. 
 3 | 
 4 | setwd("/Users/khuang/Box\ Sync/PhD/germline/PanCanAtlasGermline/analysis/association_test")
 5 | source("../global_aes_out.R")
 6 | source("TFT_functions.R")
 7 | 
 8 | fn = "assoc_results/ExAC.r1.sites.vep.biallelic.combine.fisher.anno.v2.DDRgene.tsv"  
 9 | data = read.table( fn, sep="\t",  head=T, fill=T,stringsAsFactors = F)
10 | 
11 | ##### data annotating #####
12 | data$multi_allele = F
13 | #pos = gsub("(^[0-9]+:[0-9]+):.*","\\1",data$Var)
14 | data$pos = sapply(as.character(data$Var),function(x) paste(strsplit(x,":")[[1]][1:2],collapse = ":"))
15 | duplicated_pos = data$pos[duplicated(data$pos)]
16 | data$multi_allele[data$pos %in% duplicated_pos] = T
17 | 
18 | data$missense = FALSE
19 | data$missense[grep("missense",data$impact)] = TRUE
20 | 
21 | data$truncation = FALSE
22 | for (type in vep_truncations){
23 |   data$truncation[grep(type,data$impact)] = TRUE
24 | }
25 | 
26 | data$inframe = FALSE
27 | for (type in vep_inframe){
28 |   data$inframe[grep(type,data$impact)] = TRUE
29 | }
30 | 
31 | data$variant_type = "other"
32 | data$variant_type[data$missense] = "missense"
33 | data$variant_type[data$inframe] = "inframe"
34 | data$variant_type[data$truncation] = "truncation"
35 | data$HGVSpAbbre = gsub(".*:","",data$HGVSp)
36 | 
37 | ##### some exploration into top candidates #####
38 | data_sig = data[data$P < 0.00001 & data$ExAC_AC/data$ExAC_AN < 0.01 & !data$multi_allele,]
39 | 
40 | plot_top_counts(data_sig, n=30,x_string="gene_symbol",fill_string="variant_type")
41 | fn = "out/DDR_277gene.p0.00001.dis.top30gene.pdf"
42 | ggsave(file=fn, height=5,w=10, useDingbats=FALSE)
43 | 
44 | # sele_genes = c("BRCA1","BRCA2","ATM","PALB2","BRIP1","MSH6","FANCI","FANCM")
45 | # data_sig_g = data_sig[data_sig$gene_symbol %in% sele_genes,]
46 | # p = ggplot(data=data_sig_g, aes(x=gene_symbol,y=-log10(P), color=variant_type))
47 | # p = p + geom_point(alpha=0.5)
48 | # p = p + geom_text(aes(label=ifelse(variant_type=="other",as.character(impact), as.character(HGVSpAbbre))))
49 | # p = p + labs(x = "Gene", y = "-log10(P)") + theme_bw()
50 | # p = p + theme(text = element_text(colour="black", size=16), axis.text.x = element_text(colour="black", size=14),
51 | #               axis.text.y = element_text(colour="black", size=14))
52 | # p
53 | # fn = "out/DDR_seleGene.var.sig.pdf"
54 | # ggsave(file=fn, height=10,w=10,useDingbats=FALSE)
55 | 
56 | ##### burden test #####
57 | DDR_truncation = data[data$variant_type=="truncation" & !data$multi_allele,]
58 | run_plot_burden(DDR_truncation, AF_thres=0.01)
59 | run_plot_burden(DDR_truncation, AF_thres=0.0001)
60 | 
61 | DDR_truncation_noMulti = data[data$variant_type=="truncation" & !data$multi_allele,]
62 | run_plot_burden(DDR_truncation_noMulti, AF_thres=0.01)
63 | run_plot_burden(DDR_truncation_noMulti, AF_thres=0.0001)
64 | 
65 | DDR_missense = data[data$variant_type=="missense" & !data$multi_allele,]
66 | run_plot_burden(DDR_missense, AF_thres=0.01)
67 | run_plot_burden(DDR_missense, AF_thres=0.001)
68 | run_plot_burden(DDR_missense, AF_thres=0.0001)
69 | 
70 | DDR_missense_noMulti = data[data$variant_type=="missense" & !data$multi_allele,]
71 | run_plot_burden(DDR_missense_noMulti, AF_thres=0.01)
72 | run_plot_burden(DDR_missense_noMulti, AF_thres=0.0001)
73 | 
74 | DDR_FivePrimeUTR_noMulti = data[data$impact=="5_prime_UTR_variant" & !data$multi_allele,]
75 | run_plot_burden(DDR_FivePrimeUTR_noMulti, AF_thres=0.01)
76 | run_plot_burden(DDR_FivePrimeUTR_noMulti, AF_thres=0.0001)
77 | 
78 | DDR_ThreePrimeUTR_noMulti = data[data$impact=="3_prime_UTR_variant" & !data$multi_allele,]
79 | run_plot_burden(DDR_ThreePrimeUTR_noMulti, AF_thres=0.01)
80 | run_plot_burden(DDR_ThreePrimeUTR_noMulti, AF_thres=0.0001)


--------------------------------------------------------------------------------
/analysis/association_test/analyzeAssocTFT.R:
--------------------------------------------------------------------------------
 1 | ##### analyzeAssoc.R #####
 2 | # Kuan-lin Huang @ WashU 2017 Oct. 
 3 | 
 4 | setwd("/Users/khuang/Box\ Sync/PhD/germline/PanCanAtlasGermline/analysis/association_test")
 5 | source("../global_aes_out.R")
 6 | source("TFT_functions.R")
 7 | 
 8 | fn = "assoc_results/ExAC.r1.sites.vep.biallelic.combine.fisher.NFE.152gene.tsv"  
 9 | data = read.table( fn, sep="\t",  head=T, fill=T,stringsAsFactors = F)
10 | 
11 | ##### data annotating #####
12 | data$multi_allele = F
13 | #pos = gsub("(^[0-9]+:[0-9]+):.*","\\1",data$Var)
14 | data$pos = sapply(as.character(data$Var),function(x) paste(strsplit(x,":")[[1]][1:2],collapse = ":"))
15 | duplicated_pos = data$pos[duplicated(data$pos)]
16 | data$multi_allele[data$pos %in% duplicated_pos] = T
17 | 
18 | data$missense = FALSE
19 | data$missense[grep("missense",data$impact)] = TRUE
20 | 
21 | data$truncation = FALSE
22 | for (type in vep_truncations){
23 |   data$truncation[grep(type,data$impact)] = TRUE
24 | }
25 | 
26 | data$inframe = FALSE
27 | for (type in vep_inframe){
28 |   data$inframe[grep(type,data$impact)] = TRUE
29 | }
30 | 
31 | data$variant_type = "other"
32 | data$variant_type[data$missense] = "missense"
33 | data$variant_type[data$inframe] = "inframe"
34 | data$variant_type[data$truncation] = "truncation"
35 | data$HGVSpAbbre = gsub(".*:","",data$HGVSp)
36 | 
37 | ##### some exploration into top candidates #####
38 | data_sig = data[data$P < 0.00001 & data$ExAC_AC/data$ExAC_AN < 0.01 & !data$multi_allele,]
39 | 
40 | plot_top_counts(data_sig, n=30,x_string="gene_symbol",fill_string="variant_type")
41 | fn = "out/NFE_152gene.p0.00001.dis.top30gene.pdf"
42 | ggsave(file=fn, height=5,w=10, useDingbats=FALSE)
43 | 
44 | sele_genes = c("BRCA1","BRCA2","ATM","PALB2","BRIP1","MSH6","FANCI","FANCM")
45 | data_sig_g = data_sig[data_sig$gene_symbol %in% sele_genes,]
46 | p = ggplot(data=data_sig_g, aes(x=gene_symbol,y=-log10(P), color=variant_type))
47 | p = p + geom_point(alpha=0.5)
48 | p = p + geom_text(aes(label=ifelse(variant_type=="other",as.character(impact), as.character(HGVSpAbbre))))
49 | p = p + labs(x = "Gene", y = "-log10(P)") + theme_bw()
50 | p = p + theme(text = element_text(colour="black", size=16), axis.text.x = element_text(colour="black", size=14),
51 |               axis.text.y = element_text(colour="black", size=14))
52 | p
53 | fn = "out/NFE_seleGene.var.sig.pdf"
54 | ggsave(file=fn, height=10,w=10,useDingbats=FALSE)
55 | 
56 | ##### burden test #####
57 | NFE_truncation = data[data$variant_type=="truncation" & !data$multi_allele,]
58 | run_plot_burden(NFE_truncation, AF_thres=0.01)
59 | run_plot_burden(NFE_truncation, AF_thres=0.0001)
60 | 
61 | NFE_truncation_noMulti = data[data$variant_type=="truncation" & !data$multi_allele,]
62 | run_plot_burden(NFE_truncation_noMulti, AF_thres=0.01)
63 | run_plot_burden(NFE_truncation_noMulti, AF_thres=0.0001)
64 | 
65 | NFE_missense = data[data$variant_type=="missense" & !data$multi_allele,]
66 | run_plot_burden(NFE_missense, AF_thres=0.01)
67 | run_plot_burden(NFE_missense, AF_thres=0.001)
68 | run_plot_burden(NFE_missense, AF_thres=0.0001)
69 | 
70 | NFE_missense_noMulti = data[data$variant_type=="missense" & !data$multi_allele,]
71 | run_plot_burden(NFE_missense_noMulti, AF_thres=0.01)
72 | run_plot_burden(NFE_missense_noMulti, AF_thres=0.0001)
73 | 
74 | NFE_FivePrimeUTR_noMulti = data[data$impact=="5_prime_UTR_variant" & !data$multi_allele,]
75 | run_plot_burden(NFE_FivePrimeUTR_noMulti, AF_thres=0.01)
76 | run_plot_burden(NFE_FivePrimeUTR_noMulti, AF_thres=0.0001)
77 | 
78 | NFE_ThreePrimeUTR_noMulti = data[data$impact=="3_prime_UTR_variant" & !data$multi_allele,]
79 | run_plot_burden(NFE_ThreePrimeUTR_noMulti, AF_thres=0.01)
80 | run_plot_burden(NFE_ThreePrimeUTR_noMulti, AF_thres=0.0001)
81 | 


--------------------------------------------------------------------------------
/analysis/association_test/postProcessAssoc.R:
--------------------------------------------------------------------------------
 1 | setwd("/Users/khuang/Box Sync/PhD/germline/PanCanAtlasGermline/analysis/association_test")
 2 | #fn = "test.fisher.tsv"  #"ExAC.r1.sites.vep.biallelic.combine.fisher.tsv"
 3 | # fn = "ExAC.r1.sites.vep.biallelic.combine.fisher.tsv"
 4 | fn = "assoc_results/ExAC.r1.sites.vep.biallelic.combine.fisher.NFE.152gene.tsv"
 5 | data = read.table( fn, sep="\t", fill=T, head=T)
 6 | title = "ExAC.r1.fisher.NFE.152g"
 7 | 
 8 | data_MAF0.01 = data[data$ExAC_AC/data$ExAC_AN < 0.01,]
 9 | # fn = "ExAC.r1.sites.vep.biallelic.combine.fisher.maf0.01.tsv"
10 | # write.table(file=fn, data_MAF0.01, quote=F, sep="\t", row.names = F, col.names=F)
11 | 
12 | #Genomic correction
13 | sink( sprintf( 'out/lambda.%s.MAF0.01.txt', title) )
14 | data2=data_MAF0.01[!is.na(data_MAF0.01$P),]# & -log10(data$P)<20,]
15 | ch = qchisq( data2$P, 1, lower.tail=F)
16 | data2$ch=ch
17 | theMedian = median(ch)
18 | theLambda = median(ch)/0.456
19 | cat( "median=", theMedian, "lambda=", theLambda, "\n")
20 | sink()
21 | 
22 | source( 'qqman.R' )
23 | jpeg( sprintf( 'out/qqplot.%s.MAF0.01.jpg', title), width=1200, height=1200)
24 | qq( data2$P )
25 | dev.off()
26 | 
27 | library(reshape2)
28 | 
29 | split_var = colsplit(string=data2$Var, pattern=":", names=c("CHR", "BP","ID","REF","ALT"))
30 | data3 = cbind(data2,split_var)
31 | data4 = data3[!(data3$CHR %in% c("X","Y")),]
32 | data4$CHR = as.numeric(data4$CHR)
33 | jpeg( sprintf( 'out/manhattan.%s.MAF0.01.jpg', title) , width=1200, height=1200)
34 | manhattan( data4[,c("CHR","BP","P")], main=title)
35 | dev.off()


--------------------------------------------------------------------------------
/analysis/association_test/work.log.sh:
--------------------------------------------------------------------------------
 1 | # data location: /gscmnt/gc3014/dinglab/ExAC/VCF
 2 | 
 3 | # normalize exac vcf and subset to only rare variants
 4 | bsubl -oo ExAC.r1.multi2biallelic.log '~/bin/bcftools-1.5/bcftools norm -m - ExAC.r1.sites.vep.vcf.gz | bgzip -c > ExAC.r1.sites.vep.biallelic.vcf.gz'
 5 | bsubl -oo ExAC_nonTCGA.r1.multi2biallelic.log '~/bin/bcftools-1.5/bcftools norm -m - ExAC_nonTCGA.r1.sites.vep.vcf.gz | bgzip -c > ExAC_nonTCGA.r1.sites.vep.biallelic.vcf.gz'
 6 | bsubl -oo tabix.ExAC.r1.sites.vep.biallelic.log 'tabix -p vcf ExAC.r1.sites.vep.biallelic.vcf.gz'
 7 | bsubl -oo tabix.ExAC_nonTCGA.r1.sites.vep.biallelic.log 'tabix -p vcf ExAC_nonTCGA.r1.sites.vep.biallelic.vcf.gz'
 8 | # annotate nonTCGA frequency using vcfanno
 9 | bsub -q bigmem -R"select[mem>80000] rusage[mem=80000]" -M 80000000 -oo ExAC.r1.vcfanno.log '~/bin/vcfanno_linux64.1 -p 8 ExAC_nonTCGA_config.toml ExAC.r1.sites.vep.biallelic.vcf.gz | bgzip -c > ExAC.r1.sites.vep.biallelic.combine.vcf.gz'
10 | bsubl -oo tabix.ExAC.r1.sites.vep.biallelic.combine.log 'tabix -p vcf ExAC.r1.sites.vep.biallelic.combine.vcf.gz'
11 | # get rare variants
12 | bsubl -oo ExAC.r1.getRareAllele.log '~/bin/bcftools-1.5/bcftools view --max-af 0.0005 ExAC.r1.sites.vep.biallelic.combine.vcf.gz | bgzip -c > ExAC.r1.sites.vep.biallelic.combine.maxAF0.01.vcf.gz'
13 | mv ExAC.r1.sites.vep.biallelic.combine.maxAF0.01.vcf.gz ExAC.r1.sites.vep.biallelic.combine.maxAF0.0005.vcf.gz
14 | # Each ethnicity only sums up to AN_Adj; not AN
15 | 
16 | # his workflow:
17 | # use vcf anno to combine nonTCGA AC/AN to ExAC vcf
18 | vcfanno -p 8 ExAC_nonTCGA_config.toml ExAC.r0.3.1.sites.vep.vcf.gz | bgzip -c > ExAC.r0.3.1.combine_all.vep.vcf.gz
19 | 
20 | # conduct single variant assoc test
21 | bsubl -oo single_var_association.log 'python2.7 single_var_association.py VCF/ExAC.r1.sites.vep.biallelic.combine.vcf.gz ExAC.r1.sites.vep.biallelic.combine.fisher.anno.tsv'
22 | 
23 | ### from here:
24 | # filter out the non-rare variants; convert to tab delimited
25 | # may need to limit only to regions with sufficient coverage
26 | # Do burden test on variants with specific consequence and AF bins
27 | 
28 | gzcat ExAC.r1.sites.vep.biallelic.combine.fisher.anno.v2.tsv.gz | grepList /Users/khuang/Box\ Sync/PhD/germline/PanCanAtlasGermline/TCGA_data/reference_files/20160713_Rahman_KJ_KH_152_gene_table_list.txt 9 > ExAC.r1.sites.vep.biallelic.combine.fisher.anno.v2.152gene.tsv
29 | 


--------------------------------------------------------------------------------
/analysis/burden_assoc/TFT.R:
--------------------------------------------------------------------------------
 1 | # TFT function for testing associations
 2 | 
 3 | TFT = function(data){
 4 |   
 5 |   p = NA; OR = NA
 6 |   
 7 |   fisher_elements = as.numeric(data)
 8 |   
 9 |   if (fisher_elements[1] > 0 && fisher_elements[3] > 0 && fisher_elements[2] >= 0 && fisher_elements[4] >= 0){
10 |     test.table = matrix(as.numeric(fisher_elements), nrow=2)
11 |     f.test = fisher.test(test.table, alternative = "greater")
12 |     OR = f.test$estimate
13 |     p = f.test$p.value
14 |   }
15 |   result_row = c(fisher_elements,p,OR)
16 |   
17 |   return(result_row)
18 |   
19 | }
20 | 
21 | run_TFT = function(data, AF_thres = 0.01){
22 |   data=data[data$Freq > AF_thres,]
23 |   num_genes = length(unique(data$Gene))
24 |   # burden test: TFT: http://slideplayer.com/slide/8660600/
25 |   stats = matrix(,nrow=num_genes,ncol=7)
26 |   
27 |   for (i in 1:num_genes){
28 |     gene = unique(data$Gene)[i]
29 |     #data_g = data[data$gene_symbol==gene,]
30 |     data_g = c(53105,data$ExAC_Count[data$Gene == gene],data$Sample_size[1],data$Count[data$Gene == gene])
31 |     gene_stat = TFT(data_g)
32 |     stats[i,] = c(as.character(gene), gene_stat)
33 |   }
34 |   colnames(stats) = c("Gene","ExAC_nonTCGA_AN","ExAC_nonTCGA_AC","cohort_AN","cohort_AC","P","OR")
35 |   stats = data.frame(stats,stringsAsFactors = F)
36 |   stats[,2:7] = sapply(stats[,2:7],as.numeric,2)
37 |   
38 |   #stats$P = as.numeric(stats$P)
39 |   #stats$FDR = p.adjust(stats$P, method="BH")
40 |   #stats = stats[order(stats$P),]
41 |   return(stats)
42 | }
43 | 
44 | run_TFT_against_others = function(data_c, PCA_count_byCancer, all_cancer_stat_m_suggest, AF_thres = 0.01){
45 |   data_c=data_c[data_c$Freq > AF_thres,]
46 |   cancer = data_c$Cancer[1]
47 |   num_genes = length(unique(data_c$Gene))
48 |   # burden test: TFT: http://slideplayer.com/slide/8660600/
49 |   stats = matrix(,nrow=num_genes,ncol=7)
50 |   
51 |   for (i in 1:num_genes){
52 |     gene = unique(data_c$Gene)[i]
53 |     
54 |     sig_cancers = all_cancer_stat_m_suggest$Cancer[all_cancer_stat_m_suggest$Gene==gene]
55 |     data_other_c = PCA_count_byCancer[!(PCA_count_byCancer$Cancer %in% c(sig_cancers,as.character(cancer))),]
56 |     
57 |     data_other_c_g = data_other_c[data_other_c$Gene == gene,]
58 |     #data_c_g = data_c[data_c$gene_symbol==gene,]
59 |     data_c_g = c(sum(data_other_c_g$Sample_size),sum(data_other_c_g$Count),data_c$Sample_size[i],data_c$Count[data_c$Gene == gene])
60 |     gene_stat = TFT(data_c_g)
61 |     stats[i,] = c(as.character(gene), gene_stat)
62 |   }
63 |   colnames(stats) = c("Gene","other_cancers_AN","other_cancers_AC","cohort_AN","cohort_AC","P","OR")
64 |   stats = data.frame(stats,stringsAsFactors = F)
65 |   stats[,2:7] = sapply(stats[,2:7],as.numeric,2)
66 |   
67 |   #stats$P = as.numeric(stats$P)
68 |   #stats$FDR = p.adjust(stats$P, method="BH")
69 |   #stats = stats[order(stats$P),]
70 |   return(stats)
71 | }
72 | 


--------------------------------------------------------------------------------
/analysis/burden_assoc/label_onco_var_ExAC.R:
--------------------------------------------------------------------------------
 1 | ##### label_onco_var_ExAC.R #####
 2 | # Kuan-lin Huang @ WashU 2017 Oct
 3 | # find non-cancer pathogenic variant in the ExAC cohort
 4 | 
 5 | bdir = "/Users/khuang/Box Sync/PhD/germline/PanCanAtlasGermline/analysis/burden_assoc"
 6 | setwd(bdir)
 7 | source("../global_aes_out.R")
 8 | source("../dependency_files.R")
 9 | 
10 | fn = "charged.ExAC.r1.sites.vep.biallelic.combine.exon.all.patho.expanded.tsv"
11 | variants = read.table(sep="\t",header=T,file=fn, stringsAsFactors=FALSE, quote = "",fill=TRUE)
12 | 
13 | gene_fn = "/Users/khuang/Box\ Sync/PhD/germline/PanCanAtlasGermline/TCGA_data/reference_files/20160713_Rahman_KJ_KH_152_gene_table_list.txt"
14 | predisposition_genes = as.vector(t(read.table(sep="\t",header=F,file=gene_fn, stringsAsFactors=FALSE, quote = "")))
15 | 
16 | cat("Original count of variants: ",sum(variants$ExAC_nonTCGA_AC_Adj),"\n")
17 | 
18 | ##### classify whether a variant is cancer-relevant #####
19 | cancer_terms = c("tumor","cancer","neoplasia")
20 | 
21 | variants$predisposition_gene = F
22 | variants$predisposition_gene[variants$HUGO_Symbol %in% predisposition_genes] = TRUE
23 | variants$cancer_term_trait = FALSE
24 | for (term in cancer_terms){
25 |   variants$cancer_term_trait[grep(term,tolower(variants$ClinVar_Traits))] = TRUE
26 | }
27 | variants$cancer_term_trait[grep("oma$",tolower(variants$ClinVar_Traits))] = TRUE
28 | 
29 | table(variants$predisposition_gene,variants$cancer_term_trait)
30 | variants$cancer_related = F
31 | variants$cancer_related[variants$predisposition_gene | variants$cancer_term_trait] = T
32 | 
33 | #table(variants$ClinVar_Traits[variants$cancer_term_trait])[table(variants$ClinVar_Traits[variants$cancer_term_trait])>3]
34 | 
35 | # variant frequency annotation
36 | 
37 | #### rare frequency filter #####
38 | variants = variants[variants$ExAC_AF < 0.0005,] # not less
39 | 
40 | variants_cancer = variants[variants$cancer_related,]
41 | cat("Number of these variants that are cancer-relevant:",sum(variants_cancer$ExAC_nonTCGA_AC_Adj),"\n")
42 | 
43 | variants_cancer$Overall_Classification = "Uncertain Significance"
44 | variants_cancer$Overall_Classification[variants_cancer$CharGer_Classification=="Pathogenic"] = "Likely Pathogenic"
45 | variants_cancer$Overall_Classification[variants_cancer$ClinVar_Pathogenicity=="Pathogenic"] = "Pathogenic"
46 | variants_cancer$Overall_Classification[grep("PS1",variants_cancer$Positive_Evidence)] = "Pathogenic"
47 | variants_cancerP = variants_cancer[variants_cancer$Overall_Classification %in% c("Likely Pathogenic", "Pathogenic"),]
48 | 
49 | cat("Number of these variants that are cancer-relevant and pathogenic:",sum(variants_cancerP$ExAC_nonTCGA_AC_Adj),"\n")
50 | 
51 | # tn = "out/ExAC_pathogenic_variants.tsv"
52 | # write.table(variants_cancerP, quote=F, sep="\t", file = tn, row.names = F)
53 | 


--------------------------------------------------------------------------------
/analysis/clinical_association/PathVarEthnicStats.R:
--------------------------------------------------------------------------------
 1 | ##### plotPathVarEthnic.R #####
 2 | # Kuan-lin Huang @ WashU 201711
 3 | # plot assoc results for pathogenic variants
 4 | 
 5 | bdir = "/Users/khuang/Box\ Sync/PhD/germline/PanCanAtlasGermline/analysis/clinical_association"
 6 | setwd(bdir)
 7 | source("../global_aes_out.R")
 8 | source("../dependency_files.R")
 9 | source("plotPathVarEthnic.R")
10 | 
11 | out_table=NULL
12 | 
13 | for (cancer in unique(count.F_25$Cancer)){
14 |   count_c = count.F_25[count.F_25$Cancer==cancer,]
15 |   ethnicities = unique(count_c$Ethnicity)
16 |   caucasian_all = count_c[!is.na(count_c$Ethnicity) & count_c$Ethnicity=="Europian","CohortCount"][1]
17 |   
18 |   if ("Asian" %in% ethnicities){
19 |     asian_all = count_c[!is.na(count_c$Ethnicity) & count_c$Ethnicity=="Asian","CohortCount"][1]
20 |     for (gene in unique(count_c$Gene)){
21 |       count_c_g = count_c[count_c$Gene==gene,]
22 |       caucasian_carrier = count_c_g[!is.na(count_c_g$Ethnicity) & count_c_g$Ethnicity=="Europian","CarrierCount"]
23 |       asian_carrier = count_c_g[!is.na(count_c_g$Ethnicity) & count_c_g$Ethnicity=="Asian","CarrierCount"]
24 |       if (!length(caucasian_carrier)){caucasian_carrier=0}
25 |       if (!length(asian_carrier)){asian_carrier=0}
26 |       dist_nums = c(caucasian_all-caucasian_carrier,caucasian_carrier,asian_all-asian_carrier,asian_carrier)
27 |       test.t = matrix(dist_nums,nrow=2)
28 |       f.test = fisher.test(test.t)
29 |       OR = f.test$estimate
30 |       p = f.test$p.value
31 |       
32 |       row_stat = cbind(cancer, gene, "Europian", "Asian", caucasian_all-caucasian_carrier,caucasian_carrier,asian_all-asian_carrier,asian_carrier, OR, p)
33 |       out_table = rbind(out_table, row_stat)
34 |     }
35 |   }
36 |   if ("African American" %in% ethnicities){
37 |     asian_all = count_c[!is.na(count_c$Ethnicity) & count_c$Ethnicity=="African American","CohortCount"][1]
38 |     for (gene in unique(count_c$Gene)){
39 |       count_c_g = count_c[count_c$Gene==gene,]
40 |       caucasian_carrier = count_c_g[!is.na(count_c_g$Ethnicity) & count_c_g$Ethnicity=="Europian","CarrierCount"]
41 |       asian_carrier = count_c_g[!is.na(count_c_g$Ethnicity) & count_c_g$Ethnicity=="African American","CarrierCount"]
42 |       if (!length(caucasian_carrier)){caucasian_carrier=0}
43 |       if (!length(asian_carrier)){asian_carrier=0}
44 |       dist_nums = c(caucasian_all-caucasian_carrier,caucasian_carrier,asian_all-asian_carrier,asian_carrier)
45 |       test.t = matrix(dist_nums,nrow=2)
46 |       f.test = fisher.test(test.t)
47 |       OR = f.test$estimate
48 |       p = f.test$p.value
49 |       
50 |       row_stat = cbind(cancer, gene, "Europian", "African American", caucasian_all-caucasian_carrier,caucasian_carrier,asian_all-asian_carrier,asian_carrier, OR, p)
51 |       out_table = rbind(out_table, row_stat)
52 |     }
53 |   }
54 |   if ("American" %in% ethnicities){
55 |     asian_all = count_c[!is.na(count_c$Ethnicity) & count_c$Ethnicity=="American","CohortCount"][1]
56 |     for (gene in unique(count_c$Gene)){
57 |       count_c_g = count_c[count_c$Gene==gene,]
58 |       caucasian_carrier = count_c_g[!is.na(count_c_g$Ethnicity) & count_c_g$Ethnicity=="Europian","CarrierCount"]
59 |       asian_carrier = count_c_g[!is.na(count_c_g$Ethnicity) & count_c_g$Ethnicity=="]American","CarrierCount"]
60 |       if (!length(caucasian_carrier)){caucasian_carrier=0}
61 |       if (!length(asian_carrier)){asian_carrier=0}
62 |       dist_nums = c(caucasian_all-caucasian_carrier,caucasian_carrier,asian_all-asian_carrier,asian_carrier)
63 |       test.t = matrix(dist_nums,nrow=2)
64 |       f.test = fisher.test(test.t)
65 |       OR = f.test$estimate
66 |       p = f.test$p.value
67 |       
68 |       row_stat = cbind(cancer, gene, "Europian", "American", caucasian_all-caucasian_carrier,caucasian_carrier,asian_all-asian_carrier,asian_carrier, OR, p)
69 |       out_table = rbind(out_table, row_stat)
70 |     }
71 |   }
72 | }
73 | out_table = as.data.frame(out_table)
74 | colnames(out_table) = c("cancer","gene","EthnicityA","EthnicityB","EthnicityA_noncarrier","EthnicityA_carrier"
75 |                         ,"EthnicityB_noncarrier","EthnicityB_carrier","OR","P")
76 | out_table = out_table[as.numeric(as.character(out_table$EthnicityB_carrier))>1,]
77 | out_table$FDR = p.adjust(out_table[,"P"], method="fdr") # MAW new, calculates FDR based on the method from,
78 | out_table=out_table[order(out_table$P, decreasing=FALSE),]
79 | tn = "out/pathVar_2ethni_TFT_assoc.txt"
80 | write.table(out_table, quote=F, sep="\t", file = tn, row.names = F)
81 | 


--------------------------------------------------------------------------------
/analysis/expression_effect/expression_effect.R:
--------------------------------------------------------------------------------
  1 | ##### expression_effect.R #####
  2 | # Kuan-lin Huang @ WashU 2016 May , updated 2017 Nov.
  3 | # analyze cohort level RNA-Seq data and convert to different matrices in a sample-gene format
  4 | 
  5 | bdir = "/Users/khuang/Box Sync/PhD/germline/PanCanAtlasGermline/analysis/expression_effect"
  6 | setwd(bdir)
  7 | source("../../global_aes_out.R")
  8 | source("../../dependency_files.R")
  9 | 
 10 | 
 11 | ## function ##
 12 | unfactorize = function(df){
 13 |   for(i in which(sapply(df, class) == "factor")) df[[i]] = as.numeric(as.character(df[[i]]))
 14 |   return(df)
 15 | }
 16 | 
 17 | ecdf_fun = function(x,perc) ecdf(x)(perc)
 18 | 
 19 | expression_effect = function(m){ 
 20 |   cat("##### EXPRESSION ANALYSIS #####\n")
 21 |   minNum = 5
 22 |   m = as.matrix(m)
 23 |   num = nrow(m)
 24 |   m2 = as.matrix(m[rowSums(!is.na(m)) >= minNum, ])
 25 |   num_NA= nrow(m2)
 26 |   cat(paste("Original number of markers:", num, "; NA filtered:", num_NA, "\n", sep=" "))
 27 |   
 28 |   # initiate tables
 29 |   outlier = matrix(,nrow=dim(m2)[1],ncol=dim(m2)[2])
 30 |   row.names(outlier) = row.names(m2)
 31 |   colnames(outlier) = colnames(m2)
 32 |   exp_score = outlier
 33 |   exp_quantile = outlier
 34 |   
 35 |   # gene-wise expression score and quantile score
 36 |   for (i in 1:nrow(m2)){
 37 |     #IQR = quantile(m2[i,], probs=0.75, na.rm=T) - quantile(m2[i,], probs=0.25, na.rm=T) 
 38 |     exp_score[i,] = m2[i,]#(m2[i,] - quantile(m2[i,], probs=0.50, na.rm=T))/IQR
 39 |     exp_quantile[i,] = ecdf_fun(m2[i,],m2[i,])
 40 |   }
 41 |   
 42 |   return(list("exp_score"=exp_score, "exp_quantile"=exp_quantile))
 43 | }
 44 | 
 45 | glist_f = read.table(header=FALSE, stringsAsFactors = F, file = "/Users/khuang/Box Sync/PhD/germline/pan8000_germline_clinical/reference_files/CancerGeneListV5-2014-04-18.add-Rahman.add-Fanconi-Gene.txt")
 46 | glist = as.vector(t(glist_f))
 47 | 
 48 | fileNames = Sys.glob("/Users/khuang/Box\ Sync/PhD/collaborations/premed_2015/data/All_gene_RNASeq/raw_output/*RSEM_hugo.txt")
 49 | # # get rid of COADREAD
 50 | # CR = "/Users/khuang/Box Sync/PhD/collaborations/premed_2015/data/All_gene_RNASeq/raw_output/COADREAD_RSEM_hugo.txt"
 51 | # fileNames = fileNames[-which(fileNames == CR)]
 52 | 
 53 | exp_score_tables = vector("list")
 54 | exp_quantile_tables = vector("list")
 55 | exp_tables = vector("list")
 56 | 
 57 | for (fileName in fileNames) {
 58 |   cancer2 = strsplit(fileName, split="/")[[1]][11]
 59 |   cancer = gsub("_.*","",cancer2)
 60 |   #cat(paste(cancer,"\n"))
 61 |   #exp_table = read.table(row.names=1,header=TRUE, sep="\t", file=fileName)
 62 |   exp_table = read.table(header=TRUE, sep="\t", file=fileName)
 63 |   exp_table = exp_table[exp_table$Hybridization.REF %in% glist,]
 64 |   
 65 |   row.names(exp_table) = make.names(exp_table[,1],unique=T)
 66 |   exp_table = exp_table[,-c(1,2)]
 67 |   
 68 |   # get tumor only
 69 |   normal = substr(colnames(exp_table), 14, 14)
 70 |   exp_table = exp_table[,normal=="0"] 
 71 |   
 72 |   exp_table = unfactorize(exp_table)
 73 |   exp_table = log2(exp_table+1)
 74 |   new_colname = vector()
 75 |   for (name in colnames(exp_table)){
 76 |     splitted_sname = strsplit(name, split = "\\.")[[1]]
 77 |     new_name = paste(splitted_sname[1],splitted_sname[2],splitted_sname[3],sep=".")
 78 |     new_colname = c(new_colname,new_name)
 79 |   }
 80 |   colnames(exp_table) = new_colname
 81 | 
 82 |   if (dim(exp_table)[1] == 0 || dim(exp_table)[2] == 0){next;}
 83 |   
 84 |   exp_results = expression_effect(exp_table)
 85 |   
 86 |   exp_score_table_m = melt(exp_results$exp_score)
 87 |   exp_score_table_m$cancer = cancer
 88 |   exp_score_tables[[cancer]] = exp_score_table_m
 89 |   
 90 |   exp_quantile_table_m = melt(exp_results$exp_quantile)
 91 |   exp_quantile_table_m$cancer = cancer
 92 |   exp_quantile_tables[[cancer]] = exp_quantile_table_m
 93 |   
 94 |   exp_tables[[cancer]]
 95 | }
 96 | 
 97 | exp_score_tables_all = do.call(rbind,exp_score_tables)
 98 | colnames(exp_score_tables_all) = c("gene_name","sample","log2RSEM","cancer")
 99 | exp_quantile_tables_all = do.call(rbind,exp_quantile_tables)
100 | colnames(exp_quantile_tables_all) = c("gene_name","sample","expression_quantile","cancer")
101 | exp_score_tables_all$sample = gsub("\\.","-",exp_score_tables_all$sample)
102 | exp_quantile_tables_all$sample = gsub("\\.","-",exp_quantile_tables_all$sample)
103 | 
104 | fn = "out/pancan_exp_log2RSEM_all.tsv"
105 | write.table(exp_score_tables_all, file=fn, quote=F, sep="\t", col.names=T, row.names=F)
106 | fn = "out/pancan_exp_quantile_all.tsv"
107 | write.table(exp_quantile_tables_all, file=fn, quote=F, sep="\t", col.names=T, row.names=F)
108 | 


--------------------------------------------------------------------------------
/analysis/expression_effect/plotPathVarExpressAssoc.R:
--------------------------------------------------------------------------------
 1 | ##### plotPathVarExpressAssoc.R #####
 2 | # Kuan-lin Huang @ WashU 2016 August updated 2017
 3 | # conduct association of pathVarPline variants with AAO
 4 | 
 5 | bdir = "/Users/khuang/Box\ Sync/PhD/germline/PanCanAtlasGermline/analysis/expression_effect"
 6 | setwd(bdir)
 7 | source("../global_aes_out.R")
 8 | source("../dependency_files.R")
 9 | 
10 | tn = "out/pathVarExpressAssoc.txt"
11 | tt = read.table(sep="\t",header=T,file=tn, stringsAsFactors=FALSE)
12 | 
13 | ### plotting ###
14 | tt$gene = as.character(tt$gene)
15 | tt$FDR_plot = tt$FDR
16 | 
17 | # # using GLM test result
18 | # tt$FDR_plot[tt$FDR_plot<10^(-6)]= 0.95*10^(-6)
19 | # p = ggplot(data=tt)
20 | # p = p + geom_point(aes(y=-log10(FDR_plot),x= coefficient,color = cancer),alpha=0.5)
21 | # #p = p + geom_text_repel(aes(y=-log10(FDR),x= cohort_AF,label=ifelse(FDR<0.05, Gene,NA),color = Cancer))#,alpha=1.3)
22 | # #p = p + geom_point(aes(y=cohort_AF,x=Cancer,size=-log10(FDR),color = Cancer))
23 | # p = p + geom_text_repel(aes(y=-log10(FDR_plot),x= coefficient,color = cancer,label=ifelse(FDR<0.05, gene,NA)))
24 | # p = p + getPCACancerColor()
25 | # p = p + labs(x="Coefficient",y= "-log10(FDR)")
26 | # p = p + geom_vline(xintercept = 0, alpha=0.5)
27 | # p = p  + theme_bw() +
28 | #   theme(axis.text.x = element_text(colour="black", size=12), axis.text.y = element_text(colour="black", size=12),axis.ticks = element_blank())#element_text(colour="black", size=14))
29 | # p
30 | # fn = 'out/geneExpressAssocVolcanoGLM.pdf'
31 | # ggsave(fn,w = 5, h = 5, useDingbat=F)
32 | 
33 | # # using the Wilcox test result
34 | # p = ggplot(data=tt)
35 | # p = p + geom_point(aes(y=-log10(wilcoxFDR),x= coefficient,color = cancer),alpha=0.5)
36 | # #p = p + geom_text_repel(aes(y=-log10(FDR),x= cohort_AF,label=ifelse(FDR<0.05, Gene,NA),color = Cancer))#,alpha=1.3)
37 | # #p = p + geom_point(aes(y=cohort_AF,x=Cancer,size=-log10(FDR),color = Cancer))
38 | # p = p + geom_text_repel(aes(y=-log10(wilcoxFDR),x= coefficient,color = cancer,label=ifelse(FDR<0.05, gene,NA)))
39 | # p = p + getPCACancerColor()
40 | # p = p + labs(x="Coefficient",y= "-log10(FDR)")
41 | # p = p + geom_vline(xintercept = 0, alpha=0.5)
42 | # p = p  + theme_bw() +
43 | #   theme(axis.text.x = element_text(colour="black", size=12), axis.text.y = element_text(colour="black", size=12),axis.ticks = element_blank())#element_text(colour="black", size=14))
44 | # p
45 | # fn = 'out/geneExpressAssocVolcanoWCOX.pdf'
46 | # ggsave(fn,w = 5, h = 5, useDingbat=F)
47 | 
48 | tt$association = "None"
49 | tt$association[tt$FDR<0.15] = "Suggestive"
50 | tt$association[tt$FDR<0.05] = "Significant"
51 | #tt$association = factor(tt$association,level=c("None","Suggestive","Significant"))
52 | 
53 | # using the Wilcox test result: plot by gene
54 | p = ggplot(data=tt,aes(x=coefficient,y=cancer,color = cancer))
55 | # p = p + geom_point(aes(y=-log10(wilcoxFDR),x= coefficient,color = cancer),alpha=0.5)
56 | p = p + geom_point(aes(shape=association),alpha=0.3,size=2)
57 | p = p + geom_text_repel(aes(label=ifelse(FDR<0.05,gene,NA)))
58 | p = p + getPCACancerColor()
59 | p = p + labs(x="Cancer",y= "-log10(FDR)")
60 | p = p + geom_vline(xintercept = 0, alpha=0.5) + xlim(-3.1,3.1)
61 | p = p  + theme_bw() +
62 |   theme(axis.text.x = element_text(colour="black", size=12), axis.text.y = element_text(colour="black", size=12),axis.ticks = element_blank())#element_text(colour="black", size=14))
63 | p
64 | fn = 'out/geneExpressAssocByGene.pdf'
65 | ggsave(fn,w = 5, h = 5, useDingbat=F)
66 | 


--------------------------------------------------------------------------------
/analysis/family_history/fam_history.R:
--------------------------------------------------------------------------------
  1 | ##### fam_history.R #####
  2 | # Kuan-lin Huang @ WashU 2017 updated Nov.
  3 | 
  4 | setwd("/Users/khuang/Box\ Sync/PhD/germline/PanCanAtlasGermline/analysis/family_history/")
  5 | source("../global_aes_out.R")
  6 | source("../dependency_files.R")
  7 | 
  8 | ##### individual level stats #####
  9 | 
 10 | fileName = "clinical_PANCAN_patient_cancerhistory.111817.tsv"
 11 | famHist = read.table(header=TRUE, sep="\t", file=fileName, fill=T, quote="",stringsAsFactors = F)
 12 | str(famHist)
 13 | 
 14 | fileNameB = "clinical_PANCAN_patient_cancerhistory.2col.111817.tsv"
 15 | famHistB = read.table(header=TRUE, sep="\t", file=fileNameB, fill=T, quote="",stringsAsFactors = F)
 16 | str(famHistB)
 17 | 
 18 | fam_hist_samples = famHistB$bcr_patient_barcode[famHistB[,2]=="Yes"]
 19 | 
 20 | famHistPos = famHist[famHist$bcr_patient_barcode %in% fam_hist_samples,]
 21 | 
 22 | for (col in colnames(famHistPos)[2:14]){
 23 |   print(col)
 24 |   print(table(famHistPos[,col]))
 25 | }
 26 | 
 27 | fam_var_merge = merge(pathVarP, famHistPos, by="bcr_patient_barcode")
 28 | 
 29 | for (col in colnames(fam_var_merge)[182:194]){
 30 |   print(col)
 31 |   print(table(fam_var_merge[,col]))
 32 | }
 33 | 
 34 | fam_var_merge$fam_tag = "Other"
 35 | fam_var_merge$fam_tag[grep("reast",fam_var_merge$relative_family_cancer_hx_text)] = "Breast cancer"
 36 | fam_var_merge$fam_tag[grep("rostate",fam_var_merge$relative_family_cancer_hx_text)] = "Prostate cancer"
 37 | fam_var_merge$fam_tag[fam_var_merge$family_history_of_stomach_cancer=="YES"] = "Stomach cancer"
 38 | fam_var_merge$fam_tag[fam_var_merge$number_of_first_degree_relatives_with_cancer_diagnosis==1] = "First degree relatives"
 39 | table(fam_var_merge$fam_tag)
 40 | table(fam_var_merge$HUGO_Symbol)
 41 | table(fam_var_merge$HUGO_Symbol,fam_var_merge$fam_tag)
 42 | table(fam_var_merge$cancer)
 43 | 
 44 | fam_var_merge_noNA = fam_var_merge[!is.na(fam_var_merge$HGVSp_short),]
 45 | p = ggplot(fam_var_merge_noNA,aes(x = HGVSp_short, fill = cancer))
 46 | p = p + facet_grid(.~HUGO_Symbol, scale="free", space="free")
 47 | p = p + geom_bar() + theme_bw() + theme_nogrid() + getPCACancerFill()
 48 | p = p + labs(x = "Variant", y="Number of carriers with family history")
 49 | p = p + theme(axis.title = element_text(size=16), axis.text.x = element_text(colour="black", size=14, angle = 90, vjust=0.5), axis.text.y = element_text(colour="black", size=14))#element_text(colour="black", size=14))
 50 | p = p + theme(legend.position = "top")
 51 | p
 52 | fn = 'out/family_var_count.pdf'
 53 | ggsave(file=fn, w=15, h =6 ,useDingbats=FALSE)
 54 | 
 55 | dup_var = fam_var_merge[duplicated(fam_var_merge$HGVSp_short),c("HUGO_Symbol","HGVSp_short","cancer")]
 56 | cat("Duplicated variants","\n")
 57 | dup_var 
 58 | dup_genes = fam_var_merge$HUGO_Symbol[duplicated(fam_var_merge$HUGO_Symbol)]
 59 | fam_var_merge_g = fam_var_merge[fam_var_merge$HUGO_Symbol %in% dup_genes,]
 60 | 
 61 | p = ggplot(fam_var_merge_g,aes(y = HUGO_Symbol, x = cancer, color=fam_tag))
 62 | #p = p + facet_grid(Gene_Classification~., scale="free", space="free")
 63 | p = p + geom_jitter(alpha=0.5,size=1.5, height = 0.2,width = 0.22,shape=16, stroke=0)
 64 | p = p + geom_text_repel(aes(label=as.character(HGVSp_short),size=3,angle=0,vjust=1.5))
 65 | p = p + theme_bw()
 66 | #p = p + geom_hline(yintercept = -log10(0.05),alpha=0.3)
 67 | #p = p + xlim(0,6)
 68 | #p = p + getVarColorScale()
 69 | p = p + labs( x="TCGA case cancer type", y = "Gene") + scale_colour_discrete(name = "Case family history")
 70 | p = p + theme(axis.text.x = element_text(colour="black", size=14, angle=90, vjust = 0.5),axis.text.y = element_text(colour="black", size=14))
 71 | #p = p + coord_equal()
 72 | p
 73 | fn = "out/fam_history_var.pdf"
 74 | ggsave(fn, h=6, w = 8,useDingbat=F)
 75 | 
 76 | p = ggplot(fam_var_merge_g,aes(y = HUGO_Symbol, x = cancer, color=fam_tag))
 77 | #p = p + facet_grid(Gene_Classification~., scale="free", space="free")
 78 | p = p + geom_jitter(alpha=0.4,size=3, height = 0.1,width = 0.1,shape=16, stroke=0)
 79 | p = p + geom_text(aes(label=ifelse(as.character(fam_var_merge_g$HGVSp_short) %in% dup_var$HGVSp_short,HGVSp_short,NA),size=3,angle=0,vjust=1.5))
 80 | p = p + theme_bw()
 81 | #p = p + geom_hline(yintercept = -log10(0.05),alpha=0.3)
 82 | #p = p + xlim(0,6)
 83 | #p = p + getVarColorScale()
 84 | p = p + labs( x="TCGA case cancer type", y = "Gene") + scale_colour_discrete(name = "Case family history")
 85 | p = p + theme(axis.text.x = element_text(colour="black", size=14, angle=90, vjust = 0.5),axis.text.y = element_text(colour="black", size=14))
 86 | #p = p + coord_equal()
 87 | p
 88 | fn = "out/fam_history_var_dupVarOnly.pdf"
 89 | ggsave(fn, h=6, w = 8,useDingbat=F)
 90 | 
 91 | p = ggplot(fam_var_merge_g,aes(x = HUGO_Symbol, fill = fam_tag))
 92 | p = p + facet_grid(.~cancer, scale="free", space="free")
 93 | p = p + geom_bar() + theme_bw() + theme_nogrid() #+ getPCACancerFill()
 94 | p = p + labs(x = "Gene", y="Number of carriers with family history") + scale_fill_discrete(name = "Case family history")
 95 | p = p + theme(axis.title = element_text(size=16), axis.text.x = element_text(colour="black", size=14, angle = 90, vjust=0.5), axis.text.y = element_text(colour="black", size=14))#element_text(colour="black", size=14))
 96 | p = p + theme(legend.position = "top")
 97 | p
 98 | fn = "out/fam_history_var_famtag.pdf"
 99 | ggsave(fn, h=4, w = 8,useDingbat=F)
100 | 


--------------------------------------------------------------------------------
/analysis/functional_assay/plot_result.R:
--------------------------------------------------------------------------------
 1 | ##### plot_result.R #####
 2 | # Kuan-lin Huang @ WashU 2017 July
 3 | # plot RET functional experiment results
 4 | library(ggplot2)
 5 | setwd("/Users/khuang/Box\ Sync/PhD/germline/pan8000_germline_clinical/functional_assay/")
 6 | 
 7 | args = commandArgs(trailingOnly=TRUE)
 8 | # test if there is at least one argument: if not, return an error
 9 | if (length(args)==0) {
10 |   stop("At least one argument must be supplied (input file).n", call.=FALSE)
11 | } else if (length(args)==1) {
12 |   fn = args[1]
13 | }
14 | #fn = "20170717_CO/All_Results_RET_Gel1_75_4_071717.txt"
15 | outFile = paste("output/",gsub(".*/","",fn),".pdf",sep="")
16 | outFile2 = paste("output/",gsub(".*/","",fn),"2.pdf",sep="")
17 |   
18 | results = read.table(fn, sep="\t", header=T)
19 | 
20 | colnames(results) = gsub("\\.","_",colnames(results))
21 | #colnames(results)
22 | results$Ligand = gsub(".*_","",results$Sample)
23 | results$Mut = gsub("_.*","",results$Sample)
24 | 
25 | results$Sample = factor(make.names(results$Sample,unique = T), levels=make.names(results$Sample,unique = T))
26 | 
27 | p = ggplot(results,aes(x=Sample, y=MAPK_RET_GAPDH_WT, fill=Ligand))
28 | p = p + facet_grid(.~Ligand, drop=T, space="free",scale="free")
29 | p = p + geom_bar(stat="identity") + theme_bw() #+ theme_nogrid()
30 | p = p + labs(x = "Sample", y="MAPK/RET/GAPDH (normalized to WT)")
31 | p = p + theme(axis.title = element_text(size=16), axis.text.x = element_text(colour="black", size=12, angle=90,vjust=0.5), axis.text.y = element_text(colour="black", size=12))#element_text(colour="black", size=14))
32 | p = p + theme(legend.position="bottom")
33 | p = p + geom_hline(yintercept = 1, alpha=0.3)
34 | #p = p + geom_text(aes(label=Number_of_Phosphosites), vjust=-0.25)
35 | p
36 | #fn = "output/All_Results_RET_Gel1_75_4_071717.pdf"
37 | ggsave(file=outFile, useDingbats=FALSE)
38 | 
39 | p = ggplot(results,aes(x=Sample, y=MAPK_GAPDH_WT, fill=Ligand))
40 | p = p + facet_grid(.~Ligand, drop=T, space="free",scale="free")
41 | p = p + geom_bar(stat="identity") + theme_bw() #+ theme_nogrid()
42 | p = p + labs(x = "Sample", y="MAPK/RET/GAPDH (normalized to WT)")
43 | p = p + theme(axis.title = element_text(size=16), axis.text.x = element_text(colour="black", size=12, angle=90,vjust=0.5), axis.text.y = element_text(colour="black", size=12))#element_text(colour="black", size=14))
44 | p = p + theme(legend.position="bottom")
45 | p = p + geom_hline(yintercept = 1, alpha=0.3)
46 | #p = p + geom_text(aes(label=Number_of_Phosphosites), vjust=-0.25)
47 | p
48 | 
49 | ggsave(file=outFile2, useDingbats=FALSE)


--------------------------------------------------------------------------------
/analysis/gene_list/examine_gene_list.R:
--------------------------------------------------------------------------------
 1 | ##### examine_gene_list.R #####
 2 | # Kuan-lin Huang @ WashU 201802
 3 | # examine the curated gene list
 4 | 
 5 | bdir = "/Users/khuang/Box\ Sync/PhD/germline/PanCanAtlasGermline/analysis/gene_list"
 6 | setwd(bdir)
 7 | source("../global_aes_out.R")
 8 | source("../dependency_files.R")
 9 | 
10 | library(readxl)
11 | 
12 | ### annotate 152 gene table with oncogene/TSG
13 | CPG_fn = "/Users/khuang/Box\ Sync/PhD/germline/PanCanAtlasGermline/doc/reference/CPG152_table.xlsx"
14 | CPG = data.frame(readxl::read_excel(CPG_fn))
15 | colnames(CPG)[1]="Gene"
16 | CPG$Gene_Classification = "Not classified"
17 | CPG$Gene_Classification[CPG$Gene %in% all_TSGs] = "Tumor Suppressor Gene"
18 | CPG$Gene_Classification[CPG$Gene %in% all_oncogenes] = "Oncogene"
19 | # write.table(CPG[c("Gene","Gene_Classification")], file="/Users/khuang/Box\ Sync/PhD/germline/PanCanAtlasGermline/doc/reference/CPG152_table_gClass.tsv", quote=F, sep="\t", col.names=T, row.names=F)
20 | 
21 | ### examine the number of pathogenic variants found in each category of curated genes
22 | cat("Source of 152 CPGs","\n")
23 | table(CPG$Source)
24 | 
25 | gene_var_count = data.frame(table(pathVarP$HUGO_Symbol,pathVarP$Overall_Classification))
26 | colnames(gene_var_count) = c("Gene","Classification","Count")
27 | # write.table(CPG_count[c("Gene","Classification", "Count")], file="/Users/khuang/Box\ Sync/PhD/germline/PanCanAtlasGermline/doc/reference/CPG152_table_gVarCount.tsv", quote=F, sep="\t", col.names=T, row.names=F)
28 | 
29 | gene_var_count2= data.frame(table(pathVarP$HUGO_Symbol))
30 | colnames(gene_var_count2) = c("Gene","Count")
31 | CPG_count = merge(CPG,gene_var_count2, by="Gene", all.x=T)
32 | CPG_count$Count[is.na(CPG_count$Count)] = 0
33 | CPG_count$Source[CPG_count$Source=="Cancer Gene Census Germline download 1/5/2016 (http://cancer.sanger.ac.uk/census/ )"] = "Cancer Gene Census Germline"
34 | CPG_count$Source[CPG_count$Source=="Reference (see PMID)"] = "Curated from literature"
35 | CPG_count$Source[CPG_count$Source=="personal communication; related to DICER1"] = "Personal communication"
36 | 
37 | p = ggplot(CPG_count,aes(x=Source, y = Count ))
38 | #p = p + facet_grid(.~Classification)
39 | p = p + geom_jitter(height =0, width = 0.3, alpha=0.5,aes(fill=Source, color=Source))
40 | p = p + geom_violin(alpha=0.5, stroke=0,aes(fill=Source, color=Source))
41 | p = p + geom_label_repel(aes(label=ifelse(Source != "Rahman 114 CPG" & Count > 5, paste(Gene,Count,sep="-"),NA)))
42 | p = p  + theme_bw() + labs(y="Count of likely pathogenic and pathogenic variant", x = "Source of gene") +
43 |   theme(legend.position = "None", axis.text.x = element_text(colour="black", size=12, angle=90, vjust = 0.5), axis.text.y = element_text(colour="black", size=12),axis.ticks = element_blank())#element_text(colour="black", size=14))
44 | p 
45 | 
46 | fn = 'out/source_var_count.pdf'
47 | ggsave(fn,h=7,w=4,useDingbat=F)


--------------------------------------------------------------------------------
/analysis/hotspot3d/cluster_analysis.R:
--------------------------------------------------------------------------------
 1 | ##### cluster_analysis.R #####
 2 | # Kuan-lin Huang @ WashU 201711
 3 | # plot assoc results for pathogenic variants
 4 | 
 5 | bdir = "/Users/khuang/Box\ Sync/PhD/germline/PanCanAtlasGermline/analysis/hotspot3d"
 6 | setwd(bdir)
 7 | source("../global_aes_out.R")
 8 | source("../dependency_files.R")
 9 | 
10 | s_fn = "mc3.v0.2.8.PUBLIC.code.filtered.PCAgenes.hotspot.maf"
11 | somatic = read.table(sep="\t",header=T, quote="",stringsAsFactors = F, file=s_fn)
12 | 
13 | fn = "PCA_somatic_germline_combined_missense.maf.3D_Proximity.pairwise.recurrence.l0.r10.clusters"
14 | cluster = read.table(sep="\t",header=T, quote="",stringsAsFactors = F, file=fn)
15 | 
16 | colnames(cluster)[2:3] = c("HUGO_Symbol","HGVSp_short")
17 | cluster$type = NA
18 | cluster$type[paste(cluster$HUGO_Symbol,cluster$HGVSp_short) %in% paste(somatic$Hugo_Symbol,somatic$HGVSp_Short)] = "Somatic"
19 | cluster$type[is.na(cluster$type) & (paste(cluster$HUGO_Symbol,cluster$HGVSp_short) %in% paste(pathVarP$HUGO_Symbol,pathVarP$HGVSp_short))] = "Germline"
20 | cluster$type[cluster$type== "Somatic" & (paste(cluster$HUGO_Symbol,cluster$HGVSp_short) %in% paste(pathVar$HUGO_Symbol,pathVar$HGVSp_short))] = "Colocalized"
21 | 
22 | cluster_w_germ = cluster$Cluster[cluster$type %in% c("Germline","Colocalized")]
23 | 
24 | cluster_germ = cluster[cluster$Cluster %in% cluster_w_germ,]
25 | cat("Number of sites co-clustered: ","\n")
26 | table(cluster_germ$type)
27 | 
28 | cat("Number of unique clusters: ",length(unique(cluster_germ$Cluster)),"\n")
29 | 
30 | table(cluster_germ$HUGO_Symbol[!duplicated(cluster_germ$Cluster)])
31 | 
32 | tn = "PCA_somatic_germline_combined_missense.maf.3D_Proximity.pairwise.recurrence.l0.r10.clusters_annotated.tsv"
33 | write.table(cluster_germ, quote=F, sep="\t", file = tn, row.names = F)
34 | 


--------------------------------------------------------------------------------
/analysis/hotspot3d/spotlightVar.R:
--------------------------------------------------------------------------------
 1 | ##### spotlightVar.R #####
 2 | # Kuan-lin Huang @ WashU 201711
 3 | # plot special variants
 4 | 
 5 | bdir = "/Users/khuang/Box\ Sync/PhD/germline/PanCanAtlasGermline/analysis/hotspot3d"
 6 | setwd(bdir)
 7 | source("../global_aes_out.R")
 8 | source("../dependency_files.R")
 9 | 
10 | # check if there is statistical enrichment of overlaps
11 | # $ awk -F'\t' 'BEGIN{SUM=0}{ SUM+=$3-$2 }END{print SUM}' all_CDS_and_ncRNA_24Chroms_Contigs_1BasedStart_2bpFlanks_ForMusic_merged
12 | # 49586385
13 | exonSize = 49586385
14 | nPath = nrow(pathVarP)
15 | 
16 | ### somatic mutation
17 | # $ gzcat /Users/khuang/Box\ Sync/PhD/germline/PanCanAtlasGermline/TCGA_data/somatic/mc3.v0.2.8.PUBLIC.maf.gene_vclass_HGVSp_sample.gz | cut -f1,4 | sort | uniq -c | awk '$1 >2 && $3 != "."' | wc -l
18 | # 68537
19 | numSomaticOverlap = nrow(pathVarP[pathVarP$colocalized_somatic_mutation_count > 2,])
20 | somaticMutRate = 68537/exonSize  
21 | poisson.test(numSomaticOverlap, T = nPath, r = somaticMutRate, conf.level = 0.95, alternative = "greater")
22 | 
23 | ### PCGP germ var
24 | # /charged.2015_stJude_germline_nejm_S4_AD_varOnly.vep.tsv 
25 | # 551 /Users/khuang/Box Sync/PhD/germline/PanCanAtlasGermline/analysis/pathogenic_variants/PCGP/charged.2015_stJude_germline_nejm_S4_AD_varOnly.vep.tsv
26 | # vpn-10-1-24-5:analysis khuang$ wc -l /Users/khuang/Box\ Sync/PhD/germline/PanCanAtlasGermline/analysis/pathogenic_variants/PCGP/charged.2015_stJude_germline_nejm_S4_AR_varOnly.vep.tsv 
27 | # 239 /Users/khuang/Box Sync/PhD/germline/PanCanAtlasGermline/analysis/pathogenic_variants/PCGP/charged.2015_stJude_germline_nejm_S4_AR_varOnly.vep.tsv
28 | 
29 | numPCGPOverlap = nrow(pathVarP[pathVarP$PCGP,])
30 | pcgpMutRate = (551 + 239)/exonSize
31 | poisson.test(numPCGPOverlap, T = nPath, r = pcgpMutRate, conf.level = 0.95, alternative = "greater")
32 | 
33 | # write file
34 | pathVarP_hot = pathVarP[pathVarP$colocalized_somatic_mutation_count > 2 | pathVarP$PCGP,]
35 | write.table(file = "out/colocalize_var.tsv", pathVarP_hot,quote=F, sep = '\t',row.names = F)
36 | 
37 | # plot
38 | pathVarPOT_hot = pathVarPOT[pathVarPOT$colocalized_somatic_mutation_count > 2 | pathVarPOT$PCGP,]
39 | table(pathVarPOT_hot$HUGO_Symbol)
40 | pathVarPOT_hot$somatic_count_plot = pathVarPOT_hot$colocalized_somatic_mutation_count
41 | pathVarPOT_hot$HGVSp_short_plot = gsub("p.","",pathVarPOT_hot$HGVSp_short)
42 | #pathVarPOT_hot$somatic_count_plot[pathVarPOT_hot$somatic_count_plot> 100 ]  = 100
43 | 
44 | p = ggplot(pathVarPOT_hot,aes(y=HUGO_Symbol, x =somatic_count_plot, color = PCGP))
45 | #p = p + facet_grid(PCGP~Gene_Classification,drop=T,scale="free",space="free")
46 | p = p + facet_grid(Gene_Classification~ .,drop=T,scale="free_y",space="free_y")
47 | p = p + geom_point(stroke=0) + theme_bw()  #+ guides(color=FALSE)
48 | #p = p + geom_abline(intercept = 0, slope=1, alpha=0.2) #+ geom_density2d(alpha=0.5)
49 | p = p + geom_text_repel(aes(label=ifelse(duplicated(HGVSp_short),NA,HGVSp_short_plot)))
50 | p = p + theme(axis.title = element_text(size=16), axis.text.x = element_text(colour="black", size=14, angle=90,vjust=0.5), axis.text.y = element_text(colour="black", size=14))#element_text(colour="black", size=14))
51 | p = p + scale_x_log10()
52 | p = p + expand_limits(x = 0)
53 | #p = p + coord_equal() + getLOHColorScale()
54 | p = p + labs(x = "Co-localizing somatic mutation count", y = "Gene")
55 | p
56 | fn = "out/pathVarP_spotlight.pdf"
57 | ggsave(file=fn, width=7, h =5, useDingbats=FALSE)
58 | 


--------------------------------------------------------------------------------
/analysis/hotspot3d/work.log.sh:
--------------------------------------------------------------------------------
 1 | #1. get variants
 2 | #get somatic mafs with genes of interest
 3 | cut -f1,5-7,9,11,13,16,37,38 /gscmnt/gc2741/ding/Drivers/Data/mc3.v0.2.8.PUBLIC.code.filtered.maf | grepList PCA.all.genes.txt 0 > mc3.v0.2.8.PUBLIC.code.filtered.PCAgenes.hotspot.maf
 4 | # my @mafcols = ( $mafcols{"Hugo_Symbol"},
 5 | # 					$mafcols{"Chromosome"},
 6 | # 					$mafcols{"Start_Position"},
 7 | # 					$mafcols{"End_Position"},
 8 | # 					$mafcols{"Variant_Classification"},
 9 | # 					$mafcols{"Reference_Allele"},
10 | # 					$mafcols{"Tumor_Seq_Allele2"},
11 | # 					$mafcols{"Tumor_Sample_Barcode"},
12 | # 					$mafcols{$this->{"transcript_id_header"}},
13 | #Hugo_Symbol     Chromosome      Start_Position  End_Position    Variant_Classification  Reference_Allele        Tumor_Seq_Allele2       Tumor_Sample_Barcode    HGVSp_Short     Transcript_ID
14 | 
15 | # get germline file
16 | awkt '$13 == "missense_variant" {print $2,$9,$5,$10,"Missense_Mutation",$11,$12,$1,$116,$6}' PCA_pathVar_integrated_filtered.tsv > PCA_pathVar_integrated_filtered_hotspot3d.tsv
17 | 
18 | #2. combine the somatic file with germline file
19 | 
20 | 
21 | cat mc3.v0.2.8.PUBLIC.code.filtered.PCAgenes.hotspot.maf PCA_pathVar_integrated_filtered_hotspot3d.tsv > PCA_somatic_germline_combined.maf
22 | awk 'NR==1 || $9 ~ "p."' PCA_somatic_germline_combined.maf > PCA_somatic_germline_combined_filtered.maf
23 | awk 'NR==1 || $5=="Missense_Mutation" {print $1"\t"$2"\t"$3"\t"$4"\t"$5"\t"$6"\t"$6"\t"$7"\t"$8"\t"$10"\t"$9}' PCA_somatic_germline_combined_filtered.maf > PCA_somatic_germline_combined_missense.maf
24 | 
25 | #3. hotspot run; following steps on the github
26 | bsubl -oo proximity_search.log 'hotspot3d search --maf-file=PCA_somatic_germline_combined_missense.maf --prep-dir=/gscmnt/gc2706/dinglab/medseq/Structure_Projects/Preprocessing_Output_20141023/'
27 | 
28 | bsubl -oo post.log 'hotspot3d post --maf-file=PCA_somatic_germline_combined_missense.maf'
29 | 
30 | bsubl -oo cluster.log 'hotspot3d cluster --pairwise-file=3D_Proximity.pairwise --maf-file=PCA_somatic_germline_combined_missense.maf --vertex-type=recurrence'
31 | 
32 | bsubl -oo summary.log 'hotspot3d summary --clusters-file=PCA_somatic_germline_combined_missense.maf.3D_Proximity.pairwise.recurrence.l0.r10.clusters'
33 | 
34 | #4. post processing: plotting
35 | # get presence
36 | perl ~/bin/hotspot3d_KH/scripts/clusterPDBPresence.pl 3D_Proximity.pairwise PCA_somatic_germline_combined_missense.maf.3D_Proximity.pairwise.recurrence.l0.r10.clusters PCA
37 | 
38 | # plot some of the previous top candidates
39 | musite=canonical.combined.mc3.musites
40 | cluster=PCA_somatic_germline_combined_missense.maf.3D_Proximity.pairwise.recurrence.l0.r10.clusters
41 | 
42 | # limit cluster file to clusters of interest first
43 | 
44 | # visualize
45 | grep1 245.0 ${cluster} > tmp.clusters
46 | hotspot3d visual --pairwise-file=3D_Proximity.pairwise --clusters-file=tmp.clusters --pdb=2IVT --output-file=pml_scripts/PCA.2IVT.RET.pml --script-only 
47 | grep1 245.0 ${cluster} > tmp.clusters
48 | hotspot3d visual --pairwise-file=3D_Proximity.pairwise --clusters-file=tmp.clusters --pdb=2X2M --output-file=pml_scripts/PCA.2X2M.RET.pml --script-only 
49 | 


--------------------------------------------------------------------------------
/analysis/mutation_signature/4_plotSomaticMutsigAssoc.R:
--------------------------------------------------------------------------------
 1 | ##### plotPathVarMutsigAssoc.R #####
 2 | # Kuan-lin Huang 2018
 3 | 
 4 | source("/Users/khuang/Box\ Sync/PhD/germline/PanCanAtlasGermline/analysis/global_aes_out.R")
 5 | source("/Users/khuang/Box\ Sync/PhD/germline/PanCanAtlasGermline/analysis/dependency_files.R")
 6 | 
 7 | g_tn = "out/pathVarMutsigAssoc.txt"
 8 | g_tt = read.table(sep="\t",header=T,file=g_tn, stringsAsFactors=FALSE)
 9 | 
10 | tn = "out/somaticMutMutsigAssoc.txt"
11 | tt = read.table(sep="\t",header=T,file=tn, stringsAsFactors=FALSE)
12 | 
13 | ### plotting ###
14 | tt$signature = factor(tt$signature)
15 | tt$signature = factor(tt$signature,levels = c("Signature-1","Signature-2","Signature-3","Signature-4","Signature-5","Signature-6"
16 |                                               ,"Signature-7","Signature-8","Signature-9","Signature-10","Signature-11","Signature-12"
17 |                                               ,"Signature-13","Signature-14","Signature-15","Signature-16","Signature-17","Signature-18"
18 |                                               ,"Signature-19","Signature-20","Signature-21","Signature-22","Signature-23","Signature-24"
19 |                                               ,"Signature-25","Signature-26","Signature-27","Signature-28","Signature-29","Signature-30"))
20 | tt$association = "None"
21 | tt$association[tt$FDR<0.15] = "Suggestive"
22 | tt$association[tt$FDR<0.05] = "Significant"
23 | tt$gene = as.character(tt$gene)
24 | tt$FDR_plot = -log(tt$FDR)
25 | tt$FDR_plot[tt$FDR_plot > 5 ] = 5
26 | #uniqG = unique(tt$gene[tt$FDR<0.05])
27 | uniqG = unique(g_tt$gene[g_tt$FDR<0.05]) #  plot just the germline genes for now
28 | ttG = tt[tt$gene %in% uniqG,]
29 | 
30 | getPalette = colorRampPalette(c("#FFFFFF","#fed976","#e31a1c"))
31 | p = ggplot(data=ttG)
32 | p = p + facet_grid(gene~.,space="free",scale="free")
33 | p = p + geom_tile(data=ttG,aes(y=cancer, x=signature, fill= coefficient), linetype="blank") + scale_fill_gradientn(name= "Coefficient", colours=getPalette(100), na.value=NA, limit=c(0,NA))
34 | #p = p + geom_text(data=ttG,aes(x=cancer, y=signature, label = coefficient), color="black", size=3)
35 | #p = p + geom_tile(data=ttG[ttG$FDR < 0.15,],aes(y=cancer, x=signature), color="grey",fill=NA, size=1.5) #+ scale_color_gradientn(name= "Sig", colours=sigColors)
36 | p = p + geom_tile(data=ttG[ttG$FDR < 0.05,],aes(y=cancer, x=signature), color="black",fill=NA, size=1.5) #+ scale_color_gradientn(name= "Sig", colours=sigColors)
37 | p = p  + theme_bw() + theme_nogrid() +
38 |   theme(axis.title = element_blank(), axis.text.x = element_text(colour="black", size=12, angle=90, vjust = 0.5), axis.text.y = element_text(colour="black", size=12),axis.ticks = element_blank())#element_text(colour="black", size=14))
39 | p + labs(x="Signature",y = "Cancer")
40 | 
41 | fn = 'out/SomaticWithmutSignatureHeatmap.pdf'
42 | ggsave(fn,h=25,useDingbat=F)
43 | 
44 | # plot by gene
45 | p = ggplot(data=ttG,aes(x=coefficient,y=cancer,color = cancer))
46 | p = p + facet_grid(gene~.,space="free",scale="free")
47 | # p = p + geom_point(aes(y=-log10(wilcoxFDR),x= coefficient,color = cancer),alpha=0.5)
48 | p = p + geom_point(aes(shape=association),alpha=0.3,size=2)
49 | p = p + geom_text_repel(aes(label=ifelse(FDR<0.0005,signature,NA)))
50 | p = p + getPCACancerColor()
51 | p = p + labs(x="Cancer",y= "-log10(FDR)")
52 | p = p + geom_vline(xintercept = 0, alpha=0.5) #+ xlim(-3.1,3.1)
53 | p = p  + theme_bw() +
54 |   theme(axis.text.x = element_text(colour="black", size=12), axis.text.y = element_text(colour="black", size=12),axis.ticks = element_blank())#element_text(colour="black", size=14))
55 | p + labs(x = "coefficient",y="cancer")
56 | fn = 'out/SomaticWithmutSignatureByGene.pdf'
57 | ggsave(fn,h=28,useDingbat=F)
58 | 


--------------------------------------------------------------------------------
/analysis/pleiotropy/pleiotropy.R:
--------------------------------------------------------------------------------
 1 | ##### examine_gene_list.R #####
 2 | # Kuan-lin Huang @ WashU 201802
 3 | # examine the curated gene list
 4 | 
 5 | bdir = "/Users/khuang/Box\ Sync/PhD/germline/PanCanAtlasGermline/analysis/pleiotropy"
 6 | setwd(bdir)
 7 | source("../global_aes_out.R")
 8 | source("../dependency_files.R")
 9 | 
10 | pathVarP_otherSymptoms = pathVarP[!pathVarP$cancer_term_trait & pathVarP$binary_type=="Missense" & pathVarP$ClinVar_Pathogenicity == "Pathogenic",]
11 | pathVarP_otherSymptoms_sele = pathVarP_otherSymptoms[,c(2,15,23,27,108,109)]
12 | pathVarP_otherSymptoms_sele_uni = pathVarP_otherSymptoms_sele[!duplicated(pathVarP_otherSymptoms_sele$HGVSp),]
13 | 
14 | dim(pathVarP_otherSymptoms_sele_uni)
15 | 
16 | fn = "out/pleitropy_vars_inCPG.tsv"
17 | write.table(pathVarP_otherSymptoms_sele_uni, file=fn, quote=F, sep="\t", col.names=T, row.names=F)
18 | 


--------------------------------------------------------------------------------
/analysis/process_files/genotype/cancer_type.txt:
--------------------------------------------------------------------------------
 1 | ACC
 2 | BLCA
 3 | BRCA
 4 | CESC
 5 | CHOL
 6 | COAD
 7 | DLBC
 8 | ESCA
 9 | GBM
10 | HNSC
11 | KICH
12 | KIRC
13 | KIRP
14 | LAML
15 | LGG
16 | LIHC
17 | LUAD
18 | LUSC
19 | MESO
20 | OV
21 | PAAD
22 | PCPG
23 | PRAD
24 | READ
25 | SARC
26 | SKCM
27 | STAD
28 | TGCT
29 | THCA
30 | THYM
31 | UCEC
32 | UCS
33 | UVM


--------------------------------------------------------------------------------
/analysis/process_files/genotype/merge_genotype_by_cancer.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | while IFS='' read -r line || [[ -n "$line" ]]; do
 3 |     echo "Processing cancer type: $line"
 4 |     cancer="${line%\\n}"
 5 |     #get file
 6 |     gsutil ls gs://dinglab/isb-cgc/tcga/genotyping/tarballs/genotyping.${cancer}.tar
 7 |     gsutil cp gs://dinglab/isb-cgc/tcga/genotyping/tarballs/genotyping.${cancer}.tar .
 8 |     tar -xvf genotyping.${cancer}.tar 
 9 |     
10 |     #merge
11 |     vcf-merge $(ls -1 ${cancer}/*.vcf.gz | grep TCGA-..-....-1.* |  perl -pe 's/\n/ /g') | bgzip -c > ${cancer}.normal.merge.vcf.gz
12 |     #index
13 |     tabix -p vcf ${cancer}.normal.merge.vcf.gz
14 |     #upload
15 |     gsutil cp ${cancer}.normal.merge.vcf.gz* gs://dinglab/isb-cgc/tcga/genotyping/merge
16 |     # delete files
17 |     rm -rf ${cancer}/*
18 |     rm -rf genotyping.${cancer}.tar
19 | done < "$1"
20 | 
21 | # set unlimit file number higher so it works for breast cancer
22 | ulimit -n 2500


--------------------------------------------------------------------------------
/analysis/process_files/genotype/merge_log_gcloud.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | # 2 vCPU; 60GiB <- not enough memory; maybe because vcf-merge open up lots of vcf to do re-indexing?
 4 | # 8vCPU; 52GiB mem, 60GiB
 5 | 
 6 | # set unlimit file number higher so it works for breast cancer
 7 | ulimit -n 2500
 8 | 
 9 | nohup bash merge_genotype_by_cancer.sh cancer_type.txt > merge_genotype_by_cancer.log &
10 | 
11 | # merge
12 | vcf-merge $(ls -1 *.normal.merge.vcf.gz | perl -pe 's/\n/ /g') > all.normal.merge.vcf
13 | # job ended because disk run out of space (60G, zip into partial vcf)
14 | bgzip -c all.normal.merge.vcf > all.normal.merge_partial.vcf.gz
15 | tabix -p vcf all.normal.merge_partial.vcf.gz
16 | 
17 | # get the remaining SNPs
18 | nohup vcftools --gzvcf BRCA.normal.merge.vcf.gz --gzdiff all.normal.merge_partial.vcf.gz --diff-site --out inBRCA_v_inAllPartial &
19 | awk -F '\t' '$4=="1"{print $1"\t"$2}' inBRCA_v_inAllPartial.diff.sites_in_files > leftover.sites_in_files.positions.txt
20 | # get each cancer types leftover vcf
21 | for file in *.normal.merge.vcf.gz; do
22 | 	echo $file
23 | 	echo leftover.$file
24 | 	vcftools --gzvcf $file --positions leftover.sites_in_files.positions.txt --recode --stdout | bgzip -c > leftover.$file
25 | done
26 | # tabix
27 | for file in leftover*.normal.merge.vcf.gz; do
28 | 	tabix -p vcf $file
29 | done
30 | 
31 | # merge the remaining sites
32 | nohup vcf-merge $(ls -1 leftover*.normal.merge.vcf.gz | perl -pe 's/\n/ /g') > leftover.all.normal.merge.vcf &
33 | bgzip -c leftover.all.normal.merge.vcf > leftover.all.normal.merge.vcf.gz
34 | tabix -p vcf leftover.all.normal.merge.vcf.gz
35 | 
36 | # the last line is broken; clean it up
37 | zcat all.normal.merge_partial.vcf.gz | head -n -1 | bgzip -c > all.normal.merge_partial_cleaned.vcf.gz
38 | tabix -p vcf all.normal.merge_partial_cleaned.vcf.gz
39 | 
40 | # merge both and upload
41 | vcf-concat all.normal.merge_partial_cleaned.vcf.gz leftover.all.normal.merge.vcf.gz | bgzip -c > all.normal.merge.vcf.gz &
42 | tabix -p vcf all.normal.merge.vcf.gz
43 | # $ zcat all.normal.merge.vcf.gz | wc -l
44 | # 522763
45 | 
46 | gsutil cp all.normal.merge.vcf.gz* gs://dinglab/isb-cgc/tcga/genotyping/merge


--------------------------------------------------------------------------------
/analysis/process_files/germline/google-cloud-ISB/README.md:
--------------------------------------------------------------------------------
 1 | Preliminary steps (any order):
 2 | 
 3 | * Build dockerfile and push to repository of choice
 4 | 
 5 | * Install dsub
 6 | 
 7 |   see [https://github.com/googlegenomics/dsub](https://github.com/googlegenomics/dsub)
 8 | 
 9 | * Make lists of samples ids to process (if needed)
10 | 
11 |    ./make_lists.sh
12 | 
13 |     This action creates the directory analysisID_lists/ and files therein
14 | 
15 | 
16 | 
17 | Main steps
18 | 
19 | 1. In script annotate.sh, check for the appropriate paths and filenames, dsub location, docker image repository location, as well as cloud project information and container parameters, etc.
20 | 
21 | 2. Activate dsub.
22 | 
23 | 3. Run
24 | 
25 |    ./annotate.sh  $cancerType
26 | 
27 |    This action typically launches several hundred jobs, which is the recommended approach.
28 | 


--------------------------------------------------------------------------------
/analysis/process_files/germline/google-cloud-ISB/analysisID_lists/CHOL.ids:
--------------------------------------------------------------------------------
  1 | 00b0d456-bc7c-43cc-9c23-e9bdd89af52f
  2 | 024e4bf7-2391-478b-9de5-89cd17b89fa1
  3 | 08f1a00b-9ed6-4cc6-bb35-68bf3510700e
  4 | 0af33967-eaf5-4b31-8a3e-474950c128cd
  5 | 0c177782-f4b7-46e9-9f84-d50ec1aa9308
  6 | 0ed3bf36-969d-4e5c-b30d-240d463da44b
  7 | 10a331fa-b9a8-4dea-b5a8-f65d36c8c7cd
  8 | 1746e80c-3a23-4277-8977-ae5cf486e52a
  9 | 174c632d-f035-483a-9f79-d9b5e18aaa03
 10 | 1b8e6945-69d9-4015-b178-af2e4e54f4d5
 11 | 1c4f3554-4788-4de1-aada-e95960974373
 12 | 1efacff6-9331-4bfd-9680-ce62df20af39
 13 | 1f76a26a-b2eb-4cf5-926a-ba39129f68a9
 14 | 21c32d9a-e76f-4f35-941d-45c15158dede
 15 | 23ae3329-9c10-4d83-b64b-c5c578cb2bf6
 16 | 23e27f4e-1037-43aa-b764-4fb311f3ffdb
 17 | 26f66a14-b7de-4072-9b9b-5f5f67103b4b
 18 | 27f90585-a7ed-492a-88e9-b52d7e4dd3bf
 19 | 280d5745-fde7-46e0-ae7e-e454b0033659
 20 | 3118c963-8446-4d4a-8146-6d46f1465780
 21 | 343bc2ac-6a8f-4c5a-80b5-2c94bf32dd02
 22 | 3b6a787b-33c9-4119-8f3b-94b96029f2d7
 23 | 3c0257cf-12bf-4ec7-aacf-ac52d68dda71
 24 | 3c5e8db6-8f6b-4d5e-aa27-53ec9fb57214
 25 | 3de7df2c-2375-4c9c-aa17-f1437f2c9489
 26 | 3e1b05dc-b62c-46d7-a20a-bf2781992cc4
 27 | 3fbcfb4e-4034-4ab1-911e-28d4aea07791
 28 | 44280092-b8cc-47b2-a561-2e5957f20713
 29 | 47632a94-5413-43df-a26a-8bd7292824f7
 30 | 4b60d271-7397-46bd-aaee-e3c9b5375a79
 31 | 4d85b3fd-53f3-4a52-9322-4a68de4ef7ad
 32 | 4f1bc424-598a-445f-8566-c135d2f806f4
 33 | 548ddecc-f07c-4b7b-a3fd-08a8466f7c35
 34 | 549afc0b-a2a8-4653-bc72-141eaf63ec8c
 35 | 59919629-a114-478c-9d48-52a861fedf40
 36 | 5ce81c93-c432-4db9-aae9-08f2ca5c5cd1
 37 | 5dd3429f-feb7-43d4-ad64-ff30b53c041f
 38 | 5f068ddf-6dec-4559-acf3-637730b9e004
 39 | 5fddf1ad-bf26-48e6-baef-09c8ee208b53
 40 | 61468067-f567-4119-870d-48b930e68c43
 41 | 61f9d026-7b71-47b3-a78f-39edd32ad99c
 42 | 657a9e5d-4e7d-4568-9e25-a72901cbfc12
 43 | 6587bc6d-9f8d-421e-999b-8d88e409c3f8
 44 | 68d5f8e2-0050-4caf-b12d-1b08f033bfc7
 45 | 6d064547-7125-4072-ac1d-1bcf1948597d
 46 | 6debc21c-b1e2-47fe-bd66-28bc99c613ed
 47 | 6e8ecd99-5aa1-4614-87b7-c374787e75fc
 48 | 71b53b01-d6ec-4410-8692-f3a4317db5d9
 49 | 722842ff-fd56-4965-9f31-c12bd81de159
 50 | 722d4179-a064-4e83-b39a-75b5ef361d13
 51 | 723ab076-7381-4364-bd04-6a04d8011e8a
 52 | 72b856e7-8ce1-4ae8-8d69-a7f4b76b97f9
 53 | 73ffa0ba-a122-4be8-ab74-8116c432d361
 54 | 75634c49-7875-4c65-870b-8983b185547c
 55 | 790e85fa-b916-4b51-8512-37ac7d36f6de
 56 | 79c7d7fb-8ac8-4872-b0f5-fdfab8091391
 57 | 7d134ea2-8519-4a35-8b00-593a6e73d881
 58 | 84e45fce-d12f-4b6e-ba7a-8c6f5b2eadfb
 59 | 88312e9f-a42d-45f8-b720-043b7783cd96
 60 | 899cb43c-9ef0-4997-a5cc-a90bdbbb6f17
 61 | 89e67c77-33a8-4cd0-b866-a9dbfe09e26d
 62 | 8de6d5d1-bbb0-4027-bd49-c42e6fb7ef01
 63 | 914c8d2d-051f-4a34-8c44-679d10c9c0a8
 64 | 92d0abec-0f49-46f5-83ee-1ef2c2b29cd1
 65 | 93838731-fec0-4d4a-9b7e-dcadcf083dc6
 66 | 9414c05a-029c-430c-96c3-fa87405e2b1e
 67 | 995516f3-253f-4c05-ad4b-b857a9135a8e
 68 | 996b5802-eb44-4b6e-b6a0-5eaf64a29178
 69 | 9c70ced1-7eda-41e5-a59b-91a9c7c55b15
 70 | 9e75ab9c-a7dd-4d12-8f7e-2aa855b0061a
 71 | 9f1fe80d-3319-4fa4-a0ec-7c622ccea401
 72 | a0953af0-0593-4e93-82b7-5d850685a397
 73 | a0d82814-b3df-42e0-9a85-dc7d0c5d00fc
 74 | a1c00a3b-5164-4898-b7e8-832e8a78622f
 75 | a67a55d6-39f6-44d5-a90e-7af3f925434f
 76 | a6c27c5e-0346-4d36-919e-0ccd80a76c20
 77 | aa58211e-379f-463b-b370-a9adad976ba0
 78 | ab45bb9d-a326-4143-b6a4-c6e4b6bc1b40
 79 | b06c9621-e956-49a7-b8b5-3a44695ae9e7
 80 | b6b5dcfa-d692-4940-83cb-3a343aef28e6
 81 | b6e63155-cd48-45d3-a787-c0c3ca330c16
 82 | ba3f3a2d-2ffd-461c-a9c6-9cbfe967ff0e
 83 | bac40ece-2ec0-4807-a35d-748fdb0d45f7
 84 | bcb1ea92-64b0-4d2d-a751-9034aaa0748c
 85 | bef9ef3c-b6c8-4cf0-aab4-1facf08a04a7
 86 | bf95def0-ed0a-40b0-9e6b-824464c9d037
 87 | c8319529-1328-4372-ac3e-b38faf22b122
 88 | cc44026d-7056-4627-a7b7-4eb7e42ae7a2
 89 | cc96310f-aad1-4fa2-ae04-a7239432f338
 90 | cd64550f-4cbd-480f-afb8-3cb6940ffa49
 91 | ce5363ef-2de7-451f-9718-3a2f130d955a
 92 | d0ff754e-7757-426d-834e-759e43b41a85
 93 | d5049a47-27de-4463-b213-9123199fe8ba
 94 | d5610dde-7d1e-4917-b700-3cba9158041b
 95 | d7a5d40b-b2f3-4b46-a613-b2ff5fa24480
 96 | dbeafaed-2acb-485f-9e88-92ee7e94b168
 97 | dc084a43-a6ee-49fe-b704-780af79e687d
 98 | dd344f2e-aadd-49ab-a644-f18fcc343ef5
 99 | e4785c4d-bfaa-485b-b124-d3ab0a19aef8
100 | e49f3ac3-4d27-4fc2-86e9-7adb7a91e208
101 | e894afd0-a924-4951-ac22-b6841231e0a2
102 | e962ce93-c3b8-4524-8955-1f826e27756d
103 | ea95e3b2-e3a2-4493-8a60-eaf9bb0b5363
104 | ec0f44d5-e3f7-4b09-b44a-f1dc9731ce31
105 | f0359e15-f01f-425d-b231-fb53ea4ccd71
106 | f2c9cbc1-9383-4266-8dba-4167eb127e50
107 | f34b80b1-0a0f-4e37-8ecc-2e13b4c1553b
108 | f53f8656-b74f-47d6-8bae-d86c6da684d1
109 | f73c413b-a6cf-4da6-a833-14893ebf39ae
110 | fd5f514c-bc98-4042-a13f-40915c6f715d
111 | fe83094b-a94c-46be-a74d-faf572d73b10
112 | feaeac67-aee8-4b27-a10d-42f3ddfb6266
113 | feeb857a-30b1-41ac-96dc-482f85b04a2d
114 | 


--------------------------------------------------------------------------------
/analysis/process_files/germline/google-cloud-ISB/analysisID_lists/DLBC.ids:
--------------------------------------------------------------------------------
 1 | 0706b153-fb32-445d-8ab5-b4ff0d75a7ff
 2 | 0b2ff723-4788-497a-be41-789db913ac9d
 3 | 1283e672-5e91-4e41-bdfa-46ee5e17b255
 4 | 15d7b6b0-2e09-4bd5-bbc8-98af6ee83fe0
 5 | 17da368c-d609-4792-97f6-c454ba732c71
 6 | 1e727a47-150a-40e7-8bcb-c7b3897d5276
 7 | 1fdae439-6285-496c-ae97-f64265e8110e
 8 | 26d61be3-88d4-4714-a25e-09a347d1645e
 9 | 29e221f7-34bb-4811-b943-2d1f186a1c29
10 | 2c10b72d-e10b-4284-a0d3-c40f0af522f7
11 | 2db07d8b-22f4-4799-9bb9-d1bb58fcf1ec
12 | 2f749f44-fe34-4882-ab91-e231ba6a67f9
13 | 30b796bd-7020-4a64-800a-7b39d624cc7c
14 | 335fa833-2125-480b-b009-e89ba8a610cf
15 | 355d4cae-8061-4d6c-b737-7209d721e8a9
16 | 3dd0c890-265b-4bd1-ae0d-22ced50c24fe
17 | 3e0f1eaf-b928-4d5d-9997-7e9a366f4cf4
18 | 48b53f33-2a09-47b4-becc-a063f3c7bfc6
19 | 498edf5e-1cab-4579-9170-bf6ce90a59df
20 | 5352e2a7-8b20-4909-8bae-5467c710bd8f
21 | 54050a4a-8577-4eed-902b-c54c1f519a65
22 | 570a8c41-9433-46bb-8283-1870307ed2cb
23 | 58308fe4-a9a3-40e7-a417-44732826c70f
24 | 5a2510f2-0d30-41d3-956c-9dccace0fbb3
25 | 5adc8875-a9a9-441b-995a-2f62f45d0577
26 | 5b6aae67-7801-4842-9996-502d5d872b1b
27 | 5bced86b-7bed-4a51-8c9e-3c0b4ebcdf3c
28 | 668491ad-c0e1-4055-8b4d-5368111b7f72
29 | 676bcdca-e088-418a-8079-435c6e14519c
30 | 69d5e60c-8dd9-4927-8888-14d182da9c65
31 | 6aa7bb0c-0929-4332-abeb-a9140691a8d1
32 | 6c89c9aa-9d84-42dc-bcc2-66a8a7bbd3e5
33 | 78885547-bfe9-4252-814a-ee5ca91369c0
34 | 799f0ecb-99d2-4bd8-a05e-9542f5c0d960
35 | 7bf895e1-2184-449c-a256-058a52d0d540
36 | 7d0c1dda-ef6f-4e74-9e19-0b793c972d12
37 | 7f3492d5-1cd6-41cb-aee3-32f670add6fc
38 | 80c661c8-78d0-4ff5-8cf3-360ff312e36a
39 | 82dad9bd-3e63-4e4c-af18-e394b4d0f17a
40 | 8b7408a3-ce15-40d8-9d3e-b969d373d8f9
41 | 92762a6f-4d78-457d-843e-7a470df7c827
42 | 95c41569-cbc5-46fd-a39c-3680c41a3f03
43 | 9754651c-7be9-4140-8d83-2ee34fcebb4e
44 | 9863378d-5320-4e88-b0db-cca0145b6ccc
45 | 9b256771-6d08-48c7-9696-49ce822961a0
46 | 9c02043f-9daf-437c-8d81-6c214ca5e098
47 | 9e24e779-e612-4256-a41c-667dcacc0728
48 | 9f3f4b94-4adc-4724-b449-cf6c63bdd5d1
49 | a4aa6fdc-4136-4d5d-8858-e918bdbbb3cc
50 | a6aa1829-cbf5-4775-b21d-7d63bfa7a1a4
51 | a78918b5-8182-4eaf-99d7-443e68db17c2
52 | a87753ea-3962-4fa8-a335-7e7ca1197a8e
53 | a8abdd19-9a22-4e82-966f-cf1c44d475d0
54 | abe975a5-4cf2-4d0a-82d7-5b1474673bf7
55 | ac923ff6-77a9-43ef-b7b1-298e5b9c0e1f
56 | aec76bf9-f389-4744-997d-90157466f650
57 | b1f6abd2-b661-4790-a55c-3df2dbd3b105
58 | b27c2a2d-fdf1-4e70-a78a-d18be45f54c4
59 | b3e9db29-8551-4e73-b65d-3c7c3d7d4619
60 | b703c2d0-10a6-4988-b5bb-45c746364ebc
61 | b70b0300-0a53-4d60-b6fc-edef9ba05bc8
62 | bf7633ac-eb47-48d9-95d5-84bee621f230
63 | c12d7063-94fd-4b29-a52d-81581ac47fbc
64 | c5e0c935-5018-47ce-abbc-e200ae808d3b
65 | cd223dab-94b1-4b87-9ff3-9af759ba1d21
66 | ce46292f-25b6-4b84-9631-1198f75de4b4
67 | d273ceec-6525-4a8e-b71a-eef9ea7025cb
68 | d7901b09-70bc-472e-a790-9a3134d28d86
69 | d7916479-6e6b-4926-83e3-002682cbd0bd
70 | d8a691e9-1b11-4666-87f4-188a12e716c5
71 | dd3e9cb7-ef09-4f4b-b940-a5959953d11c
72 | ddf60239-37d2-4d30-9523-3a359ece1b3c
73 | de117caa-acbe-4fe8-bd89-6e1b2743881d
74 | de5f55b5-dec3-465b-be46-a073f89fbe0d
75 | deebc37c-0f59-4de3-8727-072b3c60fdda
76 | e1d1e2ce-5eaa-4487-921f-a6c791fb5e7a
77 | e3b54861-5f84-4341-84a8-3a74f1f42e51
78 | e820d45d-cba3-453f-b618-063236575f78
79 | ea038362-67b8-4c50-bb49-45731ea0c90a
80 | ede52a3c-ea6c-4f85-9dc2-6e2efaa8eab3
81 | f1fdc4ed-bc79-4670-99ae-cd1e20795d36
82 | f44c54ed-40cb-4a51-a526-ec7c211a008c
83 | f7d459b9-3353-4fcd-b477-32e5964d9217
84 | f811068c-70c8-4825-9567-16cc0037cf03
85 | f9ba0957-2226-4c24-a1d3-dd5f693e8c80
86 | 


--------------------------------------------------------------------------------
/analysis/process_files/germline/google-cloud-ISB/analysisID_lists/UCS.ids:
--------------------------------------------------------------------------------
 1 | 08e12a8e-5220-4770-a50d-00ee4046ccaf
 2 | 0b8434ac-f295-437d-acdd-08e6d428936d
 3 | 0e1c1fa0-ae6f-4f34-aea9-657a801f04d9
 4 | 103a746b-f2df-4f06-a5a0-90b92d823458
 5 | 10fd317c-6a26-4da3-9694-373e7b2066cd
 6 | 1222db1c-c99c-411c-9f51-6b0ec158a49d
 7 | 17cea4ac-0b5f-42fe-a1b7-362311349965
 8 | 184000f8-0963-4ce3-87dd-5acdab06c56e
 9 | 18e815cd-6225-489a-ac9b-37132b635b53
10 | 1b65b29b-22bd-4706-867f-0d953e818453
11 | 1c7b7652-0311-45bf-9cb8-18b3b7caced2
12 | 1e515b00-58c4-4ce4-8c19-73a27eaaa707
13 | 2299c370-db0f-4d36-abce-894a039d6895
14 | 22d1b470-710d-415c-bdae-786e14ff68e3
15 | 22dfbcbd-2b8d-49ac-9a50-02dffe45263e
16 | 25c94f94-2154-4024-a9cb-b3c8c09eec5d
17 | 26fd12f9-0c59-470d-8ef2-31ad48ded9d8
18 | 29a60b4a-8b42-4394-ad99-a6704da8274c
19 | 30085e74-ced4-43d2-9eb9-cc0ff4402d7d
20 | 556d0725-a072-4f94-9467-2f7a148d9e40
21 | 585388db-c4fc-4b1d-b161-ebe19a4acd4c
22 | 5b1a7d51-92f7-4003-aee2-0fa68764e400
23 | 5b969f8b-2484-4cdf-ae77-fd77d40a2ac5
24 | 5c29cdfc-5120-423b-98ad-ba2fbb91833d
25 | 5d582226-0942-447b-a834-7c7d578b652a
26 | 7c116e6b-566e-4e20-aafc-aa68ea775fb8
27 | 7e82d505-51cf-472f-b850-519e50e4152f
28 | 823e8f5e-34ff-4751-a696-6e1270e57ba9
29 | 8eef9582-d240-4df8-b8e9-b5e521060653
30 | 9170a2b3-fd9f-4849-84a1-eeb1f89f2d02
31 | 948e4ecc-676b-44d4-90ad-4539f82cafc3
32 | 994d9165-bf0f-4412-a4af-16e4a0193cfe
33 | 9957321a-c3d4-4a6b-a78b-a29f843c36ba
34 | 9c8afe1b-1690-4f8d-8771-9f277a6ea1c9
35 | a0cd9f50-fbc7-4d76-badb-204a58de275f
36 | a4603f50-94d0-49f5-8686-b3579e5c393a
37 | a4edf06b-db69-448e-9317-18629e102b91
38 | aa25fc96-c66b-4629-9e23-70ca0661cb6c
39 | b61f6646-0e70-4a54-ae8b-87c4ba21880d
40 | be3f17b4-65d7-45a4-8bc9-686c298184fa
41 | c0a5553c-b623-45ae-a369-42fefff00109
42 | c704a3b7-05ee-4340-854a-dcdd7f8168a1
43 | c7716f57-e9ed-4acc-a3d6-d8ff7c15a801
44 | d2c491bf-7ce4-44cb-b735-dde9cd6bffc3
45 | d7736ff5-ab09-40b0-b634-70367b17fb42
46 | db042e45-e59b-42ab-8031-2da836a8cd65
47 | de931de2-38c0-4b8e-b21e-35cebbe9d2b4
48 | e07b0cb2-d492-4dad-94ed-81d53cd1f4aa
49 | e0974afe-5847-4fd3-b78d-43dce6851a2d
50 | e2d04a29-694d-446a-80d1-b957b1f0f9e4
51 | e2dc9062-a426-402c-a0f0-f147580d8bcd
52 | e58159cb-ff8c-4b7c-b136-f044a6424c46
53 | eccb6457-4629-49d7-91d3-0329a9943bc9
54 | ed64e08c-40c2-4712-aaf7-6625dcad4aaf
55 | f77313cf-8249-4b8b-a230-ab42afdd8bc5
56 | f8238d45-38a4-4b43-941f-fe5500ee4d40
57 | faaae401-c7b6-40f3-9491-28adcfb1550a
58 | 


--------------------------------------------------------------------------------
/analysis/process_files/germline/google-cloud-ISB/analysisID_lists/UVM.ids:
--------------------------------------------------------------------------------
 1 | 06b67b89-65ff-43d5-ada7-5e8e6413815f
 2 | 0a06f8b7-c37e-4b21-aa9a-2e7acce85904
 3 | 0b5c74aa-2ae3-4f6c-a65a-f6c2debad86d
 4 | 0b6c8bcb-be31-4e16-8359-2b4080fb403d
 5 | 0cf5dd7c-1d8d-4f8e-a66d-be3556515c8e
 6 | 0da5e271-e403-4b84-a5ea-6a2148472aef
 7 | 0e41d859-bc51-4c6a-b880-5789a8ee83ac
 8 | 0e9e81c5-9391-4adb-945c-8eaf5859ef20
 9 | 10f63d37-d925-496b-bf16-2c4fa71fdc05
10 | 127793c7-a726-4216-b9a7-ad9641e03d15
11 | 12dd6b53-4000-46be-a442-2486f6fe257d
12 | 13725399-7ec0-4620-975d-69e0b9d50002
13 | 14a68f08-85b3-451d-b4bb-30bf6a6df3c5
14 | 151daea0-6553-437e-bcc7-c7d68c155b95
15 | 196e5009-adcf-47ec-ad6a-0c0d399c45cc
16 | 1ae6cc79-178e-4693-b340-3290d2086782
17 | 1dd3d2dd-cf62-4760-8560-ca03622b2f6a
18 | 1e55a54d-8ae3-4d1b-92f1-e5f6e8cc54da
19 | 21b3e8bd-76ce-4957-8de7-bff48af5e05c
20 | 245edb11-fa42-4dc6-abb9-00f3ddda65fe
21 | 266f9b4a-f427-4131-928e-2ccb53a6de7d
22 | 2a17109e-4b46-4ac2-8bc9-89549a0a65e5
23 | 2a8318e1-6565-4fcd-8764-ecb2f208d671
24 | 2b97651a-1306-462a-97fe-7bf22054da81
25 | 2d9cb3b8-5ed2-438b-873f-fa8e46a857af
26 | 2ec8feb1-e07e-4803-9c65-555c1f4b1415
27 | 301e9730-5c5e-4934-a951-eca76df91b95
28 | 3da21087-0c6f-4972-a8d1-15847c5ab2b8
29 | 3e08b7f7-abae-43d3-b996-a85cf07f443f
30 | 4685b14f-1153-42a2-ae0a-06c9d833b2a5
31 | 512df98e-abd6-487a-8955-46f73ed9eb75
32 | 521e8b6c-287f-45ba-a08c-dfb8ab2f5d6c
33 | 56e324f4-24cc-49ce-bb7b-8f0cc8881a1d
34 | 608584ee-2611-4312-8a21-a53ce20f58ac
35 | 61ae63c7-6581-4bb5-9856-476f9077ad7b
36 | 6b38ee33-5bd2-455e-b0a3-c82968e7fde7
37 | 71e03403-d24e-4039-9940-f88fce01ade5
38 | 73e00332-2716-47c6-8f73-61b0d4730454
39 | 7432dc4c-5af1-4aa5-b22a-6f2d45e11219
40 | 74eb2254-3398-4680-93e3-cbc60b83e01b
41 | 79ee7ade-d019-454a-b53a-fc8fec750f19
42 | 7b869ca3-86df-461e-b1f2-bd43960da45c
43 | 7b8db7d1-9978-4038-b321-8ec7e7a02210
44 | 7e18cd39-cf37-43c4-9fd2-0aa522bdbd10
45 | 7fe12666-cba9-44b2-a6f9-1c078b88121b
46 | 84a97aa6-3c93-48b1-b7e9-e063f80606cd
47 | 852bc893-6939-4558-bb15-cebb997a9797
48 | 869dc9d7-a30a-400b-80d5-596924a73a63
49 | 8be24848-b089-4cdb-be6e-0d617b1fe16c
50 | 8ed4629c-1e6c-43f1-8275-f1494f94859b
51 | 9347d4b7-88d5-45d1-96dc-5e781185bfdf
52 | 960d692d-c144-4aa0-b0a1-acfc3a43ee2f
53 | 96ae97a7-7631-4bd3-a7c0-d42761042f1f
54 | a5aa583d-bbb8-4129-a7a4-8542a647398f
55 | a6bc4b41-a8ec-47cd-ab2d-a07e2b70f2e3
56 | a9f94e7b-32b5-48f3-a4a6-ef76d5408463
57 | ac91dae2-608c-4a0b-bf22-282d6274fcf8
58 | b11c7d83-d013-4a95-ad65-86a421c6c0c9
59 | b3a011d8-11ae-4d39-90cd-c5f12d7fb963
60 | b677aba6-a979-49cd-9de4-dedc5ff775e6
61 | bcfd1131-c6b1-4692-acf9-5000afe79d89
62 | c4430ed7-4733-4b2d-9606-63ee509abf09
63 | c6208775-c4bc-4869-a7fb-d0d820b19a99
64 | c7e4a802-a691-48dc-b447-1e4181feb5eb
65 | ced4f609-8f81-4323-b739-bc41e7304b44
66 | d0035037-a84c-4819-b379-a76f2c455224
67 | d436eb15-6638-472c-a011-196fd98606c8
68 | d539f558-316e-4e83-a4d8-3ef06c3df63e
69 | d9d24327-ee9f-462c-8672-b48c31bc4656
70 | dcc7f7a7-bac8-4932-b1f8-ee1ef1327c94
71 | df51e932-1fe3-45d1-ae96-d6de6d8e592a
72 | e113d903-5773-47ce-855f-fe43446783c8
73 | e3a6ef74-634e-4c97-8ead-dcacdfea4258
74 | ef9a88a0-106c-42e0-8c22-ec86bed608b8
75 | f42dfc88-d12b-490b-aadd-8fb7cd08da3b
76 | f440c9d3-d62e-4ffc-afc1-7b444d191c99
77 | f50234f7-56a7-4b50-8249-bb19940d0f70
78 | fbad6458-9565-4454-a03a-8ec83e6fb60c
79 | fc50c7f1-e67d-43d4-a211-543491d05f44
80 | fcf2b05a-7fdd-43fc-bf51-9e219e39526d
81 | 


--------------------------------------------------------------------------------
/analysis/process_files/germline/google-cloud-ISB/annotate.not-in-exac.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | # by Jay Mashl, June 2017
 4 | # PanCanAtlas Germline
 5 | 
 6 | #for cancerType in  ACC BLCA  BRCA CESC CHOL  COAD DLBC  ESCA  GBM HNSC  KICH   KIRC KIRP  LGG LIHC LUAD   LUSC MESO OV  PAAD PCPG  PRAD  READ SARC  SKCM  STAD TGCT  THCA THYM UCEC  UCS UVM  ; do
 7 | cancerType=$1
 8 | 
 9 | analysisList=analysisID_lists/$cancerType.ids
10 | 
11 | for analysisId in $(cat $analysisList) ; do
12 | 
13 |     # variable input
14 |     inputPath=gs://dinglab/isb-cgc/tcga/germline/production/${cancerType}/${analysisId}/combine
15 |     VCFIN=prefilter.snp_indel.vcf.gz
16 | 
17 |     outputPath=gs://dinglab/isb-cgc/tcga/germline/production/${cancerType}/${analysisId}/combine
18 |     #VCFOUT=${VCFIN/%vcf.gz/annotated.ExAC_AF.0.01.AD.3.ROI.vcf.gz}
19 |     VCFOUT=${VCFIN/%vcf.gz/annotated.ExAC_AF.0.01.ExAConly.AD.3.ROI.vcf.gz} # updated for the updated run with only ExAC rare
20 | 
21 |     logsPath=gs://dinglab/isb-cgc/tcga/germline/production/${cancerType}/${analysisId}/combine/annotate_logs
22 | 
23 |     # fixed input
24 |     EXAC=gs://dinglab/jay/annotation/ExAC_nonTCGA.r0.3.1.sites.vep.vcf.gz
25 |     BEDFILE=gs://dinglab/isb-cgc/tcga/reference_files/all_CDS_ncRNA_ENCODE_multicell_ROI.bed
26 | 
27 |     ~/dsub/dsub/dsub \
28 | 	--project isb-cgc-06-0004 \
29 | 	--zones "us-central1-*" \
30 | 	--logging $logsPath \
31 | 	--input VCFIN=${inputPath}/${VCFIN}  VCFINIDX=${inputPath}/${VCFIN}.tbi  EXAC=${EXAC} EXACIDX=${EXAC}.tbi  BEDFILE=${BEDFILE} \
32 | 	--output  VCFOUT=${outputPath}/${VCFOUT}  VCFOUTIDX=${outputPath}/${VCFOUT}.tbi   \
33 | 	--command 'cd $(dirname ${VCFIN}) && mv $EXAC $EXACIDX $BEDFILE $(dirname ${VCFIN})  &&  /usr/local/bin/variant_QC_annotation.sh  $(basename ${VCFIN}) &&  mv $(basename ${VCFOUT}) $(basename ${VCFOUTIDX}) $(dirname ${VCFOUT})' \
34 | 	--disk-size 20 \
35 | 	--min-ram 4 \
36 | 	--min-cores 1 \
37 | 	--name annotate \
38 | 	--image gcr.io/isb-cgc-06-0004/dinglab_pca_analysis:0.1 \
39 | 	--scopes  https://www.googleapis.com/auth/compute https://www.googleapis.com/auth/devstorage.full_control https://www.googleapis.com/auth/genomics https://www.googleapis.com/auth/logging.write https://www.googleapis.com/auth/monitoring.write
40 | 
41 | done
42 | 
43 | #done
44 | 


--------------------------------------------------------------------------------
/analysis/process_files/germline/google-cloud-ISB/docker/Dockerfile:
--------------------------------------------------------------------------------
 1 | 
 2 | FROM google/cloud-sdk
 3 | 
 4 | LABEL maintainer="R. Jay Mashl <rmashl@wustl.edu>"
 5 | LABEL program="PanCanAtlas analysis"
 6 | LABEL version="0.1"
 7 | 
 8 | RUN apt-get update && apt-get -y install \
 9 |     autoconf \
10 |     build-essential \
11 |     libncurses-dev \
12 |     perl \    		   
13 |     pkg-config \
14 |     unzip \
15 |     wget \
16 |     zlib1g-dev \
17 | && rm -rf /var/lib/apt/lists/*
18 | 		 
19 | # install vcfanno
20 | WORKDIR /usr/local/bin
21 | RUN    wget -O vcfanno https://github.com/brentp/vcfanno/releases/download/v0.2.6/vcfanno_linux64
22 | RUN    chmod +x ./vcfanno
23 | 
24 | # install vcftools
25 | WORKDIR /usr/local/src
26 | RUN    wget -O v0.1.14.zip https://github.com/vcftools/vcftools/archive/v0.1.14.zip && unzip v0.1.14.zip && rm -f v0.1.14.zip  && cd vcftools-0.1.14 && export ZLIB_LIBS=-lz && export ZLIB_CFLAGS=-I/usr/include && ./autogen.sh && ./configure --prefix=/usr/local && make && make install
27 | 
28 | # install samtools
29 | WORKDIR /usr/local/src
30 | RUN     wget -O samtools-1.2.tar.bz2 https://github.com/samtools/samtools/releases/download/1.2/samtools-1.2.tar.bz2 && tar xjf samtools-1.2.tar.bz2  && rm -f samtools-1.2.tar.bz2
31 | RUN     cd samtools-1.2/htslib-1.2.1 && ./configure && make  && cp bgzip htsfile tabix /usr/local/bin/ && cp libhts.so.1 /usr/local/lib/ && /sbin/ldconfig
32 | WORKDIR /usr/local/src
33 | RUN     cd samtools-1.2 && make  && cp samtools /usr/local/bin/
34 | 
35 | # scripts
36 | WORKDIR /usr/local/bin
37 | COPY     variant_QC_annotation.sh  filter_VCF_AF_AD.py ExAC_config.toml  ./
38 | RUN      chmod +x ./variant_QC_annotation.sh
39 | 
40 | 
41 | 


--------------------------------------------------------------------------------
/analysis/process_files/germline/google-cloud-ISB/docker/ExAC_config.toml:
--------------------------------------------------------------------------------
 1 | [[postannotation]]
 2 | fields=["ExAC_AC_Adj", "ExAC_AN_Adj"]
 3 | name="ExAC_AF_Adj"
 4 | op="div2"
 5 | type="Float"
 6 | 
 7 | [[annotation]]
 8 | #file="/Users/khuang/Box Sync/PhD/germline/ExAC/ExAC.r1.sites.vep.vcf.gz"
 9 | file="ExAC_nonTCGA.r0.3.1.sites.vep.vcf.gz"
10 | fields=["AC_Adj","AN_Adj"]
11 | ops=["self","self"]
12 | names=["ExAC_AC_Adj","ExAC_AN_Adj"]
13 | 


--------------------------------------------------------------------------------
/analysis/process_files/germline/google-cloud-ISB/docker/filter_VCF_AF_AD.py:
--------------------------------------------------------------------------------
  1 | #!/bin/python
  2 | #03 February 2016 - Kuan-Lin Huang @ WashU - 
  3 |  
  4 | import sys
  5 | import getopt
  6 | import gzip
  7 | 
  8 | class autovivification(dict):
  9 |     '''Implementation of perl's autovivification feature.'''
 10 |     def __init__( self , *args , **kwargs ):
 11 |         super( autovivification , self ).__init__( *args , **kwargs )
 12 |         self.itemlist = super( autovivification , self ).keys()
 13 |     def __getitem__(self, item):
 14 |         try:
 15 |             return dict.__getitem__(self, item)
 16 |         except KeyError:
 17 |             value = self[item] = type(self)()
 18 |             return value
 19 | 
 20 | def main():
 21 |     def usage():
 22 |         print """
 23 |     filter_AD_VCF.py : why do I exist?
 24 | 
 25 |     USAGE: filter_AD_VCF.py  [-h] <VCF filename> <ExAC Allelic Frequency (AF) threshold> <Allelic Depth (AD) threshold> 
 26 |      -h    print this message
 27 |      <VCF filename>    input file
 28 |         """
 29 | 
 30 |     if len(sys.argv) == 4:
 31 |         vcfFH= sys.argv[1]
 32 |         MAF_thres = float(sys.argv[2])
 33 |         AD_thres = int(sys.argv[3])
 34 |     else:
 35 |         usage()
 36 |         sys.exit()
 37 | 
 38 |     try:
 39 |         vcfF = open(vcfFH,"r")
 40 |     except IOError:
 41 |         print("VCF file does not exist!")  
 42 | 
 43 |     outFstring = "ExAC_AF." + str(MAF_thres) + ".AD." + str(AD_thres) + ".vcf"
 44 |     outF = vcfFH.replace("vcf",outFstring)
 45 |     outFH = open(outF, "w")
 46 | 
 47 |     all_var = 0
 48 |     nonpass_MAF_var = 0
 49 |     nonpass_AD_var = 0 
 50 |     pass_var = 0 
 51 | 
 52 |     for line in vcfF:
 53 |         line=line.strip()
 54 |         # print the info lines
 55 |         if line.startswith("#"):
 56 |             outFH.write(line + "\n")
 57 |         else:
 58 |             F = line.split("\t")
 59 |             all_var = all_var + 1
 60 | 
 61 |             info_f = str(F[7]).split(";")
 62 |             format_f = str(F[8]).split(":")
 63 |             geno_f = str(F[9]).split(":")
 64 |             AD_index = -1
 65 | 
 66 |             ### MAF filter
 67 |             nonpass_MAF = False
 68 |             for info in info_f:
 69 |                 # find the cases with annotated ExAC frequency
 70 |                 if info.startswith("ExAC_AF_Adj"):
 71 |                     ExAC_AF = info.replace("ExAC_AC_Adj=","")
 72 |                     if "," in ExAC_AF:
 73 |                         ExAC_AFs = ExAC_AF.split(",")
 74 |                         if ExAC_AFs[0] > MAF_thres: # need to assume it's the first allele
 75 |                             nonpass_MAF = True
 76 |                     else:
 77 |                         if ExAC_AF > MAF_thres:
 78 |                             nonpass_MAF = True
 79 |             if nonpass_MAF:
 80 |                 nonpass_MAF_var = nonpass_MAF_var + 1
 81 |                 continue
 82 | 
 83 | 
 84 |             ### AD filter
 85 |             nonpass_AD = False
 86 |             for i in range(0,len(format_f)):
 87 |                 if str(format_f[i]) == "AD":
 88 |                     AD_index = i
 89 | 
 90 |             genotype = str(geno_f[AD_index])
 91 |             # GATK and Pindel calls
 92 |             # second int for alt allele
 93 |             if "," in genotype: 
 94 |                 genotypes = genotype.split(",")
 95 |                 if int(genotypes[1]) < AD_thres:
 96 |                     nonpass_AD = True
 97 |                     nonpass_AD_var = nonpass_AD_var + 1
 98 |             # varscan calls
 99 |             else:
100 |                 if int(genotype) < AD_thres:
101 |                     nonpass_AD = True
102 |                     nonpass_AD_var = nonpass_AD_var + 1
103 |             
104 |             if not nonpass_MAF and not nonpass_AD:
105 |                 pass_var = pass_var + 1
106 |                 outFH.write(line + "\n")
107 | 
108 |     # filter summary
109 |     print "number of total variants:", all_var
110 |     print "number of variants failing MAF filter of", MAF_thres,":", nonpass_MAF_var
111 |     print "number of variants failing AD filter of", AD_thres,":", nonpass_AD_var
112 |     print "number of total passed variants:", pass_var
113 |             # if len(genotypes) == 1 & int(genotypes[0])>=AD_thres:
114 |             #     print line
115 |             # elif len(genotypes) == 2 & int(genotypes[1])>=AD_thres:
116 |             #     print line
117 |     outFH.close()
118 | 
119 | if __name__ == "__main__":
120 |     main()
121 | 


--------------------------------------------------------------------------------
/analysis/process_files/germline/google-cloud-ISB/docker/variant_QC_annotation.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | # by Kuan-lin Huang 2017 May @ WashU
 3 | echo "start time"
 4 | date
 5 | # requires: vcftools & vcfanno (https://github.com/brentp/vcfanno)
 6 | 
 7 | LOCAL=/usr/local/bin
 8 | 
 9 | ### AF and AD frequency filter ###
10 | vcfFile=$1
11 | 
12 | # gsutil cp  gs://dinglab/isb-cgc/tcga/reference_files/all_CDS_ncRNA_ENCODE_multicell_ROI.bed
13 | bedFile=all_CDS_ncRNA_ENCODE_multicell_ROI.bed
14 | 
15 | vcfannoConfigFile=$LOCAL/ExAC_config.toml
16 | AF_thres=0.01
17 | AD_thres=3
18 | 
19 | # annotate with ExAC frequency
20 | annotated_VCF=${vcfFile/vcf.gz/annotated.vcf}
21 | echo "using vcfanno to annotate" ${vcfFile} "into" ${annotated_VCF}
22 | vcfanno ${vcfannoConfigFile} ${vcfFile} > ${annotated_VCF}
23 | 
24 | filtered_VCF=${annotated_VCF/vcf/ExAC_AF.${AF_thres}.AD.${AD_thres}.vcf}
25 | echo "filtering" ${annotated_VCF} "into" ${filtered_VCF}
26 | python $LOCAL/filter_VCF_AF_AD.py ${annotated_VCF} $AF_thres $AD_thres
27 | date
28 | 
29 | ### extract ROI (region of interest) ###
30 | extracted_VCF=${filtered_VCF/vcf/ROI.vcf.gz}
31 | echo "extracting" ${filtered_VCF} "based on" $bedFile "into" ${extracted_VCF}
32 | vcftools --vcf $filtered_VCF \
33 | --bed $bedFile \
34 | --keep-INFO-all --recode -c  | bgzip -c  > ${extracted_VCF}
35 | 
36 | # tabix 
37 | echo "Indexing extracted VCF"
38 | tabix -p vcf ${extracted_VCF}
39 | # remove intermediate VCF files
40 | rm -f $annotated_VCF
41 | rm -f $filtered_VCF
42 | date
43 | 
44 | # possible option: calculate concordance with genotype file here [in this case you may want to batch the jobs by cancer types as I have already extracted those to ENCODE and exon regions in the VM: kuan-merge-genotype-bigmem]
45 | 
46 | # [alternative: merge first across samples in a separate workflow] 
47 | # [alternative: do a 5% MAF filter for the merged VCF within cohort to remove potential pipeline artifacts]
48 | 
49 | # annotate the resulting VCF using VEP
50 | 
51 | # run CharGer
52 | 
53 | # move the resulting VCF and files back to storage
54 | 
55 | echo "end time"
56 | date
57 | 


--------------------------------------------------------------------------------
/analysis/process_files/germline/google-cloud-ISB/make_lists.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | # by Jay Mashl, June 2017
 4 | # PanCanAtlas Germline
 5 | 
 6 | # Get analysisId
 7 | LISTS_DIR=analysisID_lists
 8 | mkdir -p $LISTS_DIR
 9 | for cancerType in  ACC BLCA  BRCA CESC CHOL  COAD DLBC  ESCA  GBM HNSC  KICH   KIRC KIRP  LGG LIHC LUAD   LUSC MESO OV  PAAD PCPG  PRAD  READ SARC  SKCM  STAD TGCT  THCA THYM UCEC  UCS UVM  ; do
10 |     echo $cancerType
11 |     listFile=$LISTS_DIR/$cancerType.ids
12 |     if [ ! -e $listFile ] ; then
13 | 	    gsutil ls -d gs://dinglab/isb-cgc/tcga/germline/production/$cancerType/* | sed -e 's/\// /g' | awk '{print $NF}' > $listFile
14 |     fi
15 | done
16 | 


--------------------------------------------------------------------------------
/analysis/process_files/germline/google-cloud-ISB/unused/Dockerfile:
--------------------------------------------------------------------------------
  1 | ##### bioconda: https://hub.docker.com/r/bioconda/bioconda-builder/~/dockerfile/ #####
  2 | 
  3 | FROM centos:centos5
  4 | 
  5 | # add tools useful for compilation
  6 | RUN rpm -Uvh http://dl.fedoraproject.org/pub/epel/5/x86_64/epel-release-5-4.noarch.rpm
  7 | # Install wget first so we can download devtools-2 and autotools repos
  8 | RUN yum install -y wget && \
  9 |     yum clean all
 10 | RUN wget http://people.centos.org/tru/devtools-2/devtools-2.repo -O /etc/yum.repos.d/devtools-2.repo
 11 | RUN yum install -y \
 12 |     bzip2 \
 13 |     git \
 14 |     gcc \
 15 |     gcc-c++ \
 16 |     patch \
 17 |     make \
 18 |     gcc44 \
 19 |     gcc44-c++ \
 20 |     cmake \
 21 |     unzip \
 22 |     byacc \
 23 |     devtoolset-2-gcc \
 24 |     devtoolset-2-binutils \
 25 |     devtoolset-2-gcc-c++ \
 26 |     devtoolset-2-gcc-gfortran \
 27 |     autotools-latest \
 28 |     pkgconfig \
 29 |     which \
 30 |     file \
 31 |     gpg \
 32 |     # Needed for perl-db-file
 33 |     db4-devel \
 34 |     # install X11 dependencies and openGL/mesa
 35 |     xorg-x11-apps \
 36 |     mesa-libGLU-devel \
 37 |     && yum clean all
 38 | 
 39 | 
 40 | 
 41 | # install conda
 42 | RUN mkdir -p /tmp/conda-build && wget https://repo.continuum.io/miniconda/Miniconda3-latest-Linux-x86_64.sh && bash Miniconda3-latest-Linux-x86_64.sh -b -p /anaconda
 43 | ENV PATH=/opt/rh/devtoolset-2/root/usr/bin:/opt/rh/autotools-latest/root/usr/bin:/anaconda/bin:$PATH
 44 | ENV LANG en_US.UTF-8
 45 | ENV LC_ALL en_US.UTF-8
 46 | RUN mkdir -p /anaconda/conda-bld/linux-64 /anaconda/conda-bld/osx-64 # workaround for bug in current conda (conda issue #466)
 47 | 
 48 | # setup conda
 49 | ADD requirements.txt requirements.txt
 50 | RUN conda update conda
 51 | RUN conda install -y --file requirements.txt
 52 | RUN conda update conda-build 
 53 | RUN conda index /anaconda/conda-bld/linux-64 /anaconda/conda-bld/osx-64
 54 | RUN conda config --add channels bioconda
 55 | RUN conda config --add channels r
 56 | RUN conda config --add channels file://anaconda/conda-bld
 57 | RUN conda install -y toposort
 58 | 
 59 | # setup entrypoint (assuming that repo is mounted under /bioconda-recipes)
 60 | ENTRYPOINT ["/bioconda-recipes/scripts/build-packages.py"]
 61 | CMD []
 62 | 
 63 | ##### vcftools: https://hub.docker.com/r/biocontainers/vcftools/~/dockerfile/ #####
 64 | # Base Image
 65 | FROM biocontainers/biocontainers:latest
 66 | 
 67 | # Metadata
 68 | LABEL base.image="biocontainers:latest"
 69 | LABEL version="1"
 70 | LABEL software="vcftools"
 71 | LABEL software.version="0.1.14"
 72 | LABEL description="A set of tools written in Perl and C++ for working with VCF files, such as those generated by the 1000 Genomes Project"
 73 | LABEL website="https://github.com/vcftools/vcftools|https://vcftools.github.io/index.html"
 74 | LABEL documentation="https://github.com/vcftools/vcftools|https://vcftools.github.io/index.html"
 75 | LABEL license="https://github.com/vcftools/vcftools|https://vcftools.github.io/index.html"
 76 | LABEL tags="Genomics"
 77 | 
 78 | # Maintainer
 79 | MAINTAINER Saulo Alves Aflitos <sauloal@gmail.com>
 80 | 
 81 | USER root
 82 | 
 83 | ENV ZIP=vcftools-0.1.14.tar.gz
 84 | ENV URL=https://github.com/vcftools/vcftools/releases/download/v0.1.14/
 85 | ENV FOLDER=vcftools-0.1.14
 86 | ENV DST=/tmp
 87 | 
 88 | RUN wget $URL/$ZIP -O $DST/$ZIP && \
 89 |   tar xvf $DST/$ZIP -C $DST && \
 90 |   rm $DST/$ZIP && \
 91 |   cd $DST/$FOLDER && \
 92 |   ./configure && \
 93 |   make && \
 94 |   make install && \
 95 |   cd / && \
 96 |   rm -rf $DST/$FOLDER
 97 | 
 98 | USER biodocker
 99 | 
100 | WORKDIR /data/
101 | 
102 | 
103 | ##### vcfanno: http://brentp.github.io/vcfanno/#installation #####
104 | 
105 | #RUN conda install -c bioconda vcfanno
106 | 
107 | RUN wget https://github.com/brentp/vcfanno/releases/download/v0.2.6/vcfanno_linux64 && \
108 | 	cp vcfanno_linux64 /usr/local/bin/vcfanno_linux64 && \
109 | 	rm vcfanno_linux64
110 | 
111 | ##### vcflib: https://hub.docker.com/r/itsjeffreyy/vcflib/~/dockerfile/ #####
112 | # Base image ubuntu:16.04
113 | FROM ubuntu:16.04
114 | 
115 | # Author
116 | MAINTAINER Jeffreyy Chun-Hui Yu
117 | 
118 | # install the system requirement
119 | RUN \
120 | 	apt-get update --fix-missing -yq \
121 | 	&& apt-get install -q -y wget g++ gcc make bzip2 git autoconf automake make g++ gcc build-essential zlib1g-dev libgsl0-dev curl git wget unzip tabix libncurses5-dev
122 | 
123 | WORKDIR /opt
124 | 
125 | # install vcflib
126 | RUN \
127 | 	git clone --recursive https://github.com/vcflib/vcflib.git \
128 | 	&& cd vcflib \
129 | 	&& make 
130 | 
131 | ENV PATH=/opt/vcflib/bin:$PATH
132 | 
133 | #clean tar balls
134 | RUN \
135 | 	rm -rf /var/lib/apt/lists/* \
136 | 	&& apt-get autoremove -y 
137 | 	
138 | 
139 | # set path
140 | WORKDIR /root 
141 | 


--------------------------------------------------------------------------------
/analysis/process_files/germline/local/ExAC_config.toml:
--------------------------------------------------------------------------------
 1 | [[postannotation]]
 2 | fields=["ExAC_AC_Adj", "ExAC_AN_Adj"]
 3 | name="ExAC_AF_Adj"
 4 | op="div2"
 5 | type="Float"
 6 | 
 7 | [[annotation]]
 8 | #file="/Users/khuang/Box Sync/PhD/germline/ExAC/ExAC.r1.sites.vep.vcf.gz"
 9 | file="ExAC_nonTCGA.r0.3.1.sites.vep.vcf.gz"
10 | fields=["AC_Adj","AN_Adj"]
11 | ops=["self","self"]
12 | names=["ExAC_AC_Adj","ExAC_AN_Adj"]
13 | 


--------------------------------------------------------------------------------
/analysis/process_files/germline/local/create_ROI_genotype_VCF.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | gcloud compute copy-files --zone us-central1-f ../../../../TCGA_data/reference_files/all_CDS_and_ncRNA_24Chroms_Contigs_1BasedStart_2bpFlanks_ForMusic huangkuanlin@kuan-merge-genotype-bigmem:~/
 3 | gcloud compute copy-files --zone us-central1-f ../../../../TCGA_data/reference_files/ROI_MultiCell_perid.txt huangkuanlin@kuan-merge-genotype-bigmem:~/
 4 | 
 5 | while IFS='' read -r line || [[ -n "$line" ]]; do
 6 |     echo "Processing cancer type: $line"
 7 |     cancer="${line%\\n}"
 8 |     
 9 |     # extract exonic region
10 |     vcftools --gzvcf ${cancer}.normal.merge.vcf.gz \
11 |     --bed all_CDS_and_ncRNA_24Chroms_Contigs_1BasedStart_2bpFlanks_ForMusic \
12 |     --keep-INFO-all --recode -c | bgzip -c  > ${cancer}.normal.merge.allCDS.vcf.gz &
13 | 
14 |     # extract encode region 
15 |     vcftools --gzvcf ${cancer}.normal.merge.vcf.gz \
16 |     --bed ROI_MultiCell_perid.txt \
17 |     --keep-INFO-all --recode -c | bgzip -c  > ${cancer}.normal.merge.ENCODE.vcf.gz
18 | 
19 |     #index
20 |     tabix -p vcf ${cancer}.normal.merge.allCDS.vcf.gz &
21 |     tabix -p vcf ${cancer}.normal.merge.ENCODE.vcf.gz
22 |     
23 | done < cancer_type.txt


--------------------------------------------------------------------------------
/analysis/process_files/germline/local/expand_csq.py:
--------------------------------------------------------------------------------
 1 | #!/bin/python
 2 | #Oct 2017 - Kuan-Lin Huang @ WashU - 
 3 |  
 4 | import sys
 5 | import getopt
 6 | import gzip
 7 | 
 8 | 
 9 | def main():
10 |     def usage():
11 |         print """
12 |     combine_CharGer2VCF.py : combine CharGer output to its originating VCF file
13 | 
14 |     USAGE: liftover_CharGer_result.py [-h] <CharGer results file> <VCF>
15 |      -h    print this message
16 |      <filename>    input file
17 |         """
18 | 
19 |     if len(sys.argv) >= 3:
20 | 	vcfheadFH = sys.argv[1]
21 |         CharGerFH= sys.argv[2]
22 |     else:
23 |         usage()
24 |         sys.exit()
25 |     
26 |     try:
27 |         vcfheadF = open(vcfheadFH,"r")
28 |     except IOError:
29 |         print("File , CharGerFH, does not exist!")
30 |     csq_header = ""
31 |     for line in vcfheadF:
32 | 	line = line.strip()
33 | 	if line.startswith("##INFO=<ID=CSQ"):
34 | 		F = line.split("|")
35 | 		F[0] = "Allele"
36 | 		csq_header = "\t".join(F)
37 |     
38 |     #open CharGer file
39 |     try:
40 |         charGerF = open(CharGerFH,"r")
41 |     except IOError:
42 |         print("File , CharGerFH, does not exist!")
43 |     
44 |     CharGerHeader = charGerF.readline().strip()
45 |     print CharGerHeader + "\t" + csq_header
46 |     #read input file
47 |     for line in charGerF:
48 |         line=line.strip()
49 |         F = line.split("\t")
50 | 	csq = F[32].split(",")
51 | 	csqF = csq[0].split("|")
52 | 	for i in range(len(csqF)):
53 | 		if csqF[i] == "":
54 | 			csqF[i] = "NA"
55 | 	csqFields = "\t".join(csqF)
56 | 	
57 | 	print line + "\t" + csqFields
58 | 
59 |     charGerF.close()
60 | 
61 | if __name__ == "__main__":
62 |     main()
63 | 


--------------------------------------------------------------------------------
/analysis/process_files/germline/local/filter_VCF_AD.py:
--------------------------------------------------------------------------------
 1 | #!/bin/python
 2 | #03 February 2016 - Kuan-Lin Huang @ WashU - 
 3 |  
 4 | import sys
 5 | import getopt
 6 | import gzip
 7 | 
 8 | class autovivification(dict):
 9 |     '''Implementation of perl's autovivification feature.'''
10 |     def __init__( self , *args , **kwargs ):
11 |         super( autovivification , self ).__init__( *args , **kwargs )
12 |         self.itemlist = super( autovivification , self ).keys()
13 |     def __getitem__(self, item):
14 |         try:
15 |             return dict.__getitem__(self, item)
16 |         except KeyError:
17 |             value = self[item] = type(self)()
18 |             return value
19 | 
20 | def main():
21 |     def usage():
22 |         print """
23 |     filter_AD_VCF.py : why do I exist?
24 | 
25 |     USAGE: filter_AD_VCF.py  [-h] <VCF filename> <Allelic Depth (AD) threshold> 
26 |      -h    print this message
27 |      <VCF filename>    input file
28 |         """
29 | 
30 |     if len(sys.argv) == 3:
31 |         vcfFH= sys.argv[1]
32 |         AD_thres = int(sys.argv[2])
33 |     else:
34 |         usage()
35 |         sys.exit()
36 | 
37 |     try:
38 |         vcfF = gzip.open(vcfFH,"r")
39 |     except IOError:
40 |         print("VCF file does not exist!")  
41 | 
42 | 
43 |     for line in vcfF:
44 |         line=line.strip()
45 |         # print the info lines
46 |         if line.startswith("#"):
47 |             print line
48 |         else:
49 |             F = line.split("\t")
50 | 
51 |             ref = str(F[3])
52 |             info_f = str(F[7]).split(";")
53 |             format_f = str(F[8]).split(":")
54 |             geno_f = str(F[9]).split(":")
55 |             AD_index = -1
56 | 
57 |             ### reference filter
58 |             nonpass_ref = False
59 |             if ref == "N":
60 |                 nonpass_ref = True
61 | 
62 |             ### AD filter
63 |             nonpass_AD = False
64 |             for i in range(0,len(format_f)):
65 |                 if str(format_f[i]) == "AD":
66 |                     AD_index = i
67 | 
68 |             genotype = str(geno_f[AD_index])
69 |             # GATK and Pindel calls
70 |             # second int for alt allele
71 |             if "," in genotype: 
72 |                 genotypes = genotype.split(",")
73 |                 if int(genotypes[1]) < AD_thres:
74 |                     nonpass_AD = True
75 |             # varscan calls
76 |             else:
77 |                 if int(genotype) < AD_thres:
78 |                     nonpass_AD = True
79 |             
80 |             if not nonpass_ref and not nonpass_AD:
81 |                 #outFH.write(line + "\n")
82 |                 print line
83 | 
84 | 
85 | if __name__ == "__main__":
86 |     main()
87 | 


--------------------------------------------------------------------------------
/analysis/process_files/germline/local/filter_VCF_AF_AD.py:
--------------------------------------------------------------------------------
  1 | #!/bin/python
  2 | #03 February 2016 - Kuan-Lin Huang @ WashU - 
  3 |  
  4 | import sys
  5 | import getopt
  6 | import gzip
  7 | 
  8 | class autovivification(dict):
  9 |     '''Implementation of perl's autovivification feature.'''
 10 |     def __init__( self , *args , **kwargs ):
 11 |         super( autovivification , self ).__init__( *args , **kwargs )
 12 |         self.itemlist = super( autovivification , self ).keys()
 13 |     def __getitem__(self, item):
 14 |         try:
 15 |             return dict.__getitem__(self, item)
 16 |         except KeyError:
 17 |             value = self[item] = type(self)()
 18 |             return value
 19 | 
 20 | def main():
 21 |     def usage():
 22 |         print """
 23 |     filter_AD_VCF.py : why do I exist?
 24 | 
 25 |     USAGE: filter_AD_VCF.py  [-h] <VCF filename> <ExAC Allelic Frequency (AF) threshold> <Allelic Depth (AD) threshold> 
 26 |      -h    print this message
 27 |      <VCF filename>    input file
 28 |         """
 29 | 
 30 |     if len(sys.argv) == 4:
 31 |         vcfFH= sys.argv[1]
 32 |         MAF_thres = float(sys.argv[2])
 33 |         AD_thres = int(sys.argv[3])
 34 |     else:
 35 |         usage()
 36 |         sys.exit()
 37 | 
 38 |     try:
 39 |         vcfF = open(vcfFH,"r")
 40 |     except IOError:
 41 |         print("VCF file does not exist!")  
 42 | 
 43 |     outFstring = "ExAC_AF." + str(MAF_thres) + ".AD." + str(AD_thres) + ".vcf"
 44 |     outF = vcfFH.replace("vcf",outFstring)
 45 |     outFH = open(outF, "w")
 46 | 
 47 |     all_var = 0
 48 |     nonpass_MAF_var = 0
 49 |     nonpass_AD_var = 0 
 50 |     pass_var = 0 
 51 | 
 52 |     for line in vcfF:
 53 |         line=line.strip()
 54 |         # print the info lines
 55 |         if line.startswith("#"):
 56 |             outFH.write(line + "\n")
 57 |         else:
 58 |             F = line.split("\t")
 59 |             all_var = all_var + 1
 60 | 
 61 |             info_f = str(F[7]).split(";")
 62 |             format_f = str(F[8]).split(":")
 63 |             geno_f = str(F[9]).split(":")
 64 |             AD_index = -1
 65 | 
 66 |             ### MAF filter
 67 |             nonpass_MAF = False
 68 |             for info in info_f:
 69 |                 # find the cases with annotated ExAC frequency
 70 |                 if info.startswith("ExAC_AF_Adj"):
 71 |                     ExAC_AF = info.replace("ExAC_AC_Adj=","")
 72 |                     if "," in ExAC_AF:
 73 |                         ExAC_AFs = ExAC_AF.split(",")
 74 |                         if ExAC_AFs[0] > MAF_thres: # need to assume it's the first allele
 75 |                             nonpass_MAF = True
 76 |                     else:
 77 |                         if ExAC_AF > MAF_thres:
 78 |                             nonpass_MAF = True
 79 |             if nonpass_MAF:
 80 |                 nonpass_MAF_var = nonpass_MAF_var + 1
 81 |                 continue
 82 | 
 83 | 
 84 |             ### AD filter
 85 |             nonpass_AD = False
 86 |             for i in range(0,len(format_f)):
 87 |                 if str(format_f[i]) == "AD":
 88 |                     AD_index = i
 89 | 
 90 |             genotype = str(geno_f[AD_index])
 91 |             # GATK and Pindel calls
 92 |             # second int for alt allele
 93 |             if "," in genotype: 
 94 |                 genotypes = genotype.split(",")
 95 |                 if int(genotypes[1]) < AD_thres:
 96 |                     nonpass_AD = True
 97 |                     nonpass_AD_var = nonpass_AD_var + 1
 98 |             # varscan calls
 99 |             else:
100 |                 if int(genotype) < AD_thres:
101 |                     nonpass_AD = True
102 |                     nonpass_AD_var = nonpass_AD_var + 1
103 |             
104 |             if not nonpass_MAF and not nonpass_AD:
105 |                 pass_var = pass_var + 1
106 |                 outFH.write(line + "\n")
107 | 
108 |     # filter summary
109 |     print "number of total variants:", all_var
110 |     print "number of variants failing MAF filter of", MAF_thres,":", nonpass_MAF_var
111 |     print "number of variants failing AD filter of", AD_thres,":", nonpass_AD_var
112 |     print "number of total passed variants:", pass_var
113 |             # if len(genotypes) == 1 & int(genotypes[0])>=AD_thres:
114 |             #     print line
115 |             # elif len(genotypes) == 2 & int(genotypes[1])>=AD_thres:
116 |             #     print line
117 |     outFH.close()
118 | 
119 | if __name__ == "__main__":
120 |     main()
121 | 


--------------------------------------------------------------------------------
/analysis/process_files/germline/local/filter_VCF_AF_AD_keepExAConly.py:
--------------------------------------------------------------------------------
  1 | #!/bin/python
  2 | #Aug 2017 - Kuan-Lin Huang @ WashU - 
  3 |  
  4 | import sys
  5 | import getopt
  6 | import gzip
  7 | 
  8 | class autovivification(dict):
  9 |     '''Implementation of perl's autovivification feature.'''
 10 |     def __init__( self , *args , **kwargs ):
 11 |         super( autovivification , self ).__init__( *args , **kwargs )
 12 |         self.itemlist = super( autovivification , self ).keys()
 13 |     def __getitem__(self, item):
 14 |         try:
 15 |             return dict.__getitem__(self, item)
 16 |         except KeyError:
 17 |             value = self[item] = type(self)()
 18 |             return value
 19 | 
 20 | def main():
 21 |     def usage():
 22 |         print """
 23 |     filter_AD_VCF.py : why do I exist?
 24 | 
 25 |     USAGE: filter_AD_VCF.py  [-h] <VCF filename> <ExAC Allelic Frequency (AF) threshold> <Allelic Depth (AD) threshold> 
 26 |      -h    print this message
 27 |      <VCF filename>    input file
 28 |         """
 29 | 
 30 |     if len(sys.argv) == 4:
 31 |         vcfFH= sys.argv[1]
 32 |         MAF_thres = float(sys.argv[2])
 33 |         AD_thres = int(sys.argv[3])
 34 |     else:
 35 |         usage()
 36 |         sys.exit()
 37 | 
 38 |     try:
 39 |         vcfF = open(vcfFH,"r")
 40 |     except IOError:
 41 |         print("VCF file does not exist!")  
 42 | 
 43 |     outFstring = "ExAC_AF." + str(MAF_thres) + ".ExAConly.AD." + str(AD_thres) + ".vcf"
 44 |     outF = vcfFH.replace("vcf",outFstring)
 45 |     outFH = open(outF, "w")
 46 | 
 47 |     all_var = 0
 48 |     nonpass_MAF_var = 0
 49 |     nonpass_AD_var = 0 
 50 |     pass_var = 0 
 51 | 
 52 |     for line in vcfF:
 53 |         line=line.strip()
 54 |         # print the info lines
 55 |         if line.startswith("#"):
 56 |             outFH.write(line + "\n")
 57 |         else:
 58 |             F = line.split("\t")
 59 |             all_var = all_var + 1
 60 | 
 61 |             info_f = str(F[7]).split(";")
 62 |             format_f = str(F[8]).split(":")
 63 |             geno_f = str(F[9]).split(":")
 64 |             AD_index = -1
 65 | 
 66 |             # only keep rare, ExAC variants
 67 |             ### MAF filter
 68 |             nonpass_MAF = True
 69 |             for info in info_f:
 70 |                 # find the cases with annotated ExAC frequency
 71 |                 if info.startswith("ExAC_AF_Adj"):
 72 |                     ExAC_AF = info.replace("ExAC_AF_Adj=","")
 73 |                     if "," in ExAC_AF:
 74 |                         ExAC_AFs = ExAC_AF.split(",")
 75 |                         if float(ExAC_AFs[0]) < MAF_thres: # need to assume it's the first allele
 76 |                             nonpass_MAF = False
 77 |                     else:
 78 |                         if float(ExAC_AF) < MAF_thres:
 79 |                             nonpass_MAF = False
 80 |             if nonpass_MAF:
 81 |                 nonpass_MAF_var = nonpass_MAF_var + 1
 82 |                 continue
 83 | 
 84 | 
 85 |             ### AD filter
 86 |             nonpass_AD = False
 87 |             for i in range(0,len(format_f)):
 88 |                 if str(format_f[i]) == "AD":
 89 |                     AD_index = i
 90 | 
 91 |             genotype = str(geno_f[AD_index])
 92 |             # GATK and Pindel calls
 93 |             # second int for alt allele
 94 |             if "," in genotype: 
 95 |                 genotypes = genotype.split(",")
 96 |                 if int(genotypes[1]) < AD_thres:
 97 |                     nonpass_AD = True
 98 |                     nonpass_AD_var = nonpass_AD_var + 1
 99 |             # varscan calls
100 |             else:
101 |                 if int(genotype) < AD_thres:
102 |                     nonpass_AD = True
103 |                     nonpass_AD_var = nonpass_AD_var + 1
104 |             
105 |             if not nonpass_MAF and not nonpass_AD:
106 |                 pass_var = pass_var + 1
107 |                 outFH.write(line + "\n")
108 | 
109 |     # filter summary
110 |     print "number of total variants:", all_var
111 |     print "number of variants failing MAF filter of", MAF_thres,":", nonpass_MAF_var
112 |     print "number of variants failing AD filter of", AD_thres,":", nonpass_AD_var
113 |     print "number of total passed variants:", pass_var
114 |             # if len(genotypes) == 1 & int(genotypes[0])>=AD_thres:
115 |             #     print line
116 |             # elif len(genotypes) == 2 & int(genotypes[1])>=AD_thres:
117 |             #     print line
118 |     outFH.close()
119 | 
120 | if __name__ == "__main__":
121 |     main()
122 | 


--------------------------------------------------------------------------------
/analysis/process_files/germline/local/filter_merge_germline_by_cancer.sh:
--------------------------------------------------------------------------------
 1 |  #!/bin/bash
 2 | cancer=$1
 3 | 
 4 |     # set limit to more files
 5 |     ulimit -n 2500
 6 |     echo "Processing cancer type: $line"
 7 |     echo "Start time"
 8 |     date
 9 |         mkdir $cancer
10 |         samples=$(gsutil ls gs://dinglab/isb-cgc/tcga/germline/production/${cancer}/)
11 |     for sample in $samples; do 
12 |         sampleNamePre=${sample##*$cancer/}
13 |         sampleName=${sampleNamePre%/}
14 |         echo "Sample "$sample
15 |         echo "Copying vcf for "$sampleName
16 |         gsutil cp ${sample}combine/prefilter.snp_indel.vcf.gz ${cancer}/${sampleName}.prefilter.snp_indel.vcf.gz
17 |         python filter_VCF_AD.py ${cancer}/${sampleName}.prefilter.snp_indel.vcf.gz 5 | bgzip -c > ${cancer}/${sampleName}.AD.5.vcf.gz
18 |         tabix -p vcf ${cancer}/${sampleName}.AD.5.vcf.gz
19 |         # copy the filtered VCF back to storage
20 |         gsutil cp ${cancer}/${sampleName}.AD.5.vcf.gz* ${sample}combine/
21 |     done
22 |     
23 |     # here we may need to limit to the best BAM
24 | 
25 |     #merge
26 |     ~/bin/bcftools-1.5/bcftools merge --output-type z --output ${cancer}.merge.vcf.gz $(ls -1 ${cancer}/*.AD.5.vcf.gz | perl -pe 's/\n/ /g')
27 |     #index
28 |     tabix -p vcf ${cancer}.merge.vcf.gz
29 |     #upload
30 |     #gsutil cp ${cancer}.merge.vcf.gz* gs://dinglab/isb-cgc/tcga/germline/production/merge
31 |     # delete files
32 |     #rm -rf ${cancer}/*.vcf.gz
33 |     #rm -rf ${cancer}/*.vcf.gz.tbi
34 |     echo "End time"
35 |     date
36 | 


--------------------------------------------------------------------------------
/analysis/process_files/germline/local/filter_merge_germline_by_sample.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | batch=${1##*pca_table_split/}
 4 | echo "Start time"
 5 | date
 6 | echo "Processing batch "$batch
 7 | echo ""
 8 | mkdir ${batch}
 9 | 
10 | while IFS='' read -r line || [[ -n "$line" ]]; do
11 | sample="${line%\\n}"
12 | 
13 |         sampleNamePre=${sample##*production/}
14 |         sampleNamePre2=${sampleNamePre%/combine/prefilter.snp_indel.vcf.gz}
15 |         sampleName=${sampleNamePre2#*/}
16 |         echo "Sample "$sample
17 |         echo "Copying vcf for "$sampleName
18 |         gsutil cp ${sample} ${batch}/${sampleName}.prefilter.snp_indel.vcf.gz
19 |         python filter_VCF_AD.py ${batch}/${sampleName}.prefilter.snp_indel.vcf.gz 5 | bgzip -c > ${batch}/${sampleName}.AD.5.vcf.gz
20 |         tabix -p vcf ${batch}/${sampleName}.AD.5.vcf.gz
21 |         # copy the filtered VCF back to storage
22 |         gsutil cp ${batch}/${sampleName}.AD.5.vcf.gz* gs://dinglab/isb-cgc/tcga/germline/release1.0/individualVCF/
23 | 	# remove original VCF
24 | 	rm -f ${batch}/${sampleName}.prefilter.snp_indel.vcf.gz
25 | done < "$1"
26 | 
27 | #merge
28 | ~/bin/bcftools-1.5/bcftools merge --output-type z --output ${batch}.merge.vcf.gz $(ls -1 ${batch}/*.AD.5.vcf.gz | perl -pe 's/\n/ /g')
29 | #index
30 | tabix -p vcf ${batch}.merge.vcf.gz
31 | #upload
32 | #gsutil cp ${batch}.merge.vcf.gz* gs://dinglab/isb-cgc/tcga/germline/production/merge
33 | # delete files
34 | #rm -rf ${batch}/*.vcf.gz
35 | #rm -rf ${batch}/*.vcf.gz.tbi
36 | echo "End time"
37 | date
38 | 


--------------------------------------------------------------------------------
/analysis/process_files/germline/local/make.bsub.commands.sh:
--------------------------------------------------------------------------------
1 | for file in pca_table_split/*; do echo "bsubl -oo filter.merge."${file##*pca_table_split/}".log 'bash filter_merge_germline_by_sample.sh "$file"'"; done
2 | 


--------------------------------------------------------------------------------
/analysis/process_files/germline/local/post_CharGer.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | input="Charged_VEP/"
 4 | results="ChargedSample/"
 5 | variantBuffer=2000
 6 | queue="long"
 7 | group="/khuang"
 8 | forks=4
 9 | mem=20000000
10 | 
11 | inputDir="/gscmnt/gc3020/dinglab/medseq/Germline/projects/PanCanAtlasGermline/analysis/process_files/germline/local/Charged_VEP/"
12 | 
13 | if [ ! -d ${results} ]; then
14 | 	mkdir ${results}
15 | fi
16 | 
17 | for i in {1..22} X Y
18 | do
19 | 	tsv=${input}charged.PCA.r1.TCGAbarcode.merge.exon.chr${i}.vcf.tsv
20 | 	vcf=AnnotatedVCFs/anno.PCA.r1.TCGAbarcode.merge.exon.chr${i}.norm.vcf.gz
21 | 	out=${results}charged.PCA.r1.TCGAbarcode.merge.exon.chr${i}.vcf.samples.tsv
22 | 	runCMD="python combine_CharGer2VCF.py ${tsv} ${vcf} > ${out}"
23 | 	log="${out}.log"
24 | 	echo "bsub -g ${group} -q ${queue} -n ${forks} -M ${mem} -oo ${log} \"${runCMD}\""
25 | done
26 | 


--------------------------------------------------------------------------------
/analysis/process_files/germline/local/recalc_AF_PM2.py:
--------------------------------------------------------------------------------
 1 | #!/bin/python
 2 | #Oct 2017 - Kuan-Lin Huang @ WashU - 
 3 |  
 4 | import sys
 5 | import getopt
 6 | import gzip
 7 | 
 8 | 
 9 | def main():
10 |     def usage():
11 |         print """
12 |     combine_CharGer2VCF.py : combine CharGer output to its originating VCF file
13 | 
14 |     USAGE: liftover_CharGer_result.py [-h] <CharGer results file> <VCF>
15 |      -h    print this message
16 |      <filename>    input file
17 |         """
18 | 
19 |     if len(sys.argv) >= 2:
20 |         CharGerFH= sys.argv[1]
21 |     else:
22 |         usage()
23 |         sys.exit()
24 |     
25 |     #open CharGer file
26 |     try:
27 |         charGerF = open(CharGerFH,"r")
28 |     except IOError:
29 |         print("File , CharGerFH, does not exist!")
30 |     
31 |     CharGerHeader = charGerF.readline().strip()
32 |     print CharGerHeader + "\tExAC_adj_AF"
33 |     #read input file
34 |     for line in charGerF:
35 |         line=line.strip()
36 |         F = line.split("\t")
37 | 	AF = 0
38 | 	AFstr = F[80]
39 |         AFstr1 = AFstr.split("&")
40 |         AFrelevantStr = ""
41 |         for AFstrings in AFstr1:
42 |         	if AFstrings.startswith(F[7]):
43 |                 	AFrelevantStr = AFstrings
44 |         if len(AFrelevantStr.split(":")) >1:
45 |                 AF = float( AFrelevantStr.split(":")[1] )
46 | 	if len(F) > 14 and "PM2" in F[14]:
47 | 		score = F[18]
48 | 		if AF > 0.0005: 
49 | 			F[18] = int(F[18])-2 # score
50 | 			F[14] = F[14].replace("PM2,","")
51 | 			if int(F[18]) < 5:
52 | 				continue
53 | 	F[18] = str(F[18])
54 | 	AF = str(AF)
55 | 	F.append(AF)
56 | 	print "\t".join(F)
57 | 
58 |     charGerF.close()
59 | 
60 | if __name__ == "__main__":
61 |     main()
62 | 


--------------------------------------------------------------------------------
/analysis/process_files/germline/local/replace_vcf_header_sample_with_source_TCGA.pl:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env perl
 2 | #
 3 | # Jay Mashl, July 2017
 4 | # Syntax:   uncompressed vcf |  $thisScript
 5 | # adopted by Kuan Oct 2017 for bcftools output and update directly to TCGA ID
 6 | 
 7 | 
 8 | use strict;
 9 | use warnings;
10 | 
11 | my @myList=();
12 | my @a;
13 | my $samples={};
14 | 
15 | my $fn = "/gscmnt/gc3020/dinglab/medseq/Germline/projects/PanCanAtlasGermline/TCGA_data/sampleQC/pca_table.20171017.tsv";
16 | #my $fn = "../../../../TCGA_data/sampleQC/pca_table.20171017.tsv";
17 | open ( IN , "<$fn" ) or die "Cannot open $fn: $!";
18 | while ( <IN> )
19 | {
20 | 	chomp;
21 | 	my @line = split "\t" , $_;
22 |     $samples->{$line[0]}=$line[1];
23 | }
24 | close IN;
25 | 
26 | while(<STDIN>) {
27 |     chomp;
28 |     if( /^#/ ) {
29 | 	# get list of input filenames given to merge
30 | 	#if( /vcf-merge/ ) {
31 | 	if( /bcftools_mergeCommand/ ){
32 | 	    @a = split /\s+/;
33 | 	    #for(my $i = 1; $i < scalar @a; $i++) {
34 | 	    for(my $i = 5; $i < scalar @a; $i++) {
35 | 		my %data = ('inputfile' => $a[ $i ], 'samplename' => "");
36 | 		push @myList, \%data;
37 | 	    }
38 | 	}
39 | 	if( /^#CHROM/ ) {
40 | 	    @a = split /\t/;
41 | 	    for(my $i = 9 ; $i < scalar @a; $i++) {
42 | #		$myList[ $i - 9 ]{'samplename'} = $a[ $i ];
43 | 
44 |                 # in this application, extract unique identifier from first field
45 | 		my @b = split /\./, $myList[ $i - 9 ]{'inputfile'};
46 | 		my @c = split /\//, $b[0];
47 | 		my $TCGA = $samples->{$c[1]};
48 | 		$a[ $i ] = $TCGA;
49 | 	    }
50 | 	}
51 | 
52 | 	#Print
53 | 	if( /^#CHROM/ ) {
54 | 	    print join("\t", @a),"\n";
55 | 	} else {
56 | 	    print $_,"\n";
57 | 	}
58 | 	
59 |     } else  {
60 | 	last;
61 |     }
62 | }
63 | 
64 | #for(my $j=0 ; $j < scalar @myList; $j++) {
65 | #    print $j,"\t", $myList[$j]{'inputfile'},"\t", $myList[$j]{'samplename'}, "\n";
66 | #}
67 | 


--------------------------------------------------------------------------------
/analysis/process_files/germline/local/run_VEP.v85.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | #(ads&rjm) 2016-09-22
 4 | 
 5 | # use new perl
 6 | . /gscmnt/gc2525/dinglab/rmashl/Software/perl/set_envvars
 7 | #which ${PERL_BIN}
 8 | #exit
 9 | #ads_vep="/gscmnt/gc2706/dinglab/medseq/LabCode/AdamDS/ensembl-vep/vep"
10 | #ads_cachevep="/gscmnt/gc2706/dinglab/medseq/LabCode/AdamDS/VEP/.vep/"
11 | vep_cmd="/gscmnt/gc2525/dinglab/rmashl/Software/perl/perl-5.22.0/bin/perl /gscmnt/gc2525/dinglab/rmashl/Software/bin/VEP/v85/ensembl-tools-release-85/scripts/variant_effect_predictor/variant_effect_predictor.pl"
12 | cachedir="/gscmnt/gc2525/dinglab/rmashl/Software/bin/VEP/v85/cache/"
13 | reffasta="/gscmnt/gc2525/dinglab/rmashl/Software/bin/VEP/v85/cache/homo_sapiens/85_GRCh37/Homo_sapiens.GRCh37.75.dna.primary_assembly.fa"
14 | assembly="GRCh37"
15 | #opts="--plugin ExAC,/gscmnt/gc2706/dinglab/medseq/ExAC/VCF/ExAC.r0.3.1.sites.vep.vcf.gz" #"--everything"
16 | opts="--everything"
17 | results="new_run/AnnotatedVCFs/"
18 | variantBuffer=2000
19 | queue="long"
20 | group="/khuang"
21 | forks=4
22 | mem=20000000
23 | export SAMTOOLSDIR="/gscmnt/gc2525/dinglab/rmashl/Software/bin/samtools/1.2/bin"
24 | expoSAMTOOLS="$SAMTOOLSDIR/samtools"
25 | 
26 | inputDir="/gscmnt/gc3020/dinglab/medseq/Germline/projects/PanCanAtlasGermline/analysis/process_files/germline/local/VCF"
27 | 
28 | if [ ! -d ${results} ]; then
29 | 	mkdir ${results}
30 | fi
31 | 
32 | for i in {1..22} X Y
33 | do
34 | 	vcf=PCA.r1.TCGAbarcode.merge.exon.chr${i}.vcf.gz
35 | 	out=PCA.r1.TCGAbarcode.merge.exon.chr${i}.vcf
36 | 	runVEP="perl format.pl $inputDir $vcf; ${vep_cmd} ${opts} --offline --cache --dir ${cachedir} --assembly ${assembly} --format vcf --vcf -i new_run/preVEP/anno.$out -o ${results}anno.${out} --force_overwrite --fasta ${reffasta} --fork ${forks} --buffer_size ${variantBuffer};"
37 | 	log="${results}anno.${vcf}.log"
38 | 	echo "bsub -g ${group} -q ${queue} -n ${forks} -M ${mem} -oo ${log} \"${runVEP}\""
39 | done
40 | 
41 | 


--------------------------------------------------------------------------------
/analysis/process_files/germline/local/run_calc_vcf_concordance.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | date
 3 | 
 4 | cancer=$1
 5 | outFileExon=${cancer}.exon.QCstat.tsv
 6 | outFileEncode=${cancer}.encodeROI.QCstat.tsv
 7 | touch $outFileExon
 8 | touch $outFileEncode
 9 | 
10 | samples=$(gsutil ls gs://dinglab/isb-cgc/tcga/germline/production/${cancer}/)
11 | for sample in $samples; do 
12 | 	sampleNamePre=${sample##*$cancer/}
13 | 	sampleName=${sampleNamePre%/}
14 |     	echo "Copying vcf for "$sampleName
15 | 
16 |     	gsutil cp ${sample}combine/*gz ${sampleName}.prefilter.snp_indel.vcf.gz
17 | 	gsutil cp ${sample}combine/*gz.tbi ${sampleName}.prefilter.snp_indel.vcf.gz.tbi
18 | 
19 | 	#echo "running" ${cancer}.normal.merge.vcf.gz ${sampleName}.prefilter.snp_indel.vcf.gz $outFile
20 |         python calc_vcf_concordance.py ${sampleName}.prefilter.snp_indel.vcf.gz ${cancer}.normal.merge.allCDS.vcf.gz $outFileExon
21 |         python calc_vcf_concordance.py ${sampleName}.prefilter.snp_indel.vcf.gz ${cancer}.normal.merge.ENCODE.vcf.gz $outFileEncode
22 | 	rm -f ${sampleName}.prefilter.snp_indel.vcf.gz*
23 | 
24 | done
25 | date
26 | 


--------------------------------------------------------------------------------
/analysis/process_files/germline/local/run_charger_on_vep_VCF.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | #setEnvironment=". /gscuser/ascott/python2_7_13_env"
 4 | 
 5 | inputDir="/gscmnt/gc3020/dinglab/medseq/Germline/projects/PanCanAtlasGermline/analysis/process_files/germline/local/new_run/AnnotatedVCFs"
 6 | 
 7 | #clinvar="/gscmnt/gc2706/dinglab/medseq/ClinVar/MacArthurLab/DataSnapshots/201701_ead5de/clinvar_alleles.tsv.gz"
 8 | clinvar="/gscmnt/gc2706/dinglab/medseq/ClinVar/MacArthurLab/clinvar/output/b37/single/clinvar_alleles.single.b37.tsv.gz"
 9 | mmGenes="/gscmnt/gc2737/ding/Analysis/Germline/CharGer/20160301_Rahman_KJ_KH_gene_table_CharGer.txt"
10 | mmVariants="/gscmnt/gc2737/ding/Analysis/VariantLists/emptyRemoved_20160428_pathogenic_variants_HGVSg_VEP.vcf"
11 | hotspot="/gscmnt/gc2737/ding/Analysis/Germline/CharGer/MC3.noHypers.mericUnspecified.d10.r20.v114.clusters"
12 | rareThreshold="0.0005"
13 | 
14 | results="new_run/Charged_VEP/"
15 | if [ ! -d ${results} ]; then
16 |         mkdir ${results}
17 | fi
18 | 
19 | #queue="bigmem"
20 | queue="long"
21 | queue="ding-lab"
22 | group="/khuang"
23 | 
24 | 
25 | for i in {1..22} X Y;
26 | do
27 | 	sample="$inputDir/anno.PCA.r1.TCGAbarcode.merge.exon.chr${i}.vcf"
28 | 	vcf="PCA.r1.TCGAbarcode.merge.exon.chr${i}.vcf"
29 | 	output="${results}charged.${vcf}.tsv"
30 | 	command="charger --include-vcf-details -f ${sample} -o ${output} -O -D -g ${mmGenes} -z ${mmVariants} -H ${hotspot} -l --mac-clinvar-tsv ${clinvar} --rare-threshold ${rareThreshold} > ${results}charger.${vcf}.out"
31 | 	log="${results}charger.${vcf}.log"
32 | 	echo "bsub -R\"select[type==LINUX64 && mem>80000] rusage[mem=80000]\" -M 60000000 -g ${group} -q ${queue} -oo ${log} \"${command}\""
33 | done
34 | 


--------------------------------------------------------------------------------
/analysis/process_files/germline/local/update_vcfHeader_to_TCGA.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | # Kuan, Oct 2017 adapted from Jay's script	
 3 | 
 4 | for file in PCA_*.merge.vcf.gz; do
 5 |     echo '------'
 6 |     echo "Reheadering "$file
 7 |     chunk=${file%.merge.vcf.gz}
 8 |     echo '------'
 9 |     gunzip -dc $file | perl replace_vcf_header_sample_with_source_TCGA.pl > $chunk.merge.newheader.txt
10 |     tabix -r $chunk.merge.newheader.txt $file >  $chunk.merge.TCGAbarcode.vcf.gz
11 |     tabix -p vcf $chunk.merge.TCGAbarcode.vcf.gz
12 | 
13 |     rm -f $chunk.merge.newheader.txt #$file $file.tbi 
14 | done


--------------------------------------------------------------------------------
/analysis/process_files/germline/local/variant_QC_annotation.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | # by Kuan-lin Huang 2017 May @ WashU
 3 | echo "start time"
 4 | date
 5 | # requires: vcftools & vcfanno (https://github.com/brentp/vcfanno)
 6 | 
 7 | ### AF and AD frequency filter ###
 8 | vcfFile=$1
 9 | bedFile=../../../../TCGA_data/reference_files/all_CDS_ncRNA_ENCODE_multicell_ROI.bed
10 | vcfannoConfigFile=ExAC_config.toml
11 | AF_thres=0.01
12 | AD_thres=3
13 | 
14 | # annotate with ExAC frequency
15 | annotated_VCF=${vcfFile/vcf.gz/annotated.vcf}
16 | echo "using vcfanno to annotate" ${vcfFile} "into" ${annotated_VCF}
17 | vcfanno ${vcfannoConfigFile} ${vcfFile} > ${annotated_VCF}
18 | 
19 | # # note the previous versions only takes variants not appearing in ExAC due to bug in filter_VCF_AF_AD.py
20 | # filtered_VCF=${annotated_VCF/vcf/ExAC_AF.${AF_thres}.AD.${AD_thres}.vcf}
21 | # echo "filtering" ${annotated_VCF} "into" ${filtered_VCF}
22 | # python filter_VCF_AF_AD.py ${annotated_VCF} $AF_thres $AD_thres
23 | 
24 | # now we only need variants that are rare in ExAC
25 | filtered_VCF=${annotated_VCF/vcf/ExAC_AF.${AF_thres}.ExAConly.AD.${AD_thres}.vcf}
26 | echo "filtering" ${annotated_VCF} "into" ${filtered_VCF}
27 | python filter_VCF_AF_AD_keepExAConly.py ${annotated_VCF} $AF_thres $AD_thres
28 | date
29 | 
30 | ### extract ROI (region of interest) ###
31 | extracted_VCF=${filtered_VCF/vcf/ROI.vcf.gz}
32 | echo "extracting" ${filtered_VCF} "based on" $bedFile "into" ${extracted_VCF}
33 | vcftools --vcf $filtered_VCF \
34 | --bed $bedFile \
35 | --keep-INFO-all --recode -c  | bgzip -c  > ${extracted_VCF}
36 | 
37 | # tabix 
38 | echo "Indexing extracted VCF"
39 | tabix -p vcf ${extracted_VCF}
40 | # remove intermediate VCF files
41 | rm -f $annotated_VCF
42 | rm -f $filtered_VCF
43 | date
44 | 
45 | # possible option: calculate concordance with genotype file here [in this case you may want to batch the jobs by cancer types as I have already extracted those to ENCODE and exon regions in the VM: kuan-merge-genotype-bigmem]
46 | 
47 | # [alternative: merge first across samples in a separate workflow] 
48 | # [alternative: do a 5% MAF filter for the merged VCF within cohort to remove potential pipeline artifacts]
49 | 
50 | # annotate the resulting VCF using VEP
51 | 
52 | # run CharGer
53 | 
54 | # move the resulting VCF and files back to storage
55 | 
56 | echo "end time"
57 | date
58 | 


--------------------------------------------------------------------------------
/analysis/process_files/germline/merge_germline_cloud.sh:
--------------------------------------------------------------------------------
  1 | # do cancer type by cancer type to avoid file size
  2 | $ cat cancer_type.txt 
  3 | ACC
  4 | BLCA
  5 | BRCA
  6 | CESC
  7 | CHOL
  8 | COAD
  9 | DLBC
 10 | ESCA
 11 | GBM
 12 | HNSC
 13 | KICH
 14 | KIRC
 15 | KIRP
 16 | LAML # there are no LAML samples so this was throwing errors earlier
 17 | LGG
 18 | LIHC
 19 | LUAD
 20 | LUSC
 21 | MESO
 22 | OV
 23 | PAAD
 24 | PCPG
 25 | PRAD
 26 | READ
 27 | SARC
 28 | SKCM
 29 | STAD
 30 | TGCT
 31 | THCA
 32 | THYM
 33 | UCEC
 34 | UCS
 35 | UVM
 36 | 
 37 | $ cat merge_germline_by_cancer.sh 
 38 | #!/bin/bash
 39 | while IFS='' read -r line || [[ -n "$line" ]]; do
 40 |     echo "Processing cancer type: $line"
 41 |     echo "Start time"
 42 |     date
 43 |     cancer="${line%\\n}"
 44 |         mkdir $cancer
 45 |         samples=$(gsutil ls gs://dinglab/isb-cgc/tcga/germline/production/${cancer}/)
 46 |     for sample in $samples; do 
 47 |         sampleNamePre=${sample##*$cancer/}
 48 |         sampleName=${sampleNamePre%/}
 49 |         echo "Copying vcf for "$sampleName
 50 |         #gsutil cp ${sample}combine/*gz ${sampleName}.prefilter.snp_indel.vcf.gz
 51 |         #gsutil cp ${sample}combine/*gz.tbi ${sampleName}.prefilter.snp_indel.vcf.gz.tbi
 52 |         gsutil cp ${sample}combine/prefilter.snp_indel.annotated.ExAC_AF.0.01.AD.3.ROI.vcf.gz ${cancer}/${sampleName}.prefilter.snp_indel.annotated.ExAC_AF.0.01.AD.3.ROI.vcf.gz
 53 |         gsutil cp ${sample}combine/prefilter.snp_indel.annotated.ExAC_AF.0.01.AD.3.ROI.vcf.gz.tbi ${cancer}/${sampleName}.prefilter.snp_indel.annotated.ExAC_AF.0.01.AD.3.ROI.vcf.gz.tbi
 54 |     done
 55 |     
 56 |     #merge
 57 |     vcf-merge $(ls -1 ${cancer}/*.vcf.gz | perl -pe 's/\n/ /g') > ${cancer}.merge.vcf
 58 |     bgzip -c ${cancer}.merge.vcf > ${cancer}.merge.vcf.gz
 59 |     #index
 60 |     tabix -p vcf ${cancer}.merge.vcf.gz
 61 |     #upload
 62 |     gsutil cp ${cancer}.merge.vcf.gz* gs://dinglab/isb-cgc/tcga/germline/production/merge
 63 |     # delete files
 64 |     rm -rf ${cancer}.merge.vcf
 65 |     rm -rf ${cancer}/*ROI.vcf.gz
 66 |     rm -rf ${cancer}/*ROI.vcf.gz.tbi
 67 |     echo "End time"
 68 |     date
 69 | done < "$1"
 70 | 
 71 | # set unlimit file number higher so it works for breast cancer
 72 | 
 73 | ulimit -n 2500
 74 | 
 75 | nohup bash merge_germline_by_cancer.sh cancer_type.txt > merge_germline_by_cancer.log &
 76 | nohup bash merge_germline_by_cancer2.sh cancer_type2.txt > merge_germline_by_cancer2.log &
 77 | nohup bash merge_germline_by_cancer2.sh cancer_type3.txt > merge_germline_by_cancer3.log &
 78 | nohup bash merge_germline_by_cancer2.sh cancer_type4.txt > merge_germline_by_cancer4.log &
 79 | nohup bash merge_germline_by_cancer2.sh CESC.txt > merge_germline_by_cancerCESC.log &
 80 | 
 81 | # there is no LAML? if so delete
 82 | rm -rf LAML*
 83 | 
 84 | ### merge verything
 85 | # copy file to big VM
 86 | echo "Start time"
 87 | date
 88 | gsutil cp gs://dinglab/isb-cgc/tcga/germline/production/merge/* .
 89 | #merge
 90 | echo "Merging"
 91 | #vcf-merge $(ls -1 *.vcf.gz | perl -pe 's/\n/ /g') > PCA.merge.vcf
 92 | bcftools merge --output-type z --output PCA.merge.vcf.gz $(ls -1 *.vcf.gz | perl -pe 's/\n/ /g') 
 93 | #echo "Zipping"
 94 | #bgzip -c PCA.merge.vcf > PCA.merge.vcf.gz
 95 | #index
 96 | echo "Indexing"
 97 | tabix -p vcf PCA.merge.vcf.gz
 98 | #upload
 99 | gsutil cp PCA.merge.vcf.gz* gs://dinglab/isb-cgc/tcga/germline/production/merge
100 | echo "End time"
101 | date
102 | 
103 | # # note: bcftools turned out much more efficient and can directly generate zipped file
104 | 
105 | # # note: other tools 
106 | # # copy local files to VM
107 | # gcloud compute copy-files --zone us-central1-f /Users/khuang/Downloads/GenomeAnalysisTK-3.7.tar.bz2 huangkuanlin@kuan-merge-germline-bigmem:~/
108 | # gcloud compute copy-files --zone us-central1-f /Users/khuang/Downloads/picard-2.9.0.zip  huangkuanlin@kuan-merge-germline-bigmem:~/
109 | 
110 | # # set up
111 | # wget ftp://genome.wustl.edu/pub/reference/GRCh37-lite/GRCh37-lite.fa.gz
112 | # gunzip GRCh37-lite.fa.gz 
113 | # samtools faidx GRCh37-lite.fa


--------------------------------------------------------------------------------
/analysis/process_files/germline/readme.txt:
--------------------------------------------------------------------------------
 1 | # describe workflow to conduct variant QC, concordance calculation and annotation
 2 | # by Kuan-lin Huang and Jay Mashl 2017 May @ WashU
 3 | 
 4 | variant_QC_annotation.sh which runs script: 
 5 | filter_VCF_AF_AD.py
 6 | 
 7 | and other dependencies: 
 8 | vcfanno
 9 | ExAC_config.toml [vcfanno configuration file]
10 | ExAC_nonTCGA frequency file (ftp://ftp.broadinstitute.org/pub/ExAC_release/release0.3.1/subsets/ExAC_nonTCGA.r0.3.1.sites.vep.vcf.gz and ftp://ftp.broadinstitute.org/pub/ExAC_release/release0.3.1/subsets/ExAC_nonTCGA.r0.3.1.sites.vep.vcf.gz.tbi)
11 | vcftools
12 | bed file: gs://dinglab/isb-cgc/tcga/reference_files/all_CDS_ncRNA_ENCODE_multicell_ROI.bed
13 | 
14 | 
15 | Note: # this is rough ballpark number check the exact number of this sample H_LS-E2-A10B-10A-01D-A10M-09 in the most recent ppt
16 | Variant QC and filter: 
17 | 1. Annotate AF, filter with AF and AD [3 min]
18 | 226K -> 102K variants
19 | 2. Extract variants in ROI (exon + encode all cell regulatory region), index [28 min]
20 | 102K -> 21K variants 
21 | 
22 | 
23 | Concordance [may be added in to the last variant QC and filter step before annotation]
24 | run_calc_vcf_concordance.sh which runs:
25 | calc_vcf_concordance.py
26 | 
27 | kuan-merge-genotype-bigmem which has the genotype VCF already trimmed down to region of interests:
28 | ${cancer}.normal.merge.allCDS.vcf.gz
29 | ${cancer}.normal.merge.ENCODE.vcf.gz


--------------------------------------------------------------------------------
/analysis/process_files/germline/var_freq/batch_run_vcf_var_freq_filter.sh:
--------------------------------------------------------------------------------
 1 | # not in ExAC
 2 | VCFs=$(gsutil ls gs://dinglab/isb-cgc/tcga/germline/production/merge_v2/not-in-exac/3.annotated/whitelisted/by_tcgaBarcode/PCA.merge.tcgaBarcode.chr*.anno.whitelist.vcf.gz)
 3 | for file in $VCFs; do 
 4 | 	bash run_vcf_var_freq_filter.sh $file &
 5 | 
 6 | 	NPROC=$(($NPROC+1))
 7 | 	if [ "$NPROC" -ge 8 ]; then
 8 | 		wait
 9 | 		NPROC=0
10 |         fi
11 | done
12 | 
13 | # in ExAC: gs://dinglab/isb-cgc/tcga/germline/production/merge.ExAConly/all/annotated/by_tcgaBarcode/PCA.merge.ExAConly.tcgaBarcode.anno.whitelist.vcf.gz*
14 | 
15 | file=gs://dinglab/isb-cgc/tcga/germline/production/merge.ExAConly/all/annotated/by_tcgaBarcode/PCA.merge.ExAConly.tcgaBarcode.anno.whitelist.vcf.gz
16 | bash run_vcf_var_freq_filter.sh $file
17 | 


--------------------------------------------------------------------------------
/analysis/process_files/germline/var_freq/run_vcf_var_freq_filter.sh:
--------------------------------------------------------------------------------
 1 | # run vcf frequency filter to rule out (1) alleles with greater than 5% AF in the PCA cohort and without any variant in the final sample list
 2 | # update AN/AC fields to the final cohort of 9401 samples
 3 | 
 4 | # not in ExAC: gs://dinglab/isb-cgc/tcga/germline/production/merge_v2/not-in-exac/3.annotated/whitelisted/by_tcgaBarcode/PCA.merge.tcgaBarcode.chr*.anno.whitelist.vcf.gz
 5 | 
 6 | 
 7 | 	file=$1
 8 | 	gsutil cp $file* .
 9 | 	vcfName=${file##*/}
10 | 	outVCF=${vcfName/.vcf.gz/.cohortAF0.05.vcf.gz}
11 | 	echo "Filtering ${vcfName} into $outVCF"
12 | 	# frequency check, recalculate AC, AN, and AF based on the cohort
13 | 	perl vcf_var_freq_filter.pl --vcf $vcfName | bgzip -c > $outVCF
14 | 	tabix -p $outVCF
15 | 	ls -klh ${outVCF}*
16 | 
17 | 	gsutil cp ${outVCF}* gs://dinglab/isb-cgc/tcga/germline/production/merge_v2/not-in-exac/3.annotated/whitelisted/by_tcgaBarcode/cohort_AF_filtered
18 | 	rm -f $vcfName
19 | 	rm -f ${outVCF}*


--------------------------------------------------------------------------------
/analysis/process_files/germline/var_freq/vcf_var_freq_filter.pl:
--------------------------------------------------------------------------------
  1 | # Kuan Huang @ WashU 2017 Aug1
  2 | # reference: # https://github.com/zhuochenbioinfo/VcfStat
  3 | # filter variants based on a specific frequency using the cohort freq
  4 | # update AN/AC fields
  5 | 
  6 | use strict;
  7 | use warnings;
  8 | use Getopt::Long;
  9 | 
 10 | my($vcf,$keepList,$all,$CHROM);
 11 | 
 12 | my $usage = "USAGE:\nperl $0 --vcf <vcf> --out <out>\n";
 13 | $usage .= "<vcf> is the input vcf fle. [Necessary].\n";
 14 | #$usage .= "<out> is the output file. [Necessary]\n";
 15 | 
 16 | GetOptions(
 17 | 	"vcf=s" => \$vcf,
 18 | 	#"out=s" => \$out,
 19 | 	"chr=s" => \$CHROM,
 20 | ) or die $usage;
 21 | 
 22 | die $usage unless(defined $vcf);
 23 | my $log = $vcf."cohortAF.filter.log";
 24 | 
 25 | # add in a module to open both vcf and vcf.gz
 26 | if ($vcf =~ /.gz$/) {
 27 | 	open(IN, "gunzip -c $vcf |") || die "can’t open pipe to $vcf";
 28 | }
 29 | else {
 30 | 	open(IN, $vcf) || die "can’t open $vcf";
 31 | }
 32 | 
 33 | # open(IN,"<$vcf") or die $!;
 34 | open(LOG,">$log");
 35 | 
 36 | # arbitrary cut-off for now
 37 | my $sample_size = 9401;
 38 | my $AF_threshold = 0.05;
 39 | 
 40 | my @samples = ();
 41 | my @keepRanks = ();
 42 | my $num_pass_alleles = 0;
 43 | my $filtered_alleles = 0;
 44 | my $filtered_vars = 0;
 45 | my $nonexisting_alleles = 0;
 46 | my $nonexisting_vars = 0;
 47 | 
 48 | while(<IN>){
 49 | 	chomp;
 50 | 	if($_ =~ /^##/){
 51 | 		print $_."\n";
 52 | 		next;
 53 | 	}
 54 | 	my($chr,$pos,$id,$ref,$alts_join,$qual,$filter,$info,$format,@datas) = split/\t/;
 55 | 	my @alts = split/,/,$alts_join;
 56 | 	my @alleles = ($ref,@alts);
 57 | 	my @alleles_count = (0 x $#alleles);
 58 | 	my @pass_alleles = (); 
 59 | 	my @pass_alleles_index = (0); #keeping the ref index
 60 | 	my @pass_alleles_count = (); 
 61 | 	my @pass_alleles_freq = (); 
 62 | 
 63 | 	if($_ =~ /^#CHROM/){
 64 | 		print $_."\n";		
 65 | 		#print "##samplenum=$num\n";
 66 | 		#print OUT "CHROM\tPOS\tREF\tALT\tFILTER\tALLELEnum\tHETnum\tNAnum\tCOVfreq\tALLELEfreq\n";
 67 | 		next;
 68 | 	}
 69 | 
 70 | 	foreach my $spot(@datas){
 71 | 		if($spot =~ /^(\d+)\/(\d+)/){ # add count for each allele here
 72 | 			$alleles_count[$1]++;
 73 | 			$alleles_count[$2]++;
 74 | 		}
 75 | 	}
 76 | 	
 77 | 	# keep only alleles that are rare in the cohort
 78 | 	for(my $i = 1; $i < @alleles; $i++){
 79 | 		my $count = $alleles_count[$i];
 80 | 		if (!defined($count)){
 81 | 			$nonexisting_alleles++;
 82 | 			next;
 83 | 		}
 84 | 		
 85 | 		my $AF = $count/$sample_size;
 86 | 		if ( $AF < $AF_threshold ){
 87 | 			$num_pass_alleles++;
 88 | 			push @pass_alleles, $alleles[$i];
 89 | 			push @pass_alleles_index, $i;
 90 | 			push @pass_alleles_count, $alleles_count[$i];
 91 | 			push @pass_alleles_freq, $AF;
 92 | 			#print $AF."(AF)\t"
 93 | 		} else {
 94 | 			$filtered_alleles++;
 95 | 		}
 96 | 		
 97 | 	}
 98 | 
 99 | 	# modify the geno field;
100 | 	# reorder the index to the new index; replace essentially
101 | 	for(my $i=1; $i < @pass_alleles_index; $i++){
102 | 		my $old_index = $pass_alleles_index[$i];
103 | 		foreach my $spot(@datas){
104 | 			$spot =~ s/^$old_index\//$i\//;
105 | 			$spot =~ s/\/$old_index/\/$i/;
106 | 		}
107 | 	}
108 | 
109 | 	my $count_of_pass_alleles = scalar @pass_alleles;
110 | 	# only print the line if we have more than one passed allele
111 | 	if($count_of_pass_alleles > 0){
112 | 			# update alternative allele
113 | 			my $alts_join = join(",",@pass_alleles);
114 | 			# update AC, AN, AF in info field of vcf; AF=0.5,0.5;MLEAC=1,1;MLEAF=0.5,0.5;AN=326;AC=169,23
115 | 			my $AC = join(",",@pass_alleles_count);
116 | 			my $AF = join(",",@pass_alleles_freq);
117 | 			$info =~ s/"AF=.*;"/"AF=".$AF.";"/;
118 | 			$info =~ s/"AC=.*;"/"AC=".$AC.";"/;
119 | 			$info =~ s/"AN=.*;"/"AF=".$sample_size.";"/;
120 | 			print join("\t",($chr,$pos,$id,$ref,$alts_join,$qual,$filter,$info,$format,@datas))."\n";
121 | 	} else{
122 | 		$filtered_vars++;
123 | 		#print LOG "Filtered variant (including nonexisting variants): ".join(",",($chr,$pos,$id,$ref,$alts_join))."\n";
124 | 	}		
125 | 	
126 | }
127 | 
128 | print LOG "Pass alleles: $num_pass_alleles\n\n";
129 | print LOG "Nonexisting alleles: $nonexisting_alleles\n";
130 | print LOG "Filtered alleles: $filtered_alleles\n";
131 | print LOG "Total filtered variants: $filtered_vars\n";
132 | my $total_filtered_alleles = $nonexisting_alleles + $filtered_alleles;
133 | print LOG "Total filtered alleles: $total_filtered_alleles\n";
134 | close LOG;
135 | 


--------------------------------------------------------------------------------
/analysis/sample_listing/make_clin_summary_table.R:
--------------------------------------------------------------------------------
 1 | ##### compile_compare_samples.R #####
 2 | # Kuan-lin Huang @ WashU 2017 Oct.
 3 | # make a clinical supplementary table for pan-germline manuscript
 4 | 
 5 | setwd("/Users/khuang/Box Sync/PhD/germline/PanCanAtlasGermline/analysis/sample_listing")
 6 | source("../global_aes_out.R")
 7 | 
 8 | clin_f = "/Users/khuang/Box Sync/PhD/germline/PanCanAtlasGermline/TCGA_data/clinical/PanCan_ClinicalData_V4_wAIM_filtered10389.txt"
 9 | clin_full = read.table(header=T, quote = "", sep="\t", fill =T, file = clin_f, stringsAsFactors=FALSE)
10 | clin = clin_full[,c("bcr_patient_barcode", "type","age_at_initial_pathologic_diagnosis","gender","race")]
11 | colnames(clin) = c("sample","cancer","age_at_onset","gender","ethnicity")
12 | 
13 | clin$ethnicity[clin$ethnicity %in% c("","[Not Available]","[Not Evaluated]","[Unknown]")]=NA
14 | 
15 | s_c_list_f = "/Users/khuang/Box\ Sync/PhD/germline/PanCanAtlasGermline/TCGA_data/sampleQC/pca_table.20171118.filtered.wclin.tsv"
16 | sample_cancer = read.table(header=T, quote = "", sep="\t", file = s_c_list_f, stringsAsFactors=FALSE)
17 | sample_cancer = sample_cancer[,c("bcr_patient_barcode", "cancer")]
18 | colnames(sample_cancer) = c("sample","cancer")
19 | 
20 | sample_cancer_clin = merge(sample_cancer,clin, by = c("sample","cancer"), all.x =T)
21 | mean(sample_cancer_clin$age_at_onset, na.rm=T)
22 | sample_cancer_clin$gender[sample_cancer_clin$gender==""]=NA
23 | 
24 | cancer_count = data.frame(table(data = sample_cancer_clin$cancer ))
25 | cancer_ethni_count = as.data.frame(table(sample_cancer_clin$cancer,sample_cancer_clin$ethnicity))
26 | cancer_ethni_count_d = dcast(cancer_ethni_count, formula = Var1 ~ Var2, value.var = "Freq")
27 | cancer_gender_count = as.data.frame(table(sample_cancer_clin$cancer,sample_cancer_clin$gender))
28 | cancer_gender_count_d = dcast(cancer_gender_count, formula = Var1 ~ Var2, value.var = "Freq")
29 | cancer_gender_count_d$FemalePercent=cancer_gender_count_d$FEMALE/(cancer_gender_count_d$FEMALE+ cancer_gender_count_d$MALE)
30 | cancer_aao = data.frame(aggregate(data = sample_cancer_clin, age_at_onset~cancer, FUN = "mean"))
31 | cancer_aao_sd = data.frame(aggregate(data = sample_cancer_clin, age_at_onset~cancer, FUN = "sd"))
32 | colnames(cancer_count) = c("Cancer","Sample size")
33 | colnames(cancer_ethni_count_d)[1] = c("Cancer")
34 | colnames(cancer_aao) = c("Cancer", "Average_AAO")
35 | colnames(cancer_aao_sd) = c("Cancer", "AAO_SD")
36 | colnames(cancer_gender_count_d)[1]="Cancer"
37 | cancer_gender_count_d_sum = cancer_gender_count_d[,c("Cancer","FemalePercent")]
38 | 
39 | cancer_count_gender = merge(cancer_count,cancer_gender_count_d_sum,by="Cancer")
40 | cancer_count_wethni = merge(cancer_count_gender,cancer_ethni_count_d, by="Cancer") 
41 | cancer_count_wethni_waao = merge(cancer_count_wethni, cancer_aao, by = "Cancer")
42 | cancer_count_wethni_waao_sd = merge(cancer_count_wethni_waao, cancer_aao_sd, by = "Cancer")
43 | cancer_count_wethni_waao_sd$AAO = paste(round(cancer_count_wethni_waao_sd$Average_AAO,1), "+/-", round(cancer_count_wethni_waao_sd$AAO_SD,1))
44 | colnames(cancer_count_wethni_waao_sd) = tolower(colnames(cancer_count_wethni_waao_sd))
45 | colnames(cancer_count_wethni_waao_sd) = paste(toupper(substring(colnames(cancer_count_wethni_waao_sd), 1,1)), substring(colnames(cancer_count_wethni_waao_sd), 2),sep="")
46 | all_sum = sum(as.numeric(cancer_count_wethni_waao_sd[,2]))
47 | all_sum_ethni = sapply(cancer_count_wethni_waao_sd[,c(4:8)],sum)
48 | gender_ratio = sum(sample_cancer_clin$gender=="FEMALE",na.rm=T)/(sum(sample_cancer_clin$gender=="FEMALE",na.rm=T)+sum(sample_cancer_clin$gender=="MALE",na.rm=T))
49 | all_aao = round(mean(sample_cancer_clin$age_at_onset, na.rm=T),1)
50 | all_aao_sd = round(sd(sample_cancer_clin$age_at_onset, na.rm=T),1)
51 | all_row = c("All", all_sum, gender_ratio, all_sum_ethni,all_aao, all_aao_sd, paste(all_aao, "+/-", all_aao_sd))
52 | cancer_count_wethni_waao_sd$Cancer = as.character(cancer_count_wethni_waao_sd$Cancer)
53 | cancer_count_wethni_waao_sd = rbind(cancer_count_wethni_waao_sd,all_row)
54 | 
55 | cancer_count_wethni_waao_sd_p = cancer_count_wethni_waao_sd[,!(colnames(cancer_count_wethni_waao_sd) %in% c("Average_aao","Aao_sd"))]
56 | #colnames(cancer_count_wethni_waao_sd_p) = c("Cancer", "Sample size", "American indian", "Asian", "African american", "Pacific islander", "White", "Age at onset")
57 | 
58 | tn = "out/cancer_count_wethni_waao.txt"
59 | write.table(cancer_count_wethni_waao_sd_p, quote=F, sep="\t", file = tn, row.names = F)
60 | 


--------------------------------------------------------------------------------
/analysis/segregation_analysis/batch_run_segregation.sh:
--------------------------------------------------------------------------------
 1 | # not in ExAC
 2 | VCFs=$(gsutil ls gs://dinglab/isb-cgc/tcga/germline/production/merge_v2/not-in-exac/3.annotated/whitelisted/by_tcgaBarcode/PCA.merge.tcgaBarcode.chr*.anno.whitelist.vcf.gz)
 3 | for file in $VCFs; do 
 4 | 	bash find_segregating_var.sh $file &
 5 | 
 6 | 	NPROC=$(($NPROC+1))
 7 | 	if [ "$NPROC" -ge 8 ]; then
 8 | 		wait
 9 | 		NPROC=0
10 |         fi
11 | done
12 | 
13 | # in ExAC: gs://dinglab/isb-cgc/tcga/germline/production/merge.ExAConly/all/annotated/by_tcgaBarcode/PCA.merge.ExAConly.tcgaBarcode.anno.whitelist.vcf.gz*
14 | 
15 | file=gs://dinglab/isb-cgc/tcga/germline/production/merge.ExAConly/all/annotated/by_tcgaBarcode/PCA.merge.ExAConly.tcgaBarcode.anno.whitelist.vcf.gz
16 | bash find_segregating_var.sh $file
17 | 


--------------------------------------------------------------------------------
/analysis/segregation_analysis/find_relatives.R:
--------------------------------------------------------------------------------
 1 | ##### find_relatives.R #####
 2 | # Kuan-lin Huang @ WashU 2017 July
 3 | 
 4 | setwd("/Users/khuang/Box Sync/PhD/germline/PanCanAtlasGermline/analysis/segregation_analysis")
 5 | source("../global_aes_out.R")
 6 | system("mkdir out")
 7 | # ethnicity assignment
 8 | ethni_fn = "/Users/khuang/Box\ Sync/PhD/germline/PanCanAtlasGermline/analysis/PCA_IBD_dist/out/2017-04-24/2017-04-24_GDAN_AIM_PCA_ethnicity_assigned_WashU.tsv"
 9 | ethni = read.table(header=T, sep = '\t',file=ethni_fn)
10 | ethni_short = ethni[,c("Case","cancer","washu_assigned_ethnicity")]
11 | 
12 | 
13 | #genome file for all the pi HAT
14 | rel_f = "plink_out/all.normal.merge.indep_50_5_2.vcf.ibd.genome.PI_HAT0.05.tsv"
15 | rel = read.table(header=T, quote = "", sep="\t", row.names =NULL, file = rel_f, stringsAsFactors=FALSE)
16 | rel_short = rel[rel$Z1 + rel$Z2 > 0.2,]
17 | rel_short$Sample1 = gsub("(.{12}).*","\\1",as.character(rel_short$FID1))
18 | rel_short$Sample2 = gsub("(.{12}).*","\\1",as.character(rel_short$FID2))
19 | 
20 | colnames(ethni_short) = c("Sample1","cancer1","assigned_ethnicity1")
21 | rel_short_m1 = merge(rel_short,ethni_short, by="Sample1",all.x=T)
22 | colnames(ethni_short) = c("Sample2","cancer2","assigned_ethnicity2")
23 | rel_short_m2 = merge(rel_short_m1,ethni_short, by="Sample2",all.x=T)
24 | rel_short_m2$same_sample = (rel_short_m2$Sample1 == rel_short_m2$Sample2)
25 | 
26 | rel_short_m2_same = rel_short_m2[rel_short_m2$assigned_ethnicity2==rel_short_m2$assigned_ethnicity1,]
27 | rel_short_m2_same_withethni = rel_short_m2_same[rel_short_m2_same$assigned_ethnicity1!="unknown",]
28 | rel_short_m2_same_withethni$assigned_ethnicity1 = as.character(rel_short_m2_same_withethni$assigned_ethnicity1)
29 | 
30 | p = ggplot(data=rel_short_m2_same_withethni,aes(x = Z1, y=Z2, color=same_sample))
31 | p = p + facet_grid(.~assigned_ethnicity1, drop=T, scales="free",space="free")
32 | p = p + geom_point(alpha=0.2)  
33 | p = p + theme_bw()  #+ expand_limits(y=1)#+ guides(fill=FALSE)
34 | p = p + xlim(0,1) + ylim(0,1)
35 | #p = p + geom_vline(xintercept = 0,alpha=.7) + geom_vline(xintercept = 1,alpha=.7)
36 | p
37 | fn = paste(pd, "PanCanAtlas_rel_z1.z2_withinEthni.pdf", sep="_")
38 | ggsave(file=fn, height=5, width=12, useDingbats=FALSE)
39 | 
40 | rel_short_m2_same_withethni_ofinterest = rel_short_m2_same_withethni[!rel_short_m2_same_withethni$same_sample,]
41 | rel_short_m2_same_withethni_ofinterest = rel_short_m2_same_withethni_ofinterest[(rel_short_m2_same_withethni_ofinterest$Z2 > 0.125 & rel_short_m2_same_withethni_ofinterest$Z1 > 0.25) |
42 |                                                                                   (rel_short_m2_same_withethni_ofinterest$Z1 > 0.20) ,]
43 | 
44 | tn = paste("out/TCGA_z1_z2_relatives.tsv", sep="_")
45 | write.table(rel_short_m2_same_withethni_ofinterest,quote=F, sep = '\t', row.names = FALSE,file=tn)
46 | 
47 | rel_short_m2_same_withethni_ofinterest2 = rel_short_m2_same_withethni_ofinterest[(rel_short_m2_same_withethni_ofinterest$Z2 > 0.125 & rel_short_m2_same_withethni_ofinterest$Z1 > 0.25) |
48 |                                                                                    (rel_short_m2_same_withethni_ofinterest$Z1 > 0.625) ,]
49 | #rel_short_m2_same_withethni_ofinterest2[,c("Sample1","Sample2","cancer1","cancer2")]
50 | tn = paste("out/TCGA_z1_z2_relatives_strict.tsv", sep="_")
51 | write.table(rel_short_m2_same_withethni_ofinterest2,quote=F, sep = '\t', row.names = FALSE,file=tn)
52 | 


--------------------------------------------------------------------------------
/analysis/segregation_analysis/find_segregating_var.sh:
--------------------------------------------------------------------------------
 1 | # run segregation python script
 2 | 	file=$1
 3 | 	gsutil cp $file* .
 4 | 	vcfName=${file##*/}
 5 | 	echo "Filtering ${vcfName} into $outVCF"
 6 | 	# frequency check, recalculate AC, AN, and AF based on the cohort
 7 | 	python find_shared_var_relatives.py TCGA_z1_z2_relatives.tsv $vcfName
 8 | 	ls -klh ${outVCF}*
 9 | 	
10 | 	rm -f $vcfName


--------------------------------------------------------------------------------
/analysis/segregation_analysis/find_shared_var_relatives.py:
--------------------------------------------------------------------------------
  1 | #!/bin/python
  2 | #Aug 2017 - Kuan-Lin Huang @ WashU - 
  3 | # find_shared variants in relatives identified in TCGA samples
  4 | 
  5 | import sys
  6 | import getopt
  7 | import gzip
  8 | 
  9 | def main():
 10 |     def usage():
 11 |         print """
 12 |     find_shared_var_relatives.py : why do I exist?
 13 | 
 14 |     USAGE: find_shared_var_relatives.py [-h] <file with relative pairs> <vcf.gz file to be scanned>
 15 |      -h    print this message
 16 |      <filename>    input file
 17 |         """
 18 | 
 19 |     #use getopt to get inputs
 20 |     try:
 21 |         opts, args = getopt.getopt(sys.argv[1:], 'h') #:after option meaning required arguments
 22 |     except getopt.GetoptError:
 23 |         print "find_shared_var_relatives.py <file with relative pairs> <vcf.gz file to be scanned>"
 24 | 
 25 |     for opt, arg in opts: #store the input options
 26 |         if opt == '-h': # h means user needs help
 27 |             usage(); sys.exit()
 28 | 
 29 |     args = sys.argv[1:]
 30 |     if len(args) < 1:
 31 |         usage(); sys.exit("input file missing")
 32 | 
 33 |     #open input file
 34 |     try:
 35 |         fn = args[0]
 36 |         relativeF = open(fn ,"r")
 37 |     except IOError:
 38 |         print("File , args[0], does not exist!")
 39 | 
 40 |     # dictionaries
 41 |     relative2relative = {}
 42 |     existing_relatives = {}
 43 |     sample2colID = {}
 44 | 
 45 |     # output file
 46 |     outFstring = args[0] + "." + args[1]
 47 |     #outF = outFstring.replace("vcf","segragatingVar.tsv")
 48 |     outF = outFstring.replace("vcf.gz","segragatingVar.tsv")
 49 |     outFH = open(outF, "w")
 50 | 
 51 |     #read input file
 52 |     for line in relativeF:
 53 |         line=line.strip()
 54 |         F = line.split("\t")
 55 |         #print str(len(F)) + "\n"
 56 |         if len(F)==2:
 57 |             print "Searching for segregating variants in relative pairs: "+ F[0] + ":" + F[1] + "\n"
 58 |             relative2relative[F[0]] = F[1]
 59 |     relativeF.close()
 60 | 
 61 | 
 62 |     try:
 63 |         fn = args[1]
 64 |         if fn.endswith(".gz"):
 65 |             vcfF = gzip.open(fn,"r")
 66 |         elif fn.endswith(".vcf"): 
 67 |             vcfF = open(fn,"r")
 68 |     except IOError:
 69 |         print("File , args[1], does not exist or is not a valid vcf!")
 70 | 
 71 |     colnames = "sample1\tsample2\tsample1GENO\tsample2GENO\tCHROM\tPOS\tID\tREF\tALT\tQUAL\tFILTER\tINFO\tFORMAT\n"
 72 |     outFH.write(colnames)
 73 |     #read input file
 74 |     for line in vcfF:
 75 |         line=line.strip()
 76 | 
 77 |         # print headers
 78 |         if line.startswith("##"):
 79 |             #print line
 80 |             continue      
 81 |         
 82 |         F = line.split("\t")
 83 |         colNum = len(F)
 84 |         
 85 |         if line.startswith("#CHR"): # row with column names
 86 |             #print line          
 87 |             for i in range(0,colNum):
 88 |                 TCGA_barcode = F[i][0:12]
 89 |                 sample2colID[TCGA_barcode] = i
 90 |             for sample in relative2relative:
 91 |                 relative = relative2relative[sample]
 92 |                 # only keep the existing pairs to save iteration time later on
 93 |                 if sample in sample2colID and relative in sample2colID:
 94 |                     existing_relatives[sample] = relative
 95 | 
 96 |         else:
 97 |             INFO = F[7]
 98 |             INFOsplit = INFO.split(";AC=")
 99 |             AC = int( INFOsplit[1] )
100 |             if AC > 20:
101 |                 continue
102 |             # loop through the relatives; if one has the var, check the other; only print if both has it
103 |             for sample in existing_relatives:
104 |                 relative = relative2relative[sample]
105 |                 sampleColID = sample2colID[sample]
106 |                 relativeColID = sample2colID[relative]
107 |                 sampleGeno = F[sampleColID]
108 |                 relativeGeno = F[relativeColID]
109 |                 # print line if segragating var found! ( BOTH NOT WT )
110 |                 if not sampleGeno.startswith("./.") and not relativeGeno.startswith("./."): 
111 |                     outLine = sample + "\t" + relative + "\t" + sampleGeno + "\t" + relativeGeno + "\t".join(F[0:8]) + "\n"
112 |                     outFH.write(outLine)
113 |                     #print outLine
114 |                     
115 |     
116 |     vcfF.close()
117 | 
118 | if __name__ == "__main__":
119 |     main()
120 | 


--------------------------------------------------------------------------------
/analysis/segregation_analysis/segregation.log.sh:
--------------------------------------------------------------------------------
1 | # 201708 Kuan-lin Huang @ WashU
2 | 
3 | # transfer needed files to VM
4 | gcloud compute scp find_shared_var_relatives.py huangkuanlin@kuan-8cpu-30gb:~/ --zone us-central1-c
5 | gcloud compute scp out/TCGA_z1_z2_relatives.tsv huangkuanlin@kuan-8cpu-30gb:~/ --zone us-central1-c
6 | gcloud compute scp *.sh huangkuanlin@kuan-8cpu-30gb:~/ --zone us-central1-c
7 | 
8 | # run script to find segregating variants


--------------------------------------------------------------------------------
/analysis/variant_QC/batch_run_pseq_stats.sh:
--------------------------------------------------------------------------------
 1 | # not in ExAC
 2 | #VCFs=$(gsutil ls gs://dinglab/isb-cgc/tcga/germline/production/merge_v2/not-in-exac/3.annotated/whitelisted/by_tcgaBarcode/PCA.merge.tcgaBarcode.chr*.anno.whitelist.vcf.gz)
 3 | #for file in $VCFs; do
 4 | for i in {17..22}; do 
 5 | 	file=gs://dinglab/isb-cgc/tcga/germline/production/merge_v2/not-in-exac/3.annotated/whitelisted/by_tcgaBarcode/PCA.merge.tcgaBarcode.chr${i}.anno.whitelist.vcf.gz
 6 | 	bash run_pseq_stats.sh $file &
 7 | 
 8 | 	NPROC=$(($NPROC+1))
 9 | 	if [ "$NPROC" -ge 8 ]; then
10 | 		wait
11 | 		NPROC=0
12 |         fi
13 | done
14 | 
15 | # in ExAC: gs://dinglab/isb-cgc/tcga/germline/production/merge.ExAConly/all/annotated/by_tcgaBarcode/PCA.merge.ExAConly.tcgaBarcode.anno.whitelist.vcf.gz*
16 | 
17 | #file=gs://dinglab/isb-cgc/tcga/germline/production/merge.ExAConly/all/annotated/by_tcgaBarcode/PCA.merge.ExAConly.tcgaBarcode.anno.whitelist.vcf.gz
18 | #bash run_pseq_stats.sh $file
19 | 


--------------------------------------------------------------------------------
/analysis/variant_QC/batch_run_pseq_vcfstats.sh:
--------------------------------------------------------------------------------
 1 | # not in ExAC
 2 | VCFs=$(gsutil ls gs://dinglab/isb-cgc/tcga/germline/production/merge_v2/not-in-exac/3.annotated/whitelisted/by_tcgaBarcode/PCA.merge.tcgaBarcode.chr*.anno.whitelist.vcf.gz)
 3 | for file in $VCFs; do 
 4 | 	gsutil cp $file* .
 5 | 	vcfName=${file##*/}
 6 | 	date
 7 | 	echo "Examining vcf-stats for ${vcfName}"
 8 | 	../plinkseq/build/execs/pseq $vcfName v-stats > ${vcfName}.pseq.vstats.tsv
 9 | 	../plinkseq/build/execs/pseq $vcfName i-stats > ${vcfName}.pseq.istats.tsv
10 | 	#vcf-stats $vcfName > ${vcfName}.vcfstats.json
11 | 	
12 | 	rm -f $vcfName
13 | 	rm -f ${vcfName}.tbi
14 | done
15 | 
16 | # in ExAC: gs://dinglab/isb-cgc/tcga/germline/production/merge.ExAConly/all/annotated/by_tcgaBarcode/PCA.merge.ExAConly.tcgaBarcode.anno.whitelist.vcf.gz*
17 | 
18 | file=gs://dinglab/isb-cgc/tcga/germline/production/merge.ExAConly/all/annotated/by_tcgaBarcode/PCA.merge.ExAConly.tcgaBarcode.anno.whitelist.vcf.gz
19 | gsutil cp $file* .
20 |    	vcfName=${file##*/}
21 |         date
22 | 	echo "Examining vcf-stats for ${vcfName}"
23 |         ../plinkseq/build/execs/pseq $vcfName v-stats > ${vcfName}.pseq.vstats.tsv
24 |         ../plinkseq/build/execs/pseq $vcfName i-stats > ${vcfName}.pseq.istats.tsv
25 | 	#vcf-stats $vcfName > ${vcfName}.vcfstats.json
26 |         
27 |         rm -f $vcfName
28 |         rm -f ${vcfName}.tbi
29 | 


--------------------------------------------------------------------------------
/analysis/variant_QC/batch_run_vcfstats.sh:
--------------------------------------------------------------------------------
 1 | # not in ExAC
 2 | VCFs=$(gsutil ls gs://dinglab/isb-cgc/tcga/germline/production/merge_v2/not-in-exac/3.annotated/whitelisted/by_tcgaBarcode/PCA.merge.tcgaBarcode.chr*.anno.whitelist.vcf.gz)
 3 | for file in $VCFs; do 
 4 | 	gsutil cp $file* .
 5 | 	vcfName=${file##*/}
 6 | 	date
 7 | 	echo "Examining vcf-stats for ${vcfName}"
 8 | 	vcf-stats $vcfName > ${vcfName}.vcfstats.json
 9 | 	
10 | 	rm -f $vcfName
11 | 	rm -f ${vcfName}.tbi
12 | done
13 | 
14 | # in ExAC: gs://dinglab/isb-cgc/tcga/germline/production/merge.ExAConly/all/annotated/by_tcgaBarcode/PCA.merge.ExAConly.tcgaBarcode.anno.whitelist.vcf.gz*
15 | 
16 | file=gs://dinglab/isb-cgc/tcga/germline/production/merge.ExAConly/all/annotated/by_tcgaBarcode/PCA.merge.ExAConly.tcgaBarcode.anno.whitelist.vcf.gz
17 | gsutil cp $file* .
18 |    	vcfName=${file##*/}
19 |         date
20 | 	echo "Examining vcf-stats for ${vcfName}"
21 |         vcf-stats $vcfName > ${vcfName}.vcfstats.json
22 |         
23 |         rm -f $vcfName
24 |         rm -f ${vcfName}.tbi
25 | 


--------------------------------------------------------------------------------
/analysis/variant_QC/plot_concordance.R:
--------------------------------------------------------------------------------
 1 | ##### plot_pseq_stats.R #####
 2 | # Kuan-lin Huang @ WashU 2017 Aug./Sep.
 3 | 
 4 | setwd("/Users/khuang/Box\ Sync/PhD/germline/PanCanAtlasGermline/analysis/variant_QC/")
 5 | source("../global_aes_out.R")
 6 | system("mkdir out")
 7 | 
 8 | ##### individual level stats #####
 9 | fileName = "out/run_calc_vcf_concordance.chr22only.sampleID.GT.txt"
10 | concordance = read.table(header=F, sep=" ", file=fileName)
11 | colnames(concordance) = c("sample","validated","unvalidated")
12 | concordance$validated[concordance$sample=="All"]/(concordance$validated[concordance$sample=="All"] + concordance$unvalidated[concordance$sample=="All"])
13 | concordance =concordance[-c(9236:9240),]
14 | 
15 | 
16 | clin_f = "/Users/khuang/Box\ Sync/PhD/germline/PanCanAtlasGermline/TCGA_data/clinical/all.clin.merged.picked.txt"
17 | clin = read.table(header=T, quote = "", sep="\t", row.names =NULL, file = clin_f, stringsAsFactors=FALSE)
18 | clin = clin[,c("sample","cancer","race")]
19 | colnames(clin) = c("sample","cancer","ethnicity")
20 | clin = clin[!(clin$cancer %in% c("GBMLGG","COAD","KIPAN")),]
21 | 
22 | concordance_clin = merge(concordance,clin,by="sample",all.x=T)
23 | 
24 | p = ggplot(data=concordance_clin,aes(x = validated, y=unvalidated, color = cancer))
25 | p = p + geom_point(alpha=0.1,stroke=0) 
26 | p = p + theme_bw() + theme(legend.position="bottom")
27 | p = p + geom_label(aes(label=ifelse(unvalidated > validated | unvalidated > 50, as.character(sample),NA)),size=1)
28 | p
29 | fn = paste('out/chr22_validated_vs_unvalidated.pdf',sep=".")
30 | ggsave(file=fn, useDingbats=FALSE,limitsize=FALSE)
31 | 
32 | concordance_clin[concordance_clin$unvalidated > concordance_clin$validated | concordance_clin$unvalidated > 50,]
33 | 
34 | # ##### compare to pan8000 #####
35 | # miss = read.table(header=F, sep="\t", "out/PCA_pan8000_missense_concordance.txt")
36 | # trun = read.table(header=F, sep="\t", "out/PCA_pan8000_truncation_concordance.txt")
37 | # PCA_sample = as.vector(t(read.table(header=F, sep="\t", "out/PCA_samples.txt")))
38 | # colnames(miss) = c("Var","Sample","Validated")
39 | # colnames(trun) = c("Var","Sample","Validated")
40 | # 
41 | # PCA_sample_ID = substr(PCA_sample,1,12)
42 | # miss$inPCA = miss$Sample %in% PCA_sample_ID
43 | # trun$inPCA = trun$Sample %in% PCA_sample_ID
44 | # 
45 | # table(miss$Validated,miss$inPCA)
46 | # table(trun$Validated,trun$inPCA)
47 | 


--------------------------------------------------------------------------------
/analysis/variant_QC/run_pseq_stats.sh:
--------------------------------------------------------------------------------
 1 | file=$1
 2 | gsutil cp $file* .
 3 |         vcfName=${file##*/}
 4 |         date
 5 |         echo "Examining vcf-stats for ${vcfName}"
 6 |         ../plinkseq/build/execs/pseq $vcfName v-stats > ${vcfName}.pseq.vstats.tsv
 7 |         ../plinkseq/build/execs/pseq $vcfName i-stats > ${vcfName}.pseq.istats.tsv
 8 |         #vcf-stats $vcfName > ${vcfName}.vcfstats.json
 9 | 
10 |         rm -f $vcfName
11 |         rm -f ${vcfName}.tbi
12 | 


--------------------------------------------------------------------------------
/doc/20170118_TCGA_Germline_Abstract.docx:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ding-lab/PanCanAtlasGermline/e57bfec6660bccf359154ae551ebffb06c42470b/doc/20170118_TCGA_Germline_Abstract.docx


--------------------------------------------------------------------------------
/doc/notes.txt:
--------------------------------------------------------------------------------
1 | # Note on vcf merging and processing: 
2 | #we can potentially use vcfanno for quick annotation of vcfs on ExAC vs. TCGA frequencies and such
3 | https://genomebiology.biomedcentral.com/articles/10.1186/s13059-016-0973-5
4 | https://github.com/brentp/vcfanno/tree/master/docs/examples/exac_combine
5 | 
6 | 
7 | 
8 | 


--------------------------------------------------------------------------------
/germline_somatic_analysis/load_somatic.R:
--------------------------------------------------------------------------------
 1 | ##### load_somatic.R #####
 2 | # Kuan-lin Huang 2018
 3 | # load somatic mutation/driver/gene list files
 4 | 
 5 | ### MAIN ###
 6 | somaticDriver299_f = "../../TCGA_data/somatic/Driver_BaileyCell2018/299driverGene.txt"
 7 | somaticDriver299 = as.vector(t(read.table(header=F, quote = "", sep="\t", file = somaticDriver299_f, stringsAsFactors=FALSE)))
 8 | 
 9 | somatic_f = "../../TCGA_data/somatic/mc3.v0.2.8.PUBLIC.maf.gene_vclass_HGVSp_sample.gz"
10 | somatic = read.table(header=T, quote = "", sep="\t", file = gzfile(somatic_f), stringsAsFactors=FALSE)
11 | somatic$bcr_patient_barcode = gsub("(^TCGA-[A-Z0-9][A-Z0-9]-[A-Z0-9][A-Z0-9][A-Z0-9][A-Z0-9])-.*","\\1",somatic$Tumor_Sample_Barcode)
12 | 
13 | clin_cmap = clin[,c("bcr_patient_barcode","type"),]
14 | colnames(clin_cmap)[2] = "cancer"
15 | somatic = merge(somatic,clin_cmap,by="bcr_patient_barcode")
16 | # somatic_mut_count = data.frame(table(somatic$bcr_patient_barcode))
17 | # colnames(somatic_mut_count) = c("bcr_patient_barcode","MutationCount")
18 | 
19 | table(somatic$Variant_Classification)
20 | likelyFunctionalTypes = c("Frame_Shift_Del","Frame_Shift_Ins","In_Frame_Del","In_Frame_Ins","Missense_Mutation",
21 |                           "Nonsense_Mutation","Splice_Site","Translation_Start_Site")
22 | somatic_likelyfunctional = somatic[somatic$Hugo_Symbol %in% c(pathVar$HUGO_Symbol,somaticDriver299) & somatic$Variant_Classification %in% likelyFunctionalTypes,]
23 | 
24 | # driver mutation
25 | driver_f = "../../TCGA_data/somatic/Driver_BaileyCell2018/Mutation.CTAT.3D.Scores.txt.gz"
26 | driver = read.table(header=T, quote = "", sep="\t", file = gzfile(driver_f), stringsAsFactors=FALSE)
27 | colnames(driver) = gsub("\\.","_",colnames(driver))
28 | driver$numOfEvidence = driver$New_Linear__cancer_focused__flag + driver$New_Linear__functional__flag + driver$New_3D_mutational_hotspot_flag
29 | table(driver$numOfEvidence)
30 | driver_func = driver[driver$numOfEvidence > 1,]
31 | somatic_likelyfunctional_driver = somatic_likelyfunctional[somatic_likelyfunctional$Variant_Classification != "Missense_Mutation" |
32 |                                                              paste(somatic_likelyfunctional$Hugo_Symbol,somatic_likelyfunctional$HGVSp_Short) %in% paste(driver_func$gene,driver_func$protein_change),]
33 | 


--------------------------------------------------------------------------------
/germline_somatic_analysis/mutation_signature/2_plotPathVarMutsigAssoc.R:
--------------------------------------------------------------------------------
 1 | ##### plotPathVarMutsigAssoc.R #####
 2 | # Kuan-lin Huang 2018
 3 | 
 4 | source("../global_aes_out.R")
 5 | source("../dependency_files.R")
 6 | 
 7 | tn = "out/pathVarMutsigAssoc.txt"
 8 | tt = read.table(sep="\t",header=T,file=tn, stringsAsFactors=FALSE)
 9 | 
10 | ### plotting ###
11 | tt$signature = factor(tt$signature)
12 | tt$signature = factor(tt$signature,levels = c("Signature-1","Signature-2","Signature-3","Signature-4","Signature-5","Signature-6"
13 |                                               ,"Signature-7","Signature-8","Signature-9","Signature-10","Signature-11","Signature-12"
14 |                                               ,"Signature-13","Signature-14","Signature-15","Signature-16","Signature-17","Signature-18"
15 |                                               ,"Signature-19","Signature-20","Signature-21","Signature-22","Signature-23","Signature-24"
16 |                                               ,"Signature-25","Signature-26","Signature-27","Signature-28","Signature-29","Signature-30"))
17 | tt$association = "None"
18 | tt$association[tt$FDR<0.15] = "Suggestive"
19 | tt$association[tt$FDR<0.05] = "Significant"
20 | tt$gene = as.character(tt$gene)
21 | tt$FDR_plot = -log(tt$FDR)
22 | tt$FDR_plot[tt$FDR_plot > 5 ] = 5
23 | uniqG = unique(tt$gene[tt$FDR<0.05])
24 | ttG = tt[tt$gene %in% uniqG,]
25 | 
26 | getPalette = colorRampPalette(c("#FFFFFF","#fed976","#e31a1c"))
27 | p = ggplot(data=ttG)
28 | p = p + facet_grid(gene~.,space="free",scale="free")
29 | p = p + geom_tile(data=ttG,aes(y=cancer, x=signature, fill= coefficient), linetype="blank") + scale_fill_gradientn(name= "Coefficient", colours=getPalette(100), na.value=NA, limit=c(0,NA))
30 | #p = p + geom_text(data=ttG,aes(x=cancer, y=signature, label = coefficient), color="black", size=3)
31 | p = p + geom_tile(data=ttG[ttG$FDR < 0.15,],aes(y=cancer, x=signature), color="grey",fill=NA, size=1.5) #+ scale_color_gradientn(name= "Sig", colours=sigColors)
32 | p = p + geom_tile(data=ttG[ttG$FDR < 0.05,],aes(y=cancer, x=signature), color="black",fill=NA, size=1.5) #+ scale_color_gradientn(name= "Sig", colours=sigColors)
33 | p = p  + theme_bw() + theme_nogrid() +
34 |   theme(axis.title = element_blank(), axis.text.x = element_text(colour="black", size=12, angle=90, vjust = 0.5), axis.text.y = element_text(colour="black", size=12),axis.ticks = element_blank())#element_text(colour="black", size=14))
35 | p + labs(x="Signature",y = "Cancer")
36 | 
37 | fn = 'out/pan10389_germlineAssocWithmutSignatureHeatmap.pdf'
38 | ggsave(fn,h=8,useDingbat=F)
39 | 
40 | # # using GLM test result
41 | # tt$FDR_plot[tt$FDR_plot<10^(-6)]= 0.95*10^(-6)
42 | # p = ggplot(data=tt)
43 | # p = p + geom_point(aes(y=-log10(FDR_plot),x= coefficient,color = cancer),alpha=0.5)
44 | # #p = p + geom_text_repel(aes(y=-log10(FDR),x= cohort_AF,label=ifelse(FDR<0.05, Gene,NA),color = Cancer))#,alpha=1.3)
45 | # #p = p + geom_point(aes(y=cohort_AF,x=Cancer,size=-log10(FDR),color = Cancer))
46 | # p = p + geom_text_repel(aes(y=-log10(FDR_plot),x= coefficient,color = cancer,label=ifelse(FDR<0.05, gene,NA)))
47 | # p = p + getPCACancerColor()
48 | # p = p + labs(x="Coefficient",y= "-log10(FDR)")
49 | # p = p + geom_vline(xintercept = 0, alpha=0.5)
50 | # p = p  + theme_bw() +
51 | #   theme(axis.text.x = element_text(colour="black", size=12), axis.text.y = element_text(colour="black", size=12),axis.ticks = element_blank())#element_text(colour="black", size=14))
52 | # p
53 | # fn = 'out/geneExpressAssocVolcanoGLM.pdf'
54 | # ggsave(fn,w = 5, h = 5, useDingbat=F)
55 | 
56 | # # using the Wilcox test result
57 | # p = ggplot(data=tt)
58 | # p = p + geom_point(aes(y=-log10(wilcoxFDR),x= coefficient,color = cancer),alpha=0.5)
59 | # #p = p + geom_text_repel(aes(y=-log10(FDR),x= cohort_AF,label=ifelse(FDR<0.05, Gene,NA),color = Cancer))#,alpha=1.3)
60 | # #p = p + geom_point(aes(y=cohort_AF,x=Cancer,size=-log10(FDR),color = Cancer))
61 | # p = p + geom_text_repel(aes(y=-log10(wilcoxFDR),x= coefficient,color = cancer,label=ifelse(FDR<0.05, gene,NA)))
62 | # p = p + getPCACancerColor()
63 | # p = p + labs(x="Coefficient",y= "-log10(FDR)")
64 | # p = p + geom_vline(xintercept = 0, alpha=0.5)
65 | # p = p  + theme_bw() +
66 | #   theme(axis.text.x = element_text(colour="black", size=12), axis.text.y = element_text(colour="black", size=12),axis.ticks = element_blank())#element_text(colour="black", size=14))
67 | # p
68 | # fn = 'out/geneExpressAssocVolcanoWCOX.pdf'
69 | # ggsave(fn,w = 5, h = 5, useDingbat=F)
70 | 
71 | #tt$association = factor(tt$association,level=c("None","Suggestive","Significant"))
72 | 
73 | # plot by gene
74 | p = ggplot(data=ttG,aes(x=coefficient,y=cancer,color = cancer))
75 | p = p + facet_grid(gene~.,space="free",scale="free")
76 | # p = p + geom_point(aes(y=-log10(wilcoxFDR),x= coefficient,color = cancer),alpha=0.5)
77 | p = p + geom_point(aes(shape=association),alpha=0.3,size=2)
78 | p = p + geom_text_repel(aes(label=ifelse(FDR<0.05,as.character(signature),NA)))
79 | p = p + getPCACancerColor()
80 | p = p + labs(x="Cancer",y= "-log10(FDR)")
81 | p = p + geom_vline(xintercept = 0, alpha=0.5) #+ xlim(-3.1,3.1)
82 | p = p  + theme_bw() +
83 |   theme(axis.text.x = element_text(colour="black", size=12), axis.text.y = element_text(colour="black", size=12),axis.ticks = element_blank())#element_text(colour="black", size=14))
84 | p + labs(x = "coefficient",y="cancer")
85 | fn = 'out/pan10389_germlineAssocWithmutSignatureByGene.pdf'
86 | ggsave(fn,h=8,useDingbat=F)
87 | 


--------------------------------------------------------------------------------
/germline_somatic_analysis/mutation_signature/4_plotSomaticMutsigAssoc.R:
--------------------------------------------------------------------------------
 1 | ##### plotPathVarMutsigAssoc.R #####
 2 | # Kuan-lin Huang 2018
 3 | 
 4 | source("../global_aes_out.R")
 5 | source("../dependency_files.R")
 6 | 
 7 | g_tn = "out/pathVarMutsigAssoc.txt"
 8 | g_tt = read.table(sep="\t",header=T,file=g_tn, stringsAsFactors=FALSE)
 9 | 
10 | tn = "out/somaticMutMutsigAssoc.txt"
11 | tt = read.table(sep="\t",header=T,file=tn, stringsAsFactors=FALSE)
12 | 
13 | ### plotting ###
14 | tt$signature = factor(tt$signature)
15 | tt$signature = factor(tt$signature,levels = c("Signature-1","Signature-2","Signature-3","Signature-4","Signature-5","Signature-6"
16 |                                               ,"Signature-7","Signature-8","Signature-9","Signature-10","Signature-11","Signature-12"
17 |                                               ,"Signature-13","Signature-14","Signature-15","Signature-16","Signature-17","Signature-18"
18 |                                               ,"Signature-19","Signature-20","Signature-21","Signature-22","Signature-23","Signature-24"
19 |                                               ,"Signature-25","Signature-26","Signature-27","Signature-28","Signature-29","Signature-30"))
20 | tt$association = "None"
21 | tt$association[tt$FDR<0.15] = "Suggestive"
22 | tt$association[tt$FDR<0.05] = "Significant"
23 | tt$gene = as.character(tt$gene)
24 | tt$FDR_plot = -log(tt$FDR)
25 | tt$FDR_plot[tt$FDR_plot > 5 ] = 5
26 | #uniqG = unique(tt$gene[tt$FDR<0.05])
27 | uniqG = unique(g_tt$gene[g_tt$FDR<0.05]) #  plot just the germline genes for now
28 | ttG = tt[tt$gene %in% uniqG,]
29 | 
30 | # getPalette = colorRampPalette(c("#FFFFFF","#fed976","#e31a1c"))
31 | # p = ggplot(data=ttG)
32 | # p = p + facet_grid(gene~.,space="free",scale="free")
33 | # p = p + geom_tile(data=ttG,aes(y=cancer, x=signature, fill= coefficient), linetype="blank") + scale_fill_gradientn(name= "Coefficient", colours=getPalette(100), na.value=NA, limit=c(0,NA))
34 | # #p = p + geom_text(data=ttG,aes(x=cancer, y=signature, label = coefficient), color="black", size=3)
35 | # #p = p + geom_tile(data=ttG[ttG$FDR < 0.15,],aes(y=cancer, x=signature), color="grey",fill=NA, size=1.5) #+ scale_color_gradientn(name= "Sig", colours=sigColors)
36 | # p = p + geom_tile(data=ttG[ttG$FDR < 0.05,],aes(y=cancer, x=signature), color="black",fill=NA, size=1.5) #+ scale_color_gradientn(name= "Sig", colours=sigColors)
37 | # p = p  + theme_bw() + theme_nogrid() +
38 | #   theme(axis.title = element_blank(), axis.text.x = element_text(colour="black", size=12, angle=90, vjust = 0.5), axis.text.y = element_text(colour="black", size=12),axis.ticks = element_blank())#element_text(colour="black", size=14))
39 | # p + labs(x="Signature",y = "Cancer")
40 | # 
41 | # fn = 'out/SomaticWithmutSignatureHeatmap.pdf'
42 | # ggsave(fn,h=25,useDingbat=F)
43 | 
44 | # plot by gene
45 | p = ggplot(data=ttG,aes(x=coefficient,y=cancer,color = cancer))
46 | p = p + facet_grid(gene~.,space="free",scale="free")
47 | # p = p + geom_point(aes(y=-log10(wilcoxFDR),x= coefficient,color = cancer),alpha=0.5)
48 | p = p + geom_point(aes(shape=association),alpha=0.3,size=2)
49 | p = p + geom_text_repel(aes(label=ifelse(FDR<0.0005,signature,NA)))
50 | p = p + getPCACancerColor()
51 | p = p + labs(x="Cancer",y= "-log10(FDR)")
52 | p = p + geom_vline(xintercept = 0, alpha=0.5) #+ xlim(-3.1,3.1)
53 | p = p  + theme_bw() +
54 |   theme(axis.text.x = element_text(colour="black", size=12), axis.text.y = element_text(colour="black", size=12),axis.ticks = element_blank())#element_text(colour="black", size=14))
55 | p + labs(x = "coefficient",y="cancer")
56 | fn = 'out/SomaticWithmutSignatureByGene.pdf'
57 | ggsave(fn,h=28,useDingbat=F)
58 | 


--------------------------------------------------------------------------------
/germline_somatic_analysis/somatic_germline/plotSomaticGermline.R:
--------------------------------------------------------------------------------
 1 | ##### plotSomaticGermline.R #####
 2 | # Kuan-lin Huang 2018
 3 | 
 4 | #setwd("/Users/khuang/Box\ Sync/PhD/germline/PanCanAtlasGermline/analysis/somatic_germline/")
 5 | 
 6 | source("../global_aes_out.R")
 7 | source("../dependency_files.R")
 8 | 
 9 | tn = "out/germline_somatic_driver_fisher.tsv"
10 | tt = read.table(sep="\t",header=T,file=tn, stringsAsFactors=FALSE)
11 | 
12 | germlineG = unique(tt$GermlineGene[tt$P<0.05])
13 | somaticG = unique(tt$SomaticGene[tt$P<0.01])
14 | ttG = tt[tt$GermlineGene %in% germlineG & tt$SomaticGene %in% somaticG,]
15 | 
16 | # pre-plotting
17 | ttG$minusLogP = -log10(ttG$P) 
18 | ttG$plotP = ttG$minusLogP
19 | ttG$plotP[ttG$OR < 1] = -ttG$plotP[ttG$OR < 1] # opposite effect size (mutual exclusivity)
20 | ttG$plotP[ttG$plotP > 5] = 5
21 | ttG$plotP[ttG$plotP < -5] = -5
22 | 
23 | ### plotting ###
24 | 
25 | #getPalette = colorRampPalette(c("#FFFFFF","#fed976","#e31a1c"))
26 | p = ggplot(data=ttG)
27 | p = p + geom_tile(data=ttG,aes(y=SomaticGene, x=GermlineGene, fill= plotP), linetype="blank") + scale_fill_gradientn(name= "-log10(P)", colours=getPalette(100), na.value=NA, limit=c(-5,5))
28 | #p = p + geom_text(data=ttG,aes(x=cancer, y=signature, label = coefficient), color="black", size=3)
29 | #p = p + geom_tile(data=ttG[ttG$FDR < 0.15,],aes(y=cancer, x=signature), color="grey",fill=NA, size=1.5) #+ scale_color_gradientn(name= "Sig", colours=sigColors)
30 | p = p + geom_tile(data=ttG[ttG$FDR < 0.05,],aes(y=SomaticGene, x=GermlineGene, fill= plotP), color="black",fill=NA, size=1.5) #+ scale_color_gradientn(name= "Sig", colours=sigColors)
31 | p = p  + theme_bw() + theme_nogrid() +
32 |   theme(axis.text.x = element_text(colour="black", size=12, angle=90, vjust = 0.5), axis.text.y = element_text(colour="black", size=12),axis.ticks = element_blank())#element_text(colour="black", size=14))
33 | p = p + labs(title="Germline-somatic Interaction: Pan-cancer",x="Germline variant carrier",y = "Somatic mutation carrier")
34 | p
35 | fn = 'out/pan10389_germlineAssocWithSomaticHeatmap.pdf'
36 | ggsave(fn,useDingbat=F)


--------------------------------------------------------------------------------
/germline_somatic_analysis/somatic_germline/plotSomaticGermlineByCancer.R:
--------------------------------------------------------------------------------
 1 | ##### plotSomaticGermline.R #####
 2 | # Kuan-lin Huang 2018
 3 | 
 4 | #setwd("/Users/khuang/Box\ Sync/PhD/germline/PanCanAtlasGermline/analysis/somatic_germline/")
 5 | 
 6 | source("../global_aes_out.R")
 7 | source("../dependency_files.R")
 8 | 
 9 | tn = "out/germline_somatic_driver_fisher_byCancer.tsv"
10 | tt = read.table(sep="\t",header=T,file=tn, stringsAsFactors=FALSE)
11 | 
12 | germlineG = unique(tt$GermlineGene[tt$P<0.05])
13 | somaticG = unique(tt$SomaticGene[tt$P<0.05])
14 | cancer = unique(tt$Cancer[tt$P<0.05])
15 | ttG = tt[tt$GermlineGene %in% germlineG & tt$SomaticGene %in% somaticG & tt$Cancer %in% cancer,]
16 | 
17 | # pre-plotting
18 | ttG$minusLogP = -log10(ttG$P) 
19 | ttG$plotP = ttG$minusLogP
20 | ttG$plotP[ttG$OR < 1] = -ttG$plotP[ttG$OR < 1] # opposite effect size (mutual exclusivity)
21 | ttG$plotP[ttG$plotP > 5] = 5
22 | ttG$plotP[ttG$plotP < -5] = -5
23 | 
24 | ### plotting ###
25 | 
26 | #getPalette = colorRampPalette(c("#FFFFFF","#fed976","#e31a1c"))
27 | p = ggplot(data=ttG)
28 | p = p + facet_grid(.~Cancer, scale="free", space="free")
29 | p = p + geom_tile(data=ttG,aes(y=SomaticGene, x=GermlineGene, fill= plotP), linetype="blank") + scale_fill_gradientn(name= "-log10(P)", colours=getPalette(100), na.value=NA, limit=c(-5,5))
30 | #p = p + geom_text(data=ttG,aes(x=cancer, y=signature, label = coefficient), color="black", size=3)
31 | #p = p + geom_tile(data=ttG[ttG$FDR < 0.15,],aes(y=cancer, x=signature), color="grey",fill=NA, size=1.5) #+ scale_color_gradientn(name= "Sig", colours=sigColors)
32 | p = p + geom_tile(data=ttG[ttG$FDR < 0.05,],aes(y=SomaticGene, x=GermlineGene, fill= plotP), color="black",fill=NA, size=1.5) #+ scale_color_gradientn(name= "Sig", colours=sigColors)
33 | p = p  + theme_bw() + theme_nogrid() +
34 |   theme( axis.text.x = element_text(colour="black", size=12, angle=90, vjust = 0.5), axis.text.y = element_text(colour="black", size=12),axis.ticks = element_blank())#element_text(colour="black", size=14))
35 | p = p + labs(title="Germline-somatic Interaction: Single-cancer",x="Germline variant carrier",y = "Somatic mutation carrier")
36 | p
37 | fn = 'out/pan10389_germlineAssocWithSomaticHeatmapByCancer.pdf'
38 | ggsave(fn,useDingbat=F)
39 | 


--------------------------------------------------------------------------------
/germline_somatic_analysis/somatic_germline/somaticDriver_germline_fisher.R:
--------------------------------------------------------------------------------
 1 | ##### somaticDriver_germline_fisher.R #####
 2 | # Kuan-lin Huang @ WashU 2016 June
 3 | # updated 2018
 4 | # Conduct fisher's exact test to find mutual occurence/mutual exclusivity of germline/somatic variants
 5 | # somatic driver only
 6 | 
 7 | ### dependencies ###
 8 | #setwd("/Users/khuang/Box\ Sync/PhD/germline/PanCanAtlasGermline/analysis/somatic_germline")
 9 | source("../global_aes_out.R")
10 | source("../dependency_files.R")
11 | source("../load_somatic.R")
12 | 
13 | ### function ###
14 | sg_fisher_test = function(all_samples, germline_carriers, somatic_carriers){
15 |   p = NA; OR = NA
16 |   
17 |   fisher_elements = c(sum(!all_samples %in% c(germline_carriers,somatic_carriers)),sum((all_samples %in% germline_carriers) & !(all_samples %in% somatic_carriers)),
18 |                       sum(!(all_samples %in% germline_carriers) & (all_samples %in% somatic_carriers)),sum((all_samples %in% germline_carriers) & (all_samples %in% somatic_carriers)))
19 |   if (fisher_elements[1] > 0 && fisher_elements[3] > 0 && fisher_elements[2] >= 0 && fisher_elements[4] >= 0){
20 |     test.table = matrix(as.numeric(fisher_elements), nrow=2)
21 |     f.test = fisher.test(test.table)
22 |     OR = f.test$estimate
23 |     p = f.test$p.value
24 |     
25 |     count00 = test.table[1,1]
26 |     count10 = test.table[2,1]
27 |     count01 = test.table[1,2]
28 |     count11 = test.table[2,2]
29 |     return(list("p"=p, "OR"=OR, "count00" = count00, "count00" = count00
30 |                 , "count10" = count10, "count01" = count01, "count11" = count11))
31 |   }
32 | }
33 | 
34 | ### get input date and files ###
35 | 
36 | ### germline ###
37 | germ_list = names(table(pathVarP$HUGO_Symbol)[table(pathVarP$HUGO_Symbol)>9]) # limit analyses to germline genes with equal to or more than 10 variants
38 | 
39 | ### germline somatic interaction ###
40 | # samples being tested
41 | test_samples = intersect(unique(somatic$bcr_patient_barcode), clin$bcr_patient_barcode)
42 | cat("Running germline-somatic interaction in ",length(test_samples),"samples.\n")
43 | 
44 | # run through function
45 | out_table=character(0);
46 | for (g_gene in germ_list){
47 |   for (s_gene in somaticDriver299){
48 |     germline_carriers = unique(pathVarP$bcr_patient_barcode[pathVarP$HUGO_Symbol==g_gene])
49 |     somatic_carriers = unique(somatic_likelyfunctional_driver$bcr_patient_barcode[somatic_likelyfunctional_driver$Hugo_Symbol==s_gene])
50 |     if(length(germline_carriers) > 9 & length(somatic_carriers) > 19){
51 |       t_result = sg_fisher_test(all_samples = test_samples, germline_carriers, somatic_carriers)
52 |       p = t_result$p
53 |       OR = t_result$OR
54 |       count00 = t_result$count00
55 |       count10 = t_result$count10
56 |       count01 = t_result$count01
57 |       count11 = t_result$count11
58 |     
59 |     
60 |       out_row = c(g_gene, s_gene, count00, count10, count01, count11, OR, p)
61 |       out_table = rbind(out_table,out_row)
62 |     }
63 |   }
64 | }
65 | rownames(out_table)=NULL
66 | colnames(out_table) = c("GermlineGene", "SomaticGene", "germline0somatic0", "germline1somatic0", "germline0somatic1", "germline1somatic1","OR", "P")
67 | out_table = data.frame(out_table)
68 | out_table[,"P"] = as.numeric(as.character(out_table[,"P"]))
69 | 
70 | FDR = p.adjust(out_table[,"P"],method="fdr")
71 | out_table = cbind(out_table,FDR)
72 | out_table = out_table[order(out_table[,"P"]),]
73 | 
74 | fn = paste("out/germline_somatic_driver_fisher.tsv")
75 | write.table(out_table,file=fn,quote=FALSE,row.names=FALSE,sep="\t")


--------------------------------------------------------------------------------
/germline_somatic_analysis/somatic_germline/somaticDriver_germline_fisher_byCancer.R:
--------------------------------------------------------------------------------
 1 | ##### somaticDriver_germline_fisher.R #####
 2 | # Kuan-lin Huang @ WashU 2016 June
 3 | # updated 2018
 4 | # Conduct fisher's exact test to find mutual occurence/mutual exclusivity of germline/somatic variants
 5 | # somatic driver only
 6 | 
 7 | ### dependencies ###
 8 | #setwd("/Users/khuang/Box\ Sync/PhD/germline/PanCanAtlasGermline/analysis/somatic_germline")
 9 | source("../global_aes_out.R")
10 | source("../dependency_files.R")
11 | source("../load_somatic.R")
12 | 
13 | ### function ###
14 | sg_fisher_test = function(all_samples, germline_carriers, somatic_carriers){
15 |   p = NA; OR = NA
16 |   
17 |   fisher_elements = c(sum(!all_samples %in% c(germline_carriers,somatic_carriers)),sum((all_samples %in% germline_carriers) & !(all_samples %in% somatic_carriers)),
18 |                       sum(!(all_samples %in% germline_carriers) & (all_samples %in% somatic_carriers)),sum((all_samples %in% germline_carriers) & (all_samples %in% somatic_carriers)))
19 |   if (fisher_elements[1] > 0 && fisher_elements[3] > 0 && fisher_elements[2] >= 0 && fisher_elements[4] >= 0){
20 |     test.table = matrix(as.numeric(fisher_elements), nrow=2)
21 |     f.test = fisher.test(test.table)
22 |     OR = f.test$estimate
23 |     p = f.test$p.value
24 |     
25 |     count00 = test.table[1,1]
26 |     count10 = test.table[2,1]
27 |     count01 = test.table[1,2]
28 |     count11 = test.table[2,2]
29 |     return(list("p"=p, "OR"=OR, "count00" = count00, "count00" = count00
30 |                 , "count10" = count10, "count01" = count01, "count11" = count11))
31 |   }
32 | }
33 | 
34 | ### get input date and files ###
35 | 
36 | ### germline ###
37 | germ_list = names(table(pathVarP$HUGO_Symbol)[table(pathVarP$HUGO_Symbol)>9]) # limit analyses to germline genes with equal to or more than 10 variants
38 | 
39 | ### germline somatic interaction ###
40 | # samples being tested
41 | all_test_samples = intersect(unique(somatic$bcr_patient_barcode), clin$bcr_patient_barcode)
42 | 
43 | # run through function
44 | out_table=character(0);
45 | for (cancer in unique(pathVarP$cancer)){
46 |   test_samples = intersect(all_test_samples, clin$bcr_patient_barcode[clin$type==cancer])
47 |   cat("Running germline-somatic interaction in ",length(test_samples),"samples.\n")
48 |   
49 |   pathVarPcancer =pathVarP[pathVarP$cancer==cancer,]
50 |   somatic_likelyfunctional_driver_cancer = somatic_likelyfunctional_driver[somatic_likelyfunctional_driver$cancer==cancer,]
51 |   
52 |   for (g_gene in germ_list){
53 |     for (s_gene in somaticDriver299){
54 |       germline_carriers = unique(pathVarPcancer$bcr_patient_barcode[pathVarPcancer$HUGO_Symbol==g_gene])
55 |       somatic_carriers = unique(somatic_likelyfunctional_driver_cancer$bcr_patient_barcode[somatic_likelyfunctional_driver_cancer$Hugo_Symbol==s_gene])
56 |       if(length(germline_carriers) > 4 & length(somatic_carriers) > 9){
57 |         t_result = sg_fisher_test(all_samples = test_samples, germline_carriers, somatic_carriers)
58 |         p = t_result$p
59 |         OR = t_result$OR
60 |         count00 = t_result$count00
61 |         count10 = t_result$count10
62 |         count01 = t_result$count01
63 |         count11 = t_result$count11
64 |         
65 |         
66 |         out_row = c(cancer,g_gene, s_gene, count00, count10, count01, count11, OR, p)
67 |         out_table = rbind(out_table,out_row)
68 |       }
69 |     }
70 |   }
71 | }
72 | rownames(out_table)=NULL
73 | colnames(out_table) = c("Cancer","GermlineGene", "SomaticGene", "germline0somatic0", "germline1somatic0", "germline0somatic1", "germline1somatic1","OR", "P")
74 | out_table = data.frame(out_table)
75 | out_table[,"P"] = as.numeric(as.character(out_table[,"P"]))
76 | 
77 | FDR = p.adjust(out_table[,"P"],method="fdr")
78 | out_table = cbind(out_table,FDR)
79 | out_table = out_table[order(out_table[,"P"]),]
80 | 
81 | fn = paste("out/germline_somatic_driver_fisher_byCancer.tsv")
82 | write.table(out_table,file=fn,quote=FALSE,row.names=FALSE,sep="\t")


--------------------------------------------------------------------------------
/germline_somatic_analysis/somatic_germline/somatic_germline_fisher.R:
--------------------------------------------------------------------------------
 1 | ##### somatic_germline_fisher.R #####
 2 | # Kuan-lin Huang @ WashU 2016 June
 3 | # updated 2018
 4 | # Conduct fisher's exact test to find mutual occurence/mutual exclusivity of germline/somatic variants
 5 | 
 6 | ### dependencies ###
 7 | #setwd("/Users/khuang/Box\ Sync/PhD/germline/PanCanAtlasGermline/analysis/somatic_germline")
 8 | source("../global_aes_out.R")
 9 | source("../dependency_files.R")
10 | 
11 | ### function ###
12 | sg_fisher_test = function(all_samples, germline_carriers, somatic_carriers){
13 |   p = NA; OR = NA
14 |   
15 |   fisher_elements = c(sum(!all_samples %in% c(germline_carriers,somatic_carriers)),sum((all_samples %in% germline_carriers) & !(all_samples %in% somatic_carriers)),
16 |                       sum(!(all_samples %in% germline_carriers) & (all_samples %in% somatic_carriers)),sum((all_samples %in% germline_carriers) & (all_samples %in% somatic_carriers)))
17 |   if (fisher_elements[1] > 0 && fisher_elements[3] > 0 && fisher_elements[2] >= 0 && fisher_elements[4] >= 0){
18 |     test.table = matrix(as.numeric(fisher_elements), nrow=2)
19 |     f.test = fisher.test(test.table)
20 |     OR = f.test$estimate
21 |     p = f.test$p.value
22 |     
23 |     count00 = test.table[1,1]
24 |     count10 = test.table[2,1]
25 |     count01 = test.table[1,2]
26 |     count11 = test.table[2,2]
27 |     return(list("p"=p, "OR"=OR, "count00" = count00, "count00" = count00
28 |                 , "count10" = count10, "count01" = count01, "count11" = count11))
29 |   }
30 | }
31 | 
32 | ### get input date and files ###
33 | 
34 | ### germline ###
35 | germ_list = names(table(pathVarP$HUGO_Symbol)[table(pathVarP$HUGO_Symbol)>9])
36 | 
37 | ### somatic ###
38 | # somatic gene list
39 | somaticDriver299_f = "../../TCGA_data/somatic/Driver_BaileyCell2018/299driverGene.txt"
40 | somaticDriver299 = as.vector(t(read.table(header=F, quote = "", sep="\t", file = somaticDriver299_f, stringsAsFactors=FALSE)))
41 | somatic_list = somaticDriver299
42 | 
43 | # mutations
44 | somatic_f = "../../TCGA_data/somatic/mc3.v0.2.8.PUBLIC.maf.gene_vclass_HGVSp_sample.gz"
45 | somatic = read.table(header=T, quote = "", sep="\t", file = gzfile(somatic_f), stringsAsFactors=FALSE)
46 | somatic$bcr_patient_barcode = gsub("(^TCGA-[A-Z0-9][A-Z0-9]-[A-Z0-9][A-Z0-9][A-Z0-9][A-Z0-9])-.*","\\1",somatic$Tumor_Sample_Barcode)
47 | somatic_mut_count = data.frame(table(somatic$bcr_patient_barcode))
48 | colnames(somatic_mut_count) = c("bcr_patient_barcode","MutationCount")
49 | 
50 | table(somatic$Variant_Classification)
51 | likelyFunctionalTypes = c("Frame_Shift_Del","Frame_Shift_Ins","In_Frame_Del","In_Frame_Ins","Missense_Mutation",
52 |                           "Nonsense_Mutation","Splice_Site","Translation_Start_Site")
53 | somatic_likelyfunctional = somatic[somatic$Hugo_Symbol %in% c(pathVar$HUGO_Symbol,somaticDriver299) & somatic$Variant_Classification %in% likelyFunctionalTypes,]
54 | 
55 | 
56 | 
57 | ### germline somatic interaction ###
58 | # samples being tested
59 | test_samples = intersect(unique(somatic$bcr_patient_barcode), clin$bcr_patient_barcode)
60 | cat("Running germline-somatic interaction in ",length(test_samples),"samples.\n")
61 | 
62 | # run through function
63 | out_table=character(0);
64 | for (g_gene in germ_list){
65 |   for (s_gene in somatic_list){
66 |     germline_carriers = unique(pathVarP$bcr_patient_barcode[pathVarP$HUGO_Symbol==g_gene])
67 |     somatic_carriers = unique(somatic_likelyfunctional$bcr_patient_barcode[somatic_likelyfunctional$Hugo_Symbol==s_gene])
68 |     t_result = sg_fisher_test(all_samples = test_samples, germline_carriers, somatic_carriers)
69 |     p = t_result$p
70 |     OR = t_result$OR
71 |     count00 = t_result$count00
72 |     count10 = t_result$count10
73 |     count01 = t_result$count01
74 |     count11 = t_result$count11
75 |     
76 |     if(count10  + count11 >= 8){
77 |       out_row = c(g_gene, s_gene, count00, count10, count01, count11, OR, p)
78 |       out_table = rbind(out_table,out_row)
79 |     }
80 |   }
81 | }
82 | rownames(out_table)=NULL
83 | colnames(out_table) = c("GermlineGene", "SomaticGene", "germline0somatic0", "germline1somatic0", "germline0somatic1", "germline1somatic1","OR", "P")
84 | out_table = data.frame(out_table)
85 | out_table[,"P"] = as.numeric(as.character(out_table[,"P"]))
86 | 
87 | FDR = p.adjust(out_table[,"P"],method="fdr")
88 | out_table = cbind(out_table,FDR)
89 | out_table = out_table[order(out_table[,"P"]),]
90 | 
91 | fn = paste("out/germline_all_somatic_fisher.tsv")
92 | write.table(out_table,file=fn,quote=FALSE,row.names=FALSE,sep="\t")


--------------------------------------------------------------------------------
/germline_somatic_analysis/somatic_germline_overlap/somatic_germline_overlap_genes.R:
--------------------------------------------------------------------------------
 1 | ##### somatic_germline_overlap.R #####
 2 | # Kuan-lin Huang @ WashU 2018
 3 | # Find overlap of genes/variants for somatic/germline variants
 4 | 
 5 | ### dependencies ###
 6 | source("../global_aes_out.R")
 7 | source("../dependency_files.R")
 8 | source("../load_somatic.R")
 9 | 
10 | # counts of somatic functional mutation by gene
11 | somatic_gene_count = data.frame(table(somatic_likelyfunctional_driver$Hugo_Symbol))
12 | germline_gene_count = data.frame(table(pathVarP$HUGO_Symbol))
13 | colnames(somatic_gene_count) = c("Gene","PredictedFunctionalSomaticMutationCount")
14 | colnames(germline_gene_count) = c("Gene","PathogenicGermlineVariantCount")
15 | gene_count = merge(somatic_gene_count,germline_gene_count,by="Gene",all=T)
16 | gene_count[is.na(gene_count)] = 0
17 | highlight_g = as.character(gene_count$Gene[gene_count$PredictedFunctionalSomaticMutationCount > 400 | gene_count$PathogenicGermlineVariantCount > 20
18 |                                            | (gene_count$PredictedFunctionalSomaticMutationCount > 140 & gene_count$PathogenicGermlineVariantCount > 3)])
19 | gene_count$GeneClass = "Others"
20 | gene_count$GeneClass[gene_count$Gene %in% all_oncogenes] = "Oncogene"
21 | gene_count$GeneClass[gene_count$Gene %in% all_TSGs] = "TSG"
22 | 
23 | p = ggplot(gene_count,aes(y=PredictedFunctionalSomaticMutationCount, x =PathogenicGermlineVariantCount, color = GeneClass))
24 | p = p + geom_point(stroke=0,alpha = 0.2) + theme_bw()  #+ guides(color=FALSE)
25 | #p = p + geom_abline(intercept = 0, slope=1, alpha=0.2) #+ geom_density2d(alpha=0.5)
26 | p = p + geom_text_repel(aes(label=ifelse(as.character(Gene) %in% highlight_g,as.character(Gene), NA)))
27 | p = p + theme(axis.title = element_text(size=16), axis.text.x = element_text(colour="black", size=14, angle=90,vjust=0.5), axis.text.y = element_text(colour="black", size=14))#element_text(colour="black", size=14))
28 | #p = p + scale_x_log10() + scale_y_log10()
29 | p = p + expand_limits(x = 0,y=0) + ylim(0,1100)
30 | #p = p + coord_equal() + getLOHColorScale()
31 | p
32 | fn = "out/somatic_vs_germline_var_counts_by_gene.pdf"
33 | ggsave(file=fn, width=5, h =5, useDingbats=FALSE)
34 | 
35 | 
36 | # by cancer
37 | somatic_cancer_gene_count = data.frame(table(somatic_likelyfunctional_driver$Hugo_Symbol, somatic_likelyfunctional_driver$cancer))
38 | germline_cancer_gene_count = data.frame(table(pathVarP$HUGO_Symbol,pathVarP$cancer))
39 | colnames(somatic_cancer_gene_count) = c("Gene","Cancer","PredictedFunctionalSomaticMutationCount")
40 | colnames(germline_cancer_gene_count) = c("Gene","Cancer","PathogenicGermlineVariantCount")
41 | cancer_gene_count = merge(somatic_cancer_gene_count,germline_cancer_gene_count,by=c("Gene","Cancer"),all=T)
42 | cancer_gene_count[is.na(cancer_gene_count)] = 0
43 | highlight_g = as.character(cancer_gene_count$Gene[cancer_gene_count$PredictedFunctionalSomaticMutationCount > 400 | cancer_gene_count$PathogenicGermlineVariantCount > 20
44 |                                            | (cancer_gene_count$PredictedFunctionalSomaticMutationCount > 140 & cancer_gene_count$PathogenicGermlineVariantCount > 3)])
45 | cancer_gene_count$GeneClass = "Others"
46 | cancer_gene_count$GeneClass[cancer_gene_count$Gene %in% all_oncogenes] = "Oncogene"
47 | cancer_gene_count$GeneClass[cancer_gene_count$Gene %in% all_TSGs] = "TSG"
48 | 
49 | p = ggplot(cancer_gene_count,aes(y=PredictedFunctionalSomaticMutationCount, x =PathogenicGermlineVariantCount, color = GeneClass))
50 | p = p + facet_wrap(~Cancer)
51 | p = p + geom_point(stroke=0,alpha = 0.2) + theme_bw()  #+ guides(color=FALSE)
52 | #p = p + geom_abline(intercept = 0, slope=1, alpha=0.2) #+ geom_density2d(alpha=0.5)
53 | p = p + geom_text_repel(aes(label=ifelse(as.character(Gene) %in% highlight_g,as.character(Gene), NA)),size=1)
54 | p = p + theme(axis.title = element_text(size=16), axis.text.x = element_text(colour="black", size=14, angle=90,vjust=0.5), axis.text.y = element_text(colour="black", size=14))#element_text(colour="black", size=14))
55 | p = p + scale_x_log10() + scale_y_log10()
56 | p = p + expand_limits(x = 0,y=0) #+ ylim(0,1100)
57 | #p = p + coord_equal() + getLOHColorScale()
58 | p
59 | fn = "out/somatic_vs_germline_var_counts_by_gene_by_cancer.pdf"
60 | ggsave(file=fn, width=10, h =10, useDingbats=FALSE)
61 | 


--------------------------------------------------------------------------------
/util/edit_vcf_samplenames/list_vcf_source-sample_pairs.pl:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env perl
 2 | #
 3 | # (rjm), July 2017
 4 | # Syntax:   uncompressed vcf |  $thisScript
 5 | 
 6 | 
 7 | use strict;
 8 | use warnings;
 9 | 
10 | my @myList=();
11 | my @a;
12 | 
13 | while(<STDIN>) {
14 |     chomp;
15 |     if( /^#/ ) {
16 | 	# get list of input filenames given to merge
17 | 	if( /vcf-merge/ ) {
18 | 	    @a = split /\s+/;
19 | 	    for(my $i = 1; $i < scalar @a; $i++) {
20 | 		my %data = ('inputfile' => $a[ $i ], 'samplename' => "");
21 | 		push @myList, \%data;
22 | 	    }
23 | 	}
24 | 	if( /^#CHROM/ ) {
25 | 	    @a = split /\t/;
26 | 	    for(my $i = 9 ; $i < scalar @a; $i++) {
27 | 		$myList[ $i - 9 ]{'samplename'} = $a[ $i ];
28 | 	    }
29 | 	}
30 | 	
31 |     } else {
32 | 	last;
33 |     }
34 | }
35 | 
36 | for(my $j=0 ; $j < scalar @myList; $j++) {
37 |     print $j,"\t", $myList[$j]{'inputfile'},"\t", $myList[$j]{'samplename'}, "\n";
38 | }
39 | 


--------------------------------------------------------------------------------
/util/edit_vcf_samplenames/replace_vcf_header_sample_with_source.pl:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env perl
 2 | #
 3 | # Jay Mashl, July 2017
 4 | # Syntax:   uncompressed vcf |  $thisScript
 5 | 
 6 | 
 7 | use strict;
 8 | use warnings;
 9 | 
10 | my @myList=();
11 | my @a;
12 | 
13 | while(<STDIN>) {
14 |     chomp;
15 |     if( /^#/ ) {
16 | 	# get list of input filenames given to merge
17 | 	if( /vcf-merge/ ) {
18 | 	    @a = split /\s+/;
19 | 	    for(my $i = 1; $i < scalar @a; $i++) {
20 | 		my %data = ('inputfile' => $a[ $i ], 'samplename' => "");
21 | 		push @myList, \%data;
22 | 	    }
23 | 	}
24 | 	if( /^#CHROM/ ) {
25 | 	    @a = split /\t/;
26 | 	    for(my $i = 9 ; $i < scalar @a; $i++) {
27 | #		$myList[ $i - 9 ]{'samplename'} = $a[ $i ];
28 | 
29 |                 # in this application, extract unique identifier from first field
30 | 		my @b = split /\./, $myList[ $i - 9 ]{'inputfile'};
31 | 		$a[ $i ] = $b[0];
32 | 	    }
33 | 	}
34 | 
35 | 	#Print
36 | 	if( /^#CHROM/ ) {
37 | 	    print join("\t", @a),"\n";
38 | 	} else {
39 | 	    print $_,"\n";
40 | 	}
41 | 	
42 |     } else  {
43 | 	last;
44 |     }
45 | }
46 | 
47 | #for(my $j=0 ; $j < scalar @myList; $j++) {
48 | #    print $j,"\t", $myList[$j]{'inputfile'},"\t", $myList[$j]{'samplename'}, "\n";
49 | #}
50 | 


--------------------------------------------------------------------------------
/util/edit_vcf_samplenames/uniqueify_merged_samplenames.driver.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | # Jay Mashl, July 2017
 4 | 
 5 | for ct in ACC BLCA BRCA CESC CHOL COAD DLBC ESCA GBM HNSC KICH KIRC KIRP LAML LGG LIHC LUAD LUSC MESO OV PAAD PCA PCPG PRAD READ SARC SKCM STAD TGCT THCA THYM UCEC UCS UVM  ; do
 6 |     echo '------'
 7 |     echo $ct
 8 |     echo '------'
 9 |     gsutil -m cp gs://dinglab/isb-cgc/tcga/germline/production/merge/$ct.merge.vcf.gz  gs://dinglab/isb-cgc/tcga/germline/production/merge/$ct.merge.vcf.gz.tbi .
10 |     gunzip -dc $ct.merge.vcf.gz | ./replace_vcf_header_sample_with_source.pl   > $ct.merge.newheader.txt
11 |     tabix -r $ct.merge.newheader.txt   $ct.merge.vcf.gz   >  $ct.merge.fixedHeader.vcf.gz
12 |     tabix -p vcf $ct.merge.fixedHeader.vcf.gz
13 | 
14 |     rm -f $ct.merge.vcf.gz $ct.merge.vcf.gz.tbi
15 | 
16 | done
17 | 
18 |     
19 |     
20 | 


--------------------------------------------------------------------------------
/util/edit_vcf_samplenames/uniqueify_merged_samplenames.template.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | # Jay Mashl, July 2017
 3 | 
 4 | SOURCE=variants.vcf.gz
 5 | DEST=${SOURCE/variants/variants.fixedHeader}
 6 | 
 7 | gunzip -dc $SOURCE  | ./replace_vcf_header_sample_with_source.pl   > newheader.txt
 8 | tabix -r newheader.txt   $SOURCE  >  $DEST
 9 | tabix -p vcf $DEST
10 | 


--------------------------------------------------------------------------------