├── LICENSE
├── README.md
├── data_and_refs
    ├── SV_colors.txt
    └── gnomAD_population_colors.txt
├── gnomad_sv_analysis_scripts
    ├── .DS_Store
    ├── apply_minGQ_filter.py
    ├── clean_frequencies_table.R
    ├── correlate_batches_singlePop.R
    ├── count_mendelian_violations.py
    ├── create_minGQ_lookup_table.R
    ├── create_minGQ_tranches_table.R
    ├── determine_lowQual_site_categories.R
    ├── determine_svcount_outliers.R
    ├── filter_cleanup_and_QUAL_recalibration.py
    ├── find_batch_effects.R
    ├── find_batch_effects.shard_helper.R
    ├── gather_trio_MVR_data.py
    ├── gather_trio_genos.py
    ├── helper_median_counts_per_trio.R
    ├── label_batch_effects.py
    ├── make_batch_effect_reclassification_table.R
    ├── make_batch_pairs_list.R
    ├── merge_batch_freq_tables.R
    ├── merge_filter_columns.py
    ├── optimize_GQ_ROC.R
    ├── optimize_minGQ_ROC_v2.R
    ├── parse_KING_results.R
    ├── prePCA_vcf_filter.py
    ├── runPCA_labelAncestries.R
    ├── subset_minGQ_trio_data.R
    └── sum_svcounts_perSample.R
├── gnomad_sv_analysis_wdls
    ├── MVR_collection_helper.wdl
    ├── apply_GQ_filter.wdl
    ├── assign_lowQuality_sites.wdl
    ├── check_batch_effects.wdl
    ├── compute_simple_AFs_singleChrom.wdl
    ├── filter_cleanup_and_QUAL_recalibration.wdl
    ├── final_outlier_sample_filter.wdl
    ├── gather_batch_effects_helper.wdl
    ├── label_ancestries_and_relatedness.wdl
    ├── minGQ_ROC_helper.wdl
    ├── minGQ_filter_procedure_v2.wdl
    ├── minGQ_filter_procedure_wrapper.wdl
    ├── optimize_GQ_filter.wdl
    ├── prune_and_add_vafs.wdl
    └── sharded_vcf2bed.wdl
├── gnomad_sv_manuscript_code
    ├── .DS_Store
    ├── metadata_generation
    │   ├── AF_reconcilliation_helper.R
    │   ├── collect_sample_level_summary_data.R
    │   ├── merge_downsample_sv_per_gene.R
    │   ├── seed_downsampling.R
    │   └── simplify_downsample_results.R
    └── plotting_code
    │   ├── chromosome_sv_density_analysis.R
    │   ├── gene_level_analysis.R
    │   ├── high_low_quality_callset_comparisons.plot.R
    │   ├── noncoding_analysis.plot.R
    │   ├── plot_PCAs.R
    │   ├── plot_deNovo_rate_analysis.R
    │   ├── plot_downsampling_analyses.R
    │   ├── plot_sample_level_analysis.R
    │   ├── secondary_prop_singletons_analysis.R
    │   ├── snv_sv_ld_analyses.plot.R
    │   ├── thousand_genomes_comparisons.plot.R
    │   ├── ukbb_gd_analysis.plot.R
    │   └── vcf_wide_site_summaries.plot.R
├── gnomad_sv_pipeline_scripts
    ├── .DS_Store
    ├── module_00
    │   ├── .DS_Store
    │   ├── vcf2baf.sh
    │   └── vcf2baf_helper.py
    └── module_01
    │   ├── make_depth_rdtest_bed.py
    │   └── make_pesr_rdtest_bed.py
└── gnomad_sv_pipeline_wdls
    ├── .DS_Store
    ├── module_00
        ├── 00_batch_BAF_merging.wdl
        ├── 00_batch_PESRRD_merging.wdl
        ├── 00_batch_SR_merging.wdl
        ├── 00_batch_evidence_merging.wdl
        ├── 00_depth_preprocessing.wdl
        ├── 00_pesr_preprocessing.wdl
        └── 00_pesr_processing_single_algorithm.wdl
    ├── module_01
        ├── 01_depth_clustering.wdl
        ├── 01_depth_clustering_by_chrom.wdl
        ├── 01_pesr_clustering.wdl
        └── 01_pesr_clustering_single_algorithm.wdl
    ├── module_02
        ├── 02_aggregate.wdl
        ├── 02_assess_evidence_single_vcf.wdl
        ├── 02_baftest.wdl
        ├── 02_baftest_autosome.wdl
        ├── 02_petest.wdl
        ├── 02_petest_allosome.wdl
        ├── 02_petest_autosome.wdl
        ├── 02_rdtest.wdl
        ├── 02_rdtest_allosome.wdl
        ├── 02_rdtest_autosome.wdl
        ├── 02_srtest.wdl
        ├── 02_srtest_allosome.wdl
        └── 02_srtest_autosome.wdl
    ├── module_03
        ├── 03_filter_outliers.wdl
        ├── 03_filter_vcf.wdl
        └── 03_variant_filtering.wdl
    ├── module_04
        ├── 04_preprocess.wdl
        ├── 04_v2_PE_genotyping_train.wdl
        ├── 04_v2_RD_genotyping_train.wdl
        ├── 04_v2_SR_genotyping_train.wdl
        ├── 04_v2_genotype_batch.wdl
        ├── 04_v2_genotype_depth_part1.wdl
        ├── 04_v2_genotype_depth_part2.wdl
        ├── 04_v2_genotype_pesr_part1.wdl
        ├── 04_v2_genotype_pesr_part2.wdl
        └── 04_v2_make_cohort_VCFs.wdl
    ├── module_05
        ├── 04_bp_overlap_filter_by_chrom.wdl
        ├── 04_genotype_CPX_CNVs.wdl
        ├── 04_genotype_CPX_CNVs_perBatch.wdl
        ├── 04_integrate_batches.wdl
        ├── 04_integrate_resolved_vcfs.wdl
        ├── 04_merge_allvar_invonly_vcfs.wdl
        ├── 04_pesr_depth_overlap.wdl
        ├── 04_resolve_complex_by_chrom.wdl
        ├── 04_resolve_complex_sv.wdl
        ├── 04_sharded_vcfcluster.wdl
        ├── 04b_batch_integration.wdl
        ├── 04b_genotype_CPX_CNVs.wdl
        ├── 04b_preprocess.wdl
        ├── 04b_resolve_complex_sv.wdl
        ├── 04b_scatter_CPX_genotyping.wdl
        └── 04b_vcfcluster_single_chrom.wdl
    ├── module_06
        ├── 05_cleanVCF.wdl
        ├── 05_cleanVCF_part2.wdl
        ├── 05_cleanVCF_part4.wdl
        └── 05_cleanVCF_scatter.wdl
    └── module_07
        ├── 06_annotate.wdl
        └── 06_annotate_per_chrom.wdl


/LICENSE:
--------------------------------------------------------------------------------
 1 | MIT License
 2 | 
 3 | Copyright (c) 2019 Talkowski Laboratory
 4 | 
 5 | Permission is hereby granted, free of charge, to any person obtaining a copy
 6 | of this software and associated documentation files (the "Software"), to deal
 7 | in the Software without restriction, including without limitation the rights
 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 | 
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 | 
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.
22 | 


--------------------------------------------------------------------------------
/data_and_refs/SV_colors.txt:
--------------------------------------------------------------------------------
1 | DEL	#D43925
2 | DUP	#2376B2
3 | MCNV	#7459B2
4 | INS	#D474E0
5 | INV	#FA931E
6 | CPX	#71E38C
7 | OTH	#397246
8 | 


--------------------------------------------------------------------------------
/data_and_refs/gnomAD_population_colors.txt:
--------------------------------------------------------------------------------
1 | pop	color	name
2 | AFR	#941594	"African/African-American"
3 | AMR	#EE1D24	Latino
4 | EAS	#108C43	"East Asian"
5 | EUR	#69A5CC	European
6 | OTH	#ABB9B9	"Other/Unk."
7 | 


--------------------------------------------------------------------------------
/gnomad_sv_analysis_scripts/.DS_Store:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/talkowski-lab/gnomad-sv-pipeline/b7798895fc1b2c8d83b41b36148b0a4b3a8d25cb/gnomad_sv_analysis_scripts/.DS_Store


--------------------------------------------------------------------------------
/gnomad_sv_analysis_scripts/clean_frequencies_table.R:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env Rscript
 2 | 
 3 | # Copyright (c) 2018 Talkowski Laboratory
 4 | # Contact: Ryan Collins <rlcollins@g.harvard.edu>
 5 | # Distributed under terms of the MIT license.
 6 | 
 7 | # Talkowski SV pipeline downstream analysis helper script
 8 | 
 9 | # Clean allele frequency table across batches
10 | 
11 | 
12 | ###Set global parameters
13 | options(stringsAsFactors=F,scipen=1000)
14 | svtypes <- c("DEL","DUP","INS","INV","CPX","BND")
15 | allpops <- c("AFR","ASN","EUR","HSP")
16 | 
17 | 
18 | ###################
19 | ###HELPER FUNCTIONS
20 | ###################
21 | #Collapse multiallelic ACs
22 | clean.ACs <- function(ACs){
23 |   ACs.to.clean <- grep(",",ACs,fixed=T)
24 |   cleaned.ACs <- as.vector(as.numeric(sapply(ACs[ACs.to.clean],function(s){
25 |     return(sum(as.numeric(unlist(strsplit(s,split=",")))[-2]))
26 |   })))
27 |   ACs[ACs.to.clean] <- cleaned.ACs
28 |   return(as.numeric(ACs))
29 | }
30 | #Import a single table of freq data
31 | import.freq.table <- function(path,pops){
32 |   #Read data
33 |   dat <- read.table(path,header=T,comment.char="",sep="\t")
34 |   colnames(dat)[1:3] <- c("VID","SVLEN","SVTYPE")
35 |   #Clean multiallelic ACs
36 |   dat[,grep("_AC",colnames(dat),fixed=T)] <- apply(dat[,grep("_AC",colnames(dat),fixed=T)],2,clean.ACs)
37 |   #Convert all ANs to numerics
38 |   dat[,grep("_AN",colnames(dat),fixed=T)] <- apply(dat[,grep("_AN",colnames(dat),fixed=T)],2,as.numeric)
39 |   #Double counts for MCNVs (since first allele for all MCNVs is nulled out)
40 |   MCNV.idx <- grep("_MCNV_",dat[,1],fixed=T)
41 |   dat[MCNV.idx,grep("_AC",colnames(dat),fixed=T)] <- 2*dat[MCNV.idx,grep("_AC",colnames(dat),fixed=T)]
42 |   dat[MCNV.idx,grep("_AN",colnames(dat),fixed=T)] <- 2*dat[MCNV.idx,grep("_AN",colnames(dat),fixed=T)]
43 |   #Adjust calls on sex chromosomes
44 |   for(pop in pops){
45 |     x.idx <- unique(c(grep("_X_",dat[,1],fixed=T),
46 |                       grep("_chrX_",dat[,1],fixed=T)))
47 |     y.idx <- unique(c(grep("_Y_",dat[,1],fixed=T),
48 |                       grep("_chrY_",dat[,1],fixed=T)))
49 |     master.AC.idx <- which(colnames(dat)==paste(pop,"AC",sep="_"))
50 |     master.AN.idx <- which(colnames(dat)==paste(pop,"AN",sep="_"))
51 |     male.AC.idx <- which(colnames(dat)==paste(pop,"MALE","AC",sep="_"))
52 |     male.AN.idx <- which(colnames(dat)==paste(pop,"MALE","AN",sep="_"))
53 |     female.AC.idx <- which(colnames(dat)==paste(pop,"FEMALE","AC",sep="_"))
54 |     female.AN.idx <- which(colnames(dat)==paste(pop,"FEMALE","AN",sep="_"))
55 |     #For variants on chrX, overwrite master counts with female-specific counts
56 |     dat[x.idx,master.AC.idx] <- dat[x.idx,female.AC.idx]
57 |     dat[x.idx,master.AN.idx] <- dat[x.idx,female.AN.idx]
58 |     #For all variants on chrY, overwrite master counts with male-specific counts
59 |     dat[y.idx,master.AC.idx] <- dat[y.idx,male.AC.idx]
60 |     dat[y.idx,master.AN.idx] <- dat[y.idx,male.AN.idx]
61 |   }
62 |   #Drop all male- and female-specific columns from table
63 |   dat <- dat[,-grep("MALE",colnames(dat),fixed=T)]
64 |   #Add batch name to colnames
65 |   return(dat)
66 | }
67 | 
68 | 
69 | ###Read command-line arguments
70 | args <- commandArgs(trailingOnly=T)
71 | INFILE <- as.character(args[1])
72 | OUTFILE <- as.character(args[2])
73 | 
74 | # #Dev parameters (local)
75 | # INFILE <- "~/scratch/gnomAD_v2_SV_PCRPLUS_Q1_batch_1.frequencies.preclean.txt.gz"
76 | 
77 | 
78 | ###Process input data
79 | dat <- import.freq.table(path=INFILE,pops=allpops)
80 | 
81 | 
82 | ###Write output data
83 | write.table(dat,OUTFILE,col.names=T,row.names=F,sep="\t",quote=F)
84 | system(paste("gzip -f ",OUTFILE,sep=""),wait=T,intern=F)
85 | 


--------------------------------------------------------------------------------
/gnomad_sv_analysis_scripts/find_batch_effects.R:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env Rscript
  2 | 
  3 | # Copyright (c) 2018 Talkowski Laboratory
  4 | # Contact: Ryan Collins <rlcollins@g.harvard.edu>
  5 | # Distributed under terms of the MIT license.
  6 | 
  7 | # Talkowski SV pipeline downstream analysis helper script
  8 | 
  9 | # Make list of all nonredundant pairs of batches from an input list of batches
 10 | 
 11 | 
 12 | ###Set global parameters
 13 | options(stringsAsFactors=F,scipen=1000)
 14 | allpops <- c("AFR","ASN","EUR","HSP")
 15 | svtypes <- c("DEL","DUP","MCNV","INS","INV","CPX")
 16 | 
 17 | 
 18 | ###################
 19 | ###HELPER FUNCTIONS
 20 | ###################
 21 | #Allele frequency correlation plot between datasets
 22 | plot.AFcorr <- function(dat,batch1=NULL,batch2=NULL,title=NULL,axlims){
 23 |   #Prepare plot
 24 |   AF.pairs <- data.frame(dat$b1.AF,dat$b2.AF)
 25 |   #Artificially assign non-zero AFs for sites that appear in 0 samples
 26 |   AF.pairs[which(AF.pairs[,1]==0),1] <- 1/900
 27 |   AF.pairs[which(AF.pairs[,2]==0),2] <- 1/900
 28 |   logscale.all <- log10(as.numeric(unlist(sapply(c(0:9),function(i){(1:9)*(10^i)}))))
 29 |   logscale.major <- 0:9
 30 |   major.labels <- sapply(logscale.major,function(i){expression(paste(i^"th"))})
 31 |   par(mar=c(3.2,1.5,1.5,3.2))
 32 |   plot(x=log10(c(min(AF.pairs[,1],na.rm=T),1)),
 33 |        y=log10(c(min(AF.pairs[,2],na.rm=T),1)),
 34 |        type="n",xaxt="n",yaxt="n",xlab="",ylab="",
 35 |        xlim=log10(axlims),ylim=log10(axlims))
 36 |   axis(1,at=-logscale.all,labels=NA,tck=-0.015,lwd=0.7)
 37 |   axis(1,at=-logscale.major,labels=NA,tck=-0.03,lwd=1.1)
 38 |   mtext(1,text=bquote("log"[10] ~ "("*.(batch1) ~ "AF)"),line=2)
 39 |   axis(4,at=-logscale.all,labels=NA,tck=-0.015,lwd=0.7)
 40 |   axis(4,at=-logscale.major,labels=NA,tck=-0.03,lwd=1.1)
 41 |   mtext(4,text=bquote("log"[10] ~ "("*.(batch2) ~ "AF)"),line=2)
 42 |   sapply(-logscale.major,function(i){
 43 |     # axis(1,at=i,labels=bquote('10'^.(i)),tick=F,line=-0.6,cex.axis=0.8)
 44 |     # axis(4,at=i,labels=bquote('10'^.(i)),tick=F,line=-0.4,cex.axis=0.8,las=2)
 45 |     axis(1,at=i,labels=i,tick=F,line=-0.6,cex.axis=0.8)
 46 |     axis(4,at=i,labels=i,tick=F,line=-0.4,cex.axis=0.8,las=2)
 47 |   })
 48 |   abline(h=log10(1/900),v=log10(1/900),lty=3)
 49 |   axis(1,at=log10(1/900),labels="AC=0",line=-0.8,cex.axis=0.7,tick=F)
 50 |   axis(4,at=log10(1/900),labels="AC=0",line=-0.6,cex.axis=0.7,tick=F,las=2)
 51 |   mtext(3,text=title,line=0.1,font=2)
 52 |   
 53 |   #Add points
 54 |   pt.cex <- 0.4
 55 |   alpha <- 0.25
 56 |   points(x=log10(AF.pairs[which(dat$bonf.p>=0.05),1]),
 57 |          y=log10(AF.pairs[which(dat$bonf.p>=0.05),2]),
 58 |          pch=19,cex=pt.cex,lwd=0,
 59 |          col=adjustcolor("gray50",alpha=alpha))
 60 |   points(x=log10(AF.pairs[which(dat$bonf.p<0.05),1]),
 61 |          y=log10(AF.pairs[which(dat$bonf.p<0.05),2]),
 62 |          pch=19,cex=pt.cex,lwd=0,
 63 |          col=adjustcolor("red",alpha=alpha))
 64 |   
 65 |   #Add stats
 66 |   abline(lm(AF.pairs[,1] ~ AF.pairs[,2]),col="gray10",lty=2)
 67 |   AB.cor <- format(round(cor(AF.pairs[,1],AF.pairs[,2])^2,3),nsmall=3)
 68 |   text(x=par("usr")[1],y=par("usr")[4]-(0.085*(par("usr")[4]-par("usr")[3])),
 69 |        labels=bquote(italic(R)^2 == .(AB.cor)),cex=1.4,pos=4)
 70 | }
 71 | 
 72 | 
 73 | ###Read command-line arguments
 74 | args <- commandArgs(trailingOnly=T)
 75 | infile <- as.character(args[1])
 76 | batch1 <- as.character(args[2])
 77 | batch2 <- as.character(args[3])
 78 | OUTPREFIX <- as.character(args[4])
 79 | 
 80 | # #Dev parameters:
 81 | # infile <- "~/scratch/gnomAD_AF_table.merged.txt.gz"
 82 | # infile <- "~/scratch/gnomAD_v2_SV_MASTER.gnomAD_v2_SV_PCRPLUS_Q1_batch_1_vs_gnomAD_v2_SV_PCRMINUS_Q2_batch_1.AF_comparison_table.txt.gz"
 83 | # batch1 <- "gnomAD_v2_SV_PCRPLUS_Q1_batch_1"
 84 | # batch2 <- "gnomAD_v2_SV_PCRMINUS_Q2_batch_1"
 85 | # OUTPREFIX <- "~/scratch/gnomAD_v2_SV_MASTER"
 86 | 
 87 | 
 88 | ###Process input data
 89 | dat <- read.table(infile,header=T,sep="\t",comment.char="")
 90 | dat$bonf.p <- p.adjust(dat$chisq.p,method="bonferroni")
 91 | write.table(dat,paste(OUTPREFIX,".",batch1,"_vs_",batch2,".freq_table_wBonferroni.txt",sep=""),
 92 |             col.names=T,row.names=F,sep="\t",quote=F)
 93 | 
 94 | 
 95 | ###Write list of significant batch effect variants
 96 | bad.vars <- dat$VID[which(dat$bonf.p<0.05)]
 97 | write.table(bad.vars,paste(OUTPREFIX,".",batch1,"_vs_",batch2,".batch_effect_variants.txt",sep=""),
 98 |             col.names=F,row.names=F,sep="\t",quote=F)
 99 | 
100 | 
101 | ###Plot AF correlations, one per SVTYPE
102 | axlims <- c(1/900,1)
103 | png(paste(OUTPREFIX,".",batch1,"_vs_",batch2,".AF_correlation_scatterplot.ALL.png",sep=""),
104 |     height=6*300,width=6*300,res=400)
105 | plot.AFcorr(dat=dat,batch1=batch1,batch2=batch2,title="All SV",axlims=axlims)
106 | dev.off()
107 | sapply(svtypes,function(svtype){
108 |   subdat <- dat[grep(svtype,dat$VID,fixed=T),]
109 |   if(nrow(subdat)>0){
110 |     png(paste(OUTPREFIX,".",batch1,"_vs_",batch2,".AF_correlation_scatterplot.",svtype,".png",sep=""),
111 |         height=6*300,width=6*300,res=400)
112 |     plot.AFcorr(dat=subdat,batch1=batch1,batch2=batch2,title=svtype,axlims=axlims)
113 |     dev.off()
114 |   }
115 | })
116 | 
117 | 


--------------------------------------------------------------------------------
/gnomad_sv_analysis_scripts/find_batch_effects.shard_helper.R:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env Rscript
  2 | 
  3 | # Copyright (c) 2018 Talkowski Laboratory
  4 | # Contact: Ryan Collins <rlcollins@g.harvard.edu>
  5 | # Distributed under terms of the MIT license.
  6 | 
  7 | # Talkowski SV pipeline downstream analysis helper script
  8 | 
  9 | # Make list of all nonredundant pairs of batches from an input list of batches
 10 | 
 11 | 
 12 | ###Set global parameters
 13 | options(stringsAsFactors=F,scipen=1000)
 14 | allpops <- c("AFR","ASN","EUR","HSP")
 15 | 
 16 | 
 17 | ###################
 18 | ###HELPER FUNCTIONS
 19 | ###################
 20 | #For any two batches, find most comparable AFs for each variant and run chi-sqared test
 21 | compare.batches <- function(dat,batch1,batch2,min.AN=60){
 22 |   #Subset data for each batch (for convenience)
 23 |   #1: restrict to sites with >0 AC in at least one batch
 24 |   b1.dat <- dat[,c(1:3,grep(batch1,colnames(dat),fixed=T))]
 25 |   b1.maxAC <- apply(b1.dat[,grep("_AC",colnames(b1.dat),fixed=T)],1,max)
 26 |   if(batch2 != "ALL_OTHERS"){
 27 |     b2.dat <- dat[,c(1:3,grep(batch2,colnames(dat),fixed=T))]
 28 |     b2.maxAC <- apply(b2.dat[,grep("_AC",colnames(b2.dat),fixed=T)],1,max)
 29 |   }else{
 30 |     b2.consolidated.dat <- do.call("cbind", lapply(allpops,function(pop){
 31 |       ACs <- apply(dat[,setdiff(grep(paste(pop,"AC",sep="_"),colnames(dat),fixed=T),
 32 |                                 grep(batch1,colnames(dat),fixed=T))],1,sum,na.rm=T)
 33 |       ANs <- apply(dat[,setdiff(grep(paste(pop,"AN",sep="_"),colnames(dat),fixed=T),
 34 |                                 grep(batch1,colnames(dat),fixed=T))],1,sum,na.rm=T)
 35 |       dtmp <- data.frame(ANs,ACs)
 36 |       colnames(dtmp) <- c(paste(pop,"_AN.ALL_OTHERS",sep=""),
 37 |                           paste(pop,"_AC.ALL_OTHERS",sep=""))
 38 |       return(dtmp)
 39 |     }))
 40 |     b2.dat <- cbind(dat[,1:3],b2.consolidated.dat)
 41 |     b2.maxAC <- apply(b2.dat[,grep("_AC",colnames(b2.dat),fixed=T)],1,max)
 42 |   }
 43 |   b1.dat <- b1.dat[which(b1.maxAC > 0 | b2.maxAC > 0),]
 44 |   b2.dat <- b2.dat[which(b1.maxAC > 0 | b2.maxAC > 0),]
 45 |   #Iterate over variants and process each
 46 |   res <- do.call("rbind", lapply(as.character(b1.dat$VID),function(VID){
 47 |     #Find pop with largest min AN and at least one alternate allele between the two batches
 48 |     AN.bypop <- sapply(allpops,function(pop){
 49 |       min(b1.dat[which(b1.dat$VID==VID),
 50 |                  grep(paste(pop,"AN",sep="_"),colnames(b1.dat))],
 51 |           b2.dat[which(b2.dat$VID==VID),
 52 |                  grep(paste(pop,"AN",sep="_"),colnames(b2.dat))],
 53 |           na.rm=T)
 54 |     })
 55 |     AC.bypop <- sapply(allpops,function(pop){
 56 |       max(b1.dat[which(b1.dat$VID==VID),
 57 |                  grep(paste(pop,"AC",sep="_"),colnames(b1.dat))],
 58 |           b2.dat[which(b2.dat$VID==VID),
 59 |                  grep(paste(pop,"AC",sep="_"),colnames(b2.dat))],
 60 |           na.rm=T)
 61 |     })
 62 |     AN.bypop[which(AC.bypop<1)] <- 0
 63 |     #Only process if at least one pop has min AN > min.AN
 64 |     if(any(AN.bypop>min.AN)){
 65 |       bestpop <- names(AN.bypop)[which(AN.bypop==max(AN.bypop,na.rm=T))]
 66 |       b1.AC <- as.numeric(b1.dat[which(b1.dat$VID==VID),
 67 |                                  grep(paste(bestpop,"AC",sep="_"),colnames(b1.dat),fixed=T)])
 68 |       b1.AN <- as.numeric(b1.dat[which(b1.dat$VID==VID),
 69 |                                  grep(paste(bestpop,"AN",sep="_"),colnames(b1.dat),fixed=T)])
 70 |       if(b1.AC>b1.AN){
 71 |         b1.AC <- b1.AN
 72 |       }
 73 |       b1.AF <- b1.AC/b1.AN
 74 |       b2.AC <- as.numeric(b2.dat[which(b2.dat$VID==VID),
 75 |                                  grep(paste(bestpop,"AC",sep="_"),colnames(b2.dat),fixed=T)])
 76 |       b2.AN <- as.numeric(b2.dat[which(b2.dat$VID==VID),
 77 |                                  grep(paste(bestpop,"AN",sep="_"),colnames(b2.dat),fixed=T)])
 78 |       if(b2.AC>b2.AN){
 79 |         b2.AC <- b2.AN
 80 |       }
 81 |       b2.AF <- b2.AC/b2.AN
 82 |       b1b2.p <- chisq.test(matrix(c(b1.AN-b1.AC,b1.AC,
 83 |                                     b2.AN-b2.AC,b2.AC),
 84 |                                   nrow=2,byrow=F))$p.value
 85 |       #Output row
 86 |       out.v <- data.frame("VID"=VID,"pop"=bestpop,"b1.AF"=b1.AF,"b2.AF"=b2.AF,"chisq.p"=b1b2.p)
 87 |     }else{
 88 |       out.v <- data.frame("VID"=VID,"pop"=NA,"b1.AF"=NA,"b2.AF"=NA,"chisq.p"=NA)
 89 |     }
 90 |     return(out.v)
 91 |   }))
 92 |   rownames(res) <- NULL
 93 |   res <- res[which(!is.na(res$pop)),]
 94 |   # res$chisq.bonf <- p.adjust(res$chisq.p,method="bonferroni")
 95 |   res[,-c(1:2)] <- apply(res[,-(1:2)],2,as.numeric)
 96 |   return(res)
 97 | }
 98 | 
 99 | 
100 | ###Read command-line arguments
101 | args <- commandArgs(trailingOnly=T)
102 | infile <- as.character(args[1])
103 | batch1 <- as.character(args[2])
104 | batch2 <- as.character(args[3])
105 | OUTFILE <- as.character(args[4])
106 | 
107 | # #Dev parameters:
108 | # infile <- "~/scratch/gnomAD_v2_SV_MASTER.merged_AF_table.txt.gz"
109 | # batch1 <- "gnomAD_v2_SV_PCRMINUS_Q4_batch_4"
110 | # # batch2 <- "gnomAD_v2_SV_PCRMINUS_Q4_batch_5"
111 | # batch2 <- "ALL_OTHERS"
112 | 
113 | ###Process data & write output
114 | dat <- read.table(infile,header=T,sep="\t",comment.char="")
115 | res <- compare.batches(dat=dat,batch1=batch1,batch2=batch2)
116 | write.table(res,OUTFILE,col.names=T,row.names=F,sep="\t",quote=F)
117 | 
118 | 


--------------------------------------------------------------------------------
/gnomad_sv_analysis_scripts/helper_median_counts_per_trio.R:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env Rscript
 2 | 
 3 | # Copyright (c) 2018 Talkowski Laboratory
 4 | # Contact: Ryan Collins <rlcollins@g.harvard.edu>
 5 | # Distributed under terms of the MIT license.
 6 | 
 7 | # Helper script to count median # of filtered variants per trio
 8 | # for minGQ optimization filtering workflow
 9 | 
10 | 
11 | ###Set master parameters
12 | options(stringsAsFactors=F,scipen=1000)
13 | 
14 | 
15 | 
16 | ################
17 | ###RSCRIPT BLOCK
18 | ################
19 | require(optparse)
20 | ###List of command-line options
21 | option_list <- list(
22 |   make_option(c("--ID"), type="character", default="condition",
23 |               help="condition ID [default %default]")
24 | )
25 | 
26 | ###Get command-line arguments & options
27 | args <- parse_args(OptionParser(usage="%prog INFILE FAMFILE OUTFILE",
28 |                                 option_list=option_list),
29 |                    positional_arguments=TRUE)
30 | opts <- args$options
31 | 
32 | ###Checks for appropriate positional arguments
33 | if(length(args$args) != 3){
34 |   stop("Incorrect number of required positional arguments\n")
35 | }
36 | 
37 | ###Writes args & opts to vars
38 | INFILE <- args$args[1]
39 | FAMFILE <- args$args[2]
40 | OUTFILE <- args$args[3]
41 | ID <- opts$ID
42 | 
43 | ###Reads data
44 | dat <- as.character(read.table(INFILE,header=F)[,1])
45 | fams <- unique(as.character(read.table(FAMFILE,header=F)[,1]))
46 | 
47 | ###Computes # of variants per family
48 | counts <- as.integer(sapply(fams,function(fam){
49 |   return(length(which(dat==fam)))
50 | }))
51 | 
52 | ###Reports results
53 | out <- data.frame("condition"=ID,
54 |                   "hetsPerProband_median"=median(counts,na.rm=T),
55 |                   "hetsPerProband_Q1"=quantile(counts,probs=0.25,na.rm=T),
56 |                   "hetsPerProband_Q3"=quantile(counts,probs=0.75,na.rm=T))
57 | colnames(out)[1] <- "#condition"
58 | write.table(out,OUTFILE,col.names=T,row.names=F,sep="\t",quote=F)
59 | 


--------------------------------------------------------------------------------
/gnomad_sv_analysis_scripts/make_batch_pairs_list.R:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env Rscript
 2 | 
 3 | # Copyright (c) 2018 Talkowski Laboratory
 4 | # Contact: Ryan Collins <rlcollins@g.harvard.edu>
 5 | # Distributed under terms of the MIT license.
 6 | 
 7 | # Talkowski SV pipeline downstream analysis helper script
 8 | 
 9 | # Make list of all nonredundant pairs of batches from an input list of batches
10 | 
11 | 
12 | ###Set global parameters
13 | options(stringsAsFactors=F,scipen=1000)
14 | 
15 | 
16 | ###Read command-line arguments
17 | args <- commandArgs(trailingOnly=T)
18 | in.list <- as.character(args[1])
19 | OUTFILE <- as.character(args[2])
20 | 
21 | # #Dev parameters
22 | # in.list <- "~/scratch/af_test/input.list"
23 | # OUTFILE <- "~/scratch/batch_pairs_test.txt"
24 | 
25 | 
26 | ###Process input data
27 | batches <- as.character(read.table(in.list,header=F,sep="\t")[,1])
28 | 
29 | 
30 | ###Generate list of all possible pairs
31 | out <- do.call("rbind", lapply(1:length(batches),function(a){
32 |   do.call("rbind", lapply(2:length(batches),function(b){
33 |     if(a<b){
34 |       return(data.frame(batches[a],batches[b]))
35 |     }
36 |   }))
37 | }))
38 | 
39 | 
40 | ###Write list to outfile
41 | write.table(out,OUTFILE,col.names=F,row.names=F,sep="\t",quote=F)
42 | 


--------------------------------------------------------------------------------
/gnomad_sv_analysis_scripts/merge_batch_freq_tables.R:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env Rscript
 2 | 
 3 | # Copyright (c) 2018 Talkowski Laboratory
 4 | # Contact: Ryan Collins <rlcollins@g.harvard.edu>
 5 | # Distributed under terms of the MIT license.
 6 | 
 7 | # Talkowski SV pipeline downstream analysis helper script
 8 | 
 9 | # Merge AF tables across batches
10 | 
11 | 
12 | ###Set global parameters
13 | options(stringsAsFactors=F,scipen=1000)
14 | svtypes <- c("DEL","DUP","INS","INV","CPX","BND")
15 | allpops <- c("AFR","ASN","EUR","HSP")
16 | 
17 | 
18 | ###################
19 | ###HELPER FUNCTIONS
20 | ###################
21 | #Collapse multiallelic ACs
22 | clean.ACs <- function(ACs){
23 |   ACs.to.clean <- grep(",",ACs,fixed=T)
24 |   cleaned.ACs <- as.vector(as.numeric(sapply(ACs[ACs.to.clean],function(s){
25 |     return(sum(as.numeric(unlist(strsplit(s,split=",")))[-2]))
26 |   })))
27 |   ACs[ACs.to.clean] <- cleaned.ACs
28 |   return(as.numeric(ACs))
29 | }
30 | #Import a single table of freq data
31 | import.freq.table <- function(batch,path){
32 |   #Read data
33 |   dat <- read.table(path,header=T,comment.char="",sep="\t")
34 |   #Clean multiallelic ACs
35 |   dat[,grep("_AC",colnames(dat),fixed=T)] <- apply(dat[,grep("_AC",colnames(dat),fixed=T)],2,clean.ACs)
36 |   #Convert all ANs to numerics
37 |   dat[,grep("_AN",colnames(dat),fixed=T)] <- apply(dat[,grep("_AN",colnames(dat),fixed=T)],2,as.numeric)
38 |   #Add batch name to colnames
39 |   colnames(dat)[-c(1:3)] <- paste(colnames(dat)[-c(1:3)],batch,sep=".")
40 |   colnames(dat)[1:3] <- c("VID","SVLEN","SVTYPE")
41 |   return(dat)
42 | }
43 | #Import a list of freq data tables and merge them
44 | import.freq.data <- function(batches.list){
45 |   merged <- import.freq.table(batch=batches.list[1,1],path=batches.list[1,2])
46 |   for(i in 2:nrow(batches.list)){
47 |     newdat <- import.freq.table(batch=batches.list[i,1],path=batches.list[i,2])
48 |     merged <- merge(x=merged,y=newdat,by=c("VID","SVLEN","SVTYPE"),sort=F,all=T)
49 |     rm(newdat)
50 |   }
51 |   return(merged)
52 | }
53 | 
54 | 
55 | ###Read command-line arguments
56 | args <- commandArgs(trailingOnly=T)
57 | in.list <- as.character(args[1])
58 | OUTFILE <- as.character(args[2])
59 | 
60 | # #Dev parameters (local)
61 | # in.list <- "~/scratch/af_test/input.list"
62 | # OUTFILE <- "~/scratch/gnomAD_AF_table.merged.txt"
63 | 
64 | 
65 | ###Process input data
66 | batches.list <- read.table(in.list,header=F,sep="\t")
67 | colnames(batches.list) <- c("batch","path")
68 | dat <- import.freq.data(batches.list)
69 | 
70 | ###Write output data
71 | write.table(dat,OUTFILE,col.names=T,row.names=F,sep="\t",quote=F)
72 | system(paste("gzip -f ",OUTFILE,sep=""),wait=T,intern=F)
73 | 


--------------------------------------------------------------------------------
/gnomad_sv_analysis_scripts/merge_filter_columns.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | # -*- coding: utf-8 -*-
 3 | #
 4 | # Copyright © 2018 Ryan Collins <rlcollins@g.harvard.edu>
 5 | # Distributed under terms of the MIT license.
 6 | 
 7 | """
 8 | Sanitize two filter columns stripped from paired VCFs
 9 | """
10 | 
11 | import argparse
12 | 
13 | 
14 | def main():
15 |     parser = argparse.ArgumentParser(description=__doc__,
16 |                                      formatter_class=argparse.RawDescriptionHelpFormatter)
17 |     parser.add_argument('file1', help='Input FILTER values from VCF 1.')
18 |     parser.add_argument('file2', help='Input FILTER values from VCF 2.')
19 |     parser.add_argument('fout', help='Output FILTER values.')
20 | 
21 |     args = parser.parse_args()
22 | 
23 |     fout = open(args.fout, 'w')
24 | 
25 |     with open(args.file1) as f1, open(args.file2) as f2:
26 |       for x, y in zip(f1, f2):
27 |         x = x.strip().split(';')
28 |         y = y.strip().split(';')
29 |         #Only return PASS if both are PASS with no other filters
30 |         if x == ['PASS'] and y == ['PASS']:
31 |             newfilt = 'PASS'
32 |         else:
33 |             x = [f for f in x if f != 'PASS']
34 |             y = [f for f in y if f != 'PASS']
35 |             newfilt = ';'.join(sorted(list(set(x + y))))
36 |         
37 |         fout.write(newfilt + '\n')
38 | 
39 |     fout.close()
40 | 
41 | 
42 | if __name__ == '__main__':
43 |     main()
44 | 


--------------------------------------------------------------------------------
/gnomad_sv_analysis_scripts/prePCA_vcf_filter.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python
  2 | # -*- coding: utf-8 -*-
  3 | #
  4 | # Copyright © 2018 Ryan Collins <rlcollins@g.harvard.edu>
  5 | # Distributed under terms of the MIT license.
  6 | 
  7 | """
  8 | Filter vcf to clean, autosomal, biallelic sites prior to cohort-wide PCA
  9 | """
 10 | 
 11 | 
 12 | import argparse
 13 | import sys
 14 | from collections import defaultdict
 15 | import pysam
 16 | 
 17 | 
 18 | NULL_GTs = [(0, 0), (None, None), (0, ), (None, ), (None, 2)]
 19 | 
 20 | 
 21 | def get_call_rate(record):
 22 |     n_samples = len(record.samples)
 23 |     n_non_null = len([s for s in record.samples if record.samples[s]['GT'] not in NULL_GTs])
 24 |     
 25 |     callrate = n_non_null / n_samples
 26 | 
 27 |     return callrate
 28 | 
 29 | 
 30 | def filter_vcf(vcf, fout_common, minAF=0.01, fout_all=None, minCallRate=0.99):
 31 |     for record in vcf:
 32 |         # #Do not include UNRESOLVED variants
 33 |         # if 'UNRESOLVED' in record.info.keys() \
 34 |         # or 'UNRESOLVED_TYPE' in record.info.keys() \
 35 |         # or 'UNRESOLVED' in record.filter:
 36 |         #     continue
 37 | 
 38 |         # #Do not include multiallelic variants
 39 |         # if 'MULTIALLELIC' in record.info.keys() \
 40 |         # or 'MULTIALLELIC' in record.filter \
 41 |         # or len(record.alts) > 1:
 42 |         #     continue
 43 | 
 44 |         #Do not include variants on X or Y
 45 |         allosomes = ['X', 'Y', 'chrX', 'chrY']
 46 |         if record.chrom in allosomes:
 47 |             continue
 48 | 
 49 |         #Only include PASS variants
 50 |         if 'PASS' not in record.filter:
 51 |             continue
 52 | 
 53 |         #Only include variants with ≥minCallRate
 54 |         if get_call_rate(record) < minCallRate:
 55 |             continue
 56 | 
 57 |         #Only keep common variants
 58 |         if 'AF' in record.info.keys():
 59 |             if record.info['AF'][0] >= minAF:
 60 |                 fout_common.write(record)
 61 | 
 62 |         #Write AF-unfiltered variants, if optioned
 63 |         if fout_all is not None:
 64 |             fout_all.write(record)
 65 | 
 66 | 
 67 | def main():
 68 |     parser = argparse.ArgumentParser(
 69 |         description=__doc__,
 70 |         formatter_class=argparse.RawDescriptionHelpFormatter)
 71 |     parser.add_argument('vcf')
 72 |     parser.add_argument('fout')
 73 |     parser.add_argument('--minAF', type=float, default=0.01,
 74 |                         help='Minimum allele frequency. [0.01]')
 75 |     parser.add_argument('--minCallRate', type=float, default=0.99,
 76 |                         help='Minimum call rate. [0.99]')
 77 |     parser.add_argument('--noAFoutput', default=None,
 78 |                         help='Output file for all variants unfiltered on AF.')
 79 | 
 80 |     args = parser.parse_args()
 81 | 
 82 |     #Open input VCF
 83 |     if args.vcf in '- stdin'.split():
 84 |         vcf = pysam.VariantFile(sys.stdin) 
 85 |     else:
 86 |         vcf = pysam.VariantFile(args.vcf)
 87 | 
 88 |     header = vcf.header
 89 | 
 90 |     #Open outut VCFs
 91 |     fout_common = pysam.VariantFile(args.fout, 'w', header=header)
 92 |     if args.noAFoutput is not None:
 93 |         fout_all = pysam.VariantFile(args.noAFoutput, 'w', header=header)
 94 |     else:
 95 |         fout_all = None
 96 | 
 97 |     #Filter VCF
 98 |     filter_vcf(vcf, fout_common, args.minAF, fout_all, args.minCallRate)
 99 | 
100 | 
101 | if __name__ == '__main__':
102 |     main()
103 | 


--------------------------------------------------------------------------------
/gnomad_sv_analysis_scripts/sum_svcounts_perSample.R:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env Rscript
 2 | 
 3 | # Copyright (c) 2018 Talkowski Laboratory
 4 | # Contact: Ryan Collins <rlcollins@g.harvard.edu>
 5 | # Distributed under terms of the MIT license.
 6 | 
 7 | # Helper script to merge outputs from count_svtypes task
 8 | # in final_outlier_sample_filter.wdl
 9 | 
10 | 
11 | ###Set master parameters & read arguments
12 | options(stringsAsFactors=F,scipen=1000)
13 | args <- commandArgs(trailingOnly=TRUE)
14 | INFILE <- args[1]
15 | OUTFILE <- args[2]
16 | 
17 | ###Read input data & reformat
18 | dat <- read.table(INFILE,header=F)
19 | colnames(dat) <- c("sample","svtype","count","chrom")
20 | samples <- as.character(unique(dat$sample))
21 | svtypes <- as.character(unique(dat$svtype))
22 | 
23 | ###Get sum of counts per sample per svtype
24 | summed.res <- do.call("rbind", lapply(samples,function(sample){
25 |   return(do.call("rbind", lapply(svtypes,function(svtype){
26 |     return(data.frame("sample"=sample,
27 |                "svtype"=svtype,
28 |                "count"=sum(dat[which(dat$sample==sample & dat$svtype==svtype),]$count,na.rm=T)))
29 |   })))
30 | }))
31 | 
32 | ###Write summed results to outfile
33 | write.table(summed.res,OUTFILE,col.names=T,row.names=F,sep="\t",quote=F)
34 | 


--------------------------------------------------------------------------------
/gnomad_sv_analysis_wdls/MVR_collection_helper.wdl:
--------------------------------------------------------------------------------
 1 | # Copyright (c) 2018 Talkowski Lab
 2 | 
 3 | # Contact Ryan Collins <rlcollins@g.harvard.edu>
 4 | 
 5 | # Distributed under terms of the MIT License
 6 | 
 7 | 
 8 | # Helper WDL to parallelize collection of Mendelian violation rate data for 
 9 | # the Talkowski lab SV pipeline
10 | 
11 | workflow mvr_colection_helper {
12 |   File vcf
13 |   File vcf_idx
14 |   String contig
15 |   String prefix
16 |   File trios_famfile
17 |   File PCRPLUS_samples_list
18 |   Int sv_per_shard
19 | 
20 |   call shard_vcf {
21 |     input:
22 |       vcf=vcf,
23 |       vcf_idx=vcf_idx,
24 |       contig=contig,
25 |       sv_per_shard=sv_per_shard
26 |   }
27 | 
28 |   scatter ( shard in shard_vcf.shard_vcfs ){
29 |     call gather_MVR_data {
30 |     input:
31 |       vcf=shard,
32 |       prefix="${prefix}.${contig}",
33 |       famfile=trios_famfile,
34 |       PCRPLUS_samples_list=PCRPLUS_samples_list
35 |     }
36 |   }
37 | 
38 |   output {
39 |     Array[File] mvr_data = gather_MVR_data.MVR_data
40 |   }
41 | }
42 | 
43 | 
44 | # Shard VCF into fixed size chunks
45 | task shard_vcf {
46 |   File vcf
47 |   File vcf_idx
48 |   String contig
49 |   Int sv_per_shard
50 | 
51 |   command {
52 |     #Tabix chromosome of interest
53 |     tabix -h ${vcf} ${contig} | bgzip -c > ${contig}.vcf.gz
54 |     #Then shard VCF
55 |     /opt/sv-pipeline/scripts/shard_VCF.sh \
56 |       ${contig}.vcf.gz \
57 |       ${sv_per_shard} \
58 |       "vcf.shard."
59 |   }
60 | 
61 |   output {
62 |     Array[File] shard_vcfs = glob("vcf.shard.*.vcf.gz")
63 |   }
64 |   
65 |   runtime {
66 |     preemptible: 1
67 |     docker: "talkowski/sv-pipeline@sha256:07160ad5fad8b8b9faa60a64caf9990e374a47fa63e8f2160d3645f5e4545c48"
68 |     memory: "4 GB"
69 |     disks: "local-disk 250 SSD"
70 |   }
71 | }
72 | 
73 | 
74 | # Subset compute all data needed for downstream filter determination
75 | task gather_MVR_data {
76 |   File vcf
77 |   String prefix
78 |   File famfile
79 |   File PCRPLUS_samples_list
80 | 
81 |   command <<<
82 |     /opt/sv-pipeline/scripts/downstream_analysis_and_filtering/count_mendelian_violations.py \
83 |       ${vcf} ${famfile} ${PCRPLUS_samples_list} "${prefix}.MVR_data.txt"
84 |     gzip -f "${prefix}.MVR_data.txt"
85 |   >>>
86 | 
87 |   output {
88 |     File MVR_data = "${prefix}.MVR_data.txt.gz"
89 |   }
90 | 
91 |   runtime {
92 |     docker: "talkowski/sv-pipeline@sha256:58b67cb4e4edf285b89250d2ebab72e17c0247e3bf6891c2c2fcda646b2a6cf4"
93 |     preemptible: 1
94 |     disks: "local-disk 20 HDD"
95 |     memory: "4 GB"
96 |   }
97 | }
98 | 


--------------------------------------------------------------------------------
/gnomad_sv_analysis_wdls/apply_GQ_filter.wdl:
--------------------------------------------------------------------------------
  1 | # Copyright (c) 2018 Talkowski Lab
  2 | 
  3 | # Contact Ryan Collins <rlcollins@g.harvard.edu>
  4 | 
  5 | # Distributed under terms of the MIT License
  6 | 
  7 | 
  8 | #This is an analysis WDL to apply a per-sample GQ cutoff to all variants in an SV VCF 
  9 | 
 10 | workflow apply_GQ_filter {
 11 |   File vcf
 12 |   File vcf_idx
 13 |   String prefix
 14 |   File contiglist
 15 |   Int minGQ_global
 16 |   File minGQ_perSVTYPE_table
 17 |   Float max_noCallRate
 18 | 
 19 |   Array[Array[String]] contigs=read_tsv(contiglist)
 20 | 
 21 |   #Split vcf per chromosome
 22 |   scatter ( contig in contigs ) {
 23 |     #Subset vcf to contig
 24 |     call shard_vcf {
 25 |         input:
 26 |           vcf=vcf,
 27 |           vcf_idx=vcf_idx,
 28 |           prefix=prefix,
 29 |           contig=contig[0]
 30 |       }
 31 | 
 32 |     #Apply minGQ filter
 33 |     call filter_GQ {
 34 |       input:
 35 |         vcf=shard_vcf.shard,
 36 |         prefix="${prefix}.${contig[0]}",
 37 |         minGQ_global=minGQ_global,
 38 |         minGQ_perSVTYPE_table=minGQ_perSVTYPE_table,
 39 |         max_noCallRate=max_noCallRate
 40 |     }
 41 |   }
 42 | 
 43 |   #Merge sharded VCFs
 44 |   call combine {
 45 |     input:
 46 |       vcfs=filter_GQ.filtered_vcf,
 47 |       prefix=prefix
 48 |   }
 49 | 
 50 |   output {
 51 |     File filtered_vcf = combine.out
 52 |     File filtered_vcf_idx = combine.idx
 53 |   }
 54 | }
 55 | 
 56 | 
 57 | # Shard VCF per chromosome
 58 | task shard_vcf {
 59 |   File vcf
 60 |   File vcf_idx
 61 |   String prefix
 62 |   String contig
 63 | 
 64 |   command {
 65 |     tabix -h ${vcf} ${contig} | bgzip -c > "${prefix}.${contig}.vcf.gz"
 66 |   }
 67 | 
 68 |   output {
 69 |     File shard = "${prefix}.${contig}.vcf.gz"
 70 |   }
 71 |   
 72 |   runtime {
 73 |     preemptible: 1
 74 |     docker: "talkowski/sv-pipeline@sha256:6bcf2b506fc66b13f5aa5e99ccf19e01891aec963b147b09b59e6510116f1adc"
 75 |     memory: "4 GB"
 76 |     disks: "local-disk 275 SSD"
 77 |   }
 78 | }
 79 | 
 80 | 
 81 | # Apply minGQ filter 
 82 | task filter_GQ {
 83 |   File vcf
 84 |   String prefix
 85 |   Int minGQ_global
 86 |   File minGQ_perSVTYPE_table
 87 |   Float max_noCallRate
 88 | 
 89 |   command {
 90 |     /opt/sv-pipeline/scripts/downstream_analysis_and_filtering/apply_minGQ_filter.py \
 91 |       --dropEmpties \
 92 |       -m ${minGQ_global} \
 93 |       --maxNCR ${max_noCallRate} \
 94 |       -t ${minGQ_perSVTYPE_table} \
 95 |       ${vcf} \
 96 |       "${prefix}.filtered.vcf"
 97 |     bgzip -f "${prefix}.filtered.vcf"
 98 |   }
 99 | 
100 |   output {
101 |     File filtered_vcf = "${prefix}.filtered.vcf.gz"
102 |   }
103 |   
104 |   runtime {
105 |     preemptible: 1
106 |     docker: "talkowski/sv-pipeline@sha256:6bcf2b506fc66b13f5aa5e99ccf19e01891aec963b147b09b59e6510116f1adc"
107 |     memory: "4 GB"
108 |     disks: "local-disk 30 SSD"
109 |   }
110 | }
111 | 
112 | 
113 | # Merge VCF shards
114 | task combine {
115 |   Array[File] vcfs
116 |   String prefix
117 |   
118 |   command {
119 |     vcf-concat ${sep=" " vcfs} | vcf-sort | bgzip -c > "${prefix}.minGQ_filtered.vcf.gz";
120 |     tabix -p vcf "${prefix}.minGQ_filtered.vcf.gz"
121 |   }
122 | 
123 |   runtime {
124 |     preemptible: 1
125 |     docker : "talkowski/sv-pipeline@sha256:6bcf2b506fc66b13f5aa5e99ccf19e01891aec963b147b09b59e6510116f1adc"
126 |     disks: "local-disk 500 SSD"
127 |     memory: "4 GB"
128 |   }
129 | 
130 |   output {
131 |     File out="${prefix}.minGQ_filtered.vcf.gz"
132 |     File idx="${prefix}.minGQ_filtered.vcf.gz.tbi"
133 |   }
134 | }
135 | 


--------------------------------------------------------------------------------
/gnomad_sv_analysis_wdls/assign_lowQuality_sites.wdl:
--------------------------------------------------------------------------------
 1 | # Copyright (c) 2018 Talkowski Lab
 2 | 
 3 | # Contact Ryan Collins <rlcollins@g.harvard.edu>
 4 | 
 5 | # Distributed under terms of the MIT License
 6 | 
 7 | 
 8 | # This is an analysis WDL for downstream processing of Talkowski SV pipeline callests
 9 | # that determines categories of SV with high Mendelian violation rates based on
10 | # parent-child trio analyses, and tags those sites as LOW_QUALITY in the
11 | # VCF FILTER field
12 | 
13 | # QC is performed on the final VCF separated by LOW_QUALITY and non-LOW_QUALITY
14 | 
15 | 
16 | import "https://api.firecloud.org/ga4gh/v1/tools/Talkowski-SV:master_SV_VCF_QC/versions/73/plain-WDL/descriptor" as QC
17 | import "https://api.firecloud.org/ga4gh/v1/tools/Talkowski-SV:MVR_collection_helper/versions/5/plain-WDL/descriptor" as collect
18 | 
19 | 
20 | workflow assign_lowQuality_sites {
21 |   File vcf
22 |   File vcf_idx
23 |   String prefix
24 |   File contiglist
25 |   File trios_famfile
26 |   File PCRPLUS_samples_list
27 |   Int sv_per_shard
28 | 
29 |   Array[Array[String]] contigs=read_tsv(contiglist)
30 | 
31 |   # Shard VCF per-chromosome and collect MVR data
32 |   scatter ( contig in contigs ) {
33 |     call collect.mvr_colection_helper as gather_MVR_data_perChrom {
34 |       input:
35 |         vcf=vcf,
36 |         vcf_idx=vcf_idx,
37 |         contig=contig[0],
38 |         prefix=prefix,
39 |         trios_famfile=trios_famfile,
40 |         PCRPLUS_samples_list=PCRPLUS_samples_list,
41 |         sv_per_shard=sv_per_shard
42 |     }
43 |     call combine_MVR_data as combine_MVR_data_perChrom {
44 |       input:
45 |         MVR_data=gather_MVR_data_perChrom.mvr_data,
46 |         prefix=prefix
47 |     }
48 |   }
49 | 
50 |   # Merge MVR data
51 |   call combine_MVR_data as combine_MVR_data_crossChrom {
52 |     input:
53 |       MVR_data=combine_MVR_data_perChrom.merged_data,
54 |       prefix=prefix
55 |   }
56 | 
57 | 
58 |   # Final outputs
59 |   output {
60 |     File merged_MVR_data = combine_MVR_data_crossChrom.merged_data
61 |   }
62 | }
63 | 
64 | 
65 | # Combine MVR data from per-chromosome shards
66 | task combine_MVR_data {
67 |   Array[File] MVR_data
68 |   String prefix
69 | 
70 |   command <<<
71 |     zcat ${MVR_data[0]} | sed -n '1p' > "${prefix}.merged_MVR_data.txt"
72 |     zcat ${sep=' ' MVR_data} | fgrep -v "#" >> "${prefix}.merged_MVR_data.txt"
73 |     gzip -f "${prefix}.merged_MVR_data.txt"
74 |   >>>
75 | 
76 |   output {
77 |     File merged_data = "${prefix}.merged_MVR_data.txt.gz"
78 |   }
79 | 
80 |   runtime {
81 |     docker: "talkowski/sv-pipeline@sha256:07160ad5fad8b8b9faa60a64caf9990e374a47fa63e8f2160d3645f5e4545c48"
82 |     preemptible: 1
83 |     disks: "local-disk 30 HDD"
84 |   }  
85 | }
86 | 
87 | 


--------------------------------------------------------------------------------
/gnomad_sv_analysis_wdls/compute_simple_AFs_singleChrom.wdl:
--------------------------------------------------------------------------------
  1 | # Copyright (c) 2018 Talkowski Laboratory
  2 | # Contact: Ryan Collins <rlcollins@g.harvard.edu>
  3 | # Distributed under terms of the MIT license.
  4 | 
  5 | # Helper workflow to calculate basic AF statistics for a single chromosome on an input VCF
  6 | 
  7 | workflow getAFs_singleChrom {
  8 | 	File vcf
  9 |   File vcf_idx
 10 |   String contig
 11 |   Int sv_per_shard
 12 | 	String prefix
 13 |   File? sample_pop_assignments  #Two-column file with sample ID & pop assignment. "." for pop will ignore sample
 14 |   File? famfile                 #Used for M/F AF calculations
 15 |   String? drop_empty_records
 16 | 
 17 | 
 18 |   # Tabix to chromosome of interest, and shard input VCF for stats collection
 19 |   call shard_vcf {
 20 |     input:
 21 |       vcf=vcf,
 22 |       vcf_idx=vcf_idx,
 23 |       contig=contig,
 24 |       sv_per_shard=sv_per_shard
 25 |   }
 26 | 
 27 |   # Scatter over VCF shards
 28 |   scatter ( shard in shard_vcf.shard_vcfs ) {
 29 |     # Collect AF summary stats
 30 |   	call compute_shard_AFs {
 31 |   		input:
 32 |         vcf=shard,
 33 |         prefix="${prefix}.${contig}",
 34 |         sample_pop_assignments=sample_pop_assignments,
 35 |         famfile=famfile
 36 |       }
 37 |   	}
 38 | 
 39 |   # Merge shards into single VCF
 40 |   call combine_sharded_vcfs {
 41 |     input:
 42 |       vcfs=compute_shard_AFs.shard_wAFs,
 43 |       prefix="${prefix}.${contig}",
 44 |       drop_empty_records=drop_empty_records
 45 |   }
 46 | 
 47 |   # Final output
 48 |   output {
 49 |     File vcf_wAFs = combine_sharded_vcfs.vcf_out
 50 |     File vcf_wAFs_idx = combine_sharded_vcfs.vcf_out_idx
 51 |   }
 52 | }
 53 | 
 54 | 
 55 | # Shard VCF into fixed size chunks
 56 | task shard_vcf {
 57 |   File vcf
 58 |   File vcf_idx
 59 |   String contig
 60 |   Int sv_per_shard
 61 | 
 62 |   command {
 63 |     set -euo pipefail
 64 |     #Tabix chromosome of interest
 65 |     tabix -h ${vcf} ${contig} | bgzip -c > ${contig}.vcf.gz
 66 |     #Then shard VCF
 67 |     /opt/sv-pipeline/scripts/shard_VCF.sh \
 68 |       ${contig}.vcf.gz \
 69 |       ${sv_per_shard} \
 70 |       "vcf.shard."
 71 |   }
 72 | 
 73 |   output {
 74 |     Array[File] shard_vcfs = glob("vcf.shard.*.vcf.gz")
 75 |   }
 76 |   
 77 |   runtime {
 78 |     preemptible: 1
 79 |     maxRetries: 1
 80 |     docker: "talkowski/sv-pipeline@sha256:193d18c26100fdd603c569346722513f5796685e990ec3abcaeb4be887062a1a"
 81 |     memory: "4 GB"
 82 |     disks: "local-disk 250 SSD"
 83 |   }
 84 | }
 85 | 
 86 | 
 87 | # Subset a vcf to a single chromosome, and add global AF information (no subpop)
 88 | task compute_shard_AFs {
 89 |   File vcf
 90 |   String prefix
 91 |   File? sample_pop_assignments
 92 |   File? famfile
 93 | 
 94 | 
 95 |   command <<<
 96 |     set -euo pipefail
 97 |     optionals=" "
 98 |     if [ ${default="SKIP" sample_pop_assignments} != "SKIP" ]; then
 99 |       optionals="$( echo "$optionals" ) -p ${sample_pop_assignments}"
100 |     fi
101 |     if [ ${default="SKIP" famfile} != "SKIP" ]; then
102 |       optionals="$( echo "$optionals" ) -f ${famfile}"
103 |     fi
104 |     echo -e "OPTIONALS INTERPRETED AS: $optionals"
105 |     echo -e "NOW RUNNING: /opt/sv-pipeline/05_annotation/scripts/compute_AFs.py $( echo "$optionals" ) ${vcf} stdout"
106 |     #Tabix chromosome of interest & compute AN, AC, and AF
107 |     /opt/sv-pipeline/05_annotation/scripts/compute_AFs.py $optionals "${vcf}" stdout \
108 |     | bgzip -c \
109 |     > "${prefix}.wAFs.vcf.gz"
110 |   >>>
111 | 
112 |   output {
113 |     File shard_wAFs = "${prefix}.wAFs.vcf.gz"
114 |   }
115 | 
116 |   runtime {
117 |     docker: "talkowski/sv-pipeline@sha256:193d18c26100fdd603c569346722513f5796685e990ec3abcaeb4be887062a1a"
118 |     preemptible: 1
119 |     maxRetries: 1
120 |     memory: "4 GB"
121 |     disks: "local-disk 20 SSD"
122 |   }
123 | }
124 | 
125 | 
126 | # Merge VCF shards & drop records with zero remaining non-ref alleles
127 | task combine_sharded_vcfs {
128 |   Array[File] vcfs
129 |   String prefix
130 |   String? drop_empty_records
131 |   
132 |   command {
133 |     set -euo pipefail
134 |     vcf-concat ${sep=" " vcfs} \
135 |     | vcf-sort \
136 |     > merged.vcf
137 |     if [ ${default="TRUE" drop_empty_records} == "TRUE" ]; then
138 |       /opt/sv-pipeline/05_annotation/scripts/prune_allref_records.py \
139 |         merged.vcf stdout \
140 |       | bgzip -c \
141 |       > "${prefix}.wAFs.vcf.gz"
142 |     else
143 |       cat merged.vcf | bgzip -c > "${prefix}.wAFs.vcf.gz"
144 |     fi
145 |     tabix -p vcf "${prefix}.wAFs.vcf.gz"
146 |   }
147 | 
148 |   runtime {
149 |     preemptible: 1
150 |     maxRetries: 1
151 |     docker: "talkowski/sv-pipeline@sha256:193d18c26100fdd603c569346722513f5796685e990ec3abcaeb4be887062a1a"
152 |     disks: "local-disk 250 SSD"
153 |     bootDiskSizeGb: 30
154 |     memory: "4 GB"
155 |   }
156 | 
157 |   output {
158 |     File vcf_out = "${prefix}.wAFs.vcf.gz"
159 |     File vcf_out_idx = "${prefix}.wAFs.vcf.gz.tbi"
160 |   }
161 | }
162 | 
163 | 


--------------------------------------------------------------------------------
/gnomad_sv_analysis_wdls/filter_cleanup_and_QUAL_recalibration.wdl:
--------------------------------------------------------------------------------
  1 | # Copyright (c) 2018 Talkowski Lab
  2 | 
  3 | # Contact Ryan Collins <rlcollins@g.harvard.edu>
  4 | 
  5 | # Distributed under terms of the MIT License
  6 | 
  7 | 
  8 | # This is an analysis WDL to perform FILTER cleanup and recalibrate
  9 | # variant QUAL scores at the end of the Talkowski SV pipeline
 10 | 
 11 | 
 12 | workflow filter_cleanup_qual_recalibration {
 13 |   File vcf
 14 |   File vcf_idx
 15 |   File PCRPLUS_samples_list
 16 |   File famfile
 17 |   Float min_callrate_global
 18 |   Float min_callrate_smallDels
 19 |   File contiglist
 20 |   String prefix
 21 | 
 22 |   Array[Array[String]] contigs = read_tsv(contiglist)
 23 | 
 24 |   scatter ( contig in contigs ) {
 25 |     call cleanup {
 26 |       input:
 27 |         vcf=vcf,
 28 |         vcf_idx=vcf_idx,
 29 |         contig=contig[0],
 30 |         PCRPLUS_samples_list=PCRPLUS_samples_list,
 31 |         famfile=famfile,
 32 |         min_callrate_global=min_callrate_global,
 33 |         min_callrate_smallDels=min_callrate_smallDels,
 34 |         prefix=prefix
 35 |     }
 36 |   }
 37 | 
 38 |   call concat_vcfs {
 39 |     input:
 40 |       vcfs=cleanup.out_vcf,
 41 |       outfile_prefix="${prefix}.cleaned_filters_qual_recalibrated"
 42 |   }
 43 | 
 44 |     output {
 45 |       File cleaned_vcf = concat_vcfs.concat_vcf
 46 |       File cleaned_vcf_idx = concat_vcfs.concat_vcf_idx
 47 |     }
 48 | }
 49 | 
 50 | 
 51 | # Applies filters & cleanup to VCF for a single chromosome
 52 | task cleanup {
 53 |   File vcf
 54 |   File vcf_idx
 55 |   String contig
 56 |   File PCRPLUS_samples_list
 57 |   File famfile
 58 |   Float min_callrate_global
 59 |   Float min_callrate_smallDels
 60 |   String prefix
 61 | 
 62 |   command <<<
 63 |     set -euo pipefail
 64 |     #Subset to chromosome of interest
 65 |     tabix -h ${vcf} ${contig} | bgzip -c > input.vcf.gz
 66 |     #Get list of PCR- samples
 67 |     tabix -H ${vcf} | fgrep -v "##" | cut -f10- | sed 's/\t/\n/g' \
 68 |     > all.samples.list
 69 |     fgrep -wvf ${PCRPLUS_samples_list} all.samples.list \
 70 |     > pcrminus.samples.list
 71 |     #Restrict famfiles
 72 |     while read ptn; do fgrep -w $ptn ${famfile}; done < all.samples.list > revised.fam
 73 |     fgrep -wf pcrminus.samples.list revised.fam > revised.pcrminus.fam
 74 |     #Compute fraction of missing genotypes per variant
 75 |     zcat input.vcf.gz \
 76 |     | awk '{ if ($7 !~ /MULTIALLELIC/) print $0 }' \
 77 |     | bgzip -c \
 78 |     > input.noMCNV.vcf.gz
 79 |     plink2 \
 80 |       --missing variant-only \
 81 |       --max-alleles 2 \
 82 |       --keep-fam revised.pcrminus.fam \
 83 |       --fam revised.fam \
 84 |       --vcf input.noMCNV.vcf.gz
 85 |     fgrep -v "#" plink2.vmiss \
 86 |     | awk -v OFS="\t" '{ print $2, 1-$NF }' \
 87 |     > callrates.txt
 88 |     #Clean up VCF
 89 |     /opt/sv-pipeline/scripts/downstream_analysis_and_filtering/filter_cleanup_and_QUAL_recalibration.py \
 90 |       --callrate-table callrates.txt \
 91 |       --min-callrate-global ${min_callrate_global} \
 92 |       --min-callrate-smallDels ${min_callrate_smallDels} \
 93 |       input.vcf.gz \
 94 |       stdout \
 95 |     | bgzip -c \
 96 |     > "${prefix}.${contig}.cleaned_filters_qual_recalibrated.vcf.gz"
 97 |     # tabix -p vcf -f "${prefix}.cleaned_filters_qual_recalibrated.vcf.gz"
 98 |   >>>
 99 | 
100 |   output {
101 |     File out_vcf = "${prefix}.${contig}.cleaned_filters_qual_recalibrated.vcf.gz"
102 |     # File out_vcf_idx = "${prefix}.cleaned_filters_qual_recalibrated.vcf.gz.tbi"
103 |   }
104 | 
105 |   runtime {
106 |     docker : "talkowski/sv-pipeline@sha256:4587376100d71d66fb864740f95e0cc5f343bb1fe6e892f5b8116c789c38333f"
107 |     preemptible: 1
108 |     maxRetries: 0
109 |     disks: "local-disk 50 HDD"
110 |     memory: "4 GB"
111 |   }
112 | }
113 | 
114 | 
115 | #General task to combine and sort multiple VCFs
116 | task concat_vcfs {
117 |   Array[File] vcfs
118 |   String outfile_prefix
119 | 
120 |   command <<<
121 |     set -euo pipefail
122 |     vcf-concat ${sep=' ' vcfs} | bgzip -c > ${outfile_prefix}.vcf.gz; 
123 |     tabix -p vcf -f "${outfile_prefix}.vcf.gz"
124 |   >>>
125 | 
126 |   output {
127 |     File concat_vcf = "${outfile_prefix}.vcf.gz"
128 |     File concat_vcf_idx = "${outfile_prefix}.vcf.gz.tbi"
129 |   }
130 | 
131 |   runtime {
132 |     docker: "talkowski/sv-pipeline@sha256:4587376100d71d66fb864740f95e0cc5f343bb1fe6e892f5b8116c789c38333f"
133 |     preemptible: 0
134 |     maxRetries: 1
135 |     memory: "4 GB"
136 |     bootDiskSizeGb: 30
137 |     disks: "local-disk 250 HDD"
138 |   }
139 | }
140 | 
141 | 


--------------------------------------------------------------------------------
/gnomad_sv_analysis_wdls/gather_batch_effects_helper.wdl:
--------------------------------------------------------------------------------
  1 | # Copyright (c) 2018 Talkowski Lab
  2 | 
  3 | # Contact Ryan Collins <rlcollins@g.harvard.edu>
  4 | 
  5 | # Distributed under terms of the MIT License
  6 | 
  7 | 
  8 | # This is an analysis WDL to perform pairwise comparisons of batches in the 
  9 | # Talkowski lab SV pipeline, and mark sites that appear batch-specific
 10 | 
 11 | 
 12 | workflow check_batch_effects {
 13 | 	File freq_table
 14 |   String batch1
 15 |   String batch2
 16 |   String prefix
 17 |   Int variants_per_shard
 18 | 
 19 |   # Shard frequency table
 20 |   call shard_table {
 21 |     input:
 22 |       freq_table=freq_table,
 23 |       variants_per_shard=variants_per_shard
 24 |   }
 25 | 
 26 |   # Scatter over shards and compute AF correlations for each variant
 27 |   scatter ( shard in shard_table.shards ) {
 28 |     call compare_batches {
 29 |       input:
 30 |         freq_table=shard,
 31 |         batch1=batch1,
 32 |         batch2=batch2,
 33 |         prefix=prefix
 34 |     }
 35 |   }
 36 | 
 37 |   # Combine shards, perform bonferroni correction to determine significant batch effects, and plot AF correlation scatter
 38 |   call combine_shards {
 39 |     input:
 40 |       freq_tables=compare_batches.results,
 41 |       batch1=batch1,
 42 |       batch2=batch2,
 43 |       prefix=prefix
 44 |   }
 45 | 
 46 |   # Outputs
 47 |   output {
 48 |     File comparison_table = combine_shards.merged_table
 49 |     File batch_effect_variants = combine_shards.batch_effect_variants
 50 |     File scatterplots_tarball = combine_shards.correlation_scatterplots_tarball
 51 |   }
 52 | }
 53 | 
 54 | 
 55 | # Shard a frequency table into an even number of evenly sized shards
 56 | task shard_table {
 57 |   File freq_table
 58 |   Int variants_per_shard
 59 | 
 60 |   command <<<
 61 |     set -euo pipefail
 62 |     #Split variant lines
 63 |     zcat ${freq_table} | sed '1d' | \
 64 |     split -l ${variants_per_shard} --numeric-suffixes=00001 -a 5 /dev/stdin freq_table_shard_ || true
 65 |     #Add header & gzip each shard
 66 |     zcat ${freq_table} | sed -n '1p' > header.txt
 67 |     maxshard=$( find / -name "freq_table_shard_*" | awk -v FS="_" '{ print $NF }' \
 68 |                 | sort -Vrk1,1 | sed -n '1p' || true )
 69 |     for i in $( seq -w 00001 "$maxshard" ); do
 70 |       cat header.txt "freq_table_shard_$i" \
 71 |       | gzip -c \
 72 |       > "freq_table_shard_$i.txt.gz" || true
 73 |     done
 74 |   >>>
 75 | 
 76 |   output {
 77 |     Array[File] shards = glob("freq_table_shard_*.txt.gz")
 78 |   }
 79 | 
 80 |   runtime {
 81 |     docker : "talkowski/sv-pipeline@sha256:aef8156983cec6ac6a91fa6461b197a63835e5487fc9523ec857f947cfac660e"
 82 |     preemptible: 1
 83 |     maxRetries: 1
 84 |   }
 85 | }
 86 | 
 87 | 
 88 | # Compare AF stats per variant between a pair of batches
 89 | task compare_batches {
 90 |   File freq_table
 91 |   String batch1
 92 |   String batch2
 93 |   String prefix
 94 | 
 95 |   command <<<
 96 |     set -euo pipefail
 97 |     /opt/sv-pipeline/scripts/downstream_analysis_and_filtering/find_batch_effects.shard_helper.R \
 98 |       ${freq_table} \
 99 |       "${batch1}" \
100 |       "${batch2}" \
101 |       "${prefix}.${batch1}_vs_${batch2}.results.txt"
102 |     gzip "${prefix}.${batch1}_vs_${batch2}.results.txt"
103 |   >>>
104 | 
105 |   output {
106 |     File results = "${prefix}.${batch1}_vs_${batch2}.results.txt.gz"
107 |   }
108 | 
109 |   runtime {
110 |     docker : "talkowski/sv-pipeline@sha256:aef8156983cec6ac6a91fa6461b197a63835e5487fc9523ec857f947cfac660e"
111 |     memory: "4 GB"
112 |     preemptible: 1
113 |     maxRetries: 1
114 |   }
115 | }
116 | 
117 | 
118 | # Merge sharded comparison results and perform analysis for batch effects
119 | task combine_shards {
120 |   Array[File] freq_tables
121 |   String batch1
122 |   String batch2
123 |   String prefix
124 |   
125 |   command <<<
126 |     set -euo pipefail
127 |     #Write header
128 |     zcat ${freq_tables[0]} | sed -n '1p' > header.txt || true
129 |     #Iterate over files and cat
130 |     while read file; do
131 |       zcat "$file" | sed '1d'
132 |     done < ${write_lines(freq_tables)} \
133 |     | cat header.txt - \
134 |     | gzip -c \
135 |     > "${prefix}.${batch1}_vs_${batch2}.AF_comparison_table.txt.gz" || true
136 |     #Analyze
137 |     mkdir "${batch1}_vs_${batch2}"
138 |     /opt/sv-pipeline/scripts/downstream_analysis_and_filtering/find_batch_effects.R \
139 |       "${prefix}.${batch1}_vs_${batch2}.AF_comparison_table.txt.gz" \
140 |       "${batch1}" \
141 |       "${batch2}" \
142 |       "${batch1}_vs_${batch2}/${prefix}"
143 |     gzip -f "${batch1}_vs_${batch2}/${prefix}.${batch1}_vs_${batch2}.freq_table_wBonferroni.txt"
144 |     tar -czvf "${batch1}_vs_${batch2}.tar.gz" \
145 |       "${batch1}_vs_${batch2}"
146 |   >>>
147 | 
148 |   output {
149 |     File merged_table = "${batch1}_vs_${batch2}/${prefix}.${batch1}_vs_${batch2}.freq_table_wBonferroni.txt.gz"
150 |     File batch_effect_variants = "${batch1}_vs_${batch2}/${prefix}.${batch1}_vs_${batch2}.batch_effect_variants.txt"
151 |     File correlation_scatterplots_tarball = "${batch1}_vs_${batch2}.tar.gz"
152 |   }
153 | 
154 |   runtime {
155 |     docker : "talkowski/sv-pipeline@sha256:aef8156983cec6ac6a91fa6461b197a63835e5487fc9523ec857f947cfac660e"
156 |     memory: "4 GB"
157 |     preemptible: 1
158 |     maxRetries: 1
159 |   }
160 | }
161 | 
162 | 


--------------------------------------------------------------------------------
/gnomad_sv_analysis_wdls/minGQ_ROC_helper.wdl:
--------------------------------------------------------------------------------
  1 | # Copyright (c) 2018 Talkowski Lab
  2 | 
  3 | # Contact Ryan Collins <rlcollins@g.harvard.edu>
  4 | 
  5 | # Distributed under terms of the MIT License
  6 | 
  7 | 
  8 | #This is a helper WDL that performs minGQ ROC optimization for a filtered VCF
  9 | #See parent WDL: optimize_GQ_filter
 10 | 
 11 | workflow optimize_ROC {
 12 |   File vcf
 13 |   Array[File] famfiles
 14 |   String prefix
 15 |   String max_fdr
 16 |   Int minGQ
 17 |   Int maxGQ
 18 |   Int GQstepsize
 19 | 
 20 |   #Get de novo stats for each famfile shard
 21 |   scatter( famfile in famfiles ){
 22 |     call gather_denovo_stats as gather_stats {
 23 |       input:
 24 |         vcf=vcf,
 25 |         famfile=famfile,
 26 |         minGQ=minGQ,
 27 |         maxGQ=maxGQ,
 28 |         GQstepsize=GQstepsize
 29 |     }
 30 | 
 31 |     call cat_denovo_stats as cat_stats_pershard {
 32 |       input:
 33 |         dn_stats=gather_stats.dn_stats_glob,
 34 |         prefix="${prefix}.pershard"
 35 |     }
 36 |   }
 37 | 
 38 |   #Merge de novo stats files across all shards
 39 |   call cat_denovo_stats as cat_stats_allshards {
 40 |     input:
 41 |       dn_stats=cat_stats_pershard.merged_stats,
 42 |       prefix="${prefix}"
 43 |   }
 44 | 
 45 |   #Run ROC analysis
 46 |   call ROC_optimization {
 47 |     input:
 48 |       merged_stats=cat_stats_allshards.merged_stats,
 49 |       prefix="${prefix}",
 50 |       max_fdr=max_fdr
 51 |   }
 52 | 
 53 |   output {
 54 |     File minGQ_ROC_plot = ROC_optimization.tarball
 55 |     File minGQ_ROC_table = ROC_optimization.minGQ_table
 56 |   }
 57 | }
 58 | 
 59 | 
 60 | #Gather de novo stats for a set of trios from a vcf
 61 | task gather_denovo_stats {
 62 |   File vcf
 63 |   File famfile
 64 |   Int minGQ
 65 |   Int maxGQ
 66 |   Int GQstepsize
 67 | 
 68 |   command <<<
 69 |     #Get list of sample IDs & column numbers from VCF header
 70 |     zcat ${vcf} | head -n1000 | fgrep "#" | fgrep -v "##" | sed 's/\t/\n/g' \
 71 |       | awk -v OFS="\t" '{ print $1, NR }' > vcf_header_columns.txt
 72 |     #Iterate over families & subset VCF
 73 |     while read famID pro fa mo prosex pheno; do
 74 |       pro_idx=$( awk -v ID=$pro '{ if ($1==ID) print $2 }' vcf_header_columns.txt )
 75 |       fa_idx=$( awk -v ID=$fa '{ if ($1==ID) print $2 }' vcf_header_columns.txt )
 76 |       mo_idx=$( awk -v ID=$mo '{ if ($1==ID) print $2 }' vcf_header_columns.txt )
 77 |       if ! [ -z $pro_idx ] && ! [ -z $fa_idx ] && ! [ -z $mo_idx ]; then
 78 |         echo -e "ANALYZING $famID, which contains $pro, $fa, and $mo"
 79 |         #Get variant stats
 80 |         zcat ${vcf} | cut -f1-9,"$pro_idx","$fa_idx","$mo_idx" \
 81 |           | fgrep -v "MULTIALLELIC" \
 82 |           | /opt/sv-pipeline/scripts/downstream_analysis_and_filtering/gather_trio_genos.py \
 83 |             stdin stdout "$pro" "$fa" "$mo" \
 84 |           | gzip -c \
 85 |           > "$famID".trio_variant_info.txt.gz
 86 |         #Titrate GQs & count de novos
 87 |         /opt/sv-pipeline/scripts/downstream_analysis_and_filtering/compute_denovo_stats.R \
 88 |           --famID "$famID" \
 89 |           --minGQ ${minGQ} \
 90 |           --maxGQ ${maxGQ} \
 91 |           --step ${GQstepsize} \
 92 |           "$famID".trio_variant_info.txt.gz \
 93 |           "$famID".trio_denovo_summary.txt
 94 |         gzip -f "$famID".trio_denovo_summary.txt
 95 |       fi
 96 |     done < ${famfile}
 97 |   >>>
 98 | 
 99 |   output {
100 |     Array[File] dn_stats_glob = glob("*.trio_denovo_summary.txt.gz")
101 |   }
102 | 
103 |   runtime {
104 |     docker: "talkowski/sv-pipeline@sha256:d3844f6c7c26da55e679c9c521882d54dbecd169f884f09f05a12d6565bf6063"
105 |     preemptible: 1
106 |     memory: "4 GB"
107 |     disks: "local-disk 250 HDD"
108 |   }
109 | }
110 | 
111 | 
112 | #Combine de novo stats into one long melted table
113 | task cat_denovo_stats {
114 |   Array[File] dn_stats
115 |   String prefix
116 | 
117 |   command <<<
118 |     zcat ${dn_stats[0]} | sed -n '1p' > ${prefix}.merged_denovo_stats.txt
119 |     while read statsfile; do
120 |       zcat "$statsfile" | sed '1d'
121 |     done < ${write_lines(dn_stats)} \
122 |       | sort -nk1,1 \
123 |       >> ${prefix}.merged_denovo_stats.txt
124 |     gzip -f ${prefix}.merged_denovo_stats.txt
125 |   >>>
126 | 
127 |   output {
128 |     File merged_stats = "${prefix}.merged_denovo_stats.txt.gz"
129 |   }
130 | 
131 |   runtime {
132 |     docker: "talkowski/sv-pipeline@sha256:d3844f6c7c26da55e679c9c521882d54dbecd169f884f09f05a12d6565bf6063"
133 |     preemptible: 1
134 |   }
135 | }
136 | 
137 | 
138 | #Run de novo ROC analysis
139 | task ROC_optimization {
140 |   File merged_stats
141 |   String prefix
142 |   String max_fdr
143 | 
144 |   command <<<
145 |     /opt/sv-pipeline/scripts/downstream_analysis_and_filtering/optimize_GQ_ROC.R \
146 |       --prefix ${prefix} \
147 |       --fdr ${max_fdr} \
148 |       -S /opt/sv-pipeline/ref/vcf_qc_refs/SV_colors.txt \
149 |       ${merged_stats} \
150 |       ./${prefix}_minGQ_ROC_results/
151 |       cp ./${prefix}_minGQ_ROC_results/${prefix}.minGQ_ROC_results.txt \
152 |       ${prefix}.minGQ_ROC_results.txt
153 |     tar -czvf ${prefix}_minGQ_ROC_results.tar.gz ./${prefix}_minGQ_ROC_results
154 |   >>>
155 | 
156 |   output {
157 |     File tarball = "${prefix}_minGQ_ROC_results.tar.gz"
158 |     File minGQ_table = "${prefix}.minGQ_ROC_results.txt"
159 |   }
160 | 
161 |   runtime {
162 |     docker: "talkowski/sv-pipeline@sha256:d3844f6c7c26da55e679c9c521882d54dbecd169f884f09f05a12d6565bf6063"
163 |     preemptible: 1
164 |     memory: "8 GB"
165 |     disks: "local-disk 20 HDD"
166 |   }
167 | }


--------------------------------------------------------------------------------
/gnomad_sv_analysis_wdls/prune_and_add_vafs.wdl:
--------------------------------------------------------------------------------
  1 | # Copyright (c) 2018 Talkowski Lab
  2 | 
  3 | # Contact Ryan Collins <rlcollins@g.harvard.edu>
  4 | 
  5 | # Distributed under terms of the MIT License
  6 | 
  7 | 
  8 | # Workflow to perform final sample pruning & compute all relevant AF statistics
  9 | # for a VCF from the Talkowski SV pipeline
 10 | 
 11 | import "https://api.firecloud.org/ga4gh/v1/tools/Talkowski-SV:compute_simple_AFs_singleChrom/versions/14/plain-WDL/descriptor" as calcAF
 12 | 
 13 | workflow prune_and_add_vafs {
 14 |   File vcf
 15 |   File vcf_idx
 16 |   String prefix
 17 |   File? sample_pop_assignments  #Two-column file with sample ID & pop assignment. "." for pop will ignore sample
 18 |   File? prune_list              #List of samples to be excluded from the output vcf
 19 |   File? famfile                 #Used for M/F AF calculations
 20 |   Int sv_per_shard
 21 |   File contiglist
 22 |   String? drop_empty_records
 23 | 
 24 |   Array[Array[String]] contigs=read_tsv(contiglist)
 25 | 
 26 | 
 27 |   #Iterate over chromosomes
 28 |   scatter (contig in contigs) {
 29 |     #Prune VCF
 30 |     call prune_vcf {
 31 |       input:
 32 |         vcf=vcf,
 33 |         vcf_idx=vcf_idx,
 34 |         contig=contig[0],
 35 |         prune_list=prune_list,
 36 |         prefix=prefix
 37 |     }
 38 |     #Compute AC, AN, and AF per population & sex combination
 39 |     call calcAF.getAFs_singleChrom as getAFs {
 40 |       input:
 41 |         vcf=prune_vcf.pruned_vcf,
 42 |         vcf_idx=prune_vcf.pruned_vcf_idx,
 43 |         contig=contig[0],
 44 |         sv_per_shard=sv_per_shard,
 45 |         prefix=prefix,
 46 |         sample_pop_assignments=sample_pop_assignments,
 47 |         famfile=famfile,
 48 |         drop_empty_records=drop_empty_records
 49 |     }
 50 |   }
 51 | 
 52 | 
 53 |   #Merge pruned VCFs with allele info
 54 |   call concat_vcfs {
 55 |     input:
 56 |       vcfs=getAFs.vcf_wAFs,
 57 |       outfile_prefix="${prefix}.pruned_wAFs"
 58 |   }
 59 | 
 60 |   output {
 61 |     File output_vcf = concat_vcfs.concat_vcf
 62 |     File output_vcf_idx = concat_vcfs.concat_vcf_idx
 63 |   }
 64 | }
 65 | 
 66 | 
 67 | #Shard vcf into single chromosome shards & drop pruned samples
 68 | task prune_vcf {
 69 |   File vcf
 70 |   File vcf_idx
 71 |   String contig
 72 |   File? prune_list
 73 |   String prefix
 74 | 
 75 |   command <<<
 76 |     #Tabix chromosome of interest
 77 |     tabix -h ${vcf} ${contig} | bgzip -c > ${contig}.vcf.gz
 78 |     #Get column indexes corresponding to samples to drop, if any exist
 79 |     if [ "${default="SKIP" prune_list}" != "SKIP" ]; then
 80 |       dropidx=$( zcat ${contig}.vcf.gz | sed -n '1,500p' | fgrep "#" | fgrep -v "##" \
 81 |                  | sed 's/\t/\n/g' | awk -v OFS="\t" '{ print NR, $1 }' \
 82 |                  | fgrep -wf ${prune_list} | cut -f1 | paste -s -d, )
 83 |       zcat ${contig}.vcf.gz \
 84 |       | cut --complement -f"$dropidx" \
 85 |       | bgzip -c \
 86 |       > "${prefix}.${contig}.pruned.vcf.gz"
 87 |     else
 88 |       cp "${contig}.vcf.gz" "${prefix}.${contig}.pruned.vcf.gz"
 89 |     fi
 90 |     tabix -f "${prefix}.${contig}.pruned.vcf.gz"
 91 |   >>>
 92 | 
 93 |   output {
 94 |     File pruned_vcf = "${prefix}.${contig}.pruned.vcf.gz"
 95 |     File pruned_vcf_idx = "${prefix}.${contig}.pruned.vcf.gz.tbi"
 96 |   }
 97 | 
 98 |   runtime {
 99 |     docker: "talkowski/sv-pipeline@sha256:4900cae92f1f8bc98c54f89444a00e134ac4c86ca55543e2646f024270a29a69"
100 |     preemptible: 1
101 |     maxRetries: 1
102 |     memory: "4 GB"
103 |     disks: "local-disk 250 SSD"
104 |   }
105 | }
106 | 
107 | 
108 | #General task to combine multiple VCFs
109 | task concat_vcfs {
110 |   Array[File] vcfs
111 |   String outfile_prefix
112 | 
113 |   command <<<
114 |     vcf-concat ${sep=' ' vcfs} | vcf-sort -c | bgzip -c > ${outfile_prefix}.vcf.gz; 
115 |     tabix -p vcf -f "${outfile_prefix}.vcf.gz"
116 |   >>>
117 | 
118 |   output {
119 |     File concat_vcf = "${outfile_prefix}.vcf.gz"
120 |     File concat_vcf_idx = "${outfile_prefix}.vcf.gz.tbi"
121 |   }
122 | 
123 |   runtime {
124 |     docker: "talkowski/sv-pipeline@sha256:4900cae92f1f8bc98c54f89444a00e134ac4c86ca55543e2646f024270a29a69"
125 |     preemptible: 1
126 |     maxRetries: 1
127 |     memory: "16 GB"
128 |     disks: "local-disk 250 SSD"
129 |   }
130 | }
131 | 
132 | 
133 | 


--------------------------------------------------------------------------------
/gnomad_sv_analysis_wdls/sharded_vcf2bed.wdl:
--------------------------------------------------------------------------------
  1 | # Copyright (c) 2018 Talkowski Lab
  2 | 
  3 | # Contact Ryan Collins <rlcollins@g.harvard.edu>
  4 | 
  5 | # Distributed under terms of the MIT License
  6 | 
  7 | 
  8 | #This is a helper WDL that runs svtk vcf2bed parallelized across many shards for
  9 | # a single chromosome
 10 | 
 11 | workflow sharded_vcf2bed {
 12 |   File vcf
 13 |   File vcf_idx
 14 |   String contig
 15 |   Int sv_per_shard
 16 |   String prefix
 17 | 
 18 |   # Tabix to chromosome of interest, and shard input VCF for stats collection
 19 |   call shard_vcf {
 20 |     input:
 21 |       vcf=vcf,
 22 |       vcf_idx=vcf_idx,
 23 |       contig=contig,
 24 |       sv_per_shard=sv_per_shard
 25 |   }
 26 | 
 27 |   # Scatter over VCF shards
 28 |   scatter (shard in shard_vcf.shard_vcfs) {
 29 |     # Run vcf2bed
 30 |     call vcf2bed_sub {
 31 |       input:
 32 |         vcf=shard,
 33 |         prefix="${prefix}.shard"
 34 |       }
 35 |     }
 36 | 
 37 |   # Merge vcf2bed_sub outputs
 38 |   call merge_vcf2bed_sub {
 39 |     input:
 40 |       vcf2bed_sub_shards=vcf2bed_sub.vcf2bed_sub_out,
 41 |       prefix=prefix
 42 |   }
 43 | 
 44 |   output {
 45 |     File vcf2bed_out=merge_vcf2bed_sub.merged_vcf2bed_out
 46 |   }
 47 | }
 48 | 
 49 | 
 50 | # Shard VCF into fixed size chunks
 51 | task shard_vcf {
 52 |   File vcf
 53 |   File vcf_idx
 54 |   String contig
 55 |   Int sv_per_shard
 56 | 
 57 |   command {
 58 |     #Tabix chromosome of interest
 59 |     tabix -h ${vcf} ${contig} | bgzip -c > ${contig}.vcf.gz
 60 |     #Then shard VCF
 61 |     /opt/sv-pipeline/scripts/shard_VCF.sh \
 62 |       ${contig}.vcf.gz \
 63 |       ${sv_per_shard} \
 64 |       "vcf.shard."
 65 |   }
 66 | 
 67 |   output {
 68 |     Array[File] shard_vcfs = glob("vcf.shard.*.vcf.gz")
 69 |   }
 70 |   
 71 |   runtime {
 72 |     preemptible: 1
 73 |     docker: "talkowski/sv-pipeline@sha256:ec7e6f578ba2a8796399fc6f0f9864ec2d34a4921c769a8a54bbbf5254337a8b"
 74 |     memory: "4 GB"
 75 |     disks: "local-disk 270 SSD"
 76 |   }
 77 | }
 78 | 
 79 | 
 80 | # Run vcf2bed_sub on an input vcf
 81 | task vcf2bed_sub {
 82 |   File vcf
 83 |   String prefix
 84 | 
 85 |   command {
 86 |     svtk vcf2bed \
 87 |       --info ALL \
 88 |       --include-filters \
 89 |       --no-samples \
 90 |       ${vcf} \
 91 |       stdout \
 92 |       | bgzip -c \
 93 |       > "${prefix}.vcf2bed.bed.gz"
 94 |   }
 95 | 
 96 |   output {
 97 |     File vcf2bed_sub_out = "${prefix}.vcf2bed.bed.gz"
 98 |   }
 99 |   
100 |   runtime {
101 |     preemptible: 1
102 |     docker: "talkowski/sv-pipeline@sha256:ec7e6f578ba2a8796399fc6f0f9864ec2d34a4921c769a8a54bbbf5254337a8b"
103 |     memory: "4 GB"
104 |     disks: "local-disk 25 SSD"
105 |   }
106 | }
107 | 
108 | # Merge vcf2bed_sub shards
109 | task merge_vcf2bed_sub {
110 |   Array[File] vcf2bed_sub_shards
111 |   String prefix
112 | 
113 |   command <<<
114 |     zcat ${vcf2bed_sub_shards[0]} | sed -n '1p' > header.txt
115 |     zcat ${sep=' ' vcf2bed_sub_shards} | fgrep -v "#" \
116 |       | sort -Vk1,1 -k2,2n -k3,3n \
117 |       | cat header.txt - \
118 |       | bgzip -c \
119 |       > "${prefix}.vcf2bed_sub.bed.gz"
120 |   >>>
121 | 
122 |   output {
123 |     File merged_vcf2bed_out = "${prefix}.vcf2bed_sub.bed.gz"
124 |   }
125 |   
126 |   runtime {
127 |     preemptible: 1
128 |     docker: "talkowski/sv-pipeline@sha256:ec7e6f578ba2a8796399fc6f0f9864ec2d34a4921c769a8a54bbbf5254337a8b"
129 |     memory: "4 GB"
130 |     disks: "local-disk 200 SSD"
131 |   }
132 | }
133 | 
134 | 
135 | 


--------------------------------------------------------------------------------
/gnomad_sv_manuscript_code/.DS_Store:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/talkowski-lab/gnomad-sv-pipeline/b7798895fc1b2c8d83b41b36148b0a4b3a8d25cb/gnomad_sv_manuscript_code/.DS_Store


--------------------------------------------------------------------------------
/gnomad_sv_manuscript_code/metadata_generation/AF_reconcilliation_helper.R:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env Rscript
 2 | 
 3 | # Copyright (c) 2018 Talkowski Laboratory
 4 | # Contact: Ryan Collins <rlcollins@g.harvard.edu>
 5 | # Distributed under terms of the MIT license.
 6 | 
 7 | # gnomAD v2 SV analysis script
 8 | 
 9 | # Helper script to sanitize AFs for 1kG comparisons
10 | 
11 | 
12 | ###Set master parameters
13 | options(stringsAsFactors=F,scipen=1000)
14 | 
15 | ###Read command line args
16 | args <- commandArgs(trailingOnly=TRUE)
17 | 
18 | ###Read data
19 | dat <- read.table(args[1],header=T,comment.char="",sep="\t")
20 | 
21 | ###Clean AFs
22 | AFs.to.clean <- grep(",",dat$AF,fixed=T)
23 | cleaned.AFs <- as.vector(as.numeric(sapply(dat$AF[AFs.to.clean],function(s){
24 |   return(sum(round(as.numeric(unlist(strsplit(s,split=","))),10)[-2]))
25 | })))
26 | cleaned.AFs[which(cleaned.AFs>1)] <- 1
27 | dat$AF[AFs.to.clean] <- cleaned.AFs
28 | 
29 | ###Write data
30 | colnames(dat)[1] <- "#chr"
31 | write.table(dat,args[1],col.names=T,row.names=F,sep="\t",quote=F)
32 | 


--------------------------------------------------------------------------------
/gnomad_sv_manuscript_code/metadata_generation/merge_downsample_sv_per_gene.R:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env Rscript
 2 | 
 3 | # Copyright (c) 2018 Talkowski Laboratory
 4 | # Contact: Ryan Collins <rlcollins@g.harvard.edu>
 5 | # Distributed under terms of the MIT license.
 6 | 
 7 | # gnomAD v2 SV analysis helper script
 8 | 
 9 | # Get average SV per gene from downsampling experiments
10 | 
11 | 
12 | ###Set master parameters & load libraries
13 | options(stringsAsFactors=F,scipen=1000)
14 | effects <- c("lof.any","lof.del","lof.other","cg","plof","inv")
15 | 
16 | 
17 | ###Helper functions
18 | #Import list of sv-per-gene counts, and take mean across downsampling points
19 | process.counts <- function(counts.list.in,seed.table.in){
20 |   #Read seeds
21 |   down.sizes <- as.integer(read.table(seed.table.in,header=F,sep="\t")[,1])
22 |   #Read tables as list
23 |   counts.list <- as.character(read.table(counts.list.in,header=F,sep="\t")[,1])
24 |   counts <- lapply(counts.list,function(l){
25 |     read.table(l,header=T,sep="\t")
26 |   })
27 |   #Sanity check to make sure all counts have same number of rows
28 |   if(length(unique(unlist(lapply(counts,nrow)))) > 1){
29 |     stop("Some sv_per_gene.txt tables don't have the same number of lines.")
30 |   }
31 |   #Take mean counts per gene per downsample size
32 |   down.points <- sort(unique(down.sizes))
33 |   res <- do.call("cbind", lapply(down.points,function(n){
34 |     idxs <- which(down.sizes==n)
35 |     merged <- do.call("rbind", counts[idxs])
36 |     meaned <- as.data.frame(t(sapply(sort(unique(as.character(merged$gene))),function(gene){
37 |       as.numeric(apply(merged[which(merged$gene==gene),-1],2,mean,na.rm=T))
38 |     })))
39 |     colnames(meaned) <- paste(effects,n,sep=".")
40 |     return(meaned)
41 |   }))
42 |   #Add gene name & return output data frame
43 |   res <- data.frame("gene"=rownames(res),res)
44 |   rownames(res) <- NULL
45 |   return(res)
46 | }
47 | 
48 | 
49 | ###Read command-line arguments
50 | args <- commandArgs(trailingOnly=T)
51 | counts.list.in <- as.character(args[1])
52 | seed.table.in <- as.character(args[2])
53 | OUTFILE <- as.character(args[3])
54 | 
55 | # #Dev parameters (local)
56 | # counts.list.in <- "~/scratch/sv_per_gene.input.list"
57 | # seed.table.in <- "~/scratch/tmp_seeds_input.txt"
58 | # OUTFILE <- "~/scratch/merged_sv_per_gene.test.txt"
59 | 
60 | 
61 | ###Process data & write out
62 | res <- process.counts(counts.list.in,seed.table.in)
63 | write.table(res,OUTFILE,col.names=T,row.names=F,sep="\t",quote=F)
64 | 
65 | 


--------------------------------------------------------------------------------
/gnomad_sv_manuscript_code/metadata_generation/seed_downsampling.R:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env Rscript
 2 | 
 3 | # Copyright (c) 2018 Talkowski Laboratory
 4 | # Contact: Ryan Collins <rlcollins@g.harvard.edu>
 5 | # Distributed under terms of the MIT license.
 6 | 
 7 | # gnomAD v2 SV analysis helper script
 8 | 
 9 | # Create list of unique downsample seeds
10 | 
11 | 
12 | ###Set master parameters
13 | options(stringsAsFactors=F,scipen=1000)
14 | sample.sizes <- c(1,2,3,4,5,6,7,8,9,10,25,50,75,100,250,500,750,1000,2500,5000,7500,10000)
15 | sample.sizes <- sample.sizes[which(sample.sizes<=10000)]
16 | 
17 | ###Read command-line arguments
18 | args <- commandArgs(trailingOnly=T)
19 | N <- as.numeric(args[1])
20 | # master.seed <- as.numeric(args[2])
21 | OUTFILE <- as.character(args[2])
22 | 
23 | ###Create data frame of random seeds & sample sizes
24 | s <- as.numeric(sapply(sample.sizes,rep,times=N))
25 | set.seed(123456789)
26 | r <- ceiling(runif(length(s), 0, 10^12))
27 | out <- data.frame("sample.size"=s,
28 |                   "random.seed"=r)
29 | write.table(out,OUTFILE,col.names=F,row.names=F,sep="\t",quote=F)
30 | 


--------------------------------------------------------------------------------
/gnomad_sv_pipeline_scripts/.DS_Store:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/talkowski-lab/gnomad-sv-pipeline/b7798895fc1b2c8d83b41b36148b0a4b3a8d25cb/gnomad_sv_pipeline_scripts/.DS_Store


--------------------------------------------------------------------------------
/gnomad_sv_pipeline_scripts/module_00/.DS_Store:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/talkowski-lab/gnomad-sv-pipeline/b7798895fc1b2c8d83b41b36148b0a4b3a8d25cb/gnomad_sv_pipeline_scripts/module_00/.DS_Store


--------------------------------------------------------------------------------
/gnomad_sv_pipeline_scripts/module_00/vcf2baf.sh:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env bash
 2 | 
 3 | #############################
 4 | #    gnomAD SV Discovery    #
 5 | #############################
 6 | 
 7 | # Copyright (c) 2018 Harold Wang, Ryan L. Collins, and the Talkowski Lab
 8 | # Distributed under terms of the MIT License (see LICENSE)
 9 | # Contact: Ryan L. Collins <rlcollins@g.harvard.edu>
10 | # gnomAD credits: http://gnomad.broadinstitute.org/
11 | 
12 | #Wrapper to handle pre-filtering for vcf2baf_helper.py
13 | #Collects BAF data for all samples present in a given VCF input
14 | 
15 | 
16 | #####Usage statement
17 | usage(){
18 | cat <<EOF
19 | 
20 | usage: vcf2baf.sh [-h] [-z] input.vcf.gz output.txt
21 | 
22 | Wrapper script to generate B-allele frequencies for all heterozygous
23 | sites present in each sample in an input VCF
24 | 
25 | Positional arguments:
26 |   input.vcf.gz   path to bgzipped input VCF file
27 |   output.txt     path to output BAF file
28 | 
29 | Optional arguments:
30 |   -h  HELP      Show this help message and exit
31 |   -z  BGZIP     bgzip & tabix index the BAF output file
32 | 
33 | EOF
34 | }
35 | 
36 | 
37 | #####Parse arguments
38 | BGZIP=0
39 | while getopts ":zh" opt; do
40 |   case "$opt" in
41 |     h)
42 |       usage
43 |       exit 0
44 |       ;;
45 |     z)
46 |       BGZIP=1
47 |       ;;
48 |   esac
49 | done
50 | shift $(( ${OPTIND} - 1))
51 | VCF=$1
52 | OUTFILE=$2
53 | 
54 | 
55 | #####Check for required input
56 | if [ -z ${VCF} ] || [ -z ${OUTFILE} ]; then
57 |   usage
58 |   exit 0
59 | fi
60 | 
61 | 
62 | #Get path to vcf2baf bin
63 | BIN=$( cd "$( dirname "${BASH_SOURCE[0]}" )" && pwd )
64 | 
65 | 
66 | #####Run BAF generation & write to OUTFILE
67 | zcat ${VCF} | bcftools view -M2 -v snps - | grep -e '^#\|PASS' | \
68 | python ${BIN}/vcf2baf_helper.py > ${OUTFILE}
69 | 
70 | 
71 | #####Bgzip & tabix index OUTFILE, if optioned
72 | if [ ${BGZIP} -gt 0 ]; then
73 |   bgzip -f ${OUTFILE}
74 |   tabix -s 1 -b 2 -e 2 -f ${OUTFILE}.gz
75 | fi
76 | 
77 | 
78 | 


--------------------------------------------------------------------------------
/gnomad_sv_pipeline_scripts/module_01/make_depth_rdtest_bed.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | # -*- coding: utf-8 -*-
 3 | #
 4 | # Copyright © 2017 Matthew Stone <mstone5@mgh.harvard.edu>
 5 | # Distributed under terms of the MIT license.
 6 | 
 7 | """
 8 | Convert bedcluster output to RdTest format
 9 | """
10 | 
11 | import argparse
12 | import sys
13 | import pandas as pd
14 | 
15 | 
16 | def make_depth_rdtest_bed(svof):
17 |     svof['#chrom'] = svof['#chrom'].astype(str)
18 |     svof['start'] = svof.start.astype(int)
19 |     svof['end'] = svof.end.astype(int)
20 |     bed = svof['#chrom start end name svtype'.split()].drop_duplicates()
21 | 
22 |     # Add samples
23 |     def agg_samples(samples):
24 |         return ','.join(sorted(set(samples)))
25 |     samples = svof.groupby('name')['sample'].agg(agg_samples)
26 |     samples = samples.rename('samples').reset_index()
27 |     bed = pd.merge(bed, samples, on='name', how='left')
28 | 
29 |     # Format
30 |     bed['svtype'] = bed.svtype.str.upper()
31 | 
32 |     cols = '#chrom start end name samples svtype'.split()
33 |     return bed[cols]
34 | 
35 | 
36 | def main():
37 |     parser = argparse.ArgumentParser(
38 |         description=__doc__,
39 |         formatter_class=argparse.RawDescriptionHelpFormatter)
40 |     parser.add_argument('bed', help='Input BED')
41 |     parser.add_argument('fout', help='Output BED', type=argparse.FileType('w'),
42 |                         default=sys.stdout, nargs='?')
43 |     args = parser.parse_args()
44 | 
45 |     clustered = pd.read_table(args.bed)
46 | 
47 |     bed = make_depth_rdtest_bed(clustered)
48 | 
49 |     bed.to_csv(args.fout, sep='\t', index=False)
50 | 
51 | if __name__ == '__main__':
52 |     main()
53 | 


--------------------------------------------------------------------------------
/gnomad_sv_pipeline_scripts/module_01/make_pesr_rdtest_bed.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | # -*- coding: utf-8 -*-
 3 | #
 4 | # Copyright © 2017 Matthew Stone <mstone5@mgh.harvard.edu>
 5 | # Distributed under terms of the MIT license.
 6 | 
 7 | """
 8 | Convert CNV records in clustered VCF to RdTest bed format.
 9 | """
10 | 
11 | import argparse
12 | import sys
13 | from collections import deque
14 | from pysam import VariantFile
15 | 
16 | 
17 | def main():
18 |     parser = argparse.ArgumentParser(
19 |         description=__doc__,
20 |         formatter_class=argparse.RawDescriptionHelpFormatter)
21 |     parser.add_argument('vcf', help='Input VCF')
22 |     parser.add_argument('bed', help='Output BED', type=argparse.FileType('w'),
23 |                         default=sys.stdout, nargs='?')
24 |     args = parser.parse_args()
25 | 
26 |     # Prep bed
27 |     header = '#chrom\tstart\tend\tname\tsamples\tsvtype\n'
28 |     args.bed.write(header)
29 |     entry = '{chrom}\t{start}\t{end}\t{name}\t{samples}\t{svtype}\n'
30 | 
31 |     vcf = VariantFile(args.vcf)
32 |     for record in vcf:
33 |         # Skip non-CNV
34 |         if record.info['SVTYPE'] not in 'DEL DUP'.split():
35 |             continue
36 | 
37 |         # Get bed interval and metadata
38 |         chrom = record.chrom
39 |         start = record.pos
40 |         end = record.stop
41 |         name = record.id
42 |         svtype = record.info['SVTYPE']
43 | 
44 |         # Get list of called samples
45 |         samples = deque()
46 |         null_GTs = [(0, 0), (None, None), (0, ), (None, )]
47 |         for sample in record.samples:
48 |             gt = record.samples[sample]['GT']
49 |             if gt not in null_GTs:
50 |                 samples.append(sample)
51 |         if len(samples) == 0:
52 |             continue
53 |         samples = ','.join(sorted(set(samples)))
54 | 
55 |         args.bed.write(entry.format(**locals()))
56 | 
57 | 
58 | if __name__ == '__main__':
59 |     main()
60 | 


--------------------------------------------------------------------------------
/gnomad_sv_pipeline_wdls/.DS_Store:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/talkowski-lab/gnomad-sv-pipeline/b7798895fc1b2c8d83b41b36148b0a4b3a8d25cb/gnomad_sv_pipeline_wdls/.DS_Store


--------------------------------------------------------------------------------
/gnomad_sv_pipeline_wdls/module_00/00_batch_BAF_merging.wdl:
--------------------------------------------------------------------------------
 1 | workflow evidence_merging {
 2 |   Array[File] BAF_files
 3 |   Array[File] BAF_indexes
 4 |   Array[String] samples
 5 |   String batch
 6 |   File inclusion_bed
 7 | 
 8 |   call merge_PESR_files as merge_BAF_files {
 9 |     input:
10 |       files=BAF_files,
11 |       indexes=BAF_indexes,
12 |       batch=batch,
13 |       evidence="BAF",
14 |       inclusion_bed=inclusion_bed
15 |   }
16 | 
17 |   output {
18 |     File merged_BAF = merge_BAF_files.merged
19 |     File merged_BAF_idx = merge_BAF_files.merged_idx
20 |   }
21 | }
22 | 
23 | task merge_PESR_files {
24 |   Array[File] files
25 |   Array[File] indexes
26 |   String batch
27 |   String evidence
28 |   File inclusion_bed
29 | 
30 |   command <<<
31 |     tmpdir=$(mktemp -d);
32 |     cmd="sort -m -k1,1V -k2,2n -T $tmpdir";
33 |     while read file; do
34 |       cmd="$cmd <( tabix -h -R ${inclusion_bed} $file )"
35 |     done < ${write_tsv(files)};
36 |     echo "$cmd"
37 |     eval "$cmd" | bgzip -c > ${batch}.${evidence}.txt.gz;
38 |     tabix -f -s1 -b 2 -e 2 ${batch}.${evidence}.txt.gz
39 |   >>>
40 | 
41 |   output {
42 |     File merged = "${batch}.${evidence}.txt.gz"
43 |     File merged_idx = "${batch}.${evidence}.txt.gz.tbi"
44 |   }
45 | 
46 |   runtime {
47 |     docker: "talkowski/sv-pipeline-remote-pysam"
48 |     memory: "8 GB"
49 |     disks: "local-disk 5000 HDD"
50 |   }
51 | }
52 | 


--------------------------------------------------------------------------------
/gnomad_sv_pipeline_wdls/module_00/00_batch_PESRRD_merging.wdl:
--------------------------------------------------------------------------------
 1 | workflow evidence_merging {
 2 |   Array[File] PE_files
 3 |   Array[File] PE_indexes
 4 |   Array[File] SR_files
 5 |   Array[File] SR_indexes
 6 |   Array[File] bincov_files
 7 |   Array[File] bincov_indexes
 8 |   Array[String] samples
 9 |   String batch
10 |   File inclusion_bed
11 | 
12 |   call merge_PESR_files as merge_PE_files {
13 |     input:
14 |       files=PE_files,
15 |       indexes=PE_indexes,
16 |       batch=batch,
17 |       evidence="PE",
18 |       inclusion_bed=inclusion_bed
19 |   }
20 |   
21 |   call merge_PESR_files as merge_SR_files {
22 |     input:
23 |       files=SR_files,
24 |       indexes=SR_indexes,
25 |       batch=batch,
26 |       evidence="SR",
27 |       inclusion_bed=inclusion_bed
28 |   }
29 | 
30 |   call make_bincov_matrix {
31 |     input:
32 |       samples=samples,
33 |       filepaths=bincov_files,
34 |       batch=batch,
35 |   }
36 | 
37 |   output {
38 |     File merged_PE = merge_PE_files.merged
39 |     File merged_PE_idx = merge_PE_files.merged_idx
40 |     File merged_SR = merge_SR_files.merged
41 |     File merged_SR_idx = merge_SR_files.merged_idx
42 |     File merged_bincov = make_bincov_matrix.bincov_matrix
43 |     File merged_bincov_idx = make_bincov_matrix.bincov_matrix_idx
44 |   }
45 | }
46 | 
47 | task merge_PESR_files {
48 |   Array[File] files
49 |   Array[File] indexes
50 |   String batch
51 |   String evidence
52 |   File inclusion_bed
53 | 
54 |   command <<<
55 |     tmpdir=$(mktemp -d);
56 |     cmd="sort -m -k1,1V -k2,2n -T $tmpdir";
57 |     while read file; do
58 |       cmd="$cmd <( tabix -h -R ${inclusion_bed} $file )"
59 |     done < ${write_tsv(files)};
60 |     echo "$cmd"
61 |     eval "$cmd" | bgzip -c > ${batch}.${evidence}.txt.gz;
62 |     tabix -f -s1 -b 2 -e 2 ${batch}.${evidence}.txt.gz
63 |   >>>
64 | 
65 |   output {
66 |     File merged = "${batch}.${evidence}.txt.gz"
67 |     File merged_idx = "${batch}.${evidence}.txt.gz.tbi"
68 |   }
69 | 
70 |   runtime {
71 |     docker: "talkowski/sv-pipeline-remote-pysam"
72 |     memory: "8 GB"
73 |     disks: "local-disk 5000 HDD"
74 |   }
75 | }
76 | 
77 | task make_bincov_matrix {
78 |   Array[String] samples
79 |   Array[File] filepaths
80 |   String batch
81 | 
82 |   command <<<
83 |     paste ${write_tsv(samples)} ${write_tsv(filepaths)} > samples.key;
84 |     makeMatrix.sh -z -N -o ${batch}.bincov.bed.gz samples.key
85 |   >>>
86 | 
87 |   output {
88 |     File bincov_matrix = "${batch}.bincov.bed.gz"
89 |     File bincov_matrix_idx = "${batch}.bincov.bed.gz.tbi"
90 |   }
91 | 
92 |   runtime {
93 |     docker: "talkowski/sv-pipeline-remote-pysam"
94 |     disks: "local-disk 1000 HDD"
95 |   }
96 | }


--------------------------------------------------------------------------------
/gnomad_sv_pipeline_wdls/module_00/00_batch_SR_merging.wdl:
--------------------------------------------------------------------------------
 1 | workflow evidence_merging {
 2 |   Array[File] SR_files
 3 |   Array[File] SR_indexes
 4 |   Array[String] samples
 5 |   String batch
 6 |   File inclusion_bed
 7 | 
 8 |   call merge_PESR_files as merge_SR_files {
 9 |     input:
10 |       files=SR_files,
11 |       indexes=SR_indexes,
12 |       batch=batch,
13 |       evidence="SR",
14 |       inclusion_bed=inclusion_bed
15 |   }
16 | 
17 |   output {
18 |     File merged_SR = merge_SR_files.merged
19 |     File merged_SR_idx = merge_SR_files.merged_idx
20 |   }
21 | }
22 | 
23 | task merge_PESR_files {
24 |   Array[File] files
25 |   Array[File] indexes
26 |   String batch
27 |   String evidence
28 |   File inclusion_bed
29 | 
30 |   command <<<
31 |     tmpdir=$(mktemp -d);
32 |     cmd="sort -m -k1,1V -k2,2n -T $tmpdir";
33 |     while read file; do
34 |       cmd="$cmd <( tabix -h -R ${inclusion_bed} $file )"
35 |     done < ${write_tsv(files)};
36 |     echo "$cmd"
37 |     eval "$cmd" | bgzip -c > ${batch}.${evidence}.txt.gz;
38 |     tabix -f -s1 -b 2 -e 2 ${batch}.${evidence}.txt.gz
39 |   >>>
40 | 
41 |   output {
42 |     File merged = "${batch}.${evidence}.txt.gz"
43 |     File merged_idx = "${batch}.${evidence}.txt.gz.tbi"
44 |   }
45 | 
46 |   runtime {
47 |     docker: "talkowski/sv-pipeline-remote-pysam"
48 |     memory: "8 GB"
49 |     disks: "local-disk 5000 HDD"
50 |   }
51 | }
52 | 


--------------------------------------------------------------------------------
/gnomad_sv_pipeline_wdls/module_00/00_batch_evidence_merging.wdl:
--------------------------------------------------------------------------------
  1 | workflow evidence_merging {
  2 |   Array[File] PE_files
  3 |   Array[File] PE_indexes
  4 |   Array[File] SR_files
  5 |   Array[File] SR_indexes
  6 |   Array[File] bincov_files
  7 |   Array[File] bincov_indexes
  8 |   Array[File] BAF_files
  9 |   Array[File] BAF_indexes
 10 |   Array[String] samples
 11 |   String batch
 12 |   File inclusion_bed
 13 | 
 14 |   call merge_PESR_files as merge_PE_files {
 15 |     input:
 16 |       files=PE_files,
 17 |       indexes=PE_indexes,
 18 |       batch=batch,
 19 |       evidence="PE",
 20 |       inclusion_bed=inclusion_bed
 21 |   }
 22 |   
 23 |   call merge_PESR_files as merge_SR_files {
 24 |     input:
 25 |       files=SR_files,
 26 |       indexes=SR_indexes,
 27 |       batch=batch,
 28 |       evidence="SR",
 29 |       inclusion_bed=inclusion_bed
 30 |   }
 31 | 
 32 |   call make_bincov_matrix {
 33 |     input:
 34 |       samples=samples,
 35 |       filepaths=bincov_files,
 36 |       batch=batch,
 37 |   }
 38 | 
 39 |   call merge_PESR_files as merge_BAF_files {
 40 |     input:
 41 |       files=BAF_files,
 42 |       indexes=BAF_indexes,
 43 |       batch=batch,
 44 |       evidence="BAF",
 45 |       inclusion_bed=inclusion_bed
 46 |   }
 47 | 
 48 |   output {
 49 |     File merged_PE = merge_PE_files.merged
 50 |     File merged_PE_idx = merge_PE_files.merged_idx
 51 |     File merged_SR = merge_SR_files.merged
 52 |     File merged_SR_idx = merge_SR_files.merged_idx
 53 |     File merged_bincov = make_bincov_matrix.bincov_matrix
 54 |     File merged_bincov_idx = make_bincov_matrix.bincov_matrix_idx
 55 |     File merged_BAF = merge_BAF_files.merged
 56 |     File merged_BAF_idx = merge_BAF_files.merged_idx
 57 |   }
 58 | }
 59 | 
 60 | task merge_PESR_files {
 61 |   Array[File] files
 62 |   Array[File] indexes
 63 |   String batch
 64 |   String evidence
 65 |   File inclusion_bed
 66 | 
 67 |   command <<<
 68 |     tmpdir=$(mktemp -d);
 69 |     cmd="sort -m -k1,1V -k2,2n -T $tmpdir";
 70 |     while read file; do
 71 |       cmd="$cmd <( tabix -h -R ${inclusion_bed} $file )"
 72 |     done < ${write_tsv(files)};
 73 |     echo "$cmd"
 74 |     eval "$cmd" | bgzip -c > ${batch}.${evidence}.txt.gz;
 75 |     tabix -f -s1 -b 2 -e 2 ${batch}.${evidence}.txt.gz
 76 |   >>>
 77 | 
 78 |   output {
 79 |     File merged = "${batch}.${evidence}.txt.gz"
 80 |     File merged_idx = "${batch}.${evidence}.txt.gz.tbi"
 81 |   }
 82 | 
 83 |   runtime {
 84 |     docker: "talkowski/sv-pipeline-remote-pysam"
 85 |     memory: "8 GB"
 86 |     disks: "local-disk 5000 HDD"
 87 |   }
 88 | }
 89 | 
 90 | task make_bincov_matrix {
 91 |   Array[String] samples
 92 |   Array[File] filepaths
 93 |   String batch
 94 | 
 95 |   command <<<
 96 |     paste ${write_tsv(samples)} ${write_tsv(filepaths)} > samples.key;
 97 |     makeMatrix.sh -z -N -o ${batch}.bincov.bed.gz samples.key
 98 |   >>>
 99 | 
100 |   output {
101 |     File bincov_matrix = "${batch}.bincov.bed.gz"
102 |     File bincov_matrix_idx = "${batch}.bincov.bed.gz.tbi"
103 |   }
104 | 
105 |   runtime {
106 |     docker: "talkowski/sv-pipeline-remote-pysam"
107 |     disks: "local-disk 1000 HDD"
108 |   }
109 | }
110 | 


--------------------------------------------------------------------------------
/gnomad_sv_pipeline_wdls/module_00/00_depth_preprocessing.wdl:
--------------------------------------------------------------------------------
 1 | workflow preprocess_depth {
 2 |   Array[File] beds
 3 |   String batch
 4 | 
 5 |   call concat_batch as preprocess_DELs {
 6 |     input:
 7 |       beds=beds,
 8 |       batch=batch,
 9 |       svtype="DEL"
10 |   }
11 | 
12 |   call concat_batch as preprocess_DUPs {
13 |     input:
14 |       beds=beds,
15 |       batch=batch,
16 |       svtype="DUP"
17 |   }
18 | 
19 |   output {
20 |     File del_bed = preprocess_DELs.bed
21 |     File dup_bed = preprocess_DUPs.bed
22 |     File del_bed_idx = preprocess_DELs.bed_idx
23 |     File dup_bed_idx = preprocess_DUPs.bed_idx
24 |   }
25 | }
26 | 
27 | task concat_batch {
28 |   Array[File] beds
29 |   String svtype
30 |   String batch
31 | 
32 |   command <<<
33 |     zcat ${sep=' ' beds} \
34 |       | sed -e '/^#chr/d' -e 's/cn.MOPS/cnmops/g' \
35 |       | awk -v svtype=${svtype} '($6==svtype)' \
36 |       | sort -k1,1V -k2,2n \
37 |       | awk -v OFS="\t" -v svtype=${svtype} -v batch=${batch} '{$4=batch"_"svtype"_"NR; print}' \
38 |       | cat <(echo -e "#chr\tstart\tend\tname\tsample\tsvtype\tsources") - \
39 |       | bgzip -c \
40 |       > ${batch}.${svtype}.bed.gz;
41 |   tabix -p bed ${batch}.${svtype}.bed.gz
42 |   >>>
43 | 
44 |   output {
45 |     File bed="${batch}.${svtype}.bed.gz"
46 |     File bed_idx="${batch}.${svtype}.bed.gz.tbi"
47 |   }
48 |   
49 |   runtime {
50 |     docker: "talkowski/sv-pipeline"
51 |     preemptible: 3
52 |   }
53 | }


--------------------------------------------------------------------------------
/gnomad_sv_pipeline_wdls/module_00/00_pesr_preprocessing.wdl:
--------------------------------------------------------------------------------
 1 | import "https://api.firecloud.org/ga4gh/v1/tools/Talkowski-SV:00_pesr_processing_single_algorithm/versions/4/plain-WDL/descriptor" as pp
 2 | 
 3 | workflow preprocess_pesr {
 4 |   String sample         # Sample ID
 5 |   File manta_vcf        # Manta VCF
 6 |   File delly_vcf        # Delly VCF
 7 |   File melt_vcf         # Melt VCF
 8 |   File contigs          # .fai file of whitelisted contigs
 9 |   Int min_svsize        # Minimum SV length to include
10 | 
11 |   call pp.preprocess_algorithm as process_manta {
12 |     input:
13 |       vcf=manta_vcf,
14 |       contigs=contigs,
15 |       min_svsize=min_svsize,
16 |       algorithm="manta",
17 |       sample=sample
18 |   }
19 | 
20 |   call pp.preprocess_algorithm as process_delly {
21 |     input:
22 |       vcf=delly_vcf,
23 |       contigs=contigs,
24 |       min_svsize=min_svsize,
25 |       algorithm="delly",
26 |       sample=sample
27 |   }
28 |   
29 |   call pp.preprocess_algorithm as process_melt {
30 |     input:
31 |       vcf=melt_vcf,
32 |       contigs=contigs,
33 |       min_svsize=min_svsize,
34 |       algorithm="melt",
35 |       sample=sample
36 |   }
37 | 
38 |   output {
39 |     File std_manta_vcf = process_manta.std_vcf
40 |     File std_delly_vcf = process_delly.std_vcf
41 |     File std_melt_vcf = process_melt.std_vcf
42 |   }
43 | }


--------------------------------------------------------------------------------
/gnomad_sv_pipeline_wdls/module_00/00_pesr_processing_single_algorithm.wdl:
--------------------------------------------------------------------------------
 1 | workflow preprocess_algorithm {
 2 |   File vcf
 3 |   File contigs
 4 |   String sample
 5 |   String algorithm
 6 |   Int min_svsize
 7 | 
 8 |   call standardize_vcf {
 9 |     input: 
10 |       raw_vcf=vcf,
11 |       algorithm=algorithm,
12 |       group=sample,
13 |       contigs=contigs,
14 |       min_svsize=min_svsize
15 |   }
16 | 
17 |   call sort_vcf {
18 |     input: 
19 |       unsorted_vcf=standardize_vcf.std_vcf,
20 |       algorithm=algorithm,
21 |       group=sample
22 |   }
23 | 
24 |   output {
25 |     File std_vcf = sort_vcf.sorted_vcf
26 |   }
27 | }
28 | 
29 | task standardize_vcf {
30 |   File raw_vcf
31 |   File contigs
32 |   Int min_svsize
33 |   String algorithm
34 |   String group
35 | 
36 |   command {
37 |     svtk standardize --prefix ${algorithm}_${group} --contigs ${contigs} --min-size ${min_svsize} ${raw_vcf} ${algorithm}.${group}.vcf ${algorithm}
38 |   }
39 | 
40 |   output { 
41 |     File std_vcf="${algorithm}.${group}.vcf"
42 |     String group_="${group}"
43 |   }
44 |   
45 |   runtime {
46 |     docker: "talkowski/sv-pipeline"
47 |   }
48 | }
49 | 
50 | task sort_vcf {
51 |   File unsorted_vcf
52 |   String algorithm
53 |   String group
54 |  
55 |   command {
56 |     vcf-sort -c ${unsorted_vcf} | bgzip -c > ${algorithm}.${group}.vcf.gz;
57 |     tabix -p vcf ${algorithm}.${group}.vcf.gz
58 |   }
59 |   
60 |   output {
61 |     File sorted_vcf="${algorithm}.${group}.vcf.gz"
62 |   }
63 |   
64 |   runtime {
65 |     docker: "talkowski/sv-pipeline"
66 |   }
67 | }


--------------------------------------------------------------------------------
/gnomad_sv_pipeline_wdls/module_01/01_depth_clustering.wdl:
--------------------------------------------------------------------------------
  1 | import "https://api.firecloud.org/ga4gh/v1/tools/Talkowski-SV:01_depth_clustering_by_chrom/versions/4/plain-WDL/descriptor" as dibc
  2 | import "https://api.firecloud.org/ga4gh/v1/tools/Talkowski-SV:master_SV_VCF_QC/versions/47/plain-WDL/descriptor" as vcf_qc
  3 | 
  4 | workflow cluster_depth {
  5 |   File del_bed
  6 |   File dup_bed
  7 |   File contigs
  8 |   Float frac
  9 |   String flags
 10 |   String batch
 11 |   File famfile
 12 |   File trios_famfile
 13 |   String ref_build
 14 |   File Sanders_2015_tarball
 15 |   File Collins_2017_tarball
 16 |   File Werling_2018_tarball
 17 | 
 18 |   call dibc.bedcluster_by_chrom as cluster_DELs {
 19 |     input:
 20 |       batch=batch,
 21 |       svtype="DEL",
 22 |       bed=del_bed,
 23 |       contigs=contigs,
 24 |       frac=frac,
 25 |       flags=flags
 26 |   }
 27 | 
 28 |   call dibc.bedcluster_by_chrom as cluster_DUPs {
 29 |     input:
 30 |       batch=batch,
 31 |       svtype="DUP",
 32 |       bed=dup_bed,
 33 |       contigs=contigs,
 34 |       frac=frac,
 35 |       flags=flags
 36 |   }
 37 | 
 38 |   call make_rdtest_bed {
 39 |     input:
 40 |       dels=cluster_DELs.clustered_bed,
 41 |       dups=cluster_DUPs.clustered_bed,
 42 |       batch=batch,
 43 |   }
 44 | 
 45 |   call make_depth_vcf {
 46 |     input:
 47 |       bed=make_rdtest_bed.bed,
 48 |       batch=batch,
 49 |       contigs=contigs
 50 |   }
 51 | 
 52 |   call vcf_qc.master_vcf_qc as vcf_qc {
 53 |     input:
 54 |       vcf=make_depth_vcf.vcf,
 55 |       famfile=trios_famfile,
 56 |       ref_build=ref_build,
 57 |       prefix="${batch}_clustered_depth_vcf",
 58 |       sv_per_shard=10000,
 59 |       samples_per_shard=100,
 60 |       Sanders_2015_tarball=Sanders_2015_tarball,
 61 |       Collins_2017_tarball=Collins_2017_tarball,
 62 |       Werling_2018_tarball=Werling_2018_tarball
 63 |   }
 64 | 
 65 |   output {
 66 |     File clustered_vcf = make_depth_vcf.vcf
 67 |     File clustered_vcf_qc = vcf_qc.sv_vcf_qc_output
 68 |   }
 69 | }
 70 | 
 71 | task make_rdtest_bed { 
 72 |   File dels
 73 |   File dups
 74 |   File script
 75 |   String batch
 76 | 
 77 |   command <<<
 78 |     cat \
 79 |         <(python3 ${script} ${dels} | sed '1d') \
 80 |         <(python3 ${script} ${dups} | sed '1d') \
 81 |       | sort -k1,1V -k2,2n \
 82 |       | cat <(echo -e "#chrom start end name samples svtype" | sed -e 's/ /\t/g') - \
 83 |       > ${batch}.depth.bed;
 84 |   >>>
 85 |   
 86 |   output {
 87 |     File bed = "${batch}.depth.bed"
 88 |   }
 89 |   
 90 |   runtime {
 91 |     docker: "talkowski/sv-pipeline@sha256:a89824ac34b915f605d09bcf57516bc76d950bd762ad5c1f336d421be917be55"
 92 |     preemptible: 3
 93 |   }
 94 | }
 95 | 
 96 | task make_depth_vcf {
 97 |   File bed
 98 |   File contigs
 99 |   String batch
100 |   
101 |   command <<<
102 |     cut -f5 ${bed} | sed -e '1d' -e 's/,/\n/g' | sort -u > samples.list;
103 |     svtk rdtest2vcf --contigs ${contigs} ${bed} samples.list ${batch}.depth.vcf.gz;
104 |   >>>
105 | 
106 |   output {
107 |     File vcf = "${batch}.depth.vcf.gz"
108 |   }
109 |   
110 |   runtime {
111 |     docker: "talkowski/sv-pipeline@sha256:a89824ac34b915f605d09bcf57516bc76d950bd762ad5c1f336d421be917be55"
112 |     preemptible: 3
113 |   }
114 | }


--------------------------------------------------------------------------------
/gnomad_sv_pipeline_wdls/module_01/01_depth_clustering_by_chrom.wdl:
--------------------------------------------------------------------------------
 1 | workflow bedcluster_by_chrom {
 2 |   String batch
 3 |   String svtype
 4 |   File bed
 5 |   File contigs
 6 |   Float frac
 7 |   String flags
 8 |   
 9 |   Array[Array[String]] contiglist = read_tsv(contigs)
10 | 
11 |   scatter (contig in contiglist) {
12 |     call bedcluster {
13 |       input:
14 |         batch=batch,
15 |         svtype=svtype,
16 |         chrom=contig[0],
17 |         bed=bed,
18 |         frac=frac,
19 |         flags=flags
20 |     }
21 |   }
22 | 
23 |   call concat_beds {
24 |     input:
25 |       batch=batch,
26 |       svtype=svtype,
27 |       beds=bedcluster.clustered_bed
28 |   }
29 | 
30 |   output {
31 |     File clustered_bed = concat_beds.merged_bed
32 |   }
33 | }
34 | 
35 | task bedcluster {
36 |   String batch
37 |   String svtype
38 |   String chrom
39 |   File bed
40 |   
41 |   Float frac
42 |   String flags
43 | 
44 |   command {
45 |     tabix -p bed ${bed};
46 |     svtk bedcluster ${bed} -r ${chrom} \
47 |       -p ${batch}_depth_${svtype}_${chrom} \
48 |       -f ${frac} \
49 |       ${flags} \
50 |       > ${batch}.${svtype}.${chrom}.bed
51 |   }
52 |   
53 |   output {
54 |     File clustered_bed="${batch}.${svtype}.${chrom}.bed"
55 |   } 
56 |   
57 |   runtime {
58 |     docker: "talkowski/sv-pipeline@sha256:a89824ac34b915f605d09bcf57516bc76d950bd762ad5c1f336d421be917be55"
59 |     preemptible: 3
60 |   }
61 | }
62 | 
63 | task concat_beds {
64 |   String batch
65 |   String svtype
66 |   Array[File] beds
67 | 
68 |   command <<<
69 |     awk 'FNR==1 && NR!=1 { while (/^#chrom/) getline; } 1 {print}' ${sep=' ' beds} > ${batch}.${svtype}.bed
70 |   >>>
71 |   
72 |   output {
73 |     File merged_bed = "${batch}.${svtype}.bed"
74 |   }
75 |   
76 |   runtime {
77 |     docker: "talkowski/sv-pipeline@sha256:a89824ac34b915f605d09bcf57516bc76d950bd762ad5c1f336d421be917be55"
78 |     preemptible: 3
79 |   }
80 | }


--------------------------------------------------------------------------------
/gnomad_sv_pipeline_wdls/module_01/01_pesr_clustering.wdl:
--------------------------------------------------------------------------------
  1 | import "https://api.firecloud.org/ga4gh/v1/tools/Talkowski-SV:01_pesr_clustering_single_algorithm/versions/15/plain-WDL/descriptor" as single
  2 | 
  3 | workflow cluster_pesr {
  4 |   Array[File] manta_vcfs
  5 |   Array[File] delly_vcfs
  6 |   Array[File] melt_vcfs
  7 |   File contigs
  8 |   String batch
  9 |   File trios_famfile
 10 |   String ref_build
 11 |   File Sanders_2015_tarball
 12 |   File Collins_2017_tarball
 13 |   File Werling_2018_tarball
 14 |   
 15 |   Int dist
 16 |   Float frac
 17 |   File blacklist
 18 |   Int svsize
 19 |   String flags
 20 | 
 21 |   call single.cluster_pesr_algorithm as cluster_manta {
 22 |     input:
 23 |       vcfs=manta_vcfs,
 24 |       batch=batch,
 25 |       algorithm="manta",
 26 |       contigs=contigs,
 27 |       dist=dist,
 28 |       frac=frac,
 29 |       blacklist=blacklist,
 30 |       svsize=svsize,
 31 |       flags=flags,
 32 |       svtypes="DEL,DUP,INV,BND,INS",
 33 |       famfile=famfile,
 34 |       ref_build=ref_build,
 35 |       Sanders_2015_tarball=Sanders_2015_tarball,
 36 |       Werling_2018_tarball=Werling_2018_tarball
 37 |   }
 38 | 
 39 |   call single.cluster_pesr_algorithm as cluster_delly {
 40 |     input:
 41 |       vcfs=delly_vcfs,
 42 |       batch=batch,
 43 |       algorithm="delly",
 44 |       contigs=contigs,
 45 |       dist=dist,
 46 |       frac=frac,
 47 |       blacklist=blacklist,
 48 |       svsize=svsize,
 49 |       flags=flags,
 50 |       svtypes="DEL,DUP,INV,BND",
 51 |       famfile=famfile,
 52 |       ref_build=ref_build,
 53 |       Sanders_2015_tarball=Sanders_2015_tarball,
 54 |       Werling_2018_tarball=Werling_2018_tarball
 55 |   }
 56 |   
 57 |   call single.cluster_pesr_algorithm as cluster_melt {
 58 |     input:
 59 |       vcfs=melt_vcfs,
 60 |       batch=batch,
 61 |       algorithm="melt",
 62 |       contigs=contigs,
 63 |       dist=dist,
 64 |       frac=frac,
 65 |       blacklist=blacklist,
 66 |       svsize=svsize,
 67 |       flags=flags,
 68 |       svtypes="INS",
 69 |       famfile=famfile,
 70 |       ref_build=ref_build,
 71 |       Sanders_2015_tarball=Sanders_2015_tarball,
 72 |       Werling_2018_tarball=Werling_2018_tarball
 73 |   }
 74 | 
 75 |   # call merge_vcf_qc {
 76 |   #   input:
 77 |   #     manta_vcf_qc=cluster_manta.clustered_vcf_qc,
 78 |   #     delly_vcf_qc=cluster_delly.clustered_vcf_qc,
 79 |   #     melt_vcf_qc=cluster_melt.clustered_vcf_qc
 80 |   # }
 81 | 
 82 |   output {
 83 |     File manta_vcf = cluster_manta.clustered_vcf
 84 |     File manta_vcf_qc = cluster_manta.clustered_vcf_qc
 85 |     File delly_vcf = cluster_delly.clustered_vcf
 86 |     File delly_vcf_qc = cluster_delly.clustered_vcf_qc
 87 |     File melt_vcf = cluster_melt.clustered_vcf
 88 |     File melt_vcf_qc = cluster_melt.clustered_vcf_qc
 89 |     # File merged_vcf_qc = merge_vcf_qc.merged_qc
 90 |   }
 91 | }
 92 | 
 93 | # task merge_vcf_qc {
 94 | #   File manta_vcf_qc
 95 | #   File delly_vcf_qc
 96 | #   File melt_vcf_qc
 97 | 
 98 | #   command <<<
 99 | #     mkdir merged_pesr_clustering_vcf_qc/
100 | #     mv ${manta_vcf_qc} merged_pesr_clustering_vcf_qc/
101 | #     mv ${delly_vcf_qc} merged_pesr_clustering_vcf_qc/
102 | #     mv ${melt_vcf_qc} merged_pesr_clustering_vcf_qc/
103 | #     tar -czvf merged_pesr_clustering_vcf_qc.tar.gz \
104 | #       merged_pesr_clustering_vcf_qc
105 | #   >>>
106 | 
107 | #   output {
108 | #     File merged_qc = "merged_pesr_clustering_vcf_qc.tar.gz"
109 | #   }
110 | 
111 | #     runtime {
112 | #     docker: "talkowski/sv-pipeline"
113 | #     preemptible: 3
114 | #   }
115 | # }


--------------------------------------------------------------------------------
/gnomad_sv_pipeline_wdls/module_01/01_pesr_clustering_single_algorithm.wdl:
--------------------------------------------------------------------------------
  1 | import "https://api.firecloud.org/ga4gh/v1/tools/Talkowski-SV:master_SV_VCF_QC/versions/47/plain-WDL/descriptor" as vcf_qc
  2 | 
  3 | workflow cluster_pesr_algorithm {
  4 |   Array[File] vcfs
  5 |   File contigs
  6 |   String batch
  7 |   String algorithm
  8 |   File famfile
  9 |   File trios_famfile
 10 |   String ref_build
 11 |   File Sanders_2015_tarball
 12 |   File collins_2017_tarball
 13 |   File Werling_2018_tarball
 14 | 
 15 |   # VCFcluster parameters
 16 |   Int dist
 17 |   Float frac
 18 |   File blacklist
 19 |   Int svsize
 20 |   String svtypes
 21 |   String flags
 22 | 
 23 |   Array[Array[String]] contiglist = read_tsv(contigs)
 24 | 
 25 |   scatter (contig in contiglist) {
 26 |     call vcfcluster {
 27 |       input:
 28 |         vcfs=vcfs,
 29 |         batch=batch,
 30 |         algorithm=algorithm,
 31 |         chrom=contig[0],
 32 |         dist=dist,
 33 |         frac=frac,
 34 |         blacklist=blacklist,
 35 |         svsize=svsize,
 36 |         flags=flags,
 37 |         svtypes=svtypes
 38 |     }
 39 |   }
 40 | 
 41 |   call concat_vcfs {
 42 |     input:
 43 |       vcfs=vcfcluster.clustered_vcf,
 44 |       batch=batch,
 45 |       algorithm=algorithm
 46 |   }
 47 | 
 48 |   call vcf_qc.master_vcf_qc as vcf_qc {
 49 |     input:
 50 |       vcf=concat_vcfs.vcf,
 51 |       famfile=trios_famfile,
 52 |       ref_build=ref_build,
 53 |       prefix="${batch}_clustered_${algorithm}_vcf",
 54 |       sv_per_shard=10000,
 55 |       samples_per_shard=100,
 56 |       Sanders_2015_tarball=Sanders_2015_tarball,
 57 |       Collins_2017_tarball=Collins_2017_tarball,
 58 |       Werling_2018_tarball=Werling_2018_tarball
 59 |   }
 60 | 
 61 |   output {
 62 |     File clustered_vcf = concat_vcfs.vcf
 63 |     File clustered_vcf_qc = vcf_qc.sv_vcf_qc_output
 64 |   }
 65 | }
 66 | 
 67 | task vcfcluster {
 68 |   Array[File] vcfs
 69 |   String batch
 70 |   String algorithm
 71 |   String chrom
 72 | 
 73 |   # VCFcluster parameters
 74 |   Int dist
 75 |   Float frac
 76 |   File blacklist
 77 |   Int svsize
 78 |   String svtypes
 79 |   String flags
 80 | 
 81 |   command <<<
 82 |     for f in ${sep=' ' vcfs}; do tabix -p vcf -f $f; done;
 83 |     tabix -p bed ${blacklist};
 84 | 
 85 |     svtk vcfcluster ${write_tsv(vcfs)} stdout \
 86 |         -r ${chrom} \
 87 |         -p ${batch}_${algorithm}_${chrom} \
 88 |         -d ${dist} \
 89 |         -f ${frac} \
 90 |         -x ${blacklist} \
 91 |         -z ${svsize} \
 92 |         -t ${svtypes} \
 93 |         ${flags} \
 94 |       | vcf-sort -c \
 95 |       | bgzip -c > ${batch}.${algorithm}.${chrom}.vcf.gz
 96 |   >>>
 97 | 
 98 |   output {
 99 |     File clustered_vcf="${batch}.${algorithm}.${chrom}.vcf.gz"
100 |   }
101 |   
102 |   runtime {
103 |     docker: "talkowski/sv-pipeline"
104 |     disks: "local-disk 300 HDD"
105 |     preemptible: 3
106 |   }
107 | }
108 | 
109 | task concat_vcfs {
110 |   Array[File] vcfs
111 |   String batch
112 |   String algorithm
113 | 
114 |   command {
115 |     vcf-concat ${sep=' ' vcfs} | vcf-sort -c | bgzip -c > ${batch}.${algorithm}.vcf.gz;
116 |     tabix -p vcf ${batch}.${algorithm}.vcf.gz;
117 |   }
118 | 
119 |   output {
120 |     File vcf="${batch}.${algorithm}.vcf.gz"
121 |     File idx="${batch}.${algorithm}.vcf.gz.tbi"
122 |   }
123 |   
124 |   runtime {
125 |     docker: "talkowski/sv-pipeline"
126 |     disks: "local-disk 300 HDD"
127 |     preemptible: 3
128 |   }
129 | }


--------------------------------------------------------------------------------
/gnomad_sv_pipeline_wdls/module_02/02_aggregate.wdl:
--------------------------------------------------------------------------------
  1 | import "https://api.firecloud.org/ga4gh/v1/tools/Talkowski-SV:02_assess_evidence_single_vcf/versions/31/plain-WDL/descriptor"  as assess
  2 | workflow assess_evidence_batch {
  3 |   File mantavcf                      # Input VCF
  4 |   File meltvcf                      # Input VCF
  5 |   File dellyvcf                      # Input VCF
  6 |   File depthvcf                      # Input VCF
  7 |   File discfile                 # Discordant pair file
  8 |   File discfile_idx             # Tabix index of discordant pair file
  9 |   File splitfile                # Split read file
 10 |   File splitfile_idx            # Tabix index of split read file
 11 |   File coveragefile             # Bincov matrix
 12 |   File coveragefile_idx         # Tabix index of bincov matrix
 13 |   File medianfile               # Median coverage of each sample
 14 |   File baf_metrics              # Matrix of BAF statistics
 15 |   File baf_metrics_idx          # Tabix index of BAF matrix
 16 |   File famfile                  # Batch fam file 
 17 |   File autosome_contigs         # Autosomes .fai
 18 |   File allosome_contigs         # Allosomes .fai
 19 |   File rmsk                     # Repeatmasker track
 20 |   File segdups                  # Seg dups track
 21 |   String batch                  # Batch ID
 22 |   Int PE_split_size             # Number of lines in each petest split
 23 |   Int SR_split_size             # Number of lines in each srtest split
 24 |   Int RD_split_size             # Number of lines in each rdtest split
 25 |   Int BAF_split_size            # Number of lines in each baftest split
 26 |   File svc_acct_key
 27 |   Array[String] samples
 28 |     call assess.assess_evidence as assessmanta{input:
 29 |        vcf=mantavcf,
 30 |        samples=samples,
 31 |        svc_acct_key=svc_acct_key,
 32 |        discfile=discfile,
 33 |        discfile_idx=discfile_idx,
 34 |        splitfile=splitfile,
 35 |        splitfile_idx=splitfile_idx,
 36 |        coveragefile=coveragefile,
 37 |        coveragefile_idx=coveragefile_idx,
 38 |        medianfile=medianfile,
 39 |        baf_metrics=baf_metrics,
 40 |        baf_metrics_idx=baf_metrics_idx,
 41 |        famfile=famfile,
 42 |        autosome_contigs=autosome_contigs,
 43 |        allosome_contigs=allosome_contigs,
 44 |        rmsk=rmsk,
 45 |        segdups=segdups,
 46 |        batch=batch,
 47 |        algorithm="manta",
 48 |        PE_split_size=PE_split_size,
 49 |        SR_split_size=SR_split_size,
 50 |        RD_split_size=RD_split_size,
 51 |        BAF_split_size=BAF_split_size,
 52 |     }
 53 |     call assess.assess_evidence as assessmelt{input:
 54 |        vcf=meltvcf,
 55 |        samples=samples,
 56 |        svc_acct_key=svc_acct_key,
 57 |        discfile=discfile,
 58 |        discfile_idx=discfile_idx,
 59 |        splitfile=splitfile,
 60 |        splitfile_idx=splitfile_idx,
 61 |        coveragefile=coveragefile,
 62 |        coveragefile_idx=coveragefile_idx,
 63 |        medianfile=medianfile,
 64 |        baf_metrics=baf_metrics,
 65 |        baf_metrics_idx=baf_metrics_idx,
 66 |        famfile=famfile,
 67 |        autosome_contigs=autosome_contigs,
 68 |        allosome_contigs=allosome_contigs,
 69 |        rmsk=rmsk,
 70 |        segdups=segdups,
 71 |        batch=batch,
 72 |        algorithm="melt",
 73 |        PE_split_size=PE_split_size,
 74 |        SR_split_size=SR_split_size,
 75 |        RD_split_size=RD_split_size,
 76 |        BAF_split_size=BAF_split_size,}
 77 |     call assess.assess_evidence as assessdelly{input:
 78 |        vcf=dellyvcf,
 79 |        samples=samples,
 80 |        svc_acct_key=svc_acct_key,
 81 |        discfile=discfile,
 82 |        discfile_idx=discfile_idx,
 83 |        splitfile=splitfile,
 84 |        splitfile_idx=splitfile_idx,
 85 |        coveragefile=coveragefile,
 86 |        coveragefile_idx=coveragefile_idx,
 87 |        medianfile=medianfile,
 88 |        baf_metrics=baf_metrics,
 89 |        baf_metrics_idx=baf_metrics_idx,
 90 |        famfile=famfile,
 91 |        autosome_contigs=autosome_contigs,
 92 |        allosome_contigs=allosome_contigs,
 93 |        rmsk=rmsk,
 94 |        segdups=segdups,
 95 |        batch=batch,
 96 |        algorithm="delly",
 97 |        PE_split_size=PE_split_size,
 98 |        SR_split_size=SR_split_size,
 99 |        RD_split_size=RD_split_size,
100 |        BAF_split_size=BAF_split_size,}
101 |     call assess.assess_evidence as assessdepth{input:
102 |        vcf=depthvcf,
103 |        samples=samples,
104 |        svc_acct_key=svc_acct_key,
105 |        discfile=discfile,
106 |        discfile_idx=discfile_idx,
107 |        splitfile=splitfile,
108 |        splitfile_idx=splitfile_idx,
109 |        coveragefile=coveragefile,
110 |        coveragefile_idx=coveragefile_idx,
111 |        medianfile=medianfile,
112 |        baf_metrics=baf_metrics,
113 |        baf_metrics_idx=baf_metrics_idx,
114 |        famfile=famfile,
115 |        autosome_contigs=autosome_contigs,
116 |        allosome_contigs=allosome_contigs,
117 |        rmsk=rmsk,
118 |        segdups=segdups,
119 |        batch=batch,
120 |        algorithm="depth",
121 |        PE_split_size=PE_split_size,
122 |        SR_split_size=SR_split_size,
123 |        RD_split_size=RD_split_size,
124 |        BAF_split_size=BAF_split_size,}
125 |     call aggregate_metric{input:
126 |         batch=batch,mantametric=assessmanta.metrics,dellymetric=assessdelly.metrics,meltmetric=assessmelt.metrics,depthmetric=assessdepth.metrics}
127 |     output{
128 |         File metrics=aggregate_metric.metrics
129 |     }
130 | }
131 | task aggregate_metric{
132 |     String batch
133 |     File mantametric
134 |     File dellymetric
135 |     File depthmetric
136 |     File meltmetric
137 |     command <<<
138 |         python3 <<CODE
139 |         import pandas as pd
140 |         metrics = ["${mantametric}","${dellymetric}","${depthmetric}","${meltmetric}"]
141 |         dfs=[]
142 |         for df in metrics:
143 |             dfs.append(pd.read_table(df))
144 |         df = pd.concat(dfs)
145 |         df.to_csv("${batch}.metrics", index=False, sep='\t')
146 |         CODE
147 |         >>>
148 |     output{
149 |         File metrics="${batch}.metrics"
150 |     }
151 |     runtime {
152 |   	preemptible: 3
153 |       docker: "talkowski/sv-pipeline-remote-pysam"
154 |       memory: "20 GB"
155 |       disks: "local-disk 100 HDD"
156 |     }
157 | }


--------------------------------------------------------------------------------
/gnomad_sv_pipeline_wdls/module_02/02_baftest.wdl:
--------------------------------------------------------------------------------
 1 | import "https://api.firecloud.org/ga4gh/v1/tools/Talkowski-SV:02_baftest_autosome/versions/12/plain-WDL/descriptor" as auto
 2 | 
 3 | # Parallelize baftest on a single VCF across chromosomes
 4 | workflow baftest_by_chrom {
 5 |   File vcf                      # Input VCF
 6 |   String baf_metrics            # Matrix of BAF statistics
 7 |   File baf_metrics_idx          # Tabix index of BAF matrix
 8 |   File autosome_contigs         # Autosomes .fai
 9 |   File svc_acct_key				# Service account json
10 |   Array[String] samples			# List of samples in batch
11 |   String batch                  # Batch ID
12 |   String algorithm              # Algorithm ID
13 |   Int split_size                # Number of lines in each baftest split
14 | 
15 |   Array[Array[String]] autosomes = read_tsv(autosome_contigs) 
16 | 
17 |   # Run baftest on each autosome
18 |   scatter (autosome in autosomes) {
19 |     call auto.baftest_autosome {
20 |       input:
21 |         vcf=vcf,
22 |         baf_metrics=baf_metrics,
23 |         baf_metrics_idx=baf_metrics_idx,
24 |         batch=batch,
25 |         algorithm=algorithm,
26 |         chrom=autosome[0],
27 |         split_size=split_size,
28 |         samples=samples,
29 |         svc_acct_key=svc_acct_key
30 |     }
31 |   }
32 |   
33 |   # Combine baftest results into single file
34 |   call merge_baftest {
35 |     input:
36 |       autosomes=baftest_autosome.stats,
37 |       prefix="${batch}.${algorithm}"
38 |   }
39 | 
40 |   output {
41 |     File baftest = merge_baftest.merged_stats
42 |   }
43 | }
44 | 
45 | # Combine per-chromosome baftest results into single table
46 | task merge_baftest {
47 |   Array[File] autosomes
48 |   String prefix
49 | 
50 |   command <<<
51 |     while read split; do
52 |       sed -e '1d' $split;
53 |     done < ${write_tsv(autosomes)} | cat <(head -n1 ${autosomes[0]}) - > ${prefix}.stats
54 |   >>>
55 | 
56 |   output {
57 |     File merged_stats = "${prefix}.stats"
58 |   }
59 |   
60 |   runtime {
61 |   	preemptible: 3
62 |   	docker: "talkowski/sv-pipeline-remote-pysam"
63 |   }
64 | }


--------------------------------------------------------------------------------
/gnomad_sv_pipeline_wdls/module_02/02_baftest_autosome.wdl:
--------------------------------------------------------------------------------
  1 | # Run baftest on a single autosome, parallelizing across a fixed split size
  2 | workflow baftest_autosome {
  3 |   File vcf                      # Input VCF
  4 |   String baf_metrics            # Matrix of BAF statistics
  5 |   File baf_metrics_idx          # Tabix index of BAF matrix
  6 |   Array[String] samples			# list of samples in batch
  7 |   File svc_acct_key				# Service account json
  8 |   String batch                  # Batch ID
  9 |   String algorithm              # Algorithm ID
 10 |   String chrom                  # Chromosome being processed
 11 |   Int split_size                # Number of lines in each baftest split
 12 | 
 13 |   # Compute the length of the suffix needed to accomodate all splits
 14 |   call compute_suffix_len {
 15 |     input:
 16 |       vcf=vcf,
 17 |       chrom=chrom,
 18 |       split_size=split_size
 19 |   }
 20 | 
 21 |   # Split the VCF into smaller chunks
 22 |   call split_vcf {
 23 |     input:
 24 |       vcf=vcf,
 25 |       batch=batch,
 26 |       algorithm=algorithm,
 27 |       chrom=chrom,
 28 |       split_size=split_size,
 29 |       suffix_len=compute_suffix_len.len
 30 |   }
 31 | 
 32 |   # Run baftest on each split
 33 |   scatter (split in split_vcf.split_beds) {
 34 |     # Run baftest
 35 |     call baftest {
 36 |       input:
 37 |         bed=split,
 38 |         prefix=basename(split),
 39 |         baf_metrics=baf_metrics,
 40 |         baf_metrics_idx=baf_metrics_idx,
 41 |         samples=samples,
 42 |         batch=batch,
 43 |         svc_acct_key=svc_acct_key
 44 |     }
 45 |   }
 46 | 
 47 |   # Merge splits into single file
 48 |   call merge_splits {
 49 |     input:
 50 |       stats=baftest.stats,
 51 |       prefix="${batch}.${algorithm}.${chrom}"
 52 |   }
 53 | 
 54 |   output {
 55 |     File stats = merge_splits.merged_stats
 56 |   }
 57 | }
 58 | 
 59 | # Compute the length of the suffix necessary to accommodate all splits
 60 | task compute_suffix_len {
 61 |   File vcf
 62 |   String chrom
 63 |   Int split_size
 64 | 
 65 |   command <<<
 66 |     tabix -p vcf ${vcf};
 67 |     python3 <<CODE
 68 |     import numpy as np
 69 |     import pysam
 70 |     vcf = pysam.VariantFile('${vcf}')
 71 |     for i, record in enumerate(vcf.fetch('${chrom}')):
 72 |       continue
 73 |     n_records = i + 1
 74 |     n_splits = int(np.ceil(n_records / ${split_size}))
 75 |     suffix_len = max(int(np.ceil(np.log10(n_splits))), 1)
 76 |     print(suffix_len)
 77 |     CODE
 78 |   >>>
 79 | 
 80 |   output {
 81 |     Int len = read_int(stdout())
 82 |   }
 83 |   
 84 |   runtime {
 85 |   	preemptible: 3
 86 |   	docker: "talkowski/sv-pipeline-remote-pysam"
 87 |   }
 88 | }
 89 | 
 90 | # Split VCF into fixed size chunks
 91 | task split_vcf {
 92 |   File vcf
 93 |   String batch
 94 |   String algorithm
 95 |   String chrom
 96 |   
 97 |   Int split_size
 98 |   Int suffix_len
 99 | 
100 |   command <<<
101 |     tabix -p vcf ${vcf};
102 |     tabix -h ${vcf} ${chrom} \
103 |       | svtk vcf2bed --no-header stdin stdout \
104 |       | fgrep -e "DEL" -e "DUP" \
105 |       | awk -v OFS="\t" '{print $1, $2, $3, $4, $6, $5}' \
106 |       | awk '($3-$2>=10000 && $3-$2<10000000)' \
107 |       | split -a ${suffix_len} -d -l 300 - ${batch}.${algorithm}.split.gt10kb.
108 |     tabix -h ${vcf} ${chrom} \
109 |       | svtk vcf2bed --no-header stdin stdout \
110 |       | fgrep -e "DEL" -e "DUP" \
111 |       | awk -v OFS="\t" '{print $1, $2, $3, $4, $6, $5}' \
112 |       | awk '($3-$2<10000)' \
113 |       | sort -k1,1V -k2,2n \
114 |       | split -a ${suffix_len} -d -l ${split_size} - ${batch}.${algorithm}.split.
115 |   >>>
116 | 
117 |   output {
118 |     Array[File] split_beds = glob("${batch}.${algorithm}.split.*")
119 |   }
120 |   
121 |   runtime {
122 |   	preemptible: 3
123 |   	docker: "talkowski/sv-pipeline-remote-pysam"
124 |   }
125 | }
126 | 
127 | # Run baftest
128 | task baftest {
129 |   File bed
130 |   String baf_metrics
131 |   File baf_metrics_idx
132 |   Array[String] samples
133 |   File svc_acct_key
134 |   String prefix
135 |   String batch
136 |  
137 |   command <<<
138 |     echo -e "sample\tgroup\tbatch" > batch.key;
139 |     awk -v batch=${batch} -v OFS="\t" '{print $1, $1, batch}' ${write_tsv(samples)} >> batch.key;
140 |   	url=$(gsutil signurl -d 24h ${svc_acct_key} ${baf_metrics} | sed '1d' | cut -f 4);
141 |     start=$(cut -f2 ${bed} | sort -k1,1n | head -n1);
142 |     end=$(cut -f3 ${bed} | sort -k1,1n | tail -n1);
143 |     chrom=$(cut -f1 ${bed} | head -n1);
144 |     svtk remote_tabix "$url" ${baf_metrics_idx} "$chrom":"$start"-"$end" | bgzip -c > local_baf.bed.gz;
145 |     tabix -b2 local_baf.bed.gz;
146 |     svtk baf-test ${bed} local_baf.bed.gz --batch batch.key > ${prefix}.metrics
147 |   >>>
148 |   
149 |   output {
150 |     File stats = "${prefix}.metrics"
151 |   }
152 |   
153 |   runtime {
154 |   	preemptible: 3
155 |   	memory: "10 GB"
156 |     disks: "local-disk 50 SSD"
157 |   	docker: "talkowski/sv-pipeline-remote-pysam"
158 |   }
159 | }
160 | 
161 | # Merge split baftest results into single file
162 | task merge_splits {
163 |   Array[File] stats
164 |   String prefix
165 | 
166 |   command <<<
167 |     echo -n "chrom start end name samples svtype delstat snp_ratio " > ${prefix}.stats;
168 |     echo -n "del_loglik dupstat KS_stat KS_pval total_case_snps " >> ${prefix}.stats;
169 |     echo -n "total_snps n_nonROH_cases n_samples mean_control_snps " >> ${prefix}.stats;
170 |     echo -n "n_nonROH_controls n_controls" >> ${prefix}.stats;
171 |     sed -i -e 's/ /\t/g' ${prefix}.stats;
172 |     while read split; do
173 |       cat $split;
174 |     done < ${write_tsv(stats)} >> ${prefix}.stats
175 |   >>>
176 | 
177 |   output {
178 |     File merged_stats = "${prefix}.stats"
179 |   }
180 |   
181 |   runtime {
182 |   	preemptible: 3
183 |   	docker: "talkowski/sv-pipeline-remote-pysam"
184 |   }
185 | }


--------------------------------------------------------------------------------
/gnomad_sv_pipeline_wdls/module_02/02_petest.wdl:
--------------------------------------------------------------------------------
 1 | import "https://api.firecloud.org/ga4gh/v1/tools/Talkowski-SV:02_petest_autosome/versions/14/plain-WDL/descriptor" as auto
 2 | import "https://api.firecloud.org/ga4gh/v1/tools/Talkowski-SV:02_petest_allosome/versions/10/plain-WDL/descriptor" as allo
 3 | 
 4 | # Parallelize petest on a single VCF across chromosomes
 5 | workflow petest_by_chrom {
 6 |   File vcf                      # Input VCF
 7 |   String discfile                 # Discordant pair file
 8 |   String medianfile				# Medianfile
 9 |   File discfile_idx             # Tabix index of discordant pair file
10 |   File famfile                  # Batch fam file 
11 |   File autosome_contigs         # Autosomes .fai
12 |   File allosome_contigs         # Allosomes .fai
13 |   File svc_acct_key
14 |   String batch                  # Batch ID
15 |   String algorithm              # Algorithm ID
16 |   Int split_size                # Number of lines in each petest split
17 | 
18 |   Array[Array[String]] autosomes = read_tsv(autosome_contigs) 
19 |   Array[Array[String]] allosomes = read_tsv(allosome_contigs) 
20 | 
21 |   # Run petest on each autosome
22 |   scatter (autosome in autosomes) {
23 |     call auto.petest_autosome {
24 |       input:
25 |         vcf=vcf,
26 |         discfile=discfile,
27 |         medianfile=medianfile,
28 |         discfile_idx=discfile_idx,
29 |         batch=batch,
30 |         algorithm=algorithm,
31 |         chrom=autosome[0],
32 |         split_size=split_size,
33 |         svc_acct_key=svc_acct_key
34 |     }
35 |   }
36 |   
37 |   # Run petest on each allosome
38 |   scatter (allosome in allosomes) {
39 |     call allo.petest_allosome {
40 |       input:
41 |         vcf=vcf,
42 |         discfile=discfile,
43 |         medianfile=medianfile,
44 |         discfile_idx=discfile_idx,
45 |         famfile=famfile,
46 |         batch=batch,
47 |         algorithm=algorithm,
48 |         chrom=allosome[0],
49 |         split_size=split_size,
50 |         svc_acct_key=svc_acct_key
51 |     }
52 |   }
53 | 
54 |   # Combine petest results into single file
55 |   call merge_petest {
56 |     input:
57 |       autosomes=petest_autosome.stats,
58 |       allosomes=petest_allosome.stats,
59 |       prefix="${batch}.${algorithm}"
60 |   }
61 | 
62 |   output {
63 |     File petest = merge_petest.merged_stats
64 |   }
65 | }
66 | 
67 | # Combine per-chromosome petest results into single table
68 | task merge_petest {
69 |   Array[File] autosomes
70 |   Array[File] allosomes
71 |   String prefix
72 | 
73 |   command <<<
74 |     cat ${write_tsv(autosomes)} ${write_tsv(allosomes)} > splits.list;
75 |     while read split; do
76 |       sed -e '1d' $split;
77 |     done < splits.list | cat <(head -n1 ${autosomes[0]}) - > ${prefix}.stats
78 |   >>>
79 | 
80 |   output {
81 |     File merged_stats = "${prefix}.stats"
82 |   }
83 |   
84 |   runtime {
85 |   	preemptible: 3
86 |   	docker: "talkowski/sv-pipeline"
87 |   }
88 | }


--------------------------------------------------------------------------------
/gnomad_sv_pipeline_wdls/module_02/02_petest_autosome.wdl:
--------------------------------------------------------------------------------
  1 | # Run petest on a single autosome, parallelizing across a fixed split size
  2 | workflow petest_autosome {
  3 |   File vcf                      # Input VCF
  4 |   String discfile               # Discordant pair file
  5 |   File medianfile               # Median file
  6 |   File discfile_idx             # Tabix index of discordant pair file
  7 |   File svc_acct_key             # Service account key
  8 |   String batch                  # Batch ID
  9 |   String algorithm              # Algorithm ID
 10 |   String chrom                  # Chromosome being processed
 11 |   Int split_size                # Number of lines in each petest split
 12 | 
 13 |   # Compute the length of the suffix needed to accomodate all splits
 14 |   call compute_suffix_len {
 15 |     input:
 16 |       vcf=vcf,
 17 |       chrom=chrom,
 18 |       split_size=split_size
 19 |   }
 20 | 
 21 |   # Split the VCF into smaller chunks
 22 |   call split_vcf {
 23 |     input:
 24 |       vcf=vcf,
 25 |       batch=batch,
 26 |       algorithm=algorithm,
 27 |       chrom=chrom,
 28 |       split_size=split_size,
 29 |       suffix_len=compute_suffix_len.len
 30 |   }
 31 | 
 32 |   # Run petest on each split
 33 |   scatter (split in split_vcf.split_vcfs) {
 34 |     # Add VCF header to split
 35 |     call reheader_split {
 36 |       input:
 37 |         vcf=vcf,
 38 |         split=split
 39 |     }
 40 | 
 41 |     # Run petest
 42 |     call petest {
 43 |       input:
 44 |         vcf=reheader_split.split_w_header,
 45 |         prefix=basename(split),
 46 |         discfile=discfile,
 47 |         medianfile=medianfile,
 48 |         discfile_idx=discfile_idx,
 49 |         svc_acct_key=svc_acct_key
 50 |     }
 51 |   }
 52 | 
 53 |   # Merge splits into single file
 54 |   call merge_splits {
 55 |     input:
 56 |       stats=petest.stats,
 57 |       prefix="${batch}.${algorithm}.${chrom}"
 58 |   }
 59 | 
 60 |   output {
 61 |     File stats = merge_splits.merged_stats
 62 |   }
 63 | }
 64 | 
 65 | # Compute the length of the suffix necessary to accommodate all splits
 66 | task compute_suffix_len {
 67 |   File vcf
 68 |   String chrom
 69 |   Int split_size
 70 | 
 71 |   command <<<
 72 |     tabix -p vcf ${vcf};
 73 |     python3 <<CODE
 74 |     import numpy as np
 75 |     import pysam
 76 |     vcf = pysam.VariantFile('${vcf}')
 77 |     for i, record in enumerate(vcf.fetch('${chrom}')):
 78 |       continue
 79 |     n_records = i + 1
 80 |     n_splits = int(np.ceil(n_records / ${split_size}))
 81 |     suffix_len = max(int(np.ceil(np.log10(n_splits))), 1)
 82 |     print(suffix_len)
 83 |     CODE
 84 |   >>>
 85 | 
 86 |   output {
 87 |     Int len = read_int(stdout())
 88 |   }
 89 |   
 90 |   runtime {
 91 |   	preemptible: 3
 92 |     docker: "talkowski/sv-pipeline-remote-pysam"
 93 |   }
 94 | }
 95 | 
 96 | # Split VCF into fixed size chunks
 97 | task split_vcf {
 98 |   File vcf
 99 |   String batch
100 |   String algorithm
101 |   String chrom
102 |   
103 |   Int split_size
104 |   Int suffix_len
105 | 
106 |   command {
107 |     tabix -p vcf ${vcf};
108 |     tabix ${vcf} ${chrom} | sort -R | split -a ${suffix_len} -d -l ${split_size} - ${batch}.${algorithm}.split.
109 |   }
110 | 
111 |   output {
112 |     Array[File] split_vcfs = glob("${batch}.${algorithm}.split.*")
113 |   }
114 |   
115 |   runtime {
116 |   	preemptible: 3
117 |     docker: "talkowski/sv-pipeline-remote-pysam"
118 |   }
119 | }
120 | 
121 | # Restore VCF header to split files
122 | task reheader_split {
123 |   File vcf
124 |   File split
125 | 
126 |   command {
127 |     cat <(zcat ${vcf} | sed -n -e '/^#/p') ${split} | bgzip -c > ${basename(split)}.vcf.gz
128 |   }
129 | 
130 |   output {
131 |     File split_w_header = "${basename(split)}.vcf.gz"
132 |   }
133 |   
134 |   runtime {
135 |   	preemptible: 3
136 |     docker: "talkowski/sv-pipeline-remote-pysam"
137 |   }
138 | }
139 | 
140 | # Run petest
141 | task petest {
142 |   File vcf
143 |   String discfile
144 |   File medianfile
145 |   File discfile_idx
146 |   String prefix
147 |   File svc_acct_key
148 |  
149 |   command { 
150 |     url=$(gsutil signurl -d 24h ${svc_acct_key} ${discfile} | sed '1d' | cut -f 4);
151 |     echo $url;
152 |     svtk pe-test -o 1000 --index ${discfile_idx} --medianfile ${medianfile} ${vcf} "$url" ${prefix}.stats
153 |   }
154 |   
155 |   output {
156 |     File stats = "${prefix}.stats"
157 |   }
158 |   
159 |   runtime {
160 |   	preemptible: 3
161 |     docker: "talkowski/sv-pipeline-remote-pysam"
162 |   }
163 | }
164 | 
165 | # Merge split petest results into single file
166 | task merge_splits {
167 |   Array[File] stats
168 |   String prefix
169 | 
170 |   command <<<
171 |     while read split; do
172 |       sed -e '1d' $split;
173 |     done < ${write_tsv(stats)} | cat <(head -n1 ${stats[0]}) - > ${prefix}.stats
174 |   >>>
175 | 
176 |   output {
177 |     File merged_stats = "${prefix}.stats"
178 |   }
179 |   
180 |   runtime {
181 |   	preemptible: 3
182 |     docker: "talkowski/sv-pipeline-remote-pysam"
183 |   }
184 | }


--------------------------------------------------------------------------------
/gnomad_sv_pipeline_wdls/module_02/02_rdtest.wdl:
--------------------------------------------------------------------------------
 1 | import "https://api.firecloud.org/ga4gh/v1/tools/Talkowski-SV:02_rdtest_autosome/versions/12/plain-WDL/descriptor" as auto
 2 | import "https://api.firecloud.org/ga4gh/v1/tools/Talkowski-SV:02_rdtest_allosome/versions/7/plain-WDL/descriptor" as allo
 3 | 
 4 | # Parallelize rdtest on a single VCF across chromosomes
 5 | workflow rdtest_by_chrom {
 6 |   File vcf                      # Input VCF
 7 |   String coveragefile           # Bincov matrix
 8 |   File coveragefile_idx         # Tabix index of bincov matrix
 9 |   File medianfile               # Median coverage of each sample
10 |   File famfile                  # Batch fam file 
11 |   File autosome_contigs         # Autosomes .fai
12 |   File allosome_contigs         # Allosomes .fai
13 |   File svc_acct_key				# Service account json
14 |   String batch                  # Batch ID
15 |   String algorithm              # Algorithm ID
16 |   Int split_size                # Number of lines in each rdtest split
17 | 
18 |   Array[Array[String]] autosomes = read_tsv(autosome_contigs) 
19 |   Array[Array[String]] allosomes = read_tsv(allosome_contigs) 
20 | 
21 |   # Run rdtest on each autosome
22 |   scatter (autosome in autosomes) {
23 |     call auto.rdtest_autosome {
24 |       input:
25 |         vcf=vcf,
26 |         coveragefile=coveragefile,
27 |         coveragefile_idx=coveragefile_idx,
28 |         medianfile=medianfile,
29 |         famfile=famfile,
30 |         batch=batch,
31 |         algorithm=algorithm,
32 |         chrom=autosome[0],
33 |         split_size=split_size,
34 |         svc_acct_key=svc_acct_key
35 |     }
36 |   }
37 |   
38 |   # Run rdtest on each allosome
39 |   scatter (allosome in allosomes) {
40 |     call allo.rdtest_allosome {
41 |       input:
42 |         vcf=vcf,
43 |         coveragefile=coveragefile,
44 |         coveragefile_idx=coveragefile_idx,
45 |         medianfile=medianfile,
46 |         famfile=famfile,
47 |         batch=batch,
48 |         algorithm=algorithm,
49 |         chrom=allosome[0],
50 |         split_size=split_size,
51 |         svc_acct_key=svc_acct_key
52 |     }
53 |   }
54 | 
55 |   # Combine rdtest results into single file
56 |   call merge_rdtest {
57 |     input:
58 |       autosomes=rdtest_autosome.stats,
59 |       allosomes=rdtest_allosome.stats,
60 |       prefix="${batch}.${algorithm}"
61 |   }
62 | 
63 |   output {
64 |     File rdtest = merge_rdtest.merged_stats
65 |   }
66 | }
67 | 
68 | # Combine per-chromosome rdtest results into single table
69 | task merge_rdtest {
70 |   Array[File] autosomes
71 |   Array[File] allosomes
72 |   String prefix
73 | 
74 |   command <<<
75 |     cat ${write_tsv(autosomes)} ${write_tsv(allosomes)} > splits.list;
76 |     while read split; do
77 |       sed -e '1d' $split;
78 |     done < splits.list | cat <(head -n1 ${autosomes[0]}) - > ${prefix}.stats
79 |   >>>
80 | 
81 |   output {
82 |     File merged_stats = "${prefix}.stats"
83 |   }
84 |   
85 |   runtime {
86 |   	preemptible: 3
87 |   	docker: "talkowski/sv-pipeline-remote-pysam"
88 |   }
89 | }


--------------------------------------------------------------------------------
/gnomad_sv_pipeline_wdls/module_02/02_rdtest_autosome.wdl:
--------------------------------------------------------------------------------
  1 | # Run rdtest on a single autosome, parallelizing across a fixed split size
  2 | workflow rdtest_autosome {
  3 |   File vcf                      # Input VCF
  4 |   String coveragefile           # Bincov matrix
  5 |   File coveragefile_idx         # Tabix index of bincov matrix
  6 |   File medianfile               # Median coverage of each sample
  7 |   File famfile                  # Batch fam file 
  8 |   File svc_acct_key				# Service account json
  9 |   String batch                  # Batch ID
 10 |   String algorithm              # Algorithm ID
 11 |   String chrom                  # Chromosome being processed
 12 |   Int split_size                # Number of lines in each rdtest split
 13 | 
 14 |   # Compute the length of the suffix needed to accomodate all splits
 15 |   call compute_suffix_len {
 16 |     input:
 17 |       vcf=vcf,
 18 |       chrom=chrom,
 19 |       split_size=split_size
 20 |   }
 21 | 
 22 |   # Split the VCF into smaller chunks
 23 |   call split_vcf {
 24 |     input:
 25 |       vcf=vcf,
 26 |       batch=batch,
 27 |       algorithm=algorithm,
 28 |       chrom=chrom,
 29 |       split_size=split_size,
 30 |       suffix_len=compute_suffix_len.len
 31 |   }
 32 | 
 33 |   call get_whitelist {
 34 |     input:
 35 |       famfile=famfile
 36 |   }
 37 | 
 38 |   # Run rdtest on each split
 39 |   scatter (split in split_vcf.split_beds) {
 40 |     # Run rdtest
 41 |     call rdtest {
 42 |       input:
 43 |         bed=split,
 44 |         prefix=basename(split),
 45 |         coveragefile=coveragefile,
 46 |         coveragefile_idx=coveragefile_idx,
 47 |         medianfile=medianfile,
 48 |         famfile=famfile,
 49 |         whitelist=get_whitelist.whitelist,
 50 |         svc_acct_key=svc_acct_key
 51 |     }
 52 |   }
 53 | 
 54 |   # Merge splits into single file
 55 |   call merge_splits {
 56 |     input:
 57 |       stats=rdtest.stats,
 58 |       prefix="${batch}.${algorithm}.${chrom}"
 59 |   }
 60 | 
 61 |   output {
 62 |     File stats = merge_splits.merged_stats
 63 |   }
 64 | }
 65 | 
 66 | # Compute the length of the suffix necessary to accommodate all splits
 67 | task compute_suffix_len {
 68 |   File vcf
 69 |   String chrom
 70 |   Int split_size
 71 | 
 72 |   command <<<
 73 |     tabix -p vcf ${vcf};
 74 |     python3 <<CODE
 75 |     import numpy as np
 76 |     import pysam
 77 |     vcf = pysam.VariantFile('${vcf}')
 78 |     for i, record in enumerate(vcf.fetch('${chrom}')):
 79 |       continue
 80 |     n_records = i + 1
 81 |     n_splits = int(np.ceil(n_records / ${split_size}))
 82 |     suffix_len = max(int(np.ceil(np.log10(n_splits))), 1)
 83 |     print(suffix_len)
 84 |     CODE
 85 |   >>>
 86 | 
 87 |   output {
 88 |     Int len = read_int(stdout())
 89 |   }
 90 |   
 91 |   runtime {
 92 |   	preemptible: 3
 93 |   	docker: "talkowski/sv-pipeline-remote-pysam"
 94 |   }
 95 | }
 96 | 
 97 | # Split VCF into fixed size chunks
 98 | task split_vcf {
 99 |   File vcf
100 |   String batch
101 |   String algorithm
102 |   String chrom
103 |   
104 |   Int split_size
105 |   Int suffix_len
106 | 
107 |   command <<<
108 |     tabix -p vcf ${vcf};
109 |     tabix -h ${vcf} ${chrom} \
110 |       | svtk vcf2bed --no-header stdin stdout \
111 |       | fgrep -e "DEL" -e "DUP" \
112 |       | awk -v OFS="\t" '{print $1, $2, $3, $4, $6, $5}' \
113 |       | awk '($3-$2>=10000)' \
114 |       > ${batch}.${algorithm}.split.gt10kb;
115 |     tabix -h ${vcf} ${chrom} \
116 |       | svtk vcf2bed --no-header stdin stdout \
117 |       | fgrep -e "DEL" -e "DUP" \
118 |       | awk -v OFS="\t" '{print $1, $2, $3, $4, $6, $5}' \
119 |       | awk '($3-$2<10000)' \
120 |       | sort -k1,1V -k2,2n \
121 |       | split -a ${suffix_len} -d -l ${split_size} - ${batch}.${algorithm}.split.
122 |   >>>
123 | 
124 |   output {
125 |     Array[File] split_beds = glob("${batch}.${algorithm}.split.*")
126 |   }
127 |   
128 |   runtime {
129 |   	preemptible: 3
130 |   	docker: "talkowski/sv-pipeline-remote-pysam"
131 |   }
132 | }
133 | 
134 | task get_whitelist {
135 |   File famfile
136 |   
137 |   command {
138 |     cut -f2 ${famfile} > samples.list
139 |   }
140 |   
141 |   output {
142 |     File whitelist = "samples.list"
143 |   }
144 |   
145 |   runtime {
146 |   	preemptible: 3
147 |   	docker: "talkowski/sv-pipeline-remote-pysam"
148 |   }
149 | }
150 | 
151 | # Run rdtest
152 | task rdtest {
153 |   File bed
154 |   String coveragefile
155 |   File coveragefile_idx
156 |   File medianfile
157 |   File famfile
158 |   File whitelist
159 |   File svc_acct_key
160 |   String prefix
161 |  
162 |   command <<<
163 |   	url=$(gsutil signurl -d 24h ${svc_acct_key} ${coveragefile} | sed '1d' | cut -f 4);
164 |     start=$(cut -f2 ${bed} | sort -k1,1n | head -n1);
165 |     end=$(cut -f3 ${bed} | sort -k1,1n | tail -n1);
166 |     chrom=$(cut -f1 ${bed} | head -n1);
167 |     svtk remote_tabix --header "$url" ${coveragefile_idx} "$chrom":"$start"-"$end" |sed 's/Chr/chr/g'|sed 's/Start/start/g'|sed 's/End/end/' | bgzip -c > local_coverage.bed.gz;
168 |     tabix -p bed local_coverage.bed.gz;
169 |     Rscript /opt/RdTest/RdTest.R \
170 |       -b ${bed} \
171 |       -n ${prefix} \
172 |       -c local_coverage.bed.gz \
173 |       -m ${medianfile} \
174 |       -f ${famfile} \
175 |       -w ${whitelist}
176 |   >>>
177 |   
178 |   output {
179 |     File stats = "${prefix}.metrics"
180 |     File local_coverage = "local_coverage.bed.gz"
181 |   }
182 |   
183 |   runtime {
184 |   	preemptible: 3
185 |   	docker: "talkowski/sv-pipeline-rdtest"
186 |   }
187 | }
188 | 
189 | # Merge split rdtest results into single file
190 | task merge_splits {
191 |   Array[File] stats
192 |   String prefix
193 | 
194 |   command <<<
195 |     while read split; do
196 |       sed -e '1d' $split;
197 |     done < ${write_tsv(stats)} | cat <(head -n1 ${stats[0]}) - > ${prefix}.stats
198 |   >>>
199 | 
200 |   output {
201 |     File merged_stats = "${prefix}.stats"
202 |   }
203 |   
204 |   runtime {
205 |   	preemptible: 3
206 |   	docker: "talkowski/sv-pipeline-remote-pysam"
207 |   }
208 | }


--------------------------------------------------------------------------------
/gnomad_sv_pipeline_wdls/module_02/02_srtest.wdl:
--------------------------------------------------------------------------------
 1 | import "https://api.firecloud.org/ga4gh/v1/tools/Talkowski-SV:02_srtest_autosome/versions/12/plain-WDL/descriptor" as auto
 2 | import "https://api.firecloud.org/ga4gh/v1/tools/Talkowski-SV:02_srtest_allosome/versions/11/plain-WDL/descriptor" as allo
 3 | 
 4 | # Parallelize srtest on a single VCF across chromosomes
 5 | workflow srtest_by_chrom {
 6 |   File vcf                      # Input VCF
 7 |   String splitfile              # Split read file
 8 |   String medianfile             # Medianfile
 9 |   File splitfile_idx            # Tabix index of split read file
10 |   File famfile                  # Batch fam file 
11 |   File autosome_contigs         # Autosomes .fai
12 |   File allosome_contigs         # Allosomes .fai
13 |   File svc_acct_key             # Service account json
14 |   String batch                  # Batch ID
15 |   String algorithm              # Algorithm ID
16 |   Int split_size                # Number of lines in each srtest split
17 | 
18 |   Array[Array[String]] autosomes = read_tsv(autosome_contigs) 
19 |   Array[Array[String]] allosomes = read_tsv(allosome_contigs) 
20 | 
21 |   # Run srtest on each autosome
22 |   scatter (autosome in autosomes) {
23 |     call auto.srtest_autosome {
24 |       input:
25 |         vcf=vcf,
26 |         splitfile=splitfile,
27 |         medianfile=medianfile,
28 |         splitfile_idx=splitfile_idx,
29 |         batch=batch,
30 |         algorithm=algorithm,
31 |         chrom=autosome[0],
32 |         split_size=split_size,
33 |         svc_acct_key=svc_acct_key
34 |     }
35 |   }
36 |   
37 |   # Run srtest on each allosome
38 |   scatter (allosome in allosomes) {
39 |     call allo.srtest_allosome {
40 |       input:
41 |         vcf=vcf,
42 |         splitfile=splitfile,
43 |         medianfile=medianfile,
44 |         splitfile_idx=splitfile_idx,
45 |         famfile=famfile,
46 |         batch=batch,
47 |         algorithm=algorithm,
48 |         chrom=allosome[0],
49 |         split_size=split_size,
50 |         svc_acct_key=svc_acct_key
51 |     }
52 |   }
53 | 
54 |   # Combine srtest results into single file
55 |   call merge_srtest {
56 |     input:
57 |       autosomes=srtest_autosome.stats,
58 |       allosomes=srtest_allosome.stats,
59 |       prefix="${batch}.${algorithm}"
60 |   }
61 | 
62 |   output {
63 |     File srtest = merge_srtest.merged_stats
64 |   }
65 | }
66 | 
67 | # Combine per-chromosome srtest results into single table
68 | task merge_srtest {
69 |   Array[File] autosomes
70 |   Array[File] allosomes
71 |   String prefix
72 | 
73 |   command <<<
74 |     cat ${write_tsv(autosomes)} ${write_tsv(allosomes)} > splits.list;
75 |     while read split; do
76 |       sed -e '1d' $split;
77 |     done < splits.list | cat <(head -n1 ${autosomes[0]}) - > ${prefix}.stats
78 |   >>>
79 | 
80 |   output {
81 |     File merged_stats = "${prefix}.stats"
82 |   }
83 |   
84 |   runtime {
85 |     preemptible: 3
86 |     docker: "talkowski/sv-pipeline-remote-pysam"
87 |     disks: "local-disk 100 SSD"
88 |   }
89 | }


--------------------------------------------------------------------------------
/gnomad_sv_pipeline_wdls/module_02/02_srtest_autosome.wdl:
--------------------------------------------------------------------------------
  1 | # Run srtest on a single autosome, parallelizing across a fixed split size
  2 | workflow srtest_autosome {
  3 |   File vcf                      # Input VCF
  4 |   String splitfile              # Split read file
  5 |   File medianfile             # Median file
  6 |   File splitfile_idx            # Tabix index of split read file
  7 |   File svc_acct_key				# Service account key json
  8 |   String batch                  # Batch ID
  9 |   String algorithm              # Algorithm ID
 10 |   String chrom                  # Chromosome being processed
 11 |   Int split_size                # Number of lines in each srtest split
 12 | 
 13 |   # Compute the length of the suffix needed to accomodate all splits
 14 |   call compute_suffix_len {
 15 |     input:
 16 |       vcf=vcf,
 17 |       chrom=chrom,
 18 |       split_size=split_size
 19 |   }
 20 | 
 21 |   # Split the VCF into smaller chunks
 22 |   call split_vcf {
 23 |     input:
 24 |       vcf=vcf,
 25 |       batch=batch,
 26 |       algorithm=algorithm,
 27 |       chrom=chrom,
 28 |       split_size=split_size,
 29 |       suffix_len=compute_suffix_len.len
 30 |   }
 31 | 
 32 |   # Run srtest on each split
 33 |   scatter (split in split_vcf.split_vcfs) {
 34 |     # Add VCF header to split
 35 |     call reheader_split {
 36 |       input:
 37 |         vcf=vcf,
 38 |         split=split
 39 |     }
 40 | 
 41 |     # Run srtest
 42 |     call srtest {
 43 |       input:
 44 |         vcf=reheader_split.split_w_header,
 45 |         prefix=basename(split),
 46 |         splitfile=splitfile,
 47 |         medianfile=medianfile,
 48 |         splitfile_idx=splitfile_idx,
 49 |         svc_acct_key=svc_acct_key
 50 |     }
 51 |   }
 52 | 
 53 |   # Merge splits into single file
 54 |   call merge_splits {
 55 |     input:
 56 |       stats=srtest.stats,
 57 |       prefix="${batch}.${algorithm}.${chrom}"
 58 |   }
 59 | 
 60 |   output {
 61 |     File stats = merge_splits.merged_stats
 62 |   }
 63 | }
 64 | 
 65 | # Compute the length of the suffix necessary to accommodate all splits
 66 | task compute_suffix_len {
 67 |   File vcf
 68 |   String chrom
 69 |   Int split_size
 70 | 
 71 |   command <<<
 72 |     tabix -p vcf ${vcf};
 73 |     python3 <<CODE
 74 |     import numpy as np
 75 |     import pysam
 76 |     vcf = pysam.VariantFile('${vcf}')
 77 |     for i, record in enumerate(vcf.fetch('${chrom}')):
 78 |       continue
 79 |     n_records = i + 1
 80 |     n_splits = int(np.ceil(n_records / ${split_size}))
 81 |     suffix_len = max(int(np.ceil(np.log10(n_splits))), 1)
 82 |     print(suffix_len)
 83 |     CODE
 84 |   >>>
 85 | 
 86 |   output {
 87 |     Int len = read_int(stdout())
 88 |   }
 89 |   
 90 |   runtime {
 91 |   	preemptible: 3
 92 |     docker: "talkowski/sv-pipeline-remote-pysam"
 93 |   }
 94 | }
 95 | 
 96 | # Split VCF into fixed size chunks
 97 | task split_vcf {
 98 |   File vcf
 99 |   String batch
100 |   String algorithm
101 |   String chrom
102 |   
103 |   Int split_size
104 |   Int suffix_len
105 | 
106 |   command {
107 |     tabix -p vcf ${vcf};
108 |     tabix ${vcf} ${chrom} | sort -R | split -a ${suffix_len} -d -l ${split_size} - ${batch}.${algorithm}.split.
109 |   }
110 | 
111 |   output {
112 |     Array[File] split_vcfs = glob("${batch}.${algorithm}.split.*")
113 |   }
114 |   
115 |   runtime {
116 |   	preemptible: 3
117 |     docker: "talkowski/sv-pipeline-remote-pysam"
118 |   }
119 | }
120 | 
121 | # Restore VCF header to split files
122 | task reheader_split {
123 |   File vcf
124 |   File split
125 | 
126 |   command {
127 |     cat <(zcat ${vcf} | sed -n -e '/^#/p') ${split} | bgzip -c > ${basename(split)}.vcf.gz
128 |   }
129 | 
130 |   output {
131 |     File split_w_header = "${basename(split)}.vcf.gz"
132 |   }
133 |   
134 |   runtime {
135 |   	preemptible: 3
136 |     docker: "talkowski/sv-pipeline-remote-pysam"
137 |   }
138 | }
139 | 
140 | # Run srtest
141 | task srtest {
142 |   File vcf
143 |   String splitfile
144 |   File medianfile
145 |   File splitfile_idx
146 |   File svc_acct_key
147 |   String prefix
148 |  
149 |   command <<<
150 |     url=$(gsutil signurl -d 24h ${svc_acct_key} ${splitfile} | sed '1d' | cut -f 4);
151 |     echo $url;
152 |     svtk vcf2bed --split-bnd --no-header ${vcf} test.bed
153 |     awk -v OFS="\t" '{if ($2-250>0){print $1,$2-250,$2+250}else{print $1,0,$2+250}}' test.bed  >> region.bed
154 |     awk -v OFS="\t" '{if ($3-250>0){print $1,$3-250,$3+250}else{print $1,0,$3+250}}' test.bed  >> region.bed
155 |     sort -k1,1 -k2,2n region.bed > region.sorted.bed
156 |     bedtools merge -i region.sorted.bed > region.merged.bed
157 |     svtk remote_tabix "$url" ${splitfile_idx} -R region.merged.bed | bgzip -c > SR.txt.gz
158 |     tabix -b 2 -e 2 SR.txt.gz
159 |     svtk sr-test -w 50 --log --index SR.txt.gz.tbi --medianfile ${medianfile} ${vcf} SR.txt.gz ${prefix}.stats
160 |   >>>
161 |   
162 |   output {
163 |     File stats = "${prefix}.stats"
164 |   }
165 |   
166 |   runtime {
167 |   	disks: "local-disk 30 SSD"
168 |   	preemptible: 3
169 |     docker: "talkowski/sv-pipeline-remote-pysam"
170 |   }
171 | }
172 | 
173 | # Merge split srtest results into single file
174 | task merge_splits {
175 |   Array[File] stats
176 |   String prefix
177 | 
178 |   command <<<
179 |     while read split; do
180 |       sed -e '1d' $split;
181 |     done < ${write_tsv(stats)} | cat <(head -n1 ${stats[0]}) - > ${prefix}.stats
182 |   >>>
183 | 
184 |   output {
185 |     File merged_stats = "${prefix}.stats"
186 |   }
187 |   
188 |   runtime {
189 |   	preemptible: 3
190 |     docker: "talkowski/sv-pipeline-remote-pysam"
191 |   }
192 | }


--------------------------------------------------------------------------------
/gnomad_sv_pipeline_wdls/module_03/03_filter_vcf.wdl:
--------------------------------------------------------------------------------
  1 | workflow RF_filter_vcf {
  2 |   File vcf
  3 |   File metrics
  4 |   File scores
  5 |   File cutoffs
  6 |   String prefix
  7 | 
  8 |   call filter_vcf {
  9 |     input:
 10 |       vcf=vcf,
 11 |       scores=scores,
 12 |       prefix=prefix
 13 |   }
 14 | 
 15 |   call rewrite_SR_coords {
 16 |     input:
 17 |       vcf=filter_vcf.filtered_vcf,
 18 |       metrics=metrics,
 19 |       cutoffs=cutoffs,
 20 |       prefix=prefix
 21 |   }
 22 |   
 23 |   call annotate_RF_evidence {
 24 |     input:
 25 |       vcf=rewrite_SR_coords.corrected_vcf,
 26 |       scores=scores,
 27 |       prefix=prefix
 28 |   }
 29 | 
 30 |   output {
 31 |     File filtered_vcf = annotate_RF_evidence.annotated_vcf
 32 |   }
 33 | }
 34 | 
 35 | task filter_vcf {
 36 |   File vcf
 37 |   File scores
 38 |   String prefix
 39 | 
 40 |   command <<<
 41 |     cat \
 42 |         <(sed -e '1d' ${scores} | fgrep -e DEL -e DUP | awk '($3>=0.5)' | cut -f1 | fgrep -w -f - <(zcat ${vcf})) \
 43 |         <(sed -e '1d' ${scores} | fgrep -e INV -e BND -e INS | awk '($3>=0.5)' | cut -f1 | fgrep -w -f - <(zcat ${vcf}) | sed -e 's/SVTYPE=DEL/SVTYPE=BND/' -e 's/SVTYPE=DUP/SVTYPE=BND/' -e 's/<DEL>/<BND>/' -e 's/<DUP>/<BND>/') \
 44 |       | cat <(sed -n -e '/^#/p' <(zcat ${vcf})) - \
 45 |       | vcf-sort -c \
 46 |       | bgzip -c \
 47 |       > ${prefix}.filtered.vcf.gz
 48 |   >>>
 49 | 
 50 |   output {
 51 |     File filtered_vcf = "${prefix}.filtered.vcf.gz"
 52 |   }
 53 | 
 54 |   runtime {
 55 |       docker: "talkowski/sv-pipeline@sha256:7e7e6163d6ac0fc5781eb99ee5a7eec4db37506f48d00f5063b96123f9ca5024"
 56 |       preemptible: 3
 57 |   }
 58 | }
 59 | 
 60 | task rewrite_SR_coords {
 61 |   File vcf
 62 |   File metrics
 63 |   File cutoffs
 64 |   String prefix
 65 | 
 66 |   command <<<
 67 |     set -o pipefail;
 68 |     /opt/sv-pipeline/03_variant_filtering/scripts/rewrite_SR_coords.py ${vcf} ${metrics} ${cutoffs} stdout \
 69 |       | vcf-sort -c \
 70 |       | bgzip -c \
 71 |       > ${prefix}.corrected_coords.vcf.gz
 72 |   >>>
 73 | 
 74 |   output {
 75 |     File corrected_vcf = "${prefix}.corrected_coords.vcf.gz"
 76 |   }
 77 | 
 78 |   runtime {
 79 |       docker: "talkowski/sv-pipeline@sha256:7e7e6163d6ac0fc5781eb99ee5a7eec4db37506f48d00f5063b96123f9ca5024"
 80 |       memory: "10 GB"
 81 |       preemptible: 3
 82 |   }
 83 | }
 84 | 
 85 | task annotate_RF_evidence {
 86 |   File vcf
 87 |   File scores
 88 |   String prefix
 89 |   
 90 |   command <<<
 91 |     /opt/sv-pipeline/03_variant_filtering/scripts/annotate_RF_evidence.py ${vcf} ${scores} ${prefix}.with_evidence.vcf;
 92 |     bgzip ${prefix}.with_evidence.vcf
 93 |   >>>
 94 |   
 95 |   output {
 96 |     File annotated_vcf = "${prefix}.with_evidence.vcf.gz"
 97 |   }
 98 |   
 99 |   runtime {
100 |     docker: "talkowski/sv-pipeline@sha256:7e7e6163d6ac0fc5781eb99ee5a7eec4db37506f48d00f5063b96123f9ca5024"
101 |     preemptible: 3
102 |   }
103 | }


--------------------------------------------------------------------------------
/gnomad_sv_pipeline_wdls/module_04/04_preprocess.wdl:
--------------------------------------------------------------------------------
 1 | import "https://api.firecloud.org/ga4gh/v1/tools/Talkowski-SV:gather_attribute_paths_multiSampleSet/versions/7/plain-WDL/descriptor" as getAttribute
 2 | 
 3 | # Copyright (c) 2018 Talkowski Lab
 4 | 
 5 | # Contact Ryan Collins <rlcollins@g.harvard.edu>
 6 | 
 7 | # Distributed under terms of the MIT License
 8 | 
 9 | 
10 | # Workflow to preprocess all files needed for per-batch genotyping in module 04a
11 | workflow preprocess_04a_files {
12 | 
13 |   File sample_set_list
14 |   File svcActKeyJson
15 |   String workspaceProject
16 |   String workspaceName
17 | 
18 |   # Get cohort_filtered_pesr_vcf_list
19 |   call getAttribute.gather_attribute_paths_multiSampleSet as get_pesr_vcf_list {
20 |     input:
21 |       sample_set_list=sample_set_list,
22 |       Attribute="filtered_pesr_vcf",
23 |       svcActKeyJson=svcActKeyJson,
24 |       workspaceProject=workspaceProject,
25 |       workspaceName=workspaceName
26 |   }
27 | 
28 |   # Get cohort_filtered_depth_vcf_list
29 |   call getAttribute.gather_attribute_paths_multiSampleSet as get_depth_vcf_list {
30 |     input:
31 |       sample_set_list=sample_set_list,
32 |       Attribute="filtered_depth_vcf",
33 |       svcActKeyJson=svcActKeyJson,
34 |       workspaceProject=workspaceProject,
35 |       workspaceName=workspaceName
36 |   }
37 | 
38 |   # Outputs
39 |   output {
40 |     File filtered_pesr_vcf_list = get_pesr_vcf_list.attribute_list
41 |     File filtered_depth_vcf_list = get_depth_vcf_list.attribute_list
42 |   }
43 | }


--------------------------------------------------------------------------------
/gnomad_sv_pipeline_wdls/module_04/04_v2_PE_genotyping_train.wdl:
--------------------------------------------------------------------------------
  1 | workflow PE_genotype_train {
  2 |   File batch_vcf    # variants from just the batch in question
  3 |   String discfile
  4 |   Int n_per_split
  5 |   File medianfile
  6 |   File discfile_idx
  7 |   File svc_acct_key
  8 |   Array[String] samples
  9 |   String batch_ID
 10 |   File RF_cutoffs
 11 |   File RD_genotypes
 12 |   File RD_melted_genotypes
 13 |   File blacklist
 14 | 
 15 |   call vcf2bed as make_batch_bed {
 16 |     input:
 17 |       vcf=batch_vcf,
 18 |       prefix=batch_ID
 19 |   }
 20 |   
 21 |   call split_vcf as split_batch_vcf {
 22 |     input:
 23 |       vcf=batch_vcf,
 24 |       n_per_split=n_per_split
 25 |   }
 26 | 
 27 |   scatter (vcf in split_batch_vcf.vcfs) {
 28 |     call count_pe as count_batch_pe {
 29 |       input:
 30 |         vcf=vcf,
 31 |         discfile=discfile,
 32 |         discfile_idx=discfile_idx,
 33 |         medianfile=medianfile,
 34 |         svc_acct_key=svc_acct_key,
 35 |         samples=samples
 36 |     }
 37 |   }
 38 | 
 39 |   call merge_pe_counts {
 40 |     input:
 41 |       count_list=count_batch_pe.pe_counts
 42 |   }
 43 | 
 44 |   call genotype_PE_part1 {
 45 |     input:
 46 |       bed=make_batch_bed.bed,
 47 |       RF_cutoffs=RF_cutoffs,
 48 |       PE_counts=merge_pe_counts.counts,
 49 |       RD_genotypes=RD_genotypes,
 50 |       RD_melted_genotypes=RD_melted_genotypes,
 51 |       blacklist=blacklist
 52 |   }
 53 | 
 54 |   output {
 55 |     File PE_genotypes = genotype_PE_part1.genotypes
 56 |     File PE_varGQ = genotype_PE_part1.varGQ
 57 |     File PE_metrics = genotype_PE_part1.PE_metrics
 58 |     File PE_train = genotype_PE_part1.PE_train
 59 |   }
 60 | }
 61 | 
 62 | task vcf2bed {
 63 |   File vcf
 64 |   String prefix
 65 | 
 66 |   command {
 67 |     svtk vcf2bed ${vcf} -i ALGORITHMS ${prefix}.bed
 68 |   }
 69 | 
 70 |   output {
 71 |     File bed = "${prefix}.bed"
 72 |   }
 73 | 
 74 |   runtime {
 75 |       preemptible: 3
 76 |       docker: "talkowski/sv-pipeline@sha256:e5c7ce65c2e0c851261679b62095a13f42d0e4b4fef70b1d0183f2767e4ec53c"
 77 |   }
 78 | }
 79 | 
 80 | task split_vcf {
 81 |   File vcf
 82 |   Int n_per_split
 83 | 
 84 |   command <<<
 85 |     if [[ ${vcf} == *.gz ]] ; then
 86 |       zcat ${vcf} | sed -n -e '/^#/p' > header.vcf;
 87 |       zcat ${vcf} | sed -e '/^#/d' | split -l ${n_per_split} - pe;
 88 |     else
 89 |       sed -n -e '/^#/p' ${vcf} > header.vcf;
 90 |       sed -e '/^#/d' ${vcf} | split -l ${n_per_split} - pe;
 91 |     fi
 92 |     for f in pe*; do cat header.vcf $f > $f.vcf; done
 93 |   >>>
 94 | 
 95 |   output {
 96 |     Array[File] vcfs = glob("pe*.vcf")
 97 |   }
 98 | 
 99 |   runtime {
100 |       preemptible: 3
101 |       docker: "talkowski/sv-pipeline@sha256:e5c7ce65c2e0c851261679b62095a13f42d0e4b4fef70b1d0183f2767e4ec53c"
102 |   }
103 | }
104 | 
105 | task count_pe {
106 |   File vcf
107 |   String discfile
108 |   File discfile_idx
109 |   File medianfile
110 |   File svc_acct_key
111 |   Array[String] samples
112 | 
113 |   String prefix = basename(vcf, ".vcf")
114 | 
115 |   command <<<
116 |     url=$(gsutil signurl -d 24h ${svc_acct_key} ${discfile} | sed '1d' | cut -f 4);
117 |     svtk vcf2bed --split-bnd --no-header ${vcf} test.bed;
118 |     awk -v OFS="\t" -v window=5000 '{if ($2-window>0){print $1,$2-window,$2+window}else{print $1,0,$2+window}}' test.bed  >> region.bed;
119 |     awk -v OFS="\t" -v window=5000 '{if ($3-window>0){print $1,$3-window,$3+window}else{print $1,0,$3+window}}' test.bed  >> region.bed;
120 |     sort -k1,1 -k2,2n region.bed > region.sorted.bed;
121 |     bedtools merge -i region.sorted.bed > region.merged.bed;
122 |     svtk remote_tabix "$url" ${discfile_idx} -R region.merged.bed | bgzip -c > PE.txt.gz;
123 |     tabix -b 2 -e 2 PE.txt.gz;
124 |     svtk count-pe --index PE.txt.gz.tbi -s ${write_tsv(samples)} --medianfile ${medianfile} ${vcf} PE.txt.gz ${prefix}.pe_counts.txt;
125 |     gzip ${prefix}.pe_counts.txt
126 |   >>>
127 | 
128 |   output {
129 |     File pe_counts = "${prefix}.pe_counts.txt.gz"
130 |   }
131 | 
132 |   runtime {
133 |       preemptible: 3
134 |       docker: "talkowski/sv-pipeline-remote-pysam@sha256:41a84644c1f7d339813c1176fdd6d42ed1ac770e430b053975d47da6e99f5f26"
135 |   }
136 | }
137 | 
138 | task merge_pe_counts {
139 |   Array[File] count_list
140 | 
141 |   command {
142 |     zcat ${sep=' ' count_list} | fgrep -v -e "name" | gzip -c > pe_counts.txt.gz
143 |   }
144 | 
145 |   output {
146 |     File counts = "pe_counts.txt.gz"
147 |   }
148 | 
149 |   runtime {
150 |       preemptible: 3
151 |       docker: "talkowski/sv-pipeline@sha256:e5c7ce65c2e0c851261679b62095a13f42d0e4b4fef70b1d0183f2767e4ec53c"
152 |       disks: "local-disk 50 SSD"
153 |   }
154 | }
155 | 
156 | task genotype_PE_part1 {
157 |   File bed
158 |   File RF_cutoffs
159 |   File PE_counts
160 |   File RD_genotypes
161 |   File RD_melted_genotypes
162 |   File blacklist
163 |   
164 |   command <<<
165 |     /opt/sv-pipeline/04_variant_resolution/scripts/PE_genotype.sh \
166 |       ${bed} \
167 |       ${PE_counts} \
168 |       ${RD_genotypes} \
169 |       ${RD_melted_genotypes} \
170 |       ${RF_cutoffs} \
171 |       ${blacklist} \
172 |       /opt/RdTest/generate_cutoff_PE.R 
173 |   >>>
174 | 
175 |   output {
176 |     File PE_train = "pe.train.include.txt"
177 |     File PE_metrics = "pe_metric_file.txt"
178 |     File genotypes = "pe.geno.withquality.txt.gz"
179 |     File varGQ = "pe.variant.quality.final.txt.gz"
180 |   }
181 | 
182 |   runtime {
183 |       preemptible: 0
184 |       docker: "talkowski/sv-pipeline-rdtest@sha256:764635fce650adac449b013058388a55653e8c7e6c075452a80f6e2a104754cd"
185 |       disks: "local-disk 50 SSD"
186 |   }
187 | }


--------------------------------------------------------------------------------
/gnomad_sv_pipeline_wdls/module_04/04_v2_SR_genotyping_train.wdl:
--------------------------------------------------------------------------------
  1 | workflow SR_genotype_train {
  2 |   File batch_vcf
  3 |   String splitfile
  4 |   Int n_per_split
  5 |   File medianfile
  6 |   File splitfile_idx
  7 |   File svc_acct_key
  8 |   Array[String] samples
  9 |   String batch_ID
 10 |   File RF_cutoffs
 11 |   File RD_melted_genotypes
 12 |   File PE_train
 13 |   File PE_genotypes
 14 | 
 15 |   call split_vcf as split_batch_vcf {
 16 |     input:
 17 |       vcf=batch_vcf,
 18 |       n_per_split=n_per_split
 19 |   }
 20 | 
 21 |   scatter (vcf in split_batch_vcf.vcfs) {
 22 |     call count_sr as count_batch_sr {
 23 |       input:
 24 |         vcf=vcf,
 25 |         splitfile=splitfile,
 26 |         splitfile_idx=splitfile_idx,
 27 |         medianfile=medianfile,
 28 |         svc_acct_key=svc_acct_key,
 29 |         samples=samples
 30 |     }
 31 |   }
 32 | 
 33 |   call merge_sr_counts {
 34 |     input:
 35 |       count_list=count_batch_sr.sr_counts,
 36 |       sum_list=count_batch_sr.sr_sum
 37 |   }
 38 | 
 39 |   call genotype_SR_part1 {
 40 |     input:
 41 |       vcf=batch_vcf,
 42 |       RF_cutoffs=RF_cutoffs,
 43 |       SR_counts=merge_sr_counts.counts,
 44 |       SR_sum=merge_sr_counts.sum,
 45 |       RD_melted_genotypes=RD_melted_genotypes,
 46 |       PE_train=PE_train,
 47 |       samples=samples,
 48 |       PE_genotypes=PE_genotypes
 49 |   }
 50 | 
 51 |   output {
 52 |     File SR_metrics = genotype_SR_part1.SR_metrics
 53 |   }
 54 | }
 55 | 
 56 | task split_vcf {
 57 |   File vcf
 58 |   Int n_per_split
 59 | 
 60 |   command <<<
 61 |       if [[ ${vcf} == *.gz ]] ; then
 62 |       echo "gzipped";
 63 |       zcat ${vcf} | sed -n -e '/^#/p' > header.vcf;
 64 |       zcat ${vcf} | sed -e '/^#/d' | split -l ${n_per_split} - sr;
 65 |     else
 66 |       echo "plaintext";
 67 |       sed -n -e '/^#/p' ${vcf} > header.vcf;
 68 |       sed -e '/^#/d' ${vcf} | split -l ${n_per_split} - sr;
 69 |     fi
 70 |     for f in sr*; do cat header.vcf $f | bgzip -c > $f.vcf.gz; done
 71 |   >>>
 72 | 
 73 |   output {
 74 |     Array[File] vcfs = glob("sr*.vcf.gz")
 75 |   }
 76 | 
 77 |   runtime {
 78 |       preemptible: 3
 79 |       docker: "talkowski/sv-pipeline@sha256:e5c7ce65c2e0c851261679b62095a13f42d0e4b4fef70b1d0183f2767e4ec53c"
 80 |   }
 81 | }
 82 | 
 83 | task count_sr {
 84 |   File vcf
 85 |   String splitfile
 86 |   File splitfile_idx
 87 |   File medianfile
 88 |   File svc_acct_key
 89 |   Array[String] samples
 90 | 
 91 |   String prefix = basename(vcf, ".vcf")
 92 | 
 93 |   command <<<
 94 |     url=$(gsutil signurl -d 24h ${svc_acct_key} ${splitfile} | sed '1d' | cut -f 4);
 95 |     svtk vcf2bed --split-bnd --no-header ${vcf} test.bed;
 96 |     awk -v OFS="\t" '{if ($2-250>0){print $1,$2-250,$2+250}else{print $1,0,$2+250}}' test.bed  >> region.bed;
 97 |     awk -v OFS="\t" '{if ($3-250>0){print $1,$3-250,$3+250}else{print $1,0,$3+250}}' test.bed  >> region.bed;
 98 |     sort -k1,1 -k2,2n region.bed > region.sorted.bed;
 99 |     bedtools merge -i region.sorted.bed > region.merged.bed;
100 |     svtk remote_tabix "$url" ${splitfile_idx} -R region.merged.bed | bgzip -c > SR.txt.gz;
101 |     tabix -b 2 -e 2 SR.txt.gz;
102 |     svtk count-sr --index SR.txt.gz.tbi -s ${write_tsv(samples)} --medianfile ${medianfile} ${vcf} SR.txt.gz ${prefix}.sr_counts.txt;
103 |     /opt/sv-pipeline/04_variant_resolution/scripts/sum_SR.sh ${prefix}.sr_counts.txt ${prefix}.sr_sum.txt.gz;
104 |     gzip ${prefix}.sr_counts.txt
105 |   >>>
106 | 
107 |   output {
108 |     File sr_counts = "${prefix}.sr_counts.txt.gz"
109 |     File sr_sum = "${prefix}.sr_sum.txt.gz"
110 |   }
111 | 
112 |   runtime {
113 |       preemptible: 3
114 |       docker: "talkowski/sv-pipeline-remote-pysam@sha256:41a84644c1f7d339813c1176fdd6d42ed1ac770e430b053975d47da6e99f5f26"
115 |   }
116 | }
117 | 
118 | task merge_sr_counts {
119 |   Array[File] count_list
120 |   Array[File] sum_list
121 | 
122 |   command {
123 |     zcat ${sep=' ' count_list} | fgrep -v -e "name" | gzip -c > sr_counts.txt.gz;
124 |     cat ${sep=' ' sum_list} > sr_sum.txt.gz
125 |   }
126 | 
127 |   output {
128 |     File counts = "sr_counts.txt.gz"
129 |     File sum = "sr_sum.txt.gz"
130 |   }
131 | 
132 |   runtime {
133 |       preemptible: 3
134 |       docker: "talkowski/sv-pipeline@sha256:e5c7ce65c2e0c851261679b62095a13f42d0e4b4fef70b1d0183f2767e4ec53c"
135 |       disks: "local-disk 60 SSD"
136 |   }
137 | }
138 | 
139 | task genotype_SR_part1 {
140 |   File vcf
141 |   File SR_counts
142 |   File SR_sum
143 |   File RD_melted_genotypes
144 |   File RF_cutoffs
145 |   Array[String] samples
146 |   File PE_train
147 |   File PE_genotypes
148 |   
149 |   command <<<
150 |     /opt/sv-pipeline/04_variant_resolution/scripts/SR_genotype.opt_part1.sh \
151 |       ${vcf} \
152 |       ${SR_counts} \
153 |       ${SR_sum} \
154 |       ${RD_melted_genotypes} \
155 |       ${RF_cutoffs} \
156 |       ${write_tsv(samples)} \
157 |       ${PE_train} \
158 |       ${PE_genotypes}
159 |   >>>
160 | 
161 |   output {
162 |     File SR_metrics = "sr_metric_file.txt"
163 |   }
164 | 
165 |   runtime {
166 |       preemptible: 0
167 |       docker: "talkowski/sv-pipeline-rdtest@sha256:764635fce650adac449b013058388a55653e8c7e6c075452a80f6e2a104754cd"
168 |       disks: "local-disk 60 SSD"
169 |       memory: "16 GB"
170 |   }
171 | }


--------------------------------------------------------------------------------
/gnomad_sv_pipeline_wdls/module_04/04_v2_genotype_depth_part1.wdl:
--------------------------------------------------------------------------------
 1 | import "https://api.firecloud.org/ga4gh/v1/tools/Talkowski-SV:04_v2_RD_genotyping_train/versions/7/plain-WDL/descriptor" as RD_genotype_train
 2 | 
 3 | workflow genotype_depth_part1 {
 4 |   File batch_vcf
 5 |   String batch
 6 |   String coveragefile     # batch coverage file
 7 |   File coveragefile_idx
 8 |   File medianfile         # batch median file
 9 |   File famfile            # batch famfile
10 |   File svc_acct_key
11 |   File rf_cutoffs         # Random forest cutoffs
12 |   File seed_cutoffs
13 |   Array[String] samples   # List of samples in batch
14 |   Int n_RD_genotype_bins  # number of RdTest bins
15 |   Int n_per_RD_split      # number of variants per RdTest split
16 |   String reference_build  #hg19 or hg38
17 | 
18 |   call RD_genotype_train.RD_genotype_train {
19 |     input:
20 |       vcf=batch_vcf,
21 |       coveragefile=coveragefile,
22 |       coveragefile_idx=coveragefile_idx,
23 |       medianfile=medianfile,
24 |       famfile=famfile,
25 |       svc_acct_key=svc_acct_key,
26 |       rf_cutoffs=rf_cutoffs,
27 |       seed_cutoffs=seed_cutoffs,
28 |       samples=samples,
29 |       prefix=batch,
30 |       n_bins=n_RD_genotype_bins,
31 |       n_per_split=n_per_RD_split,
32 |       reference_build=reference_build
33 |   }
34 | 
35 |   output {
36 |     File RD_pesr_sepcutoff = RD_genotype_train.pesr_sepcutoff
37 |     File RD_depth_sepcutoff = RD_genotype_train.depth_sepcutoff
38 |   }
39 | }


--------------------------------------------------------------------------------
/gnomad_sv_pipeline_wdls/module_04/04_v2_genotype_pesr_part1.wdl:
--------------------------------------------------------------------------------
 1 | import "https://api.firecloud.org/ga4gh/v1/tools/Talkowski-SV:04_v2_RD_genotyping_train/versions/7/plain-WDL/descriptor" as RD_genotype_train
 2 | import "https://api.firecloud.org/ga4gh/v1/tools/Talkowski-SV:04_v2_PE_genotyping_train/versions/6/plain-WDL/descriptor" as PE_genotype_train
 3 | import "https://api.firecloud.org/ga4gh/v1/tools/Talkowski-SV:04_v2_SR_genotyping_train/versions/6/plain-WDL/descriptor" as SR_genotype_train
 4 | 
 5 | workflow genotype_pesr_part1 {
 6 |   File batch_vcf
 7 |   String batch
 8 |   String coveragefile     # batch coverage file
 9 |   File coveragefile_idx
10 |   File medianfile         # batch median file
11 |   File famfile            # batch famfile
12 |   File svc_acct_key
13 |   File rf_cutoffs         # Random forest cutoffs
14 |   File seed_cutoffs
15 |   Array[String] samples   # List of samples in batch
16 |   Int n_RD_genotype_bins  # number of RdTest bins
17 |   Int n_per_RD_split      # number of variants per RdTest split
18 |   Int n_per_PE_split
19 |   String discfile
20 |   File discfile_idx
21 |   File pesr_blacklist
22 |   String splitfile
23 |   Int n_per_SR_split
24 |   File splitfile_idx
25 |   String reference_build  #hg19 or hg38
26 | 
27 |   call RD_genotype_train.RD_genotype_train {
28 |     input:
29 |       vcf=batch_vcf,
30 |       coveragefile=coveragefile,
31 |       coveragefile_idx=coveragefile_idx,
32 |       medianfile=medianfile,
33 |       famfile=famfile,
34 |       svc_acct_key=svc_acct_key,
35 |       rf_cutoffs=rf_cutoffs,
36 |       seed_cutoffs=seed_cutoffs,
37 |       samples=samples,
38 |       prefix=batch,
39 |       n_bins=n_RD_genotype_bins,
40 |       n_per_split=n_per_RD_split,
41 |       reference_build=reference_build
42 |   }
43 | 
44 |   call PE_genotype_train.PE_genotype_train {
45 |     input:
46 |       batch_vcf=batch_vcf,
47 |       discfile=discfile,
48 |       n_per_split=n_per_PE_split,
49 |       medianfile=medianfile,
50 |       discfile_idx=discfile_idx,
51 |       svc_acct_key=svc_acct_key,
52 |       samples=samples,
53 |       batch_ID=batch,
54 |       RF_cutoffs=rf_cutoffs,
55 |       RD_genotypes=RD_genotype_train.genotypes,
56 |       RD_melted_genotypes=RD_genotype_train.melted_genotypes,
57 |       blacklist=pesr_blacklist
58 |   }
59 | 
60 |   call SR_genotype_train.SR_genotype_train {
61 |     input:
62 |       batch_vcf=batch_vcf,
63 |       splitfile=splitfile,
64 |       n_per_split=n_per_SR_split,
65 |       medianfile=medianfile,
66 |       splitfile_idx=splitfile_idx,
67 |       svc_acct_key=svc_acct_key,
68 |       samples=samples,
69 |       batch_ID=batch,
70 |       RF_cutoffs=rf_cutoffs,
71 |       RD_melted_genotypes=RD_genotype_train.melted_genotypes,
72 |       PE_train=PE_genotype_train.PE_train,
73 |       PE_genotypes=PE_genotype_train.PE_genotypes
74 |   }
75 | 
76 |   output {
77 |     File RD_pesr_sepcutoff = RD_genotype_train.pesr_sepcutoff
78 |     File RD_depth_sepcutoff = RD_genotype_train.depth_sepcutoff
79 |     File PE_metrics = PE_genotype_train.PE_metrics
80 |     File SR_metrics = SR_genotype_train.SR_metrics
81 |   }
82 | }


--------------------------------------------------------------------------------
/gnomad_sv_pipeline_wdls/module_04/04_v2_make_cohort_VCFs.wdl:
--------------------------------------------------------------------------------
 1 | workflow make_cohort_VCFs {
 2 |   File pesr_vcfs_list
 3 |   File depth_vcfs_list
 4 |   
 5 |   call merge_vcfs as merge_pesr_vcfs {
 6 |     input:
 7 |       vcfs_list=pesr_vcfs_list,
 8 |       prefix="all_batches.pesr"
 9 |   }
10 |   
11 |   call merge_vcfs as merge_depth_vcfs {
12 |     input:
13 |       vcfs_list=depth_vcfs_list,
14 |       prefix="all_batches.depth"
15 |   }
16 | 
17 |   output {
18 |   	File cohort_pesr_vcf = merge_pesr_vcfs.merged_vcf
19 |     File cohort_depth_vcf = merge_depth_vcfs.merged_vcf
20 |   }
21 | }
22 | 
23 | task merge_vcfs {
24 |   File vcfs_list
25 |   String prefix
26 | 
27 |   command {
28 |     /opt/sv-pipeline/04_variant_resolution/scripts/merge_vcfs.sh ${vcfs_list} ${prefix}
29 |   }
30 | 
31 |   output {
32 |     File merged_vcf = "${prefix}.vcf.gz"
33 |   }
34 |   
35 |   runtime {
36 |     docker: "talkowski/sv-pipeline@sha256:aaf0b5fa587fbe4f4d137532a4c1be292f9ea104422494e1a7d8ac7a5d8459e6"
37 |     preemptible: 3
38 |   }
39 | }


--------------------------------------------------------------------------------
/gnomad_sv_pipeline_wdls/module_05/04_bp_overlap_filter_by_chrom.wdl:
--------------------------------------------------------------------------------
  1 | # Copyright (c) 2018 Talkowski Lab
  2 | 
  3 | # Contact Ryan Collins <rlcollins@g.harvard.edu>
  4 | 
  5 | # Distributed under terms of the MIT License
  6 | 
  7 | 
  8 | # Workflow to parallelize same-bp overlap filter per chromosome
  9 | workflow same_bp_filter {
 10 |   String vcf
 11 |   File vcf_idx
 12 |   String prefix
 13 |   File contiglist
 14 |   File svc_acct_key
 15 |   File bothside_pass
 16 |   File background_fail
 17 | 
 18 |   Array[Array[String]] contigs = read_tsv(contiglist)
 19 | 
 20 |   #Run same-bp overlap filter, scattered by chromosome
 21 |   scatter (contig in contigs) {
 22 | 
 23 |     #Remote tabix each vcf & join into a single vcf
 24 |     call subset_vcf {
 25 |       input:
 26 |         vcf=vcf,
 27 |         vcf_idx=vcf_idx,
 28 |         contig=contig[0],
 29 |         prefix=prefix,
 30 |         svc_acct_key=svc_acct_key
 31 |     }
 32 | 
 33 |     #Run same-bp overlap filter per chromosome
 34 |     call bp_overlap_filter {
 35 |       input:
 36 |         vcf=subset_vcf.subsetted_vcf,
 37 |         prefix="${prefix}.${contig[0]}",
 38 |         bothside_pass=bothside_pass,
 39 |         background_fail=background_fail
 40 |     }
 41 |   }
 42 | 
 43 |   #Merge filtered vcfs across chromosomes
 44 |   call concat_vcfs {
 45 |     input:
 46 |       vcfs=bp_overlap_filter.bp_filtered_vcf,
 47 |       prefix="${prefix}.non_redundant"
 48 |   }
 49 | 
 50 |   output {
 51 |     File filtered_vcf = concat_vcfs.concat_vcf
 52 |     File filtered_vcf_idx = concat_vcfs.concat_vcf_idx
 53 |   }
 54 | }
 55 | 
 56 | 
 57 | #Remote tabix a single chromosome per VCFs
 58 | task subset_vcf {
 59 |   String vcf
 60 |   File vcf_idx
 61 |   String contig
 62 |   String prefix
 63 |   File svc_acct_key
 64 | 
 65 |   command <<<
 66 |     #Remote tabix to chromosome of interest
 67 |     url=$( gsutil signurl -d 24h ${svc_acct_key} "$vcf" | sed '1d' | cut -f 4 );
 68 |     echo $url;
 69 |     svtk remote_tabix --header "$url" ${vcf_idx} "${contig}:0-300000000" > "${prefix}.${contig}.vcf"
 70 |     bgzip -f "${prefix}.${contig}.vcf"
 71 |     tabix -p vcf -f "${prefix}.${contig}.vcf.gz"
 72 |   >>>
 73 | 
 74 |   output {
 75 |     File subsetted_vcf = "${prefix}.${contig}.vcf.gz"
 76 |     File subsetted_vcf_idx = "${prefix}.${contig}.vcf.gz.tbi"
 77 |   }
 78 | 
 79 |   runtime {
 80 |     docker: "talkowski/sv-pipeline-remote-pysam@sha256:9fd37fb64e28e54d53172dd30d68c36f0815f21af465381dac281d53755edd86"
 81 |     preemptible: 1
 82 |     disks: "local-disk 50 SSD"
 83 |   }
 84 | }
 85 | 
 86 | 
 87 | # Run Harrison's overlapping breakpoint filter prior to complex resolution
 88 | task bp_overlap_filter {
 89 |   File vcf
 90 |   String prefix
 91 |   File bothside_pass
 92 |   File background_fail
 93 | 
 94 |   command <<<
 95 |     /opt/sv-pipeline/04_variant_resolution/scripts/overlapbpchange.sh \
 96 |     ${vcf} \
 97 |     ${background_fail} \
 98 |     ${bothside_pass};
 99 |     mv non_redundant.vcf.gz "${prefix}.non_redundant.vcf.gz"
100 |   >>>
101 | 
102 |   output {
103 |     File bp_filtered_vcf = "${prefix}.non_redundant.vcf.gz"
104 |   }
105 | 
106 |   runtime {
107 |     docker: "talkowski/sv-pipeline@sha256:b359f2cb0c9d5f5a55eb4c41fd362f4e574bf3f8f0f395a2907837571b367ee0"
108 |     preemptible: 1
109 |     memory: "4 GB"
110 |     disks: "local-disk 250 SSD"
111 |   }
112 | }
113 | 
114 | 
115 | #Merge multiple vcfs
116 | task concat_vcfs {
117 |   Array[File] vcfs
118 |   String prefix
119 | 
120 |   command <<<
121 |     vcf-concat ${sep=' ' vcfs} | vcf-sort -c | bgzip -c > ${prefix}.vcf.gz
122 |     tabix -f -p vcf ${prefix}.vcf.gz
123 |   >>>
124 | 
125 |   output {
126 |     File concat_vcf = "${prefix}.vcf.gz"
127 |     File concat_vcf_idx = "${prefix}.vcf.gz.tbi"
128 |   }
129 | 
130 |   runtime {
131 |     docker: "talkowski/sv-pipeline@sha256:b359f2cb0c9d5f5a55eb4c41fd362f4e574bf3f8f0f395a2907837571b367ee0"
132 |     preemptible: 1
133 |     disks: "local-disk 1000 SSD"
134 |   }
135 | }


--------------------------------------------------------------------------------
/gnomad_sv_pipeline_wdls/module_05/04_genotype_CPX_CNVs_perBatch.wdl:
--------------------------------------------------------------------------------
  1 | # Copyright (c) 2018 Talkowski Lab
  2 | 
  3 | # Contact Ryan Collins <rlcollins@g.harvard.edu>
  4 | 
  5 | # Distributed under terms of the MIT License
  6 | 
  7 | 
  8 | # Workflow to perform depth-based genotyping per batch
  9 | # on predicted CPX CNVs from 04b
 10 | 
 11 | workflow genotype_CPX_CNVs_perBatch {
 12 |   File cpx_bed
 13 |   File RD_depth_sepcutoff
 14 |   Int n_per_split_small
 15 |   Int n_per_split_large
 16 |   Int n_RdTest_bins
 17 |   String batch
 18 |   File medianfile
 19 |   File famfile
 20 |   File svc_acct_key
 21 |   File sampleslist
 22 |   String coveragefile
 23 |   File coveragefile_idx
 24 | 
 25 |   Array[String] samples = read_lines(sampleslist)
 26 | 
 27 |   call shard_bed {
 28 |     input:
 29 |       bed=cpx_bed,
 30 |       n_per_split_small=n_per_split_small,
 31 |       n_per_split_large=n_per_split_large,
 32 |       sampleslist=sampleslist
 33 |   }
 34 | 
 35 |   scatter (lt5kb_bed in shard_bed.lt5kb_beds) {
 36 |     call RdTest_genotype as RD_genotype_lt5kb {
 37 |       input:
 38 |         bed=lt5kb_bed,
 39 |         coveragefile=coveragefile,
 40 |         coveragefile_idx=coveragefile_idx,
 41 |         svc_acct_key=svc_acct_key,
 42 |         medianfile=medianfile,
 43 |         famfile=famfile,
 44 |         samples=samples,
 45 |         gt_cutoffs=RD_depth_sepcutoff,
 46 |         n_bins=n_RdTest_bins,
 47 |         prefix=basename(lt5kb_bed, ".bed")
 48 |     }
 49 |   }
 50 |   
 51 |   scatter (gt5kb_bed in shard_bed.gt5kb_beds) {
 52 |     call RdTest_genotype as RD_genotype_gt5kb {
 53 |       input:
 54 |         bed=gt5kb_bed,
 55 |         coveragefile=coveragefile,
 56 |         coveragefile_idx=coveragefile_idx,
 57 |         svc_acct_key=svc_acct_key,
 58 |         medianfile=medianfile,
 59 |         famfile=famfile,
 60 |         samples=samples,
 61 |         gt_cutoffs=RD_depth_sepcutoff,
 62 |         n_bins=n_RdTest_bins,
 63 |         prefix=basename(gt5kb_bed)
 64 |     }
 65 |   }
 66 | 
 67 |   call concat_melted_genotypes {
 68 |     input:
 69 |       lt5kb_genos=RD_genotype_lt5kb.melted_genotypes,
 70 |       gt5kb_genos=RD_genotype_gt5kb.melted_genotypes,
 71 |       batch=batch
 72 |   }
 73 | 
 74 |   output {
 75 |     File genotypes = concat_melted_genotypes.genotypes
 76 |   }
 77 | }
 78 | 
 79 | task shard_bed {
 80 |   File bed
 81 |   Int n_per_split_small
 82 |   Int n_per_split_large
 83 |   File sampleslist
 84 | 
 85 |   command <<<
 86 |     set -euo pipefail
 87 |     if [ $( zcat ${bed} | fgrep -v "#" | wc -l ) -gt 0 ]; then
 88 |       #First, repace samples in input bed with full list of all samples in batch
 89 |       zcat ${bed} \
 90 |         | fgrep -v "#" \
 91 |         | awk -v OFS="\t" -v samples=$( cat ${sampleslist} | paste -s -d, ) \
 92 |           '{ print $1, $2, $3, $4, samples, "DUP" }' \
 93 |         | sort -Vk1,1 -k2,2n -k3,3n \
 94 |         | bgzip -c \
 95 |         > newBed_wSamples.bed.gz || true
 96 |       #Second, split by small vs large CNVs
 97 |       zcat newBed_wSamples.bed.gz \
 98 |         | awk -v OFS="\t" '($3-$2<5000) {print $0}' \
 99 |         | split -l ${n_per_split_small} -a 6 - lt5kb. || true
100 |       zcat newBed_wSamples.bed.gz \
101 |         | awk -v OFS="\t" '($3-$2>=5000) {print $0}' \
102 |         | split -l ${n_per_split_large} -a 6 - gt5kb. || true
103 |     fi
104 |     if [ $( find ./ -name "lt5kb.*" | wc -l ) -eq 0 ]; then
105 |       touch lt5kb.aaaaaa
106 |     fi
107 |     if [ $( find ./ -name "gt5kb.*" | wc -l ) -eq 0 ]; then
108 |       touch gt5kb.aaaaaa
109 |     fi
110 |   >>>
111 | 
112 |   output {
113 |     Array[File] lt5kb_beds = glob("lt5kb.*")
114 |     Array[File] gt5kb_beds = glob("gt5kb.*")
115 |   }
116 |     
117 |   runtime {
118 |     preemptible: 3
119 |     maxRetries: 1
120 |     docker: "talkowski/sv-pipeline@sha256:5ff4bd3264cc61fc69e37cd2e307e3b5ab8458fec2606e1b57d4b1f73fecead0"
121 |     disks: "local-disk 50 HDD"
122 |   }
123 | }
124 | 
125 | 
126 | # Run depth-based genotyping
127 | task RdTest_genotype {
128 |   File bed
129 |   String coveragefile
130 |   File medianfile
131 |   File svc_acct_key
132 |   File coveragefile_idx
133 |   File famfile
134 |   Array[String] samples
135 |   File gt_cutoffs
136 |   Int n_bins
137 |   String prefix
138 | 
139 |   command <<<
140 |     set -euo pipefail
141 |     /opt/RdTest/localize_bincov.sh \
142 |       ${bed} \
143 |       ${coveragefile} \
144 |       ${coveragefile_idx} \
145 |       ${svc_acct_key};
146 |     Rscript /opt/RdTest/RdTest.R \
147 |       -b ${bed} \
148 |       -c local_coverage.bed.gz \
149 |       -m ${medianfile} \
150 |       -f ${famfile} \
151 |       -n ${prefix} \
152 |       -w ${write_tsv(samples)} \
153 |       -i ${n_bins} \
154 |       -r ${gt_cutoffs} \
155 |       -y /opt/RdTest/bin_exclude.bed.gz \
156 |       -g TRUE;
157 |     /opt/sv-pipeline/04_variant_resolution/scripts/merge_RdTest_genotypes.py \
158 |       ${prefix}.geno \
159 |       ${prefix}.gq \
160 |       rd.geno.cnv.bed;
161 |     sort -k1,1V -k2,2n rd.geno.cnv.bed | uniq | bgzip -c > rd.geno.cnv.bed.gz
162 |   >>>
163 | 
164 |   output {
165 |     # File genotypes = "${prefix}.geno"
166 |     # File copy_states = "${prefix}.median_geno"
167 |     # File metrics = "${prefix}.metrics"
168 |     # File gq = "${prefix}.gq"
169 |     # File varGQ = "${prefix}.vargq"
170 |     File melted_genotypes = "rd.geno.cnv.bed.gz"
171 |   }
172 | 
173 |   runtime {
174 |     preemptible: 3
175 |     docker: "talkowski/sv-pipeline-rdtest@sha256:0393ca5260e523f8646a72a2a739863384de73670383d3f0b32c6ccceba010e8"
176 |     disks: "local-disk 100 HDD"
177 |     bootDiskSizeGb: "30"
178 |     memory: "8 GB"
179 |     maxRetries: 1
180 |   }
181 | }
182 | 
183 | 
184 | # Merge melted genotype files
185 | task concat_melted_genotypes {
186 |   Array[File] lt5kb_genos
187 |   Array[File] gt5kb_genos
188 |   String batch
189 | 
190 |   command <<<
191 |     zcat ${sep=' ' lt5kb_genos} ${sep=' ' gt5kb_genos} \
192 |       | sort -Vk1,1 -k2,2n -k3,3n \
193 |       | bgzip -c \
194 |       > ${batch}.rd_genos.bed.gz
195 |   >>>
196 | 
197 |   output {
198 |     File genotypes = "${batch}.rd_genos.bed.gz"
199 |   }
200 |   
201 |   runtime {
202 |     docker: "talkowski/sv-pipeline@sha256:5ff4bd3264cc61fc69e37cd2e307e3b5ab8458fec2606e1b57d4b1f73fecead0"
203 |     preemptible: 3
204 |     maxRetries: 1
205 |     memory: "16 GB"
206 |     disks: "local-disk 250 HDD"
207 |   }
208 | }


--------------------------------------------------------------------------------
/gnomad_sv_pipeline_wdls/module_05/04_integrate_resolved_vcfs.wdl:
--------------------------------------------------------------------------------
  1 | # Copyright (c) 2018 Talkowski Lab
  2 | 
  3 | # Contact Ryan Collins <rlcollins@g.harvard.edu>
  4 | 
  5 | # Distributed under terms of the MIT License
  6 | 
  7 | 
  8 | # Workflow to parallelize integration of all-variant and inv-only svtk resolve results per chromosome
  9 | workflow integrate_invonly_allvars {
 10 |   String inv_res_vcf
 11 |   String all_res_vcf
 12 |   File inv_res_vcf_idx
 13 |   File all_res_vcf_idx
 14 |   String prefix
 15 |   File contiglist
 16 |   File svc_acct_key
 17 |   File bothside_pass
 18 |   File background_fail
 19 | 
 20 |   Array[Array[String]] contigs = read_tsv(contiglist)
 21 | 
 22 |   #Merge, scattered by chromosome
 23 |   scatter (contig in contigs) {
 24 | 
 25 |     #Remote tabix each vcf
 26 |     call subset_vcf as subset_inv {
 27 |       input:
 28 |         vcf=inv_res_vcf,
 29 |         vcf_idx=inv_res_vcf_idx,
 30 |         contig=contig[0],
 31 |         prefix="${prefix}.inv_only.${contig[0]}",
 32 |         svc_acct_key=svc_acct_key
 33 |     }
 34 |     call subset_vcf as subset_all {
 35 |       input:
 36 |         vcf=all_res_vcf,
 37 |         vcf_idx=all_res_vcf_idx,
 38 |         contig=contig[0],
 39 |         prefix="${prefix}.all_variants.${contig[0]}",
 40 |         svc_acct_key=svc_acct_key
 41 |     }
 42 | 
 43 |     #Run integration per chromosome
 44 |     call integrate_resolved_vcfs {
 45 |       input:
 46 |         inv_res_vcf=subset_inv.subsetted_vcf,
 47 |         all_res_vcf=subset_all.subsetted_vcf,
 48 |         prefix="${prefix}.resolved.${contig[0]}"
 49 |     }
 50 |   }
 51 | 
 52 |   #Merge integrated vcfs across chromosomes
 53 |   call concat_vcfs {
 54 |     input:
 55 |       vcfs=integrate_resolved_vcfs.integrated_vcf,
 56 |       prefix="${prefix}.resolved"
 57 |   }
 58 | 
 59 |   output {
 60 |     File integrated_vcf = concat_vcfs.concat_vcf
 61 |     File integrated_vcf_idx = concat_vcfs.concat_vcf_idx
 62 |   }
 63 | }
 64 | 
 65 | 
 66 | #Remote tabix a single chromosome per VCFs
 67 | task subset_vcf {
 68 |   String vcf
 69 |   File vcf_idx
 70 |   String contig
 71 |   String prefix
 72 |   File svc_acct_key
 73 | 
 74 |   command <<<
 75 |     #Remote tabix to chromosome of interest
 76 |     url=$( gsutil signurl -d 24h ${svc_acct_key} "$vcf" | sed '1d' | cut -f 4 );
 77 |     echo $url;
 78 |     svtk remote_tabix --header "$url" ${vcf_idx} "${contig}:0-300000000" > "${prefix}.${contig}.vcf"
 79 |     bgzip -f "${prefix}.${contig}.vcf"
 80 |     tabix -p vcf -f "${prefix}.${contig}.vcf.gz"
 81 |   >>>
 82 | 
 83 |   output {
 84 |     File subsetted_vcf = "${prefix}.${contig}.vcf.gz"
 85 |     File subsetted_vcf_idx = "${prefix}.${contig}.vcf.gz.tbi"
 86 |   }
 87 | 
 88 |   runtime {
 89 |     docker: "talkowski/sv-pipeline-remote-pysam@sha256:9fd37fb64e28e54d53172dd30d68c36f0815f21af465381dac281d53755edd86"
 90 |     preemptible: 1
 91 |     disks: "local-disk 50 SSD"
 92 |   }
 93 | }
 94 | 
 95 | 
 96 | # Merge inversion-only and all-variant cpx-resolved outputs
 97 | task integrate_resolved_vcfs {
 98 |   File inv_res_vcf
 99 |   File all_res_vcf
100 |   String prefix
101 | 
102 |   command <<<
103 |     /opt/sv-pipeline/04_variant_resolution/scripts/Complex_Inversion_Integration.sh \
104 |       ${inv_res_vcf} \
105 |       ${all_res_vcf} \
106 |       ${prefix}.integrated_resolved.vcf.gz
107 |   >>>
108 | 
109 |   output {
110 |     File integrated_vcf = "${prefix}.integrated_resolved.vcf.gz"
111 |   }
112 | 
113 |   runtime {
114 |     docker: "talkowski/sv-pipeline@sha256:b359f2cb0c9d5f5a55eb4c41fd362f4e574bf3f8f0f395a2907837571b367ee0"
115 |     preemptible: 1
116 |     memory: "4 GB"
117 |     disks: "local-disk 250 SSD"
118 |   }
119 | }
120 | 
121 | 
122 | #Merge multiple vcfs
123 | task concat_vcfs {
124 |   Array[File] vcfs
125 |   String prefix
126 | 
127 |   command <<<
128 |     vcf-concat ${sep=' ' vcfs} | vcf-sort -c | bgzip -c > ${prefix}.vcf.gz
129 |     tabix -f -p vcf ${prefix}.vcf.gz
130 |   >>>
131 | 
132 |   output {
133 |     File concat_vcf = "${prefix}.vcf.gz"
134 |     File concat_vcf_idx = "${prefix}.vcf.gz.tbi"
135 |   }
136 | 
137 |   runtime {
138 |     docker: "talkowski/sv-pipeline@sha256:b359f2cb0c9d5f5a55eb4c41fd362f4e574bf3f8f0f395a2907837571b367ee0"
139 |     preemptible: 1
140 |     disks: "local-disk 1000 SSD"
141 |   }
142 | }


--------------------------------------------------------------------------------
/gnomad_sv_pipeline_wdls/module_05/04_merge_allvar_invonly_vcfs.wdl:
--------------------------------------------------------------------------------
  1 | # Copyright (c) 2018 Talkowski Lab
  2 | 
  3 | # Contact Ryan Collins <rlcollins@g.harvard.edu>
  4 | 
  5 | # Distributed under terms of the MIT License
  6 | 
  7 | 
  8 | # Workflow to parallelize same-bp overlap filter per chromosome
  9 | workflow same_bp_filter {
 10 |   String vcf
 11 |   File vcf_idx
 12 |   String prefix
 13 |   File contiglist
 14 |   File svc_acct_key
 15 |   File bothside_pass
 16 |   File background_fail
 17 | 
 18 |   Array[Array[String]] contigs = read_tsv(contiglist)
 19 | 
 20 |   #Run same-bp overlap filter, scattered by chromosome
 21 |   scatter (contig in contigs) {
 22 | 
 23 |     #Remote tabix each vcf & join into a single vcf
 24 |     call subset_vcf {
 25 |       input:
 26 |         vcf=vcf,
 27 |         vcf_idx=vcf_idx,
 28 |         contig=contig[0],
 29 |         prefix=prefix,
 30 |         svc_acct_key=svc_acct_key
 31 |     }
 32 | 
 33 |     #Run same-bp overlap filter per chromosome
 34 |     call bp_overlap_filter {
 35 |       input:
 36 |         vcf=subset_vcf.subsetted_vcf,
 37 |         prefix="${prefix}.${contig[0]}",
 38 |         bothside_pass=bothside_pass,
 39 |         background_fail=background_fail
 40 |     }
 41 |   }
 42 | 
 43 |   #Merge filtered vcfs across chromosomes
 44 |   call concat_vcfs {
 45 |     input:
 46 |       vcfs=bp_overlap_filter.bp_filtered_vcf,
 47 |       prefix="${prefix}.non_redundant"
 48 |   }
 49 | 
 50 |   output {
 51 |     File filtered_vcf = concat_vcfs.concat_vcf
 52 |     File filtered_vcf_idx = concat_vcfs.concat_vcf_idx
 53 |   }
 54 | }
 55 | 
 56 | 
 57 | #Remote tabix a single chromosome per VCFs
 58 | task subset_vcf {
 59 |   String vcf
 60 |   File vcf_idx
 61 |   String contig
 62 |   String prefix
 63 |   File svc_acct_key
 64 | 
 65 |   command <<<
 66 |     #Remote tabix to chromosome of interest
 67 |     url=$( gsutil signurl -d 24h ${svc_acct_key} "$vcf" | sed '1d' | cut -f 4 );
 68 |     echo $url;
 69 |     svtk remote_tabix --header "$url" ${vcf_idx} "${contig}:0-300000000" > "${prefix}.${contig}.vcf"
 70 |     bgzip -f "${prefix}.${contig}.vcf"
 71 |     tabix -p vcf -f "${prefix}.${contig}.vcf.gz"
 72 |   >>>
 73 | 
 74 |   output {
 75 |     File subsetted_vcf = "${prefix}.${contig}.vcf.gz"
 76 |     File subsetted_vcf_idx = "${prefix}.${contig}.vcf.gz.tbi"
 77 |   }
 78 | 
 79 |   runtime {
 80 |     docker: "talkowski/sv-pipeline-remote-pysam@sha256:9fd37fb64e28e54d53172dd30d68c36f0815f21af465381dac281d53755edd86"
 81 |     preemptible: 1
 82 |     disks: "local-disk 50 SSD"
 83 |   }
 84 | }
 85 | 
 86 | 
 87 | # Run Harrison's overlapping breakpoint filter prior to complex resolution
 88 | task bp_overlap_filter {
 89 |   File vcf
 90 |   String prefix
 91 |   File bothside_pass
 92 |   File background_fail
 93 | 
 94 |   command <<<
 95 |     /opt/sv-pipeline/04_variant_resolution/scripts/overlapbpchange.sh \
 96 |     ${vcf} \
 97 |     ${background_fail} \
 98 |     ${bothside_pass};
 99 |     mv non_redundant.vcf.gz "${prefix}.non_redundant.vcf.gz"
100 |   >>>
101 | 
102 |   output {
103 |     File bp_filtered_vcf = "${prefix}.non_redundant.vcf.gz"
104 |   }
105 | 
106 |   runtime {
107 |     docker: "talkowski/sv-pipeline@sha256:b359f2cb0c9d5f5a55eb4c41fd362f4e574bf3f8f0f395a2907837571b367ee0"
108 |     preemptible: 1
109 |     memory: "4 GB"
110 |     disks: "local-disk 250 SSD"
111 |   }
112 | }
113 | 
114 | 
115 | #Merge multiple vcfs
116 | task concat_vcfs {
117 |   Array[File] vcfs
118 |   String prefix
119 | 
120 |   command <<<
121 |     vcf-concat ${sep=' ' vcfs} | vcf-sort -c | bgzip -c > ${prefix}.vcf.gz
122 |     tabix -f -p vcf ${prefix}.vcf.gz
123 |   >>>
124 | 
125 |   output {
126 |     File concat_vcf = "${prefix}.vcf.gz"
127 |     File concat_vcf_idx = "${prefix}.vcf.gz.tbi"
128 |   }
129 | 
130 |   runtime {
131 |     docker: "talkowski/sv-pipeline@sha256:b359f2cb0c9d5f5a55eb4c41fd362f4e574bf3f8f0f395a2907837571b367ee0"
132 |     preemptible: 1
133 |     disks: "local-disk 1000 SSD"
134 |   }
135 | }


--------------------------------------------------------------------------------
/gnomad_sv_pipeline_wdls/module_05/04_pesr_depth_overlap.wdl:
--------------------------------------------------------------------------------
  1 | # Copyright (c) 2018 Talkowski Lab
  2 | 
  3 | # Contact Ryan Collins <rlcollins@g.harvard.edu>
  4 | 
  5 | # Distributed under terms of the MIT License
  6 | 
  7 | 
  8 | # Workflow to parallelize vcf clustering per chromosome
  9 | workflow pesr_depth_overlap {
 10 |   String pesr_vcf
 11 |   File pesr_vcf_idx
 12 |   String depth_vcf
 13 |   File depth_vcf_idx
 14 |   File contigs
 15 |   Array[String] samples
 16 |   File svc_acct_key
 17 | 
 18 |   Array[Array[String]] contiglist = read_tsv(contigs)
 19 | 
 20 |   scatter (contig in contiglist) {
 21 |     call subset_vcf as subset_pesr_vcf {
 22 |       input:
 23 |         vcf=pesr_vcf,
 24 |         vcf_idx=pesr_vcf_idx,
 25 |         contig=contig[0],
 26 |         prefix="all_batches.pesr",
 27 |         svc_acct_key=svc_acct_key
 28 |     }
 29 |     
 30 |     call subset_vcf as subset_depth_vcf {
 31 |       input:
 32 |         vcf=depth_vcf,
 33 |         vcf_idx=depth_vcf_idx,
 34 |         contig=contig[0],
 35 |         prefix="all_batches.depth",
 36 |         svc_acct_key=svc_acct_key
 37 |     }
 38 |     
 39 |     call merge_pesr_depth {
 40 |       input:
 41 |         pesr_vcf=subset_pesr_vcf.subsetted_vcf,
 42 |         depth_vcf=subset_depth_vcf.subsetted_vcf,
 43 |         contig=contig[0]
 44 |     }
 45 |   }
 46 | 
 47 |   call concat_vcfs {
 48 |     input:
 49 |       vcfs=merge_pesr_depth.merged_vcf,
 50 |       prefix="all_batches.pesr_depth"
 51 |   }
 52 | 
 53 |   output {
 54 |     File merged_vcf = concat_vcfs.concat_vcf
 55 |     File merged_vcf_idx = concat_vcfs.concat_vcf_idx
 56 |   }
 57 | }
 58 | 
 59 | task subset_vcf {
 60 |   String vcf
 61 |   File vcf_idx
 62 |   String contig
 63 |   String prefix
 64 |   File svc_acct_key
 65 | 
 66 |   command <<<
 67 |     # tabix -p vcf ${vcf};
 68 |     # tabix -h ${vcf} ${contig} | bgzip -c > ${prefix}.${contig}.vcf.gz
 69 |     url=$( gsutil signurl -d 24h ${svc_acct_key} ${vcf} | sed '1d' | cut -f 4 );
 70 |     echo $url;
 71 |     svtk remote_tabix --header "$url" "${vcf_idx}" "$contig" \
 72 |       | bgzip -c \
 73 |       > "${prefix}.${contig}.vcf.gz"
 74 |   >>>
 75 | 
 76 |   output {
 77 |     File subsetted_vcf = "${prefix}.${contig}.vcf.gz"
 78 |   }
 79 | 
 80 |   runtime {
 81 |     docker: "talkowski/sv-pipeline-remote-pysam@sha256:0c21137179665254ca0d9ebe4d21251ae2ff6679337fd9b3e9d6e6ab808db6a8"
 82 |     preemptible: 3
 83 |     disks: "local-disk 100 SSD"
 84 |   }
 85 | }
 86 | 
 87 | task merge_pesr_depth {
 88 |   File pesr_vcf
 89 |   File depth_vcf
 90 |   String contig
 91 |   
 92 |   command <<<
 93 |     /opt/sv-pipeline/04_variant_resolution/scripts/PESR_RD_merge_wrapper.sh \
 94 |       ${pesr_vcf} \
 95 |       ${depth_vcf} \
 96 |       ${contig} \
 97 |       all_batches.pesr_depth.${contig}.vcf.gz
 98 |   >>>
 99 | 
100 |   output {
101 |     File merged_vcf = "all_batches.pesr_depth.${contig}.vcf.gz"
102 |   }
103 | 
104 |   runtime {
105 |     docker: "talkowski/sv-pipeline@sha256:3f9d99b8154dff67eb33b0da0a4358ac149461d65f819e7eb64958953d478900"
106 |     preemptible: 1
107 |     memory: "16 GB"
108 |     disks: "local-disk 500 SSD"
109 |   }
110 | }
111 | 
112 | task concat_vcfs {
113 |   Array[File] vcfs
114 |   String prefix
115 | 
116 |   command <<<
117 |     vcf-concat ${sep=' ' vcfs} | vcf-sort -c | bgzip -c > ${prefix}.vcf.gz;
118 |     tabix -p vcf -f ${prefix}.vcf.gz
119 |   >>>
120 | 
121 |   output {
122 |     File concat_vcf = "${prefix}.vcf.gz"
123 |     File concat_vcf_idx = "${prefix}.vcf.gz.tbi"
124 |   }
125 | 
126 |   runtime {
127 |     docker: "talkowski/sv-pipeline@sha256:b359f2cb0c9d5f5a55eb4c41fd362f4e574bf3f8f0f395a2907837571b367ee0"
128 |     preemptible: 1
129 |     memory: "8 GB"
130 |     disks: "local-disk 5000 SSD"
131 |   }
132 | }


--------------------------------------------------------------------------------
/gnomad_sv_pipeline_wdls/module_05/04_resolve_complex_sv.wdl:
--------------------------------------------------------------------------------
  1 | import "https://api.firecloud.org/ga4gh/v1/tools/Talkowski-SV:04_resolve_complex_by_chrom/versions/63/plain-WDL/descriptor" as resolve_complex_by_chrom
  2 | 
  3 | workflow resolve_complex_sv {
  4 |   File vcf
  5 |   File contigs
  6 |   Int max_shards_per_chrom
  7 |   Int min_variants_per_shard
  8 |   File cytobands
  9 |   File cytobands_idx
 10 |   File mei_bed
 11 |   File discfile_list
 12 |   File discfile_idx_list
 13 |   File pe_blacklist
 14 |   File pe_blacklist_idx
 15 |   File svc_acct_key
 16 |   File rf_cutoffs
 17 | 
 18 |   Array[Array[String]] contiglist = read_tsv(contigs)
 19 | 
 20 |   # Get SR count cutoff from RF metrics to use in single-ender rescan procedure
 21 |   call get_se_cutoff {
 22 |     input:
 23 |       rf_cutoffs=rf_cutoffs
 24 |   }
 25 | 
 26 | 
 27 |   scatter (contig in contiglist) {
 28 |     call subset_vcf {
 29 |       input:
 30 |         vcf=vcf,
 31 |         chrom=contig[0]
 32 |     }
 33 | 
 34 |     call resolve_complex_by_chrom.resolve_complex_by_chrom as resolve_perChrom {
 35 |       input:
 36 |         vcf=subset_vcf.single_chrom,
 37 |         vcf_idx=subset_vcf.single_chrom_idx,
 38 |         contig=contig[0],
 39 |         max_shards=max_shards_per_chrom,
 40 |         min_variants_per_shard=min_variants_per_shard,
 41 |         cytobands=cytobands,
 42 |         cytobands_idx=cytobands_idx,
 43 |         discfile_list=discfile_list,
 44 |         discfile_idx_list=discfile_idx_list,
 45 |         mei_bed=mei_bed,
 46 |         pe_blacklist=pe_blacklist,
 47 |         pe_blacklist_idx=pe_blacklist_idx,
 48 |         svc_acct_key=svc_acct_key,
 49 |         se_pe_cutoff=get_se_cutoff.median_PE_cutoff
 50 |     }
 51 |   }
 52 | 
 53 |   call resolve_complex_by_chrom.concat_vcfs as concat_resolved {
 54 |     input:
 55 |       vcfs=resolve_perChrom.res_vcf,
 56 |       vcftype="resolved"
 57 |   }
 58 | 
 59 |   # call resolve_complex_by_chrom.concat_vcfs as concat_unresolved {
 60 |   #   input:
 61 |   #     vcfs=resolve_perChrom.unres_vcf,
 62 |   #     vcftype="unresolved"
 63 |   # }
 64 | 
 65 |   output {
 66 |     File resolved_vcf_merged = concat_resolved.concat_vcf
 67 |     File resolved_vcf_merged_idx = concat_resolved.concat_vcf_idx
 68 |     # File unresolved_vcf_merged = concat_unresolved.concat_vcf
 69 |   }
 70 | }
 71 | 
 72 | #Subset VCF per chromosome
 73 | task subset_vcf {
 74 |   File vcf
 75 |   String chrom
 76 | 
 77 |   String prefix = basename(vcf, ".vcf.gz")
 78 |   
 79 |   command <<<
 80 |     tabix -p vcf ${vcf};
 81 |     tabix --print-header ${vcf} ${chrom} | bgzip -c > ${prefix}.${chrom}.vcf.gz
 82 |     tabix -f ${prefix}.${chrom}.vcf.gz
 83 |   >>>
 84 | 
 85 |   output {
 86 |     File single_chrom = "${prefix}.${chrom}.vcf.gz"
 87 |     File single_chrom_idx = "${prefix}.${chrom}.vcf.gz.tbi"
 88 |   }
 89 | 
 90 |   runtime {
 91 |     docker: "talkowski/sv-pipeline@sha256:96d07aa2c7c3e8bd12f2621a0644a5a8fca99f922926922724497ad2aad9364d"
 92 |     preemptible: 3
 93 |     disks: "local-disk 1000 SSD"
 94 |   }
 95 | }
 96 | 
 97 | # Get SE cutoff
 98 | task get_se_cutoff {
 99 |   File rf_cutoffs
100 | 
101 |   command <<<
102 |     mkdir rf_cutoff_files/
103 |     cat ${rf_cutoffs} | gsutil cp -I rf_cutoff_files/
104 |     while read file; do
105 |       /opt/sv-pipeline/04_variant_resolution/scripts/convert_poisson_p.py \
106 |       $( awk -F '\t' '{if ( $5=="PE_log_pval") print $2 }' $file | head -n1 )
107 |     done < <( find rf_cutoff_files/ -name "*cutoffs" ) | \
108 |     Rscript -e "cat(floor(median(scan('stdin',quiet=T))),sep='\n')" > \
109 |     median_cutoff.txt
110 |   >>>
111 | 
112 |   output {
113 |     Int median_PE_cutoff = read_tsv("median_cutoff.txt")[0][0]
114 |   }
115 | 
116 |   runtime {
117 |     docker: "talkowski/sv-pipeline@sha256:96d07aa2c7c3e8bd12f2621a0644a5a8fca99f922926922724497ad2aad9364d"
118 |     preemptible: 3
119 |   }
120 | }
121 | 
122 | # 
123 | ##Combine multiple VCFs
124 | #task concat_vcfs {
125 | #  Array[File] vcfs
126 | #
127 | #  command <<<
128 | #    vcf-concat ${sep=' ' vcfs} | vcf-sort -c | bgzip -c > all_batches.vcf.gz
129 | #  >>>
130 | #
131 | #  output {
132 | #    File concat_vcf = "all_batches.vcf.gz"
133 | #  }
134 | #
135 | #  runtime {
136 | #    docker: "talkowski/sv-pipeline@sha256:b0455d30df2fbdbd4649466d968cada0a44d02a7159d94982308b629dd1aef78"
137 | #    preemptible: 3
138 | #  }
139 | #}
140 | 


--------------------------------------------------------------------------------
/gnomad_sv_pipeline_wdls/module_05/04_sharded_vcfcluster.wdl:
--------------------------------------------------------------------------------
  1 | # Copyright (c) 2018 Talkowski Lab
  2 | 
  3 | # Contact Ryan Collins <rlcollins@g.harvard.edu>
  4 | 
  5 | # Distributed under terms of the MIT License
  6 | 
  7 | # Workflow to shard a filtered vcf & run vcfcluster (sub-sub-sub workflow)
  8 | workflow sharded_cluster {
  9 |   File vcf
 10 |   Int dist
 11 |   Float frac
 12 |   Int max_shards
 13 |   Int min_per_shard
 14 |   String prefix
 15 |   String contig
 16 |   String svtype
 17 |   Float sample_overlap
 18 |   String do_blacklist
 19 |   File blacklist
 20 |   File blacklist_idx
 21 |   Int svsize
 22 |   Array[String] svtypes
 23 | 
 24 |   #New as of November 2, 2018: perform sharding and return list of variant IDs
 25 |   # for each shard, rather than VCF shards themselves, which should dramatically
 26 |   # improve speed of sharding task (previously took 1-6 hours for 14k samples in
 27 |   # gnomAD v2)
 28 |   call shard_vcf {
 29 |     input:
 30 |       vcf=vcf,
 31 |       dist=dist,
 32 |       frac=frac,
 33 |       max_shards=max_shards,
 34 |       min_per_shard=min_per_shard,
 35 |       prefix="${prefix}.${contig}.${svtype}"
 36 |   }
 37 | 
 38 |   #Run vcfcluster per shard
 39 |   scatter ( VIDs_list in shard_vcf.VID_list_shards ) {
 40 |     call vcfcluster {
 41 |       input:
 42 |         vcf=vcf,
 43 |         VIDs=VIDs_list,
 44 |         prefix="${prefix}.${contig}.${svtype}",
 45 |         dist=dist,
 46 |         frac=frac,
 47 |         sample_overlap=sample_overlap,
 48 |         do_blacklist=do_blacklist,
 49 |         blacklist=blacklist,
 50 |         blacklist_idx=blacklist_idx,
 51 |         svsize=svsize,
 52 |         svtypes=svtypes
 53 |     }
 54 |   }
 55 | 
 56 |   #Merge shards per svtype
 57 |   call concat_vcfs as concat_shards {
 58 |     input:
 59 |       vcfs=vcfcluster.clustered_vcf,
 60 |       prefix="${prefix}.${contig}.${svtype}"
 61 |   }
 62 | 
 63 |   #Output
 64 |   output {
 65 |     File clustered_vcf = concat_shards.concat_vcf
 66 |   }
 67 | }
 68 | 
 69 | 
 70 | #Intelligently shard a VCF for parallelized clustering
 71 | task shard_vcf {
 72 |   File vcf
 73 |   Int dist
 74 |   Float frac
 75 |   Int max_shards
 76 |   Int min_per_shard
 77 |   String prefix
 78 | 
 79 |   command <<<
 80 |     set -eu -o pipefail
 81 | 
 82 |     tabix -f -p vcf ${vcf}
 83 |     /opt/sv-pipeline/04_variant_resolution/scripts/shardVCF_preClustering_part1.sh \
 84 |     -D ${dist} \
 85 |     -R ${frac} \
 86 |     -L ${min_per_shard} \
 87 |     -S ${max_shards} \
 88 |     -P ${prefix} \
 89 |     ${vcf}
 90 |   >>>
 91 | 
 92 |   output {
 93 |     Array[File] VID_list_shards = glob("*.VIDs.list")
 94 |   }
 95 |   
 96 |   runtime {
 97 |     preemptible: 1
 98 |     docker: "talkowski/sv-pipeline@sha256:703a19f84f498989ba8ffde110a3462cfecfbd7ade1084a151fac5fff742c266"
 99 |     disks: "local-disk 250 SSD"
100 |   }
101 | }
102 | 
103 | 
104 | #Run svtk vcfcluster
105 | task vcfcluster {
106 |   File vcf
107 |   File VIDs
108 |   String prefix
109 |   Int dist
110 |   Float frac
111 |   Float sample_overlap
112 |   String do_blacklist
113 |   File blacklist
114 |   File blacklist_idx
115 |   Int svsize
116 |   Array[String] svtypes
117 |   
118 |   command <<<
119 |     set -eu -o pipefail
120 | 
121 |     # Don't generate random characters for vcf name, it produces problems with caching on cromwell
122 |     # You *could* pass a seed like so:
123 |     # INPUT_HASH=$(tr -d '/+' < <(openssl enc -a -aes-256-ctr -pass pass:"$SEED" -nosalt </dev/zero 2>/dev/null) | head -c16)
124 |     # But if you hash filtered input vcf, you accomplish the same goal of avoiding similar-named files in the loop,
125 |     # without introducing randomness:
126 |     INPUT_HASH=$(md5sum ${vcf} | awk '{print $1}')
127 |     # concat prefix and hash to create unique vcf name:
128 |     VCF_NAME="${prefix}-$INPUT_HASH"
129 | 
130 |     #Prep vcf
131 |     zcat ${vcf} | sed -n '1,1000p' | fgrep "#" > header.vcf
132 |     zcat ${vcf} | fgrep -v "#" | fgrep -wf ${VIDs} | cat header.vcf - | bgzip -c \
133 |     > input.vcf.gz
134 |     #Run clustering
135 |     echo "input.vcf.gz" > unclustered_vcfs.list;
136 |     if [ ${do_blacklist} == "YES" ]; then
137 |       svtk vcfcluster unclustered_vcfs.list $VCF_NAME.vcf \
138 |         -d ${dist} \
139 |         -f ${frac} \
140 |         -x ${blacklist} \
141 |         -z ${svsize} \
142 |         -p ${prefix} \
143 |         -t ${sep=',' svtypes} \
144 |         -o ${sample_overlap} \
145 |         --preserve-ids \
146 |         --preserve-genotypes \
147 |         --preserve-header
148 |       else
149 |         svtk vcfcluster unclustered_vcfs.list $VCF_NAME.vcf \
150 |         -d ${dist} \
151 |         -f ${frac} \
152 |         -z ${svsize} \
153 |         -p ${prefix} \
154 |         -t ${sep=',' svtypes} \
155 |         -o ${sample_overlap} \
156 |         --preserve-ids \
157 |         --preserve-genotypes \
158 |         --preserve-header
159 |       fi
160 |     bgzip -f $VCF_NAME.vcf
161 |   >>>
162 | 
163 |   output {
164 |     # need to use glob since cromwell will not be aware of the value of INPUT hash
165 |     File clustered_vcf = glob("${prefix}*.vcf.gz")[0]
166 |   }
167 | 
168 |   runtime {
169 |     docker: "talkowski/sv-pipeline@sha256:703a19f84f498989ba8ffde110a3462cfecfbd7ade1084a151fac5fff742c266"
170 |     preemptible: 1
171 |     maxRetries: 1
172 |     memory: "8 GB"
173 |     disks: "local-disk 20 SSD"
174 |   }
175 | }
176 | 
177 | 
178 | #Merge multiple vcfs
179 | task concat_vcfs {
180 |   Array[File] vcfs
181 |   String prefix
182 | 
183 |   command <<<
184 |     vcf-concat ${sep=' ' vcfs} | vcf-sort -c | bgzip -c > ${prefix}.vcf.gz
185 |     tabix -f -p vcf ${prefix}.vcf.gz
186 |   >>>
187 | 
188 |   output {
189 |     File concat_vcf = "${prefix}.vcf.gz"
190 |     File concat_vcf_idx = "${prefix}.vcf.gz.tbi"
191 |   }
192 | 
193 |   runtime {
194 |     docker: "talkowski/sv-pipeline@sha256:703a19f84f498989ba8ffde110a3462cfecfbd7ade1084a151fac5fff742c266"
195 |     preemptible: 1
196 |     maxRetries: 1
197 |     disks: "local-disk 500 SSD"
198 |   }
199 | }


--------------------------------------------------------------------------------
/gnomad_sv_pipeline_wdls/module_05/04b_genotype_CPX_CNVs.wdl:
--------------------------------------------------------------------------------
  1 | import "https://api.firecloud.org/ga4gh/v1/tools/Talkowski-SV:04_genotype_CPX_CNVs_perBatch/versions/28/plain-WDL/descriptor" as rd_gt_perbatch
  2 | 
  3 | # Copyright (c) 2018 Talkowski Lab
  4 | 
  5 | # Contact Ryan Collins <rlcollins@g.harvard.edu>
  6 | 
  7 | # Distributed under terms of the MIT License
  8 | 
  9 | 
 10 | # Workflow to perform depth-based genotyping for a single vcf shard scattered 
 11 | # across batches on predicted CPX CNVs from 04b
 12 | workflow genotype_CPX_CNVs {
 13 |   File vcf
 14 |   File gt_input_files
 15 |   Int n_per_split_small
 16 |   Int n_per_split_large
 17 |   Int n_RdTest_bins
 18 |   File svc_acct_key
 19 |   String prefix
 20 |   File famfile
 21 |   String contig
 22 | 
 23 |   Array[Array[String]] gt_input_array = read_tsv(gt_input_files)
 24 | 
 25 |   # Convert VCF to bed of CPX CNV intervals
 26 |   call get_cpx_cnv_intervals {
 27 |     input:
 28 |       vcf=vcf,
 29 |       prefix="${prefix}.${contig}"
 30 |   }
 31 | 
 32 |   # Scatter over each batch (row) in gt_input_files and run depth genotyping
 33 |   scatter (gt_inputs in gt_input_array) {
 34 |     call rd_gt_perbatch.genotype_CPX_CNVs_perBatch as gt_batch {
 35 |       input:
 36 |         cpx_bed=get_cpx_cnv_intervals.CPX_CNV_BED,
 37 |         batch=gt_inputs[0],
 38 |         coveragefile=gt_inputs[1],
 39 |         coveragefile_idx=gt_inputs[2],
 40 |         RD_depth_sepcutoff=gt_inputs[3],
 41 |         sampleslist=gt_inputs[4],
 42 |         famfile=gt_inputs[5],
 43 |         medianfile=gt_inputs[6],
 44 |         n_per_split_small=n_per_split_small,
 45 |         n_per_split_large=n_per_split_large,
 46 |         n_RdTest_bins=n_RdTest_bins,
 47 |         svc_acct_key=svc_acct_key
 48 |     }
 49 |   }
 50 | 
 51 |   # Merge melted genotypes across all batches
 52 |   call merge_melted_gts {
 53 |     input:
 54 |       melted_gts=gt_batch.genotypes,
 55 |       prefix="${prefix}.${contig}"
 56 |   }
 57 | 
 58 |   # Parse genotyping results
 59 |   call parse_gts {
 60 |     input:
 61 |       vcf=vcf,
 62 |       intervals=get_cpx_cnv_intervals.CPX_CNV_BED,
 63 |       genotypes=merge_melted_gts.merged_genotypes,
 64 |       prefix="${prefix}.${contig}",
 65 |       famfile=famfile,
 66 |       contig=contig
 67 |   }
 68 | 
 69 |   # Final output
 70 |   output {
 71 |     File cpx_depth_gt_resolved_vcf = parse_gts.cpx_depth_gt_resolved_vcf
 72 |     File reclassification_table = parse_gts.reclassification_table
 73 |     File interval_genotype_counts_table = parse_gts.gt_counts_table
 74 |   }
 75 | }
 76 | 
 77 | 
 78 | # Get CNV intervals from complex SV for depth genotyping
 79 | task get_cpx_cnv_intervals {
 80 |   File vcf
 81 |   String prefix
 82 | 
 83 |   command <<<
 84 |     /opt/sv-pipeline/04_variant_resolution/scripts/gather_cpx_intervals_for_rd_gt.sh \
 85 |       ${vcf} \
 86 |       ${prefix}.complex_CNV_intervals_to_test.bed.gz
 87 |   >>>
 88 | 
 89 |   output {
 90 |     File CPX_CNV_BED = "${prefix}.complex_CNV_intervals_to_test.bed.gz"
 91 |   }
 92 | 
 93 |   runtime {
 94 |     docker: "talkowski/sv-pipeline@sha256:5ff4bd3264cc61fc69e37cd2e307e3b5ab8458fec2606e1b57d4b1f73fecead0"
 95 |     preemptible: 1
 96 |     maxRetries: 1
 97 |     memory: "8 GB"
 98 |     disks: "local-disk 100 HDD"
 99 |   }
100 | }
101 | 
102 | 
103 | # Merge output from per-batch genotyping
104 | task merge_melted_gts {
105 |   Array[File] melted_gts
106 |   String prefix
107 | 
108 |   command <<<
109 |     while read file; do
110 |       zcat $file
111 |     done < ${write_tsv(melted_gts)} \
112 |       | sort -Vk1,1 -k2,2n -k3,3n -k4,4V -k5,5V \
113 |       | bgzip -c \
114 |       > ${prefix}.CPX_intervals.merged_rd_genos.bed.gz
115 |   >>>
116 | 
117 |   output {
118 |     File merged_genotypes = "${prefix}.CPX_intervals.merged_rd_genos.bed.gz"
119 |   }
120 | 
121 |     runtime {
122 |     docker: "talkowski/sv-pipeline@sha256:703a19f84f498989ba8ffde110a3462cfecfbd7ade1084a151fac5fff742c266"
123 |     preemptible: 1
124 |     maxRetries: 1
125 |     disks: "local-disk 100 HDD"
126 |   }
127 | }
128 | 
129 | 
130 | # Parse genotyping results
131 | task parse_gts {
132 |   File vcf
133 |   File intervals
134 |   File genotypes
135 |   File famfile
136 |   String prefix
137 |   String contig
138 | 
139 |   command <<<
140 |     /opt/sv-pipeline/04_variant_resolution/scripts/process_posthoc_cpx_depth_regenotyping.sh \
141 |       -R ${prefix}.CPXregenotyping_reclassification_table.${contig}.txt \
142 |       -G ${prefix}.CPXregenotyping_raw_genotype_counts_table.${contig}.txt \
143 |       ${vcf} \
144 |       ${intervals} \
145 |       ${genotypes} \
146 |       ${famfile} \
147 |       ${prefix}.postCPXregenotyping.${contig}.vcf.gz
148 |   >>>
149 | 
150 |   output {
151 |     File cpx_depth_gt_resolved_vcf = "${prefix}.postCPXregenotyping.${contig}.vcf.gz"
152 |     File reclassification_table = "${prefix}.CPXregenotyping_reclassification_table.${contig}.txt"
153 |     File gt_counts_table = "${prefix}.CPXregenotyping_raw_genotype_counts_table.${contig}.txt"
154 |   }
155 | 
156 |   runtime {
157 |     docker: "talkowski/sv-pipeline@sha256:703a19f84f498989ba8ffde110a3462cfecfbd7ade1084a151fac5fff742c266"
158 |     preemptible: 1
159 |     maxRetries: 1
160 |     disks: "local-disk 100 HDD"
161 |   }
162 | }
163 | 
164 | 
165 | # Combine multiple VCFs
166 | task concat_vcfs {
167 |   Array[File] vcfs
168 |   String prefix
169 | 
170 |   command <<<
171 |     vcf-concat ${sep=' ' vcfs} | vcf-sort -c | bgzip -c > ${prefix}.mod04b_final.vcf.gz
172 |   >>>
173 | 
174 |   output {
175 |     File concat_vcf = "${prefix}.mod04b_final.vcf.gz"
176 |   }
177 | 
178 |   runtime {
179 |     docker: "talkowski/sv-pipeline@sha256:703a19f84f498989ba8ffde110a3462cfecfbd7ade1084a151fac5fff742c266"
180 |     preemptible: 1
181 |     maxRetries: 1
182 |     disks: "local-disk 300 HDD"
183 |   }
184 | }
185 | 


--------------------------------------------------------------------------------
/gnomad_sv_pipeline_wdls/module_05/04b_scatter_CPX_genotyping.wdl:
--------------------------------------------------------------------------------
  1 | import "https://api.firecloud.org/ga4gh/v1/tools/Talkowski-SV:04b_genotype_CPX_CNVs/versions/25/plain-WDL/descriptor" as cpx_gt
  2 | 
  3 | # Copyright (c) 2018 Talkowski Lab
  4 | 
  5 | # Contact Ryan Collins <rlcollins@g.harvard.edu>
  6 | 
  7 | # Distributed under terms of the MIT License
  8 | 
  9 | 
 10 | # Workflow to perform depth-based genotyping for a single vcf shard scattered 
 11 | # across batches on predicted CPX CNVs from 04b
 12 | workflow scatter_CPX_genotyping {
 13 |   File vcf
 14 |   File vcf_idx
 15 |   Int n_master_vcf_shards
 16 |   Int n_master_min_vars_per_vcf_shard
 17 |   File gt_input_files
 18 |   Int n_per_split_small
 19 |   Int n_per_split_large
 20 |   Int n_RdTest_bins
 21 |   File svc_acct_key
 22 |   String prefix
 23 |   File famfile
 24 |   String contig
 25 | 
 26 |   # Shard VCF into even slices
 27 |   call shard_vcf {
 28 |     input:
 29 |       vcf=vcf,
 30 |       vcf_idx=vcf_idx,
 31 |       prefix="${prefix}.${contig}",
 32 |       n_shards=n_master_vcf_shards,
 33 |       min_vars_per_shard=n_master_min_vars_per_vcf_shard
 34 |   }
 35 | 
 36 |   # Scatter genotyping over shards
 37 |   scatter ( shard in shard_vcf.vcf_shards ) {
 38 |     # Run genotyping
 39 |     call cpx_gt.genotype_CPX_CNVs as genotype_shard {
 40 |       input:
 41 |         vcf=shard,
 42 |         gt_input_files=gt_input_files,
 43 |         n_per_split_large=n_per_split_large,
 44 |         n_per_split_small=n_per_split_small,
 45 |         n_RdTest_bins=n_RdTest_bins,
 46 |         svc_acct_key=svc_acct_key,
 47 |         prefix=prefix,
 48 |         famfile=famfile,
 49 |         contig=contig
 50 |     }
 51 |   }
 52 | 
 53 |   # Merge VCF shards
 54 |   call concat_vcfs {
 55 |     input:
 56 |       vcfs=genotype_shard.cpx_depth_gt_resolved_vcf,
 57 |       outfile_prefix="${prefix}.${contig}.resolved"
 58 |   }
 59 | 
 60 |   # Output merged VCF
 61 |   output {
 62 |     File cpx_depth_gt_resolved_vcf = concat_vcfs.concat_vcf
 63 |     File cpx_depth_gt_resolved_vcf_idx = concat_vcfs.concat_vcf_idx
 64 |   }
 65 |  }
 66 | 
 67 | 
 68 | #Shard a vcf into even chunks
 69 | task shard_vcf {
 70 |   File vcf
 71 |   File vcf_idx
 72 |   String prefix
 73 |   Int n_shards
 74 |   Int min_vars_per_shard
 75 | 
 76 |   command <<<
 77 |     tabix -H ${vcf} > header.vcf;
 78 |     zcat ${vcf} | grep -ve '^#' | cut -f3 > all_VIDs.list;
 79 |     nrecords=$( cat all_VIDs.list | wc -l );
 80 |     rec_per_shard=$( echo "$(( $nrecords / ${n_shards} ))" | cut -f1 -d\. );
 81 |     if [ $rec_per_shard -lt ${min_vars_per_shard} ]; then
 82 |       rec_per_shard=${min_vars_per_shard}
 83 |     fi;
 84 |     /opt/sv-pipeline/04_variant_resolution/scripts/evenSplitter.R \
 85 |       -L $rec_per_shard \
 86 |       all_VIDs.list \
 87 |       VIDs_split_
 88 |     max_suf=$( find `pwd` -name "VIDs_split_*" | awk -v FS="_" '{ print $NF }' | sort -nrk1,1 | sed -n '1p' )
 89 |     for i in $( seq 1 "$max_suf" ); do
 90 |       zcat ${vcf} \
 91 |       | fgrep -wf VIDs_split_"$i" \
 92 |       | cat header.vcf - \
 93 |       | bgzip -c \
 94 |       > ${prefix}.shard_"$i".vcf.gz
 95 |       rm VIDs_split_"$i"
 96 |     done
 97 |   >>>
 98 | 
 99 |   output {
100 |     Array[File] vcf_shards = glob("${prefix}.shard_*.vcf.gz")
101 |   }
102 | 
103 |   runtime {
104 |     docker: "talkowski/sv-pipeline@sha256:703a19f84f498989ba8ffde110a3462cfecfbd7ade1084a151fac5fff742c266"
105 |     preemptible: 1
106 |     maxRetries: 1
107 |     memory: "4 GB"
108 |     disks: "local-disk 500 HDD"
109 |   }
110 | }
111 | 
112 | 
113 | #General task to combine multiple VCFs
114 | task concat_vcfs {
115 |   Array[File] vcfs
116 |   String outfile_prefix
117 | 
118 |   command <<<
119 |     vcf-concat ${sep=' ' vcfs} | vcf-sort -c | bgzip -c > ${outfile_prefix}.vcf.gz; 
120 |     tabix -p vcf -f "${outfile_prefix}.vcf.gz"
121 |   >>>
122 | 
123 |   output {
124 |     File concat_vcf = "${outfile_prefix}.vcf.gz"
125 |     File concat_vcf_idx = "${outfile_prefix}.vcf.gz.tbi"
126 |   }
127 | 
128 |   runtime {
129 |     docker: "talkowski/sv-pipeline@sha256:703a19f84f498989ba8ffde110a3462cfecfbd7ade1084a151fac5fff742c266"
130 |     preemptible: 1
131 |     maxRetries: 1
132 |     memory: "4 GB"
133 |     disks: "local-disk 500 HDD"
134 |   }
135 | }
136 | 
137 | 
138 | 


--------------------------------------------------------------------------------
/gnomad_sv_pipeline_wdls/module_06/05_cleanVCF_part2.wdl:
--------------------------------------------------------------------------------
 1 | workflow Clean {
 2 | 
 3 | 	Array[File] whitelists
 4 |   File normal_revise_vcf
 5 |   File multi_cnvs
 6 |   File vcftools_idx
 7 | 
 8 | 	scatter ( white in whitelists ){
 9 | 		call cleanvcf2{
10 |       input:
11 |         normal_revise_vcf=normal_revise_vcf,
12 |         whitelist=white,
13 |         multi_cnvs=multi_cnvs,
14 |         vcftools_idx=vcftools_idx
15 |   	}
16 | 	}
17 | 
18 |   call combine{
19 |     input:
20 |       shards=cleanvcf2.out
21 |   }
22 |  	
23 |   output {
24 |     File out=combine.out
25 |   }
26 | }
27 | 
28 | 
29 | task cleanvcf2 {
30 |   
31 |   File normal_revise_vcf
32 |   File whitelist
33 |   File multi_cnvs
34 |   File vcftools_idx
35 | 
36 |   command {
37 |       bash /opt/sv-pipeline/04_variant_resolution/scripts/clean_vcf_part2.sh ${normal_revise_vcf} ${whitelist} ${multi_cnvs} "output.txt"
38 |   }
39 | 
40 |   runtime {
41 |     preemptible: 1
42 |     docker: "talkowski/sv-pipeline@sha256:703a19f84f498989ba8ffde110a3462cfecfbd7ade1084a151fac5fff742c266"
43 |     disks: "local-disk 250 SSD"
44 |     bootDiskSizeGb: 30
45 |     memory: "32 GB"
46 |   }
47 | 
48 |   output {
49 |     File out="output.txt"
50 |   }
51 | }
52 | 
53 | 
54 | task combine {
55 | 	
56 |   Array[File] shards
57 | 
58 |   command {
59 |     cat ${sep=" " shards} > output.txt
60 |   }
61 |   
62 |   runtime {
63 |     preemptible: 1
64 |     docker: "talkowski/sv-pipeline@sha256:703a19f84f498989ba8ffde110a3462cfecfbd7ade1084a151fac5fff742c266"
65 |     disks: "local-disk 200 SSD"
66 |     memory: "4 GB"
67 |   }
68 |   
69 |   output {
70 | 		File out="output.txt"
71 |   }
72 | }
73 | 


--------------------------------------------------------------------------------
/gnomad_sv_pipeline_wdls/module_06/05_cleanVCF_part4.wdl:
--------------------------------------------------------------------------------
 1 | workflow Clean4{
 2 | 
 3 |   Array[File] RD_CN_revises
 4 |   File normal_revise_vcf
 5 |   
 6 |   scatter ( RD_CN_revise in  RD_CN_revises ){
 7 |     call cleanvcf4 {
 8 |       input:
 9 |         RD_CN_revise=RD_CN_revise,
10 |         normal_revise_vcf=normal_revise_vcf,
11 |     }
12 |   }
13 | 
14 |   call combine as combine_revised {
15 |     input:
16 |       shards=cleanvcf4.out,
17 |       outfile="revise.vcf.lines.txt.gz"
18 |   }
19 | 
20 |   call combine as combine_multi_IDs {
21 |     input:
22 |       shards=cleanvcf4.multi_IDs,
23 |       outfile="multi.geno.ids.txt.gz"
24 |   }
25 | 
26 |   output {
27 |       File out=combine_revised.out
28 |       File multi_IDs=combine_multi_IDs.out
29 |   }
30 | }
31 | 
32 | task cleanvcf4 {
33 |   File RD_CN_revise
34 |   File normal_revise_vcf
35 | 
36 |   command {
37 |       bash /opt/sv-pipeline/04_variant_resolution/scripts/clean_vcf_part4.sh ${RD_CN_revise} ${normal_revise_vcf} 
38 |   }
39 | 
40 |   runtime {
41 |     docker: "talkowski/sv-pipeline@sha256:703a19f84f498989ba8ffde110a3462cfecfbd7ade1084a151fac5fff742c266"
42 |     disks: "local-disk 200 SSD"
43 |     memory: "16 GB"
44 |   }
45 | 
46 |   output {
47 |     File out="revise.vcf.lines.txt.gz"
48 |     File multi_IDs="multi.geno.ids.txt.gz"
49 |   }
50 | }
51 | 
52 | task combine {
53 |   Array[File] shards
54 |   String outfile
55 | 
56 |   command {
57 |       zcat ${sep=" " shards} | bgzip -c > ${outfile}
58 |   }
59 | 
60 |   runtime {
61 |     preemptible: 1
62 |     docker: "talkowski/sv-pipeline@sha256:703a19f84f498989ba8ffde110a3462cfecfbd7ade1084a151fac5fff742c266"
63 |     disks: "local-disk 250 SSD"
64 |     memory: "8 GB"
65 |   }
66 | 
67 |   output {
68 |     File out="${outfile}"
69 |   }
70 | }
71 | 


--------------------------------------------------------------------------------
/gnomad_sv_pipeline_wdls/module_06/05_cleanVCF_scatter.wdl:
--------------------------------------------------------------------------------
  1 | import "https://api.firecloud.org/ga4gh/v1/tools/Talkowski-SV:05_CleanVCF/versions/93/plain-WDL/descriptor" as CleanVCF_chr
  2 | import "https://api.firecloud.org/ga4gh/v1/tools/Talkowski-SV:master_SV_VCF_QC/versions/75/plain-WDL/descriptor" as QC
  3 | 
  4 | workflow CleanVCF_scatter{
  5 | 
  6 |   File vcf
  7 |   File chrlist
  8 |   File backgroundlist
  9 |   File famfile
 10 |   Int max_shards_per_chrom_step1
 11 |   Int min_records_per_shard_step1
 12 |   Int samples_per_step2_shard
 13 |   File trio_famfile
 14 |   String ref_build
 15 |   String prefix
 16 |   File Sanders_2015_tarball
 17 |   File Collins_2017_tarball
 18 |   File Werling_2018_tarball
 19 |   File? outlier_samples_list
 20 | 
 21 | 
 22 |   Array[Array[String]] chrs=read_tsv(chrlist)
 23 | 
 24 |     
 25 |   scatter ( chr in chrs ){
 26 |   	call CleanVCF_chr.CleanVCF {
 27 |       input:
 28 |       	vcf=vcf,
 29 |         Chr=chr[0],
 30 |         backgroundlist=backgroundlist,
 31 |         famfile=famfile,
 32 |         prefix=prefix,
 33 |         max_shards_per_chrom_step1=max_shards_per_chrom_step1,
 34 |         min_records_per_shard_step1=min_records_per_shard_step1,
 35 |         samples_per_step2_shard=samples_per_step2_shard,
 36 |         outlier_samples_list=outlier_samples_list
 37 |     }
 38 |   }
 39 | 
 40 |   call combine {
 41 |     input:
 42 |       vcfs=CleanVCF.out
 43 |   }
 44 | 
 45 |   call QC.master_vcf_qc as QC_all {
 46 |     input:
 47 |       vcf=combine.out,
 48 |       vcf_idx=combine.idx,
 49 |       famfile=trio_famfile,
 50 |       ref_build=ref_build,
 51 |       prefix="${prefix}_cleanedVCF",
 52 |       sv_per_shard=10000,
 53 |       samples_per_shard=50,
 54 |       Sanders_2015_tarball=Sanders_2015_tarball,
 55 |       Collins_2017_tarball=Collins_2017_tarball,
 56 |       Werling_2018_tarball=Werling_2018_tarball,
 57 |       contiglist=chrlist
 58 |   }
 59 | 
 60 |   # call subset_pass {
 61 |   #   input:
 62 |   #     vcf=combine.out,
 63 |   #     prefix=prefix
 64 |   # }
 65 | 
 66 |   # call QC.master_vcf_qc as QC_pass {
 67 |   #   input:
 68 |   #     vcf=subset_pass.filtered_vcf,
 69 |   #     famfile=trio_famfile,
 70 |   #     ref_build=ref_build,
 71 |   #     prefix="${prefix}_cleanedVCF_filterPass",
 72 |   #     sv_per_shard=10000,
 73 |   #     samples_per_shard=100,
 74 |   #     Sanders_2015_tarball=Sanders_2015_tarball,
 75 |   #     Collins_2017_tarball=Collins_2017_tarball,
 76 |   #     Werling_2018_tarball=Werling_2018_tarball
 77 |   # }
 78 |     
 79 |   # call subset_fail {
 80 |   #   input:
 81 |   #     vcf=combine.out,
 82 |   #     prefix=prefix
 83 |   # }
 84 | 
 85 |   # call QC.master_vcf_qc as QC_fail {
 86 |   #   input:
 87 |   #     vcf=subset_fail.filtered_vcf,
 88 |   #     famfile=trio_famfile,
 89 |   #     ref_build=ref_build,
 90 |   #     prefix="${prefix}_cleanedVCF_filterFail",
 91 |   #     sv_per_shard=10000,
 92 |   #     samples_per_shard=100,
 93 |   #     Sanders_2015_tarball=Sanders_2015_tarball,
 94 |   #     Collins_2017_tarball=Collins_2017_tarball,
 95 |   #     Werling_2018_tarball=Werling_2018_tarball
 96 |   # }
 97 |     
 98 | 	output {
 99 |   	File cleaned_vcf = combine.out
100 |     File cleaned_vcf_idx = combine.idx
101 |     File all_variants_QC = QC_all.sv_vcf_qc_output
102 |     # File passing_variants_QC = QC_pass.sv_vcf_qc_output
103 |     # File failing_variants_QC = QC_fail.sv_vcf_qc_output
104 |   }
105 | }
106 | 
107 | 
108 | # Merge per-chromosome VCF shards
109 | task combine {
110 | 
111 | 	Array[File] vcfs
112 |   String prefix
113 |   
114 |   command {
115 |     vcf-concat ${sep=" " vcfs} | vcf-sort | bgzip -c > ${prefix}.cleanedvcf.vcf.gz;
116 |     tabix -p vcf ${prefix}.cleanedvcf.vcf.gz
117 |   }
118 | 
119 |   runtime {
120 |     preemptible: 1
121 |     docker : "talkowski/sv-pipeline@sha256:facb963613f57bf6c70072c9356241e3ffe47c5d0550beaf9b21f805315846b0"
122 |     disks: "local-disk 500 SSD"
123 |     memory: "8 GB"
124 |   }
125 | 
126 |   output {
127 |     File out="${prefix}.cleanedvcf.vcf.gz"
128 |     File idx="${prefix}.cleanedvcf.vcf.gz.tbi"
129 |   }
130 | }
131 | 
132 | 
133 | # Task to sunset variants with VCF FILTER = PASS | MULTIALLELIC
134 | task subset_pass {
135 |   File vcf
136 |   String prefix
137 | 
138 |   command <<<
139 |     zcat ${vcf} \
140 |       | awk -v FS="\t" -v OFS="\t" \
141 |         '{ if ($1~"#" || $7=="PASS" || $7=="MULTIALLELIC") print $0 }' \
142 |       | vcf-sort \
143 |       | bgzip -c \
144 |       > ${prefix}.passing_variants.vcf.gz
145 |   >>>
146 | 
147 |   runtime {
148 |     docker: "talkowski/sv-pipeline@sha256:facb963613f57bf6c70072c9356241e3ffe47c5d0550beaf9b21f805315846b0"
149 |     preemptible: 1
150 |     disks: "local-disk 500 SSD"
151 |   }
152 | 
153 |   output {
154 |     File filtered_vcf = "${prefix}.passing_variants.vcf.gz"
155 |   }
156 | }
157 | 
158 | 
159 | # Task to sunset variants with VCF FILTER != PASS | MULTIALLELIC
160 | task subset_fail {
161 |   File vcf
162 |   String prefix
163 | 
164 |   command <<<
165 |     zcat ${vcf} \
166 |       | awk -v FS="\t" -v OFS="\t" \
167 |         '{ if ($1~"#" || ($7!="PASS" && $7!="MULTIALLELIC") ) print $0 }' \
168 |       | vcf-sort \
169 |       | bgzip -c \
170 |       > ${prefix}.failing_variants.vcf.gz
171 |   >>>
172 | 
173 |   runtime {
174 |     docker: "talkowski/sv-pipeline@sha256:facb963613f57bf6c70072c9356241e3ffe47c5d0550beaf9b21f805315846b0"
175 |     preemptible: 1
176 |     disks: "local-disk 500 SSD"
177 |   }
178 | 
179 |   output {
180 |     File filtered_vcf = "${prefix}.failing_variants.vcf.gz"
181 |   }
182 | }
183 | 


--------------------------------------------------------------------------------
/gnomad_sv_pipeline_wdls/module_07/06_annotate.wdl:
--------------------------------------------------------------------------------
  1 | import "https://api.firecloud.org/ga4gh/v1/tools/Talkowski-SV:06_annotate_per_chrom/versions/18/plain-WDL/descriptor" as annotate_by_chrom
  2 | 
  3 | # Copyright (c) 2018 Talkowski Lab
  4 | 
  5 | # Contact Ryan Collins <rlcollins@g.harvard.edu>
  6 | 
  7 | # Distributed under terms of the MIT License
  8 | 
  9 | 
 10 | # Workflow to parallelize VCF annotation by chromosome
 11 | workflow parallelized_annotation {
 12 |   String vcf
 13 |   File vcf_idx
 14 |   String prefix
 15 |   File contiglist
 16 |   File protein_coding_gtf
 17 |   # File antisense_gtf
 18 |   File lincRNA_gtf
 19 |   # File processed_transcript_gtf
 20 |   # File pseudogene_gtf
 21 |   File promoter_bed
 22 |   File noncoding_bed
 23 |   File svc_acct_key
 24 | 
 25 |   Array[Array[String]] contigs = read_tsv(contiglist)
 26 | 
 27 |   #Annotate, scattered by chromosome
 28 |   scatter (contig in contigs) {
 29 | 
 30 |     #Remote tabix each chromosome
 31 |     call subset_vcf {
 32 |       input:
 33 |         vcf=vcf,
 34 |         vcf_idx=vcf_idx,
 35 |         contig=contig[0],
 36 |         prefix="${prefix}.${contig[0]}",
 37 |         svc_acct_key=svc_acct_key
 38 |     }
 39 | 
 40 |     #Annotate per chromosome
 41 |     call annotate_by_chrom.annotate as annotate {
 42 |       input:
 43 |         vcf=subset_vcf.subsetted_vcf,
 44 |         prefix="${prefix}.${contig[0]}",
 45 |         protein_coding_gtf=protein_coding_gtf,
 46 |         lincRNA_gtf=lincRNA_gtf,
 47 |         promoter_bed=promoter_bed,
 48 |         noncoding_bed=noncoding_bed
 49 |     }
 50 |   }
 51 | 
 52 |   #Merge integrated vcfs across chromosomes
 53 |   call concat_vcfs {
 54 |     input:
 55 |       vcfs=annotate.annotated_vcf,
 56 |       prefix="${prefix}.annotated"
 57 |   }
 58 | 
 59 |   output {
 60 |     File annotated_vcf = concat_vcfs.concat_vcf
 61 |     File annotated_vcf_idx = concat_vcfs.concat_vcf_idx
 62 |   }
 63 | }
 64 | 
 65 | 
 66 | #Remote tabix a single chromosome per VCFs
 67 | task subset_vcf {
 68 |   String vcf
 69 |   File vcf_idx
 70 |   String contig
 71 |   String prefix
 72 |   File svc_acct_key
 73 | 
 74 |   command <<<
 75 |     #Remote tabix to chromosome of interest
 76 |     url=$( gsutil signurl -d 24h ${svc_acct_key} ${vcf} | sed '1d' | cut -f 4 );
 77 |     echo "$url";
 78 |     svtk remote_tabix --header "$url" ${vcf_idx} "${contig}:0-300000000" \
 79 |       | bgzip -c > "${prefix}.${contig}.vcf.gz"
 80 |     tabix -p vcf -f "${prefix}.${contig}.vcf.gz"
 81 |   >>>
 82 | 
 83 |   output {
 84 |     File subsetted_vcf = "${prefix}.${contig}.vcf.gz"
 85 |     File subsetted_vcf_idx = "${prefix}.${contig}.vcf.gz.tbi"
 86 |   }
 87 | 
 88 |   runtime {
 89 |     docker: "talkowski/sv-pipeline-remote-pysam@sha256:13da9601b97e08ce2abb1aca494551dc7c09920e46dcca11768cd6aff3db37e5"
 90 |     preemptible: 1
 91 |     maxRetries: 1
 92 |     disks: "local-disk 50 SSD"
 93 |   }
 94 | }
 95 | 
 96 | 
 97 | #Merge multiple vcfs
 98 | task concat_vcfs {
 99 |   Array[File] vcfs
100 |   String prefix
101 | 
102 |   command <<<
103 |     vcf-concat ${sep=' ' vcfs} | vcf-sort -c | bgzip -c > ${prefix}.vcf.gz
104 |     tabix -f -p vcf ${prefix}.vcf.gz
105 |   >>>
106 | 
107 |   output {
108 |     File concat_vcf = "${prefix}.vcf.gz"
109 |     File concat_vcf_idx = "${prefix}.vcf.gz.tbi"
110 |   }
111 | 
112 |   runtime {
113 |     docker: "talkowski/sv-pipeline@sha256:6727434a18800d0453a973ca2386325b6b75330b6d05dd014ddb4bcd91dba31b"
114 |     preemptible: 1
115 |     maxRetries: 1
116 |     disks: "local-disk 1000 SSD"
117 |   }
118 | }


--------------------------------------------------------------------------------
/gnomad_sv_pipeline_wdls/module_07/06_annotate_per_chrom.wdl:
--------------------------------------------------------------------------------
  1 | # Copyright (c) 2018 Talkowski Lab
  2 | 
  3 | # Contact Ryan Collins <rlcollins@g.harvard.edu>
  4 | 
  5 | # Distributed under terms of the MIT License
  6 | 
  7 | 
  8 | # Workflow to annotate output of cleanVCF
  9 | workflow annotate {
 10 |   File vcf
 11 |   String prefix
 12 |   File protein_coding_gtf
 13 |   # File antisense_gtf
 14 |   File lincRNA_gtf
 15 |   # File processed_transcript_gtf
 16 |   # File pseudogene_gtf
 17 |   File promoter_bed
 18 |   File noncoding_bed
 19 | 
 20 |   call annotate_coding as annotate_protein_coding {
 21 |     input:
 22 |       vcf=vcf,
 23 |       gtf=protein_coding_gtf,
 24 |       prefix=prefix,
 25 |       gene_set="protein_coding"
 26 |   }
 27 | 
 28 |   # call annotate_coding as annotate_antisense {
 29 |   #   input:
 30 |   #     vcf=vcf,
 31 |   #     gtf=antisense_gtf,
 32 |   #     prefix=prefix,
 33 |   #     gene_set="antisense"
 34 |   # }
 35 | 
 36 |   call annotate_coding as annotate_lincRNA {
 37 |     input:
 38 |       vcf=vcf,
 39 |       gtf=lincRNA_gtf,
 40 |       prefix=prefix,
 41 |       gene_set="lincRNA"
 42 |   }
 43 | 
 44 |   # call annotate_coding as annotate_processed_transcript {
 45 |   #   input:
 46 |   #     vcf=vcf,
 47 |   #     gtf=processed_transcript_gtf,
 48 |   #     prefix=prefix,
 49 |   #     gene_set="processed_transcript"
 50 |   # }
 51 | 
 52 |   # call annotate_coding as annotate_pseudogene {
 53 |   #   input:
 54 |   #     vcf=vcf,
 55 |   #     gtf=pseudogene_gtf,
 56 |   #     prefix=prefix,
 57 |   #     gene_set="pseudogene"
 58 |   # }
 59 | 
 60 |   call annotate_noncoding as annotate_promoter {
 61 |     input:
 62 |       vcf=vcf,
 63 |       bed=promoter_bed,
 64 |       prefix=prefix,
 65 |       noncoding_set="promoter"
 66 |   }
 67 | 
 68 |   call annotate_noncoding as annotate_noncoding_elements {
 69 |     input:
 70 |       vcf=vcf,
 71 |       bed=noncoding_bed,
 72 |       prefix=prefix,
 73 |       noncoding_set="noncoding"
 74 |   }
 75 |   
 76 |   call merge_annotations {
 77 |   	input:
 78 |       vcf=vcf,
 79 |       protein_coding_vcf=annotate_protein_coding.annotated_vcf,
 80 |       lincRNA_vcf=annotate_lincRNA.annotated_vcf,
 81 |       promoter_vcf=annotate_promoter.annotated_vcf,
 82 |       noncoding_vcf=annotate_noncoding_elements.annotated_vcf,
 83 |       prefix=prefix
 84 |   }
 85 |   
 86 |   output {
 87 |   	File annotated_vcf = merge_annotations.annotated_vcf
 88 |   }
 89 | }
 90 | 
 91 | task annotate_coding {
 92 |   File vcf
 93 |   File gtf
 94 |   String prefix
 95 |   String gene_set
 96 | 
 97 |   command <<<
 98 |     set -euo pipefail
 99 |     # Note: as of BEDTools 2.28, there are issues with reading bgzip-compressed files
100 |     # directly into intersect/coverage, so GTF needs to be decompressed first
101 |     zcat ${gtf} > decompressed.gtf
102 |     svtk annotate \
103 |       --gencode decompressed.gtf \
104 |       ${vcf} \
105 |       ${prefix}.${gene_set}.vcf
106 |     orig=$( zcat ${vcf} | cut -f1 | fgrep -v "#" | wc -l )
107 |     new=$( cut -f1 ${prefix}.${gene_set}.vcf | fgrep -v "#" | wc -l )
108 |     if [ "$new" -ne "$orig" ]; then
109 |       echo "ANNOTATED VCF DOES NOT HAVE THE SAME NUMBER OF RECORDS AS INPUT VCF ($new vs $orig)"
110 |       exit 1
111 |     fi
112 |     bgzip -f ${prefix}.${gene_set}.vcf
113 |   >>>
114 | 
115 |   output {
116 |     File annotated_vcf = "${prefix}.${gene_set}.vcf.gz"
117 |   }
118 | 
119 |   runtime {
120 |     preemptible: 1
121 |     maxRetries: 1
122 |     disks: "local-disk 50 SSD"
123 |     memory: "4 GB"
124 |     docker: "talkowski/sv-pipeline@sha256:e98cd2ffd787240a0fe4a075d35ffc3f6107310b881f646d5340de34910a7510"
125 |   }
126 | }
127 | 
128 | task annotate_noncoding {
129 |   File vcf
130 |   File bed
131 |   String prefix
132 |   String noncoding_set
133 |   
134 |   command <<<
135 |     set -euo pipefail
136 |     svtk annotate \
137 |       --noncoding ${bed} \
138 |       ${vcf} \
139 |       ${prefix}.${noncoding_set}.vcf
140 |     orig=$( zcat ${vcf} | cut -f1 | fgrep -v "#" | wc -l )
141 |     new=$( cut -f1 ${prefix}.${noncoding_set}.vcf | fgrep -v "#" | wc -l )
142 |     if [ "$new" -ne "$orig" ]; then
143 |       echo "ANNOTATED VCF DOES NOT HAVE THE SAME NUMBER OF RECORDS AS INPUT VCF ($new vs $orig)"
144 |       exit 1
145 |     fi
146 |     bgzip -f ${prefix}.${noncoding_set}.vcf
147 |   >>>
148 | 
149 |   output {
150 |     File annotated_vcf = "${prefix}.${noncoding_set}.vcf.gz"
151 |   }
152 | 
153 |   runtime {
154 |     preemptible: 1
155 |     maxRetries: 1
156 |     disks: "local-disk 50 SSD"
157 |     memory: "4 GB"
158 |     docker: "talkowski/sv-pipeline@sha256:e98cd2ffd787240a0fe4a075d35ffc3f6107310b881f646d5340de34910a7510"
159 |   }
160 | }
161 | 
162 | task merge_annotations {
163 |   File vcf
164 |   File protein_coding_vcf
165 |   # File antisense_vcf
166 |   File lincRNA_vcf
167 |   # File processed_transcript_vcf
168 |   # File pseudogene_vcf
169 |   File promoter_vcf
170 |   File noncoding_vcf
171 |   String prefix
172 |   
173 |   command <<<
174 |     set -euo pipefail
175 |     /opt/sv-pipeline/05_annotation/scripts/merge_annotations.py \
176 |       ${vcf} \
177 |       ${protein_coding_vcf} \
178 |       ${lincRNA_vcf} \
179 |       ${promoter_vcf} \
180 |       ${noncoding_vcf} \
181 |       ${prefix}.annotated.vcf
182 |     bgzip ${prefix}.annotated.vcf
183 |     orig=$( zcat ${vcf} | cut -f1 | fgrep -v "#" | wc -l )
184 |     new=$( zcat ${prefix}.annotated.vcf.gz | cut -f1 | fgrep -v "#" | wc -l )
185 |     if [ "$new" -ne "$orig" ]; then
186 |       echo "ANNOTATED VCF DOES NOT HAVE THE SAME NUMBER OF RECORDS AS INPUT VCF ($new vs $orig)"
187 |       exit 1
188 |     fi
189 |   >>>
190 |   
191 |   output {
192 |     File annotated_vcf = "${prefix}.annotated.vcf.gz"
193 |   }
194 | 
195 |   runtime {
196 |     preemptible: 1
197 |     maxRetries: 1
198 |     disks: "local-disk 250 SSD"
199 |     memory: "8 GB"
200 |     docker: "talkowski/sv-pipeline@sha256:e98cd2ffd787240a0fe4a075d35ffc3f6107310b881f646d5340de34910a7510"
201 |   }
202 | }


--------------------------------------------------------------------------------