├── LICENSE ├── README.md ├── data_and_refs ├── SV_colors.txt └── gnomAD_population_colors.txt ├── gnomad_sv_analysis_scripts ├── .DS_Store ├── apply_minGQ_filter.py ├── clean_frequencies_table.R ├── correlate_batches_singlePop.R ├── count_mendelian_violations.py ├── create_minGQ_lookup_table.R ├── create_minGQ_tranches_table.R ├── determine_lowQual_site_categories.R ├── determine_svcount_outliers.R ├── filter_cleanup_and_QUAL_recalibration.py ├── find_batch_effects.R ├── find_batch_effects.shard_helper.R ├── gather_trio_MVR_data.py ├── gather_trio_genos.py ├── helper_median_counts_per_trio.R ├── label_batch_effects.py ├── make_batch_effect_reclassification_table.R ├── make_batch_pairs_list.R ├── merge_batch_freq_tables.R ├── merge_filter_columns.py ├── optimize_GQ_ROC.R ├── optimize_minGQ_ROC_v2.R ├── parse_KING_results.R ├── prePCA_vcf_filter.py ├── runPCA_labelAncestries.R ├── subset_minGQ_trio_data.R └── sum_svcounts_perSample.R ├── gnomad_sv_analysis_wdls ├── MVR_collection_helper.wdl ├── apply_GQ_filter.wdl ├── assign_lowQuality_sites.wdl ├── check_batch_effects.wdl ├── compute_simple_AFs_singleChrom.wdl ├── filter_cleanup_and_QUAL_recalibration.wdl ├── final_outlier_sample_filter.wdl ├── gather_batch_effects_helper.wdl ├── label_ancestries_and_relatedness.wdl ├── minGQ_ROC_helper.wdl ├── minGQ_filter_procedure_v2.wdl ├── minGQ_filter_procedure_wrapper.wdl ├── optimize_GQ_filter.wdl ├── prune_and_add_vafs.wdl └── sharded_vcf2bed.wdl ├── gnomad_sv_manuscript_code ├── .DS_Store ├── metadata_generation │ ├── AF_reconcilliation_helper.R │ ├── collect_sample_level_summary_data.R │ ├── merge_downsample_sv_per_gene.R │ ├── seed_downsampling.R │ └── simplify_downsample_results.R └── plotting_code │ ├── chromosome_sv_density_analysis.R │ ├── gene_level_analysis.R │ ├── high_low_quality_callset_comparisons.plot.R │ ├── noncoding_analysis.plot.R │ ├── plot_PCAs.R │ ├── plot_deNovo_rate_analysis.R │ ├── plot_downsampling_analyses.R │ ├── plot_sample_level_analysis.R │ ├── secondary_prop_singletons_analysis.R │ ├── snv_sv_ld_analyses.plot.R │ ├── thousand_genomes_comparisons.plot.R │ ├── ukbb_gd_analysis.plot.R │ └── vcf_wide_site_summaries.plot.R ├── gnomad_sv_pipeline_scripts ├── .DS_Store ├── module_00 │ ├── .DS_Store │ ├── vcf2baf.sh │ └── vcf2baf_helper.py └── module_01 │ ├── make_depth_rdtest_bed.py │ └── make_pesr_rdtest_bed.py └── gnomad_sv_pipeline_wdls ├── .DS_Store ├── module_00 ├── 00_batch_BAF_merging.wdl ├── 00_batch_PESRRD_merging.wdl ├── 00_batch_SR_merging.wdl ├── 00_batch_evidence_merging.wdl ├── 00_depth_preprocessing.wdl ├── 00_pesr_preprocessing.wdl └── 00_pesr_processing_single_algorithm.wdl ├── module_01 ├── 01_depth_clustering.wdl ├── 01_depth_clustering_by_chrom.wdl ├── 01_pesr_clustering.wdl └── 01_pesr_clustering_single_algorithm.wdl ├── module_02 ├── 02_aggregate.wdl ├── 02_assess_evidence_single_vcf.wdl ├── 02_baftest.wdl ├── 02_baftest_autosome.wdl ├── 02_petest.wdl ├── 02_petest_allosome.wdl ├── 02_petest_autosome.wdl ├── 02_rdtest.wdl ├── 02_rdtest_allosome.wdl ├── 02_rdtest_autosome.wdl ├── 02_srtest.wdl ├── 02_srtest_allosome.wdl └── 02_srtest_autosome.wdl ├── module_03 ├── 03_filter_outliers.wdl ├── 03_filter_vcf.wdl └── 03_variant_filtering.wdl ├── module_04 ├── 04_preprocess.wdl ├── 04_v2_PE_genotyping_train.wdl ├── 04_v2_RD_genotyping_train.wdl ├── 04_v2_SR_genotyping_train.wdl ├── 04_v2_genotype_batch.wdl ├── 04_v2_genotype_depth_part1.wdl ├── 04_v2_genotype_depth_part2.wdl ├── 04_v2_genotype_pesr_part1.wdl ├── 04_v2_genotype_pesr_part2.wdl └── 04_v2_make_cohort_VCFs.wdl ├── module_05 ├── 04_bp_overlap_filter_by_chrom.wdl ├── 04_genotype_CPX_CNVs.wdl ├── 04_genotype_CPX_CNVs_perBatch.wdl ├── 04_integrate_batches.wdl ├── 04_integrate_resolved_vcfs.wdl ├── 04_merge_allvar_invonly_vcfs.wdl ├── 04_pesr_depth_overlap.wdl ├── 04_resolve_complex_by_chrom.wdl ├── 04_resolve_complex_sv.wdl ├── 04_sharded_vcfcluster.wdl ├── 04b_batch_integration.wdl ├── 04b_genotype_CPX_CNVs.wdl ├── 04b_preprocess.wdl ├── 04b_resolve_complex_sv.wdl ├── 04b_scatter_CPX_genotyping.wdl └── 04b_vcfcluster_single_chrom.wdl ├── module_06 ├── 05_cleanVCF.wdl ├── 05_cleanVCF_part2.wdl ├── 05_cleanVCF_part4.wdl └── 05_cleanVCF_scatter.wdl └── module_07 ├── 06_annotate.wdl └── 06_annotate_per_chrom.wdl /LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2019 Talkowski Laboratory 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /data_and_refs/SV_colors.txt: -------------------------------------------------------------------------------- 1 | DEL #D43925 2 | DUP #2376B2 3 | MCNV #7459B2 4 | INS #D474E0 5 | INV #FA931E 6 | CPX #71E38C 7 | OTH #397246 8 | -------------------------------------------------------------------------------- /data_and_refs/gnomAD_population_colors.txt: -------------------------------------------------------------------------------- 1 | pop color name 2 | AFR #941594 "African/African-American" 3 | AMR #EE1D24 Latino 4 | EAS #108C43 "East Asian" 5 | EUR #69A5CC European 6 | OTH #ABB9B9 "Other/Unk." 7 | -------------------------------------------------------------------------------- /gnomad_sv_analysis_scripts/.DS_Store: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/talkowski-lab/gnomad-sv-pipeline/b7798895fc1b2c8d83b41b36148b0a4b3a8d25cb/gnomad_sv_analysis_scripts/.DS_Store -------------------------------------------------------------------------------- /gnomad_sv_analysis_scripts/clean_frequencies_table.R: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env Rscript 2 | 3 | # Copyright (c) 2018 Talkowski Laboratory 4 | # Contact: Ryan Collins 5 | # Distributed under terms of the MIT license. 6 | 7 | # Talkowski SV pipeline downstream analysis helper script 8 | 9 | # Clean allele frequency table across batches 10 | 11 | 12 | ###Set global parameters 13 | options(stringsAsFactors=F,scipen=1000) 14 | svtypes <- c("DEL","DUP","INS","INV","CPX","BND") 15 | allpops <- c("AFR","ASN","EUR","HSP") 16 | 17 | 18 | ################### 19 | ###HELPER FUNCTIONS 20 | ################### 21 | #Collapse multiallelic ACs 22 | clean.ACs <- function(ACs){ 23 | ACs.to.clean <- grep(",",ACs,fixed=T) 24 | cleaned.ACs <- as.vector(as.numeric(sapply(ACs[ACs.to.clean],function(s){ 25 | return(sum(as.numeric(unlist(strsplit(s,split=",")))[-2])) 26 | }))) 27 | ACs[ACs.to.clean] <- cleaned.ACs 28 | return(as.numeric(ACs)) 29 | } 30 | #Import a single table of freq data 31 | import.freq.table <- function(path,pops){ 32 | #Read data 33 | dat <- read.table(path,header=T,comment.char="",sep="\t") 34 | colnames(dat)[1:3] <- c("VID","SVLEN","SVTYPE") 35 | #Clean multiallelic ACs 36 | dat[,grep("_AC",colnames(dat),fixed=T)] <- apply(dat[,grep("_AC",colnames(dat),fixed=T)],2,clean.ACs) 37 | #Convert all ANs to numerics 38 | dat[,grep("_AN",colnames(dat),fixed=T)] <- apply(dat[,grep("_AN",colnames(dat),fixed=T)],2,as.numeric) 39 | #Double counts for MCNVs (since first allele for all MCNVs is nulled out) 40 | MCNV.idx <- grep("_MCNV_",dat[,1],fixed=T) 41 | dat[MCNV.idx,grep("_AC",colnames(dat),fixed=T)] <- 2*dat[MCNV.idx,grep("_AC",colnames(dat),fixed=T)] 42 | dat[MCNV.idx,grep("_AN",colnames(dat),fixed=T)] <- 2*dat[MCNV.idx,grep("_AN",colnames(dat),fixed=T)] 43 | #Adjust calls on sex chromosomes 44 | for(pop in pops){ 45 | x.idx <- unique(c(grep("_X_",dat[,1],fixed=T), 46 | grep("_chrX_",dat[,1],fixed=T))) 47 | y.idx <- unique(c(grep("_Y_",dat[,1],fixed=T), 48 | grep("_chrY_",dat[,1],fixed=T))) 49 | master.AC.idx <- which(colnames(dat)==paste(pop,"AC",sep="_")) 50 | master.AN.idx <- which(colnames(dat)==paste(pop,"AN",sep="_")) 51 | male.AC.idx <- which(colnames(dat)==paste(pop,"MALE","AC",sep="_")) 52 | male.AN.idx <- which(colnames(dat)==paste(pop,"MALE","AN",sep="_")) 53 | female.AC.idx <- which(colnames(dat)==paste(pop,"FEMALE","AC",sep="_")) 54 | female.AN.idx <- which(colnames(dat)==paste(pop,"FEMALE","AN",sep="_")) 55 | #For variants on chrX, overwrite master counts with female-specific counts 56 | dat[x.idx,master.AC.idx] <- dat[x.idx,female.AC.idx] 57 | dat[x.idx,master.AN.idx] <- dat[x.idx,female.AN.idx] 58 | #For all variants on chrY, overwrite master counts with male-specific counts 59 | dat[y.idx,master.AC.idx] <- dat[y.idx,male.AC.idx] 60 | dat[y.idx,master.AN.idx] <- dat[y.idx,male.AN.idx] 61 | } 62 | #Drop all male- and female-specific columns from table 63 | dat <- dat[,-grep("MALE",colnames(dat),fixed=T)] 64 | #Add batch name to colnames 65 | return(dat) 66 | } 67 | 68 | 69 | ###Read command-line arguments 70 | args <- commandArgs(trailingOnly=T) 71 | INFILE <- as.character(args[1]) 72 | OUTFILE <- as.character(args[2]) 73 | 74 | # #Dev parameters (local) 75 | # INFILE <- "~/scratch/gnomAD_v2_SV_PCRPLUS_Q1_batch_1.frequencies.preclean.txt.gz" 76 | 77 | 78 | ###Process input data 79 | dat <- import.freq.table(path=INFILE,pops=allpops) 80 | 81 | 82 | ###Write output data 83 | write.table(dat,OUTFILE,col.names=T,row.names=F,sep="\t",quote=F) 84 | system(paste("gzip -f ",OUTFILE,sep=""),wait=T,intern=F) 85 | -------------------------------------------------------------------------------- /gnomad_sv_analysis_scripts/find_batch_effects.R: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env Rscript 2 | 3 | # Copyright (c) 2018 Talkowski Laboratory 4 | # Contact: Ryan Collins 5 | # Distributed under terms of the MIT license. 6 | 7 | # Talkowski SV pipeline downstream analysis helper script 8 | 9 | # Make list of all nonredundant pairs of batches from an input list of batches 10 | 11 | 12 | ###Set global parameters 13 | options(stringsAsFactors=F,scipen=1000) 14 | allpops <- c("AFR","ASN","EUR","HSP") 15 | svtypes <- c("DEL","DUP","MCNV","INS","INV","CPX") 16 | 17 | 18 | ################### 19 | ###HELPER FUNCTIONS 20 | ################### 21 | #Allele frequency correlation plot between datasets 22 | plot.AFcorr <- function(dat,batch1=NULL,batch2=NULL,title=NULL,axlims){ 23 | #Prepare plot 24 | AF.pairs <- data.frame(dat$b1.AF,dat$b2.AF) 25 | #Artificially assign non-zero AFs for sites that appear in 0 samples 26 | AF.pairs[which(AF.pairs[,1]==0),1] <- 1/900 27 | AF.pairs[which(AF.pairs[,2]==0),2] <- 1/900 28 | logscale.all <- log10(as.numeric(unlist(sapply(c(0:9),function(i){(1:9)*(10^i)})))) 29 | logscale.major <- 0:9 30 | major.labels <- sapply(logscale.major,function(i){expression(paste(i^"th"))}) 31 | par(mar=c(3.2,1.5,1.5,3.2)) 32 | plot(x=log10(c(min(AF.pairs[,1],na.rm=T),1)), 33 | y=log10(c(min(AF.pairs[,2],na.rm=T),1)), 34 | type="n",xaxt="n",yaxt="n",xlab="",ylab="", 35 | xlim=log10(axlims),ylim=log10(axlims)) 36 | axis(1,at=-logscale.all,labels=NA,tck=-0.015,lwd=0.7) 37 | axis(1,at=-logscale.major,labels=NA,tck=-0.03,lwd=1.1) 38 | mtext(1,text=bquote("log"[10] ~ "("*.(batch1) ~ "AF)"),line=2) 39 | axis(4,at=-logscale.all,labels=NA,tck=-0.015,lwd=0.7) 40 | axis(4,at=-logscale.major,labels=NA,tck=-0.03,lwd=1.1) 41 | mtext(4,text=bquote("log"[10] ~ "("*.(batch2) ~ "AF)"),line=2) 42 | sapply(-logscale.major,function(i){ 43 | # axis(1,at=i,labels=bquote('10'^.(i)),tick=F,line=-0.6,cex.axis=0.8) 44 | # axis(4,at=i,labels=bquote('10'^.(i)),tick=F,line=-0.4,cex.axis=0.8,las=2) 45 | axis(1,at=i,labels=i,tick=F,line=-0.6,cex.axis=0.8) 46 | axis(4,at=i,labels=i,tick=F,line=-0.4,cex.axis=0.8,las=2) 47 | }) 48 | abline(h=log10(1/900),v=log10(1/900),lty=3) 49 | axis(1,at=log10(1/900),labels="AC=0",line=-0.8,cex.axis=0.7,tick=F) 50 | axis(4,at=log10(1/900),labels="AC=0",line=-0.6,cex.axis=0.7,tick=F,las=2) 51 | mtext(3,text=title,line=0.1,font=2) 52 | 53 | #Add points 54 | pt.cex <- 0.4 55 | alpha <- 0.25 56 | points(x=log10(AF.pairs[which(dat$bonf.p>=0.05),1]), 57 | y=log10(AF.pairs[which(dat$bonf.p>=0.05),2]), 58 | pch=19,cex=pt.cex,lwd=0, 59 | col=adjustcolor("gray50",alpha=alpha)) 60 | points(x=log10(AF.pairs[which(dat$bonf.p<0.05),1]), 61 | y=log10(AF.pairs[which(dat$bonf.p<0.05),2]), 62 | pch=19,cex=pt.cex,lwd=0, 63 | col=adjustcolor("red",alpha=alpha)) 64 | 65 | #Add stats 66 | abline(lm(AF.pairs[,1] ~ AF.pairs[,2]),col="gray10",lty=2) 67 | AB.cor <- format(round(cor(AF.pairs[,1],AF.pairs[,2])^2,3),nsmall=3) 68 | text(x=par("usr")[1],y=par("usr")[4]-(0.085*(par("usr")[4]-par("usr")[3])), 69 | labels=bquote(italic(R)^2 == .(AB.cor)),cex=1.4,pos=4) 70 | } 71 | 72 | 73 | ###Read command-line arguments 74 | args <- commandArgs(trailingOnly=T) 75 | infile <- as.character(args[1]) 76 | batch1 <- as.character(args[2]) 77 | batch2 <- as.character(args[3]) 78 | OUTPREFIX <- as.character(args[4]) 79 | 80 | # #Dev parameters: 81 | # infile <- "~/scratch/gnomAD_AF_table.merged.txt.gz" 82 | # infile <- "~/scratch/gnomAD_v2_SV_MASTER.gnomAD_v2_SV_PCRPLUS_Q1_batch_1_vs_gnomAD_v2_SV_PCRMINUS_Q2_batch_1.AF_comparison_table.txt.gz" 83 | # batch1 <- "gnomAD_v2_SV_PCRPLUS_Q1_batch_1" 84 | # batch2 <- "gnomAD_v2_SV_PCRMINUS_Q2_batch_1" 85 | # OUTPREFIX <- "~/scratch/gnomAD_v2_SV_MASTER" 86 | 87 | 88 | ###Process input data 89 | dat <- read.table(infile,header=T,sep="\t",comment.char="") 90 | dat$bonf.p <- p.adjust(dat$chisq.p,method="bonferroni") 91 | write.table(dat,paste(OUTPREFIX,".",batch1,"_vs_",batch2,".freq_table_wBonferroni.txt",sep=""), 92 | col.names=T,row.names=F,sep="\t",quote=F) 93 | 94 | 95 | ###Write list of significant batch effect variants 96 | bad.vars <- dat$VID[which(dat$bonf.p<0.05)] 97 | write.table(bad.vars,paste(OUTPREFIX,".",batch1,"_vs_",batch2,".batch_effect_variants.txt",sep=""), 98 | col.names=F,row.names=F,sep="\t",quote=F) 99 | 100 | 101 | ###Plot AF correlations, one per SVTYPE 102 | axlims <- c(1/900,1) 103 | png(paste(OUTPREFIX,".",batch1,"_vs_",batch2,".AF_correlation_scatterplot.ALL.png",sep=""), 104 | height=6*300,width=6*300,res=400) 105 | plot.AFcorr(dat=dat,batch1=batch1,batch2=batch2,title="All SV",axlims=axlims) 106 | dev.off() 107 | sapply(svtypes,function(svtype){ 108 | subdat <- dat[grep(svtype,dat$VID,fixed=T),] 109 | if(nrow(subdat)>0){ 110 | png(paste(OUTPREFIX,".",batch1,"_vs_",batch2,".AF_correlation_scatterplot.",svtype,".png",sep=""), 111 | height=6*300,width=6*300,res=400) 112 | plot.AFcorr(dat=subdat,batch1=batch1,batch2=batch2,title=svtype,axlims=axlims) 113 | dev.off() 114 | } 115 | }) 116 | 117 | -------------------------------------------------------------------------------- /gnomad_sv_analysis_scripts/find_batch_effects.shard_helper.R: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env Rscript 2 | 3 | # Copyright (c) 2018 Talkowski Laboratory 4 | # Contact: Ryan Collins 5 | # Distributed under terms of the MIT license. 6 | 7 | # Talkowski SV pipeline downstream analysis helper script 8 | 9 | # Make list of all nonredundant pairs of batches from an input list of batches 10 | 11 | 12 | ###Set global parameters 13 | options(stringsAsFactors=F,scipen=1000) 14 | allpops <- c("AFR","ASN","EUR","HSP") 15 | 16 | 17 | ################### 18 | ###HELPER FUNCTIONS 19 | ################### 20 | #For any two batches, find most comparable AFs for each variant and run chi-sqared test 21 | compare.batches <- function(dat,batch1,batch2,min.AN=60){ 22 | #Subset data for each batch (for convenience) 23 | #1: restrict to sites with >0 AC in at least one batch 24 | b1.dat <- dat[,c(1:3,grep(batch1,colnames(dat),fixed=T))] 25 | b1.maxAC <- apply(b1.dat[,grep("_AC",colnames(b1.dat),fixed=T)],1,max) 26 | if(batch2 != "ALL_OTHERS"){ 27 | b2.dat <- dat[,c(1:3,grep(batch2,colnames(dat),fixed=T))] 28 | b2.maxAC <- apply(b2.dat[,grep("_AC",colnames(b2.dat),fixed=T)],1,max) 29 | }else{ 30 | b2.consolidated.dat <- do.call("cbind", lapply(allpops,function(pop){ 31 | ACs <- apply(dat[,setdiff(grep(paste(pop,"AC",sep="_"),colnames(dat),fixed=T), 32 | grep(batch1,colnames(dat),fixed=T))],1,sum,na.rm=T) 33 | ANs <- apply(dat[,setdiff(grep(paste(pop,"AN",sep="_"),colnames(dat),fixed=T), 34 | grep(batch1,colnames(dat),fixed=T))],1,sum,na.rm=T) 35 | dtmp <- data.frame(ANs,ACs) 36 | colnames(dtmp) <- c(paste(pop,"_AN.ALL_OTHERS",sep=""), 37 | paste(pop,"_AC.ALL_OTHERS",sep="")) 38 | return(dtmp) 39 | })) 40 | b2.dat <- cbind(dat[,1:3],b2.consolidated.dat) 41 | b2.maxAC <- apply(b2.dat[,grep("_AC",colnames(b2.dat),fixed=T)],1,max) 42 | } 43 | b1.dat <- b1.dat[which(b1.maxAC > 0 | b2.maxAC > 0),] 44 | b2.dat <- b2.dat[which(b1.maxAC > 0 | b2.maxAC > 0),] 45 | #Iterate over variants and process each 46 | res <- do.call("rbind", lapply(as.character(b1.dat$VID),function(VID){ 47 | #Find pop with largest min AN and at least one alternate allele between the two batches 48 | AN.bypop <- sapply(allpops,function(pop){ 49 | min(b1.dat[which(b1.dat$VID==VID), 50 | grep(paste(pop,"AN",sep="_"),colnames(b1.dat))], 51 | b2.dat[which(b2.dat$VID==VID), 52 | grep(paste(pop,"AN",sep="_"),colnames(b2.dat))], 53 | na.rm=T) 54 | }) 55 | AC.bypop <- sapply(allpops,function(pop){ 56 | max(b1.dat[which(b1.dat$VID==VID), 57 | grep(paste(pop,"AC",sep="_"),colnames(b1.dat))], 58 | b2.dat[which(b2.dat$VID==VID), 59 | grep(paste(pop,"AC",sep="_"),colnames(b2.dat))], 60 | na.rm=T) 61 | }) 62 | AN.bypop[which(AC.bypop<1)] <- 0 63 | #Only process if at least one pop has min AN > min.AN 64 | if(any(AN.bypop>min.AN)){ 65 | bestpop <- names(AN.bypop)[which(AN.bypop==max(AN.bypop,na.rm=T))] 66 | b1.AC <- as.numeric(b1.dat[which(b1.dat$VID==VID), 67 | grep(paste(bestpop,"AC",sep="_"),colnames(b1.dat),fixed=T)]) 68 | b1.AN <- as.numeric(b1.dat[which(b1.dat$VID==VID), 69 | grep(paste(bestpop,"AN",sep="_"),colnames(b1.dat),fixed=T)]) 70 | if(b1.AC>b1.AN){ 71 | b1.AC <- b1.AN 72 | } 73 | b1.AF <- b1.AC/b1.AN 74 | b2.AC <- as.numeric(b2.dat[which(b2.dat$VID==VID), 75 | grep(paste(bestpop,"AC",sep="_"),colnames(b2.dat),fixed=T)]) 76 | b2.AN <- as.numeric(b2.dat[which(b2.dat$VID==VID), 77 | grep(paste(bestpop,"AN",sep="_"),colnames(b2.dat),fixed=T)]) 78 | if(b2.AC>b2.AN){ 79 | b2.AC <- b2.AN 80 | } 81 | b2.AF <- b2.AC/b2.AN 82 | b1b2.p <- chisq.test(matrix(c(b1.AN-b1.AC,b1.AC, 83 | b2.AN-b2.AC,b2.AC), 84 | nrow=2,byrow=F))$p.value 85 | #Output row 86 | out.v <- data.frame("VID"=VID,"pop"=bestpop,"b1.AF"=b1.AF,"b2.AF"=b2.AF,"chisq.p"=b1b2.p) 87 | }else{ 88 | out.v <- data.frame("VID"=VID,"pop"=NA,"b1.AF"=NA,"b2.AF"=NA,"chisq.p"=NA) 89 | } 90 | return(out.v) 91 | })) 92 | rownames(res) <- NULL 93 | res <- res[which(!is.na(res$pop)),] 94 | # res$chisq.bonf <- p.adjust(res$chisq.p,method="bonferroni") 95 | res[,-c(1:2)] <- apply(res[,-(1:2)],2,as.numeric) 96 | return(res) 97 | } 98 | 99 | 100 | ###Read command-line arguments 101 | args <- commandArgs(trailingOnly=T) 102 | infile <- as.character(args[1]) 103 | batch1 <- as.character(args[2]) 104 | batch2 <- as.character(args[3]) 105 | OUTFILE <- as.character(args[4]) 106 | 107 | # #Dev parameters: 108 | # infile <- "~/scratch/gnomAD_v2_SV_MASTER.merged_AF_table.txt.gz" 109 | # batch1 <- "gnomAD_v2_SV_PCRMINUS_Q4_batch_4" 110 | # # batch2 <- "gnomAD_v2_SV_PCRMINUS_Q4_batch_5" 111 | # batch2 <- "ALL_OTHERS" 112 | 113 | ###Process data & write output 114 | dat <- read.table(infile,header=T,sep="\t",comment.char="") 115 | res <- compare.batches(dat=dat,batch1=batch1,batch2=batch2) 116 | write.table(res,OUTFILE,col.names=T,row.names=F,sep="\t",quote=F) 117 | 118 | -------------------------------------------------------------------------------- /gnomad_sv_analysis_scripts/helper_median_counts_per_trio.R: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env Rscript 2 | 3 | # Copyright (c) 2018 Talkowski Laboratory 4 | # Contact: Ryan Collins 5 | # Distributed under terms of the MIT license. 6 | 7 | # Helper script to count median # of filtered variants per trio 8 | # for minGQ optimization filtering workflow 9 | 10 | 11 | ###Set master parameters 12 | options(stringsAsFactors=F,scipen=1000) 13 | 14 | 15 | 16 | ################ 17 | ###RSCRIPT BLOCK 18 | ################ 19 | require(optparse) 20 | ###List of command-line options 21 | option_list <- list( 22 | make_option(c("--ID"), type="character", default="condition", 23 | help="condition ID [default %default]") 24 | ) 25 | 26 | ###Get command-line arguments & options 27 | args <- parse_args(OptionParser(usage="%prog INFILE FAMFILE OUTFILE", 28 | option_list=option_list), 29 | positional_arguments=TRUE) 30 | opts <- args$options 31 | 32 | ###Checks for appropriate positional arguments 33 | if(length(args$args) != 3){ 34 | stop("Incorrect number of required positional arguments\n") 35 | } 36 | 37 | ###Writes args & opts to vars 38 | INFILE <- args$args[1] 39 | FAMFILE <- args$args[2] 40 | OUTFILE <- args$args[3] 41 | ID <- opts$ID 42 | 43 | ###Reads data 44 | dat <- as.character(read.table(INFILE,header=F)[,1]) 45 | fams <- unique(as.character(read.table(FAMFILE,header=F)[,1])) 46 | 47 | ###Computes # of variants per family 48 | counts <- as.integer(sapply(fams,function(fam){ 49 | return(length(which(dat==fam))) 50 | })) 51 | 52 | ###Reports results 53 | out <- data.frame("condition"=ID, 54 | "hetsPerProband_median"=median(counts,na.rm=T), 55 | "hetsPerProband_Q1"=quantile(counts,probs=0.25,na.rm=T), 56 | "hetsPerProband_Q3"=quantile(counts,probs=0.75,na.rm=T)) 57 | colnames(out)[1] <- "#condition" 58 | write.table(out,OUTFILE,col.names=T,row.names=F,sep="\t",quote=F) 59 | -------------------------------------------------------------------------------- /gnomad_sv_analysis_scripts/make_batch_pairs_list.R: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env Rscript 2 | 3 | # Copyright (c) 2018 Talkowski Laboratory 4 | # Contact: Ryan Collins 5 | # Distributed under terms of the MIT license. 6 | 7 | # Talkowski SV pipeline downstream analysis helper script 8 | 9 | # Make list of all nonredundant pairs of batches from an input list of batches 10 | 11 | 12 | ###Set global parameters 13 | options(stringsAsFactors=F,scipen=1000) 14 | 15 | 16 | ###Read command-line arguments 17 | args <- commandArgs(trailingOnly=T) 18 | in.list <- as.character(args[1]) 19 | OUTFILE <- as.character(args[2]) 20 | 21 | # #Dev parameters 22 | # in.list <- "~/scratch/af_test/input.list" 23 | # OUTFILE <- "~/scratch/batch_pairs_test.txt" 24 | 25 | 26 | ###Process input data 27 | batches <- as.character(read.table(in.list,header=F,sep="\t")[,1]) 28 | 29 | 30 | ###Generate list of all possible pairs 31 | out <- do.call("rbind", lapply(1:length(batches),function(a){ 32 | do.call("rbind", lapply(2:length(batches),function(b){ 33 | if(a 5 | # Distributed under terms of the MIT license. 6 | 7 | # Talkowski SV pipeline downstream analysis helper script 8 | 9 | # Merge AF tables across batches 10 | 11 | 12 | ###Set global parameters 13 | options(stringsAsFactors=F,scipen=1000) 14 | svtypes <- c("DEL","DUP","INS","INV","CPX","BND") 15 | allpops <- c("AFR","ASN","EUR","HSP") 16 | 17 | 18 | ################### 19 | ###HELPER FUNCTIONS 20 | ################### 21 | #Collapse multiallelic ACs 22 | clean.ACs <- function(ACs){ 23 | ACs.to.clean <- grep(",",ACs,fixed=T) 24 | cleaned.ACs <- as.vector(as.numeric(sapply(ACs[ACs.to.clean],function(s){ 25 | return(sum(as.numeric(unlist(strsplit(s,split=",")))[-2])) 26 | }))) 27 | ACs[ACs.to.clean] <- cleaned.ACs 28 | return(as.numeric(ACs)) 29 | } 30 | #Import a single table of freq data 31 | import.freq.table <- function(batch,path){ 32 | #Read data 33 | dat <- read.table(path,header=T,comment.char="",sep="\t") 34 | #Clean multiallelic ACs 35 | dat[,grep("_AC",colnames(dat),fixed=T)] <- apply(dat[,grep("_AC",colnames(dat),fixed=T)],2,clean.ACs) 36 | #Convert all ANs to numerics 37 | dat[,grep("_AN",colnames(dat),fixed=T)] <- apply(dat[,grep("_AN",colnames(dat),fixed=T)],2,as.numeric) 38 | #Add batch name to colnames 39 | colnames(dat)[-c(1:3)] <- paste(colnames(dat)[-c(1:3)],batch,sep=".") 40 | colnames(dat)[1:3] <- c("VID","SVLEN","SVTYPE") 41 | return(dat) 42 | } 43 | #Import a list of freq data tables and merge them 44 | import.freq.data <- function(batches.list){ 45 | merged <- import.freq.table(batch=batches.list[1,1],path=batches.list[1,2]) 46 | for(i in 2:nrow(batches.list)){ 47 | newdat <- import.freq.table(batch=batches.list[i,1],path=batches.list[i,2]) 48 | merged <- merge(x=merged,y=newdat,by=c("VID","SVLEN","SVTYPE"),sort=F,all=T) 49 | rm(newdat) 50 | } 51 | return(merged) 52 | } 53 | 54 | 55 | ###Read command-line arguments 56 | args <- commandArgs(trailingOnly=T) 57 | in.list <- as.character(args[1]) 58 | OUTFILE <- as.character(args[2]) 59 | 60 | # #Dev parameters (local) 61 | # in.list <- "~/scratch/af_test/input.list" 62 | # OUTFILE <- "~/scratch/gnomAD_AF_table.merged.txt" 63 | 64 | 65 | ###Process input data 66 | batches.list <- read.table(in.list,header=F,sep="\t") 67 | colnames(batches.list) <- c("batch","path") 68 | dat <- import.freq.data(batches.list) 69 | 70 | ###Write output data 71 | write.table(dat,OUTFILE,col.names=T,row.names=F,sep="\t",quote=F) 72 | system(paste("gzip -f ",OUTFILE,sep=""),wait=T,intern=F) 73 | -------------------------------------------------------------------------------- /gnomad_sv_analysis_scripts/merge_filter_columns.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # -*- coding: utf-8 -*- 3 | # 4 | # Copyright © 2018 Ryan Collins 5 | # Distributed under terms of the MIT license. 6 | 7 | """ 8 | Sanitize two filter columns stripped from paired VCFs 9 | """ 10 | 11 | import argparse 12 | 13 | 14 | def main(): 15 | parser = argparse.ArgumentParser(description=__doc__, 16 | formatter_class=argparse.RawDescriptionHelpFormatter) 17 | parser.add_argument('file1', help='Input FILTER values from VCF 1.') 18 | parser.add_argument('file2', help='Input FILTER values from VCF 2.') 19 | parser.add_argument('fout', help='Output FILTER values.') 20 | 21 | args = parser.parse_args() 22 | 23 | fout = open(args.fout, 'w') 24 | 25 | with open(args.file1) as f1, open(args.file2) as f2: 26 | for x, y in zip(f1, f2): 27 | x = x.strip().split(';') 28 | y = y.strip().split(';') 29 | #Only return PASS if both are PASS with no other filters 30 | if x == ['PASS'] and y == ['PASS']: 31 | newfilt = 'PASS' 32 | else: 33 | x = [f for f in x if f != 'PASS'] 34 | y = [f for f in y if f != 'PASS'] 35 | newfilt = ';'.join(sorted(list(set(x + y)))) 36 | 37 | fout.write(newfilt + '\n') 38 | 39 | fout.close() 40 | 41 | 42 | if __name__ == '__main__': 43 | main() 44 | -------------------------------------------------------------------------------- /gnomad_sv_analysis_scripts/prePCA_vcf_filter.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # -*- coding: utf-8 -*- 3 | # 4 | # Copyright © 2018 Ryan Collins 5 | # Distributed under terms of the MIT license. 6 | 7 | """ 8 | Filter vcf to clean, autosomal, biallelic sites prior to cohort-wide PCA 9 | """ 10 | 11 | 12 | import argparse 13 | import sys 14 | from collections import defaultdict 15 | import pysam 16 | 17 | 18 | NULL_GTs = [(0, 0), (None, None), (0, ), (None, ), (None, 2)] 19 | 20 | 21 | def get_call_rate(record): 22 | n_samples = len(record.samples) 23 | n_non_null = len([s for s in record.samples if record.samples[s]['GT'] not in NULL_GTs]) 24 | 25 | callrate = n_non_null / n_samples 26 | 27 | return callrate 28 | 29 | 30 | def filter_vcf(vcf, fout_common, minAF=0.01, fout_all=None, minCallRate=0.99): 31 | for record in vcf: 32 | # #Do not include UNRESOLVED variants 33 | # if 'UNRESOLVED' in record.info.keys() \ 34 | # or 'UNRESOLVED_TYPE' in record.info.keys() \ 35 | # or 'UNRESOLVED' in record.filter: 36 | # continue 37 | 38 | # #Do not include multiallelic variants 39 | # if 'MULTIALLELIC' in record.info.keys() \ 40 | # or 'MULTIALLELIC' in record.filter \ 41 | # or len(record.alts) > 1: 42 | # continue 43 | 44 | #Do not include variants on X or Y 45 | allosomes = ['X', 'Y', 'chrX', 'chrY'] 46 | if record.chrom in allosomes: 47 | continue 48 | 49 | #Only include PASS variants 50 | if 'PASS' not in record.filter: 51 | continue 52 | 53 | #Only include variants with ≥minCallRate 54 | if get_call_rate(record) < minCallRate: 55 | continue 56 | 57 | #Only keep common variants 58 | if 'AF' in record.info.keys(): 59 | if record.info['AF'][0] >= minAF: 60 | fout_common.write(record) 61 | 62 | #Write AF-unfiltered variants, if optioned 63 | if fout_all is not None: 64 | fout_all.write(record) 65 | 66 | 67 | def main(): 68 | parser = argparse.ArgumentParser( 69 | description=__doc__, 70 | formatter_class=argparse.RawDescriptionHelpFormatter) 71 | parser.add_argument('vcf') 72 | parser.add_argument('fout') 73 | parser.add_argument('--minAF', type=float, default=0.01, 74 | help='Minimum allele frequency. [0.01]') 75 | parser.add_argument('--minCallRate', type=float, default=0.99, 76 | help='Minimum call rate. [0.99]') 77 | parser.add_argument('--noAFoutput', default=None, 78 | help='Output file for all variants unfiltered on AF.') 79 | 80 | args = parser.parse_args() 81 | 82 | #Open input VCF 83 | if args.vcf in '- stdin'.split(): 84 | vcf = pysam.VariantFile(sys.stdin) 85 | else: 86 | vcf = pysam.VariantFile(args.vcf) 87 | 88 | header = vcf.header 89 | 90 | #Open outut VCFs 91 | fout_common = pysam.VariantFile(args.fout, 'w', header=header) 92 | if args.noAFoutput is not None: 93 | fout_all = pysam.VariantFile(args.noAFoutput, 'w', header=header) 94 | else: 95 | fout_all = None 96 | 97 | #Filter VCF 98 | filter_vcf(vcf, fout_common, args.minAF, fout_all, args.minCallRate) 99 | 100 | 101 | if __name__ == '__main__': 102 | main() 103 | -------------------------------------------------------------------------------- /gnomad_sv_analysis_scripts/sum_svcounts_perSample.R: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env Rscript 2 | 3 | # Copyright (c) 2018 Talkowski Laboratory 4 | # Contact: Ryan Collins 5 | # Distributed under terms of the MIT license. 6 | 7 | # Helper script to merge outputs from count_svtypes task 8 | # in final_outlier_sample_filter.wdl 9 | 10 | 11 | ###Set master parameters & read arguments 12 | options(stringsAsFactors=F,scipen=1000) 13 | args <- commandArgs(trailingOnly=TRUE) 14 | INFILE <- args[1] 15 | OUTFILE <- args[2] 16 | 17 | ###Read input data & reformat 18 | dat <- read.table(INFILE,header=F) 19 | colnames(dat) <- c("sample","svtype","count","chrom") 20 | samples <- as.character(unique(dat$sample)) 21 | svtypes <- as.character(unique(dat$svtype)) 22 | 23 | ###Get sum of counts per sample per svtype 24 | summed.res <- do.call("rbind", lapply(samples,function(sample){ 25 | return(do.call("rbind", lapply(svtypes,function(svtype){ 26 | return(data.frame("sample"=sample, 27 | "svtype"=svtype, 28 | "count"=sum(dat[which(dat$sample==sample & dat$svtype==svtype),]$count,na.rm=T))) 29 | }))) 30 | })) 31 | 32 | ###Write summed results to outfile 33 | write.table(summed.res,OUTFILE,col.names=T,row.names=F,sep="\t",quote=F) 34 | -------------------------------------------------------------------------------- /gnomad_sv_analysis_wdls/MVR_collection_helper.wdl: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2018 Talkowski Lab 2 | 3 | # Contact Ryan Collins 4 | 5 | # Distributed under terms of the MIT License 6 | 7 | 8 | # Helper WDL to parallelize collection of Mendelian violation rate data for 9 | # the Talkowski lab SV pipeline 10 | 11 | workflow mvr_colection_helper { 12 | File vcf 13 | File vcf_idx 14 | String contig 15 | String prefix 16 | File trios_famfile 17 | File PCRPLUS_samples_list 18 | Int sv_per_shard 19 | 20 | call shard_vcf { 21 | input: 22 | vcf=vcf, 23 | vcf_idx=vcf_idx, 24 | contig=contig, 25 | sv_per_shard=sv_per_shard 26 | } 27 | 28 | scatter ( shard in shard_vcf.shard_vcfs ){ 29 | call gather_MVR_data { 30 | input: 31 | vcf=shard, 32 | prefix="${prefix}.${contig}", 33 | famfile=trios_famfile, 34 | PCRPLUS_samples_list=PCRPLUS_samples_list 35 | } 36 | } 37 | 38 | output { 39 | Array[File] mvr_data = gather_MVR_data.MVR_data 40 | } 41 | } 42 | 43 | 44 | # Shard VCF into fixed size chunks 45 | task shard_vcf { 46 | File vcf 47 | File vcf_idx 48 | String contig 49 | Int sv_per_shard 50 | 51 | command { 52 | #Tabix chromosome of interest 53 | tabix -h ${vcf} ${contig} | bgzip -c > ${contig}.vcf.gz 54 | #Then shard VCF 55 | /opt/sv-pipeline/scripts/shard_VCF.sh \ 56 | ${contig}.vcf.gz \ 57 | ${sv_per_shard} \ 58 | "vcf.shard." 59 | } 60 | 61 | output { 62 | Array[File] shard_vcfs = glob("vcf.shard.*.vcf.gz") 63 | } 64 | 65 | runtime { 66 | preemptible: 1 67 | docker: "talkowski/sv-pipeline@sha256:07160ad5fad8b8b9faa60a64caf9990e374a47fa63e8f2160d3645f5e4545c48" 68 | memory: "4 GB" 69 | disks: "local-disk 250 SSD" 70 | } 71 | } 72 | 73 | 74 | # Subset compute all data needed for downstream filter determination 75 | task gather_MVR_data { 76 | File vcf 77 | String prefix 78 | File famfile 79 | File PCRPLUS_samples_list 80 | 81 | command <<< 82 | /opt/sv-pipeline/scripts/downstream_analysis_and_filtering/count_mendelian_violations.py \ 83 | ${vcf} ${famfile} ${PCRPLUS_samples_list} "${prefix}.MVR_data.txt" 84 | gzip -f "${prefix}.MVR_data.txt" 85 | >>> 86 | 87 | output { 88 | File MVR_data = "${prefix}.MVR_data.txt.gz" 89 | } 90 | 91 | runtime { 92 | docker: "talkowski/sv-pipeline@sha256:58b67cb4e4edf285b89250d2ebab72e17c0247e3bf6891c2c2fcda646b2a6cf4" 93 | preemptible: 1 94 | disks: "local-disk 20 HDD" 95 | memory: "4 GB" 96 | } 97 | } 98 | -------------------------------------------------------------------------------- /gnomad_sv_analysis_wdls/apply_GQ_filter.wdl: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2018 Talkowski Lab 2 | 3 | # Contact Ryan Collins 4 | 5 | # Distributed under terms of the MIT License 6 | 7 | 8 | #This is an analysis WDL to apply a per-sample GQ cutoff to all variants in an SV VCF 9 | 10 | workflow apply_GQ_filter { 11 | File vcf 12 | File vcf_idx 13 | String prefix 14 | File contiglist 15 | Int minGQ_global 16 | File minGQ_perSVTYPE_table 17 | Float max_noCallRate 18 | 19 | Array[Array[String]] contigs=read_tsv(contiglist) 20 | 21 | #Split vcf per chromosome 22 | scatter ( contig in contigs ) { 23 | #Subset vcf to contig 24 | call shard_vcf { 25 | input: 26 | vcf=vcf, 27 | vcf_idx=vcf_idx, 28 | prefix=prefix, 29 | contig=contig[0] 30 | } 31 | 32 | #Apply minGQ filter 33 | call filter_GQ { 34 | input: 35 | vcf=shard_vcf.shard, 36 | prefix="${prefix}.${contig[0]}", 37 | minGQ_global=minGQ_global, 38 | minGQ_perSVTYPE_table=minGQ_perSVTYPE_table, 39 | max_noCallRate=max_noCallRate 40 | } 41 | } 42 | 43 | #Merge sharded VCFs 44 | call combine { 45 | input: 46 | vcfs=filter_GQ.filtered_vcf, 47 | prefix=prefix 48 | } 49 | 50 | output { 51 | File filtered_vcf = combine.out 52 | File filtered_vcf_idx = combine.idx 53 | } 54 | } 55 | 56 | 57 | # Shard VCF per chromosome 58 | task shard_vcf { 59 | File vcf 60 | File vcf_idx 61 | String prefix 62 | String contig 63 | 64 | command { 65 | tabix -h ${vcf} ${contig} | bgzip -c > "${prefix}.${contig}.vcf.gz" 66 | } 67 | 68 | output { 69 | File shard = "${prefix}.${contig}.vcf.gz" 70 | } 71 | 72 | runtime { 73 | preemptible: 1 74 | docker: "talkowski/sv-pipeline@sha256:6bcf2b506fc66b13f5aa5e99ccf19e01891aec963b147b09b59e6510116f1adc" 75 | memory: "4 GB" 76 | disks: "local-disk 275 SSD" 77 | } 78 | } 79 | 80 | 81 | # Apply minGQ filter 82 | task filter_GQ { 83 | File vcf 84 | String prefix 85 | Int minGQ_global 86 | File minGQ_perSVTYPE_table 87 | Float max_noCallRate 88 | 89 | command { 90 | /opt/sv-pipeline/scripts/downstream_analysis_and_filtering/apply_minGQ_filter.py \ 91 | --dropEmpties \ 92 | -m ${minGQ_global} \ 93 | --maxNCR ${max_noCallRate} \ 94 | -t ${minGQ_perSVTYPE_table} \ 95 | ${vcf} \ 96 | "${prefix}.filtered.vcf" 97 | bgzip -f "${prefix}.filtered.vcf" 98 | } 99 | 100 | output { 101 | File filtered_vcf = "${prefix}.filtered.vcf.gz" 102 | } 103 | 104 | runtime { 105 | preemptible: 1 106 | docker: "talkowski/sv-pipeline@sha256:6bcf2b506fc66b13f5aa5e99ccf19e01891aec963b147b09b59e6510116f1adc" 107 | memory: "4 GB" 108 | disks: "local-disk 30 SSD" 109 | } 110 | } 111 | 112 | 113 | # Merge VCF shards 114 | task combine { 115 | Array[File] vcfs 116 | String prefix 117 | 118 | command { 119 | vcf-concat ${sep=" " vcfs} | vcf-sort | bgzip -c > "${prefix}.minGQ_filtered.vcf.gz"; 120 | tabix -p vcf "${prefix}.minGQ_filtered.vcf.gz" 121 | } 122 | 123 | runtime { 124 | preemptible: 1 125 | docker : "talkowski/sv-pipeline@sha256:6bcf2b506fc66b13f5aa5e99ccf19e01891aec963b147b09b59e6510116f1adc" 126 | disks: "local-disk 500 SSD" 127 | memory: "4 GB" 128 | } 129 | 130 | output { 131 | File out="${prefix}.minGQ_filtered.vcf.gz" 132 | File idx="${prefix}.minGQ_filtered.vcf.gz.tbi" 133 | } 134 | } 135 | -------------------------------------------------------------------------------- /gnomad_sv_analysis_wdls/assign_lowQuality_sites.wdl: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2018 Talkowski Lab 2 | 3 | # Contact Ryan Collins 4 | 5 | # Distributed under terms of the MIT License 6 | 7 | 8 | # This is an analysis WDL for downstream processing of Talkowski SV pipeline callests 9 | # that determines categories of SV with high Mendelian violation rates based on 10 | # parent-child trio analyses, and tags those sites as LOW_QUALITY in the 11 | # VCF FILTER field 12 | 13 | # QC is performed on the final VCF separated by LOW_QUALITY and non-LOW_QUALITY 14 | 15 | 16 | import "https://api.firecloud.org/ga4gh/v1/tools/Talkowski-SV:master_SV_VCF_QC/versions/73/plain-WDL/descriptor" as QC 17 | import "https://api.firecloud.org/ga4gh/v1/tools/Talkowski-SV:MVR_collection_helper/versions/5/plain-WDL/descriptor" as collect 18 | 19 | 20 | workflow assign_lowQuality_sites { 21 | File vcf 22 | File vcf_idx 23 | String prefix 24 | File contiglist 25 | File trios_famfile 26 | File PCRPLUS_samples_list 27 | Int sv_per_shard 28 | 29 | Array[Array[String]] contigs=read_tsv(contiglist) 30 | 31 | # Shard VCF per-chromosome and collect MVR data 32 | scatter ( contig in contigs ) { 33 | call collect.mvr_colection_helper as gather_MVR_data_perChrom { 34 | input: 35 | vcf=vcf, 36 | vcf_idx=vcf_idx, 37 | contig=contig[0], 38 | prefix=prefix, 39 | trios_famfile=trios_famfile, 40 | PCRPLUS_samples_list=PCRPLUS_samples_list, 41 | sv_per_shard=sv_per_shard 42 | } 43 | call combine_MVR_data as combine_MVR_data_perChrom { 44 | input: 45 | MVR_data=gather_MVR_data_perChrom.mvr_data, 46 | prefix=prefix 47 | } 48 | } 49 | 50 | # Merge MVR data 51 | call combine_MVR_data as combine_MVR_data_crossChrom { 52 | input: 53 | MVR_data=combine_MVR_data_perChrom.merged_data, 54 | prefix=prefix 55 | } 56 | 57 | 58 | # Final outputs 59 | output { 60 | File merged_MVR_data = combine_MVR_data_crossChrom.merged_data 61 | } 62 | } 63 | 64 | 65 | # Combine MVR data from per-chromosome shards 66 | task combine_MVR_data { 67 | Array[File] MVR_data 68 | String prefix 69 | 70 | command <<< 71 | zcat ${MVR_data[0]} | sed -n '1p' > "${prefix}.merged_MVR_data.txt" 72 | zcat ${sep=' ' MVR_data} | fgrep -v "#" >> "${prefix}.merged_MVR_data.txt" 73 | gzip -f "${prefix}.merged_MVR_data.txt" 74 | >>> 75 | 76 | output { 77 | File merged_data = "${prefix}.merged_MVR_data.txt.gz" 78 | } 79 | 80 | runtime { 81 | docker: "talkowski/sv-pipeline@sha256:07160ad5fad8b8b9faa60a64caf9990e374a47fa63e8f2160d3645f5e4545c48" 82 | preemptible: 1 83 | disks: "local-disk 30 HDD" 84 | } 85 | } 86 | 87 | -------------------------------------------------------------------------------- /gnomad_sv_analysis_wdls/compute_simple_AFs_singleChrom.wdl: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2018 Talkowski Laboratory 2 | # Contact: Ryan Collins 3 | # Distributed under terms of the MIT license. 4 | 5 | # Helper workflow to calculate basic AF statistics for a single chromosome on an input VCF 6 | 7 | workflow getAFs_singleChrom { 8 | File vcf 9 | File vcf_idx 10 | String contig 11 | Int sv_per_shard 12 | String prefix 13 | File? sample_pop_assignments #Two-column file with sample ID & pop assignment. "." for pop will ignore sample 14 | File? famfile #Used for M/F AF calculations 15 | String? drop_empty_records 16 | 17 | 18 | # Tabix to chromosome of interest, and shard input VCF for stats collection 19 | call shard_vcf { 20 | input: 21 | vcf=vcf, 22 | vcf_idx=vcf_idx, 23 | contig=contig, 24 | sv_per_shard=sv_per_shard 25 | } 26 | 27 | # Scatter over VCF shards 28 | scatter ( shard in shard_vcf.shard_vcfs ) { 29 | # Collect AF summary stats 30 | call compute_shard_AFs { 31 | input: 32 | vcf=shard, 33 | prefix="${prefix}.${contig}", 34 | sample_pop_assignments=sample_pop_assignments, 35 | famfile=famfile 36 | } 37 | } 38 | 39 | # Merge shards into single VCF 40 | call combine_sharded_vcfs { 41 | input: 42 | vcfs=compute_shard_AFs.shard_wAFs, 43 | prefix="${prefix}.${contig}", 44 | drop_empty_records=drop_empty_records 45 | } 46 | 47 | # Final output 48 | output { 49 | File vcf_wAFs = combine_sharded_vcfs.vcf_out 50 | File vcf_wAFs_idx = combine_sharded_vcfs.vcf_out_idx 51 | } 52 | } 53 | 54 | 55 | # Shard VCF into fixed size chunks 56 | task shard_vcf { 57 | File vcf 58 | File vcf_idx 59 | String contig 60 | Int sv_per_shard 61 | 62 | command { 63 | set -euo pipefail 64 | #Tabix chromosome of interest 65 | tabix -h ${vcf} ${contig} | bgzip -c > ${contig}.vcf.gz 66 | #Then shard VCF 67 | /opt/sv-pipeline/scripts/shard_VCF.sh \ 68 | ${contig}.vcf.gz \ 69 | ${sv_per_shard} \ 70 | "vcf.shard." 71 | } 72 | 73 | output { 74 | Array[File] shard_vcfs = glob("vcf.shard.*.vcf.gz") 75 | } 76 | 77 | runtime { 78 | preemptible: 1 79 | maxRetries: 1 80 | docker: "talkowski/sv-pipeline@sha256:193d18c26100fdd603c569346722513f5796685e990ec3abcaeb4be887062a1a" 81 | memory: "4 GB" 82 | disks: "local-disk 250 SSD" 83 | } 84 | } 85 | 86 | 87 | # Subset a vcf to a single chromosome, and add global AF information (no subpop) 88 | task compute_shard_AFs { 89 | File vcf 90 | String prefix 91 | File? sample_pop_assignments 92 | File? famfile 93 | 94 | 95 | command <<< 96 | set -euo pipefail 97 | optionals=" " 98 | if [ ${default="SKIP" sample_pop_assignments} != "SKIP" ]; then 99 | optionals="$( echo "$optionals" ) -p ${sample_pop_assignments}" 100 | fi 101 | if [ ${default="SKIP" famfile} != "SKIP" ]; then 102 | optionals="$( echo "$optionals" ) -f ${famfile}" 103 | fi 104 | echo -e "OPTIONALS INTERPRETED AS: $optionals" 105 | echo -e "NOW RUNNING: /opt/sv-pipeline/05_annotation/scripts/compute_AFs.py $( echo "$optionals" ) ${vcf} stdout" 106 | #Tabix chromosome of interest & compute AN, AC, and AF 107 | /opt/sv-pipeline/05_annotation/scripts/compute_AFs.py $optionals "${vcf}" stdout \ 108 | | bgzip -c \ 109 | > "${prefix}.wAFs.vcf.gz" 110 | >>> 111 | 112 | output { 113 | File shard_wAFs = "${prefix}.wAFs.vcf.gz" 114 | } 115 | 116 | runtime { 117 | docker: "talkowski/sv-pipeline@sha256:193d18c26100fdd603c569346722513f5796685e990ec3abcaeb4be887062a1a" 118 | preemptible: 1 119 | maxRetries: 1 120 | memory: "4 GB" 121 | disks: "local-disk 20 SSD" 122 | } 123 | } 124 | 125 | 126 | # Merge VCF shards & drop records with zero remaining non-ref alleles 127 | task combine_sharded_vcfs { 128 | Array[File] vcfs 129 | String prefix 130 | String? drop_empty_records 131 | 132 | command { 133 | set -euo pipefail 134 | vcf-concat ${sep=" " vcfs} \ 135 | | vcf-sort \ 136 | > merged.vcf 137 | if [ ${default="TRUE" drop_empty_records} == "TRUE" ]; then 138 | /opt/sv-pipeline/05_annotation/scripts/prune_allref_records.py \ 139 | merged.vcf stdout \ 140 | | bgzip -c \ 141 | > "${prefix}.wAFs.vcf.gz" 142 | else 143 | cat merged.vcf | bgzip -c > "${prefix}.wAFs.vcf.gz" 144 | fi 145 | tabix -p vcf "${prefix}.wAFs.vcf.gz" 146 | } 147 | 148 | runtime { 149 | preemptible: 1 150 | maxRetries: 1 151 | docker: "talkowski/sv-pipeline@sha256:193d18c26100fdd603c569346722513f5796685e990ec3abcaeb4be887062a1a" 152 | disks: "local-disk 250 SSD" 153 | bootDiskSizeGb: 30 154 | memory: "4 GB" 155 | } 156 | 157 | output { 158 | File vcf_out = "${prefix}.wAFs.vcf.gz" 159 | File vcf_out_idx = "${prefix}.wAFs.vcf.gz.tbi" 160 | } 161 | } 162 | 163 | -------------------------------------------------------------------------------- /gnomad_sv_analysis_wdls/filter_cleanup_and_QUAL_recalibration.wdl: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2018 Talkowski Lab 2 | 3 | # Contact Ryan Collins 4 | 5 | # Distributed under terms of the MIT License 6 | 7 | 8 | # This is an analysis WDL to perform FILTER cleanup and recalibrate 9 | # variant QUAL scores at the end of the Talkowski SV pipeline 10 | 11 | 12 | workflow filter_cleanup_qual_recalibration { 13 | File vcf 14 | File vcf_idx 15 | File PCRPLUS_samples_list 16 | File famfile 17 | Float min_callrate_global 18 | Float min_callrate_smallDels 19 | File contiglist 20 | String prefix 21 | 22 | Array[Array[String]] contigs = read_tsv(contiglist) 23 | 24 | scatter ( contig in contigs ) { 25 | call cleanup { 26 | input: 27 | vcf=vcf, 28 | vcf_idx=vcf_idx, 29 | contig=contig[0], 30 | PCRPLUS_samples_list=PCRPLUS_samples_list, 31 | famfile=famfile, 32 | min_callrate_global=min_callrate_global, 33 | min_callrate_smallDels=min_callrate_smallDels, 34 | prefix=prefix 35 | } 36 | } 37 | 38 | call concat_vcfs { 39 | input: 40 | vcfs=cleanup.out_vcf, 41 | outfile_prefix="${prefix}.cleaned_filters_qual_recalibrated" 42 | } 43 | 44 | output { 45 | File cleaned_vcf = concat_vcfs.concat_vcf 46 | File cleaned_vcf_idx = concat_vcfs.concat_vcf_idx 47 | } 48 | } 49 | 50 | 51 | # Applies filters & cleanup to VCF for a single chromosome 52 | task cleanup { 53 | File vcf 54 | File vcf_idx 55 | String contig 56 | File PCRPLUS_samples_list 57 | File famfile 58 | Float min_callrate_global 59 | Float min_callrate_smallDels 60 | String prefix 61 | 62 | command <<< 63 | set -euo pipefail 64 | #Subset to chromosome of interest 65 | tabix -h ${vcf} ${contig} | bgzip -c > input.vcf.gz 66 | #Get list of PCR- samples 67 | tabix -H ${vcf} | fgrep -v "##" | cut -f10- | sed 's/\t/\n/g' \ 68 | > all.samples.list 69 | fgrep -wvf ${PCRPLUS_samples_list} all.samples.list \ 70 | > pcrminus.samples.list 71 | #Restrict famfiles 72 | while read ptn; do fgrep -w $ptn ${famfile}; done < all.samples.list > revised.fam 73 | fgrep -wf pcrminus.samples.list revised.fam > revised.pcrminus.fam 74 | #Compute fraction of missing genotypes per variant 75 | zcat input.vcf.gz \ 76 | | awk '{ if ($7 !~ /MULTIALLELIC/) print $0 }' \ 77 | | bgzip -c \ 78 | > input.noMCNV.vcf.gz 79 | plink2 \ 80 | --missing variant-only \ 81 | --max-alleles 2 \ 82 | --keep-fam revised.pcrminus.fam \ 83 | --fam revised.fam \ 84 | --vcf input.noMCNV.vcf.gz 85 | fgrep -v "#" plink2.vmiss \ 86 | | awk -v OFS="\t" '{ print $2, 1-$NF }' \ 87 | > callrates.txt 88 | #Clean up VCF 89 | /opt/sv-pipeline/scripts/downstream_analysis_and_filtering/filter_cleanup_and_QUAL_recalibration.py \ 90 | --callrate-table callrates.txt \ 91 | --min-callrate-global ${min_callrate_global} \ 92 | --min-callrate-smallDels ${min_callrate_smallDels} \ 93 | input.vcf.gz \ 94 | stdout \ 95 | | bgzip -c \ 96 | > "${prefix}.${contig}.cleaned_filters_qual_recalibrated.vcf.gz" 97 | # tabix -p vcf -f "${prefix}.cleaned_filters_qual_recalibrated.vcf.gz" 98 | >>> 99 | 100 | output { 101 | File out_vcf = "${prefix}.${contig}.cleaned_filters_qual_recalibrated.vcf.gz" 102 | # File out_vcf_idx = "${prefix}.cleaned_filters_qual_recalibrated.vcf.gz.tbi" 103 | } 104 | 105 | runtime { 106 | docker : "talkowski/sv-pipeline@sha256:4587376100d71d66fb864740f95e0cc5f343bb1fe6e892f5b8116c789c38333f" 107 | preemptible: 1 108 | maxRetries: 0 109 | disks: "local-disk 50 HDD" 110 | memory: "4 GB" 111 | } 112 | } 113 | 114 | 115 | #General task to combine and sort multiple VCFs 116 | task concat_vcfs { 117 | Array[File] vcfs 118 | String outfile_prefix 119 | 120 | command <<< 121 | set -euo pipefail 122 | vcf-concat ${sep=' ' vcfs} | bgzip -c > ${outfile_prefix}.vcf.gz; 123 | tabix -p vcf -f "${outfile_prefix}.vcf.gz" 124 | >>> 125 | 126 | output { 127 | File concat_vcf = "${outfile_prefix}.vcf.gz" 128 | File concat_vcf_idx = "${outfile_prefix}.vcf.gz.tbi" 129 | } 130 | 131 | runtime { 132 | docker: "talkowski/sv-pipeline@sha256:4587376100d71d66fb864740f95e0cc5f343bb1fe6e892f5b8116c789c38333f" 133 | preemptible: 0 134 | maxRetries: 1 135 | memory: "4 GB" 136 | bootDiskSizeGb: 30 137 | disks: "local-disk 250 HDD" 138 | } 139 | } 140 | 141 | -------------------------------------------------------------------------------- /gnomad_sv_analysis_wdls/gather_batch_effects_helper.wdl: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2018 Talkowski Lab 2 | 3 | # Contact Ryan Collins 4 | 5 | # Distributed under terms of the MIT License 6 | 7 | 8 | # This is an analysis WDL to perform pairwise comparisons of batches in the 9 | # Talkowski lab SV pipeline, and mark sites that appear batch-specific 10 | 11 | 12 | workflow check_batch_effects { 13 | File freq_table 14 | String batch1 15 | String batch2 16 | String prefix 17 | Int variants_per_shard 18 | 19 | # Shard frequency table 20 | call shard_table { 21 | input: 22 | freq_table=freq_table, 23 | variants_per_shard=variants_per_shard 24 | } 25 | 26 | # Scatter over shards and compute AF correlations for each variant 27 | scatter ( shard in shard_table.shards ) { 28 | call compare_batches { 29 | input: 30 | freq_table=shard, 31 | batch1=batch1, 32 | batch2=batch2, 33 | prefix=prefix 34 | } 35 | } 36 | 37 | # Combine shards, perform bonferroni correction to determine significant batch effects, and plot AF correlation scatter 38 | call combine_shards { 39 | input: 40 | freq_tables=compare_batches.results, 41 | batch1=batch1, 42 | batch2=batch2, 43 | prefix=prefix 44 | } 45 | 46 | # Outputs 47 | output { 48 | File comparison_table = combine_shards.merged_table 49 | File batch_effect_variants = combine_shards.batch_effect_variants 50 | File scatterplots_tarball = combine_shards.correlation_scatterplots_tarball 51 | } 52 | } 53 | 54 | 55 | # Shard a frequency table into an even number of evenly sized shards 56 | task shard_table { 57 | File freq_table 58 | Int variants_per_shard 59 | 60 | command <<< 61 | set -euo pipefail 62 | #Split variant lines 63 | zcat ${freq_table} | sed '1d' | \ 64 | split -l ${variants_per_shard} --numeric-suffixes=00001 -a 5 /dev/stdin freq_table_shard_ || true 65 | #Add header & gzip each shard 66 | zcat ${freq_table} | sed -n '1p' > header.txt 67 | maxshard=$( find / -name "freq_table_shard_*" | awk -v FS="_" '{ print $NF }' \ 68 | | sort -Vrk1,1 | sed -n '1p' || true ) 69 | for i in $( seq -w 00001 "$maxshard" ); do 70 | cat header.txt "freq_table_shard_$i" \ 71 | | gzip -c \ 72 | > "freq_table_shard_$i.txt.gz" || true 73 | done 74 | >>> 75 | 76 | output { 77 | Array[File] shards = glob("freq_table_shard_*.txt.gz") 78 | } 79 | 80 | runtime { 81 | docker : "talkowski/sv-pipeline@sha256:aef8156983cec6ac6a91fa6461b197a63835e5487fc9523ec857f947cfac660e" 82 | preemptible: 1 83 | maxRetries: 1 84 | } 85 | } 86 | 87 | 88 | # Compare AF stats per variant between a pair of batches 89 | task compare_batches { 90 | File freq_table 91 | String batch1 92 | String batch2 93 | String prefix 94 | 95 | command <<< 96 | set -euo pipefail 97 | /opt/sv-pipeline/scripts/downstream_analysis_and_filtering/find_batch_effects.shard_helper.R \ 98 | ${freq_table} \ 99 | "${batch1}" \ 100 | "${batch2}" \ 101 | "${prefix}.${batch1}_vs_${batch2}.results.txt" 102 | gzip "${prefix}.${batch1}_vs_${batch2}.results.txt" 103 | >>> 104 | 105 | output { 106 | File results = "${prefix}.${batch1}_vs_${batch2}.results.txt.gz" 107 | } 108 | 109 | runtime { 110 | docker : "talkowski/sv-pipeline@sha256:aef8156983cec6ac6a91fa6461b197a63835e5487fc9523ec857f947cfac660e" 111 | memory: "4 GB" 112 | preemptible: 1 113 | maxRetries: 1 114 | } 115 | } 116 | 117 | 118 | # Merge sharded comparison results and perform analysis for batch effects 119 | task combine_shards { 120 | Array[File] freq_tables 121 | String batch1 122 | String batch2 123 | String prefix 124 | 125 | command <<< 126 | set -euo pipefail 127 | #Write header 128 | zcat ${freq_tables[0]} | sed -n '1p' > header.txt || true 129 | #Iterate over files and cat 130 | while read file; do 131 | zcat "$file" | sed '1d' 132 | done < ${write_lines(freq_tables)} \ 133 | | cat header.txt - \ 134 | | gzip -c \ 135 | > "${prefix}.${batch1}_vs_${batch2}.AF_comparison_table.txt.gz" || true 136 | #Analyze 137 | mkdir "${batch1}_vs_${batch2}" 138 | /opt/sv-pipeline/scripts/downstream_analysis_and_filtering/find_batch_effects.R \ 139 | "${prefix}.${batch1}_vs_${batch2}.AF_comparison_table.txt.gz" \ 140 | "${batch1}" \ 141 | "${batch2}" \ 142 | "${batch1}_vs_${batch2}/${prefix}" 143 | gzip -f "${batch1}_vs_${batch2}/${prefix}.${batch1}_vs_${batch2}.freq_table_wBonferroni.txt" 144 | tar -czvf "${batch1}_vs_${batch2}.tar.gz" \ 145 | "${batch1}_vs_${batch2}" 146 | >>> 147 | 148 | output { 149 | File merged_table = "${batch1}_vs_${batch2}/${prefix}.${batch1}_vs_${batch2}.freq_table_wBonferroni.txt.gz" 150 | File batch_effect_variants = "${batch1}_vs_${batch2}/${prefix}.${batch1}_vs_${batch2}.batch_effect_variants.txt" 151 | File correlation_scatterplots_tarball = "${batch1}_vs_${batch2}.tar.gz" 152 | } 153 | 154 | runtime { 155 | docker : "talkowski/sv-pipeline@sha256:aef8156983cec6ac6a91fa6461b197a63835e5487fc9523ec857f947cfac660e" 156 | memory: "4 GB" 157 | preemptible: 1 158 | maxRetries: 1 159 | } 160 | } 161 | 162 | -------------------------------------------------------------------------------- /gnomad_sv_analysis_wdls/minGQ_ROC_helper.wdl: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2018 Talkowski Lab 2 | 3 | # Contact Ryan Collins 4 | 5 | # Distributed under terms of the MIT License 6 | 7 | 8 | #This is a helper WDL that performs minGQ ROC optimization for a filtered VCF 9 | #See parent WDL: optimize_GQ_filter 10 | 11 | workflow optimize_ROC { 12 | File vcf 13 | Array[File] famfiles 14 | String prefix 15 | String max_fdr 16 | Int minGQ 17 | Int maxGQ 18 | Int GQstepsize 19 | 20 | #Get de novo stats for each famfile shard 21 | scatter( famfile in famfiles ){ 22 | call gather_denovo_stats as gather_stats { 23 | input: 24 | vcf=vcf, 25 | famfile=famfile, 26 | minGQ=minGQ, 27 | maxGQ=maxGQ, 28 | GQstepsize=GQstepsize 29 | } 30 | 31 | call cat_denovo_stats as cat_stats_pershard { 32 | input: 33 | dn_stats=gather_stats.dn_stats_glob, 34 | prefix="${prefix}.pershard" 35 | } 36 | } 37 | 38 | #Merge de novo stats files across all shards 39 | call cat_denovo_stats as cat_stats_allshards { 40 | input: 41 | dn_stats=cat_stats_pershard.merged_stats, 42 | prefix="${prefix}" 43 | } 44 | 45 | #Run ROC analysis 46 | call ROC_optimization { 47 | input: 48 | merged_stats=cat_stats_allshards.merged_stats, 49 | prefix="${prefix}", 50 | max_fdr=max_fdr 51 | } 52 | 53 | output { 54 | File minGQ_ROC_plot = ROC_optimization.tarball 55 | File minGQ_ROC_table = ROC_optimization.minGQ_table 56 | } 57 | } 58 | 59 | 60 | #Gather de novo stats for a set of trios from a vcf 61 | task gather_denovo_stats { 62 | File vcf 63 | File famfile 64 | Int minGQ 65 | Int maxGQ 66 | Int GQstepsize 67 | 68 | command <<< 69 | #Get list of sample IDs & column numbers from VCF header 70 | zcat ${vcf} | head -n1000 | fgrep "#" | fgrep -v "##" | sed 's/\t/\n/g' \ 71 | | awk -v OFS="\t" '{ print $1, NR }' > vcf_header_columns.txt 72 | #Iterate over families & subset VCF 73 | while read famID pro fa mo prosex pheno; do 74 | pro_idx=$( awk -v ID=$pro '{ if ($1==ID) print $2 }' vcf_header_columns.txt ) 75 | fa_idx=$( awk -v ID=$fa '{ if ($1==ID) print $2 }' vcf_header_columns.txt ) 76 | mo_idx=$( awk -v ID=$mo '{ if ($1==ID) print $2 }' vcf_header_columns.txt ) 77 | if ! [ -z $pro_idx ] && ! [ -z $fa_idx ] && ! [ -z $mo_idx ]; then 78 | echo -e "ANALYZING $famID, which contains $pro, $fa, and $mo" 79 | #Get variant stats 80 | zcat ${vcf} | cut -f1-9,"$pro_idx","$fa_idx","$mo_idx" \ 81 | | fgrep -v "MULTIALLELIC" \ 82 | | /opt/sv-pipeline/scripts/downstream_analysis_and_filtering/gather_trio_genos.py \ 83 | stdin stdout "$pro" "$fa" "$mo" \ 84 | | gzip -c \ 85 | > "$famID".trio_variant_info.txt.gz 86 | #Titrate GQs & count de novos 87 | /opt/sv-pipeline/scripts/downstream_analysis_and_filtering/compute_denovo_stats.R \ 88 | --famID "$famID" \ 89 | --minGQ ${minGQ} \ 90 | --maxGQ ${maxGQ} \ 91 | --step ${GQstepsize} \ 92 | "$famID".trio_variant_info.txt.gz \ 93 | "$famID".trio_denovo_summary.txt 94 | gzip -f "$famID".trio_denovo_summary.txt 95 | fi 96 | done < ${famfile} 97 | >>> 98 | 99 | output { 100 | Array[File] dn_stats_glob = glob("*.trio_denovo_summary.txt.gz") 101 | } 102 | 103 | runtime { 104 | docker: "talkowski/sv-pipeline@sha256:d3844f6c7c26da55e679c9c521882d54dbecd169f884f09f05a12d6565bf6063" 105 | preemptible: 1 106 | memory: "4 GB" 107 | disks: "local-disk 250 HDD" 108 | } 109 | } 110 | 111 | 112 | #Combine de novo stats into one long melted table 113 | task cat_denovo_stats { 114 | Array[File] dn_stats 115 | String prefix 116 | 117 | command <<< 118 | zcat ${dn_stats[0]} | sed -n '1p' > ${prefix}.merged_denovo_stats.txt 119 | while read statsfile; do 120 | zcat "$statsfile" | sed '1d' 121 | done < ${write_lines(dn_stats)} \ 122 | | sort -nk1,1 \ 123 | >> ${prefix}.merged_denovo_stats.txt 124 | gzip -f ${prefix}.merged_denovo_stats.txt 125 | >>> 126 | 127 | output { 128 | File merged_stats = "${prefix}.merged_denovo_stats.txt.gz" 129 | } 130 | 131 | runtime { 132 | docker: "talkowski/sv-pipeline@sha256:d3844f6c7c26da55e679c9c521882d54dbecd169f884f09f05a12d6565bf6063" 133 | preemptible: 1 134 | } 135 | } 136 | 137 | 138 | #Run de novo ROC analysis 139 | task ROC_optimization { 140 | File merged_stats 141 | String prefix 142 | String max_fdr 143 | 144 | command <<< 145 | /opt/sv-pipeline/scripts/downstream_analysis_and_filtering/optimize_GQ_ROC.R \ 146 | --prefix ${prefix} \ 147 | --fdr ${max_fdr} \ 148 | -S /opt/sv-pipeline/ref/vcf_qc_refs/SV_colors.txt \ 149 | ${merged_stats} \ 150 | ./${prefix}_minGQ_ROC_results/ 151 | cp ./${prefix}_minGQ_ROC_results/${prefix}.minGQ_ROC_results.txt \ 152 | ${prefix}.minGQ_ROC_results.txt 153 | tar -czvf ${prefix}_minGQ_ROC_results.tar.gz ./${prefix}_minGQ_ROC_results 154 | >>> 155 | 156 | output { 157 | File tarball = "${prefix}_minGQ_ROC_results.tar.gz" 158 | File minGQ_table = "${prefix}.minGQ_ROC_results.txt" 159 | } 160 | 161 | runtime { 162 | docker: "talkowski/sv-pipeline@sha256:d3844f6c7c26da55e679c9c521882d54dbecd169f884f09f05a12d6565bf6063" 163 | preemptible: 1 164 | memory: "8 GB" 165 | disks: "local-disk 20 HDD" 166 | } 167 | } -------------------------------------------------------------------------------- /gnomad_sv_analysis_wdls/prune_and_add_vafs.wdl: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2018 Talkowski Lab 2 | 3 | # Contact Ryan Collins 4 | 5 | # Distributed under terms of the MIT License 6 | 7 | 8 | # Workflow to perform final sample pruning & compute all relevant AF statistics 9 | # for a VCF from the Talkowski SV pipeline 10 | 11 | import "https://api.firecloud.org/ga4gh/v1/tools/Talkowski-SV:compute_simple_AFs_singleChrom/versions/14/plain-WDL/descriptor" as calcAF 12 | 13 | workflow prune_and_add_vafs { 14 | File vcf 15 | File vcf_idx 16 | String prefix 17 | File? sample_pop_assignments #Two-column file with sample ID & pop assignment. "." for pop will ignore sample 18 | File? prune_list #List of samples to be excluded from the output vcf 19 | File? famfile #Used for M/F AF calculations 20 | Int sv_per_shard 21 | File contiglist 22 | String? drop_empty_records 23 | 24 | Array[Array[String]] contigs=read_tsv(contiglist) 25 | 26 | 27 | #Iterate over chromosomes 28 | scatter (contig in contigs) { 29 | #Prune VCF 30 | call prune_vcf { 31 | input: 32 | vcf=vcf, 33 | vcf_idx=vcf_idx, 34 | contig=contig[0], 35 | prune_list=prune_list, 36 | prefix=prefix 37 | } 38 | #Compute AC, AN, and AF per population & sex combination 39 | call calcAF.getAFs_singleChrom as getAFs { 40 | input: 41 | vcf=prune_vcf.pruned_vcf, 42 | vcf_idx=prune_vcf.pruned_vcf_idx, 43 | contig=contig[0], 44 | sv_per_shard=sv_per_shard, 45 | prefix=prefix, 46 | sample_pop_assignments=sample_pop_assignments, 47 | famfile=famfile, 48 | drop_empty_records=drop_empty_records 49 | } 50 | } 51 | 52 | 53 | #Merge pruned VCFs with allele info 54 | call concat_vcfs { 55 | input: 56 | vcfs=getAFs.vcf_wAFs, 57 | outfile_prefix="${prefix}.pruned_wAFs" 58 | } 59 | 60 | output { 61 | File output_vcf = concat_vcfs.concat_vcf 62 | File output_vcf_idx = concat_vcfs.concat_vcf_idx 63 | } 64 | } 65 | 66 | 67 | #Shard vcf into single chromosome shards & drop pruned samples 68 | task prune_vcf { 69 | File vcf 70 | File vcf_idx 71 | String contig 72 | File? prune_list 73 | String prefix 74 | 75 | command <<< 76 | #Tabix chromosome of interest 77 | tabix -h ${vcf} ${contig} | bgzip -c > ${contig}.vcf.gz 78 | #Get column indexes corresponding to samples to drop, if any exist 79 | if [ "${default="SKIP" prune_list}" != "SKIP" ]; then 80 | dropidx=$( zcat ${contig}.vcf.gz | sed -n '1,500p' | fgrep "#" | fgrep -v "##" \ 81 | | sed 's/\t/\n/g' | awk -v OFS="\t" '{ print NR, $1 }' \ 82 | | fgrep -wf ${prune_list} | cut -f1 | paste -s -d, ) 83 | zcat ${contig}.vcf.gz \ 84 | | cut --complement -f"$dropidx" \ 85 | | bgzip -c \ 86 | > "${prefix}.${contig}.pruned.vcf.gz" 87 | else 88 | cp "${contig}.vcf.gz" "${prefix}.${contig}.pruned.vcf.gz" 89 | fi 90 | tabix -f "${prefix}.${contig}.pruned.vcf.gz" 91 | >>> 92 | 93 | output { 94 | File pruned_vcf = "${prefix}.${contig}.pruned.vcf.gz" 95 | File pruned_vcf_idx = "${prefix}.${contig}.pruned.vcf.gz.tbi" 96 | } 97 | 98 | runtime { 99 | docker: "talkowski/sv-pipeline@sha256:4900cae92f1f8bc98c54f89444a00e134ac4c86ca55543e2646f024270a29a69" 100 | preemptible: 1 101 | maxRetries: 1 102 | memory: "4 GB" 103 | disks: "local-disk 250 SSD" 104 | } 105 | } 106 | 107 | 108 | #General task to combine multiple VCFs 109 | task concat_vcfs { 110 | Array[File] vcfs 111 | String outfile_prefix 112 | 113 | command <<< 114 | vcf-concat ${sep=' ' vcfs} | vcf-sort -c | bgzip -c > ${outfile_prefix}.vcf.gz; 115 | tabix -p vcf -f "${outfile_prefix}.vcf.gz" 116 | >>> 117 | 118 | output { 119 | File concat_vcf = "${outfile_prefix}.vcf.gz" 120 | File concat_vcf_idx = "${outfile_prefix}.vcf.gz.tbi" 121 | } 122 | 123 | runtime { 124 | docker: "talkowski/sv-pipeline@sha256:4900cae92f1f8bc98c54f89444a00e134ac4c86ca55543e2646f024270a29a69" 125 | preemptible: 1 126 | maxRetries: 1 127 | memory: "16 GB" 128 | disks: "local-disk 250 SSD" 129 | } 130 | } 131 | 132 | 133 | -------------------------------------------------------------------------------- /gnomad_sv_analysis_wdls/sharded_vcf2bed.wdl: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2018 Talkowski Lab 2 | 3 | # Contact Ryan Collins 4 | 5 | # Distributed under terms of the MIT License 6 | 7 | 8 | #This is a helper WDL that runs svtk vcf2bed parallelized across many shards for 9 | # a single chromosome 10 | 11 | workflow sharded_vcf2bed { 12 | File vcf 13 | File vcf_idx 14 | String contig 15 | Int sv_per_shard 16 | String prefix 17 | 18 | # Tabix to chromosome of interest, and shard input VCF for stats collection 19 | call shard_vcf { 20 | input: 21 | vcf=vcf, 22 | vcf_idx=vcf_idx, 23 | contig=contig, 24 | sv_per_shard=sv_per_shard 25 | } 26 | 27 | # Scatter over VCF shards 28 | scatter (shard in shard_vcf.shard_vcfs) { 29 | # Run vcf2bed 30 | call vcf2bed_sub { 31 | input: 32 | vcf=shard, 33 | prefix="${prefix}.shard" 34 | } 35 | } 36 | 37 | # Merge vcf2bed_sub outputs 38 | call merge_vcf2bed_sub { 39 | input: 40 | vcf2bed_sub_shards=vcf2bed_sub.vcf2bed_sub_out, 41 | prefix=prefix 42 | } 43 | 44 | output { 45 | File vcf2bed_out=merge_vcf2bed_sub.merged_vcf2bed_out 46 | } 47 | } 48 | 49 | 50 | # Shard VCF into fixed size chunks 51 | task shard_vcf { 52 | File vcf 53 | File vcf_idx 54 | String contig 55 | Int sv_per_shard 56 | 57 | command { 58 | #Tabix chromosome of interest 59 | tabix -h ${vcf} ${contig} | bgzip -c > ${contig}.vcf.gz 60 | #Then shard VCF 61 | /opt/sv-pipeline/scripts/shard_VCF.sh \ 62 | ${contig}.vcf.gz \ 63 | ${sv_per_shard} \ 64 | "vcf.shard." 65 | } 66 | 67 | output { 68 | Array[File] shard_vcfs = glob("vcf.shard.*.vcf.gz") 69 | } 70 | 71 | runtime { 72 | preemptible: 1 73 | docker: "talkowski/sv-pipeline@sha256:ec7e6f578ba2a8796399fc6f0f9864ec2d34a4921c769a8a54bbbf5254337a8b" 74 | memory: "4 GB" 75 | disks: "local-disk 270 SSD" 76 | } 77 | } 78 | 79 | 80 | # Run vcf2bed_sub on an input vcf 81 | task vcf2bed_sub { 82 | File vcf 83 | String prefix 84 | 85 | command { 86 | svtk vcf2bed \ 87 | --info ALL \ 88 | --include-filters \ 89 | --no-samples \ 90 | ${vcf} \ 91 | stdout \ 92 | | bgzip -c \ 93 | > "${prefix}.vcf2bed.bed.gz" 94 | } 95 | 96 | output { 97 | File vcf2bed_sub_out = "${prefix}.vcf2bed.bed.gz" 98 | } 99 | 100 | runtime { 101 | preemptible: 1 102 | docker: "talkowski/sv-pipeline@sha256:ec7e6f578ba2a8796399fc6f0f9864ec2d34a4921c769a8a54bbbf5254337a8b" 103 | memory: "4 GB" 104 | disks: "local-disk 25 SSD" 105 | } 106 | } 107 | 108 | # Merge vcf2bed_sub shards 109 | task merge_vcf2bed_sub { 110 | Array[File] vcf2bed_sub_shards 111 | String prefix 112 | 113 | command <<< 114 | zcat ${vcf2bed_sub_shards[0]} | sed -n '1p' > header.txt 115 | zcat ${sep=' ' vcf2bed_sub_shards} | fgrep -v "#" \ 116 | | sort -Vk1,1 -k2,2n -k3,3n \ 117 | | cat header.txt - \ 118 | | bgzip -c \ 119 | > "${prefix}.vcf2bed_sub.bed.gz" 120 | >>> 121 | 122 | output { 123 | File merged_vcf2bed_out = "${prefix}.vcf2bed_sub.bed.gz" 124 | } 125 | 126 | runtime { 127 | preemptible: 1 128 | docker: "talkowski/sv-pipeline@sha256:ec7e6f578ba2a8796399fc6f0f9864ec2d34a4921c769a8a54bbbf5254337a8b" 129 | memory: "4 GB" 130 | disks: "local-disk 200 SSD" 131 | } 132 | } 133 | 134 | 135 | -------------------------------------------------------------------------------- /gnomad_sv_manuscript_code/.DS_Store: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/talkowski-lab/gnomad-sv-pipeline/b7798895fc1b2c8d83b41b36148b0a4b3a8d25cb/gnomad_sv_manuscript_code/.DS_Store -------------------------------------------------------------------------------- /gnomad_sv_manuscript_code/metadata_generation/AF_reconcilliation_helper.R: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env Rscript 2 | 3 | # Copyright (c) 2018 Talkowski Laboratory 4 | # Contact: Ryan Collins 5 | # Distributed under terms of the MIT license. 6 | 7 | # gnomAD v2 SV analysis script 8 | 9 | # Helper script to sanitize AFs for 1kG comparisons 10 | 11 | 12 | ###Set master parameters 13 | options(stringsAsFactors=F,scipen=1000) 14 | 15 | ###Read command line args 16 | args <- commandArgs(trailingOnly=TRUE) 17 | 18 | ###Read data 19 | dat <- read.table(args[1],header=T,comment.char="",sep="\t") 20 | 21 | ###Clean AFs 22 | AFs.to.clean <- grep(",",dat$AF,fixed=T) 23 | cleaned.AFs <- as.vector(as.numeric(sapply(dat$AF[AFs.to.clean],function(s){ 24 | return(sum(round(as.numeric(unlist(strsplit(s,split=","))),10)[-2])) 25 | }))) 26 | cleaned.AFs[which(cleaned.AFs>1)] <- 1 27 | dat$AF[AFs.to.clean] <- cleaned.AFs 28 | 29 | ###Write data 30 | colnames(dat)[1] <- "#chr" 31 | write.table(dat,args[1],col.names=T,row.names=F,sep="\t",quote=F) 32 | -------------------------------------------------------------------------------- /gnomad_sv_manuscript_code/metadata_generation/merge_downsample_sv_per_gene.R: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env Rscript 2 | 3 | # Copyright (c) 2018 Talkowski Laboratory 4 | # Contact: Ryan Collins 5 | # Distributed under terms of the MIT license. 6 | 7 | # gnomAD v2 SV analysis helper script 8 | 9 | # Get average SV per gene from downsampling experiments 10 | 11 | 12 | ###Set master parameters & load libraries 13 | options(stringsAsFactors=F,scipen=1000) 14 | effects <- c("lof.any","lof.del","lof.other","cg","plof","inv") 15 | 16 | 17 | ###Helper functions 18 | #Import list of sv-per-gene counts, and take mean across downsampling points 19 | process.counts <- function(counts.list.in,seed.table.in){ 20 | #Read seeds 21 | down.sizes <- as.integer(read.table(seed.table.in,header=F,sep="\t")[,1]) 22 | #Read tables as list 23 | counts.list <- as.character(read.table(counts.list.in,header=F,sep="\t")[,1]) 24 | counts <- lapply(counts.list,function(l){ 25 | read.table(l,header=T,sep="\t") 26 | }) 27 | #Sanity check to make sure all counts have same number of rows 28 | if(length(unique(unlist(lapply(counts,nrow)))) > 1){ 29 | stop("Some sv_per_gene.txt tables don't have the same number of lines.") 30 | } 31 | #Take mean counts per gene per downsample size 32 | down.points <- sort(unique(down.sizes)) 33 | res <- do.call("cbind", lapply(down.points,function(n){ 34 | idxs <- which(down.sizes==n) 35 | merged <- do.call("rbind", counts[idxs]) 36 | meaned <- as.data.frame(t(sapply(sort(unique(as.character(merged$gene))),function(gene){ 37 | as.numeric(apply(merged[which(merged$gene==gene),-1],2,mean,na.rm=T)) 38 | }))) 39 | colnames(meaned) <- paste(effects,n,sep=".") 40 | return(meaned) 41 | })) 42 | #Add gene name & return output data frame 43 | res <- data.frame("gene"=rownames(res),res) 44 | rownames(res) <- NULL 45 | return(res) 46 | } 47 | 48 | 49 | ###Read command-line arguments 50 | args <- commandArgs(trailingOnly=T) 51 | counts.list.in <- as.character(args[1]) 52 | seed.table.in <- as.character(args[2]) 53 | OUTFILE <- as.character(args[3]) 54 | 55 | # #Dev parameters (local) 56 | # counts.list.in <- "~/scratch/sv_per_gene.input.list" 57 | # seed.table.in <- "~/scratch/tmp_seeds_input.txt" 58 | # OUTFILE <- "~/scratch/merged_sv_per_gene.test.txt" 59 | 60 | 61 | ###Process data & write out 62 | res <- process.counts(counts.list.in,seed.table.in) 63 | write.table(res,OUTFILE,col.names=T,row.names=F,sep="\t",quote=F) 64 | 65 | -------------------------------------------------------------------------------- /gnomad_sv_manuscript_code/metadata_generation/seed_downsampling.R: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env Rscript 2 | 3 | # Copyright (c) 2018 Talkowski Laboratory 4 | # Contact: Ryan Collins 5 | # Distributed under terms of the MIT license. 6 | 7 | # gnomAD v2 SV analysis helper script 8 | 9 | # Create list of unique downsample seeds 10 | 11 | 12 | ###Set master parameters 13 | options(stringsAsFactors=F,scipen=1000) 14 | sample.sizes <- c(1,2,3,4,5,6,7,8,9,10,25,50,75,100,250,500,750,1000,2500,5000,7500,10000) 15 | sample.sizes <- sample.sizes[which(sample.sizes<=10000)] 16 | 17 | ###Read command-line arguments 18 | args <- commandArgs(trailingOnly=T) 19 | N <- as.numeric(args[1]) 20 | # master.seed <- as.numeric(args[2]) 21 | OUTFILE <- as.character(args[2]) 22 | 23 | ###Create data frame of random seeds & sample sizes 24 | s <- as.numeric(sapply(sample.sizes,rep,times=N)) 25 | set.seed(123456789) 26 | r <- ceiling(runif(length(s), 0, 10^12)) 27 | out <- data.frame("sample.size"=s, 28 | "random.seed"=r) 29 | write.table(out,OUTFILE,col.names=F,row.names=F,sep="\t",quote=F) 30 | -------------------------------------------------------------------------------- /gnomad_sv_pipeline_scripts/.DS_Store: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/talkowski-lab/gnomad-sv-pipeline/b7798895fc1b2c8d83b41b36148b0a4b3a8d25cb/gnomad_sv_pipeline_scripts/.DS_Store -------------------------------------------------------------------------------- /gnomad_sv_pipeline_scripts/module_00/.DS_Store: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/talkowski-lab/gnomad-sv-pipeline/b7798895fc1b2c8d83b41b36148b0a4b3a8d25cb/gnomad_sv_pipeline_scripts/module_00/.DS_Store -------------------------------------------------------------------------------- /gnomad_sv_pipeline_scripts/module_00/vcf2baf.sh: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | 3 | ############################# 4 | # gnomAD SV Discovery # 5 | ############################# 6 | 7 | # Copyright (c) 2018 Harold Wang, Ryan L. Collins, and the Talkowski Lab 8 | # Distributed under terms of the MIT License (see LICENSE) 9 | # Contact: Ryan L. Collins 10 | # gnomAD credits: http://gnomad.broadinstitute.org/ 11 | 12 | #Wrapper to handle pre-filtering for vcf2baf_helper.py 13 | #Collects BAF data for all samples present in a given VCF input 14 | 15 | 16 | #####Usage statement 17 | usage(){ 18 | cat < ${OUTFILE} 69 | 70 | 71 | #####Bgzip & tabix index OUTFILE, if optioned 72 | if [ ${BGZIP} -gt 0 ]; then 73 | bgzip -f ${OUTFILE} 74 | tabix -s 1 -b 2 -e 2 -f ${OUTFILE}.gz 75 | fi 76 | 77 | 78 | -------------------------------------------------------------------------------- /gnomad_sv_pipeline_scripts/module_01/make_depth_rdtest_bed.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # -*- coding: utf-8 -*- 3 | # 4 | # Copyright © 2017 Matthew Stone 5 | # Distributed under terms of the MIT license. 6 | 7 | """ 8 | Convert bedcluster output to RdTest format 9 | """ 10 | 11 | import argparse 12 | import sys 13 | import pandas as pd 14 | 15 | 16 | def make_depth_rdtest_bed(svof): 17 | svof['#chrom'] = svof['#chrom'].astype(str) 18 | svof['start'] = svof.start.astype(int) 19 | svof['end'] = svof.end.astype(int) 20 | bed = svof['#chrom start end name svtype'.split()].drop_duplicates() 21 | 22 | # Add samples 23 | def agg_samples(samples): 24 | return ','.join(sorted(set(samples))) 25 | samples = svof.groupby('name')['sample'].agg(agg_samples) 26 | samples = samples.rename('samples').reset_index() 27 | bed = pd.merge(bed, samples, on='name', how='left') 28 | 29 | # Format 30 | bed['svtype'] = bed.svtype.str.upper() 31 | 32 | cols = '#chrom start end name samples svtype'.split() 33 | return bed[cols] 34 | 35 | 36 | def main(): 37 | parser = argparse.ArgumentParser( 38 | description=__doc__, 39 | formatter_class=argparse.RawDescriptionHelpFormatter) 40 | parser.add_argument('bed', help='Input BED') 41 | parser.add_argument('fout', help='Output BED', type=argparse.FileType('w'), 42 | default=sys.stdout, nargs='?') 43 | args = parser.parse_args() 44 | 45 | clustered = pd.read_table(args.bed) 46 | 47 | bed = make_depth_rdtest_bed(clustered) 48 | 49 | bed.to_csv(args.fout, sep='\t', index=False) 50 | 51 | if __name__ == '__main__': 52 | main() 53 | -------------------------------------------------------------------------------- /gnomad_sv_pipeline_scripts/module_01/make_pesr_rdtest_bed.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # -*- coding: utf-8 -*- 3 | # 4 | # Copyright © 2017 Matthew Stone 5 | # Distributed under terms of the MIT license. 6 | 7 | """ 8 | Convert CNV records in clustered VCF to RdTest bed format. 9 | """ 10 | 11 | import argparse 12 | import sys 13 | from collections import deque 14 | from pysam import VariantFile 15 | 16 | 17 | def main(): 18 | parser = argparse.ArgumentParser( 19 | description=__doc__, 20 | formatter_class=argparse.RawDescriptionHelpFormatter) 21 | parser.add_argument('vcf', help='Input VCF') 22 | parser.add_argument('bed', help='Output BED', type=argparse.FileType('w'), 23 | default=sys.stdout, nargs='?') 24 | args = parser.parse_args() 25 | 26 | # Prep bed 27 | header = '#chrom\tstart\tend\tname\tsamples\tsvtype\n' 28 | args.bed.write(header) 29 | entry = '{chrom}\t{start}\t{end}\t{name}\t{samples}\t{svtype}\n' 30 | 31 | vcf = VariantFile(args.vcf) 32 | for record in vcf: 33 | # Skip non-CNV 34 | if record.info['SVTYPE'] not in 'DEL DUP'.split(): 35 | continue 36 | 37 | # Get bed interval and metadata 38 | chrom = record.chrom 39 | start = record.pos 40 | end = record.stop 41 | name = record.id 42 | svtype = record.info['SVTYPE'] 43 | 44 | # Get list of called samples 45 | samples = deque() 46 | null_GTs = [(0, 0), (None, None), (0, ), (None, )] 47 | for sample in record.samples: 48 | gt = record.samples[sample]['GT'] 49 | if gt not in null_GTs: 50 | samples.append(sample) 51 | if len(samples) == 0: 52 | continue 53 | samples = ','.join(sorted(set(samples))) 54 | 55 | args.bed.write(entry.format(**locals())) 56 | 57 | 58 | if __name__ == '__main__': 59 | main() 60 | -------------------------------------------------------------------------------- /gnomad_sv_pipeline_wdls/.DS_Store: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/talkowski-lab/gnomad-sv-pipeline/b7798895fc1b2c8d83b41b36148b0a4b3a8d25cb/gnomad_sv_pipeline_wdls/.DS_Store -------------------------------------------------------------------------------- /gnomad_sv_pipeline_wdls/module_00/00_batch_BAF_merging.wdl: -------------------------------------------------------------------------------- 1 | workflow evidence_merging { 2 | Array[File] BAF_files 3 | Array[File] BAF_indexes 4 | Array[String] samples 5 | String batch 6 | File inclusion_bed 7 | 8 | call merge_PESR_files as merge_BAF_files { 9 | input: 10 | files=BAF_files, 11 | indexes=BAF_indexes, 12 | batch=batch, 13 | evidence="BAF", 14 | inclusion_bed=inclusion_bed 15 | } 16 | 17 | output { 18 | File merged_BAF = merge_BAF_files.merged 19 | File merged_BAF_idx = merge_BAF_files.merged_idx 20 | } 21 | } 22 | 23 | task merge_PESR_files { 24 | Array[File] files 25 | Array[File] indexes 26 | String batch 27 | String evidence 28 | File inclusion_bed 29 | 30 | command <<< 31 | tmpdir=$(mktemp -d); 32 | cmd="sort -m -k1,1V -k2,2n -T $tmpdir"; 33 | while read file; do 34 | cmd="$cmd <( tabix -h -R ${inclusion_bed} $file )" 35 | done < ${write_tsv(files)}; 36 | echo "$cmd" 37 | eval "$cmd" | bgzip -c > ${batch}.${evidence}.txt.gz; 38 | tabix -f -s1 -b 2 -e 2 ${batch}.${evidence}.txt.gz 39 | >>> 40 | 41 | output { 42 | File merged = "${batch}.${evidence}.txt.gz" 43 | File merged_idx = "${batch}.${evidence}.txt.gz.tbi" 44 | } 45 | 46 | runtime { 47 | docker: "talkowski/sv-pipeline-remote-pysam" 48 | memory: "8 GB" 49 | disks: "local-disk 5000 HDD" 50 | } 51 | } 52 | -------------------------------------------------------------------------------- /gnomad_sv_pipeline_wdls/module_00/00_batch_PESRRD_merging.wdl: -------------------------------------------------------------------------------- 1 | workflow evidence_merging { 2 | Array[File] PE_files 3 | Array[File] PE_indexes 4 | Array[File] SR_files 5 | Array[File] SR_indexes 6 | Array[File] bincov_files 7 | Array[File] bincov_indexes 8 | Array[String] samples 9 | String batch 10 | File inclusion_bed 11 | 12 | call merge_PESR_files as merge_PE_files { 13 | input: 14 | files=PE_files, 15 | indexes=PE_indexes, 16 | batch=batch, 17 | evidence="PE", 18 | inclusion_bed=inclusion_bed 19 | } 20 | 21 | call merge_PESR_files as merge_SR_files { 22 | input: 23 | files=SR_files, 24 | indexes=SR_indexes, 25 | batch=batch, 26 | evidence="SR", 27 | inclusion_bed=inclusion_bed 28 | } 29 | 30 | call make_bincov_matrix { 31 | input: 32 | samples=samples, 33 | filepaths=bincov_files, 34 | batch=batch, 35 | } 36 | 37 | output { 38 | File merged_PE = merge_PE_files.merged 39 | File merged_PE_idx = merge_PE_files.merged_idx 40 | File merged_SR = merge_SR_files.merged 41 | File merged_SR_idx = merge_SR_files.merged_idx 42 | File merged_bincov = make_bincov_matrix.bincov_matrix 43 | File merged_bincov_idx = make_bincov_matrix.bincov_matrix_idx 44 | } 45 | } 46 | 47 | task merge_PESR_files { 48 | Array[File] files 49 | Array[File] indexes 50 | String batch 51 | String evidence 52 | File inclusion_bed 53 | 54 | command <<< 55 | tmpdir=$(mktemp -d); 56 | cmd="sort -m -k1,1V -k2,2n -T $tmpdir"; 57 | while read file; do 58 | cmd="$cmd <( tabix -h -R ${inclusion_bed} $file )" 59 | done < ${write_tsv(files)}; 60 | echo "$cmd" 61 | eval "$cmd" | bgzip -c > ${batch}.${evidence}.txt.gz; 62 | tabix -f -s1 -b 2 -e 2 ${batch}.${evidence}.txt.gz 63 | >>> 64 | 65 | output { 66 | File merged = "${batch}.${evidence}.txt.gz" 67 | File merged_idx = "${batch}.${evidence}.txt.gz.tbi" 68 | } 69 | 70 | runtime { 71 | docker: "talkowski/sv-pipeline-remote-pysam" 72 | memory: "8 GB" 73 | disks: "local-disk 5000 HDD" 74 | } 75 | } 76 | 77 | task make_bincov_matrix { 78 | Array[String] samples 79 | Array[File] filepaths 80 | String batch 81 | 82 | command <<< 83 | paste ${write_tsv(samples)} ${write_tsv(filepaths)} > samples.key; 84 | makeMatrix.sh -z -N -o ${batch}.bincov.bed.gz samples.key 85 | >>> 86 | 87 | output { 88 | File bincov_matrix = "${batch}.bincov.bed.gz" 89 | File bincov_matrix_idx = "${batch}.bincov.bed.gz.tbi" 90 | } 91 | 92 | runtime { 93 | docker: "talkowski/sv-pipeline-remote-pysam" 94 | disks: "local-disk 1000 HDD" 95 | } 96 | } -------------------------------------------------------------------------------- /gnomad_sv_pipeline_wdls/module_00/00_batch_SR_merging.wdl: -------------------------------------------------------------------------------- 1 | workflow evidence_merging { 2 | Array[File] SR_files 3 | Array[File] SR_indexes 4 | Array[String] samples 5 | String batch 6 | File inclusion_bed 7 | 8 | call merge_PESR_files as merge_SR_files { 9 | input: 10 | files=SR_files, 11 | indexes=SR_indexes, 12 | batch=batch, 13 | evidence="SR", 14 | inclusion_bed=inclusion_bed 15 | } 16 | 17 | output { 18 | File merged_SR = merge_SR_files.merged 19 | File merged_SR_idx = merge_SR_files.merged_idx 20 | } 21 | } 22 | 23 | task merge_PESR_files { 24 | Array[File] files 25 | Array[File] indexes 26 | String batch 27 | String evidence 28 | File inclusion_bed 29 | 30 | command <<< 31 | tmpdir=$(mktemp -d); 32 | cmd="sort -m -k1,1V -k2,2n -T $tmpdir"; 33 | while read file; do 34 | cmd="$cmd <( tabix -h -R ${inclusion_bed} $file )" 35 | done < ${write_tsv(files)}; 36 | echo "$cmd" 37 | eval "$cmd" | bgzip -c > ${batch}.${evidence}.txt.gz; 38 | tabix -f -s1 -b 2 -e 2 ${batch}.${evidence}.txt.gz 39 | >>> 40 | 41 | output { 42 | File merged = "${batch}.${evidence}.txt.gz" 43 | File merged_idx = "${batch}.${evidence}.txt.gz.tbi" 44 | } 45 | 46 | runtime { 47 | docker: "talkowski/sv-pipeline-remote-pysam" 48 | memory: "8 GB" 49 | disks: "local-disk 5000 HDD" 50 | } 51 | } 52 | -------------------------------------------------------------------------------- /gnomad_sv_pipeline_wdls/module_00/00_batch_evidence_merging.wdl: -------------------------------------------------------------------------------- 1 | workflow evidence_merging { 2 | Array[File] PE_files 3 | Array[File] PE_indexes 4 | Array[File] SR_files 5 | Array[File] SR_indexes 6 | Array[File] bincov_files 7 | Array[File] bincov_indexes 8 | Array[File] BAF_files 9 | Array[File] BAF_indexes 10 | Array[String] samples 11 | String batch 12 | File inclusion_bed 13 | 14 | call merge_PESR_files as merge_PE_files { 15 | input: 16 | files=PE_files, 17 | indexes=PE_indexes, 18 | batch=batch, 19 | evidence="PE", 20 | inclusion_bed=inclusion_bed 21 | } 22 | 23 | call merge_PESR_files as merge_SR_files { 24 | input: 25 | files=SR_files, 26 | indexes=SR_indexes, 27 | batch=batch, 28 | evidence="SR", 29 | inclusion_bed=inclusion_bed 30 | } 31 | 32 | call make_bincov_matrix { 33 | input: 34 | samples=samples, 35 | filepaths=bincov_files, 36 | batch=batch, 37 | } 38 | 39 | call merge_PESR_files as merge_BAF_files { 40 | input: 41 | files=BAF_files, 42 | indexes=BAF_indexes, 43 | batch=batch, 44 | evidence="BAF", 45 | inclusion_bed=inclusion_bed 46 | } 47 | 48 | output { 49 | File merged_PE = merge_PE_files.merged 50 | File merged_PE_idx = merge_PE_files.merged_idx 51 | File merged_SR = merge_SR_files.merged 52 | File merged_SR_idx = merge_SR_files.merged_idx 53 | File merged_bincov = make_bincov_matrix.bincov_matrix 54 | File merged_bincov_idx = make_bincov_matrix.bincov_matrix_idx 55 | File merged_BAF = merge_BAF_files.merged 56 | File merged_BAF_idx = merge_BAF_files.merged_idx 57 | } 58 | } 59 | 60 | task merge_PESR_files { 61 | Array[File] files 62 | Array[File] indexes 63 | String batch 64 | String evidence 65 | File inclusion_bed 66 | 67 | command <<< 68 | tmpdir=$(mktemp -d); 69 | cmd="sort -m -k1,1V -k2,2n -T $tmpdir"; 70 | while read file; do 71 | cmd="$cmd <( tabix -h -R ${inclusion_bed} $file )" 72 | done < ${write_tsv(files)}; 73 | echo "$cmd" 74 | eval "$cmd" | bgzip -c > ${batch}.${evidence}.txt.gz; 75 | tabix -f -s1 -b 2 -e 2 ${batch}.${evidence}.txt.gz 76 | >>> 77 | 78 | output { 79 | File merged = "${batch}.${evidence}.txt.gz" 80 | File merged_idx = "${batch}.${evidence}.txt.gz.tbi" 81 | } 82 | 83 | runtime { 84 | docker: "talkowski/sv-pipeline-remote-pysam" 85 | memory: "8 GB" 86 | disks: "local-disk 5000 HDD" 87 | } 88 | } 89 | 90 | task make_bincov_matrix { 91 | Array[String] samples 92 | Array[File] filepaths 93 | String batch 94 | 95 | command <<< 96 | paste ${write_tsv(samples)} ${write_tsv(filepaths)} > samples.key; 97 | makeMatrix.sh -z -N -o ${batch}.bincov.bed.gz samples.key 98 | >>> 99 | 100 | output { 101 | File bincov_matrix = "${batch}.bincov.bed.gz" 102 | File bincov_matrix_idx = "${batch}.bincov.bed.gz.tbi" 103 | } 104 | 105 | runtime { 106 | docker: "talkowski/sv-pipeline-remote-pysam" 107 | disks: "local-disk 1000 HDD" 108 | } 109 | } 110 | -------------------------------------------------------------------------------- /gnomad_sv_pipeline_wdls/module_00/00_depth_preprocessing.wdl: -------------------------------------------------------------------------------- 1 | workflow preprocess_depth { 2 | Array[File] beds 3 | String batch 4 | 5 | call concat_batch as preprocess_DELs { 6 | input: 7 | beds=beds, 8 | batch=batch, 9 | svtype="DEL" 10 | } 11 | 12 | call concat_batch as preprocess_DUPs { 13 | input: 14 | beds=beds, 15 | batch=batch, 16 | svtype="DUP" 17 | } 18 | 19 | output { 20 | File del_bed = preprocess_DELs.bed 21 | File dup_bed = preprocess_DUPs.bed 22 | File del_bed_idx = preprocess_DELs.bed_idx 23 | File dup_bed_idx = preprocess_DUPs.bed_idx 24 | } 25 | } 26 | 27 | task concat_batch { 28 | Array[File] beds 29 | String svtype 30 | String batch 31 | 32 | command <<< 33 | zcat ${sep=' ' beds} \ 34 | | sed -e '/^#chr/d' -e 's/cn.MOPS/cnmops/g' \ 35 | | awk -v svtype=${svtype} '($6==svtype)' \ 36 | | sort -k1,1V -k2,2n \ 37 | | awk -v OFS="\t" -v svtype=${svtype} -v batch=${batch} '{$4=batch"_"svtype"_"NR; print}' \ 38 | | cat <(echo -e "#chr\tstart\tend\tname\tsample\tsvtype\tsources") - \ 39 | | bgzip -c \ 40 | > ${batch}.${svtype}.bed.gz; 41 | tabix -p bed ${batch}.${svtype}.bed.gz 42 | >>> 43 | 44 | output { 45 | File bed="${batch}.${svtype}.bed.gz" 46 | File bed_idx="${batch}.${svtype}.bed.gz.tbi" 47 | } 48 | 49 | runtime { 50 | docker: "talkowski/sv-pipeline" 51 | preemptible: 3 52 | } 53 | } -------------------------------------------------------------------------------- /gnomad_sv_pipeline_wdls/module_00/00_pesr_preprocessing.wdl: -------------------------------------------------------------------------------- 1 | import "https://api.firecloud.org/ga4gh/v1/tools/Talkowski-SV:00_pesr_processing_single_algorithm/versions/4/plain-WDL/descriptor" as pp 2 | 3 | workflow preprocess_pesr { 4 | String sample # Sample ID 5 | File manta_vcf # Manta VCF 6 | File delly_vcf # Delly VCF 7 | File melt_vcf # Melt VCF 8 | File contigs # .fai file of whitelisted contigs 9 | Int min_svsize # Minimum SV length to include 10 | 11 | call pp.preprocess_algorithm as process_manta { 12 | input: 13 | vcf=manta_vcf, 14 | contigs=contigs, 15 | min_svsize=min_svsize, 16 | algorithm="manta", 17 | sample=sample 18 | } 19 | 20 | call pp.preprocess_algorithm as process_delly { 21 | input: 22 | vcf=delly_vcf, 23 | contigs=contigs, 24 | min_svsize=min_svsize, 25 | algorithm="delly", 26 | sample=sample 27 | } 28 | 29 | call pp.preprocess_algorithm as process_melt { 30 | input: 31 | vcf=melt_vcf, 32 | contigs=contigs, 33 | min_svsize=min_svsize, 34 | algorithm="melt", 35 | sample=sample 36 | } 37 | 38 | output { 39 | File std_manta_vcf = process_manta.std_vcf 40 | File std_delly_vcf = process_delly.std_vcf 41 | File std_melt_vcf = process_melt.std_vcf 42 | } 43 | } -------------------------------------------------------------------------------- /gnomad_sv_pipeline_wdls/module_00/00_pesr_processing_single_algorithm.wdl: -------------------------------------------------------------------------------- 1 | workflow preprocess_algorithm { 2 | File vcf 3 | File contigs 4 | String sample 5 | String algorithm 6 | Int min_svsize 7 | 8 | call standardize_vcf { 9 | input: 10 | raw_vcf=vcf, 11 | algorithm=algorithm, 12 | group=sample, 13 | contigs=contigs, 14 | min_svsize=min_svsize 15 | } 16 | 17 | call sort_vcf { 18 | input: 19 | unsorted_vcf=standardize_vcf.std_vcf, 20 | algorithm=algorithm, 21 | group=sample 22 | } 23 | 24 | output { 25 | File std_vcf = sort_vcf.sorted_vcf 26 | } 27 | } 28 | 29 | task standardize_vcf { 30 | File raw_vcf 31 | File contigs 32 | Int min_svsize 33 | String algorithm 34 | String group 35 | 36 | command { 37 | svtk standardize --prefix ${algorithm}_${group} --contigs ${contigs} --min-size ${min_svsize} ${raw_vcf} ${algorithm}.${group}.vcf ${algorithm} 38 | } 39 | 40 | output { 41 | File std_vcf="${algorithm}.${group}.vcf" 42 | String group_="${group}" 43 | } 44 | 45 | runtime { 46 | docker: "talkowski/sv-pipeline" 47 | } 48 | } 49 | 50 | task sort_vcf { 51 | File unsorted_vcf 52 | String algorithm 53 | String group 54 | 55 | command { 56 | vcf-sort -c ${unsorted_vcf} | bgzip -c > ${algorithm}.${group}.vcf.gz; 57 | tabix -p vcf ${algorithm}.${group}.vcf.gz 58 | } 59 | 60 | output { 61 | File sorted_vcf="${algorithm}.${group}.vcf.gz" 62 | } 63 | 64 | runtime { 65 | docker: "talkowski/sv-pipeline" 66 | } 67 | } -------------------------------------------------------------------------------- /gnomad_sv_pipeline_wdls/module_01/01_depth_clustering.wdl: -------------------------------------------------------------------------------- 1 | import "https://api.firecloud.org/ga4gh/v1/tools/Talkowski-SV:01_depth_clustering_by_chrom/versions/4/plain-WDL/descriptor" as dibc 2 | import "https://api.firecloud.org/ga4gh/v1/tools/Talkowski-SV:master_SV_VCF_QC/versions/47/plain-WDL/descriptor" as vcf_qc 3 | 4 | workflow cluster_depth { 5 | File del_bed 6 | File dup_bed 7 | File contigs 8 | Float frac 9 | String flags 10 | String batch 11 | File famfile 12 | File trios_famfile 13 | String ref_build 14 | File Sanders_2015_tarball 15 | File Collins_2017_tarball 16 | File Werling_2018_tarball 17 | 18 | call dibc.bedcluster_by_chrom as cluster_DELs { 19 | input: 20 | batch=batch, 21 | svtype="DEL", 22 | bed=del_bed, 23 | contigs=contigs, 24 | frac=frac, 25 | flags=flags 26 | } 27 | 28 | call dibc.bedcluster_by_chrom as cluster_DUPs { 29 | input: 30 | batch=batch, 31 | svtype="DUP", 32 | bed=dup_bed, 33 | contigs=contigs, 34 | frac=frac, 35 | flags=flags 36 | } 37 | 38 | call make_rdtest_bed { 39 | input: 40 | dels=cluster_DELs.clustered_bed, 41 | dups=cluster_DUPs.clustered_bed, 42 | batch=batch, 43 | } 44 | 45 | call make_depth_vcf { 46 | input: 47 | bed=make_rdtest_bed.bed, 48 | batch=batch, 49 | contigs=contigs 50 | } 51 | 52 | call vcf_qc.master_vcf_qc as vcf_qc { 53 | input: 54 | vcf=make_depth_vcf.vcf, 55 | famfile=trios_famfile, 56 | ref_build=ref_build, 57 | prefix="${batch}_clustered_depth_vcf", 58 | sv_per_shard=10000, 59 | samples_per_shard=100, 60 | Sanders_2015_tarball=Sanders_2015_tarball, 61 | Collins_2017_tarball=Collins_2017_tarball, 62 | Werling_2018_tarball=Werling_2018_tarball 63 | } 64 | 65 | output { 66 | File clustered_vcf = make_depth_vcf.vcf 67 | File clustered_vcf_qc = vcf_qc.sv_vcf_qc_output 68 | } 69 | } 70 | 71 | task make_rdtest_bed { 72 | File dels 73 | File dups 74 | File script 75 | String batch 76 | 77 | command <<< 78 | cat \ 79 | <(python3 ${script} ${dels} | sed '1d') \ 80 | <(python3 ${script} ${dups} | sed '1d') \ 81 | | sort -k1,1V -k2,2n \ 82 | | cat <(echo -e "#chrom start end name samples svtype" | sed -e 's/ /\t/g') - \ 83 | > ${batch}.depth.bed; 84 | >>> 85 | 86 | output { 87 | File bed = "${batch}.depth.bed" 88 | } 89 | 90 | runtime { 91 | docker: "talkowski/sv-pipeline@sha256:a89824ac34b915f605d09bcf57516bc76d950bd762ad5c1f336d421be917be55" 92 | preemptible: 3 93 | } 94 | } 95 | 96 | task make_depth_vcf { 97 | File bed 98 | File contigs 99 | String batch 100 | 101 | command <<< 102 | cut -f5 ${bed} | sed -e '1d' -e 's/,/\n/g' | sort -u > samples.list; 103 | svtk rdtest2vcf --contigs ${contigs} ${bed} samples.list ${batch}.depth.vcf.gz; 104 | >>> 105 | 106 | output { 107 | File vcf = "${batch}.depth.vcf.gz" 108 | } 109 | 110 | runtime { 111 | docker: "talkowski/sv-pipeline@sha256:a89824ac34b915f605d09bcf57516bc76d950bd762ad5c1f336d421be917be55" 112 | preemptible: 3 113 | } 114 | } -------------------------------------------------------------------------------- /gnomad_sv_pipeline_wdls/module_01/01_depth_clustering_by_chrom.wdl: -------------------------------------------------------------------------------- 1 | workflow bedcluster_by_chrom { 2 | String batch 3 | String svtype 4 | File bed 5 | File contigs 6 | Float frac 7 | String flags 8 | 9 | Array[Array[String]] contiglist = read_tsv(contigs) 10 | 11 | scatter (contig in contiglist) { 12 | call bedcluster { 13 | input: 14 | batch=batch, 15 | svtype=svtype, 16 | chrom=contig[0], 17 | bed=bed, 18 | frac=frac, 19 | flags=flags 20 | } 21 | } 22 | 23 | call concat_beds { 24 | input: 25 | batch=batch, 26 | svtype=svtype, 27 | beds=bedcluster.clustered_bed 28 | } 29 | 30 | output { 31 | File clustered_bed = concat_beds.merged_bed 32 | } 33 | } 34 | 35 | task bedcluster { 36 | String batch 37 | String svtype 38 | String chrom 39 | File bed 40 | 41 | Float frac 42 | String flags 43 | 44 | command { 45 | tabix -p bed ${bed}; 46 | svtk bedcluster ${bed} -r ${chrom} \ 47 | -p ${batch}_depth_${svtype}_${chrom} \ 48 | -f ${frac} \ 49 | ${flags} \ 50 | > ${batch}.${svtype}.${chrom}.bed 51 | } 52 | 53 | output { 54 | File clustered_bed="${batch}.${svtype}.${chrom}.bed" 55 | } 56 | 57 | runtime { 58 | docker: "talkowski/sv-pipeline@sha256:a89824ac34b915f605d09bcf57516bc76d950bd762ad5c1f336d421be917be55" 59 | preemptible: 3 60 | } 61 | } 62 | 63 | task concat_beds { 64 | String batch 65 | String svtype 66 | Array[File] beds 67 | 68 | command <<< 69 | awk 'FNR==1 && NR!=1 { while (/^#chrom/) getline; } 1 {print}' ${sep=' ' beds} > ${batch}.${svtype}.bed 70 | >>> 71 | 72 | output { 73 | File merged_bed = "${batch}.${svtype}.bed" 74 | } 75 | 76 | runtime { 77 | docker: "talkowski/sv-pipeline@sha256:a89824ac34b915f605d09bcf57516bc76d950bd762ad5c1f336d421be917be55" 78 | preemptible: 3 79 | } 80 | } -------------------------------------------------------------------------------- /gnomad_sv_pipeline_wdls/module_01/01_pesr_clustering.wdl: -------------------------------------------------------------------------------- 1 | import "https://api.firecloud.org/ga4gh/v1/tools/Talkowski-SV:01_pesr_clustering_single_algorithm/versions/15/plain-WDL/descriptor" as single 2 | 3 | workflow cluster_pesr { 4 | Array[File] manta_vcfs 5 | Array[File] delly_vcfs 6 | Array[File] melt_vcfs 7 | File contigs 8 | String batch 9 | File trios_famfile 10 | String ref_build 11 | File Sanders_2015_tarball 12 | File Collins_2017_tarball 13 | File Werling_2018_tarball 14 | 15 | Int dist 16 | Float frac 17 | File blacklist 18 | Int svsize 19 | String flags 20 | 21 | call single.cluster_pesr_algorithm as cluster_manta { 22 | input: 23 | vcfs=manta_vcfs, 24 | batch=batch, 25 | algorithm="manta", 26 | contigs=contigs, 27 | dist=dist, 28 | frac=frac, 29 | blacklist=blacklist, 30 | svsize=svsize, 31 | flags=flags, 32 | svtypes="DEL,DUP,INV,BND,INS", 33 | famfile=famfile, 34 | ref_build=ref_build, 35 | Sanders_2015_tarball=Sanders_2015_tarball, 36 | Werling_2018_tarball=Werling_2018_tarball 37 | } 38 | 39 | call single.cluster_pesr_algorithm as cluster_delly { 40 | input: 41 | vcfs=delly_vcfs, 42 | batch=batch, 43 | algorithm="delly", 44 | contigs=contigs, 45 | dist=dist, 46 | frac=frac, 47 | blacklist=blacklist, 48 | svsize=svsize, 49 | flags=flags, 50 | svtypes="DEL,DUP,INV,BND", 51 | famfile=famfile, 52 | ref_build=ref_build, 53 | Sanders_2015_tarball=Sanders_2015_tarball, 54 | Werling_2018_tarball=Werling_2018_tarball 55 | } 56 | 57 | call single.cluster_pesr_algorithm as cluster_melt { 58 | input: 59 | vcfs=melt_vcfs, 60 | batch=batch, 61 | algorithm="melt", 62 | contigs=contigs, 63 | dist=dist, 64 | frac=frac, 65 | blacklist=blacklist, 66 | svsize=svsize, 67 | flags=flags, 68 | svtypes="INS", 69 | famfile=famfile, 70 | ref_build=ref_build, 71 | Sanders_2015_tarball=Sanders_2015_tarball, 72 | Werling_2018_tarball=Werling_2018_tarball 73 | } 74 | 75 | # call merge_vcf_qc { 76 | # input: 77 | # manta_vcf_qc=cluster_manta.clustered_vcf_qc, 78 | # delly_vcf_qc=cluster_delly.clustered_vcf_qc, 79 | # melt_vcf_qc=cluster_melt.clustered_vcf_qc 80 | # } 81 | 82 | output { 83 | File manta_vcf = cluster_manta.clustered_vcf 84 | File manta_vcf_qc = cluster_manta.clustered_vcf_qc 85 | File delly_vcf = cluster_delly.clustered_vcf 86 | File delly_vcf_qc = cluster_delly.clustered_vcf_qc 87 | File melt_vcf = cluster_melt.clustered_vcf 88 | File melt_vcf_qc = cluster_melt.clustered_vcf_qc 89 | # File merged_vcf_qc = merge_vcf_qc.merged_qc 90 | } 91 | } 92 | 93 | # task merge_vcf_qc { 94 | # File manta_vcf_qc 95 | # File delly_vcf_qc 96 | # File melt_vcf_qc 97 | 98 | # command <<< 99 | # mkdir merged_pesr_clustering_vcf_qc/ 100 | # mv ${manta_vcf_qc} merged_pesr_clustering_vcf_qc/ 101 | # mv ${delly_vcf_qc} merged_pesr_clustering_vcf_qc/ 102 | # mv ${melt_vcf_qc} merged_pesr_clustering_vcf_qc/ 103 | # tar -czvf merged_pesr_clustering_vcf_qc.tar.gz \ 104 | # merged_pesr_clustering_vcf_qc 105 | # >>> 106 | 107 | # output { 108 | # File merged_qc = "merged_pesr_clustering_vcf_qc.tar.gz" 109 | # } 110 | 111 | # runtime { 112 | # docker: "talkowski/sv-pipeline" 113 | # preemptible: 3 114 | # } 115 | # } -------------------------------------------------------------------------------- /gnomad_sv_pipeline_wdls/module_01/01_pesr_clustering_single_algorithm.wdl: -------------------------------------------------------------------------------- 1 | import "https://api.firecloud.org/ga4gh/v1/tools/Talkowski-SV:master_SV_VCF_QC/versions/47/plain-WDL/descriptor" as vcf_qc 2 | 3 | workflow cluster_pesr_algorithm { 4 | Array[File] vcfs 5 | File contigs 6 | String batch 7 | String algorithm 8 | File famfile 9 | File trios_famfile 10 | String ref_build 11 | File Sanders_2015_tarball 12 | File collins_2017_tarball 13 | File Werling_2018_tarball 14 | 15 | # VCFcluster parameters 16 | Int dist 17 | Float frac 18 | File blacklist 19 | Int svsize 20 | String svtypes 21 | String flags 22 | 23 | Array[Array[String]] contiglist = read_tsv(contigs) 24 | 25 | scatter (contig in contiglist) { 26 | call vcfcluster { 27 | input: 28 | vcfs=vcfs, 29 | batch=batch, 30 | algorithm=algorithm, 31 | chrom=contig[0], 32 | dist=dist, 33 | frac=frac, 34 | blacklist=blacklist, 35 | svsize=svsize, 36 | flags=flags, 37 | svtypes=svtypes 38 | } 39 | } 40 | 41 | call concat_vcfs { 42 | input: 43 | vcfs=vcfcluster.clustered_vcf, 44 | batch=batch, 45 | algorithm=algorithm 46 | } 47 | 48 | call vcf_qc.master_vcf_qc as vcf_qc { 49 | input: 50 | vcf=concat_vcfs.vcf, 51 | famfile=trios_famfile, 52 | ref_build=ref_build, 53 | prefix="${batch}_clustered_${algorithm}_vcf", 54 | sv_per_shard=10000, 55 | samples_per_shard=100, 56 | Sanders_2015_tarball=Sanders_2015_tarball, 57 | Collins_2017_tarball=Collins_2017_tarball, 58 | Werling_2018_tarball=Werling_2018_tarball 59 | } 60 | 61 | output { 62 | File clustered_vcf = concat_vcfs.vcf 63 | File clustered_vcf_qc = vcf_qc.sv_vcf_qc_output 64 | } 65 | } 66 | 67 | task vcfcluster { 68 | Array[File] vcfs 69 | String batch 70 | String algorithm 71 | String chrom 72 | 73 | # VCFcluster parameters 74 | Int dist 75 | Float frac 76 | File blacklist 77 | Int svsize 78 | String svtypes 79 | String flags 80 | 81 | command <<< 82 | for f in ${sep=' ' vcfs}; do tabix -p vcf -f $f; done; 83 | tabix -p bed ${blacklist}; 84 | 85 | svtk vcfcluster ${write_tsv(vcfs)} stdout \ 86 | -r ${chrom} \ 87 | -p ${batch}_${algorithm}_${chrom} \ 88 | -d ${dist} \ 89 | -f ${frac} \ 90 | -x ${blacklist} \ 91 | -z ${svsize} \ 92 | -t ${svtypes} \ 93 | ${flags} \ 94 | | vcf-sort -c \ 95 | | bgzip -c > ${batch}.${algorithm}.${chrom}.vcf.gz 96 | >>> 97 | 98 | output { 99 | File clustered_vcf="${batch}.${algorithm}.${chrom}.vcf.gz" 100 | } 101 | 102 | runtime { 103 | docker: "talkowski/sv-pipeline" 104 | disks: "local-disk 300 HDD" 105 | preemptible: 3 106 | } 107 | } 108 | 109 | task concat_vcfs { 110 | Array[File] vcfs 111 | String batch 112 | String algorithm 113 | 114 | command { 115 | vcf-concat ${sep=' ' vcfs} | vcf-sort -c | bgzip -c > ${batch}.${algorithm}.vcf.gz; 116 | tabix -p vcf ${batch}.${algorithm}.vcf.gz; 117 | } 118 | 119 | output { 120 | File vcf="${batch}.${algorithm}.vcf.gz" 121 | File idx="${batch}.${algorithm}.vcf.gz.tbi" 122 | } 123 | 124 | runtime { 125 | docker: "talkowski/sv-pipeline" 126 | disks: "local-disk 300 HDD" 127 | preemptible: 3 128 | } 129 | } -------------------------------------------------------------------------------- /gnomad_sv_pipeline_wdls/module_02/02_aggregate.wdl: -------------------------------------------------------------------------------- 1 | import "https://api.firecloud.org/ga4gh/v1/tools/Talkowski-SV:02_assess_evidence_single_vcf/versions/31/plain-WDL/descriptor" as assess 2 | workflow assess_evidence_batch { 3 | File mantavcf # Input VCF 4 | File meltvcf # Input VCF 5 | File dellyvcf # Input VCF 6 | File depthvcf # Input VCF 7 | File discfile # Discordant pair file 8 | File discfile_idx # Tabix index of discordant pair file 9 | File splitfile # Split read file 10 | File splitfile_idx # Tabix index of split read file 11 | File coveragefile # Bincov matrix 12 | File coveragefile_idx # Tabix index of bincov matrix 13 | File medianfile # Median coverage of each sample 14 | File baf_metrics # Matrix of BAF statistics 15 | File baf_metrics_idx # Tabix index of BAF matrix 16 | File famfile # Batch fam file 17 | File autosome_contigs # Autosomes .fai 18 | File allosome_contigs # Allosomes .fai 19 | File rmsk # Repeatmasker track 20 | File segdups # Seg dups track 21 | String batch # Batch ID 22 | Int PE_split_size # Number of lines in each petest split 23 | Int SR_split_size # Number of lines in each srtest split 24 | Int RD_split_size # Number of lines in each rdtest split 25 | Int BAF_split_size # Number of lines in each baftest split 26 | File svc_acct_key 27 | Array[String] samples 28 | call assess.assess_evidence as assessmanta{input: 29 | vcf=mantavcf, 30 | samples=samples, 31 | svc_acct_key=svc_acct_key, 32 | discfile=discfile, 33 | discfile_idx=discfile_idx, 34 | splitfile=splitfile, 35 | splitfile_idx=splitfile_idx, 36 | coveragefile=coveragefile, 37 | coveragefile_idx=coveragefile_idx, 38 | medianfile=medianfile, 39 | baf_metrics=baf_metrics, 40 | baf_metrics_idx=baf_metrics_idx, 41 | famfile=famfile, 42 | autosome_contigs=autosome_contigs, 43 | allosome_contigs=allosome_contigs, 44 | rmsk=rmsk, 45 | segdups=segdups, 46 | batch=batch, 47 | algorithm="manta", 48 | PE_split_size=PE_split_size, 49 | SR_split_size=SR_split_size, 50 | RD_split_size=RD_split_size, 51 | BAF_split_size=BAF_split_size, 52 | } 53 | call assess.assess_evidence as assessmelt{input: 54 | vcf=meltvcf, 55 | samples=samples, 56 | svc_acct_key=svc_acct_key, 57 | discfile=discfile, 58 | discfile_idx=discfile_idx, 59 | splitfile=splitfile, 60 | splitfile_idx=splitfile_idx, 61 | coveragefile=coveragefile, 62 | coveragefile_idx=coveragefile_idx, 63 | medianfile=medianfile, 64 | baf_metrics=baf_metrics, 65 | baf_metrics_idx=baf_metrics_idx, 66 | famfile=famfile, 67 | autosome_contigs=autosome_contigs, 68 | allosome_contigs=allosome_contigs, 69 | rmsk=rmsk, 70 | segdups=segdups, 71 | batch=batch, 72 | algorithm="melt", 73 | PE_split_size=PE_split_size, 74 | SR_split_size=SR_split_size, 75 | RD_split_size=RD_split_size, 76 | BAF_split_size=BAF_split_size,} 77 | call assess.assess_evidence as assessdelly{input: 78 | vcf=dellyvcf, 79 | samples=samples, 80 | svc_acct_key=svc_acct_key, 81 | discfile=discfile, 82 | discfile_idx=discfile_idx, 83 | splitfile=splitfile, 84 | splitfile_idx=splitfile_idx, 85 | coveragefile=coveragefile, 86 | coveragefile_idx=coveragefile_idx, 87 | medianfile=medianfile, 88 | baf_metrics=baf_metrics, 89 | baf_metrics_idx=baf_metrics_idx, 90 | famfile=famfile, 91 | autosome_contigs=autosome_contigs, 92 | allosome_contigs=allosome_contigs, 93 | rmsk=rmsk, 94 | segdups=segdups, 95 | batch=batch, 96 | algorithm="delly", 97 | PE_split_size=PE_split_size, 98 | SR_split_size=SR_split_size, 99 | RD_split_size=RD_split_size, 100 | BAF_split_size=BAF_split_size,} 101 | call assess.assess_evidence as assessdepth{input: 102 | vcf=depthvcf, 103 | samples=samples, 104 | svc_acct_key=svc_acct_key, 105 | discfile=discfile, 106 | discfile_idx=discfile_idx, 107 | splitfile=splitfile, 108 | splitfile_idx=splitfile_idx, 109 | coveragefile=coveragefile, 110 | coveragefile_idx=coveragefile_idx, 111 | medianfile=medianfile, 112 | baf_metrics=baf_metrics, 113 | baf_metrics_idx=baf_metrics_idx, 114 | famfile=famfile, 115 | autosome_contigs=autosome_contigs, 116 | allosome_contigs=allosome_contigs, 117 | rmsk=rmsk, 118 | segdups=segdups, 119 | batch=batch, 120 | algorithm="depth", 121 | PE_split_size=PE_split_size, 122 | SR_split_size=SR_split_size, 123 | RD_split_size=RD_split_size, 124 | BAF_split_size=BAF_split_size,} 125 | call aggregate_metric{input: 126 | batch=batch,mantametric=assessmanta.metrics,dellymetric=assessdelly.metrics,meltmetric=assessmelt.metrics,depthmetric=assessdepth.metrics} 127 | output{ 128 | File metrics=aggregate_metric.metrics 129 | } 130 | } 131 | task aggregate_metric{ 132 | String batch 133 | File mantametric 134 | File dellymetric 135 | File depthmetric 136 | File meltmetric 137 | command <<< 138 | python3 <>> 148 | output{ 149 | File metrics="${batch}.metrics" 150 | } 151 | runtime { 152 | preemptible: 3 153 | docker: "talkowski/sv-pipeline-remote-pysam" 154 | memory: "20 GB" 155 | disks: "local-disk 100 HDD" 156 | } 157 | } -------------------------------------------------------------------------------- /gnomad_sv_pipeline_wdls/module_02/02_baftest.wdl: -------------------------------------------------------------------------------- 1 | import "https://api.firecloud.org/ga4gh/v1/tools/Talkowski-SV:02_baftest_autosome/versions/12/plain-WDL/descriptor" as auto 2 | 3 | # Parallelize baftest on a single VCF across chromosomes 4 | workflow baftest_by_chrom { 5 | File vcf # Input VCF 6 | String baf_metrics # Matrix of BAF statistics 7 | File baf_metrics_idx # Tabix index of BAF matrix 8 | File autosome_contigs # Autosomes .fai 9 | File svc_acct_key # Service account json 10 | Array[String] samples # List of samples in batch 11 | String batch # Batch ID 12 | String algorithm # Algorithm ID 13 | Int split_size # Number of lines in each baftest split 14 | 15 | Array[Array[String]] autosomes = read_tsv(autosome_contigs) 16 | 17 | # Run baftest on each autosome 18 | scatter (autosome in autosomes) { 19 | call auto.baftest_autosome { 20 | input: 21 | vcf=vcf, 22 | baf_metrics=baf_metrics, 23 | baf_metrics_idx=baf_metrics_idx, 24 | batch=batch, 25 | algorithm=algorithm, 26 | chrom=autosome[0], 27 | split_size=split_size, 28 | samples=samples, 29 | svc_acct_key=svc_acct_key 30 | } 31 | } 32 | 33 | # Combine baftest results into single file 34 | call merge_baftest { 35 | input: 36 | autosomes=baftest_autosome.stats, 37 | prefix="${batch}.${algorithm}" 38 | } 39 | 40 | output { 41 | File baftest = merge_baftest.merged_stats 42 | } 43 | } 44 | 45 | # Combine per-chromosome baftest results into single table 46 | task merge_baftest { 47 | Array[File] autosomes 48 | String prefix 49 | 50 | command <<< 51 | while read split; do 52 | sed -e '1d' $split; 53 | done < ${write_tsv(autosomes)} | cat <(head -n1 ${autosomes[0]}) - > ${prefix}.stats 54 | >>> 55 | 56 | output { 57 | File merged_stats = "${prefix}.stats" 58 | } 59 | 60 | runtime { 61 | preemptible: 3 62 | docker: "talkowski/sv-pipeline-remote-pysam" 63 | } 64 | } -------------------------------------------------------------------------------- /gnomad_sv_pipeline_wdls/module_02/02_baftest_autosome.wdl: -------------------------------------------------------------------------------- 1 | # Run baftest on a single autosome, parallelizing across a fixed split size 2 | workflow baftest_autosome { 3 | File vcf # Input VCF 4 | String baf_metrics # Matrix of BAF statistics 5 | File baf_metrics_idx # Tabix index of BAF matrix 6 | Array[String] samples # list of samples in batch 7 | File svc_acct_key # Service account json 8 | String batch # Batch ID 9 | String algorithm # Algorithm ID 10 | String chrom # Chromosome being processed 11 | Int split_size # Number of lines in each baftest split 12 | 13 | # Compute the length of the suffix needed to accomodate all splits 14 | call compute_suffix_len { 15 | input: 16 | vcf=vcf, 17 | chrom=chrom, 18 | split_size=split_size 19 | } 20 | 21 | # Split the VCF into smaller chunks 22 | call split_vcf { 23 | input: 24 | vcf=vcf, 25 | batch=batch, 26 | algorithm=algorithm, 27 | chrom=chrom, 28 | split_size=split_size, 29 | suffix_len=compute_suffix_len.len 30 | } 31 | 32 | # Run baftest on each split 33 | scatter (split in split_vcf.split_beds) { 34 | # Run baftest 35 | call baftest { 36 | input: 37 | bed=split, 38 | prefix=basename(split), 39 | baf_metrics=baf_metrics, 40 | baf_metrics_idx=baf_metrics_idx, 41 | samples=samples, 42 | batch=batch, 43 | svc_acct_key=svc_acct_key 44 | } 45 | } 46 | 47 | # Merge splits into single file 48 | call merge_splits { 49 | input: 50 | stats=baftest.stats, 51 | prefix="${batch}.${algorithm}.${chrom}" 52 | } 53 | 54 | output { 55 | File stats = merge_splits.merged_stats 56 | } 57 | } 58 | 59 | # Compute the length of the suffix necessary to accommodate all splits 60 | task compute_suffix_len { 61 | File vcf 62 | String chrom 63 | Int split_size 64 | 65 | command <<< 66 | tabix -p vcf ${vcf}; 67 | python3 <>> 79 | 80 | output { 81 | Int len = read_int(stdout()) 82 | } 83 | 84 | runtime { 85 | preemptible: 3 86 | docker: "talkowski/sv-pipeline-remote-pysam" 87 | } 88 | } 89 | 90 | # Split VCF into fixed size chunks 91 | task split_vcf { 92 | File vcf 93 | String batch 94 | String algorithm 95 | String chrom 96 | 97 | Int split_size 98 | Int suffix_len 99 | 100 | command <<< 101 | tabix -p vcf ${vcf}; 102 | tabix -h ${vcf} ${chrom} \ 103 | | svtk vcf2bed --no-header stdin stdout \ 104 | | fgrep -e "DEL" -e "DUP" \ 105 | | awk -v OFS="\t" '{print $1, $2, $3, $4, $6, $5}' \ 106 | | awk '($3-$2>=10000 && $3-$2<10000000)' \ 107 | | split -a ${suffix_len} -d -l 300 - ${batch}.${algorithm}.split.gt10kb. 108 | tabix -h ${vcf} ${chrom} \ 109 | | svtk vcf2bed --no-header stdin stdout \ 110 | | fgrep -e "DEL" -e "DUP" \ 111 | | awk -v OFS="\t" '{print $1, $2, $3, $4, $6, $5}' \ 112 | | awk '($3-$2<10000)' \ 113 | | sort -k1,1V -k2,2n \ 114 | | split -a ${suffix_len} -d -l ${split_size} - ${batch}.${algorithm}.split. 115 | >>> 116 | 117 | output { 118 | Array[File] split_beds = glob("${batch}.${algorithm}.split.*") 119 | } 120 | 121 | runtime { 122 | preemptible: 3 123 | docker: "talkowski/sv-pipeline-remote-pysam" 124 | } 125 | } 126 | 127 | # Run baftest 128 | task baftest { 129 | File bed 130 | String baf_metrics 131 | File baf_metrics_idx 132 | Array[String] samples 133 | File svc_acct_key 134 | String prefix 135 | String batch 136 | 137 | command <<< 138 | echo -e "sample\tgroup\tbatch" > batch.key; 139 | awk -v batch=${batch} -v OFS="\t" '{print $1, $1, batch}' ${write_tsv(samples)} >> batch.key; 140 | url=$(gsutil signurl -d 24h ${svc_acct_key} ${baf_metrics} | sed '1d' | cut -f 4); 141 | start=$(cut -f2 ${bed} | sort -k1,1n | head -n1); 142 | end=$(cut -f3 ${bed} | sort -k1,1n | tail -n1); 143 | chrom=$(cut -f1 ${bed} | head -n1); 144 | svtk remote_tabix "$url" ${baf_metrics_idx} "$chrom":"$start"-"$end" | bgzip -c > local_baf.bed.gz; 145 | tabix -b2 local_baf.bed.gz; 146 | svtk baf-test ${bed} local_baf.bed.gz --batch batch.key > ${prefix}.metrics 147 | >>> 148 | 149 | output { 150 | File stats = "${prefix}.metrics" 151 | } 152 | 153 | runtime { 154 | preemptible: 3 155 | memory: "10 GB" 156 | disks: "local-disk 50 SSD" 157 | docker: "talkowski/sv-pipeline-remote-pysam" 158 | } 159 | } 160 | 161 | # Merge split baftest results into single file 162 | task merge_splits { 163 | Array[File] stats 164 | String prefix 165 | 166 | command <<< 167 | echo -n "chrom start end name samples svtype delstat snp_ratio " > ${prefix}.stats; 168 | echo -n "del_loglik dupstat KS_stat KS_pval total_case_snps " >> ${prefix}.stats; 169 | echo -n "total_snps n_nonROH_cases n_samples mean_control_snps " >> ${prefix}.stats; 170 | echo -n "n_nonROH_controls n_controls" >> ${prefix}.stats; 171 | sed -i -e 's/ /\t/g' ${prefix}.stats; 172 | while read split; do 173 | cat $split; 174 | done < ${write_tsv(stats)} >> ${prefix}.stats 175 | >>> 176 | 177 | output { 178 | File merged_stats = "${prefix}.stats" 179 | } 180 | 181 | runtime { 182 | preemptible: 3 183 | docker: "talkowski/sv-pipeline-remote-pysam" 184 | } 185 | } -------------------------------------------------------------------------------- /gnomad_sv_pipeline_wdls/module_02/02_petest.wdl: -------------------------------------------------------------------------------- 1 | import "https://api.firecloud.org/ga4gh/v1/tools/Talkowski-SV:02_petest_autosome/versions/14/plain-WDL/descriptor" as auto 2 | import "https://api.firecloud.org/ga4gh/v1/tools/Talkowski-SV:02_petest_allosome/versions/10/plain-WDL/descriptor" as allo 3 | 4 | # Parallelize petest on a single VCF across chromosomes 5 | workflow petest_by_chrom { 6 | File vcf # Input VCF 7 | String discfile # Discordant pair file 8 | String medianfile # Medianfile 9 | File discfile_idx # Tabix index of discordant pair file 10 | File famfile # Batch fam file 11 | File autosome_contigs # Autosomes .fai 12 | File allosome_contigs # Allosomes .fai 13 | File svc_acct_key 14 | String batch # Batch ID 15 | String algorithm # Algorithm ID 16 | Int split_size # Number of lines in each petest split 17 | 18 | Array[Array[String]] autosomes = read_tsv(autosome_contigs) 19 | Array[Array[String]] allosomes = read_tsv(allosome_contigs) 20 | 21 | # Run petest on each autosome 22 | scatter (autosome in autosomes) { 23 | call auto.petest_autosome { 24 | input: 25 | vcf=vcf, 26 | discfile=discfile, 27 | medianfile=medianfile, 28 | discfile_idx=discfile_idx, 29 | batch=batch, 30 | algorithm=algorithm, 31 | chrom=autosome[0], 32 | split_size=split_size, 33 | svc_acct_key=svc_acct_key 34 | } 35 | } 36 | 37 | # Run petest on each allosome 38 | scatter (allosome in allosomes) { 39 | call allo.petest_allosome { 40 | input: 41 | vcf=vcf, 42 | discfile=discfile, 43 | medianfile=medianfile, 44 | discfile_idx=discfile_idx, 45 | famfile=famfile, 46 | batch=batch, 47 | algorithm=algorithm, 48 | chrom=allosome[0], 49 | split_size=split_size, 50 | svc_acct_key=svc_acct_key 51 | } 52 | } 53 | 54 | # Combine petest results into single file 55 | call merge_petest { 56 | input: 57 | autosomes=petest_autosome.stats, 58 | allosomes=petest_allosome.stats, 59 | prefix="${batch}.${algorithm}" 60 | } 61 | 62 | output { 63 | File petest = merge_petest.merged_stats 64 | } 65 | } 66 | 67 | # Combine per-chromosome petest results into single table 68 | task merge_petest { 69 | Array[File] autosomes 70 | Array[File] allosomes 71 | String prefix 72 | 73 | command <<< 74 | cat ${write_tsv(autosomes)} ${write_tsv(allosomes)} > splits.list; 75 | while read split; do 76 | sed -e '1d' $split; 77 | done < splits.list | cat <(head -n1 ${autosomes[0]}) - > ${prefix}.stats 78 | >>> 79 | 80 | output { 81 | File merged_stats = "${prefix}.stats" 82 | } 83 | 84 | runtime { 85 | preemptible: 3 86 | docker: "talkowski/sv-pipeline" 87 | } 88 | } -------------------------------------------------------------------------------- /gnomad_sv_pipeline_wdls/module_02/02_petest_autosome.wdl: -------------------------------------------------------------------------------- 1 | # Run petest on a single autosome, parallelizing across a fixed split size 2 | workflow petest_autosome { 3 | File vcf # Input VCF 4 | String discfile # Discordant pair file 5 | File medianfile # Median file 6 | File discfile_idx # Tabix index of discordant pair file 7 | File svc_acct_key # Service account key 8 | String batch # Batch ID 9 | String algorithm # Algorithm ID 10 | String chrom # Chromosome being processed 11 | Int split_size # Number of lines in each petest split 12 | 13 | # Compute the length of the suffix needed to accomodate all splits 14 | call compute_suffix_len { 15 | input: 16 | vcf=vcf, 17 | chrom=chrom, 18 | split_size=split_size 19 | } 20 | 21 | # Split the VCF into smaller chunks 22 | call split_vcf { 23 | input: 24 | vcf=vcf, 25 | batch=batch, 26 | algorithm=algorithm, 27 | chrom=chrom, 28 | split_size=split_size, 29 | suffix_len=compute_suffix_len.len 30 | } 31 | 32 | # Run petest on each split 33 | scatter (split in split_vcf.split_vcfs) { 34 | # Add VCF header to split 35 | call reheader_split { 36 | input: 37 | vcf=vcf, 38 | split=split 39 | } 40 | 41 | # Run petest 42 | call petest { 43 | input: 44 | vcf=reheader_split.split_w_header, 45 | prefix=basename(split), 46 | discfile=discfile, 47 | medianfile=medianfile, 48 | discfile_idx=discfile_idx, 49 | svc_acct_key=svc_acct_key 50 | } 51 | } 52 | 53 | # Merge splits into single file 54 | call merge_splits { 55 | input: 56 | stats=petest.stats, 57 | prefix="${batch}.${algorithm}.${chrom}" 58 | } 59 | 60 | output { 61 | File stats = merge_splits.merged_stats 62 | } 63 | } 64 | 65 | # Compute the length of the suffix necessary to accommodate all splits 66 | task compute_suffix_len { 67 | File vcf 68 | String chrom 69 | Int split_size 70 | 71 | command <<< 72 | tabix -p vcf ${vcf}; 73 | python3 <>> 85 | 86 | output { 87 | Int len = read_int(stdout()) 88 | } 89 | 90 | runtime { 91 | preemptible: 3 92 | docker: "talkowski/sv-pipeline-remote-pysam" 93 | } 94 | } 95 | 96 | # Split VCF into fixed size chunks 97 | task split_vcf { 98 | File vcf 99 | String batch 100 | String algorithm 101 | String chrom 102 | 103 | Int split_size 104 | Int suffix_len 105 | 106 | command { 107 | tabix -p vcf ${vcf}; 108 | tabix ${vcf} ${chrom} | sort -R | split -a ${suffix_len} -d -l ${split_size} - ${batch}.${algorithm}.split. 109 | } 110 | 111 | output { 112 | Array[File] split_vcfs = glob("${batch}.${algorithm}.split.*") 113 | } 114 | 115 | runtime { 116 | preemptible: 3 117 | docker: "talkowski/sv-pipeline-remote-pysam" 118 | } 119 | } 120 | 121 | # Restore VCF header to split files 122 | task reheader_split { 123 | File vcf 124 | File split 125 | 126 | command { 127 | cat <(zcat ${vcf} | sed -n -e '/^#/p') ${split} | bgzip -c > ${basename(split)}.vcf.gz 128 | } 129 | 130 | output { 131 | File split_w_header = "${basename(split)}.vcf.gz" 132 | } 133 | 134 | runtime { 135 | preemptible: 3 136 | docker: "talkowski/sv-pipeline-remote-pysam" 137 | } 138 | } 139 | 140 | # Run petest 141 | task petest { 142 | File vcf 143 | String discfile 144 | File medianfile 145 | File discfile_idx 146 | String prefix 147 | File svc_acct_key 148 | 149 | command { 150 | url=$(gsutil signurl -d 24h ${svc_acct_key} ${discfile} | sed '1d' | cut -f 4); 151 | echo $url; 152 | svtk pe-test -o 1000 --index ${discfile_idx} --medianfile ${medianfile} ${vcf} "$url" ${prefix}.stats 153 | } 154 | 155 | output { 156 | File stats = "${prefix}.stats" 157 | } 158 | 159 | runtime { 160 | preemptible: 3 161 | docker: "talkowski/sv-pipeline-remote-pysam" 162 | } 163 | } 164 | 165 | # Merge split petest results into single file 166 | task merge_splits { 167 | Array[File] stats 168 | String prefix 169 | 170 | command <<< 171 | while read split; do 172 | sed -e '1d' $split; 173 | done < ${write_tsv(stats)} | cat <(head -n1 ${stats[0]}) - > ${prefix}.stats 174 | >>> 175 | 176 | output { 177 | File merged_stats = "${prefix}.stats" 178 | } 179 | 180 | runtime { 181 | preemptible: 3 182 | docker: "talkowski/sv-pipeline-remote-pysam" 183 | } 184 | } -------------------------------------------------------------------------------- /gnomad_sv_pipeline_wdls/module_02/02_rdtest.wdl: -------------------------------------------------------------------------------- 1 | import "https://api.firecloud.org/ga4gh/v1/tools/Talkowski-SV:02_rdtest_autosome/versions/12/plain-WDL/descriptor" as auto 2 | import "https://api.firecloud.org/ga4gh/v1/tools/Talkowski-SV:02_rdtest_allosome/versions/7/plain-WDL/descriptor" as allo 3 | 4 | # Parallelize rdtest on a single VCF across chromosomes 5 | workflow rdtest_by_chrom { 6 | File vcf # Input VCF 7 | String coveragefile # Bincov matrix 8 | File coveragefile_idx # Tabix index of bincov matrix 9 | File medianfile # Median coverage of each sample 10 | File famfile # Batch fam file 11 | File autosome_contigs # Autosomes .fai 12 | File allosome_contigs # Allosomes .fai 13 | File svc_acct_key # Service account json 14 | String batch # Batch ID 15 | String algorithm # Algorithm ID 16 | Int split_size # Number of lines in each rdtest split 17 | 18 | Array[Array[String]] autosomes = read_tsv(autosome_contigs) 19 | Array[Array[String]] allosomes = read_tsv(allosome_contigs) 20 | 21 | # Run rdtest on each autosome 22 | scatter (autosome in autosomes) { 23 | call auto.rdtest_autosome { 24 | input: 25 | vcf=vcf, 26 | coveragefile=coveragefile, 27 | coveragefile_idx=coveragefile_idx, 28 | medianfile=medianfile, 29 | famfile=famfile, 30 | batch=batch, 31 | algorithm=algorithm, 32 | chrom=autosome[0], 33 | split_size=split_size, 34 | svc_acct_key=svc_acct_key 35 | } 36 | } 37 | 38 | # Run rdtest on each allosome 39 | scatter (allosome in allosomes) { 40 | call allo.rdtest_allosome { 41 | input: 42 | vcf=vcf, 43 | coveragefile=coveragefile, 44 | coveragefile_idx=coveragefile_idx, 45 | medianfile=medianfile, 46 | famfile=famfile, 47 | batch=batch, 48 | algorithm=algorithm, 49 | chrom=allosome[0], 50 | split_size=split_size, 51 | svc_acct_key=svc_acct_key 52 | } 53 | } 54 | 55 | # Combine rdtest results into single file 56 | call merge_rdtest { 57 | input: 58 | autosomes=rdtest_autosome.stats, 59 | allosomes=rdtest_allosome.stats, 60 | prefix="${batch}.${algorithm}" 61 | } 62 | 63 | output { 64 | File rdtest = merge_rdtest.merged_stats 65 | } 66 | } 67 | 68 | # Combine per-chromosome rdtest results into single table 69 | task merge_rdtest { 70 | Array[File] autosomes 71 | Array[File] allosomes 72 | String prefix 73 | 74 | command <<< 75 | cat ${write_tsv(autosomes)} ${write_tsv(allosomes)} > splits.list; 76 | while read split; do 77 | sed -e '1d' $split; 78 | done < splits.list | cat <(head -n1 ${autosomes[0]}) - > ${prefix}.stats 79 | >>> 80 | 81 | output { 82 | File merged_stats = "${prefix}.stats" 83 | } 84 | 85 | runtime { 86 | preemptible: 3 87 | docker: "talkowski/sv-pipeline-remote-pysam" 88 | } 89 | } -------------------------------------------------------------------------------- /gnomad_sv_pipeline_wdls/module_02/02_rdtest_autosome.wdl: -------------------------------------------------------------------------------- 1 | # Run rdtest on a single autosome, parallelizing across a fixed split size 2 | workflow rdtest_autosome { 3 | File vcf # Input VCF 4 | String coveragefile # Bincov matrix 5 | File coveragefile_idx # Tabix index of bincov matrix 6 | File medianfile # Median coverage of each sample 7 | File famfile # Batch fam file 8 | File svc_acct_key # Service account json 9 | String batch # Batch ID 10 | String algorithm # Algorithm ID 11 | String chrom # Chromosome being processed 12 | Int split_size # Number of lines in each rdtest split 13 | 14 | # Compute the length of the suffix needed to accomodate all splits 15 | call compute_suffix_len { 16 | input: 17 | vcf=vcf, 18 | chrom=chrom, 19 | split_size=split_size 20 | } 21 | 22 | # Split the VCF into smaller chunks 23 | call split_vcf { 24 | input: 25 | vcf=vcf, 26 | batch=batch, 27 | algorithm=algorithm, 28 | chrom=chrom, 29 | split_size=split_size, 30 | suffix_len=compute_suffix_len.len 31 | } 32 | 33 | call get_whitelist { 34 | input: 35 | famfile=famfile 36 | } 37 | 38 | # Run rdtest on each split 39 | scatter (split in split_vcf.split_beds) { 40 | # Run rdtest 41 | call rdtest { 42 | input: 43 | bed=split, 44 | prefix=basename(split), 45 | coveragefile=coveragefile, 46 | coveragefile_idx=coveragefile_idx, 47 | medianfile=medianfile, 48 | famfile=famfile, 49 | whitelist=get_whitelist.whitelist, 50 | svc_acct_key=svc_acct_key 51 | } 52 | } 53 | 54 | # Merge splits into single file 55 | call merge_splits { 56 | input: 57 | stats=rdtest.stats, 58 | prefix="${batch}.${algorithm}.${chrom}" 59 | } 60 | 61 | output { 62 | File stats = merge_splits.merged_stats 63 | } 64 | } 65 | 66 | # Compute the length of the suffix necessary to accommodate all splits 67 | task compute_suffix_len { 68 | File vcf 69 | String chrom 70 | Int split_size 71 | 72 | command <<< 73 | tabix -p vcf ${vcf}; 74 | python3 <>> 86 | 87 | output { 88 | Int len = read_int(stdout()) 89 | } 90 | 91 | runtime { 92 | preemptible: 3 93 | docker: "talkowski/sv-pipeline-remote-pysam" 94 | } 95 | } 96 | 97 | # Split VCF into fixed size chunks 98 | task split_vcf { 99 | File vcf 100 | String batch 101 | String algorithm 102 | String chrom 103 | 104 | Int split_size 105 | Int suffix_len 106 | 107 | command <<< 108 | tabix -p vcf ${vcf}; 109 | tabix -h ${vcf} ${chrom} \ 110 | | svtk vcf2bed --no-header stdin stdout \ 111 | | fgrep -e "DEL" -e "DUP" \ 112 | | awk -v OFS="\t" '{print $1, $2, $3, $4, $6, $5}' \ 113 | | awk '($3-$2>=10000)' \ 114 | > ${batch}.${algorithm}.split.gt10kb; 115 | tabix -h ${vcf} ${chrom} \ 116 | | svtk vcf2bed --no-header stdin stdout \ 117 | | fgrep -e "DEL" -e "DUP" \ 118 | | awk -v OFS="\t" '{print $1, $2, $3, $4, $6, $5}' \ 119 | | awk '($3-$2<10000)' \ 120 | | sort -k1,1V -k2,2n \ 121 | | split -a ${suffix_len} -d -l ${split_size} - ${batch}.${algorithm}.split. 122 | >>> 123 | 124 | output { 125 | Array[File] split_beds = glob("${batch}.${algorithm}.split.*") 126 | } 127 | 128 | runtime { 129 | preemptible: 3 130 | docker: "talkowski/sv-pipeline-remote-pysam" 131 | } 132 | } 133 | 134 | task get_whitelist { 135 | File famfile 136 | 137 | command { 138 | cut -f2 ${famfile} > samples.list 139 | } 140 | 141 | output { 142 | File whitelist = "samples.list" 143 | } 144 | 145 | runtime { 146 | preemptible: 3 147 | docker: "talkowski/sv-pipeline-remote-pysam" 148 | } 149 | } 150 | 151 | # Run rdtest 152 | task rdtest { 153 | File bed 154 | String coveragefile 155 | File coveragefile_idx 156 | File medianfile 157 | File famfile 158 | File whitelist 159 | File svc_acct_key 160 | String prefix 161 | 162 | command <<< 163 | url=$(gsutil signurl -d 24h ${svc_acct_key} ${coveragefile} | sed '1d' | cut -f 4); 164 | start=$(cut -f2 ${bed} | sort -k1,1n | head -n1); 165 | end=$(cut -f3 ${bed} | sort -k1,1n | tail -n1); 166 | chrom=$(cut -f1 ${bed} | head -n1); 167 | svtk remote_tabix --header "$url" ${coveragefile_idx} "$chrom":"$start"-"$end" |sed 's/Chr/chr/g'|sed 's/Start/start/g'|sed 's/End/end/' | bgzip -c > local_coverage.bed.gz; 168 | tabix -p bed local_coverage.bed.gz; 169 | Rscript /opt/RdTest/RdTest.R \ 170 | -b ${bed} \ 171 | -n ${prefix} \ 172 | -c local_coverage.bed.gz \ 173 | -m ${medianfile} \ 174 | -f ${famfile} \ 175 | -w ${whitelist} 176 | >>> 177 | 178 | output { 179 | File stats = "${prefix}.metrics" 180 | File local_coverage = "local_coverage.bed.gz" 181 | } 182 | 183 | runtime { 184 | preemptible: 3 185 | docker: "talkowski/sv-pipeline-rdtest" 186 | } 187 | } 188 | 189 | # Merge split rdtest results into single file 190 | task merge_splits { 191 | Array[File] stats 192 | String prefix 193 | 194 | command <<< 195 | while read split; do 196 | sed -e '1d' $split; 197 | done < ${write_tsv(stats)} | cat <(head -n1 ${stats[0]}) - > ${prefix}.stats 198 | >>> 199 | 200 | output { 201 | File merged_stats = "${prefix}.stats" 202 | } 203 | 204 | runtime { 205 | preemptible: 3 206 | docker: "talkowski/sv-pipeline-remote-pysam" 207 | } 208 | } -------------------------------------------------------------------------------- /gnomad_sv_pipeline_wdls/module_02/02_srtest.wdl: -------------------------------------------------------------------------------- 1 | import "https://api.firecloud.org/ga4gh/v1/tools/Talkowski-SV:02_srtest_autosome/versions/12/plain-WDL/descriptor" as auto 2 | import "https://api.firecloud.org/ga4gh/v1/tools/Talkowski-SV:02_srtest_allosome/versions/11/plain-WDL/descriptor" as allo 3 | 4 | # Parallelize srtest on a single VCF across chromosomes 5 | workflow srtest_by_chrom { 6 | File vcf # Input VCF 7 | String splitfile # Split read file 8 | String medianfile # Medianfile 9 | File splitfile_idx # Tabix index of split read file 10 | File famfile # Batch fam file 11 | File autosome_contigs # Autosomes .fai 12 | File allosome_contigs # Allosomes .fai 13 | File svc_acct_key # Service account json 14 | String batch # Batch ID 15 | String algorithm # Algorithm ID 16 | Int split_size # Number of lines in each srtest split 17 | 18 | Array[Array[String]] autosomes = read_tsv(autosome_contigs) 19 | Array[Array[String]] allosomes = read_tsv(allosome_contigs) 20 | 21 | # Run srtest on each autosome 22 | scatter (autosome in autosomes) { 23 | call auto.srtest_autosome { 24 | input: 25 | vcf=vcf, 26 | splitfile=splitfile, 27 | medianfile=medianfile, 28 | splitfile_idx=splitfile_idx, 29 | batch=batch, 30 | algorithm=algorithm, 31 | chrom=autosome[0], 32 | split_size=split_size, 33 | svc_acct_key=svc_acct_key 34 | } 35 | } 36 | 37 | # Run srtest on each allosome 38 | scatter (allosome in allosomes) { 39 | call allo.srtest_allosome { 40 | input: 41 | vcf=vcf, 42 | splitfile=splitfile, 43 | medianfile=medianfile, 44 | splitfile_idx=splitfile_idx, 45 | famfile=famfile, 46 | batch=batch, 47 | algorithm=algorithm, 48 | chrom=allosome[0], 49 | split_size=split_size, 50 | svc_acct_key=svc_acct_key 51 | } 52 | } 53 | 54 | # Combine srtest results into single file 55 | call merge_srtest { 56 | input: 57 | autosomes=srtest_autosome.stats, 58 | allosomes=srtest_allosome.stats, 59 | prefix="${batch}.${algorithm}" 60 | } 61 | 62 | output { 63 | File srtest = merge_srtest.merged_stats 64 | } 65 | } 66 | 67 | # Combine per-chromosome srtest results into single table 68 | task merge_srtest { 69 | Array[File] autosomes 70 | Array[File] allosomes 71 | String prefix 72 | 73 | command <<< 74 | cat ${write_tsv(autosomes)} ${write_tsv(allosomes)} > splits.list; 75 | while read split; do 76 | sed -e '1d' $split; 77 | done < splits.list | cat <(head -n1 ${autosomes[0]}) - > ${prefix}.stats 78 | >>> 79 | 80 | output { 81 | File merged_stats = "${prefix}.stats" 82 | } 83 | 84 | runtime { 85 | preemptible: 3 86 | docker: "talkowski/sv-pipeline-remote-pysam" 87 | disks: "local-disk 100 SSD" 88 | } 89 | } -------------------------------------------------------------------------------- /gnomad_sv_pipeline_wdls/module_02/02_srtest_autosome.wdl: -------------------------------------------------------------------------------- 1 | # Run srtest on a single autosome, parallelizing across a fixed split size 2 | workflow srtest_autosome { 3 | File vcf # Input VCF 4 | String splitfile # Split read file 5 | File medianfile # Median file 6 | File splitfile_idx # Tabix index of split read file 7 | File svc_acct_key # Service account key json 8 | String batch # Batch ID 9 | String algorithm # Algorithm ID 10 | String chrom # Chromosome being processed 11 | Int split_size # Number of lines in each srtest split 12 | 13 | # Compute the length of the suffix needed to accomodate all splits 14 | call compute_suffix_len { 15 | input: 16 | vcf=vcf, 17 | chrom=chrom, 18 | split_size=split_size 19 | } 20 | 21 | # Split the VCF into smaller chunks 22 | call split_vcf { 23 | input: 24 | vcf=vcf, 25 | batch=batch, 26 | algorithm=algorithm, 27 | chrom=chrom, 28 | split_size=split_size, 29 | suffix_len=compute_suffix_len.len 30 | } 31 | 32 | # Run srtest on each split 33 | scatter (split in split_vcf.split_vcfs) { 34 | # Add VCF header to split 35 | call reheader_split { 36 | input: 37 | vcf=vcf, 38 | split=split 39 | } 40 | 41 | # Run srtest 42 | call srtest { 43 | input: 44 | vcf=reheader_split.split_w_header, 45 | prefix=basename(split), 46 | splitfile=splitfile, 47 | medianfile=medianfile, 48 | splitfile_idx=splitfile_idx, 49 | svc_acct_key=svc_acct_key 50 | } 51 | } 52 | 53 | # Merge splits into single file 54 | call merge_splits { 55 | input: 56 | stats=srtest.stats, 57 | prefix="${batch}.${algorithm}.${chrom}" 58 | } 59 | 60 | output { 61 | File stats = merge_splits.merged_stats 62 | } 63 | } 64 | 65 | # Compute the length of the suffix necessary to accommodate all splits 66 | task compute_suffix_len { 67 | File vcf 68 | String chrom 69 | Int split_size 70 | 71 | command <<< 72 | tabix -p vcf ${vcf}; 73 | python3 <>> 85 | 86 | output { 87 | Int len = read_int(stdout()) 88 | } 89 | 90 | runtime { 91 | preemptible: 3 92 | docker: "talkowski/sv-pipeline-remote-pysam" 93 | } 94 | } 95 | 96 | # Split VCF into fixed size chunks 97 | task split_vcf { 98 | File vcf 99 | String batch 100 | String algorithm 101 | String chrom 102 | 103 | Int split_size 104 | Int suffix_len 105 | 106 | command { 107 | tabix -p vcf ${vcf}; 108 | tabix ${vcf} ${chrom} | sort -R | split -a ${suffix_len} -d -l ${split_size} - ${batch}.${algorithm}.split. 109 | } 110 | 111 | output { 112 | Array[File] split_vcfs = glob("${batch}.${algorithm}.split.*") 113 | } 114 | 115 | runtime { 116 | preemptible: 3 117 | docker: "talkowski/sv-pipeline-remote-pysam" 118 | } 119 | } 120 | 121 | # Restore VCF header to split files 122 | task reheader_split { 123 | File vcf 124 | File split 125 | 126 | command { 127 | cat <(zcat ${vcf} | sed -n -e '/^#/p') ${split} | bgzip -c > ${basename(split)}.vcf.gz 128 | } 129 | 130 | output { 131 | File split_w_header = "${basename(split)}.vcf.gz" 132 | } 133 | 134 | runtime { 135 | preemptible: 3 136 | docker: "talkowski/sv-pipeline-remote-pysam" 137 | } 138 | } 139 | 140 | # Run srtest 141 | task srtest { 142 | File vcf 143 | String splitfile 144 | File medianfile 145 | File splitfile_idx 146 | File svc_acct_key 147 | String prefix 148 | 149 | command <<< 150 | url=$(gsutil signurl -d 24h ${svc_acct_key} ${splitfile} | sed '1d' | cut -f 4); 151 | echo $url; 152 | svtk vcf2bed --split-bnd --no-header ${vcf} test.bed 153 | awk -v OFS="\t" '{if ($2-250>0){print $1,$2-250,$2+250}else{print $1,0,$2+250}}' test.bed >> region.bed 154 | awk -v OFS="\t" '{if ($3-250>0){print $1,$3-250,$3+250}else{print $1,0,$3+250}}' test.bed >> region.bed 155 | sort -k1,1 -k2,2n region.bed > region.sorted.bed 156 | bedtools merge -i region.sorted.bed > region.merged.bed 157 | svtk remote_tabix "$url" ${splitfile_idx} -R region.merged.bed | bgzip -c > SR.txt.gz 158 | tabix -b 2 -e 2 SR.txt.gz 159 | svtk sr-test -w 50 --log --index SR.txt.gz.tbi --medianfile ${medianfile} ${vcf} SR.txt.gz ${prefix}.stats 160 | >>> 161 | 162 | output { 163 | File stats = "${prefix}.stats" 164 | } 165 | 166 | runtime { 167 | disks: "local-disk 30 SSD" 168 | preemptible: 3 169 | docker: "talkowski/sv-pipeline-remote-pysam" 170 | } 171 | } 172 | 173 | # Merge split srtest results into single file 174 | task merge_splits { 175 | Array[File] stats 176 | String prefix 177 | 178 | command <<< 179 | while read split; do 180 | sed -e '1d' $split; 181 | done < ${write_tsv(stats)} | cat <(head -n1 ${stats[0]}) - > ${prefix}.stats 182 | >>> 183 | 184 | output { 185 | File merged_stats = "${prefix}.stats" 186 | } 187 | 188 | runtime { 189 | preemptible: 3 190 | docker: "talkowski/sv-pipeline-remote-pysam" 191 | } 192 | } -------------------------------------------------------------------------------- /gnomad_sv_pipeline_wdls/module_03/03_filter_vcf.wdl: -------------------------------------------------------------------------------- 1 | workflow RF_filter_vcf { 2 | File vcf 3 | File metrics 4 | File scores 5 | File cutoffs 6 | String prefix 7 | 8 | call filter_vcf { 9 | input: 10 | vcf=vcf, 11 | scores=scores, 12 | prefix=prefix 13 | } 14 | 15 | call rewrite_SR_coords { 16 | input: 17 | vcf=filter_vcf.filtered_vcf, 18 | metrics=metrics, 19 | cutoffs=cutoffs, 20 | prefix=prefix 21 | } 22 | 23 | call annotate_RF_evidence { 24 | input: 25 | vcf=rewrite_SR_coords.corrected_vcf, 26 | scores=scores, 27 | prefix=prefix 28 | } 29 | 30 | output { 31 | File filtered_vcf = annotate_RF_evidence.annotated_vcf 32 | } 33 | } 34 | 35 | task filter_vcf { 36 | File vcf 37 | File scores 38 | String prefix 39 | 40 | command <<< 41 | cat \ 42 | <(sed -e '1d' ${scores} | fgrep -e DEL -e DUP | awk '($3>=0.5)' | cut -f1 | fgrep -w -f - <(zcat ${vcf})) \ 43 | <(sed -e '1d' ${scores} | fgrep -e INV -e BND -e INS | awk '($3>=0.5)' | cut -f1 | fgrep -w -f - <(zcat ${vcf}) | sed -e 's/SVTYPE=DEL/SVTYPE=BND/' -e 's/SVTYPE=DUP/SVTYPE=BND/' -e 's///' -e 's///') \ 44 | | cat <(sed -n -e '/^#/p' <(zcat ${vcf})) - \ 45 | | vcf-sort -c \ 46 | | bgzip -c \ 47 | > ${prefix}.filtered.vcf.gz 48 | >>> 49 | 50 | output { 51 | File filtered_vcf = "${prefix}.filtered.vcf.gz" 52 | } 53 | 54 | runtime { 55 | docker: "talkowski/sv-pipeline@sha256:7e7e6163d6ac0fc5781eb99ee5a7eec4db37506f48d00f5063b96123f9ca5024" 56 | preemptible: 3 57 | } 58 | } 59 | 60 | task rewrite_SR_coords { 61 | File vcf 62 | File metrics 63 | File cutoffs 64 | String prefix 65 | 66 | command <<< 67 | set -o pipefail; 68 | /opt/sv-pipeline/03_variant_filtering/scripts/rewrite_SR_coords.py ${vcf} ${metrics} ${cutoffs} stdout \ 69 | | vcf-sort -c \ 70 | | bgzip -c \ 71 | > ${prefix}.corrected_coords.vcf.gz 72 | >>> 73 | 74 | output { 75 | File corrected_vcf = "${prefix}.corrected_coords.vcf.gz" 76 | } 77 | 78 | runtime { 79 | docker: "talkowski/sv-pipeline@sha256:7e7e6163d6ac0fc5781eb99ee5a7eec4db37506f48d00f5063b96123f9ca5024" 80 | memory: "10 GB" 81 | preemptible: 3 82 | } 83 | } 84 | 85 | task annotate_RF_evidence { 86 | File vcf 87 | File scores 88 | String prefix 89 | 90 | command <<< 91 | /opt/sv-pipeline/03_variant_filtering/scripts/annotate_RF_evidence.py ${vcf} ${scores} ${prefix}.with_evidence.vcf; 92 | bgzip ${prefix}.with_evidence.vcf 93 | >>> 94 | 95 | output { 96 | File annotated_vcf = "${prefix}.with_evidence.vcf.gz" 97 | } 98 | 99 | runtime { 100 | docker: "talkowski/sv-pipeline@sha256:7e7e6163d6ac0fc5781eb99ee5a7eec4db37506f48d00f5063b96123f9ca5024" 101 | preemptible: 3 102 | } 103 | } -------------------------------------------------------------------------------- /gnomad_sv_pipeline_wdls/module_04/04_preprocess.wdl: -------------------------------------------------------------------------------- 1 | import "https://api.firecloud.org/ga4gh/v1/tools/Talkowski-SV:gather_attribute_paths_multiSampleSet/versions/7/plain-WDL/descriptor" as getAttribute 2 | 3 | # Copyright (c) 2018 Talkowski Lab 4 | 5 | # Contact Ryan Collins 6 | 7 | # Distributed under terms of the MIT License 8 | 9 | 10 | # Workflow to preprocess all files needed for per-batch genotyping in module 04a 11 | workflow preprocess_04a_files { 12 | 13 | File sample_set_list 14 | File svcActKeyJson 15 | String workspaceProject 16 | String workspaceName 17 | 18 | # Get cohort_filtered_pesr_vcf_list 19 | call getAttribute.gather_attribute_paths_multiSampleSet as get_pesr_vcf_list { 20 | input: 21 | sample_set_list=sample_set_list, 22 | Attribute="filtered_pesr_vcf", 23 | svcActKeyJson=svcActKeyJson, 24 | workspaceProject=workspaceProject, 25 | workspaceName=workspaceName 26 | } 27 | 28 | # Get cohort_filtered_depth_vcf_list 29 | call getAttribute.gather_attribute_paths_multiSampleSet as get_depth_vcf_list { 30 | input: 31 | sample_set_list=sample_set_list, 32 | Attribute="filtered_depth_vcf", 33 | svcActKeyJson=svcActKeyJson, 34 | workspaceProject=workspaceProject, 35 | workspaceName=workspaceName 36 | } 37 | 38 | # Outputs 39 | output { 40 | File filtered_pesr_vcf_list = get_pesr_vcf_list.attribute_list 41 | File filtered_depth_vcf_list = get_depth_vcf_list.attribute_list 42 | } 43 | } -------------------------------------------------------------------------------- /gnomad_sv_pipeline_wdls/module_04/04_v2_PE_genotyping_train.wdl: -------------------------------------------------------------------------------- 1 | workflow PE_genotype_train { 2 | File batch_vcf # variants from just the batch in question 3 | String discfile 4 | Int n_per_split 5 | File medianfile 6 | File discfile_idx 7 | File svc_acct_key 8 | Array[String] samples 9 | String batch_ID 10 | File RF_cutoffs 11 | File RD_genotypes 12 | File RD_melted_genotypes 13 | File blacklist 14 | 15 | call vcf2bed as make_batch_bed { 16 | input: 17 | vcf=batch_vcf, 18 | prefix=batch_ID 19 | } 20 | 21 | call split_vcf as split_batch_vcf { 22 | input: 23 | vcf=batch_vcf, 24 | n_per_split=n_per_split 25 | } 26 | 27 | scatter (vcf in split_batch_vcf.vcfs) { 28 | call count_pe as count_batch_pe { 29 | input: 30 | vcf=vcf, 31 | discfile=discfile, 32 | discfile_idx=discfile_idx, 33 | medianfile=medianfile, 34 | svc_acct_key=svc_acct_key, 35 | samples=samples 36 | } 37 | } 38 | 39 | call merge_pe_counts { 40 | input: 41 | count_list=count_batch_pe.pe_counts 42 | } 43 | 44 | call genotype_PE_part1 { 45 | input: 46 | bed=make_batch_bed.bed, 47 | RF_cutoffs=RF_cutoffs, 48 | PE_counts=merge_pe_counts.counts, 49 | RD_genotypes=RD_genotypes, 50 | RD_melted_genotypes=RD_melted_genotypes, 51 | blacklist=blacklist 52 | } 53 | 54 | output { 55 | File PE_genotypes = genotype_PE_part1.genotypes 56 | File PE_varGQ = genotype_PE_part1.varGQ 57 | File PE_metrics = genotype_PE_part1.PE_metrics 58 | File PE_train = genotype_PE_part1.PE_train 59 | } 60 | } 61 | 62 | task vcf2bed { 63 | File vcf 64 | String prefix 65 | 66 | command { 67 | svtk vcf2bed ${vcf} -i ALGORITHMS ${prefix}.bed 68 | } 69 | 70 | output { 71 | File bed = "${prefix}.bed" 72 | } 73 | 74 | runtime { 75 | preemptible: 3 76 | docker: "talkowski/sv-pipeline@sha256:e5c7ce65c2e0c851261679b62095a13f42d0e4b4fef70b1d0183f2767e4ec53c" 77 | } 78 | } 79 | 80 | task split_vcf { 81 | File vcf 82 | Int n_per_split 83 | 84 | command <<< 85 | if [[ ${vcf} == *.gz ]] ; then 86 | zcat ${vcf} | sed -n -e '/^#/p' > header.vcf; 87 | zcat ${vcf} | sed -e '/^#/d' | split -l ${n_per_split} - pe; 88 | else 89 | sed -n -e '/^#/p' ${vcf} > header.vcf; 90 | sed -e '/^#/d' ${vcf} | split -l ${n_per_split} - pe; 91 | fi 92 | for f in pe*; do cat header.vcf $f > $f.vcf; done 93 | >>> 94 | 95 | output { 96 | Array[File] vcfs = glob("pe*.vcf") 97 | } 98 | 99 | runtime { 100 | preemptible: 3 101 | docker: "talkowski/sv-pipeline@sha256:e5c7ce65c2e0c851261679b62095a13f42d0e4b4fef70b1d0183f2767e4ec53c" 102 | } 103 | } 104 | 105 | task count_pe { 106 | File vcf 107 | String discfile 108 | File discfile_idx 109 | File medianfile 110 | File svc_acct_key 111 | Array[String] samples 112 | 113 | String prefix = basename(vcf, ".vcf") 114 | 115 | command <<< 116 | url=$(gsutil signurl -d 24h ${svc_acct_key} ${discfile} | sed '1d' | cut -f 4); 117 | svtk vcf2bed --split-bnd --no-header ${vcf} test.bed; 118 | awk -v OFS="\t" -v window=5000 '{if ($2-window>0){print $1,$2-window,$2+window}else{print $1,0,$2+window}}' test.bed >> region.bed; 119 | awk -v OFS="\t" -v window=5000 '{if ($3-window>0){print $1,$3-window,$3+window}else{print $1,0,$3+window}}' test.bed >> region.bed; 120 | sort -k1,1 -k2,2n region.bed > region.sorted.bed; 121 | bedtools merge -i region.sorted.bed > region.merged.bed; 122 | svtk remote_tabix "$url" ${discfile_idx} -R region.merged.bed | bgzip -c > PE.txt.gz; 123 | tabix -b 2 -e 2 PE.txt.gz; 124 | svtk count-pe --index PE.txt.gz.tbi -s ${write_tsv(samples)} --medianfile ${medianfile} ${vcf} PE.txt.gz ${prefix}.pe_counts.txt; 125 | gzip ${prefix}.pe_counts.txt 126 | >>> 127 | 128 | output { 129 | File pe_counts = "${prefix}.pe_counts.txt.gz" 130 | } 131 | 132 | runtime { 133 | preemptible: 3 134 | docker: "talkowski/sv-pipeline-remote-pysam@sha256:41a84644c1f7d339813c1176fdd6d42ed1ac770e430b053975d47da6e99f5f26" 135 | } 136 | } 137 | 138 | task merge_pe_counts { 139 | Array[File] count_list 140 | 141 | command { 142 | zcat ${sep=' ' count_list} | fgrep -v -e "name" | gzip -c > pe_counts.txt.gz 143 | } 144 | 145 | output { 146 | File counts = "pe_counts.txt.gz" 147 | } 148 | 149 | runtime { 150 | preemptible: 3 151 | docker: "talkowski/sv-pipeline@sha256:e5c7ce65c2e0c851261679b62095a13f42d0e4b4fef70b1d0183f2767e4ec53c" 152 | disks: "local-disk 50 SSD" 153 | } 154 | } 155 | 156 | task genotype_PE_part1 { 157 | File bed 158 | File RF_cutoffs 159 | File PE_counts 160 | File RD_genotypes 161 | File RD_melted_genotypes 162 | File blacklist 163 | 164 | command <<< 165 | /opt/sv-pipeline/04_variant_resolution/scripts/PE_genotype.sh \ 166 | ${bed} \ 167 | ${PE_counts} \ 168 | ${RD_genotypes} \ 169 | ${RD_melted_genotypes} \ 170 | ${RF_cutoffs} \ 171 | ${blacklist} \ 172 | /opt/RdTest/generate_cutoff_PE.R 173 | >>> 174 | 175 | output { 176 | File PE_train = "pe.train.include.txt" 177 | File PE_metrics = "pe_metric_file.txt" 178 | File genotypes = "pe.geno.withquality.txt.gz" 179 | File varGQ = "pe.variant.quality.final.txt.gz" 180 | } 181 | 182 | runtime { 183 | preemptible: 0 184 | docker: "talkowski/sv-pipeline-rdtest@sha256:764635fce650adac449b013058388a55653e8c7e6c075452a80f6e2a104754cd" 185 | disks: "local-disk 50 SSD" 186 | } 187 | } -------------------------------------------------------------------------------- /gnomad_sv_pipeline_wdls/module_04/04_v2_SR_genotyping_train.wdl: -------------------------------------------------------------------------------- 1 | workflow SR_genotype_train { 2 | File batch_vcf 3 | String splitfile 4 | Int n_per_split 5 | File medianfile 6 | File splitfile_idx 7 | File svc_acct_key 8 | Array[String] samples 9 | String batch_ID 10 | File RF_cutoffs 11 | File RD_melted_genotypes 12 | File PE_train 13 | File PE_genotypes 14 | 15 | call split_vcf as split_batch_vcf { 16 | input: 17 | vcf=batch_vcf, 18 | n_per_split=n_per_split 19 | } 20 | 21 | scatter (vcf in split_batch_vcf.vcfs) { 22 | call count_sr as count_batch_sr { 23 | input: 24 | vcf=vcf, 25 | splitfile=splitfile, 26 | splitfile_idx=splitfile_idx, 27 | medianfile=medianfile, 28 | svc_acct_key=svc_acct_key, 29 | samples=samples 30 | } 31 | } 32 | 33 | call merge_sr_counts { 34 | input: 35 | count_list=count_batch_sr.sr_counts, 36 | sum_list=count_batch_sr.sr_sum 37 | } 38 | 39 | call genotype_SR_part1 { 40 | input: 41 | vcf=batch_vcf, 42 | RF_cutoffs=RF_cutoffs, 43 | SR_counts=merge_sr_counts.counts, 44 | SR_sum=merge_sr_counts.sum, 45 | RD_melted_genotypes=RD_melted_genotypes, 46 | PE_train=PE_train, 47 | samples=samples, 48 | PE_genotypes=PE_genotypes 49 | } 50 | 51 | output { 52 | File SR_metrics = genotype_SR_part1.SR_metrics 53 | } 54 | } 55 | 56 | task split_vcf { 57 | File vcf 58 | Int n_per_split 59 | 60 | command <<< 61 | if [[ ${vcf} == *.gz ]] ; then 62 | echo "gzipped"; 63 | zcat ${vcf} | sed -n -e '/^#/p' > header.vcf; 64 | zcat ${vcf} | sed -e '/^#/d' | split -l ${n_per_split} - sr; 65 | else 66 | echo "plaintext"; 67 | sed -n -e '/^#/p' ${vcf} > header.vcf; 68 | sed -e '/^#/d' ${vcf} | split -l ${n_per_split} - sr; 69 | fi 70 | for f in sr*; do cat header.vcf $f | bgzip -c > $f.vcf.gz; done 71 | >>> 72 | 73 | output { 74 | Array[File] vcfs = glob("sr*.vcf.gz") 75 | } 76 | 77 | runtime { 78 | preemptible: 3 79 | docker: "talkowski/sv-pipeline@sha256:e5c7ce65c2e0c851261679b62095a13f42d0e4b4fef70b1d0183f2767e4ec53c" 80 | } 81 | } 82 | 83 | task count_sr { 84 | File vcf 85 | String splitfile 86 | File splitfile_idx 87 | File medianfile 88 | File svc_acct_key 89 | Array[String] samples 90 | 91 | String prefix = basename(vcf, ".vcf") 92 | 93 | command <<< 94 | url=$(gsutil signurl -d 24h ${svc_acct_key} ${splitfile} | sed '1d' | cut -f 4); 95 | svtk vcf2bed --split-bnd --no-header ${vcf} test.bed; 96 | awk -v OFS="\t" '{if ($2-250>0){print $1,$2-250,$2+250}else{print $1,0,$2+250}}' test.bed >> region.bed; 97 | awk -v OFS="\t" '{if ($3-250>0){print $1,$3-250,$3+250}else{print $1,0,$3+250}}' test.bed >> region.bed; 98 | sort -k1,1 -k2,2n region.bed > region.sorted.bed; 99 | bedtools merge -i region.sorted.bed > region.merged.bed; 100 | svtk remote_tabix "$url" ${splitfile_idx} -R region.merged.bed | bgzip -c > SR.txt.gz; 101 | tabix -b 2 -e 2 SR.txt.gz; 102 | svtk count-sr --index SR.txt.gz.tbi -s ${write_tsv(samples)} --medianfile ${medianfile} ${vcf} SR.txt.gz ${prefix}.sr_counts.txt; 103 | /opt/sv-pipeline/04_variant_resolution/scripts/sum_SR.sh ${prefix}.sr_counts.txt ${prefix}.sr_sum.txt.gz; 104 | gzip ${prefix}.sr_counts.txt 105 | >>> 106 | 107 | output { 108 | File sr_counts = "${prefix}.sr_counts.txt.gz" 109 | File sr_sum = "${prefix}.sr_sum.txt.gz" 110 | } 111 | 112 | runtime { 113 | preemptible: 3 114 | docker: "talkowski/sv-pipeline-remote-pysam@sha256:41a84644c1f7d339813c1176fdd6d42ed1ac770e430b053975d47da6e99f5f26" 115 | } 116 | } 117 | 118 | task merge_sr_counts { 119 | Array[File] count_list 120 | Array[File] sum_list 121 | 122 | command { 123 | zcat ${sep=' ' count_list} | fgrep -v -e "name" | gzip -c > sr_counts.txt.gz; 124 | cat ${sep=' ' sum_list} > sr_sum.txt.gz 125 | } 126 | 127 | output { 128 | File counts = "sr_counts.txt.gz" 129 | File sum = "sr_sum.txt.gz" 130 | } 131 | 132 | runtime { 133 | preemptible: 3 134 | docker: "talkowski/sv-pipeline@sha256:e5c7ce65c2e0c851261679b62095a13f42d0e4b4fef70b1d0183f2767e4ec53c" 135 | disks: "local-disk 60 SSD" 136 | } 137 | } 138 | 139 | task genotype_SR_part1 { 140 | File vcf 141 | File SR_counts 142 | File SR_sum 143 | File RD_melted_genotypes 144 | File RF_cutoffs 145 | Array[String] samples 146 | File PE_train 147 | File PE_genotypes 148 | 149 | command <<< 150 | /opt/sv-pipeline/04_variant_resolution/scripts/SR_genotype.opt_part1.sh \ 151 | ${vcf} \ 152 | ${SR_counts} \ 153 | ${SR_sum} \ 154 | ${RD_melted_genotypes} \ 155 | ${RF_cutoffs} \ 156 | ${write_tsv(samples)} \ 157 | ${PE_train} \ 158 | ${PE_genotypes} 159 | >>> 160 | 161 | output { 162 | File SR_metrics = "sr_metric_file.txt" 163 | } 164 | 165 | runtime { 166 | preemptible: 0 167 | docker: "talkowski/sv-pipeline-rdtest@sha256:764635fce650adac449b013058388a55653e8c7e6c075452a80f6e2a104754cd" 168 | disks: "local-disk 60 SSD" 169 | memory: "16 GB" 170 | } 171 | } -------------------------------------------------------------------------------- /gnomad_sv_pipeline_wdls/module_04/04_v2_genotype_depth_part1.wdl: -------------------------------------------------------------------------------- 1 | import "https://api.firecloud.org/ga4gh/v1/tools/Talkowski-SV:04_v2_RD_genotyping_train/versions/7/plain-WDL/descriptor" as RD_genotype_train 2 | 3 | workflow genotype_depth_part1 { 4 | File batch_vcf 5 | String batch 6 | String coveragefile # batch coverage file 7 | File coveragefile_idx 8 | File medianfile # batch median file 9 | File famfile # batch famfile 10 | File svc_acct_key 11 | File rf_cutoffs # Random forest cutoffs 12 | File seed_cutoffs 13 | Array[String] samples # List of samples in batch 14 | Int n_RD_genotype_bins # number of RdTest bins 15 | Int n_per_RD_split # number of variants per RdTest split 16 | String reference_build #hg19 or hg38 17 | 18 | call RD_genotype_train.RD_genotype_train { 19 | input: 20 | vcf=batch_vcf, 21 | coveragefile=coveragefile, 22 | coveragefile_idx=coveragefile_idx, 23 | medianfile=medianfile, 24 | famfile=famfile, 25 | svc_acct_key=svc_acct_key, 26 | rf_cutoffs=rf_cutoffs, 27 | seed_cutoffs=seed_cutoffs, 28 | samples=samples, 29 | prefix=batch, 30 | n_bins=n_RD_genotype_bins, 31 | n_per_split=n_per_RD_split, 32 | reference_build=reference_build 33 | } 34 | 35 | output { 36 | File RD_pesr_sepcutoff = RD_genotype_train.pesr_sepcutoff 37 | File RD_depth_sepcutoff = RD_genotype_train.depth_sepcutoff 38 | } 39 | } -------------------------------------------------------------------------------- /gnomad_sv_pipeline_wdls/module_04/04_v2_genotype_pesr_part1.wdl: -------------------------------------------------------------------------------- 1 | import "https://api.firecloud.org/ga4gh/v1/tools/Talkowski-SV:04_v2_RD_genotyping_train/versions/7/plain-WDL/descriptor" as RD_genotype_train 2 | import "https://api.firecloud.org/ga4gh/v1/tools/Talkowski-SV:04_v2_PE_genotyping_train/versions/6/plain-WDL/descriptor" as PE_genotype_train 3 | import "https://api.firecloud.org/ga4gh/v1/tools/Talkowski-SV:04_v2_SR_genotyping_train/versions/6/plain-WDL/descriptor" as SR_genotype_train 4 | 5 | workflow genotype_pesr_part1 { 6 | File batch_vcf 7 | String batch 8 | String coveragefile # batch coverage file 9 | File coveragefile_idx 10 | File medianfile # batch median file 11 | File famfile # batch famfile 12 | File svc_acct_key 13 | File rf_cutoffs # Random forest cutoffs 14 | File seed_cutoffs 15 | Array[String] samples # List of samples in batch 16 | Int n_RD_genotype_bins # number of RdTest bins 17 | Int n_per_RD_split # number of variants per RdTest split 18 | Int n_per_PE_split 19 | String discfile 20 | File discfile_idx 21 | File pesr_blacklist 22 | String splitfile 23 | Int n_per_SR_split 24 | File splitfile_idx 25 | String reference_build #hg19 or hg38 26 | 27 | call RD_genotype_train.RD_genotype_train { 28 | input: 29 | vcf=batch_vcf, 30 | coveragefile=coveragefile, 31 | coveragefile_idx=coveragefile_idx, 32 | medianfile=medianfile, 33 | famfile=famfile, 34 | svc_acct_key=svc_acct_key, 35 | rf_cutoffs=rf_cutoffs, 36 | seed_cutoffs=seed_cutoffs, 37 | samples=samples, 38 | prefix=batch, 39 | n_bins=n_RD_genotype_bins, 40 | n_per_split=n_per_RD_split, 41 | reference_build=reference_build 42 | } 43 | 44 | call PE_genotype_train.PE_genotype_train { 45 | input: 46 | batch_vcf=batch_vcf, 47 | discfile=discfile, 48 | n_per_split=n_per_PE_split, 49 | medianfile=medianfile, 50 | discfile_idx=discfile_idx, 51 | svc_acct_key=svc_acct_key, 52 | samples=samples, 53 | batch_ID=batch, 54 | RF_cutoffs=rf_cutoffs, 55 | RD_genotypes=RD_genotype_train.genotypes, 56 | RD_melted_genotypes=RD_genotype_train.melted_genotypes, 57 | blacklist=pesr_blacklist 58 | } 59 | 60 | call SR_genotype_train.SR_genotype_train { 61 | input: 62 | batch_vcf=batch_vcf, 63 | splitfile=splitfile, 64 | n_per_split=n_per_SR_split, 65 | medianfile=medianfile, 66 | splitfile_idx=splitfile_idx, 67 | svc_acct_key=svc_acct_key, 68 | samples=samples, 69 | batch_ID=batch, 70 | RF_cutoffs=rf_cutoffs, 71 | RD_melted_genotypes=RD_genotype_train.melted_genotypes, 72 | PE_train=PE_genotype_train.PE_train, 73 | PE_genotypes=PE_genotype_train.PE_genotypes 74 | } 75 | 76 | output { 77 | File RD_pesr_sepcutoff = RD_genotype_train.pesr_sepcutoff 78 | File RD_depth_sepcutoff = RD_genotype_train.depth_sepcutoff 79 | File PE_metrics = PE_genotype_train.PE_metrics 80 | File SR_metrics = SR_genotype_train.SR_metrics 81 | } 82 | } -------------------------------------------------------------------------------- /gnomad_sv_pipeline_wdls/module_04/04_v2_make_cohort_VCFs.wdl: -------------------------------------------------------------------------------- 1 | workflow make_cohort_VCFs { 2 | File pesr_vcfs_list 3 | File depth_vcfs_list 4 | 5 | call merge_vcfs as merge_pesr_vcfs { 6 | input: 7 | vcfs_list=pesr_vcfs_list, 8 | prefix="all_batches.pesr" 9 | } 10 | 11 | call merge_vcfs as merge_depth_vcfs { 12 | input: 13 | vcfs_list=depth_vcfs_list, 14 | prefix="all_batches.depth" 15 | } 16 | 17 | output { 18 | File cohort_pesr_vcf = merge_pesr_vcfs.merged_vcf 19 | File cohort_depth_vcf = merge_depth_vcfs.merged_vcf 20 | } 21 | } 22 | 23 | task merge_vcfs { 24 | File vcfs_list 25 | String prefix 26 | 27 | command { 28 | /opt/sv-pipeline/04_variant_resolution/scripts/merge_vcfs.sh ${vcfs_list} ${prefix} 29 | } 30 | 31 | output { 32 | File merged_vcf = "${prefix}.vcf.gz" 33 | } 34 | 35 | runtime { 36 | docker: "talkowski/sv-pipeline@sha256:aaf0b5fa587fbe4f4d137532a4c1be292f9ea104422494e1a7d8ac7a5d8459e6" 37 | preemptible: 3 38 | } 39 | } -------------------------------------------------------------------------------- /gnomad_sv_pipeline_wdls/module_05/04_bp_overlap_filter_by_chrom.wdl: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2018 Talkowski Lab 2 | 3 | # Contact Ryan Collins 4 | 5 | # Distributed under terms of the MIT License 6 | 7 | 8 | # Workflow to parallelize same-bp overlap filter per chromosome 9 | workflow same_bp_filter { 10 | String vcf 11 | File vcf_idx 12 | String prefix 13 | File contiglist 14 | File svc_acct_key 15 | File bothside_pass 16 | File background_fail 17 | 18 | Array[Array[String]] contigs = read_tsv(contiglist) 19 | 20 | #Run same-bp overlap filter, scattered by chromosome 21 | scatter (contig in contigs) { 22 | 23 | #Remote tabix each vcf & join into a single vcf 24 | call subset_vcf { 25 | input: 26 | vcf=vcf, 27 | vcf_idx=vcf_idx, 28 | contig=contig[0], 29 | prefix=prefix, 30 | svc_acct_key=svc_acct_key 31 | } 32 | 33 | #Run same-bp overlap filter per chromosome 34 | call bp_overlap_filter { 35 | input: 36 | vcf=subset_vcf.subsetted_vcf, 37 | prefix="${prefix}.${contig[0]}", 38 | bothside_pass=bothside_pass, 39 | background_fail=background_fail 40 | } 41 | } 42 | 43 | #Merge filtered vcfs across chromosomes 44 | call concat_vcfs { 45 | input: 46 | vcfs=bp_overlap_filter.bp_filtered_vcf, 47 | prefix="${prefix}.non_redundant" 48 | } 49 | 50 | output { 51 | File filtered_vcf = concat_vcfs.concat_vcf 52 | File filtered_vcf_idx = concat_vcfs.concat_vcf_idx 53 | } 54 | } 55 | 56 | 57 | #Remote tabix a single chromosome per VCFs 58 | task subset_vcf { 59 | String vcf 60 | File vcf_idx 61 | String contig 62 | String prefix 63 | File svc_acct_key 64 | 65 | command <<< 66 | #Remote tabix to chromosome of interest 67 | url=$( gsutil signurl -d 24h ${svc_acct_key} "$vcf" | sed '1d' | cut -f 4 ); 68 | echo $url; 69 | svtk remote_tabix --header "$url" ${vcf_idx} "${contig}:0-300000000" > "${prefix}.${contig}.vcf" 70 | bgzip -f "${prefix}.${contig}.vcf" 71 | tabix -p vcf -f "${prefix}.${contig}.vcf.gz" 72 | >>> 73 | 74 | output { 75 | File subsetted_vcf = "${prefix}.${contig}.vcf.gz" 76 | File subsetted_vcf_idx = "${prefix}.${contig}.vcf.gz.tbi" 77 | } 78 | 79 | runtime { 80 | docker: "talkowski/sv-pipeline-remote-pysam@sha256:9fd37fb64e28e54d53172dd30d68c36f0815f21af465381dac281d53755edd86" 81 | preemptible: 1 82 | disks: "local-disk 50 SSD" 83 | } 84 | } 85 | 86 | 87 | # Run Harrison's overlapping breakpoint filter prior to complex resolution 88 | task bp_overlap_filter { 89 | File vcf 90 | String prefix 91 | File bothside_pass 92 | File background_fail 93 | 94 | command <<< 95 | /opt/sv-pipeline/04_variant_resolution/scripts/overlapbpchange.sh \ 96 | ${vcf} \ 97 | ${background_fail} \ 98 | ${bothside_pass}; 99 | mv non_redundant.vcf.gz "${prefix}.non_redundant.vcf.gz" 100 | >>> 101 | 102 | output { 103 | File bp_filtered_vcf = "${prefix}.non_redundant.vcf.gz" 104 | } 105 | 106 | runtime { 107 | docker: "talkowski/sv-pipeline@sha256:b359f2cb0c9d5f5a55eb4c41fd362f4e574bf3f8f0f395a2907837571b367ee0" 108 | preemptible: 1 109 | memory: "4 GB" 110 | disks: "local-disk 250 SSD" 111 | } 112 | } 113 | 114 | 115 | #Merge multiple vcfs 116 | task concat_vcfs { 117 | Array[File] vcfs 118 | String prefix 119 | 120 | command <<< 121 | vcf-concat ${sep=' ' vcfs} | vcf-sort -c | bgzip -c > ${prefix}.vcf.gz 122 | tabix -f -p vcf ${prefix}.vcf.gz 123 | >>> 124 | 125 | output { 126 | File concat_vcf = "${prefix}.vcf.gz" 127 | File concat_vcf_idx = "${prefix}.vcf.gz.tbi" 128 | } 129 | 130 | runtime { 131 | docker: "talkowski/sv-pipeline@sha256:b359f2cb0c9d5f5a55eb4c41fd362f4e574bf3f8f0f395a2907837571b367ee0" 132 | preemptible: 1 133 | disks: "local-disk 1000 SSD" 134 | } 135 | } -------------------------------------------------------------------------------- /gnomad_sv_pipeline_wdls/module_05/04_genotype_CPX_CNVs_perBatch.wdl: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2018 Talkowski Lab 2 | 3 | # Contact Ryan Collins 4 | 5 | # Distributed under terms of the MIT License 6 | 7 | 8 | # Workflow to perform depth-based genotyping per batch 9 | # on predicted CPX CNVs from 04b 10 | 11 | workflow genotype_CPX_CNVs_perBatch { 12 | File cpx_bed 13 | File RD_depth_sepcutoff 14 | Int n_per_split_small 15 | Int n_per_split_large 16 | Int n_RdTest_bins 17 | String batch 18 | File medianfile 19 | File famfile 20 | File svc_acct_key 21 | File sampleslist 22 | String coveragefile 23 | File coveragefile_idx 24 | 25 | Array[String] samples = read_lines(sampleslist) 26 | 27 | call shard_bed { 28 | input: 29 | bed=cpx_bed, 30 | n_per_split_small=n_per_split_small, 31 | n_per_split_large=n_per_split_large, 32 | sampleslist=sampleslist 33 | } 34 | 35 | scatter (lt5kb_bed in shard_bed.lt5kb_beds) { 36 | call RdTest_genotype as RD_genotype_lt5kb { 37 | input: 38 | bed=lt5kb_bed, 39 | coveragefile=coveragefile, 40 | coveragefile_idx=coveragefile_idx, 41 | svc_acct_key=svc_acct_key, 42 | medianfile=medianfile, 43 | famfile=famfile, 44 | samples=samples, 45 | gt_cutoffs=RD_depth_sepcutoff, 46 | n_bins=n_RdTest_bins, 47 | prefix=basename(lt5kb_bed, ".bed") 48 | } 49 | } 50 | 51 | scatter (gt5kb_bed in shard_bed.gt5kb_beds) { 52 | call RdTest_genotype as RD_genotype_gt5kb { 53 | input: 54 | bed=gt5kb_bed, 55 | coveragefile=coveragefile, 56 | coveragefile_idx=coveragefile_idx, 57 | svc_acct_key=svc_acct_key, 58 | medianfile=medianfile, 59 | famfile=famfile, 60 | samples=samples, 61 | gt_cutoffs=RD_depth_sepcutoff, 62 | n_bins=n_RdTest_bins, 63 | prefix=basename(gt5kb_bed) 64 | } 65 | } 66 | 67 | call concat_melted_genotypes { 68 | input: 69 | lt5kb_genos=RD_genotype_lt5kb.melted_genotypes, 70 | gt5kb_genos=RD_genotype_gt5kb.melted_genotypes, 71 | batch=batch 72 | } 73 | 74 | output { 75 | File genotypes = concat_melted_genotypes.genotypes 76 | } 77 | } 78 | 79 | task shard_bed { 80 | File bed 81 | Int n_per_split_small 82 | Int n_per_split_large 83 | File sampleslist 84 | 85 | command <<< 86 | set -euo pipefail 87 | if [ $( zcat ${bed} | fgrep -v "#" | wc -l ) -gt 0 ]; then 88 | #First, repace samples in input bed with full list of all samples in batch 89 | zcat ${bed} \ 90 | | fgrep -v "#" \ 91 | | awk -v OFS="\t" -v samples=$( cat ${sampleslist} | paste -s -d, ) \ 92 | '{ print $1, $2, $3, $4, samples, "DUP" }' \ 93 | | sort -Vk1,1 -k2,2n -k3,3n \ 94 | | bgzip -c \ 95 | > newBed_wSamples.bed.gz || true 96 | #Second, split by small vs large CNVs 97 | zcat newBed_wSamples.bed.gz \ 98 | | awk -v OFS="\t" '($3-$2<5000) {print $0}' \ 99 | | split -l ${n_per_split_small} -a 6 - lt5kb. || true 100 | zcat newBed_wSamples.bed.gz \ 101 | | awk -v OFS="\t" '($3-$2>=5000) {print $0}' \ 102 | | split -l ${n_per_split_large} -a 6 - gt5kb. || true 103 | fi 104 | if [ $( find ./ -name "lt5kb.*" | wc -l ) -eq 0 ]; then 105 | touch lt5kb.aaaaaa 106 | fi 107 | if [ $( find ./ -name "gt5kb.*" | wc -l ) -eq 0 ]; then 108 | touch gt5kb.aaaaaa 109 | fi 110 | >>> 111 | 112 | output { 113 | Array[File] lt5kb_beds = glob("lt5kb.*") 114 | Array[File] gt5kb_beds = glob("gt5kb.*") 115 | } 116 | 117 | runtime { 118 | preemptible: 3 119 | maxRetries: 1 120 | docker: "talkowski/sv-pipeline@sha256:5ff4bd3264cc61fc69e37cd2e307e3b5ab8458fec2606e1b57d4b1f73fecead0" 121 | disks: "local-disk 50 HDD" 122 | } 123 | } 124 | 125 | 126 | # Run depth-based genotyping 127 | task RdTest_genotype { 128 | File bed 129 | String coveragefile 130 | File medianfile 131 | File svc_acct_key 132 | File coveragefile_idx 133 | File famfile 134 | Array[String] samples 135 | File gt_cutoffs 136 | Int n_bins 137 | String prefix 138 | 139 | command <<< 140 | set -euo pipefail 141 | /opt/RdTest/localize_bincov.sh \ 142 | ${bed} \ 143 | ${coveragefile} \ 144 | ${coveragefile_idx} \ 145 | ${svc_acct_key}; 146 | Rscript /opt/RdTest/RdTest.R \ 147 | -b ${bed} \ 148 | -c local_coverage.bed.gz \ 149 | -m ${medianfile} \ 150 | -f ${famfile} \ 151 | -n ${prefix} \ 152 | -w ${write_tsv(samples)} \ 153 | -i ${n_bins} \ 154 | -r ${gt_cutoffs} \ 155 | -y /opt/RdTest/bin_exclude.bed.gz \ 156 | -g TRUE; 157 | /opt/sv-pipeline/04_variant_resolution/scripts/merge_RdTest_genotypes.py \ 158 | ${prefix}.geno \ 159 | ${prefix}.gq \ 160 | rd.geno.cnv.bed; 161 | sort -k1,1V -k2,2n rd.geno.cnv.bed | uniq | bgzip -c > rd.geno.cnv.bed.gz 162 | >>> 163 | 164 | output { 165 | # File genotypes = "${prefix}.geno" 166 | # File copy_states = "${prefix}.median_geno" 167 | # File metrics = "${prefix}.metrics" 168 | # File gq = "${prefix}.gq" 169 | # File varGQ = "${prefix}.vargq" 170 | File melted_genotypes = "rd.geno.cnv.bed.gz" 171 | } 172 | 173 | runtime { 174 | preemptible: 3 175 | docker: "talkowski/sv-pipeline-rdtest@sha256:0393ca5260e523f8646a72a2a739863384de73670383d3f0b32c6ccceba010e8" 176 | disks: "local-disk 100 HDD" 177 | bootDiskSizeGb: "30" 178 | memory: "8 GB" 179 | maxRetries: 1 180 | } 181 | } 182 | 183 | 184 | # Merge melted genotype files 185 | task concat_melted_genotypes { 186 | Array[File] lt5kb_genos 187 | Array[File] gt5kb_genos 188 | String batch 189 | 190 | command <<< 191 | zcat ${sep=' ' lt5kb_genos} ${sep=' ' gt5kb_genos} \ 192 | | sort -Vk1,1 -k2,2n -k3,3n \ 193 | | bgzip -c \ 194 | > ${batch}.rd_genos.bed.gz 195 | >>> 196 | 197 | output { 198 | File genotypes = "${batch}.rd_genos.bed.gz" 199 | } 200 | 201 | runtime { 202 | docker: "talkowski/sv-pipeline@sha256:5ff4bd3264cc61fc69e37cd2e307e3b5ab8458fec2606e1b57d4b1f73fecead0" 203 | preemptible: 3 204 | maxRetries: 1 205 | memory: "16 GB" 206 | disks: "local-disk 250 HDD" 207 | } 208 | } -------------------------------------------------------------------------------- /gnomad_sv_pipeline_wdls/module_05/04_integrate_resolved_vcfs.wdl: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2018 Talkowski Lab 2 | 3 | # Contact Ryan Collins 4 | 5 | # Distributed under terms of the MIT License 6 | 7 | 8 | # Workflow to parallelize integration of all-variant and inv-only svtk resolve results per chromosome 9 | workflow integrate_invonly_allvars { 10 | String inv_res_vcf 11 | String all_res_vcf 12 | File inv_res_vcf_idx 13 | File all_res_vcf_idx 14 | String prefix 15 | File contiglist 16 | File svc_acct_key 17 | File bothside_pass 18 | File background_fail 19 | 20 | Array[Array[String]] contigs = read_tsv(contiglist) 21 | 22 | #Merge, scattered by chromosome 23 | scatter (contig in contigs) { 24 | 25 | #Remote tabix each vcf 26 | call subset_vcf as subset_inv { 27 | input: 28 | vcf=inv_res_vcf, 29 | vcf_idx=inv_res_vcf_idx, 30 | contig=contig[0], 31 | prefix="${prefix}.inv_only.${contig[0]}", 32 | svc_acct_key=svc_acct_key 33 | } 34 | call subset_vcf as subset_all { 35 | input: 36 | vcf=all_res_vcf, 37 | vcf_idx=all_res_vcf_idx, 38 | contig=contig[0], 39 | prefix="${prefix}.all_variants.${contig[0]}", 40 | svc_acct_key=svc_acct_key 41 | } 42 | 43 | #Run integration per chromosome 44 | call integrate_resolved_vcfs { 45 | input: 46 | inv_res_vcf=subset_inv.subsetted_vcf, 47 | all_res_vcf=subset_all.subsetted_vcf, 48 | prefix="${prefix}.resolved.${contig[0]}" 49 | } 50 | } 51 | 52 | #Merge integrated vcfs across chromosomes 53 | call concat_vcfs { 54 | input: 55 | vcfs=integrate_resolved_vcfs.integrated_vcf, 56 | prefix="${prefix}.resolved" 57 | } 58 | 59 | output { 60 | File integrated_vcf = concat_vcfs.concat_vcf 61 | File integrated_vcf_idx = concat_vcfs.concat_vcf_idx 62 | } 63 | } 64 | 65 | 66 | #Remote tabix a single chromosome per VCFs 67 | task subset_vcf { 68 | String vcf 69 | File vcf_idx 70 | String contig 71 | String prefix 72 | File svc_acct_key 73 | 74 | command <<< 75 | #Remote tabix to chromosome of interest 76 | url=$( gsutil signurl -d 24h ${svc_acct_key} "$vcf" | sed '1d' | cut -f 4 ); 77 | echo $url; 78 | svtk remote_tabix --header "$url" ${vcf_idx} "${contig}:0-300000000" > "${prefix}.${contig}.vcf" 79 | bgzip -f "${prefix}.${contig}.vcf" 80 | tabix -p vcf -f "${prefix}.${contig}.vcf.gz" 81 | >>> 82 | 83 | output { 84 | File subsetted_vcf = "${prefix}.${contig}.vcf.gz" 85 | File subsetted_vcf_idx = "${prefix}.${contig}.vcf.gz.tbi" 86 | } 87 | 88 | runtime { 89 | docker: "talkowski/sv-pipeline-remote-pysam@sha256:9fd37fb64e28e54d53172dd30d68c36f0815f21af465381dac281d53755edd86" 90 | preemptible: 1 91 | disks: "local-disk 50 SSD" 92 | } 93 | } 94 | 95 | 96 | # Merge inversion-only and all-variant cpx-resolved outputs 97 | task integrate_resolved_vcfs { 98 | File inv_res_vcf 99 | File all_res_vcf 100 | String prefix 101 | 102 | command <<< 103 | /opt/sv-pipeline/04_variant_resolution/scripts/Complex_Inversion_Integration.sh \ 104 | ${inv_res_vcf} \ 105 | ${all_res_vcf} \ 106 | ${prefix}.integrated_resolved.vcf.gz 107 | >>> 108 | 109 | output { 110 | File integrated_vcf = "${prefix}.integrated_resolved.vcf.gz" 111 | } 112 | 113 | runtime { 114 | docker: "talkowski/sv-pipeline@sha256:b359f2cb0c9d5f5a55eb4c41fd362f4e574bf3f8f0f395a2907837571b367ee0" 115 | preemptible: 1 116 | memory: "4 GB" 117 | disks: "local-disk 250 SSD" 118 | } 119 | } 120 | 121 | 122 | #Merge multiple vcfs 123 | task concat_vcfs { 124 | Array[File] vcfs 125 | String prefix 126 | 127 | command <<< 128 | vcf-concat ${sep=' ' vcfs} | vcf-sort -c | bgzip -c > ${prefix}.vcf.gz 129 | tabix -f -p vcf ${prefix}.vcf.gz 130 | >>> 131 | 132 | output { 133 | File concat_vcf = "${prefix}.vcf.gz" 134 | File concat_vcf_idx = "${prefix}.vcf.gz.tbi" 135 | } 136 | 137 | runtime { 138 | docker: "talkowski/sv-pipeline@sha256:b359f2cb0c9d5f5a55eb4c41fd362f4e574bf3f8f0f395a2907837571b367ee0" 139 | preemptible: 1 140 | disks: "local-disk 1000 SSD" 141 | } 142 | } -------------------------------------------------------------------------------- /gnomad_sv_pipeline_wdls/module_05/04_merge_allvar_invonly_vcfs.wdl: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2018 Talkowski Lab 2 | 3 | # Contact Ryan Collins 4 | 5 | # Distributed under terms of the MIT License 6 | 7 | 8 | # Workflow to parallelize same-bp overlap filter per chromosome 9 | workflow same_bp_filter { 10 | String vcf 11 | File vcf_idx 12 | String prefix 13 | File contiglist 14 | File svc_acct_key 15 | File bothside_pass 16 | File background_fail 17 | 18 | Array[Array[String]] contigs = read_tsv(contiglist) 19 | 20 | #Run same-bp overlap filter, scattered by chromosome 21 | scatter (contig in contigs) { 22 | 23 | #Remote tabix each vcf & join into a single vcf 24 | call subset_vcf { 25 | input: 26 | vcf=vcf, 27 | vcf_idx=vcf_idx, 28 | contig=contig[0], 29 | prefix=prefix, 30 | svc_acct_key=svc_acct_key 31 | } 32 | 33 | #Run same-bp overlap filter per chromosome 34 | call bp_overlap_filter { 35 | input: 36 | vcf=subset_vcf.subsetted_vcf, 37 | prefix="${prefix}.${contig[0]}", 38 | bothside_pass=bothside_pass, 39 | background_fail=background_fail 40 | } 41 | } 42 | 43 | #Merge filtered vcfs across chromosomes 44 | call concat_vcfs { 45 | input: 46 | vcfs=bp_overlap_filter.bp_filtered_vcf, 47 | prefix="${prefix}.non_redundant" 48 | } 49 | 50 | output { 51 | File filtered_vcf = concat_vcfs.concat_vcf 52 | File filtered_vcf_idx = concat_vcfs.concat_vcf_idx 53 | } 54 | } 55 | 56 | 57 | #Remote tabix a single chromosome per VCFs 58 | task subset_vcf { 59 | String vcf 60 | File vcf_idx 61 | String contig 62 | String prefix 63 | File svc_acct_key 64 | 65 | command <<< 66 | #Remote tabix to chromosome of interest 67 | url=$( gsutil signurl -d 24h ${svc_acct_key} "$vcf" | sed '1d' | cut -f 4 ); 68 | echo $url; 69 | svtk remote_tabix --header "$url" ${vcf_idx} "${contig}:0-300000000" > "${prefix}.${contig}.vcf" 70 | bgzip -f "${prefix}.${contig}.vcf" 71 | tabix -p vcf -f "${prefix}.${contig}.vcf.gz" 72 | >>> 73 | 74 | output { 75 | File subsetted_vcf = "${prefix}.${contig}.vcf.gz" 76 | File subsetted_vcf_idx = "${prefix}.${contig}.vcf.gz.tbi" 77 | } 78 | 79 | runtime { 80 | docker: "talkowski/sv-pipeline-remote-pysam@sha256:9fd37fb64e28e54d53172dd30d68c36f0815f21af465381dac281d53755edd86" 81 | preemptible: 1 82 | disks: "local-disk 50 SSD" 83 | } 84 | } 85 | 86 | 87 | # Run Harrison's overlapping breakpoint filter prior to complex resolution 88 | task bp_overlap_filter { 89 | File vcf 90 | String prefix 91 | File bothside_pass 92 | File background_fail 93 | 94 | command <<< 95 | /opt/sv-pipeline/04_variant_resolution/scripts/overlapbpchange.sh \ 96 | ${vcf} \ 97 | ${background_fail} \ 98 | ${bothside_pass}; 99 | mv non_redundant.vcf.gz "${prefix}.non_redundant.vcf.gz" 100 | >>> 101 | 102 | output { 103 | File bp_filtered_vcf = "${prefix}.non_redundant.vcf.gz" 104 | } 105 | 106 | runtime { 107 | docker: "talkowski/sv-pipeline@sha256:b359f2cb0c9d5f5a55eb4c41fd362f4e574bf3f8f0f395a2907837571b367ee0" 108 | preemptible: 1 109 | memory: "4 GB" 110 | disks: "local-disk 250 SSD" 111 | } 112 | } 113 | 114 | 115 | #Merge multiple vcfs 116 | task concat_vcfs { 117 | Array[File] vcfs 118 | String prefix 119 | 120 | command <<< 121 | vcf-concat ${sep=' ' vcfs} | vcf-sort -c | bgzip -c > ${prefix}.vcf.gz 122 | tabix -f -p vcf ${prefix}.vcf.gz 123 | >>> 124 | 125 | output { 126 | File concat_vcf = "${prefix}.vcf.gz" 127 | File concat_vcf_idx = "${prefix}.vcf.gz.tbi" 128 | } 129 | 130 | runtime { 131 | docker: "talkowski/sv-pipeline@sha256:b359f2cb0c9d5f5a55eb4c41fd362f4e574bf3f8f0f395a2907837571b367ee0" 132 | preemptible: 1 133 | disks: "local-disk 1000 SSD" 134 | } 135 | } -------------------------------------------------------------------------------- /gnomad_sv_pipeline_wdls/module_05/04_pesr_depth_overlap.wdl: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2018 Talkowski Lab 2 | 3 | # Contact Ryan Collins 4 | 5 | # Distributed under terms of the MIT License 6 | 7 | 8 | # Workflow to parallelize vcf clustering per chromosome 9 | workflow pesr_depth_overlap { 10 | String pesr_vcf 11 | File pesr_vcf_idx 12 | String depth_vcf 13 | File depth_vcf_idx 14 | File contigs 15 | Array[String] samples 16 | File svc_acct_key 17 | 18 | Array[Array[String]] contiglist = read_tsv(contigs) 19 | 20 | scatter (contig in contiglist) { 21 | call subset_vcf as subset_pesr_vcf { 22 | input: 23 | vcf=pesr_vcf, 24 | vcf_idx=pesr_vcf_idx, 25 | contig=contig[0], 26 | prefix="all_batches.pesr", 27 | svc_acct_key=svc_acct_key 28 | } 29 | 30 | call subset_vcf as subset_depth_vcf { 31 | input: 32 | vcf=depth_vcf, 33 | vcf_idx=depth_vcf_idx, 34 | contig=contig[0], 35 | prefix="all_batches.depth", 36 | svc_acct_key=svc_acct_key 37 | } 38 | 39 | call merge_pesr_depth { 40 | input: 41 | pesr_vcf=subset_pesr_vcf.subsetted_vcf, 42 | depth_vcf=subset_depth_vcf.subsetted_vcf, 43 | contig=contig[0] 44 | } 45 | } 46 | 47 | call concat_vcfs { 48 | input: 49 | vcfs=merge_pesr_depth.merged_vcf, 50 | prefix="all_batches.pesr_depth" 51 | } 52 | 53 | output { 54 | File merged_vcf = concat_vcfs.concat_vcf 55 | File merged_vcf_idx = concat_vcfs.concat_vcf_idx 56 | } 57 | } 58 | 59 | task subset_vcf { 60 | String vcf 61 | File vcf_idx 62 | String contig 63 | String prefix 64 | File svc_acct_key 65 | 66 | command <<< 67 | # tabix -p vcf ${vcf}; 68 | # tabix -h ${vcf} ${contig} | bgzip -c > ${prefix}.${contig}.vcf.gz 69 | url=$( gsutil signurl -d 24h ${svc_acct_key} ${vcf} | sed '1d' | cut -f 4 ); 70 | echo $url; 71 | svtk remote_tabix --header "$url" "${vcf_idx}" "$contig" \ 72 | | bgzip -c \ 73 | > "${prefix}.${contig}.vcf.gz" 74 | >>> 75 | 76 | output { 77 | File subsetted_vcf = "${prefix}.${contig}.vcf.gz" 78 | } 79 | 80 | runtime { 81 | docker: "talkowski/sv-pipeline-remote-pysam@sha256:0c21137179665254ca0d9ebe4d21251ae2ff6679337fd9b3e9d6e6ab808db6a8" 82 | preemptible: 3 83 | disks: "local-disk 100 SSD" 84 | } 85 | } 86 | 87 | task merge_pesr_depth { 88 | File pesr_vcf 89 | File depth_vcf 90 | String contig 91 | 92 | command <<< 93 | /opt/sv-pipeline/04_variant_resolution/scripts/PESR_RD_merge_wrapper.sh \ 94 | ${pesr_vcf} \ 95 | ${depth_vcf} \ 96 | ${contig} \ 97 | all_batches.pesr_depth.${contig}.vcf.gz 98 | >>> 99 | 100 | output { 101 | File merged_vcf = "all_batches.pesr_depth.${contig}.vcf.gz" 102 | } 103 | 104 | runtime { 105 | docker: "talkowski/sv-pipeline@sha256:3f9d99b8154dff67eb33b0da0a4358ac149461d65f819e7eb64958953d478900" 106 | preemptible: 1 107 | memory: "16 GB" 108 | disks: "local-disk 500 SSD" 109 | } 110 | } 111 | 112 | task concat_vcfs { 113 | Array[File] vcfs 114 | String prefix 115 | 116 | command <<< 117 | vcf-concat ${sep=' ' vcfs} | vcf-sort -c | bgzip -c > ${prefix}.vcf.gz; 118 | tabix -p vcf -f ${prefix}.vcf.gz 119 | >>> 120 | 121 | output { 122 | File concat_vcf = "${prefix}.vcf.gz" 123 | File concat_vcf_idx = "${prefix}.vcf.gz.tbi" 124 | } 125 | 126 | runtime { 127 | docker: "talkowski/sv-pipeline@sha256:b359f2cb0c9d5f5a55eb4c41fd362f4e574bf3f8f0f395a2907837571b367ee0" 128 | preemptible: 1 129 | memory: "8 GB" 130 | disks: "local-disk 5000 SSD" 131 | } 132 | } -------------------------------------------------------------------------------- /gnomad_sv_pipeline_wdls/module_05/04_resolve_complex_sv.wdl: -------------------------------------------------------------------------------- 1 | import "https://api.firecloud.org/ga4gh/v1/tools/Talkowski-SV:04_resolve_complex_by_chrom/versions/63/plain-WDL/descriptor" as resolve_complex_by_chrom 2 | 3 | workflow resolve_complex_sv { 4 | File vcf 5 | File contigs 6 | Int max_shards_per_chrom 7 | Int min_variants_per_shard 8 | File cytobands 9 | File cytobands_idx 10 | File mei_bed 11 | File discfile_list 12 | File discfile_idx_list 13 | File pe_blacklist 14 | File pe_blacklist_idx 15 | File svc_acct_key 16 | File rf_cutoffs 17 | 18 | Array[Array[String]] contiglist = read_tsv(contigs) 19 | 20 | # Get SR count cutoff from RF metrics to use in single-ender rescan procedure 21 | call get_se_cutoff { 22 | input: 23 | rf_cutoffs=rf_cutoffs 24 | } 25 | 26 | 27 | scatter (contig in contiglist) { 28 | call subset_vcf { 29 | input: 30 | vcf=vcf, 31 | chrom=contig[0] 32 | } 33 | 34 | call resolve_complex_by_chrom.resolve_complex_by_chrom as resolve_perChrom { 35 | input: 36 | vcf=subset_vcf.single_chrom, 37 | vcf_idx=subset_vcf.single_chrom_idx, 38 | contig=contig[0], 39 | max_shards=max_shards_per_chrom, 40 | min_variants_per_shard=min_variants_per_shard, 41 | cytobands=cytobands, 42 | cytobands_idx=cytobands_idx, 43 | discfile_list=discfile_list, 44 | discfile_idx_list=discfile_idx_list, 45 | mei_bed=mei_bed, 46 | pe_blacklist=pe_blacklist, 47 | pe_blacklist_idx=pe_blacklist_idx, 48 | svc_acct_key=svc_acct_key, 49 | se_pe_cutoff=get_se_cutoff.median_PE_cutoff 50 | } 51 | } 52 | 53 | call resolve_complex_by_chrom.concat_vcfs as concat_resolved { 54 | input: 55 | vcfs=resolve_perChrom.res_vcf, 56 | vcftype="resolved" 57 | } 58 | 59 | # call resolve_complex_by_chrom.concat_vcfs as concat_unresolved { 60 | # input: 61 | # vcfs=resolve_perChrom.unres_vcf, 62 | # vcftype="unresolved" 63 | # } 64 | 65 | output { 66 | File resolved_vcf_merged = concat_resolved.concat_vcf 67 | File resolved_vcf_merged_idx = concat_resolved.concat_vcf_idx 68 | # File unresolved_vcf_merged = concat_unresolved.concat_vcf 69 | } 70 | } 71 | 72 | #Subset VCF per chromosome 73 | task subset_vcf { 74 | File vcf 75 | String chrom 76 | 77 | String prefix = basename(vcf, ".vcf.gz") 78 | 79 | command <<< 80 | tabix -p vcf ${vcf}; 81 | tabix --print-header ${vcf} ${chrom} | bgzip -c > ${prefix}.${chrom}.vcf.gz 82 | tabix -f ${prefix}.${chrom}.vcf.gz 83 | >>> 84 | 85 | output { 86 | File single_chrom = "${prefix}.${chrom}.vcf.gz" 87 | File single_chrom_idx = "${prefix}.${chrom}.vcf.gz.tbi" 88 | } 89 | 90 | runtime { 91 | docker: "talkowski/sv-pipeline@sha256:96d07aa2c7c3e8bd12f2621a0644a5a8fca99f922926922724497ad2aad9364d" 92 | preemptible: 3 93 | disks: "local-disk 1000 SSD" 94 | } 95 | } 96 | 97 | # Get SE cutoff 98 | task get_se_cutoff { 99 | File rf_cutoffs 100 | 101 | command <<< 102 | mkdir rf_cutoff_files/ 103 | cat ${rf_cutoffs} | gsutil cp -I rf_cutoff_files/ 104 | while read file; do 105 | /opt/sv-pipeline/04_variant_resolution/scripts/convert_poisson_p.py \ 106 | $( awk -F '\t' '{if ( $5=="PE_log_pval") print $2 }' $file | head -n1 ) 107 | done < <( find rf_cutoff_files/ -name "*cutoffs" ) | \ 108 | Rscript -e "cat(floor(median(scan('stdin',quiet=T))),sep='\n')" > \ 109 | median_cutoff.txt 110 | >>> 111 | 112 | output { 113 | Int median_PE_cutoff = read_tsv("median_cutoff.txt")[0][0] 114 | } 115 | 116 | runtime { 117 | docker: "talkowski/sv-pipeline@sha256:96d07aa2c7c3e8bd12f2621a0644a5a8fca99f922926922724497ad2aad9364d" 118 | preemptible: 3 119 | } 120 | } 121 | 122 | # 123 | ##Combine multiple VCFs 124 | #task concat_vcfs { 125 | # Array[File] vcfs 126 | # 127 | # command <<< 128 | # vcf-concat ${sep=' ' vcfs} | vcf-sort -c | bgzip -c > all_batches.vcf.gz 129 | # >>> 130 | # 131 | # output { 132 | # File concat_vcf = "all_batches.vcf.gz" 133 | # } 134 | # 135 | # runtime { 136 | # docker: "talkowski/sv-pipeline@sha256:b0455d30df2fbdbd4649466d968cada0a44d02a7159d94982308b629dd1aef78" 137 | # preemptible: 3 138 | # } 139 | #} 140 | -------------------------------------------------------------------------------- /gnomad_sv_pipeline_wdls/module_05/04_sharded_vcfcluster.wdl: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2018 Talkowski Lab 2 | 3 | # Contact Ryan Collins 4 | 5 | # Distributed under terms of the MIT License 6 | 7 | # Workflow to shard a filtered vcf & run vcfcluster (sub-sub-sub workflow) 8 | workflow sharded_cluster { 9 | File vcf 10 | Int dist 11 | Float frac 12 | Int max_shards 13 | Int min_per_shard 14 | String prefix 15 | String contig 16 | String svtype 17 | Float sample_overlap 18 | String do_blacklist 19 | File blacklist 20 | File blacklist_idx 21 | Int svsize 22 | Array[String] svtypes 23 | 24 | #New as of November 2, 2018: perform sharding and return list of variant IDs 25 | # for each shard, rather than VCF shards themselves, which should dramatically 26 | # improve speed of sharding task (previously took 1-6 hours for 14k samples in 27 | # gnomAD v2) 28 | call shard_vcf { 29 | input: 30 | vcf=vcf, 31 | dist=dist, 32 | frac=frac, 33 | max_shards=max_shards, 34 | min_per_shard=min_per_shard, 35 | prefix="${prefix}.${contig}.${svtype}" 36 | } 37 | 38 | #Run vcfcluster per shard 39 | scatter ( VIDs_list in shard_vcf.VID_list_shards ) { 40 | call vcfcluster { 41 | input: 42 | vcf=vcf, 43 | VIDs=VIDs_list, 44 | prefix="${prefix}.${contig}.${svtype}", 45 | dist=dist, 46 | frac=frac, 47 | sample_overlap=sample_overlap, 48 | do_blacklist=do_blacklist, 49 | blacklist=blacklist, 50 | blacklist_idx=blacklist_idx, 51 | svsize=svsize, 52 | svtypes=svtypes 53 | } 54 | } 55 | 56 | #Merge shards per svtype 57 | call concat_vcfs as concat_shards { 58 | input: 59 | vcfs=vcfcluster.clustered_vcf, 60 | prefix="${prefix}.${contig}.${svtype}" 61 | } 62 | 63 | #Output 64 | output { 65 | File clustered_vcf = concat_shards.concat_vcf 66 | } 67 | } 68 | 69 | 70 | #Intelligently shard a VCF for parallelized clustering 71 | task shard_vcf { 72 | File vcf 73 | Int dist 74 | Float frac 75 | Int max_shards 76 | Int min_per_shard 77 | String prefix 78 | 79 | command <<< 80 | set -eu -o pipefail 81 | 82 | tabix -f -p vcf ${vcf} 83 | /opt/sv-pipeline/04_variant_resolution/scripts/shardVCF_preClustering_part1.sh \ 84 | -D ${dist} \ 85 | -R ${frac} \ 86 | -L ${min_per_shard} \ 87 | -S ${max_shards} \ 88 | -P ${prefix} \ 89 | ${vcf} 90 | >>> 91 | 92 | output { 93 | Array[File] VID_list_shards = glob("*.VIDs.list") 94 | } 95 | 96 | runtime { 97 | preemptible: 1 98 | docker: "talkowski/sv-pipeline@sha256:703a19f84f498989ba8ffde110a3462cfecfbd7ade1084a151fac5fff742c266" 99 | disks: "local-disk 250 SSD" 100 | } 101 | } 102 | 103 | 104 | #Run svtk vcfcluster 105 | task vcfcluster { 106 | File vcf 107 | File VIDs 108 | String prefix 109 | Int dist 110 | Float frac 111 | Float sample_overlap 112 | String do_blacklist 113 | File blacklist 114 | File blacklist_idx 115 | Int svsize 116 | Array[String] svtypes 117 | 118 | command <<< 119 | set -eu -o pipefail 120 | 121 | # Don't generate random characters for vcf name, it produces problems with caching on cromwell 122 | # You *could* pass a seed like so: 123 | # INPUT_HASH=$(tr -d '/+' < <(openssl enc -a -aes-256-ctr -pass pass:"$SEED" -nosalt /dev/null) | head -c16) 124 | # But if you hash filtered input vcf, you accomplish the same goal of avoiding similar-named files in the loop, 125 | # without introducing randomness: 126 | INPUT_HASH=$(md5sum ${vcf} | awk '{print $1}') 127 | # concat prefix and hash to create unique vcf name: 128 | VCF_NAME="${prefix}-$INPUT_HASH" 129 | 130 | #Prep vcf 131 | zcat ${vcf} | sed -n '1,1000p' | fgrep "#" > header.vcf 132 | zcat ${vcf} | fgrep -v "#" | fgrep -wf ${VIDs} | cat header.vcf - | bgzip -c \ 133 | > input.vcf.gz 134 | #Run clustering 135 | echo "input.vcf.gz" > unclustered_vcfs.list; 136 | if [ ${do_blacklist} == "YES" ]; then 137 | svtk vcfcluster unclustered_vcfs.list $VCF_NAME.vcf \ 138 | -d ${dist} \ 139 | -f ${frac} \ 140 | -x ${blacklist} \ 141 | -z ${svsize} \ 142 | -p ${prefix} \ 143 | -t ${sep=',' svtypes} \ 144 | -o ${sample_overlap} \ 145 | --preserve-ids \ 146 | --preserve-genotypes \ 147 | --preserve-header 148 | else 149 | svtk vcfcluster unclustered_vcfs.list $VCF_NAME.vcf \ 150 | -d ${dist} \ 151 | -f ${frac} \ 152 | -z ${svsize} \ 153 | -p ${prefix} \ 154 | -t ${sep=',' svtypes} \ 155 | -o ${sample_overlap} \ 156 | --preserve-ids \ 157 | --preserve-genotypes \ 158 | --preserve-header 159 | fi 160 | bgzip -f $VCF_NAME.vcf 161 | >>> 162 | 163 | output { 164 | # need to use glob since cromwell will not be aware of the value of INPUT hash 165 | File clustered_vcf = glob("${prefix}*.vcf.gz")[0] 166 | } 167 | 168 | runtime { 169 | docker: "talkowski/sv-pipeline@sha256:703a19f84f498989ba8ffde110a3462cfecfbd7ade1084a151fac5fff742c266" 170 | preemptible: 1 171 | maxRetries: 1 172 | memory: "8 GB" 173 | disks: "local-disk 20 SSD" 174 | } 175 | } 176 | 177 | 178 | #Merge multiple vcfs 179 | task concat_vcfs { 180 | Array[File] vcfs 181 | String prefix 182 | 183 | command <<< 184 | vcf-concat ${sep=' ' vcfs} | vcf-sort -c | bgzip -c > ${prefix}.vcf.gz 185 | tabix -f -p vcf ${prefix}.vcf.gz 186 | >>> 187 | 188 | output { 189 | File concat_vcf = "${prefix}.vcf.gz" 190 | File concat_vcf_idx = "${prefix}.vcf.gz.tbi" 191 | } 192 | 193 | runtime { 194 | docker: "talkowski/sv-pipeline@sha256:703a19f84f498989ba8ffde110a3462cfecfbd7ade1084a151fac5fff742c266" 195 | preemptible: 1 196 | maxRetries: 1 197 | disks: "local-disk 500 SSD" 198 | } 199 | } -------------------------------------------------------------------------------- /gnomad_sv_pipeline_wdls/module_05/04b_genotype_CPX_CNVs.wdl: -------------------------------------------------------------------------------- 1 | import "https://api.firecloud.org/ga4gh/v1/tools/Talkowski-SV:04_genotype_CPX_CNVs_perBatch/versions/28/plain-WDL/descriptor" as rd_gt_perbatch 2 | 3 | # Copyright (c) 2018 Talkowski Lab 4 | 5 | # Contact Ryan Collins 6 | 7 | # Distributed under terms of the MIT License 8 | 9 | 10 | # Workflow to perform depth-based genotyping for a single vcf shard scattered 11 | # across batches on predicted CPX CNVs from 04b 12 | workflow genotype_CPX_CNVs { 13 | File vcf 14 | File gt_input_files 15 | Int n_per_split_small 16 | Int n_per_split_large 17 | Int n_RdTest_bins 18 | File svc_acct_key 19 | String prefix 20 | File famfile 21 | String contig 22 | 23 | Array[Array[String]] gt_input_array = read_tsv(gt_input_files) 24 | 25 | # Convert VCF to bed of CPX CNV intervals 26 | call get_cpx_cnv_intervals { 27 | input: 28 | vcf=vcf, 29 | prefix="${prefix}.${contig}" 30 | } 31 | 32 | # Scatter over each batch (row) in gt_input_files and run depth genotyping 33 | scatter (gt_inputs in gt_input_array) { 34 | call rd_gt_perbatch.genotype_CPX_CNVs_perBatch as gt_batch { 35 | input: 36 | cpx_bed=get_cpx_cnv_intervals.CPX_CNV_BED, 37 | batch=gt_inputs[0], 38 | coveragefile=gt_inputs[1], 39 | coveragefile_idx=gt_inputs[2], 40 | RD_depth_sepcutoff=gt_inputs[3], 41 | sampleslist=gt_inputs[4], 42 | famfile=gt_inputs[5], 43 | medianfile=gt_inputs[6], 44 | n_per_split_small=n_per_split_small, 45 | n_per_split_large=n_per_split_large, 46 | n_RdTest_bins=n_RdTest_bins, 47 | svc_acct_key=svc_acct_key 48 | } 49 | } 50 | 51 | # Merge melted genotypes across all batches 52 | call merge_melted_gts { 53 | input: 54 | melted_gts=gt_batch.genotypes, 55 | prefix="${prefix}.${contig}" 56 | } 57 | 58 | # Parse genotyping results 59 | call parse_gts { 60 | input: 61 | vcf=vcf, 62 | intervals=get_cpx_cnv_intervals.CPX_CNV_BED, 63 | genotypes=merge_melted_gts.merged_genotypes, 64 | prefix="${prefix}.${contig}", 65 | famfile=famfile, 66 | contig=contig 67 | } 68 | 69 | # Final output 70 | output { 71 | File cpx_depth_gt_resolved_vcf = parse_gts.cpx_depth_gt_resolved_vcf 72 | File reclassification_table = parse_gts.reclassification_table 73 | File interval_genotype_counts_table = parse_gts.gt_counts_table 74 | } 75 | } 76 | 77 | 78 | # Get CNV intervals from complex SV for depth genotyping 79 | task get_cpx_cnv_intervals { 80 | File vcf 81 | String prefix 82 | 83 | command <<< 84 | /opt/sv-pipeline/04_variant_resolution/scripts/gather_cpx_intervals_for_rd_gt.sh \ 85 | ${vcf} \ 86 | ${prefix}.complex_CNV_intervals_to_test.bed.gz 87 | >>> 88 | 89 | output { 90 | File CPX_CNV_BED = "${prefix}.complex_CNV_intervals_to_test.bed.gz" 91 | } 92 | 93 | runtime { 94 | docker: "talkowski/sv-pipeline@sha256:5ff4bd3264cc61fc69e37cd2e307e3b5ab8458fec2606e1b57d4b1f73fecead0" 95 | preemptible: 1 96 | maxRetries: 1 97 | memory: "8 GB" 98 | disks: "local-disk 100 HDD" 99 | } 100 | } 101 | 102 | 103 | # Merge output from per-batch genotyping 104 | task merge_melted_gts { 105 | Array[File] melted_gts 106 | String prefix 107 | 108 | command <<< 109 | while read file; do 110 | zcat $file 111 | done < ${write_tsv(melted_gts)} \ 112 | | sort -Vk1,1 -k2,2n -k3,3n -k4,4V -k5,5V \ 113 | | bgzip -c \ 114 | > ${prefix}.CPX_intervals.merged_rd_genos.bed.gz 115 | >>> 116 | 117 | output { 118 | File merged_genotypes = "${prefix}.CPX_intervals.merged_rd_genos.bed.gz" 119 | } 120 | 121 | runtime { 122 | docker: "talkowski/sv-pipeline@sha256:703a19f84f498989ba8ffde110a3462cfecfbd7ade1084a151fac5fff742c266" 123 | preemptible: 1 124 | maxRetries: 1 125 | disks: "local-disk 100 HDD" 126 | } 127 | } 128 | 129 | 130 | # Parse genotyping results 131 | task parse_gts { 132 | File vcf 133 | File intervals 134 | File genotypes 135 | File famfile 136 | String prefix 137 | String contig 138 | 139 | command <<< 140 | /opt/sv-pipeline/04_variant_resolution/scripts/process_posthoc_cpx_depth_regenotyping.sh \ 141 | -R ${prefix}.CPXregenotyping_reclassification_table.${contig}.txt \ 142 | -G ${prefix}.CPXregenotyping_raw_genotype_counts_table.${contig}.txt \ 143 | ${vcf} \ 144 | ${intervals} \ 145 | ${genotypes} \ 146 | ${famfile} \ 147 | ${prefix}.postCPXregenotyping.${contig}.vcf.gz 148 | >>> 149 | 150 | output { 151 | File cpx_depth_gt_resolved_vcf = "${prefix}.postCPXregenotyping.${contig}.vcf.gz" 152 | File reclassification_table = "${prefix}.CPXregenotyping_reclassification_table.${contig}.txt" 153 | File gt_counts_table = "${prefix}.CPXregenotyping_raw_genotype_counts_table.${contig}.txt" 154 | } 155 | 156 | runtime { 157 | docker: "talkowski/sv-pipeline@sha256:703a19f84f498989ba8ffde110a3462cfecfbd7ade1084a151fac5fff742c266" 158 | preemptible: 1 159 | maxRetries: 1 160 | disks: "local-disk 100 HDD" 161 | } 162 | } 163 | 164 | 165 | # Combine multiple VCFs 166 | task concat_vcfs { 167 | Array[File] vcfs 168 | String prefix 169 | 170 | command <<< 171 | vcf-concat ${sep=' ' vcfs} | vcf-sort -c | bgzip -c > ${prefix}.mod04b_final.vcf.gz 172 | >>> 173 | 174 | output { 175 | File concat_vcf = "${prefix}.mod04b_final.vcf.gz" 176 | } 177 | 178 | runtime { 179 | docker: "talkowski/sv-pipeline@sha256:703a19f84f498989ba8ffde110a3462cfecfbd7ade1084a151fac5fff742c266" 180 | preemptible: 1 181 | maxRetries: 1 182 | disks: "local-disk 300 HDD" 183 | } 184 | } 185 | -------------------------------------------------------------------------------- /gnomad_sv_pipeline_wdls/module_05/04b_scatter_CPX_genotyping.wdl: -------------------------------------------------------------------------------- 1 | import "https://api.firecloud.org/ga4gh/v1/tools/Talkowski-SV:04b_genotype_CPX_CNVs/versions/25/plain-WDL/descriptor" as cpx_gt 2 | 3 | # Copyright (c) 2018 Talkowski Lab 4 | 5 | # Contact Ryan Collins 6 | 7 | # Distributed under terms of the MIT License 8 | 9 | 10 | # Workflow to perform depth-based genotyping for a single vcf shard scattered 11 | # across batches on predicted CPX CNVs from 04b 12 | workflow scatter_CPX_genotyping { 13 | File vcf 14 | File vcf_idx 15 | Int n_master_vcf_shards 16 | Int n_master_min_vars_per_vcf_shard 17 | File gt_input_files 18 | Int n_per_split_small 19 | Int n_per_split_large 20 | Int n_RdTest_bins 21 | File svc_acct_key 22 | String prefix 23 | File famfile 24 | String contig 25 | 26 | # Shard VCF into even slices 27 | call shard_vcf { 28 | input: 29 | vcf=vcf, 30 | vcf_idx=vcf_idx, 31 | prefix="${prefix}.${contig}", 32 | n_shards=n_master_vcf_shards, 33 | min_vars_per_shard=n_master_min_vars_per_vcf_shard 34 | } 35 | 36 | # Scatter genotyping over shards 37 | scatter ( shard in shard_vcf.vcf_shards ) { 38 | # Run genotyping 39 | call cpx_gt.genotype_CPX_CNVs as genotype_shard { 40 | input: 41 | vcf=shard, 42 | gt_input_files=gt_input_files, 43 | n_per_split_large=n_per_split_large, 44 | n_per_split_small=n_per_split_small, 45 | n_RdTest_bins=n_RdTest_bins, 46 | svc_acct_key=svc_acct_key, 47 | prefix=prefix, 48 | famfile=famfile, 49 | contig=contig 50 | } 51 | } 52 | 53 | # Merge VCF shards 54 | call concat_vcfs { 55 | input: 56 | vcfs=genotype_shard.cpx_depth_gt_resolved_vcf, 57 | outfile_prefix="${prefix}.${contig}.resolved" 58 | } 59 | 60 | # Output merged VCF 61 | output { 62 | File cpx_depth_gt_resolved_vcf = concat_vcfs.concat_vcf 63 | File cpx_depth_gt_resolved_vcf_idx = concat_vcfs.concat_vcf_idx 64 | } 65 | } 66 | 67 | 68 | #Shard a vcf into even chunks 69 | task shard_vcf { 70 | File vcf 71 | File vcf_idx 72 | String prefix 73 | Int n_shards 74 | Int min_vars_per_shard 75 | 76 | command <<< 77 | tabix -H ${vcf} > header.vcf; 78 | zcat ${vcf} | grep -ve '^#' | cut -f3 > all_VIDs.list; 79 | nrecords=$( cat all_VIDs.list | wc -l ); 80 | rec_per_shard=$( echo "$(( $nrecords / ${n_shards} ))" | cut -f1 -d\. ); 81 | if [ $rec_per_shard -lt ${min_vars_per_shard} ]; then 82 | rec_per_shard=${min_vars_per_shard} 83 | fi; 84 | /opt/sv-pipeline/04_variant_resolution/scripts/evenSplitter.R \ 85 | -L $rec_per_shard \ 86 | all_VIDs.list \ 87 | VIDs_split_ 88 | max_suf=$( find `pwd` -name "VIDs_split_*" | awk -v FS="_" '{ print $NF }' | sort -nrk1,1 | sed -n '1p' ) 89 | for i in $( seq 1 "$max_suf" ); do 90 | zcat ${vcf} \ 91 | | fgrep -wf VIDs_split_"$i" \ 92 | | cat header.vcf - \ 93 | | bgzip -c \ 94 | > ${prefix}.shard_"$i".vcf.gz 95 | rm VIDs_split_"$i" 96 | done 97 | >>> 98 | 99 | output { 100 | Array[File] vcf_shards = glob("${prefix}.shard_*.vcf.gz") 101 | } 102 | 103 | runtime { 104 | docker: "talkowski/sv-pipeline@sha256:703a19f84f498989ba8ffde110a3462cfecfbd7ade1084a151fac5fff742c266" 105 | preemptible: 1 106 | maxRetries: 1 107 | memory: "4 GB" 108 | disks: "local-disk 500 HDD" 109 | } 110 | } 111 | 112 | 113 | #General task to combine multiple VCFs 114 | task concat_vcfs { 115 | Array[File] vcfs 116 | String outfile_prefix 117 | 118 | command <<< 119 | vcf-concat ${sep=' ' vcfs} | vcf-sort -c | bgzip -c > ${outfile_prefix}.vcf.gz; 120 | tabix -p vcf -f "${outfile_prefix}.vcf.gz" 121 | >>> 122 | 123 | output { 124 | File concat_vcf = "${outfile_prefix}.vcf.gz" 125 | File concat_vcf_idx = "${outfile_prefix}.vcf.gz.tbi" 126 | } 127 | 128 | runtime { 129 | docker: "talkowski/sv-pipeline@sha256:703a19f84f498989ba8ffde110a3462cfecfbd7ade1084a151fac5fff742c266" 130 | preemptible: 1 131 | maxRetries: 1 132 | memory: "4 GB" 133 | disks: "local-disk 500 HDD" 134 | } 135 | } 136 | 137 | 138 | -------------------------------------------------------------------------------- /gnomad_sv_pipeline_wdls/module_06/05_cleanVCF_part2.wdl: -------------------------------------------------------------------------------- 1 | workflow Clean { 2 | 3 | Array[File] whitelists 4 | File normal_revise_vcf 5 | File multi_cnvs 6 | File vcftools_idx 7 | 8 | scatter ( white in whitelists ){ 9 | call cleanvcf2{ 10 | input: 11 | normal_revise_vcf=normal_revise_vcf, 12 | whitelist=white, 13 | multi_cnvs=multi_cnvs, 14 | vcftools_idx=vcftools_idx 15 | } 16 | } 17 | 18 | call combine{ 19 | input: 20 | shards=cleanvcf2.out 21 | } 22 | 23 | output { 24 | File out=combine.out 25 | } 26 | } 27 | 28 | 29 | task cleanvcf2 { 30 | 31 | File normal_revise_vcf 32 | File whitelist 33 | File multi_cnvs 34 | File vcftools_idx 35 | 36 | command { 37 | bash /opt/sv-pipeline/04_variant_resolution/scripts/clean_vcf_part2.sh ${normal_revise_vcf} ${whitelist} ${multi_cnvs} "output.txt" 38 | } 39 | 40 | runtime { 41 | preemptible: 1 42 | docker: "talkowski/sv-pipeline@sha256:703a19f84f498989ba8ffde110a3462cfecfbd7ade1084a151fac5fff742c266" 43 | disks: "local-disk 250 SSD" 44 | bootDiskSizeGb: 30 45 | memory: "32 GB" 46 | } 47 | 48 | output { 49 | File out="output.txt" 50 | } 51 | } 52 | 53 | 54 | task combine { 55 | 56 | Array[File] shards 57 | 58 | command { 59 | cat ${sep=" " shards} > output.txt 60 | } 61 | 62 | runtime { 63 | preemptible: 1 64 | docker: "talkowski/sv-pipeline@sha256:703a19f84f498989ba8ffde110a3462cfecfbd7ade1084a151fac5fff742c266" 65 | disks: "local-disk 200 SSD" 66 | memory: "4 GB" 67 | } 68 | 69 | output { 70 | File out="output.txt" 71 | } 72 | } 73 | -------------------------------------------------------------------------------- /gnomad_sv_pipeline_wdls/module_06/05_cleanVCF_part4.wdl: -------------------------------------------------------------------------------- 1 | workflow Clean4{ 2 | 3 | Array[File] RD_CN_revises 4 | File normal_revise_vcf 5 | 6 | scatter ( RD_CN_revise in RD_CN_revises ){ 7 | call cleanvcf4 { 8 | input: 9 | RD_CN_revise=RD_CN_revise, 10 | normal_revise_vcf=normal_revise_vcf, 11 | } 12 | } 13 | 14 | call combine as combine_revised { 15 | input: 16 | shards=cleanvcf4.out, 17 | outfile="revise.vcf.lines.txt.gz" 18 | } 19 | 20 | call combine as combine_multi_IDs { 21 | input: 22 | shards=cleanvcf4.multi_IDs, 23 | outfile="multi.geno.ids.txt.gz" 24 | } 25 | 26 | output { 27 | File out=combine_revised.out 28 | File multi_IDs=combine_multi_IDs.out 29 | } 30 | } 31 | 32 | task cleanvcf4 { 33 | File RD_CN_revise 34 | File normal_revise_vcf 35 | 36 | command { 37 | bash /opt/sv-pipeline/04_variant_resolution/scripts/clean_vcf_part4.sh ${RD_CN_revise} ${normal_revise_vcf} 38 | } 39 | 40 | runtime { 41 | docker: "talkowski/sv-pipeline@sha256:703a19f84f498989ba8ffde110a3462cfecfbd7ade1084a151fac5fff742c266" 42 | disks: "local-disk 200 SSD" 43 | memory: "16 GB" 44 | } 45 | 46 | output { 47 | File out="revise.vcf.lines.txt.gz" 48 | File multi_IDs="multi.geno.ids.txt.gz" 49 | } 50 | } 51 | 52 | task combine { 53 | Array[File] shards 54 | String outfile 55 | 56 | command { 57 | zcat ${sep=" " shards} | bgzip -c > ${outfile} 58 | } 59 | 60 | runtime { 61 | preemptible: 1 62 | docker: "talkowski/sv-pipeline@sha256:703a19f84f498989ba8ffde110a3462cfecfbd7ade1084a151fac5fff742c266" 63 | disks: "local-disk 250 SSD" 64 | memory: "8 GB" 65 | } 66 | 67 | output { 68 | File out="${outfile}" 69 | } 70 | } 71 | -------------------------------------------------------------------------------- /gnomad_sv_pipeline_wdls/module_06/05_cleanVCF_scatter.wdl: -------------------------------------------------------------------------------- 1 | import "https://api.firecloud.org/ga4gh/v1/tools/Talkowski-SV:05_CleanVCF/versions/93/plain-WDL/descriptor" as CleanVCF_chr 2 | import "https://api.firecloud.org/ga4gh/v1/tools/Talkowski-SV:master_SV_VCF_QC/versions/75/plain-WDL/descriptor" as QC 3 | 4 | workflow CleanVCF_scatter{ 5 | 6 | File vcf 7 | File chrlist 8 | File backgroundlist 9 | File famfile 10 | Int max_shards_per_chrom_step1 11 | Int min_records_per_shard_step1 12 | Int samples_per_step2_shard 13 | File trio_famfile 14 | String ref_build 15 | String prefix 16 | File Sanders_2015_tarball 17 | File Collins_2017_tarball 18 | File Werling_2018_tarball 19 | File? outlier_samples_list 20 | 21 | 22 | Array[Array[String]] chrs=read_tsv(chrlist) 23 | 24 | 25 | scatter ( chr in chrs ){ 26 | call CleanVCF_chr.CleanVCF { 27 | input: 28 | vcf=vcf, 29 | Chr=chr[0], 30 | backgroundlist=backgroundlist, 31 | famfile=famfile, 32 | prefix=prefix, 33 | max_shards_per_chrom_step1=max_shards_per_chrom_step1, 34 | min_records_per_shard_step1=min_records_per_shard_step1, 35 | samples_per_step2_shard=samples_per_step2_shard, 36 | outlier_samples_list=outlier_samples_list 37 | } 38 | } 39 | 40 | call combine { 41 | input: 42 | vcfs=CleanVCF.out 43 | } 44 | 45 | call QC.master_vcf_qc as QC_all { 46 | input: 47 | vcf=combine.out, 48 | vcf_idx=combine.idx, 49 | famfile=trio_famfile, 50 | ref_build=ref_build, 51 | prefix="${prefix}_cleanedVCF", 52 | sv_per_shard=10000, 53 | samples_per_shard=50, 54 | Sanders_2015_tarball=Sanders_2015_tarball, 55 | Collins_2017_tarball=Collins_2017_tarball, 56 | Werling_2018_tarball=Werling_2018_tarball, 57 | contiglist=chrlist 58 | } 59 | 60 | # call subset_pass { 61 | # input: 62 | # vcf=combine.out, 63 | # prefix=prefix 64 | # } 65 | 66 | # call QC.master_vcf_qc as QC_pass { 67 | # input: 68 | # vcf=subset_pass.filtered_vcf, 69 | # famfile=trio_famfile, 70 | # ref_build=ref_build, 71 | # prefix="${prefix}_cleanedVCF_filterPass", 72 | # sv_per_shard=10000, 73 | # samples_per_shard=100, 74 | # Sanders_2015_tarball=Sanders_2015_tarball, 75 | # Collins_2017_tarball=Collins_2017_tarball, 76 | # Werling_2018_tarball=Werling_2018_tarball 77 | # } 78 | 79 | # call subset_fail { 80 | # input: 81 | # vcf=combine.out, 82 | # prefix=prefix 83 | # } 84 | 85 | # call QC.master_vcf_qc as QC_fail { 86 | # input: 87 | # vcf=subset_fail.filtered_vcf, 88 | # famfile=trio_famfile, 89 | # ref_build=ref_build, 90 | # prefix="${prefix}_cleanedVCF_filterFail", 91 | # sv_per_shard=10000, 92 | # samples_per_shard=100, 93 | # Sanders_2015_tarball=Sanders_2015_tarball, 94 | # Collins_2017_tarball=Collins_2017_tarball, 95 | # Werling_2018_tarball=Werling_2018_tarball 96 | # } 97 | 98 | output { 99 | File cleaned_vcf = combine.out 100 | File cleaned_vcf_idx = combine.idx 101 | File all_variants_QC = QC_all.sv_vcf_qc_output 102 | # File passing_variants_QC = QC_pass.sv_vcf_qc_output 103 | # File failing_variants_QC = QC_fail.sv_vcf_qc_output 104 | } 105 | } 106 | 107 | 108 | # Merge per-chromosome VCF shards 109 | task combine { 110 | 111 | Array[File] vcfs 112 | String prefix 113 | 114 | command { 115 | vcf-concat ${sep=" " vcfs} | vcf-sort | bgzip -c > ${prefix}.cleanedvcf.vcf.gz; 116 | tabix -p vcf ${prefix}.cleanedvcf.vcf.gz 117 | } 118 | 119 | runtime { 120 | preemptible: 1 121 | docker : "talkowski/sv-pipeline@sha256:facb963613f57bf6c70072c9356241e3ffe47c5d0550beaf9b21f805315846b0" 122 | disks: "local-disk 500 SSD" 123 | memory: "8 GB" 124 | } 125 | 126 | output { 127 | File out="${prefix}.cleanedvcf.vcf.gz" 128 | File idx="${prefix}.cleanedvcf.vcf.gz.tbi" 129 | } 130 | } 131 | 132 | 133 | # Task to sunset variants with VCF FILTER = PASS | MULTIALLELIC 134 | task subset_pass { 135 | File vcf 136 | String prefix 137 | 138 | command <<< 139 | zcat ${vcf} \ 140 | | awk -v FS="\t" -v OFS="\t" \ 141 | '{ if ($1~"#" || $7=="PASS" || $7=="MULTIALLELIC") print $0 }' \ 142 | | vcf-sort \ 143 | | bgzip -c \ 144 | > ${prefix}.passing_variants.vcf.gz 145 | >>> 146 | 147 | runtime { 148 | docker: "talkowski/sv-pipeline@sha256:facb963613f57bf6c70072c9356241e3ffe47c5d0550beaf9b21f805315846b0" 149 | preemptible: 1 150 | disks: "local-disk 500 SSD" 151 | } 152 | 153 | output { 154 | File filtered_vcf = "${prefix}.passing_variants.vcf.gz" 155 | } 156 | } 157 | 158 | 159 | # Task to sunset variants with VCF FILTER != PASS | MULTIALLELIC 160 | task subset_fail { 161 | File vcf 162 | String prefix 163 | 164 | command <<< 165 | zcat ${vcf} \ 166 | | awk -v FS="\t" -v OFS="\t" \ 167 | '{ if ($1~"#" || ($7!="PASS" && $7!="MULTIALLELIC") ) print $0 }' \ 168 | | vcf-sort \ 169 | | bgzip -c \ 170 | > ${prefix}.failing_variants.vcf.gz 171 | >>> 172 | 173 | runtime { 174 | docker: "talkowski/sv-pipeline@sha256:facb963613f57bf6c70072c9356241e3ffe47c5d0550beaf9b21f805315846b0" 175 | preemptible: 1 176 | disks: "local-disk 500 SSD" 177 | } 178 | 179 | output { 180 | File filtered_vcf = "${prefix}.failing_variants.vcf.gz" 181 | } 182 | } 183 | -------------------------------------------------------------------------------- /gnomad_sv_pipeline_wdls/module_07/06_annotate.wdl: -------------------------------------------------------------------------------- 1 | import "https://api.firecloud.org/ga4gh/v1/tools/Talkowski-SV:06_annotate_per_chrom/versions/18/plain-WDL/descriptor" as annotate_by_chrom 2 | 3 | # Copyright (c) 2018 Talkowski Lab 4 | 5 | # Contact Ryan Collins 6 | 7 | # Distributed under terms of the MIT License 8 | 9 | 10 | # Workflow to parallelize VCF annotation by chromosome 11 | workflow parallelized_annotation { 12 | String vcf 13 | File vcf_idx 14 | String prefix 15 | File contiglist 16 | File protein_coding_gtf 17 | # File antisense_gtf 18 | File lincRNA_gtf 19 | # File processed_transcript_gtf 20 | # File pseudogene_gtf 21 | File promoter_bed 22 | File noncoding_bed 23 | File svc_acct_key 24 | 25 | Array[Array[String]] contigs = read_tsv(contiglist) 26 | 27 | #Annotate, scattered by chromosome 28 | scatter (contig in contigs) { 29 | 30 | #Remote tabix each chromosome 31 | call subset_vcf { 32 | input: 33 | vcf=vcf, 34 | vcf_idx=vcf_idx, 35 | contig=contig[0], 36 | prefix="${prefix}.${contig[0]}", 37 | svc_acct_key=svc_acct_key 38 | } 39 | 40 | #Annotate per chromosome 41 | call annotate_by_chrom.annotate as annotate { 42 | input: 43 | vcf=subset_vcf.subsetted_vcf, 44 | prefix="${prefix}.${contig[0]}", 45 | protein_coding_gtf=protein_coding_gtf, 46 | lincRNA_gtf=lincRNA_gtf, 47 | promoter_bed=promoter_bed, 48 | noncoding_bed=noncoding_bed 49 | } 50 | } 51 | 52 | #Merge integrated vcfs across chromosomes 53 | call concat_vcfs { 54 | input: 55 | vcfs=annotate.annotated_vcf, 56 | prefix="${prefix}.annotated" 57 | } 58 | 59 | output { 60 | File annotated_vcf = concat_vcfs.concat_vcf 61 | File annotated_vcf_idx = concat_vcfs.concat_vcf_idx 62 | } 63 | } 64 | 65 | 66 | #Remote tabix a single chromosome per VCFs 67 | task subset_vcf { 68 | String vcf 69 | File vcf_idx 70 | String contig 71 | String prefix 72 | File svc_acct_key 73 | 74 | command <<< 75 | #Remote tabix to chromosome of interest 76 | url=$( gsutil signurl -d 24h ${svc_acct_key} ${vcf} | sed '1d' | cut -f 4 ); 77 | echo "$url"; 78 | svtk remote_tabix --header "$url" ${vcf_idx} "${contig}:0-300000000" \ 79 | | bgzip -c > "${prefix}.${contig}.vcf.gz" 80 | tabix -p vcf -f "${prefix}.${contig}.vcf.gz" 81 | >>> 82 | 83 | output { 84 | File subsetted_vcf = "${prefix}.${contig}.vcf.gz" 85 | File subsetted_vcf_idx = "${prefix}.${contig}.vcf.gz.tbi" 86 | } 87 | 88 | runtime { 89 | docker: "talkowski/sv-pipeline-remote-pysam@sha256:13da9601b97e08ce2abb1aca494551dc7c09920e46dcca11768cd6aff3db37e5" 90 | preemptible: 1 91 | maxRetries: 1 92 | disks: "local-disk 50 SSD" 93 | } 94 | } 95 | 96 | 97 | #Merge multiple vcfs 98 | task concat_vcfs { 99 | Array[File] vcfs 100 | String prefix 101 | 102 | command <<< 103 | vcf-concat ${sep=' ' vcfs} | vcf-sort -c | bgzip -c > ${prefix}.vcf.gz 104 | tabix -f -p vcf ${prefix}.vcf.gz 105 | >>> 106 | 107 | output { 108 | File concat_vcf = "${prefix}.vcf.gz" 109 | File concat_vcf_idx = "${prefix}.vcf.gz.tbi" 110 | } 111 | 112 | runtime { 113 | docker: "talkowski/sv-pipeline@sha256:6727434a18800d0453a973ca2386325b6b75330b6d05dd014ddb4bcd91dba31b" 114 | preemptible: 1 115 | maxRetries: 1 116 | disks: "local-disk 1000 SSD" 117 | } 118 | } -------------------------------------------------------------------------------- /gnomad_sv_pipeline_wdls/module_07/06_annotate_per_chrom.wdl: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2018 Talkowski Lab 2 | 3 | # Contact Ryan Collins 4 | 5 | # Distributed under terms of the MIT License 6 | 7 | 8 | # Workflow to annotate output of cleanVCF 9 | workflow annotate { 10 | File vcf 11 | String prefix 12 | File protein_coding_gtf 13 | # File antisense_gtf 14 | File lincRNA_gtf 15 | # File processed_transcript_gtf 16 | # File pseudogene_gtf 17 | File promoter_bed 18 | File noncoding_bed 19 | 20 | call annotate_coding as annotate_protein_coding { 21 | input: 22 | vcf=vcf, 23 | gtf=protein_coding_gtf, 24 | prefix=prefix, 25 | gene_set="protein_coding" 26 | } 27 | 28 | # call annotate_coding as annotate_antisense { 29 | # input: 30 | # vcf=vcf, 31 | # gtf=antisense_gtf, 32 | # prefix=prefix, 33 | # gene_set="antisense" 34 | # } 35 | 36 | call annotate_coding as annotate_lincRNA { 37 | input: 38 | vcf=vcf, 39 | gtf=lincRNA_gtf, 40 | prefix=prefix, 41 | gene_set="lincRNA" 42 | } 43 | 44 | # call annotate_coding as annotate_processed_transcript { 45 | # input: 46 | # vcf=vcf, 47 | # gtf=processed_transcript_gtf, 48 | # prefix=prefix, 49 | # gene_set="processed_transcript" 50 | # } 51 | 52 | # call annotate_coding as annotate_pseudogene { 53 | # input: 54 | # vcf=vcf, 55 | # gtf=pseudogene_gtf, 56 | # prefix=prefix, 57 | # gene_set="pseudogene" 58 | # } 59 | 60 | call annotate_noncoding as annotate_promoter { 61 | input: 62 | vcf=vcf, 63 | bed=promoter_bed, 64 | prefix=prefix, 65 | noncoding_set="promoter" 66 | } 67 | 68 | call annotate_noncoding as annotate_noncoding_elements { 69 | input: 70 | vcf=vcf, 71 | bed=noncoding_bed, 72 | prefix=prefix, 73 | noncoding_set="noncoding" 74 | } 75 | 76 | call merge_annotations { 77 | input: 78 | vcf=vcf, 79 | protein_coding_vcf=annotate_protein_coding.annotated_vcf, 80 | lincRNA_vcf=annotate_lincRNA.annotated_vcf, 81 | promoter_vcf=annotate_promoter.annotated_vcf, 82 | noncoding_vcf=annotate_noncoding_elements.annotated_vcf, 83 | prefix=prefix 84 | } 85 | 86 | output { 87 | File annotated_vcf = merge_annotations.annotated_vcf 88 | } 89 | } 90 | 91 | task annotate_coding { 92 | File vcf 93 | File gtf 94 | String prefix 95 | String gene_set 96 | 97 | command <<< 98 | set -euo pipefail 99 | # Note: as of BEDTools 2.28, there are issues with reading bgzip-compressed files 100 | # directly into intersect/coverage, so GTF needs to be decompressed first 101 | zcat ${gtf} > decompressed.gtf 102 | svtk annotate \ 103 | --gencode decompressed.gtf \ 104 | ${vcf} \ 105 | ${prefix}.${gene_set}.vcf 106 | orig=$( zcat ${vcf} | cut -f1 | fgrep -v "#" | wc -l ) 107 | new=$( cut -f1 ${prefix}.${gene_set}.vcf | fgrep -v "#" | wc -l ) 108 | if [ "$new" -ne "$orig" ]; then 109 | echo "ANNOTATED VCF DOES NOT HAVE THE SAME NUMBER OF RECORDS AS INPUT VCF ($new vs $orig)" 110 | exit 1 111 | fi 112 | bgzip -f ${prefix}.${gene_set}.vcf 113 | >>> 114 | 115 | output { 116 | File annotated_vcf = "${prefix}.${gene_set}.vcf.gz" 117 | } 118 | 119 | runtime { 120 | preemptible: 1 121 | maxRetries: 1 122 | disks: "local-disk 50 SSD" 123 | memory: "4 GB" 124 | docker: "talkowski/sv-pipeline@sha256:e98cd2ffd787240a0fe4a075d35ffc3f6107310b881f646d5340de34910a7510" 125 | } 126 | } 127 | 128 | task annotate_noncoding { 129 | File vcf 130 | File bed 131 | String prefix 132 | String noncoding_set 133 | 134 | command <<< 135 | set -euo pipefail 136 | svtk annotate \ 137 | --noncoding ${bed} \ 138 | ${vcf} \ 139 | ${prefix}.${noncoding_set}.vcf 140 | orig=$( zcat ${vcf} | cut -f1 | fgrep -v "#" | wc -l ) 141 | new=$( cut -f1 ${prefix}.${noncoding_set}.vcf | fgrep -v "#" | wc -l ) 142 | if [ "$new" -ne "$orig" ]; then 143 | echo "ANNOTATED VCF DOES NOT HAVE THE SAME NUMBER OF RECORDS AS INPUT VCF ($new vs $orig)" 144 | exit 1 145 | fi 146 | bgzip -f ${prefix}.${noncoding_set}.vcf 147 | >>> 148 | 149 | output { 150 | File annotated_vcf = "${prefix}.${noncoding_set}.vcf.gz" 151 | } 152 | 153 | runtime { 154 | preemptible: 1 155 | maxRetries: 1 156 | disks: "local-disk 50 SSD" 157 | memory: "4 GB" 158 | docker: "talkowski/sv-pipeline@sha256:e98cd2ffd787240a0fe4a075d35ffc3f6107310b881f646d5340de34910a7510" 159 | } 160 | } 161 | 162 | task merge_annotations { 163 | File vcf 164 | File protein_coding_vcf 165 | # File antisense_vcf 166 | File lincRNA_vcf 167 | # File processed_transcript_vcf 168 | # File pseudogene_vcf 169 | File promoter_vcf 170 | File noncoding_vcf 171 | String prefix 172 | 173 | command <<< 174 | set -euo pipefail 175 | /opt/sv-pipeline/05_annotation/scripts/merge_annotations.py \ 176 | ${vcf} \ 177 | ${protein_coding_vcf} \ 178 | ${lincRNA_vcf} \ 179 | ${promoter_vcf} \ 180 | ${noncoding_vcf} \ 181 | ${prefix}.annotated.vcf 182 | bgzip ${prefix}.annotated.vcf 183 | orig=$( zcat ${vcf} | cut -f1 | fgrep -v "#" | wc -l ) 184 | new=$( zcat ${prefix}.annotated.vcf.gz | cut -f1 | fgrep -v "#" | wc -l ) 185 | if [ "$new" -ne "$orig" ]; then 186 | echo "ANNOTATED VCF DOES NOT HAVE THE SAME NUMBER OF RECORDS AS INPUT VCF ($new vs $orig)" 187 | exit 1 188 | fi 189 | >>> 190 | 191 | output { 192 | File annotated_vcf = "${prefix}.annotated.vcf.gz" 193 | } 194 | 195 | runtime { 196 | preemptible: 1 197 | maxRetries: 1 198 | disks: "local-disk 250 SSD" 199 | memory: "8 GB" 200 | docker: "talkowski/sv-pipeline@sha256:e98cd2ffd787240a0fe4a075d35ffc3f6107310b881f646d5340de34910a7510" 201 | } 202 | } --------------------------------------------------------------------------------