├── .gitignore
├── images
    └── workflow.wdl.v04.low-01.png
├── scripts
    ├── generic.options.json
    ├── Pre_Merge_QC_per_sample.wdl
    ├── Pre_Merge_SV.inputs.json
    ├── Merge_SV.inputs.json
    ├── SV_Pipeline_Full.inputs.json
    ├── sort_same.py
    ├── Merge_SV.wdl
    ├── Pre_Merge_SV_per_sample.wdl
    ├── Post_Merge_SV.inputs.json
    ├── SV_Pipeline_Full.wdl
    ├── Pre_Merge_SV.wdl
    ├── Post_Merge_SV.wdl
    ├── jes.conf
    └── SV_Tasks.wdl
├── test
    ├── cnvnator
    │   ├── cromwell_cmd.sh
    │   ├── test.inputs.json
    │   └── test.wdl
    ├── lumpy
    │   ├── cromwell_cmd.sh
    │   ├── test.wdl
    │   └── test.inputs.json
    ├── svtools
    │   ├── cromwell_cmd.sh
    │   ├── test.inputs.json
    │   └── test.wdl
    ├── svtyper
    │   ├── cromwell_cmd.sh
    │   ├── test.inputs.json
    │   └── test.wdl
    ├── extract-sv-reads
    │   ├── cromwell_cmd.sh
    │   ├── test.inputs.json
    │   └── test.wdl
    ├── config
    │   └── test.wdl
    └── jes.conf
├── docker
    ├── samtools
    │   └── Dockerfile
    ├── bcftools
    │   └── Dockerfile
    ├── extract-sv-reads
    │   └── Dockerfile
    ├── cromwell_mysql
    │   ├── mysql.cnf.template
    │   ├── Dockerfile
    │   ├── run_pipeline.sh
    │   └── application.conf.template
    ├── lumpy
    │   └── Dockerfile
    ├── cnvnator
    │   └── Dockerfile
    ├── vcf_bed_utils
    │   └── Dockerfile
    ├── manta
    │   └── Dockerfile
    ├── svtyper
    │   └── Dockerfile
    ├── svtools
    │   └── Dockerfile
    ├── manta_samtools
    │   ├── Dockerfile
    │   └── doctor_manta.1.py
    └── smoove
    │   └── Dockerfile
└── README.md


/.gitignore:
--------------------------------------------------------------------------------
1 | *~


--------------------------------------------------------------------------------
/images/workflow.wdl.v04.low-01.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/hall-lab/sv-pipeline/HEAD/images/workflow.wdl.v04.low-01.png


--------------------------------------------------------------------------------
/scripts/generic.options.json:
--------------------------------------------------------------------------------
1 | {
2 |   "read_from_cache": true,
3 |   "default_runtime_attributes": {
4 |     "zones": "us-central1-a us-central1-b us-central1-c us-central1-f"
5 |   }
6 | }
7 | 
8 | 


--------------------------------------------------------------------------------
/test/cnvnator/cromwell_cmd.sh:
--------------------------------------------------------------------------------
 1 | CROMWELL=/home/cchiang/src/cromwell/target/scala-2.11/cromwell-26-22fe860-SNAP.jar
 2 | JES_CONF=../jes.conf
 3 | OPTIONS=../../scripts/generic.options.json
 4 | 
 5 | java \
 6 |     -Dconfig.file=$JES_CONF \
 7 |     -jar $CROMWELL \
 8 |     run \
 9 |     test.wdl \
10 |     test.inputs.json \
11 |     $OPTIONS \
12 |     test.metadata.json
13 | 


--------------------------------------------------------------------------------
/test/lumpy/cromwell_cmd.sh:
--------------------------------------------------------------------------------
 1 | CROMWELL=/home/cchiang/src/cromwell/target/scala-2.11/cromwell-26-22fe860-SNAP.jar
 2 | JES_CONF=../jes.conf
 3 | OPTIONS=../../scripts/generic.options.json
 4 | 
 5 | java \
 6 |     -Dconfig.file=$JES_CONF \
 7 |     -jar $CROMWELL \
 8 |     run \
 9 |     test.wdl \
10 |     test.inputs.json \
11 |     $OPTIONS \
12 |     test.metadata.json
13 | 


--------------------------------------------------------------------------------
/test/svtools/cromwell_cmd.sh:
--------------------------------------------------------------------------------
 1 | CROMWELL=/home/cchiang/src/cromwell/target/scala-2.11/cromwell-26-22fe860-SNAP.jar
 2 | JES_CONF=../jes.conf
 3 | OPTIONS=../../scripts/generic.options.json
 4 | 
 5 | java \
 6 |     -Dconfig.file=$JES_CONF \
 7 |     -jar $CROMWELL \
 8 |     run \
 9 |     test.wdl \
10 |     test.inputs.json \
11 |     $OPTIONS \
12 |     test.metadata.json
13 | 


--------------------------------------------------------------------------------
/test/svtyper/cromwell_cmd.sh:
--------------------------------------------------------------------------------
 1 | CROMWELL=/home/cchiang/src/cromwell/target/scala-2.11/cromwell-26-22fe860-SNAP.jar
 2 | JES_CONF=../jes.conf
 3 | OPTIONS=../../scripts/generic.options.json
 4 | 
 5 | java \
 6 |     -Dconfig.file=$JES_CONF \
 7 |     -jar $CROMWELL \
 8 |     run \
 9 |     test.wdl \
10 |     test.inputs.json \
11 |     $OPTIONS \
12 |     test.metadata.json
13 | 


--------------------------------------------------------------------------------
/test/extract-sv-reads/cromwell_cmd.sh:
--------------------------------------------------------------------------------
 1 | CROMWELL=/home/cchiang/src/cromwell/target/scala-2.11/cromwell-26-22fe860-SNAP.jar
 2 | JES_CONF=../jes.conf
 3 | OPTIONS=../../scripts/generic.options.json
 4 | 
 5 | java \
 6 |     -Dconfig.file=$JES_CONF \
 7 |     -jar $CROMWELL \
 8 |     run \
 9 |     test.wdl \
10 |     test.inputs.json \
11 |     $OPTIONS \
12 |     test.metadata.json
13 | 


--------------------------------------------------------------------------------
/test/config/test.wdl:
--------------------------------------------------------------------------------
 1 | import "../../scripts/SV_Tasks.wdl" as SV
 2 | 
 3 | workflow Test_Simple_Workflow {
 4 |     File input_cram
 5 | 
 6 |     Int disk_size
 7 |     Int preemptible_tries
 8 | 
 9 |     call SV.Get_Sample_Name {
10 |         input:
11 |         input_cram = input_cram,
12 |         disk_size = disk_size,
13 |         preemptible_tries = preemptible_tries
14 |     }
15 | }
16 | 


--------------------------------------------------------------------------------
/test/extract-sv-reads/test.inputs.json:
--------------------------------------------------------------------------------
1 | {
2 |   "Test_Extract_Reads.input_cram": "gs://mgi-wdl-test/data/call-Extract_Reads/shard-0/attempt-2/H_IJ-NA12878-NA12878_K10.cram",
3 |   "Test_Extract_Reads.basename": "H_IJ-NA12878-NA12878_K10",
4 |   "Test_Extract_Reads.ref_cache": "gs://human-b38/cache.tar.gz",
5 |   "Test_Extract_Reads.preemptible_tries": 3,
6 |   "Test_Extract_Reads.disk_size": 50
7 | }
8 | 


--------------------------------------------------------------------------------
/test/extract-sv-reads/test.wdl:
--------------------------------------------------------------------------------
 1 | import "../../scripts/SV_Tasks.wdl" as SV
 2 | 
 3 | workflow Test_Extract_Reads {
 4 |   # data inputs
 5 |   String basename
 6 |   File input_cram
 7 | 
 8 |   # reference inputs
 9 |   File ref_cache
10 | 
11 |   # system inputs
12 |   Int disk_size
13 |   Int preemptible_tries
14 | 
15 |   call SV.Extract_Reads {
16 |     input:
17 |     input_cram = input_cram,
18 |     basename = basename,
19 |     ref_cache = ref_cache,
20 |     disk_size = disk_size,
21 |     preemptible_tries = preemptible_tries
22 |   }
23 | }
24 | 


--------------------------------------------------------------------------------
/docker/samtools/Dockerfile:
--------------------------------------------------------------------------------
 1 | FROM debian:stretch-slim
 2 | LABEL maintainer "Dave Larson <delarson@wustl.edu>"
 3 | 
 4 | COPY --from=halllab/samtools-1.9-build:v1 /build/deb-build/opt/hall-lab/samtools-1.9 /opt/hall-lab/samtools-1.9
 5 | 
 6 | ENV PATH=/opt/hall-lab/samtools-1.9/bin:$PATH
 7 | 
 8 | RUN apt-get update -qq \
 9 |     && apt-get install -y --no-install-recommends \
10 |         libssl1.1 \
11 |         libcurl3 \
12 |         libncurses5 \
13 |         libbz2-1.0 \ 
14 |         liblzma5 \ 
15 |         libssl1.0.2 \
16 |         zlib1g
17 | 
18 | CMD ["/bin/bash"]
19 | 


--------------------------------------------------------------------------------
/test/svtyper/test.inputs.json:
--------------------------------------------------------------------------------
 1 | {
 2 |   "Test_Genotype.basename": "H_IJ-NA12878-NA12878_K10",
 3 |   "Test_Genotype.input_cram": "gs://mgi-wdl-test/data/call-Extract_Reads/shard-0/attempt-2/H_IJ-NA12878-NA12878_K10.cram",
 4 |   "Test_Genotype.input_cram_index": "gs://mgi-wdl-test/data/call-Extract_Reads/shard-0/attempt-2/H_IJ-NA12878-NA12878_K10.cram.crai",
 5 |   "Test_Genotype.input_vcf": "gs://mgi-wdl-test/data/call-L_Merge_VCF_Variants/pilot-01.lmerge.vcf.gz",
 6 |   "Test_Genotype.ref_cache": "gs://human-b38/cache.tar.gz",
 7 |   "Test_Genotype.preemptible_tries": 3,
 8 |   "Test_Genotype.disk_size": 50
 9 | }
10 | 


--------------------------------------------------------------------------------
/test/svtyper/test.wdl:
--------------------------------------------------------------------------------
 1 | import "../../scripts/SV_Tasks.wdl" as SV
 2 | 
 3 | workflow Test_Genotype {
 4 |   # data inputs
 5 |   String basename
 6 |   File input_cram
 7 |   File input_cram_index
 8 |   File input_vcf
 9 | 
10 |   # reference inputs
11 |   File ref_cache
12 | 
13 |   # system inputs
14 |   Int disk_size
15 |   Int preemptible_tries
16 | 
17 |   call SV.Genotype as Genotype_Merged {
18 |     input:
19 |     basename = basename,
20 |     input_cram = input_cram,
21 |     input_cram_index = input_cram_index,
22 |     input_vcf = input_vcf,
23 |     ref_cache = ref_cache,
24 |     disk_size = disk_size,
25 |     preemptible_tries = preemptible_tries
26 |   }
27 | }
28 | 


--------------------------------------------------------------------------------
/docker/bcftools/Dockerfile:
--------------------------------------------------------------------------------
 1 | FROM debian:stretch-slim
 2 | LABEL maintainer "Dave Larson <delarson@wustl.edu>"
 3 | 
 4 | COPY --from=halllab/bcftools-1.9-build:v1 /build/deb-build/opt/hall-lab/bcftools-1.9 /opt/hall-lab/bcftools-1.9
 5 | COPY --from=halllab/htslib-1.9-build:v1 /build/deb-build/opt/hall-lab/htslib-1.9 /opt/hall-lab/htslib-1.9
 6 | 
 7 | ENV PATH=/opt/hall-lab/bcftools-1.9/bin:/opt/hall-lab/htslib-1.9/bin:$PATH
 8 | 
 9 | RUN apt-get update -qq \
10 |     && apt-get install -y --no-install-recommends \
11 |         libssl1.1 \
12 |         libcurl3 \
13 |         libbz2-1.0 \ 
14 |         liblzma5 \ 
15 |         libssl1.0.2 \
16 |         zlib1g
17 | 
18 | CMD ["/bin/bash"]
19 | 


--------------------------------------------------------------------------------
/docker/extract-sv-reads/Dockerfile:
--------------------------------------------------------------------------------
 1 | FROM ubuntu:14.04
 2 | LABEL maintainer "Dave Larson <delarson@wustl.edu>"
 3 | 
 4 | # Build dependencies
 5 | RUN export EXTRACT_SV_READS_VERSION=1.1.0 \
 6 |     && apt-get update -qq \
 7 |     && apt-get -y install apt-transport-https \
 8 |     && echo "deb [trusted=yes] https://gitlab.com/hall-lab/ccdg-apt-repo/raw/master ccdg main" | tee -a /etc/apt/sources.list \
 9 |     && runDeps=' \
10 |         libcurl3 \
11 |         ca-certificates \
12 |         zlib1g \
13 |         libncurses5 \
14 |         ccdg-samtools-1.3.1 \
15 |         extract-sv-reads1.1 \
16 |         ' \
17 |     && apt-get update -qq \
18 |     && apt-get -y install \
19 |         --no-install-recommends \
20 |         $runDeps \
21 |     && rm -rf /var/lib/apt/lists/*
22 | 
23 | ENV PATH=/opt/ccdg/samtools-1.3.1/bin:${PATH}
24 | 
25 | CMD ["/bin/bash"]
26 | 


--------------------------------------------------------------------------------
/test/svtools/test.inputs.json:
--------------------------------------------------------------------------------
 1 | {
 2 |   "Test_SVTools.cohort_name": "test-01",
 3 |   "Test_SVTools.final_vcf_name": "test-01.vcf.gz",
 4 |   "Test_SVTools.input_pre_merged_vcfs": [
 5 |     "gs://mgi-wdl-test/data/call-SV_Genotype_Unmerged/shard-0/H_IJ-NA12878-NA12878_K10.gt.vcf",
 6 |     "gs://mgi-wdl-test/data/call-SV_Genotype_Unmerged/shard-1/H_IJ-NA12891-NA12891_D2.gt.vcf"
 7 |   ],
 8 |   "Test_SVTools.input_post_merged_vcfs": [
 9 |     "gs://mgi-wdl-test/data/call-SV_Copy_Number/shard-0/H_IJ-NA12878-NA12878_K10.cn.vcf",
10 |     "gs://mgi-wdl-test/data/call-SV_Copy_Number/shard-1/H_IJ-NA12891-NA12891_D2.cn.vcf"
11 |   ],
12 |   "Test_SVTools.pedigree_file": "gs://mgi-wdl-test/data/call-Make_Pedigree_File/pilot-01.ped",
13 |   "Test_SVTools.mei_annotation_bed": "gs://human-b38/GRCh38DH/annotations/repeatMasker.recent.lt200millidiv.LINE_SINE_SVA.GRCh38.sorted.bed.gz",
14 |   "Test_SVTools.disk_size": 50,
15 |   "Test_SVTools.preemptible_tries": 3
16 | }
17 | 


--------------------------------------------------------------------------------
/docker/cromwell_mysql/mysql.cnf.template:
--------------------------------------------------------------------------------
 1 | [client]
 2 | port		= 3306
 3 | socket		= /tmp/mysqld.sock
 4 | 
 5 | [mysqld_safe]
 6 | socket		= /tmp/mysqld.sock
 7 | nice		= 0
 8 | 
 9 | [mysqld]
10 | user		= mysql
11 | pid-file	= %%SHARED_FS_DIRECTORY%%/db/run/mysqld/mysqld.pid
12 | socket		= /tmp/mysqld.sock
13 | port		= 3306
14 | basedir		= /usr
15 | datadir		= %%SHARED_FS_DIRECTORY%%/db/lib/mysql
16 | tmpdir		= /tmp
17 | skip-external-locking
18 | bind-address		= 127.0.0.1
19 | key_buffer		= 16M
20 | max_allowed_packet	= 16M
21 | thread_stack		= 192K
22 | thread_cache_size       = 8
23 | myisam-recover         = BACKUP
24 | query_cache_limit	= 1M
25 | query_cache_size        = 16M
26 | log_error = %%SHARED_FS_DIRECTORY%%/db/log/mysql/error.log
27 | expire_logs_days	= 10
28 | max_binlog_size         = 100M
29 | 
30 | [mysqldump]
31 | quick
32 | quote-names
33 | max_allowed_packet	= 16M
34 | 
35 | [mysql]
36 | 
37 | [isamchk]
38 | key_buffer		= 16M
39 | 
40 | !includedir /etc/mysql/conf.d/
41 | 


--------------------------------------------------------------------------------
/scripts/Pre_Merge_QC_per_sample.wdl:
--------------------------------------------------------------------------------
 1 | version 1.0
 2 | import "SV_Tasks.wdl" as SV
 3 | 
 4 | workflow Pre_Merge_QC_Per_Sample {
 5 |   input {
 6 |     # data inputs
 7 |     File manta_vcf
 8 |     File lumpy_vcf
 9 |     File cnvnator_vcf
10 |     String cohort
11 |     String center
12 | 
13 |     # system inputs
14 |     Int preemptible_tries
15 |     String basename = sub(sub(lumpy_vcf, "^.*/", ""), ".vcf.gz" + "$", "")
16 |   }
17 | 
18 |   call SV.Count_Lumpy {
19 |     input:
20 |     cohort = cohort,
21 |     center = center,
22 |     basename = basename,
23 |     input_vcf = lumpy_vcf, 
24 |     preemptible_tries = preemptible_tries
25 |   }
26 | 
27 |  call SV.Count_Manta {
28 |    input:
29 |    cohort = cohort,
30 |    center = center,
31 |    basename = basename,
32 |    input_vcf = manta_vcf,
33 |    preemptible_tries = preemptible_tries
34 |  }
35 | 
36 |   output {
37 |     File lumpy_counts = Count_Lumpy.output_counts
38 |     File manta_counts = Count_Manta.output_counts
39 |   }
40 | }
41 | 


--------------------------------------------------------------------------------
/test/lumpy/test.wdl:
--------------------------------------------------------------------------------
 1 | import "../../scripts/SV_Tasks.wdl" as SV
 2 | 
 3 | workflow Test_Lumpy {
 4 |   # data inputs
 5 |   String basename
 6 |   File input_cram
 7 |   File input_cram_index
 8 |   File input_splitters_bam
 9 |   File input_splitters_bam_index
10 |   File input_discordants_bam
11 |   File input_discordants_bam_index
12 | 
13 |   # reference inputs
14 |   File ref_cache
15 |   File exclude_regions
16 | 
17 |   # system inputs
18 |   Int disk_size
19 |   Int preemptible_tries
20 | 
21 |   call SV.Lumpy {
22 |     input:
23 |     basename = basename,
24 |     input_cram = input_cram,
25 |     input_cram_index = input_cram_index,
26 |     input_splitters_bam = input_splitters_bam,
27 |     input_splitters_bam_index = input_splitters_bam_index,
28 |     input_discordants_bam = input_discordants_bam,
29 |     input_discordants_bam_index = input_discordants_bam_index,
30 |     ref_cache = ref_cache,
31 |     exclude_regions = exclude_regions,
32 |     disk_size = disk_size,
33 |     preemptible_tries = preemptible_tries
34 |   }
35 | }
36 | 


--------------------------------------------------------------------------------
/scripts/Pre_Merge_SV.inputs.json:
--------------------------------------------------------------------------------
 1 | {
 2 |   "Pre_Merge_SV.aligned_crams": [
 3 |     "gs://fc-56ac46ea-efc4-4683-b6d5-6d95bed41c5e/CCDG_13607/Project_CCDG_13607_B01_GRM_WGS.cram.2019-02-06/Sample_NA12878/analysis/NA12878.final.cram",
 4 |     "gs://fc-56ac46ea-efc4-4683-b6d5-6d95bed41c5e/CCDG_13607/Project_CCDG_13607_B01_GRM_WGS.cram.2019-02-06/Sample_NA19238/analysis/NA19238.final.cram"
 5 |   ],
 6 |   "Pre_Merge_SV.aligned_cram_suffix": ".cram",
 7 |   "Pre_Merge_SV.cohort": "Cohort",
 8 |   "Pre_Merge_SV.center": "WashU",
 9 | 
10 |   "Pre_Merge_SV.ref_fasta": "gs://human-b38/GRCh38DH/all_sequences.fa",
11 |   "Pre_Merge_SV.ref_fasta_index": "gs://human-b38/GRCh38DH/all_sequences.fa.fai",
12 |   "Pre_Merge_SV.ref_cache": "gs://human-b38/cache.tar.gz",
13 |   "Pre_Merge_SV.exclude_regions": "gs://human-b38/GRCh38DH/annotations/exclude.cnvnator_100bp.GRCh38.20170403.bed",
14 |   "Pre_Merge_SV.call_regions_bed": "gs://human-b38/GRCh38DH/annotations/canonical_chromosome.bed.gz",
15 |   "Pre_Merge_SV.call_regions_bed_index": "gs://human-b38/GRCh38DH/annotations/canonical_chromosome.bed.gz.tbi",
16 |   "Pre_Merge_SV.preemptible_tries": 3
17 | }
18 | 


--------------------------------------------------------------------------------
/scripts/Merge_SV.inputs.json:
--------------------------------------------------------------------------------
 1 | {
 2 |   "Merge_SV.smoove_input_vcfs": [
 3 |     "gs://mgi-wdl-test/2019-09-06/Pre_Merge_SV/1e43c1ff-befd-4bf1-834e-6aab76b976d5/call-Pre_Merge_SV_Per_Sample/shard-0/Pre_Merge_SV_Per_Sample/30546f8c-c09f-4873-b77e-641f194cacb5/call-Smoove/attempt-2/NA12878.final-smoove.genotyped.vcf.gz",
 4 |     "gs://mgi-wdl-test/2019-09-06/Pre_Merge_SV/1e43c1ff-befd-4bf1-834e-6aab76b976d5/call-Pre_Merge_SV_Per_Sample/shard-1/Pre_Merge_SV_Per_Sample/928868b2-a748-43f6-a938-517f784eff54/call-Smoove/NA19238.final-smoove.genotyped.vcf.gz"
 5 |   ],
 6 |   "Merge_SV.manta_input_vcfs": [
 7 |     "gs://mgi-wdl-test/2019-09-06/Pre_Merge_SV/e1715d86-4359-44ed-8115-8cd47a008311/call-Pre_Merge_SV_Per_Sample/shard-0/Pre_Merge_SV_Per_Sample/cadf1cd8-bcd8-43b7-8038-9eccf0e2f2ca/call-Manta/attempt-2/NA12878.final.doctored.vcf.gz",
 8 |     "gs://mgi-wdl-test/2019-09-06/Pre_Merge_SV/1e43c1ff-befd-4bf1-834e-6aab76b976d5/call-Pre_Merge_SV_Per_Sample/shard-1/Pre_Merge_SV_Per_Sample/928868b2-a748-43f6-a938-517f784eff54/call-Manta/attempt-2/NA19238.final.doctored.vcf.gz"
 9 |   ],
10 |   "Merge_SV.cohort_name": "Cohort",
11 | 
12 |   "Merge_SV.preemptible_tries": 3
13 | }
14 | 


--------------------------------------------------------------------------------
/docker/cromwell_mysql/Dockerfile:
--------------------------------------------------------------------------------
 1 | FROM broadinstitute/cromwell:29
 2 | LABEL maintainer "Dave Larson <delarson@wustl.edu>"
 3 | 
 4 | # Build dependencies
 5 | RUN apt-get update -qq \
 6 |     && runDeps=' \
 7 |         libnss-sss \
 8 |         mysql-server \
 9 |         ' \
10 |     && apt-get update -qq \
11 |     && DEBIAN_FRONTEND=noninteractive apt-get -y install \
12 |         --no-install-recommends \
13 |         $runDeps \
14 |     && mkdir /var/run/mysqld \
15 |     && chmod -R 777 /var/lib/mysql /var/run/mysqld /var/log/mysql && rm -fr /var/lib/mysql/mysql /var/lib/mysql/performance_schema \
16 |     && mkdir -p /opt/ccdg/cromwell/resources \
17 |     && ln -sf /usr/share/zoneinfo/America/Chicago /etc/localtime && echo "America/Chicago" > /etc/timezone && dpkg-reconfigure --frontend noninteractive tzdata \
18 |     && rm -rf /var/lib/apt/lists/*
19 | 
20 | ADD application.conf.template /opt/ccdg/cromwell/resources
21 | ADD mysql.cnf.template /opt/ccdg/cromwell/resources
22 | ADD run_pipeline.sh /opt/ccdg/cromwell/resources
23 | 
24 | # Reset entrypoint so container doesn't try to run cromwell directly
25 | ENTRYPOINT ["/opt/ccdg/cromwell/resources/run_pipeline.sh"]
26 | 


--------------------------------------------------------------------------------
/test/lumpy/test.inputs.json:
--------------------------------------------------------------------------------
 1 | {
 2 |   "Test_Lumpy.basename": "H_IJ-NA12878-NA12878_K10",
 3 |   "Test_Lumpy.input_cram": "gs://mgi-wdl-test/data/call-Extract_Reads/shard-0/attempt-2/H_IJ-NA12878-NA12878_K10.cram",
 4 |   "Test_Lumpy.input_cram_index": "gs://mgi-wdl-test/data/call-Extract_Reads/shard-0/attempt-2/H_IJ-NA12878-NA12878_K10.cram.crai",
 5 |   "Test_Lumpy.input_discordants_bam": "gs://mgi-wdl-test/data/call-Extract_Reads/shard-0/attempt-2/H_IJ-NA12878-NA12878_K10.discordants.bam",
 6 |   "Test_Lumpy.input_discordants_bam_index": "gs://mgi-wdl-test/data/call-Extract_Reads/shard-0/attempt-2/H_IJ-NA12878-NA12878_K10.discordants.bam.bai",
 7 |   "Test_Lumpy.input_splitters_bam": "gs://mgi-wdl-test/data/call-Extract_Reads/shard-0/attempt-2/H_IJ-NA12878-NA12878_K10.splitters.bam",
 8 |   "Test_Lumpy.input_splitters_bam_index": "gs://mgi-wdl-test/data/call-Extract_Reads/shard-0/attempt-2/H_IJ-NA12878-NA12878_K10.splitters.bam.bai",
 9 |   "Test_Lumpy.disk_size": 50,
10 |   "Test_Lumpy.ref_cache": "gs://human-b38/cache.tar.gz",
11 |   "Test_Lumpy.exclude_regions": "gs://human-b38/GRCh38DH/annotations/exclude.cnvnator_100bp.GRCh38.20170403.bed",
12 |   "Test_Lumpy.preemptible_tries": 3
13 | }
14 | 


--------------------------------------------------------------------------------
/docker/lumpy/Dockerfile:
--------------------------------------------------------------------------------
 1 | FROM ubuntu:14.04
 2 | LABEL maintainer "Colby Chiang <colbychiang@wustl.edu>"
 3 | 
 4 | # Build dependencies
 5 | RUN apt-get update -qq \
 6 |     && apt-get -y install \
 7 |         apt-transport-https \
 8 |         g++ \
 9 | 	gawk \
10 |         libcurl4-gnutls-dev \
11 |         autoconf \
12 | 	libssl-dev \
13 |         git \
14 |     && echo "deb [trusted=yes] https://gitlab.com/hall-lab/ccdg-apt-repo/raw/master ccdg main" | tee -a /etc/apt/sources.list \
15 |     && runDeps=' \
16 | 	ccdg-python-2.7.12 \
17 |         ccdg-samtools-1.3.1 \
18 |         ' \
19 |     && apt-get update -qq \
20 |     && apt-get -y install \
21 |         --no-install-recommends \
22 |         $runDeps \
23 |     && /opt/ccdg/python-2.7.12/bin/pip install --upgrade pip numpy scipy pysam \
24 |     && rm -rf /var/lib/apt/lists/*
25 | 
26 | ENV PATH /opt/ccdg/samtools-1.3.1/bin:${PATH}
27 | ENV PATH /opt/ccdg/python-2.7.12/bin:${PATH}
28 | 
29 | # Install LUMPY
30 | RUN cd /opt \
31 |     && git clone https://github.com/hall-lab/lumpy-sv.git \
32 |     && cd /opt/lumpy-sv \
33 |     && git checkout 0.2.13_cram_support \
34 |     && git submodule sync \
35 |     && git submodule update --init \
36 |     && cd /opt/lumpy-sv \
37 |     && make
38 | 
39 | ENV PATH /opt/lumpy-sv/bin:${PATH}
40 | ENV SHELL /bin/bash
41 | 
42 | CMD ["/bin/bash"]
43 | 


--------------------------------------------------------------------------------
/scripts/SV_Pipeline_Full.inputs.json:
--------------------------------------------------------------------------------
 1 | {
 2 |   "SV_Pipeline_Full.aligned_crams": [
 3 |     "gs://fc-56ac46ea-efc4-4683-b6d5-6d95bed41c5e/CCDG_13607/Project_CCDG_13607_B01_GRM_WGS.cram.2019-02-06/Sample_NA12878/analysis/NA12878.final.cram",
 4 |     "gs://fc-56ac46ea-efc4-4683-b6d5-6d95bed41c5e/CCDG_13607/Project_CCDG_13607_B01_GRM_WGS.cram.2019-02-06/Sample_NA19238/analysis/NA19238.final.cram"
 5 |   ],
 6 |   "SV_Pipeline_Full.aligned_cram_suffix": ".cram",
 7 |   "SV_Pipeline_Full.cohort": "Cohort",
 8 |   "SV_Pipeline_Full.center": "WashU",
 9 | 
10 |   "SV_Pipeline_Full.ref_fasta": "gs://human-b38/GRCh38DH/all_sequences.fa",
11 |   "SV_Pipeline_Full.ref_fasta_index": "gs://human-b38/GRCh38DH/all_sequences.fa.fai",
12 |   "SV_Pipeline_Full.ref_cache": "gs://human-b38/cache.tar.gz",
13 |   "SV_Pipeline_Full.exclude_regions": "gs://human-b38/GRCh38DH/annotations/exclude.cnvnator_100bp.GRCh38.20170403.bed",
14 |   "SV_Pipeline_Full.call_regions_bed": "gs://human-b38/GRCh38DH/annotations/canonical_chromosome.bed.gz",
15 |   "SV_Pipeline_Full.call_regions_bed_index": "gs://human-b38/GRCh38DH/annotations/canonical_chromosome.bed.gz.tbi",
16 |   "SV_Pipeline_Full.mei_annotation_bed": "gs://human-b38/GRCh38DH/annotations/repeatMasker.recent.lt200millidiv.LINE_SINE_SVA.GRCh38.sorted.bed.gz",
17 |   "SV_Pipeline_Full.preemptible_tries": 3,
18 |   "SV_Pipeline_Full.final_vcf_name": "merged_genotyped.vcf.gz",
19 | }
20 | 


--------------------------------------------------------------------------------
/scripts/sort_same.py:
--------------------------------------------------------------------------------
 1 | import re
 2 | 
 3 | def parse_name(line):
 4 | 	list1 = re.split("/", line)
 5 | 	list2 = re.split("\.", list1[len(list1)-1])
 6 | 	return list2[0]
 7 | 
 8 | cram_list = "cram_list.txt"
 9 | cn_list = "cn_hist_roots.txt"
10 | index_list = "indices.txt"
11 | original_manta_list = "original_manta_vcfs.txt"
12 | 
13 | crams = open(cram_list, "r")
14 | cram_list = []
15 | for line in crams:
16 | 	cram_list.append(line.rstrip())
17 | 
18 | cns = open(cn_list, "r")
19 | cns_dict = {}
20 | for line in cns:
21 | 	sample_name = parse_name(line)
22 | 	cns_dict[sample_name] = line.rstrip()
23 | 
24 | indices = open(index_list, "r")
25 | index_dict = {}
26 | for line in indices:
27 | 	sample_name = parse_name(line)
28 | 	index_dict[sample_name] = line.rstrip()
29 | 
30 | manta = open(original_manta_list, "r")
31 | manta_dict = {}
32 | for line in manta:
33 | 	sample_name = parse_name(line)
34 | 	manta_dict[sample_name] = line.rstrip()
35 | 
36 | cns_out = open("cn_hist_roots_ordered.txt", "w")
37 | index_out = open("indices_ordered.txt", "w")
38 | manta_out = open("manta_ordered.txt", "w")
39 | for cram in cram_list:
40 | 	sample_name = parse_name(cram)
41 | 	cns_out.write(cns_dict[sample_name])
42 | 	cns_out.write("\n")
43 | 	index_out.write(index_dict[sample_name])
44 | 	index_out.write("\n")
45 | 	manta_out.write(manta_dict[sample_name])
46 | 	manta_out.write("\n")
47 | 
48 | cns_out.close()
49 | index_out.close()
50 | 


--------------------------------------------------------------------------------
/test/cnvnator/test.inputs.json:
--------------------------------------------------------------------------------
 1 | {
 2 |   "Test_Copy_Number.input_cram": "gs://mgi-wdl-test/data/call-Extract_Reads/shard-0/attempt-2/H_IJ-NA12878-NA12878_K10.cram",
 3 |   "Test_Copy_Number.input_cram_index": "gs://mgi-wdl-test/data/call-Extract_Reads/shard-0/attempt-2/H_IJ-NA12878-NA12878_K10.cram.crai",
 4 |   "Test_Copy_Number.basename": "H_IJ-NA12878-NA12878_K10",
 5 |   "Test_Copy_Number.sample": "H_IJ-NA12878-NA12878_K10",
 6 |   "Test_Copy_Number.input_vcf": "gs://mgi-wdl-test/data/call-SV_Genotype_Merged/shard-0/H_IJ-NA12878-NA12878_K10.gt.vcf",
 7 |   "Test_Copy_Number.ref_fasta": "gs://human-b38/GRCh38DH/all_sequences.fa",
 8 |   "Test_Copy_Number.ref_fasta_index": "gs://human-b38/GRCh38DH/all_sequences.fa.fai",
 9 |   "Test_Copy_Number.ref_cache": "gs://human-b38/cache.tar.gz",
10 |   "Test_Copy_Number.disk_size": 50,
11 |   "Test_Copy_Number.preemptible_tries": 3,
12 | 
13 |   "Test_Copy_Number.cohort_name": "test-01",
14 |   "Test_Copy_Number.aligned_crams": [
15 |     "gs://mgi-wdl-test/data/call-Extract_Reads/shard-0/attempt-2/H_IJ-NA12878-NA12878_K10.cram",
16 |     "gs://mgi-wdl-test/data/call-Extract_Reads/shard-1/attempt-4/H_IJ-NA12891-NA12891_D2.cram"
17 |   ],
18 |   "Test_Copy_Number.cn_hist_roots": [
19 |     "gs://mgi-wdl-test/data/call-CNVnator_Histogram/shard-0/attempt-2/cnvnator.out/H_IJ-NA12878-NA12878_K10.cram.hist.root",
20 |     "gs://mgi-wdl-test/data/call-CNVnator_Histogram/shard-1/cnvnator.out/H_IJ-NA12891-NA12891_D2.cram.hist.root"
21 |   ]
22 | }
23 | 


--------------------------------------------------------------------------------
/docker/cnvnator/Dockerfile:
--------------------------------------------------------------------------------
 1 | FROM debian:stretch-slim
 2 | LABEL maintainer "Dave Larson <delarson@wustl.edu>"
 3 | COPY --from=halllab/python2.7-build:v1 /opt/hall-lab/python-2.7.15 /opt/hall-lab/python-2.7.15
 4 | COPY --from=halllab/htslib-1.9-build:v1 /build/deb-build/opt/hall-lab/htslib-1.9 /opt/hall-lab/htslib-1.9
 5 | COPY --from=halllab/samtools-1.9-build:v1 /build/deb-build/opt/hall-lab/samtools-1.9 /opt/hall-lab/samtools-1.9
 6 | COPY --from=halllab/cnvnator-0.3.3-build:v1 /opt/hall-lab/cnvnator-0.3.3/deb-build/opt/hall-lab/cnvnator-0.3.3 /opt/hall-lab/cnvnator-0.3.3
 7 | ENV PATH=/opt/hall-lab/python-2.7.15/bin:/opt/hall-lab/htslib-1.9/bin:/opt/hall-lab/samtools-1.9/bin:/opt/hall-lab/cnvnator-0.3.3/bin:$PATH
 8 | ENV LD_LIBRARY_PATH=/opt/hall-lab/htslib-1.9/lib:$LD_LIBRARY_PATH
 9 | 
10 | # Build dependencies
11 | RUN apt-get update -qq \
12 |     && apt-get -y install \
13 |         libssl1.1 \
14 |         libcurl3 \
15 |         libncurses5 \
16 |         libbz2-1.0 \ 
17 |         liblzma5 \ 
18 |         libssl1.0.2 \
19 |         zlib1g \
20 |         libgomp1 \
21 |         libstdc++6 \
22 |         libstdc++-6-dev \
23 |         libgcc1 \
24 |         g++ \
25 |         libxpm4 \
26 |         git-core \
27 |     && git clone https://github.com/hall-lab/speedseq.git \
28 |     && cd speedseq \
29 |     && git checkout 4e60002 \
30 |     && cp bin/cnvnator_wrapper.py /opt/hall-lab/cnvnator-0.3.3/bin/ \
31 |     && cd .. \
32 |     && rm -rf speedseq \
33 |     && apt-get purge -y git-core \
34 |     && apt autoremove -y
35 | 
36 | CMD ["/bin/bash"]
37 | 


--------------------------------------------------------------------------------
/test/jes.conf:
--------------------------------------------------------------------------------
 1 | # Minimal Cromwell template for using JES
 2 | 
 3 | webservice {
 4 |   port = 8000
 5 |   interface = 0.0.0.0
 6 |   instance.name = "cromwell-for-wdl-runner"
 7 | }
 8 | 
 9 | akka {
10 |   loggers = ["akka.event.slf4j.Slf4jLogger"]
11 | }
12 | 
13 | spray.can {
14 |   server {
15 |     request-timeout = 40s
16 |   }
17 |   client {
18 |     request-timeout = 40s
19 |     connecting-timeout = 40s
20 |   }
21 | }
22 | 
23 | backend {
24 |   default = "JES"
25 |   providers {
26 |     JES {
27 |       actor-factory = "cromwell.backend.impl.jes.JesBackendLifecycleActorFactory"
28 |       config {
29 |         project = "washu-genome-inh-dis-analysis"
30 |         root = "gs://mgi-wdl-test/workspace"
31 | 
32 |         genomics {
33 |           # A reference to an auth defined in the 'google' stanza at the top.  This auth is used to create
34 |           # Pipelines and manipulate auth JSONs.
35 |           auth = "application-default"
36 |           endpoint-url = "https://genomics.googleapis.com/"
37 |         }
38 | 
39 |         filesystems = {
40 |           gcs {
41 |             # A reference to a potentially different auth for manipulating files via engine functions.
42 |             auth = "application-default"
43 |           }
44 |         }
45 | 
46 |       }
47 |     }
48 |   }
49 | }
50 | 
51 | google {
52 |   applicationName = "cromwell"
53 |   cromwellAuthenticationScheme = "application_default"
54 | }
55 | 
56 | database {
57 |   driver = "slick.driver.HsqldbDriver$"
58 | 
59 |   db {
60 |     driver = "org.hsqldb.jdbcDriver"
61 |     url = "jdbc:hsqldb:mem:${slick.uniqueSchema};shutdown=false;hsqldb.tx=mvcc"
62 |     connectionTimeout = 1000
63 |   }
64 | }
65 | 
66 | instrumentation {
67 |   use-kamon = false
68 | }
69 | 
70 | call-caching {
71 |   enabled = true
72 |   invalidate-bad-cache-results = true
73 | }
74 | 


--------------------------------------------------------------------------------
/docker/vcf_bed_utils/Dockerfile:
--------------------------------------------------------------------------------
 1 | FROM debian:stretch-slim as builder-base
 2 | LABEL maintainer "Allison Regier <aregier@wustl.edu>"
 3 | RUN apt-get update -qq \
 4 |     && apt-get install -y --no-install-recommends \
 5 |         build-essential \
 6 |         make \
 7 |         cmake \
 8 |         autoconf \
 9 |         automake \
10 |         libtool \
11 |         gawk \
12 |         git-core \
13 |         bzip2 \
14 |         libbz2-dev \
15 |         liblzma-dev \
16 |         libssl1.0-dev \
17 |         libcurl4-openssl-dev \
18 |         ca-certificates \
19 |         curl \
20 |         zlib1g-dev
21 | 
22 | FROM builder-base as vawk-build
23 | LABEL maintainer "Allison Regier <aregier@wustl.edu>"
24 | 
25 | COPY --from=halllab/python2.7-build:v1 /opt/hall-lab/python-2.7.15 /opt/hall-lab/python-2.7.15
26 | ENV PATH=/opt/hall-lab/python-2.7.15/bin:${PATH}
27 | RUN git clone https://github.com/cc2qe/vawk
28 | RUN git clone https://github.com/hall-lab/io
29 | RUN curl -kL https://github.com/arq5x/bedtools2/releases/download/v2.28.0/bedtools > bedtools
30 | RUN chmod a+x bedtools
31 | 
32 | FROM debian:stretch-slim
33 | LABEL maintainer "Allison Regier <aregier@wustl.edu>"
34 | 
35 | COPY --from=vawk-build /opt/hall-lab/python-2.7.15 /opt/hall-lab/python-2.7.15
36 | COPY --from=halllab/htslib-1.9-build:v1 /build/deb-build/opt/hall-lab/htslib-1.9 /opt/hall-lab/htslib-1.9
37 | COPY --from=vawk-build vawk /opt/hall-lab/vawk
38 | COPY --from=vawk-build io /opt/hall-lab/io
39 | COPY --from=vawk-build bedtools /opt/hall-lab/bin/bedtools
40 | 
41 | RUN apt-get update -qq \
42 |     && apt-get install -y --no-install-recommends \
43 |         libssl1.1 \
44 |         libcurl3 \
45 |         libncurses5 \
46 |         libbz2-1.0 \ 
47 |         liblzma5 \ 
48 |         libssl1.0.2 \
49 |         zlib1g \
50 |         less \
51 | 	gawk
52 | 
53 | ENV PATH=/opt/hall-lab/python-2.7.15/bin:${PATH}
54 | 
55 | CMD ["/bin/bash"]
56 | 


--------------------------------------------------------------------------------
/test/svtools/test.wdl:
--------------------------------------------------------------------------------
 1 | import "../../scripts/SV_Tasks.wdl" as SV
 2 | 
 3 | workflow Test_SVTools {
 4 |   # data inputs
 5 |   Array[File] input_pre_merged_vcfs
 6 |   Array[File] input_post_merged_vcfs
 7 |   File pedigree_file
 8 |   String cohort_name
 9 |   String final_vcf_name
10 | 
11 |   # reference inputs
12 |   File mei_annotation_bed
13 | 
14 |   # system inputs
15 |   Int disk_size
16 |   Int preemptible_tries
17 | 
18 |   call SV.L_Sort_VCF_Variants {
19 |     input:
20 |     input_vcfs = input_pre_merged_vcfs,
21 |     output_vcf_basename = cohort_name + ".lsort",
22 |     disk_size = disk_size,
23 |     preemptible_tries = preemptible_tries
24 |   }
25 | 
26 |   call SV.L_Merge_VCF_Variants {
27 |     input:
28 |     input_vcf_gz = L_Sort_VCF_Variants.output_vcf_gz,
29 |     output_vcf_basename = cohort_name + ".lmerge",
30 |     disk_size = disk_size,
31 |     preemptible_tries = preemptible_tries
32 |   }
33 | 
34 |   call SV.Paste_VCF {
35 |     input:
36 |     input_vcfs = input_post_merged_vcfs,
37 |     output_vcf_basename = cohort_name + ".merged.gt.cn",
38 |     disk_size = disk_size,
39 |     preemptible_tries = preemptible_tries
40 |   }
41 | 
42 |   call SV.Prune_VCF {
43 |     input:
44 |     input_vcf_gz = Paste_VCF.output_vcf_gz,
45 |     output_vcf_basename = cohort_name + ".merged.gt.cn.pruned",
46 |     disk_size = disk_size,
47 |     preemptible_tries = preemptible_tries
48 |   }
49 | 
50 |   call SV.Classify {
51 |     input:
52 |     input_vcf_gz = Prune_VCF.output_vcf_gz,
53 |     input_ped = pedigree_file,
54 |     mei_annotation_bed = mei_annotation_bed,
55 |     output_vcf_basename = cohort_name + ".merged.gt.cn.pruned.class",
56 |     disk_size = disk_size,
57 |     preemptible_tries = preemptible_tries
58 |   }
59 | 
60 |   call SV.Sort_Index_VCF {
61 |     input:
62 |     input_vcf_gz = Classify.output_vcf_gz,
63 |     output_vcf_name = final_vcf_name,
64 |     disk_size = disk_size,
65 |     preemptible_tries = preemptible_tries
66 |   }
67 | }
68 | 


--------------------------------------------------------------------------------
/docker/manta/Dockerfile:
--------------------------------------------------------------------------------
 1 | FROM debian:stretch-slim AS manta-build
 2 | LABEL maintainer "Dave Larson <delarson@wustl.edu>"
 3 | ARG MANTA_VERSION=1.4.0
 4 | COPY --from=halllab/python2.7-build:v1 /opt/hall-lab/python-2.7.15 /opt/hall-lab/python-2.7.15
 5 | ENV PATH=/opt/hall-lab/python-2.7.15/bin:${PATH}
 6 | RUN apt-get update -qq \
 7 |     && apt-get -y install \
 8 |         --no-install-recommends \
 9 |         build-essential \
10 |         bzip2 \
11 |         zlib1g-dev \
12 |         curl \
13 |         ca-certificates
14 | RUN curl -O -L https://github.com/Illumina/manta/releases/download/v${MANTA_VERSION}/manta-${MANTA_VERSION}.release_src.tar.bz2 \
15 |     && tar -xjf manta-${MANTA_VERSION}.release_src.tar.bz2 \
16 |     && mkdir build \
17 |     && cd build \
18 |     && ../manta-${MANTA_VERSION}.release_src/configure --prefix=/opt/hall-lab/manta-${MANTA_VERSION} \
19 |     && make -j 4 install
20 | RUN find /opt/hall-lab/python-2.7.15/ -depth \( -name '*.pyo' -o -name '*.pyc' -o -name 'test' -o -name 'tests' \) -exec rm -rf '{}' + ;
21 | RUN find /opt/hall-lab/python-2.7.15/lib/python2.7/site-packages/ -name '*.so' -print -exec sh -c 'file "{}" | grep -q "not stripped" && strip -s "{}"' \;
22 | 
23 | FROM debian:stretch-slim
24 | LABEL maintainer "Dave Larson <delarson@wustl.edu>"
25 | ARG MANTA_VERSION=1.4.0
26 | 
27 | COPY --from=manta-build /opt/hall-lab/manta-${MANTA_VERSION}/bin /opt/hall-lab/manta-${MANTA_VERSION}/bin
28 | COPY --from=manta-build /opt/hall-lab/manta-${MANTA_VERSION}/lib /opt/hall-lab/manta-${MANTA_VERSION}/lib
29 | COPY --from=manta-build /opt/hall-lab/manta-${MANTA_VERSION}/libexec /opt/hall-lab/manta-${MANTA_VERSION}/libexec
30 | COPY --from=manta-build /opt/hall-lab/python-2.7.15 /opt/hall-lab/python-2.7.15
31 | 
32 | # Run dependencies
33 | RUN apt-get update -qq \
34 |     && apt-get -y install \
35 |         --no-install-recommends \
36 |         libssl1.1 \
37 |         libcurl3 \
38 |         libbz2-1.0 \ 
39 |         liblzma5 \ 
40 |         libssl1.0.2 \
41 |         zlib1g
42 | 
43 | ENV PATH=/opt/hall-lab/manta-${MANTA_VERSION}/bin:/opt/hall-lab/python-2.7.15/bin/:$PATH
44 | 
45 | CMD ["/bin/bash"]
46 | 


--------------------------------------------------------------------------------
/test/cnvnator/test.wdl:
--------------------------------------------------------------------------------
 1 | import "../../scripts/SV_Tasks.wdl" as SV
 2 | 
 3 | workflow Test_Copy_Number {
 4 |   # data inputs
 5 |   String basename
 6 |   String sample
 7 |   File input_cram
 8 |   File input_cram_index
 9 |   File input_vcf
10 | 
11 |   # reference inputs
12 |   File ref_fasta
13 |   File ref_fasta_index
14 |   File ref_cache
15 | 
16 |   # system inputs
17 |   Int disk_size
18 |   Int preemptible_tries
19 | 
20 |   # -----------------------------------
21 |   # test CNVnator
22 |   call SV.CNVnator_Histogram {
23 |     input:
24 |     basename = basename,
25 |     input_cram = input_cram,
26 |     input_cram_index = input_cram_index,
27 |     ref_fasta = ref_fasta,
28 |     ref_fasta_index = ref_fasta_index,
29 |     ref_cache = ref_cache,
30 |     disk_size = disk_size,
31 |     preemptible_tries = preemptible_tries
32 |   }
33 | 
34 |   call SV.Copy_Number {
35 |     input:
36 |     basename = basename,
37 |     sample = sample,
38 |     input_vcf = input_vcf,
39 |     input_cn_hist_root = CNVnator_Histogram.output_cn_hist_root,
40 |     ref_cache = ref_cache,
41 |     disk_size = disk_size,
42 |     preemptible_tries = preemptible_tries
43 |   }
44 |   
45 |   # ------------------------------------
46 |   # generate .ped file
47 | 
48 |   Array[File] aligned_crams
49 |   Array[File] cn_hist_roots
50 |   String cohort_name
51 | 
52 |   scatter (i in range(length(aligned_crams))) {
53 |     File aligned_cram = aligned_crams[i]
54 |     File cn_hist_root = cn_hist_roots[i]
55 |     
56 |     call SV.Get_Sample_Name {
57 |       input:
58 |       input_cram = aligned_cram,
59 |       disk_size = disk_size,
60 |       preemptible_tries = preemptible_tries
61 |     }
62 | 
63 |     call SV.Get_Sex {
64 |       input:
65 |       input_cn_hist_root = cn_hist_root,
66 |       ref_fasta_index = ref_fasta_index,
67 |       disk_size = disk_size,
68 |       preemptible_tries = preemptible_tries
69 |     }
70 |   }
71 |   
72 |   call SV.Make_Pedigree_File {
73 |     input:
74 |     sample_array = Get_Sample_Name.sample,
75 |     sex_array = Get_Sex.sex,
76 |     output_ped_basename = cohort_name,
77 |     disk_size = 1
78 |   }
79 | }
80 | 


--------------------------------------------------------------------------------
/docker/svtyper/Dockerfile:
--------------------------------------------------------------------------------
 1 | FROM debian:stretch-slim as builder-base
 2 | LABEL maintainer "Dave Larson <delarson@wustl.edu>"
 3 | RUN apt-get update -qq \
 4 |     && apt-get install -y --no-install-recommends \
 5 |         build-essential \
 6 |         make \
 7 |         cmake \
 8 |         autoconf \
 9 |         automake \
10 |         libtool \
11 |         gawk \
12 |         git-core \
13 |         bzip2 \
14 |         libbz2-dev \
15 |         liblzma-dev \
16 |         libssl1.0-dev \
17 |         libcurl4-openssl-dev \
18 |         ca-certificates \
19 |         curl \
20 |         zlib1g-dev
21 | 
22 | FROM builder-base as svtyper-0.7.1-build
23 | LABEL maintainer "Dave Larson <delarson@wustl.edu>"
24 | 
25 | COPY --from=halllab/python2.7-build:v1 /opt/hall-lab/python-2.7.15 /opt/hall-lab/python-2.7.15
26 | ENV PATH=/opt/hall-lab/python-2.7.15/bin:${PATH}
27 | RUN SVTYPER_VERSION=0.7.1 \
28 |     && git clone https://github.com/hall-lab/svtyper \
29 |     && cd svtyper \
30 |     && git checkout v$SVTYPER_VERSION \
31 |     && sed -i '/numpy/d' setup.py \
32 |     && sed -i '/scipy/d' setup.py \
33 |     && pip install .
34 | RUN find /opt/hall-lab/python-2.7.15/ -depth \( -name '*.pyo' -o -name '*.pyc' -o -name 'test' -o -name 'tests' \) -exec rm -rf '{}' + ;
35 | #RUN find /opt/hall-lab/python-2.7.15/lib/python2.7/site-packages/ -name '*.so' -print -exec sh -c 'file "{}" | grep -q "not stripped" && strip -s "{}"' \;
36 | 
37 | FROM debian:stretch-slim
38 | LABEL maintainer "Dave Larson <delarson@wustl.edu>"
39 | 
40 | COPY --from=svtyper-0.7.1-build /opt/hall-lab/python-2.7.15 /opt/hall-lab/python-2.7.15
41 | COPY --from=halllab/samtools-1.9-build:v1 /build/deb-build/opt/hall-lab/samtools-1.9 /opt/hall-lab/samtools-1.9
42 | COPY --from=halllab/htslib-1.9-build:v1 /build/deb-build/opt/hall-lab/htslib-1.9 /opt/hall-lab/htslib-1.9
43 | 
44 | RUN apt-get update -qq \
45 |     && apt-get install -y --no-install-recommends \
46 |         libssl1.1 \
47 |         libcurl3 \
48 |         libncurses5 \
49 |         libbz2-1.0 \ 
50 |         liblzma5 \ 
51 |         libssl1.0.2 \
52 |         zlib1g
53 | 
54 | ENV PATH=/opt/hall-lab/python-2.7.15/bin:/opt/hall-lab/samtools-1.9/bin:/opt/hall-lab/htslib-1.9/bin:${PATH}
55 | 
56 | CMD ["/bin/bash"]
57 | 


--------------------------------------------------------------------------------
/scripts/Merge_SV.wdl:
--------------------------------------------------------------------------------
 1 | version 1.0
 2 | import "SV_Tasks.wdl" as SV
 3 | 
 4 | workflow Merge_SV {
 5 |   input {
 6 |     # data inputs
 7 |     Array[File] manta_input_vcfs
 8 |     Array[File] smoove_input_vcfs
 9 |     String cohort_name
10 | 
11 |     # system inputs
12 |     Int preemptible_tries
13 |   }
14 | 
15 |   
16 |   call SV.L_Sort_VCF_Variants as lsort_manta {
17 |     input:
18 |     input_vcfs = manta_input_vcfs,
19 |     output_vcf_basename = cohort_name + ".manta.lsort",
20 |     preemptible_tries = preemptible_tries
21 |   }
22 | 
23 |   call SV.Filter_Pass as filter_manta {
24 |     input:
25 |     input_vcf_gz = lsort_manta.output_vcf_gz,
26 |     output_vcf_basename = cohort_name + ".manta.filter",
27 |     preemptible_tries = preemptible_tries
28 |   }
29 | 
30 |   call SV.L_Merge_VCF_Variants as lmerge_manta {
31 |     input:
32 |     input_vcf_gz = filter_manta.output_vcf_gz,
33 |     output_vcf_basename = cohort_name + ".manta.lmerge",
34 |     preemptible_tries = preemptible_tries
35 |   }
36 | 
37 |   call SV.L_Sort_VCF_Variants as lsort_smoove {
38 |     input:
39 |     input_vcfs = smoove_input_vcfs,
40 |     output_vcf_basename = cohort_name + ".smoove.lsort",
41 |     preemptible_tries = preemptible_tries
42 |   }
43 | 
44 |   call SV.Filter_Del as filter_smoove {
45 |     input:
46 |     input_vcf_gz = lsort_smoove.output_vcf_gz,
47 |     output_vcf_basename = cohort_name + ".smoove.filter",
48 |     preemptible_tries = preemptible_tries
49 |   }
50 | 
51 |   call SV.L_Merge_VCF_Variants as lmerge_smoove {
52 |     input:
53 |     input_vcf_gz = filter_smoove.output_vcf_gz,
54 |     output_vcf_basename = cohort_name + ".smoove.lmerge",
55 |     preemptible_tries = preemptible_tries
56 |   }
57 | 
58 |   call SV.L_Sort_VCF_Variants as lsort_manta_smoove {
59 |     input:
60 |     input_vcfs = [lmerge_manta.output_vcf_gz, lmerge_smoove.output_vcf_gz],
61 |     output_vcf_basename = cohort_name + ".manta_smoove.lsort",
62 |     preemptible_tries = preemptible_tries
63 |   }
64 | 
65 |   call SV.L_Merge_VCF_Variants_weighted as lmerge_manta_smoove {
66 |     input:
67 |     input_vcf_gz = lsort_manta_smoove.output_vcf_gz,
68 |     output_vcf_basename = cohort_name + ".manta_smoove.lmerge",
69 |     preemptible_tries = preemptible_tries
70 |   }
71 | 
72 |   output {
73 |     File output_vcf = lmerge_manta_smoove.output_vcf_gz
74 |   }
75 | }
76 | 


--------------------------------------------------------------------------------
/scripts/Pre_Merge_SV_per_sample.wdl:
--------------------------------------------------------------------------------
 1 | version 1.0
 2 | import "SV_Tasks.wdl" as SV
 3 | 
 4 | workflow Pre_Merge_SV_Per_Sample {
 5 |   input {
 6 |     # data inputs
 7 |     File aligned_cram
 8 | 
 9 |     # reference inputs
10 |     File ref_fasta
11 |     File ref_fasta_index
12 |     File ref_cache
13 |     File? call_regions_bed
14 |     File? call_regions_bed_index
15 |     File exclude_regions
16 |   
17 |     String aligned_cram_suffix
18 | 
19 |     # system inputs
20 |     Int preemptible_tries
21 | 
22 |     String basename = sub(sub(aligned_cram, "^.*/", ""), aligned_cram_suffix + "$", "")
23 |   }
24 | 
25 |   call SV.Index_Cram {
26 |     input:
27 |     basename = basename,
28 |     input_cram = aligned_cram,
29 |     ref_cache = ref_cache,
30 |     preemptible_tries = preemptible_tries
31 |   }
32 | 
33 |   call SV.Manta {
34 |     input:
35 |     basename = basename,
36 |     input_cram = aligned_cram,
37 |     input_cram_index = Index_Cram.output_cram_index,
38 |     ref_fasta = ref_fasta,
39 |     ref_fasta_index = ref_fasta_index,
40 |     call_regions_bed = call_regions_bed,
41 |     call_regions_bed_index = call_regions_bed_index,
42 |     ref_cache = ref_cache,
43 |     preemptible_tries = preemptible_tries
44 |   }
45 | 
46 |   call SV.CNVnator_Histogram {
47 |     input:
48 |     basename = basename,
49 |     input_cram = aligned_cram,
50 |     input_cram_index = Index_Cram.output_cram_index,
51 |     ref_fasta = ref_fasta,
52 |     ref_fasta_index = ref_fasta_index,
53 |     ref_cache = ref_cache,
54 |     preemptible_tries = preemptible_tries
55 |   }
56 | 
57 |   call SV.Smoove {
58 |     input:
59 |     basename = basename,
60 |     input_cram = aligned_cram,
61 |     input_cram_index = Index_Cram.output_cram_index,
62 |     ref_fasta = ref_fasta,
63 |     ref_fasta_index = ref_fasta_index,
64 |     ref_cache = ref_cache,
65 |     exclude_regions = exclude_regions,
66 |     preemptible_tries = preemptible_tries
67 |   }
68 | 
69 |   output {
70 |     File cram_index = Index_Cram.output_cram_index
71 |     File manta_vcf = Manta.output_vcf
72 |     File manta_tbi = Manta.output_tbi
73 |     File manta_original_vcf = Manta.original_vcf
74 |     File manta_original_tbi = Manta.original_tbi
75 |     File cnvnator_cn_hist_root = CNVnator_Histogram.output_cn_hist_root
76 |     File cnvnator_output_cn_txt = CNVnator_Histogram.output_cn_txt
77 |     File cnvnator_cn_bed = CNVnator_Histogram.output_cn_bed
78 |     File smoove_vcf = Smoove.output_vcf
79 |     File smoove_csi = Smoove.output_csi
80 |   }
81 | }
82 | 


--------------------------------------------------------------------------------
/scripts/Post_Merge_SV.inputs.json:
--------------------------------------------------------------------------------
 1 | {
 2 |   "Post_Merge_SV.aligned_crams": [
 3 |     "gs://fc-56ac46ea-efc4-4683-b6d5-6d95bed41c5e/CCDG_13607/Project_CCDG_13607_B01_GRM_WGS.cram.2019-02-06/Sample_NA12878/analysis/NA12878.final.cram",
 4 |     "gs://fc-56ac46ea-efc4-4683-b6d5-6d95bed41c5e/CCDG_13607/Project_CCDG_13607_B01_GRM_WGS.cram.2019-02-06/Sample_NA19238/analysis/NA19238.final.cram"
 5 |   ],
 6 |   "Post_Merge_SV.aligned_cram_indices": [
 7 |     "gs://fc-56ac46ea-efc4-4683-b6d5-6d95bed41c5e/CCDG_13607/Project_CCDG_13607_B01_GRM_WGS.cram.2019-02-06/Sample_NA12878/analysis/NA12878.final.cram.crai",
 8 |     "gs://fc-56ac46ea-efc4-4683-b6d5-6d95bed41c5e/CCDG_13607/Project_CCDG_13607_B01_GRM_WGS.cram.2019-02-06/Sample_NA19238/analysis/NA19238.final.cram.crai"
 9 |   ],
10 |   "Post_Merge_SV.manta_vcfs": [
11 |     "gs://mgi-wdl-test/2019-09-06/Pre_Merge_SV/1e43c1ff-befd-4bf1-834e-6aab76b976d5/call-Pre_Merge_SV_Per_Sample/shard-0/Pre_Merge_SV_Per_Sample/30546f8c-c09f-4873-b77e-641f194cacb5/call-Manta/NA12878.final.vcf.gz",
12 |     "gs://mgi-wdl-test/2019-09-06/Pre_Merge_SV/1e43c1ff-befd-4bf1-834e-6aab76b976d5/call-Pre_Merge_SV_Per_Sample/shard-1/Pre_Merge_SV_Per_Sample/928868b2-a748-43f6-a938-517f784eff54/call-Manta/attempt-2/NA19238.final.vcf.gz"
13 |   ],
14 |   "Post_Merge_SV.cn_hist_roots": [
15 |     "gs://mgi-wdl-test/2019-09-06/Pre_Merge_SV/1e43c1ff-befd-4bf1-834e-6aab76b976d5/call-Pre_Merge_SV_Per_Sample/shard-0/Pre_Merge_SV_Per_Sample/30546f8c-c09f-4873-b77e-641f194cacb5/call-CNVnator_Histogram/cnvnator.out/NA12878.final.cram.hist.root",
16 |     "gs://mgi-wdl-test/2019-09-06/Pre_Merge_SV/1e43c1ff-befd-4bf1-834e-6aab76b976d5/call-Pre_Merge_SV_Per_Sample/shard-1/Pre_Merge_SV_Per_Sample/928868b2-a748-43f6-a938-517f784eff54/call-CNVnator_Histogram/cnvnator.out/NA19238.final.cram.hist.root"
17 |   ],
18 |   "Post_Merge_SV.merged_vcf": "gs://mgi-wdl-test/2019-09-06/Merge_SV/39dfbd20-14df-427e-afe2-78296cf09798/call-lmerge_manta_smoove/Cohort.manta_smoove.lmerge.vcf.gz",
19 |   "Post_Merge_SV.cohort_name": "Cohort",
20 |   "Post_Merge_SV.aligned_cram_suffix": ".cram",
21 |   "Post_Merge_SV.final_vcf_name": "merged_genotyped.vcf.gz",
22 | 
23 |   "Post_Merge_SV.ref_fasta": "gs://human-b38/GRCh38DH/all_sequences.fa",
24 |   "Post_Merge_SV.ref_fasta_index": "gs://human-b38/GRCh38DH/all_sequences.fa.fai",
25 |   "Post_Merge_SV.ref_cache": "gs://human-b38/cache.tar.gz",
26 |   "Post_Merge_SV.mei_annotation_bed": "gs://human-b38/GRCh38DH/annotations/repeatMasker.recent.lt200millidiv.LINE_SINE_SVA.GRCh38.sorted.bed.gz",
27 | 
28 |   "Post_Merge_SV.preemptible_tries": 3
29 | }
30 | 


--------------------------------------------------------------------------------
/docker/svtools/Dockerfile:
--------------------------------------------------------------------------------
 1 | FROM debian:stretch-slim as builder-base
 2 | LABEL maintainer "Dave Larson <delarson@wustl.edu>"
 3 | RUN apt-get update -qq \
 4 |     && apt-get install -y --no-install-recommends \
 5 |         build-essential \
 6 |         make \
 7 |         cmake \
 8 |         autoconf \
 9 |         automake \
10 |         libtool \
11 |         gawk \
12 |         git-core \
13 |         bzip2 \
14 |         libbz2-dev \
15 |         liblzma-dev \
16 |         libssl1.0-dev \
17 |         libcurl4-openssl-dev \
18 |         ca-certificates \
19 |         libblas-dev \
20 |         libatlas-base-dev \
21 |         liblapack-dev \
22 |         curl \
23 |         zlib1g-dev
24 | 
25 | FROM builder-base as svtools-0.5.1-build
26 | LABEL maintainer "Dave Larson <delarson@wustl.edu>"
27 | 
28 | COPY --from=halllab/python2.7-build:v1 /opt/hall-lab/python-2.7.15 /opt/hall-lab/python-2.7.15
29 | ENV PATH=/opt/hall-lab/python-2.7.15/bin:${PATH}
30 | RUN SVTOOLS_VERSION=0.5.1 \
31 |     && pip install svtools==${SVTOOLS_VERSION}
32 | RUN find /opt/hall-lab/python-2.7.15/ -depth \( -name '*.pyo' -o -name '*.pyc' -o -name 'test' -o -name 'tests' \) -exec rm -rf '{}' + ;
33 | #RUN find /opt/hall-lab/python-2.7.15/lib/python2.7/site-packages/ -name '*.so' -print -exec sh -c 'file "{}" | grep -q "not stripped" && strip -s "{}"' \;
34 | 
35 | FROM debian:stretch-slim
36 | LABEL maintainer "Dave Larson <delarson@wustl.edu>"
37 | 
38 | COPY --from=svtools-0.5.1-build /opt/hall-lab/python-2.7.15 /opt/hall-lab/python-2.7.15
39 | COPY --from=halllab/samtools-1.9-build:v1 /build/deb-build/opt/hall-lab/samtools-1.9 /opt/hall-lab/samtools-1.9
40 | COPY --from=halllab/cnvnator-0.3.3-build:v1 /opt/hall-lab/cnvnator-0.3.3/deb-build/opt/hall-lab/cnvnator-0.3.3 /opt/hall-lab/cnvnator-0.3.3
41 | COPY --from=halllab/htslib-1.9-build:v1 /build/deb-build/opt/hall-lab/htslib-1.9 /opt/hall-lab/htslib-1.9
42 | 
43 | RUN apt-get update -qq \
44 |     && apt-get install -y --no-install-recommends \
45 |         libssl1.1 \
46 |         libcurl3 \
47 |         libncurses5 \
48 |         libbz2-1.0 \ 
49 |         liblzma5 \ 
50 |         libssl1.0.2 \
51 |         zlib1g \
52 |         libblas3 \
53 |         libatlas3-base \
54 |         liblapack3 \
55 |         libgomp1 \
56 |         libstdc++6 \
57 |         libstdc++-6-dev \
58 |         libgcc1 \
59 |         g++ \
60 |         libxpm4 \
61 |         gzip \
62 |         less
63 | 
64 | ENV PATH=/opt/hall-lab/python-2.7.15/bin:/opt/hall-lab/samtools-1.9/bin:/opt/hall-lab/htslib-1.9/bin:/opt/hall-lab/cnvnator-0.3.3/bin:${PATH}
65 | 
66 | CMD ["/bin/bash"]
67 | 
68 | 


--------------------------------------------------------------------------------
/scripts/SV_Pipeline_Full.wdl:
--------------------------------------------------------------------------------
 1 | version 1.0
 2 | import "Pre_Merge_SV.wdl" as premerge
 3 | import "Merge_SV.wdl" as merge
 4 | import "Post_Merge_SV.wdl" as postmerge
 5 | 
 6 | workflow SV_Pipeline_Full {
 7 |   input {
 8 |     Array[File] aligned_crams
 9 |     String aligned_cram_suffix
10 |     File ref_fasta
11 |     File ref_fasta_index
12 |     File ref_cache
13 |     File? call_regions_bed
14 |     File? call_regions_bed_index
15 |     File exclude_regions
16 |     File mei_annotation_bed
17 |     String cohort
18 |     String center
19 |     String final_vcf_name
20 |     Int preemptible_tries
21 |   }
22 | 
23 |   call premerge.Pre_Merge_SV {
24 |     input:
25 |       aligned_crams = aligned_crams,
26 |       aligned_cram_suffix = aligned_cram_suffix,
27 |       ref_fasta = ref_fasta,
28 |       ref_fasta_index = ref_fasta_index,
29 |       ref_cache = ref_cache,
30 |       call_regions_bed = call_regions_bed,
31 |       call_regions_bed_index = call_regions_bed_index,
32 |       exclude_regions = exclude_regions,
33 |       cohort = cohort,
34 |       center = center,
35 |       preemptible_tries = preemptible_tries
36 |   }
37 | 
38 |   call merge.Merge_SV {
39 |     input:
40 |       manta_input_vcfs = Pre_Merge_SV.manta_vcfs,
41 |       smoove_input_vcfs = Pre_Merge_SV.smoove_vcfs,
42 |       cohort_name = cohort,
43 |       preemptible_tries = preemptible_tries
44 |   }
45 | 
46 |   call postmerge.Post_Merge_SV {
47 |     input:
48 |       aligned_crams = aligned_crams,
49 |       aligned_cram_indices = Pre_Merge_SV.cram_indices,
50 |       cn_hist_roots = Pre_Merge_SV.cnvnator_cn_hist_roots,
51 |       manta_vcfs = Pre_Merge_SV.manta_original_vcfs,
52 |       aligned_cram_suffix = aligned_cram_suffix,
53 |       merged_vcf = Merge_SV.output_vcf,
54 |       cohort_name = cohort,
55 |       final_vcf_name = final_vcf_name,
56 |       ref_fasta = ref_fasta,
57 |       ref_fasta_index = ref_fasta_index,
58 |       ref_cache = ref_cache,
59 |       mei_annotation_bed = mei_annotation_bed,
60 |       preemptible_tries = preemptible_tries
61 |   }
62 | 
63 |   output {
64 |     File output_ped = Post_Merge_SV.output_ped
65 |     File output_vcf_bnd = Post_Merge_SV.output_vcf_bnd
66 |     File output_vcf_index_bnd = Post_Merge_SV.output_vcf_index_bnd
67 |     File output_vcf_del = Post_Merge_SV.output_vcf_del
68 |     File output_vcf_ins = Post_Merge_SV.output_vcf_ins
69 |     File output_vcf_index_other = Post_Merge_SV.output_vcf_index_other
70 |     File output_vcf_other = Post_Merge_SV.output_vcf_other
71 |     File output_vcf_index_del = Post_Merge_SV.output_vcf_index_del
72 |     File output_vcf_index_ins = Post_Merge_SV.output_vcf_index_ins
73 |   }
74 | }
75 | 


--------------------------------------------------------------------------------
/docker/manta_samtools/Dockerfile:
--------------------------------------------------------------------------------
 1 | FROM debian:stretch-slim AS manta-build
 2 | LABEL maintainer "Dave Larson <delarson@wustl.edu>"
 3 | ARG MANTA_VERSION=1.4.0
 4 | COPY --from=halllab/python2.7-build:v1 /opt/hall-lab/python-2.7.15 /opt/hall-lab/python-2.7.15
 5 | ENV PATH=/opt/hall-lab/python-2.7.15/bin:${PATH}
 6 | RUN apt-get update -qq \
 7 |     && apt-get -y install \
 8 |         --no-install-recommends \
 9 |         build-essential \
10 |         bzip2 \
11 |         zlib1g-dev \
12 |         curl \
13 |         ca-certificates \
14 |         tabix \
15 |         libbz2-dev \
16 |         liblzma-dev \
17 |     && pip install pandas \
18 |     && pip install scipy \
19 |     && pip install pysam \
20 |     && pip install svtools
21 | RUN curl -O -L https://github.com/Illumina/manta/releases/download/v${MANTA_VERSION}/manta-${MANTA_VERSION}.release_src.tar.bz2 \
22 |     && tar -xjf manta-${MANTA_VERSION}.release_src.tar.bz2 \
23 |     && mkdir build \
24 |     && cd build \
25 |     && ../manta-${MANTA_VERSION}.release_src/configure --prefix=/opt/hall-lab/manta-${MANTA_VERSION} \
26 |     && make -j 4 install
27 | RUN find /opt/hall-lab/python-2.7.15/ -depth \( -name '*.pyo' -o -name '*.pyc' -o -name 'test' -o -name 'tests' \) -exec rm -rf '{}' + ;
28 | RUN find /opt/hall-lab/python-2.7.15/lib/python2.7/site-packages/ -name '*.so' -print -exec sh -c 'file "{}" | grep -q "not stripped" && strip -s "{}"' \;
29 | 
30 | FROM debian:stretch-slim
31 | LABEL maintainer "Dave Larson <delarson@wustl.edu>"
32 | LABEL description "Manta v1.4.0 with samtools v1.9 alongside"
33 | ARG MANTA_VERSION=1.4.0
34 | 
35 | COPY --from=manta-build /opt/hall-lab/manta-${MANTA_VERSION}/bin /opt/hall-lab/manta-${MANTA_VERSION}/bin
36 | COPY --from=manta-build /opt/hall-lab/manta-${MANTA_VERSION}/lib /opt/hall-lab/manta-${MANTA_VERSION}/lib
37 | COPY --from=manta-build /opt/hall-lab/manta-${MANTA_VERSION}/libexec /opt/hall-lab/manta-${MANTA_VERSION}/libexec
38 | COPY --from=manta-build /opt/hall-lab/python-2.7.15 /opt/hall-lab/python-2.7.15
39 | COPY --from=halllab/htslib-1.9-build:v1 /build/deb-build/opt/hall-lab/htslib-1.9 /opt/hall-lab/htslib-1.9
40 | COPY --from=halllab/samtools-1.9-build:v1 /build/deb-build/opt/hall-lab/samtools-1.9 /opt/hall-lab/samtools-1.9
41 | ADD doctor_manta.1.py /opt/hall-lab/
42 | 
43 | # Run dependencies
44 | RUN apt-get update -qq \
45 |     && apt-get -y install \
46 |         --no-install-recommends \
47 |         libssl1.1 \
48 |         libcurl3 \
49 |         libncurses5 \
50 |         libbz2-1.0 \ 
51 |         liblzma5 \ 
52 |         libssl1.0.2 \
53 |         zlib1g
54 | 
55 | ENV PATH=/opt/hall-lab/manta-${MANTA_VERSION}/bin:/opt/hall-lab/python-2.7.15/bin/:/opt/hall-lab/samtools-1.9/bin:/opt/hall-lab/htslib-1.9/bin:$PATH
56 | ENV LD_LIBRARY_PATH=/opt/hall-lab/htslib-1.9/lib:$LD_LIBRARY_PATH
57 | 
58 | CMD ["/bin/bash"]
59 | 


--------------------------------------------------------------------------------
/scripts/Pre_Merge_SV.wdl:
--------------------------------------------------------------------------------
 1 | version 1.0
 2 | import "Pre_Merge_SV_per_sample.wdl" as per_sample
 3 | import "Pre_Merge_QC_per_sample.wdl" as qc
 4 | import "SV_Tasks.wdl" as SV
 5 | 
 6 | workflow Pre_Merge_SV {
 7 |   input {
 8 |     Array[File] aligned_crams
 9 |     String aligned_cram_suffix
10 | 
11 |     # reference inputs
12 |     File ref_fasta
13 |     File ref_fasta_index
14 |     File ref_cache
15 |     File? call_regions_bed
16 |     File? call_regions_bed_index
17 |     File exclude_regions
18 |     String cohort
19 |     String center
20 | 
21 |     # system inputs
22 |     Int preemptible_tries
23 |   }
24 | 
25 | 
26 |   scatter (i in range(length(aligned_crams))) {
27 |     File aligned_cram = aligned_crams[i]
28 | 
29 |     call per_sample.Pre_Merge_SV_Per_Sample {
30 |       input:
31 |         aligned_cram = aligned_cram,
32 |         aligned_cram_suffix = aligned_cram_suffix,
33 | 	    ref_fasta = ref_fasta,
34 | 	    ref_fasta_index = ref_fasta_index,
35 |         call_regions_bed = call_regions_bed,
36 |         call_regions_bed_index = call_regions_bed_index,
37 | 	    ref_cache = ref_cache,
38 | 	    exclude_regions = exclude_regions,
39 | 	    preemptible_tries = preemptible_tries
40 |     }
41 |     
42 |     call qc.Pre_Merge_QC_Per_Sample {
43 |       input:
44 |         manta_vcf = Pre_Merge_SV_Per_Sample.manta_vcf,
45 |         lumpy_vcf = Pre_Merge_SV_Per_Sample.smoove_vcf,
46 |         cnvnator_vcf = Pre_Merge_SV_Per_Sample.cnvnator_output_cn_txt,
47 |         cohort = cohort,
48 |         center = center,
49 | 	preemptible_tries = preemptible_tries
50 |     }
51 |   }
52 | 
53 |   #scatter (p in [("manta", Pre_Merge_QC_Per_Sample.manta_counts), ("lumpy", Pre_Merge_QC_Per_Sample.lumpy_counts)]) {
54 |   #  call SV.Make_Count_Plot {
55 |   #    input:
56 |   #      name=p.left,
57 |   #      count_files=p.right
58 |   #  }
59 |   #}
60 | 
61 |   output {
62 |     Array[File] cram_indices = Pre_Merge_SV_Per_Sample.cram_index
63 |     Array[File] manta_vcfs = Pre_Merge_SV_Per_Sample.manta_vcf
64 |     Array[File] manta_tbis = Pre_Merge_SV_Per_Sample.manta_tbi
65 |     Array[File] manta_original_vcfs = Pre_Merge_SV_Per_Sample.manta_original_vcf
66 |     Array[File] manta_original_tbis = Pre_Merge_SV_Per_Sample.manta_original_tbi
67 |     Array[File] cnvnator_cn_hist_roots = Pre_Merge_SV_Per_Sample.cnvnator_cn_hist_root
68 |     Array[File] cnvnator_output_cn_txt_files = Pre_Merge_SV_Per_Sample.cnvnator_output_cn_txt
69 |     Array[File] cnvnator_cn_bed_files = Pre_Merge_SV_Per_Sample.cnvnator_cn_bed
70 |     Array[File] smoove_vcfs = Pre_Merge_SV_Per_Sample.smoove_vcf
71 |     Array[File] smoove_csis = Pre_Merge_SV_Per_Sample.smoove_csi
72 |     Array[File] lumpy_counts = Pre_Merge_QC_Per_Sample.lumpy_counts
73 |     Array[File] manta_counts = Pre_Merge_QC_Per_Sample.manta_counts
74 |     #Array[File] count_plots = Make_Count_Plot.counts_plot
75 |   }
76 | }
77 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | 
 2 | # Cohort SV detection pipeline
 3 | 
 4 | # Table of contents
 5 | 1. [Overview](#overview)
 6 | 2. [WDL scripts](#wdl-scripts)
 7 | 3. [Docker images](#docker-images)
 8 | 
 9 | # Overview
10 | This repository contains pipeline scripts for structural variation detection in large cohorts. The pipeline is designed for Illumina paired-end whole genome sequencing data, preferably with at least 30x sequence coverage. Data inputs should be a set of sorted CRAM files, aligned with BWA-MEM.
11 | 
12 | This pipeline detects structural variation based on breakpoint sequence evidence using both the LUMPY and Manta algorithms. Structural variant (SV) breakpoints are then unified and merged using the [SVTools](https://github.com/hall-lab/svtools) workflow, followed by re-genotyping with [SVTyper](https://github.com/hall-lab/svtyper) and read-depth annotation with [CNVnator](https://github.com/abyzovlab/CNVnator). Finally, SV types are reclassified based on the concordance between read-depth and breakpoint genotype.
13 | 
14 | Additional details on the SVTools pipeline are available in the [SVTools tutorial](https://github.com/hall-lab/svtools/blob/master/Tutorial.md).
15 | 
16 | ![Workflow](images/workflow.wdl.v04.low-01.png?raw=true "Workflow")
17 | 
18 | # WDL scripts
19 | 
20 | Pipeline scripts (in [WDL format](https://software.broadinstitute.org/wdl/)) are available in the [scripts](scripts) directory. These scripts can be launched using [Cromwell](https://github.com/broadinstitute/cromwell) (version 25 or later).
21 | 
22 | While the SV pipeline can be run in its entirety via the [SV_Pipeline_Full.wdl](scripts/SV_Pipeline_Full.wdl) script, we recommend running the pipeline in three stages to enable intermediate quality control checkpoints.
23 | 
24 | ## 1. [Pre_Merge_SV.wdl](scripts/Pre_Merge_SV.wdl)
25 | 
26 | For each sample:
27 |   - SV discovery with LUMPY using the [smoove](https://github.com/brentp/smoove) wrapper
28 |   - Preliminary SV genotyping with SVTyper (also done within the smoove wrapper)
29 |   - SV discovery with [Manta](https://github.com/Illumina/manta), including insertions
30 |   - Generate [CNVnator](https://github.com/abyzovlab/CNVnator) histogram files
31 | 
32 | After this step, we recommend performing quality control checks on each sample before merging them into the cohort-level VCF (step 2).  To help with this, per-sample variant counts are generated for both LUMPY and Manta outputs.
33 | 
34 | ## 2. [Merge_SV.wdl](scripts/Merge_SV.wdl)
35 | 
36 | This step merges the sample-level VCF files from step 1 using the LUMPY breakpoint probability curves to produce a single cohort-level VCF.
37 | 
38 | ## 3. [Post_Merge_SV.wdl](scripts/Post_Merge_SV.wdl)
39 | 
40 | This step re-genotypes each sample at the sites in the cohort-level VCF file from step 2, and then combines the results into a set of final VCFs, split by variant type for efficiency (deletions, insertions, breakends, and other:duplications+inversions).
41 | 
42 | For each sample:
43 |   - Re-genotype each SV using SVTyper (note that insertion calls from Manta are taken from the per-sample genotypes and not processed with SVTyper)
44 |   - Annotate the read-depth at each SV using CNVnator
45 |   - Generate a .ped file of sample names and sexes
46 | 
47 | For the cohort:
48 |   - Combine the re-genotyped VCFs into a single cohort-level VCF
49 |   - Prune overlapping SVs
50 |   - Classify SV type based on the concordance between variant genotypes and read-depths
51 |   - Sort and index the VCF
52 | 
53 | # Docker images
54 | 
55 | - Docker images for this pipeline are available at https://hub.docker.com/u/halllab.
56 | - Dockerfiles for these containers are available in the [docker](docker) directory.
57 | - WDL test scripts for each of these Docker containers are available in the [test](test) directory.
58 | 


--------------------------------------------------------------------------------
/docker/smoove/Dockerfile:
--------------------------------------------------------------------------------
 1 | FROM debian:stretch-slim as builder-base
 2 | LABEL maintainer "Dave Larson <delarson@wustl.edu>"
 3 | RUN apt-get update -qq \
 4 |     && apt-get install -y --no-install-recommends \
 5 |         build-essential \
 6 |         make \
 7 |         cmake \
 8 |         autoconf \
 9 |         automake \
10 |         libtool \
11 |         gawk \
12 |         git-core \
13 |         bzip2 \
14 |         libbz2-dev \
15 |         liblzma-dev \
16 |         libssl1.0-dev \
17 |         libcurl4-openssl-dev \
18 |         ca-certificates \
19 |         curl \
20 |         zlib1g-dev
21 | 
22 | 
23 | FROM builder-base as lumpy-2f3fccb-build
24 | LABEL maintainer "Dave Larson <delarson@wustl.edu>"
25 | RUN LUMPY_COMMIT=2f3fccb0e6ef8732ff2f5c4e2c12a7a0b8ae2784 \
26 |     && git clone --single-branch --recursive --depth 5 https://github.com/arq5x/lumpy-sv \
27 |     && cd lumpy-sv \
28 |     && git checkout $LUMPY_COMMIT \
29 |     && make -j 3 \
30 |     && mkdir -p /opt/hall-lab/lumpy-2f3fccb/bin \
31 |     && cp ./bin/* /opt/hall-lab/lumpy-2f3fccb/bin
32 |     
33 | FROM builder-base as svtyper-0.7.0-build
34 | LABEL maintainer "Dave Larson <delarson@wustl.edu>"
35 | 
36 | COPY --from=halllab/python2.7-build:v1 /opt/hall-lab/python-2.7.15 /opt/hall-lab/python-2.7.15
37 | ENV PATH=/opt/hall-lab/python-2.7.15/bin:${PATH}
38 | RUN SVTYPER_VERSION=0.7.0 \
39 |     && git clone https://github.com/hall-lab/svtyper \
40 |     && cd svtyper \
41 |     && git checkout v$SVTYPER_VERSION \
42 |     && sed -i '/numpy/d' setup.py \
43 |     && sed -i '/scipy/d' setup.py \
44 |     && pip install .
45 | RUN find /opt/hall-lab/python-2.7.15/ -depth \( -name '*.pyo' -o -name '*.pyc' -o -name 'test' -o -name 'tests' \) -exec rm -rf '{}' + ;
46 | RUN find /opt/hall-lab/python-2.7.15/lib/python2.7/site-packages/ -name '*.so' -print -exec sh -c 'file "{}" | grep -q "not stripped" && strip -s "{}"' \;
47 | 
48 | # Smoove build...
49 | FROM builder-base as smoove-0.2.2-build
50 | WORKDIR /opt/hall-lab/smoove-0.2.2/bin
51 | RUN SMOOVE_VERSION=0.2.2 \
52 |     && MOSDEPTH_VERSION=0.2.4 \
53 |     && GSORT_VERSION=0.0.6 \
54 |     && curl -L -o mosdepth https://github.com/brentp/mosdepth/releases/download/v$MOSDEPTH_VERSION/mosdepth \
55 |     && chmod a+x mosdepth \
56 |     && curl -L -o gsort https://github.com/brentp/gsort/releases/download/v$GSORT_VERSION/gsort_linux_amd64 \
57 |     && chmod a+x gsort \
58 |     && curl -L -o smoove https://github.com/brentp/smoove/releases/download/v$SMOOVE_VERSION/smoove \
59 |     && chmod +x smoove
60 | 
61 | FROM debian:stretch-slim
62 | LABEL maintainer "Dave Larson <delarson@wustl.edu>"
63 | 
64 | COPY --from=lumpy-2f3fccb-build /opt/hall-lab/lumpy-2f3fccb/bin /opt/hall-lab/lumpy-2f3fccb/bin
65 | COPY --from=svtyper-0.7.0-build /opt/hall-lab/python-2.7.15 /opt/hall-lab/python-2.7.15
66 | COPY --from=halllab/htslib-1.9-build:v1 /build/deb-build/opt/hall-lab/htslib-1.9 /opt/hall-lab/htslib-1.9
67 | COPY --from=halllab/samtools-1.9-build:v1 /build/deb-build/opt/hall-lab/samtools-1.9 /opt/hall-lab/samtools-1.9
68 | COPY --from=halllab/bcftools-1.9-build:v1 /build/deb-build/opt/hall-lab/bcftools-1.9 /opt/hall-lab/bcftools-1.9
69 | COPY --from=smoove-0.2.2-build /opt/hall-lab/smoove-0.2.2/bin /opt/hall-lab/smoove-0.2.2/bin
70 | 
71 | ENV PATH=/opt/hall-lab/smoove-0.2.2/bin:/opt/hall-lab/python-2.7.15/bin:/opt/hall-lab/lumpy-2f3fccb/bin:/opt/hall-lab/htslib-1.9/bin:/opt/hall-lab/samtools-1.9/bin:/opt/hall-lab/bcftools-1.9/bin:$PATH
72 | ENV LD_LIBRARY_PATH=/opt/hall-lab/htslib-1.9/lib:$LD_LIBRARY_PATH
73 | 
74 | RUN apt-get update -qq \
75 |     && apt-get install -y --no-install-recommends \
76 |         libssl1.1 \
77 |         libcurl3 \
78 |         libncurses5 \
79 |         libbz2-1.0 \ 
80 |         liblzma5 \ 
81 |         libssl1.0.2 \
82 |         zlib1g
83 | 
84 | CMD ["/bin/bash"]
85 | 


--------------------------------------------------------------------------------
/docker/cromwell_mysql/run_pipeline.sh:
--------------------------------------------------------------------------------
  1 | #!/bin/bash
  2 | 
  3 | set -eo pipefail
  4 | 
  5 | export PATH="/usr/local/sbin:/usr/local/bin:/usr/sbin:/usr/bin:/sbin:/bin:$PATH"
  6 | RESOURCE_DIR=/opt/ccdg/cromwell/resources
  7 | #RESOURCE_DIR='.'
  8 | CROMWELL_CONF_TEMPLATE=$RESOURCE_DIR/application.conf.template
  9 | MYSQL_CONF_TEMPLATE=$RESOURCE_DIR/mysql.cnf.template
 10 | 
 11 | MYSQLD_PID=''
 12 | LOCK_ACQUIRED=false
 13 | 
 14 | MAIN_DIR="$1"
 15 | 
 16 | function clean_directory {
 17 |     local cleaned=$(echo "$1" | sed 's|/*$||')
 18 |     local abspath=$(cd "$cleaned" && pwd -P)
 19 |     echo "$abspath"
 20 | }
 21 | 
 22 | function wait_for_file {
 23 |     local file="$1"
 24 |     for i in `seq 1 28`; do
 25 |         if [[ ! -e "$file" ]]; then
 26 |             sleep $i
 27 |         fi
 28 |     done
 29 | }
 30 | 
 31 | function cromwell_conf {
 32 |     local dir="$1"
 33 |     echo "$dir/application.conf"
 34 | }
 35 | 
 36 | 
 37 | function set_up_conf {
 38 |     local dir="$1"
 39 |     local conf_file=$(cromwell_conf "$dir")
 40 |     if [[ -s "$conf_file" ]]; then
 41 |         echo "Using existing cromwell config at $conf_file" >&2
 42 |     else
 43 |         cat $CROMWELL_CONF_TEMPLATE | sed "s|%%SHARED_FS_DIRECTORY%%|$dir|" > "$conf_file"
 44 |         echo "Created cromwell config $conf_file" >&2
 45 |     fi
 46 | }
 47 | 
 48 | function has_db {
 49 |     local dir="$1"
 50 |     if [[ -d "$dir/db/run/mysqld" && -d "$dir/db/lib/mysql" && -d "$dir/db/log/mysql" && -e "$dir/db/lib/mysql/cromwell" ]]; then
 51 |         true
 52 |     else
 53 |         false
 54 |     fi
 55 | }
 56 | 
 57 | function create_mysql_directories {
 58 |     local dir="$1"
 59 |     for new_dir in "$dir/db/run/mysqld"  "$dir/db/lib/mysql" "$dir/db/log/mysql"
 60 |     do
 61 |         mkdir -p "$new_dir"
 62 |     done
 63 |     # TODO Are we really sure we need/want to do this?
 64 |     touch "$dir/db/log/mysql/error.log"
 65 |     chmod -R 777 "$dir/db"
 66 | }
 67 | 
 68 | function setup_new_database {
 69 |     echo "create database cromwell; create user 'cromwell'@'localhost' identified by 'test4cromwell'; grant all privileges on *.* to 'cromwell'@localhost;" | mysql -u root --socket=/tmp/mysqld.sock
 70 | }
 71 | 
 72 | function start_mysql {
 73 |     local dir="$1"
 74 |     local mysql_cnf_file=$(mysql_conf "$dir")
 75 |     mysqld_safe --defaults-file="$mysql_cnf_file" &
 76 |     MYSQLD_PID="$!"
 77 |     wait_for_file "/tmp/mysqld.sock"
 78 | }
 79 | 
 80 | function shutdown_mysql {
 81 |     echo "Shutting down mysql" >&2
 82 |     /usr/bin/mysqladmin -u root --socket /tmp/mysqld.sock shutdown
 83 |     MYSQLD_PID=''
 84 | }
 85 | 
 86 | function install_db {
 87 |     local dir="$1"
 88 |     local ldata="$dir/db/lib/mysql"
 89 |     create_mysql_directories "$dir"
 90 |     mysql_install_db --user=$USER --basedir=/usr/ --ldata=$ldata
 91 |     start_mysql "$dir"
 92 |     setup_new_database "$dir"
 93 | }
 94 | 
 95 | function mysql_conf {
 96 |     local dir="$1"
 97 |     echo "$dir/mysql.cnf"
 98 | }
 99 | 
100 | function set_up_mysql_cnf {
101 |     local dir="$1"
102 |     local conf=$(mysql_conf "$dir")
103 |     if [[ -s "$conf" ]]; then
104 |         echo "Using existing mysql config at $conf" >&2
105 |     else
106 |         cat $MYSQL_CONF_TEMPLATE | sed "s|%%SHARED_FS_DIRECTORY%%|$dir|" > "$conf"
107 |         echo "Created mysql config $conf" >&2
108 |     fi
109 | }
110 | 
111 | function is_locked {
112 |     local dir="$1"
113 |     if [[ -d "$dir/.lock" ]]; then
114 |         true
115 |     else
116 |         false
117 |     fi
118 | }
119 | 
120 | function lock {
121 |     local dir="$1"
122 |     if mkdir "$dir/.lock"; then
123 |         LOCK_ACQUIRED=true
124 |         echo "Locked $dir" >&2
125 |     else
126 |         echo "Unable to lock $dir" >&2
127 |         exit 1
128 |     fi
129 | }
130 | 
131 | function unlock {
132 |     local dir="$1"
133 |     if rmdir "$dir/.lock"; then
134 |         LOCK_ACQUIRED=false
135 |         echo "Unlocked $dir" >&2
136 |     else
137 |         echo "Unable to unlock $dir" >&2
138 |         exit 1
139 |     fi
140 | }
141 | 
142 | function run_cromwell {
143 |     local dir="$1"
144 |     local cromwell_conf=$(cromwell_conf "$dir")
145 |     /usr/bin/java -Xmx31G -Xms16G -Dconfig.file="$cromwell_conf" -jar /app/cromwell.jar run "${@:2}"
146 | }
147 | 
148 | function cleanup {
149 |     local dir="$1"
150 |     if [[ $MYSQLD_PID ]]; then
151 |         shutdown_mysql
152 |     fi
153 |     if $LOCK_ACQUIRED; then
154 |         unlock "$dir"
155 |     fi
156 | }
157 | 
158 | function main {
159 |    local dir="$1"
160 |    if [[ -d "$dir" ]]
161 |     then
162 |         local clean_dir=$(clean_directory "$dir")
163 |         trap 'cleanup $(clean_directory "$MAIN_DIR")' EXIT SIGTERM SIGINT
164 |         lock "$clean_dir"
165 |         set_up_conf "$clean_dir"
166 |         set_up_mysql_cnf "$clean_dir"
167 |         if ! $(has_db "$clean_dir"); then
168 |             # note that this also starts mysqld
169 |             install_db "$clean_dir"
170 |         else
171 |             start_mysql "$clean_dir"
172 |         fi
173 |         echo "${@:2}" >&2
174 |         run_cromwell "$clean_dir" "${@:2}"
175 |     else
176 |         echo "$dir is not a directory" >&2
177 |         exit 1
178 |     fi
179 | }
180 | 
181 | main "${@}";
182 | 


--------------------------------------------------------------------------------
/docker/manta_samtools/doctor_manta.1.py:
--------------------------------------------------------------------------------
  1 | import argparse, sys, StringIO
  2 | import pandas as pd
  3 | import numpy as np
  4 | import scipy.spatial.distance as ssd
  5 | import pysam
  6 | sys.path.insert(1,'/gscmnt/gc2802/halllab/abelhj/svtools')
  7 | from svtools.vcf.file import Vcf
  8 | from svtools.vcf.variant import Variant
  9 | from collections import namedtuple
 10 | import svtools.utils as su
 11 | 
 12 | 
 13 | def add_arguments_to_parser(parser):
 14 |     parser.add_argument('-i', '--vcf', metavar='<VCF>', dest='manta_vcf', help="manta input vcf")
 15 |     parser.add_argument('-v', '--verbose', dest='verbose', action='store_true')
 16 |     parser.add_argument('-s', '--slop', dest='slop',  default=0,  required=False, help='padding to either side')
 17 |     parser.add_argument('-m', '--max_ins', dest='max_ins', default=1000, type=int, required=False, help='maximum insert size') 
 18 | 
 19 | def command_parser():
 20 |     parser = argparse.ArgumentParser(description="cross-cohort cnv caller")
 21 |     add_arguments_to_parser(parser)
 22 |     return parser
 23 | 
 24 | def convert_variant(v, max_ins):
 25 |     set_read_counts(v)
 26 |     set_cis_prs(v)
 27 |     if v.get_info('SVTYPE')=='DEL':
 28 |         convert_del(v)
 29 |     elif v.get_info('SVTYPE')=='DUP':
 30 |         convert_dup(v)
 31 |     elif v.get_info('SVTYPE')=='INV':
 32 |         convert_inv(v)
 33 |     elif v.get_info('SVTYPE')=='INS':
 34 |         convert_ins(v, max_ins)
 35 |     elif v.get_info('SVTYPE')=='BND':
 36 |         convert_bnd(v)
 37 | 
 38 | def split_ci(ci):
 39 |      return[int(ci.split(',')[0]),  int(ci.split(',')[1])]
 40 | 
 41 | def uniform_pr(length):
 42 |     pr=np.ones(length, dtype='float64')/length
 43 |     pr1=','.join( map(str, pr))
 44 |     return pr1
 45 | 
 46 | def set_read_counts(var):
 47 | 
 48 |     sample=var.sample_list[0]
 49 |     gt=var.genotype(sample)
 50 |     pe=0
 51 |     sr=0
 52 |     if 'PR' in var.format_dict:
 53 |         pe=int(gt.get_format('PR').split(',')[1])
 54 |     if 'SR' in var.format_dict:
 55 |         sr=int(gt.get_format('SR').split(',')[1])
 56 |     var.info['PE']=pe
 57 |     var.info['SR']=sr
 58 |     var.info['SU']=pe+sr
 59 | 
 60 | def set_cis_prs(v):
 61 |     imprec=False
 62 |     cipos='0,0'
 63 |     ciend='0,0'
 64 |     prpos=1.0
 65 |     prend=1.0
 66 |     if 'CIPOS' in v.info:
 67 |         cipos=v.get_info('CIPOS')
 68 |         [start, stop]=split_ci(cipos)
 69 |         prpos=uniform_pr(stop-start+1)
 70 |         imprec=True
 71 |     if 'CIEND' in v.info:
 72 |         ciend=v.get_info('CIEND')
 73 |         [start, stop]=split_ci(ciend)
 74 |         prend=uniform_pr(stop-start+1)
 75 |         imprec=True
 76 |     v.info['CIPOS']=cipos
 77 |     v.info['CIEND']=ciend
 78 |     v.info['CIPOS95']=cipos
 79 |     v.info['CIEND95']=ciend
 80 |     v.info['PRPOS']=prpos
 81 |     v.info['PREND']=prend
 82 |     v.set_info('IMPRECISE', imprec)
 83 |     
 84 | def convert_del(var):
 85 |     var.alt='<DEL>'
 86 |     var.info['STRANDS']='+-:'+str(var.info['SU'])
 87 |     var.ref='N'
 88 | 
 89 | def convert_dup(var):
 90 |     var.alt='<DUP>'
 91 |     var.info['STRANDS']='-+:'+str(var.info['SU'])
 92 |     var.ref='N'
 93 | 
 94 | #def convert_inv(var):
 95 | #    var.ref='N'
 96 | #    var.alt='<INV>'
 97 | #    if 'INV3' in var.info:
 98 | #        var.info['STRANDS']='++:'+var.info['SU']
 99 | #    else:
100 | #        var.info['STRANDS']='--:'+var.info['SU']
101 | 
102 | def convert_inv(var):
103 |     var.ref='N'
104 |     strands=''
105 |     if 'INV3' in var.info:
106 |         strands='++:'
107 |         var.alt='N]'+var.chrom+':'+str(var.info['END'])+']'
108 |     else:
109 |         strands='--:'
110 |         var.alt='['+var.chrom+':'+str(var.info['END'])+'['
111 |     var.info['SVTYPE']='BND'
112 |     var.info['STRANDS']=strands+str(var.info['SU'])
113 |     
114 | 
115 | def convert_ins(var, max_ins):
116 |     var.ref='N'
117 |     var.alt='<INS>'
118 |     var.info['STRANDS']='+.:'+str(var.info['SU'])
119 |     orig_len='.'
120 |     new_len=max_ins
121 |     if 'SVLEN' in var.info:
122 |         svlen=int(var.get_info('SVLEN'))
123 |         orig_len=svlen
124 |         if svlen<max_ins:
125 |             new_len=svlen
126 |     var.info['SVLEN']=new_len
127 |     var.info['INSLEN_ORIG']=orig_len
128 |         
129 | def convert_bnd(var):
130 |     var.ref='N'
131 |     alt=var.alt
132 |     ff=alt.find("[")
133 |     newalt=""
134 |     strands=""
135 |     sep1, chrom2, breakpoint2=su.parse_bnd_alt_string(alt)
136 |     if chrom2 < var.chrom or (chrom2==var.chrom and int(breakpoint2)<int(var.pos)):
137 |         var.set_info('SECONDARY', True)
138 |     if ff==0:
139 |         strands="--:"
140 |         ff1=alt.find("[", 1)
141 |         newalt=alt[0:(ff1+1)]+'N'
142 |     elif ff>0:
143 |         strands="+-:"
144 |         newalt='N'+alt[ff::]
145 |     else:
146 |         ff=alt.find("]")
147 |         if ff==0:
148 |             strands="-+:"
149 |             ff1=alt.find("]", 1)
150 |             newalt=alt[0:(ff1+1)]+'N'
151 |         else:
152 |             strands="++:"
153 |             newalt='N'+alt[ff::]
154 |     var.alt=newalt
155 |     var.info['STRANDS']=strands+str(var.info['SU'])
156 |         
157 | 
158 | def run_from_args(args):
159 | 
160 |   vcf = Vcf()
161 |   vcf_out=sys.stdout
162 |   in_header = True
163 |   header_lines = list()
164 |   with su.InputStream(args.manta_vcf) as input_stream:
165 |     for line in input_stream:
166 |       if in_header:
167 |         header_lines.append(line)
168 |         if line[0:6] == '#CHROM':
169 |           in_header=False
170 |           vcf.add_header(header_lines)
171 |           vcf.add_info('PRPOS', '1', 'String', 'Breakpoint probability dist')
172 |           vcf.add_info('PREND', '1', 'String', 'Breakpoint probability dist')
173 |           vcf.add_info('STRANDS', '.', 'String', 'Strand orientation of the adjacency in BEDPE format (DEL:+-, DUP:-+, INV:++/--')
174 |           vcf.add_info('SU', '.', 'Integer', 'Number of pieces of evidence supporting the variant across all samples')
175 |           vcf.add_info('PE', '.', 'Integer', 'Number of paired-end reads supporting the variant across all samples')
176 |           vcf.add_info('SR', '.', 'Integer', 'Number of split reads supporting the variant across all samples')
177 |           vcf.add_info('INSLEN_ORIG', '.', 'Integer', 'Original insertion length')
178 |           vcf.add_info('CIPOS95', '2', 'Integer', 'Confidence interval (95%) around POS for imprecise variants')
179 |           vcf.add_info('CIEND95', '2', 'Integer', 'Confidence interval (95%) around END for imprecise variants')
180 |           vcf.add_info('SECONDARY', '0', 'Flag', 'Secondary breakend in a multi-line variant')
181 |           vcf_out.write(vcf.get_header()+'\n')
182 |       else:
183 |         v = Variant(line.rstrip().split('\t'), vcf)
184 |         convert_variant(v, args.max_ins)
185 |         vcf_out.write(v.get_var_string()+"\n")
186 |           
187 | 
188 | parser=command_parser()
189 | args=parser.parse_args()
190 | run_from_args(args)
191 | 


--------------------------------------------------------------------------------
/scripts/Post_Merge_SV.wdl:
--------------------------------------------------------------------------------
  1 | version 1.0
  2 | import "SV_Tasks.wdl" as SV
  3 | 
  4 | workflow Post_Merge_SV {
  5 |   # data inputs
  6 |   input {
  7 |   	Array[File] aligned_crams
  8 |   	Array[File] aligned_cram_indices
  9 |   	Array[File] cn_hist_roots
 10 |   	Array[File] manta_vcfs
 11 |   	String aligned_cram_suffix
 12 |   	File merged_vcf
 13 |   	String cohort_name
 14 |   	String final_vcf_name
 15 | 
 16 |   	# reference inputs
 17 |   	File ref_fasta
 18 |   	File ref_fasta_index
 19 |   	File ref_cache
 20 |   	File mei_annotation_bed
 21 | 
 22 |   	# system inputs
 23 |   	Int preemptible_tries
 24 |   }
 25 | 
 26 |   call SV.Split_By_Type {
 27 |     input:
 28 |     input_vcf = merged_vcf,
 29 |     output_vcf_prefix = cohort_name + ".merged",
 30 |     preemptible_tries = preemptible_tries
 31 |   }
 32 | 
 33 |   # Re-genotype and call copy number for each sample on the merged SV VCF
 34 |   scatter (i in range(length(aligned_crams))) {
 35 |     
 36 |     File aligned_cram = aligned_crams[i]
 37 |     File aligned_cram_index = aligned_cram_indices[i]
 38 |     File cn_hist_root = cn_hist_roots[i]
 39 |     String basename = sub(sub(aligned_cram, "^.*/", ""), aligned_cram_suffix + "$", "")
 40 | 
 41 |     call SV.Get_Sample_Name {
 42 |       input:
 43 |       input_cram = aligned_cram,
 44 |       preemptible_tries = preemptible_tries
 45 |     }
 46 | 
 47 |     call SV.Get_Sex {
 48 |       input:
 49 |       input_cn_hist_root = cn_hist_root,
 50 |       ref_fasta_index = ref_fasta_index,
 51 |       preemptible_tries = preemptible_tries
 52 |     }
 53 | 
 54 |     call SV.Genotype as Genotype_Merged_BND {
 55 |       input:
 56 |       basename = basename + ".bnd",
 57 |       input_cram = aligned_cram,
 58 |       input_cram_index = aligned_cram_index,
 59 |       input_vcf = Split_By_Type.bnd_vcf,
 60 |       ref_cache = ref_cache,
 61 |       preemptible_tries = preemptible_tries
 62 |     }
 63 | 
 64 |     call SV.Genotype as Genotype_Merged_DEL {
 65 |       input:
 66 |       basename = basename + ".del",
 67 |       input_cram = aligned_cram,
 68 |       input_cram_index = aligned_cram_index,
 69 |       input_vcf = Split_By_Type.del_vcf,
 70 |       ref_cache = ref_cache,
 71 |       preemptible_tries = preemptible_tries
 72 |     }
 73 | 
 74 |     call SV.Take_Original_Genotypes as Genotype_Merged_INS {
 75 |       input:
 76 |       sample_name = Get_Sample_Name.sample,
 77 |       original_per_sample_vcf = manta_vcfs[i],
 78 |       basename = basename + ".ins",
 79 |       input_vcf = Split_By_Type.ins_vcf,
 80 |       input_variant_to_sname_mapping = Split_By_Type.ins_split,
 81 |       preemptible_tries = preemptible_tries
 82 |     }
 83 | 
 84 |     call SV.Genotype as Genotype_Merged_OTHER {
 85 |       input:
 86 |       basename = basename + ".other",
 87 |       input_cram = aligned_cram,
 88 |       input_cram_index = aligned_cram_index,
 89 |       input_vcf = Split_By_Type.other_vcf,
 90 |       ref_cache = ref_cache,
 91 |       preemptible_tries = preemptible_tries
 92 |     }
 93 | 
 94 |     call SV.Copy_Number as Copy_Number_DEL {
 95 |       input:
 96 |       basename = basename + ".del",
 97 |       sample = Get_Sample_Name.sample,
 98 |       input_vcf = Genotype_Merged_DEL.output_vcf,
 99 |       input_cn_hist_root = cn_hist_root,
100 |       ref_cache = ref_cache,
101 |       preemptible_tries = preemptible_tries
102 |     }
103 | 
104 |     call SV.Copy_Number as Copy_Number_OTHER {
105 |       input:
106 |       basename = basename + ".other",
107 |       sample = Get_Sample_Name.sample,
108 |       input_vcf = Genotype_Merged_OTHER.output_vcf,
109 |       input_cn_hist_root = cn_hist_root,
110 |       ref_cache = ref_cache,
111 |       preemptible_tries = preemptible_tries
112 |     }
113 |   }
114 |   
115 |   call SV.Make_Pedigree_File {
116 |     input:
117 |     sample_array = Get_Sample_Name.sample,
118 |     sex_array = Get_Sex.sex,
119 |     output_ped_basename = cohort_name,
120 |   }
121 | 
122 |   call SV.Paste_VCF as Paste_VCF_BND {
123 |     input:
124 |     input_vcfs = Genotype_Merged_BND.output_vcf,
125 |     output_vcf_basename = cohort_name + ".merged.gt.bnd",
126 |     preemptible_tries = preemptible_tries
127 |   }
128 | 
129 |   call SV.Paste_VCF as Paste_VCF_DEL {
130 |     input:
131 |     input_vcfs = Copy_Number_DEL.output_vcf,
132 |     output_vcf_basename = cohort_name + ".merged.gt.cn.del",
133 |     preemptible_tries = preemptible_tries
134 |   }
135 | 
136 |   call SV.Paste_VCF as Paste_VCF_INS {
137 |     input:
138 |     input_vcfs = Genotype_Merged_INS.output_vcf,
139 |     output_vcf_basename = cohort_name + ".merged.gt.ins",
140 |     preemptible_tries = preemptible_tries
141 |   }
142 | 
143 |   call SV.Paste_VCF as Paste_VCF_OTHER {
144 |     input:
145 |     input_vcfs = Copy_Number_OTHER.output_vcf,
146 |     output_vcf_basename = cohort_name + ".merged.gt.cn.other",
147 |     preemptible_tries = preemptible_tries
148 |   }
149 | 
150 |   call SV.Prune_VCF as Prune_VCF_BND{
151 |     input:
152 |     input_vcf_gz = Paste_VCF_BND.output_vcf_gz,
153 |     output_vcf_basename = cohort_name + ".merged.gt.pruned.bnd",
154 |     preemptible_tries = preemptible_tries
155 |   }
156 | 
157 |   call SV.Prune_VCF as Prune_VCF_DEL{
158 |     input:
159 |     input_vcf_gz = Paste_VCF_DEL.output_vcf_gz,
160 |     output_vcf_basename = cohort_name + ".merged.gt.cn.pruned.del",
161 |     preemptible_tries = preemptible_tries
162 |   }
163 | 
164 |   call SV.Prune_VCF as Prune_VCF_INS{
165 |     input:
166 |     input_vcf_gz = Paste_VCF_INS.output_vcf_gz,
167 |     output_vcf_basename = cohort_name + ".merged.gt.pruned.ins",
168 |     preemptible_tries = preemptible_tries
169 |   }
170 | 
171 |   call SV.Prune_VCF as Prune_VCF_OTHER{
172 |     input:
173 |     input_vcf_gz = Paste_VCF_OTHER.output_vcf_gz,
174 |     output_vcf_basename = cohort_name + ".merged.gt.cn.pruned.other",
175 |     preemptible_tries = preemptible_tries
176 |   }
177 | 
178 |   call SV.Classify as Classify_DEL{
179 |     input:
180 |     input_vcf_gz = Prune_VCF_DEL.output_vcf_gz,
181 |     input_ped = Make_Pedigree_File.output_ped,
182 |     mei_annotation_bed = mei_annotation_bed,
183 |     output_vcf_basename = cohort_name + ".merged.gt.cn.pruned.class.del",
184 |     preemptible_tries = preemptible_tries
185 |   }
186 | 
187 |   call SV.Classify as Classify_OTHER{
188 |     input:
189 |     input_vcf_gz = Prune_VCF_OTHER.output_vcf_gz,
190 |     input_ped = Make_Pedigree_File.output_ped,
191 |     mei_annotation_bed = mei_annotation_bed,
192 |     output_vcf_basename = cohort_name + ".merged.gt.cn.pruned.class.other",
193 |     preemptible_tries = preemptible_tries
194 |   }
195 | 
196 |   call SV.Sort_Index_VCF as Sort_Index_VCF_BND {
197 |     input:
198 |     input_vcf_gz = Prune_VCF_BND.output_vcf_gz,
199 |     output_vcf_name = final_vcf_name + ".bnd.vcf.gz",
200 |     preemptible_tries = preemptible_tries
201 |   }
202 | 
203 |   call SV.Sort_Index_VCF as Sort_Index_VCF_DEL {
204 |     input:
205 |     input_vcf_gz = Classify_DEL.output_vcf_gz,
206 |     output_vcf_name = final_vcf_name + ".del.vcf.gz",
207 |     preemptible_tries = preemptible_tries
208 |   }
209 | 
210 |   call SV.Sort_Index_VCF as Sort_Index_VCF_INS {
211 |     input:
212 |     input_vcf_gz = Prune_VCF_INS.output_vcf_gz,
213 |     output_vcf_name = final_vcf_name + ".ins.vcf.gz",
214 |     preemptible_tries = preemptible_tries
215 |   }
216 | 
217 |   call SV.Sort_Index_VCF as Sort_Index_VCF_OTHER {
218 |     input:
219 |     input_vcf_gz = Classify_OTHER.output_vcf_gz,
220 |     output_vcf_name = final_vcf_name + ".other.vcf.gz",
221 |     preemptible_tries = preemptible_tries
222 |   }
223 | 
224 |   call SV.Filter_Index as Filter_Index_BND {
225 |     input:
226 |     input_vcf_gz = Sort_Index_VCF_BND.output_vcf_gz,
227 |     output_vcf_name = final_vcf_name + ".bnd.vcf.gz",
228 |     preemptible_tries = preemptible_tries
229 |   }
230 | 
231 |   call SV.Filter_Index as Filter_Index_DEL {
232 |     input:
233 |     input_vcf_gz = Sort_Index_VCF_DEL.output_vcf_gz,
234 |     output_vcf_name = final_vcf_name + ".del.vcf.gz",
235 |     preemptible_tries = preemptible_tries
236 |   }
237 | 
238 |   call SV.Filter_Index as Filter_Index_INS {
239 |     input:
240 |     input_vcf_gz = Sort_Index_VCF_INS.output_vcf_gz,
241 |     output_vcf_name = final_vcf_name + ".ins.vcf.gz",
242 |     preemptible_tries = preemptible_tries
243 |   }
244 | 
245 |   call SV.Filter_Index as Filter_Index_OTHER {
246 |     input:
247 |     input_vcf_gz = Sort_Index_VCF_OTHER.output_vcf_gz,
248 |     output_vcf_name = final_vcf_name + ".other.vcf.gz",
249 |     preemptible_tries = preemptible_tries
250 |   }
251 | 
252 |   output {
253 |     File output_ped = Make_Pedigree_File.output_ped
254 |     File output_vcf_bnd = Filter_Index_BND.output_vcf_gz
255 |     File output_vcf_index_bnd = Filter_Index_BND.output_vcf_gz_index
256 |     File output_vcf_del = Filter_Index_DEL.output_vcf_gz
257 |     File output_vcf_ins = Filter_Index_INS.output_vcf_gz
258 |     File output_vcf_index_other = Filter_Index_OTHER.output_vcf_gz_index
259 |     File output_vcf_other = Filter_Index_OTHER.output_vcf_gz
260 |     File output_vcf_index_del = Filter_Index_DEL.output_vcf_gz_index
261 |     File output_vcf_index_ins = Filter_Index_INS.output_vcf_gz_index
262 |   }
263 | }
264 | 


--------------------------------------------------------------------------------
/scripts/jes.conf:
--------------------------------------------------------------------------------
  1 | # Updated Cromwell template for JES
  2 | 
  3 | webservice {
  4 |   port = 8000
  5 |   interface = 0.0.0.0
  6 |   binding-timeout = 5s
  7 |   instance.name = "cromwell-for-wdl-runner"
  8 | }
  9 | 
 10 | akka {
 11 | 
 12 |   dispatchers {
 13 |     io-dispatcher {
 14 |       type = Dispatcher
 15 |       executor = "fork-join-executor"
 16 |     }
 17 | 
 18 |     api-dispatcher {
 19 |       type = Dispatcher
 20 |       executor = "fork-join-executor"
 21 |     }
 22 |     engine-dispatcher {
 23 |       type = Dispatcher
 24 |       executor = "fork-join-executor"
 25 |     }
 26 | 
 27 |     backend-dispatcher {
 28 |       type = Dispatcher
 29 |       executor = "fork-join-executor"
 30 |     }
 31 | 
 32 |     service-dispatcher {
 33 |       type = Dispatcher
 34 |       executor = "fork-join-executor"
 35 |     }
 36 | 
 37 |   }
 38 | }
 39 | 
 40 | system {
 41 |   # If 'true', a SIGINT will trigger Cromwell to attempt to abort all currently running jobs before exiting
 42 |   #abort-jobs-on-terminate = false
 43 | 
 44 |   # If 'true' then when Cromwell starts up, it tries to restart incomplete workflows
 45 |   workflow-restart = true
 46 | 
 47 |   # Cromwell will cap the number of running workflows at N
 48 |   max-concurrent-workflows = 5000
 49 | 
 50 |   # Cromwell will launch up to N submitted workflows at a time, regardless of how many open workflow slots exist
 51 |   max-workflow-launch-count = 50
 52 | 
 53 |   # Number of seconds between workflow launches
 54 |   new-workflow-poll-rate = 20
 55 | 
 56 |   # Since the WorkflowLogCopyRouter is initialized in code, this is the number of workers
 57 |   number-of-workflow-log-copy-workers = 10
 58 | 
 59 |   # Default number of cache read workers
 60 |   number-of-cache-read-workers = 25
 61 |   
 62 |   io {
 63 |     # Global Throttling - This is mostly useful for GCS and can be adjusted to match
 64 |     # the quota availble on the GCS API
 65 |     number-of-requests = 100000
 66 |     per = 100 seconds
 67 |     
 68 |     # Number of times an I/O operation should be attempted before giving up and failing it.
 69 |     number-of-attempts = 5
 70 |   }
 71 | }
 72 | 
 73 | workflow-options {
 74 |   encrypted-fields: []
 75 | 
 76 |   base64-encryption-key: "AAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAA="
 77 | 
 78 |   workflow-log-dir: "cromwell-workflow-logs"
 79 | 
 80 |   workflow-log-temporary: true
 81 | }
 82 | 
 83 | # Optional call-caching configuration.
 84 | call-caching {
 85 |   enabled = true
 86 |   invalidate-bad-cache-results = true
 87 | }
 88 | 
 89 | google {
 90 | 
 91 |   application-name = "cromwell"
 92 | 
 93 |   auths = [
 94 |     {
 95 |       name = "application-default"
 96 |       scheme = "application_default"
 97 |     },
 98 |   ]
 99 | }
100 | 
101 | docker {
102 |   hash-lookup {
103 |     // Set this to match your available quota against the Google Container Engine API
104 |     gcr-api-queries-per-100-seconds = 1000
105 |     // Time in minutes before an entry expires from the docker hashes cache and needs to be fetched again
106 |     cache-entry-ttl = "20 minutes"
107 |     // Maximum number of elements to be kept in the cache. If the limit is reached, old elements will be removed from the cache
108 |     cache-size = 200
109 |     // How should docker hashes be looked up. Possible values are "local" and "remote"
110 |     // "local": Lookup hashes on the local docker daemon using the cli
111 |     // "remote": Lookup hashes on docker hub and gcr
112 |     method = "remote"
113 |   }
114 | }
115 | 
116 | engine {
117 |   # This instructs the engine which filesystems are at its disposal to perform any IO operation that it might need.
118 |   # For instance, WDL variables declared at the Workflow level will be evaluated using the filesystems declared here.
119 |   # If you intend to be able to run workflows with this kind of declarations:
120 |   # workflow {
121 |   #    String str = read_string("gs://bucket/my-file.txt")
122 |   # }
123 |   # You will need to provide the engine with a gcs filesystem
124 |   # Note that the default filesystem (local) is always available.
125 |   filesystems {
126 |     gcs {
127 |       auth = "application-default"
128 |     }
129 |     local {
130 |       enabled: true      
131 |     }
132 |   }
133 | }
134 | 
135 | backend {
136 |   default = "JES"
137 |   providers {
138 |     JES {
139 |       actor-factory = "cromwell.backend.impl.jes.JesBackendLifecycleActorFactory"
140 |       config {
141 |         # Google project
142 |         project = "washu-genome-inh-dis-analysis"
143 |         root = "gs://ccdg-100-samples-trios-pilot-crams-mgi/workspace"
144 |     
145 |         # Set this to the lower of the two values "Queries per 100 seconds" and "Queries per 100 seconds per user" for
146 |         # your project.
147 |         #
148 |         # Used to help determine maximum throughput to the Google Genomics API. Setting this value too low will
149 |         # cause a drop in performance. Setting this value too high will cause QPS based locks from Google.
150 |         # 1000 is the default "Queries per 100 seconds per user", 50000 is the default "Queries per 100 seconds"
151 |         # See https://cloud.google.com/genomics/quotas for more information
152 |         genomics-api-queries-per-100-seconds = 1000
153 |     
154 |         # Polling for completion backs-off gradually for slower-running jobs.
155 |         # This is the maximum polling interval (in seconds):
156 |         maximum-polling-interval = 600
157 |     
158 |         genomics {
159 |           # A reference to an auth defined in the `google` stanza at the top.  This auth is used to create
160 |           # Pipelines and manipulate auth JSONs.
161 |           auth = "application-default"
162 |     
163 |     
164 |           // alternative service account to use on the launched compute instance
165 |           // NOTE: If combined with service account authorization, both that serivce account and this service account
166 |           // must be able to read and write to the 'root' GCS path
167 |           compute-service-account = "default"
168 |     
169 |           # Endpoint for APIs, no reason to change this unless directed by Google.
170 |           endpoint-url = "https://genomics.googleapis.com/"
171 |         }
172 |     
173 |         filesystems {
174 |           gcs {
175 |             # A reference to a potentially different auth for manipulating files via engine functions.
176 |             auth = "application-default"
177 |           }
178 |         }
179 |     
180 |       }
181 |     }
182 |   }
183 | }
184 | 
185 | services {
186 |   KeyValue {
187 |     class = "cromwell.services.keyvalue.impl.SqlKeyValueServiceActor"
188 |   }
189 |   MetadataService {
190 |     class = "cromwell.services.metadata.impl.MetadataServiceActor"
191 |     config {
192 |       # Set this value to "Inf" to turn off metadata summary refresh.  The default value is currently "2 seconds".
193 |       # metadata-summary-refresh-interval = "Inf"
194 |       # For higher scale environments, e.g. many workflows and/or jobs, DB write performance for metadata events
195 |       # can improved by writing to the database in batches. Increasing this value can dramatically improve overall
196 |       # performance but will both lead to a higher memory usage as well as increase the risk that metadata events
197 |       # might not have been persisted in the event of a Cromwell crash.
198 |       #
199 |       # For normal usage the default value of 1 (effectively no batching) should be fine but for larger/production
200 |       # environments we recommend a value of at least 500. There'll be no one size fits all number here so we recommend
201 |       # benchmarking performance and tuning the value to match your environment
202 |       # db-batch-size = 1
203 |       #
204 |       # Periodically the stored metadata events will be forcibly written to the DB regardless of if the batch size
205 |       # has been reached. This is to prevent situations where events wind up never being written to an incomplete batch
206 |       # with no new events being generated. The default value is currently 5 seconds
207 |       # db-flush-rate = 5 seconds
208 |     }
209 |   }
210 | }
211 | 
212 | database {
213 |   # hsql default
214 |   profile = "slick.jdbc.HsqldbProfile$"
215 |   db {
216 |     driver = "org.hsqldb.jdbcDriver"
217 |     url = "jdbc:hsqldb:mem:${uniqueSchema};shutdown=false;hsqldb.tx=mvcc"
218 |     connectionTimeout = 3000
219 |   }
220 | 
221 |   # mysql example
222 |   #driver = "slick.driver.MySQLDriver$"
223 |   #db {
224 |   #  driver = "com.mysql.jdbc.Driver"
225 |   #  url = "jdbc:mysql://host/cromwell?rewriteBatchedStatements=true"
226 |   #  user = "user"
227 |   #  password = "pass"
228 |   #  connectionTimeout = 5000
229 |   #}
230 | 
231 |   # For batch inserts the number of inserts to send to the DB at a time
232 |   # insert-batch-size = 2000
233 | 
234 |   migration {
235 |     # For databases with a very large number of symbols, selecting all the rows at once can generate a variety of
236 |     # problems. In order to avoid any issue, the selection is paginated. This value sets how many rows should be
237 |     # retrieved and processed at a time, before asking for the next chunk.
238 |     read-batch-size = 100000
239 | 
240 |     # Because a symbol row can contain any arbitrary wdl value, the amount of metadata rows to insert from a single
241 |     # symbol row can vary from 1 to several thousands (or more). To keep the size of the insert batch from growing out
242 |     # of control we monitor its size and execute/commit when it reaches or exceeds writeBatchSize.
243 |     write-batch-size = 100000
244 |   }
245 | }
246 | 


--------------------------------------------------------------------------------
/docker/cromwell_mysql/application.conf.template:
--------------------------------------------------------------------------------
  1 | webservice {
  2 |   port = 8000
  3 |   interface = 0.0.0.0
  4 |   binding-timeout = 5s
  5 |   instance.name = "reference"
  6 | }
  7 | 
  8 | akka {
  9 |   actor.default-dispatcher.fork-join-executor {
 10 |     # Number of threads = min(parallelism-factor * cpus, parallelism-max)
 11 |     # Below are the default values set by Akka, uncomment to tune these
 12 | 
 13 |     #parallelism-factor = 3.0
 14 |     #parallelism-max = 64
 15 |   }
 16 | 
 17 |   dispatchers {
 18 |     # A dispatcher for actors performing blocking io operations
 19 |     # Prevents the whole system from being slowed down when waiting for responses from external resources for instance
 20 |     io-dispatcher {
 21 |       type = Dispatcher
 22 |       executor = "fork-join-executor"
 23 |       # Using the forkjoin defaults, this can be tuned if we wish
 24 |     }
 25 | 
 26 |     # A dispatcher for actors handling API operations
 27 |     # Keeps the API responsive regardless of the load of workflows being run
 28 |     api-dispatcher {
 29 |       type = Dispatcher
 30 |       executor = "fork-join-executor"
 31 |     }
 32 | 
 33 |     # A dispatcher for engine actors
 34 |     # Because backends behaviour is unpredictable (potentially blocking, slow) the engine runs
 35 |     # on its own dispatcher to prevent backends from affecting its performance.
 36 |     engine-dispatcher {
 37 |       type = Dispatcher
 38 |       executor = "fork-join-executor"
 39 |     }
 40 | 
 41 |     # A dispatcher used by supported backend actors
 42 |     backend-dispatcher {
 43 |       type = Dispatcher
 44 |       executor = "fork-join-executor"
 45 |     }
 46 | 
 47 |     # A dispatcher used for the service registry
 48 |     service-dispatcher {
 49 |       type = Dispatcher
 50 |       executor = "fork-join-executor"
 51 |     }
 52 |     # Note that without further configuration, all other actors run on the default dispatcher
 53 |   }
 54 | }
 55 | 
 56 | system {
 57 |   # If 'true', a SIGINT will trigger Cromwell to attempt to abort all currently running jobs before exiting
 58 |   abort-jobs-on-terminate = false
 59 | 
 60 |   # Max number of retries per job that the engine will attempt in case of a retryable failure received from the backend
 61 |   max-retries = 10
 62 | 
 63 |   # If 'true' then when Cromwell starts up, it tries to restart incomplete workflows
 64 |   workflow-restart = true
 65 | 
 66 |   # Cromwell will cap the number of running workflows at N
 67 |   max-concurrent-workflows = 5000
 68 | 
 69 |   # Cromwell will launch up to N submitted workflows at a time, regardless of how many open workflow slots exist
 70 |   max-workflow-launch-count = 50
 71 | 
 72 |   # Number of seconds between workflow launches
 73 |   new-workflow-poll-rate = 20
 74 | 
 75 |   # Since the WorkflowLogCopyRouter is initialized in code, this is the number of workers
 76 |   number-of-workflow-log-copy-workers = 10
 77 | 
 78 |   # Default number of cache read workers
 79 |   number-of-cache-read-workers = 25
 80 | 
 81 |   io {
 82 |     # Global Throttling - This is mostly useful for GCS and can be adjusted to match
 83 |     # the quota availble on the GCS API
 84 |     number-of-requests = 100000
 85 |     per = 100 seconds
 86 | 
 87 |     # Number of times an I/O operation should be attempted before giving up and failing it.
 88 |     number-of-attempts = 5
 89 |   }
 90 | }
 91 | 
 92 | workflow-options {
 93 |   # These workflow options will be encrypted when stored in the database
 94 |   encrypted-fields: []
 95 | 
 96 |   # AES-256 key to use to encrypt the values in `encrypted-fields`
 97 |   base64-encryption-key: "AAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAA="
 98 | 
 99 |   # Directory where to write per workflow logs
100 |   workflow-log-dir: "cromwell-workflow-logs"
101 | 
102 |   # When true, per workflow logs will be deleted after copying
103 |   workflow-log-temporary: true
104 | 
105 |   # Workflow-failure-mode determines what happens to other calls when a call fails. Can be either ContinueWhilePossible or NoNewCalls.
106 |   # Can also be overridden in workflow options. Defaults to NoNewCalls. Uncomment to change:
107 |   #workflow-failure-mode: "ContinueWhilePossible"
108 | }
109 | 
110 | // Optional call-caching configuration.
111 | call-caching {
112 |   enabled = true
113 |   invalidate-bad-cache-results = true
114 | }
115 | 
116 | engine {
117 |   # This instructs the engine which filesystems are at its disposal to perform any IO operation that it might need.
118 |   # For instance, WDL variables declared at the Workflow level will be evaluated using the filesystems declared here.
119 |   # If you intend to be able to run workflows with this kind of declarations:
120 |   # workflow {
121 |   #    String str = read_string("gs://bucket/my-file.txt")
122 |   # }
123 |   # You will need to provide the engine with a gcs filesystem
124 |   # Note that the default filesystem (local) is always available.
125 |   filesystems {
126 |   #  gcs {
127 |   #    auth = "application-default"
128 |   #  }
129 |     local {
130 |        caching {
131 |               # When copying a cached result, what type of file duplication should occur. Attempted in the order listed below:
132 |               duplication-strategy: [
133 |                 "soft-link"
134 |               ]
135 | 
136 |               # Possible values: file, path
137 |               # "file" will compute an md5 hash of the file content.
138 |               # "path" will compute an md5 hash of the file path. This strategy will only be effective if the duplication-strategy (above) is set to "soft-link",
139 |               # in order to allow for the original file path to be hashed.
140 |               # Default: file
141 |               hashing-strategy: "path"
142 | 
143 |               # When true, will check if a sibling file with the same name and the .md5 extension exists, and if it does, use the content of this file as a hash.
144 |               # If false or the md5 does not exist, will proceed with the above-defined hashing strategy.
145 |               # Default: false
146 |               check-sibling-md5: false
147 |             }
148 |     }
149 |   }
150 | }
151 | 
152 | backend {
153 |   default = "LSF"
154 |   providers {
155 |     Local {
156 |       actor-factory = "cromwell.backend.impl.sfs.config.ConfigBackendLifecycleActorFactory"
157 |       config {
158 |         run-in-background = true
159 |         runtime-attributes = "String? docker"
160 |         submit = "/bin/bash ${script}"
161 |         submit-docker = "docker run --rm -v ${cwd}:${docker_cwd} -i ${docker} /bin/bash < ${script}"
162 | 
163 |         # Root directory where Cromwell writes job results.  This directory must be
164 |         # visible and writeable by the Cromwell process as well as the jobs that Cromwell
165 |         # launches.
166 |         root = "%%SHARED_FS_DIRECTORY%%/cromwell-executions" // Change this to your directory that contains this application.conf file.
167 | 
168 |         filesystems {
169 |           local {
170 |             localization: [
171 |               "hard-link", "soft-link", "copy"
172 |             ]
173 |           }
174 |         }
175 |         default-runtime-attributes {
176 |           failOnStderr: false
177 |           continueOnReturnCode: 0
178 |       }
179 |     }
180 |   }
181 | 
182 | 
183 |     LSF {
184 |       actor-factory = "cromwell.backend.impl.sfs.config.ConfigBackendLifecycleActorFactory"
185 |       config {
186 |         runtime-attributes = """
187 |         Int cpu = 1
188 |         Int? memory_mb = 4000
189 |         String? queue = 'research-hpc'
190 |         String? project
191 |         String? docker
192 |         String? resource
193 |         String? job_group
194 |         String? priority
195 |         """
196 | 
197 |         submit = """
198 |         bsub \
199 |         -J ${job_name} \
200 |         -cwd ${cwd} \
201 |         -o ${out} \
202 |         -e ${err} \
203 |         ${"-P " + project} \
204 |         ${"-q " + queue} \
205 |         -M ${memory_mb}000 \
206 |         -R 'select[mem>${memory_mb}] rusage[mem=${memory_mb}] span[hosts=1]' \
207 |         ${"-n " + cpu} \
208 |         ${"-R \"" + resource + "\""} \
209 |         ${"-g \"" + job_group + "\""} \
210 |         ${"-sp " + priority} \
211 |         /bin/bash ${script}
212 |         """
213 | 
214 |         submit-docker = """
215 |         LSF_DOCKER_PRESERVE_ENVIRONMENT='false' \
216 |         LSF_DOCKER_VOLUMES='${cwd}:${docker_cwd}' \
217 |         bsub \
218 |         -J ${job_name} \
219 |         -cwd ${cwd} \
220 |         -a 'docker(${docker})' \
221 |         ${"-P " + project} \
222 |         ${"-q " + queue} \
223 |         -M ${memory_mb}000 \
224 |         -R 'select[mem>${memory_mb}] rusage[mem=${memory_mb}] span[hosts=1]' \
225 |         ${"-n " + cpu} \
226 |         ${"-R \"" + resource + "\""} \
227 |         ${"-g \"" + job_group + "\""} \
228 |         ${"-sp " + priority} \
229 |         /bin/bash -c '/bin/bash ${script} >${out} 2>${err}'
230 |         """
231 | 
232 |         kill = "bkill ${job_id}"
233 |         check-alive = "bjobs -noheader -o \"stat\" ${job_id} | /bin/grep 'PEND\\|RUN'"
234 |         job-id-regex = "Job <(\\d+)>.*"
235 |         root = "%%SHARED_FS_DIRECTORY%%/cromwell-executions"
236 |         filesystems {
237 |             localization: [
238 |                 "soft-link"
239 |             ]
240 |             hashing-strategy: "path"
241 |         }
242 |         default-runtime-attributes {
243 |             failOnStderr: false
244 |             continueOnReturnCode: 0
245 |         }
246 |       }
247 |     }
248 |   }
249 | }
250 | 
251 | services {
252 |   KeyValue {
253 |     class = "cromwell.services.keyvalue.impl.SqlKeyValueServiceActor"
254 |   }
255 |   MetadataService {
256 |     class = "cromwell.services.metadata.impl.MetadataServiceActor"
257 |     config {
258 |       # Set this value to "Inf" to turn off metadata summary refresh.  The default value is currently "2 seconds".
259 |       # metadata-summary-refresh-interval = "Inf"
260 |       # For higher scale environments, e.g. many workflows and/or jobs, DB write performance for metadata events
261 |       # can improved by writing to the database in batches. Increasing this value can dramatically improve overall
262 |       # performance but will both lead to a higher memory usage as well as increase the risk that metadata events
263 |       # might not have been persisted in the event of a Cromwell crash.
264 |       #
265 |       # For normal usage the default value of 1 (effectively no batching) should be fine but for larger/production
266 |       # environments we recommend a value of at least 500. There'll be no one size fits all number here so we recommend
267 |       # benchmarking performance and tuning the value to match your environment
268 |       # db-batch-size = 1
269 |       #
270 |       # Periodically the stored metadata events will be forcibly written to the DB regardless of if the batch size
271 |       # has been reached. This is to prevent situations where events wind up never being written to an incomplete batch
272 |       # with no new events being generated. The default value is currently 5 seconds
273 |       # db-flush-rate = 5 seconds
274 |     }
275 |   }
276 | }
277 | 
278 | database {
279 |   # mysql example
280 |   profile = "slick.jdbc.MySQLProfile$"
281 |   db {
282 |     driver = "com.mysql.jdbc.Driver"
283 |     url = "jdbc:mysql://localhost:3306/cromwell?socket=/tmp/mysqld.sock"
284 |     user = "cromwell"
285 |     password = "test4cromwell"
286 |     connectionTimeout = 10000
287 |   }
288 | 
289 |   migration {
290 |     # For databases with a very large number of symbols, selecting all the rows at once can generate a variety of
291 |     # problems. In order to avoid any issue, the selection is paginated. This value sets how many rows should be
292 |     # retrieved and processed at a time, before asking for the next chunk.
293 |     read-batch-size = 100000
294 | 
295 |     # Because a symbol row can contain any arbitrary wdl value, the amount of metadata rows to insert from a single
296 |     # symbol row can vary from 1 to several thousands (or more). To keep the size of the insert batch from growing out
297 |     # of control we monitor its size and execute/commit when it reaches or exceeds writeBatchSize.
298 |     write-batch-size = 100000
299 |   }
300 | }
301 | 


--------------------------------------------------------------------------------
/scripts/SV_Tasks.wdl:
--------------------------------------------------------------------------------
  1 | version 1.0
  2 | # get the sample (SM) field from a CRAM file
  3 | task Split_By_Type {
  4 |   input {
  5 |     File input_vcf
  6 |     String output_vcf_prefix
  7 |     Int preemptible_tries
  8 |   }
  9 |   command <<<
 10 |     set -eo pipefail
 11 |     zcat ~{input_vcf} | grep -v "random	" | grep -v "alt	" | grep -v "decoy	" | grep -v "EBV	" | grep -v "^chrUn" | grep -v "^HLA"| /opt/hall-lab/vawk/vawk -v svtype=BND --header '{if(I$SVTYPE==svtype) print $0;}' | /opt/hall-lab/htslib-1.9/bin/bgzip -c > ~{output_vcf_prefix}.bnd.vcf.gz
 12 |     zcat ~{input_vcf} | grep -v "random	" | grep -v "alt	" | grep -v "decoy	" | grep -v "EBV	" | grep -v "^chrUn" | grep -v "^HLA"| /opt/hall-lab/vawk/vawk -v svtype=DEL --header '{if(I$SVTYPE==svtype) print $0;}' | /opt/hall-lab/htslib-1.9/bin/bgzip -c > ~{output_vcf_prefix}.del.vcf.gz
 13 |     zcat ~{input_vcf} | grep -v "random	" | grep -v "alt	" | grep -v "decoy	" | grep -v "EBV	" | grep -v "^chrUn" | grep -v "^HLA"| /opt/hall-lab/vawk/vawk -v svtype=INS --header '{if(I$SVTYPE==svtype) print $0;}' | /opt/hall-lab/htslib-1.9/bin/bgzip -c > ~{output_vcf_prefix}.ins.vcf.gz
 14 |     zcat ~{input_vcf} | grep -v "random	" | grep -v "alt	" | grep -v "decoy	" | grep -v "EBV	" | grep -v "^chrUn" | grep -v "^HLA"| /opt/hall-lab/vawk/vawk --header '{if(I$SVTYPE!="DEL" && I$SVTYPE!="BND" && I$SVTYPE!="INS") print $0;}' | /opt/hall-lab/htslib-1.9/bin/bgzip -c > ~{output_vcf_prefix}.other.vcf.gz
 15 |     zcat ~{output_vcf_prefix}.ins.vcf.gz | \
 16 |     /opt/hall-lab/vawk/vawk '{ct=split(I$SNAME, spl, ","); for(ii=1; ii<=ct; ii++) print $3, spl[ii], $9}' | \
 17 |     /opt/hall-lab/htslib-1.9/bin/bgzip -c > ~{output_vcf_prefix}.ins_split.txt.gz
 18 |   >>>
 19 |   runtime {
 20 |     docker: "halllab/vcf_bed_utils@sha256:09c18a5827d67891792ffc110627c7fa05b2262df4b91d6967ad6e544f41e8ec"
 21 |     cpu: "1"
 22 |     memory: "1 GB"
 23 |     disks: "local-disk " + ceil( size(input_vcf, "GB") * 2) + " HDD"
 24 |     preemptible: preemptible_tries
 25 |   }
 26 |   output {
 27 |     File bnd_vcf = "${output_vcf_prefix}.bnd.vcf.gz"
 28 |     File del_vcf = "${output_vcf_prefix}.del.vcf.gz"
 29 |     File ins_vcf = "${output_vcf_prefix}.ins.vcf.gz"
 30 |     File other_vcf = "${output_vcf_prefix}.other.vcf.gz"
 31 |     File ins_split = "${output_vcf_prefix}.ins_split.txt.gz"
 32 |   }
 33 | }
 34 | 
 35 | task Get_Sample_Name {
 36 |   input {
 37 |   	File input_cram
 38 |   	Int preemptible_tries
 39 |   }
 40 | 
 41 |   command {
 42 |     set -eo pipefail
 43 |     samtools view -H ${input_cram} \
 44 |       | grep -m 1 '^@RG' | tr '\t' '\n' \
 45 |       | grep '^SM:' | sed 's/^SM://g'
 46 |   }
 47 | 
 48 |   runtime {
 49 |     docker: "halllab/extract-sv-reads@sha256:192090f72afaeaaafa104d50890b2fc23935c8dc98988a9b5c80ddf4ec50f70c"
 50 |     cpu: "1"
 51 |     memory: "1 GB"
 52 |     disks: "local-disk " + ceil( size(input_cram, "GB") + 2.0) + " HDD"
 53 |     preemptible: preemptible_tries
 54 |   }
 55 | 
 56 |   output {
 57 |     String sample = read_string(stdout())
 58 |   }
 59 | }
 60 | 
 61 | # infer the sex of a sample based on chrom X copy number
 62 | task Get_Sex {
 63 |   input {
 64 |   	File input_cn_hist_root
 65 |   	File ref_fasta_index
 66 |   	Int preemptible_tries
 67 |   }
 68 | 
 69 |   command <<<
 70 |     set -eo pipefail
 71 |     cat ~{ref_fasta_index} \
 72 |       | awk '$1=="chrX" { print $1":0-"$2 } END { print "exit"}' \
 73 |       | cnvnator -root ~{input_cn_hist_root} -genotype 100 \
 74 |       | grep -v "^Assuming male" \
 75 |       | awk '{ printf("%.0f\n",$4); }'
 76 |   >>>
 77 | 
 78 |   runtime {
 79 |     docker: "halllab/cnvnator@sha256:8bf4fa64a288c5647a9a6b1ea90d14e76f48a3e16c5bf98c63419bb7d81c8938"
 80 |     cpu: "1"
 81 |     memory: "1 GB"
 82 |     disks: "local-disk 4 HDD"
 83 |     preemptible: preemptible_tries
 84 |   }
 85 | 
 86 |   output {
 87 |     String sex = read_string(stdout())
 88 |   }
 89 | }
 90 | 
 91 | # Create pedigree file from samples, with sex inferred from
 92 | # CNVnator X chrom copy number
 93 | task Make_Pedigree_File {
 94 |   input {
 95 |     Array[String] sample_array
 96 |     Array[String] sex_array
 97 |     String output_ped_basename
 98 |     File sample_file = write_lines(sample_array)
 99 |     File sex_file = write_lines(sex_array)
100 |   }
101 | 
102 |   command <<<
103 |     set -eo pipefail
104 |     paste ~{sample_file} ~{sex_file} \
105 |       | awk '{ print $1,$1,-9,-9,$2,-9 }' OFS='\t' \
106 |       > ~{output_ped_basename}.ped
107 |   >>>
108 | 
109 |   runtime {
110 |     docker: "ubuntu@sha256:edf05697d8ea17028a69726b4b450ad48da8b29884cd640fec950c904bfb50ce"
111 |     cpu: "1"
112 |     memory: "1 GB"
113 |     disks: "local-disk 4 HDD"
114 |   }
115 | 
116 |   output {
117 |     File output_ped = "${output_ped_basename}.ped"
118 |   }
119 | }
120 | 
121 | # index a CRAM
122 | task Index_Cram {
123 |   input {
124 |     File input_cram
125 |     String basename
126 |     File ref_cache
127 |     Int preemptible_tries
128 |   }
129 | 
130 |   command {
131 |     set -eo pipefail
132 |     ln -s ${input_cram} ${basename}.cram
133 | 
134 |     # build the reference sequence cache
135 |     tar -zxf ${ref_cache}
136 |     export REF_PATH=./cache/%2s/%2s/%s
137 |     export REF_CACHE=./cache/%2s/%2s/%s
138 | 
139 |     # index the CRAM
140 |     samtools index ${basename}.cram
141 |   }
142 | 
143 |   runtime {
144 |     docker: "halllab/samtools@sha256:5e6b0430a7ad25f68e5c46a9fa9c0ebba0f9af8ebf5aebe94242954d812a4e68"
145 |     cpu: "1"
146 |     memory: "1 GB"
147 |     disks: "local-disk " + ceil( size(input_cram, "GB") + size(ref_cache, "GB") * 5 + 1.0) + " HDD"
148 |     preemptible: preemptible_tries
149 |   }
150 | 
151 |   output {
152 |     File output_cram_index = "${basename}.cram.crai"
153 |   }
154 | }
155 | 
156 | task Filter_Index {
157 |   input {
158 |     File input_vcf_gz
159 |     String output_vcf_name
160 |     Int preemptible_tries
161 |   }
162 | 
163 |   command <<<
164 | 	set -eo pipefail
165 | 	FILTERLINE='##FILTER=<ID=LOW,Description="Test Low quality filter">'
166 | 	zcat ~{input_vcf_gz} | \
167 | 		/opt/hall-lab/vawk/vawk '{ \
168 | 		split(I$STRANDS,x,","); \
169 | 		split(x[1],y,":"); \
170 | 		split(x[2],z,":"); \
171 | 		if (I$SVTYPE=="INS" && I$NSAMP>0) { \
172 | 		I$MSQ=QUAL/I$NSAMP; \
173 | 		gsub("MSQ=0.00", "MSQ="I$MSQ, $8) \
174 | 		} \
175 | 		if ((I$SVTYPE=="DEL" || I$SVTYPE=="DUP" || I$SVTYPE=="MEI") && \
176 | 		I$MSQ>=100 && sqrt((I$SVLEN)*(I$SVLEN))>=50){ \
177 | 		$7="PASS"; print $0; \
178 | 		}  else if ( I$SVTYPE=="INV" && $6>=100 && (I$SR/I$SU)>=0.1 && (I$PE/I$SU)>=0.1 && (y[2]/I$SU)>0.1 && (z[2]/I$SU)>0.1 && sqrt((I$SVLEN)*(I$SVLEN))>=50){ \
179 | 		$7="PASS"; print $0; \
180 | 		} else if ( I$SVTYPE=="BND" && $9 !~ /CN/ && I$MSQ>=500){ \
181 | 		$7="PASS"; print $0; \
182 | 		} else if ( I$SVTYPE=="BND" && $9 ~ /CN/ && I$MSQ>=250){ \
183 | 		$7="PASS"; print $0; \
184 | 		} else if ( I$SVTYPE=="INS" && I$MSQ>=100 && I$SVLEN >=50) { \
185 | 		$7="PASS"; print $0; \
186 | 		} else { \
187 | 		$7="LOW"; print $0; \
188 | 		} \
189 | 	}' |  cat <(zcat ~{input_vcf_gz} | sed -n '/^#[^#]/q;p') <(echo $FILTERLINE) <(zgrep -m 1 '^#CHROM' ~{input_vcf_gz}) - | /opt/hall-lab/htslib-1.9/bin/bgzip -c > ~{output_vcf_name}
190 | 	/opt/hall-lab/htslib-1.9/bin/tabix -p vcf -f ~{output_vcf_name}
191 |   >>>
192 | 
193 |   runtime {
194 |     docker: "halllab/vcf_bed_utils@sha256:09c18a5827d67891792ffc110627c7fa05b2262df4b91d6967ad6e544f41e8ec"
195 |     cpu: "1"
196 |     memory: "1 GB"
197 |     disks: "local-disk " + ceil( size(input_vcf_gz, "GB") * 2) + " HDD"
198 |     preemptible: preemptible_tries
199 |   }
200 | 
201 |   output {
202 |     File output_vcf_gz = "${output_vcf_name}"
203 |     File output_vcf_gz_index = "${output_vcf_name}.tbi"
204 |   }
205 |   
206 | }
207 | 
208 | task Count_Lumpy {
209 |   input {
210 |     String basename
211 |     File input_vcf
212 |     Int preemptible_tries
213 |     String cohort
214 |     String center
215 |   }
216 | 
217 |   command <<<
218 |     set -eo pipefail
219 |    
220 |      bcftools query  -f "[%CHROM\t~{cohort}\t~{center}\t%FILTER\t%INFO/SVTYPE\t%INFO/SVLEN\t%INFO/SR\t%SAMPLE\t%GT\n]"  ~{input_vcf} \
221 |      | awk 'BEGIN{OFS="\t"}{if($1~/chr[1-9]+/ && $1!~/_/) {
222 |        svlen=$6;
223 |        if($6<0 && $6!=".") svlen=-1*$6;
224 |        len_bin=">=1kb"
225 |        if(svlen<1000) len_bin="<1kb";
226 |        if($7>0) $7="SR>=1";
227 |        else $7="SR=0";
228 |        print $1, $2, $3, $4, $5, len_bin, $7, $8, $9;}}' \
229 |      | sort -k1,9 \
230 |      | uniq -c \
231 |      | awk 'BEGIN{OFS="\t"}{print $2, $3, $4, $5, $6, $7, $8, $9, $1}'  > ~{basename}.lumpy.counts.1.txt
232 |   >>>
233 | 
234 |   runtime {
235 |     docker: "halllab/bcftools@sha256:955cbf93e35e5ee6fdb60e34bb404b7433f816e03a202dfed9ceda542e0d8906"
236 |     cpu: "1"
237 |     memory: "1 GB"
238 |     disks: "local-disk " + ceil( size(input_vcf, "GB") * 2) + " HDD"
239 |     preemptible: preemptible_tries
240 |   }
241 | 
242 |   output {
243 |     File output_counts = "${basename}.lumpy.counts.1.txt"
244 |   }
245 | }
246 | 
247 | task Count_Manta {
248 |   input {
249 |     String basename
250 |     File input_vcf
251 |     Int preemptible_tries
252 |     String cohort
253 |     String center
254 |   }
255 | 
256 |   command <<<
257 |     set -eo pipefail
258 | 
259 |      bcftools query  -f "[%CHROM\t~{cohort}\t~{center}\t%FILTER\t%INFO/SVTYPE\t%SAMPLE\t%GT\n]"  ~{input_vcf} \
260 |      | awk 'BEGIN{OFS="\t"}{if($1~/chr[1-9]+/ && $1!~/_/ && $4=="PASS") print $0;}' \
261 |      | sort -k1,7 \
262 |      | uniq -c \
263 |      | awk 'BEGIN{OGS="\t"}{print $2, $3, $4, $5, $6, $7, $8,  $1}'  > ~{basename}.manta.counts.1.txt
264 |   >>>
265 | 
266 |   runtime {
267 |     docker: "halllab/bcftools@sha256:955cbf93e35e5ee6fdb60e34bb404b7433f816e03a202dfed9ceda542e0d8906"
268 |     cpu: "1"
269 |     memory: "1 GB"
270 |     disks: "local-disk " + ceil( size(input_vcf, "GB") * 2) + " HDD"
271 |     preemptible: preemptible_tries
272 |   }
273 | 
274 |   output {
275 |     File output_counts = "${basename}.manta.counts.1.txt"
276 |   }
277 | }
278 | 
279 | task Manta {    
280 |   input {
281 |     File input_cram
282 |     File input_cram_index
283 |     File ref_fasta
284 |     File ref_fasta_index
285 |     File ref_cache
286 |     File? call_regions_bed
287 |     File? call_regions_bed_index
288 |     String basename
289 |     Int preemptible_tries
290 |   }
291 | 
292 |   # Manta requires 2GB per thread for scheduling, but in typical cases uses less than this
293 |   # see https://github.com/Illumina/manta/issues/38
294 |   # Setting below derives CPU count from machine
295 |   # Sets RAM to unlimited to jobs are scheduled only 
296 |   # with respect to cores
297 |   # If a task starts to fail then we can adjust the machine resources to get it
298 |   # to succeed without adjusting the command
299 |   # Note that we are converting to BAM on the fly as CRAM is showing extreme memory usage in some situations. See https://github.com/Illumina/manta/issues/154.
300 |   # Note also that we are specifying an inflation factor of 4, but padding with 20GB of data. This is aimed to get us over 100GB of SSD for better performance on small samples.
301 | 
302 |   command {
303 |     set -eo pipefail
304 |     ln -s ${input_cram} ${basename}.cram
305 |     ln -s ${input_cram_index} ${basename}.cram.crai
306 | 
307 |     tar -zxf ${ref_cache}
308 |     export REF_PATH=./cache/%2s/%2s/%s
309 |     export REF_CACHE=./cache/%2s/%2s/%s
310 | 
311 |     ${"touch " + call_regions_bed_index} 
312 | 
313 |     samtools view -hb -@8 ${basename}.cram -o ${basename}.bam 
314 |     samtools index -@8 ${basename}.bam 
315 | 
316 |     configManta.py \
317 |     --referenceFasta=${ref_fasta} \
318 |     --runDir=MantaWorkflow \
319 |     --bam=${basename}.bam ${"--callRegions=" + call_regions_bed}
320 |     MantaWorkflow/runWorkflow.py -m local -g "unlimited"
321 |     mv MantaWorkflow/results/variants/diploidSV.vcf.gz ${basename}.vcf.gz
322 |     mv MantaWorkflow/results/variants/diploidSV.vcf.gz.tbi ${basename}.vcf.gz.tbi
323 |     zcat ${basename}.vcf.gz | /opt/hall-lab/python-2.7.15/bin/python /opt/hall-lab/doctor_manta.1.py -m 700 | /opt/hall-lab/htslib-1.9/bin/bgzip -c > ${basename}.doctored.vcf.gz
324 |     /opt/hall-lab/htslib-1.9/bin/tabix -p vcf ${basename}.doctored.vcf.gz
325 |     tar -czvf ${basename}.MantaWorkflow.tgz MantaWorkflow
326 |   }
327 |   runtime {
328 |     docker: "halllab/manta_samtools@sha256:d39fac59a2c06f808d115c65b9c191baf5f249769d317263ae3cd19e2c74d20e"
329 |     cpu: "8"
330 |     memory: "16 GiB"
331 |     disks: "local-disk " + ceil( size(input_cram, "GB") * 4 + size(input_cram_index, "GB") + size(ref_fasta, "GB") + size(ref_fasta_index, "GB") + size(ref_cache, "GB") * 5 + 20.0) + " SSD"
332 |     preemptible: preemptible_tries
333 |   }
334 |   output {
335 |     File output_vcf = "${basename}.doctored.vcf.gz"
336 |     File output_tbi = "${basename}.doctored.vcf.gz.tbi"
337 |     File original_vcf = "${basename}.vcf.gz"
338 |     File original_tbi = "${basename}.vcf.gz.tbi"
339 |     File workflow_tgz = "${basename}.MantaWorkflow.tgz"
340 |   }
341 | }
342 | 
343 | # Smoove wrapper
344 | task Smoove {
345 |   input {
346 |     String basename
347 |     File input_cram
348 |     File input_cram_index
349 | 
350 |     File ref_fasta
351 |     File ref_fasta_index
352 |     File ref_cache
353 |     File exclude_regions
354 | 
355 |     Int preemptible_tries
356 |   }
357 | 
358 |   command {
359 |     set -eo pipefail
360 |     ln -s ${input_cram} ${basename}.cram
361 |     ln -s ${input_cram_index} ${basename}.cram.crai
362 | 
363 |     tar -zxf ${ref_cache}
364 |     export REF_PATH=./cache/%2s/%2s/%s
365 |     export REF_CACHE=./cache/%2s/%2s/%s
366 |     
367 |     export SMOOVE_NO_MAX_CI=TRUE
368 | 
369 |     smoove call \
370 |       --name ${basename} \
371 |       --exclude ${exclude_regions} \
372 |       --fasta ${ref_fasta} \
373 |       --noextrafilters \
374 |       --genotype \
375 |       ${basename}.cram
376 | 
377 |     if [ ! -e ${basename}.histo ]; then
378 |       mv *.histo ${basename}.histo
379 |       mv *.split.bam ${basename}.split.bam
380 |       mv *.split.bam.bai ${basename}.split.bam.bai
381 |       mv *.disc.bam ${basename}.disc.bam
382 |       mv *.disc.bam.bai ${basename}.disc.bam.bai
383 |     fi
384 |   }
385 | 
386 |   runtime {
387 |     docker: "brentp/smoove@sha256:c839ed223462a1c1ae26e7acc27f28f0f67b4581d80a06823895f295ad2bdaf4"
388 |     cpu: "1"
389 |     memory: "2.5 GiB"
390 |     disks: "local-disk " + ceil( size(input_cram, "GB") + size(input_cram_index, "GB") + size(ref_fasta, "GB") + size(ref_fasta_index, "GB") + size(exclude_regions, "GB") + size(input_cram, "GB") * 0.30 + size(ref_cache, "GB") * 5) + " HDD"
391 |     preemptible: preemptible_tries
392 |   }
393 | 
394 |   output {
395 |     File output_vcf = "${basename}-smoove.genotyped.vcf.gz"
396 |     File output_csi = "${basename}-smoove.genotyped.vcf.gz.csi"
397 |     File output_histogram = "${basename}.histo"
398 |     File lumpy_script = "${basename}-lumpy-cmd.sh"
399 |     File splitters = "${basename}.split.bam"
400 |     File splitters_index = "${basename}.split.bam.bai"
401 |     File discordants = "${basename}.disc.bam"
402 |     File discordants_index = "${basename}.disc.bam.bai"
403 |   }
404 | }
405 | 
406 | task Genotype {
407 |   input {
408 |     String basename
409 |     File input_cram
410 |     File input_cram_index
411 |     File input_vcf
412 |     File ref_cache
413 |     Int preemptible_tries
414 |   }
415 | 
416 |   command {
417 |     set -eo pipefail
418 |     ln -s ${input_cram} ${basename}.cram
419 |     ln -s ${input_cram_index} ${basename}.cram.crai
420 | 
421 |     # build the reference sequence cache
422 |     tar -zxf ${ref_cache}
423 |     export REF_PATH=./cache/%2s/%2s/%s
424 |     export REF_CACHE=./cache/%2s/%2s/%s
425 | 
426 |     rm -f ${basename}.cram.json
427 |     zcat ${input_vcf} \
428 |       | svtyper \
429 |       -B ${basename}.cram \
430 |       -l ${basename}.cram.json \
431 |       | bgzip -c > ${basename}.gt.vcf.gz
432 |   }
433 | 
434 |   runtime {
435 |     docker: "halllab/svtyper@sha256:8ebb0508bc63a2a32d22b4a3e55453222560daa30b7cc14a4f1189cb311d5922"
436 |     cpu: "1"
437 |     memory: "15 GB"
438 |     disks: "local-disk " + ceil( size(input_cram, "GB") + size(input_vcf, "GB") +  size(ref_cache, "GB") * 5 + 20.0) + " HDD"
439 |     preemptible: preemptible_tries
440 |   }
441 | 
442 |   output {
443 |     File output_vcf = "${basename}.gt.vcf.gz"
444 |     File output_lib = "${basename}.cram.json"
445 |   }
446 | }
447 | 
448 | task Take_Original_Genotypes {
449 |   input {
450 |     String sample_name
451 |     String basename
452 |     File input_vcf
453 |     File input_variant_to_sname_mapping
454 |     File original_per_sample_vcf
455 |     Int preemptible_tries
456 |   }
457 | 
458 |   command <<<
459 |     set -eo pipefail
460 |     zcat ~{input_variant_to_sname_mapping} \
461 |       | /opt/hall-lab/vawk/vawk -v sname="~{sample_name}" 'BEGIN{OFS="\t"}{ \
462 |     split($2, spl, ":"); \
463 |     if(spl[1]==sname) { \
464 |         print $1, spl[1], spl[2]":"spl[3]":"spl[4]":"spl[5]":"spl[6]":"spl[7]":"spl[8]; \
465 |         } \
466 |       }' \
467 |       | /opt/hall-lab/io/zjoin -a stdin -b <(paste -d ":" <(zcat ~{original_per_sample_vcf} | grep -v "^#" | cut -f 3,9-) <(zcat ~{original_per_sample_vcf} | grep -v "^#" | cut -f 4,5 | tr "\t" ":") <(zcat ~{original_per_sample_vcf} | /opt/hall-lab/vawk/vawk '{svlen=I$SVLEN; if(svlen==""){svlen="."} print svlen}') | sed 's/:SR/:SR:OREF:OALT:OSVLEN/') -1 3 -2 1 \
468 |       | cut -f 1,5- \
469 |       | awk -v sname="~{sample_name}" 'BEGIN{OFS="\t"; print "ID", "FORMAT", sname;}{ \
470 |            print $0; \
471 |         }' \
472 |       | /opt/hall-lab/htslib-1.9/bin/bgzip -c > temp
473 | 
474 |     zcat ~{input_vcf} \
475 |       | /opt/hall-lab/io/zjoin -r -p "##" -a stdin -b <(zcat temp | sort -k1,1 | /opt/hall-lab/bin/bedtools groupby -g 1 -c 2,3 -o first,first ) -1 3 -2 1 \
476 |       | cut -f -8,10- \
477 |       | /opt/hall-lab/vawk/vawk --header 'BEGIN{OFS="\t"}{if($9=="NA") {$9="GT:FT:GQ:PL:PR:SR:OREF:OALT:OSVLEN"; $10="0/0:.:.:.:.:.:.:.:.";} print $0;}' \
478 |       | sed 's/^#CHROM/##FORMAT=<ID=OREF,Number=1,Type=String,Description="Original reference sequence">\n##FORMAT=<ID=OALT,Number=1,Type=String,Description="Original alt sequence">\n##FORMAT=<ID=OSVLEN,Number=1,Type=Integer,Description="Original SVLEN">\n#CHROM/' \
479 |       | /opt/hall-lab/htslib-1.9/bin/bgzip -c > ~{basename}.gt.vcf.gz
480 |   >>> 
481 | 
482 |   runtime {
483 |     docker: "halllab/vcf_bed_utils@sha256:09c18a5827d67891792ffc110627c7fa05b2262df4b91d6967ad6e544f41e8ec"
484 |     cpu: "1"
485 |     memory: "15 GB"
486 |     disks: "local-disk " + ceil( size(original_per_sample_vcf, "GB") + size(input_vcf, "GB") +  size(input_variant_to_sname_mapping, "GB") + 20.0) + " HDD"
487 |     preemptible: preemptible_tries
488 |   }
489 | 
490 |   output {
491 |     File output_vcf = "${basename}.gt.vcf.gz"
492 |   }
493 | }
494 | 
495 | task Copy_Number {
496 |   input {
497 |     String basename
498 |     String sample
499 |     File input_vcf
500 |     File input_cn_hist_root
501 |     File ref_cache
502 |     Int preemptible_tries
503 |   }
504 | 
505 |   command {
506 |     set -eo pipefail
507 |     zcat ${input_vcf} \
508 |      | create_coordinates \
509 |       -o coordinates.txt
510 | 
511 |     svtools copynumber \
512 |       -i ${input_vcf} \
513 |       -s ${sample} \
514 |       --cnvnator cnvnator \
515 |       -w 100 \
516 |       -r ${input_cn_hist_root} \
517 |       -c coordinates.txt \
518 |       | bgzip -c \
519 |       > ${basename}.cn.vcf.gz
520 |   }
521 | 
522 |   runtime {
523 |     docker: "halllab/svtools@sha256:38ac08a8685ff58329b72e2b9c366872086d41ef21da84278676e06ef7f1bfbb"
524 |     cpu: "1"
525 |     memory: "4 GB"
526 |     disks: "local-disk " + 35 + " HDD"
527 |     preemptible: preemptible_tries
528 |   }
529 | 
530 |   output {
531 |     File output_vcf = "${basename}.cn.vcf.gz"
532 |   }
533 | }
534 | 
535 | task CNVnator_Histogram {
536 |   input {
537 |     String basename
538 |     File input_cram
539 |     File input_cram_index
540 |     File ref_fasta
541 |     File ref_fasta_index
542 |     File ref_cache
543 |     String ref_chrom_dir = "cnvnator_chroms"
544 |     Int preemptible_tries
545 |     Int threads = 4
546 |   # Add 7G of pad of the chromosome directory and ~2-3 GB of output files
547 |   }
548 | 
549 |   command <<<
550 |     set -eo pipefail
551 |     ln -s ~{input_cram} ~{basename}.cram
552 |     ln -s ~{input_cram_index} ~{basename}.cram.crai
553 | 
554 |     # build the reference sequence cache
555 |     tar -zxf ~{ref_cache}
556 |     export REF_PATH=./cache/%2s/%2s/%s
557 |     export REF_CACHE=./cache/%2s/%2s/%s
558 | 
559 |     # Create directory of chromosome FASTA files for CNVnator
560 |     mkdir -p ~{ref_chrom_dir}
561 |     awk -v CHROM_DIR=~{ref_chrom_dir} 'BEGIN { CHROM="" } { if ($1~"^>") CHROM=substr($1,2); print $0 > CHROM_DIR"/"CHROM".fa" }' ~{ref_fasta}
562 | 
563 |     cnvnator_wrapper.py \
564 |       -T cnvnator.out \
565 |       -o ~{basename}.cn \
566 |       -t ~{threads} \
567 |       -w 100 \
568 |       -b ~{basename}.cram \
569 |       -c ~{ref_chrom_dir} \
570 |       -g GRCh38 \
571 |       --cnvnator cnvnator
572 |   >>>
573 | 
574 |   runtime {
575 |     docker: "halllab/cnvnator@sha256:8bf4fa64a288c5647a9a6b1ea90d14e76f48a3e16c5bf98c63419bb7d81c8938"
576 |     cpu: threads
577 |     memory: "16 GB"
578 |     disks: "local-disk " + ceil( size(input_cram, "GB") + size(input_cram_index, "GB") + size(ref_fasta, "GB") + size(ref_fasta_index, "GB") + size(ref_cache, "GB") * 5 + 7.0 ) + " HDD"
579 |     preemptible: preemptible_tries
580 |   }
581 | 
582 |   output {
583 |     File output_cn_hist_root = "cnvnator.out/${basename}.cram.hist.root"
584 |     File output_cn_txt = "${basename}.cn.txt"
585 |     File output_cn_bed = "${basename}.cn.bed"
586 |   }
587 | }
588 | 
589 | task L_Sort_VCF_Variants {
590 |   input {
591 |     Array[File] input_vcfs
592 |     File input_vcfs_file = write_lines(input_vcfs)
593 |     String output_vcf_basename
594 |     Int preemptible_tries
595 |   }
596 | 
597 |   parameter_meta {
598 |     input_vcfs: {
599 | 	description: "vcf files to sort together",
600 |         localization_optional: true
601 |     }
602 |   }
603 | 
604 |   command {
605 |     set -eo pipefail
606 |     # strip the "gs://" prefix from the file paths
607 |     cat ${input_vcfs_file} \
608 |       | sed 's/^gs:\/\//\.\//g' \
609 |       > ${input_vcfs_file}.local_map.txt
610 |    sleep 1
611 | 
612 |     svtools lsort \
613 |       -b 200 \
614 |       -f ${input_vcfs_file} \
615 |       -t /cromwell_root/bulk_download \
616 |       | bgzip -c \
617 |       > ${output_vcf_basename}.vcf.gz
618 |   }
619 | 
620 |   runtime {
621 |     docker: "halllab/svtools@sha256:38ac08a8685ff58329b72e2b9c366872086d41ef21da84278676e06ef7f1bfbb"
622 |     cpu: "1"
623 |     memory: "3.75 GB"
624 |     disks: "local-disk " + 2*ceil(size(input_vcfs, "GB")) +10 + " HDD"
625 |     bootDiskSizeGb: 30
626 |     preemptible: preemptible_tries
627 |   }
628 | 
629 |   output {
630 |     File output_vcf_gz = "${output_vcf_basename}.vcf.gz"
631 |   }
632 | }
633 | 
634 | task L_Merge_VCF_Variants {
635 |   input {
636 |     File input_vcf_gz
637 |     String output_vcf_basename
638 |     Int preemptible_tries
639 |   }
640 | 
641 |   command {
642 |     set -eo pipefail
643 |     zcat ${input_vcf_gz} \
644 |       | svtools lmerge \
645 |       -i /dev/stdin \
646 |       -f 20 \
647 |       | bgzip -c \
648 |       > ${output_vcf_basename}.vcf.gz
649 |   }
650 | 
651 |   runtime {
652 |     docker: "halllab/svtools@sha256:38ac08a8685ff58329b72e2b9c366872086d41ef21da84278676e06ef7f1bfbb"
653 |     cpu: "1"
654 |     memory: "3.75 GB"
655 |     disks: "local-disk " + 2*ceil(size(input_vcf_gz, "GB"))+10 + " HDD"
656 |     preemptible: preemptible_tries
657 |   }
658 | 
659 |   output {
660 |     File output_vcf_gz = "${output_vcf_basename}.vcf.gz"
661 |   }
662 | }
663 | 
664 | task L_Merge_VCF_Variants_weighted {
665 |   input {
666 |     File input_vcf_gz
667 |     String output_vcf_basename
668 |     Int preemptible_tries
669 |   }
670 | 
671 |   command {
672 |     set -eo pipefail
673 |     zcat ${input_vcf_gz} \
674 |       | svtools lmerge \
675 |       -i /dev/stdin \
676 |       -f 20 \
677 |       -w carrier_wt \
678 |       | bgzip -c \
679 |       > ${output_vcf_basename}.vcf.gz
680 |   }
681 | 
682 |   runtime {
683 |     docker: "halllab/svtools@sha256:38ac08a8685ff58329b72e2b9c366872086d41ef21da84278676e06ef7f1bfbb"
684 |     cpu: "1"
685 |     memory: "3.75 GB"
686 |     disks: "local-disk " + 2*ceil(size(input_vcf_gz, "GB"))+10 + " HDD"
687 |     preemptible: preemptible_tries
688 |   }
689 | 
690 |   output {
691 |     File output_vcf_gz = "${output_vcf_basename}.vcf.gz"
692 |   }
693 | }
694 | 
695 | task Filter_Del {
696 |   input {
697 |     File input_vcf_gz
698 |     String output_vcf_basename
699 |     Int preemptible_tries
700 |   }
701 | 
702 |   command <<<
703 |     set -eo pipefail
704 | 
705 |     bcftools view -i '(SVTYPE!="DEL" || SVLEN>1000 || SVLEN<-1000 || INFO/SR>0)' ~{input_vcf_gz}  | bgzip -c >  ~{output_vcf_basename}.vcf.gz
706 |   >>>
707 | 
708 |   runtime {
709 |     docker: "halllab/bcftools@sha256:955cbf93e35e5ee6fdb60e34bb404b7433f816e03a202dfed9ceda542e0d8906"
710 |     cpu: "1"
711 |     memory: "3.75 GB"
712 |     disks: "local-disk " + 2*ceil(size(input_vcf_gz, "GB"))+10 + " HDD"
713 |     preemptible: preemptible_tries
714 |   }
715 | 
716 |   output {
717 |     File output_vcf_gz = "${output_vcf_basename}.vcf.gz"
718 |   }
719 | }
720 | 
721 | task Filter_Pass {
722 |   input {
723 |     File input_vcf_gz
724 |     String output_vcf_basename
725 |     Int preemptible_tries
726 |   }
727 | 
728 |   command <<<
729 |     set -eo pipefail
730 | 
731 |     bcftools view -f .,PASS ~{input_vcf_gz}  | bgzip -c >  ~{output_vcf_basename}.vcf.gz
732 |   >>>
733 | 
734 |   runtime {
735 |     docker: "halllab/bcftools@sha256:955cbf93e35e5ee6fdb60e34bb404b7433f816e03a202dfed9ceda542e0d8906"
736 |     cpu: "1"
737 |     memory: "3.75 GB"
738 |     disks: "local-disk " + 2*ceil(size(input_vcf_gz, "GB"))+10 + " HDD"
739 |     preemptible: preemptible_tries
740 |   }
741 | 
742 |   output {
743 |     File output_vcf_gz = "${output_vcf_basename}.vcf.gz"
744 |   }
745 | }
746 | 
747 | task Paste_VCF {
748 |   input {
749 |     Array[File] input_vcfs
750 |     File input_vcfs_file = write_lines(input_vcfs)
751 |     String output_vcf_basename
752 |     Int preemptible_tries
753 |   }
754 |   parameter_meta {
755 |     input_vcfs: {
756 | 	description: "vcf files to paste together",
757 |         localization_optional: true
758 |     }
759 |   }
760 | 
761 |   command {
762 |     set -eo pipefail
763 |     svtools vcfpaste \
764 |       -f ${input_vcfs_file} \
765 |       -q \
766 |       -t /cromwell_root/bulk_download \
767 |       | bgzip -c \
768 |       > ${output_vcf_basename}.vcf.gz
769 |   }
770 | 
771 |   runtime {
772 |     docker: "halllab/svtools@sha256:38ac08a8685ff58329b72e2b9c366872086d41ef21da84278676e06ef7f1bfbb"
773 |     cpu: "1"
774 |     memory: "12 GB"
775 |     disks: "local-disk " + 2*ceil(size(input_vcfs, "GB")) + " HDD"
776 |     preemptible: 0
777 |   }
778 | 
779 |   output {
780 |     File output_vcf_gz = "${output_vcf_basename}.vcf.gz"
781 |   }
782 | }
783 | 
784 | task Remove_INS {
785 |   input {
786 |     File input_vcf_gz
787 |     String output_vcf_basename
788 |     Int preemptible_tries
789 |   }
790 | 
791 |   command <<<
792 |     set -eo pipefail
793 |     zcat ~{input_vcf_gz} \
794 |     | awk '{if($5!="<INS>") print $0}' \
795 |     | bgzip -c \
796 |     > ~{output_vcf_basename}.vcf.gz
797 |   >>>
798 | 
799 |   runtime {
800 |     docker: "halllab/svtools@sha256:38ac08a8685ff58329b72e2b9c366872086d41ef21da84278676e06ef7f1bfbb"
801 |     cpu: "1"
802 |     memory: "3 GB"
803 |     disks: "local-disk " +  2*ceil( size(input_vcf_gz, "GB")) + " HDD"
804 |     preemptible: preemptible_tries
805 |   }
806 |   
807 |   output {
808 |     File output_vcf_gz =  "${output_vcf_basename}.vcf.gz"
809 |   }
810 | }
811 | 
812 | task Prune_VCF {
813 |   input {
814 |     File input_vcf_gz
815 |     String output_vcf_basename
816 |     Int preemptible_tries
817 |   }
818 | 
819 |   command {
820 |     set -eo pipefail
821 |     zcat ${input_vcf_gz} \
822 |       | svtools afreq \
823 |       | svtools vcftobedpe \
824 |       | svtools bedpesort \
825 |       | svtools prune -s -d 100 -e 'AF' \
826 |       | svtools bedpetovcf \
827 |       | bgzip -c \
828 |       > ${output_vcf_basename}.vcf.gz
829 |   }
830 | 
831 |   runtime {
832 |     docker: "halllab/svtools@sha256:38ac08a8685ff58329b72e2b9c366872086d41ef21da84278676e06ef7f1bfbb"
833 |     cpu: "1"
834 |     memory: "3 GB"
835 |     disks: "local-disk " +  3*ceil( size(input_vcf_gz, "GB")) + " HDD"
836 |     preemptible: preemptible_tries
837 |   }
838 | 
839 |   output {
840 |     File output_vcf_gz = "${output_vcf_basename}.vcf.gz"
841 |   }
842 | }
843 | 
844 | task Classify {
845 |   input {
846 |     File input_vcf_gz
847 |     File input_ped
848 |     String output_vcf_basename
849 |     File mei_annotation_bed
850 |     Int preemptible_tries
851 |   }
852 | 
853 |   command {
854 |     set -eo pipefail
855 |     cat ${input_ped} \
856 |       | cut -f 2,5 \
857 |       > sex.txt
858 | 
859 |     zcat ${input_vcf_gz} \
860 |       | svtools classify \
861 |       -g sex.txt \
862 |       -a ${mei_annotation_bed} \
863 |       -m large_sample \
864 |       | bgzip -c \
865 |       > ${output_vcf_basename}.vcf.gz
866 |   }
867 | 
868 |   runtime {
869 |     docker: "halllab/svtools@sha256:38ac08a8685ff58329b72e2b9c366872086d41ef21da84278676e06ef7f1bfbb"
870 |     cpu: "1"
871 |     memory: "3 GB"
872 |     disks: "local-disk " +  10*ceil( size(input_vcf_gz, "GB")) + " HDD"
873 |     preemptible: preemptible_tries
874 |   }
875 | 
876 |   output {
877 |     File output_vcf_gz = "${output_vcf_basename}.vcf.gz"
878 |   }
879 | }
880 | 
881 | task Sort_Index_VCF {
882 |   input {
883 |     File input_vcf_gz
884 |     String output_vcf_name
885 |     Int preemptible_tries
886 |   }
887 | 
888 |   command {
889 |     set -eo pipefail
890 |     zcat ${input_vcf_gz} \
891 |       | svtools vcfsort \
892 |       | bgzip -c \
893 |       > ${output_vcf_name}
894 | 
895 |     tabix -p vcf -f ${output_vcf_name}
896 |   }
897 | 
898 |   runtime {
899 |     docker: "halllab/svtools@sha256:38ac08a8685ff58329b72e2b9c366872086d41ef21da84278676e06ef7f1bfbb"
900 |     cpu: "1"
901 |     memory: "3 GB"
902 |     disks: "local-disk " + 20*ceil( size(input_vcf_gz, "GB")) + " HDD"
903 |     preemptible: preemptible_tries
904 |   }
905 | 
906 |   output {
907 |     File output_vcf_gz = "${output_vcf_name}"
908 |     File output_vcf_gz_index = "${output_vcf_name}.tbi"
909 |   }
910 | }
911 | 
912 | 


--------------------------------------------------------------------------------