├── .gitignore ├── images └── workflow.wdl.v04.low-01.png ├── scripts ├── generic.options.json ├── Pre_Merge_QC_per_sample.wdl ├── Pre_Merge_SV.inputs.json ├── Merge_SV.inputs.json ├── SV_Pipeline_Full.inputs.json ├── sort_same.py ├── Merge_SV.wdl ├── Pre_Merge_SV_per_sample.wdl ├── Post_Merge_SV.inputs.json ├── SV_Pipeline_Full.wdl ├── Pre_Merge_SV.wdl ├── Post_Merge_SV.wdl ├── jes.conf └── SV_Tasks.wdl ├── test ├── cnvnator │ ├── cromwell_cmd.sh │ ├── test.inputs.json │ └── test.wdl ├── lumpy │ ├── cromwell_cmd.sh │ ├── test.wdl │ └── test.inputs.json ├── svtools │ ├── cromwell_cmd.sh │ ├── test.inputs.json │ └── test.wdl ├── svtyper │ ├── cromwell_cmd.sh │ ├── test.inputs.json │ └── test.wdl ├── extract-sv-reads │ ├── cromwell_cmd.sh │ ├── test.inputs.json │ └── test.wdl ├── config │ └── test.wdl └── jes.conf ├── docker ├── samtools │ └── Dockerfile ├── bcftools │ └── Dockerfile ├── extract-sv-reads │ └── Dockerfile ├── cromwell_mysql │ ├── mysql.cnf.template │ ├── Dockerfile │ ├── run_pipeline.sh │ └── application.conf.template ├── lumpy │ └── Dockerfile ├── cnvnator │ └── Dockerfile ├── vcf_bed_utils │ └── Dockerfile ├── manta │ └── Dockerfile ├── svtyper │ └── Dockerfile ├── svtools │ └── Dockerfile ├── manta_samtools │ ├── Dockerfile │ └── doctor_manta.1.py └── smoove │ └── Dockerfile └── README.md /.gitignore: -------------------------------------------------------------------------------- 1 | *~ -------------------------------------------------------------------------------- /images/workflow.wdl.v04.low-01.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/hall-lab/sv-pipeline/HEAD/images/workflow.wdl.v04.low-01.png -------------------------------------------------------------------------------- /scripts/generic.options.json: -------------------------------------------------------------------------------- 1 | { 2 | "read_from_cache": true, 3 | "default_runtime_attributes": { 4 | "zones": "us-central1-a us-central1-b us-central1-c us-central1-f" 5 | } 6 | } 7 | 8 | -------------------------------------------------------------------------------- /test/cnvnator/cromwell_cmd.sh: -------------------------------------------------------------------------------- 1 | CROMWELL=/home/cchiang/src/cromwell/target/scala-2.11/cromwell-26-22fe860-SNAP.jar 2 | JES_CONF=../jes.conf 3 | OPTIONS=../../scripts/generic.options.json 4 | 5 | java \ 6 | -Dconfig.file=$JES_CONF \ 7 | -jar $CROMWELL \ 8 | run \ 9 | test.wdl \ 10 | test.inputs.json \ 11 | $OPTIONS \ 12 | test.metadata.json 13 | -------------------------------------------------------------------------------- /test/lumpy/cromwell_cmd.sh: -------------------------------------------------------------------------------- 1 | CROMWELL=/home/cchiang/src/cromwell/target/scala-2.11/cromwell-26-22fe860-SNAP.jar 2 | JES_CONF=../jes.conf 3 | OPTIONS=../../scripts/generic.options.json 4 | 5 | java \ 6 | -Dconfig.file=$JES_CONF \ 7 | -jar $CROMWELL \ 8 | run \ 9 | test.wdl \ 10 | test.inputs.json \ 11 | $OPTIONS \ 12 | test.metadata.json 13 | -------------------------------------------------------------------------------- /test/svtools/cromwell_cmd.sh: -------------------------------------------------------------------------------- 1 | CROMWELL=/home/cchiang/src/cromwell/target/scala-2.11/cromwell-26-22fe860-SNAP.jar 2 | JES_CONF=../jes.conf 3 | OPTIONS=../../scripts/generic.options.json 4 | 5 | java \ 6 | -Dconfig.file=$JES_CONF \ 7 | -jar $CROMWELL \ 8 | run \ 9 | test.wdl \ 10 | test.inputs.json \ 11 | $OPTIONS \ 12 | test.metadata.json 13 | -------------------------------------------------------------------------------- /test/svtyper/cromwell_cmd.sh: -------------------------------------------------------------------------------- 1 | CROMWELL=/home/cchiang/src/cromwell/target/scala-2.11/cromwell-26-22fe860-SNAP.jar 2 | JES_CONF=../jes.conf 3 | OPTIONS=../../scripts/generic.options.json 4 | 5 | java \ 6 | -Dconfig.file=$JES_CONF \ 7 | -jar $CROMWELL \ 8 | run \ 9 | test.wdl \ 10 | test.inputs.json \ 11 | $OPTIONS \ 12 | test.metadata.json 13 | -------------------------------------------------------------------------------- /test/extract-sv-reads/cromwell_cmd.sh: -------------------------------------------------------------------------------- 1 | CROMWELL=/home/cchiang/src/cromwell/target/scala-2.11/cromwell-26-22fe860-SNAP.jar 2 | JES_CONF=../jes.conf 3 | OPTIONS=../../scripts/generic.options.json 4 | 5 | java \ 6 | -Dconfig.file=$JES_CONF \ 7 | -jar $CROMWELL \ 8 | run \ 9 | test.wdl \ 10 | test.inputs.json \ 11 | $OPTIONS \ 12 | test.metadata.json 13 | -------------------------------------------------------------------------------- /test/config/test.wdl: -------------------------------------------------------------------------------- 1 | import "../../scripts/SV_Tasks.wdl" as SV 2 | 3 | workflow Test_Simple_Workflow { 4 | File input_cram 5 | 6 | Int disk_size 7 | Int preemptible_tries 8 | 9 | call SV.Get_Sample_Name { 10 | input: 11 | input_cram = input_cram, 12 | disk_size = disk_size, 13 | preemptible_tries = preemptible_tries 14 | } 15 | } 16 | -------------------------------------------------------------------------------- /test/extract-sv-reads/test.inputs.json: -------------------------------------------------------------------------------- 1 | { 2 | "Test_Extract_Reads.input_cram": "gs://mgi-wdl-test/data/call-Extract_Reads/shard-0/attempt-2/H_IJ-NA12878-NA12878_K10.cram", 3 | "Test_Extract_Reads.basename": "H_IJ-NA12878-NA12878_K10", 4 | "Test_Extract_Reads.ref_cache": "gs://human-b38/cache.tar.gz", 5 | "Test_Extract_Reads.preemptible_tries": 3, 6 | "Test_Extract_Reads.disk_size": 50 7 | } 8 | -------------------------------------------------------------------------------- /test/extract-sv-reads/test.wdl: -------------------------------------------------------------------------------- 1 | import "../../scripts/SV_Tasks.wdl" as SV 2 | 3 | workflow Test_Extract_Reads { 4 | # data inputs 5 | String basename 6 | File input_cram 7 | 8 | # reference inputs 9 | File ref_cache 10 | 11 | # system inputs 12 | Int disk_size 13 | Int preemptible_tries 14 | 15 | call SV.Extract_Reads { 16 | input: 17 | input_cram = input_cram, 18 | basename = basename, 19 | ref_cache = ref_cache, 20 | disk_size = disk_size, 21 | preemptible_tries = preemptible_tries 22 | } 23 | } 24 | -------------------------------------------------------------------------------- /docker/samtools/Dockerfile: -------------------------------------------------------------------------------- 1 | FROM debian:stretch-slim 2 | LABEL maintainer "Dave Larson " 3 | 4 | COPY --from=halllab/samtools-1.9-build:v1 /build/deb-build/opt/hall-lab/samtools-1.9 /opt/hall-lab/samtools-1.9 5 | 6 | ENV PATH=/opt/hall-lab/samtools-1.9/bin:$PATH 7 | 8 | RUN apt-get update -qq \ 9 | && apt-get install -y --no-install-recommends \ 10 | libssl1.1 \ 11 | libcurl3 \ 12 | libncurses5 \ 13 | libbz2-1.0 \ 14 | liblzma5 \ 15 | libssl1.0.2 \ 16 | zlib1g 17 | 18 | CMD ["/bin/bash"] 19 | -------------------------------------------------------------------------------- /test/svtyper/test.inputs.json: -------------------------------------------------------------------------------- 1 | { 2 | "Test_Genotype.basename": "H_IJ-NA12878-NA12878_K10", 3 | "Test_Genotype.input_cram": "gs://mgi-wdl-test/data/call-Extract_Reads/shard-0/attempt-2/H_IJ-NA12878-NA12878_K10.cram", 4 | "Test_Genotype.input_cram_index": "gs://mgi-wdl-test/data/call-Extract_Reads/shard-0/attempt-2/H_IJ-NA12878-NA12878_K10.cram.crai", 5 | "Test_Genotype.input_vcf": "gs://mgi-wdl-test/data/call-L_Merge_VCF_Variants/pilot-01.lmerge.vcf.gz", 6 | "Test_Genotype.ref_cache": "gs://human-b38/cache.tar.gz", 7 | "Test_Genotype.preemptible_tries": 3, 8 | "Test_Genotype.disk_size": 50 9 | } 10 | -------------------------------------------------------------------------------- /test/svtyper/test.wdl: -------------------------------------------------------------------------------- 1 | import "../../scripts/SV_Tasks.wdl" as SV 2 | 3 | workflow Test_Genotype { 4 | # data inputs 5 | String basename 6 | File input_cram 7 | File input_cram_index 8 | File input_vcf 9 | 10 | # reference inputs 11 | File ref_cache 12 | 13 | # system inputs 14 | Int disk_size 15 | Int preemptible_tries 16 | 17 | call SV.Genotype as Genotype_Merged { 18 | input: 19 | basename = basename, 20 | input_cram = input_cram, 21 | input_cram_index = input_cram_index, 22 | input_vcf = input_vcf, 23 | ref_cache = ref_cache, 24 | disk_size = disk_size, 25 | preemptible_tries = preemptible_tries 26 | } 27 | } 28 | -------------------------------------------------------------------------------- /docker/bcftools/Dockerfile: -------------------------------------------------------------------------------- 1 | FROM debian:stretch-slim 2 | LABEL maintainer "Dave Larson " 3 | 4 | COPY --from=halllab/bcftools-1.9-build:v1 /build/deb-build/opt/hall-lab/bcftools-1.9 /opt/hall-lab/bcftools-1.9 5 | COPY --from=halllab/htslib-1.9-build:v1 /build/deb-build/opt/hall-lab/htslib-1.9 /opt/hall-lab/htslib-1.9 6 | 7 | ENV PATH=/opt/hall-lab/bcftools-1.9/bin:/opt/hall-lab/htslib-1.9/bin:$PATH 8 | 9 | RUN apt-get update -qq \ 10 | && apt-get install -y --no-install-recommends \ 11 | libssl1.1 \ 12 | libcurl3 \ 13 | libbz2-1.0 \ 14 | liblzma5 \ 15 | libssl1.0.2 \ 16 | zlib1g 17 | 18 | CMD ["/bin/bash"] 19 | -------------------------------------------------------------------------------- /docker/extract-sv-reads/Dockerfile: -------------------------------------------------------------------------------- 1 | FROM ubuntu:14.04 2 | LABEL maintainer "Dave Larson " 3 | 4 | # Build dependencies 5 | RUN export EXTRACT_SV_READS_VERSION=1.1.0 \ 6 | && apt-get update -qq \ 7 | && apt-get -y install apt-transport-https \ 8 | && echo "deb [trusted=yes] https://gitlab.com/hall-lab/ccdg-apt-repo/raw/master ccdg main" | tee -a /etc/apt/sources.list \ 9 | && runDeps=' \ 10 | libcurl3 \ 11 | ca-certificates \ 12 | zlib1g \ 13 | libncurses5 \ 14 | ccdg-samtools-1.3.1 \ 15 | extract-sv-reads1.1 \ 16 | ' \ 17 | && apt-get update -qq \ 18 | && apt-get -y install \ 19 | --no-install-recommends \ 20 | $runDeps \ 21 | && rm -rf /var/lib/apt/lists/* 22 | 23 | ENV PATH=/opt/ccdg/samtools-1.3.1/bin:${PATH} 24 | 25 | CMD ["/bin/bash"] 26 | -------------------------------------------------------------------------------- /test/svtools/test.inputs.json: -------------------------------------------------------------------------------- 1 | { 2 | "Test_SVTools.cohort_name": "test-01", 3 | "Test_SVTools.final_vcf_name": "test-01.vcf.gz", 4 | "Test_SVTools.input_pre_merged_vcfs": [ 5 | "gs://mgi-wdl-test/data/call-SV_Genotype_Unmerged/shard-0/H_IJ-NA12878-NA12878_K10.gt.vcf", 6 | "gs://mgi-wdl-test/data/call-SV_Genotype_Unmerged/shard-1/H_IJ-NA12891-NA12891_D2.gt.vcf" 7 | ], 8 | "Test_SVTools.input_post_merged_vcfs": [ 9 | "gs://mgi-wdl-test/data/call-SV_Copy_Number/shard-0/H_IJ-NA12878-NA12878_K10.cn.vcf", 10 | "gs://mgi-wdl-test/data/call-SV_Copy_Number/shard-1/H_IJ-NA12891-NA12891_D2.cn.vcf" 11 | ], 12 | "Test_SVTools.pedigree_file": "gs://mgi-wdl-test/data/call-Make_Pedigree_File/pilot-01.ped", 13 | "Test_SVTools.mei_annotation_bed": "gs://human-b38/GRCh38DH/annotations/repeatMasker.recent.lt200millidiv.LINE_SINE_SVA.GRCh38.sorted.bed.gz", 14 | "Test_SVTools.disk_size": 50, 15 | "Test_SVTools.preemptible_tries": 3 16 | } 17 | -------------------------------------------------------------------------------- /docker/cromwell_mysql/mysql.cnf.template: -------------------------------------------------------------------------------- 1 | [client] 2 | port = 3306 3 | socket = /tmp/mysqld.sock 4 | 5 | [mysqld_safe] 6 | socket = /tmp/mysqld.sock 7 | nice = 0 8 | 9 | [mysqld] 10 | user = mysql 11 | pid-file = %%SHARED_FS_DIRECTORY%%/db/run/mysqld/mysqld.pid 12 | socket = /tmp/mysqld.sock 13 | port = 3306 14 | basedir = /usr 15 | datadir = %%SHARED_FS_DIRECTORY%%/db/lib/mysql 16 | tmpdir = /tmp 17 | skip-external-locking 18 | bind-address = 127.0.0.1 19 | key_buffer = 16M 20 | max_allowed_packet = 16M 21 | thread_stack = 192K 22 | thread_cache_size = 8 23 | myisam-recover = BACKUP 24 | query_cache_limit = 1M 25 | query_cache_size = 16M 26 | log_error = %%SHARED_FS_DIRECTORY%%/db/log/mysql/error.log 27 | expire_logs_days = 10 28 | max_binlog_size = 100M 29 | 30 | [mysqldump] 31 | quick 32 | quote-names 33 | max_allowed_packet = 16M 34 | 35 | [mysql] 36 | 37 | [isamchk] 38 | key_buffer = 16M 39 | 40 | !includedir /etc/mysql/conf.d/ 41 | -------------------------------------------------------------------------------- /scripts/Pre_Merge_QC_per_sample.wdl: -------------------------------------------------------------------------------- 1 | version 1.0 2 | import "SV_Tasks.wdl" as SV 3 | 4 | workflow Pre_Merge_QC_Per_Sample { 5 | input { 6 | # data inputs 7 | File manta_vcf 8 | File lumpy_vcf 9 | File cnvnator_vcf 10 | String cohort 11 | String center 12 | 13 | # system inputs 14 | Int preemptible_tries 15 | String basename = sub(sub(lumpy_vcf, "^.*/", ""), ".vcf.gz" + "$", "") 16 | } 17 | 18 | call SV.Count_Lumpy { 19 | input: 20 | cohort = cohort, 21 | center = center, 22 | basename = basename, 23 | input_vcf = lumpy_vcf, 24 | preemptible_tries = preemptible_tries 25 | } 26 | 27 | call SV.Count_Manta { 28 | input: 29 | cohort = cohort, 30 | center = center, 31 | basename = basename, 32 | input_vcf = manta_vcf, 33 | preemptible_tries = preemptible_tries 34 | } 35 | 36 | output { 37 | File lumpy_counts = Count_Lumpy.output_counts 38 | File manta_counts = Count_Manta.output_counts 39 | } 40 | } 41 | -------------------------------------------------------------------------------- /test/lumpy/test.wdl: -------------------------------------------------------------------------------- 1 | import "../../scripts/SV_Tasks.wdl" as SV 2 | 3 | workflow Test_Lumpy { 4 | # data inputs 5 | String basename 6 | File input_cram 7 | File input_cram_index 8 | File input_splitters_bam 9 | File input_splitters_bam_index 10 | File input_discordants_bam 11 | File input_discordants_bam_index 12 | 13 | # reference inputs 14 | File ref_cache 15 | File exclude_regions 16 | 17 | # system inputs 18 | Int disk_size 19 | Int preemptible_tries 20 | 21 | call SV.Lumpy { 22 | input: 23 | basename = basename, 24 | input_cram = input_cram, 25 | input_cram_index = input_cram_index, 26 | input_splitters_bam = input_splitters_bam, 27 | input_splitters_bam_index = input_splitters_bam_index, 28 | input_discordants_bam = input_discordants_bam, 29 | input_discordants_bam_index = input_discordants_bam_index, 30 | ref_cache = ref_cache, 31 | exclude_regions = exclude_regions, 32 | disk_size = disk_size, 33 | preemptible_tries = preemptible_tries 34 | } 35 | } 36 | -------------------------------------------------------------------------------- /scripts/Pre_Merge_SV.inputs.json: -------------------------------------------------------------------------------- 1 | { 2 | "Pre_Merge_SV.aligned_crams": [ 3 | "gs://fc-56ac46ea-efc4-4683-b6d5-6d95bed41c5e/CCDG_13607/Project_CCDG_13607_B01_GRM_WGS.cram.2019-02-06/Sample_NA12878/analysis/NA12878.final.cram", 4 | "gs://fc-56ac46ea-efc4-4683-b6d5-6d95bed41c5e/CCDG_13607/Project_CCDG_13607_B01_GRM_WGS.cram.2019-02-06/Sample_NA19238/analysis/NA19238.final.cram" 5 | ], 6 | "Pre_Merge_SV.aligned_cram_suffix": ".cram", 7 | "Pre_Merge_SV.cohort": "Cohort", 8 | "Pre_Merge_SV.center": "WashU", 9 | 10 | "Pre_Merge_SV.ref_fasta": "gs://human-b38/GRCh38DH/all_sequences.fa", 11 | "Pre_Merge_SV.ref_fasta_index": "gs://human-b38/GRCh38DH/all_sequences.fa.fai", 12 | "Pre_Merge_SV.ref_cache": "gs://human-b38/cache.tar.gz", 13 | "Pre_Merge_SV.exclude_regions": "gs://human-b38/GRCh38DH/annotations/exclude.cnvnator_100bp.GRCh38.20170403.bed", 14 | "Pre_Merge_SV.call_regions_bed": "gs://human-b38/GRCh38DH/annotations/canonical_chromosome.bed.gz", 15 | "Pre_Merge_SV.call_regions_bed_index": "gs://human-b38/GRCh38DH/annotations/canonical_chromosome.bed.gz.tbi", 16 | "Pre_Merge_SV.preemptible_tries": 3 17 | } 18 | -------------------------------------------------------------------------------- /scripts/Merge_SV.inputs.json: -------------------------------------------------------------------------------- 1 | { 2 | "Merge_SV.smoove_input_vcfs": [ 3 | "gs://mgi-wdl-test/2019-09-06/Pre_Merge_SV/1e43c1ff-befd-4bf1-834e-6aab76b976d5/call-Pre_Merge_SV_Per_Sample/shard-0/Pre_Merge_SV_Per_Sample/30546f8c-c09f-4873-b77e-641f194cacb5/call-Smoove/attempt-2/NA12878.final-smoove.genotyped.vcf.gz", 4 | "gs://mgi-wdl-test/2019-09-06/Pre_Merge_SV/1e43c1ff-befd-4bf1-834e-6aab76b976d5/call-Pre_Merge_SV_Per_Sample/shard-1/Pre_Merge_SV_Per_Sample/928868b2-a748-43f6-a938-517f784eff54/call-Smoove/NA19238.final-smoove.genotyped.vcf.gz" 5 | ], 6 | "Merge_SV.manta_input_vcfs": [ 7 | "gs://mgi-wdl-test/2019-09-06/Pre_Merge_SV/e1715d86-4359-44ed-8115-8cd47a008311/call-Pre_Merge_SV_Per_Sample/shard-0/Pre_Merge_SV_Per_Sample/cadf1cd8-bcd8-43b7-8038-9eccf0e2f2ca/call-Manta/attempt-2/NA12878.final.doctored.vcf.gz", 8 | "gs://mgi-wdl-test/2019-09-06/Pre_Merge_SV/1e43c1ff-befd-4bf1-834e-6aab76b976d5/call-Pre_Merge_SV_Per_Sample/shard-1/Pre_Merge_SV_Per_Sample/928868b2-a748-43f6-a938-517f784eff54/call-Manta/attempt-2/NA19238.final.doctored.vcf.gz" 9 | ], 10 | "Merge_SV.cohort_name": "Cohort", 11 | 12 | "Merge_SV.preemptible_tries": 3 13 | } 14 | -------------------------------------------------------------------------------- /docker/cromwell_mysql/Dockerfile: -------------------------------------------------------------------------------- 1 | FROM broadinstitute/cromwell:29 2 | LABEL maintainer "Dave Larson " 3 | 4 | # Build dependencies 5 | RUN apt-get update -qq \ 6 | && runDeps=' \ 7 | libnss-sss \ 8 | mysql-server \ 9 | ' \ 10 | && apt-get update -qq \ 11 | && DEBIAN_FRONTEND=noninteractive apt-get -y install \ 12 | --no-install-recommends \ 13 | $runDeps \ 14 | && mkdir /var/run/mysqld \ 15 | && chmod -R 777 /var/lib/mysql /var/run/mysqld /var/log/mysql && rm -fr /var/lib/mysql/mysql /var/lib/mysql/performance_schema \ 16 | && mkdir -p /opt/ccdg/cromwell/resources \ 17 | && ln -sf /usr/share/zoneinfo/America/Chicago /etc/localtime && echo "America/Chicago" > /etc/timezone && dpkg-reconfigure --frontend noninteractive tzdata \ 18 | && rm -rf /var/lib/apt/lists/* 19 | 20 | ADD application.conf.template /opt/ccdg/cromwell/resources 21 | ADD mysql.cnf.template /opt/ccdg/cromwell/resources 22 | ADD run_pipeline.sh /opt/ccdg/cromwell/resources 23 | 24 | # Reset entrypoint so container doesn't try to run cromwell directly 25 | ENTRYPOINT ["/opt/ccdg/cromwell/resources/run_pipeline.sh"] 26 | -------------------------------------------------------------------------------- /test/lumpy/test.inputs.json: -------------------------------------------------------------------------------- 1 | { 2 | "Test_Lumpy.basename": "H_IJ-NA12878-NA12878_K10", 3 | "Test_Lumpy.input_cram": "gs://mgi-wdl-test/data/call-Extract_Reads/shard-0/attempt-2/H_IJ-NA12878-NA12878_K10.cram", 4 | "Test_Lumpy.input_cram_index": "gs://mgi-wdl-test/data/call-Extract_Reads/shard-0/attempt-2/H_IJ-NA12878-NA12878_K10.cram.crai", 5 | "Test_Lumpy.input_discordants_bam": "gs://mgi-wdl-test/data/call-Extract_Reads/shard-0/attempt-2/H_IJ-NA12878-NA12878_K10.discordants.bam", 6 | "Test_Lumpy.input_discordants_bam_index": "gs://mgi-wdl-test/data/call-Extract_Reads/shard-0/attempt-2/H_IJ-NA12878-NA12878_K10.discordants.bam.bai", 7 | "Test_Lumpy.input_splitters_bam": "gs://mgi-wdl-test/data/call-Extract_Reads/shard-0/attempt-2/H_IJ-NA12878-NA12878_K10.splitters.bam", 8 | "Test_Lumpy.input_splitters_bam_index": "gs://mgi-wdl-test/data/call-Extract_Reads/shard-0/attempt-2/H_IJ-NA12878-NA12878_K10.splitters.bam.bai", 9 | "Test_Lumpy.disk_size": 50, 10 | "Test_Lumpy.ref_cache": "gs://human-b38/cache.tar.gz", 11 | "Test_Lumpy.exclude_regions": "gs://human-b38/GRCh38DH/annotations/exclude.cnvnator_100bp.GRCh38.20170403.bed", 12 | "Test_Lumpy.preemptible_tries": 3 13 | } 14 | -------------------------------------------------------------------------------- /docker/lumpy/Dockerfile: -------------------------------------------------------------------------------- 1 | FROM ubuntu:14.04 2 | LABEL maintainer "Colby Chiang " 3 | 4 | # Build dependencies 5 | RUN apt-get update -qq \ 6 | && apt-get -y install \ 7 | apt-transport-https \ 8 | g++ \ 9 | gawk \ 10 | libcurl4-gnutls-dev \ 11 | autoconf \ 12 | libssl-dev \ 13 | git \ 14 | && echo "deb [trusted=yes] https://gitlab.com/hall-lab/ccdg-apt-repo/raw/master ccdg main" | tee -a /etc/apt/sources.list \ 15 | && runDeps=' \ 16 | ccdg-python-2.7.12 \ 17 | ccdg-samtools-1.3.1 \ 18 | ' \ 19 | && apt-get update -qq \ 20 | && apt-get -y install \ 21 | --no-install-recommends \ 22 | $runDeps \ 23 | && /opt/ccdg/python-2.7.12/bin/pip install --upgrade pip numpy scipy pysam \ 24 | && rm -rf /var/lib/apt/lists/* 25 | 26 | ENV PATH /opt/ccdg/samtools-1.3.1/bin:${PATH} 27 | ENV PATH /opt/ccdg/python-2.7.12/bin:${PATH} 28 | 29 | # Install LUMPY 30 | RUN cd /opt \ 31 | && git clone https://github.com/hall-lab/lumpy-sv.git \ 32 | && cd /opt/lumpy-sv \ 33 | && git checkout 0.2.13_cram_support \ 34 | && git submodule sync \ 35 | && git submodule update --init \ 36 | && cd /opt/lumpy-sv \ 37 | && make 38 | 39 | ENV PATH /opt/lumpy-sv/bin:${PATH} 40 | ENV SHELL /bin/bash 41 | 42 | CMD ["/bin/bash"] 43 | -------------------------------------------------------------------------------- /scripts/SV_Pipeline_Full.inputs.json: -------------------------------------------------------------------------------- 1 | { 2 | "SV_Pipeline_Full.aligned_crams": [ 3 | "gs://fc-56ac46ea-efc4-4683-b6d5-6d95bed41c5e/CCDG_13607/Project_CCDG_13607_B01_GRM_WGS.cram.2019-02-06/Sample_NA12878/analysis/NA12878.final.cram", 4 | "gs://fc-56ac46ea-efc4-4683-b6d5-6d95bed41c5e/CCDG_13607/Project_CCDG_13607_B01_GRM_WGS.cram.2019-02-06/Sample_NA19238/analysis/NA19238.final.cram" 5 | ], 6 | "SV_Pipeline_Full.aligned_cram_suffix": ".cram", 7 | "SV_Pipeline_Full.cohort": "Cohort", 8 | "SV_Pipeline_Full.center": "WashU", 9 | 10 | "SV_Pipeline_Full.ref_fasta": "gs://human-b38/GRCh38DH/all_sequences.fa", 11 | "SV_Pipeline_Full.ref_fasta_index": "gs://human-b38/GRCh38DH/all_sequences.fa.fai", 12 | "SV_Pipeline_Full.ref_cache": "gs://human-b38/cache.tar.gz", 13 | "SV_Pipeline_Full.exclude_regions": "gs://human-b38/GRCh38DH/annotations/exclude.cnvnator_100bp.GRCh38.20170403.bed", 14 | "SV_Pipeline_Full.call_regions_bed": "gs://human-b38/GRCh38DH/annotations/canonical_chromosome.bed.gz", 15 | "SV_Pipeline_Full.call_regions_bed_index": "gs://human-b38/GRCh38DH/annotations/canonical_chromosome.bed.gz.tbi", 16 | "SV_Pipeline_Full.mei_annotation_bed": "gs://human-b38/GRCh38DH/annotations/repeatMasker.recent.lt200millidiv.LINE_SINE_SVA.GRCh38.sorted.bed.gz", 17 | "SV_Pipeline_Full.preemptible_tries": 3, 18 | "SV_Pipeline_Full.final_vcf_name": "merged_genotyped.vcf.gz", 19 | } 20 | -------------------------------------------------------------------------------- /scripts/sort_same.py: -------------------------------------------------------------------------------- 1 | import re 2 | 3 | def parse_name(line): 4 | list1 = re.split("/", line) 5 | list2 = re.split("\.", list1[len(list1)-1]) 6 | return list2[0] 7 | 8 | cram_list = "cram_list.txt" 9 | cn_list = "cn_hist_roots.txt" 10 | index_list = "indices.txt" 11 | original_manta_list = "original_manta_vcfs.txt" 12 | 13 | crams = open(cram_list, "r") 14 | cram_list = [] 15 | for line in crams: 16 | cram_list.append(line.rstrip()) 17 | 18 | cns = open(cn_list, "r") 19 | cns_dict = {} 20 | for line in cns: 21 | sample_name = parse_name(line) 22 | cns_dict[sample_name] = line.rstrip() 23 | 24 | indices = open(index_list, "r") 25 | index_dict = {} 26 | for line in indices: 27 | sample_name = parse_name(line) 28 | index_dict[sample_name] = line.rstrip() 29 | 30 | manta = open(original_manta_list, "r") 31 | manta_dict = {} 32 | for line in manta: 33 | sample_name = parse_name(line) 34 | manta_dict[sample_name] = line.rstrip() 35 | 36 | cns_out = open("cn_hist_roots_ordered.txt", "w") 37 | index_out = open("indices_ordered.txt", "w") 38 | manta_out = open("manta_ordered.txt", "w") 39 | for cram in cram_list: 40 | sample_name = parse_name(cram) 41 | cns_out.write(cns_dict[sample_name]) 42 | cns_out.write("\n") 43 | index_out.write(index_dict[sample_name]) 44 | index_out.write("\n") 45 | manta_out.write(manta_dict[sample_name]) 46 | manta_out.write("\n") 47 | 48 | cns_out.close() 49 | index_out.close() 50 | -------------------------------------------------------------------------------- /test/cnvnator/test.inputs.json: -------------------------------------------------------------------------------- 1 | { 2 | "Test_Copy_Number.input_cram": "gs://mgi-wdl-test/data/call-Extract_Reads/shard-0/attempt-2/H_IJ-NA12878-NA12878_K10.cram", 3 | "Test_Copy_Number.input_cram_index": "gs://mgi-wdl-test/data/call-Extract_Reads/shard-0/attempt-2/H_IJ-NA12878-NA12878_K10.cram.crai", 4 | "Test_Copy_Number.basename": "H_IJ-NA12878-NA12878_K10", 5 | "Test_Copy_Number.sample": "H_IJ-NA12878-NA12878_K10", 6 | "Test_Copy_Number.input_vcf": "gs://mgi-wdl-test/data/call-SV_Genotype_Merged/shard-0/H_IJ-NA12878-NA12878_K10.gt.vcf", 7 | "Test_Copy_Number.ref_fasta": "gs://human-b38/GRCh38DH/all_sequences.fa", 8 | "Test_Copy_Number.ref_fasta_index": "gs://human-b38/GRCh38DH/all_sequences.fa.fai", 9 | "Test_Copy_Number.ref_cache": "gs://human-b38/cache.tar.gz", 10 | "Test_Copy_Number.disk_size": 50, 11 | "Test_Copy_Number.preemptible_tries": 3, 12 | 13 | "Test_Copy_Number.cohort_name": "test-01", 14 | "Test_Copy_Number.aligned_crams": [ 15 | "gs://mgi-wdl-test/data/call-Extract_Reads/shard-0/attempt-2/H_IJ-NA12878-NA12878_K10.cram", 16 | "gs://mgi-wdl-test/data/call-Extract_Reads/shard-1/attempt-4/H_IJ-NA12891-NA12891_D2.cram" 17 | ], 18 | "Test_Copy_Number.cn_hist_roots": [ 19 | "gs://mgi-wdl-test/data/call-CNVnator_Histogram/shard-0/attempt-2/cnvnator.out/H_IJ-NA12878-NA12878_K10.cram.hist.root", 20 | "gs://mgi-wdl-test/data/call-CNVnator_Histogram/shard-1/cnvnator.out/H_IJ-NA12891-NA12891_D2.cram.hist.root" 21 | ] 22 | } 23 | -------------------------------------------------------------------------------- /docker/cnvnator/Dockerfile: -------------------------------------------------------------------------------- 1 | FROM debian:stretch-slim 2 | LABEL maintainer "Dave Larson " 3 | COPY --from=halllab/python2.7-build:v1 /opt/hall-lab/python-2.7.15 /opt/hall-lab/python-2.7.15 4 | COPY --from=halllab/htslib-1.9-build:v1 /build/deb-build/opt/hall-lab/htslib-1.9 /opt/hall-lab/htslib-1.9 5 | COPY --from=halllab/samtools-1.9-build:v1 /build/deb-build/opt/hall-lab/samtools-1.9 /opt/hall-lab/samtools-1.9 6 | COPY --from=halllab/cnvnator-0.3.3-build:v1 /opt/hall-lab/cnvnator-0.3.3/deb-build/opt/hall-lab/cnvnator-0.3.3 /opt/hall-lab/cnvnator-0.3.3 7 | ENV PATH=/opt/hall-lab/python-2.7.15/bin:/opt/hall-lab/htslib-1.9/bin:/opt/hall-lab/samtools-1.9/bin:/opt/hall-lab/cnvnator-0.3.3/bin:$PATH 8 | ENV LD_LIBRARY_PATH=/opt/hall-lab/htslib-1.9/lib:$LD_LIBRARY_PATH 9 | 10 | # Build dependencies 11 | RUN apt-get update -qq \ 12 | && apt-get -y install \ 13 | libssl1.1 \ 14 | libcurl3 \ 15 | libncurses5 \ 16 | libbz2-1.0 \ 17 | liblzma5 \ 18 | libssl1.0.2 \ 19 | zlib1g \ 20 | libgomp1 \ 21 | libstdc++6 \ 22 | libstdc++-6-dev \ 23 | libgcc1 \ 24 | g++ \ 25 | libxpm4 \ 26 | git-core \ 27 | && git clone https://github.com/hall-lab/speedseq.git \ 28 | && cd speedseq \ 29 | && git checkout 4e60002 \ 30 | && cp bin/cnvnator_wrapper.py /opt/hall-lab/cnvnator-0.3.3/bin/ \ 31 | && cd .. \ 32 | && rm -rf speedseq \ 33 | && apt-get purge -y git-core \ 34 | && apt autoremove -y 35 | 36 | CMD ["/bin/bash"] 37 | -------------------------------------------------------------------------------- /test/jes.conf: -------------------------------------------------------------------------------- 1 | # Minimal Cromwell template for using JES 2 | 3 | webservice { 4 | port = 8000 5 | interface = 0.0.0.0 6 | instance.name = "cromwell-for-wdl-runner" 7 | } 8 | 9 | akka { 10 | loggers = ["akka.event.slf4j.Slf4jLogger"] 11 | } 12 | 13 | spray.can { 14 | server { 15 | request-timeout = 40s 16 | } 17 | client { 18 | request-timeout = 40s 19 | connecting-timeout = 40s 20 | } 21 | } 22 | 23 | backend { 24 | default = "JES" 25 | providers { 26 | JES { 27 | actor-factory = "cromwell.backend.impl.jes.JesBackendLifecycleActorFactory" 28 | config { 29 | project = "washu-genome-inh-dis-analysis" 30 | root = "gs://mgi-wdl-test/workspace" 31 | 32 | genomics { 33 | # A reference to an auth defined in the 'google' stanza at the top. This auth is used to create 34 | # Pipelines and manipulate auth JSONs. 35 | auth = "application-default" 36 | endpoint-url = "https://genomics.googleapis.com/" 37 | } 38 | 39 | filesystems = { 40 | gcs { 41 | # A reference to a potentially different auth for manipulating files via engine functions. 42 | auth = "application-default" 43 | } 44 | } 45 | 46 | } 47 | } 48 | } 49 | } 50 | 51 | google { 52 | applicationName = "cromwell" 53 | cromwellAuthenticationScheme = "application_default" 54 | } 55 | 56 | database { 57 | driver = "slick.driver.HsqldbDriver$" 58 | 59 | db { 60 | driver = "org.hsqldb.jdbcDriver" 61 | url = "jdbc:hsqldb:mem:${slick.uniqueSchema};shutdown=false;hsqldb.tx=mvcc" 62 | connectionTimeout = 1000 63 | } 64 | } 65 | 66 | instrumentation { 67 | use-kamon = false 68 | } 69 | 70 | call-caching { 71 | enabled = true 72 | invalidate-bad-cache-results = true 73 | } 74 | -------------------------------------------------------------------------------- /docker/vcf_bed_utils/Dockerfile: -------------------------------------------------------------------------------- 1 | FROM debian:stretch-slim as builder-base 2 | LABEL maintainer "Allison Regier " 3 | RUN apt-get update -qq \ 4 | && apt-get install -y --no-install-recommends \ 5 | build-essential \ 6 | make \ 7 | cmake \ 8 | autoconf \ 9 | automake \ 10 | libtool \ 11 | gawk \ 12 | git-core \ 13 | bzip2 \ 14 | libbz2-dev \ 15 | liblzma-dev \ 16 | libssl1.0-dev \ 17 | libcurl4-openssl-dev \ 18 | ca-certificates \ 19 | curl \ 20 | zlib1g-dev 21 | 22 | FROM builder-base as vawk-build 23 | LABEL maintainer "Allison Regier " 24 | 25 | COPY --from=halllab/python2.7-build:v1 /opt/hall-lab/python-2.7.15 /opt/hall-lab/python-2.7.15 26 | ENV PATH=/opt/hall-lab/python-2.7.15/bin:${PATH} 27 | RUN git clone https://github.com/cc2qe/vawk 28 | RUN git clone https://github.com/hall-lab/io 29 | RUN curl -kL https://github.com/arq5x/bedtools2/releases/download/v2.28.0/bedtools > bedtools 30 | RUN chmod a+x bedtools 31 | 32 | FROM debian:stretch-slim 33 | LABEL maintainer "Allison Regier " 34 | 35 | COPY --from=vawk-build /opt/hall-lab/python-2.7.15 /opt/hall-lab/python-2.7.15 36 | COPY --from=halllab/htslib-1.9-build:v1 /build/deb-build/opt/hall-lab/htslib-1.9 /opt/hall-lab/htslib-1.9 37 | COPY --from=vawk-build vawk /opt/hall-lab/vawk 38 | COPY --from=vawk-build io /opt/hall-lab/io 39 | COPY --from=vawk-build bedtools /opt/hall-lab/bin/bedtools 40 | 41 | RUN apt-get update -qq \ 42 | && apt-get install -y --no-install-recommends \ 43 | libssl1.1 \ 44 | libcurl3 \ 45 | libncurses5 \ 46 | libbz2-1.0 \ 47 | liblzma5 \ 48 | libssl1.0.2 \ 49 | zlib1g \ 50 | less \ 51 | gawk 52 | 53 | ENV PATH=/opt/hall-lab/python-2.7.15/bin:${PATH} 54 | 55 | CMD ["/bin/bash"] 56 | -------------------------------------------------------------------------------- /test/svtools/test.wdl: -------------------------------------------------------------------------------- 1 | import "../../scripts/SV_Tasks.wdl" as SV 2 | 3 | workflow Test_SVTools { 4 | # data inputs 5 | Array[File] input_pre_merged_vcfs 6 | Array[File] input_post_merged_vcfs 7 | File pedigree_file 8 | String cohort_name 9 | String final_vcf_name 10 | 11 | # reference inputs 12 | File mei_annotation_bed 13 | 14 | # system inputs 15 | Int disk_size 16 | Int preemptible_tries 17 | 18 | call SV.L_Sort_VCF_Variants { 19 | input: 20 | input_vcfs = input_pre_merged_vcfs, 21 | output_vcf_basename = cohort_name + ".lsort", 22 | disk_size = disk_size, 23 | preemptible_tries = preemptible_tries 24 | } 25 | 26 | call SV.L_Merge_VCF_Variants { 27 | input: 28 | input_vcf_gz = L_Sort_VCF_Variants.output_vcf_gz, 29 | output_vcf_basename = cohort_name + ".lmerge", 30 | disk_size = disk_size, 31 | preemptible_tries = preemptible_tries 32 | } 33 | 34 | call SV.Paste_VCF { 35 | input: 36 | input_vcfs = input_post_merged_vcfs, 37 | output_vcf_basename = cohort_name + ".merged.gt.cn", 38 | disk_size = disk_size, 39 | preemptible_tries = preemptible_tries 40 | } 41 | 42 | call SV.Prune_VCF { 43 | input: 44 | input_vcf_gz = Paste_VCF.output_vcf_gz, 45 | output_vcf_basename = cohort_name + ".merged.gt.cn.pruned", 46 | disk_size = disk_size, 47 | preemptible_tries = preemptible_tries 48 | } 49 | 50 | call SV.Classify { 51 | input: 52 | input_vcf_gz = Prune_VCF.output_vcf_gz, 53 | input_ped = pedigree_file, 54 | mei_annotation_bed = mei_annotation_bed, 55 | output_vcf_basename = cohort_name + ".merged.gt.cn.pruned.class", 56 | disk_size = disk_size, 57 | preemptible_tries = preemptible_tries 58 | } 59 | 60 | call SV.Sort_Index_VCF { 61 | input: 62 | input_vcf_gz = Classify.output_vcf_gz, 63 | output_vcf_name = final_vcf_name, 64 | disk_size = disk_size, 65 | preemptible_tries = preemptible_tries 66 | } 67 | } 68 | -------------------------------------------------------------------------------- /docker/manta/Dockerfile: -------------------------------------------------------------------------------- 1 | FROM debian:stretch-slim AS manta-build 2 | LABEL maintainer "Dave Larson " 3 | ARG MANTA_VERSION=1.4.0 4 | COPY --from=halllab/python2.7-build:v1 /opt/hall-lab/python-2.7.15 /opt/hall-lab/python-2.7.15 5 | ENV PATH=/opt/hall-lab/python-2.7.15/bin:${PATH} 6 | RUN apt-get update -qq \ 7 | && apt-get -y install \ 8 | --no-install-recommends \ 9 | build-essential \ 10 | bzip2 \ 11 | zlib1g-dev \ 12 | curl \ 13 | ca-certificates 14 | RUN curl -O -L https://github.com/Illumina/manta/releases/download/v${MANTA_VERSION}/manta-${MANTA_VERSION}.release_src.tar.bz2 \ 15 | && tar -xjf manta-${MANTA_VERSION}.release_src.tar.bz2 \ 16 | && mkdir build \ 17 | && cd build \ 18 | && ../manta-${MANTA_VERSION}.release_src/configure --prefix=/opt/hall-lab/manta-${MANTA_VERSION} \ 19 | && make -j 4 install 20 | RUN find /opt/hall-lab/python-2.7.15/ -depth \( -name '*.pyo' -o -name '*.pyc' -o -name 'test' -o -name 'tests' \) -exec rm -rf '{}' + ; 21 | RUN find /opt/hall-lab/python-2.7.15/lib/python2.7/site-packages/ -name '*.so' -print -exec sh -c 'file "{}" | grep -q "not stripped" && strip -s "{}"' \; 22 | 23 | FROM debian:stretch-slim 24 | LABEL maintainer "Dave Larson " 25 | ARG MANTA_VERSION=1.4.0 26 | 27 | COPY --from=manta-build /opt/hall-lab/manta-${MANTA_VERSION}/bin /opt/hall-lab/manta-${MANTA_VERSION}/bin 28 | COPY --from=manta-build /opt/hall-lab/manta-${MANTA_VERSION}/lib /opt/hall-lab/manta-${MANTA_VERSION}/lib 29 | COPY --from=manta-build /opt/hall-lab/manta-${MANTA_VERSION}/libexec /opt/hall-lab/manta-${MANTA_VERSION}/libexec 30 | COPY --from=manta-build /opt/hall-lab/python-2.7.15 /opt/hall-lab/python-2.7.15 31 | 32 | # Run dependencies 33 | RUN apt-get update -qq \ 34 | && apt-get -y install \ 35 | --no-install-recommends \ 36 | libssl1.1 \ 37 | libcurl3 \ 38 | libbz2-1.0 \ 39 | liblzma5 \ 40 | libssl1.0.2 \ 41 | zlib1g 42 | 43 | ENV PATH=/opt/hall-lab/manta-${MANTA_VERSION}/bin:/opt/hall-lab/python-2.7.15/bin/:$PATH 44 | 45 | CMD ["/bin/bash"] 46 | -------------------------------------------------------------------------------- /test/cnvnator/test.wdl: -------------------------------------------------------------------------------- 1 | import "../../scripts/SV_Tasks.wdl" as SV 2 | 3 | workflow Test_Copy_Number { 4 | # data inputs 5 | String basename 6 | String sample 7 | File input_cram 8 | File input_cram_index 9 | File input_vcf 10 | 11 | # reference inputs 12 | File ref_fasta 13 | File ref_fasta_index 14 | File ref_cache 15 | 16 | # system inputs 17 | Int disk_size 18 | Int preemptible_tries 19 | 20 | # ----------------------------------- 21 | # test CNVnator 22 | call SV.CNVnator_Histogram { 23 | input: 24 | basename = basename, 25 | input_cram = input_cram, 26 | input_cram_index = input_cram_index, 27 | ref_fasta = ref_fasta, 28 | ref_fasta_index = ref_fasta_index, 29 | ref_cache = ref_cache, 30 | disk_size = disk_size, 31 | preemptible_tries = preemptible_tries 32 | } 33 | 34 | call SV.Copy_Number { 35 | input: 36 | basename = basename, 37 | sample = sample, 38 | input_vcf = input_vcf, 39 | input_cn_hist_root = CNVnator_Histogram.output_cn_hist_root, 40 | ref_cache = ref_cache, 41 | disk_size = disk_size, 42 | preemptible_tries = preemptible_tries 43 | } 44 | 45 | # ------------------------------------ 46 | # generate .ped file 47 | 48 | Array[File] aligned_crams 49 | Array[File] cn_hist_roots 50 | String cohort_name 51 | 52 | scatter (i in range(length(aligned_crams))) { 53 | File aligned_cram = aligned_crams[i] 54 | File cn_hist_root = cn_hist_roots[i] 55 | 56 | call SV.Get_Sample_Name { 57 | input: 58 | input_cram = aligned_cram, 59 | disk_size = disk_size, 60 | preemptible_tries = preemptible_tries 61 | } 62 | 63 | call SV.Get_Sex { 64 | input: 65 | input_cn_hist_root = cn_hist_root, 66 | ref_fasta_index = ref_fasta_index, 67 | disk_size = disk_size, 68 | preemptible_tries = preemptible_tries 69 | } 70 | } 71 | 72 | call SV.Make_Pedigree_File { 73 | input: 74 | sample_array = Get_Sample_Name.sample, 75 | sex_array = Get_Sex.sex, 76 | output_ped_basename = cohort_name, 77 | disk_size = 1 78 | } 79 | } 80 | -------------------------------------------------------------------------------- /docker/svtyper/Dockerfile: -------------------------------------------------------------------------------- 1 | FROM debian:stretch-slim as builder-base 2 | LABEL maintainer "Dave Larson " 3 | RUN apt-get update -qq \ 4 | && apt-get install -y --no-install-recommends \ 5 | build-essential \ 6 | make \ 7 | cmake \ 8 | autoconf \ 9 | automake \ 10 | libtool \ 11 | gawk \ 12 | git-core \ 13 | bzip2 \ 14 | libbz2-dev \ 15 | liblzma-dev \ 16 | libssl1.0-dev \ 17 | libcurl4-openssl-dev \ 18 | ca-certificates \ 19 | curl \ 20 | zlib1g-dev 21 | 22 | FROM builder-base as svtyper-0.7.1-build 23 | LABEL maintainer "Dave Larson " 24 | 25 | COPY --from=halllab/python2.7-build:v1 /opt/hall-lab/python-2.7.15 /opt/hall-lab/python-2.7.15 26 | ENV PATH=/opt/hall-lab/python-2.7.15/bin:${PATH} 27 | RUN SVTYPER_VERSION=0.7.1 \ 28 | && git clone https://github.com/hall-lab/svtyper \ 29 | && cd svtyper \ 30 | && git checkout v$SVTYPER_VERSION \ 31 | && sed -i '/numpy/d' setup.py \ 32 | && sed -i '/scipy/d' setup.py \ 33 | && pip install . 34 | RUN find /opt/hall-lab/python-2.7.15/ -depth \( -name '*.pyo' -o -name '*.pyc' -o -name 'test' -o -name 'tests' \) -exec rm -rf '{}' + ; 35 | #RUN find /opt/hall-lab/python-2.7.15/lib/python2.7/site-packages/ -name '*.so' -print -exec sh -c 'file "{}" | grep -q "not stripped" && strip -s "{}"' \; 36 | 37 | FROM debian:stretch-slim 38 | LABEL maintainer "Dave Larson " 39 | 40 | COPY --from=svtyper-0.7.1-build /opt/hall-lab/python-2.7.15 /opt/hall-lab/python-2.7.15 41 | COPY --from=halllab/samtools-1.9-build:v1 /build/deb-build/opt/hall-lab/samtools-1.9 /opt/hall-lab/samtools-1.9 42 | COPY --from=halllab/htslib-1.9-build:v1 /build/deb-build/opt/hall-lab/htslib-1.9 /opt/hall-lab/htslib-1.9 43 | 44 | RUN apt-get update -qq \ 45 | && apt-get install -y --no-install-recommends \ 46 | libssl1.1 \ 47 | libcurl3 \ 48 | libncurses5 \ 49 | libbz2-1.0 \ 50 | liblzma5 \ 51 | libssl1.0.2 \ 52 | zlib1g 53 | 54 | ENV PATH=/opt/hall-lab/python-2.7.15/bin:/opt/hall-lab/samtools-1.9/bin:/opt/hall-lab/htslib-1.9/bin:${PATH} 55 | 56 | CMD ["/bin/bash"] 57 | -------------------------------------------------------------------------------- /scripts/Merge_SV.wdl: -------------------------------------------------------------------------------- 1 | version 1.0 2 | import "SV_Tasks.wdl" as SV 3 | 4 | workflow Merge_SV { 5 | input { 6 | # data inputs 7 | Array[File] manta_input_vcfs 8 | Array[File] smoove_input_vcfs 9 | String cohort_name 10 | 11 | # system inputs 12 | Int preemptible_tries 13 | } 14 | 15 | 16 | call SV.L_Sort_VCF_Variants as lsort_manta { 17 | input: 18 | input_vcfs = manta_input_vcfs, 19 | output_vcf_basename = cohort_name + ".manta.lsort", 20 | preemptible_tries = preemptible_tries 21 | } 22 | 23 | call SV.Filter_Pass as filter_manta { 24 | input: 25 | input_vcf_gz = lsort_manta.output_vcf_gz, 26 | output_vcf_basename = cohort_name + ".manta.filter", 27 | preemptible_tries = preemptible_tries 28 | } 29 | 30 | call SV.L_Merge_VCF_Variants as lmerge_manta { 31 | input: 32 | input_vcf_gz = filter_manta.output_vcf_gz, 33 | output_vcf_basename = cohort_name + ".manta.lmerge", 34 | preemptible_tries = preemptible_tries 35 | } 36 | 37 | call SV.L_Sort_VCF_Variants as lsort_smoove { 38 | input: 39 | input_vcfs = smoove_input_vcfs, 40 | output_vcf_basename = cohort_name + ".smoove.lsort", 41 | preemptible_tries = preemptible_tries 42 | } 43 | 44 | call SV.Filter_Del as filter_smoove { 45 | input: 46 | input_vcf_gz = lsort_smoove.output_vcf_gz, 47 | output_vcf_basename = cohort_name + ".smoove.filter", 48 | preemptible_tries = preemptible_tries 49 | } 50 | 51 | call SV.L_Merge_VCF_Variants as lmerge_smoove { 52 | input: 53 | input_vcf_gz = filter_smoove.output_vcf_gz, 54 | output_vcf_basename = cohort_name + ".smoove.lmerge", 55 | preemptible_tries = preemptible_tries 56 | } 57 | 58 | call SV.L_Sort_VCF_Variants as lsort_manta_smoove { 59 | input: 60 | input_vcfs = [lmerge_manta.output_vcf_gz, lmerge_smoove.output_vcf_gz], 61 | output_vcf_basename = cohort_name + ".manta_smoove.lsort", 62 | preemptible_tries = preemptible_tries 63 | } 64 | 65 | call SV.L_Merge_VCF_Variants_weighted as lmerge_manta_smoove { 66 | input: 67 | input_vcf_gz = lsort_manta_smoove.output_vcf_gz, 68 | output_vcf_basename = cohort_name + ".manta_smoove.lmerge", 69 | preemptible_tries = preemptible_tries 70 | } 71 | 72 | output { 73 | File output_vcf = lmerge_manta_smoove.output_vcf_gz 74 | } 75 | } 76 | -------------------------------------------------------------------------------- /scripts/Pre_Merge_SV_per_sample.wdl: -------------------------------------------------------------------------------- 1 | version 1.0 2 | import "SV_Tasks.wdl" as SV 3 | 4 | workflow Pre_Merge_SV_Per_Sample { 5 | input { 6 | # data inputs 7 | File aligned_cram 8 | 9 | # reference inputs 10 | File ref_fasta 11 | File ref_fasta_index 12 | File ref_cache 13 | File? call_regions_bed 14 | File? call_regions_bed_index 15 | File exclude_regions 16 | 17 | String aligned_cram_suffix 18 | 19 | # system inputs 20 | Int preemptible_tries 21 | 22 | String basename = sub(sub(aligned_cram, "^.*/", ""), aligned_cram_suffix + "$", "") 23 | } 24 | 25 | call SV.Index_Cram { 26 | input: 27 | basename = basename, 28 | input_cram = aligned_cram, 29 | ref_cache = ref_cache, 30 | preemptible_tries = preemptible_tries 31 | } 32 | 33 | call SV.Manta { 34 | input: 35 | basename = basename, 36 | input_cram = aligned_cram, 37 | input_cram_index = Index_Cram.output_cram_index, 38 | ref_fasta = ref_fasta, 39 | ref_fasta_index = ref_fasta_index, 40 | call_regions_bed = call_regions_bed, 41 | call_regions_bed_index = call_regions_bed_index, 42 | ref_cache = ref_cache, 43 | preemptible_tries = preemptible_tries 44 | } 45 | 46 | call SV.CNVnator_Histogram { 47 | input: 48 | basename = basename, 49 | input_cram = aligned_cram, 50 | input_cram_index = Index_Cram.output_cram_index, 51 | ref_fasta = ref_fasta, 52 | ref_fasta_index = ref_fasta_index, 53 | ref_cache = ref_cache, 54 | preemptible_tries = preemptible_tries 55 | } 56 | 57 | call SV.Smoove { 58 | input: 59 | basename = basename, 60 | input_cram = aligned_cram, 61 | input_cram_index = Index_Cram.output_cram_index, 62 | ref_fasta = ref_fasta, 63 | ref_fasta_index = ref_fasta_index, 64 | ref_cache = ref_cache, 65 | exclude_regions = exclude_regions, 66 | preemptible_tries = preemptible_tries 67 | } 68 | 69 | output { 70 | File cram_index = Index_Cram.output_cram_index 71 | File manta_vcf = Manta.output_vcf 72 | File manta_tbi = Manta.output_tbi 73 | File manta_original_vcf = Manta.original_vcf 74 | File manta_original_tbi = Manta.original_tbi 75 | File cnvnator_cn_hist_root = CNVnator_Histogram.output_cn_hist_root 76 | File cnvnator_output_cn_txt = CNVnator_Histogram.output_cn_txt 77 | File cnvnator_cn_bed = CNVnator_Histogram.output_cn_bed 78 | File smoove_vcf = Smoove.output_vcf 79 | File smoove_csi = Smoove.output_csi 80 | } 81 | } 82 | -------------------------------------------------------------------------------- /scripts/Post_Merge_SV.inputs.json: -------------------------------------------------------------------------------- 1 | { 2 | "Post_Merge_SV.aligned_crams": [ 3 | "gs://fc-56ac46ea-efc4-4683-b6d5-6d95bed41c5e/CCDG_13607/Project_CCDG_13607_B01_GRM_WGS.cram.2019-02-06/Sample_NA12878/analysis/NA12878.final.cram", 4 | "gs://fc-56ac46ea-efc4-4683-b6d5-6d95bed41c5e/CCDG_13607/Project_CCDG_13607_B01_GRM_WGS.cram.2019-02-06/Sample_NA19238/analysis/NA19238.final.cram" 5 | ], 6 | "Post_Merge_SV.aligned_cram_indices": [ 7 | "gs://fc-56ac46ea-efc4-4683-b6d5-6d95bed41c5e/CCDG_13607/Project_CCDG_13607_B01_GRM_WGS.cram.2019-02-06/Sample_NA12878/analysis/NA12878.final.cram.crai", 8 | "gs://fc-56ac46ea-efc4-4683-b6d5-6d95bed41c5e/CCDG_13607/Project_CCDG_13607_B01_GRM_WGS.cram.2019-02-06/Sample_NA19238/analysis/NA19238.final.cram.crai" 9 | ], 10 | "Post_Merge_SV.manta_vcfs": [ 11 | "gs://mgi-wdl-test/2019-09-06/Pre_Merge_SV/1e43c1ff-befd-4bf1-834e-6aab76b976d5/call-Pre_Merge_SV_Per_Sample/shard-0/Pre_Merge_SV_Per_Sample/30546f8c-c09f-4873-b77e-641f194cacb5/call-Manta/NA12878.final.vcf.gz", 12 | "gs://mgi-wdl-test/2019-09-06/Pre_Merge_SV/1e43c1ff-befd-4bf1-834e-6aab76b976d5/call-Pre_Merge_SV_Per_Sample/shard-1/Pre_Merge_SV_Per_Sample/928868b2-a748-43f6-a938-517f784eff54/call-Manta/attempt-2/NA19238.final.vcf.gz" 13 | ], 14 | "Post_Merge_SV.cn_hist_roots": [ 15 | "gs://mgi-wdl-test/2019-09-06/Pre_Merge_SV/1e43c1ff-befd-4bf1-834e-6aab76b976d5/call-Pre_Merge_SV_Per_Sample/shard-0/Pre_Merge_SV_Per_Sample/30546f8c-c09f-4873-b77e-641f194cacb5/call-CNVnator_Histogram/cnvnator.out/NA12878.final.cram.hist.root", 16 | "gs://mgi-wdl-test/2019-09-06/Pre_Merge_SV/1e43c1ff-befd-4bf1-834e-6aab76b976d5/call-Pre_Merge_SV_Per_Sample/shard-1/Pre_Merge_SV_Per_Sample/928868b2-a748-43f6-a938-517f784eff54/call-CNVnator_Histogram/cnvnator.out/NA19238.final.cram.hist.root" 17 | ], 18 | "Post_Merge_SV.merged_vcf": "gs://mgi-wdl-test/2019-09-06/Merge_SV/39dfbd20-14df-427e-afe2-78296cf09798/call-lmerge_manta_smoove/Cohort.manta_smoove.lmerge.vcf.gz", 19 | "Post_Merge_SV.cohort_name": "Cohort", 20 | "Post_Merge_SV.aligned_cram_suffix": ".cram", 21 | "Post_Merge_SV.final_vcf_name": "merged_genotyped.vcf.gz", 22 | 23 | "Post_Merge_SV.ref_fasta": "gs://human-b38/GRCh38DH/all_sequences.fa", 24 | "Post_Merge_SV.ref_fasta_index": "gs://human-b38/GRCh38DH/all_sequences.fa.fai", 25 | "Post_Merge_SV.ref_cache": "gs://human-b38/cache.tar.gz", 26 | "Post_Merge_SV.mei_annotation_bed": "gs://human-b38/GRCh38DH/annotations/repeatMasker.recent.lt200millidiv.LINE_SINE_SVA.GRCh38.sorted.bed.gz", 27 | 28 | "Post_Merge_SV.preemptible_tries": 3 29 | } 30 | -------------------------------------------------------------------------------- /docker/svtools/Dockerfile: -------------------------------------------------------------------------------- 1 | FROM debian:stretch-slim as builder-base 2 | LABEL maintainer "Dave Larson " 3 | RUN apt-get update -qq \ 4 | && apt-get install -y --no-install-recommends \ 5 | build-essential \ 6 | make \ 7 | cmake \ 8 | autoconf \ 9 | automake \ 10 | libtool \ 11 | gawk \ 12 | git-core \ 13 | bzip2 \ 14 | libbz2-dev \ 15 | liblzma-dev \ 16 | libssl1.0-dev \ 17 | libcurl4-openssl-dev \ 18 | ca-certificates \ 19 | libblas-dev \ 20 | libatlas-base-dev \ 21 | liblapack-dev \ 22 | curl \ 23 | zlib1g-dev 24 | 25 | FROM builder-base as svtools-0.5.1-build 26 | LABEL maintainer "Dave Larson " 27 | 28 | COPY --from=halllab/python2.7-build:v1 /opt/hall-lab/python-2.7.15 /opt/hall-lab/python-2.7.15 29 | ENV PATH=/opt/hall-lab/python-2.7.15/bin:${PATH} 30 | RUN SVTOOLS_VERSION=0.5.1 \ 31 | && pip install svtools==${SVTOOLS_VERSION} 32 | RUN find /opt/hall-lab/python-2.7.15/ -depth \( -name '*.pyo' -o -name '*.pyc' -o -name 'test' -o -name 'tests' \) -exec rm -rf '{}' + ; 33 | #RUN find /opt/hall-lab/python-2.7.15/lib/python2.7/site-packages/ -name '*.so' -print -exec sh -c 'file "{}" | grep -q "not stripped" && strip -s "{}"' \; 34 | 35 | FROM debian:stretch-slim 36 | LABEL maintainer "Dave Larson " 37 | 38 | COPY --from=svtools-0.5.1-build /opt/hall-lab/python-2.7.15 /opt/hall-lab/python-2.7.15 39 | COPY --from=halllab/samtools-1.9-build:v1 /build/deb-build/opt/hall-lab/samtools-1.9 /opt/hall-lab/samtools-1.9 40 | COPY --from=halllab/cnvnator-0.3.3-build:v1 /opt/hall-lab/cnvnator-0.3.3/deb-build/opt/hall-lab/cnvnator-0.3.3 /opt/hall-lab/cnvnator-0.3.3 41 | COPY --from=halllab/htslib-1.9-build:v1 /build/deb-build/opt/hall-lab/htslib-1.9 /opt/hall-lab/htslib-1.9 42 | 43 | RUN apt-get update -qq \ 44 | && apt-get install -y --no-install-recommends \ 45 | libssl1.1 \ 46 | libcurl3 \ 47 | libncurses5 \ 48 | libbz2-1.0 \ 49 | liblzma5 \ 50 | libssl1.0.2 \ 51 | zlib1g \ 52 | libblas3 \ 53 | libatlas3-base \ 54 | liblapack3 \ 55 | libgomp1 \ 56 | libstdc++6 \ 57 | libstdc++-6-dev \ 58 | libgcc1 \ 59 | g++ \ 60 | libxpm4 \ 61 | gzip \ 62 | less 63 | 64 | ENV PATH=/opt/hall-lab/python-2.7.15/bin:/opt/hall-lab/samtools-1.9/bin:/opt/hall-lab/htslib-1.9/bin:/opt/hall-lab/cnvnator-0.3.3/bin:${PATH} 65 | 66 | CMD ["/bin/bash"] 67 | 68 | -------------------------------------------------------------------------------- /scripts/SV_Pipeline_Full.wdl: -------------------------------------------------------------------------------- 1 | version 1.0 2 | import "Pre_Merge_SV.wdl" as premerge 3 | import "Merge_SV.wdl" as merge 4 | import "Post_Merge_SV.wdl" as postmerge 5 | 6 | workflow SV_Pipeline_Full { 7 | input { 8 | Array[File] aligned_crams 9 | String aligned_cram_suffix 10 | File ref_fasta 11 | File ref_fasta_index 12 | File ref_cache 13 | File? call_regions_bed 14 | File? call_regions_bed_index 15 | File exclude_regions 16 | File mei_annotation_bed 17 | String cohort 18 | String center 19 | String final_vcf_name 20 | Int preemptible_tries 21 | } 22 | 23 | call premerge.Pre_Merge_SV { 24 | input: 25 | aligned_crams = aligned_crams, 26 | aligned_cram_suffix = aligned_cram_suffix, 27 | ref_fasta = ref_fasta, 28 | ref_fasta_index = ref_fasta_index, 29 | ref_cache = ref_cache, 30 | call_regions_bed = call_regions_bed, 31 | call_regions_bed_index = call_regions_bed_index, 32 | exclude_regions = exclude_regions, 33 | cohort = cohort, 34 | center = center, 35 | preemptible_tries = preemptible_tries 36 | } 37 | 38 | call merge.Merge_SV { 39 | input: 40 | manta_input_vcfs = Pre_Merge_SV.manta_vcfs, 41 | smoove_input_vcfs = Pre_Merge_SV.smoove_vcfs, 42 | cohort_name = cohort, 43 | preemptible_tries = preemptible_tries 44 | } 45 | 46 | call postmerge.Post_Merge_SV { 47 | input: 48 | aligned_crams = aligned_crams, 49 | aligned_cram_indices = Pre_Merge_SV.cram_indices, 50 | cn_hist_roots = Pre_Merge_SV.cnvnator_cn_hist_roots, 51 | manta_vcfs = Pre_Merge_SV.manta_original_vcfs, 52 | aligned_cram_suffix = aligned_cram_suffix, 53 | merged_vcf = Merge_SV.output_vcf, 54 | cohort_name = cohort, 55 | final_vcf_name = final_vcf_name, 56 | ref_fasta = ref_fasta, 57 | ref_fasta_index = ref_fasta_index, 58 | ref_cache = ref_cache, 59 | mei_annotation_bed = mei_annotation_bed, 60 | preemptible_tries = preemptible_tries 61 | } 62 | 63 | output { 64 | File output_ped = Post_Merge_SV.output_ped 65 | File output_vcf_bnd = Post_Merge_SV.output_vcf_bnd 66 | File output_vcf_index_bnd = Post_Merge_SV.output_vcf_index_bnd 67 | File output_vcf_del = Post_Merge_SV.output_vcf_del 68 | File output_vcf_ins = Post_Merge_SV.output_vcf_ins 69 | File output_vcf_index_other = Post_Merge_SV.output_vcf_index_other 70 | File output_vcf_other = Post_Merge_SV.output_vcf_other 71 | File output_vcf_index_del = Post_Merge_SV.output_vcf_index_del 72 | File output_vcf_index_ins = Post_Merge_SV.output_vcf_index_ins 73 | } 74 | } 75 | -------------------------------------------------------------------------------- /docker/manta_samtools/Dockerfile: -------------------------------------------------------------------------------- 1 | FROM debian:stretch-slim AS manta-build 2 | LABEL maintainer "Dave Larson " 3 | ARG MANTA_VERSION=1.4.0 4 | COPY --from=halllab/python2.7-build:v1 /opt/hall-lab/python-2.7.15 /opt/hall-lab/python-2.7.15 5 | ENV PATH=/opt/hall-lab/python-2.7.15/bin:${PATH} 6 | RUN apt-get update -qq \ 7 | && apt-get -y install \ 8 | --no-install-recommends \ 9 | build-essential \ 10 | bzip2 \ 11 | zlib1g-dev \ 12 | curl \ 13 | ca-certificates \ 14 | tabix \ 15 | libbz2-dev \ 16 | liblzma-dev \ 17 | && pip install pandas \ 18 | && pip install scipy \ 19 | && pip install pysam \ 20 | && pip install svtools 21 | RUN curl -O -L https://github.com/Illumina/manta/releases/download/v${MANTA_VERSION}/manta-${MANTA_VERSION}.release_src.tar.bz2 \ 22 | && tar -xjf manta-${MANTA_VERSION}.release_src.tar.bz2 \ 23 | && mkdir build \ 24 | && cd build \ 25 | && ../manta-${MANTA_VERSION}.release_src/configure --prefix=/opt/hall-lab/manta-${MANTA_VERSION} \ 26 | && make -j 4 install 27 | RUN find /opt/hall-lab/python-2.7.15/ -depth \( -name '*.pyo' -o -name '*.pyc' -o -name 'test' -o -name 'tests' \) -exec rm -rf '{}' + ; 28 | RUN find /opt/hall-lab/python-2.7.15/lib/python2.7/site-packages/ -name '*.so' -print -exec sh -c 'file "{}" | grep -q "not stripped" && strip -s "{}"' \; 29 | 30 | FROM debian:stretch-slim 31 | LABEL maintainer "Dave Larson " 32 | LABEL description "Manta v1.4.0 with samtools v1.9 alongside" 33 | ARG MANTA_VERSION=1.4.0 34 | 35 | COPY --from=manta-build /opt/hall-lab/manta-${MANTA_VERSION}/bin /opt/hall-lab/manta-${MANTA_VERSION}/bin 36 | COPY --from=manta-build /opt/hall-lab/manta-${MANTA_VERSION}/lib /opt/hall-lab/manta-${MANTA_VERSION}/lib 37 | COPY --from=manta-build /opt/hall-lab/manta-${MANTA_VERSION}/libexec /opt/hall-lab/manta-${MANTA_VERSION}/libexec 38 | COPY --from=manta-build /opt/hall-lab/python-2.7.15 /opt/hall-lab/python-2.7.15 39 | COPY --from=halllab/htslib-1.9-build:v1 /build/deb-build/opt/hall-lab/htslib-1.9 /opt/hall-lab/htslib-1.9 40 | COPY --from=halllab/samtools-1.9-build:v1 /build/deb-build/opt/hall-lab/samtools-1.9 /opt/hall-lab/samtools-1.9 41 | ADD doctor_manta.1.py /opt/hall-lab/ 42 | 43 | # Run dependencies 44 | RUN apt-get update -qq \ 45 | && apt-get -y install \ 46 | --no-install-recommends \ 47 | libssl1.1 \ 48 | libcurl3 \ 49 | libncurses5 \ 50 | libbz2-1.0 \ 51 | liblzma5 \ 52 | libssl1.0.2 \ 53 | zlib1g 54 | 55 | ENV PATH=/opt/hall-lab/manta-${MANTA_VERSION}/bin:/opt/hall-lab/python-2.7.15/bin/:/opt/hall-lab/samtools-1.9/bin:/opt/hall-lab/htslib-1.9/bin:$PATH 56 | ENV LD_LIBRARY_PATH=/opt/hall-lab/htslib-1.9/lib:$LD_LIBRARY_PATH 57 | 58 | CMD ["/bin/bash"] 59 | -------------------------------------------------------------------------------- /scripts/Pre_Merge_SV.wdl: -------------------------------------------------------------------------------- 1 | version 1.0 2 | import "Pre_Merge_SV_per_sample.wdl" as per_sample 3 | import "Pre_Merge_QC_per_sample.wdl" as qc 4 | import "SV_Tasks.wdl" as SV 5 | 6 | workflow Pre_Merge_SV { 7 | input { 8 | Array[File] aligned_crams 9 | String aligned_cram_suffix 10 | 11 | # reference inputs 12 | File ref_fasta 13 | File ref_fasta_index 14 | File ref_cache 15 | File? call_regions_bed 16 | File? call_regions_bed_index 17 | File exclude_regions 18 | String cohort 19 | String center 20 | 21 | # system inputs 22 | Int preemptible_tries 23 | } 24 | 25 | 26 | scatter (i in range(length(aligned_crams))) { 27 | File aligned_cram = aligned_crams[i] 28 | 29 | call per_sample.Pre_Merge_SV_Per_Sample { 30 | input: 31 | aligned_cram = aligned_cram, 32 | aligned_cram_suffix = aligned_cram_suffix, 33 | ref_fasta = ref_fasta, 34 | ref_fasta_index = ref_fasta_index, 35 | call_regions_bed = call_regions_bed, 36 | call_regions_bed_index = call_regions_bed_index, 37 | ref_cache = ref_cache, 38 | exclude_regions = exclude_regions, 39 | preemptible_tries = preemptible_tries 40 | } 41 | 42 | call qc.Pre_Merge_QC_Per_Sample { 43 | input: 44 | manta_vcf = Pre_Merge_SV_Per_Sample.manta_vcf, 45 | lumpy_vcf = Pre_Merge_SV_Per_Sample.smoove_vcf, 46 | cnvnator_vcf = Pre_Merge_SV_Per_Sample.cnvnator_output_cn_txt, 47 | cohort = cohort, 48 | center = center, 49 | preemptible_tries = preemptible_tries 50 | } 51 | } 52 | 53 | #scatter (p in [("manta", Pre_Merge_QC_Per_Sample.manta_counts), ("lumpy", Pre_Merge_QC_Per_Sample.lumpy_counts)]) { 54 | # call SV.Make_Count_Plot { 55 | # input: 56 | # name=p.left, 57 | # count_files=p.right 58 | # } 59 | #} 60 | 61 | output { 62 | Array[File] cram_indices = Pre_Merge_SV_Per_Sample.cram_index 63 | Array[File] manta_vcfs = Pre_Merge_SV_Per_Sample.manta_vcf 64 | Array[File] manta_tbis = Pre_Merge_SV_Per_Sample.manta_tbi 65 | Array[File] manta_original_vcfs = Pre_Merge_SV_Per_Sample.manta_original_vcf 66 | Array[File] manta_original_tbis = Pre_Merge_SV_Per_Sample.manta_original_tbi 67 | Array[File] cnvnator_cn_hist_roots = Pre_Merge_SV_Per_Sample.cnvnator_cn_hist_root 68 | Array[File] cnvnator_output_cn_txt_files = Pre_Merge_SV_Per_Sample.cnvnator_output_cn_txt 69 | Array[File] cnvnator_cn_bed_files = Pre_Merge_SV_Per_Sample.cnvnator_cn_bed 70 | Array[File] smoove_vcfs = Pre_Merge_SV_Per_Sample.smoove_vcf 71 | Array[File] smoove_csis = Pre_Merge_SV_Per_Sample.smoove_csi 72 | Array[File] lumpy_counts = Pre_Merge_QC_Per_Sample.lumpy_counts 73 | Array[File] manta_counts = Pre_Merge_QC_Per_Sample.manta_counts 74 | #Array[File] count_plots = Make_Count_Plot.counts_plot 75 | } 76 | } 77 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | 2 | # Cohort SV detection pipeline 3 | 4 | # Table of contents 5 | 1. [Overview](#overview) 6 | 2. [WDL scripts](#wdl-scripts) 7 | 3. [Docker images](#docker-images) 8 | 9 | # Overview 10 | This repository contains pipeline scripts for structural variation detection in large cohorts. The pipeline is designed for Illumina paired-end whole genome sequencing data, preferably with at least 30x sequence coverage. Data inputs should be a set of sorted CRAM files, aligned with BWA-MEM. 11 | 12 | This pipeline detects structural variation based on breakpoint sequence evidence using both the LUMPY and Manta algorithms. Structural variant (SV) breakpoints are then unified and merged using the [SVTools](https://github.com/hall-lab/svtools) workflow, followed by re-genotyping with [SVTyper](https://github.com/hall-lab/svtyper) and read-depth annotation with [CNVnator](https://github.com/abyzovlab/CNVnator). Finally, SV types are reclassified based on the concordance between read-depth and breakpoint genotype. 13 | 14 | Additional details on the SVTools pipeline are available in the [SVTools tutorial](https://github.com/hall-lab/svtools/blob/master/Tutorial.md). 15 | 16 | ![Workflow](images/workflow.wdl.v04.low-01.png?raw=true "Workflow") 17 | 18 | # WDL scripts 19 | 20 | Pipeline scripts (in [WDL format](https://software.broadinstitute.org/wdl/)) are available in the [scripts](scripts) directory. These scripts can be launched using [Cromwell](https://github.com/broadinstitute/cromwell) (version 25 or later). 21 | 22 | While the SV pipeline can be run in its entirety via the [SV_Pipeline_Full.wdl](scripts/SV_Pipeline_Full.wdl) script, we recommend running the pipeline in three stages to enable intermediate quality control checkpoints. 23 | 24 | ## 1. [Pre_Merge_SV.wdl](scripts/Pre_Merge_SV.wdl) 25 | 26 | For each sample: 27 | - SV discovery with LUMPY using the [smoove](https://github.com/brentp/smoove) wrapper 28 | - Preliminary SV genotyping with SVTyper (also done within the smoove wrapper) 29 | - SV discovery with [Manta](https://github.com/Illumina/manta), including insertions 30 | - Generate [CNVnator](https://github.com/abyzovlab/CNVnator) histogram files 31 | 32 | After this step, we recommend performing quality control checks on each sample before merging them into the cohort-level VCF (step 2). To help with this, per-sample variant counts are generated for both LUMPY and Manta outputs. 33 | 34 | ## 2. [Merge_SV.wdl](scripts/Merge_SV.wdl) 35 | 36 | This step merges the sample-level VCF files from step 1 using the LUMPY breakpoint probability curves to produce a single cohort-level VCF. 37 | 38 | ## 3. [Post_Merge_SV.wdl](scripts/Post_Merge_SV.wdl) 39 | 40 | This step re-genotypes each sample at the sites in the cohort-level VCF file from step 2, and then combines the results into a set of final VCFs, split by variant type for efficiency (deletions, insertions, breakends, and other:duplications+inversions). 41 | 42 | For each sample: 43 | - Re-genotype each SV using SVTyper (note that insertion calls from Manta are taken from the per-sample genotypes and not processed with SVTyper) 44 | - Annotate the read-depth at each SV using CNVnator 45 | - Generate a .ped file of sample names and sexes 46 | 47 | For the cohort: 48 | - Combine the re-genotyped VCFs into a single cohort-level VCF 49 | - Prune overlapping SVs 50 | - Classify SV type based on the concordance between variant genotypes and read-depths 51 | - Sort and index the VCF 52 | 53 | # Docker images 54 | 55 | - Docker images for this pipeline are available at https://hub.docker.com/u/halllab. 56 | - Dockerfiles for these containers are available in the [docker](docker) directory. 57 | - WDL test scripts for each of these Docker containers are available in the [test](test) directory. 58 | -------------------------------------------------------------------------------- /docker/smoove/Dockerfile: -------------------------------------------------------------------------------- 1 | FROM debian:stretch-slim as builder-base 2 | LABEL maintainer "Dave Larson " 3 | RUN apt-get update -qq \ 4 | && apt-get install -y --no-install-recommends \ 5 | build-essential \ 6 | make \ 7 | cmake \ 8 | autoconf \ 9 | automake \ 10 | libtool \ 11 | gawk \ 12 | git-core \ 13 | bzip2 \ 14 | libbz2-dev \ 15 | liblzma-dev \ 16 | libssl1.0-dev \ 17 | libcurl4-openssl-dev \ 18 | ca-certificates \ 19 | curl \ 20 | zlib1g-dev 21 | 22 | 23 | FROM builder-base as lumpy-2f3fccb-build 24 | LABEL maintainer "Dave Larson " 25 | RUN LUMPY_COMMIT=2f3fccb0e6ef8732ff2f5c4e2c12a7a0b8ae2784 \ 26 | && git clone --single-branch --recursive --depth 5 https://github.com/arq5x/lumpy-sv \ 27 | && cd lumpy-sv \ 28 | && git checkout $LUMPY_COMMIT \ 29 | && make -j 3 \ 30 | && mkdir -p /opt/hall-lab/lumpy-2f3fccb/bin \ 31 | && cp ./bin/* /opt/hall-lab/lumpy-2f3fccb/bin 32 | 33 | FROM builder-base as svtyper-0.7.0-build 34 | LABEL maintainer "Dave Larson " 35 | 36 | COPY --from=halllab/python2.7-build:v1 /opt/hall-lab/python-2.7.15 /opt/hall-lab/python-2.7.15 37 | ENV PATH=/opt/hall-lab/python-2.7.15/bin:${PATH} 38 | RUN SVTYPER_VERSION=0.7.0 \ 39 | && git clone https://github.com/hall-lab/svtyper \ 40 | && cd svtyper \ 41 | && git checkout v$SVTYPER_VERSION \ 42 | && sed -i '/numpy/d' setup.py \ 43 | && sed -i '/scipy/d' setup.py \ 44 | && pip install . 45 | RUN find /opt/hall-lab/python-2.7.15/ -depth \( -name '*.pyo' -o -name '*.pyc' -o -name 'test' -o -name 'tests' \) -exec rm -rf '{}' + ; 46 | RUN find /opt/hall-lab/python-2.7.15/lib/python2.7/site-packages/ -name '*.so' -print -exec sh -c 'file "{}" | grep -q "not stripped" && strip -s "{}"' \; 47 | 48 | # Smoove build... 49 | FROM builder-base as smoove-0.2.2-build 50 | WORKDIR /opt/hall-lab/smoove-0.2.2/bin 51 | RUN SMOOVE_VERSION=0.2.2 \ 52 | && MOSDEPTH_VERSION=0.2.4 \ 53 | && GSORT_VERSION=0.0.6 \ 54 | && curl -L -o mosdepth https://github.com/brentp/mosdepth/releases/download/v$MOSDEPTH_VERSION/mosdepth \ 55 | && chmod a+x mosdepth \ 56 | && curl -L -o gsort https://github.com/brentp/gsort/releases/download/v$GSORT_VERSION/gsort_linux_amd64 \ 57 | && chmod a+x gsort \ 58 | && curl -L -o smoove https://github.com/brentp/smoove/releases/download/v$SMOOVE_VERSION/smoove \ 59 | && chmod +x smoove 60 | 61 | FROM debian:stretch-slim 62 | LABEL maintainer "Dave Larson " 63 | 64 | COPY --from=lumpy-2f3fccb-build /opt/hall-lab/lumpy-2f3fccb/bin /opt/hall-lab/lumpy-2f3fccb/bin 65 | COPY --from=svtyper-0.7.0-build /opt/hall-lab/python-2.7.15 /opt/hall-lab/python-2.7.15 66 | COPY --from=halllab/htslib-1.9-build:v1 /build/deb-build/opt/hall-lab/htslib-1.9 /opt/hall-lab/htslib-1.9 67 | COPY --from=halllab/samtools-1.9-build:v1 /build/deb-build/opt/hall-lab/samtools-1.9 /opt/hall-lab/samtools-1.9 68 | COPY --from=halllab/bcftools-1.9-build:v1 /build/deb-build/opt/hall-lab/bcftools-1.9 /opt/hall-lab/bcftools-1.9 69 | COPY --from=smoove-0.2.2-build /opt/hall-lab/smoove-0.2.2/bin /opt/hall-lab/smoove-0.2.2/bin 70 | 71 | ENV PATH=/opt/hall-lab/smoove-0.2.2/bin:/opt/hall-lab/python-2.7.15/bin:/opt/hall-lab/lumpy-2f3fccb/bin:/opt/hall-lab/htslib-1.9/bin:/opt/hall-lab/samtools-1.9/bin:/opt/hall-lab/bcftools-1.9/bin:$PATH 72 | ENV LD_LIBRARY_PATH=/opt/hall-lab/htslib-1.9/lib:$LD_LIBRARY_PATH 73 | 74 | RUN apt-get update -qq \ 75 | && apt-get install -y --no-install-recommends \ 76 | libssl1.1 \ 77 | libcurl3 \ 78 | libncurses5 \ 79 | libbz2-1.0 \ 80 | liblzma5 \ 81 | libssl1.0.2 \ 82 | zlib1g 83 | 84 | CMD ["/bin/bash"] 85 | -------------------------------------------------------------------------------- /docker/cromwell_mysql/run_pipeline.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | set -eo pipefail 4 | 5 | export PATH="/usr/local/sbin:/usr/local/bin:/usr/sbin:/usr/bin:/sbin:/bin:$PATH" 6 | RESOURCE_DIR=/opt/ccdg/cromwell/resources 7 | #RESOURCE_DIR='.' 8 | CROMWELL_CONF_TEMPLATE=$RESOURCE_DIR/application.conf.template 9 | MYSQL_CONF_TEMPLATE=$RESOURCE_DIR/mysql.cnf.template 10 | 11 | MYSQLD_PID='' 12 | LOCK_ACQUIRED=false 13 | 14 | MAIN_DIR="$1" 15 | 16 | function clean_directory { 17 | local cleaned=$(echo "$1" | sed 's|/*$||') 18 | local abspath=$(cd "$cleaned" && pwd -P) 19 | echo "$abspath" 20 | } 21 | 22 | function wait_for_file { 23 | local file="$1" 24 | for i in `seq 1 28`; do 25 | if [[ ! -e "$file" ]]; then 26 | sleep $i 27 | fi 28 | done 29 | } 30 | 31 | function cromwell_conf { 32 | local dir="$1" 33 | echo "$dir/application.conf" 34 | } 35 | 36 | 37 | function set_up_conf { 38 | local dir="$1" 39 | local conf_file=$(cromwell_conf "$dir") 40 | if [[ -s "$conf_file" ]]; then 41 | echo "Using existing cromwell config at $conf_file" >&2 42 | else 43 | cat $CROMWELL_CONF_TEMPLATE | sed "s|%%SHARED_FS_DIRECTORY%%|$dir|" > "$conf_file" 44 | echo "Created cromwell config $conf_file" >&2 45 | fi 46 | } 47 | 48 | function has_db { 49 | local dir="$1" 50 | if [[ -d "$dir/db/run/mysqld" && -d "$dir/db/lib/mysql" && -d "$dir/db/log/mysql" && -e "$dir/db/lib/mysql/cromwell" ]]; then 51 | true 52 | else 53 | false 54 | fi 55 | } 56 | 57 | function create_mysql_directories { 58 | local dir="$1" 59 | for new_dir in "$dir/db/run/mysqld" "$dir/db/lib/mysql" "$dir/db/log/mysql" 60 | do 61 | mkdir -p "$new_dir" 62 | done 63 | # TODO Are we really sure we need/want to do this? 64 | touch "$dir/db/log/mysql/error.log" 65 | chmod -R 777 "$dir/db" 66 | } 67 | 68 | function setup_new_database { 69 | echo "create database cromwell; create user 'cromwell'@'localhost' identified by 'test4cromwell'; grant all privileges on *.* to 'cromwell'@localhost;" | mysql -u root --socket=/tmp/mysqld.sock 70 | } 71 | 72 | function start_mysql { 73 | local dir="$1" 74 | local mysql_cnf_file=$(mysql_conf "$dir") 75 | mysqld_safe --defaults-file="$mysql_cnf_file" & 76 | MYSQLD_PID="$!" 77 | wait_for_file "/tmp/mysqld.sock" 78 | } 79 | 80 | function shutdown_mysql { 81 | echo "Shutting down mysql" >&2 82 | /usr/bin/mysqladmin -u root --socket /tmp/mysqld.sock shutdown 83 | MYSQLD_PID='' 84 | } 85 | 86 | function install_db { 87 | local dir="$1" 88 | local ldata="$dir/db/lib/mysql" 89 | create_mysql_directories "$dir" 90 | mysql_install_db --user=$USER --basedir=/usr/ --ldata=$ldata 91 | start_mysql "$dir" 92 | setup_new_database "$dir" 93 | } 94 | 95 | function mysql_conf { 96 | local dir="$1" 97 | echo "$dir/mysql.cnf" 98 | } 99 | 100 | function set_up_mysql_cnf { 101 | local dir="$1" 102 | local conf=$(mysql_conf "$dir") 103 | if [[ -s "$conf" ]]; then 104 | echo "Using existing mysql config at $conf" >&2 105 | else 106 | cat $MYSQL_CONF_TEMPLATE | sed "s|%%SHARED_FS_DIRECTORY%%|$dir|" > "$conf" 107 | echo "Created mysql config $conf" >&2 108 | fi 109 | } 110 | 111 | function is_locked { 112 | local dir="$1" 113 | if [[ -d "$dir/.lock" ]]; then 114 | true 115 | else 116 | false 117 | fi 118 | } 119 | 120 | function lock { 121 | local dir="$1" 122 | if mkdir "$dir/.lock"; then 123 | LOCK_ACQUIRED=true 124 | echo "Locked $dir" >&2 125 | else 126 | echo "Unable to lock $dir" >&2 127 | exit 1 128 | fi 129 | } 130 | 131 | function unlock { 132 | local dir="$1" 133 | if rmdir "$dir/.lock"; then 134 | LOCK_ACQUIRED=false 135 | echo "Unlocked $dir" >&2 136 | else 137 | echo "Unable to unlock $dir" >&2 138 | exit 1 139 | fi 140 | } 141 | 142 | function run_cromwell { 143 | local dir="$1" 144 | local cromwell_conf=$(cromwell_conf "$dir") 145 | /usr/bin/java -Xmx31G -Xms16G -Dconfig.file="$cromwell_conf" -jar /app/cromwell.jar run "${@:2}" 146 | } 147 | 148 | function cleanup { 149 | local dir="$1" 150 | if [[ $MYSQLD_PID ]]; then 151 | shutdown_mysql 152 | fi 153 | if $LOCK_ACQUIRED; then 154 | unlock "$dir" 155 | fi 156 | } 157 | 158 | function main { 159 | local dir="$1" 160 | if [[ -d "$dir" ]] 161 | then 162 | local clean_dir=$(clean_directory "$dir") 163 | trap 'cleanup $(clean_directory "$MAIN_DIR")' EXIT SIGTERM SIGINT 164 | lock "$clean_dir" 165 | set_up_conf "$clean_dir" 166 | set_up_mysql_cnf "$clean_dir" 167 | if ! $(has_db "$clean_dir"); then 168 | # note that this also starts mysqld 169 | install_db "$clean_dir" 170 | else 171 | start_mysql "$clean_dir" 172 | fi 173 | echo "${@:2}" >&2 174 | run_cromwell "$clean_dir" "${@:2}" 175 | else 176 | echo "$dir is not a directory" >&2 177 | exit 1 178 | fi 179 | } 180 | 181 | main "${@}"; 182 | -------------------------------------------------------------------------------- /docker/manta_samtools/doctor_manta.1.py: -------------------------------------------------------------------------------- 1 | import argparse, sys, StringIO 2 | import pandas as pd 3 | import numpy as np 4 | import scipy.spatial.distance as ssd 5 | import pysam 6 | sys.path.insert(1,'/gscmnt/gc2802/halllab/abelhj/svtools') 7 | from svtools.vcf.file import Vcf 8 | from svtools.vcf.variant import Variant 9 | from collections import namedtuple 10 | import svtools.utils as su 11 | 12 | 13 | def add_arguments_to_parser(parser): 14 | parser.add_argument('-i', '--vcf', metavar='', dest='manta_vcf', help="manta input vcf") 15 | parser.add_argument('-v', '--verbose', dest='verbose', action='store_true') 16 | parser.add_argument('-s', '--slop', dest='slop', default=0, required=False, help='padding to either side') 17 | parser.add_argument('-m', '--max_ins', dest='max_ins', default=1000, type=int, required=False, help='maximum insert size') 18 | 19 | def command_parser(): 20 | parser = argparse.ArgumentParser(description="cross-cohort cnv caller") 21 | add_arguments_to_parser(parser) 22 | return parser 23 | 24 | def convert_variant(v, max_ins): 25 | set_read_counts(v) 26 | set_cis_prs(v) 27 | if v.get_info('SVTYPE')=='DEL': 28 | convert_del(v) 29 | elif v.get_info('SVTYPE')=='DUP': 30 | convert_dup(v) 31 | elif v.get_info('SVTYPE')=='INV': 32 | convert_inv(v) 33 | elif v.get_info('SVTYPE')=='INS': 34 | convert_ins(v, max_ins) 35 | elif v.get_info('SVTYPE')=='BND': 36 | convert_bnd(v) 37 | 38 | def split_ci(ci): 39 | return[int(ci.split(',')[0]), int(ci.split(',')[1])] 40 | 41 | def uniform_pr(length): 42 | pr=np.ones(length, dtype='float64')/length 43 | pr1=','.join( map(str, pr)) 44 | return pr1 45 | 46 | def set_read_counts(var): 47 | 48 | sample=var.sample_list[0] 49 | gt=var.genotype(sample) 50 | pe=0 51 | sr=0 52 | if 'PR' in var.format_dict: 53 | pe=int(gt.get_format('PR').split(',')[1]) 54 | if 'SR' in var.format_dict: 55 | sr=int(gt.get_format('SR').split(',')[1]) 56 | var.info['PE']=pe 57 | var.info['SR']=sr 58 | var.info['SU']=pe+sr 59 | 60 | def set_cis_prs(v): 61 | imprec=False 62 | cipos='0,0' 63 | ciend='0,0' 64 | prpos=1.0 65 | prend=1.0 66 | if 'CIPOS' in v.info: 67 | cipos=v.get_info('CIPOS') 68 | [start, stop]=split_ci(cipos) 69 | prpos=uniform_pr(stop-start+1) 70 | imprec=True 71 | if 'CIEND' in v.info: 72 | ciend=v.get_info('CIEND') 73 | [start, stop]=split_ci(ciend) 74 | prend=uniform_pr(stop-start+1) 75 | imprec=True 76 | v.info['CIPOS']=cipos 77 | v.info['CIEND']=ciend 78 | v.info['CIPOS95']=cipos 79 | v.info['CIEND95']=ciend 80 | v.info['PRPOS']=prpos 81 | v.info['PREND']=prend 82 | v.set_info('IMPRECISE', imprec) 83 | 84 | def convert_del(var): 85 | var.alt='' 86 | var.info['STRANDS']='+-:'+str(var.info['SU']) 87 | var.ref='N' 88 | 89 | def convert_dup(var): 90 | var.alt='' 91 | var.info['STRANDS']='-+:'+str(var.info['SU']) 92 | var.ref='N' 93 | 94 | #def convert_inv(var): 95 | # var.ref='N' 96 | # var.alt='' 97 | # if 'INV3' in var.info: 98 | # var.info['STRANDS']='++:'+var.info['SU'] 99 | # else: 100 | # var.info['STRANDS']='--:'+var.info['SU'] 101 | 102 | def convert_inv(var): 103 | var.ref='N' 104 | strands='' 105 | if 'INV3' in var.info: 106 | strands='++:' 107 | var.alt='N]'+var.chrom+':'+str(var.info['END'])+']' 108 | else: 109 | strands='--:' 110 | var.alt='['+var.chrom+':'+str(var.info['END'])+'[' 111 | var.info['SVTYPE']='BND' 112 | var.info['STRANDS']=strands+str(var.info['SU']) 113 | 114 | 115 | def convert_ins(var, max_ins): 116 | var.ref='N' 117 | var.alt='' 118 | var.info['STRANDS']='+.:'+str(var.info['SU']) 119 | orig_len='.' 120 | new_len=max_ins 121 | if 'SVLEN' in var.info: 122 | svlen=int(var.get_info('SVLEN')) 123 | orig_len=svlen 124 | if svlen0: 143 | strands="+-:" 144 | newalt='N'+alt[ff::] 145 | else: 146 | ff=alt.find("]") 147 | if ff==0: 148 | strands="-+:" 149 | ff1=alt.find("]", 1) 150 | newalt=alt[0:(ff1+1)]+'N' 151 | else: 152 | strands="++:" 153 | newalt='N'+alt[ff::] 154 | var.alt=newalt 155 | var.info['STRANDS']=strands+str(var.info['SU']) 156 | 157 | 158 | def run_from_args(args): 159 | 160 | vcf = Vcf() 161 | vcf_out=sys.stdout 162 | in_header = True 163 | header_lines = list() 164 | with su.InputStream(args.manta_vcf) as input_stream: 165 | for line in input_stream: 166 | if in_header: 167 | header_lines.append(line) 168 | if line[0:6] == '#CHROM': 169 | in_header=False 170 | vcf.add_header(header_lines) 171 | vcf.add_info('PRPOS', '1', 'String', 'Breakpoint probability dist') 172 | vcf.add_info('PREND', '1', 'String', 'Breakpoint probability dist') 173 | vcf.add_info('STRANDS', '.', 'String', 'Strand orientation of the adjacency in BEDPE format (DEL:+-, DUP:-+, INV:++/--') 174 | vcf.add_info('SU', '.', 'Integer', 'Number of pieces of evidence supporting the variant across all samples') 175 | vcf.add_info('PE', '.', 'Integer', 'Number of paired-end reads supporting the variant across all samples') 176 | vcf.add_info('SR', '.', 'Integer', 'Number of split reads supporting the variant across all samples') 177 | vcf.add_info('INSLEN_ORIG', '.', 'Integer', 'Original insertion length') 178 | vcf.add_info('CIPOS95', '2', 'Integer', 'Confidence interval (95%) around POS for imprecise variants') 179 | vcf.add_info('CIEND95', '2', 'Integer', 'Confidence interval (95%) around END for imprecise variants') 180 | vcf.add_info('SECONDARY', '0', 'Flag', 'Secondary breakend in a multi-line variant') 181 | vcf_out.write(vcf.get_header()+'\n') 182 | else: 183 | v = Variant(line.rstrip().split('\t'), vcf) 184 | convert_variant(v, args.max_ins) 185 | vcf_out.write(v.get_var_string()+"\n") 186 | 187 | 188 | parser=command_parser() 189 | args=parser.parse_args() 190 | run_from_args(args) 191 | -------------------------------------------------------------------------------- /scripts/Post_Merge_SV.wdl: -------------------------------------------------------------------------------- 1 | version 1.0 2 | import "SV_Tasks.wdl" as SV 3 | 4 | workflow Post_Merge_SV { 5 | # data inputs 6 | input { 7 | Array[File] aligned_crams 8 | Array[File] aligned_cram_indices 9 | Array[File] cn_hist_roots 10 | Array[File] manta_vcfs 11 | String aligned_cram_suffix 12 | File merged_vcf 13 | String cohort_name 14 | String final_vcf_name 15 | 16 | # reference inputs 17 | File ref_fasta 18 | File ref_fasta_index 19 | File ref_cache 20 | File mei_annotation_bed 21 | 22 | # system inputs 23 | Int preemptible_tries 24 | } 25 | 26 | call SV.Split_By_Type { 27 | input: 28 | input_vcf = merged_vcf, 29 | output_vcf_prefix = cohort_name + ".merged", 30 | preemptible_tries = preemptible_tries 31 | } 32 | 33 | # Re-genotype and call copy number for each sample on the merged SV VCF 34 | scatter (i in range(length(aligned_crams))) { 35 | 36 | File aligned_cram = aligned_crams[i] 37 | File aligned_cram_index = aligned_cram_indices[i] 38 | File cn_hist_root = cn_hist_roots[i] 39 | String basename = sub(sub(aligned_cram, "^.*/", ""), aligned_cram_suffix + "$", "") 40 | 41 | call SV.Get_Sample_Name { 42 | input: 43 | input_cram = aligned_cram, 44 | preemptible_tries = preemptible_tries 45 | } 46 | 47 | call SV.Get_Sex { 48 | input: 49 | input_cn_hist_root = cn_hist_root, 50 | ref_fasta_index = ref_fasta_index, 51 | preemptible_tries = preemptible_tries 52 | } 53 | 54 | call SV.Genotype as Genotype_Merged_BND { 55 | input: 56 | basename = basename + ".bnd", 57 | input_cram = aligned_cram, 58 | input_cram_index = aligned_cram_index, 59 | input_vcf = Split_By_Type.bnd_vcf, 60 | ref_cache = ref_cache, 61 | preemptible_tries = preemptible_tries 62 | } 63 | 64 | call SV.Genotype as Genotype_Merged_DEL { 65 | input: 66 | basename = basename + ".del", 67 | input_cram = aligned_cram, 68 | input_cram_index = aligned_cram_index, 69 | input_vcf = Split_By_Type.del_vcf, 70 | ref_cache = ref_cache, 71 | preemptible_tries = preemptible_tries 72 | } 73 | 74 | call SV.Take_Original_Genotypes as Genotype_Merged_INS { 75 | input: 76 | sample_name = Get_Sample_Name.sample, 77 | original_per_sample_vcf = manta_vcfs[i], 78 | basename = basename + ".ins", 79 | input_vcf = Split_By_Type.ins_vcf, 80 | input_variant_to_sname_mapping = Split_By_Type.ins_split, 81 | preemptible_tries = preemptible_tries 82 | } 83 | 84 | call SV.Genotype as Genotype_Merged_OTHER { 85 | input: 86 | basename = basename + ".other", 87 | input_cram = aligned_cram, 88 | input_cram_index = aligned_cram_index, 89 | input_vcf = Split_By_Type.other_vcf, 90 | ref_cache = ref_cache, 91 | preemptible_tries = preemptible_tries 92 | } 93 | 94 | call SV.Copy_Number as Copy_Number_DEL { 95 | input: 96 | basename = basename + ".del", 97 | sample = Get_Sample_Name.sample, 98 | input_vcf = Genotype_Merged_DEL.output_vcf, 99 | input_cn_hist_root = cn_hist_root, 100 | ref_cache = ref_cache, 101 | preemptible_tries = preemptible_tries 102 | } 103 | 104 | call SV.Copy_Number as Copy_Number_OTHER { 105 | input: 106 | basename = basename + ".other", 107 | sample = Get_Sample_Name.sample, 108 | input_vcf = Genotype_Merged_OTHER.output_vcf, 109 | input_cn_hist_root = cn_hist_root, 110 | ref_cache = ref_cache, 111 | preemptible_tries = preemptible_tries 112 | } 113 | } 114 | 115 | call SV.Make_Pedigree_File { 116 | input: 117 | sample_array = Get_Sample_Name.sample, 118 | sex_array = Get_Sex.sex, 119 | output_ped_basename = cohort_name, 120 | } 121 | 122 | call SV.Paste_VCF as Paste_VCF_BND { 123 | input: 124 | input_vcfs = Genotype_Merged_BND.output_vcf, 125 | output_vcf_basename = cohort_name + ".merged.gt.bnd", 126 | preemptible_tries = preemptible_tries 127 | } 128 | 129 | call SV.Paste_VCF as Paste_VCF_DEL { 130 | input: 131 | input_vcfs = Copy_Number_DEL.output_vcf, 132 | output_vcf_basename = cohort_name + ".merged.gt.cn.del", 133 | preemptible_tries = preemptible_tries 134 | } 135 | 136 | call SV.Paste_VCF as Paste_VCF_INS { 137 | input: 138 | input_vcfs = Genotype_Merged_INS.output_vcf, 139 | output_vcf_basename = cohort_name + ".merged.gt.ins", 140 | preemptible_tries = preemptible_tries 141 | } 142 | 143 | call SV.Paste_VCF as Paste_VCF_OTHER { 144 | input: 145 | input_vcfs = Copy_Number_OTHER.output_vcf, 146 | output_vcf_basename = cohort_name + ".merged.gt.cn.other", 147 | preemptible_tries = preemptible_tries 148 | } 149 | 150 | call SV.Prune_VCF as Prune_VCF_BND{ 151 | input: 152 | input_vcf_gz = Paste_VCF_BND.output_vcf_gz, 153 | output_vcf_basename = cohort_name + ".merged.gt.pruned.bnd", 154 | preemptible_tries = preemptible_tries 155 | } 156 | 157 | call SV.Prune_VCF as Prune_VCF_DEL{ 158 | input: 159 | input_vcf_gz = Paste_VCF_DEL.output_vcf_gz, 160 | output_vcf_basename = cohort_name + ".merged.gt.cn.pruned.del", 161 | preemptible_tries = preemptible_tries 162 | } 163 | 164 | call SV.Prune_VCF as Prune_VCF_INS{ 165 | input: 166 | input_vcf_gz = Paste_VCF_INS.output_vcf_gz, 167 | output_vcf_basename = cohort_name + ".merged.gt.pruned.ins", 168 | preemptible_tries = preemptible_tries 169 | } 170 | 171 | call SV.Prune_VCF as Prune_VCF_OTHER{ 172 | input: 173 | input_vcf_gz = Paste_VCF_OTHER.output_vcf_gz, 174 | output_vcf_basename = cohort_name + ".merged.gt.cn.pruned.other", 175 | preemptible_tries = preemptible_tries 176 | } 177 | 178 | call SV.Classify as Classify_DEL{ 179 | input: 180 | input_vcf_gz = Prune_VCF_DEL.output_vcf_gz, 181 | input_ped = Make_Pedigree_File.output_ped, 182 | mei_annotation_bed = mei_annotation_bed, 183 | output_vcf_basename = cohort_name + ".merged.gt.cn.pruned.class.del", 184 | preemptible_tries = preemptible_tries 185 | } 186 | 187 | call SV.Classify as Classify_OTHER{ 188 | input: 189 | input_vcf_gz = Prune_VCF_OTHER.output_vcf_gz, 190 | input_ped = Make_Pedigree_File.output_ped, 191 | mei_annotation_bed = mei_annotation_bed, 192 | output_vcf_basename = cohort_name + ".merged.gt.cn.pruned.class.other", 193 | preemptible_tries = preemptible_tries 194 | } 195 | 196 | call SV.Sort_Index_VCF as Sort_Index_VCF_BND { 197 | input: 198 | input_vcf_gz = Prune_VCF_BND.output_vcf_gz, 199 | output_vcf_name = final_vcf_name + ".bnd.vcf.gz", 200 | preemptible_tries = preemptible_tries 201 | } 202 | 203 | call SV.Sort_Index_VCF as Sort_Index_VCF_DEL { 204 | input: 205 | input_vcf_gz = Classify_DEL.output_vcf_gz, 206 | output_vcf_name = final_vcf_name + ".del.vcf.gz", 207 | preemptible_tries = preemptible_tries 208 | } 209 | 210 | call SV.Sort_Index_VCF as Sort_Index_VCF_INS { 211 | input: 212 | input_vcf_gz = Prune_VCF_INS.output_vcf_gz, 213 | output_vcf_name = final_vcf_name + ".ins.vcf.gz", 214 | preemptible_tries = preemptible_tries 215 | } 216 | 217 | call SV.Sort_Index_VCF as Sort_Index_VCF_OTHER { 218 | input: 219 | input_vcf_gz = Classify_OTHER.output_vcf_gz, 220 | output_vcf_name = final_vcf_name + ".other.vcf.gz", 221 | preemptible_tries = preemptible_tries 222 | } 223 | 224 | call SV.Filter_Index as Filter_Index_BND { 225 | input: 226 | input_vcf_gz = Sort_Index_VCF_BND.output_vcf_gz, 227 | output_vcf_name = final_vcf_name + ".bnd.vcf.gz", 228 | preemptible_tries = preemptible_tries 229 | } 230 | 231 | call SV.Filter_Index as Filter_Index_DEL { 232 | input: 233 | input_vcf_gz = Sort_Index_VCF_DEL.output_vcf_gz, 234 | output_vcf_name = final_vcf_name + ".del.vcf.gz", 235 | preemptible_tries = preemptible_tries 236 | } 237 | 238 | call SV.Filter_Index as Filter_Index_INS { 239 | input: 240 | input_vcf_gz = Sort_Index_VCF_INS.output_vcf_gz, 241 | output_vcf_name = final_vcf_name + ".ins.vcf.gz", 242 | preemptible_tries = preemptible_tries 243 | } 244 | 245 | call SV.Filter_Index as Filter_Index_OTHER { 246 | input: 247 | input_vcf_gz = Sort_Index_VCF_OTHER.output_vcf_gz, 248 | output_vcf_name = final_vcf_name + ".other.vcf.gz", 249 | preemptible_tries = preemptible_tries 250 | } 251 | 252 | output { 253 | File output_ped = Make_Pedigree_File.output_ped 254 | File output_vcf_bnd = Filter_Index_BND.output_vcf_gz 255 | File output_vcf_index_bnd = Filter_Index_BND.output_vcf_gz_index 256 | File output_vcf_del = Filter_Index_DEL.output_vcf_gz 257 | File output_vcf_ins = Filter_Index_INS.output_vcf_gz 258 | File output_vcf_index_other = Filter_Index_OTHER.output_vcf_gz_index 259 | File output_vcf_other = Filter_Index_OTHER.output_vcf_gz 260 | File output_vcf_index_del = Filter_Index_DEL.output_vcf_gz_index 261 | File output_vcf_index_ins = Filter_Index_INS.output_vcf_gz_index 262 | } 263 | } 264 | -------------------------------------------------------------------------------- /scripts/jes.conf: -------------------------------------------------------------------------------- 1 | # Updated Cromwell template for JES 2 | 3 | webservice { 4 | port = 8000 5 | interface = 0.0.0.0 6 | binding-timeout = 5s 7 | instance.name = "cromwell-for-wdl-runner" 8 | } 9 | 10 | akka { 11 | 12 | dispatchers { 13 | io-dispatcher { 14 | type = Dispatcher 15 | executor = "fork-join-executor" 16 | } 17 | 18 | api-dispatcher { 19 | type = Dispatcher 20 | executor = "fork-join-executor" 21 | } 22 | engine-dispatcher { 23 | type = Dispatcher 24 | executor = "fork-join-executor" 25 | } 26 | 27 | backend-dispatcher { 28 | type = Dispatcher 29 | executor = "fork-join-executor" 30 | } 31 | 32 | service-dispatcher { 33 | type = Dispatcher 34 | executor = "fork-join-executor" 35 | } 36 | 37 | } 38 | } 39 | 40 | system { 41 | # If 'true', a SIGINT will trigger Cromwell to attempt to abort all currently running jobs before exiting 42 | #abort-jobs-on-terminate = false 43 | 44 | # If 'true' then when Cromwell starts up, it tries to restart incomplete workflows 45 | workflow-restart = true 46 | 47 | # Cromwell will cap the number of running workflows at N 48 | max-concurrent-workflows = 5000 49 | 50 | # Cromwell will launch up to N submitted workflows at a time, regardless of how many open workflow slots exist 51 | max-workflow-launch-count = 50 52 | 53 | # Number of seconds between workflow launches 54 | new-workflow-poll-rate = 20 55 | 56 | # Since the WorkflowLogCopyRouter is initialized in code, this is the number of workers 57 | number-of-workflow-log-copy-workers = 10 58 | 59 | # Default number of cache read workers 60 | number-of-cache-read-workers = 25 61 | 62 | io { 63 | # Global Throttling - This is mostly useful for GCS and can be adjusted to match 64 | # the quota availble on the GCS API 65 | number-of-requests = 100000 66 | per = 100 seconds 67 | 68 | # Number of times an I/O operation should be attempted before giving up and failing it. 69 | number-of-attempts = 5 70 | } 71 | } 72 | 73 | workflow-options { 74 | encrypted-fields: [] 75 | 76 | base64-encryption-key: "AAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAA=" 77 | 78 | workflow-log-dir: "cromwell-workflow-logs" 79 | 80 | workflow-log-temporary: true 81 | } 82 | 83 | # Optional call-caching configuration. 84 | call-caching { 85 | enabled = true 86 | invalidate-bad-cache-results = true 87 | } 88 | 89 | google { 90 | 91 | application-name = "cromwell" 92 | 93 | auths = [ 94 | { 95 | name = "application-default" 96 | scheme = "application_default" 97 | }, 98 | ] 99 | } 100 | 101 | docker { 102 | hash-lookup { 103 | // Set this to match your available quota against the Google Container Engine API 104 | gcr-api-queries-per-100-seconds = 1000 105 | // Time in minutes before an entry expires from the docker hashes cache and needs to be fetched again 106 | cache-entry-ttl = "20 minutes" 107 | // Maximum number of elements to be kept in the cache. If the limit is reached, old elements will be removed from the cache 108 | cache-size = 200 109 | // How should docker hashes be looked up. Possible values are "local" and "remote" 110 | // "local": Lookup hashes on the local docker daemon using the cli 111 | // "remote": Lookup hashes on docker hub and gcr 112 | method = "remote" 113 | } 114 | } 115 | 116 | engine { 117 | # This instructs the engine which filesystems are at its disposal to perform any IO operation that it might need. 118 | # For instance, WDL variables declared at the Workflow level will be evaluated using the filesystems declared here. 119 | # If you intend to be able to run workflows with this kind of declarations: 120 | # workflow { 121 | # String str = read_string("gs://bucket/my-file.txt") 122 | # } 123 | # You will need to provide the engine with a gcs filesystem 124 | # Note that the default filesystem (local) is always available. 125 | filesystems { 126 | gcs { 127 | auth = "application-default" 128 | } 129 | local { 130 | enabled: true 131 | } 132 | } 133 | } 134 | 135 | backend { 136 | default = "JES" 137 | providers { 138 | JES { 139 | actor-factory = "cromwell.backend.impl.jes.JesBackendLifecycleActorFactory" 140 | config { 141 | # Google project 142 | project = "washu-genome-inh-dis-analysis" 143 | root = "gs://ccdg-100-samples-trios-pilot-crams-mgi/workspace" 144 | 145 | # Set this to the lower of the two values "Queries per 100 seconds" and "Queries per 100 seconds per user" for 146 | # your project. 147 | # 148 | # Used to help determine maximum throughput to the Google Genomics API. Setting this value too low will 149 | # cause a drop in performance. Setting this value too high will cause QPS based locks from Google. 150 | # 1000 is the default "Queries per 100 seconds per user", 50000 is the default "Queries per 100 seconds" 151 | # See https://cloud.google.com/genomics/quotas for more information 152 | genomics-api-queries-per-100-seconds = 1000 153 | 154 | # Polling for completion backs-off gradually for slower-running jobs. 155 | # This is the maximum polling interval (in seconds): 156 | maximum-polling-interval = 600 157 | 158 | genomics { 159 | # A reference to an auth defined in the `google` stanza at the top. This auth is used to create 160 | # Pipelines and manipulate auth JSONs. 161 | auth = "application-default" 162 | 163 | 164 | // alternative service account to use on the launched compute instance 165 | // NOTE: If combined with service account authorization, both that serivce account and this service account 166 | // must be able to read and write to the 'root' GCS path 167 | compute-service-account = "default" 168 | 169 | # Endpoint for APIs, no reason to change this unless directed by Google. 170 | endpoint-url = "https://genomics.googleapis.com/" 171 | } 172 | 173 | filesystems { 174 | gcs { 175 | # A reference to a potentially different auth for manipulating files via engine functions. 176 | auth = "application-default" 177 | } 178 | } 179 | 180 | } 181 | } 182 | } 183 | } 184 | 185 | services { 186 | KeyValue { 187 | class = "cromwell.services.keyvalue.impl.SqlKeyValueServiceActor" 188 | } 189 | MetadataService { 190 | class = "cromwell.services.metadata.impl.MetadataServiceActor" 191 | config { 192 | # Set this value to "Inf" to turn off metadata summary refresh. The default value is currently "2 seconds". 193 | # metadata-summary-refresh-interval = "Inf" 194 | # For higher scale environments, e.g. many workflows and/or jobs, DB write performance for metadata events 195 | # can improved by writing to the database in batches. Increasing this value can dramatically improve overall 196 | # performance but will both lead to a higher memory usage as well as increase the risk that metadata events 197 | # might not have been persisted in the event of a Cromwell crash. 198 | # 199 | # For normal usage the default value of 1 (effectively no batching) should be fine but for larger/production 200 | # environments we recommend a value of at least 500. There'll be no one size fits all number here so we recommend 201 | # benchmarking performance and tuning the value to match your environment 202 | # db-batch-size = 1 203 | # 204 | # Periodically the stored metadata events will be forcibly written to the DB regardless of if the batch size 205 | # has been reached. This is to prevent situations where events wind up never being written to an incomplete batch 206 | # with no new events being generated. The default value is currently 5 seconds 207 | # db-flush-rate = 5 seconds 208 | } 209 | } 210 | } 211 | 212 | database { 213 | # hsql default 214 | profile = "slick.jdbc.HsqldbProfile$" 215 | db { 216 | driver = "org.hsqldb.jdbcDriver" 217 | url = "jdbc:hsqldb:mem:${uniqueSchema};shutdown=false;hsqldb.tx=mvcc" 218 | connectionTimeout = 3000 219 | } 220 | 221 | # mysql example 222 | #driver = "slick.driver.MySQLDriver$" 223 | #db { 224 | # driver = "com.mysql.jdbc.Driver" 225 | # url = "jdbc:mysql://host/cromwell?rewriteBatchedStatements=true" 226 | # user = "user" 227 | # password = "pass" 228 | # connectionTimeout = 5000 229 | #} 230 | 231 | # For batch inserts the number of inserts to send to the DB at a time 232 | # insert-batch-size = 2000 233 | 234 | migration { 235 | # For databases with a very large number of symbols, selecting all the rows at once can generate a variety of 236 | # problems. In order to avoid any issue, the selection is paginated. This value sets how many rows should be 237 | # retrieved and processed at a time, before asking for the next chunk. 238 | read-batch-size = 100000 239 | 240 | # Because a symbol row can contain any arbitrary wdl value, the amount of metadata rows to insert from a single 241 | # symbol row can vary from 1 to several thousands (or more). To keep the size of the insert batch from growing out 242 | # of control we monitor its size and execute/commit when it reaches or exceeds writeBatchSize. 243 | write-batch-size = 100000 244 | } 245 | } 246 | -------------------------------------------------------------------------------- /docker/cromwell_mysql/application.conf.template: -------------------------------------------------------------------------------- 1 | webservice { 2 | port = 8000 3 | interface = 0.0.0.0 4 | binding-timeout = 5s 5 | instance.name = "reference" 6 | } 7 | 8 | akka { 9 | actor.default-dispatcher.fork-join-executor { 10 | # Number of threads = min(parallelism-factor * cpus, parallelism-max) 11 | # Below are the default values set by Akka, uncomment to tune these 12 | 13 | #parallelism-factor = 3.0 14 | #parallelism-max = 64 15 | } 16 | 17 | dispatchers { 18 | # A dispatcher for actors performing blocking io operations 19 | # Prevents the whole system from being slowed down when waiting for responses from external resources for instance 20 | io-dispatcher { 21 | type = Dispatcher 22 | executor = "fork-join-executor" 23 | # Using the forkjoin defaults, this can be tuned if we wish 24 | } 25 | 26 | # A dispatcher for actors handling API operations 27 | # Keeps the API responsive regardless of the load of workflows being run 28 | api-dispatcher { 29 | type = Dispatcher 30 | executor = "fork-join-executor" 31 | } 32 | 33 | # A dispatcher for engine actors 34 | # Because backends behaviour is unpredictable (potentially blocking, slow) the engine runs 35 | # on its own dispatcher to prevent backends from affecting its performance. 36 | engine-dispatcher { 37 | type = Dispatcher 38 | executor = "fork-join-executor" 39 | } 40 | 41 | # A dispatcher used by supported backend actors 42 | backend-dispatcher { 43 | type = Dispatcher 44 | executor = "fork-join-executor" 45 | } 46 | 47 | # A dispatcher used for the service registry 48 | service-dispatcher { 49 | type = Dispatcher 50 | executor = "fork-join-executor" 51 | } 52 | # Note that without further configuration, all other actors run on the default dispatcher 53 | } 54 | } 55 | 56 | system { 57 | # If 'true', a SIGINT will trigger Cromwell to attempt to abort all currently running jobs before exiting 58 | abort-jobs-on-terminate = false 59 | 60 | # Max number of retries per job that the engine will attempt in case of a retryable failure received from the backend 61 | max-retries = 10 62 | 63 | # If 'true' then when Cromwell starts up, it tries to restart incomplete workflows 64 | workflow-restart = true 65 | 66 | # Cromwell will cap the number of running workflows at N 67 | max-concurrent-workflows = 5000 68 | 69 | # Cromwell will launch up to N submitted workflows at a time, regardless of how many open workflow slots exist 70 | max-workflow-launch-count = 50 71 | 72 | # Number of seconds between workflow launches 73 | new-workflow-poll-rate = 20 74 | 75 | # Since the WorkflowLogCopyRouter is initialized in code, this is the number of workers 76 | number-of-workflow-log-copy-workers = 10 77 | 78 | # Default number of cache read workers 79 | number-of-cache-read-workers = 25 80 | 81 | io { 82 | # Global Throttling - This is mostly useful for GCS and can be adjusted to match 83 | # the quota availble on the GCS API 84 | number-of-requests = 100000 85 | per = 100 seconds 86 | 87 | # Number of times an I/O operation should be attempted before giving up and failing it. 88 | number-of-attempts = 5 89 | } 90 | } 91 | 92 | workflow-options { 93 | # These workflow options will be encrypted when stored in the database 94 | encrypted-fields: [] 95 | 96 | # AES-256 key to use to encrypt the values in `encrypted-fields` 97 | base64-encryption-key: "AAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAA=" 98 | 99 | # Directory where to write per workflow logs 100 | workflow-log-dir: "cromwell-workflow-logs" 101 | 102 | # When true, per workflow logs will be deleted after copying 103 | workflow-log-temporary: true 104 | 105 | # Workflow-failure-mode determines what happens to other calls when a call fails. Can be either ContinueWhilePossible or NoNewCalls. 106 | # Can also be overridden in workflow options. Defaults to NoNewCalls. Uncomment to change: 107 | #workflow-failure-mode: "ContinueWhilePossible" 108 | } 109 | 110 | // Optional call-caching configuration. 111 | call-caching { 112 | enabled = true 113 | invalidate-bad-cache-results = true 114 | } 115 | 116 | engine { 117 | # This instructs the engine which filesystems are at its disposal to perform any IO operation that it might need. 118 | # For instance, WDL variables declared at the Workflow level will be evaluated using the filesystems declared here. 119 | # If you intend to be able to run workflows with this kind of declarations: 120 | # workflow { 121 | # String str = read_string("gs://bucket/my-file.txt") 122 | # } 123 | # You will need to provide the engine with a gcs filesystem 124 | # Note that the default filesystem (local) is always available. 125 | filesystems { 126 | # gcs { 127 | # auth = "application-default" 128 | # } 129 | local { 130 | caching { 131 | # When copying a cached result, what type of file duplication should occur. Attempted in the order listed below: 132 | duplication-strategy: [ 133 | "soft-link" 134 | ] 135 | 136 | # Possible values: file, path 137 | # "file" will compute an md5 hash of the file content. 138 | # "path" will compute an md5 hash of the file path. This strategy will only be effective if the duplication-strategy (above) is set to "soft-link", 139 | # in order to allow for the original file path to be hashed. 140 | # Default: file 141 | hashing-strategy: "path" 142 | 143 | # When true, will check if a sibling file with the same name and the .md5 extension exists, and if it does, use the content of this file as a hash. 144 | # If false or the md5 does not exist, will proceed with the above-defined hashing strategy. 145 | # Default: false 146 | check-sibling-md5: false 147 | } 148 | } 149 | } 150 | } 151 | 152 | backend { 153 | default = "LSF" 154 | providers { 155 | Local { 156 | actor-factory = "cromwell.backend.impl.sfs.config.ConfigBackendLifecycleActorFactory" 157 | config { 158 | run-in-background = true 159 | runtime-attributes = "String? docker" 160 | submit = "/bin/bash ${script}" 161 | submit-docker = "docker run --rm -v ${cwd}:${docker_cwd} -i ${docker} /bin/bash < ${script}" 162 | 163 | # Root directory where Cromwell writes job results. This directory must be 164 | # visible and writeable by the Cromwell process as well as the jobs that Cromwell 165 | # launches. 166 | root = "%%SHARED_FS_DIRECTORY%%/cromwell-executions" // Change this to your directory that contains this application.conf file. 167 | 168 | filesystems { 169 | local { 170 | localization: [ 171 | "hard-link", "soft-link", "copy" 172 | ] 173 | } 174 | } 175 | default-runtime-attributes { 176 | failOnStderr: false 177 | continueOnReturnCode: 0 178 | } 179 | } 180 | } 181 | 182 | 183 | LSF { 184 | actor-factory = "cromwell.backend.impl.sfs.config.ConfigBackendLifecycleActorFactory" 185 | config { 186 | runtime-attributes = """ 187 | Int cpu = 1 188 | Int? memory_mb = 4000 189 | String? queue = 'research-hpc' 190 | String? project 191 | String? docker 192 | String? resource 193 | String? job_group 194 | String? priority 195 | """ 196 | 197 | submit = """ 198 | bsub \ 199 | -J ${job_name} \ 200 | -cwd ${cwd} \ 201 | -o ${out} \ 202 | -e ${err} \ 203 | ${"-P " + project} \ 204 | ${"-q " + queue} \ 205 | -M ${memory_mb}000 \ 206 | -R 'select[mem>${memory_mb}] rusage[mem=${memory_mb}] span[hosts=1]' \ 207 | ${"-n " + cpu} \ 208 | ${"-R \"" + resource + "\""} \ 209 | ${"-g \"" + job_group + "\""} \ 210 | ${"-sp " + priority} \ 211 | /bin/bash ${script} 212 | """ 213 | 214 | submit-docker = """ 215 | LSF_DOCKER_PRESERVE_ENVIRONMENT='false' \ 216 | LSF_DOCKER_VOLUMES='${cwd}:${docker_cwd}' \ 217 | bsub \ 218 | -J ${job_name} \ 219 | -cwd ${cwd} \ 220 | -a 'docker(${docker})' \ 221 | ${"-P " + project} \ 222 | ${"-q " + queue} \ 223 | -M ${memory_mb}000 \ 224 | -R 'select[mem>${memory_mb}] rusage[mem=${memory_mb}] span[hosts=1]' \ 225 | ${"-n " + cpu} \ 226 | ${"-R \"" + resource + "\""} \ 227 | ${"-g \"" + job_group + "\""} \ 228 | ${"-sp " + priority} \ 229 | /bin/bash -c '/bin/bash ${script} >${out} 2>${err}' 230 | """ 231 | 232 | kill = "bkill ${job_id}" 233 | check-alive = "bjobs -noheader -o \"stat\" ${job_id} | /bin/grep 'PEND\\|RUN'" 234 | job-id-regex = "Job <(\\d+)>.*" 235 | root = "%%SHARED_FS_DIRECTORY%%/cromwell-executions" 236 | filesystems { 237 | localization: [ 238 | "soft-link" 239 | ] 240 | hashing-strategy: "path" 241 | } 242 | default-runtime-attributes { 243 | failOnStderr: false 244 | continueOnReturnCode: 0 245 | } 246 | } 247 | } 248 | } 249 | } 250 | 251 | services { 252 | KeyValue { 253 | class = "cromwell.services.keyvalue.impl.SqlKeyValueServiceActor" 254 | } 255 | MetadataService { 256 | class = "cromwell.services.metadata.impl.MetadataServiceActor" 257 | config { 258 | # Set this value to "Inf" to turn off metadata summary refresh. The default value is currently "2 seconds". 259 | # metadata-summary-refresh-interval = "Inf" 260 | # For higher scale environments, e.g. many workflows and/or jobs, DB write performance for metadata events 261 | # can improved by writing to the database in batches. Increasing this value can dramatically improve overall 262 | # performance but will both lead to a higher memory usage as well as increase the risk that metadata events 263 | # might not have been persisted in the event of a Cromwell crash. 264 | # 265 | # For normal usage the default value of 1 (effectively no batching) should be fine but for larger/production 266 | # environments we recommend a value of at least 500. There'll be no one size fits all number here so we recommend 267 | # benchmarking performance and tuning the value to match your environment 268 | # db-batch-size = 1 269 | # 270 | # Periodically the stored metadata events will be forcibly written to the DB regardless of if the batch size 271 | # has been reached. This is to prevent situations where events wind up never being written to an incomplete batch 272 | # with no new events being generated. The default value is currently 5 seconds 273 | # db-flush-rate = 5 seconds 274 | } 275 | } 276 | } 277 | 278 | database { 279 | # mysql example 280 | profile = "slick.jdbc.MySQLProfile$" 281 | db { 282 | driver = "com.mysql.jdbc.Driver" 283 | url = "jdbc:mysql://localhost:3306/cromwell?socket=/tmp/mysqld.sock" 284 | user = "cromwell" 285 | password = "test4cromwell" 286 | connectionTimeout = 10000 287 | } 288 | 289 | migration { 290 | # For databases with a very large number of symbols, selecting all the rows at once can generate a variety of 291 | # problems. In order to avoid any issue, the selection is paginated. This value sets how many rows should be 292 | # retrieved and processed at a time, before asking for the next chunk. 293 | read-batch-size = 100000 294 | 295 | # Because a symbol row can contain any arbitrary wdl value, the amount of metadata rows to insert from a single 296 | # symbol row can vary from 1 to several thousands (or more). To keep the size of the insert batch from growing out 297 | # of control we monitor its size and execute/commit when it reaches or exceeds writeBatchSize. 298 | write-batch-size = 100000 299 | } 300 | } 301 | -------------------------------------------------------------------------------- /scripts/SV_Tasks.wdl: -------------------------------------------------------------------------------- 1 | version 1.0 2 | # get the sample (SM) field from a CRAM file 3 | task Split_By_Type { 4 | input { 5 | File input_vcf 6 | String output_vcf_prefix 7 | Int preemptible_tries 8 | } 9 | command <<< 10 | set -eo pipefail 11 | zcat ~{input_vcf} | grep -v "random " | grep -v "alt " | grep -v "decoy " | grep -v "EBV " | grep -v "^chrUn" | grep -v "^HLA"| /opt/hall-lab/vawk/vawk -v svtype=BND --header '{if(I$SVTYPE==svtype) print $0;}' | /opt/hall-lab/htslib-1.9/bin/bgzip -c > ~{output_vcf_prefix}.bnd.vcf.gz 12 | zcat ~{input_vcf} | grep -v "random " | grep -v "alt " | grep -v "decoy " | grep -v "EBV " | grep -v "^chrUn" | grep -v "^HLA"| /opt/hall-lab/vawk/vawk -v svtype=DEL --header '{if(I$SVTYPE==svtype) print $0;}' | /opt/hall-lab/htslib-1.9/bin/bgzip -c > ~{output_vcf_prefix}.del.vcf.gz 13 | zcat ~{input_vcf} | grep -v "random " | grep -v "alt " | grep -v "decoy " | grep -v "EBV " | grep -v "^chrUn" | grep -v "^HLA"| /opt/hall-lab/vawk/vawk -v svtype=INS --header '{if(I$SVTYPE==svtype) print $0;}' | /opt/hall-lab/htslib-1.9/bin/bgzip -c > ~{output_vcf_prefix}.ins.vcf.gz 14 | zcat ~{input_vcf} | grep -v "random " | grep -v "alt " | grep -v "decoy " | grep -v "EBV " | grep -v "^chrUn" | grep -v "^HLA"| /opt/hall-lab/vawk/vawk --header '{if(I$SVTYPE!="DEL" && I$SVTYPE!="BND" && I$SVTYPE!="INS") print $0;}' | /opt/hall-lab/htslib-1.9/bin/bgzip -c > ~{output_vcf_prefix}.other.vcf.gz 15 | zcat ~{output_vcf_prefix}.ins.vcf.gz | \ 16 | /opt/hall-lab/vawk/vawk '{ct=split(I$SNAME, spl, ","); for(ii=1; ii<=ct; ii++) print $3, spl[ii], $9}' | \ 17 | /opt/hall-lab/htslib-1.9/bin/bgzip -c > ~{output_vcf_prefix}.ins_split.txt.gz 18 | >>> 19 | runtime { 20 | docker: "halllab/vcf_bed_utils@sha256:09c18a5827d67891792ffc110627c7fa05b2262df4b91d6967ad6e544f41e8ec" 21 | cpu: "1" 22 | memory: "1 GB" 23 | disks: "local-disk " + ceil( size(input_vcf, "GB") * 2) + " HDD" 24 | preemptible: preemptible_tries 25 | } 26 | output { 27 | File bnd_vcf = "${output_vcf_prefix}.bnd.vcf.gz" 28 | File del_vcf = "${output_vcf_prefix}.del.vcf.gz" 29 | File ins_vcf = "${output_vcf_prefix}.ins.vcf.gz" 30 | File other_vcf = "${output_vcf_prefix}.other.vcf.gz" 31 | File ins_split = "${output_vcf_prefix}.ins_split.txt.gz" 32 | } 33 | } 34 | 35 | task Get_Sample_Name { 36 | input { 37 | File input_cram 38 | Int preemptible_tries 39 | } 40 | 41 | command { 42 | set -eo pipefail 43 | samtools view -H ${input_cram} \ 44 | | grep -m 1 '^@RG' | tr '\t' '\n' \ 45 | | grep '^SM:' | sed 's/^SM://g' 46 | } 47 | 48 | runtime { 49 | docker: "halllab/extract-sv-reads@sha256:192090f72afaeaaafa104d50890b2fc23935c8dc98988a9b5c80ddf4ec50f70c" 50 | cpu: "1" 51 | memory: "1 GB" 52 | disks: "local-disk " + ceil( size(input_cram, "GB") + 2.0) + " HDD" 53 | preemptible: preemptible_tries 54 | } 55 | 56 | output { 57 | String sample = read_string(stdout()) 58 | } 59 | } 60 | 61 | # infer the sex of a sample based on chrom X copy number 62 | task Get_Sex { 63 | input { 64 | File input_cn_hist_root 65 | File ref_fasta_index 66 | Int preemptible_tries 67 | } 68 | 69 | command <<< 70 | set -eo pipefail 71 | cat ~{ref_fasta_index} \ 72 | | awk '$1=="chrX" { print $1":0-"$2 } END { print "exit"}' \ 73 | | cnvnator -root ~{input_cn_hist_root} -genotype 100 \ 74 | | grep -v "^Assuming male" \ 75 | | awk '{ printf("%.0f\n",$4); }' 76 | >>> 77 | 78 | runtime { 79 | docker: "halllab/cnvnator@sha256:8bf4fa64a288c5647a9a6b1ea90d14e76f48a3e16c5bf98c63419bb7d81c8938" 80 | cpu: "1" 81 | memory: "1 GB" 82 | disks: "local-disk 4 HDD" 83 | preemptible: preemptible_tries 84 | } 85 | 86 | output { 87 | String sex = read_string(stdout()) 88 | } 89 | } 90 | 91 | # Create pedigree file from samples, with sex inferred from 92 | # CNVnator X chrom copy number 93 | task Make_Pedigree_File { 94 | input { 95 | Array[String] sample_array 96 | Array[String] sex_array 97 | String output_ped_basename 98 | File sample_file = write_lines(sample_array) 99 | File sex_file = write_lines(sex_array) 100 | } 101 | 102 | command <<< 103 | set -eo pipefail 104 | paste ~{sample_file} ~{sex_file} \ 105 | | awk '{ print $1,$1,-9,-9,$2,-9 }' OFS='\t' \ 106 | > ~{output_ped_basename}.ped 107 | >>> 108 | 109 | runtime { 110 | docker: "ubuntu@sha256:edf05697d8ea17028a69726b4b450ad48da8b29884cd640fec950c904bfb50ce" 111 | cpu: "1" 112 | memory: "1 GB" 113 | disks: "local-disk 4 HDD" 114 | } 115 | 116 | output { 117 | File output_ped = "${output_ped_basename}.ped" 118 | } 119 | } 120 | 121 | # index a CRAM 122 | task Index_Cram { 123 | input { 124 | File input_cram 125 | String basename 126 | File ref_cache 127 | Int preemptible_tries 128 | } 129 | 130 | command { 131 | set -eo pipefail 132 | ln -s ${input_cram} ${basename}.cram 133 | 134 | # build the reference sequence cache 135 | tar -zxf ${ref_cache} 136 | export REF_PATH=./cache/%2s/%2s/%s 137 | export REF_CACHE=./cache/%2s/%2s/%s 138 | 139 | # index the CRAM 140 | samtools index ${basename}.cram 141 | } 142 | 143 | runtime { 144 | docker: "halllab/samtools@sha256:5e6b0430a7ad25f68e5c46a9fa9c0ebba0f9af8ebf5aebe94242954d812a4e68" 145 | cpu: "1" 146 | memory: "1 GB" 147 | disks: "local-disk " + ceil( size(input_cram, "GB") + size(ref_cache, "GB") * 5 + 1.0) + " HDD" 148 | preemptible: preemptible_tries 149 | } 150 | 151 | output { 152 | File output_cram_index = "${basename}.cram.crai" 153 | } 154 | } 155 | 156 | task Filter_Index { 157 | input { 158 | File input_vcf_gz 159 | String output_vcf_name 160 | Int preemptible_tries 161 | } 162 | 163 | command <<< 164 | set -eo pipefail 165 | FILTERLINE='##FILTER=' 166 | zcat ~{input_vcf_gz} | \ 167 | /opt/hall-lab/vawk/vawk '{ \ 168 | split(I$STRANDS,x,","); \ 169 | split(x[1],y,":"); \ 170 | split(x[2],z,":"); \ 171 | if (I$SVTYPE=="INS" && I$NSAMP>0) { \ 172 | I$MSQ=QUAL/I$NSAMP; \ 173 | gsub("MSQ=0.00", "MSQ="I$MSQ, $8) \ 174 | } \ 175 | if ((I$SVTYPE=="DEL" || I$SVTYPE=="DUP" || I$SVTYPE=="MEI") && \ 176 | I$MSQ>=100 && sqrt((I$SVLEN)*(I$SVLEN))>=50){ \ 177 | $7="PASS"; print $0; \ 178 | } else if ( I$SVTYPE=="INV" && $6>=100 && (I$SR/I$SU)>=0.1 && (I$PE/I$SU)>=0.1 && (y[2]/I$SU)>0.1 && (z[2]/I$SU)>0.1 && sqrt((I$SVLEN)*(I$SVLEN))>=50){ \ 179 | $7="PASS"; print $0; \ 180 | } else if ( I$SVTYPE=="BND" && $9 !~ /CN/ && I$MSQ>=500){ \ 181 | $7="PASS"; print $0; \ 182 | } else if ( I$SVTYPE=="BND" && $9 ~ /CN/ && I$MSQ>=250){ \ 183 | $7="PASS"; print $0; \ 184 | } else if ( I$SVTYPE=="INS" && I$MSQ>=100 && I$SVLEN >=50) { \ 185 | $7="PASS"; print $0; \ 186 | } else { \ 187 | $7="LOW"; print $0; \ 188 | } \ 189 | }' | cat <(zcat ~{input_vcf_gz} | sed -n '/^#[^#]/q;p') <(echo $FILTERLINE) <(zgrep -m 1 '^#CHROM' ~{input_vcf_gz}) - | /opt/hall-lab/htslib-1.9/bin/bgzip -c > ~{output_vcf_name} 190 | /opt/hall-lab/htslib-1.9/bin/tabix -p vcf -f ~{output_vcf_name} 191 | >>> 192 | 193 | runtime { 194 | docker: "halllab/vcf_bed_utils@sha256:09c18a5827d67891792ffc110627c7fa05b2262df4b91d6967ad6e544f41e8ec" 195 | cpu: "1" 196 | memory: "1 GB" 197 | disks: "local-disk " + ceil( size(input_vcf_gz, "GB") * 2) + " HDD" 198 | preemptible: preemptible_tries 199 | } 200 | 201 | output { 202 | File output_vcf_gz = "${output_vcf_name}" 203 | File output_vcf_gz_index = "${output_vcf_name}.tbi" 204 | } 205 | 206 | } 207 | 208 | task Count_Lumpy { 209 | input { 210 | String basename 211 | File input_vcf 212 | Int preemptible_tries 213 | String cohort 214 | String center 215 | } 216 | 217 | command <<< 218 | set -eo pipefail 219 | 220 | bcftools query -f "[%CHROM\t~{cohort}\t~{center}\t%FILTER\t%INFO/SVTYPE\t%INFO/SVLEN\t%INFO/SR\t%SAMPLE\t%GT\n]" ~{input_vcf} \ 221 | | awk 'BEGIN{OFS="\t"}{if($1~/chr[1-9]+/ && $1!~/_/) { 222 | svlen=$6; 223 | if($6<0 && $6!=".") svlen=-1*$6; 224 | len_bin=">=1kb" 225 | if(svlen<1000) len_bin="<1kb"; 226 | if($7>0) $7="SR>=1"; 227 | else $7="SR=0"; 228 | print $1, $2, $3, $4, $5, len_bin, $7, $8, $9;}}' \ 229 | | sort -k1,9 \ 230 | | uniq -c \ 231 | | awk 'BEGIN{OFS="\t"}{print $2, $3, $4, $5, $6, $7, $8, $9, $1}' > ~{basename}.lumpy.counts.1.txt 232 | >>> 233 | 234 | runtime { 235 | docker: "halllab/bcftools@sha256:955cbf93e35e5ee6fdb60e34bb404b7433f816e03a202dfed9ceda542e0d8906" 236 | cpu: "1" 237 | memory: "1 GB" 238 | disks: "local-disk " + ceil( size(input_vcf, "GB") * 2) + " HDD" 239 | preemptible: preemptible_tries 240 | } 241 | 242 | output { 243 | File output_counts = "${basename}.lumpy.counts.1.txt" 244 | } 245 | } 246 | 247 | task Count_Manta { 248 | input { 249 | String basename 250 | File input_vcf 251 | Int preemptible_tries 252 | String cohort 253 | String center 254 | } 255 | 256 | command <<< 257 | set -eo pipefail 258 | 259 | bcftools query -f "[%CHROM\t~{cohort}\t~{center}\t%FILTER\t%INFO/SVTYPE\t%SAMPLE\t%GT\n]" ~{input_vcf} \ 260 | | awk 'BEGIN{OFS="\t"}{if($1~/chr[1-9]+/ && $1!~/_/ && $4=="PASS") print $0;}' \ 261 | | sort -k1,7 \ 262 | | uniq -c \ 263 | | awk 'BEGIN{OGS="\t"}{print $2, $3, $4, $5, $6, $7, $8, $1}' > ~{basename}.manta.counts.1.txt 264 | >>> 265 | 266 | runtime { 267 | docker: "halllab/bcftools@sha256:955cbf93e35e5ee6fdb60e34bb404b7433f816e03a202dfed9ceda542e0d8906" 268 | cpu: "1" 269 | memory: "1 GB" 270 | disks: "local-disk " + ceil( size(input_vcf, "GB") * 2) + " HDD" 271 | preemptible: preemptible_tries 272 | } 273 | 274 | output { 275 | File output_counts = "${basename}.manta.counts.1.txt" 276 | } 277 | } 278 | 279 | task Manta { 280 | input { 281 | File input_cram 282 | File input_cram_index 283 | File ref_fasta 284 | File ref_fasta_index 285 | File ref_cache 286 | File? call_regions_bed 287 | File? call_regions_bed_index 288 | String basename 289 | Int preemptible_tries 290 | } 291 | 292 | # Manta requires 2GB per thread for scheduling, but in typical cases uses less than this 293 | # see https://github.com/Illumina/manta/issues/38 294 | # Setting below derives CPU count from machine 295 | # Sets RAM to unlimited to jobs are scheduled only 296 | # with respect to cores 297 | # If a task starts to fail then we can adjust the machine resources to get it 298 | # to succeed without adjusting the command 299 | # Note that we are converting to BAM on the fly as CRAM is showing extreme memory usage in some situations. See https://github.com/Illumina/manta/issues/154. 300 | # Note also that we are specifying an inflation factor of 4, but padding with 20GB of data. This is aimed to get us over 100GB of SSD for better performance on small samples. 301 | 302 | command { 303 | set -eo pipefail 304 | ln -s ${input_cram} ${basename}.cram 305 | ln -s ${input_cram_index} ${basename}.cram.crai 306 | 307 | tar -zxf ${ref_cache} 308 | export REF_PATH=./cache/%2s/%2s/%s 309 | export REF_CACHE=./cache/%2s/%2s/%s 310 | 311 | ${"touch " + call_regions_bed_index} 312 | 313 | samtools view -hb -@8 ${basename}.cram -o ${basename}.bam 314 | samtools index -@8 ${basename}.bam 315 | 316 | configManta.py \ 317 | --referenceFasta=${ref_fasta} \ 318 | --runDir=MantaWorkflow \ 319 | --bam=${basename}.bam ${"--callRegions=" + call_regions_bed} 320 | MantaWorkflow/runWorkflow.py -m local -g "unlimited" 321 | mv MantaWorkflow/results/variants/diploidSV.vcf.gz ${basename}.vcf.gz 322 | mv MantaWorkflow/results/variants/diploidSV.vcf.gz.tbi ${basename}.vcf.gz.tbi 323 | zcat ${basename}.vcf.gz | /opt/hall-lab/python-2.7.15/bin/python /opt/hall-lab/doctor_manta.1.py -m 700 | /opt/hall-lab/htslib-1.9/bin/bgzip -c > ${basename}.doctored.vcf.gz 324 | /opt/hall-lab/htslib-1.9/bin/tabix -p vcf ${basename}.doctored.vcf.gz 325 | tar -czvf ${basename}.MantaWorkflow.tgz MantaWorkflow 326 | } 327 | runtime { 328 | docker: "halllab/manta_samtools@sha256:d39fac59a2c06f808d115c65b9c191baf5f249769d317263ae3cd19e2c74d20e" 329 | cpu: "8" 330 | memory: "16 GiB" 331 | disks: "local-disk " + ceil( size(input_cram, "GB") * 4 + size(input_cram_index, "GB") + size(ref_fasta, "GB") + size(ref_fasta_index, "GB") + size(ref_cache, "GB") * 5 + 20.0) + " SSD" 332 | preemptible: preemptible_tries 333 | } 334 | output { 335 | File output_vcf = "${basename}.doctored.vcf.gz" 336 | File output_tbi = "${basename}.doctored.vcf.gz.tbi" 337 | File original_vcf = "${basename}.vcf.gz" 338 | File original_tbi = "${basename}.vcf.gz.tbi" 339 | File workflow_tgz = "${basename}.MantaWorkflow.tgz" 340 | } 341 | } 342 | 343 | # Smoove wrapper 344 | task Smoove { 345 | input { 346 | String basename 347 | File input_cram 348 | File input_cram_index 349 | 350 | File ref_fasta 351 | File ref_fasta_index 352 | File ref_cache 353 | File exclude_regions 354 | 355 | Int preemptible_tries 356 | } 357 | 358 | command { 359 | set -eo pipefail 360 | ln -s ${input_cram} ${basename}.cram 361 | ln -s ${input_cram_index} ${basename}.cram.crai 362 | 363 | tar -zxf ${ref_cache} 364 | export REF_PATH=./cache/%2s/%2s/%s 365 | export REF_CACHE=./cache/%2s/%2s/%s 366 | 367 | export SMOOVE_NO_MAX_CI=TRUE 368 | 369 | smoove call \ 370 | --name ${basename} \ 371 | --exclude ${exclude_regions} \ 372 | --fasta ${ref_fasta} \ 373 | --noextrafilters \ 374 | --genotype \ 375 | ${basename}.cram 376 | 377 | if [ ! -e ${basename}.histo ]; then 378 | mv *.histo ${basename}.histo 379 | mv *.split.bam ${basename}.split.bam 380 | mv *.split.bam.bai ${basename}.split.bam.bai 381 | mv *.disc.bam ${basename}.disc.bam 382 | mv *.disc.bam.bai ${basename}.disc.bam.bai 383 | fi 384 | } 385 | 386 | runtime { 387 | docker: "brentp/smoove@sha256:c839ed223462a1c1ae26e7acc27f28f0f67b4581d80a06823895f295ad2bdaf4" 388 | cpu: "1" 389 | memory: "2.5 GiB" 390 | disks: "local-disk " + ceil( size(input_cram, "GB") + size(input_cram_index, "GB") + size(ref_fasta, "GB") + size(ref_fasta_index, "GB") + size(exclude_regions, "GB") + size(input_cram, "GB") * 0.30 + size(ref_cache, "GB") * 5) + " HDD" 391 | preemptible: preemptible_tries 392 | } 393 | 394 | output { 395 | File output_vcf = "${basename}-smoove.genotyped.vcf.gz" 396 | File output_csi = "${basename}-smoove.genotyped.vcf.gz.csi" 397 | File output_histogram = "${basename}.histo" 398 | File lumpy_script = "${basename}-lumpy-cmd.sh" 399 | File splitters = "${basename}.split.bam" 400 | File splitters_index = "${basename}.split.bam.bai" 401 | File discordants = "${basename}.disc.bam" 402 | File discordants_index = "${basename}.disc.bam.bai" 403 | } 404 | } 405 | 406 | task Genotype { 407 | input { 408 | String basename 409 | File input_cram 410 | File input_cram_index 411 | File input_vcf 412 | File ref_cache 413 | Int preemptible_tries 414 | } 415 | 416 | command { 417 | set -eo pipefail 418 | ln -s ${input_cram} ${basename}.cram 419 | ln -s ${input_cram_index} ${basename}.cram.crai 420 | 421 | # build the reference sequence cache 422 | tar -zxf ${ref_cache} 423 | export REF_PATH=./cache/%2s/%2s/%s 424 | export REF_CACHE=./cache/%2s/%2s/%s 425 | 426 | rm -f ${basename}.cram.json 427 | zcat ${input_vcf} \ 428 | | svtyper \ 429 | -B ${basename}.cram \ 430 | -l ${basename}.cram.json \ 431 | | bgzip -c > ${basename}.gt.vcf.gz 432 | } 433 | 434 | runtime { 435 | docker: "halllab/svtyper@sha256:8ebb0508bc63a2a32d22b4a3e55453222560daa30b7cc14a4f1189cb311d5922" 436 | cpu: "1" 437 | memory: "15 GB" 438 | disks: "local-disk " + ceil( size(input_cram, "GB") + size(input_vcf, "GB") + size(ref_cache, "GB") * 5 + 20.0) + " HDD" 439 | preemptible: preemptible_tries 440 | } 441 | 442 | output { 443 | File output_vcf = "${basename}.gt.vcf.gz" 444 | File output_lib = "${basename}.cram.json" 445 | } 446 | } 447 | 448 | task Take_Original_Genotypes { 449 | input { 450 | String sample_name 451 | String basename 452 | File input_vcf 453 | File input_variant_to_sname_mapping 454 | File original_per_sample_vcf 455 | Int preemptible_tries 456 | } 457 | 458 | command <<< 459 | set -eo pipefail 460 | zcat ~{input_variant_to_sname_mapping} \ 461 | | /opt/hall-lab/vawk/vawk -v sname="~{sample_name}" 'BEGIN{OFS="\t"}{ \ 462 | split($2, spl, ":"); \ 463 | if(spl[1]==sname) { \ 464 | print $1, spl[1], spl[2]":"spl[3]":"spl[4]":"spl[5]":"spl[6]":"spl[7]":"spl[8]; \ 465 | } \ 466 | }' \ 467 | | /opt/hall-lab/io/zjoin -a stdin -b <(paste -d ":" <(zcat ~{original_per_sample_vcf} | grep -v "^#" | cut -f 3,9-) <(zcat ~{original_per_sample_vcf} | grep -v "^#" | cut -f 4,5 | tr "\t" ":") <(zcat ~{original_per_sample_vcf} | /opt/hall-lab/vawk/vawk '{svlen=I$SVLEN; if(svlen==""){svlen="."} print svlen}') | sed 's/:SR/:SR:OREF:OALT:OSVLEN/') -1 3 -2 1 \ 468 | | cut -f 1,5- \ 469 | | awk -v sname="~{sample_name}" 'BEGIN{OFS="\t"; print "ID", "FORMAT", sname;}{ \ 470 | print $0; \ 471 | }' \ 472 | | /opt/hall-lab/htslib-1.9/bin/bgzip -c > temp 473 | 474 | zcat ~{input_vcf} \ 475 | | /opt/hall-lab/io/zjoin -r -p "##" -a stdin -b <(zcat temp | sort -k1,1 | /opt/hall-lab/bin/bedtools groupby -g 1 -c 2,3 -o first,first ) -1 3 -2 1 \ 476 | | cut -f -8,10- \ 477 | | /opt/hall-lab/vawk/vawk --header 'BEGIN{OFS="\t"}{if($9=="NA") {$9="GT:FT:GQ:PL:PR:SR:OREF:OALT:OSVLEN"; $10="0/0:.:.:.:.:.:.:.:.";} print $0;}' \ 478 | | sed 's/^#CHROM/##FORMAT=\n##FORMAT=\n##FORMAT=\n#CHROM/' \ 479 | | /opt/hall-lab/htslib-1.9/bin/bgzip -c > ~{basename}.gt.vcf.gz 480 | >>> 481 | 482 | runtime { 483 | docker: "halllab/vcf_bed_utils@sha256:09c18a5827d67891792ffc110627c7fa05b2262df4b91d6967ad6e544f41e8ec" 484 | cpu: "1" 485 | memory: "15 GB" 486 | disks: "local-disk " + ceil( size(original_per_sample_vcf, "GB") + size(input_vcf, "GB") + size(input_variant_to_sname_mapping, "GB") + 20.0) + " HDD" 487 | preemptible: preemptible_tries 488 | } 489 | 490 | output { 491 | File output_vcf = "${basename}.gt.vcf.gz" 492 | } 493 | } 494 | 495 | task Copy_Number { 496 | input { 497 | String basename 498 | String sample 499 | File input_vcf 500 | File input_cn_hist_root 501 | File ref_cache 502 | Int preemptible_tries 503 | } 504 | 505 | command { 506 | set -eo pipefail 507 | zcat ${input_vcf} \ 508 | | create_coordinates \ 509 | -o coordinates.txt 510 | 511 | svtools copynumber \ 512 | -i ${input_vcf} \ 513 | -s ${sample} \ 514 | --cnvnator cnvnator \ 515 | -w 100 \ 516 | -r ${input_cn_hist_root} \ 517 | -c coordinates.txt \ 518 | | bgzip -c \ 519 | > ${basename}.cn.vcf.gz 520 | } 521 | 522 | runtime { 523 | docker: "halllab/svtools@sha256:38ac08a8685ff58329b72e2b9c366872086d41ef21da84278676e06ef7f1bfbb" 524 | cpu: "1" 525 | memory: "4 GB" 526 | disks: "local-disk " + 35 + " HDD" 527 | preemptible: preemptible_tries 528 | } 529 | 530 | output { 531 | File output_vcf = "${basename}.cn.vcf.gz" 532 | } 533 | } 534 | 535 | task CNVnator_Histogram { 536 | input { 537 | String basename 538 | File input_cram 539 | File input_cram_index 540 | File ref_fasta 541 | File ref_fasta_index 542 | File ref_cache 543 | String ref_chrom_dir = "cnvnator_chroms" 544 | Int preemptible_tries 545 | Int threads = 4 546 | # Add 7G of pad of the chromosome directory and ~2-3 GB of output files 547 | } 548 | 549 | command <<< 550 | set -eo pipefail 551 | ln -s ~{input_cram} ~{basename}.cram 552 | ln -s ~{input_cram_index} ~{basename}.cram.crai 553 | 554 | # build the reference sequence cache 555 | tar -zxf ~{ref_cache} 556 | export REF_PATH=./cache/%2s/%2s/%s 557 | export REF_CACHE=./cache/%2s/%2s/%s 558 | 559 | # Create directory of chromosome FASTA files for CNVnator 560 | mkdir -p ~{ref_chrom_dir} 561 | awk -v CHROM_DIR=~{ref_chrom_dir} 'BEGIN { CHROM="" } { if ($1~"^>") CHROM=substr($1,2); print $0 > CHROM_DIR"/"CHROM".fa" }' ~{ref_fasta} 562 | 563 | cnvnator_wrapper.py \ 564 | -T cnvnator.out \ 565 | -o ~{basename}.cn \ 566 | -t ~{threads} \ 567 | -w 100 \ 568 | -b ~{basename}.cram \ 569 | -c ~{ref_chrom_dir} \ 570 | -g GRCh38 \ 571 | --cnvnator cnvnator 572 | >>> 573 | 574 | runtime { 575 | docker: "halllab/cnvnator@sha256:8bf4fa64a288c5647a9a6b1ea90d14e76f48a3e16c5bf98c63419bb7d81c8938" 576 | cpu: threads 577 | memory: "16 GB" 578 | disks: "local-disk " + ceil( size(input_cram, "GB") + size(input_cram_index, "GB") + size(ref_fasta, "GB") + size(ref_fasta_index, "GB") + size(ref_cache, "GB") * 5 + 7.0 ) + " HDD" 579 | preemptible: preemptible_tries 580 | } 581 | 582 | output { 583 | File output_cn_hist_root = "cnvnator.out/${basename}.cram.hist.root" 584 | File output_cn_txt = "${basename}.cn.txt" 585 | File output_cn_bed = "${basename}.cn.bed" 586 | } 587 | } 588 | 589 | task L_Sort_VCF_Variants { 590 | input { 591 | Array[File] input_vcfs 592 | File input_vcfs_file = write_lines(input_vcfs) 593 | String output_vcf_basename 594 | Int preemptible_tries 595 | } 596 | 597 | parameter_meta { 598 | input_vcfs: { 599 | description: "vcf files to sort together", 600 | localization_optional: true 601 | } 602 | } 603 | 604 | command { 605 | set -eo pipefail 606 | # strip the "gs://" prefix from the file paths 607 | cat ${input_vcfs_file} \ 608 | | sed 's/^gs:\/\//\.\//g' \ 609 | > ${input_vcfs_file}.local_map.txt 610 | sleep 1 611 | 612 | svtools lsort \ 613 | -b 200 \ 614 | -f ${input_vcfs_file} \ 615 | -t /cromwell_root/bulk_download \ 616 | | bgzip -c \ 617 | > ${output_vcf_basename}.vcf.gz 618 | } 619 | 620 | runtime { 621 | docker: "halllab/svtools@sha256:38ac08a8685ff58329b72e2b9c366872086d41ef21da84278676e06ef7f1bfbb" 622 | cpu: "1" 623 | memory: "3.75 GB" 624 | disks: "local-disk " + 2*ceil(size(input_vcfs, "GB")) +10 + " HDD" 625 | bootDiskSizeGb: 30 626 | preemptible: preemptible_tries 627 | } 628 | 629 | output { 630 | File output_vcf_gz = "${output_vcf_basename}.vcf.gz" 631 | } 632 | } 633 | 634 | task L_Merge_VCF_Variants { 635 | input { 636 | File input_vcf_gz 637 | String output_vcf_basename 638 | Int preemptible_tries 639 | } 640 | 641 | command { 642 | set -eo pipefail 643 | zcat ${input_vcf_gz} \ 644 | | svtools lmerge \ 645 | -i /dev/stdin \ 646 | -f 20 \ 647 | | bgzip -c \ 648 | > ${output_vcf_basename}.vcf.gz 649 | } 650 | 651 | runtime { 652 | docker: "halllab/svtools@sha256:38ac08a8685ff58329b72e2b9c366872086d41ef21da84278676e06ef7f1bfbb" 653 | cpu: "1" 654 | memory: "3.75 GB" 655 | disks: "local-disk " + 2*ceil(size(input_vcf_gz, "GB"))+10 + " HDD" 656 | preemptible: preemptible_tries 657 | } 658 | 659 | output { 660 | File output_vcf_gz = "${output_vcf_basename}.vcf.gz" 661 | } 662 | } 663 | 664 | task L_Merge_VCF_Variants_weighted { 665 | input { 666 | File input_vcf_gz 667 | String output_vcf_basename 668 | Int preemptible_tries 669 | } 670 | 671 | command { 672 | set -eo pipefail 673 | zcat ${input_vcf_gz} \ 674 | | svtools lmerge \ 675 | -i /dev/stdin \ 676 | -f 20 \ 677 | -w carrier_wt \ 678 | | bgzip -c \ 679 | > ${output_vcf_basename}.vcf.gz 680 | } 681 | 682 | runtime { 683 | docker: "halllab/svtools@sha256:38ac08a8685ff58329b72e2b9c366872086d41ef21da84278676e06ef7f1bfbb" 684 | cpu: "1" 685 | memory: "3.75 GB" 686 | disks: "local-disk " + 2*ceil(size(input_vcf_gz, "GB"))+10 + " HDD" 687 | preemptible: preemptible_tries 688 | } 689 | 690 | output { 691 | File output_vcf_gz = "${output_vcf_basename}.vcf.gz" 692 | } 693 | } 694 | 695 | task Filter_Del { 696 | input { 697 | File input_vcf_gz 698 | String output_vcf_basename 699 | Int preemptible_tries 700 | } 701 | 702 | command <<< 703 | set -eo pipefail 704 | 705 | bcftools view -i '(SVTYPE!="DEL" || SVLEN>1000 || SVLEN<-1000 || INFO/SR>0)' ~{input_vcf_gz} | bgzip -c > ~{output_vcf_basename}.vcf.gz 706 | >>> 707 | 708 | runtime { 709 | docker: "halllab/bcftools@sha256:955cbf93e35e5ee6fdb60e34bb404b7433f816e03a202dfed9ceda542e0d8906" 710 | cpu: "1" 711 | memory: "3.75 GB" 712 | disks: "local-disk " + 2*ceil(size(input_vcf_gz, "GB"))+10 + " HDD" 713 | preemptible: preemptible_tries 714 | } 715 | 716 | output { 717 | File output_vcf_gz = "${output_vcf_basename}.vcf.gz" 718 | } 719 | } 720 | 721 | task Filter_Pass { 722 | input { 723 | File input_vcf_gz 724 | String output_vcf_basename 725 | Int preemptible_tries 726 | } 727 | 728 | command <<< 729 | set -eo pipefail 730 | 731 | bcftools view -f .,PASS ~{input_vcf_gz} | bgzip -c > ~{output_vcf_basename}.vcf.gz 732 | >>> 733 | 734 | runtime { 735 | docker: "halllab/bcftools@sha256:955cbf93e35e5ee6fdb60e34bb404b7433f816e03a202dfed9ceda542e0d8906" 736 | cpu: "1" 737 | memory: "3.75 GB" 738 | disks: "local-disk " + 2*ceil(size(input_vcf_gz, "GB"))+10 + " HDD" 739 | preemptible: preemptible_tries 740 | } 741 | 742 | output { 743 | File output_vcf_gz = "${output_vcf_basename}.vcf.gz" 744 | } 745 | } 746 | 747 | task Paste_VCF { 748 | input { 749 | Array[File] input_vcfs 750 | File input_vcfs_file = write_lines(input_vcfs) 751 | String output_vcf_basename 752 | Int preemptible_tries 753 | } 754 | parameter_meta { 755 | input_vcfs: { 756 | description: "vcf files to paste together", 757 | localization_optional: true 758 | } 759 | } 760 | 761 | command { 762 | set -eo pipefail 763 | svtools vcfpaste \ 764 | -f ${input_vcfs_file} \ 765 | -q \ 766 | -t /cromwell_root/bulk_download \ 767 | | bgzip -c \ 768 | > ${output_vcf_basename}.vcf.gz 769 | } 770 | 771 | runtime { 772 | docker: "halllab/svtools@sha256:38ac08a8685ff58329b72e2b9c366872086d41ef21da84278676e06ef7f1bfbb" 773 | cpu: "1" 774 | memory: "12 GB" 775 | disks: "local-disk " + 2*ceil(size(input_vcfs, "GB")) + " HDD" 776 | preemptible: 0 777 | } 778 | 779 | output { 780 | File output_vcf_gz = "${output_vcf_basename}.vcf.gz" 781 | } 782 | } 783 | 784 | task Remove_INS { 785 | input { 786 | File input_vcf_gz 787 | String output_vcf_basename 788 | Int preemptible_tries 789 | } 790 | 791 | command <<< 792 | set -eo pipefail 793 | zcat ~{input_vcf_gz} \ 794 | | awk '{if($5!="") print $0}' \ 795 | | bgzip -c \ 796 | > ~{output_vcf_basename}.vcf.gz 797 | >>> 798 | 799 | runtime { 800 | docker: "halllab/svtools@sha256:38ac08a8685ff58329b72e2b9c366872086d41ef21da84278676e06ef7f1bfbb" 801 | cpu: "1" 802 | memory: "3 GB" 803 | disks: "local-disk " + 2*ceil( size(input_vcf_gz, "GB")) + " HDD" 804 | preemptible: preemptible_tries 805 | } 806 | 807 | output { 808 | File output_vcf_gz = "${output_vcf_basename}.vcf.gz" 809 | } 810 | } 811 | 812 | task Prune_VCF { 813 | input { 814 | File input_vcf_gz 815 | String output_vcf_basename 816 | Int preemptible_tries 817 | } 818 | 819 | command { 820 | set -eo pipefail 821 | zcat ${input_vcf_gz} \ 822 | | svtools afreq \ 823 | | svtools vcftobedpe \ 824 | | svtools bedpesort \ 825 | | svtools prune -s -d 100 -e 'AF' \ 826 | | svtools bedpetovcf \ 827 | | bgzip -c \ 828 | > ${output_vcf_basename}.vcf.gz 829 | } 830 | 831 | runtime { 832 | docker: "halllab/svtools@sha256:38ac08a8685ff58329b72e2b9c366872086d41ef21da84278676e06ef7f1bfbb" 833 | cpu: "1" 834 | memory: "3 GB" 835 | disks: "local-disk " + 3*ceil( size(input_vcf_gz, "GB")) + " HDD" 836 | preemptible: preemptible_tries 837 | } 838 | 839 | output { 840 | File output_vcf_gz = "${output_vcf_basename}.vcf.gz" 841 | } 842 | } 843 | 844 | task Classify { 845 | input { 846 | File input_vcf_gz 847 | File input_ped 848 | String output_vcf_basename 849 | File mei_annotation_bed 850 | Int preemptible_tries 851 | } 852 | 853 | command { 854 | set -eo pipefail 855 | cat ${input_ped} \ 856 | | cut -f 2,5 \ 857 | > sex.txt 858 | 859 | zcat ${input_vcf_gz} \ 860 | | svtools classify \ 861 | -g sex.txt \ 862 | -a ${mei_annotation_bed} \ 863 | -m large_sample \ 864 | | bgzip -c \ 865 | > ${output_vcf_basename}.vcf.gz 866 | } 867 | 868 | runtime { 869 | docker: "halllab/svtools@sha256:38ac08a8685ff58329b72e2b9c366872086d41ef21da84278676e06ef7f1bfbb" 870 | cpu: "1" 871 | memory: "3 GB" 872 | disks: "local-disk " + 10*ceil( size(input_vcf_gz, "GB")) + " HDD" 873 | preemptible: preemptible_tries 874 | } 875 | 876 | output { 877 | File output_vcf_gz = "${output_vcf_basename}.vcf.gz" 878 | } 879 | } 880 | 881 | task Sort_Index_VCF { 882 | input { 883 | File input_vcf_gz 884 | String output_vcf_name 885 | Int preemptible_tries 886 | } 887 | 888 | command { 889 | set -eo pipefail 890 | zcat ${input_vcf_gz} \ 891 | | svtools vcfsort \ 892 | | bgzip -c \ 893 | > ${output_vcf_name} 894 | 895 | tabix -p vcf -f ${output_vcf_name} 896 | } 897 | 898 | runtime { 899 | docker: "halllab/svtools@sha256:38ac08a8685ff58329b72e2b9c366872086d41ef21da84278676e06ef7f1bfbb" 900 | cpu: "1" 901 | memory: "3 GB" 902 | disks: "local-disk " + 20*ceil( size(input_vcf_gz, "GB")) + " HDD" 903 | preemptible: preemptible_tries 904 | } 905 | 906 | output { 907 | File output_vcf_gz = "${output_vcf_name}" 908 | File output_vcf_gz_index = "${output_vcf_name}.tbi" 909 | } 910 | } 911 | 912 | --------------------------------------------------------------------------------