├── 20k_Tutorial_Docker
    ├── 20k_WF_ID.txt
    ├── 20k_submission_response.txt
    ├── step05_Single_Sample_20k_Workflow_Output.sh
    ├── step04_Cromwell_Monitor_Single_Sample_20k_Workflow.sh
    ├── step03_Cromwell_Run_20k_Single_Sample_Workflow.sh
    ├── step02_Download_Single_Sample_20k_Data.sh
    └── 16T_PairedSingleSampleWf_optimized.inputs.20k.json
├── images
    ├── Layer-Cake.png
    ├── SW-Arch-Diagram.png
    ├── Pipeline-Overview.png
    └── Directory-Quick-Reference.png
├── .gitignore
├── 20k_Tutorial
    ├── step05_Single_Sample_20k_Workflow_Output.sh
    ├── step04_Cromwell_Monitor_Single_Sample_20k_Workflow.sh
    ├── step03_Cromwell_Run_20k_Single_Sample_Workflow.sh
    ├── step02_Download_Single_Sample_20k_Data.sh
    └── 16T_PairedSingleSampleWf_optimized.inputs.20k.json
├── SECURITY.md
├── config_files
    ├── SLURM
    │   ├── slurmdbd.conf
    │   └── slurm.conf
    └── HTCondor
    │   ├── condor_config.app-node
    │   └── condor_config.comp-node
├── LICENSE
├── 20k_Throughput-run
    ├── configure
    ├── WDL
    │   ├── DNASeqStructs.wdl
    │   ├── BamToCram.wdl
    │   ├── DragenTasks.wdl
    │   ├── SplitLargeReadGroup.wdl
    │   ├── DragmapAlignment.wdl
    │   ├── AggregatedBamQC.wdl
    │   ├── Alignment.wdl
    │   ├── VariantCalling.wdl
    │   ├── Utilities.wdl
    │   ├── GermlineVariantDiscovery.wdl
    │   ├── UnmappedBamToAlignedBam.wdl
    │   ├── BamProcessing.wdl
    │   └── Qc.wdl
    ├── step01_Configure_20k_Throughput-run.sh
    ├── step04_Cromwell_Monitor_20k_Throughput-run.sh
    ├── step03_Cromwell_Run_20k_Throughput-run.sh
    ├── step05_Output_20k_Throughput-run.sh
    ├── README.md
    ├── WholeGenomeGermlineSingleSample_20k.json
    ├── step02_Download_20k_Data_Throughput-run.sh
    └── WholeGenomeGermlineSingleSample.wdl
└── README.md


/20k_Tutorial_Docker/20k_WF_ID.txt:
--------------------------------------------------------------------------------
1 | cd21d43f-302e-484f-8025-a5ebded0e6e5
2 | 


--------------------------------------------------------------------------------
/images/Layer-Cake.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Intel-HLS/BIGstack/HEAD/images/Layer-Cake.png


--------------------------------------------------------------------------------
/images/SW-Arch-Diagram.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Intel-HLS/BIGstack/HEAD/images/SW-Arch-Diagram.png


--------------------------------------------------------------------------------
/20k_Tutorial_Docker/20k_submission_response.txt:
--------------------------------------------------------------------------------
1 | {"id":"cd21d43f-302e-484f-8025-a5ebded0e6e5","status":"Submitted"}


--------------------------------------------------------------------------------
/images/Pipeline-Overview.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Intel-HLS/BIGstack/HEAD/images/Pipeline-Overview.png


--------------------------------------------------------------------------------
/images/Directory-Quick-Reference.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Intel-HLS/BIGstack/HEAD/images/Directory-Quick-Reference.png


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
1 | 20k_Throughput-run/20k_WF_ID*
2 | *.zip
3 | 20k_Throughput-run/20k_submission_response.txt
4 | 20k_Throughput-run/JSON
5 | 20k_Throughput-run/cromwell-monitor
6 | 20k_Throughput-run/cromwell*
7 | 20k_Throughput-run/data
8 | 


--------------------------------------------------------------------------------
/20k_Tutorial/step05_Single_Sample_20k_Workflow_Output.sh:
--------------------------------------------------------------------------------
1 | #! /bin/bash
2 | ROOT_PATH="/cluster_share"
3 | echo "Go to $ROOT_PATH/cromwell-executions/SingleSample20k/$(cat 20k_WF_ID.txt) to view the output of workflow instance $(cat 20k_WF_ID.txt)"
4 | 


--------------------------------------------------------------------------------
/20k_Tutorial_Docker/step05_Single_Sample_20k_Workflow_Output.sh:
--------------------------------------------------------------------------------
1 | #! /bin/bash
2 | ROOT_PATH="/cluster_share"
3 | echo "Go to $ROOT_PATH/cromwell-executions/SingleSample20k/$(cat 20k_WF_ID.txt) to view the output of workflow instance $(cat 20k_WF_ID.txt)"
4 | 


--------------------------------------------------------------------------------
/SECURITY.md:
--------------------------------------------------------------------------------
1 | # Intel® Select Solutions for Genomics Analytics Security Policy
2 | Intel is committed to rapidly addressing security vulnerabilities affecting our customers and providing clear guidance on the solution, impact, severity and mitigation. 
3 | 
4 | ## Reporting a Vulnerability
5 | Please report any security vulnerabilities in this project [utilizing the guidelines here](https://www.intel.com/content/www/us/en/security-center/vulnerability-handling-guidelines.html).
6 | 


--------------------------------------------------------------------------------
/20k_Tutorial/step04_Cromwell_Monitor_Single_Sample_20k_Workflow.sh:
--------------------------------------------------------------------------------
 1 | #! /bin/bash
 2 | export http_proxy=
 3 | export https_proxy=
 4 | CROMWELL_HOST=$HOSTNAME
 5 | #CROMWELL_PORT=$(lsof -Pan -p $(ps waux | grep cromwell | grep "java -jar" \ 
 6 | #| awk '{print $2}') -i | grep -i listen | awk '{print $9}' | grep -E -o "[0-9]{4}")
 7 | #Contact your admin to obtain Cromwell port - 8000 by default
 8 | 
 9 | curl -vXGET $CROMWELL_HOST:8000/api/workflows/v1/$(cat 20k_WF_ID.txt)/status
10 | 


--------------------------------------------------------------------------------
/20k_Tutorial_Docker/step04_Cromwell_Monitor_Single_Sample_20k_Workflow.sh:
--------------------------------------------------------------------------------
 1 | #! /bin/bash
 2 | export http_proxy=
 3 | export https_proxy=
 4 | CROMWELL_HOST=$HOSTNAME
 5 | #CROMWELL_PORT=$(lsof -Pan -p $(ps waux | grep cromwell | grep "java -jar" \ 
 6 | #| awk '{print $2}') -i | grep -i listen | awk '{print $9}' | grep -E -o "[0-9]{4}")
 7 | #Contact your admin to obtain Cromwell port - 8000 by default
 8 | 
 9 | curl -vXGET $CROMWELL_HOST:8000/api/workflows/v1/$(cat 20k_WF_ID.txt)/status
10 | 


--------------------------------------------------------------------------------
/20k_Tutorial/step03_Cromwell_Run_20k_Single_Sample_Workflow.sh:
--------------------------------------------------------------------------------
 1 | #! /bin/bash
 2 | export http_proxy=
 3 | export https_proxy=
 4 | CROMWELL_HOST=$HOSTNAME
 5 | #CROMWELL_PORT=$(lsof -Pan -p $(ps waux | grep cromwell | grep "java -jar" | awk '{print $2}') -i | grep -i listen | awk '{print $9}' | grep -E -o "[0-9]{4}")
 6 | #Cromwell port is 8000 by default. Contact your admin if port is different
 7 | 
 8 | curl -vXPOST http://$CROMWELL_HOST:8000/api/workflows/v1 -F workflowSource=@PairedSingleSampleWf_noqc_nocram_optimized.wdl \
 9 | -F workflowInputs=@16T_PairedSingleSampleWf_optimized.inputs.20k.json > 20k_submission_response.txt
10 | 
11 | cat 20k_submission_response.txt | grep -o -E "[0-9a-f]{8}-[0-9a-f]{4}-[0-9a-f]{4}-[0-9a-f]{4}-[0-9a-f]{12}" \
12 | > 20k_WF_ID.txt
13 | 


--------------------------------------------------------------------------------
/20k_Tutorial_Docker/step03_Cromwell_Run_20k_Single_Sample_Workflow.sh:
--------------------------------------------------------------------------------
 1 | #! /bin/bash
 2 | export http_proxy=
 3 | export https_proxy=
 4 | CROMWELL_HOST=$HOSTNAME
 5 | #CROMWELL_PORT=$(lsof -Pan -p $(ps waux | grep cromwell | grep "java -jar" | awk '{print $2}') -i | grep -i listen | awk '{print $9}' | grep -E -o "[0-9]{4}")
 6 | #Cromwell port is 8000 by default. Contact your admin if port is different
 7 | 
 8 | curl -vXPOST http://$CROMWELL_HOST:8000/api/workflows/v1 -F workflowSource=@PairedSingleSampleWf_noqc_nocram_optimized.wdl \
 9 | -F workflowInputs=@16T_PairedSingleSampleWf_optimized.inputs.20k.json > 20k_submission_response.txt
10 | 
11 | cat 20k_submission_response.txt | grep -o -E "[0-9a-f]{8}-[0-9a-f]{4}-[0-9a-f]{4}-[0-9a-f]{4}-[0-9a-f]{12}" \
12 | > 20k_WF_ID.txt
13 | 


--------------------------------------------------------------------------------
/config_files/SLURM/slurmdbd.conf:
--------------------------------------------------------------------------------
 1 | #
 2 | # Example slurmdbd.conf file.
 3 | #
 4 | # See the slurmdbd.conf man page for more information.
 5 | #
 6 | # Archive info
 7 | #ArchiveJobs=yes
 8 | #ArchiveDir="/tmp"
 9 | #ArchiveSteps=yes
10 | #ArchiveScript=
11 | #JobPurge=12
12 | #StepPurge=1
13 | #
14 | # Authentication info
15 | AuthType=auth/munge
16 | #AuthInfo=/var/run/munge/munge.socket.2
17 | #
18 | # slurmDBD info
19 | DbdAddr=localhost
20 | DbdHost=localhost
21 | #DbdPort=7031
22 | SlurmUser=slurm
23 | #MessageTimeout=300
24 | DebugLevel=4
25 | #DefaultQOS=normal,standby
26 | LogFile=/var/log/slurmdbd.log
27 | PidFile=/var/run/slurmdbd.pid
28 | #PluginDir=/usr/lib/slurm
29 | #PrivateData=accounts,users,usage,jobs
30 | #TrackWCKey=yes
31 | #
32 | # Database info
33 | StorageType=accounting_storage/mysql
34 | #StorageHost=localhost
35 | #StoragePort=1234
36 | StoragePass=password
37 | StorageUser=slurm
38 | #StorageLoc=slurm_acct_db
39 | 
40 | 


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
 1 | The MIT License (MIT)
 2 | 
 3 | Copyright (c) 2017-2018 Intel Corporation
 4 | 
 5 | Permission is hereby granted, free of charge, to any person obtaining a copy of 
 6 | this software and associated documentation files (the "Software"), to deal in 
 7 | the Software without restriction, including without limitation the rights to 
 8 | use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of 
 9 | the Software, and to permit persons to whom the Software is furnished to do so, 
10 | subject to the following conditions:
11 | 
12 | The above copyright notice and this permission notice shall be included in all 
13 | copies or substantial portions of the Software.
14 | 
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS 
17 | FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR 
18 | COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER 
19 | IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN 
20 | CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
21 | 


--------------------------------------------------------------------------------
/20k_Throughput-run/configure:
--------------------------------------------------------------------------------
 1 | #! /bin/bash
 2 | # Copyright (c) 2019 Intel Corporation
 3 | #
 4 | # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License.
 5 | # You may obtain a copy of the License at #
 6 | #    http://www.apache.org/licenses/LICENSE-2.0
 7 | #
 8 | # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 9 | # See the License for the specific language governing permissions and # limitations under the License.
10 | #
11 | # SPDX-License-Identifier: Apache-2.0
12 | #
13 | 
14 | export http_proxy=http://proxy-jf.intel.com:911
15 | export https_proxy=http://proxy-jf.intel.com:912
16 | export  no_proxy="localhost,intel.com"
17 | CROMWELL_HOST=$HOSTNAME
18 | 
19 | BASEDIR="$( cd "$( dirname "${BASH_SOURCE[0]}" )" && pwd )"
20 | GENOMICS_PATH="/mnt/lustre/genomics"
21 | CROMWELL_PATH="/fastdata/02/genomics/cromwell"
22 | 
23 | #specify the path to tools directory.By default,script expects tools to be in the following path
24 | TOOLS_PATH="$GENOMICS_PATH/tools"
25 | 
26 | #specify the path to data download directory.By default, data is downloaded to current folder
27 | DATA_PATH="$BASEDIR/data"
28 | 
29 | #Enter the number of workflow for throughput run
30 | NUM_WORKFLOW=16
31 | 
32 | BASEDIR="$( cd "$( dirname "${BASH_SOURCE[0]}" )" && pwd )"
33 | 
34 | # sudo yum install -y R
35 | # sudo yum install -y jq
36 | # Create generic symlinks for tools e.g. :
37 | # for tool in bwa samtools gatk; do export tool_version=`ls $GENOMICS_PATH/tools | grep ${tool}- | head -n1` && echo ${tool_version} && ln -sfn $GENOMICS_PATH/tools/$tool_version $GENOMICS_PATH/tools/$tool; done;
38 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | PROJECT NOT UNDER ACTIVE MANAGEMENT
 2 | 
 3 | This project will no longer be maintained by Intel.
 4 | 
 5 | Intel has ceased development and contributions including, but not limited to, maintenance, bug fixes, new releases, or updates, to this project.  
 6 | 
 7 | Intel no longer accepts patches to this project.
 8 | 
 9 | If you have an ongoing need to use this project, are interested in independently developing it, or would like to maintain patches for the open source software community, please create your own fork of this project.  
10 | 
11 | Contact: webadmin@linux.intel.com
12 | # Intel® Select Solutions for Genomics Analytics
13 | 
14 | [Intel® Select Solutions for Genomics Analytics](https://www.intel.com/content/dam/www/public/us/en/documents/product-briefs/select-solutions-for-genomics-analytics-brief-v2.pdf)
15 | is an end-to-end, optimized hardware and software solution for analyzing 
16 | genomic data. It provides a way to run pre-packaged, optimized workflows, including the Genome Analysis Toolkit* 
17 | (GATK*) Best Practices workflows from the Broad Institute. 
18 | 
19 | This repo contains a simple smoketest benchmark for HPC clusters ("20k Throughput Run").  The test ensures your HPC system is configured correctly to run whole genome and whole exome samples.
20 | 
21 | For an overview on how to set up an HPC cluster for running GATK, see the [Broad documentation here](https://gatk.broadinstitute.org/hc/en-us/articles/360035530872).  An overview of the Intel Solution, including a HW reference design, can be found [here](https://www.intel.com/content/www/us/en/products/solutions/select-solutions/hpc/genomics-analytics-v2.html).  
22 | 
23 | For detailed, line-by-line instructions on how to configure an HPC system for running genomics workflows, please contact your Intel representative.
24 | 


--------------------------------------------------------------------------------
/20k_Throughput-run/WDL/DNASeqStructs.wdl:
--------------------------------------------------------------------------------
 1 | version 1.0
 2 | 
 3 | struct SampleAndUnmappedBams {
 4 |   String base_file_name
 5 |   String? final_gvcf_base_name
 6 |   Array[File] flowcell_unmapped_bams
 7 |   String sample_name
 8 |   String unmapped_bam_suffix
 9 | }
10 | 
11 | struct ReferenceFasta {
12 |   File ref_dict
13 |   File ref_fasta
14 |   File ref_fasta_index
15 |   File ref_alt
16 |   File ref_sa
17 |   File ref_amb
18 |   File ref_bwt
19 |   File ref_ann
20 |   File ref_pac
21 |   File? ref_str
22 | }
23 | 
24 | struct DragmapReference {
25 |   File reference_bin
26 |   File hash_table_cfg_bin
27 |   File hash_table_cmp
28 | }
29 | 
30 | struct DNASeqSingleSampleReferences {
31 |   File contamination_sites_ud
32 |   File contamination_sites_bed
33 |   File contamination_sites_mu
34 |   File calling_interval_list
35 | 
36 |   ReferenceFasta reference_fasta
37 | 
38 |   Array[File] known_indels_sites_vcfs
39 |   Array[File] known_indels_sites_indices
40 | 
41 |   File dbsnp_vcf
42 |   File dbsnp_vcf_index
43 | 
44 |   File evaluation_interval_list
45 | 
46 |   File haplotype_database_file
47 | }
48 | 
49 | struct VariantCallingScatterSettings {
50 |    Int haplotype_scatter_count
51 |    Int break_bands_at_multiples_of
52 | }
53 | 
54 | struct ExomeGermlineSingleSampleOligos {
55 |   File target_interval_list
56 |   File bait_interval_list
57 |   String bait_set_name
58 | }
59 | 
60 | struct CrossSpeciesContaminationReferences {
61 |   File filter_bwa_image
62 |   File kmer_file
63 |   File meats_bwa_image
64 |   File meats_fasta
65 |   File meats_fasta_dict
66 |   File meats_taxonomy_file
67 |   File microbe_bwa_image
68 |   File microbe_fasta
69 |   File microbe_fasta_dict
70 |   File microbe_taxonomy_file
71 |   File normalization_file
72 |   File metrics_script_file
73 |   Float score_min_identity
74 |   Int reads_after_downsampling
75 | }
76 | 
77 | struct PapiSettings {
78 | }
79 | 
80 | 


--------------------------------------------------------------------------------
/20k_Throughput-run/step01_Configure_20k_Throughput-run.sh:
--------------------------------------------------------------------------------
 1 | #! /bin/bash
 2 | # Copyright (c) 2019 Intel Corporation
 3 | #
 4 | # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License.
 5 | # You may obtain a copy of the License at #
 6 | #    http://www.apache.org/licenses/LICENSE-2.0
 7 | #
 8 | # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 9 | # See the License for the specific language governing permissions and # limitations under the License.
10 | #
11 | # SPDX-License-Identifier: Apache-2.0
12 | #
13 | 
14 | ###########Editing JSON file#############
15 | source ./configure
16 | 
17 | #datapath is the existing path specified in the WholeGenomeGermlineSingleSample_20k.json.Do not edit this path
18 | datapath=/mnt/lustre/genomics/data
19 | #toolspath is the existing path specified in the WholeGenomeGermlineSingleSample_20k.json.Do not edit this path
20 | toolspath=/mnt/lustre/genomics/tools
21 | 
22 | mkdir -p  $BASEDIR/JSON
23 | cd $BASEDIR/JSON
24 | cp $BASEDIR/WholeGenomeGermlineSingleSample_20k.json $BASEDIR/JSON/WholeGenomeGermlineSingleSample_20k.json
25 | 
26 | newdatapath=${DATA_PATH}
27 | newtoolspath=${TOOLS_PATH}
28 | 
29 | #pointing the correct data path to wdl
30 | sed -i "s%$datapath%$newdatapath%g" $BASEDIR/JSON/WholeGenomeGermlineSingleSample_20k.json
31 | sed -i "s%$toolspath%$newtoolspath%g" $BASEDIR/JSON/WholeGenomeGermlineSingleSample_20k.json
32 | 
33 | sed -i "s%$toolspath%$newtoolspath%g" $BASEDIR/WDL/WholeGenomeGermlineSingleSample.wdl
34 | sed -i "s%$toolspath%$newtoolspath%g" $BASEDIR/WDL/*.wdl
35 | 
36 | FILE="$BASEDIR/WDL/warp.zip"
37 | 
38 | echo "Creating zip file for WDLS "$FILE" "
39 | zip -j  $BASEDIR/WDL/warp.zip $BASEDIR/WDL/*.wdl
40 | 
41 | 


--------------------------------------------------------------------------------
/20k_Throughput-run/step04_Cromwell_Monitor_20k_Throughput-run.sh:
--------------------------------------------------------------------------------
 1 | #! /bin/bash
 2 | # Copyright (c) 2019 Intel Corporation
 3 | #
 4 | # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License.
 5 | # You may obtain a copy of the License at #
 6 | #    http://www.apache.org/licenses/LICENSE-2.0
 7 | #
 8 | # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 9 | # See the License for the specific language governing permissions and # limitations under the License.
10 | #
11 | # SPDX-License-Identifier: Apache-2.0
12 | #
13 | 
14 | source ./configure
15 | 
16 | curl -s $CROMWELL_HOST:8000/api/workflows/v1/query 2>/dev/null | json_pp>cromwell_stop
17 | 
18 | start_date=`cat cromwell_start_date`
19 | count=`curl -sGET "$CROMWELL_HOST:8000/api/workflows/v1/query?start=$start_date&status=Running&includeSubworkflows=false" | jq '.totalResultsCount'`
20 | finish=`curl -sGET "$CROMWELL_HOST:8000/api/workflows/v1/query?start=$start_date&status=Succeeded&includeSubworkflows=false" | jq '.totalResultsCount'`
21 | failed=`curl -sGET "$CROMWELL_HOST:8000/api/workflows/v1/query?start=$start_date&status=Failed&includeSubworkflows=false" | jq '.totalResultsCount'`
22 | echo running: $count  finished: $finish  failed:  $failed
23 | 
24 | echo "-----------------------------"
25 | echo ' 	          workflow_id         |  status   |	     start           |	  	 end            |  	    name 	| 	parent_workflow_id' 
26 | for WFID in `cat $BASEDIR/20k_WF_ID/*`; do 
27 | 	echo "-----------------------------"
28 | 	curl -sXGET $CROMWELL_HOST:8000/api/workflows/v1/query?status={Submitted,Running,Aborting,Failed,Succeeded,Aborted} | jq ' .results | [.|= sort_by(.start)] | .[] | .[] | ( .id + " | "  + .status + " | " + .start + " | "+ .end +" | " + .name + " | " + .rootWorkflowId )' | grep $WFID  | tr '"' '|'
29 | done;
30 | 


--------------------------------------------------------------------------------
/20k_Throughput-run/step03_Cromwell_Run_20k_Throughput-run.sh:
--------------------------------------------------------------------------------
 1 | #! /bin/bash
 2 | # Copyright (c) 2019 Intel Corporation
 3 | #
 4 | # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License.
 5 | # You may obtain a copy of the License at #
 6 | #    http://www.apache.org/licenses/LICENSE-2.0
 7 | #
 8 | # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 9 | # See the License for the specific language governing permissions and # limitations under the License.
10 | #
11 | # SPDX-License-Identifier: Apache-2.0
12 | #
13 | 
14 | source ./configure
15 | 
16 | WDL=$BASEDIR/WholeGenomeGermlineSingleSample.wdl
17 | JSON=$BASEDIR/JSON/WholeGenomeGermlineSingleSample_20k.json
18 | 
19 | limit=$NUM_WORKFLOW
20 | 
21 | export DATE_WITH_TIME=`date "+%Y%m%d:%H-%M-%S"`
22 | mkdir "20k_WF_ID-"$DATE_WITH_TIME""
23 | mkdir "cromwell-status-"$DATE_WITH_TIME""
24 | #remove the temporary directories from previous runs.
25 | rm -rf cromwell-monitor
26 | rm -rf 20k_WF_ID
27 | #creating new temporary directories for monitoring and output results
28 | mkdir cromwell-monitor 
29 | mkdir 20k_WF_ID
30 | 
31 | curl localhost:8000/api/workflows/v1/query 2>/dev/null | json_pp>"cromwell-status-"$DATE_WITH_TIME""/cromwell_start
32 | cp "cromwell-status-"$DATE_WITH_TIME""/cromwell_start cromwell-monitor
33 | 
34 | date -u +"%Y-%m-%dT%H:%M:%S.000Z"> cromwell_start_date
35 | echo Start time is `date`  : `date +"%H:%M:%S"`
36 | 
37 | 
38 | for i in $(seq $limit)
39 | do
40 |         echo $i
41 |         curl -vXPOST http://$CROMWELL_HOST:8000/api/workflows/v1 -F workflowSource=@${WDL} -F workflowInputs=@${JSON} -F workflowDependencies=@$BASEDIR/WDL/warp.zip > 20k_submission_response.txt
42 | 	cat 20k_submission_response.txt |  cut -d '"' -f4 >"20k_WF_ID-"$DATE_WITH_TIME""/20k_WF_ID_${i}.txt
43 | 	cp "20k_WF_ID-"$DATE_WITH_TIME""/20k_WF_ID_* 20k_WF_ID
44 | done
45 | 
46 | 
47 | 


--------------------------------------------------------------------------------
/20k_Throughput-run/WDL/BamToCram.wdl:
--------------------------------------------------------------------------------
 1 | version 1.0
 2 | 
 3 | import "Utilities.wdl" as Utils
 4 | import "Qc.wdl" as QC
 5 | 
 6 | workflow BamToCram {
 7 | 
 8 |   input {
 9 |     File input_bam
10 |     File ref_fasta
11 |     File ref_fasta_index
12 |     File ref_dict
13 |     File duplication_metrics
14 |     File chimerism_metrics
15 |     String base_file_name
16 |   }
17 | 
18 | 
19 |   # ValidateSamFile runs out of memory in mate validation on crazy edge case data, so we want to skip the mate validation
20 |   # in those cases.  These values set the thresholds for what is considered outside the normal realm of "reasonable" data.
21 |   Float max_duplication_in_reasonable_sample = 0.30
22 |   Float max_chimerism_in_reasonable_sample = 0.15
23 | 
24 |   # Convert the final merged recalibrated BAM file to CRAM format
25 |   call Utils.ConvertToCram as ConvertToCram {
26 |     input:
27 |       input_bam = input_bam,
28 |       ref_fasta = ref_fasta,
29 |       ref_fasta_index = ref_fasta_index,
30 |       output_basename = base_file_name,
31 |   }
32 | 
33 |   # Check whether the data has massively high duplication or chimerism rates
34 |   call QC.CheckPreValidation as CheckPreValidation {
35 |     input:
36 |       duplication_metrics = duplication_metrics,
37 |       chimerism_metrics = chimerism_metrics,
38 |       max_duplication_in_reasonable_sample = max_duplication_in_reasonable_sample,
39 |       max_chimerism_in_reasonable_sample = max_chimerism_in_reasonable_sample,
40 |  }
41 | 
42 |   # Validate the CRAM file
43 |   call QC.ValidateSamFile as ValidateCram {
44 |     input:
45 |       input_bam = ConvertToCram.output_cram,
46 |       input_bam_index = ConvertToCram.output_cram_index,
47 |       report_filename = base_file_name + ".cram.validation_report",
48 |       ref_dict = ref_dict,
49 |       ref_fasta = ref_fasta,
50 |       ref_fasta_index = ref_fasta_index,
51 |       ignore = ["MISSING_TAG_NM"],
52 |       max_output = 1000000000,
53 |       is_outlier_data = CheckPreValidation.is_outlier_data,
54 |   }
55 | 
56 |   output {
57 |      File output_cram = ConvertToCram.output_cram
58 |      File output_cram_index = ConvertToCram.output_cram_index
59 |      File output_cram_md5 = ConvertToCram.output_cram_md5
60 |      File validate_cram_file_report = ValidateCram.report
61 |   }
62 |   meta {
63 |     allowNestedInputs: true
64 |   }
65 | }
66 | 
67 | 


--------------------------------------------------------------------------------
/config_files/SLURM/slurm.conf:
--------------------------------------------------------------------------------
 1 | #
 2 | # Example slurm.conf file. Please run configurator.html
 3 | # (in doc/html) to build a configuration file customized
 4 | # for your environment.
 5 | #
 6 | #
 7 | # slurm.conf file generated by configurator.html.
 8 | #
 9 | # See the slurm.conf man page for more information.
10 | #
11 | ClusterName=linux
12 | ControlMachine=slurm-0-0
13 | #ControlAddr=
14 | #BackupController=
15 | #BackupAddr=
16 | #
17 | SlurmUser=slurm
18 | #SlurmdUser=root
19 | SlurmctldPort=6817
20 | SlurmdPort=6818
21 | AuthType=auth/munge
22 | CryptoType=crypto/munge
23 | #JobCredentialPrivateKey=
24 | #JobCredentialPublicCertificate=
25 | StateSaveLocation=/var/spool/slurm/ctld
26 | SlurmdSpoolDir=/var/spool/slurm/d
27 | SwitchType=switch/none
28 | MpiDefault=none
29 | SlurmctldPidFile=/var/run/slurmctld.pid
30 | SlurmdPidFile=/var/run/slurmd.pid
31 | ProctrackType=proctrack/pgid
32 | #PluginDir=
33 | #FirstJobId=
34 | ReturnToService=0
35 | #MaxJobCount=
36 | #PlugStackConfig=
37 | #PropagatePrioProcess=
38 | #PropagateResourceLimits=
39 | #PropagateResourceLimitsExcept=
40 | #Prolog=
41 | #Epilog=
42 | #SrunProlog=
43 | #SrunEpilog=
44 | #TaskProlog=
45 | #TaskEpilog=
46 | #TaskPlugin=
47 | #TrackWCKey=no
48 | #TreeWidth=50
49 | #TmpFS=
50 | #UsePAM=
51 | #
52 | # TIMERS
53 | SlurmctldTimeout=300
54 | SlurmdTimeout=300
55 | InactiveLimit=0
56 | MinJobAge=300
57 | KillWait=30
58 | Waittime=0
59 | #
60 | # SCHEDULING
61 | SchedulerType=sched/backfill
62 | #SchedulerAuth=
63 | #SelectType=select/linear
64 | FastSchedule=1
65 | #PriorityType=priority/multifactor
66 | #PriorityDecayHalfLife=14-0
67 | #PriorityUsageResetPeriod=14-0
68 | #PriorityWeightFairshare=100000
69 | #PriorityWeightAge=1000
70 | #PriorityWeightPartition=10000
71 | #PriorityWeightJobSize=1000
72 | #PriorityMaxAge=1-0
73 | #
74 | # LOGGING
75 | SlurmctldDebug=3
76 | SlurmctldLogFile=/var/log/slurmctld.log
77 | SlurmdDebug=3
78 | SlurmdLogFile=/var/log/slurmd.log
79 | JobCompType=jobcomp/none
80 | #JobCompLoc=
81 | #
82 | # ACCOUNTING
83 | #JobAcctGatherType=jobacct_gather/linux
84 | #JobAcctGatherFrequency=30
85 | #
86 | #AccountingStorageType=accounting_storage/slurmdbd
87 | #AccountingStorageHost=
88 | #AccountingStorageLoc=
89 | #AccountingStoragePass=
90 | #AccountingStorageUser=
91 | #
92 | # COMPUTE NODES
93 | NodeName=slurm-0-0 Sockets=2 CoresPerSocket=22 ThreadsPerCore=1 Procs=44 RealMemory=257671 State=IDLE
94 | PartitionName=debug Nodes=ALL Default=YES MaxTime=INFINITE State=UP
95 | 


--------------------------------------------------------------------------------
/20k_Throughput-run/step05_Output_20k_Throughput-run.sh:
--------------------------------------------------------------------------------
 1 | #! /bin/bash
 2 | # Copyright (c) 2019 Intel Corporation
 3 | #
 4 | # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License.
 5 | # You may obtain a copy of the License at #
 6 | #    http://www.apache.org/licenses/LICENSE-2.0
 7 | #
 8 | # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 9 | # See the License for the specific language governing permissions and # limitations under the License.
10 | #
11 | # SPDX-License-Identifier: Apache-2.0
12 | #
13 | source ./configure
14 | 
15 | curl localhost:8000/api/workflows/v1/query 2>/dev/null | json_pp>cromwell_stop
16 | 
17 | start_date=`cat cromwell_start_date`
18 | count=`curl -sGET "$CROMWELL_HOST:8000/api/workflows/v1/query?start=$start_date&status=Running&includeSubworkflows=false" | jq '.totalResultsCount'`
19 | finish=`curl -sGET "$CROMWELL_HOST:8000/api/workflows/v1/query?start=$start_date&status=Succeeded&includeSubworkflows=false" | jq '.totalResultsCount'`
20 | failed=`curl -sGET "$CROMWELL_HOST:8000/api/workflows/v1/query?start=$start_date&status=Failed&includeSubworkflows=false" | jq '.totalResultsCount'`
21 | echo running: $count  finished: $finish  failed:  $failed
22 | #grep suceeded runs to cromwell-times to calculate elapse time:
23 | 
24 | sh step04_Cromwell_Monitor_20k_Throughput-run.sh | grep WholeGenomeGermlineSingleSample | sort>cromwell-times
25 | s=`cat cromwell-times | cut -d '|' -f4 | sort | head -1`
26 | e=`cat cromwell-times | cut -d '|' -f5 | sort | tail -n 1 `
27 | 
28 | if [ $count -gt 0 ]
29 | then
30 | echo workflow still in progress
31 | exit
32 | fi
33 | 
34 | #echo $s $e
35 | 
36 | s=`echo $s | tr 'T' ' ' | tr 'Z' '\n'`
37 | e=`echo $e | tr 'T' ' ' | tr 'Z' '\n'`
38 | #echo $s $e
39 | 
40 | s=`date -d "$s" +%s`
41 | e=`date -d "$e" +%s`
42 | 
43 | sec=`expr $e - $s`
44 | min=$(($sec / 60))
45 | minsec=$(($sec % 60))
46 | 
47 | printf "Total Elapsed Time for $NUM_WORKFLOW workflows: $min minutes:%2d seconds \n " $minsec
48 | 
49 | ########## Average elapse time taken for Mark Duplicates#############
50 | sum=0
51 | limit=$NUM_WORKFLOW
52 | 
53 | for i in `cat 20k_WF_ID/20k_WF_ID_*`;
54 | do
55 | 
56 | data=`grep "Elapsed time: " $CROMWELL_PATH/cromwell-slurm-exec/WholeGenomeGermlineSingleSample/$i/call-*/*/*/call-MarkDuplicates/execution/stderr | cut -d ':' -f 4 | cut -d " " -f 2`
57 | 
58 | x=`echo $data | cut -d '.' -f 1`
59 | y=`echo $data | cut -d '.' -f 2`
60 | let "z= 10#$x*100 + 10#$y"
61 | 
62 | let "sum= 10#$sum + 10#$z"
63 | 
64 | done
65 | 
66 | let "avg = sum / $limit"
67 | let "x = $avg / 100"
68 | let "y = $avg % 100"
69 | printf "Average Elapsed Time for Mark Duplicates: $x.%02d minutes\n" $y
70 | 
71 | 
72 | 


--------------------------------------------------------------------------------
/20k_Throughput-run/WDL/DragenTasks.wdl:
--------------------------------------------------------------------------------
 1 | version 1.0
 2 | 
 3 | ## Copyright Broad Institute, 2021
 4 | ##
 5 | ## This WDL defines tasks to use Dragen's DRAGstr approach to STR sequencing artifacts 
 6 | ## Indel genotype priors in the DRAGEN-Gatk pipeline. 
 7 | ##
 8 | ## Runtime parameters are often optimized for Broad's Google Cloud Platform implementation.
 9 | ## For program versions, see docker containers.
10 | ##
11 | ## LICENSING :
12 | ## This script is released under the WDL source code license (BSD-3) (see LICENSE in
13 | ## https://github.com/broadinstitute/wdl). Note however that the programs it calls may
14 | ## be subject to different licenses. Users are responsible for checking that they are
15 | ## authorized to run all programs before running this script. Please see the docker
16 | ## page at https://hub.docker.com/r/broadinstitute/genomes-in-the-cloud/ for detailed
17 | ## licensing information pertaining to the included programs.
18 | 
19 | task CalibrateDragstrModel {
20 |   input {
21 |     File ref_fasta
22 |     File ref_fasta_idx
23 |     File ref_dict
24 |     File str_table_file
25 |     File alignment ## can handle cram or bam.
26 |     File alignment_index
27 |     Int threads = 4
28 |     Int? memory_mb
29 |     Boolean use_ssd = true
30 |   }
31 | 
32 |   # If CRAM, restrict threads to a maximum of 4
33 |   Boolean is_cram = sub(alignment, "\\.cram$", "") != "" + alignment
34 |   Int java_threads = if (threads < 1 ) then 1 
35 |                 else if (is_cram && threads > 4) then 4 # more than 4 threads in cram is probrably contra-productive.
36 |                 else threads
37 | 
38 |   String base_name = basename(alignment)
39 |   String out_file_name = base_name + ".dragstr"
40 |   Int disk_size_gb = ceil(size([ref_fasta, ref_fasta_idx, ref_dict, alignment, alignment_index, str_table_file], "GiB")) + 
41 |                         40 # 40 for the rest of the fs.
42 | 
43 |   String parallel_args  = if (java_threads <= 1) then "" else "--threads " + java_threads
44 |   
45 |   # If the input is a CRAM we need an additional 500MB of memory per thread
46 |   Int recommended_memory_mb = ceil(2000 + (if (is_cram) then 500 else 100) * java_threads)
47 |   Int selected_memory_mb = select_first([memory_mb, recommended_memory_mb])
48 |   Int runtime_memory_mb = if (selected_memory_mb < 1500) then 1500 else selected_memory_mb
49 |   Int java_memory_mb = if (runtime_memory_mb < 2000) then 1000 else runtime_memory_mb - 1000
50 | 
51 |   command <<<
52 |     set -x
53 |     /mnt/lustre/genomics/tools/gatk/gatk --java-options "-Xmx~{java_memory_mb}m -DGATK_STACKTRACE_ON_USER_EXCEPTION=true -Dsamjdk.reference_fasta=~{ref_fasta}" \
54 |       CalibrateDragstrModel \
55 |         -R ~{ref_fasta} \
56 |         -I ~{alignment} \
57 |         -str ~{str_table_file} \
58 |         -O ~{out_file_name} \
59 |         ~{parallel_args}
60 | 
61 |   >>>
62 | 
63 |   runtime {
64 |      memory: runtime_memory_mb + " MiB"
65 |      cpu: java_threads
66 |   }
67 | 
68 |   output {
69 |     File dragstr_model = "~{out_file_name}"
70 |   }
71 | }
72 | 


--------------------------------------------------------------------------------
/20k_Throughput-run/README.md:
--------------------------------------------------------------------------------
 1 | # 20k Single Sample Workflow
 2 | To submit, monitor, and receive output from these workflows, follow these steps:
 3 | 
 4 | ## Prerequisites
 5 | 
 6 |  | Genomics Tools | Version |
 7 |  | :---: | --- |
 8 |  | **WARP** | v3.1.6 |
 9 |  | **GATK** | 4.2.6.1 |
10 |  | **bwa** | 0.7.17 |
11 |  | **cromwell** | 84 |
12 |  | **samtools** | 1.11 |
13 |  | **picard** | 2.27.4 |
14 |  | **VerifyBamID2** | 2.0.1 |
15 |  | **java** | java-11-openjdk - Cromwell<br>java-1.8.0-openjdk - GATK |
16 | 
17 |    Please refer to [WARP Requirement](https://broadinstitute.github.io/warp/docs/Pipelines/Whole_Genome_Germline_Single_Sample_Pipeline/README#software-version-requirements) for more details.
18 | 
19 | ## 1.	Clone the repository
20 | To clone the repository, run these commands:
21 | 
22 |      git clone https://github.com/Intel-HLS/BIGstack.git
23 |      
24 |      cd 20k_Throughput-run
25 |   
26 | ## 2.	Configure and setup environment variables
27 | Edit the configure file to set up the various paths to work directories: GENOMICS_PATH, TOOLS_PATH, DATA_PATH and NUM_WORKFLOW and zip the wdls in to warp.zip
28 | 
29 | 	./configure
30 | 
31 | ## 3.	Configure JSON file
32 | 
33 | 	./step01_Configure_20k_Throughput-run.sh
34 | 
35 | Modifies the Tool and Dataset paths in the json WholeGenomeGermlineSingleSample_20k.json file.
36 | 
37 | ## 4.	Download datasets
38 | This step will download the dataset to 'data' directory in the working directory.
39 | 
40 | 	./step02_Download_20k_Data_Throughput-run.sh
41 | 
42 | ## 5.	Run the 20k Throughput workflow
43 | Submit the workflow to the Cromwell workflow engine using this script: 
44 | 
45 | 	./step03_Cromwell_Run_20k_Throughput-run.sh.
46 | 
47 | After running this script, the HTTP response and workflow submission information are written to 20k_submission_response.txt in your home directory. Additionally, the workflow identifier for throughput run (for example: "id": "6ec0643c-1ea1-42bf-b60c-507cd1e3e96c"), is written to the file 20k_WF_ID.timestamp, which is used by steps 6 and 7.
48 | 
49 | ## 6.	Monitor the workflow for Workflow status - Failed, Succeeded, Running
50 | To monitor the 20k Single Sample workflow, execute:
51 | 
52 | 	./step04_Cromwell_Monitor_Single_Sample_20k_Workflow.sh.
53 | 
54 | ## 7.	View 20k Single Sample Workflow Output
55 | This will output the Elapse time and average Markduplicates elapse time.
56 | 
57 | 	./step05_Single_Sample_20k_Workflow_Output.sh.
58 | 
59 | # Troubleshooting
60 | 
61 | ## Install dependencies for Step 3-5:
62 | 	sudo yum install R -y
63 | 
64 | 	sudo yum install jq -y
65 | 
66 |  Make sure python2 and python3 are installed and symlinks are created.
67 | 
68 | ## Create generic symlinks for tools to latest/desired version - By default tool paths in wdl files uses generic symlinks :
69 | 	for tool in bwa samtools gatk;
70 | 	do 
71 | 	export tool_version=`ls $GENOMICS_PATH/tools | grep ${tool}- | head -n1` && echo ${tool_version} && ln -sfn $GENOMICS_PATH/tools/$tool_version $GENOMICS_PATH/tools/$tool; 
72 | 
73 | 	done;
74 | 
75 | ## Java version
76 | 
77 | Use Java 11 to compile and run cromwell, but switch to java 8 as the default to run the workflows.
78 | 
79 | ```
80 | sudo alternatives --config java
81 | ```


--------------------------------------------------------------------------------
/20k_Throughput-run/WDL/SplitLargeReadGroup.wdl:
--------------------------------------------------------------------------------
  1 | version 1.0
  2 | 
  3 | ## Copyright Broad Institute, 2018
  4 | ##
  5 | ## This WDL pipeline implements a split of large readgroups for human whole-genome and exome sequencing data.
  6 | ##
  7 | ## Runtime parameters are often optimized for Broad's Google Cloud Platform implementation.
  8 | ## For program versions, see docker containers.
  9 | ##
 10 | ## LICENSING :
 11 | ## This script is released under the WDL source code license (BSD-3) (see LICENSE in
 12 | ## https://github.com/broadinstitute/wdl). Note however that the programs it calls may
 13 | ## be subject to different licenses. Users are responsible for checking that they are
 14 | ## authorized to run all programs before running this script. Please see the docker
 15 | ## page at https://hub.docker.com/r/broadinstitute/genomes-in-the-cloud/ for detailed
 16 | ## licensing information pertaining to the included programs.
 17 | 
 18 | import "Alignment.wdl" as Alignment
 19 | import "DragmapAlignment.wdl" as DragmapAlignment
 20 | import "BamProcessing.wdl" as Processing
 21 | import "Utilities.wdl" as Utils
 22 | import "DNASeqStructs.wdl" as Structs
 23 | 
 24 | workflow SplitLargeReadGroup {
 25 | 
 26 |   input {
 27 |     File input_bam
 28 | 
 29 |     String bwa_commandline
 30 |     String output_bam_basename
 31 | 
 32 |     # reference_fasta.ref_alt is the .alt file from bwa-kit
 33 |     # (https://github.com/lh3/bwa/tree/master/bwakit),
 34 |     # listing the reference contigs that are "alternative".
 35 |     ReferenceFasta reference_fasta
 36 |     DragmapReference? dragmap_reference
 37 | 
 38 |     Int compression_level
 39 |     Int reads_per_file = 48000000
 40 |     Boolean hard_clip_reads = false
 41 |     Boolean unmap_contaminant_reads = true
 42 |     Boolean use_bwa_mem = true
 43 |     Boolean allow_empty_ref_alt = false
 44 |   }
 45 | 
 46 |   call Alignment.SamSplitter as SamSplitter {
 47 |     input :
 48 |       input_bam = input_bam,
 49 |       n_reads = reads_per_file,
 50 |       compression_level = compression_level
 51 |   }
 52 | 
 53 |   scatter(unmapped_bam in SamSplitter.split_bams) {
 54 |     Float current_unmapped_bam_size = size(unmapped_bam, "GiB")
 55 |     String current_name = basename(unmapped_bam, ".bam")
 56 | 
 57 |     if (use_bwa_mem) {
 58 |       call Alignment.SamToFastqAndBwaMemAndMba as SamToFastqAndBwaMemAndMba {
 59 |         input:
 60 |           input_bam = unmapped_bam,
 61 |           bwa_commandline = bwa_commandline,
 62 |           output_bam_basename = current_name,
 63 |           reference_fasta = reference_fasta,
 64 |           compression_level = compression_level,
 65 |           hard_clip_reads = hard_clip_reads,
 66 |           unmap_contaminant_reads = unmap_contaminant_reads,
 67 |           allow_empty_ref_alt = allow_empty_ref_alt
 68 |       }
 69 |     }
 70 |     if (!use_bwa_mem) {
 71 |       call DragmapAlignment.SamToFastqAndDragmapAndMba as SamToFastqAndDragmapAndMba {
 72 |         input:
 73 |           input_bam = unmapped_bam,
 74 |           output_bam_basename = current_name,
 75 |           reference_fasta = reference_fasta,
 76 |           dragmap_reference = select_first([dragmap_reference]),
 77 |           compression_level = compression_level,
 78 |           hard_clip_reads = hard_clip_reads,
 79 |           unmap_contaminant_reads = unmap_contaminant_reads
 80 |       }
 81 |     }
 82 | 
 83 |     File output_bam = select_first([SamToFastqAndBwaMemAndMba.output_bam, SamToFastqAndDragmapAndMba.output_bam])
 84 |   }
 85 | 
 86 |   call Processing.GatherUnsortedBamFiles as GatherMonolithicBamFile {
 87 |     input:
 88 |       input_bams = output_bam,
 89 |       total_input_size = size(output_bam, "GiB"),
 90 |       output_bam_basename = output_bam_basename,
 91 |       compression_level = compression_level
 92 |   }
 93 |   output {
 94 |     File aligned_bam = GatherMonolithicBamFile.output_bam
 95 |   }
 96 |   meta {
 97 |     allowNestedInputs: true
 98 |   }
 99 | }
100 | 


--------------------------------------------------------------------------------
/20k_Throughput-run/WholeGenomeGermlineSingleSample_20k.json:
--------------------------------------------------------------------------------
 1 | {
 2 |  "WholeGenomeGermlineSingleSample.sample_and_unmapped_bams": {
 3 |     "sample_name": "NA12878",
 4 |     "base_file_name": "NA1278",
 5 |     "flowcell_unmapped_bams": [
 6 |     "/mnt/lustre/genomics/data/genomics-public-data/test-data/dna/wgs/hiseq2500/NA12878/H06HDADXX130110.1.ATCACGAT.20k_reads.bam",
 7 |     "/mnt/lustre/genomics/data/genomics-public-data/test-data/dna/wgs/hiseq2500/NA12878/H06HDADXX130110.2.ATCACGAT.20k_reads.bam",
 8 |     "/mnt/lustre/genomics/data/genomics-public-data/test-data/dna/wgs/hiseq2500/NA12878/H06JUADXX130110.1.ATCACGAT.20k_reads.bam"
 9 |     ],
10 |     "final_gvcf_base_name": "NA12878",
11 |     "unmapped_bam_suffix": ".bam"
12 |   },
13 | 
14 |   "WholeGenomeGermlineSingleSample.scatter_settings": {
15 |     "haplotype_scatter_count": 50,
16 |     "break_bands_at_multiples_of": 100000
17 |   },
18 | 
19 |   "WholeGenomeGermlineSingleSample.references": {
20 |     "contamination_sites_ud": "/mnt/lustre/genomics/data/genomics-public-data/references/broad/hg38/v0/Homo_sapiens_assembly38.contam.UD",
21 |     "contamination_sites_bed": "/mnt/lustre/genomics/data/genomics-public-data/references/broad/hg38/v0/Homo_sapiens_assembly38.contam.bed",
22 |     "contamination_sites_mu": "/mnt/lustre/genomics/data/genomics-public-data/references/broad/hg38/v0/Homo_sapiens_assembly38.contam.mu",
23 |     "haplotype_database_file": "/mnt/lustre/genomics/data/genomics-public-data/references/broad/hg38/v0/Homo_sapiens_assembly38.haplotype_database.txt",
24 |     "calling_interval_list": "/mnt/lustre/genomics/data/genomics-public-data/resources/broad/hg38/v0/wgs_calling_regions.hg38.interval_list",
25 |     "evaluation_interval_list": "/mnt/lustre/genomics/data/genomics-public-data/resources/broad/hg38/v0/wgs_evaluation_regions.hg38.interval_list",
26 |     "reference_fasta" : {
27 |         "ref_dict": "/mnt/lustre/genomics/data/genomics-public-data/resources/broad/hg38/v0/Homo_sapiens_assembly38.dict",
28 |         "ref_fasta": "/mnt/lustre/genomics/data/genomics-public-data/resources/broad/hg38/v0/Homo_sapiens_assembly38.fasta",
29 |         "ref_fasta_index": "/mnt/lustre/genomics/data/genomics-public-data/resources/broad/hg38/v0/Homo_sapiens_assembly38.fasta.fai",
30 |         "ref_alt": "/mnt/lustre/genomics/data/genomics-public-data/resources/broad/hg38/v0/Homo_sapiens_assembly38.fasta.64.alt",
31 |         "ref_sa": "/mnt/lustre/genomics/data/genomics-public-data/resources/broad/hg38/v0/Homo_sapiens_assembly38.fasta.64.sa",
32 |         "ref_amb": "/mnt/lustre/genomics/data/genomics-public-data/resources/broad/hg38/v0/Homo_sapiens_assembly38.fasta.64.amb",
33 |         "ref_bwt": "/mnt/lustre/genomics/data/genomics-public-data/resources/broad/hg38/v0/Homo_sapiens_assembly38.fasta.64.bwt",
34 |         "ref_ann": "/mnt/lustre/genomics/data/genomics-public-data/resources/broad/hg38/v0/Homo_sapiens_assembly38.fasta.64.ann",
35 |         "ref_pac": "/mnt/lustre/genomics/data/genomics-public-data/resources/broad/hg38/v0/Homo_sapiens_assembly38.fasta.64.pac"
36 |     },
37 |     "known_indels_sites_vcfs": [
38 |       "/mnt/lustre/genomics/data/genomics-public-data/resources/broad/hg38/v0/Mills_and_1000G_gold_standard.indels.hg38.vcf.gz",
39 |       "/mnt/lustre/genomics/data/genomics-public-data/resources/broad/hg38/v0/Homo_sapiens_assembly38.known_indels.vcf.gz"
40 |     ],
41 |     "known_indels_sites_indices": [
42 |       "/mnt/lustre/genomics/data/genomics-public-data/resources/broad/hg38/v0/Mills_and_1000G_gold_standard.indels.hg38.vcf.gz.tbi",
43 |       "/mnt/lustre/genomics/data/genomics-public-data/resources/broad/hg38/v0/Homo_sapiens_assembly38.known_indels.vcf.gz.tbi"
44 |     ],
45 |     "dbsnp_vcf": "/mnt/lustre/genomics/data/genomics-public-data/resources/broad/hg38/v0/Homo_sapiens_assembly38.dbsnp138.vcf",
46 |     "dbsnp_vcf_index": "/mnt/lustre/genomics/data/genomics-public-data/resources/broad/hg38/v0/Homo_sapiens_assembly38.dbsnp138.vcf.idx"
47 |   },
48 | 
49 | 
50 |   "WholeGenomeGermlineSingleSample.wgs_coverage_interval_list": "/mnt/lustre/genomics/data/genomics-public-data/references/broad/hg38/v0/wgs_coverage_regions.hg38.interval_list",
51 | 
52 |   "WholeGenomeGermlineSingleSample.papi_settings": {
53 |     "preemptible_tries": 3,
54 |     "agg_preemptible_tries": 3
55 |   },
56 | 
57 |   "WholeGenomeGermlineSingleSample.UnmappedBamToAlignedBam.CheckContamination.disable_sanity_check": true
58 | }
59 | 


--------------------------------------------------------------------------------
/config_files/HTCondor/condor_config.app-node:
--------------------------------------------------------------------------------
 1 | ######################################################################
 2 | ##
 3 | ##  condor_config
 4 | ##
 5 | ##  This is the global configuration file for condor. This is where
 6 | ##  you define where the local config file is. Any settings
 7 | ##  made here may potentially be overridden in the local configuration
 8 | ##  file.  KEEP THAT IN MIND!  To double-check that a variable is
 9 | ##  getting set from the configuration file that you expect, use
10 | ##  condor_config_val -v <variable name>
11 | ##
12 | ##  condor_config.annotated is a more detailed sample config file
13 | ##
14 | ##  Unless otherwise specified, settings that are commented out show
15 | ##  the defaults that are used if you don't define a value.  Settings
16 | ##  that are defined here MUST BE DEFINED since they have no default
17 | ##  value.
18 | ##
19 | ######################################################################
20 | 
21 | ##  Where have you installed the bin, sbin and lib condor directories?   
22 | RELEASE_DIR = /usr
23 | 
24 | ##  Where is the local condor directory for each host?  This is where the local config file(s), logs and
25 | ##  spool/execute directories are located. this is the default for Linux and Unix systems.
26 | LOCAL_DIR = /var
27 | 
28 | ##  Where is the machine-specific local config file for each host?
29 | LOCAL_CONFIG_FILE = /etc/condor/condor_config.local
30 | ##  If your configuration is on a shared file system, then this might be a better default
31 | #LOCAL_CONFIG_FILE = $(RELEASE_DIR)/etc/$(HOSTNAME).local
32 | ##  If the local config file is not present, is it an error? (WARNING: This is a potential security issue.)
33 | REQUIRE_LOCAL_CONFIG_FILE = false
34 | 
35 | ##  The normal way to do configuration with RPMs is to read all of the
36 | ##  files in a given directory that don't match a regex as configuration files.
37 | ##  Config files are read in lexicographic order.
38 | LOCAL_CONFIG_DIR = /etc/condor/config.d
39 | #LOCAL_CONFIG_DIR_EXCLUDE_REGEXP = ^((\..*)|(.*~)|(#.*)|(.*\.rpmsave)|(.*\.rpmnew))$
40 | 
41 | ##  Use a host-based security policy. By default CONDOR_HOST and the local machine will be allowed
42 | use SECURITY : HOST_BASED
43 | ##  To expand your condor pool beyond a single host, set ALLOW_WRITE to match all of the hosts
44 | #ALLOW_WRITE = *.cs.wisc.edu
45 | ##  FLOCK_FROM defines the machines that grant access to your pool via flocking. (i.e. these machines can join your pool).
46 | #FLOCK_FROM =
47 | ##  FLOCK_TO defines the central managers that your schedd will advertise itself to (i.e. these pools will give matches to your schedd).
48 | #FLOCK_TO = condor.cs.wisc.edu, cm.example.edu
49 | 
50 | ##--------------------------------------------------------------------
51 | ## Values set by the rpm patch script:
52 | ##--------------------------------------------------------------------
53 | 
54 | ## For Unix machines, the path and file name of the file containing
55 | ## the pool password for password authentication.
56 | #SEC_PASSWORD_FILE = $(LOCAL_DIR)/lib/condor/pool_password
57 | 
58 | ##  Pathnames
59 | RUN     = $(LOCAL_DIR)/run/condor
60 | LOG     = $(LOCAL_DIR)/log/condor
61 | LOCK    = $(LOCAL_DIR)/lock/condor
62 | SPOOL   = $(LOCAL_DIR)/lib/condor/spool
63 | EXECUTE = $(LOCAL_DIR)/lib/condor/execute
64 | BIN     = $(RELEASE_DIR)/bin
65 | LIB = $(RELEASE_DIR)/lib64/condor
66 | INCLUDE = $(RELEASE_DIR)/include/condor
67 | SBIN    = $(RELEASE_DIR)/sbin
68 | LIBEXEC = $(RELEASE_DIR)/libexec/condor
69 | SHARE   = $(RELEASE_DIR)/share/condor
70 | 
71 | PROCD_ADDRESS = $(RUN)/procd_pipe
72 | 
73 | JAVA_CLASSPATH_DEFAULT = $(SHARE) $(SHARE)/scimark2lib.jar .
74 | 
75 | SSH_TO_JOB_SSHD_CONFIG_TEMPLATE = /etc/condor/condor_ssh_to_job_sshd_config_template
76 | 
77 | ##  What machine is your central manager?
78 | 
79 | CONDOR_HOST = $(FULL_HOSTNAME)
80 | 
81 | ##  This macro determines what daemons the condor_master will start and keep its watchful eyes on.
82 | ##  The list is a comma or space separated list of subsystem names
83 | 
84 | DAEMON_LIST = COLLECTOR, MASTER, NEGOTIATOR, SCHEDD
85 | 
86 | # domain
87 | # REPLACE <YOUR-DOMAIN> WITH YOUR CLUSTER'S DOMAIN, e.g., iogs.yourorg.com
88 | UID_DOMAIN = <YOUR-DOMAIN>
89 | FILESYSTEM_DOMAIN = $(UID_DOMAIN)
90 | 
91 | # permissions
92 | ALLOW_READ = *
93 | ALLOW_WRITE = *
94 | 
95 | # dedicated scheduler
96 | # REPLACE <APPNODE-IP-ADDR> WITH THE APPLICATION NODE'S IP ADDRESS, e.g., 192.168.1.5
97 | DedicatedScheduler="DedicatedScheduler@<APPNODE-IP-ADDR>"
98 | STARTD_ATTRS = $(STARTD_ATTRS), DedicatedScheduler
99 | 


--------------------------------------------------------------------------------
/20k_Throughput-run/WDL/DragmapAlignment.wdl:
--------------------------------------------------------------------------------
 1 | version 1.0
 2 | 
 3 | ## Copyright Broad Institute, 2021
 4 | ##
 5 | ## This WDL defines tasks used for alignment of human whole-genome or exome sequencing data using Illumina's DRAGEN open source mapper.
 6 | ##
 7 | ## Runtime parameters are often optimized for Broad's Google Cloud Platform implementation.
 8 | ## For program versions, see docker containers.
 9 | ##
10 | ## LICENSING :
11 | ## This script is released under the WDL source code license (BSD-3) (see LICENSE in
12 | ## https://github.com/broadinstitute/wdl). Note however that the programs it calls may
13 | ## be subject to different licenses. Users are responsible for checking that they are
14 | ## authorized to run all programs before running this script. Please see the docker
15 | ## page at https://hub.docker.com/r/broadinstitute/genomes-in-the-cloud/ for detailed
16 | ## licensing information pertaining to the included programs.
17 | 
18 | import "DNASeqStructs.wdl"
19 | 
20 | # Read unmapped BAM, convert on-the-fly to FASTQ and stream to BWA MEM for alignment, then stream to MergeBamAlignment
21 | task SamToFastqAndDragmapAndMba {
22 |   input {
23 |     File input_bam
24 |     String output_bam_basename
25 | 
26 |     ReferenceFasta reference_fasta
27 |     DragmapReference dragmap_reference
28 | 
29 |     Int compression_level
30 |     Boolean hard_clip_reads = false
31 |     Boolean unmap_contaminant_reads = true
32 | 
33 |     Int cpu = 16
34 |     Float disk_multiplier = 8
35 |     Int memory_mb = 40960
36 |   }
37 | 
38 |   Float unmapped_bam_size = size(input_bam, "GiB")
39 |   Float ref_size = size(reference_fasta.ref_fasta, "GiB") + size(reference_fasta.ref_fasta_index, "GiB") + size(reference_fasta.ref_dict, "GiB")
40 |   Float bwa_ref_size = ref_size + size(reference_fasta.ref_alt, "GiB") + size(reference_fasta.ref_amb, "GiB") + size(reference_fasta.ref_ann, "GiB") + size(reference_fasta.ref_bwt, "GiB") + size(reference_fasta.ref_pac, "GiB") + size(reference_fasta.ref_sa, "GiB")
41 |   Float dragmap_ref_size = size(dragmap_reference.reference_bin, "GiB") + size(dragmap_reference.hash_table_cfg_bin, "GiB") + size(dragmap_reference.hash_table_cmp, "GiB")
42 |   Int disk_size_gb = ceil(unmapped_bam_size + bwa_ref_size + dragmap_ref_size + (disk_multiplier * unmapped_bam_size) + 20)
43 | 
44 |   command <<<
45 |     set -euxo pipefail
46 | 
47 |     DRAGMAP_VERSION=$(dragen-os --version)
48 | 
49 |     if [ -z ${DRAGMAP_VERSION} ]; then
50 |         exit 1;
51 |     fi
52 | 
53 |     mkdir dragen_reference
54 |     mv ~{dragmap_reference.reference_bin} ~{dragmap_reference.hash_table_cfg_bin} ~{dragmap_reference.hash_table_cmp} dragen_reference
55 | 
56 |     dragen-os -b ~{input_bam} -r dragen_reference --interleaved=1 2> >(tee ~{output_bam_basename}.dragmap.stderr.log >&2) | /mnt/lustre/genomics/tools/samtools/samtools view -h -O BAM - > aligned.bam
57 |     java -Dsamjdk.compression_level=~{compression_level} -Xms1000m -Xmx1000m -jar /picard/picard.jar \
58 |       MergeBamAlignment \
59 |       VALIDATION_STRINGENCY=SILENT \
60 |       EXPECTED_ORIENTATIONS=FR \
61 |       ATTRIBUTES_TO_RETAIN=X0 \
62 |       ATTRIBUTES_TO_REMOVE=RG \
63 |       ATTRIBUTES_TO_REMOVE=NM \
64 |       ATTRIBUTES_TO_REMOVE=MD \
65 |       ALIGNED_BAM=aligned.bam \
66 |       UNMAPPED_BAM=~{input_bam} \
67 |       OUTPUT=~{output_bam_basename}.bam \
68 |       REFERENCE_SEQUENCE=~{reference_fasta.ref_fasta} \
69 |       PAIRED_RUN=true \
70 |       SORT_ORDER="unsorted" \
71 |       IS_BISULFITE_SEQUENCE=false \
72 |       ALIGNED_READS_ONLY=false \
73 |       CLIP_ADAPTERS=false \
74 |       ~{true='CLIP_OVERLAPPING_READS=true' false="" hard_clip_reads} \
75 |       ~{true='CLIP_OVERLAPPING_READS_OPERATOR=H' false="" hard_clip_reads} \
76 |       MAX_RECORDS_IN_RAM=2000000 \
77 |       ADD_MATE_CIGAR=true \
78 |       MAX_INSERTIONS_OR_DELETIONS=-1 \
79 |       PRIMARY_ALIGNMENT_STRATEGY=MostDistant \
80 |       PROGRAM_RECORD_ID="dragen-os" \
81 |       PROGRAM_GROUP_VERSION="${DRAGMAP_VERSION}" \
82 |       PROGRAM_GROUP_COMMAND_LINE="dragen-os -b ~{input_bam} -r dragen_reference --interleaved=1" \
83 |       PROGRAM_GROUP_NAME="dragen-os" \
84 |       UNMAPPED_READ_STRATEGY=COPY_TO_TAG \
85 |       ALIGNER_PROPER_PAIR_FLAGS=true \
86 |       UNMAP_CONTAMINANT_READS=~{unmap_contaminant_reads} \
87 |       ADD_PG_TAG_TO_READS=false
88 |   >>>
89 |   runtime {
90 |     memory: "${memory_mb} MiB"
91 |     cpu: cpu
92 |   }
93 |   output {
94 |     File output_bam = "~{output_bam_basename}.bam"
95 |     File dragmap_stderr_log = "~{output_bam_basename}.dragmap.stderr.log"
96 |   }
97 | }
98 | 


--------------------------------------------------------------------------------
/config_files/HTCondor/condor_config.comp-node:
--------------------------------------------------------------------------------
  1 | ######################################################################
  2 | ##
  3 | ##  condor_config
  4 | ##
  5 | ##  This is the global configuration file for condor. This is where
  6 | ##  you define where the local config file is. Any settings
  7 | ##  made here may potentially be overridden in the local configuration
  8 | ##  file.  KEEP THAT IN MIND!  To double-check that a variable is
  9 | ##  getting set from the configuration file that you expect, use
 10 | ##  condor_config_val -v <variable name>
 11 | ##
 12 | ##  condor_config.annotated is a more detailed sample config file
 13 | ##
 14 | ##  Unless otherwise specified, settings that are commented out show
 15 | ##  the defaults that are used if you don't define a value.  Settings
 16 | ##  that are defined here MUST BE DEFINED since they have no default
 17 | ##  value.
 18 | ##
 19 | ######################################################################
 20 | 
 21 | ##  Where have you installed the bin, sbin and lib condor directories?   
 22 | RELEASE_DIR = /usr
 23 | 
 24 | ##  Where is the local condor directory for each host?  This is where the local config file(s), logs and
 25 | ##  spool/execute directories are located. this is the default for Linux and Unix systems.
 26 | LOCAL_DIR = /var
 27 | 
 28 | ##  Where is the machine-specific local config file for each host?
 29 | LOCAL_CONFIG_FILE = /etc/condor/condor_config.local
 30 | ##  If your configuration is on a shared file system, then this might be a better default
 31 | #LOCAL_CONFIG_FILE = $(RELEASE_DIR)/etc/$(HOSTNAME).local
 32 | ##  If the local config file is not present, is it an error? (WARNING: This is a potential security issue.)
 33 | REQUIRE_LOCAL_CONFIG_FILE = false
 34 | 
 35 | ##  The normal way to do configuration with RPMs is to read all of the
 36 | ##  files in a given directory that don't match a regex as configuration files.
 37 | ##  Config files are read in lexicographic order.
 38 | LOCAL_CONFIG_DIR = /etc/condor/config.d
 39 | #LOCAL_CONFIG_DIR_EXCLUDE_REGEXP = ^((\..*)|(.*~)|(#.*)|(.*\.rpmsave)|(.*\.rpmnew))$
 40 | 
 41 | ##  Use a host-based security policy. By default CONDOR_HOST and the local machine will be allowed
 42 | use SECURITY : HOST_BASED
 43 | ##  To expand your condor pool beyond a single host, set ALLOW_WRITE to match all of the hosts
 44 | #ALLOW_WRITE = *.cs.wisc.edu
 45 | ##  FLOCK_FROM defines the machines that grant access to your pool via flocking. (i.e. these machines can join your pool).
 46 | #FLOCK_FROM =
 47 | ##  FLOCK_TO defines the central managers that your schedd will advertise itself to (i.e. these pools will give matches to your schedd).
 48 | #FLOCK_TO = condor.cs.wisc.edu, cm.example.edu
 49 | 
 50 | ##--------------------------------------------------------------------
 51 | ## Values set by the rpm patch script:
 52 | ##--------------------------------------------------------------------
 53 | 
 54 | ## For Unix machines, the path and file name of the file containing
 55 | ## the pool password for password authentication.
 56 | #SEC_PASSWORD_FILE = $(LOCAL_DIR)/lib/condor/pool_password
 57 | 
 58 | ##  Pathnames
 59 | RUN     = $(LOCAL_DIR)/run/condor
 60 | LOG     = $(LOCAL_DIR)/log/condor
 61 | LOCK    = $(LOCAL_DIR)/lock/condor
 62 | SPOOL   = $(LOCAL_DIR)/lib/condor/spool
 63 | EXECUTE = $(LOCAL_DIR)/lib/condor/execute
 64 | BIN     = $(RELEASE_DIR)/bin
 65 | LIB = $(RELEASE_DIR)/lib64/condor
 66 | INCLUDE = $(RELEASE_DIR)/include/condor
 67 | SBIN    = $(RELEASE_DIR)/sbin
 68 | LIBEXEC = $(RELEASE_DIR)/libexec/condor
 69 | SHARE   = $(RELEASE_DIR)/share/condor
 70 | 
 71 | PROCD_ADDRESS = $(RUN)/procd_pipe
 72 | 
 73 | JAVA_CLASSPATH_DEFAULT = $(SHARE) $(SHARE)/scimark2lib.jar .
 74 | 
 75 | SSH_TO_JOB_SSHD_CONFIG_TEMPLATE = /etc/condor/condor_ssh_to_job_sshd_config_template
 76 | 
 77 | ##  What machine is your central manager?
 78 | 
 79 | # REPLACE <APPNODE-IP-ADDR> WITH THE APPLICATION NODE'S IP ADDRESS, e.g., 192.168.1.5
 80 | CONDOR_HOST = <APPNODE-IP-ADDR>
 81 | 
 82 | ##  This macro determines what daemons the condor_master will start and keep its watchful eyes on.
 83 | ##  The list is a comma or space separated list of subsystem names
 84 | 
 85 | DAEMON_LIST = MASTER, SCHEDD, STARTD
 86 | # domain
 87 | # REPLACE <YOUR-DOMAIN> WITH YOUR CLUSTER'S DOMAIN, e.g., iogs.yourorg.com
 88 | UID_DOMAIN = <YOUR-DOMAIN>
 89 | FILESYSTEM_DOMAIN = $(UID_DOMAIN)
 90 | 
 91 | #Configure the whole machine as 1 slot - set it to be dynamically
 92 | ##partitionable so that Condor can assign portions as needed.
 93 | SLOT_TYPE_1 = 100%
 94 | NUM_SLOTS_TYPE_1 = 1
 95 | SLOT_TYPE_1_PARTITIONABLE = True
 96 | NUM_SLOTS=1
 97 | 
 98 | # permissions
 99 | ALLOW_READ = *
100 | ALLOW_WRITE = *
101 | 
102 | # dedicated scheduler
103 | # REPLACE <APPNODE-IP-ADDR> WITH THE APPLICATION NODE'S IP ADDRESS, e.g., 192.168.1.5
104 | DedicatedScheduler="DedicatedScheduler@<APPNODE-IP-ADDR>"
105 | STARTD_ATTRS = $(STARTD_ATTRS), DedicatedScheduler
106 | 


--------------------------------------------------------------------------------
/20k_Tutorial/step02_Download_Single_Sample_20k_Data.sh:
--------------------------------------------------------------------------------
 1 | #! /bin/bash
 2 | echo "BIGStack Tutorial Step 2"
 3 | echo "Downloading Reference Data (if it doesn't already exist)"
 4 | GCP_PATH="https://storage.googleapis.com"
 5 | #Edit the below DATA_PATH to where you want the data to reside in your shared file system
 6 | DATA_PATH="/cluster_share/data/RefArch_Broad_data"
 7 | 
 8 | mkdir -p $DATA_PATH/genomics-public-data/resources/broad/hg38/v0
 9 | cd $DATA_PATH/genomics-public-data/resources/broad/hg38/v0
10 | echo "Downloading the reference files"
11 | #Reference Genome
12 | wget -nc -v -P $DATA_PATH/genomics-public-data/resources/broad/hg38/v0 \
13 | $GCP_PATH/broad-references/hg38/v0/Homo_sapiens_assembly38.dict
14 | wget -nc -v -P $DATA_PATH/genomics-public-data/resources/broad/hg38/v0 \
15 | $GCP_PATH/broad-references/hg38/v0/Homo_sapiens_assembly38.fasta
16 | wget -nc -v -P $DATA_PATH/genomics-public-data/resources/broad/hg38/v0 \
17 | $GCP_PATH/broad-references/hg38/v0/Homo_sapiens_assembly38.fasta.fai
18 | wget -nc -v -P $DATA_PATH/genomics-public-data/resources/broad/hg38/v0 \
19 | $GCP_PATH/broad-references/hg38/v0/Homo_sapiens_assembly38.fasta.64.alt
20 | wget -nc -v -P $DATA_PATH/genomics-public-data/resources/broad/hg38/v0 \
21 | $GCP_PATH/broad-references/hg38/v0/Homo_sapiens_assembly38.fasta.64.sa
22 | wget -nc -v -P $DATA_PATH/genomics-public-data/resources/broad/hg38/v0 \
23 | $GCP_PATH/broad-references/hg38/v0/Homo_sapiens_assembly38.fasta.64.amb
24 | wget -nc -v -P $DATA_PATH/genomics-public-data/resources/broad/hg38/v0 \
25 | $GCP_PATH/broad-references/hg38/v0/Homo_sapiens_assembly38.fasta.64.bwt
26 | wget -nc -v -P $DATA_PATH/genomics-public-data/resources/broad/hg38/v0 \
27 | $GCP_PATH/broad-references/hg38/v0/Homo_sapiens_assembly38.fasta.64.ann
28 | wget -nc -v -P $DATA_PATH/genomics-public-data/resources/broad/hg38/v0 \
29 | $GCP_PATH/broad-references/hg38/v0/Homo_sapiens_assembly38.fasta.64.pac
30 | wget -nc -v -P $DATA_PATH/genomics-public-data/resources/broad/hg38/v0 \
31 | $GCP_PATH/broad-references/hg38/v0/Homo_sapiens_assembly38.contam.UD
32 | wget -nc -v -P $DATA_PATH/genomics-public-data/resources/broad/hg38/v0 \
33 | $GCP_PATH/broad-references/hg38/v0/Homo_sapiens_assembly38.contam.bed
34 | wget -nc -v -P $DATA_PATH/genomics-public-data/resources/broad/hg38/v0 \
35 | $GCP_PATH/broad-references/hg38/v0/Homo_sapiens_assembly38.contam.mu
36 | echo "Done downloading reference files"
37 | sleep 1
38 | echo "Downloading the resource files"
39 | #Resource Files
40 | wget -nc -v -P $DATA_PATH/genomics-public-data/resources/broad/hg38/v0 \
41 | $GCP_PATH/broad-references/hg38/v0/Homo_sapiens_assembly38.dbsnp138.vcf
42 | wget -nc -v -P $DATA_PATH/genomics-public-data/resources/broad/hg38/v0 \
43 | $GCP_PATH/broad-references/hg38/v0/Homo_sapiens_assembly38.dbsnp138.vcf.idx
44 | wget -nc -v -P $DATA_PATH/genomics-public-data/resources/broad/hg38/v0 \
45 | $GCP_PATH/broad-references/hg38/v0/Mills_and_1000G_gold_standard.indels.hg38.vcf.gz
46 | wget -nc -v -P $DATA_PATH/genomics-public-data/resources/broad/hg38/v0 \
47 | $GCP_PATH/broad-references/hg38/v0/Mills_and_1000G_gold_standard.indels.hg38.vcf.gz.tbi
48 | wget -nc -v -P $DATA_PATH/genomics-public-data/resources/broad/hg38/v0 \
49 | $GCP_PATH/broad-references/hg38/v0/Homo_sapiens_assembly38.known_indels.vcf.gz
50 | wget -nc -v -P $DATA_PATH/genomics-public-data/resources/broad/hg38/v0 \
51 | $GCP_PATH/broad-references/hg38/v0/Homo_sapiens_assembly38.known_indels.vcf.gz.tbi
52 | echo "Done downloading resource files"
53 | sleep 1
54 | echo "Downloading the intervals files"
55 | #Interval Files
56 | wget -nc -v -P $DATA_PATH/genomics-public-data/resources/broad/hg38/v0 \
57 | $GCP_PATH/broad-references/hg38/v0/wgs_calling_regions.hg38.interval_list
58 | wget -nc -v -P $DATA_PATH/genomics-public-data/resources/broad/hg38/v0 \
59 | $GCP_PATH/broad-references/hg38/v0/wgs_coverage_regions.hg38.interval_list
60 | wget -nc -v -P $DATA_PATH/genomics-public-data/resources/broad/hg38/v0 \
61 | $GCP_PATH/broad-references/hg38/v0/wgs_coverage_regions.hg38.interval_list
62 | echo "Done downloading interval files"
63 | sleep 1
64 | echo "Downloading 20k Test Data for Single Sample Workflow"
65 | mkdir -p $DATA_PATH/genomics-public-data/test-data/dna/wgs/hiseq2500/NA12878
66 | cd $DATA_PATH/genomics-public-data/test-data/dna/wgs/hiseq2500/NA12878
67 | wget -nc -v -P $DATA_PATH/genomics-public-data/test-data/dna/wgs/hiseq2500/NA12878 \
68 | $GCP_PATH/genomics-public-data/test-data/dna/wgs/hiseq2500/NA12878/H06HDADXX130110.1.ATCACGAT.20k_reads.bam
69 | wget -nc -v -P $DATA_PATH/genomics-public-data/test-data/dna/wgs/hiseq2500/NA12878 \
70 | $GCP_PATH/genomics-public-data/test-data/dna/wgs/hiseq2500/NA12878/H06HDADXX130110.2.ATCACGAT.20k_reads.bam
71 | wget -nc -v -P $DATA_PATH/genomics-public-data/test-data/dna/wgs/hiseq2500/NA12878 \
72 | $GCP_PATH/genomics-public-data/test-data/dna/wgs/hiseq2500/NA12878/H06JUADXX130110.1.ATCACGAT.20k_reads.bam
73 | chmod -R 777 $DATA_PATH/genomics-public-data/test-data/dna/wgs/hiseq2500/NA12878
74 | echo "Data for tutorial downloaded successfully"
75 | 
76 | 


--------------------------------------------------------------------------------
/20k_Tutorial_Docker/step02_Download_Single_Sample_20k_Data.sh:
--------------------------------------------------------------------------------
 1 | #! /bin/bash
 2 | echo "BIGStack Tutorial Step 2"
 3 | echo "Downloading Reference Data (if it doesn't already exist)"
 4 | GCP_PATH="https://storage.googleapis.com"
 5 | #Edit the below DATA_PATH to where you want the data to reside in your shared file system
 6 | DATA_PATH="/cluster_share/data/RefArch_Broad_data"
 7 | 
 8 | mkdir -p $DATA_PATH/genomics-public-data/resources/broad/hg38/v0
 9 | cd $DATA_PATH/genomics-public-data/resources/broad/hg38/v0
10 | echo "Downloading the reference files"
11 | #Reference Genome
12 | wget -nc -v -P $DATA_PATH/genomics-public-data/resources/broad/hg38/v0 \
13 | $GCP_PATH/broad-references/hg38/v0/Homo_sapiens_assembly38.dict
14 | wget -nc -v -P $DATA_PATH/genomics-public-data/resources/broad/hg38/v0 \
15 | $GCP_PATH/broad-references/hg38/v0/Homo_sapiens_assembly38.fasta
16 | wget -nc -v -P $DATA_PATH/genomics-public-data/resources/broad/hg38/v0 \
17 | $GCP_PATH/broad-references/hg38/v0/Homo_sapiens_assembly38.fasta.fai
18 | wget -nc -v -P $DATA_PATH/genomics-public-data/resources/broad/hg38/v0 \
19 | $GCP_PATH/broad-references/hg38/v0/Homo_sapiens_assembly38.fasta.64.alt
20 | wget -nc -v -P $DATA_PATH/genomics-public-data/resources/broad/hg38/v0 \
21 | $GCP_PATH/broad-references/hg38/v0/Homo_sapiens_assembly38.fasta.64.sa
22 | wget -nc -v -P $DATA_PATH/genomics-public-data/resources/broad/hg38/v0 \
23 | $GCP_PATH/broad-references/hg38/v0/Homo_sapiens_assembly38.fasta.64.amb
24 | wget -nc -v -P $DATA_PATH/genomics-public-data/resources/broad/hg38/v0 \
25 | $GCP_PATH/broad-references/hg38/v0/Homo_sapiens_assembly38.fasta.64.bwt
26 | wget -nc -v -P $DATA_PATH/genomics-public-data/resources/broad/hg38/v0 \
27 | $GCP_PATH/broad-references/hg38/v0/Homo_sapiens_assembly38.fasta.64.ann
28 | wget -nc -v -P $DATA_PATH/genomics-public-data/resources/broad/hg38/v0 \
29 | $GCP_PATH/broad-references/hg38/v0/Homo_sapiens_assembly38.fasta.64.pac
30 | wget -nc -v -P $DATA_PATH/genomics-public-data/resources/broad/hg38/v0 \
31 | $GCP_PATH/broad-references/hg38/v0/Homo_sapiens_assembly38.contam.UD
32 | wget -nc -v -P $DATA_PATH/genomics-public-data/resources/broad/hg38/v0 \
33 | $GCP_PATH/broad-references/hg38/v0/Homo_sapiens_assembly38.contam.bed
34 | wget -nc -v -P $DATA_PATH/genomics-public-data/resources/broad/hg38/v0 \
35 | $GCP_PATH/broad-references/hg38/v0/Homo_sapiens_assembly38.contam.mu
36 | echo "Done downloading reference files"
37 | sleep 1
38 | echo "Downloading the resource files"
39 | #Resource Files
40 | wget -nc -v -P $DATA_PATH/genomics-public-data/resources/broad/hg38/v0 \
41 | $GCP_PATH/broad-references/hg38/v0/Homo_sapiens_assembly38.dbsnp138.vcf
42 | wget -nc -v -P $DATA_PATH/genomics-public-data/resources/broad/hg38/v0 \
43 | $GCP_PATH/broad-references/hg38/v0/Homo_sapiens_assembly38.dbsnp138.vcf.idx
44 | wget -nc -v -P $DATA_PATH/genomics-public-data/resources/broad/hg38/v0 \
45 | $GCP_PATH/broad-references/hg38/v0/Mills_and_1000G_gold_standard.indels.hg38.vcf.gz
46 | wget -nc -v -P $DATA_PATH/genomics-public-data/resources/broad/hg38/v0 \
47 | $GCP_PATH/broad-references/hg38/v0/Mills_and_1000G_gold_standard.indels.hg38.vcf.gz.tbi
48 | wget -nc -v -P $DATA_PATH/genomics-public-data/resources/broad/hg38/v0 \
49 | $GCP_PATH/broad-references/hg38/v0/Homo_sapiens_assembly38.known_indels.vcf.gz
50 | wget -nc -v -P $DATA_PATH/genomics-public-data/resources/broad/hg38/v0 \
51 | $GCP_PATH/broad-references/hg38/v0/Homo_sapiens_assembly38.known_indels.vcf.gz.tbi
52 | echo "Done downloading resource files"
53 | sleep 1
54 | echo "Downloading the intervals files"
55 | #Interval Files
56 | wget -nc -v -P $DATA_PATH/genomics-public-data/resources/broad/hg38/v0 \
57 | $GCP_PATH/broad-references/hg38/v0/wgs_calling_regions.hg38.interval_list
58 | wget -nc -v -P $DATA_PATH/genomics-public-data/resources/broad/hg38/v0 \
59 | $GCP_PATH/broad-references/hg38/v0/wgs_coverage_regions.hg38.interval_list
60 | wget -nc -v -P $DATA_PATH/genomics-public-data/resources/broad/hg38/v0 \
61 | $GCP_PATH/broad-references/hg38/v0/wgs_coverage_regions.hg38.interval_list
62 | echo "Done downloading interval files"
63 | sleep 1
64 | echo "Downloading 20k Test Data for Single Sample Workflow"
65 | mkdir -p $DATA_PATH/genomics-public-data/test-data/dna/wgs/hiseq2500/NA12878
66 | cd $DATA_PATH/genomics-public-data/test-data/dna/wgs/hiseq2500/NA12878
67 | wget -nc -v -P $DATA_PATH/genomics-public-data/test-data/dna/wgs/hiseq2500/NA12878 \
68 | $GCP_PATH/genomics-public-data/test-data/dna/wgs/hiseq2500/NA12878/H06HDADXX130110.1.ATCACGAT.20k_reads.bam
69 | wget -nc -v -P $DATA_PATH/genomics-public-data/test-data/dna/wgs/hiseq2500/NA12878 \
70 | $GCP_PATH/genomics-public-data/test-data/dna/wgs/hiseq2500/NA12878/H06HDADXX130110.2.ATCACGAT.20k_reads.bam
71 | wget -nc -v -P $DATA_PATH/genomics-public-data/test-data/dna/wgs/hiseq2500/NA12878 \
72 | $GCP_PATH/genomics-public-data/test-data/dna/wgs/hiseq2500/NA12878/H06JUADXX130110.1.ATCACGAT.20k_reads.bam
73 | chmod -R 777 $DATA_PATH/genomics-public-data/test-data/dna/wgs/hiseq2500/NA12878
74 | echo "Data for tutorial downloaded successfully"
75 | 
76 | 


--------------------------------------------------------------------------------
/20k_Throughput-run/WDL/AggregatedBamQC.wdl:
--------------------------------------------------------------------------------
  1 | version 1.0
  2 | ## Copyright Broad Institute, 2018
  3 | ##
  4 | ## This WDL pipeline implements data processing according to the GATK Best Practices (June 2016)
  5 | ## for human whole-genome and exome sequencing data.
  6 | ##
  7 | ## Runtime parameters are often optimized for Broad's Google Cloud Platform implementation.
  8 | ## For program versions, see docker containers.
  9 | ##
 10 | ## LICENSING :
 11 | ## This script is released under the WDL source code license (BSD-3) (see LICENSE in
 12 | ## https://github.com/broadinstitute/wdl). Note however that the programs it calls may
 13 | ## be subject to different licenses. Users are responsible for checking that they are
 14 | ## authorized to run all programs before running this script. Please see the docker
 15 | ## page at https://hub.docker.com/r/broadinstitute/genomes-in-the-cloud/ for detailed
 16 | ## licensing information pertaining to the included programs.
 17 | 
 18 | import "Qc.wdl" as QC
 19 | import "DNASeqStructs.wdl"
 20 | 
 21 | # WORKFLOW DEFINITION
 22 | workflow AggregatedBamQC {
 23 | input {
 24 |     File base_recalibrated_bam
 25 |     File base_recalibrated_bam_index
 26 |     String base_name
 27 |     String sample_name
 28 |     String recalibrated_bam_base_name
 29 |     File haplotype_database_file
 30 |     DNASeqSingleSampleReferences references
 31 |     PapiSettings papi_settings
 32 |     File? fingerprint_genotypes_file
 33 |     File? fingerprint_genotypes_index
 34 |   }
 35 | 
 36 |   # QC the final BAM (consolidated after scattered BQSR)
 37 |   call QC.CollectReadgroupBamQualityMetrics as CollectReadgroupBamQualityMetrics {
 38 |     input:
 39 |       input_bam = base_recalibrated_bam,
 40 |       input_bam_index = base_recalibrated_bam_index,
 41 |       output_bam_prefix = base_name + ".readgroup",
 42 |       ref_dict = references.reference_fasta.ref_dict,
 43 |       ref_fasta = references.reference_fasta.ref_fasta,
 44 |       ref_fasta_index = references.reference_fasta.ref_fasta_index,
 45 |   }
 46 | 
 47 |   # QC the final BAM some more (no such thing as too much QC)
 48 |   call QC.CollectAggregationMetrics as CollectAggregationMetrics {
 49 |     input:
 50 |       input_bam = base_recalibrated_bam,
 51 |       input_bam_index = base_recalibrated_bam_index,
 52 |       output_bam_prefix = base_name,
 53 |       ref_dict = references.reference_fasta.ref_dict,
 54 |       ref_fasta = references.reference_fasta.ref_fasta,
 55 |       ref_fasta_index = references.reference_fasta.ref_fasta_index,
 56 |   }
 57 | 
 58 |   if (defined(haplotype_database_file) && defined(fingerprint_genotypes_file)) {
 59 |     # Check the sample BAM fingerprint against the sample array
 60 |     call QC.CheckFingerprintTask as CheckFingerprintTask {
 61 |       input:
 62 |         input_bam = base_recalibrated_bam,
 63 |         input_bam_index = base_recalibrated_bam_index,
 64 |         genotypes = select_first([fingerprint_genotypes_file]),
 65 |         genotypes_index = fingerprint_genotypes_index,
 66 |         expected_sample_alias = sample_name,
 67 |         output_basename = base_name,
 68 |         haplotype_database_file = haplotype_database_file,
 69 |     }
 70 |   }
 71 | 
 72 |   # Generate a checksum per readgroup in the final BAM
 73 |   call QC.CalculateReadGroupChecksum as CalculateReadGroupChecksum {
 74 |     input:
 75 |       input_bam = base_recalibrated_bam,
 76 |       input_bam_index = base_recalibrated_bam_index,
 77 |       read_group_md5_filename = recalibrated_bam_base_name + ".bam.read_group_md5",
 78 |   }
 79 | 
 80 |   output {
 81 |     File read_group_alignment_summary_metrics = CollectReadgroupBamQualityMetrics.alignment_summary_metrics
 82 |     File read_group_gc_bias_detail_metrics = CollectReadgroupBamQualityMetrics.gc_bias_detail_metrics
 83 |     File read_group_gc_bias_pdf = CollectReadgroupBamQualityMetrics.gc_bias_pdf
 84 |     File read_group_gc_bias_summary_metrics = CollectReadgroupBamQualityMetrics.gc_bias_summary_metrics
 85 | 
 86 |     File calculate_read_group_checksum_md5 = CalculateReadGroupChecksum.md5_file
 87 | 
 88 |     File agg_alignment_summary_metrics = CollectAggregationMetrics.alignment_summary_metrics
 89 |     File agg_bait_bias_detail_metrics = CollectAggregationMetrics.bait_bias_detail_metrics
 90 |     File agg_bait_bias_summary_metrics = CollectAggregationMetrics.bait_bias_summary_metrics
 91 |     File agg_gc_bias_detail_metrics = CollectAggregationMetrics.gc_bias_detail_metrics
 92 |     File agg_gc_bias_pdf = CollectAggregationMetrics.gc_bias_pdf
 93 |     File agg_gc_bias_summary_metrics = CollectAggregationMetrics.gc_bias_summary_metrics
 94 |     File agg_insert_size_histogram_pdf = CollectAggregationMetrics.insert_size_histogram_pdf
 95 |     File agg_insert_size_metrics = CollectAggregationMetrics.insert_size_metrics
 96 |     File agg_pre_adapter_detail_metrics = CollectAggregationMetrics.pre_adapter_detail_metrics
 97 |     File agg_pre_adapter_summary_metrics = CollectAggregationMetrics.pre_adapter_summary_metrics
 98 |     File agg_quality_distribution_pdf = CollectAggregationMetrics.quality_distribution_pdf
 99 |     File agg_quality_distribution_metrics = CollectAggregationMetrics.quality_distribution_metrics
100 |     File agg_error_summary_metrics = CollectAggregationMetrics.error_summary_metrics
101 | 
102 |     File? fingerprint_summary_metrics = CheckFingerprintTask.summary_metrics
103 |     File? fingerprint_detail_metrics = CheckFingerprintTask.detail_metrics
104 |   }
105 |   meta {
106 |     allowNestedInputs: true
107 |   }
108 | }
109 | 


--------------------------------------------------------------------------------
/20k_Throughput-run/WDL/Alignment.wdl:
--------------------------------------------------------------------------------
  1 | version 1.0
  2 | 
  3 | ## Copyright Broad Institute, 2018
  4 | ##
  5 | ## This WDL defines tasks used for alignment of human whole-genome or exome sequencing data.
  6 | ##
  7 | ## Runtime parameters are often optimized for Broad's Google Cloud Platform implementation.
  8 | ## For program versions, see docker containers.
  9 | ##
 10 | ## LICENSING :
 11 | ## This script is released under the WDL source code license (BSD-3) (see LICENSE in
 12 | ## https://github.com/broadinstitute/wdl). Note however that the programs it calls may
 13 | ## be subject to different licenses. Users are responsible for checking that they are
 14 | ## authorized to run all programs before running this script. Please see the docker
 15 | ## page at https://hub.docker.com/r/broadinstitute/genomes-in-the-cloud/ for detailed
 16 | ## licensing information pertaining to the included programs.
 17 | 
 18 | import "DNASeqStructs.wdl"
 19 | 
 20 | # Read unmapped BAM, convert on-the-fly to FASTQ and stream to BWA MEM for alignment, then stream to MergeBamAlignment
 21 | task SamToFastqAndBwaMemAndMba {
 22 |   input {
 23 |     File input_bam
 24 |     String bwa_commandline
 25 |     String output_bam_basename
 26 | 
 27 |     # reference_fasta.ref_alt is the .alt file from bwa-kit
 28 |     # (https://github.com/lh3/bwa/tree/master/bwakit),
 29 |     # listing the reference contigs that are "alternative".
 30 |     ReferenceFasta reference_fasta
 31 | 
 32 |     Int compression_level
 33 |     Boolean hard_clip_reads = false
 34 |     Boolean unmap_contaminant_reads = true
 35 |     Boolean allow_empty_ref_alt = false
 36 |   }
 37 | 
 38 |   Float unmapped_bam_size = size(input_bam, "GiB")
 39 |   Float ref_size = size(reference_fasta.ref_fasta, "GiB") + size(reference_fasta.ref_fasta_index, "GiB") + size(reference_fasta.ref_dict, "GiB")
 40 |   Float bwa_ref_size = ref_size + size(reference_fasta.ref_alt, "GiB") + size(reference_fasta.ref_amb, "GiB") + size(reference_fasta.ref_ann, "GiB") + size(reference_fasta.ref_bwt, "GiB") + size(reference_fasta.ref_pac, "GiB") + size(reference_fasta.ref_sa, "GiB")
 41 |   # Sometimes the output is larger than the input, or a task can spill to disk.
 42 |   # In these cases we need to account for the input (1) and the output (1.5) or the input(1), the output(1), and spillage (.5).
 43 |   Float disk_multiplier = 2.5
 44 |   Int disk_size = ceil(unmapped_bam_size + bwa_ref_size + (disk_multiplier * unmapped_bam_size) + 20)
 45 | 
 46 |   command <<<
 47 | 
 48 | 
 49 |     # This is done before "set -o pipefail" because "bwa" will have a rc=1 and we don't want to allow rc=1 to succeed
 50 |     # because the sed may also fail with that error and that is something we actually want to fail on.
 51 |     BWA_VERSION=$(/mnt/lustre/genomics/tools/bwa/bwa 2>&1 | \
 52 |     grep -e '^Version' | \
 53 |     sed 's/Version: //')
 54 | 
 55 |     set -o pipefail
 56 |     set -e
 57 | 
 58 |     if [ -z ${BWA_VERSION} ]; then
 59 |         exit 1;
 60 |     fi
 61 | 
 62 |     # set the bash variable needed for the command-line
 63 |     bash_ref_fasta=~{reference_fasta.ref_fasta}
 64 |     # if reference_fasta.ref_alt has data in it or allow_empty_ref_alt is set
 65 |     if [ -s ~{reference_fasta.ref_alt} ] || ~{allow_empty_ref_alt}; then
 66 |       java -Xms1000m -Xmx1000m -jar /mnt/lustre/genomics/tools/picard.jar \
 67 |         SamToFastq \
 68 |         INPUT=~{input_bam} \
 69 |         FASTQ=/dev/stdout \
 70 |         INTERLEAVE=true \
 71 |         NON_PF=true | \
 72 |       /mnt/lustre/genomics/tools/bwa/~{bwa_commandline} /dev/stdin - 2> >(tee ~{output_bam_basename}.bwa.stderr.log >&2) | \
 73 |       java -Dsamjdk.compression_level=~{compression_level} -Xms1000m -Xmx1000m -jar /mnt/lustre/genomics/tools/picard.jar \
 74 |         MergeBamAlignment \
 75 |         VALIDATION_STRINGENCY=SILENT \
 76 |         EXPECTED_ORIENTATIONS=FR \
 77 |         ATTRIBUTES_TO_RETAIN=X0 \
 78 |         ATTRIBUTES_TO_REMOVE=NM \
 79 |         ATTRIBUTES_TO_REMOVE=MD \
 80 |         ALIGNED_BAM=/dev/stdin \
 81 |         UNMAPPED_BAM=~{input_bam} \
 82 |         OUTPUT=~{output_bam_basename}.bam \
 83 |         REFERENCE_SEQUENCE=~{reference_fasta.ref_fasta} \
 84 |         SORT_ORDER="unsorted" \
 85 |         IS_BISULFITE_SEQUENCE=false \
 86 |         ALIGNED_READS_ONLY=false \
 87 |         CLIP_ADAPTERS=false \
 88 |         ~{true='CLIP_OVERLAPPING_READS=true' false="" hard_clip_reads} \
 89 |         ~{true='CLIP_OVERLAPPING_READS_OPERATOR=H' false="" hard_clip_reads} \
 90 |         MAX_RECORDS_IN_RAM=2000000 \
 91 |         ADD_MATE_CIGAR=true \
 92 |         MAX_INSERTIONS_OR_DELETIONS=-1 \
 93 |         PRIMARY_ALIGNMENT_STRATEGY=MostDistant \
 94 |         PROGRAM_RECORD_ID="bwamem" \
 95 |         PROGRAM_GROUP_VERSION="${BWA_VERSION}" \
 96 |         PROGRAM_GROUP_COMMAND_LINE="~{bwa_commandline}" \
 97 |         PROGRAM_GROUP_NAME="bwamem" \
 98 |         UNMAPPED_READ_STRATEGY=COPY_TO_TAG \
 99 |         ALIGNER_PROPER_PAIR_FLAGS=true \
100 |         UNMAP_CONTAMINANT_READS=~{unmap_contaminant_reads} \
101 |         ADD_PG_TAG_TO_READS=false
102 | 
103 |       if ~{!allow_empty_ref_alt}; then
104 |         grep -m1 "read .* ALT contigs" ~{output_bam_basename}.bwa.stderr.log | \
105 |         grep -v "read 0 ALT contigs"
106 |       fi
107 | 
108 |     # else reference_fasta.ref_alt is empty or could not be found
109 |     else
110 |       echo ref_alt input is empty or not provided. >&2
111 |       exit 1;
112 |     fi
113 |   >>>
114 |   runtime {
115 |     memory: "14 GiB"
116 |     cpu: "16"
117 |     backend: "SLURM-BWA"
118 |   }
119 |   output {
120 |     File output_bam = "~{output_bam_basename}.bam"
121 |     File bwa_stderr_log = "~{output_bam_basename}.bwa.stderr.log"
122 |   }
123 | }
124 | 
125 | task SamSplitter {
126 |   input {
127 |     File input_bam
128 |     Int n_reads
129 |     Int compression_level
130 |   }
131 | 
132 |   Float unmapped_bam_size = size(input_bam, "GiB")
133 |   # Since the output bams are less compressed than the input bam we need a disk multiplier that's larger than 2.
134 |   Float disk_multiplier = 2.5
135 |   Int disk_size = ceil(disk_multiplier * unmapped_bam_size + 20)
136 | 
137 |   command {
138 |     set -e
139 |     mkdir output_dir
140 | 
141 |     total_reads=$(/mnt/lustre/genomics/tools/samtools/samtools view -c ~{input_bam})
142 | 
143 |     java -Dsamjdk.compression_level=~{compression_level} -Xms3000m -Xmx3600m -jar /mnt/lustre/genomics/tools/picard.jar SplitSamByNumberOfReads \
144 |       INPUT=~{input_bam} \
145 |       OUTPUT=output_dir \
146 |       SPLIT_TO_N_READS=~{n_reads} \
147 |       TOTAL_READS_IN_INPUT=$total_reads
148 |   }
149 |   output {
150 |     Array[File] split_bams = glob("output_dir/*.bam")
151 |   }
152 |   runtime {
153 |     memory: "3.75 GiB"
154 |   }
155 | }
156 | 


--------------------------------------------------------------------------------
/20k_Throughput-run/step02_Download_20k_Data_Throughput-run.sh:
--------------------------------------------------------------------------------
 1 | #! /bin/bash
 2 | # Copyright (c) 2019 Intel Corporation
 3 | #
 4 | # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License.
 5 | # You may obtain a copy of the License at #
 6 | #    http://www.apache.org/licenses/LICENSE-2.0
 7 | #
 8 | # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 9 | # See the License for the specific language governing permissions and # limitations under the License.
10 | #
11 | # SPDX-License-Identifier: Apache-2.0
12 | 
13 | source ./configure
14 | 
15 | echo "Downloading Reference Data (if it doesn't already exist)"
16 | GCP_PATH="https://storage.googleapis.com"
17 | BASEDIR="$( cd "$( dirname "${BASH_SOURCE[0]}" )" && pwd )"
18 | #specify the path to data download directory.By default, data is downloaded to current folder
19 | DATA_PATH="$BASEDIR/data"
20 | mkdir -p $DATA_PATH/genomics-public-data/resources/broad/hg38/v0
21 | cd $DATA_PATH/genomics-public-data/resources/broad/hg38/v0
22 | echo "Downloading the reference files"
23 | #Reference Genome
24 | wget -nc -v -P $DATA_PATH/genomics-public-data/resources/broad/hg38/v0 \
25 | $GCP_PATH/genomics-public-data/resources/broad/hg38/v0/Homo_sapiens_assembly38.dict
26 | wget -nc -v -P $DATA_PATH/genomics-public-data/resources/broad/hg38/v0 \
27 | $GCP_PATH/genomics-public-data/resources/broad/hg38/v0/Homo_sapiens_assembly38.fasta
28 | wget -nc -v -P $DATA_PATH/genomics-public-data/resources/broad/hg38/v0 \
29 | $GCP_PATH/genomics-public-data/resources/broad/hg38/v0/Homo_sapiens_assembly38.fasta.fai
30 | wget -nc -v -P $DATA_PATH/genomics-public-data/resources/broad/hg38/v0 \
31 | $GCP_PATH/genomics-public-data/resources/broad/hg38/v0/Homo_sapiens_assembly38.fasta.64.alt
32 | wget -nc -v -P $DATA_PATH/genomics-public-data/resources/broad/hg38/v0 \
33 | $GCP_PATH/genomics-public-data/resources/broad/hg38/v0/Homo_sapiens_assembly38.fasta.64.sa
34 | wget -nc -v -P $DATA_PATH/genomics-public-data/resources/broad/hg38/v0 \
35 | $GCP_PATH/genomics-public-data/resources/broad/hg38/v0/Homo_sapiens_assembly38.fasta.64.amb
36 | wget -nc -v -P $DATA_PATH/genomics-public-data/resources/broad/hg38/v0 \
37 | $GCP_PATH/genomics-public-data/resources/broad/hg38/v0/Homo_sapiens_assembly38.fasta.64.bwt
38 | wget -nc -v -P $DATA_PATH/genomics-public-data/resources/broad/hg38/v0 \
39 | $GCP_PATH/genomics-public-data/resources/broad/hg38/v0/Homo_sapiens_assembly38.fasta.64.ann
40 | wget -nc -v -P $DATA_PATH/genomics-public-data/resources/broad/hg38/v0 \
41 | $GCP_PATH/genomics-public-data/resources/broad/hg38/v0/Homo_sapiens_assembly38.fasta.64.pac
42 | echo "Done downloading reference files"
43 | sleep 1
44 | echo "Downloading the resource files"
45 | #Resource Files
46 | wget -nc -v -P $DATA_PATH/genomics-public-data/resources/broad/hg38/v0 \
47 | $GCP_PATH/genomics-public-data/resources/broad/hg38/v0/Homo_sapiens_assembly38.dbsnp138.vcf
48 | wget -nc -v -P $DATA_PATH/genomics-public-data/resources/broad/hg38/v0 \
49 | $GCP_PATH/genomics-public-data/resources/broad/hg38/v0/Homo_sapiens_assembly38.dbsnp138.vcf.idx
50 | wget -nc -v -P $DATA_PATH/genomics-public-data/resources/broad/hg38/v0 \
51 | $GCP_PATH/genomics-public-data/resources/broad/hg38/v0/Mills_and_1000G_gold_standard.indels.hg38.vcf.gz
52 | wget -nc -v -P $DATA_PATH/genomics-public-data/resources/broad/hg38/v0 \
53 | $GCP_PATH/genomics-public-data/resources/broad/hg38/v0/Mills_and_1000G_gold_standard.indels.hg38.vcf.gz.tbi
54 | wget -nc -v -P $DATA_PATH/genomics-public-data/resources/broad/hg38/v0 \
55 | $GCP_PATH/genomics-public-data/resources/broad/hg38/v0/Homo_sapiens_assembly38.known_indels.vcf.gz
56 | wget -nc -v -P $DATA_PATH/genomics-public-data/resources/broad/hg38/v0 \
57 | $GCP_PATH/genomics-public-data/resources/broad/hg38/v0/Homo_sapiens_assembly38.known_indels.vcf.gz.tbi
58 | echo "Done downloading resource files"
59 | sleep 1
60 | echo "Downloading the intervals files"
61 | #Interval Files
62 | wget -nc -v -P $DATA_PATH/genomics-public-data/resources/broad/hg38/v0 \
63 | $GCP_PATH/genomics-public-data/resources/broad/hg38/v0/wgs_calling_regions.hg38.interval_list
64 | wget -nc -v -P $DATA_PATH/genomics-public-data/resources/broad/hg38/v0 \
65 | $GCP_PATH/gcp-public-data--broad-references/hg38/v0/wgs_evaluation_regions.hg38.interval_list
66 | 
67 | #Alternatively  gsutil cp -r gs://genomics-public-data/references/hg38/v0/* .
68 |  wget -nc -v -P $DATA_PATH/genomics-public-data/references/broad/hg38/v0/ \
69 | $GCP_PATH/genomics-public-data/references/hg38/v0/wgs_coverage_regions.hg38.interval_list
70 |  wget -nc -v -P $DATA_PATH/genomics-public-data/references/broad/hg38/v0/ \
71 | $GCP_PATH/genomics-public-data/references/hg38/v0/Homo_sapiens_assembly38.haplotype_database.txt
72 |  wget -nc -v -P $DATA_PATH/genomics-public-data/references/broad/hg38/v0/ \
73 | $GCP_PATH/genomics-public-data/references/hg38/v0/Homo_sapiens_assembly38.contam.UD
74 |  wget -nc -v -P $DATA_PATH/genomics-public-data/references/broad/hg38/v0/ \
75 | $GCP_PATH/genomics-public-data/references/hg38/v0/Homo_sapiens_assembly38.contam.mu
76 |  wget -nc -v -P $DATA_PATH/genomics-public-data/references/broad/hg38/v0/ \
77 | $GCP_PATH/genomics-public-data/references/hg38/v0/Homo_sapiens_assembly38.contam.bed
78 | # Need to find following reference 
79 | # wget -nc -v -P $DATA_PATH/genomics-public-data/references/broad/hg38/v0/ \
80 | #$GCP_PATH/genomics-public-data/references/hg38/v0/hg38_wgs_scattered_calling_intervals.txt
81 | # wget -nc -v -P $DATA_PATH/genomics-public-data/references/broad/hg38/v0/ \
82 | #$GCP_PATH/genomics-public-data/references/hg38/v0/NA12878.hg38.reference.fingerprint.vcf
83 | 
84 | echo "Done downloading interval files"
85 | sleep 1
86 | echo "Downloading 20k Test Data for Single Sample Workflow"
87 | mkdir -p $DATA_PATH/genomics-public-data/test-data/dna/wgs/hiseq2500/NA12878
88 | cd $DATA_PATH/genomics-public-data/test-data/dna/wgs/hiseq2500/NA12878
89 | wget -nc -v -P $DATA_PATH/genomics-public-data/test-data/dna/wgs/hiseq2500/NA12878 \
90 | $GCP_PATH/genomics-public-data/test-data/dna/wgs/hiseq2500/NA12878/H06HDADXX130110.1.ATCACGAT.20k_reads.bam
91 | wget -nc -v -P $DATA_PATH/genomics-public-data/test-data/dna/wgs/hiseq2500/NA12878 \
92 | $GCP_PATH/genomics-public-data/test-data/dna/wgs/hiseq2500/NA12878/H06HDADXX130110.2.ATCACGAT.20k_reads.bam
93 | wget -nc -v -P $DATA_PATH/genomics-public-data/test-data/dna/wgs/hiseq2500/NA12878 \
94 | $GCP_PATH/genomics-public-data/test-data/dna/wgs/hiseq2500/NA12878/H06JUADXX130110.1.ATCACGAT.20k_reads.bam
95 | chmod -R 777 $DATA_PATH/genomics-public-data/test-data/dna/wgs/hiseq2500/NA12878
96 | echo "Data for tutorial downloaded successfully"
97 | 
98 | 


--------------------------------------------------------------------------------
/20k_Throughput-run/WDL/VariantCalling.wdl:
--------------------------------------------------------------------------------
  1 | version 1.0
  2 | 
  3 | import "GermlineVariantDiscovery.wdl" as Calling
  4 | import "Qc.wdl" as QC
  5 | import "Utilities.wdl" as Utils
  6 | import "BamProcessing.wdl" as BamProcessing
  7 | import "DragenTasks.wdl" as DragenTasks
  8 | 
  9 | workflow VariantCalling {
 10 | 
 11 | 
 12 |   String pipeline_version = "2.1.17"
 13 | 
 14 | 
 15 |   input {
 16 |     Boolean run_dragen_mode_variant_calling = false
 17 |     Boolean use_spanning_event_genotyping = true
 18 |     File calling_interval_list
 19 |     File evaluation_interval_list
 20 |     Int haplotype_scatter_count
 21 |     Int break_bands_at_multiples_of
 22 |     Float? contamination
 23 |     File input_bam
 24 |     File input_bam_index
 25 |     File ref_fasta
 26 |     File ref_fasta_index
 27 |     File ref_dict
 28 |     File? ref_str
 29 |     File dbsnp_vcf
 30 |     File dbsnp_vcf_index
 31 |     String base_file_name
 32 |     String final_vcf_base_name
 33 |     Boolean make_gvcf = true
 34 |     Boolean make_bamout = false
 35 |     Boolean use_gatk3_haplotype_caller = false
 36 |     Boolean skip_reblocking = false
 37 |     Boolean use_dragen_hard_filtering = false
 38 |   }
 39 | 
 40 |   parameter_meta {
 41 |     make_bamout: "For CNNScoreVariants to run with a 2D model, a bamout must be created by HaplotypeCaller. The bamout is a bam containing information on how HaplotypeCaller remapped reads while it was calling variants. See https://gatkforums.broadinstitute.org/gatk/discussion/5484/howto-generate-a-bamout-file-showing-how-haplotypecaller-has-remapped-sequence-reads for more details."
 42 |     run_dragen_mode_variant_calling: "Run variant calling using the DRAGEN-GATK pipeline, false by default."
 43 |   }
 44 | 
 45 |   if (run_dragen_mode_variant_calling) {
 46 |     call DragenTasks.CalibrateDragstrModel as DragstrAutoCalibration {
 47 |       input:
 48 |         ref_fasta = ref_fasta,
 49 |         ref_fasta_idx = ref_fasta_index,
 50 |         ref_dict = ref_dict,
 51 |         alignment = input_bam,
 52 |         alignment_index = input_bam_index,
 53 |         str_table_file = select_first([ref_str])
 54 |     }
 55 |   }
 56 | 
 57 | 
 58 |   # Break the calling interval_list into sub-intervals
 59 |   # Perform variant calling on the sub-intervals, and then gather the results
 60 |   call Utils.ScatterIntervalList as ScatterIntervalList {
 61 |     input:
 62 |       interval_list = calling_interval_list,
 63 |       scatter_count = haplotype_scatter_count,
 64 |       break_bands_at_multiples_of = break_bands_at_multiples_of
 65 |   }
 66 | 
 67 |   # We need disk to localize the sharded input and output due to the scatter for HaplotypeCaller.
 68 |   # If we take the number we are scattering by and reduce by 20 we will have enough disk space
 69 |   # to account for the fact that the data is quite uneven across the shards.
 70 |   Int potential_hc_divisor = ScatterIntervalList.interval_count - 20
 71 |   Int hc_divisor = if potential_hc_divisor > 1 then potential_hc_divisor else 1
 72 | 
 73 |   # Call variants in parallel over WGS calling intervals
 74 |   scatter (scattered_interval_list in ScatterIntervalList.out) {
 75 | 
 76 |     if (use_gatk3_haplotype_caller) {
 77 |       call Calling.HaplotypeCaller_GATK35_GVCF as HaplotypeCallerGATK3 {
 78 |         input:
 79 |           input_bam = input_bam,
 80 |           input_bam_index = input_bam_index,
 81 |           interval_list = scattered_interval_list,
 82 |           gvcf_basename = base_file_name,
 83 |           ref_dict = ref_dict,
 84 |           ref_fasta = ref_fasta,
 85 |           ref_fasta_index = ref_fasta_index,
 86 |           contamination = contamination,
 87 |           hc_scatter = hc_divisor
 88 |       }
 89 |     }
 90 | 
 91 |     if (!use_gatk3_haplotype_caller) {
 92 |       # Generate GVCF by interval
 93 |       call Calling.HaplotypeCaller_GATK4_VCF as HaplotypeCallerGATK4 {
 94 |         input:
 95 |           contamination = if run_dragen_mode_variant_calling then 0 else contamination,
 96 |           input_bam = input_bam,
 97 |           input_bam_index = input_bam_index,
 98 |           interval_list = scattered_interval_list,
 99 |           vcf_basename = base_file_name,
100 |           ref_dict = ref_dict,
101 |           ref_fasta = ref_fasta,
102 |           ref_fasta_index = ref_fasta_index,
103 |           hc_scatter = hc_divisor,
104 |           make_gvcf = make_gvcf,
105 |           make_bamout = make_bamout,
106 |           run_dragen_mode_variant_calling = run_dragen_mode_variant_calling,
107 |           use_dragen_hard_filtering = use_dragen_hard_filtering,
108 |           use_spanning_event_genotyping = use_spanning_event_genotyping,
109 |           dragstr_model = DragstrAutoCalibration.dragstr_model,
110 |        }
111 | 
112 |       if (use_dragen_hard_filtering) {
113 |         call Calling.DragenHardFilterVcf as DragenHardFilterVcf {
114 |           input:
115 |             input_vcf = HaplotypeCallerGATK4.output_vcf,
116 |             input_vcf_index = HaplotypeCallerGATK4.output_vcf_index,
117 |             make_gvcf = make_gvcf,
118 |             vcf_basename = base_file_name,
119 |         }
120 |       }
121 | 
122 |       # If bamout files were created, we need to sort and gather them into one bamout
123 |       if (make_bamout) {
124 |         call BamProcessing.SortSam as SortBamout {
125 |           input:
126 |             input_bam = HaplotypeCallerGATK4.bamout,
127 |             output_bam_basename = final_vcf_base_name,
128 |             compression_level = 2
129 |         }
130 |       }
131 |     }
132 | 
133 |     File vcfs_to_merge = select_first([HaplotypeCallerGATK3.output_gvcf, DragenHardFilterVcf.output_vcf, HaplotypeCallerGATK4.output_vcf])
134 |     File vcf_indices_to_merge = select_first([HaplotypeCallerGATK3.output_gvcf_index, DragenHardFilterVcf.output_vcf_index, HaplotypeCallerGATK4.output_vcf_index])
135 |   }
136 | 
137 |   # Combine by-interval (g)VCFs into a single sample (g)VCF file
138 |   String hard_filter_suffix = if use_dragen_hard_filtering then ".hard-filtered" else ""
139 |   String merge_suffix = if make_gvcf then ".g.vcf.gz" else ".vcf.gz"
140 |   call Calling.MergeVCFs as MergeVCFs {
141 |     input:
142 |       input_vcfs = vcfs_to_merge,
143 |       input_vcfs_indexes = vcf_indices_to_merge,
144 |       output_vcf_name = final_vcf_base_name + hard_filter_suffix + merge_suffix,
145 |   }
146 | 
147 |   if (make_gvcf && !skip_reblocking) {
148 |     call Calling.Reblock as Reblock {
149 |       input:
150 |         gvcf = MergeVCFs.output_vcf,
151 |         gvcf_index = MergeVCFs.output_vcf_index,
152 |         ref_fasta = ref_fasta,
153 |         ref_fasta_index = ref_fasta_index,
154 |         ref_dict = ref_dict,
155 |         output_vcf_filename = basename(MergeVCFs.output_vcf, ".g.vcf.gz") + ".rb.g.vcf.gz"
156 |     }
157 |   }
158 | 
159 |   if (make_bamout) {
160 |     call MergeBamouts {
161 |       input:
162 |         bams = select_all(SortBamout.output_bam),
163 |         output_base_name = final_vcf_base_name
164 |     }
165 |   }
166 | 
167 |   # Validate the (g)VCF output of HaplotypeCaller
168 |   call QC.ValidateVCF as ValidateVCF {
169 |     input:
170 |       input_vcf = select_first([Reblock.output_vcf, MergeVCFs.output_vcf]),
171 |       input_vcf_index = select_first([Reblock.output_vcf_index, MergeVCFs.output_vcf_index]),
172 |       dbsnp_vcf = dbsnp_vcf,
173 |       dbsnp_vcf_index = dbsnp_vcf_index,
174 |       ref_fasta = ref_fasta,
175 |       ref_fasta_index = ref_fasta_index,
176 |       ref_dict = ref_dict,
177 |       calling_interval_list = calling_interval_list,
178 |       is_gvcf = make_gvcf,
179 |       extra_args = if (skip_reblocking == false) then "--no-overlaps" else ""
180 |   }
181 | 
182 |   # QC the (g)VCF
183 |   call QC.CollectVariantCallingMetrics as CollectVariantCallingMetrics {
184 |     input:
185 |       input_vcf = select_first([Reblock.output_vcf, MergeVCFs.output_vcf]),
186 |       input_vcf_index = select_first([Reblock.output_vcf_index, MergeVCFs.output_vcf_index]),
187 |       metrics_basename = final_vcf_base_name,
188 |       dbsnp_vcf = dbsnp_vcf,
189 |       dbsnp_vcf_index = dbsnp_vcf_index,
190 |       ref_dict = ref_dict,
191 |       evaluation_interval_list = evaluation_interval_list,
192 |       is_gvcf = make_gvcf,
193 |   }
194 | 
195 |   output {
196 |     File vcf_summary_metrics = CollectVariantCallingMetrics.summary_metrics
197 |     File vcf_detail_metrics = CollectVariantCallingMetrics.detail_metrics
198 |     File output_vcf = select_first([Reblock.output_vcf, MergeVCFs.output_vcf])
199 |     File output_vcf_index = select_first([Reblock.output_vcf_index, MergeVCFs.output_vcf_index])
200 |     File? bamout = MergeBamouts.output_bam
201 |     File? bamout_index = MergeBamouts.output_bam_index
202 |   }
203 |   meta {
204 |     allowNestedInputs: true
205 |   }
206 | }
207 | 
208 | # This task is here because merging bamout files using Picard produces an error.
209 | task MergeBamouts {
210 | 
211 |   input {
212 |     Array[File] bams
213 |     String output_base_name
214 |   }
215 | 
216 |   Int disk_size = ceil(size(bams, "GiB") * 2) + 10
217 | 
218 |   command <<<
219 |     /mnt/lustre/genomics/tools/samtools/samtools merge ~{output_base_name}.bam ~{sep=" " bams}
220 |     /mnt/lustre/genomics/tools/samtools/samtools index ~{output_base_name}.bam
221 |     mv ~{output_base_name}.bam.bai ~{output_base_name}.bai
222 |   >>>
223 | 
224 |   output {
225 |     File output_bam = "~{output_base_name}.bam"
226 |     File output_bam_index = "~{output_base_name}.bai"
227 |   }
228 | 
229 |   runtime {
230 |     memory: "4 GiB"
231 |     cpu: "1"
232 |   }
233 | }
234 | 


--------------------------------------------------------------------------------
/20k_Throughput-run/WDL/Utilities.wdl:
--------------------------------------------------------------------------------
  1 | version 1.0
  2 | 
  3 | ## Copyright Broad Institute, 2018
  4 | ##
  5 | ## This WDL defines utility tasks used for processing of sequencing data.
  6 | ##
  7 | ## Runtime parameters are often optimized for Broad's Google Cloud Platform implementation.
  8 | ## For program versions, see docker containers.
  9 | ##
 10 | ## LICENSING :
 11 | ## This script is released under the WDL source code license (BSD-3) (see LICENSE in
 12 | ## https://github.com/broadinstitute/wdl). Note however that the programs it calls may
 13 | ## be subject to different licenses. Users are responsible for checking that they are
 14 | ## authorized to run all programs before running this script. Please see the docker
 15 | ## page at https://hub.docker.com/r/broadinstitute/genomes-in-the-cloud/ for detailed
 16 | ## licensing information pertaining to the included programs.
 17 | 
 18 | # Generate sets of intervals for scatter-gathering over chromosomes
 19 | task CreateSequenceGroupingTSV {
 20 |   input {
 21 |     File ref_dict
 22 |   }
 23 |   # Use python to create the Sequencing Groupings used for BQSR and PrintReads Scatter.
 24 |   # It outputs to stdout where it is parsed into a wdl Array[Array[String]]
 25 |   # e.g. [["1"], ["2"], ["3", "4"], ["5"], ["6", "7", "8"]]
 26 |   command <<<
 27 |     python3 <<CODE
 28 |     with open("~{ref_dict}", "r") as ref_dict_file:
 29 |         sequence_tuple_list = []
 30 |         longest_sequence = 0
 31 |         for line in ref_dict_file:
 32 |             if line.startswith("@SQ"):
 33 |                 line_split = line.split("\t")
 34 |                 # (Sequence_Name, Sequence_Length)
 35 |                 sequence_tuple_list.append((line_split[1].split("SN:")[1], int(line_split[2].split("LN:")[1])))
 36 |         longest_sequence = sorted(sequence_tuple_list, key=lambda x: x[1], reverse=True)[0][1]
 37 |     # We are adding this to the intervals because hg38 has contigs named with embedded colons and a bug in GATK strips off
 38 |     # the last element after a :, so we add this as a sacrificial element.
 39 |     hg38_protection_tag = ":1+"
 40 |     # initialize the tsv string with the first sequence
 41 |     tsv_string = sequence_tuple_list[0][0] + hg38_protection_tag
 42 |     temp_size = sequence_tuple_list[0][1]
 43 |     for sequence_tuple in sequence_tuple_list[1:]:
 44 |         if temp_size + sequence_tuple[1] <= longest_sequence:
 45 |             temp_size += sequence_tuple[1]
 46 |             tsv_string += "\t" + sequence_tuple[0] + hg38_protection_tag
 47 |         else:
 48 |             tsv_string += "\n" + sequence_tuple[0] + hg38_protection_tag
 49 |             temp_size = sequence_tuple[1]
 50 |     # add the unmapped sequences as a separate line to ensure that they are recalibrated as well
 51 |     with open("sequence_grouping.txt","w") as tsv_file:
 52 |       tsv_file.write(tsv_string)
 53 |       tsv_file.close()
 54 | 
 55 |     tsv_string += '\n' + "unmapped"
 56 | 
 57 |     with open("sequence_grouping_with_unmapped.txt","w") as tsv_file_with_unmapped:
 58 |       tsv_file_with_unmapped.write(tsv_string)
 59 |       tsv_file_with_unmapped.close()
 60 |     CODE
 61 |   >>>
 62 |   runtime {
 63 |     cpu: "2"
 64 |     memory: "2 GiB"
 65 |   }
 66 |   output {
 67 |     Array[Array[String]] sequence_grouping = read_tsv("sequence_grouping.txt")
 68 |     Array[Array[String]] sequence_grouping_with_unmapped = read_tsv("sequence_grouping_with_unmapped.txt")
 69 |   }
 70 | }
 71 | 
 72 | # This task calls picard's IntervalListTools to scatter the input interval list into scatter_count sub interval lists
 73 | # Note that the number of sub interval lists may not be exactly equal to scatter_count.  There may be slightly more or less.
 74 | # Thus we have the block of python to count the number of generated sub interval lists.
 75 | task ScatterIntervalList {
 76 |   input {
 77 |     File interval_list
 78 |     Int scatter_count
 79 |     Int break_bands_at_multiples_of
 80 |   }
 81 | 
 82 |   command <<<
 83 |     set -e
 84 |     mkdir out
 85 |     java -Xms1000m -Xmx2g -jar /mnt/lustre/genomics/tools/picard.jar \
 86 |       IntervalListTools \
 87 |       SCATTER_COUNT=~{scatter_count} \
 88 |       SUBDIVISION_MODE=BALANCING_WITHOUT_INTERVAL_SUBDIVISION_WITH_OVERFLOW \
 89 |       UNIQUE=true \
 90 |       SORT=true \
 91 |       BREAK_BANDS_AT_MULTIPLES_OF=~{break_bands_at_multiples_of} \
 92 |       INPUT=~{interval_list} \
 93 |       OUTPUT=out
 94 | 
 95 |     python3 <<CODE
 96 |     import glob, os
 97 |     # Works around a JES limitation where multiples files with the same name overwrite each other when globbed
 98 |     intervals = sorted(glob.glob("out/*/*.interval_list"))
 99 |     for i, interval in enumerate(intervals):
100 |       (directory, filename) = os.path.split(interval)
101 |       newName = os.path.join(directory, str(i + 1) + filename)
102 |       os.rename(interval, newName)
103 |     print(len(intervals))
104 |     CODE
105 |   >>>
106 |   output {
107 |     Array[File] out = glob("out/*/*.interval_list")
108 |     Int interval_count = read_int(stdout())
109 |   }
110 |   runtime {
111 |     cpu: "2"
112 |     memory: "2000 MiB"
113 |   }
114 | }
115 | 
116 | # Convert BAM file to CRAM format
117 | # Note that reading CRAMs directly with Picard is not yet supported
118 | task ConvertToCram {
119 |   input {
120 |     File input_bam
121 |     File ref_fasta
122 |     File ref_fasta_index
123 |     String output_basename
124 | 
125 |     Int disk_size = ceil((2 * size(input_bam, "GiB")) + size(ref_fasta, "GiB") + size(ref_fasta_index, "GiB")) + 20
126 |   }
127 | 
128 |   command <<<
129 |     set -e
130 |     set -o pipefail
131 | 
132 |     /mnt/lustre/genomics/tools/samtools/samtools view -C -T ~{ref_fasta} ~{input_bam} | \
133 |     tee ~{output_basename}.cram | \
134 |     md5sum | awk '{print $1}' > ~{output_basename}.cram.md5
135 | 
136 |     # Create REF_CACHE. Used when indexing a CRAM
137 |     /mnt/lustre/genomics/tools/samtools/misc/seq_cache_populate.pl -root ./ref/cache ~{ref_fasta}
138 |     export REF_PATH=:
139 |     export REF_CACHE=./ref/cache/%2s/%2s/%s
140 | 
141 |     /mnt/lustre/genomics/tools/samtools/samtools index ~{output_basename}.cram
142 |   >>>
143 |   runtime {
144 |     memory: "3 GiB"
145 |     cpu: "2"
146 |   }
147 |   output {
148 |     File output_cram = "~{output_basename}.cram"
149 |     File output_cram_index = "~{output_basename}.cram.crai"
150 |     File output_cram_md5 = "~{output_basename}.cram.md5"
151 |   }
152 | }
153 | 
154 | # Convert CRAM file to BAM format
155 | task ConvertToBam {
156 |   input {
157 |     File input_cram
158 |     File ref_fasta
159 |     File ref_fasta_index
160 |     String output_basename
161 |   }
162 | 
163 |   command <<<
164 |     set -e
165 |     set -o pipefail
166 | 
167 |     /mnt/lustre/genomics/tools/samtools/samtools view -b -o ~{output_basename}.bam -T ~{ref_fasta} ~{input_cram}
168 | 
169 |     /mnt/lustre/genomics/tools/samtools/samtools index ~{output_basename}.bam
170 |   >>>
171 |   runtime {
172 |     memory: "3 GiB"
173 |     cpu: "2"
174 |   }
175 |   output {
176 |     File output_bam = "~{output_basename}.bam"
177 |     File output_bam_index = "~{output_basename}.bam.bai"
178 |   }
179 | }
180 | 
181 | # Calculates sum of a list of floats
182 | task SumFloats {
183 |   input {
184 |     Array[Float] sizes
185 |   }
186 | 
187 |   command <<<
188 |     python3 -c 'print(~{sep="+" sizes})'
189 |   >>>
190 |   output {
191 |     Float total_size = read_float(stdout())
192 |   }
193 | }
194 | 
195 | # Print given message to stderr and return an error
196 | task ErrorWithMessage {
197 |   input {
198 |     String message
199 |   }
200 |   command <<<
201 |     >&2 echo "Error: ~{message}"
202 |     exit 1
203 |   >>>
204 | }
205 | 
206 | # This task is unused for now, going to keep it in here though if we need it in the future
207 | task GetValidationInputs {
208 |   input {
209 |     String results_path
210 |     String truth_path
211 |     Array[String]? input_files
212 |     String? input_file
213 | 
214 |     Int cpu = 1
215 |     Int memory_mb = 2000
216 |     Int disk_size_gb = 20
217 |   }
218 | 
219 |   meta {
220 |     description: "Given either a file or list of files, output both the truth and results path"
221 |   }
222 | 
223 |   command <<<
224 |     set -e
225 | 
226 |     touch truth_file.txt
227 |     touch truth_files.txt
228 |     touch results_file.txt
229 |     touch results_files.txt
230 | 
231 |     python3 <<CODE
232 |     import os.path
233 | 
234 | 
235 | 
236 |     results_path = "~{results_path}"
237 |     truth_path = "~{truth_path}"
238 |     input_file = "~{input_file}"
239 |     input_files = [ x for x in [ "~{sep='", "' input_files}" ]  if x != "" ]
240 | 
241 |     if input_file:
242 |       file = os.path.basename(input_file)
243 |       truth_file = os.path.join(truth_path, file)
244 |       results_file = os.path.join(results_path, file)
245 | 
246 |       with open("truth_file.txt", "w") as f:
247 |         f.write(truth_file)
248 |       with open("results_file.txt", "w") as f:
249 |         f.write(results_file)
250 | 
251 |     elif input_files:
252 |       truth_files, results_files = [], []
253 | 
254 |       for input_file in input_files:
255 |         file = os.path.basename(input_file)
256 |         truth_files.append(os.path.join(truth_path, file))
257 |         results_files.append(os.path.join(results_path, file))
258 | 
259 |       with open("truth_files.txt", "w") as f:
260 |         f.write("\n".join(truth_files))
261 |       with open("results_files.txt", "w") as f:
262 |         f.write("\n".join(results_files))
263 | 
264 | 
265 |     CODE
266 |   >>>
267 | 
268 |   runtime {
269 |     cpu: cpu
270 |     memory: "~{memory_mb} MiB"
271 |   }
272 | 
273 |   output {
274 |     String truth_file = read_string("truth_file.txt")
275 |     String results_file = read_string("results_file.txt")
276 |     Array[String] truth_files = read_lines("truth_files.txt")
277 |     Array[String] results_files = read_lines("results_files.txt")
278 |   }
279 | 
280 | }
281 | 


--------------------------------------------------------------------------------
/20k_Tutorial_Docker/16T_PairedSingleSampleWf_optimized.inputs.20k.json:
--------------------------------------------------------------------------------
  1 | {
  2 |   "##_COMMENT1": "Take note of the .64 extensions on the reference files, issues between 32 and 64 bit OS",
  3 | 
  4 |   "##_COMMENT2": "SAMPLES - read the README to find other examples.",
  5 |   "PairedEndSingleSampleWorkflow.sample_name": "NA12878",
  6 |   "PairedEndSingleSampleWorkflow.base_file_name": "NA12878",
  7 |   "PairedEndSingleSampleWorkflow.flowcell_unmapped_bams": [
  8 |     "/cluster_share/data/RefArch_Broad_data/genomics-public-data/test-data/dna/wgs/hiseq2500/NA12878/H06HDADXX130110.1.ATCACGAT.20k_reads.bam",
  9 |     "/cluster_share/data/RefArch_Broad_data/genomics-public-data/test-data/dna/wgs/hiseq2500/NA12878/H06HDADXX130110.2.ATCACGAT.20k_reads.bam",
 10 |     "/cluster_share/data/RefArch_Broad_data/genomics-public-data/test-data/dna/wgs/hiseq2500/NA12878/H06JUADXX130110.1.ATCACGAT.20k_reads.bam"
 11 |   ],
 12 |   "PairedEndSingleSampleWorkflow.final_gvcf_name": "NA12878.g.vcf.gz",
 13 |   "PairedEndSingleSampleWorkflow.unmapped_bam_suffix": ".unmapped.bam",
 14 | 
 15 |   "##_COMMENT3": "REFERENCES",
 16 |   "PairedEndSingleSampleWorkflow.fingerprint_genotypes_file": "/cluster_share/data/RefArch_Broad_data/genomics-public-data/resources/broad/hg38/v0/NA12878.hg38.reference.fingerprint.vcf",
 17 |   "PairedEndSingleSampleWorkflow.contamination_sites_ud": "/cluster_share/data/RefArch_Broad_data/genomics-public-data/resources/broad/hg38/v0/Homo_sapiens_assembly38.contam.UD",
 18 |   "PairedEndSingleSampleWorkflow.contamination_sites_bed": "/cluster_share/data/RefArch_Broad_data/genomics-public-data/resources/broad/hg38/v0/Homo_sapiens_assembly38.contam.bed",
 19 |   "PairedEndSingleSampleWorkflow.contamination_sites_mu": "/cluster_share/data/RefArch_Broad_data/genomics-public-data/resources/broad/hg38/v0/Homo_sapiens_assembly38.contam.mu",
 20 |   "PairedEndSingleSampleWorkflow.wgs_calling_interval_list": "/cluster_share/data/RefArch_Broad_data/genomics-public-data/resources/broad/hg38/v0/wgs_calling_regions.hg38.interval_list",
 21 |   "PairedEndSingleSampleWorkflow.haplotype_scatter_count" : 50,
 22 |   "PairedEndSingleSampleWorkflow.break_bands_at_multiples_of" : 1000000,
 23 |   "PairedEndSingleSampleWorkflow.ref_dict": "/cluster_share/data/RefArch_Broad_data/genomics-public-data/resources/broad/hg38/v0/Homo_sapiens_assembly38.dict",
 24 |   "PairedEndSingleSampleWorkflow.ref_fasta": "/cluster_share/data/RefArch_Broad_data/genomics-public-data/resources/broad/hg38/v0/Homo_sapiens_assembly38.fasta",
 25 |   "PairedEndSingleSampleWorkflow.ref_fasta_index": "/cluster_share/data/RefArch_Broad_data/genomics-public-data/resources/broad/hg38/v0/Homo_sapiens_assembly38.fasta.fai",
 26 |   "PairedEndSingleSampleWorkflow.ref_alt": "/cluster_share/data/RefArch_Broad_data/genomics-public-data/resources/broad/hg38/v0/Homo_sapiens_assembly38.fasta.64.alt",
 27 |   "PairedEndSingleSampleWorkflow.ref_sa": "/cluster_share/data/RefArch_Broad_data/genomics-public-data/resources/broad/hg38/v0/Homo_sapiens_assembly38.fasta.64.sa",
 28 |   "PairedEndSingleSampleWorkflow.ref_amb": "/cluster_share/data/RefArch_Broad_data/genomics-public-data/resources/broad/hg38/v0/Homo_sapiens_assembly38.fasta.64.amb",
 29 |   "PairedEndSingleSampleWorkflow.ref_bwt": "/cluster_share/data/RefArch_Broad_data/genomics-public-data/resources/broad/hg38/v0/Homo_sapiens_assembly38.fasta.64.bwt",
 30 |   "PairedEndSingleSampleWorkflow.ref_ann": "/cluster_share/data/RefArch_Broad_data/genomics-public-data/resources/broad/hg38/v0/Homo_sapiens_assembly38.fasta.64.ann",
 31 |   "PairedEndSingleSampleWorkflow.ref_pac": "/cluster_share/data/RefArch_Broad_data/genomics-public-data/resources/broad/hg38/v0/Homo_sapiens_assembly38.fasta.64.pac",
 32 |   "PairedEndSingleSampleWorkflow.known_snps_sites_vcf": "/cluster_share/data/RefArch_Broad_data/genomics-public-data/resources/broad/hg38/v0/Mills_and_1000G_gold_standard.indels.hg38.vcf.gz",
 33 |   "PairedEndSingleSampleWorkflow.known_snps_sites_vcf_index": "/cluster_share/data/RefArch_Broad_data/genomics-public-data/resources/broad/hg38/v0/Mills_and_1000G_gold_standard.indels.hg38.vcf.gz.tbi",
 34 |   "PairedEndSingleSampleWorkflow.dbSNP_vcf": "/cluster_share/data/RefArch_Broad_data/genomics-public-data/resources/broad/hg38/v0/Homo_sapiens_assembly38.dbsnp138.vcf",
 35 |   "PairedEndSingleSampleWorkflow.dbSNP_vcf_index": "/cluster_share/data/RefArch_Broad_data/genomics-public-data/resources/broad/hg38/v0/Homo_sapiens_assembly38.dbsnp138.vcf.idx",
 36 |   "PairedEndSingleSampleWorkflow.wgs_coverage_interval_list": "/cluster_share/data/RefArch_Broad_data/genomics-public-data/resources/broad/hg38/v0/wgs_coverage_regions.hg38.interval_list",
 37 |   "PairedEndSingleSampleWorkflow.wgs_evaluation_interval_list": "/cluster_share/data/RefArch_Broad_data/genomics-public-data/resources/broad/hg38/v0/wgs_evaluation_regions.hg38.interval_list",
 38 |     "PairedEndSingleSampleWorkflow.known_indels_sites_VCFs": [
 39 |     "/cluster_share/data/RefArch_Broad_data/genomics-public-data/resources/broad/hg38/v0/Mills_and_1000G_gold_standard.indels.hg38.vcf.gz",
 40 |     "/cluster_share/data/RefArch_Broad_data/genomics-public-data/resources/broad/hg38/v0/Homo_sapiens_assembly38.known_indels.vcf.gz"
 41 |   ],
 42 |   "PairedEndSingleSampleWorkflow.known_indels_sites_indices": [
 43 |     "/cluster_share/data/RefArch_Broad_data/genomics-public-data/resources/broad/hg38/v0/Mills_and_1000G_gold_standard.indels.hg38.vcf.gz.tbi",
 44 |     "/cluster_share/data/RefArch_Broad_data/genomics-public-data/resources/broad/hg38/v0/Homo_sapiens_assembly38.known_indels.vcf.gz.tbi"
 45 |   ],
 46 | 
 47 |   "##_COMMENT6":"OPTIMIZATION FLAGS FOR BWA, SAMTOOLS, GATK-GKL",
 48 |   "## SamToFastQ_COMMENT": "BWA mem is the bottleneck - request 16 cores for the task, assign 16T to bwa and let SamToFastQ and MergeBamAlignment timeshare",
 49 |   "PairedEndSingleSampleWorkflow.bwa_threads":16,
 50 |   "## SamtoolsSort_COMMENT": "Threads for Samtools sort step",
 51 |   "PairedEndSingleSampleWorkflow.samtools_threads":16,
 52 |   "## GENERAL_COMMENT": "Compression level for all java commands",
 53 |   "PairedEndSingleSampleWorkflow.compression_level":1,
 54 |   "## HaplotypeCaller_MT_comment": "See comment in the WDL file",
 55 |   "PairedEndSingleSampleWorkflow.gatk_gkl_pairhmm_implementation":"AVX_LOGLESS_CACHING",
 56 |   "PairedEndSingleSampleWorkflow.gatk_gkl_pairhmm_threads":1,
 57 |   "PairedEndSingleSampleWorkflow.HaplotypeCaller.smith_waterman_implementation":"AVX_ENABLED",
 58 |   "##COMMENT_TMPDIR": "defaults to /tmp, BUT you might want to customize this. For example, if your working directory is on an NVMe SSD",
 59 |   "PairedEndSingleSampleWorkflow.tmp_directory": "/tmp",
 60 |   "PairedEndSingleSampleWorkflow.SortSampleBam.mem_limit": "64M",
 61 | 
 62 |   "##_COMMENT7":"JAVA HEAP MEMORY OPTIONS",
 63 |   "PairedEndSingleSampleWorkflow.CollectQualityYieldMetrics.java_heap_memory_initial":"128m",
 64 |   "PairedEndSingleSampleWorkflow.SamToFastqAndBwaMemAndMba.java_heap_memory_initial":"2g",
 65 |   "PairedEndSingleSampleWorkflow.SortSampleBam.java_heap_memory_initial":"4g",
 66 |   "PairedEndSingleSampleWorkflow.CollectUnsortedReadgroupBamQualityMetrics.java_heap_memory_initial":"5g",
 67 |   "PairedEndSingleSampleWorkflow.CollectReadgroupBamQualityMetrics.java_heap_memory_initial":"5g",
 68 |   "PairedEndSingleSampleWorkflow.CollectAggregationMetrics.java_heap_memory_initial":"5g",
 69 |   "PairedEndSingleSampleWorkflow.CrossCheckFingerprints.java_heap_memory_initial":"2g",
 70 |   "PairedEndSingleSampleWorkflow.CheckFingerprint.java_heap_memory_initial":"1g",
 71 |   "PairedEndSingleSampleWorkflow.MarkDuplicates.java_heap_memory_initial":"4g",
 72 |   "PairedEndSingleSampleWorkflow.BaseRecalibrator.java_heap_memory_initial":"4g",
 73 |   "PairedEndSingleSampleWorkflow.GatherBqsrReports.java_heap_memory_initial":"3g",
 74 |   "PairedEndSingleSampleWorkflow.ApplyBQSR.java_heap_memory_initial":"3g",
 75 |   "PairedEndSingleSampleWorkflow.GatherBamFiles.java_heap_memory_initial":"2g",
 76 |   "PairedEndSingleSampleWorkflow.ValidateBamFromCram.java_heap_memory_initial":"6g",
 77 |   "PairedEndSingleSampleWorkflow.CollectWgsMetrics.java_heap_memory_initial":"2g",
 78 |   "PairedEndSingleSampleWorkflow.CollectRawWgsMetrics.java_heap_memory_initial":"2g",
 79 |   "PairedEndSingleSampleWorkflow.CalculateReadGroupChecksum.java_heap_memory_initial":"1g",
 80 |   "PairedEndSingleSampleWorkflow.ScatterIntervalList.java_heap_memory_initial":"1g",
 81 |   "PairedEndSingleSampleWorkflow.HaplotypeCaller.haplotypecaller_java_heap_memory_initial":"6g",
 82 |   "PairedEndSingleSampleWorkflow.MergeVCFs.java_heap_memory_initial":"2g",
 83 |   "PairedEndSingleSampleWorkflow.ValidateGVCF.java_heap_memory_initial":"3g",
 84 |   "PairedEndSingleSampleWorkflow.CollectGvcfCallingMetrics.java_heap_memory_initial":"2g",
 85 | 
 86 |   "##_COMMENT8":"RUNTIME SECTION MEMORY OPTIONS",
 87 |   "PairedEndSingleSampleWorkflow.CollectQualityYieldMetrics.memory":"2GB",
 88 |   "PairedEndSingleSampleWorkflow.CheckFinalVcfExtension.memory":"2GB",
 89 |   "PairedEndSingleSampleWorkflow.GetBwaVersion.memory":"1GB",
 90 |   "PairedEndSingleSampleWorkflow.SamToFastqAndBwaMemAndMba.memory":"10GB",
 91 |   "PairedEndSingleSampleWorkflow.SortSampleBam.memory":"5GB",
 92 |   "PairedEndSingleSampleWorkflow.CollectUnsortedReadgroupBamQualityMetrics.memory":"7GB",
 93 |   "PairedEndSingleSampleWorkflow.CollectReadgroupBamQualityMetrics.memory":"7GB",
 94 |   "PairedEndSingleSampleWorkflow.CollectAggregationMetrics.memory":"7GB",
 95 |   "PairedEndSingleSampleWorkflow.CrossCheckFingerprints.memory":"2GB",
 96 |   "PairedEndSingleSampleWorkflow.CheckFingerprint.memory":"1GB",
 97 |   "PairedEndSingleSampleWorkflow.MarkDuplicates.memory":"7GB",
 98 |   "PairedEndSingleSampleWorkflow.CreateSequenceGroupingTSV.memory":"2GB",
 99 |   "PairedEndSingleSampleWorkflow.BaseRecalibrator.memory":"6GB",
100 |   "PairedEndSingleSampleWorkflow.GatherBqsrReports.memory":"3GB",
101 |   "PairedEndSingleSampleWorkflow.ApplyBQSR.memory":"4GB",
102 |   "PairedEndSingleSampleWorkflow.GatherBamFiles.memory":"3GB",
103 |   "PairedEndSingleSampleWorkflow.ValidateBamFromCram.memory":"7GB",
104 |   "PairedEndSingleSampleWorkflow.CollectWgsMetrics.memory":"3GB",
105 |   "PairedEndSingleSampleWorkflow.CollectRawWgsMetrics.memory":"3GB",
106 |   "PairedEndSingleSampleWorkflow.CalculateReadGroupChecksum.memory":"2GB",
107 |   "PairedEndSingleSampleWorkflow.CheckContamination.memory":"2GB",
108 |   "PairedEndSingleSampleWorkflow.ScatterIntervalList.memory":"2GB",
109 |   "PairedEndSingleSampleWorkflow.HaplotypeCaller.memory":"7GB",
110 |   "PairedEndSingleSampleWorkflow.MergeVCFs.memory":"3GB",
111 |   "PairedEndSingleSampleWorkflow.ValidateGVCF.memory":"4GB",
112 |   "PairedEndSingleSampleWorkflow.CollectGvcfCallingMetrics.memory":"3GB",
113 |   "PairedEndSingleSampleWorkflow.ConvertToCram.memory":"3GB",
114 |   "PairedEndSingleSampleWorkflow.CramToBam.memory":"3GB",
115 | 
116 |   "##_COMMENT9":"RUNTIME SECTION CPU OPTIONS",
117 |   "PairedEndSingleSampleWorkflow.CollectQualityYieldMetrics.cpu":1,
118 |   "PairedEndSingleSampleWorkflow.CheckFinalVcfExtension.cpu":1,
119 |   "PairedEndSingleSampleWorkflow.GetBwaVersion.cpu":1,
120 |   "PairedEndSingleSampleWorkflow.SamToFastqAndBwaMemAndMba.cpu":16,
121 |   "PairedEndSingleSampleWorkflow.SortSampleBam.cpu":16,
122 |   "PairedEndSingleSampleWorkflow.CollectUnsortedReadgroupBamQualityMetrics.cpu":1,
123 |   "PairedEndSingleSampleWorkflow.CollectReadgroupBamQualityMetrics.cpu":1,
124 |   "PairedEndSingleSampleWorkflow.CollectAggregationMetrics.cpu":1,
125 |   "PairedEndSingleSampleWorkflow.CrossCheckFingerprints.cpu":1,
126 |   "PairedEndSingleSampleWorkflow.CheckFingerprint.cpu":1,
127 |   "PairedEndSingleSampleWorkflow.MarkDuplicates.cpu":1,
128 |   "PairedEndSingleSampleWorkflow.CreateSequenceGroupingTSV.cpu":1,
129 |   "PairedEndSingleSampleWorkflow.BaseRecalibrator.cpu":1,
130 |   "PairedEndSingleSampleWorkflow.GatherBqsrReports.cpu":1,
131 |   "PairedEndSingleSampleWorkflow.ApplyBQSR.cpu":1,
132 |   "PairedEndSingleSampleWorkflow.GatherBamFiles.cpu":1,
133 |   "PairedEndSingleSampleWorkflow.ValidateBamFromCram.cpu":1,
134 |   "PairedEndSingleSampleWorkflow.CollectWgsMetrics.cpu":1,
135 |   "PairedEndSingleSampleWorkflow.CollectRawWgsMetrics.cpu":1,
136 |   "PairedEndSingleSampleWorkflow.CalculateReadGroupChecksum.cpu":1,
137 |   "PairedEndSingleSampleWorkflow.CheckContamination.cpu":1,
138 |   "PairedEndSingleSampleWorkflow.ScatterIntervalList.cpu":1,
139 |   "PairedEndSingleSampleWorkflow.HaplotypeCaller.cpu":1,
140 |   "PairedEndSingleSampleWorkflow.MergeVCFs.cpu":1,
141 |   "PairedEndSingleSampleWorkflow.ValidateGVCF.cpu":1,
142 |   "PairedEndSingleSampleWorkflow.CollectGvcfCallingMetrics.cpu":1,
143 |   "PairedEndSingleSampleWorkflow.ConvertToCram.cpu":1,
144 |   "PairedEndSingleSampleWorkflow.CramToBam.cpu":1 
145 | }
146 | 


--------------------------------------------------------------------------------
/20k_Throughput-run/WDL/GermlineVariantDiscovery.wdl:
--------------------------------------------------------------------------------
  1 | version 1.0
  2 | 
  3 | ## Copyright Broad Institute, 2018
  4 | ##
  5 | ## This WDL defines tasks used for germline variant discovery of human whole-genome or exome sequencing data.
  6 | ##
  7 | ## Runtime parameters are often optimized for Broad's Google Cloud Platform implementation.
  8 | ## For program versions, see docker containers.
  9 | ##
 10 | ## LICENSING :
 11 | ## This script is released under the WDL source code license (BSD-3) (see LICENSE in
 12 | ## https://github.com/broadinstitute/wdl). Note however that the programs it calls may
 13 | ## be subject to different licenses. Users are responsible for checking that they are
 14 | ## authorized to run all programs before running this script. Please see the docker
 15 | ## page at https://hub.docker.com/r/broadinstitute/genomes-in-the-cloud/ for detailed
 16 | ## licensing information pertaining to the included programs.
 17 | 
 18 | task HaplotypeCaller_GATK35_GVCF {
 19 |   input {
 20 |     File input_bam
 21 |     File input_bam_index
 22 |     File interval_list
 23 |     String gvcf_basename
 24 |     File ref_dict
 25 |     File ref_fasta
 26 |     File ref_fasta_index
 27 |     Float? contamination
 28 |     Int hc_scatter
 29 |   }
 30 | 
 31 |   parameter_meta {
 32 |     input_bam: {
 33 |       localization_optional: true
 34 |     }
 35 |   }
 36 | 
 37 |   Float ref_size = size(ref_fasta, "GiB") + size(ref_fasta_index, "GiB") + size(ref_dict, "GiB")
 38 |   Int disk_size = ceil(((size(input_bam, "GiB") + 30) / hc_scatter) + ref_size) + 20
 39 | 
 40 |   # We use interval_padding 500 below to make sure that the HaplotypeCaller has context on both sides around
 41 |   # the interval because the assembly uses them.
 42 |   #
 43 |   # Using PrintReads is a temporary solution until we update HaploypeCaller to use GATK4. Once that is done,
 44 |   # HaplotypeCaller can stream the required intervals directly from the cloud.
 45 |   command {
 46 |     /usr/gitc/gatk4/gatk --java-options "-Xms2000m -Xmx9000m"\
 47 |       PrintReads \
 48 |       -I ~{input_bam} \
 49 |       --interval-padding 500 \
 50 |       -L ~{interval_list} \
 51 |       -O local.sharded.bam \
 52 |     && \
 53 |     java -XX:GCTimeLimit=50 -XX:GCHeapFreeLimit=10 -Xms8000m -Xmx9000m\
 54 |       -jar /usr/gitc/GATK35.jar \
 55 |       -T HaplotypeCaller \
 56 |       -R ~{ref_fasta} \
 57 |       -o ~{gvcf_basename}.vcf.gz \
 58 |       -I local.sharded.bam \
 59 |       -L ~{interval_list} \
 60 |       -ERC GVCF \
 61 |       --max_alternate_alleles 3 \
 62 |       -variant_index_parameter 128000 \
 63 |       -variant_index_type LINEAR \
 64 |       -contamination ~{default=0 contamination} \
 65 |       --read_filter OverclippedRead
 66 |   }
 67 |   runtime {
 68 |     memory: "10000 MiB"
 69 |     cpu: "2"
 70 |     backend: "SLURM-HAPLO"
 71 |   }
 72 |   output {
 73 |     File output_gvcf = "~{gvcf_basename}.vcf.gz"
 74 |     File output_gvcf_index = "~{gvcf_basename}.vcf.gz.tbi"
 75 |   }
 76 | }
 77 | 
 78 | task HaplotypeCaller_GATK4_VCF {
 79 |   input {
 80 |     File input_bam
 81 |     File input_bam_index
 82 |     File interval_list
 83 |     String vcf_basename
 84 |     File ref_dict
 85 |     File ref_fasta
 86 |     File ref_fasta_index
 87 |     Float? contamination
 88 |     Boolean make_gvcf
 89 |     Boolean make_bamout
 90 |     Int hc_scatter
 91 |     Boolean run_dragen_mode_variant_calling = false
 92 |     Boolean use_dragen_hard_filtering = false
 93 |     Boolean use_spanning_event_genotyping = true
 94 |     File? dragstr_model
 95 |     Int memory_multiplier = 1
 96 |   }
 97 |   
 98 |   Int memory_size_mb = ceil(8000 * memory_multiplier)
 99 | 
100 |   String output_suffix = if make_gvcf then ".g.vcf.gz" else ".vcf.gz"
101 |   String output_file_name = vcf_basename + output_suffix
102 | 
103 |   Float ref_size = size(ref_fasta, "GiB") + size(ref_fasta_index, "GiB") + size(ref_dict, "GiB")
104 |   Int disk_size = ceil(((size(input_bam, "GiB") + 30) / hc_scatter) + ref_size) + 20
105 | 
106 |   String bamout_arg = if make_bamout then "-bamout ~{vcf_basename}.bamout.bam" else ""
107 | 
108 |   parameter_meta {
109 |     input_bam: {
110 |       localization_optional: true
111 |     }
112 |   }
113 | 
114 |   command <<<
115 |     set -e
116 |     /mnt/lustre/genomics/tools/gatk/gatk --java-options "-Xms6000m -Xmx6400m -XX:GCTimeLimit=50 -XX:GCHeapFreeLimit=10" \
117 |       HaplotypeCaller \
118 |       -R ~{ref_fasta} \
119 |       -I ~{input_bam} \
120 |       -L ~{interval_list} \
121 |       -O ~{output_file_name} \
122 |       -contamination ~{default=0 contamination} \
123 |       -G StandardAnnotation -G StandardHCAnnotation ~{true="-G AS_StandardAnnotation" false="" make_gvcf} \
124 |       ~{true="--dragen-mode" false="" run_dragen_mode_variant_calling} \
125 |       ~{false="--disable-spanning-event-genotyping" true="" use_spanning_event_genotyping} \
126 |       ~{if defined(dragstr_model) then "--dragstr-params-path " + dragstr_model else ""} \
127 |       -GQB 10 -GQB 20 -GQB 30 -GQB 40 -GQB 50 -GQB 60 -GQB 70 -GQB 80 -GQB 90 \
128 |       ~{true="-ERC GVCF" false="" make_gvcf} \
129 |       ~{bamout_arg}
130 | 
131 |     # Cromwell doesn't like optional task outputs, so we have to touch this file.
132 |     touch ~{vcf_basename}.bamout.bam
133 |   >>>
134 | 
135 |   runtime {
136 |     memory: "6.5 GiB" 
137 |     cpu: "2"
138 |     backend: "SLURM-HAPLO"
139 |   }
140 | 
141 |   output {
142 |     File output_vcf = "~{output_file_name}"
143 |     File output_vcf_index = "~{output_file_name}.tbi"
144 |     File bamout = "~{vcf_basename}.bamout.bam"
145 |   }
146 | }
147 | 
148 | # Combine multiple VCFs or GVCFs from scattered HaplotypeCaller runs
149 | task MergeVCFs {
150 |   input {
151 |     Array[File] input_vcfs
152 |     Array[File] input_vcfs_indexes
153 |     String output_vcf_name
154 |   }
155 | 
156 |   Int disk_size = ceil(size(input_vcfs, "GiB") * 2.5) + 10
157 | 
158 |   # Using MergeVcfs instead of GatherVcfs so we can create indices
159 |   # See https://github.com/broadinstitute/picard/issues/789 for relevant GatherVcfs ticket
160 |   command {
161 |     java -Xms2000m -Xmx2900m -jar /mnt/lustre/genomics/tools/picard.jar \
162 |       MergeVcfs \
163 |       INPUT=~{sep=' INPUT=' input_vcfs} \
164 |       OUTPUT=~{output_vcf_name}
165 |   }
166 |   runtime {
167 |     cpu: "2"
168 |     memory: "3000 MiB"
169 |   }
170 |   output {
171 |     File output_vcf = "~{output_vcf_name}"
172 |     File output_vcf_index = "~{output_vcf_name}.tbi"
173 |   }
174 | }
175 | 
176 | task Reblock {
177 | 
178 |   input {
179 |     File gvcf
180 |     File gvcf_index
181 |     File ref_dict
182 |     File ref_fasta
183 |     File ref_fasta_index
184 |     String output_vcf_filename
185 |     Int additional_disk = 20
186 |     String? annotations_to_keep_command
187 |     String? annotations_to_remove_command
188 |     Float? tree_score_cutoff
189 |     Boolean move_filters_to_genotypes = false
190 |   }
191 | 
192 |   Int disk_size = ceil((size(gvcf, "GiB")) * 4) + additional_disk
193 |   String gvcf_basename = basename(gvcf)
194 |   String gvcf_index_basename = basename(gvcf_index)
195 | 
196 |   command {
197 |     set -e 
198 | 
199 |     # We can't always assume the index was located with the gvcf, so make a link so that the paths look the same
200 |     ln -s ~{gvcf} ~{gvcf_basename}
201 |     ln -s ~{gvcf_index} ~{gvcf_index_basename}
202 | 
203 |     /mnt/lustre/genomics/tools/gatk/gatk --java-options "-Xms3000m -Xmx3000m" \
204 |       ReblockGVCF \
205 |       -R ~{ref_fasta} \
206 |       -V ~{gvcf_basename} \
207 |       -do-qual-approx \
208 |       --floor-blocks -GQB 20 -GQB 30 -GQB 40 \
209 |       ~{annotations_to_keep_command} \
210 |       ~{annotations_to_remove_command} \
211 |       ~{"--tree-score-threshold-to-no-call " + tree_score_cutoff} \
212 |       ~{if move_filters_to_genotypes then "--add-site-filters-to-genotype" else ""} \
213 |       -O ~{output_vcf_filename}
214 |   }
215 | 
216 |   runtime {
217 |     memory: "3750 MiB"
218 |   }
219 | 
220 |   output {
221 |     File output_vcf = output_vcf_filename
222 |     File output_vcf_index = output_vcf_filename + ".tbi"
223 |   }
224 | }
225 | 
226 | task HardFilterVcf {
227 |   input {
228 |     File input_vcf
229 |     File input_vcf_index
230 |     String vcf_basename
231 |     File interval_list
232 |   }
233 | 
234 |   Int disk_size = ceil(2 * size(input_vcf, "GiB")) + 20
235 |   String output_vcf_name = vcf_basename + ".filtered.vcf.gz"
236 | 
237 |   command {
238 |     /mnt/lustre/genomics/tools/gatk/gatk --java-options "-Xms2000m -Xmx2500m" \
239 |       VariantFiltration \
240 |       -V ~{input_vcf} \
241 |       -L ~{interval_list} \
242 |       --filter-expression "QD < 2.0 || FS > 30.0 || SOR > 3.0 || MQ < 40.0 || MQRankSum < -3.0 || ReadPosRankSum < -3.0" \
243 |       --filter-name "HardFiltered" \
244 |       -O ~{output_vcf_name}
245 |   }
246 |   output {
247 |     File output_vcf = "~{output_vcf_name}"
248 |     File output_vcf_index = "~{output_vcf_name}.tbi"
249 |   }
250 |   runtime {
251 |     memory: "3000 MiB"
252 |   }
253 | }
254 | 
255 | # This hard filtering matches DRAGEN 3.4.12. For later DRAGEN versions, this needs to be updated.
256 | task DragenHardFilterVcf {
257 |   input {
258 |     File input_vcf
259 |     File input_vcf_index
260 |     Boolean make_gvcf
261 |     String vcf_basename
262 |   }
263 | 
264 |   Int disk_size = ceil(2 * size(input_vcf, "GiB")) + 20
265 | 
266 |   String output_suffix = if make_gvcf then ".g.vcf.gz" else ".vcf.gz"
267 |   String output_vcf_name = vcf_basename + ".hard-filtered" + output_suffix
268 | 
269 |   command {
270 |      /mnt/lustre/genomics/tools/gatk/gatk --java-options "-Xms2000m -Xmx2500m" \
271 |       VariantFiltration \
272 |       -V ~{input_vcf} \
273 |       --filter-expression "QUAL < 10.4139" \
274 |       --filter-name "DRAGENHardQUAL" \
275 |       -O ~{output_vcf_name}
276 |   }
277 |   output {
278 |     File output_vcf = "~{output_vcf_name}"
279 |     File output_vcf_index = "~{output_vcf_name}.tbi"
280 |   }
281 |   runtime {
282 |     memory: "3000 MiB"
283 |   }
284 | }
285 | 
286 | task CNNScoreVariants {
287 |   input {
288 |     File? bamout
289 |     File? bamout_index
290 |     File input_vcf
291 |     File input_vcf_index
292 |     String vcf_basename
293 |     File ref_fasta
294 |     File ref_fasta_index
295 |     File ref_dict
296 |   }
297 | 
298 |   Int disk_size = ceil(size(bamout, "GiB") + size(ref_fasta, "GiB") + (size(input_vcf, "GiB") * 2))
299 | 
300 |   String base_vcf = basename(input_vcf)
301 |   Boolean is_compressed = basename(base_vcf, "gz") != base_vcf
302 |   String vcf_suffix = if is_compressed then ".vcf.gz" else ".vcf"
303 |   String vcf_index_suffix = if is_compressed then ".tbi" else ".idx"
304 |   String output_vcf = base_vcf + ".scored" + vcf_suffix
305 |   String output_vcf_index = output_vcf + vcf_index_suffix
306 | 
307 |   String bamout_param = if defined(bamout) then "-I ~{bamout}" else ""
308 |   String tensor_type = if defined(bamout) then "read-tensor" else "reference"
309 | 
310 |   command {
311 |      /mnt/lustre/genomics/tools/gatk/gatk --java-options "-Xmx10000m" CNNScoreVariants \
312 |        -V ~{input_vcf} \
313 |        -R ~{ref_fasta} \
314 |        -O ~{output_vcf} \
315 |        ~{bamout_param} \
316 |        -tensor-type ~{tensor_type}
317 |   }
318 | 
319 |   output {
320 |     File scored_vcf = "~{output_vcf}"
321 |     File scored_vcf_index = "~{output_vcf_index}"
322 |   }
323 | 
324 |   runtime {
325 |     memory: "15000 MiB"
326 |     cpu: "2"
327 |   }
328 | }
329 | 
330 | task FilterVariantTranches {
331 | 
332 |   input {
333 |     File input_vcf
334 |     File input_vcf_index
335 |     String vcf_basename
336 |     Array[String] snp_tranches
337 |     Array[String] indel_tranches
338 |     File hapmap_resource_vcf
339 |     File hapmap_resource_vcf_index
340 |     File omni_resource_vcf
341 |     File omni_resource_vcf_index
342 |     File one_thousand_genomes_resource_vcf
343 |     File one_thousand_genomes_resource_vcf_index
344 |     File dbsnp_resource_vcf
345 |     File dbsnp_resource_vcf_index
346 |     String info_key
347 |   }
348 | 
349 | 
350 |   command {
351 | 
352 |     /mnt/lustre/genomics/tools/gatk/gatk --java-options "-Xmx6000m" FilterVariantTranches \
353 |       -V ~{input_vcf} \
354 |       -O ~{vcf_basename}.filtered.vcf.gz \
355 |       ~{sep=" " prefix("--snp-tranche ", snp_tranches)} \
356 |       ~{sep=" " prefix("--indel-tranche ", indel_tranches)} \
357 |       --resource ~{hapmap_resource_vcf} \
358 |       --resource ~{omni_resource_vcf} \
359 |       --resource ~{one_thousand_genomes_resource_vcf} \
360 |       --resource ~{dbsnp_resource_vcf} \
361 |       --info-key ~{info_key} \
362 |       --create-output-variant-index true
363 |   }
364 | 
365 |   output {
366 |     File filtered_vcf = "~{vcf_basename}.filtered.vcf.gz"
367 |     File filtered_vcf_index = "~{vcf_basename}.filtered.vcf.gz.tbi"
368 |   }
369 | 
370 |   runtime {
371 |     memory: "7000 MiB"
372 |     cpu: "2"
373 |   }
374 | }
375 | 


--------------------------------------------------------------------------------
/20k_Tutorial/16T_PairedSingleSampleWf_optimized.inputs.20k.json:
--------------------------------------------------------------------------------
  1 | {
  2 |   "##_COMMENT1": "Take note of the .64 extensions on the reference files, issues between 32 and 64 bit OS",
  3 | 
  4 |   "##_COMMENT2": "SAMPLES - read the README to find other examples.",
  5 |   "PairedEndSingleSampleWorkflow.sample_name": "NA12878",
  6 |   "PairedEndSingleSampleWorkflow.base_file_name": "NA12878",
  7 |   "PairedEndSingleSampleWorkflow.flowcell_unmapped_bams": [
  8 |     "/cluster_share/data/RefArch_Broad_data/genomics-public-data/test-data/dna/wgs/hiseq2500/NA12878/H06HDADXX130110.1.ATCACGAT.20k_reads.bam",
  9 |     "/cluster_share/data/RefArch_Broad_data/genomics-public-data/test-data/dna/wgs/hiseq2500/NA12878/H06HDADXX130110.2.ATCACGAT.20k_reads.bam",
 10 |     "/cluster_share/data/RefArch_Broad_data/genomics-public-data/test-data/dna/wgs/hiseq2500/NA12878/H06JUADXX130110.1.ATCACGAT.20k_reads.bam"
 11 |   ],
 12 |   "PairedEndSingleSampleWorkflow.final_gvcf_name": "NA12878.g.vcf.gz",
 13 |   "PairedEndSingleSampleWorkflow.unmapped_bam_suffix": ".unmapped.bam",
 14 | 
 15 |   "##_COMMENT3": "REFERENCES",
 16 |   "PairedEndSingleSampleWorkflow.fingerprint_genotypes_file": "/cluster_share/data/RefArch_Broad_data/genomics-public-data/resources/broad/hg38/v0/NA12878.hg38.reference.fingerprint.vcf",
 17 |   "PairedEndSingleSampleWorkflow.contamination_sites_ud": "/cluster_share/data/RefArch_Broad_data/genomics-public-data/resources/broad/hg38/v0/Homo_sapiens_assembly38.contam.UD",
 18 |   "PairedEndSingleSampleWorkflow.contamination_sites_bed": "/cluster_share/data/RefArch_Broad_data/genomics-public-data/resources/broad/hg38/v0/Homo_sapiens_assembly38.contam.bed",
 19 |   "PairedEndSingleSampleWorkflow.contamination_sites_mu": "/cluster_share/data/RefArch_Broad_data/genomics-public-data/resources/broad/hg38/v0/Homo_sapiens_assembly38.contam.mu",
 20 |   "PairedEndSingleSampleWorkflow.wgs_calling_interval_list": "/cluster_share/data/RefArch_Broad_data/genomics-public-data/resources/broad/hg38/v0/wgs_calling_regions.hg38.interval_list",
 21 |   "PairedEndSingleSampleWorkflow.haplotype_scatter_count" : 50,
 22 |   "PairedEndSingleSampleWorkflow.break_bands_at_multiples_of" : 1000000,
 23 |   "PairedEndSingleSampleWorkflow.ref_dict": "/cluster_share/data/RefArch_Broad_data/genomics-public-data/resources/broad/hg38/v0/Homo_sapiens_assembly38.dict",
 24 |   "PairedEndSingleSampleWorkflow.ref_fasta": "/cluster_share/data/RefArch_Broad_data/genomics-public-data/resources/broad/hg38/v0/Homo_sapiens_assembly38.fasta",
 25 |   "PairedEndSingleSampleWorkflow.ref_fasta_index": "/cluster_share/data/RefArch_Broad_data/genomics-public-data/resources/broad/hg38/v0/Homo_sapiens_assembly38.fasta.fai",
 26 |   "PairedEndSingleSampleWorkflow.ref_alt": "/cluster_share/data/RefArch_Broad_data/genomics-public-data/resources/broad/hg38/v0/Homo_sapiens_assembly38.fasta.64.alt",
 27 |   "PairedEndSingleSampleWorkflow.ref_sa": "/cluster_share/data/RefArch_Broad_data/genomics-public-data/resources/broad/hg38/v0/Homo_sapiens_assembly38.fasta.64.sa",
 28 |   "PairedEndSingleSampleWorkflow.ref_amb": "/cluster_share/data/RefArch_Broad_data/genomics-public-data/resources/broad/hg38/v0/Homo_sapiens_assembly38.fasta.64.amb",
 29 |   "PairedEndSingleSampleWorkflow.ref_bwt": "/cluster_share/data/RefArch_Broad_data/genomics-public-data/resources/broad/hg38/v0/Homo_sapiens_assembly38.fasta.64.bwt",
 30 |   "PairedEndSingleSampleWorkflow.ref_ann": "/cluster_share/data/RefArch_Broad_data/genomics-public-data/resources/broad/hg38/v0/Homo_sapiens_assembly38.fasta.64.ann",
 31 |   "PairedEndSingleSampleWorkflow.ref_pac": "/cluster_share/data/RefArch_Broad_data/genomics-public-data/resources/broad/hg38/v0/Homo_sapiens_assembly38.fasta.64.pac",
 32 |   "PairedEndSingleSampleWorkflow.known_snps_sites_vcf": "/cluster_share/data/RefArch_Broad_data/genomics-public-data/resources/broad/hg38/v0/Mills_and_1000G_gold_standard.indels.hg38.vcf.gz",
 33 |   "PairedEndSingleSampleWorkflow.known_snps_sites_vcf_index": "/cluster_share/data/RefArch_Broad_data/genomics-public-data/resources/broad/hg38/v0/Mills_and_1000G_gold_standard.indels.hg38.vcf.gz.tbi",
 34 |   "PairedEndSingleSampleWorkflow.dbSNP_vcf": "/cluster_share/data/RefArch_Broad_data/genomics-public-data/resources/broad/hg38/v0/Homo_sapiens_assembly38.dbsnp138.vcf",
 35 |   "PairedEndSingleSampleWorkflow.dbSNP_vcf_index": "/cluster_share/data/RefArch_Broad_data/genomics-public-data/resources/broad/hg38/v0/Homo_sapiens_assembly38.dbsnp138.vcf.idx",
 36 |   "PairedEndSingleSampleWorkflow.wgs_coverage_interval_list": "/cluster_share/data/RefArch_Broad_data/genomics-public-data/resources/broad/hg38/v0/wgs_coverage_regions.hg38.interval_list",
 37 |   "PairedEndSingleSampleWorkflow.wgs_evaluation_interval_list": "/cluster_share/data/RefArch_Broad_data/genomics-public-data/resources/broad/hg38/v0/wgs_evaluation_regions.hg38.interval_list",
 38 |     "PairedEndSingleSampleWorkflow.known_indels_sites_VCFs": [
 39 |     "/cluster_share/data/RefArch_Broad_data/genomics-public-data/resources/broad/hg38/v0/Mills_and_1000G_gold_standard.indels.hg38.vcf.gz",
 40 |     "/cluster_share/data/RefArch_Broad_data/genomics-public-data/resources/broad/hg38/v0/Homo_sapiens_assembly38.known_indels.vcf.gz"
 41 |   ],
 42 |   "PairedEndSingleSampleWorkflow.known_indels_sites_indices": [
 43 |     "/cluster_share/data/RefArch_Broad_data/genomics-public-data/resources/broad/hg38/v0/Mills_and_1000G_gold_standard.indels.hg38.vcf.gz.tbi",
 44 |     "/cluster_share/data/RefArch_Broad_data/genomics-public-data/resources/broad/hg38/v0/Homo_sapiens_assembly38.known_indels.vcf.gz.tbi"
 45 |   ],
 46 | 
 47 |   "##_COMMENT5": "PATH TO GENOMICS TOOLS FOR NON-DOCKERIZED WORKFLOW",
 48 |   "PairedEndSingleSampleWorkflow.tool_path": "/cluster_share/Hybrid_Pipeline/tools",
 49 |   
 50 |   "##_COMMENT6":"OPTIMIZATION FLAGS FOR BWA, SAMTOOLS, GATK-GKL",
 51 |   "## SamToFastQ_COMMENT": "BWA mem is the bottleneck - request 16 cores for the task, assign 16T to bwa and let SamToFastQ and MergeBamAlignment timeshare",
 52 |   "PairedEndSingleSampleWorkflow.bwa_threads":16,
 53 |   "## SamtoolsSort_COMMENT": "Threads for Samtools sort step",
 54 |   "PairedEndSingleSampleWorkflow.samtools_threads":16,
 55 |   "## GENERAL_COMMENT": "Compression level for all java commands",
 56 |   "PairedEndSingleSampleWorkflow.compression_level":1,
 57 |   "## HaplotypeCaller_MT_comment": "See comment in the WDL file",
 58 |   "PairedEndSingleSampleWorkflow.gatk_gkl_pairhmm_implementation":"AVX_LOGLESS_CACHING",
 59 |   "PairedEndSingleSampleWorkflow.gatk_gkl_pairhmm_threads":1,
 60 |   "PairedEndSingleSampleWorkflow.HaplotypeCaller.smith_waterman_implementation":"AVX_ENABLED",
 61 |   "##COMMENT_TMPDIR": "defaults to /tmp, BUT you might want to customize this. For example, if your working directory is on an NVMe SSD",
 62 |   "PairedEndSingleSampleWorkflow.tmp_directory": "/tmp",
 63 |   "PairedEndSingleSampleWorkflow.SortSampleBam.mem_limit": "64M",
 64 | 
 65 |   "##_COMMENT7":"JAVA HEAP MEMORY OPTIONS",
 66 |   "PairedEndSingleSampleWorkflow.CollectQualityYieldMetrics.java_heap_memory_initial":"128m",
 67 |   "PairedEndSingleSampleWorkflow.SamToFastqAndBwaMemAndMba.java_heap_memory_initial":"2g",
 68 |   "PairedEndSingleSampleWorkflow.SortSampleBam.java_heap_memory_initial":"4g",
 69 |   "PairedEndSingleSampleWorkflow.CollectUnsortedReadgroupBamQualityMetrics.java_heap_memory_initial":"5g",
 70 |   "PairedEndSingleSampleWorkflow.CollectReadgroupBamQualityMetrics.java_heap_memory_initial":"5g",
 71 |   "PairedEndSingleSampleWorkflow.CollectAggregationMetrics.java_heap_memory_initial":"5g",
 72 |   "PairedEndSingleSampleWorkflow.CrossCheckFingerprints.java_heap_memory_initial":"2g",
 73 |   "PairedEndSingleSampleWorkflow.CheckFingerprint.java_heap_memory_initial":"1g",
 74 |   "PairedEndSingleSampleWorkflow.MarkDuplicates.java_heap_memory_initial":"4g",
 75 |   "PairedEndSingleSampleWorkflow.BaseRecalibrator.java_heap_memory_initial":"4g",
 76 |   "PairedEndSingleSampleWorkflow.GatherBqsrReports.java_heap_memory_initial":"3g",
 77 |   "PairedEndSingleSampleWorkflow.ApplyBQSR.java_heap_memory_initial":"3g",
 78 |   "PairedEndSingleSampleWorkflow.GatherBamFiles.java_heap_memory_initial":"2g",
 79 |   "PairedEndSingleSampleWorkflow.ValidateBamFromCram.java_heap_memory_initial":"6g",
 80 |   "PairedEndSingleSampleWorkflow.CollectWgsMetrics.java_heap_memory_initial":"2g",
 81 |   "PairedEndSingleSampleWorkflow.CollectRawWgsMetrics.java_heap_memory_initial":"2g",
 82 |   "PairedEndSingleSampleWorkflow.CalculateReadGroupChecksum.java_heap_memory_initial":"1g",
 83 |   "PairedEndSingleSampleWorkflow.ScatterIntervalList.java_heap_memory_initial":"1g",
 84 |   "PairedEndSingleSampleWorkflow.HaplotypeCaller.haplotypecaller_java_heap_memory_initial":"6g",
 85 |   "PairedEndSingleSampleWorkflow.MergeVCFs.java_heap_memory_initial":"2g",
 86 |   "PairedEndSingleSampleWorkflow.ValidateGVCF.java_heap_memory_initial":"3g",
 87 |   "PairedEndSingleSampleWorkflow.CollectGvcfCallingMetrics.java_heap_memory_initial":"2g",
 88 | 
 89 |   "##_COMMENT8":"RUNTIME SECTION MEMORY OPTIONS",
 90 |   "PairedEndSingleSampleWorkflow.CollectQualityYieldMetrics.memory":"2GB",
 91 |   "PairedEndSingleSampleWorkflow.CheckFinalVcfExtension.memory":"2GB",
 92 |   "PairedEndSingleSampleWorkflow.GetBwaVersion.memory":"1GB",
 93 |   "PairedEndSingleSampleWorkflow.SamToFastqAndBwaMemAndMba.memory":"10GB",
 94 |   "PairedEndSingleSampleWorkflow.SortSampleBam.memory":"5GB",
 95 |   "PairedEndSingleSampleWorkflow.CollectUnsortedReadgroupBamQualityMetrics.memory":"7GB",
 96 |   "PairedEndSingleSampleWorkflow.CollectReadgroupBamQualityMetrics.memory":"7GB",
 97 |   "PairedEndSingleSampleWorkflow.CollectAggregationMetrics.memory":"7GB",
 98 |   "PairedEndSingleSampleWorkflow.CrossCheckFingerprints.memory":"2GB",
 99 |   "PairedEndSingleSampleWorkflow.CheckFingerprint.memory":"1GB",
100 |   "PairedEndSingleSampleWorkflow.MarkDuplicates.memory":"7GB",
101 |   "PairedEndSingleSampleWorkflow.CreateSequenceGroupingTSV.memory":"2GB",
102 |   "PairedEndSingleSampleWorkflow.BaseRecalibrator.memory":"6GB",
103 |   "PairedEndSingleSampleWorkflow.GatherBqsrReports.memory":"3GB",
104 |   "PairedEndSingleSampleWorkflow.ApplyBQSR.memory":"4GB",
105 |   "PairedEndSingleSampleWorkflow.GatherBamFiles.memory":"3GB",
106 |   "PairedEndSingleSampleWorkflow.ValidateBamFromCram.memory":"7GB",
107 |   "PairedEndSingleSampleWorkflow.CollectWgsMetrics.memory":"3GB",
108 |   "PairedEndSingleSampleWorkflow.CollectRawWgsMetrics.memory":"3GB",
109 |   "PairedEndSingleSampleWorkflow.CalculateReadGroupChecksum.memory":"2GB",
110 |   "PairedEndSingleSampleWorkflow.CheckContamination.memory":"2GB",
111 |   "PairedEndSingleSampleWorkflow.ScatterIntervalList.memory":"2GB",
112 |   "PairedEndSingleSampleWorkflow.HaplotypeCaller.memory":"7GB",
113 |   "PairedEndSingleSampleWorkflow.MergeVCFs.memory":"3GB",
114 |   "PairedEndSingleSampleWorkflow.ValidateGVCF.memory":"4GB",
115 |   "PairedEndSingleSampleWorkflow.CollectGvcfCallingMetrics.memory":"3GB",
116 |   "PairedEndSingleSampleWorkflow.ConvertToCram.memory":"3GB",
117 |   "PairedEndSingleSampleWorkflow.CramToBam.memory":"3GB",
118 | 
119 |   "##_COMMENT9":"RUNTIME SECTION CPU OPTIONS",
120 |   "PairedEndSingleSampleWorkflow.CollectQualityYieldMetrics.cpu":1,
121 |   "PairedEndSingleSampleWorkflow.CheckFinalVcfExtension.cpu":1,
122 |   "PairedEndSingleSampleWorkflow.GetBwaVersion.cpu":1,
123 |   "PairedEndSingleSampleWorkflow.SamToFastqAndBwaMemAndMba.cpu":16,
124 |   "PairedEndSingleSampleWorkflow.SortSampleBam.cpu":16,
125 |   "PairedEndSingleSampleWorkflow.CollectUnsortedReadgroupBamQualityMetrics.cpu":1,
126 |   "PairedEndSingleSampleWorkflow.CollectReadgroupBamQualityMetrics.cpu":1,
127 |   "PairedEndSingleSampleWorkflow.CollectAggregationMetrics.cpu":1,
128 |   "PairedEndSingleSampleWorkflow.CrossCheckFingerprints.cpu":1,
129 |   "PairedEndSingleSampleWorkflow.CheckFingerprint.cpu":1,
130 |   "PairedEndSingleSampleWorkflow.MarkDuplicates.cpu":1,
131 |   "PairedEndSingleSampleWorkflow.CreateSequenceGroupingTSV.cpu":1,
132 |   "PairedEndSingleSampleWorkflow.BaseRecalibrator.cpu":1,
133 |   "PairedEndSingleSampleWorkflow.GatherBqsrReports.cpu":1,
134 |   "PairedEndSingleSampleWorkflow.ApplyBQSR.cpu":1,
135 |   "PairedEndSingleSampleWorkflow.GatherBamFiles.cpu":1,
136 |   "PairedEndSingleSampleWorkflow.ValidateBamFromCram.cpu":1,
137 |   "PairedEndSingleSampleWorkflow.CollectWgsMetrics.cpu":1,
138 |   "PairedEndSingleSampleWorkflow.CollectRawWgsMetrics.cpu":1,
139 |   "PairedEndSingleSampleWorkflow.CalculateReadGroupChecksum.cpu":1,
140 |   "PairedEndSingleSampleWorkflow.CheckContamination.cpu":1,
141 |   "PairedEndSingleSampleWorkflow.ScatterIntervalList.cpu":1,
142 |   "PairedEndSingleSampleWorkflow.HaplotypeCaller.cpu":1,
143 |   "PairedEndSingleSampleWorkflow.MergeVCFs.cpu":1,
144 |   "PairedEndSingleSampleWorkflow.ValidateGVCF.cpu":1,
145 |   "PairedEndSingleSampleWorkflow.CollectGvcfCallingMetrics.cpu":1,
146 |   "PairedEndSingleSampleWorkflow.ConvertToCram.cpu":1,
147 |   "PairedEndSingleSampleWorkflow.CramToBam.cpu":1 
148 | }
149 | 


--------------------------------------------------------------------------------
/20k_Throughput-run/WDL/UnmappedBamToAlignedBam.wdl:
--------------------------------------------------------------------------------
  1 | version 1.0
  2 | 
  3 | ## Copyright Broad Institute, 2018
  4 | ##
  5 | ## This WDL pipeline implements data processing according to the GATK Best Practices (June 2016)
  6 | ## for human whole-genome and exome sequencing data.
  7 | ##
  8 | ## Runtime parameters are often optimized for Broad's Google Cloud Platform implementation.
  9 | ## For program versions, see docker containers.
 10 | ##
 11 | ## LICENSING :
 12 | ## This script is released under the WDL source code license (BSD-3) (see LICENSE in
 13 | ## https://github.com/broadinstitute/wdl). Note however that the programs it calls may
 14 | ## be subject to different licenses. Users are responsible for checking that they are
 15 | ## authorized to run all programs before running this script. Please see the docker
 16 | ## page at https://hub.docker.com/r/broadinstitute/genomes-in-the-cloud/ for detailed
 17 | ## licensing information pertaining to the included programs.
 18 | 
 19 | import "Alignment.wdl" as Alignment
 20 | import "DragmapAlignment.wdl" as DragmapAlignment
 21 | import "SplitLargeReadGroup.wdl" as SplitRG
 22 | import "Qc.wdl" as QC
 23 | import "BamProcessing.wdl" as Processing
 24 | import "Utilities.wdl" as Utils
 25 | import "DNASeqStructs.wdl" as Structs
 26 | 
 27 | # WORKFLOW DEFINITION
 28 | workflow UnmappedBamToAlignedBam {
 29 | 
 30 |   input {
 31 |     SampleAndUnmappedBams sample_and_unmapped_bams
 32 |     DNASeqSingleSampleReferences references
 33 |     DragmapReference? dragmap_reference
 34 |     PapiSettings papi_settings
 35 | 
 36 |     File contamination_sites_ud
 37 |     File contamination_sites_bed
 38 |     File contamination_sites_mu
 39 | 
 40 |     String cross_check_fingerprints_by
 41 |     File haplotype_database_file
 42 |     Float lod_threshold
 43 |     String recalibrated_bam_basename
 44 |     Boolean hard_clip_reads = false
 45 |     Boolean unmap_contaminant_reads = true
 46 |     Boolean bin_base_qualities = true
 47 |     Boolean somatic = false
 48 |     Boolean perform_bqsr = true
 49 |     Boolean use_bwa_mem = true
 50 |     Boolean allow_empty_ref_alt = false
 51 |   }
 52 | 
 53 |   Float cutoff_for_large_rg_in_gb = 20.0
 54 | 
 55 |   String bwa_commandline = "bwa mem -K 100000000 -p -v 3 -t 16 -Y $bash_ref_fasta"
 56 | 
 57 |   Int compression_level = 2
 58 | 
 59 |   # Get the size of the standard reference files as well as the additional reference files needed for BWA
 60 | 
 61 |   # Align flowcell-level unmapped input bams in parallel
 62 |   scatter (unmapped_bam in sample_and_unmapped_bams.flowcell_unmapped_bams) {
 63 | 
 64 |     Float unmapped_bam_size = size(unmapped_bam, "GiB")
 65 | 
 66 |     String unmapped_bam_basename = basename(unmapped_bam, sample_and_unmapped_bams.unmapped_bam_suffix)
 67 | 
 68 |     # QC the unmapped BAM
 69 |     call QC.CollectQualityYieldMetrics as CollectQualityYieldMetrics {
 70 |       input:
 71 |         input_bam = unmapped_bam,
 72 |         metrics_filename = unmapped_bam_basename + ".unmapped.quality_yield_metrics"
 73 |     }
 74 | 
 75 |     if (unmapped_bam_size > cutoff_for_large_rg_in_gb) {
 76 |       # Split bam into multiple smaller bams,
 77 |       # map reads to reference and recombine into one bam
 78 |       call SplitRG.SplitLargeReadGroup as SplitRG {
 79 |         input:
 80 |           input_bam = unmapped_bam,
 81 |           bwa_commandline = bwa_commandline,
 82 |           output_bam_basename = unmapped_bam_basename + ".aligned.unsorted",
 83 |           reference_fasta = references.reference_fasta,
 84 |           dragmap_reference = dragmap_reference,
 85 |           compression_level = compression_level,
 86 |           hard_clip_reads = hard_clip_reads,
 87 |           unmap_contaminant_reads = unmap_contaminant_reads,
 88 |           use_bwa_mem = use_bwa_mem,
 89 |           allow_empty_ref_alt = allow_empty_ref_alt
 90 |       }
 91 |     }
 92 | 
 93 |     if (unmapped_bam_size <= cutoff_for_large_rg_in_gb) {
 94 |       # Map reads to reference
 95 |       if (use_bwa_mem) {
 96 |         call Alignment.SamToFastqAndBwaMemAndMba as SamToFastqAndBwaMemAndMba {
 97 |           input:
 98 |             input_bam = unmapped_bam,
 99 |             bwa_commandline = bwa_commandline,
100 |             output_bam_basename = unmapped_bam_basename + ".aligned.unsorted",
101 |             reference_fasta = references.reference_fasta,
102 |             compression_level = compression_level,
103 |             hard_clip_reads = hard_clip_reads,
104 |             unmap_contaminant_reads = unmap_contaminant_reads,
105 |             allow_empty_ref_alt = allow_empty_ref_alt
106 |         }
107 |       }
108 |       if (!use_bwa_mem) {
109 |         call DragmapAlignment.SamToFastqAndDragmapAndMba as SamToFastqAndDragmapAndMba {
110 |           input:
111 |             input_bam = unmapped_bam,
112 |             output_bam_basename = unmapped_bam_basename + ".aligned.unsorted",
113 |             reference_fasta = references.reference_fasta,
114 |             dragmap_reference = select_first([dragmap_reference]),
115 |             compression_level = compression_level,
116 |             hard_clip_reads = hard_clip_reads,
117 |             unmap_contaminant_reads = unmap_contaminant_reads
118 |         }
119 |       }
120 |     }
121 | 
122 |     File output_aligned_bam = select_first([SamToFastqAndBwaMemAndMba.output_bam, SamToFastqAndDragmapAndMba.output_bam, SplitRG.aligned_bam])
123 | 
124 |     Float mapped_bam_size = size(output_aligned_bam, "GiB")
125 | 
126 |     # QC the aligned but unsorted readgroup BAM
127 |     # no reference as the input here is unsorted, providing a reference would cause an error
128 |     call QC.CollectUnsortedReadgroupBamQualityMetrics as CollectUnsortedReadgroupBamQualityMetrics {
129 |       input:
130 |         input_bam = output_aligned_bam,
131 |         output_bam_prefix = unmapped_bam_basename + ".readgroup"
132 |     }
133 |   }
134 | 
135 |   # MarkDuplicates and SortSam currently take too long for preemptibles if the input data is too large
136 |   Float gb_size_cutoff_for_preemptibles = 110.0
137 |   Boolean data_too_large_for_preemptibles = size(output_aligned_bam, "GiB") > gb_size_cutoff_for_preemptibles
138 | 
139 |   # Aggregate aligned+merged flowcell BAM files and mark duplicates
140 |   # We take advantage of the tool's ability to take multiple BAM inputs and write out a single output
141 |   # to avoid having to spend time just merging BAM files.
142 |   call Processing.MarkDuplicates as MarkDuplicates {
143 |     input:
144 |       input_bams = output_aligned_bam,
145 |       output_bam_basename = sample_and_unmapped_bams.base_file_name + ".aligned.unsorted.duplicates_marked",
146 |       metrics_filename = sample_and_unmapped_bams.base_file_name + ".duplicate_metrics",
147 |       total_input_size = size(output_aligned_bam, "GiB"),
148 |       compression_level = compression_level
149 |   }
150 | 
151 |   # Sort aggregated+deduped BAM file and fix tags
152 |   call Processing.SortSam as SortSampleBam {
153 |     input:
154 |       input_bam = MarkDuplicates.output_bam,
155 |       output_bam_basename = sample_and_unmapped_bams.base_file_name + ".aligned.duplicate_marked.sorted",
156 |       compression_level = compression_level
157 |   }
158 | 
159 |   Float agg_bam_size = size(SortSampleBam.output_bam, "GiB")
160 | 
161 |   if (defined(haplotype_database_file)) {
162 |     # Check identity of fingerprints across readgroups
163 |     call QC.CrossCheckFingerprints as CrossCheckFingerprints {
164 |       input:
165 |         input_bams = [ SortSampleBam.output_bam ],
166 |         input_bam_indexes = [SortSampleBam.output_bam_index],
167 |         haplotype_database_file = haplotype_database_file,
168 |         metrics_filename = sample_and_unmapped_bams.base_file_name + ".crosscheck",
169 |         total_input_size = agg_bam_size,
170 |         lod_threshold = lod_threshold,
171 |         cross_check_by = cross_check_fingerprints_by
172 |     }
173 |   }
174 | 
175 |   # Create list of sequences for scatter-gather parallelization
176 |   call Utils.CreateSequenceGroupingTSV as CreateSequenceGroupingTSV {
177 |     input:
178 |       ref_dict = references.reference_fasta.ref_dict
179 |   }
180 | 
181 |   # Estimate level of cross-sample contamination
182 |   call Processing.CheckContamination as CheckContamination {
183 |     input:
184 |       input_bam = SortSampleBam.output_bam,
185 |       input_bam_index = SortSampleBam.output_bam_index,
186 |       contamination_sites_ud = contamination_sites_ud,
187 |       contamination_sites_bed = contamination_sites_bed,
188 |       contamination_sites_mu = contamination_sites_mu,
189 |       ref_fasta = references.reference_fasta.ref_fasta,
190 |       ref_fasta_index = references.reference_fasta.ref_fasta_index,
191 |       output_prefix = sample_and_unmapped_bams.base_file_name + ".preBqsr",
192 |       contamination_underestimation_factor = 0.75
193 |   }
194 | 
195 |   # We need disk to localize the sharded input and output due to the scatter for BQSR.
196 |   # If we take the number we are scattering by and reduce by 3 we will have enough disk space
197 |   # to account for the fact that the data is not split evenly.
198 |   Int num_of_bqsr_scatters = length(CreateSequenceGroupingTSV.sequence_grouping)
199 |   Int potential_bqsr_divisor = num_of_bqsr_scatters - 10
200 |   Int bqsr_divisor = if potential_bqsr_divisor > 1 then potential_bqsr_divisor else 1
201 | 
202 |   # Perform Base Quality Score Recalibration (BQSR) on the sorted BAM in parallel
203 | 
204 |   if (perform_bqsr) {
205 |     scatter (subgroup in CreateSequenceGroupingTSV.sequence_grouping) {
206 |       # Generate the recalibration model by interval
207 |       call Processing.BaseRecalibrator as BaseRecalibrator {
208 |         input:
209 |           input_bam = SortSampleBam.output_bam,
210 |           input_bam_index = SortSampleBam.output_bam_index,
211 |           recalibration_report_filename = sample_and_unmapped_bams.base_file_name + ".recal_data.csv",
212 |           sequence_group_interval = subgroup,
213 |           dbsnp_vcf = references.dbsnp_vcf,
214 |           dbsnp_vcf_index = references.dbsnp_vcf_index,
215 |           known_indels_sites_vcfs = references.known_indels_sites_vcfs,
216 |           known_indels_sites_indices = references.known_indels_sites_indices,
217 |           ref_dict = references.reference_fasta.ref_dict,
218 |           ref_fasta = references.reference_fasta.ref_fasta,
219 |           ref_fasta_index = references.reference_fasta.ref_fasta_index,
220 |           bqsr_scatter = bqsr_divisor
221 |       }
222 |     }
223 | 
224 |     # Merge the recalibration reports resulting from by-interval recalibration
225 |     # The reports are always the same size
226 |     call Processing.GatherBqsrReports as GatherBqsrReports {
227 |       input:
228 |         input_bqsr_reports = BaseRecalibrator.recalibration_report,
229 |         output_report_filename = sample_and_unmapped_bams.base_file_name + ".recal_data.csv"
230 |     }
231 | 
232 |     scatter (subgroup in CreateSequenceGroupingTSV.sequence_grouping_with_unmapped) {
233 |       # Apply the recalibration model by interval
234 |       call Processing.ApplyBQSR as ApplyBQSR {
235 |         input:
236 |           input_bam = SortSampleBam.output_bam,
237 |           input_bam_index = SortSampleBam.output_bam_index,
238 |           output_bam_basename = recalibrated_bam_basename,
239 |           recalibration_report = GatherBqsrReports.output_bqsr_report,
240 |           sequence_group_interval = subgroup,
241 |           ref_dict = references.reference_fasta.ref_dict,
242 |           ref_fasta = references.reference_fasta.ref_fasta,
243 |           ref_fasta_index = references.reference_fasta.ref_fasta_index,
244 |           bqsr_scatter = bqsr_divisor,
245 |           compression_level = compression_level,
246 |           bin_base_qualities = bin_base_qualities,
247 |           somatic = somatic
248 |       }
249 |     }
250 |   }
251 | 
252 |   # Merge the recalibrated BAM files resulting from by-interval recalibration
253 |   call Processing.GatherSortedBamFiles as GatherBamFiles {
254 |     input:
255 |       input_bams = select_first([ApplyBQSR.recalibrated_bam, [SortSampleBam.output_bam]]),
256 |       output_bam_basename = sample_and_unmapped_bams.base_file_name,
257 |       total_input_size = agg_bam_size,
258 |       compression_level = compression_level,
259 |   }
260 | 
261 |   # Outputs that will be retained when execution is complete
262 |   output {
263 |     Array[File] quality_yield_metrics = CollectQualityYieldMetrics.quality_yield_metrics
264 | 
265 |     Array[File] unsorted_read_group_base_distribution_by_cycle_pdf = CollectUnsortedReadgroupBamQualityMetrics.base_distribution_by_cycle_pdf
266 |     Array[File] unsorted_read_group_base_distribution_by_cycle_metrics = CollectUnsortedReadgroupBamQualityMetrics.base_distribution_by_cycle_metrics
267 |     Array[File] unsorted_read_group_insert_size_histogram_pdf = CollectUnsortedReadgroupBamQualityMetrics.insert_size_histogram_pdf
268 |     Array[File] unsorted_read_group_insert_size_metrics = CollectUnsortedReadgroupBamQualityMetrics.insert_size_metrics
269 |     Array[File] unsorted_read_group_quality_by_cycle_pdf = CollectUnsortedReadgroupBamQualityMetrics.quality_by_cycle_pdf
270 |     Array[File] unsorted_read_group_quality_by_cycle_metrics = CollectUnsortedReadgroupBamQualityMetrics.quality_by_cycle_metrics
271 |     Array[File] unsorted_read_group_quality_distribution_pdf = CollectUnsortedReadgroupBamQualityMetrics.quality_distribution_pdf
272 |     Array[File] unsorted_read_group_quality_distribution_metrics = CollectUnsortedReadgroupBamQualityMetrics.quality_distribution_metrics
273 | 
274 |     File? cross_check_fingerprints_metrics = CrossCheckFingerprints.cross_check_fingerprints_metrics
275 | 
276 |     File selfSM = CheckContamination.selfSM
277 |     Float contamination = CheckContamination.contamination
278 | 
279 |     File duplicate_metrics = MarkDuplicates.duplicate_metrics
280 |     File? output_bqsr_reports = GatherBqsrReports.output_bqsr_report
281 | 
282 |     File output_bam = GatherBamFiles.output_bam
283 |     File output_bam_index = GatherBamFiles.output_bam_index
284 |   }
285 |   meta {
286 |     allowNestedInputs: true
287 |   }
288 | }
289 | 


--------------------------------------------------------------------------------
/20k_Throughput-run/WholeGenomeGermlineSingleSample.wdl:
--------------------------------------------------------------------------------
  1 | version 1.0
  2 | 
  3 | ## Copyright Broad Institute, 2018
  4 | ##
  5 | ## This WDL pipeline implements data pre-processing and initial variant calling (GVCF
  6 | ## generation) according to the GATK Best Practices (June 2016) for germline SNP and
  7 | ## Indel discovery in human whole-genome data.
  8 | ##
  9 | ## Requirements/expectations :
 10 | ## - Human whole-genome pair-end sequencing data in unmapped BAM (uBAM) format
 11 | ## - One or more read groups, one per uBAM file, all belonging to a single sample (SM)
 12 | ## - Input uBAM files must additionally comply with the following requirements:
 13 | ## - - filenames all have the same suffix (we use ".unmapped.bam")
 14 | ## - - files must pass validation by ValidateSamFile
 15 | ## - - reads are provided in query-sorted order
 16 | ## - - all reads must have an RG tag
 17 | ## - GVCF output names must end in ".g.vcf.gz"
 18 | ## - Reference genome must be Hg38 with ALT contigs
 19 | ##
 20 | ## Runtime parameters are optimized for Broad's Google Cloud Platform implementation.
 21 | ## For program versions, see docker containers.
 22 | ##
 23 | ## LICENSING :
 24 | ## This script is released under the WDL source code license (BSD-3) (see LICENSE in
 25 | ## https://github.com/broadinstitute/wdl). Note however that the programs it calls may
 26 | ## be subject to different licenses. Users are responsible for checking that they are
 27 | ## authorized to run all programs before running this script. Please see the docker
 28 | ## page at https://hub.docker.com/r/broadinstitute/genomes-in-the-cloud/ for detailed
 29 | ## licensing information pertaining to the included programs.
 30 | 
 31 | import "UnmappedBamToAlignedBam.wdl" as ToBam
 32 | import "AggregatedBamQC.wdl" as AggregatedQC
 33 | import "Qc.wdl" as QC
 34 | import "BamToCram.wdl" as ToCram
 35 | import "Utilities.wdl" as Utilities
 36 | import "VariantCalling.wdl" as ToGvcf
 37 | import "DNASeqStructs.wdl"
 38 | 
 39 | # WORKFLOW DEFINITION
 40 | workflow WholeGenomeGermlineSingleSample {
 41 | 
 42 | 
 43 |   String pipeline_version = "3.1.19"
 44 | 
 45 | 
 46 |   input {
 47 |     SampleAndUnmappedBams sample_and_unmapped_bams
 48 |     DNASeqSingleSampleReferences references
 49 |     DragmapReference? dragmap_reference
 50 |     VariantCallingScatterSettings scatter_settings
 51 |     PapiSettings papi_settings
 52 | 
 53 |     File? fingerprint_genotypes_file
 54 |     File? fingerprint_genotypes_index
 55 | 
 56 |     File wgs_coverage_interval_list
 57 | 
 58 |     Boolean provide_bam_output = false
 59 |     Boolean use_gatk3_haplotype_caller = false
 60 | 
 61 |     Boolean dragen_functional_equivalence_mode = false
 62 |     Boolean dragen_maximum_quality_mode = false
 63 | 
 64 |     Boolean run_dragen_mode_variant_calling = false
 65 |     Boolean use_spanning_event_genotyping = true
 66 |     Boolean unmap_contaminant_reads = true
 67 |     Boolean perform_bqsr = true
 68 |     Boolean use_bwa_mem = true
 69 |     Boolean allow_empty_ref_alt = false
 70 |     Boolean use_dragen_hard_filtering = false
 71 |   }
 72 | 
 73 |   if (dragen_functional_equivalence_mode && dragen_maximum_quality_mode) {
 74 |     call Utilities.ErrorWithMessage as PresetArgumentsError {
 75 |       input:
 76 |         message = "Both dragen_functional_equivalence_mode and dragen_maximum_quality_mode have been set to true, however, they are mutually exclusive. You can set either of them to true, or set them both to false and adjust the arguments individually."
 77 |     }
 78 |   }
 79 | 
 80 |   if (run_dragen_mode_variant_calling && use_gatk3_haplotype_caller) {
 81 |     call Utilities.ErrorWithMessage as DragenModeVariantCallingAndGATK3Error {
 82 |       input:
 83 |         message = "DRAGEN mode variant calling has been activated, however, the HaplotypeCaller version has been set to use GATK 3. Please set use_gatk3_haplotype_caller to false to use DRAGEN mode variant calling."
 84 |     }
 85 |   }
 86 | 
 87 |   # Set DRAGEN-related arguments according to the preset arguments
 88 |   Boolean run_dragen_mode_variant_calling_ = if (dragen_functional_equivalence_mode || dragen_maximum_quality_mode) then true else run_dragen_mode_variant_calling
 89 |   Boolean use_spanning_event_genotyping_ = if dragen_functional_equivalence_mode then false else (if dragen_maximum_quality_mode then true else use_spanning_event_genotyping)
 90 |   Boolean unmap_contaminant_reads_ = if dragen_functional_equivalence_mode then false else (if dragen_maximum_quality_mode then true else unmap_contaminant_reads)
 91 |   Boolean perform_bqsr_ = if (dragen_functional_equivalence_mode || dragen_maximum_quality_mode) then false else perform_bqsr
 92 |   Boolean use_bwa_mem_ = if (dragen_functional_equivalence_mode || dragen_maximum_quality_mode) then false else use_bwa_mem
 93 |   Boolean use_gatk3_haplotype_caller_ = if (dragen_functional_equivalence_mode || dragen_maximum_quality_mode) then false else use_gatk3_haplotype_caller
 94 |   Boolean use_dragen_hard_filtering_ = if (dragen_functional_equivalence_mode || dragen_maximum_quality_mode) then true else use_dragen_hard_filtering
 95 | 
 96 |   # Not overridable:
 97 |   Float lod_threshold = -20.0
 98 |   String cross_check_fingerprints_by = "READGROUP"
 99 |   String recalibrated_bam_basename = sample_and_unmapped_bams.base_file_name + ".aligned.duplicates_marked.recalibrated"
100 | 
101 |   String final_gvcf_base_name = select_first([sample_and_unmapped_bams.final_gvcf_base_name, sample_and_unmapped_bams.base_file_name])
102 | 
103 |   call ToBam.UnmappedBamToAlignedBam {
104 |     input:
105 |       sample_and_unmapped_bams    = sample_and_unmapped_bams,
106 |       references                  = references,
107 |       dragmap_reference           = dragmap_reference,
108 |       papi_settings               = papi_settings,
109 | 
110 |       contamination_sites_ud = references.contamination_sites_ud,
111 |       contamination_sites_bed = references.contamination_sites_bed,
112 |       contamination_sites_mu = references.contamination_sites_mu,
113 | 
114 |       cross_check_fingerprints_by = cross_check_fingerprints_by,
115 |       haplotype_database_file     = references.haplotype_database_file,
116 |       lod_threshold               = lod_threshold,
117 |       recalibrated_bam_basename   = recalibrated_bam_basename,
118 |       perform_bqsr                = perform_bqsr_,
119 |       use_bwa_mem                 = use_bwa_mem_,
120 |       unmap_contaminant_reads     = unmap_contaminant_reads_,
121 |       allow_empty_ref_alt         = allow_empty_ref_alt
122 |   }
123 | 
124 |   call AggregatedQC.AggregatedBamQC {
125 |     input:
126 |       base_recalibrated_bam = UnmappedBamToAlignedBam.output_bam,
127 |       base_recalibrated_bam_index = UnmappedBamToAlignedBam.output_bam_index,
128 |       base_name = sample_and_unmapped_bams.base_file_name,
129 |       sample_name = sample_and_unmapped_bams.sample_name,
130 |       recalibrated_bam_base_name = recalibrated_bam_basename,
131 |       haplotype_database_file = references.haplotype_database_file,
132 |       references = references,
133 |       fingerprint_genotypes_file = fingerprint_genotypes_file,
134 |       fingerprint_genotypes_index = fingerprint_genotypes_index,
135 |       papi_settings = papi_settings
136 |   }
137 | 
138 |   call ToCram.BamToCram as BamToCram {
139 |     input:
140 |       input_bam = UnmappedBamToAlignedBam.output_bam,
141 |       ref_fasta = references.reference_fasta.ref_fasta,
142 |       ref_fasta_index = references.reference_fasta.ref_fasta_index,
143 |       ref_dict = references.reference_fasta.ref_dict,
144 |       duplication_metrics = UnmappedBamToAlignedBam.duplicate_metrics,
145 |       chimerism_metrics = AggregatedBamQC.agg_alignment_summary_metrics,
146 |       base_file_name = sample_and_unmapped_bams.base_file_name,
147 |   }
148 | 
149 |   # QC the sample WGS metrics (stringent thresholds)
150 |   call QC.CollectWgsMetrics as CollectWgsMetrics {
151 |     input:
152 |       input_bam = UnmappedBamToAlignedBam.output_bam,
153 |       input_bam_index = UnmappedBamToAlignedBam.output_bam_index,
154 |       metrics_filename = sample_and_unmapped_bams.base_file_name + ".wgs_metrics",
155 |       ref_fasta = references.reference_fasta.ref_fasta,
156 |       ref_fasta_index = references.reference_fasta.ref_fasta_index,
157 |       wgs_coverage_interval_list = wgs_coverage_interval_list,
158 |   }
159 | 
160 |   # QC the sample raw WGS metrics (common thresholds)
161 |   call QC.CollectRawWgsMetrics as CollectRawWgsMetrics {
162 |     input:
163 |       input_bam = UnmappedBamToAlignedBam.output_bam,
164 |       input_bam_index = UnmappedBamToAlignedBam.output_bam_index,
165 |       metrics_filename = sample_and_unmapped_bams.base_file_name + ".raw_wgs_metrics",
166 |       ref_fasta = references.reference_fasta.ref_fasta,
167 |       ref_fasta_index = references.reference_fasta.ref_fasta_index,
168 |       wgs_coverage_interval_list = wgs_coverage_interval_list,
169 |   }
170 | 
171 |   call ToGvcf.VariantCalling as BamToGvcf {
172 |     input:
173 |       run_dragen_mode_variant_calling = run_dragen_mode_variant_calling_,
174 |       use_spanning_event_genotyping = use_spanning_event_genotyping_,
175 |       calling_interval_list = references.calling_interval_list,
176 |       evaluation_interval_list = references.evaluation_interval_list,
177 |       haplotype_scatter_count = scatter_settings.haplotype_scatter_count,
178 |       break_bands_at_multiples_of = scatter_settings.break_bands_at_multiples_of,
179 |       contamination = UnmappedBamToAlignedBam.contamination,
180 |       input_bam = UnmappedBamToAlignedBam.output_bam,
181 |       input_bam_index = UnmappedBamToAlignedBam.output_bam_index,
182 |       ref_fasta = references.reference_fasta.ref_fasta,
183 |       ref_fasta_index = references.reference_fasta.ref_fasta_index,
184 |       ref_dict = references.reference_fasta.ref_dict,
185 |       ref_str = references.reference_fasta.ref_str,
186 |       dbsnp_vcf = references.dbsnp_vcf,
187 |       dbsnp_vcf_index = references.dbsnp_vcf_index,
188 |       base_file_name = sample_and_unmapped_bams.base_file_name,
189 |       final_vcf_base_name = final_gvcf_base_name,
190 |       use_gatk3_haplotype_caller = use_gatk3_haplotype_caller_,
191 |       use_dragen_hard_filtering = use_dragen_hard_filtering_
192 |   }
193 | 
194 |   if (provide_bam_output) {
195 |     File provided_output_bam = UnmappedBamToAlignedBam.output_bam
196 |     File provided_output_bam_index = UnmappedBamToAlignedBam.output_bam_index
197 |   }
198 | 
199 |   # Outputs that will be retained when execution is complete
200 |   output {
201 |     Array[File] quality_yield_metrics = UnmappedBamToAlignedBam.quality_yield_metrics
202 | 
203 |     Array[File] unsorted_read_group_base_distribution_by_cycle_pdf = UnmappedBamToAlignedBam.unsorted_read_group_base_distribution_by_cycle_pdf
204 |     Array[File] unsorted_read_group_base_distribution_by_cycle_metrics = UnmappedBamToAlignedBam.unsorted_read_group_base_distribution_by_cycle_metrics
205 |     Array[File] unsorted_read_group_insert_size_histogram_pdf = UnmappedBamToAlignedBam.unsorted_read_group_insert_size_histogram_pdf
206 |     Array[File] unsorted_read_group_insert_size_metrics = UnmappedBamToAlignedBam.unsorted_read_group_insert_size_metrics
207 |     Array[File] unsorted_read_group_quality_by_cycle_pdf = UnmappedBamToAlignedBam.unsorted_read_group_quality_by_cycle_pdf
208 |     Array[File] unsorted_read_group_quality_by_cycle_metrics = UnmappedBamToAlignedBam.unsorted_read_group_quality_by_cycle_metrics
209 |     Array[File] unsorted_read_group_quality_distribution_pdf = UnmappedBamToAlignedBam.unsorted_read_group_quality_distribution_pdf
210 |     Array[File] unsorted_read_group_quality_distribution_metrics = UnmappedBamToAlignedBam.unsorted_read_group_quality_distribution_metrics
211 | 
212 |     File read_group_alignment_summary_metrics = AggregatedBamQC.read_group_alignment_summary_metrics
213 |     File read_group_gc_bias_detail_metrics = AggregatedBamQC.read_group_gc_bias_detail_metrics
214 |     File read_group_gc_bias_pdf = AggregatedBamQC.read_group_gc_bias_pdf
215 |     File read_group_gc_bias_summary_metrics = AggregatedBamQC.read_group_gc_bias_summary_metrics
216 | 
217 |     File? cross_check_fingerprints_metrics = UnmappedBamToAlignedBam.cross_check_fingerprints_metrics
218 | 
219 |     File selfSM = UnmappedBamToAlignedBam.selfSM
220 |     Float contamination = UnmappedBamToAlignedBam.contamination
221 | 
222 |     File calculate_read_group_checksum_md5 = AggregatedBamQC.calculate_read_group_checksum_md5
223 | 
224 |     File agg_alignment_summary_metrics = AggregatedBamQC.agg_alignment_summary_metrics
225 |     File agg_bait_bias_detail_metrics = AggregatedBamQC.agg_bait_bias_detail_metrics
226 |     File agg_bait_bias_summary_metrics = AggregatedBamQC.agg_bait_bias_summary_metrics
227 |     File agg_gc_bias_detail_metrics = AggregatedBamQC.agg_gc_bias_detail_metrics
228 |     File agg_gc_bias_pdf = AggregatedBamQC.agg_gc_bias_pdf
229 |     File agg_gc_bias_summary_metrics = AggregatedBamQC.agg_gc_bias_summary_metrics
230 |     File agg_insert_size_histogram_pdf = AggregatedBamQC.agg_insert_size_histogram_pdf
231 |     File agg_insert_size_metrics = AggregatedBamQC.agg_insert_size_metrics
232 |     File agg_pre_adapter_detail_metrics = AggregatedBamQC.agg_pre_adapter_detail_metrics
233 |     File agg_pre_adapter_summary_metrics = AggregatedBamQC.agg_pre_adapter_summary_metrics
234 |     File agg_quality_distribution_pdf = AggregatedBamQC.agg_quality_distribution_pdf
235 |     File agg_quality_distribution_metrics = AggregatedBamQC.agg_quality_distribution_metrics
236 |     File agg_error_summary_metrics = AggregatedBamQC.agg_error_summary_metrics
237 | 
238 |     File? fingerprint_summary_metrics = AggregatedBamQC.fingerprint_summary_metrics
239 |     File? fingerprint_detail_metrics = AggregatedBamQC.fingerprint_detail_metrics
240 | 
241 |     File wgs_metrics = CollectWgsMetrics.metrics
242 |     File raw_wgs_metrics = CollectRawWgsMetrics.metrics
243 | 
244 |     File duplicate_metrics = UnmappedBamToAlignedBam.duplicate_metrics
245 |     File? output_bqsr_reports = UnmappedBamToAlignedBam.output_bqsr_reports
246 | 
247 |     File gvcf_summary_metrics = BamToGvcf.vcf_summary_metrics
248 |     File gvcf_detail_metrics = BamToGvcf.vcf_detail_metrics
249 | 
250 |     File? output_bam = provided_output_bam
251 |     File? output_bam_index = provided_output_bam_index
252 | 
253 |     File output_cram = BamToCram.output_cram
254 |     File output_cram_index = BamToCram.output_cram_index
255 |     File output_cram_md5 = BamToCram.output_cram_md5
256 | 
257 |     File validate_cram_file_report = BamToCram.validate_cram_file_report
258 | 
259 |     File output_vcf = BamToGvcf.output_vcf
260 |     File output_vcf_index = BamToGvcf.output_vcf_index
261 |   }
262 |   meta {
263 |     allowNestedInputs: true
264 |   }
265 | }
266 | 


--------------------------------------------------------------------------------
/20k_Throughput-run/WDL/BamProcessing.wdl:
--------------------------------------------------------------------------------
  1 | version 1.0
  2 | 
  3 | ## Copyright Broad Institute, 2018
  4 | ##
  5 | ## This WDL defines tasks used for BAM file processing of human whole-genome or exome sequencing data.
  6 | ##
  7 | ## Runtime parameters are often optimized for Broad's Google Cloud Platform implementation.
  8 | ## For program versions, see docker containers.
  9 | ##
 10 | ## LICENSING :
 11 | ## This script is released under the WDL source code license (BSD-3) (see LICENSE in
 12 | ## https://github.com/broadinstitute/wdl). Note however that the programs it calls may
 13 | ## be subject to different licenses. Users are responsible for checking that they are
 14 | ## authorized to run all programs before running this script. Please see the docker
 15 | ## page at https://hub.docker.com/r/broadinstitute/genomes-in-the-cloud/ for detailed
 16 | ## licensing information pertaining to the included programs.
 17 | 
 18 | # Sort BAM file by coordinate order
 19 | task SortSam {
 20 |   input {
 21 |     File input_bam
 22 |     String output_bam_basename
 23 |     Int compression_level
 24 |     Int additional_disk = 20
 25 |     Int memory_multiplier = 1
 26 |   }
 27 |   # SortSam spills to disk a lot more because we are only store 300000 records in RAM now because its faster for our data so it needs
 28 |   # more disk space.  Also it spills to disk in an uncompressed format so we need to account for that with a larger multiplier
 29 |   Float sort_sam_disk_multiplier = 3.25
 30 |   Int disk_size = ceil(sort_sam_disk_multiplier * size(input_bam, "GiB")) + additional_disk
 31 | 
 32 |   command {
 33 |     java -Dsamjdk.compression_level=~{compression_level} -Xms4000m -Xmx4900m -jar /mnt/lustre/genomics/tools/picard.jar \
 34 |       SortSam \
 35 |       INPUT=~{input_bam} \
 36 |       OUTPUT=~{output_bam_basename}.bam \
 37 |       SORT_ORDER="coordinate" \
 38 |       CREATE_INDEX=true \
 39 |       CREATE_MD5_FILE=true \
 40 |       MAX_RECORDS_IN_RAM=300000
 41 | 
 42 |   }
 43 |   runtime {
 44 |     cpu: "16"
 45 |     memory: "5000 MiB"
 46 |   }
 47 |   output {
 48 |     File output_bam = "~{output_bam_basename}.bam"
 49 |     File output_bam_index = "~{output_bam_basename}.bai"
 50 |     File output_bam_md5 = "~{output_bam_basename}.bam.md5"
 51 |   }
 52 | }
 53 | 
 54 | 
 55 | # Mark duplicate reads to avoid counting non-independent observations
 56 | task MarkDuplicates {
 57 |   input {
 58 |     Array[File] input_bams
 59 |     String output_bam_basename
 60 |     String metrics_filename
 61 |     Float total_input_size
 62 |     Int compression_level
 63 | 
 64 |     # The program default for READ_NAME_REGEX is appropriate in nearly every case.
 65 |     # Sometimes we wish to supply "null" in order to turn off optical duplicate detection
 66 |     # This can be desirable if you don't mind the estimated library size being wrong and optical duplicate detection is taking >7 days and failing
 67 |     String? read_name_regex
 68 |     Int memory_multiplier = 1
 69 |     Int additional_disk = 20
 70 | 
 71 |     Float? sorting_collection_size_ratio
 72 |   }
 73 | 
 74 |   # The merged bam will be smaller than the sum of the parts so we need to account for the unmerged inputs and the merged output.
 75 |   # Mark Duplicates takes in as input readgroup bams and outputs a slightly smaller aggregated bam. Giving .25 as wiggleroom
 76 |   Float md_disk_multiplier = 3
 77 |   Int disk_size = ceil(md_disk_multiplier * total_input_size) + additional_disk
 78 | 
 79 |   Float memory_size = 7.5 * memory_multiplier
 80 |   Int java_memory_size = (ceil(memory_size) - 2)
 81 | 
 82 |   # Task is assuming query-sorted input so that the Secondary and Supplementary reads get marked correctly
 83 |   # This works because the output of BWA is query-grouped and therefore, so is the output of MergeBamAlignment.
 84 |   # While query-grouped isn't actually query-sorted, it's good enough for MarkDuplicates with ASSUME_SORT_ORDER="queryname"
 85 | 
 86 |   command {
 87 |     java -Dsamjdk.compression_level=~{compression_level} -Xms~{java_memory_size}g -jar /mnt/lustre/genomics/tools/picard.jar \
 88 |       MarkDuplicates \
 89 |       INPUT=~{sep=' INPUT=' input_bams} \
 90 |       OUTPUT=~{output_bam_basename}.bam \
 91 |       METRICS_FILE=~{metrics_filename} \
 92 |       VALIDATION_STRINGENCY=SILENT \
 93 |       ~{"READ_NAME_REGEX=" + read_name_regex} \
 94 |       ~{"SORTING_COLLECTION_SIZE_RATIO=" + sorting_collection_size_ratio} \
 95 |       OPTICAL_DUPLICATE_PIXEL_DISTANCE=2500 \
 96 |       ASSUME_SORT_ORDER="queryname" \
 97 |       CLEAR_DT="false" \
 98 |       ADD_PG_TAG_TO_READS=false
 99 |   }
100 |   runtime {
101 |     cpu: "2"
102 |     memory: "~{memory_size} GiB"
103 |   }
104 |   output {
105 |     File output_bam = "~{output_bam_basename}.bam"
106 |     File duplicate_metrics = "~{metrics_filename}"
107 |   }
108 | }
109 | 
110 | # Generate Base Quality Score Recalibration (BQSR) model
111 | task BaseRecalibrator {
112 |   input {
113 |     File input_bam
114 |     File input_bam_index
115 |     String recalibration_report_filename
116 |     Array[String] sequence_group_interval
117 |     File dbsnp_vcf
118 |     File dbsnp_vcf_index
119 |     Array[File] known_indels_sites_vcfs
120 |     Array[File] known_indels_sites_indices
121 |     File ref_dict
122 |     File ref_fasta
123 |     File ref_fasta_index
124 |     Int bqsr_scatter
125 |   }
126 | 
127 |   Float ref_size = size(ref_fasta, "GiB") + size(ref_fasta_index, "GiB") + size(ref_dict, "GiB")
128 |   Float dbsnp_size = size(dbsnp_vcf, "GiB")
129 |   Int disk_size = ceil((size(input_bam, "GiB") / bqsr_scatter) + ref_size + dbsnp_size) + 20
130 | 
131 |   parameter_meta {
132 |     input_bam: {
133 |       localization_optional: true
134 |     }
135 |   }
136 | 
137 |   command {
138 |     /mnt/lustre/genomics/tools/gatk/gatk --java-options "-XX:GCTimeLimit=50 -XX:GCHeapFreeLimit=10 -XX:+PrintFlagsFinal \
139 |       -Xlog:gc*:gc_log.log:time,level,tags \
140 |       -Xloggc:gc_log.log -Xms5g -Xmx6g" \
141 |       BaseRecalibrator \
142 |       -R ~{ref_fasta} \
143 |       -I ~{input_bam} \
144 |       --use-original-qualities \
145 |       -O ~{recalibration_report_filename} \
146 |       --known-sites ~{dbsnp_vcf} \
147 |       --known-sites ~{sep=" -known-sites " known_indels_sites_vcfs} \
148 |       -L ~{sep=" -L " sequence_group_interval}
149 |   }
150 |   runtime {
151 |     cpu: "2"
152 |     memory: "6000 MiB"
153 |   }
154 |   output {
155 |     File recalibration_report = "~{recalibration_report_filename}"
156 |   }
157 | }
158 | 
159 | # Apply Base Quality Score Recalibration (BQSR) model
160 | task ApplyBQSR {
161 |   input {
162 |     File input_bam
163 |     File input_bam_index
164 |     String output_bam_basename
165 |     File recalibration_report
166 |     Array[String] sequence_group_interval
167 |     File ref_dict
168 |     File ref_fasta
169 |     File ref_fasta_index
170 |     Int compression_level
171 |     Int bqsr_scatter
172 |     Int memory_multiplier = 1
173 |     Int additional_disk = 20
174 |     Boolean bin_base_qualities = true
175 |     Boolean somatic = false
176 |   }
177 | 
178 |   Float ref_size = size(ref_fasta, "GiB") + size(ref_fasta_index, "GiB") + size(ref_dict, "GiB")
179 |   Int disk_size = ceil((size(input_bam, "GiB") * 3 / bqsr_scatter) + ref_size) + additional_disk
180 | 
181 |   Int memory_size = ceil(3500 * memory_multiplier)
182 |   Int java_memory_mb = memory_size - 500
183 | 
184 |   Boolean bin_somatic_base_qualities = bin_base_qualities && somatic
185 | 
186 |   parameter_meta {
187 |     input_bam: {
188 |       localization_optional: true
189 |     }
190 |   }
191 | 
192 |   command {
193 |     /mnt/lustre/genomics/tools/gatk/gatk --java-options "-XX:+PrintFlagsFinal \
194 |       -Xlog:gc*:gc_log.log:time,level,tags \
195 |       -XX:GCTimeLimit=50 -XX:GCHeapFreeLimit=10 -Dsamjdk.compression_level=~{compression_level} -Xms3000m -Xmx~{java_memory_mb}m" \
196 |       ApplyBQSR \
197 |       --create-output-bam-md5 \
198 |       --add-output-sam-program-record \
199 |       -R ~{ref_fasta} \
200 |       -I ~{input_bam} \
201 |       --use-original-qualities \
202 |       -O ~{output_bam_basename}.bam \
203 |       -bqsr ~{recalibration_report} \
204 |       ~{true='--static-quantized-quals 10' false='' bin_base_qualities} \
205 |       ~{true='--static-quantized-quals 20' false='' bin_base_qualities} \
206 |       ~{true='--static-quantized-quals 30' false='' bin_base_qualities} \
207 |       ~{true='--static-quantized-quals 40' false='' bin_somatic_base_qualities} \
208 |       ~{true='--static-quantized-quals 50' false='' bin_somatic_base_qualities} \
209 |       -L ~{sep=" -L " sequence_group_interval}
210 |   }
211 |   runtime {
212 |     memory: "~{memory_size} MiB"
213 |     cpu: "2"
214 |   }
215 |   output {
216 |     File recalibrated_bam = "~{output_bam_basename}.bam"
217 |     File recalibrated_bam_checksum = "~{output_bam_basename}.bam.md5"
218 |   }
219 | }
220 | 
221 | # Combine multiple recalibration tables from scattered BaseRecalibrator runs
222 | task GatherBqsrReports {
223 |   input {
224 |     Array[File] input_bqsr_reports
225 |     String output_report_filename
226 |   }
227 | 
228 |   command {
229 |     /mnt/lustre/genomics/tools/gatk/gatk --java-options "-Xms3000m -Xmx3400m" \
230 |       GatherBQSRReports \
231 |       -I ~{sep=' -I ' input_bqsr_reports} \
232 |       -O ~{output_report_filename}
233 |     }
234 |   runtime {
235 |     cpu: "2"
236 |     memory: "3500 MiB"
237 |   }
238 |   output {
239 |     File output_bqsr_report = "~{output_report_filename}"
240 |   }
241 | }
242 | 
243 | # Combine multiple *sorted* BAM files
244 | task GatherSortedBamFiles {
245 |   input {
246 |     Array[File] input_bams
247 |     String output_bam_basename
248 |     Float total_input_size
249 |     Int compression_level
250 |     Int additional_disk = 20
251 |     Int memory_multiplier = 1
252 |   }
253 | 
254 |   # Multiply the input bam size by two to account for the input and output
255 |   Int disk_size = ceil(2 * total_input_size) + additional_disk
256 | 
257 |   command {
258 |     java -Dsamjdk.compression_level=~{compression_level} -Xms2000m -Xmx2900m -jar /mnt/lustre/genomics/tools/picard.jar \
259 |       GatherBamFiles \
260 |       INPUT=~{sep=' INPUT=' input_bams} \
261 |       OUTPUT=~{output_bam_basename}.bam \
262 |       CREATE_INDEX=true \
263 |       CREATE_MD5_FILE=true
264 |     }
265 |   runtime {
266 |     memory: "3000 MiB"
267 |   }
268 |   output {
269 |     File output_bam = "~{output_bam_basename}.bam"
270 |     File output_bam_index = "~{output_bam_basename}.bai"
271 |     File output_bam_md5 = "~{output_bam_basename}.bam.md5"
272 |   }
273 | }
274 | 
275 | # Combine multiple *unsorted* BAM files
276 | # Note that if/when WDL supports optional outputs, we should merge this task with the sorted version
277 | task GatherUnsortedBamFiles {
278 |   input {
279 |     Array[File] input_bams
280 |     String output_bam_basename
281 |     Float total_input_size
282 |     Int compression_level
283 |   }
284 | 
285 |   # Multiply the input bam size by two to account for the input and output
286 |   Int disk_size = ceil(2 * total_input_size) + 20
287 | 
288 |   command {
289 |     java -Dsamjdk.compression_level=~{compression_level} -Xms2000m -Xmx2900m -jar /mnt/lustre/genomics/tools/picard.jar \
290 |       GatherBamFiles \
291 |       INPUT=~{sep=' INPUT=' input_bams} \
292 |       OUTPUT=~{output_bam_basename}.bam \
293 |       CREATE_INDEX=false \
294 |       CREATE_MD5_FILE=false
295 |     }
296 |   runtime {
297 |     cpu: "2"
298 |     memory: "3 GiB"
299 |   }
300 |   output {
301 |     File output_bam = "~{output_bam_basename}.bam"
302 |   }
303 | }
304 | 
305 | task GenerateSubsettedContaminationResources {
306 |   input {
307 |     String bait_set_name
308 |     File target_interval_list
309 |     File contamination_sites_ud
310 |     File contamination_sites_bed
311 |     File contamination_sites_mu
312 |   }
313 | 
314 |   String output_ud = bait_set_name + "." + basename(contamination_sites_ud)
315 |   String output_bed = bait_set_name + "." + basename(contamination_sites_bed)
316 |   String output_mu = bait_set_name + "." + basename(contamination_sites_mu)
317 |   String target_overlap_counts = "target_overlap_counts.txt"
318 | 
319 |   command <<<
320 |     set -e -o pipefail
321 | 
322 |     grep -vE "^@" ~{target_interval_list} |
323 |        awk -v OFS='\t' '$2=$2-1' |
324 |        /app/bedtools intersect -c -a ~{contamination_sites_bed} -b - |
325 |        cut -f6 > ~{target_overlap_counts}
326 | 
327 |     function restrict_to_overlaps() {
328 |         # print lines from whole-genome file from loci with non-zero overlap
329 |         # with target intervals
330 |         WGS_FILE=$1
331 |         EXOME_FILE=$2
332 |         paste ~{target_overlap_counts} $WGS_FILE |
333 |             grep -Ev "^0" |
334 |             cut -f 2- > $EXOME_FILE
335 |         echo "Generated $EXOME_FILE"
336 |     }
337 | 
338 |     restrict_to_overlaps ~{contamination_sites_ud} ~{output_ud}
339 |     restrict_to_overlaps ~{contamination_sites_bed} ~{output_bed}
340 |     restrict_to_overlaps ~{contamination_sites_mu} ~{output_mu}
341 | 
342 |   >>>
343 |   runtime {
344 |     memory: "3.5 GiB"
345 |   }
346 |   output {
347 |     File subsetted_contamination_ud = output_ud
348 |     File subsetted_contamination_bed = output_bed
349 |     File subsetted_contamination_mu = output_mu
350 |   }
351 | }
352 | 
353 | # Notes on the contamination estimate:
354 | # The contamination value is read from the FREEMIX field of the selfSM file output by verifyBamId
355 | #
356 | # In Zamboni production, this value is stored directly in METRICS.AGGREGATION_CONTAM
357 | #
358 | # Contamination is also stored in GVCF_CALLING and thereby passed to HAPLOTYPE_CALLER
359 | # But first, it is divided by an underestimation factor thusly:
360 | #   float(FREEMIX) / ContaminationUnderestimationFactor
361 | #     where the denominator is hardcoded in Zamboni:
362 | #     val ContaminationUnderestimationFactor = 0.75f
363 | #
364 | # Here, I am handling this by returning both the original selfSM file for reporting, and the adjusted
365 | # contamination estimate for use in variant calling
366 | task CheckContamination {
367 |   input {
368 |     File input_bam
369 |     File input_bam_index
370 |     File contamination_sites_ud
371 |     File contamination_sites_bed
372 |     File contamination_sites_mu
373 |     File ref_fasta
374 |     File ref_fasta_index
375 |     String output_prefix
376 |     Float contamination_underestimation_factor
377 |     Boolean disable_sanity_check = false
378 |   }
379 | 
380 |   Int disk_size = ceil(size(input_bam, "GiB") + size(ref_fasta, "GiB")) + 30
381 | 
382 |   command <<<
383 |     set -e
384 | 
385 |     # creates a ~{output_prefix}.selfSM file, a TSV file with 2 rows, 19 columns.
386 |     # First row are the keys (e.g., SEQ_SM, RG, FREEMIX), second row are the associated values
387 |     /mnt/lustre/genomics/tools/VerifyBamID/bin/VerifyBamID \
388 |     --Verbose \
389 |     --NumPC 4 \
390 |     --Output ~{output_prefix} \
391 |     --BamFile ~{input_bam} \
392 |     --Reference ~{ref_fasta} \
393 |     --UDPath ~{contamination_sites_ud} \
394 |     --MeanPath ~{contamination_sites_mu} \
395 |     --BedPath ~{contamination_sites_bed} \
396 |     ~{true="--DisableSanityCheck" false="" disable_sanity_check} \
397 |     1>/dev/null
398 | 
399 |     # used to read from the selfSM file and calculate contamination, which gets printed out
400 |     python3 <<CODE
401 |     import csv
402 |     import sys
403 |     with open('~{output_prefix}.selfSM') as selfSM:
404 |       reader = csv.DictReader(selfSM, delimiter='\t')
405 |       i = 0
406 |       for row in reader:
407 |         if float(row["FREELK0"])==0 and float(row["FREELK1"])==0:
408 |           # a zero value for the likelihoods implies no data. This usually indicates a problem rather than a real event.
409 |           # if the bam isn't really empty, this is probably due to the use of a incompatible reference build between
410 |           # vcf and bam.
411 |           sys.stderr.write("Found zero likelihoods. Bam is either very-very shallow, or aligned to the wrong reference (relative to the vcf).")
412 |           sys.exit(1)
413 |         print(float(row["FREEMIX"])/~{contamination_underestimation_factor})
414 |         i = i + 1
415 |         # there should be exactly one row, and if this isn't the case the format of the output is unexpectedly different
416 |         # and the results are not reliable.
417 |         if i != 1:
418 |           sys.stderr.write("Found %d rows in .selfSM file. Was expecting exactly 1. This is an error"%(i))
419 |           sys.exit(2)
420 |     CODE
421 |   >>>
422 |   runtime {
423 |     memory: "7.5 GiB"
424 |     cpu: "2"
425 |   }
426 |   output {
427 |     File selfSM = "~{output_prefix}.selfSM"
428 |     Float contamination = read_float(stdout())
429 |   }
430 | }
431 | 


--------------------------------------------------------------------------------
/20k_Throughput-run/WDL/Qc.wdl:
--------------------------------------------------------------------------------
  1 | version 1.0
  2 | 
  3 | ## Copyright Broad Institute, 2018
  4 | ##
  5 | ## This WDL defines tasks used for QC of human whole-genome or exome sequencing data.
  6 | ##
  7 | ## Runtime parameters are often optimized for Broad's Google Cloud Platform implementation.
  8 | ## For program versions, see docker containers.
  9 | ##
 10 | ## LICENSING :
 11 | ## This script is released under the WDL source code license (BSD-3) (see LICENSE in
 12 | ## https://github.com/broadinstitute/wdl). Note however that the programs it calls may
 13 | ## be subject to different licenses. Users are responsible for checking that they are
 14 | ## authorized to run all programs before running this script. Please see the docker
 15 | ## page at https://hub.docker.com/r/broadinstitute/genomes-in-the-cloud/ for detailed
 16 | ## licensing information pertaining to the included programs.
 17 | 
 18 | # Collect sequencing yield quality metrics
 19 | task CollectQualityYieldMetrics {
 20 |   input {
 21 |     File input_bam
 22 |     String metrics_filename
 23 |   }
 24 | 
 25 |   Int disk_size = ceil(size(input_bam, "GiB")) + 20
 26 | 
 27 |   command {
 28 |     java -Xms2000m -Xmx3400m -jar /mnt/lustre/genomics/tools/picard.jar \
 29 |       CollectQualityYieldMetrics \
 30 |       INPUT=~{input_bam} \
 31 |       OQ=true \
 32 |       OUTPUT=~{metrics_filename}
 33 |   }
 34 |   runtime {
 35 |     cpu: "2"
 36 |     memory: "3500 MiB"
 37 |   }
 38 |   output {
 39 |     File quality_yield_metrics = "~{metrics_filename}"
 40 |   }
 41 | }
 42 | 
 43 | # Collect base quality and insert size metrics
 44 | task CollectUnsortedReadgroupBamQualityMetrics {
 45 |   input {
 46 |     File input_bam
 47 |     String output_bam_prefix
 48 |   }
 49 | 
 50 |   Int disk_size = ceil(size(input_bam, "GiB")) + 20
 51 | 
 52 |   command {
 53 |     java -Xms5000m -Xmx6900m -jar /mnt/lustre/genomics/tools/picard.jar \
 54 |       CollectMultipleMetrics \
 55 |       INPUT=~{input_bam} \
 56 |       OUTPUT=~{output_bam_prefix} \
 57 |       ASSUME_SORTED=true \
 58 |       PROGRAM=null \
 59 |       PROGRAM=CollectBaseDistributionByCycle \
 60 |       PROGRAM=CollectInsertSizeMetrics \
 61 |       PROGRAM=MeanQualityByCycle \
 62 |       PROGRAM=QualityScoreDistribution \
 63 |       METRIC_ACCUMULATION_LEVEL=null \
 64 |       METRIC_ACCUMULATION_LEVEL=ALL_READS
 65 | 
 66 |     touch ~{output_bam_prefix}.insert_size_metrics
 67 |     touch ~{output_bam_prefix}.insert_size_histogram.pdf
 68 |   }
 69 |   runtime {
 70 |     memory: "7000 MiB"
 71 |     cpu: "2"
 72 |   }
 73 |   output {
 74 |     File base_distribution_by_cycle_pdf = "~{output_bam_prefix}.base_distribution_by_cycle.pdf"
 75 |     File base_distribution_by_cycle_metrics = "~{output_bam_prefix}.base_distribution_by_cycle_metrics"
 76 |     File insert_size_histogram_pdf = "~{output_bam_prefix}.insert_size_histogram.pdf"
 77 |     File insert_size_metrics = "~{output_bam_prefix}.insert_size_metrics"
 78 |     File quality_by_cycle_pdf = "~{output_bam_prefix}.quality_by_cycle.pdf"
 79 |     File quality_by_cycle_metrics = "~{output_bam_prefix}.quality_by_cycle_metrics"
 80 |     File quality_distribution_pdf = "~{output_bam_prefix}.quality_distribution.pdf"
 81 |     File quality_distribution_metrics = "~{output_bam_prefix}.quality_distribution_metrics"
 82 |   }
 83 | }
 84 | 
 85 | # Collect alignment summary and GC bias quality metrics
 86 | task CollectReadgroupBamQualityMetrics {
 87 |   input {
 88 |     File input_bam
 89 |     File input_bam_index
 90 |     String output_bam_prefix
 91 |     File ref_dict
 92 |     File ref_fasta
 93 |     File ref_fasta_index
 94 |     Boolean collect_gc_bias_metrics = true
 95 |   }
 96 | 
 97 |   Float ref_size = size(ref_fasta, "GiB") + size(ref_fasta_index, "GiB") + size(ref_dict, "GiB")
 98 |   Int disk_size = ceil(size(input_bam, "GiB") + ref_size) + 20
 99 | 
100 |   command {
101 |     # These are optionally generated, but need to exist for Cromwell's sake
102 |     touch ~{output_bam_prefix}.gc_bias.detail_metrics \
103 |       ~{output_bam_prefix}.gc_bias.pdf \
104 |       ~{output_bam_prefix}.gc_bias.summary_metrics
105 | 
106 |     java -Xms5000m -Xmx6900m -jar /mnt/lustre/genomics/tools/picard.jar \
107 |       CollectMultipleMetrics \
108 |       INPUT=~{input_bam} \
109 |       REFERENCE_SEQUENCE=~{ref_fasta} \
110 |       OUTPUT=~{output_bam_prefix} \
111 |       ASSUME_SORTED=true \
112 |       PROGRAM=null \
113 |       PROGRAM=CollectAlignmentSummaryMetrics \
114 |       ~{true='PROGRAM="CollectGcBiasMetrics"' false="" collect_gc_bias_metrics} \
115 |       METRIC_ACCUMULATION_LEVEL=null \
116 |       METRIC_ACCUMULATION_LEVEL=READ_GROUP
117 |   }
118 |   runtime {
119 |     cpu: "2"
120 |     memory: "7000 MiB"
121 |   }
122 |   output {
123 |     File alignment_summary_metrics = "~{output_bam_prefix}.alignment_summary_metrics"
124 |     File gc_bias_detail_metrics = "~{output_bam_prefix}.gc_bias.detail_metrics"
125 |     File gc_bias_pdf = "~{output_bam_prefix}.gc_bias.pdf"
126 |     File gc_bias_summary_metrics = "~{output_bam_prefix}.gc_bias.summary_metrics"
127 |   }
128 | }
129 | 
130 | # Collect quality metrics from the aggregated bam
131 | task CollectAggregationMetrics {
132 |   input {
133 |     File input_bam
134 |     File input_bam_index
135 |     String output_bam_prefix
136 |     File ref_dict
137 |     File ref_fasta
138 |     File ref_fasta_index
139 |     Boolean collect_gc_bias_metrics = true
140 |   }
141 | 
142 |   Float ref_size = size(ref_fasta, "GiB") + size(ref_fasta_index, "GiB") + size(ref_dict, "GiB")
143 |   Int disk_size = ceil(size(input_bam, "GiB") + ref_size) + 20
144 | 
145 |   command {
146 |     # These are optionally generated, but need to exist for Cromwell's sake
147 |     touch ~{output_bam_prefix}.gc_bias.detail_metrics \
148 |       ~{output_bam_prefix}.gc_bias.pdf \
149 |       ~{output_bam_prefix}.gc_bias.summary_metrics \
150 |       ~{output_bam_prefix}.insert_size_metrics \
151 |       ~{output_bam_prefix}.insert_size_histogram.pdf
152 | 
153 |     java -Xms5000m -Xmx6900m -jar /mnt/lustre/genomics/tools/picard.jar \
154 |       CollectMultipleMetrics \
155 |       INPUT=~{input_bam} \
156 |       REFERENCE_SEQUENCE=~{ref_fasta} \
157 |       OUTPUT=~{output_bam_prefix} \
158 |       ASSUME_SORTED=true \
159 |       PROGRAM=null \
160 |       PROGRAM=CollectAlignmentSummaryMetrics \
161 |       PROGRAM=CollectInsertSizeMetrics \
162 |       PROGRAM=CollectSequencingArtifactMetrics \
163 |       PROGRAM=QualityScoreDistribution \
164 |       ~{true='PROGRAM="CollectGcBiasMetrics"' false="" collect_gc_bias_metrics} \
165 |       METRIC_ACCUMULATION_LEVEL=null \
166 |       METRIC_ACCUMULATION_LEVEL=SAMPLE \
167 |       METRIC_ACCUMULATION_LEVEL=LIBRARY
168 |   }
169 |   runtime {
170 |     cpu: "2"
171 |     memory: "7000 MiB"
172 |   }
173 |   output {
174 |     File alignment_summary_metrics = "~{output_bam_prefix}.alignment_summary_metrics"
175 |     File bait_bias_detail_metrics = "~{output_bam_prefix}.bait_bias_detail_metrics"
176 |     File bait_bias_summary_metrics = "~{output_bam_prefix}.bait_bias_summary_metrics"
177 |     File gc_bias_detail_metrics = "~{output_bam_prefix}.gc_bias.detail_metrics"
178 |     File gc_bias_pdf = "~{output_bam_prefix}.gc_bias.pdf"
179 |     File gc_bias_summary_metrics = "~{output_bam_prefix}.gc_bias.summary_metrics"
180 |     File insert_size_histogram_pdf = "~{output_bam_prefix}.insert_size_histogram.pdf"
181 |     File insert_size_metrics = "~{output_bam_prefix}.insert_size_metrics"
182 |     File pre_adapter_detail_metrics = "~{output_bam_prefix}.pre_adapter_detail_metrics"
183 |     File pre_adapter_summary_metrics = "~{output_bam_prefix}.pre_adapter_summary_metrics"
184 |     File quality_distribution_pdf = "~{output_bam_prefix}.quality_distribution.pdf"
185 |     File quality_distribution_metrics = "~{output_bam_prefix}.quality_distribution_metrics"
186 |     File error_summary_metrics = "~{output_bam_prefix}.error_summary_metrics"
187 |   }
188 | }
189 | 
190 | task ConvertSequencingArtifactToOxoG {
191 |   input {
192 |     File pre_adapter_detail_metrics
193 |     File bait_bias_detail_metrics
194 |     String base_name
195 |     File ref_dict
196 |     File ref_fasta
197 |     File ref_fasta_index
198 |     Int memory_multiplier = 1
199 |   }
200 | 
201 |   Float ref_size = size(ref_fasta, "GiB") + size(ref_fasta_index, "GiB") + size(ref_dict, "GiB")
202 |   Int disk_size = ceil(size(pre_adapter_detail_metrics, "GiB") + size(bait_bias_detail_metrics, "GiB") + ref_size) + 20
203 | 
204 |   Int memory_size = ceil(4000 * memory_multiplier)
205 |   Int java_memory_size = memory_size - 1000
206 |   Int max_heap = memory_size - 500
207 | 
208 |   command {
209 |     input_base=$(dirname ~{pre_adapter_detail_metrics})/~{base_name}
210 |     java -Xms~{java_memory_size}m -Xmx~{max_heap}m \
211 |       -jar /mnt/lustre/genomics/tools/picard.jar \
212 |       ConvertSequencingArtifactToOxoG \
213 |       --INPUT_BASE $input_base \
214 |       --OUTPUT_BASE ~{base_name} \
215 |       --REFERENCE_SEQUENCE ~{ref_fasta}
216 |   }
217 |   runtime {
218 |     memory: "~{memory_size} MiB"
219 |   }
220 |   output {
221 |     File oxog_metrics = "~{base_name}.oxog_metrics"
222 |   }
223 | }
224 | 
225 | # Check that the fingerprints of separate readgroups all match
226 | task CrossCheckFingerprints {
227 |   input {
228 |     Array[File] input_bams
229 |     Array[File] input_bam_indexes
230 |     File haplotype_database_file
231 |     String metrics_filename
232 |     Float total_input_size
233 |     Float lod_threshold
234 |     String cross_check_by
235 |   }
236 | 
237 |   Int disk_size = ceil(total_input_size) + 20
238 | 
239 |   command <<<
240 |     java -Dsamjdk.buffer_size=131072 \
241 |       -XX:GCTimeLimit=50 -XX:GCHeapFreeLimit=10 -Xms3000m -Xmx3400m \
242 |       -jar /mnt/lustre/genomics/tools/picard.jar \
243 |       CrosscheckFingerprints \
244 |       OUTPUT=~{metrics_filename} \
245 |       HAPLOTYPE_MAP=~{haplotype_database_file} \
246 |       EXPECT_ALL_GROUPS_TO_MATCH=true \
247 |       INPUT=~{sep=' INPUT=' input_bams} \
248 |       LOD_THRESHOLD=~{lod_threshold} \
249 |       CROSSCHECK_BY=~{cross_check_by}
250 |   >>>
251 |   runtime {
252 |     cpu: "2"
253 |     memory: "3500 MiB"
254 |   }
255 |   output {
256 |     File cross_check_fingerprints_metrics = "~{metrics_filename}"
257 |   }
258 | }
259 | 
260 | task CheckFingerprintTask {
261 |   input {
262 |     File? input_bam
263 |     File? input_bam_index
264 |     File? input_vcf
265 |     File? input_vcf_index
266 |     String? input_sample_alias
267 | 
268 |     File genotypes
269 |     File? genotypes_index
270 |     String expected_sample_alias
271 | 
272 |     String output_basename
273 |     Float genotype_lod_threshold = 5.0
274 | 
275 |     File haplotype_database_file
276 |     File? ref_fasta
277 |     File? ref_fasta_index
278 | 
279 |     Int memory_size = 2500
280 | 
281 |     Boolean allow_lod_zero = false
282 |   }
283 | 
284 |   Int java_memory_size = memory_size - 1000
285 |   Int max_heap = memory_size - 500
286 | 
287 |   Int disk_size = ceil(size(input_bam, "GiB") + size(input_vcf, "GiB")) + 20
288 |   # Picard has different behavior depending on whether or not the OUTPUT parameter ends with a '.', so we are explicitly
289 |   #   passing in where we want the two metrics files to go to avoid any potential confusion.
290 |   String summary_metrics_location = "~{output_basename}.fingerprinting_summary_metrics"
291 |   String detail_metrics_location = "~{output_basename}.fingerprinting_detail_metrics"
292 | 
293 |   File input_file = select_first([input_vcf, input_bam])
294 | 
295 |   command <<<
296 |     set -e
297 |     java -Xms3g -Xmx3400m -Dpicard.useLegacyParser=false -jar /mnt/lustre/genomics/tools/picard.jar \
298 |     CheckFingerprint \
299 |       --INPUT ~{input_file} \
300 |       ~{if defined(input_vcf) then "--OBSERVED_SAMPLE_ALIAS \"" + input_sample_alias + "\"" else ""} \
301 |       --GENOTYPES ~{genotypes} \
302 |       --EXPECTED_SAMPLE_ALIAS "~{expected_sample_alias}" \
303 |       ~{if defined(input_bam) then "--IGNORE_READ_GROUPS true" else ""} \
304 |       --HAPLOTYPE_MAP ~{haplotype_database_file} \
305 |       --GENOTYPE_LOD_THRESHOLD ~{genotype_lod_threshold} \
306 |       --SUMMARY_OUTPUT ~{summary_metrics_location} \
307 |       --DETAIL_OUTPUT ~{detail_metrics_location} \
308 |       ~{"--REFERENCE_SEQUENCE " + ref_fasta} \
309 |       ~{true='--EXIT_CODE_WHEN_NO_VALID_CHECKS 0' false='' allow_lod_zero}
310 | 
311 |     CONTENT_LINE=$(cat ~{summary_metrics_location} |
312 |     grep -n "## METRICS CLASS\tpicard.analysis.FingerprintingSummaryMetrics" |
313 |     cut -f1 -d:)
314 |     CONTENT_LINE=$(($CONTENT_LINE+2))
315 |     sed '8q;d' ~{summary_metrics_location} | cut -f5 > lod
316 |   >>>
317 | 
318 |   runtime {
319 |     cpu: "2"
320 |     memory: "~{memory_size} MiB"
321 |   }
322 | 
323 |   output {
324 |     File summary_metrics = summary_metrics_location
325 |     File detail_metrics = detail_metrics_location
326 |     Float lod = read_float("lod")
327 |   }
328 | }
329 | 
330 | task CheckPreValidation {
331 |   input {
332 |     File duplication_metrics
333 |     File chimerism_metrics
334 |     Float max_duplication_in_reasonable_sample
335 |     Float max_chimerism_in_reasonable_sample
336 |   }
337 | 
338 |   command <<<
339 |     set -o pipefail
340 |     set -e
341 | 
342 |     grep -A 1 PERCENT_DUPLICATION ~{duplication_metrics} > duplication.csv
343 |     grep -A 3 PCT_CHIMERAS ~{chimerism_metrics} | grep -v OF_PAIR > chimerism.csv
344 | 
345 |     python3 <<CODE
346 | 
347 |     import csv
348 |     with open('duplication.csv') as dupfile:
349 |       reader = csv.DictReader(dupfile, delimiter='\t')
350 |       for row in reader:
351 |         with open("duplication_value.txt","w") as file:
352 |           file.write(row['PERCENT_DUPLICATION'])
353 |           file.close()
354 | 
355 |     with open('chimerism.csv') as chimfile:
356 |       reader = csv.DictReader(chimfile, delimiter='\t')
357 |       for row in reader:
358 |         with open("chimerism_value.txt","w") as file:
359 |           file.write(row['PCT_CHIMERAS'])
360 |           file.close()
361 | 
362 |     CODE
363 | 
364 | >>>
365 |   runtime {
366 |     memory: "2 GiB"
367 |   }
368 |   output {
369 |     Float duplication_rate = read_float("duplication_value.txt")
370 |     Float chimerism_rate = read_float("chimerism_value.txt")
371 |     Boolean is_outlier_data = duplication_rate > max_duplication_in_reasonable_sample || chimerism_rate > max_chimerism_in_reasonable_sample
372 |   }
373 | }
374 | 
375 | task ValidateSamFile {
376 |   input {
377 |     File input_bam
378 |     File? input_bam_index
379 |     String report_filename
380 |     File ref_dict
381 |     File ref_fasta
382 |     File ref_fasta_index
383 |     Int? max_output
384 |     Array[String]? ignore
385 |     Boolean? is_outlier_data
386 |     Int memory_multiplier = 1
387 |     Int additional_disk = 20
388 | 
389 |     Int disk_size = ceil(size(input_bam, "GiB") 
390 |                     + size(ref_fasta, "GiB") 
391 |                     + size(ref_fasta_index, "GiB")
392 |                     + size(ref_dict, "GiB")) + additional_disk
393 |   }
394 | 
395 |   Int memory_size = ceil(16000 * memory_multiplier)
396 |   Int java_memory_size = memory_size - 1000
397 |   Int max_heap = memory_size - 500
398 | 
399 |   command {
400 |     java -Xms~{java_memory_size}m -Xmx~{max_heap}m -jar /mnt/lustre/genomics/tools/picard.jar \
401 |       ValidateSamFile \
402 |       INPUT=~{input_bam} \
403 |       OUTPUT=~{report_filename} \
404 |       REFERENCE_SEQUENCE=~{ref_fasta} \
405 |       ~{"MAX_OUTPUT=" + max_output} \
406 |       IGNORE=~{default="null" sep=" IGNORE=" ignore} \
407 |       MODE=VERBOSE \
408 |       ~{default='SKIP_MATE_VALIDATION=false' true='SKIP_MATE_VALIDATION=true' false='SKIP_MATE_VALIDATION=false' is_outlier_data} \
409 |       IS_BISULFITE_SEQUENCED=false
410 |   }
411 |   runtime {
412 |     memory: "~{memory_size} MiB"
413 |     cpu: "2"
414 |   }
415 |   output {
416 |     File report = "~{report_filename}"
417 |   }
418 | }
419 | 
420 | task CollectWgsMetrics {
421 |   input {
422 |     File input_bam
423 |     File input_bam_index
424 |     String metrics_filename
425 |     File wgs_coverage_interval_list
426 |     File ref_fasta
427 |     File ref_fasta_index
428 |     Int read_length = 250
429 |   }
430 | 
431 |   Float ref_size = size(ref_fasta, "GiB") + size(ref_fasta_index, "GiB")
432 |   Int disk_size = ceil(size(input_bam, "GiB") + ref_size) + 20
433 | 
434 |   command {
435 |     java -Xms2000m -Xmx3g -jar /mnt/lustre/genomics/tools/picard.jar \
436 |       CollectWgsMetrics \
437 |       INPUT=~{input_bam} \
438 |       VALIDATION_STRINGENCY=SILENT \
439 |       REFERENCE_SEQUENCE=~{ref_fasta} \
440 |       INCLUDE_BQ_HISTOGRAM=true \
441 |       INTERVALS=~{wgs_coverage_interval_list} \
442 |       OUTPUT=~{metrics_filename} \
443 |       USE_FAST_ALGORITHM=true \
444 |       READ_LENGTH=~{read_length}
445 |   }
446 |   runtime {
447 |     cpu: "2"
448 |     memory: "3000 MiB"
449 |   }
450 |   output {
451 |     File metrics = "~{metrics_filename}"
452 |   }
453 | }
454 | 
455 | # Collect raw WGS metrics (commonly used QC thresholds)
456 | task CollectRawWgsMetrics {
457 |   input {
458 |     File input_bam
459 |     File input_bam_index
460 |     String metrics_filename
461 |     File wgs_coverage_interval_list
462 |     File ref_fasta
463 |     File ref_fasta_index
464 |     Int read_length = 250
465 |     Int memory_multiplier = 1
466 |     Int additional_disk = 20
467 |   }
468 |   Float ref_size = size(ref_fasta, "GiB") + size(ref_fasta_index, "GiB")
469 |   Int disk_size = ceil(size(input_bam, "GiB") + ref_size) + additional_disk
470 | 
471 |   Int memory_size = ceil((if (disk_size < 110) then 5 else 7) * memory_multiplier)
472 |   String java_memory_size = (memory_size - 1) * 1000
473 | 
474 |   command {
475 |     java -Xms~{java_memory_size}m -Xmx~{memory_size}g -jar /mnt/lustre/genomics/tools/picard.jar \
476 |       CollectRawWgsMetrics \
477 |       INPUT=~{input_bam} \
478 |       VALIDATION_STRINGENCY=SILENT \
479 |       REFERENCE_SEQUENCE=~{ref_fasta} \
480 |       INCLUDE_BQ_HISTOGRAM=true \
481 |       INTERVALS=~{wgs_coverage_interval_list} \
482 |       OUTPUT=~{metrics_filename} \
483 |       USE_FAST_ALGORITHM=true \
484 |       READ_LENGTH=~{read_length}
485 |   }
486 |   runtime {
487 |     cpu: "2"
488 |     memory: "~{memory_size} GiB"
489 |   }
490 |   output {
491 |     File metrics = "~{metrics_filename}"
492 |   }
493 | }
494 | 
495 | task CollectHsMetrics {
496 |   input {
497 |     File input_bam
498 |     File input_bam_index
499 |     File ref_fasta
500 |     File ref_fasta_index
501 |     String metrics_filename
502 |     File target_interval_list
503 |     File bait_interval_list
504 |     Int memory_multiplier = 1
505 |     Int additional_disk = 20
506 |   }
507 | 
508 |   Float ref_size = size(ref_fasta, "GiB") + size(ref_fasta_index, "GiB")
509 |   Int disk_size = ceil(size(input_bam, "GiB") + ref_size) + additional_disk
510 |   # Try to fit the input bam into memory, within reason.
511 |   Int rounded_bam_size = ceil(size(input_bam, "GiB") + 0.5)
512 |   Int rounded_memory_size = ceil((if (rounded_bam_size > 10) then 10 else rounded_bam_size) * memory_multiplier)
513 |   Int memory_size = if rounded_memory_size < 7 then 7000 else (rounded_memory_size * 1000)
514 |   Int java_memory_size = memory_size - 1000
515 |   Int max_heap = memory_size - 500
516 | 
517 |   # There are probably more metrics we want to generate with this tool
518 |   command {
519 |     java -Xms~{java_memory_size}m -Xmx~{max_heap}m -jar /mnt/lustre/genomics/tools/picard.jar \
520 |       CollectHsMetrics \
521 |       INPUT=~{input_bam} \
522 |       REFERENCE_SEQUENCE=~{ref_fasta} \
523 |       VALIDATION_STRINGENCY=SILENT \
524 |       TARGET_INTERVALS=~{target_interval_list} \
525 |       BAIT_INTERVALS=~{bait_interval_list} \
526 |       METRIC_ACCUMULATION_LEVEL=null \
527 |       METRIC_ACCUMULATION_LEVEL=SAMPLE \
528 |       METRIC_ACCUMULATION_LEVEL=LIBRARY \
529 |       OUTPUT=~{metrics_filename}
530 |   }
531 | 
532 |   runtime {
533 |     memory: "~{memory_size} MiB"
534 |   }
535 | 
536 |   output {
537 |     File metrics = metrics_filename
538 |   }
539 | }
540 | 
541 | # Generate a checksum per readgroup
542 | task CalculateReadGroupChecksum {
543 |   input {
544 |     File input_bam
545 |     File input_bam_index
546 |     String read_group_md5_filename
547 |   }
548 | 
549 |   Int disk_size = ceil(size(input_bam, "GiB")) + 40
550 | 
551 |   command {
552 |     java -Xms1000m -Xmx1900m -jar /mnt/lustre/genomics/tools/picard.jar \
553 |       CalculateReadGroupChecksum \
554 |       INPUT=~{input_bam} \
555 |       OUTPUT=~{read_group_md5_filename}
556 |   }
557 |   runtime {
558 |     cpu: "2"
559 |     memory: "2 GiB"
560 |   }
561 |   output {
562 |     File md5_file = "~{read_group_md5_filename}"
563 |   }
564 | }
565 | 
566 | # Validate a (g)VCF with -gvcf specific validation
567 | task ValidateVCF {
568 |   input {
569 |     File input_vcf
570 |     File input_vcf_index
571 |     File ref_fasta
572 |     File ref_fasta_index
573 |     File ref_dict
574 |     File? dbsnp_vcf
575 |     File? dbsnp_vcf_index
576 |     File calling_interval_list
577 |     File? calling_interval_list_index  # if the interval list is a VCF, than an index file is also required
578 |     Boolean is_gvcf = true
579 |     String? extra_args
580 |   }
581 | 
582 |   Boolean calling_intervals_is_vcf = defined(calling_interval_list_index)
583 |   String calling_interval_list_basename = basename(calling_interval_list)
584 |   String calling_interval_list_index_basename = if calling_intervals_is_vcf then basename(select_first([calling_interval_list_index])) else ""
585 | 
586 |   Float ref_size = size(ref_fasta, "GiB") + size(ref_fasta_index, "GiB") + size(ref_dict, "GiB")
587 |   Int disk_size = ceil(size(input_vcf, "GiB") + size(dbsnp_vcf, "GiB") + ref_size) + 20
588 | 
589 |   command {
590 |     set -e
591 |     
592 |     # We can't always assume the index was located with the vcf, so make a link so that the paths look the same
593 |     ln -s ~{calling_interval_list} ~{calling_interval_list_basename}
594 |     if [ ~{calling_intervals_is_vcf} == "true" ]; then
595 |       ln -s ~{calling_interval_list_index} ~{calling_interval_list_index_basename}
596 |     fi
597 |     
598 |     # Note that WGS needs a lot of memory to do the -L *.vcf if an interval file is not supplied
599 |     /mnt/lustre/genomics/tools/gatk/gatk --java-options "-Xms6000m -Xmx6900m" \
600 |       ValidateVariants \
601 |       -V ~{input_vcf} \
602 |       -R ~{ref_fasta} \
603 |       -L ~{calling_interval_list_basename} \
604 |       ~{true="-gvcf" false="" is_gvcf} \
605 |       --validation-type-to-exclude ALLELES \
606 |       ~{"--dbsnp " + dbsnp_vcf} \
607 |       ~{extra_args}
608 |   }
609 |   runtime {
610 |     cpu: "2"
611 |     memory: "7000 MiB"
612 |   }
613 | }
614 | 
615 | # Collect variant calling metrics from GVCF output
616 | task CollectVariantCallingMetrics {
617 |   input {
618 |     File input_vcf
619 |     File input_vcf_index
620 |     String metrics_basename
621 |     File dbsnp_vcf
622 |     File dbsnp_vcf_index
623 |     File ref_dict
624 |     File evaluation_interval_list
625 |     Boolean is_gvcf = true
626 |   }
627 | 
628 |   Int disk_size = ceil(size(input_vcf, "GiB") + size(dbsnp_vcf, "GiB")) + 20
629 | 
630 |   command {
631 |     java -Xms2000m -Xmx2900m -jar /mnt/lustre/genomics/tools/picard.jar \
632 |       CollectVariantCallingMetrics \
633 |       INPUT=~{input_vcf} \
634 |       OUTPUT=~{metrics_basename} \
635 |       DBSNP=~{dbsnp_vcf} \
636 |       SEQUENCE_DICTIONARY=~{ref_dict} \
637 |       TARGET_INTERVALS=~{evaluation_interval_list} \
638 |       ~{true="GVCF_INPUT=true" false="" is_gvcf}
639 |   }
640 |   runtime {
641 |     cpu: "2"
642 |     memory: "3000 MiB"
643 |   }
644 |   output {
645 |     File summary_metrics = "~{metrics_basename}.variant_calling_summary_metrics"
646 |     File detail_metrics = "~{metrics_basename}.variant_calling_detail_metrics"
647 |   }
648 | }
649 | 


--------------------------------------------------------------------------------