├── 20k_Tutorial_Docker ├── 20k_WF_ID.txt ├── 20k_submission_response.txt ├── step05_Single_Sample_20k_Workflow_Output.sh ├── step04_Cromwell_Monitor_Single_Sample_20k_Workflow.sh ├── step03_Cromwell_Run_20k_Single_Sample_Workflow.sh ├── step02_Download_Single_Sample_20k_Data.sh └── 16T_PairedSingleSampleWf_optimized.inputs.20k.json ├── images ├── Layer-Cake.png ├── SW-Arch-Diagram.png ├── Pipeline-Overview.png └── Directory-Quick-Reference.png ├── .gitignore ├── 20k_Tutorial ├── step05_Single_Sample_20k_Workflow_Output.sh ├── step04_Cromwell_Monitor_Single_Sample_20k_Workflow.sh ├── step03_Cromwell_Run_20k_Single_Sample_Workflow.sh ├── step02_Download_Single_Sample_20k_Data.sh └── 16T_PairedSingleSampleWf_optimized.inputs.20k.json ├── SECURITY.md ├── config_files ├── SLURM │ ├── slurmdbd.conf │ └── slurm.conf └── HTCondor │ ├── condor_config.app-node │ └── condor_config.comp-node ├── LICENSE ├── 20k_Throughput-run ├── configure ├── WDL │ ├── DNASeqStructs.wdl │ ├── BamToCram.wdl │ ├── DragenTasks.wdl │ ├── SplitLargeReadGroup.wdl │ ├── DragmapAlignment.wdl │ ├── AggregatedBamQC.wdl │ ├── Alignment.wdl │ ├── VariantCalling.wdl │ ├── Utilities.wdl │ ├── GermlineVariantDiscovery.wdl │ ├── UnmappedBamToAlignedBam.wdl │ ├── BamProcessing.wdl │ └── Qc.wdl ├── step01_Configure_20k_Throughput-run.sh ├── step04_Cromwell_Monitor_20k_Throughput-run.sh ├── step03_Cromwell_Run_20k_Throughput-run.sh ├── step05_Output_20k_Throughput-run.sh ├── README.md ├── WholeGenomeGermlineSingleSample_20k.json ├── step02_Download_20k_Data_Throughput-run.sh └── WholeGenomeGermlineSingleSample.wdl └── README.md /20k_Tutorial_Docker/20k_WF_ID.txt: -------------------------------------------------------------------------------- 1 | cd21d43f-302e-484f-8025-a5ebded0e6e5 2 | -------------------------------------------------------------------------------- /images/Layer-Cake.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Intel-HLS/BIGstack/HEAD/images/Layer-Cake.png -------------------------------------------------------------------------------- /images/SW-Arch-Diagram.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Intel-HLS/BIGstack/HEAD/images/SW-Arch-Diagram.png -------------------------------------------------------------------------------- /20k_Tutorial_Docker/20k_submission_response.txt: -------------------------------------------------------------------------------- 1 | {"id":"cd21d43f-302e-484f-8025-a5ebded0e6e5","status":"Submitted"} -------------------------------------------------------------------------------- /images/Pipeline-Overview.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Intel-HLS/BIGstack/HEAD/images/Pipeline-Overview.png -------------------------------------------------------------------------------- /images/Directory-Quick-Reference.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Intel-HLS/BIGstack/HEAD/images/Directory-Quick-Reference.png -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | 20k_Throughput-run/20k_WF_ID* 2 | *.zip 3 | 20k_Throughput-run/20k_submission_response.txt 4 | 20k_Throughput-run/JSON 5 | 20k_Throughput-run/cromwell-monitor 6 | 20k_Throughput-run/cromwell* 7 | 20k_Throughput-run/data 8 | -------------------------------------------------------------------------------- /20k_Tutorial/step05_Single_Sample_20k_Workflow_Output.sh: -------------------------------------------------------------------------------- 1 | #! /bin/bash 2 | ROOT_PATH="/cluster_share" 3 | echo "Go to $ROOT_PATH/cromwell-executions/SingleSample20k/$(cat 20k_WF_ID.txt) to view the output of workflow instance $(cat 20k_WF_ID.txt)" 4 | -------------------------------------------------------------------------------- /20k_Tutorial_Docker/step05_Single_Sample_20k_Workflow_Output.sh: -------------------------------------------------------------------------------- 1 | #! /bin/bash 2 | ROOT_PATH="/cluster_share" 3 | echo "Go to $ROOT_PATH/cromwell-executions/SingleSample20k/$(cat 20k_WF_ID.txt) to view the output of workflow instance $(cat 20k_WF_ID.txt)" 4 | -------------------------------------------------------------------------------- /SECURITY.md: -------------------------------------------------------------------------------- 1 | # Intel® Select Solutions for Genomics Analytics Security Policy 2 | Intel is committed to rapidly addressing security vulnerabilities affecting our customers and providing clear guidance on the solution, impact, severity and mitigation. 3 | 4 | ## Reporting a Vulnerability 5 | Please report any security vulnerabilities in this project [utilizing the guidelines here](https://www.intel.com/content/www/us/en/security-center/vulnerability-handling-guidelines.html). 6 | -------------------------------------------------------------------------------- /20k_Tutorial/step04_Cromwell_Monitor_Single_Sample_20k_Workflow.sh: -------------------------------------------------------------------------------- 1 | #! /bin/bash 2 | export http_proxy= 3 | export https_proxy= 4 | CROMWELL_HOST=$HOSTNAME 5 | #CROMWELL_PORT=$(lsof -Pan -p $(ps waux | grep cromwell | grep "java -jar" \ 6 | #| awk '{print $2}') -i | grep -i listen | awk '{print $9}' | grep -E -o "[0-9]{4}") 7 | #Contact your admin to obtain Cromwell port - 8000 by default 8 | 9 | curl -vXGET $CROMWELL_HOST:8000/api/workflows/v1/$(cat 20k_WF_ID.txt)/status 10 | -------------------------------------------------------------------------------- /20k_Tutorial_Docker/step04_Cromwell_Monitor_Single_Sample_20k_Workflow.sh: -------------------------------------------------------------------------------- 1 | #! /bin/bash 2 | export http_proxy= 3 | export https_proxy= 4 | CROMWELL_HOST=$HOSTNAME 5 | #CROMWELL_PORT=$(lsof -Pan -p $(ps waux | grep cromwell | grep "java -jar" \ 6 | #| awk '{print $2}') -i | grep -i listen | awk '{print $9}' | grep -E -o "[0-9]{4}") 7 | #Contact your admin to obtain Cromwell port - 8000 by default 8 | 9 | curl -vXGET $CROMWELL_HOST:8000/api/workflows/v1/$(cat 20k_WF_ID.txt)/status 10 | -------------------------------------------------------------------------------- /20k_Tutorial/step03_Cromwell_Run_20k_Single_Sample_Workflow.sh: -------------------------------------------------------------------------------- 1 | #! /bin/bash 2 | export http_proxy= 3 | export https_proxy= 4 | CROMWELL_HOST=$HOSTNAME 5 | #CROMWELL_PORT=$(lsof -Pan -p $(ps waux | grep cromwell | grep "java -jar" | awk '{print $2}') -i | grep -i listen | awk '{print $9}' | grep -E -o "[0-9]{4}") 6 | #Cromwell port is 8000 by default. Contact your admin if port is different 7 | 8 | curl -vXPOST http://$CROMWELL_HOST:8000/api/workflows/v1 -F workflowSource=@PairedSingleSampleWf_noqc_nocram_optimized.wdl \ 9 | -F workflowInputs=@16T_PairedSingleSampleWf_optimized.inputs.20k.json > 20k_submission_response.txt 10 | 11 | cat 20k_submission_response.txt | grep -o -E "[0-9a-f]{8}-[0-9a-f]{4}-[0-9a-f]{4}-[0-9a-f]{4}-[0-9a-f]{12}" \ 12 | > 20k_WF_ID.txt 13 | -------------------------------------------------------------------------------- /20k_Tutorial_Docker/step03_Cromwell_Run_20k_Single_Sample_Workflow.sh: -------------------------------------------------------------------------------- 1 | #! /bin/bash 2 | export http_proxy= 3 | export https_proxy= 4 | CROMWELL_HOST=$HOSTNAME 5 | #CROMWELL_PORT=$(lsof -Pan -p $(ps waux | grep cromwell | grep "java -jar" | awk '{print $2}') -i | grep -i listen | awk '{print $9}' | grep -E -o "[0-9]{4}") 6 | #Cromwell port is 8000 by default. Contact your admin if port is different 7 | 8 | curl -vXPOST http://$CROMWELL_HOST:8000/api/workflows/v1 -F workflowSource=@PairedSingleSampleWf_noqc_nocram_optimized.wdl \ 9 | -F workflowInputs=@16T_PairedSingleSampleWf_optimized.inputs.20k.json > 20k_submission_response.txt 10 | 11 | cat 20k_submission_response.txt | grep -o -E "[0-9a-f]{8}-[0-9a-f]{4}-[0-9a-f]{4}-[0-9a-f]{4}-[0-9a-f]{12}" \ 12 | > 20k_WF_ID.txt 13 | -------------------------------------------------------------------------------- /config_files/SLURM/slurmdbd.conf: -------------------------------------------------------------------------------- 1 | # 2 | # Example slurmdbd.conf file. 3 | # 4 | # See the slurmdbd.conf man page for more information. 5 | # 6 | # Archive info 7 | #ArchiveJobs=yes 8 | #ArchiveDir="/tmp" 9 | #ArchiveSteps=yes 10 | #ArchiveScript= 11 | #JobPurge=12 12 | #StepPurge=1 13 | # 14 | # Authentication info 15 | AuthType=auth/munge 16 | #AuthInfo=/var/run/munge/munge.socket.2 17 | # 18 | # slurmDBD info 19 | DbdAddr=localhost 20 | DbdHost=localhost 21 | #DbdPort=7031 22 | SlurmUser=slurm 23 | #MessageTimeout=300 24 | DebugLevel=4 25 | #DefaultQOS=normal,standby 26 | LogFile=/var/log/slurmdbd.log 27 | PidFile=/var/run/slurmdbd.pid 28 | #PluginDir=/usr/lib/slurm 29 | #PrivateData=accounts,users,usage,jobs 30 | #TrackWCKey=yes 31 | # 32 | # Database info 33 | StorageType=accounting_storage/mysql 34 | #StorageHost=localhost 35 | #StoragePort=1234 36 | StoragePass=password 37 | StorageUser=slurm 38 | #StorageLoc=slurm_acct_db 39 | 40 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | The MIT License (MIT) 2 | 3 | Copyright (c) 2017-2018 Intel Corporation 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy of 6 | this software and associated documentation files (the "Software"), to deal in 7 | the Software without restriction, including without limitation the rights to 8 | use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of 9 | the Software, and to permit persons to whom the Software is furnished to do so, 10 | subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS 17 | FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR 18 | COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER 19 | IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN 20 | CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. 21 | -------------------------------------------------------------------------------- /20k_Throughput-run/configure: -------------------------------------------------------------------------------- 1 | #! /bin/bash 2 | # Copyright (c) 2019 Intel Corporation 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at # 6 | # http://www.apache.org/licenses/LICENSE-2.0 7 | # 8 | # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 9 | # See the License for the specific language governing permissions and # limitations under the License. 10 | # 11 | # SPDX-License-Identifier: Apache-2.0 12 | # 13 | 14 | export http_proxy=http://proxy-jf.intel.com:911 15 | export https_proxy=http://proxy-jf.intel.com:912 16 | export no_proxy="localhost,intel.com" 17 | CROMWELL_HOST=$HOSTNAME 18 | 19 | BASEDIR="$( cd "$( dirname "${BASH_SOURCE[0]}" )" && pwd )" 20 | GENOMICS_PATH="/mnt/lustre/genomics" 21 | CROMWELL_PATH="/fastdata/02/genomics/cromwell" 22 | 23 | #specify the path to tools directory.By default,script expects tools to be in the following path 24 | TOOLS_PATH="$GENOMICS_PATH/tools" 25 | 26 | #specify the path to data download directory.By default, data is downloaded to current folder 27 | DATA_PATH="$BASEDIR/data" 28 | 29 | #Enter the number of workflow for throughput run 30 | NUM_WORKFLOW=16 31 | 32 | BASEDIR="$( cd "$( dirname "${BASH_SOURCE[0]}" )" && pwd )" 33 | 34 | # sudo yum install -y R 35 | # sudo yum install -y jq 36 | # Create generic symlinks for tools e.g. : 37 | # for tool in bwa samtools gatk; do export tool_version=`ls $GENOMICS_PATH/tools | grep ${tool}- | head -n1` && echo ${tool_version} && ln -sfn $GENOMICS_PATH/tools/$tool_version $GENOMICS_PATH/tools/$tool; done; 38 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | PROJECT NOT UNDER ACTIVE MANAGEMENT 2 | 3 | This project will no longer be maintained by Intel. 4 | 5 | Intel has ceased development and contributions including, but not limited to, maintenance, bug fixes, new releases, or updates, to this project. 6 | 7 | Intel no longer accepts patches to this project. 8 | 9 | If you have an ongoing need to use this project, are interested in independently developing it, or would like to maintain patches for the open source software community, please create your own fork of this project. 10 | 11 | Contact: webadmin@linux.intel.com 12 | # Intel® Select Solutions for Genomics Analytics 13 | 14 | [Intel® Select Solutions for Genomics Analytics](https://www.intel.com/content/dam/www/public/us/en/documents/product-briefs/select-solutions-for-genomics-analytics-brief-v2.pdf) 15 | is an end-to-end, optimized hardware and software solution for analyzing 16 | genomic data. It provides a way to run pre-packaged, optimized workflows, including the Genome Analysis Toolkit* 17 | (GATK*) Best Practices workflows from the Broad Institute. 18 | 19 | This repo contains a simple smoketest benchmark for HPC clusters ("20k Throughput Run"). The test ensures your HPC system is configured correctly to run whole genome and whole exome samples. 20 | 21 | For an overview on how to set up an HPC cluster for running GATK, see the [Broad documentation here](https://gatk.broadinstitute.org/hc/en-us/articles/360035530872). An overview of the Intel Solution, including a HW reference design, can be found [here](https://www.intel.com/content/www/us/en/products/solutions/select-solutions/hpc/genomics-analytics-v2.html). 22 | 23 | For detailed, line-by-line instructions on how to configure an HPC system for running genomics workflows, please contact your Intel representative. 24 | -------------------------------------------------------------------------------- /20k_Throughput-run/WDL/DNASeqStructs.wdl: -------------------------------------------------------------------------------- 1 | version 1.0 2 | 3 | struct SampleAndUnmappedBams { 4 | String base_file_name 5 | String? final_gvcf_base_name 6 | Array[File] flowcell_unmapped_bams 7 | String sample_name 8 | String unmapped_bam_suffix 9 | } 10 | 11 | struct ReferenceFasta { 12 | File ref_dict 13 | File ref_fasta 14 | File ref_fasta_index 15 | File ref_alt 16 | File ref_sa 17 | File ref_amb 18 | File ref_bwt 19 | File ref_ann 20 | File ref_pac 21 | File? ref_str 22 | } 23 | 24 | struct DragmapReference { 25 | File reference_bin 26 | File hash_table_cfg_bin 27 | File hash_table_cmp 28 | } 29 | 30 | struct DNASeqSingleSampleReferences { 31 | File contamination_sites_ud 32 | File contamination_sites_bed 33 | File contamination_sites_mu 34 | File calling_interval_list 35 | 36 | ReferenceFasta reference_fasta 37 | 38 | Array[File] known_indels_sites_vcfs 39 | Array[File] known_indels_sites_indices 40 | 41 | File dbsnp_vcf 42 | File dbsnp_vcf_index 43 | 44 | File evaluation_interval_list 45 | 46 | File haplotype_database_file 47 | } 48 | 49 | struct VariantCallingScatterSettings { 50 | Int haplotype_scatter_count 51 | Int break_bands_at_multiples_of 52 | } 53 | 54 | struct ExomeGermlineSingleSampleOligos { 55 | File target_interval_list 56 | File bait_interval_list 57 | String bait_set_name 58 | } 59 | 60 | struct CrossSpeciesContaminationReferences { 61 | File filter_bwa_image 62 | File kmer_file 63 | File meats_bwa_image 64 | File meats_fasta 65 | File meats_fasta_dict 66 | File meats_taxonomy_file 67 | File microbe_bwa_image 68 | File microbe_fasta 69 | File microbe_fasta_dict 70 | File microbe_taxonomy_file 71 | File normalization_file 72 | File metrics_script_file 73 | Float score_min_identity 74 | Int reads_after_downsampling 75 | } 76 | 77 | struct PapiSettings { 78 | } 79 | 80 | -------------------------------------------------------------------------------- /20k_Throughput-run/step01_Configure_20k_Throughput-run.sh: -------------------------------------------------------------------------------- 1 | #! /bin/bash 2 | # Copyright (c) 2019 Intel Corporation 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at # 6 | # http://www.apache.org/licenses/LICENSE-2.0 7 | # 8 | # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 9 | # See the License for the specific language governing permissions and # limitations under the License. 10 | # 11 | # SPDX-License-Identifier: Apache-2.0 12 | # 13 | 14 | ###########Editing JSON file############# 15 | source ./configure 16 | 17 | #datapath is the existing path specified in the WholeGenomeGermlineSingleSample_20k.json.Do not edit this path 18 | datapath=/mnt/lustre/genomics/data 19 | #toolspath is the existing path specified in the WholeGenomeGermlineSingleSample_20k.json.Do not edit this path 20 | toolspath=/mnt/lustre/genomics/tools 21 | 22 | mkdir -p $BASEDIR/JSON 23 | cd $BASEDIR/JSON 24 | cp $BASEDIR/WholeGenomeGermlineSingleSample_20k.json $BASEDIR/JSON/WholeGenomeGermlineSingleSample_20k.json 25 | 26 | newdatapath=${DATA_PATH} 27 | newtoolspath=${TOOLS_PATH} 28 | 29 | #pointing the correct data path to wdl 30 | sed -i "s%$datapath%$newdatapath%g" $BASEDIR/JSON/WholeGenomeGermlineSingleSample_20k.json 31 | sed -i "s%$toolspath%$newtoolspath%g" $BASEDIR/JSON/WholeGenomeGermlineSingleSample_20k.json 32 | 33 | sed -i "s%$toolspath%$newtoolspath%g" $BASEDIR/WDL/WholeGenomeGermlineSingleSample.wdl 34 | sed -i "s%$toolspath%$newtoolspath%g" $BASEDIR/WDL/*.wdl 35 | 36 | FILE="$BASEDIR/WDL/warp.zip" 37 | 38 | echo "Creating zip file for WDLS "$FILE" " 39 | zip -j $BASEDIR/WDL/warp.zip $BASEDIR/WDL/*.wdl 40 | 41 | -------------------------------------------------------------------------------- /20k_Throughput-run/step04_Cromwell_Monitor_20k_Throughput-run.sh: -------------------------------------------------------------------------------- 1 | #! /bin/bash 2 | # Copyright (c) 2019 Intel Corporation 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at # 6 | # http://www.apache.org/licenses/LICENSE-2.0 7 | # 8 | # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 9 | # See the License for the specific language governing permissions and # limitations under the License. 10 | # 11 | # SPDX-License-Identifier: Apache-2.0 12 | # 13 | 14 | source ./configure 15 | 16 | curl -s $CROMWELL_HOST:8000/api/workflows/v1/query 2>/dev/null | json_pp>cromwell_stop 17 | 18 | start_date=`cat cromwell_start_date` 19 | count=`curl -sGET "$CROMWELL_HOST:8000/api/workflows/v1/query?start=$start_date&status=Running&includeSubworkflows=false" | jq '.totalResultsCount'` 20 | finish=`curl -sGET "$CROMWELL_HOST:8000/api/workflows/v1/query?start=$start_date&status=Succeeded&includeSubworkflows=false" | jq '.totalResultsCount'` 21 | failed=`curl -sGET "$CROMWELL_HOST:8000/api/workflows/v1/query?start=$start_date&status=Failed&includeSubworkflows=false" | jq '.totalResultsCount'` 22 | echo running: $count finished: $finish failed: $failed 23 | 24 | echo "-----------------------------" 25 | echo ' workflow_id | status | start | end | name | parent_workflow_id' 26 | for WFID in `cat $BASEDIR/20k_WF_ID/*`; do 27 | echo "-----------------------------" 28 | curl -sXGET $CROMWELL_HOST:8000/api/workflows/v1/query?status={Submitted,Running,Aborting,Failed,Succeeded,Aborted} | jq ' .results | [.|= sort_by(.start)] | .[] | .[] | ( .id + " | " + .status + " | " + .start + " | "+ .end +" | " + .name + " | " + .rootWorkflowId )' | grep $WFID | tr '"' '|' 29 | done; 30 | -------------------------------------------------------------------------------- /20k_Throughput-run/step03_Cromwell_Run_20k_Throughput-run.sh: -------------------------------------------------------------------------------- 1 | #! /bin/bash 2 | # Copyright (c) 2019 Intel Corporation 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at # 6 | # http://www.apache.org/licenses/LICENSE-2.0 7 | # 8 | # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 9 | # See the License for the specific language governing permissions and # limitations under the License. 10 | # 11 | # SPDX-License-Identifier: Apache-2.0 12 | # 13 | 14 | source ./configure 15 | 16 | WDL=$BASEDIR/WholeGenomeGermlineSingleSample.wdl 17 | JSON=$BASEDIR/JSON/WholeGenomeGermlineSingleSample_20k.json 18 | 19 | limit=$NUM_WORKFLOW 20 | 21 | export DATE_WITH_TIME=`date "+%Y%m%d:%H-%M-%S"` 22 | mkdir "20k_WF_ID-"$DATE_WITH_TIME"" 23 | mkdir "cromwell-status-"$DATE_WITH_TIME"" 24 | #remove the temporary directories from previous runs. 25 | rm -rf cromwell-monitor 26 | rm -rf 20k_WF_ID 27 | #creating new temporary directories for monitoring and output results 28 | mkdir cromwell-monitor 29 | mkdir 20k_WF_ID 30 | 31 | curl localhost:8000/api/workflows/v1/query 2>/dev/null | json_pp>"cromwell-status-"$DATE_WITH_TIME""/cromwell_start 32 | cp "cromwell-status-"$DATE_WITH_TIME""/cromwell_start cromwell-monitor 33 | 34 | date -u +"%Y-%m-%dT%H:%M:%S.000Z"> cromwell_start_date 35 | echo Start time is `date` : `date +"%H:%M:%S"` 36 | 37 | 38 | for i in $(seq $limit) 39 | do 40 | echo $i 41 | curl -vXPOST http://$CROMWELL_HOST:8000/api/workflows/v1 -F workflowSource=@${WDL} -F workflowInputs=@${JSON} -F workflowDependencies=@$BASEDIR/WDL/warp.zip > 20k_submission_response.txt 42 | cat 20k_submission_response.txt | cut -d '"' -f4 >"20k_WF_ID-"$DATE_WITH_TIME""/20k_WF_ID_${i}.txt 43 | cp "20k_WF_ID-"$DATE_WITH_TIME""/20k_WF_ID_* 20k_WF_ID 44 | done 45 | 46 | 47 | -------------------------------------------------------------------------------- /20k_Throughput-run/WDL/BamToCram.wdl: -------------------------------------------------------------------------------- 1 | version 1.0 2 | 3 | import "Utilities.wdl" as Utils 4 | import "Qc.wdl" as QC 5 | 6 | workflow BamToCram { 7 | 8 | input { 9 | File input_bam 10 | File ref_fasta 11 | File ref_fasta_index 12 | File ref_dict 13 | File duplication_metrics 14 | File chimerism_metrics 15 | String base_file_name 16 | } 17 | 18 | 19 | # ValidateSamFile runs out of memory in mate validation on crazy edge case data, so we want to skip the mate validation 20 | # in those cases. These values set the thresholds for what is considered outside the normal realm of "reasonable" data. 21 | Float max_duplication_in_reasonable_sample = 0.30 22 | Float max_chimerism_in_reasonable_sample = 0.15 23 | 24 | # Convert the final merged recalibrated BAM file to CRAM format 25 | call Utils.ConvertToCram as ConvertToCram { 26 | input: 27 | input_bam = input_bam, 28 | ref_fasta = ref_fasta, 29 | ref_fasta_index = ref_fasta_index, 30 | output_basename = base_file_name, 31 | } 32 | 33 | # Check whether the data has massively high duplication or chimerism rates 34 | call QC.CheckPreValidation as CheckPreValidation { 35 | input: 36 | duplication_metrics = duplication_metrics, 37 | chimerism_metrics = chimerism_metrics, 38 | max_duplication_in_reasonable_sample = max_duplication_in_reasonable_sample, 39 | max_chimerism_in_reasonable_sample = max_chimerism_in_reasonable_sample, 40 | } 41 | 42 | # Validate the CRAM file 43 | call QC.ValidateSamFile as ValidateCram { 44 | input: 45 | input_bam = ConvertToCram.output_cram, 46 | input_bam_index = ConvertToCram.output_cram_index, 47 | report_filename = base_file_name + ".cram.validation_report", 48 | ref_dict = ref_dict, 49 | ref_fasta = ref_fasta, 50 | ref_fasta_index = ref_fasta_index, 51 | ignore = ["MISSING_TAG_NM"], 52 | max_output = 1000000000, 53 | is_outlier_data = CheckPreValidation.is_outlier_data, 54 | } 55 | 56 | output { 57 | File output_cram = ConvertToCram.output_cram 58 | File output_cram_index = ConvertToCram.output_cram_index 59 | File output_cram_md5 = ConvertToCram.output_cram_md5 60 | File validate_cram_file_report = ValidateCram.report 61 | } 62 | meta { 63 | allowNestedInputs: true 64 | } 65 | } 66 | 67 | -------------------------------------------------------------------------------- /config_files/SLURM/slurm.conf: -------------------------------------------------------------------------------- 1 | # 2 | # Example slurm.conf file. Please run configurator.html 3 | # (in doc/html) to build a configuration file customized 4 | # for your environment. 5 | # 6 | # 7 | # slurm.conf file generated by configurator.html. 8 | # 9 | # See the slurm.conf man page for more information. 10 | # 11 | ClusterName=linux 12 | ControlMachine=slurm-0-0 13 | #ControlAddr= 14 | #BackupController= 15 | #BackupAddr= 16 | # 17 | SlurmUser=slurm 18 | #SlurmdUser=root 19 | SlurmctldPort=6817 20 | SlurmdPort=6818 21 | AuthType=auth/munge 22 | CryptoType=crypto/munge 23 | #JobCredentialPrivateKey= 24 | #JobCredentialPublicCertificate= 25 | StateSaveLocation=/var/spool/slurm/ctld 26 | SlurmdSpoolDir=/var/spool/slurm/d 27 | SwitchType=switch/none 28 | MpiDefault=none 29 | SlurmctldPidFile=/var/run/slurmctld.pid 30 | SlurmdPidFile=/var/run/slurmd.pid 31 | ProctrackType=proctrack/pgid 32 | #PluginDir= 33 | #FirstJobId= 34 | ReturnToService=0 35 | #MaxJobCount= 36 | #PlugStackConfig= 37 | #PropagatePrioProcess= 38 | #PropagateResourceLimits= 39 | #PropagateResourceLimitsExcept= 40 | #Prolog= 41 | #Epilog= 42 | #SrunProlog= 43 | #SrunEpilog= 44 | #TaskProlog= 45 | #TaskEpilog= 46 | #TaskPlugin= 47 | #TrackWCKey=no 48 | #TreeWidth=50 49 | #TmpFS= 50 | #UsePAM= 51 | # 52 | # TIMERS 53 | SlurmctldTimeout=300 54 | SlurmdTimeout=300 55 | InactiveLimit=0 56 | MinJobAge=300 57 | KillWait=30 58 | Waittime=0 59 | # 60 | # SCHEDULING 61 | SchedulerType=sched/backfill 62 | #SchedulerAuth= 63 | #SelectType=select/linear 64 | FastSchedule=1 65 | #PriorityType=priority/multifactor 66 | #PriorityDecayHalfLife=14-0 67 | #PriorityUsageResetPeriod=14-0 68 | #PriorityWeightFairshare=100000 69 | #PriorityWeightAge=1000 70 | #PriorityWeightPartition=10000 71 | #PriorityWeightJobSize=1000 72 | #PriorityMaxAge=1-0 73 | # 74 | # LOGGING 75 | SlurmctldDebug=3 76 | SlurmctldLogFile=/var/log/slurmctld.log 77 | SlurmdDebug=3 78 | SlurmdLogFile=/var/log/slurmd.log 79 | JobCompType=jobcomp/none 80 | #JobCompLoc= 81 | # 82 | # ACCOUNTING 83 | #JobAcctGatherType=jobacct_gather/linux 84 | #JobAcctGatherFrequency=30 85 | # 86 | #AccountingStorageType=accounting_storage/slurmdbd 87 | #AccountingStorageHost= 88 | #AccountingStorageLoc= 89 | #AccountingStoragePass= 90 | #AccountingStorageUser= 91 | # 92 | # COMPUTE NODES 93 | NodeName=slurm-0-0 Sockets=2 CoresPerSocket=22 ThreadsPerCore=1 Procs=44 RealMemory=257671 State=IDLE 94 | PartitionName=debug Nodes=ALL Default=YES MaxTime=INFINITE State=UP 95 | -------------------------------------------------------------------------------- /20k_Throughput-run/step05_Output_20k_Throughput-run.sh: -------------------------------------------------------------------------------- 1 | #! /bin/bash 2 | # Copyright (c) 2019 Intel Corporation 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at # 6 | # http://www.apache.org/licenses/LICENSE-2.0 7 | # 8 | # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 9 | # See the License for the specific language governing permissions and # limitations under the License. 10 | # 11 | # SPDX-License-Identifier: Apache-2.0 12 | # 13 | source ./configure 14 | 15 | curl localhost:8000/api/workflows/v1/query 2>/dev/null | json_pp>cromwell_stop 16 | 17 | start_date=`cat cromwell_start_date` 18 | count=`curl -sGET "$CROMWELL_HOST:8000/api/workflows/v1/query?start=$start_date&status=Running&includeSubworkflows=false" | jq '.totalResultsCount'` 19 | finish=`curl -sGET "$CROMWELL_HOST:8000/api/workflows/v1/query?start=$start_date&status=Succeeded&includeSubworkflows=false" | jq '.totalResultsCount'` 20 | failed=`curl -sGET "$CROMWELL_HOST:8000/api/workflows/v1/query?start=$start_date&status=Failed&includeSubworkflows=false" | jq '.totalResultsCount'` 21 | echo running: $count finished: $finish failed: $failed 22 | #grep suceeded runs to cromwell-times to calculate elapse time: 23 | 24 | sh step04_Cromwell_Monitor_20k_Throughput-run.sh | grep WholeGenomeGermlineSingleSample | sort>cromwell-times 25 | s=`cat cromwell-times | cut -d '|' -f4 | sort | head -1` 26 | e=`cat cromwell-times | cut -d '|' -f5 | sort | tail -n 1 ` 27 | 28 | if [ $count -gt 0 ] 29 | then 30 | echo workflow still in progress 31 | exit 32 | fi 33 | 34 | #echo $s $e 35 | 36 | s=`echo $s | tr 'T' ' ' | tr 'Z' '\n'` 37 | e=`echo $e | tr 'T' ' ' | tr 'Z' '\n'` 38 | #echo $s $e 39 | 40 | s=`date -d "$s" +%s` 41 | e=`date -d "$e" +%s` 42 | 43 | sec=`expr $e - $s` 44 | min=$(($sec / 60)) 45 | minsec=$(($sec % 60)) 46 | 47 | printf "Total Elapsed Time for $NUM_WORKFLOW workflows: $min minutes:%2d seconds \n " $minsec 48 | 49 | ########## Average elapse time taken for Mark Duplicates############# 50 | sum=0 51 | limit=$NUM_WORKFLOW 52 | 53 | for i in `cat 20k_WF_ID/20k_WF_ID_*`; 54 | do 55 | 56 | data=`grep "Elapsed time: " $CROMWELL_PATH/cromwell-slurm-exec/WholeGenomeGermlineSingleSample/$i/call-*/*/*/call-MarkDuplicates/execution/stderr | cut -d ':' -f 4 | cut -d " " -f 2` 57 | 58 | x=`echo $data | cut -d '.' -f 1` 59 | y=`echo $data | cut -d '.' -f 2` 60 | let "z= 10#$x*100 + 10#$y" 61 | 62 | let "sum= 10#$sum + 10#$z" 63 | 64 | done 65 | 66 | let "avg = sum / $limit" 67 | let "x = $avg / 100" 68 | let "y = $avg % 100" 69 | printf "Average Elapsed Time for Mark Duplicates: $x.%02d minutes\n" $y 70 | 71 | 72 | -------------------------------------------------------------------------------- /20k_Throughput-run/WDL/DragenTasks.wdl: -------------------------------------------------------------------------------- 1 | version 1.0 2 | 3 | ## Copyright Broad Institute, 2021 4 | ## 5 | ## This WDL defines tasks to use Dragen's DRAGstr approach to STR sequencing artifacts 6 | ## Indel genotype priors in the DRAGEN-Gatk pipeline. 7 | ## 8 | ## Runtime parameters are often optimized for Broad's Google Cloud Platform implementation. 9 | ## For program versions, see docker containers. 10 | ## 11 | ## LICENSING : 12 | ## This script is released under the WDL source code license (BSD-3) (see LICENSE in 13 | ## https://github.com/broadinstitute/wdl). Note however that the programs it calls may 14 | ## be subject to different licenses. Users are responsible for checking that they are 15 | ## authorized to run all programs before running this script. Please see the docker 16 | ## page at https://hub.docker.com/r/broadinstitute/genomes-in-the-cloud/ for detailed 17 | ## licensing information pertaining to the included programs. 18 | 19 | task CalibrateDragstrModel { 20 | input { 21 | File ref_fasta 22 | File ref_fasta_idx 23 | File ref_dict 24 | File str_table_file 25 | File alignment ## can handle cram or bam. 26 | File alignment_index 27 | Int threads = 4 28 | Int? memory_mb 29 | Boolean use_ssd = true 30 | } 31 | 32 | # If CRAM, restrict threads to a maximum of 4 33 | Boolean is_cram = sub(alignment, "\\.cram$", "") != "" + alignment 34 | Int java_threads = if (threads < 1 ) then 1 35 | else if (is_cram && threads > 4) then 4 # more than 4 threads in cram is probrably contra-productive. 36 | else threads 37 | 38 | String base_name = basename(alignment) 39 | String out_file_name = base_name + ".dragstr" 40 | Int disk_size_gb = ceil(size([ref_fasta, ref_fasta_idx, ref_dict, alignment, alignment_index, str_table_file], "GiB")) + 41 | 40 # 40 for the rest of the fs. 42 | 43 | String parallel_args = if (java_threads <= 1) then "" else "--threads " + java_threads 44 | 45 | # If the input is a CRAM we need an additional 500MB of memory per thread 46 | Int recommended_memory_mb = ceil(2000 + (if (is_cram) then 500 else 100) * java_threads) 47 | Int selected_memory_mb = select_first([memory_mb, recommended_memory_mb]) 48 | Int runtime_memory_mb = if (selected_memory_mb < 1500) then 1500 else selected_memory_mb 49 | Int java_memory_mb = if (runtime_memory_mb < 2000) then 1000 else runtime_memory_mb - 1000 50 | 51 | command <<< 52 | set -x 53 | /mnt/lustre/genomics/tools/gatk/gatk --java-options "-Xmx~{java_memory_mb}m -DGATK_STACKTRACE_ON_USER_EXCEPTION=true -Dsamjdk.reference_fasta=~{ref_fasta}" \ 54 | CalibrateDragstrModel \ 55 | -R ~{ref_fasta} \ 56 | -I ~{alignment} \ 57 | -str ~{str_table_file} \ 58 | -O ~{out_file_name} \ 59 | ~{parallel_args} 60 | 61 | >>> 62 | 63 | runtime { 64 | memory: runtime_memory_mb + " MiB" 65 | cpu: java_threads 66 | } 67 | 68 | output { 69 | File dragstr_model = "~{out_file_name}" 70 | } 71 | } 72 | -------------------------------------------------------------------------------- /20k_Throughput-run/README.md: -------------------------------------------------------------------------------- 1 | # 20k Single Sample Workflow 2 | To submit, monitor, and receive output from these workflows, follow these steps: 3 | 4 | ## Prerequisites 5 | 6 | | Genomics Tools | Version | 7 | | :---: | --- | 8 | | **WARP** | v3.1.6 | 9 | | **GATK** | 4.2.6.1 | 10 | | **bwa** | 0.7.17 | 11 | | **cromwell** | 84 | 12 | | **samtools** | 1.11 | 13 | | **picard** | 2.27.4 | 14 | | **VerifyBamID2** | 2.0.1 | 15 | | **java** | java-11-openjdk - Cromwell
java-1.8.0-openjdk - GATK | 16 | 17 | Please refer to [WARP Requirement](https://broadinstitute.github.io/warp/docs/Pipelines/Whole_Genome_Germline_Single_Sample_Pipeline/README#software-version-requirements) for more details. 18 | 19 | ## 1. Clone the repository 20 | To clone the repository, run these commands: 21 | 22 | git clone https://github.com/Intel-HLS/BIGstack.git 23 | 24 | cd 20k_Throughput-run 25 | 26 | ## 2. Configure and setup environment variables 27 | Edit the configure file to set up the various paths to work directories: GENOMICS_PATH, TOOLS_PATH, DATA_PATH and NUM_WORKFLOW and zip the wdls in to warp.zip 28 | 29 | ./configure 30 | 31 | ## 3. Configure JSON file 32 | 33 | ./step01_Configure_20k_Throughput-run.sh 34 | 35 | Modifies the Tool and Dataset paths in the json WholeGenomeGermlineSingleSample_20k.json file. 36 | 37 | ## 4. Download datasets 38 | This step will download the dataset to 'data' directory in the working directory. 39 | 40 | ./step02_Download_20k_Data_Throughput-run.sh 41 | 42 | ## 5. Run the 20k Throughput workflow 43 | Submit the workflow to the Cromwell workflow engine using this script: 44 | 45 | ./step03_Cromwell_Run_20k_Throughput-run.sh. 46 | 47 | After running this script, the HTTP response and workflow submission information are written to 20k_submission_response.txt in your home directory. Additionally, the workflow identifier for throughput run (for example: "id": "6ec0643c-1ea1-42bf-b60c-507cd1e3e96c"), is written to the file 20k_WF_ID.timestamp, which is used by steps 6 and 7. 48 | 49 | ## 6. Monitor the workflow for Workflow status - Failed, Succeeded, Running 50 | To monitor the 20k Single Sample workflow, execute: 51 | 52 | ./step04_Cromwell_Monitor_Single_Sample_20k_Workflow.sh. 53 | 54 | ## 7. View 20k Single Sample Workflow Output 55 | This will output the Elapse time and average Markduplicates elapse time. 56 | 57 | ./step05_Single_Sample_20k_Workflow_Output.sh. 58 | 59 | # Troubleshooting 60 | 61 | ## Install dependencies for Step 3-5: 62 | sudo yum install R -y 63 | 64 | sudo yum install jq -y 65 | 66 | Make sure python2 and python3 are installed and symlinks are created. 67 | 68 | ## Create generic symlinks for tools to latest/desired version - By default tool paths in wdl files uses generic symlinks : 69 | for tool in bwa samtools gatk; 70 | do 71 | export tool_version=`ls $GENOMICS_PATH/tools | grep ${tool}- | head -n1` && echo ${tool_version} && ln -sfn $GENOMICS_PATH/tools/$tool_version $GENOMICS_PATH/tools/$tool; 72 | 73 | done; 74 | 75 | ## Java version 76 | 77 | Use Java 11 to compile and run cromwell, but switch to java 8 as the default to run the workflows. 78 | 79 | ``` 80 | sudo alternatives --config java 81 | ``` -------------------------------------------------------------------------------- /20k_Throughput-run/WDL/SplitLargeReadGroup.wdl: -------------------------------------------------------------------------------- 1 | version 1.0 2 | 3 | ## Copyright Broad Institute, 2018 4 | ## 5 | ## This WDL pipeline implements a split of large readgroups for human whole-genome and exome sequencing data. 6 | ## 7 | ## Runtime parameters are often optimized for Broad's Google Cloud Platform implementation. 8 | ## For program versions, see docker containers. 9 | ## 10 | ## LICENSING : 11 | ## This script is released under the WDL source code license (BSD-3) (see LICENSE in 12 | ## https://github.com/broadinstitute/wdl). Note however that the programs it calls may 13 | ## be subject to different licenses. Users are responsible for checking that they are 14 | ## authorized to run all programs before running this script. Please see the docker 15 | ## page at https://hub.docker.com/r/broadinstitute/genomes-in-the-cloud/ for detailed 16 | ## licensing information pertaining to the included programs. 17 | 18 | import "Alignment.wdl" as Alignment 19 | import "DragmapAlignment.wdl" as DragmapAlignment 20 | import "BamProcessing.wdl" as Processing 21 | import "Utilities.wdl" as Utils 22 | import "DNASeqStructs.wdl" as Structs 23 | 24 | workflow SplitLargeReadGroup { 25 | 26 | input { 27 | File input_bam 28 | 29 | String bwa_commandline 30 | String output_bam_basename 31 | 32 | # reference_fasta.ref_alt is the .alt file from bwa-kit 33 | # (https://github.com/lh3/bwa/tree/master/bwakit), 34 | # listing the reference contigs that are "alternative". 35 | ReferenceFasta reference_fasta 36 | DragmapReference? dragmap_reference 37 | 38 | Int compression_level 39 | Int reads_per_file = 48000000 40 | Boolean hard_clip_reads = false 41 | Boolean unmap_contaminant_reads = true 42 | Boolean use_bwa_mem = true 43 | Boolean allow_empty_ref_alt = false 44 | } 45 | 46 | call Alignment.SamSplitter as SamSplitter { 47 | input : 48 | input_bam = input_bam, 49 | n_reads = reads_per_file, 50 | compression_level = compression_level 51 | } 52 | 53 | scatter(unmapped_bam in SamSplitter.split_bams) { 54 | Float current_unmapped_bam_size = size(unmapped_bam, "GiB") 55 | String current_name = basename(unmapped_bam, ".bam") 56 | 57 | if (use_bwa_mem) { 58 | call Alignment.SamToFastqAndBwaMemAndMba as SamToFastqAndBwaMemAndMba { 59 | input: 60 | input_bam = unmapped_bam, 61 | bwa_commandline = bwa_commandline, 62 | output_bam_basename = current_name, 63 | reference_fasta = reference_fasta, 64 | compression_level = compression_level, 65 | hard_clip_reads = hard_clip_reads, 66 | unmap_contaminant_reads = unmap_contaminant_reads, 67 | allow_empty_ref_alt = allow_empty_ref_alt 68 | } 69 | } 70 | if (!use_bwa_mem) { 71 | call DragmapAlignment.SamToFastqAndDragmapAndMba as SamToFastqAndDragmapAndMba { 72 | input: 73 | input_bam = unmapped_bam, 74 | output_bam_basename = current_name, 75 | reference_fasta = reference_fasta, 76 | dragmap_reference = select_first([dragmap_reference]), 77 | compression_level = compression_level, 78 | hard_clip_reads = hard_clip_reads, 79 | unmap_contaminant_reads = unmap_contaminant_reads 80 | } 81 | } 82 | 83 | File output_bam = select_first([SamToFastqAndBwaMemAndMba.output_bam, SamToFastqAndDragmapAndMba.output_bam]) 84 | } 85 | 86 | call Processing.GatherUnsortedBamFiles as GatherMonolithicBamFile { 87 | input: 88 | input_bams = output_bam, 89 | total_input_size = size(output_bam, "GiB"), 90 | output_bam_basename = output_bam_basename, 91 | compression_level = compression_level 92 | } 93 | output { 94 | File aligned_bam = GatherMonolithicBamFile.output_bam 95 | } 96 | meta { 97 | allowNestedInputs: true 98 | } 99 | } 100 | -------------------------------------------------------------------------------- /20k_Throughput-run/WholeGenomeGermlineSingleSample_20k.json: -------------------------------------------------------------------------------- 1 | { 2 | "WholeGenomeGermlineSingleSample.sample_and_unmapped_bams": { 3 | "sample_name": "NA12878", 4 | "base_file_name": "NA1278", 5 | "flowcell_unmapped_bams": [ 6 | "/mnt/lustre/genomics/data/genomics-public-data/test-data/dna/wgs/hiseq2500/NA12878/H06HDADXX130110.1.ATCACGAT.20k_reads.bam", 7 | "/mnt/lustre/genomics/data/genomics-public-data/test-data/dna/wgs/hiseq2500/NA12878/H06HDADXX130110.2.ATCACGAT.20k_reads.bam", 8 | "/mnt/lustre/genomics/data/genomics-public-data/test-data/dna/wgs/hiseq2500/NA12878/H06JUADXX130110.1.ATCACGAT.20k_reads.bam" 9 | ], 10 | "final_gvcf_base_name": "NA12878", 11 | "unmapped_bam_suffix": ".bam" 12 | }, 13 | 14 | "WholeGenomeGermlineSingleSample.scatter_settings": { 15 | "haplotype_scatter_count": 50, 16 | "break_bands_at_multiples_of": 100000 17 | }, 18 | 19 | "WholeGenomeGermlineSingleSample.references": { 20 | "contamination_sites_ud": "/mnt/lustre/genomics/data/genomics-public-data/references/broad/hg38/v0/Homo_sapiens_assembly38.contam.UD", 21 | "contamination_sites_bed": "/mnt/lustre/genomics/data/genomics-public-data/references/broad/hg38/v0/Homo_sapiens_assembly38.contam.bed", 22 | "contamination_sites_mu": "/mnt/lustre/genomics/data/genomics-public-data/references/broad/hg38/v0/Homo_sapiens_assembly38.contam.mu", 23 | "haplotype_database_file": "/mnt/lustre/genomics/data/genomics-public-data/references/broad/hg38/v0/Homo_sapiens_assembly38.haplotype_database.txt", 24 | "calling_interval_list": "/mnt/lustre/genomics/data/genomics-public-data/resources/broad/hg38/v0/wgs_calling_regions.hg38.interval_list", 25 | "evaluation_interval_list": "/mnt/lustre/genomics/data/genomics-public-data/resources/broad/hg38/v0/wgs_evaluation_regions.hg38.interval_list", 26 | "reference_fasta" : { 27 | "ref_dict": "/mnt/lustre/genomics/data/genomics-public-data/resources/broad/hg38/v0/Homo_sapiens_assembly38.dict", 28 | "ref_fasta": "/mnt/lustre/genomics/data/genomics-public-data/resources/broad/hg38/v0/Homo_sapiens_assembly38.fasta", 29 | "ref_fasta_index": "/mnt/lustre/genomics/data/genomics-public-data/resources/broad/hg38/v0/Homo_sapiens_assembly38.fasta.fai", 30 | "ref_alt": "/mnt/lustre/genomics/data/genomics-public-data/resources/broad/hg38/v0/Homo_sapiens_assembly38.fasta.64.alt", 31 | "ref_sa": "/mnt/lustre/genomics/data/genomics-public-data/resources/broad/hg38/v0/Homo_sapiens_assembly38.fasta.64.sa", 32 | "ref_amb": "/mnt/lustre/genomics/data/genomics-public-data/resources/broad/hg38/v0/Homo_sapiens_assembly38.fasta.64.amb", 33 | "ref_bwt": "/mnt/lustre/genomics/data/genomics-public-data/resources/broad/hg38/v0/Homo_sapiens_assembly38.fasta.64.bwt", 34 | "ref_ann": "/mnt/lustre/genomics/data/genomics-public-data/resources/broad/hg38/v0/Homo_sapiens_assembly38.fasta.64.ann", 35 | "ref_pac": "/mnt/lustre/genomics/data/genomics-public-data/resources/broad/hg38/v0/Homo_sapiens_assembly38.fasta.64.pac" 36 | }, 37 | "known_indels_sites_vcfs": [ 38 | "/mnt/lustre/genomics/data/genomics-public-data/resources/broad/hg38/v0/Mills_and_1000G_gold_standard.indels.hg38.vcf.gz", 39 | "/mnt/lustre/genomics/data/genomics-public-data/resources/broad/hg38/v0/Homo_sapiens_assembly38.known_indels.vcf.gz" 40 | ], 41 | "known_indels_sites_indices": [ 42 | "/mnt/lustre/genomics/data/genomics-public-data/resources/broad/hg38/v0/Mills_and_1000G_gold_standard.indels.hg38.vcf.gz.tbi", 43 | "/mnt/lustre/genomics/data/genomics-public-data/resources/broad/hg38/v0/Homo_sapiens_assembly38.known_indels.vcf.gz.tbi" 44 | ], 45 | "dbsnp_vcf": "/mnt/lustre/genomics/data/genomics-public-data/resources/broad/hg38/v0/Homo_sapiens_assembly38.dbsnp138.vcf", 46 | "dbsnp_vcf_index": "/mnt/lustre/genomics/data/genomics-public-data/resources/broad/hg38/v0/Homo_sapiens_assembly38.dbsnp138.vcf.idx" 47 | }, 48 | 49 | 50 | "WholeGenomeGermlineSingleSample.wgs_coverage_interval_list": "/mnt/lustre/genomics/data/genomics-public-data/references/broad/hg38/v0/wgs_coverage_regions.hg38.interval_list", 51 | 52 | "WholeGenomeGermlineSingleSample.papi_settings": { 53 | "preemptible_tries": 3, 54 | "agg_preemptible_tries": 3 55 | }, 56 | 57 | "WholeGenomeGermlineSingleSample.UnmappedBamToAlignedBam.CheckContamination.disable_sanity_check": true 58 | } 59 | -------------------------------------------------------------------------------- /config_files/HTCondor/condor_config.app-node: -------------------------------------------------------------------------------- 1 | ###################################################################### 2 | ## 3 | ## condor_config 4 | ## 5 | ## This is the global configuration file for condor. This is where 6 | ## you define where the local config file is. Any settings 7 | ## made here may potentially be overridden in the local configuration 8 | ## file. KEEP THAT IN MIND! To double-check that a variable is 9 | ## getting set from the configuration file that you expect, use 10 | ## condor_config_val -v 11 | ## 12 | ## condor_config.annotated is a more detailed sample config file 13 | ## 14 | ## Unless otherwise specified, settings that are commented out show 15 | ## the defaults that are used if you don't define a value. Settings 16 | ## that are defined here MUST BE DEFINED since they have no default 17 | ## value. 18 | ## 19 | ###################################################################### 20 | 21 | ## Where have you installed the bin, sbin and lib condor directories? 22 | RELEASE_DIR = /usr 23 | 24 | ## Where is the local condor directory for each host? This is where the local config file(s), logs and 25 | ## spool/execute directories are located. this is the default for Linux and Unix systems. 26 | LOCAL_DIR = /var 27 | 28 | ## Where is the machine-specific local config file for each host? 29 | LOCAL_CONFIG_FILE = /etc/condor/condor_config.local 30 | ## If your configuration is on a shared file system, then this might be a better default 31 | #LOCAL_CONFIG_FILE = $(RELEASE_DIR)/etc/$(HOSTNAME).local 32 | ## If the local config file is not present, is it an error? (WARNING: This is a potential security issue.) 33 | REQUIRE_LOCAL_CONFIG_FILE = false 34 | 35 | ## The normal way to do configuration with RPMs is to read all of the 36 | ## files in a given directory that don't match a regex as configuration files. 37 | ## Config files are read in lexicographic order. 38 | LOCAL_CONFIG_DIR = /etc/condor/config.d 39 | #LOCAL_CONFIG_DIR_EXCLUDE_REGEXP = ^((\..*)|(.*~)|(#.*)|(.*\.rpmsave)|(.*\.rpmnew))$ 40 | 41 | ## Use a host-based security policy. By default CONDOR_HOST and the local machine will be allowed 42 | use SECURITY : HOST_BASED 43 | ## To expand your condor pool beyond a single host, set ALLOW_WRITE to match all of the hosts 44 | #ALLOW_WRITE = *.cs.wisc.edu 45 | ## FLOCK_FROM defines the machines that grant access to your pool via flocking. (i.e. these machines can join your pool). 46 | #FLOCK_FROM = 47 | ## FLOCK_TO defines the central managers that your schedd will advertise itself to (i.e. these pools will give matches to your schedd). 48 | #FLOCK_TO = condor.cs.wisc.edu, cm.example.edu 49 | 50 | ##-------------------------------------------------------------------- 51 | ## Values set by the rpm patch script: 52 | ##-------------------------------------------------------------------- 53 | 54 | ## For Unix machines, the path and file name of the file containing 55 | ## the pool password for password authentication. 56 | #SEC_PASSWORD_FILE = $(LOCAL_DIR)/lib/condor/pool_password 57 | 58 | ## Pathnames 59 | RUN = $(LOCAL_DIR)/run/condor 60 | LOG = $(LOCAL_DIR)/log/condor 61 | LOCK = $(LOCAL_DIR)/lock/condor 62 | SPOOL = $(LOCAL_DIR)/lib/condor/spool 63 | EXECUTE = $(LOCAL_DIR)/lib/condor/execute 64 | BIN = $(RELEASE_DIR)/bin 65 | LIB = $(RELEASE_DIR)/lib64/condor 66 | INCLUDE = $(RELEASE_DIR)/include/condor 67 | SBIN = $(RELEASE_DIR)/sbin 68 | LIBEXEC = $(RELEASE_DIR)/libexec/condor 69 | SHARE = $(RELEASE_DIR)/share/condor 70 | 71 | PROCD_ADDRESS = $(RUN)/procd_pipe 72 | 73 | JAVA_CLASSPATH_DEFAULT = $(SHARE) $(SHARE)/scimark2lib.jar . 74 | 75 | SSH_TO_JOB_SSHD_CONFIG_TEMPLATE = /etc/condor/condor_ssh_to_job_sshd_config_template 76 | 77 | ## What machine is your central manager? 78 | 79 | CONDOR_HOST = $(FULL_HOSTNAME) 80 | 81 | ## This macro determines what daemons the condor_master will start and keep its watchful eyes on. 82 | ## The list is a comma or space separated list of subsystem names 83 | 84 | DAEMON_LIST = COLLECTOR, MASTER, NEGOTIATOR, SCHEDD 85 | 86 | # domain 87 | # REPLACE WITH YOUR CLUSTER'S DOMAIN, e.g., iogs.yourorg.com 88 | UID_DOMAIN = 89 | FILESYSTEM_DOMAIN = $(UID_DOMAIN) 90 | 91 | # permissions 92 | ALLOW_READ = * 93 | ALLOW_WRITE = * 94 | 95 | # dedicated scheduler 96 | # REPLACE WITH THE APPLICATION NODE'S IP ADDRESS, e.g., 192.168.1.5 97 | DedicatedScheduler="DedicatedScheduler@" 98 | STARTD_ATTRS = $(STARTD_ATTRS), DedicatedScheduler 99 | -------------------------------------------------------------------------------- /20k_Throughput-run/WDL/DragmapAlignment.wdl: -------------------------------------------------------------------------------- 1 | version 1.0 2 | 3 | ## Copyright Broad Institute, 2021 4 | ## 5 | ## This WDL defines tasks used for alignment of human whole-genome or exome sequencing data using Illumina's DRAGEN open source mapper. 6 | ## 7 | ## Runtime parameters are often optimized for Broad's Google Cloud Platform implementation. 8 | ## For program versions, see docker containers. 9 | ## 10 | ## LICENSING : 11 | ## This script is released under the WDL source code license (BSD-3) (see LICENSE in 12 | ## https://github.com/broadinstitute/wdl). Note however that the programs it calls may 13 | ## be subject to different licenses. Users are responsible for checking that they are 14 | ## authorized to run all programs before running this script. Please see the docker 15 | ## page at https://hub.docker.com/r/broadinstitute/genomes-in-the-cloud/ for detailed 16 | ## licensing information pertaining to the included programs. 17 | 18 | import "DNASeqStructs.wdl" 19 | 20 | # Read unmapped BAM, convert on-the-fly to FASTQ and stream to BWA MEM for alignment, then stream to MergeBamAlignment 21 | task SamToFastqAndDragmapAndMba { 22 | input { 23 | File input_bam 24 | String output_bam_basename 25 | 26 | ReferenceFasta reference_fasta 27 | DragmapReference dragmap_reference 28 | 29 | Int compression_level 30 | Boolean hard_clip_reads = false 31 | Boolean unmap_contaminant_reads = true 32 | 33 | Int cpu = 16 34 | Float disk_multiplier = 8 35 | Int memory_mb = 40960 36 | } 37 | 38 | Float unmapped_bam_size = size(input_bam, "GiB") 39 | Float ref_size = size(reference_fasta.ref_fasta, "GiB") + size(reference_fasta.ref_fasta_index, "GiB") + size(reference_fasta.ref_dict, "GiB") 40 | Float bwa_ref_size = ref_size + size(reference_fasta.ref_alt, "GiB") + size(reference_fasta.ref_amb, "GiB") + size(reference_fasta.ref_ann, "GiB") + size(reference_fasta.ref_bwt, "GiB") + size(reference_fasta.ref_pac, "GiB") + size(reference_fasta.ref_sa, "GiB") 41 | Float dragmap_ref_size = size(dragmap_reference.reference_bin, "GiB") + size(dragmap_reference.hash_table_cfg_bin, "GiB") + size(dragmap_reference.hash_table_cmp, "GiB") 42 | Int disk_size_gb = ceil(unmapped_bam_size + bwa_ref_size + dragmap_ref_size + (disk_multiplier * unmapped_bam_size) + 20) 43 | 44 | command <<< 45 | set -euxo pipefail 46 | 47 | DRAGMAP_VERSION=$(dragen-os --version) 48 | 49 | if [ -z ${DRAGMAP_VERSION} ]; then 50 | exit 1; 51 | fi 52 | 53 | mkdir dragen_reference 54 | mv ~{dragmap_reference.reference_bin} ~{dragmap_reference.hash_table_cfg_bin} ~{dragmap_reference.hash_table_cmp} dragen_reference 55 | 56 | dragen-os -b ~{input_bam} -r dragen_reference --interleaved=1 2> >(tee ~{output_bam_basename}.dragmap.stderr.log >&2) | /mnt/lustre/genomics/tools/samtools/samtools view -h -O BAM - > aligned.bam 57 | java -Dsamjdk.compression_level=~{compression_level} -Xms1000m -Xmx1000m -jar /picard/picard.jar \ 58 | MergeBamAlignment \ 59 | VALIDATION_STRINGENCY=SILENT \ 60 | EXPECTED_ORIENTATIONS=FR \ 61 | ATTRIBUTES_TO_RETAIN=X0 \ 62 | ATTRIBUTES_TO_REMOVE=RG \ 63 | ATTRIBUTES_TO_REMOVE=NM \ 64 | ATTRIBUTES_TO_REMOVE=MD \ 65 | ALIGNED_BAM=aligned.bam \ 66 | UNMAPPED_BAM=~{input_bam} \ 67 | OUTPUT=~{output_bam_basename}.bam \ 68 | REFERENCE_SEQUENCE=~{reference_fasta.ref_fasta} \ 69 | PAIRED_RUN=true \ 70 | SORT_ORDER="unsorted" \ 71 | IS_BISULFITE_SEQUENCE=false \ 72 | ALIGNED_READS_ONLY=false \ 73 | CLIP_ADAPTERS=false \ 74 | ~{true='CLIP_OVERLAPPING_READS=true' false="" hard_clip_reads} \ 75 | ~{true='CLIP_OVERLAPPING_READS_OPERATOR=H' false="" hard_clip_reads} \ 76 | MAX_RECORDS_IN_RAM=2000000 \ 77 | ADD_MATE_CIGAR=true \ 78 | MAX_INSERTIONS_OR_DELETIONS=-1 \ 79 | PRIMARY_ALIGNMENT_STRATEGY=MostDistant \ 80 | PROGRAM_RECORD_ID="dragen-os" \ 81 | PROGRAM_GROUP_VERSION="${DRAGMAP_VERSION}" \ 82 | PROGRAM_GROUP_COMMAND_LINE="dragen-os -b ~{input_bam} -r dragen_reference --interleaved=1" \ 83 | PROGRAM_GROUP_NAME="dragen-os" \ 84 | UNMAPPED_READ_STRATEGY=COPY_TO_TAG \ 85 | ALIGNER_PROPER_PAIR_FLAGS=true \ 86 | UNMAP_CONTAMINANT_READS=~{unmap_contaminant_reads} \ 87 | ADD_PG_TAG_TO_READS=false 88 | >>> 89 | runtime { 90 | memory: "${memory_mb} MiB" 91 | cpu: cpu 92 | } 93 | output { 94 | File output_bam = "~{output_bam_basename}.bam" 95 | File dragmap_stderr_log = "~{output_bam_basename}.dragmap.stderr.log" 96 | } 97 | } 98 | -------------------------------------------------------------------------------- /config_files/HTCondor/condor_config.comp-node: -------------------------------------------------------------------------------- 1 | ###################################################################### 2 | ## 3 | ## condor_config 4 | ## 5 | ## This is the global configuration file for condor. This is where 6 | ## you define where the local config file is. Any settings 7 | ## made here may potentially be overridden in the local configuration 8 | ## file. KEEP THAT IN MIND! To double-check that a variable is 9 | ## getting set from the configuration file that you expect, use 10 | ## condor_config_val -v 11 | ## 12 | ## condor_config.annotated is a more detailed sample config file 13 | ## 14 | ## Unless otherwise specified, settings that are commented out show 15 | ## the defaults that are used if you don't define a value. Settings 16 | ## that are defined here MUST BE DEFINED since they have no default 17 | ## value. 18 | ## 19 | ###################################################################### 20 | 21 | ## Where have you installed the bin, sbin and lib condor directories? 22 | RELEASE_DIR = /usr 23 | 24 | ## Where is the local condor directory for each host? This is where the local config file(s), logs and 25 | ## spool/execute directories are located. this is the default for Linux and Unix systems. 26 | LOCAL_DIR = /var 27 | 28 | ## Where is the machine-specific local config file for each host? 29 | LOCAL_CONFIG_FILE = /etc/condor/condor_config.local 30 | ## If your configuration is on a shared file system, then this might be a better default 31 | #LOCAL_CONFIG_FILE = $(RELEASE_DIR)/etc/$(HOSTNAME).local 32 | ## If the local config file is not present, is it an error? (WARNING: This is a potential security issue.) 33 | REQUIRE_LOCAL_CONFIG_FILE = false 34 | 35 | ## The normal way to do configuration with RPMs is to read all of the 36 | ## files in a given directory that don't match a regex as configuration files. 37 | ## Config files are read in lexicographic order. 38 | LOCAL_CONFIG_DIR = /etc/condor/config.d 39 | #LOCAL_CONFIG_DIR_EXCLUDE_REGEXP = ^((\..*)|(.*~)|(#.*)|(.*\.rpmsave)|(.*\.rpmnew))$ 40 | 41 | ## Use a host-based security policy. By default CONDOR_HOST and the local machine will be allowed 42 | use SECURITY : HOST_BASED 43 | ## To expand your condor pool beyond a single host, set ALLOW_WRITE to match all of the hosts 44 | #ALLOW_WRITE = *.cs.wisc.edu 45 | ## FLOCK_FROM defines the machines that grant access to your pool via flocking. (i.e. these machines can join your pool). 46 | #FLOCK_FROM = 47 | ## FLOCK_TO defines the central managers that your schedd will advertise itself to (i.e. these pools will give matches to your schedd). 48 | #FLOCK_TO = condor.cs.wisc.edu, cm.example.edu 49 | 50 | ##-------------------------------------------------------------------- 51 | ## Values set by the rpm patch script: 52 | ##-------------------------------------------------------------------- 53 | 54 | ## For Unix machines, the path and file name of the file containing 55 | ## the pool password for password authentication. 56 | #SEC_PASSWORD_FILE = $(LOCAL_DIR)/lib/condor/pool_password 57 | 58 | ## Pathnames 59 | RUN = $(LOCAL_DIR)/run/condor 60 | LOG = $(LOCAL_DIR)/log/condor 61 | LOCK = $(LOCAL_DIR)/lock/condor 62 | SPOOL = $(LOCAL_DIR)/lib/condor/spool 63 | EXECUTE = $(LOCAL_DIR)/lib/condor/execute 64 | BIN = $(RELEASE_DIR)/bin 65 | LIB = $(RELEASE_DIR)/lib64/condor 66 | INCLUDE = $(RELEASE_DIR)/include/condor 67 | SBIN = $(RELEASE_DIR)/sbin 68 | LIBEXEC = $(RELEASE_DIR)/libexec/condor 69 | SHARE = $(RELEASE_DIR)/share/condor 70 | 71 | PROCD_ADDRESS = $(RUN)/procd_pipe 72 | 73 | JAVA_CLASSPATH_DEFAULT = $(SHARE) $(SHARE)/scimark2lib.jar . 74 | 75 | SSH_TO_JOB_SSHD_CONFIG_TEMPLATE = /etc/condor/condor_ssh_to_job_sshd_config_template 76 | 77 | ## What machine is your central manager? 78 | 79 | # REPLACE WITH THE APPLICATION NODE'S IP ADDRESS, e.g., 192.168.1.5 80 | CONDOR_HOST = 81 | 82 | ## This macro determines what daemons the condor_master will start and keep its watchful eyes on. 83 | ## The list is a comma or space separated list of subsystem names 84 | 85 | DAEMON_LIST = MASTER, SCHEDD, STARTD 86 | # domain 87 | # REPLACE WITH YOUR CLUSTER'S DOMAIN, e.g., iogs.yourorg.com 88 | UID_DOMAIN = 89 | FILESYSTEM_DOMAIN = $(UID_DOMAIN) 90 | 91 | #Configure the whole machine as 1 slot - set it to be dynamically 92 | ##partitionable so that Condor can assign portions as needed. 93 | SLOT_TYPE_1 = 100% 94 | NUM_SLOTS_TYPE_1 = 1 95 | SLOT_TYPE_1_PARTITIONABLE = True 96 | NUM_SLOTS=1 97 | 98 | # permissions 99 | ALLOW_READ = * 100 | ALLOW_WRITE = * 101 | 102 | # dedicated scheduler 103 | # REPLACE WITH THE APPLICATION NODE'S IP ADDRESS, e.g., 192.168.1.5 104 | DedicatedScheduler="DedicatedScheduler@" 105 | STARTD_ATTRS = $(STARTD_ATTRS), DedicatedScheduler 106 | -------------------------------------------------------------------------------- /20k_Tutorial/step02_Download_Single_Sample_20k_Data.sh: -------------------------------------------------------------------------------- 1 | #! /bin/bash 2 | echo "BIGStack Tutorial Step 2" 3 | echo "Downloading Reference Data (if it doesn't already exist)" 4 | GCP_PATH="https://storage.googleapis.com" 5 | #Edit the below DATA_PATH to where you want the data to reside in your shared file system 6 | DATA_PATH="/cluster_share/data/RefArch_Broad_data" 7 | 8 | mkdir -p $DATA_PATH/genomics-public-data/resources/broad/hg38/v0 9 | cd $DATA_PATH/genomics-public-data/resources/broad/hg38/v0 10 | echo "Downloading the reference files" 11 | #Reference Genome 12 | wget -nc -v -P $DATA_PATH/genomics-public-data/resources/broad/hg38/v0 \ 13 | $GCP_PATH/broad-references/hg38/v0/Homo_sapiens_assembly38.dict 14 | wget -nc -v -P $DATA_PATH/genomics-public-data/resources/broad/hg38/v0 \ 15 | $GCP_PATH/broad-references/hg38/v0/Homo_sapiens_assembly38.fasta 16 | wget -nc -v -P $DATA_PATH/genomics-public-data/resources/broad/hg38/v0 \ 17 | $GCP_PATH/broad-references/hg38/v0/Homo_sapiens_assembly38.fasta.fai 18 | wget -nc -v -P $DATA_PATH/genomics-public-data/resources/broad/hg38/v0 \ 19 | $GCP_PATH/broad-references/hg38/v0/Homo_sapiens_assembly38.fasta.64.alt 20 | wget -nc -v -P $DATA_PATH/genomics-public-data/resources/broad/hg38/v0 \ 21 | $GCP_PATH/broad-references/hg38/v0/Homo_sapiens_assembly38.fasta.64.sa 22 | wget -nc -v -P $DATA_PATH/genomics-public-data/resources/broad/hg38/v0 \ 23 | $GCP_PATH/broad-references/hg38/v0/Homo_sapiens_assembly38.fasta.64.amb 24 | wget -nc -v -P $DATA_PATH/genomics-public-data/resources/broad/hg38/v0 \ 25 | $GCP_PATH/broad-references/hg38/v0/Homo_sapiens_assembly38.fasta.64.bwt 26 | wget -nc -v -P $DATA_PATH/genomics-public-data/resources/broad/hg38/v0 \ 27 | $GCP_PATH/broad-references/hg38/v0/Homo_sapiens_assembly38.fasta.64.ann 28 | wget -nc -v -P $DATA_PATH/genomics-public-data/resources/broad/hg38/v0 \ 29 | $GCP_PATH/broad-references/hg38/v0/Homo_sapiens_assembly38.fasta.64.pac 30 | wget -nc -v -P $DATA_PATH/genomics-public-data/resources/broad/hg38/v0 \ 31 | $GCP_PATH/broad-references/hg38/v0/Homo_sapiens_assembly38.contam.UD 32 | wget -nc -v -P $DATA_PATH/genomics-public-data/resources/broad/hg38/v0 \ 33 | $GCP_PATH/broad-references/hg38/v0/Homo_sapiens_assembly38.contam.bed 34 | wget -nc -v -P $DATA_PATH/genomics-public-data/resources/broad/hg38/v0 \ 35 | $GCP_PATH/broad-references/hg38/v0/Homo_sapiens_assembly38.contam.mu 36 | echo "Done downloading reference files" 37 | sleep 1 38 | echo "Downloading the resource files" 39 | #Resource Files 40 | wget -nc -v -P $DATA_PATH/genomics-public-data/resources/broad/hg38/v0 \ 41 | $GCP_PATH/broad-references/hg38/v0/Homo_sapiens_assembly38.dbsnp138.vcf 42 | wget -nc -v -P $DATA_PATH/genomics-public-data/resources/broad/hg38/v0 \ 43 | $GCP_PATH/broad-references/hg38/v0/Homo_sapiens_assembly38.dbsnp138.vcf.idx 44 | wget -nc -v -P $DATA_PATH/genomics-public-data/resources/broad/hg38/v0 \ 45 | $GCP_PATH/broad-references/hg38/v0/Mills_and_1000G_gold_standard.indels.hg38.vcf.gz 46 | wget -nc -v -P $DATA_PATH/genomics-public-data/resources/broad/hg38/v0 \ 47 | $GCP_PATH/broad-references/hg38/v0/Mills_and_1000G_gold_standard.indels.hg38.vcf.gz.tbi 48 | wget -nc -v -P $DATA_PATH/genomics-public-data/resources/broad/hg38/v0 \ 49 | $GCP_PATH/broad-references/hg38/v0/Homo_sapiens_assembly38.known_indels.vcf.gz 50 | wget -nc -v -P $DATA_PATH/genomics-public-data/resources/broad/hg38/v0 \ 51 | $GCP_PATH/broad-references/hg38/v0/Homo_sapiens_assembly38.known_indels.vcf.gz.tbi 52 | echo "Done downloading resource files" 53 | sleep 1 54 | echo "Downloading the intervals files" 55 | #Interval Files 56 | wget -nc -v -P $DATA_PATH/genomics-public-data/resources/broad/hg38/v0 \ 57 | $GCP_PATH/broad-references/hg38/v0/wgs_calling_regions.hg38.interval_list 58 | wget -nc -v -P $DATA_PATH/genomics-public-data/resources/broad/hg38/v0 \ 59 | $GCP_PATH/broad-references/hg38/v0/wgs_coverage_regions.hg38.interval_list 60 | wget -nc -v -P $DATA_PATH/genomics-public-data/resources/broad/hg38/v0 \ 61 | $GCP_PATH/broad-references/hg38/v0/wgs_coverage_regions.hg38.interval_list 62 | echo "Done downloading interval files" 63 | sleep 1 64 | echo "Downloading 20k Test Data for Single Sample Workflow" 65 | mkdir -p $DATA_PATH/genomics-public-data/test-data/dna/wgs/hiseq2500/NA12878 66 | cd $DATA_PATH/genomics-public-data/test-data/dna/wgs/hiseq2500/NA12878 67 | wget -nc -v -P $DATA_PATH/genomics-public-data/test-data/dna/wgs/hiseq2500/NA12878 \ 68 | $GCP_PATH/genomics-public-data/test-data/dna/wgs/hiseq2500/NA12878/H06HDADXX130110.1.ATCACGAT.20k_reads.bam 69 | wget -nc -v -P $DATA_PATH/genomics-public-data/test-data/dna/wgs/hiseq2500/NA12878 \ 70 | $GCP_PATH/genomics-public-data/test-data/dna/wgs/hiseq2500/NA12878/H06HDADXX130110.2.ATCACGAT.20k_reads.bam 71 | wget -nc -v -P $DATA_PATH/genomics-public-data/test-data/dna/wgs/hiseq2500/NA12878 \ 72 | $GCP_PATH/genomics-public-data/test-data/dna/wgs/hiseq2500/NA12878/H06JUADXX130110.1.ATCACGAT.20k_reads.bam 73 | chmod -R 777 $DATA_PATH/genomics-public-data/test-data/dna/wgs/hiseq2500/NA12878 74 | echo "Data for tutorial downloaded successfully" 75 | 76 | -------------------------------------------------------------------------------- /20k_Tutorial_Docker/step02_Download_Single_Sample_20k_Data.sh: -------------------------------------------------------------------------------- 1 | #! /bin/bash 2 | echo "BIGStack Tutorial Step 2" 3 | echo "Downloading Reference Data (if it doesn't already exist)" 4 | GCP_PATH="https://storage.googleapis.com" 5 | #Edit the below DATA_PATH to where you want the data to reside in your shared file system 6 | DATA_PATH="/cluster_share/data/RefArch_Broad_data" 7 | 8 | mkdir -p $DATA_PATH/genomics-public-data/resources/broad/hg38/v0 9 | cd $DATA_PATH/genomics-public-data/resources/broad/hg38/v0 10 | echo "Downloading the reference files" 11 | #Reference Genome 12 | wget -nc -v -P $DATA_PATH/genomics-public-data/resources/broad/hg38/v0 \ 13 | $GCP_PATH/broad-references/hg38/v0/Homo_sapiens_assembly38.dict 14 | wget -nc -v -P $DATA_PATH/genomics-public-data/resources/broad/hg38/v0 \ 15 | $GCP_PATH/broad-references/hg38/v0/Homo_sapiens_assembly38.fasta 16 | wget -nc -v -P $DATA_PATH/genomics-public-data/resources/broad/hg38/v0 \ 17 | $GCP_PATH/broad-references/hg38/v0/Homo_sapiens_assembly38.fasta.fai 18 | wget -nc -v -P $DATA_PATH/genomics-public-data/resources/broad/hg38/v0 \ 19 | $GCP_PATH/broad-references/hg38/v0/Homo_sapiens_assembly38.fasta.64.alt 20 | wget -nc -v -P $DATA_PATH/genomics-public-data/resources/broad/hg38/v0 \ 21 | $GCP_PATH/broad-references/hg38/v0/Homo_sapiens_assembly38.fasta.64.sa 22 | wget -nc -v -P $DATA_PATH/genomics-public-data/resources/broad/hg38/v0 \ 23 | $GCP_PATH/broad-references/hg38/v0/Homo_sapiens_assembly38.fasta.64.amb 24 | wget -nc -v -P $DATA_PATH/genomics-public-data/resources/broad/hg38/v0 \ 25 | $GCP_PATH/broad-references/hg38/v0/Homo_sapiens_assembly38.fasta.64.bwt 26 | wget -nc -v -P $DATA_PATH/genomics-public-data/resources/broad/hg38/v0 \ 27 | $GCP_PATH/broad-references/hg38/v0/Homo_sapiens_assembly38.fasta.64.ann 28 | wget -nc -v -P $DATA_PATH/genomics-public-data/resources/broad/hg38/v0 \ 29 | $GCP_PATH/broad-references/hg38/v0/Homo_sapiens_assembly38.fasta.64.pac 30 | wget -nc -v -P $DATA_PATH/genomics-public-data/resources/broad/hg38/v0 \ 31 | $GCP_PATH/broad-references/hg38/v0/Homo_sapiens_assembly38.contam.UD 32 | wget -nc -v -P $DATA_PATH/genomics-public-data/resources/broad/hg38/v0 \ 33 | $GCP_PATH/broad-references/hg38/v0/Homo_sapiens_assembly38.contam.bed 34 | wget -nc -v -P $DATA_PATH/genomics-public-data/resources/broad/hg38/v0 \ 35 | $GCP_PATH/broad-references/hg38/v0/Homo_sapiens_assembly38.contam.mu 36 | echo "Done downloading reference files" 37 | sleep 1 38 | echo "Downloading the resource files" 39 | #Resource Files 40 | wget -nc -v -P $DATA_PATH/genomics-public-data/resources/broad/hg38/v0 \ 41 | $GCP_PATH/broad-references/hg38/v0/Homo_sapiens_assembly38.dbsnp138.vcf 42 | wget -nc -v -P $DATA_PATH/genomics-public-data/resources/broad/hg38/v0 \ 43 | $GCP_PATH/broad-references/hg38/v0/Homo_sapiens_assembly38.dbsnp138.vcf.idx 44 | wget -nc -v -P $DATA_PATH/genomics-public-data/resources/broad/hg38/v0 \ 45 | $GCP_PATH/broad-references/hg38/v0/Mills_and_1000G_gold_standard.indels.hg38.vcf.gz 46 | wget -nc -v -P $DATA_PATH/genomics-public-data/resources/broad/hg38/v0 \ 47 | $GCP_PATH/broad-references/hg38/v0/Mills_and_1000G_gold_standard.indels.hg38.vcf.gz.tbi 48 | wget -nc -v -P $DATA_PATH/genomics-public-data/resources/broad/hg38/v0 \ 49 | $GCP_PATH/broad-references/hg38/v0/Homo_sapiens_assembly38.known_indels.vcf.gz 50 | wget -nc -v -P $DATA_PATH/genomics-public-data/resources/broad/hg38/v0 \ 51 | $GCP_PATH/broad-references/hg38/v0/Homo_sapiens_assembly38.known_indels.vcf.gz.tbi 52 | echo "Done downloading resource files" 53 | sleep 1 54 | echo "Downloading the intervals files" 55 | #Interval Files 56 | wget -nc -v -P $DATA_PATH/genomics-public-data/resources/broad/hg38/v0 \ 57 | $GCP_PATH/broad-references/hg38/v0/wgs_calling_regions.hg38.interval_list 58 | wget -nc -v -P $DATA_PATH/genomics-public-data/resources/broad/hg38/v0 \ 59 | $GCP_PATH/broad-references/hg38/v0/wgs_coverage_regions.hg38.interval_list 60 | wget -nc -v -P $DATA_PATH/genomics-public-data/resources/broad/hg38/v0 \ 61 | $GCP_PATH/broad-references/hg38/v0/wgs_coverage_regions.hg38.interval_list 62 | echo "Done downloading interval files" 63 | sleep 1 64 | echo "Downloading 20k Test Data for Single Sample Workflow" 65 | mkdir -p $DATA_PATH/genomics-public-data/test-data/dna/wgs/hiseq2500/NA12878 66 | cd $DATA_PATH/genomics-public-data/test-data/dna/wgs/hiseq2500/NA12878 67 | wget -nc -v -P $DATA_PATH/genomics-public-data/test-data/dna/wgs/hiseq2500/NA12878 \ 68 | $GCP_PATH/genomics-public-data/test-data/dna/wgs/hiseq2500/NA12878/H06HDADXX130110.1.ATCACGAT.20k_reads.bam 69 | wget -nc -v -P $DATA_PATH/genomics-public-data/test-data/dna/wgs/hiseq2500/NA12878 \ 70 | $GCP_PATH/genomics-public-data/test-data/dna/wgs/hiseq2500/NA12878/H06HDADXX130110.2.ATCACGAT.20k_reads.bam 71 | wget -nc -v -P $DATA_PATH/genomics-public-data/test-data/dna/wgs/hiseq2500/NA12878 \ 72 | $GCP_PATH/genomics-public-data/test-data/dna/wgs/hiseq2500/NA12878/H06JUADXX130110.1.ATCACGAT.20k_reads.bam 73 | chmod -R 777 $DATA_PATH/genomics-public-data/test-data/dna/wgs/hiseq2500/NA12878 74 | echo "Data for tutorial downloaded successfully" 75 | 76 | -------------------------------------------------------------------------------- /20k_Throughput-run/WDL/AggregatedBamQC.wdl: -------------------------------------------------------------------------------- 1 | version 1.0 2 | ## Copyright Broad Institute, 2018 3 | ## 4 | ## This WDL pipeline implements data processing according to the GATK Best Practices (June 2016) 5 | ## for human whole-genome and exome sequencing data. 6 | ## 7 | ## Runtime parameters are often optimized for Broad's Google Cloud Platform implementation. 8 | ## For program versions, see docker containers. 9 | ## 10 | ## LICENSING : 11 | ## This script is released under the WDL source code license (BSD-3) (see LICENSE in 12 | ## https://github.com/broadinstitute/wdl). Note however that the programs it calls may 13 | ## be subject to different licenses. Users are responsible for checking that they are 14 | ## authorized to run all programs before running this script. Please see the docker 15 | ## page at https://hub.docker.com/r/broadinstitute/genomes-in-the-cloud/ for detailed 16 | ## licensing information pertaining to the included programs. 17 | 18 | import "Qc.wdl" as QC 19 | import "DNASeqStructs.wdl" 20 | 21 | # WORKFLOW DEFINITION 22 | workflow AggregatedBamQC { 23 | input { 24 | File base_recalibrated_bam 25 | File base_recalibrated_bam_index 26 | String base_name 27 | String sample_name 28 | String recalibrated_bam_base_name 29 | File haplotype_database_file 30 | DNASeqSingleSampleReferences references 31 | PapiSettings papi_settings 32 | File? fingerprint_genotypes_file 33 | File? fingerprint_genotypes_index 34 | } 35 | 36 | # QC the final BAM (consolidated after scattered BQSR) 37 | call QC.CollectReadgroupBamQualityMetrics as CollectReadgroupBamQualityMetrics { 38 | input: 39 | input_bam = base_recalibrated_bam, 40 | input_bam_index = base_recalibrated_bam_index, 41 | output_bam_prefix = base_name + ".readgroup", 42 | ref_dict = references.reference_fasta.ref_dict, 43 | ref_fasta = references.reference_fasta.ref_fasta, 44 | ref_fasta_index = references.reference_fasta.ref_fasta_index, 45 | } 46 | 47 | # QC the final BAM some more (no such thing as too much QC) 48 | call QC.CollectAggregationMetrics as CollectAggregationMetrics { 49 | input: 50 | input_bam = base_recalibrated_bam, 51 | input_bam_index = base_recalibrated_bam_index, 52 | output_bam_prefix = base_name, 53 | ref_dict = references.reference_fasta.ref_dict, 54 | ref_fasta = references.reference_fasta.ref_fasta, 55 | ref_fasta_index = references.reference_fasta.ref_fasta_index, 56 | } 57 | 58 | if (defined(haplotype_database_file) && defined(fingerprint_genotypes_file)) { 59 | # Check the sample BAM fingerprint against the sample array 60 | call QC.CheckFingerprintTask as CheckFingerprintTask { 61 | input: 62 | input_bam = base_recalibrated_bam, 63 | input_bam_index = base_recalibrated_bam_index, 64 | genotypes = select_first([fingerprint_genotypes_file]), 65 | genotypes_index = fingerprint_genotypes_index, 66 | expected_sample_alias = sample_name, 67 | output_basename = base_name, 68 | haplotype_database_file = haplotype_database_file, 69 | } 70 | } 71 | 72 | # Generate a checksum per readgroup in the final BAM 73 | call QC.CalculateReadGroupChecksum as CalculateReadGroupChecksum { 74 | input: 75 | input_bam = base_recalibrated_bam, 76 | input_bam_index = base_recalibrated_bam_index, 77 | read_group_md5_filename = recalibrated_bam_base_name + ".bam.read_group_md5", 78 | } 79 | 80 | output { 81 | File read_group_alignment_summary_metrics = CollectReadgroupBamQualityMetrics.alignment_summary_metrics 82 | File read_group_gc_bias_detail_metrics = CollectReadgroupBamQualityMetrics.gc_bias_detail_metrics 83 | File read_group_gc_bias_pdf = CollectReadgroupBamQualityMetrics.gc_bias_pdf 84 | File read_group_gc_bias_summary_metrics = CollectReadgroupBamQualityMetrics.gc_bias_summary_metrics 85 | 86 | File calculate_read_group_checksum_md5 = CalculateReadGroupChecksum.md5_file 87 | 88 | File agg_alignment_summary_metrics = CollectAggregationMetrics.alignment_summary_metrics 89 | File agg_bait_bias_detail_metrics = CollectAggregationMetrics.bait_bias_detail_metrics 90 | File agg_bait_bias_summary_metrics = CollectAggregationMetrics.bait_bias_summary_metrics 91 | File agg_gc_bias_detail_metrics = CollectAggregationMetrics.gc_bias_detail_metrics 92 | File agg_gc_bias_pdf = CollectAggregationMetrics.gc_bias_pdf 93 | File agg_gc_bias_summary_metrics = CollectAggregationMetrics.gc_bias_summary_metrics 94 | File agg_insert_size_histogram_pdf = CollectAggregationMetrics.insert_size_histogram_pdf 95 | File agg_insert_size_metrics = CollectAggregationMetrics.insert_size_metrics 96 | File agg_pre_adapter_detail_metrics = CollectAggregationMetrics.pre_adapter_detail_metrics 97 | File agg_pre_adapter_summary_metrics = CollectAggregationMetrics.pre_adapter_summary_metrics 98 | File agg_quality_distribution_pdf = CollectAggregationMetrics.quality_distribution_pdf 99 | File agg_quality_distribution_metrics = CollectAggregationMetrics.quality_distribution_metrics 100 | File agg_error_summary_metrics = CollectAggregationMetrics.error_summary_metrics 101 | 102 | File? fingerprint_summary_metrics = CheckFingerprintTask.summary_metrics 103 | File? fingerprint_detail_metrics = CheckFingerprintTask.detail_metrics 104 | } 105 | meta { 106 | allowNestedInputs: true 107 | } 108 | } 109 | -------------------------------------------------------------------------------- /20k_Throughput-run/WDL/Alignment.wdl: -------------------------------------------------------------------------------- 1 | version 1.0 2 | 3 | ## Copyright Broad Institute, 2018 4 | ## 5 | ## This WDL defines tasks used for alignment of human whole-genome or exome sequencing data. 6 | ## 7 | ## Runtime parameters are often optimized for Broad's Google Cloud Platform implementation. 8 | ## For program versions, see docker containers. 9 | ## 10 | ## LICENSING : 11 | ## This script is released under the WDL source code license (BSD-3) (see LICENSE in 12 | ## https://github.com/broadinstitute/wdl). Note however that the programs it calls may 13 | ## be subject to different licenses. Users are responsible for checking that they are 14 | ## authorized to run all programs before running this script. Please see the docker 15 | ## page at https://hub.docker.com/r/broadinstitute/genomes-in-the-cloud/ for detailed 16 | ## licensing information pertaining to the included programs. 17 | 18 | import "DNASeqStructs.wdl" 19 | 20 | # Read unmapped BAM, convert on-the-fly to FASTQ and stream to BWA MEM for alignment, then stream to MergeBamAlignment 21 | task SamToFastqAndBwaMemAndMba { 22 | input { 23 | File input_bam 24 | String bwa_commandline 25 | String output_bam_basename 26 | 27 | # reference_fasta.ref_alt is the .alt file from bwa-kit 28 | # (https://github.com/lh3/bwa/tree/master/bwakit), 29 | # listing the reference contigs that are "alternative". 30 | ReferenceFasta reference_fasta 31 | 32 | Int compression_level 33 | Boolean hard_clip_reads = false 34 | Boolean unmap_contaminant_reads = true 35 | Boolean allow_empty_ref_alt = false 36 | } 37 | 38 | Float unmapped_bam_size = size(input_bam, "GiB") 39 | Float ref_size = size(reference_fasta.ref_fasta, "GiB") + size(reference_fasta.ref_fasta_index, "GiB") + size(reference_fasta.ref_dict, "GiB") 40 | Float bwa_ref_size = ref_size + size(reference_fasta.ref_alt, "GiB") + size(reference_fasta.ref_amb, "GiB") + size(reference_fasta.ref_ann, "GiB") + size(reference_fasta.ref_bwt, "GiB") + size(reference_fasta.ref_pac, "GiB") + size(reference_fasta.ref_sa, "GiB") 41 | # Sometimes the output is larger than the input, or a task can spill to disk. 42 | # In these cases we need to account for the input (1) and the output (1.5) or the input(1), the output(1), and spillage (.5). 43 | Float disk_multiplier = 2.5 44 | Int disk_size = ceil(unmapped_bam_size + bwa_ref_size + (disk_multiplier * unmapped_bam_size) + 20) 45 | 46 | command <<< 47 | 48 | 49 | # This is done before "set -o pipefail" because "bwa" will have a rc=1 and we don't want to allow rc=1 to succeed 50 | # because the sed may also fail with that error and that is something we actually want to fail on. 51 | BWA_VERSION=$(/mnt/lustre/genomics/tools/bwa/bwa 2>&1 | \ 52 | grep -e '^Version' | \ 53 | sed 's/Version: //') 54 | 55 | set -o pipefail 56 | set -e 57 | 58 | if [ -z ${BWA_VERSION} ]; then 59 | exit 1; 60 | fi 61 | 62 | # set the bash variable needed for the command-line 63 | bash_ref_fasta=~{reference_fasta.ref_fasta} 64 | # if reference_fasta.ref_alt has data in it or allow_empty_ref_alt is set 65 | if [ -s ~{reference_fasta.ref_alt} ] || ~{allow_empty_ref_alt}; then 66 | java -Xms1000m -Xmx1000m -jar /mnt/lustre/genomics/tools/picard.jar \ 67 | SamToFastq \ 68 | INPUT=~{input_bam} \ 69 | FASTQ=/dev/stdout \ 70 | INTERLEAVE=true \ 71 | NON_PF=true | \ 72 | /mnt/lustre/genomics/tools/bwa/~{bwa_commandline} /dev/stdin - 2> >(tee ~{output_bam_basename}.bwa.stderr.log >&2) | \ 73 | java -Dsamjdk.compression_level=~{compression_level} -Xms1000m -Xmx1000m -jar /mnt/lustre/genomics/tools/picard.jar \ 74 | MergeBamAlignment \ 75 | VALIDATION_STRINGENCY=SILENT \ 76 | EXPECTED_ORIENTATIONS=FR \ 77 | ATTRIBUTES_TO_RETAIN=X0 \ 78 | ATTRIBUTES_TO_REMOVE=NM \ 79 | ATTRIBUTES_TO_REMOVE=MD \ 80 | ALIGNED_BAM=/dev/stdin \ 81 | UNMAPPED_BAM=~{input_bam} \ 82 | OUTPUT=~{output_bam_basename}.bam \ 83 | REFERENCE_SEQUENCE=~{reference_fasta.ref_fasta} \ 84 | SORT_ORDER="unsorted" \ 85 | IS_BISULFITE_SEQUENCE=false \ 86 | ALIGNED_READS_ONLY=false \ 87 | CLIP_ADAPTERS=false \ 88 | ~{true='CLIP_OVERLAPPING_READS=true' false="" hard_clip_reads} \ 89 | ~{true='CLIP_OVERLAPPING_READS_OPERATOR=H' false="" hard_clip_reads} \ 90 | MAX_RECORDS_IN_RAM=2000000 \ 91 | ADD_MATE_CIGAR=true \ 92 | MAX_INSERTIONS_OR_DELETIONS=-1 \ 93 | PRIMARY_ALIGNMENT_STRATEGY=MostDistant \ 94 | PROGRAM_RECORD_ID="bwamem" \ 95 | PROGRAM_GROUP_VERSION="${BWA_VERSION}" \ 96 | PROGRAM_GROUP_COMMAND_LINE="~{bwa_commandline}" \ 97 | PROGRAM_GROUP_NAME="bwamem" \ 98 | UNMAPPED_READ_STRATEGY=COPY_TO_TAG \ 99 | ALIGNER_PROPER_PAIR_FLAGS=true \ 100 | UNMAP_CONTAMINANT_READS=~{unmap_contaminant_reads} \ 101 | ADD_PG_TAG_TO_READS=false 102 | 103 | if ~{!allow_empty_ref_alt}; then 104 | grep -m1 "read .* ALT contigs" ~{output_bam_basename}.bwa.stderr.log | \ 105 | grep -v "read 0 ALT contigs" 106 | fi 107 | 108 | # else reference_fasta.ref_alt is empty or could not be found 109 | else 110 | echo ref_alt input is empty or not provided. >&2 111 | exit 1; 112 | fi 113 | >>> 114 | runtime { 115 | memory: "14 GiB" 116 | cpu: "16" 117 | backend: "SLURM-BWA" 118 | } 119 | output { 120 | File output_bam = "~{output_bam_basename}.bam" 121 | File bwa_stderr_log = "~{output_bam_basename}.bwa.stderr.log" 122 | } 123 | } 124 | 125 | task SamSplitter { 126 | input { 127 | File input_bam 128 | Int n_reads 129 | Int compression_level 130 | } 131 | 132 | Float unmapped_bam_size = size(input_bam, "GiB") 133 | # Since the output bams are less compressed than the input bam we need a disk multiplier that's larger than 2. 134 | Float disk_multiplier = 2.5 135 | Int disk_size = ceil(disk_multiplier * unmapped_bam_size + 20) 136 | 137 | command { 138 | set -e 139 | mkdir output_dir 140 | 141 | total_reads=$(/mnt/lustre/genomics/tools/samtools/samtools view -c ~{input_bam}) 142 | 143 | java -Dsamjdk.compression_level=~{compression_level} -Xms3000m -Xmx3600m -jar /mnt/lustre/genomics/tools/picard.jar SplitSamByNumberOfReads \ 144 | INPUT=~{input_bam} \ 145 | OUTPUT=output_dir \ 146 | SPLIT_TO_N_READS=~{n_reads} \ 147 | TOTAL_READS_IN_INPUT=$total_reads 148 | } 149 | output { 150 | Array[File] split_bams = glob("output_dir/*.bam") 151 | } 152 | runtime { 153 | memory: "3.75 GiB" 154 | } 155 | } 156 | -------------------------------------------------------------------------------- /20k_Throughput-run/step02_Download_20k_Data_Throughput-run.sh: -------------------------------------------------------------------------------- 1 | #! /bin/bash 2 | # Copyright (c) 2019 Intel Corporation 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at # 6 | # http://www.apache.org/licenses/LICENSE-2.0 7 | # 8 | # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 9 | # See the License for the specific language governing permissions and # limitations under the License. 10 | # 11 | # SPDX-License-Identifier: Apache-2.0 12 | 13 | source ./configure 14 | 15 | echo "Downloading Reference Data (if it doesn't already exist)" 16 | GCP_PATH="https://storage.googleapis.com" 17 | BASEDIR="$( cd "$( dirname "${BASH_SOURCE[0]}" )" && pwd )" 18 | #specify the path to data download directory.By default, data is downloaded to current folder 19 | DATA_PATH="$BASEDIR/data" 20 | mkdir -p $DATA_PATH/genomics-public-data/resources/broad/hg38/v0 21 | cd $DATA_PATH/genomics-public-data/resources/broad/hg38/v0 22 | echo "Downloading the reference files" 23 | #Reference Genome 24 | wget -nc -v -P $DATA_PATH/genomics-public-data/resources/broad/hg38/v0 \ 25 | $GCP_PATH/genomics-public-data/resources/broad/hg38/v0/Homo_sapiens_assembly38.dict 26 | wget -nc -v -P $DATA_PATH/genomics-public-data/resources/broad/hg38/v0 \ 27 | $GCP_PATH/genomics-public-data/resources/broad/hg38/v0/Homo_sapiens_assembly38.fasta 28 | wget -nc -v -P $DATA_PATH/genomics-public-data/resources/broad/hg38/v0 \ 29 | $GCP_PATH/genomics-public-data/resources/broad/hg38/v0/Homo_sapiens_assembly38.fasta.fai 30 | wget -nc -v -P $DATA_PATH/genomics-public-data/resources/broad/hg38/v0 \ 31 | $GCP_PATH/genomics-public-data/resources/broad/hg38/v0/Homo_sapiens_assembly38.fasta.64.alt 32 | wget -nc -v -P $DATA_PATH/genomics-public-data/resources/broad/hg38/v0 \ 33 | $GCP_PATH/genomics-public-data/resources/broad/hg38/v0/Homo_sapiens_assembly38.fasta.64.sa 34 | wget -nc -v -P $DATA_PATH/genomics-public-data/resources/broad/hg38/v0 \ 35 | $GCP_PATH/genomics-public-data/resources/broad/hg38/v0/Homo_sapiens_assembly38.fasta.64.amb 36 | wget -nc -v -P $DATA_PATH/genomics-public-data/resources/broad/hg38/v0 \ 37 | $GCP_PATH/genomics-public-data/resources/broad/hg38/v0/Homo_sapiens_assembly38.fasta.64.bwt 38 | wget -nc -v -P $DATA_PATH/genomics-public-data/resources/broad/hg38/v0 \ 39 | $GCP_PATH/genomics-public-data/resources/broad/hg38/v0/Homo_sapiens_assembly38.fasta.64.ann 40 | wget -nc -v -P $DATA_PATH/genomics-public-data/resources/broad/hg38/v0 \ 41 | $GCP_PATH/genomics-public-data/resources/broad/hg38/v0/Homo_sapiens_assembly38.fasta.64.pac 42 | echo "Done downloading reference files" 43 | sleep 1 44 | echo "Downloading the resource files" 45 | #Resource Files 46 | wget -nc -v -P $DATA_PATH/genomics-public-data/resources/broad/hg38/v0 \ 47 | $GCP_PATH/genomics-public-data/resources/broad/hg38/v0/Homo_sapiens_assembly38.dbsnp138.vcf 48 | wget -nc -v -P $DATA_PATH/genomics-public-data/resources/broad/hg38/v0 \ 49 | $GCP_PATH/genomics-public-data/resources/broad/hg38/v0/Homo_sapiens_assembly38.dbsnp138.vcf.idx 50 | wget -nc -v -P $DATA_PATH/genomics-public-data/resources/broad/hg38/v0 \ 51 | $GCP_PATH/genomics-public-data/resources/broad/hg38/v0/Mills_and_1000G_gold_standard.indels.hg38.vcf.gz 52 | wget -nc -v -P $DATA_PATH/genomics-public-data/resources/broad/hg38/v0 \ 53 | $GCP_PATH/genomics-public-data/resources/broad/hg38/v0/Mills_and_1000G_gold_standard.indels.hg38.vcf.gz.tbi 54 | wget -nc -v -P $DATA_PATH/genomics-public-data/resources/broad/hg38/v0 \ 55 | $GCP_PATH/genomics-public-data/resources/broad/hg38/v0/Homo_sapiens_assembly38.known_indels.vcf.gz 56 | wget -nc -v -P $DATA_PATH/genomics-public-data/resources/broad/hg38/v0 \ 57 | $GCP_PATH/genomics-public-data/resources/broad/hg38/v0/Homo_sapiens_assembly38.known_indels.vcf.gz.tbi 58 | echo "Done downloading resource files" 59 | sleep 1 60 | echo "Downloading the intervals files" 61 | #Interval Files 62 | wget -nc -v -P $DATA_PATH/genomics-public-data/resources/broad/hg38/v0 \ 63 | $GCP_PATH/genomics-public-data/resources/broad/hg38/v0/wgs_calling_regions.hg38.interval_list 64 | wget -nc -v -P $DATA_PATH/genomics-public-data/resources/broad/hg38/v0 \ 65 | $GCP_PATH/gcp-public-data--broad-references/hg38/v0/wgs_evaluation_regions.hg38.interval_list 66 | 67 | #Alternatively gsutil cp -r gs://genomics-public-data/references/hg38/v0/* . 68 | wget -nc -v -P $DATA_PATH/genomics-public-data/references/broad/hg38/v0/ \ 69 | $GCP_PATH/genomics-public-data/references/hg38/v0/wgs_coverage_regions.hg38.interval_list 70 | wget -nc -v -P $DATA_PATH/genomics-public-data/references/broad/hg38/v0/ \ 71 | $GCP_PATH/genomics-public-data/references/hg38/v0/Homo_sapiens_assembly38.haplotype_database.txt 72 | wget -nc -v -P $DATA_PATH/genomics-public-data/references/broad/hg38/v0/ \ 73 | $GCP_PATH/genomics-public-data/references/hg38/v0/Homo_sapiens_assembly38.contam.UD 74 | wget -nc -v -P $DATA_PATH/genomics-public-data/references/broad/hg38/v0/ \ 75 | $GCP_PATH/genomics-public-data/references/hg38/v0/Homo_sapiens_assembly38.contam.mu 76 | wget -nc -v -P $DATA_PATH/genomics-public-data/references/broad/hg38/v0/ \ 77 | $GCP_PATH/genomics-public-data/references/hg38/v0/Homo_sapiens_assembly38.contam.bed 78 | # Need to find following reference 79 | # wget -nc -v -P $DATA_PATH/genomics-public-data/references/broad/hg38/v0/ \ 80 | #$GCP_PATH/genomics-public-data/references/hg38/v0/hg38_wgs_scattered_calling_intervals.txt 81 | # wget -nc -v -P $DATA_PATH/genomics-public-data/references/broad/hg38/v0/ \ 82 | #$GCP_PATH/genomics-public-data/references/hg38/v0/NA12878.hg38.reference.fingerprint.vcf 83 | 84 | echo "Done downloading interval files" 85 | sleep 1 86 | echo "Downloading 20k Test Data for Single Sample Workflow" 87 | mkdir -p $DATA_PATH/genomics-public-data/test-data/dna/wgs/hiseq2500/NA12878 88 | cd $DATA_PATH/genomics-public-data/test-data/dna/wgs/hiseq2500/NA12878 89 | wget -nc -v -P $DATA_PATH/genomics-public-data/test-data/dna/wgs/hiseq2500/NA12878 \ 90 | $GCP_PATH/genomics-public-data/test-data/dna/wgs/hiseq2500/NA12878/H06HDADXX130110.1.ATCACGAT.20k_reads.bam 91 | wget -nc -v -P $DATA_PATH/genomics-public-data/test-data/dna/wgs/hiseq2500/NA12878 \ 92 | $GCP_PATH/genomics-public-data/test-data/dna/wgs/hiseq2500/NA12878/H06HDADXX130110.2.ATCACGAT.20k_reads.bam 93 | wget -nc -v -P $DATA_PATH/genomics-public-data/test-data/dna/wgs/hiseq2500/NA12878 \ 94 | $GCP_PATH/genomics-public-data/test-data/dna/wgs/hiseq2500/NA12878/H06JUADXX130110.1.ATCACGAT.20k_reads.bam 95 | chmod -R 777 $DATA_PATH/genomics-public-data/test-data/dna/wgs/hiseq2500/NA12878 96 | echo "Data for tutorial downloaded successfully" 97 | 98 | -------------------------------------------------------------------------------- /20k_Throughput-run/WDL/VariantCalling.wdl: -------------------------------------------------------------------------------- 1 | version 1.0 2 | 3 | import "GermlineVariantDiscovery.wdl" as Calling 4 | import "Qc.wdl" as QC 5 | import "Utilities.wdl" as Utils 6 | import "BamProcessing.wdl" as BamProcessing 7 | import "DragenTasks.wdl" as DragenTasks 8 | 9 | workflow VariantCalling { 10 | 11 | 12 | String pipeline_version = "2.1.17" 13 | 14 | 15 | input { 16 | Boolean run_dragen_mode_variant_calling = false 17 | Boolean use_spanning_event_genotyping = true 18 | File calling_interval_list 19 | File evaluation_interval_list 20 | Int haplotype_scatter_count 21 | Int break_bands_at_multiples_of 22 | Float? contamination 23 | File input_bam 24 | File input_bam_index 25 | File ref_fasta 26 | File ref_fasta_index 27 | File ref_dict 28 | File? ref_str 29 | File dbsnp_vcf 30 | File dbsnp_vcf_index 31 | String base_file_name 32 | String final_vcf_base_name 33 | Boolean make_gvcf = true 34 | Boolean make_bamout = false 35 | Boolean use_gatk3_haplotype_caller = false 36 | Boolean skip_reblocking = false 37 | Boolean use_dragen_hard_filtering = false 38 | } 39 | 40 | parameter_meta { 41 | make_bamout: "For CNNScoreVariants to run with a 2D model, a bamout must be created by HaplotypeCaller. The bamout is a bam containing information on how HaplotypeCaller remapped reads while it was calling variants. See https://gatkforums.broadinstitute.org/gatk/discussion/5484/howto-generate-a-bamout-file-showing-how-haplotypecaller-has-remapped-sequence-reads for more details." 42 | run_dragen_mode_variant_calling: "Run variant calling using the DRAGEN-GATK pipeline, false by default." 43 | } 44 | 45 | if (run_dragen_mode_variant_calling) { 46 | call DragenTasks.CalibrateDragstrModel as DragstrAutoCalibration { 47 | input: 48 | ref_fasta = ref_fasta, 49 | ref_fasta_idx = ref_fasta_index, 50 | ref_dict = ref_dict, 51 | alignment = input_bam, 52 | alignment_index = input_bam_index, 53 | str_table_file = select_first([ref_str]) 54 | } 55 | } 56 | 57 | 58 | # Break the calling interval_list into sub-intervals 59 | # Perform variant calling on the sub-intervals, and then gather the results 60 | call Utils.ScatterIntervalList as ScatterIntervalList { 61 | input: 62 | interval_list = calling_interval_list, 63 | scatter_count = haplotype_scatter_count, 64 | break_bands_at_multiples_of = break_bands_at_multiples_of 65 | } 66 | 67 | # We need disk to localize the sharded input and output due to the scatter for HaplotypeCaller. 68 | # If we take the number we are scattering by and reduce by 20 we will have enough disk space 69 | # to account for the fact that the data is quite uneven across the shards. 70 | Int potential_hc_divisor = ScatterIntervalList.interval_count - 20 71 | Int hc_divisor = if potential_hc_divisor > 1 then potential_hc_divisor else 1 72 | 73 | # Call variants in parallel over WGS calling intervals 74 | scatter (scattered_interval_list in ScatterIntervalList.out) { 75 | 76 | if (use_gatk3_haplotype_caller) { 77 | call Calling.HaplotypeCaller_GATK35_GVCF as HaplotypeCallerGATK3 { 78 | input: 79 | input_bam = input_bam, 80 | input_bam_index = input_bam_index, 81 | interval_list = scattered_interval_list, 82 | gvcf_basename = base_file_name, 83 | ref_dict = ref_dict, 84 | ref_fasta = ref_fasta, 85 | ref_fasta_index = ref_fasta_index, 86 | contamination = contamination, 87 | hc_scatter = hc_divisor 88 | } 89 | } 90 | 91 | if (!use_gatk3_haplotype_caller) { 92 | # Generate GVCF by interval 93 | call Calling.HaplotypeCaller_GATK4_VCF as HaplotypeCallerGATK4 { 94 | input: 95 | contamination = if run_dragen_mode_variant_calling then 0 else contamination, 96 | input_bam = input_bam, 97 | input_bam_index = input_bam_index, 98 | interval_list = scattered_interval_list, 99 | vcf_basename = base_file_name, 100 | ref_dict = ref_dict, 101 | ref_fasta = ref_fasta, 102 | ref_fasta_index = ref_fasta_index, 103 | hc_scatter = hc_divisor, 104 | make_gvcf = make_gvcf, 105 | make_bamout = make_bamout, 106 | run_dragen_mode_variant_calling = run_dragen_mode_variant_calling, 107 | use_dragen_hard_filtering = use_dragen_hard_filtering, 108 | use_spanning_event_genotyping = use_spanning_event_genotyping, 109 | dragstr_model = DragstrAutoCalibration.dragstr_model, 110 | } 111 | 112 | if (use_dragen_hard_filtering) { 113 | call Calling.DragenHardFilterVcf as DragenHardFilterVcf { 114 | input: 115 | input_vcf = HaplotypeCallerGATK4.output_vcf, 116 | input_vcf_index = HaplotypeCallerGATK4.output_vcf_index, 117 | make_gvcf = make_gvcf, 118 | vcf_basename = base_file_name, 119 | } 120 | } 121 | 122 | # If bamout files were created, we need to sort and gather them into one bamout 123 | if (make_bamout) { 124 | call BamProcessing.SortSam as SortBamout { 125 | input: 126 | input_bam = HaplotypeCallerGATK4.bamout, 127 | output_bam_basename = final_vcf_base_name, 128 | compression_level = 2 129 | } 130 | } 131 | } 132 | 133 | File vcfs_to_merge = select_first([HaplotypeCallerGATK3.output_gvcf, DragenHardFilterVcf.output_vcf, HaplotypeCallerGATK4.output_vcf]) 134 | File vcf_indices_to_merge = select_first([HaplotypeCallerGATK3.output_gvcf_index, DragenHardFilterVcf.output_vcf_index, HaplotypeCallerGATK4.output_vcf_index]) 135 | } 136 | 137 | # Combine by-interval (g)VCFs into a single sample (g)VCF file 138 | String hard_filter_suffix = if use_dragen_hard_filtering then ".hard-filtered" else "" 139 | String merge_suffix = if make_gvcf then ".g.vcf.gz" else ".vcf.gz" 140 | call Calling.MergeVCFs as MergeVCFs { 141 | input: 142 | input_vcfs = vcfs_to_merge, 143 | input_vcfs_indexes = vcf_indices_to_merge, 144 | output_vcf_name = final_vcf_base_name + hard_filter_suffix + merge_suffix, 145 | } 146 | 147 | if (make_gvcf && !skip_reblocking) { 148 | call Calling.Reblock as Reblock { 149 | input: 150 | gvcf = MergeVCFs.output_vcf, 151 | gvcf_index = MergeVCFs.output_vcf_index, 152 | ref_fasta = ref_fasta, 153 | ref_fasta_index = ref_fasta_index, 154 | ref_dict = ref_dict, 155 | output_vcf_filename = basename(MergeVCFs.output_vcf, ".g.vcf.gz") + ".rb.g.vcf.gz" 156 | } 157 | } 158 | 159 | if (make_bamout) { 160 | call MergeBamouts { 161 | input: 162 | bams = select_all(SortBamout.output_bam), 163 | output_base_name = final_vcf_base_name 164 | } 165 | } 166 | 167 | # Validate the (g)VCF output of HaplotypeCaller 168 | call QC.ValidateVCF as ValidateVCF { 169 | input: 170 | input_vcf = select_first([Reblock.output_vcf, MergeVCFs.output_vcf]), 171 | input_vcf_index = select_first([Reblock.output_vcf_index, MergeVCFs.output_vcf_index]), 172 | dbsnp_vcf = dbsnp_vcf, 173 | dbsnp_vcf_index = dbsnp_vcf_index, 174 | ref_fasta = ref_fasta, 175 | ref_fasta_index = ref_fasta_index, 176 | ref_dict = ref_dict, 177 | calling_interval_list = calling_interval_list, 178 | is_gvcf = make_gvcf, 179 | extra_args = if (skip_reblocking == false) then "--no-overlaps" else "" 180 | } 181 | 182 | # QC the (g)VCF 183 | call QC.CollectVariantCallingMetrics as CollectVariantCallingMetrics { 184 | input: 185 | input_vcf = select_first([Reblock.output_vcf, MergeVCFs.output_vcf]), 186 | input_vcf_index = select_first([Reblock.output_vcf_index, MergeVCFs.output_vcf_index]), 187 | metrics_basename = final_vcf_base_name, 188 | dbsnp_vcf = dbsnp_vcf, 189 | dbsnp_vcf_index = dbsnp_vcf_index, 190 | ref_dict = ref_dict, 191 | evaluation_interval_list = evaluation_interval_list, 192 | is_gvcf = make_gvcf, 193 | } 194 | 195 | output { 196 | File vcf_summary_metrics = CollectVariantCallingMetrics.summary_metrics 197 | File vcf_detail_metrics = CollectVariantCallingMetrics.detail_metrics 198 | File output_vcf = select_first([Reblock.output_vcf, MergeVCFs.output_vcf]) 199 | File output_vcf_index = select_first([Reblock.output_vcf_index, MergeVCFs.output_vcf_index]) 200 | File? bamout = MergeBamouts.output_bam 201 | File? bamout_index = MergeBamouts.output_bam_index 202 | } 203 | meta { 204 | allowNestedInputs: true 205 | } 206 | } 207 | 208 | # This task is here because merging bamout files using Picard produces an error. 209 | task MergeBamouts { 210 | 211 | input { 212 | Array[File] bams 213 | String output_base_name 214 | } 215 | 216 | Int disk_size = ceil(size(bams, "GiB") * 2) + 10 217 | 218 | command <<< 219 | /mnt/lustre/genomics/tools/samtools/samtools merge ~{output_base_name}.bam ~{sep=" " bams} 220 | /mnt/lustre/genomics/tools/samtools/samtools index ~{output_base_name}.bam 221 | mv ~{output_base_name}.bam.bai ~{output_base_name}.bai 222 | >>> 223 | 224 | output { 225 | File output_bam = "~{output_base_name}.bam" 226 | File output_bam_index = "~{output_base_name}.bai" 227 | } 228 | 229 | runtime { 230 | memory: "4 GiB" 231 | cpu: "1" 232 | } 233 | } 234 | -------------------------------------------------------------------------------- /20k_Throughput-run/WDL/Utilities.wdl: -------------------------------------------------------------------------------- 1 | version 1.0 2 | 3 | ## Copyright Broad Institute, 2018 4 | ## 5 | ## This WDL defines utility tasks used for processing of sequencing data. 6 | ## 7 | ## Runtime parameters are often optimized for Broad's Google Cloud Platform implementation. 8 | ## For program versions, see docker containers. 9 | ## 10 | ## LICENSING : 11 | ## This script is released under the WDL source code license (BSD-3) (see LICENSE in 12 | ## https://github.com/broadinstitute/wdl). Note however that the programs it calls may 13 | ## be subject to different licenses. Users are responsible for checking that they are 14 | ## authorized to run all programs before running this script. Please see the docker 15 | ## page at https://hub.docker.com/r/broadinstitute/genomes-in-the-cloud/ for detailed 16 | ## licensing information pertaining to the included programs. 17 | 18 | # Generate sets of intervals for scatter-gathering over chromosomes 19 | task CreateSequenceGroupingTSV { 20 | input { 21 | File ref_dict 22 | } 23 | # Use python to create the Sequencing Groupings used for BQSR and PrintReads Scatter. 24 | # It outputs to stdout where it is parsed into a wdl Array[Array[String]] 25 | # e.g. [["1"], ["2"], ["3", "4"], ["5"], ["6", "7", "8"]] 26 | command <<< 27 | python3 <>> 62 | runtime { 63 | cpu: "2" 64 | memory: "2 GiB" 65 | } 66 | output { 67 | Array[Array[String]] sequence_grouping = read_tsv("sequence_grouping.txt") 68 | Array[Array[String]] sequence_grouping_with_unmapped = read_tsv("sequence_grouping_with_unmapped.txt") 69 | } 70 | } 71 | 72 | # This task calls picard's IntervalListTools to scatter the input interval list into scatter_count sub interval lists 73 | # Note that the number of sub interval lists may not be exactly equal to scatter_count. There may be slightly more or less. 74 | # Thus we have the block of python to count the number of generated sub interval lists. 75 | task ScatterIntervalList { 76 | input { 77 | File interval_list 78 | Int scatter_count 79 | Int break_bands_at_multiples_of 80 | } 81 | 82 | command <<< 83 | set -e 84 | mkdir out 85 | java -Xms1000m -Xmx2g -jar /mnt/lustre/genomics/tools/picard.jar \ 86 | IntervalListTools \ 87 | SCATTER_COUNT=~{scatter_count} \ 88 | SUBDIVISION_MODE=BALANCING_WITHOUT_INTERVAL_SUBDIVISION_WITH_OVERFLOW \ 89 | UNIQUE=true \ 90 | SORT=true \ 91 | BREAK_BANDS_AT_MULTIPLES_OF=~{break_bands_at_multiples_of} \ 92 | INPUT=~{interval_list} \ 93 | OUTPUT=out 94 | 95 | python3 <>> 106 | output { 107 | Array[File] out = glob("out/*/*.interval_list") 108 | Int interval_count = read_int(stdout()) 109 | } 110 | runtime { 111 | cpu: "2" 112 | memory: "2000 MiB" 113 | } 114 | } 115 | 116 | # Convert BAM file to CRAM format 117 | # Note that reading CRAMs directly with Picard is not yet supported 118 | task ConvertToCram { 119 | input { 120 | File input_bam 121 | File ref_fasta 122 | File ref_fasta_index 123 | String output_basename 124 | 125 | Int disk_size = ceil((2 * size(input_bam, "GiB")) + size(ref_fasta, "GiB") + size(ref_fasta_index, "GiB")) + 20 126 | } 127 | 128 | command <<< 129 | set -e 130 | set -o pipefail 131 | 132 | /mnt/lustre/genomics/tools/samtools/samtools view -C -T ~{ref_fasta} ~{input_bam} | \ 133 | tee ~{output_basename}.cram | \ 134 | md5sum | awk '{print $1}' > ~{output_basename}.cram.md5 135 | 136 | # Create REF_CACHE. Used when indexing a CRAM 137 | /mnt/lustre/genomics/tools/samtools/misc/seq_cache_populate.pl -root ./ref/cache ~{ref_fasta} 138 | export REF_PATH=: 139 | export REF_CACHE=./ref/cache/%2s/%2s/%s 140 | 141 | /mnt/lustre/genomics/tools/samtools/samtools index ~{output_basename}.cram 142 | >>> 143 | runtime { 144 | memory: "3 GiB" 145 | cpu: "2" 146 | } 147 | output { 148 | File output_cram = "~{output_basename}.cram" 149 | File output_cram_index = "~{output_basename}.cram.crai" 150 | File output_cram_md5 = "~{output_basename}.cram.md5" 151 | } 152 | } 153 | 154 | # Convert CRAM file to BAM format 155 | task ConvertToBam { 156 | input { 157 | File input_cram 158 | File ref_fasta 159 | File ref_fasta_index 160 | String output_basename 161 | } 162 | 163 | command <<< 164 | set -e 165 | set -o pipefail 166 | 167 | /mnt/lustre/genomics/tools/samtools/samtools view -b -o ~{output_basename}.bam -T ~{ref_fasta} ~{input_cram} 168 | 169 | /mnt/lustre/genomics/tools/samtools/samtools index ~{output_basename}.bam 170 | >>> 171 | runtime { 172 | memory: "3 GiB" 173 | cpu: "2" 174 | } 175 | output { 176 | File output_bam = "~{output_basename}.bam" 177 | File output_bam_index = "~{output_basename}.bam.bai" 178 | } 179 | } 180 | 181 | # Calculates sum of a list of floats 182 | task SumFloats { 183 | input { 184 | Array[Float] sizes 185 | } 186 | 187 | command <<< 188 | python3 -c 'print(~{sep="+" sizes})' 189 | >>> 190 | output { 191 | Float total_size = read_float(stdout()) 192 | } 193 | } 194 | 195 | # Print given message to stderr and return an error 196 | task ErrorWithMessage { 197 | input { 198 | String message 199 | } 200 | command <<< 201 | >&2 echo "Error: ~{message}" 202 | exit 1 203 | >>> 204 | } 205 | 206 | # This task is unused for now, going to keep it in here though if we need it in the future 207 | task GetValidationInputs { 208 | input { 209 | String results_path 210 | String truth_path 211 | Array[String]? input_files 212 | String? input_file 213 | 214 | Int cpu = 1 215 | Int memory_mb = 2000 216 | Int disk_size_gb = 20 217 | } 218 | 219 | meta { 220 | description: "Given either a file or list of files, output both the truth and results path" 221 | } 222 | 223 | command <<< 224 | set -e 225 | 226 | touch truth_file.txt 227 | touch truth_files.txt 228 | touch results_file.txt 229 | touch results_files.txt 230 | 231 | python3 <>> 267 | 268 | runtime { 269 | cpu: cpu 270 | memory: "~{memory_mb} MiB" 271 | } 272 | 273 | output { 274 | String truth_file = read_string("truth_file.txt") 275 | String results_file = read_string("results_file.txt") 276 | Array[String] truth_files = read_lines("truth_files.txt") 277 | Array[String] results_files = read_lines("results_files.txt") 278 | } 279 | 280 | } 281 | -------------------------------------------------------------------------------- /20k_Tutorial_Docker/16T_PairedSingleSampleWf_optimized.inputs.20k.json: -------------------------------------------------------------------------------- 1 | { 2 | "##_COMMENT1": "Take note of the .64 extensions on the reference files, issues between 32 and 64 bit OS", 3 | 4 | "##_COMMENT2": "SAMPLES - read the README to find other examples.", 5 | "PairedEndSingleSampleWorkflow.sample_name": "NA12878", 6 | "PairedEndSingleSampleWorkflow.base_file_name": "NA12878", 7 | "PairedEndSingleSampleWorkflow.flowcell_unmapped_bams": [ 8 | "/cluster_share/data/RefArch_Broad_data/genomics-public-data/test-data/dna/wgs/hiseq2500/NA12878/H06HDADXX130110.1.ATCACGAT.20k_reads.bam", 9 | "/cluster_share/data/RefArch_Broad_data/genomics-public-data/test-data/dna/wgs/hiseq2500/NA12878/H06HDADXX130110.2.ATCACGAT.20k_reads.bam", 10 | "/cluster_share/data/RefArch_Broad_data/genomics-public-data/test-data/dna/wgs/hiseq2500/NA12878/H06JUADXX130110.1.ATCACGAT.20k_reads.bam" 11 | ], 12 | "PairedEndSingleSampleWorkflow.final_gvcf_name": "NA12878.g.vcf.gz", 13 | "PairedEndSingleSampleWorkflow.unmapped_bam_suffix": ".unmapped.bam", 14 | 15 | "##_COMMENT3": "REFERENCES", 16 | "PairedEndSingleSampleWorkflow.fingerprint_genotypes_file": "/cluster_share/data/RefArch_Broad_data/genomics-public-data/resources/broad/hg38/v0/NA12878.hg38.reference.fingerprint.vcf", 17 | "PairedEndSingleSampleWorkflow.contamination_sites_ud": "/cluster_share/data/RefArch_Broad_data/genomics-public-data/resources/broad/hg38/v0/Homo_sapiens_assembly38.contam.UD", 18 | "PairedEndSingleSampleWorkflow.contamination_sites_bed": "/cluster_share/data/RefArch_Broad_data/genomics-public-data/resources/broad/hg38/v0/Homo_sapiens_assembly38.contam.bed", 19 | "PairedEndSingleSampleWorkflow.contamination_sites_mu": "/cluster_share/data/RefArch_Broad_data/genomics-public-data/resources/broad/hg38/v0/Homo_sapiens_assembly38.contam.mu", 20 | "PairedEndSingleSampleWorkflow.wgs_calling_interval_list": "/cluster_share/data/RefArch_Broad_data/genomics-public-data/resources/broad/hg38/v0/wgs_calling_regions.hg38.interval_list", 21 | "PairedEndSingleSampleWorkflow.haplotype_scatter_count" : 50, 22 | "PairedEndSingleSampleWorkflow.break_bands_at_multiples_of" : 1000000, 23 | "PairedEndSingleSampleWorkflow.ref_dict": "/cluster_share/data/RefArch_Broad_data/genomics-public-data/resources/broad/hg38/v0/Homo_sapiens_assembly38.dict", 24 | "PairedEndSingleSampleWorkflow.ref_fasta": "/cluster_share/data/RefArch_Broad_data/genomics-public-data/resources/broad/hg38/v0/Homo_sapiens_assembly38.fasta", 25 | "PairedEndSingleSampleWorkflow.ref_fasta_index": "/cluster_share/data/RefArch_Broad_data/genomics-public-data/resources/broad/hg38/v0/Homo_sapiens_assembly38.fasta.fai", 26 | "PairedEndSingleSampleWorkflow.ref_alt": "/cluster_share/data/RefArch_Broad_data/genomics-public-data/resources/broad/hg38/v0/Homo_sapiens_assembly38.fasta.64.alt", 27 | "PairedEndSingleSampleWorkflow.ref_sa": "/cluster_share/data/RefArch_Broad_data/genomics-public-data/resources/broad/hg38/v0/Homo_sapiens_assembly38.fasta.64.sa", 28 | "PairedEndSingleSampleWorkflow.ref_amb": "/cluster_share/data/RefArch_Broad_data/genomics-public-data/resources/broad/hg38/v0/Homo_sapiens_assembly38.fasta.64.amb", 29 | "PairedEndSingleSampleWorkflow.ref_bwt": "/cluster_share/data/RefArch_Broad_data/genomics-public-data/resources/broad/hg38/v0/Homo_sapiens_assembly38.fasta.64.bwt", 30 | "PairedEndSingleSampleWorkflow.ref_ann": "/cluster_share/data/RefArch_Broad_data/genomics-public-data/resources/broad/hg38/v0/Homo_sapiens_assembly38.fasta.64.ann", 31 | "PairedEndSingleSampleWorkflow.ref_pac": "/cluster_share/data/RefArch_Broad_data/genomics-public-data/resources/broad/hg38/v0/Homo_sapiens_assembly38.fasta.64.pac", 32 | "PairedEndSingleSampleWorkflow.known_snps_sites_vcf": "/cluster_share/data/RefArch_Broad_data/genomics-public-data/resources/broad/hg38/v0/Mills_and_1000G_gold_standard.indels.hg38.vcf.gz", 33 | "PairedEndSingleSampleWorkflow.known_snps_sites_vcf_index": "/cluster_share/data/RefArch_Broad_data/genomics-public-data/resources/broad/hg38/v0/Mills_and_1000G_gold_standard.indels.hg38.vcf.gz.tbi", 34 | "PairedEndSingleSampleWorkflow.dbSNP_vcf": "/cluster_share/data/RefArch_Broad_data/genomics-public-data/resources/broad/hg38/v0/Homo_sapiens_assembly38.dbsnp138.vcf", 35 | "PairedEndSingleSampleWorkflow.dbSNP_vcf_index": "/cluster_share/data/RefArch_Broad_data/genomics-public-data/resources/broad/hg38/v0/Homo_sapiens_assembly38.dbsnp138.vcf.idx", 36 | "PairedEndSingleSampleWorkflow.wgs_coverage_interval_list": "/cluster_share/data/RefArch_Broad_data/genomics-public-data/resources/broad/hg38/v0/wgs_coverage_regions.hg38.interval_list", 37 | "PairedEndSingleSampleWorkflow.wgs_evaluation_interval_list": "/cluster_share/data/RefArch_Broad_data/genomics-public-data/resources/broad/hg38/v0/wgs_evaluation_regions.hg38.interval_list", 38 | "PairedEndSingleSampleWorkflow.known_indels_sites_VCFs": [ 39 | "/cluster_share/data/RefArch_Broad_data/genomics-public-data/resources/broad/hg38/v0/Mills_and_1000G_gold_standard.indels.hg38.vcf.gz", 40 | "/cluster_share/data/RefArch_Broad_data/genomics-public-data/resources/broad/hg38/v0/Homo_sapiens_assembly38.known_indels.vcf.gz" 41 | ], 42 | "PairedEndSingleSampleWorkflow.known_indels_sites_indices": [ 43 | "/cluster_share/data/RefArch_Broad_data/genomics-public-data/resources/broad/hg38/v0/Mills_and_1000G_gold_standard.indels.hg38.vcf.gz.tbi", 44 | "/cluster_share/data/RefArch_Broad_data/genomics-public-data/resources/broad/hg38/v0/Homo_sapiens_assembly38.known_indels.vcf.gz.tbi" 45 | ], 46 | 47 | "##_COMMENT6":"OPTIMIZATION FLAGS FOR BWA, SAMTOOLS, GATK-GKL", 48 | "## SamToFastQ_COMMENT": "BWA mem is the bottleneck - request 16 cores for the task, assign 16T to bwa and let SamToFastQ and MergeBamAlignment timeshare", 49 | "PairedEndSingleSampleWorkflow.bwa_threads":16, 50 | "## SamtoolsSort_COMMENT": "Threads for Samtools sort step", 51 | "PairedEndSingleSampleWorkflow.samtools_threads":16, 52 | "## GENERAL_COMMENT": "Compression level for all java commands", 53 | "PairedEndSingleSampleWorkflow.compression_level":1, 54 | "## HaplotypeCaller_MT_comment": "See comment in the WDL file", 55 | "PairedEndSingleSampleWorkflow.gatk_gkl_pairhmm_implementation":"AVX_LOGLESS_CACHING", 56 | "PairedEndSingleSampleWorkflow.gatk_gkl_pairhmm_threads":1, 57 | "PairedEndSingleSampleWorkflow.HaplotypeCaller.smith_waterman_implementation":"AVX_ENABLED", 58 | "##COMMENT_TMPDIR": "defaults to /tmp, BUT you might want to customize this. For example, if your working directory is on an NVMe SSD", 59 | "PairedEndSingleSampleWorkflow.tmp_directory": "/tmp", 60 | "PairedEndSingleSampleWorkflow.SortSampleBam.mem_limit": "64M", 61 | 62 | "##_COMMENT7":"JAVA HEAP MEMORY OPTIONS", 63 | "PairedEndSingleSampleWorkflow.CollectQualityYieldMetrics.java_heap_memory_initial":"128m", 64 | "PairedEndSingleSampleWorkflow.SamToFastqAndBwaMemAndMba.java_heap_memory_initial":"2g", 65 | "PairedEndSingleSampleWorkflow.SortSampleBam.java_heap_memory_initial":"4g", 66 | "PairedEndSingleSampleWorkflow.CollectUnsortedReadgroupBamQualityMetrics.java_heap_memory_initial":"5g", 67 | "PairedEndSingleSampleWorkflow.CollectReadgroupBamQualityMetrics.java_heap_memory_initial":"5g", 68 | "PairedEndSingleSampleWorkflow.CollectAggregationMetrics.java_heap_memory_initial":"5g", 69 | "PairedEndSingleSampleWorkflow.CrossCheckFingerprints.java_heap_memory_initial":"2g", 70 | "PairedEndSingleSampleWorkflow.CheckFingerprint.java_heap_memory_initial":"1g", 71 | "PairedEndSingleSampleWorkflow.MarkDuplicates.java_heap_memory_initial":"4g", 72 | "PairedEndSingleSampleWorkflow.BaseRecalibrator.java_heap_memory_initial":"4g", 73 | "PairedEndSingleSampleWorkflow.GatherBqsrReports.java_heap_memory_initial":"3g", 74 | "PairedEndSingleSampleWorkflow.ApplyBQSR.java_heap_memory_initial":"3g", 75 | "PairedEndSingleSampleWorkflow.GatherBamFiles.java_heap_memory_initial":"2g", 76 | "PairedEndSingleSampleWorkflow.ValidateBamFromCram.java_heap_memory_initial":"6g", 77 | "PairedEndSingleSampleWorkflow.CollectWgsMetrics.java_heap_memory_initial":"2g", 78 | "PairedEndSingleSampleWorkflow.CollectRawWgsMetrics.java_heap_memory_initial":"2g", 79 | "PairedEndSingleSampleWorkflow.CalculateReadGroupChecksum.java_heap_memory_initial":"1g", 80 | "PairedEndSingleSampleWorkflow.ScatterIntervalList.java_heap_memory_initial":"1g", 81 | "PairedEndSingleSampleWorkflow.HaplotypeCaller.haplotypecaller_java_heap_memory_initial":"6g", 82 | "PairedEndSingleSampleWorkflow.MergeVCFs.java_heap_memory_initial":"2g", 83 | "PairedEndSingleSampleWorkflow.ValidateGVCF.java_heap_memory_initial":"3g", 84 | "PairedEndSingleSampleWorkflow.CollectGvcfCallingMetrics.java_heap_memory_initial":"2g", 85 | 86 | "##_COMMENT8":"RUNTIME SECTION MEMORY OPTIONS", 87 | "PairedEndSingleSampleWorkflow.CollectQualityYieldMetrics.memory":"2GB", 88 | "PairedEndSingleSampleWorkflow.CheckFinalVcfExtension.memory":"2GB", 89 | "PairedEndSingleSampleWorkflow.GetBwaVersion.memory":"1GB", 90 | "PairedEndSingleSampleWorkflow.SamToFastqAndBwaMemAndMba.memory":"10GB", 91 | "PairedEndSingleSampleWorkflow.SortSampleBam.memory":"5GB", 92 | "PairedEndSingleSampleWorkflow.CollectUnsortedReadgroupBamQualityMetrics.memory":"7GB", 93 | "PairedEndSingleSampleWorkflow.CollectReadgroupBamQualityMetrics.memory":"7GB", 94 | "PairedEndSingleSampleWorkflow.CollectAggregationMetrics.memory":"7GB", 95 | "PairedEndSingleSampleWorkflow.CrossCheckFingerprints.memory":"2GB", 96 | "PairedEndSingleSampleWorkflow.CheckFingerprint.memory":"1GB", 97 | "PairedEndSingleSampleWorkflow.MarkDuplicates.memory":"7GB", 98 | "PairedEndSingleSampleWorkflow.CreateSequenceGroupingTSV.memory":"2GB", 99 | "PairedEndSingleSampleWorkflow.BaseRecalibrator.memory":"6GB", 100 | "PairedEndSingleSampleWorkflow.GatherBqsrReports.memory":"3GB", 101 | "PairedEndSingleSampleWorkflow.ApplyBQSR.memory":"4GB", 102 | "PairedEndSingleSampleWorkflow.GatherBamFiles.memory":"3GB", 103 | "PairedEndSingleSampleWorkflow.ValidateBamFromCram.memory":"7GB", 104 | "PairedEndSingleSampleWorkflow.CollectWgsMetrics.memory":"3GB", 105 | "PairedEndSingleSampleWorkflow.CollectRawWgsMetrics.memory":"3GB", 106 | "PairedEndSingleSampleWorkflow.CalculateReadGroupChecksum.memory":"2GB", 107 | "PairedEndSingleSampleWorkflow.CheckContamination.memory":"2GB", 108 | "PairedEndSingleSampleWorkflow.ScatterIntervalList.memory":"2GB", 109 | "PairedEndSingleSampleWorkflow.HaplotypeCaller.memory":"7GB", 110 | "PairedEndSingleSampleWorkflow.MergeVCFs.memory":"3GB", 111 | "PairedEndSingleSampleWorkflow.ValidateGVCF.memory":"4GB", 112 | "PairedEndSingleSampleWorkflow.CollectGvcfCallingMetrics.memory":"3GB", 113 | "PairedEndSingleSampleWorkflow.ConvertToCram.memory":"3GB", 114 | "PairedEndSingleSampleWorkflow.CramToBam.memory":"3GB", 115 | 116 | "##_COMMENT9":"RUNTIME SECTION CPU OPTIONS", 117 | "PairedEndSingleSampleWorkflow.CollectQualityYieldMetrics.cpu":1, 118 | "PairedEndSingleSampleWorkflow.CheckFinalVcfExtension.cpu":1, 119 | "PairedEndSingleSampleWorkflow.GetBwaVersion.cpu":1, 120 | "PairedEndSingleSampleWorkflow.SamToFastqAndBwaMemAndMba.cpu":16, 121 | "PairedEndSingleSampleWorkflow.SortSampleBam.cpu":16, 122 | "PairedEndSingleSampleWorkflow.CollectUnsortedReadgroupBamQualityMetrics.cpu":1, 123 | "PairedEndSingleSampleWorkflow.CollectReadgroupBamQualityMetrics.cpu":1, 124 | "PairedEndSingleSampleWorkflow.CollectAggregationMetrics.cpu":1, 125 | "PairedEndSingleSampleWorkflow.CrossCheckFingerprints.cpu":1, 126 | "PairedEndSingleSampleWorkflow.CheckFingerprint.cpu":1, 127 | "PairedEndSingleSampleWorkflow.MarkDuplicates.cpu":1, 128 | "PairedEndSingleSampleWorkflow.CreateSequenceGroupingTSV.cpu":1, 129 | "PairedEndSingleSampleWorkflow.BaseRecalibrator.cpu":1, 130 | "PairedEndSingleSampleWorkflow.GatherBqsrReports.cpu":1, 131 | "PairedEndSingleSampleWorkflow.ApplyBQSR.cpu":1, 132 | "PairedEndSingleSampleWorkflow.GatherBamFiles.cpu":1, 133 | "PairedEndSingleSampleWorkflow.ValidateBamFromCram.cpu":1, 134 | "PairedEndSingleSampleWorkflow.CollectWgsMetrics.cpu":1, 135 | "PairedEndSingleSampleWorkflow.CollectRawWgsMetrics.cpu":1, 136 | "PairedEndSingleSampleWorkflow.CalculateReadGroupChecksum.cpu":1, 137 | "PairedEndSingleSampleWorkflow.CheckContamination.cpu":1, 138 | "PairedEndSingleSampleWorkflow.ScatterIntervalList.cpu":1, 139 | "PairedEndSingleSampleWorkflow.HaplotypeCaller.cpu":1, 140 | "PairedEndSingleSampleWorkflow.MergeVCFs.cpu":1, 141 | "PairedEndSingleSampleWorkflow.ValidateGVCF.cpu":1, 142 | "PairedEndSingleSampleWorkflow.CollectGvcfCallingMetrics.cpu":1, 143 | "PairedEndSingleSampleWorkflow.ConvertToCram.cpu":1, 144 | "PairedEndSingleSampleWorkflow.CramToBam.cpu":1 145 | } 146 | -------------------------------------------------------------------------------- /20k_Throughput-run/WDL/GermlineVariantDiscovery.wdl: -------------------------------------------------------------------------------- 1 | version 1.0 2 | 3 | ## Copyright Broad Institute, 2018 4 | ## 5 | ## This WDL defines tasks used for germline variant discovery of human whole-genome or exome sequencing data. 6 | ## 7 | ## Runtime parameters are often optimized for Broad's Google Cloud Platform implementation. 8 | ## For program versions, see docker containers. 9 | ## 10 | ## LICENSING : 11 | ## This script is released under the WDL source code license (BSD-3) (see LICENSE in 12 | ## https://github.com/broadinstitute/wdl). Note however that the programs it calls may 13 | ## be subject to different licenses. Users are responsible for checking that they are 14 | ## authorized to run all programs before running this script. Please see the docker 15 | ## page at https://hub.docker.com/r/broadinstitute/genomes-in-the-cloud/ for detailed 16 | ## licensing information pertaining to the included programs. 17 | 18 | task HaplotypeCaller_GATK35_GVCF { 19 | input { 20 | File input_bam 21 | File input_bam_index 22 | File interval_list 23 | String gvcf_basename 24 | File ref_dict 25 | File ref_fasta 26 | File ref_fasta_index 27 | Float? contamination 28 | Int hc_scatter 29 | } 30 | 31 | parameter_meta { 32 | input_bam: { 33 | localization_optional: true 34 | } 35 | } 36 | 37 | Float ref_size = size(ref_fasta, "GiB") + size(ref_fasta_index, "GiB") + size(ref_dict, "GiB") 38 | Int disk_size = ceil(((size(input_bam, "GiB") + 30) / hc_scatter) + ref_size) + 20 39 | 40 | # We use interval_padding 500 below to make sure that the HaplotypeCaller has context on both sides around 41 | # the interval because the assembly uses them. 42 | # 43 | # Using PrintReads is a temporary solution until we update HaploypeCaller to use GATK4. Once that is done, 44 | # HaplotypeCaller can stream the required intervals directly from the cloud. 45 | command { 46 | /usr/gitc/gatk4/gatk --java-options "-Xms2000m -Xmx9000m"\ 47 | PrintReads \ 48 | -I ~{input_bam} \ 49 | --interval-padding 500 \ 50 | -L ~{interval_list} \ 51 | -O local.sharded.bam \ 52 | && \ 53 | java -XX:GCTimeLimit=50 -XX:GCHeapFreeLimit=10 -Xms8000m -Xmx9000m\ 54 | -jar /usr/gitc/GATK35.jar \ 55 | -T HaplotypeCaller \ 56 | -R ~{ref_fasta} \ 57 | -o ~{gvcf_basename}.vcf.gz \ 58 | -I local.sharded.bam \ 59 | -L ~{interval_list} \ 60 | -ERC GVCF \ 61 | --max_alternate_alleles 3 \ 62 | -variant_index_parameter 128000 \ 63 | -variant_index_type LINEAR \ 64 | -contamination ~{default=0 contamination} \ 65 | --read_filter OverclippedRead 66 | } 67 | runtime { 68 | memory: "10000 MiB" 69 | cpu: "2" 70 | backend: "SLURM-HAPLO" 71 | } 72 | output { 73 | File output_gvcf = "~{gvcf_basename}.vcf.gz" 74 | File output_gvcf_index = "~{gvcf_basename}.vcf.gz.tbi" 75 | } 76 | } 77 | 78 | task HaplotypeCaller_GATK4_VCF { 79 | input { 80 | File input_bam 81 | File input_bam_index 82 | File interval_list 83 | String vcf_basename 84 | File ref_dict 85 | File ref_fasta 86 | File ref_fasta_index 87 | Float? contamination 88 | Boolean make_gvcf 89 | Boolean make_bamout 90 | Int hc_scatter 91 | Boolean run_dragen_mode_variant_calling = false 92 | Boolean use_dragen_hard_filtering = false 93 | Boolean use_spanning_event_genotyping = true 94 | File? dragstr_model 95 | Int memory_multiplier = 1 96 | } 97 | 98 | Int memory_size_mb = ceil(8000 * memory_multiplier) 99 | 100 | String output_suffix = if make_gvcf then ".g.vcf.gz" else ".vcf.gz" 101 | String output_file_name = vcf_basename + output_suffix 102 | 103 | Float ref_size = size(ref_fasta, "GiB") + size(ref_fasta_index, "GiB") + size(ref_dict, "GiB") 104 | Int disk_size = ceil(((size(input_bam, "GiB") + 30) / hc_scatter) + ref_size) + 20 105 | 106 | String bamout_arg = if make_bamout then "-bamout ~{vcf_basename}.bamout.bam" else "" 107 | 108 | parameter_meta { 109 | input_bam: { 110 | localization_optional: true 111 | } 112 | } 113 | 114 | command <<< 115 | set -e 116 | /mnt/lustre/genomics/tools/gatk/gatk --java-options "-Xms6000m -Xmx6400m -XX:GCTimeLimit=50 -XX:GCHeapFreeLimit=10" \ 117 | HaplotypeCaller \ 118 | -R ~{ref_fasta} \ 119 | -I ~{input_bam} \ 120 | -L ~{interval_list} \ 121 | -O ~{output_file_name} \ 122 | -contamination ~{default=0 contamination} \ 123 | -G StandardAnnotation -G StandardHCAnnotation ~{true="-G AS_StandardAnnotation" false="" make_gvcf} \ 124 | ~{true="--dragen-mode" false="" run_dragen_mode_variant_calling} \ 125 | ~{false="--disable-spanning-event-genotyping" true="" use_spanning_event_genotyping} \ 126 | ~{if defined(dragstr_model) then "--dragstr-params-path " + dragstr_model else ""} \ 127 | -GQB 10 -GQB 20 -GQB 30 -GQB 40 -GQB 50 -GQB 60 -GQB 70 -GQB 80 -GQB 90 \ 128 | ~{true="-ERC GVCF" false="" make_gvcf} \ 129 | ~{bamout_arg} 130 | 131 | # Cromwell doesn't like optional task outputs, so we have to touch this file. 132 | touch ~{vcf_basename}.bamout.bam 133 | >>> 134 | 135 | runtime { 136 | memory: "6.5 GiB" 137 | cpu: "2" 138 | backend: "SLURM-HAPLO" 139 | } 140 | 141 | output { 142 | File output_vcf = "~{output_file_name}" 143 | File output_vcf_index = "~{output_file_name}.tbi" 144 | File bamout = "~{vcf_basename}.bamout.bam" 145 | } 146 | } 147 | 148 | # Combine multiple VCFs or GVCFs from scattered HaplotypeCaller runs 149 | task MergeVCFs { 150 | input { 151 | Array[File] input_vcfs 152 | Array[File] input_vcfs_indexes 153 | String output_vcf_name 154 | } 155 | 156 | Int disk_size = ceil(size(input_vcfs, "GiB") * 2.5) + 10 157 | 158 | # Using MergeVcfs instead of GatherVcfs so we can create indices 159 | # See https://github.com/broadinstitute/picard/issues/789 for relevant GatherVcfs ticket 160 | command { 161 | java -Xms2000m -Xmx2900m -jar /mnt/lustre/genomics/tools/picard.jar \ 162 | MergeVcfs \ 163 | INPUT=~{sep=' INPUT=' input_vcfs} \ 164 | OUTPUT=~{output_vcf_name} 165 | } 166 | runtime { 167 | cpu: "2" 168 | memory: "3000 MiB" 169 | } 170 | output { 171 | File output_vcf = "~{output_vcf_name}" 172 | File output_vcf_index = "~{output_vcf_name}.tbi" 173 | } 174 | } 175 | 176 | task Reblock { 177 | 178 | input { 179 | File gvcf 180 | File gvcf_index 181 | File ref_dict 182 | File ref_fasta 183 | File ref_fasta_index 184 | String output_vcf_filename 185 | Int additional_disk = 20 186 | String? annotations_to_keep_command 187 | String? annotations_to_remove_command 188 | Float? tree_score_cutoff 189 | Boolean move_filters_to_genotypes = false 190 | } 191 | 192 | Int disk_size = ceil((size(gvcf, "GiB")) * 4) + additional_disk 193 | String gvcf_basename = basename(gvcf) 194 | String gvcf_index_basename = basename(gvcf_index) 195 | 196 | command { 197 | set -e 198 | 199 | # We can't always assume the index was located with the gvcf, so make a link so that the paths look the same 200 | ln -s ~{gvcf} ~{gvcf_basename} 201 | ln -s ~{gvcf_index} ~{gvcf_index_basename} 202 | 203 | /mnt/lustre/genomics/tools/gatk/gatk --java-options "-Xms3000m -Xmx3000m" \ 204 | ReblockGVCF \ 205 | -R ~{ref_fasta} \ 206 | -V ~{gvcf_basename} \ 207 | -do-qual-approx \ 208 | --floor-blocks -GQB 20 -GQB 30 -GQB 40 \ 209 | ~{annotations_to_keep_command} \ 210 | ~{annotations_to_remove_command} \ 211 | ~{"--tree-score-threshold-to-no-call " + tree_score_cutoff} \ 212 | ~{if move_filters_to_genotypes then "--add-site-filters-to-genotype" else ""} \ 213 | -O ~{output_vcf_filename} 214 | } 215 | 216 | runtime { 217 | memory: "3750 MiB" 218 | } 219 | 220 | output { 221 | File output_vcf = output_vcf_filename 222 | File output_vcf_index = output_vcf_filename + ".tbi" 223 | } 224 | } 225 | 226 | task HardFilterVcf { 227 | input { 228 | File input_vcf 229 | File input_vcf_index 230 | String vcf_basename 231 | File interval_list 232 | } 233 | 234 | Int disk_size = ceil(2 * size(input_vcf, "GiB")) + 20 235 | String output_vcf_name = vcf_basename + ".filtered.vcf.gz" 236 | 237 | command { 238 | /mnt/lustre/genomics/tools/gatk/gatk --java-options "-Xms2000m -Xmx2500m" \ 239 | VariantFiltration \ 240 | -V ~{input_vcf} \ 241 | -L ~{interval_list} \ 242 | --filter-expression "QD < 2.0 || FS > 30.0 || SOR > 3.0 || MQ < 40.0 || MQRankSum < -3.0 || ReadPosRankSum < -3.0" \ 243 | --filter-name "HardFiltered" \ 244 | -O ~{output_vcf_name} 245 | } 246 | output { 247 | File output_vcf = "~{output_vcf_name}" 248 | File output_vcf_index = "~{output_vcf_name}.tbi" 249 | } 250 | runtime { 251 | memory: "3000 MiB" 252 | } 253 | } 254 | 255 | # This hard filtering matches DRAGEN 3.4.12. For later DRAGEN versions, this needs to be updated. 256 | task DragenHardFilterVcf { 257 | input { 258 | File input_vcf 259 | File input_vcf_index 260 | Boolean make_gvcf 261 | String vcf_basename 262 | } 263 | 264 | Int disk_size = ceil(2 * size(input_vcf, "GiB")) + 20 265 | 266 | String output_suffix = if make_gvcf then ".g.vcf.gz" else ".vcf.gz" 267 | String output_vcf_name = vcf_basename + ".hard-filtered" + output_suffix 268 | 269 | command { 270 | /mnt/lustre/genomics/tools/gatk/gatk --java-options "-Xms2000m -Xmx2500m" \ 271 | VariantFiltration \ 272 | -V ~{input_vcf} \ 273 | --filter-expression "QUAL < 10.4139" \ 274 | --filter-name "DRAGENHardQUAL" \ 275 | -O ~{output_vcf_name} 276 | } 277 | output { 278 | File output_vcf = "~{output_vcf_name}" 279 | File output_vcf_index = "~{output_vcf_name}.tbi" 280 | } 281 | runtime { 282 | memory: "3000 MiB" 283 | } 284 | } 285 | 286 | task CNNScoreVariants { 287 | input { 288 | File? bamout 289 | File? bamout_index 290 | File input_vcf 291 | File input_vcf_index 292 | String vcf_basename 293 | File ref_fasta 294 | File ref_fasta_index 295 | File ref_dict 296 | } 297 | 298 | Int disk_size = ceil(size(bamout, "GiB") + size(ref_fasta, "GiB") + (size(input_vcf, "GiB") * 2)) 299 | 300 | String base_vcf = basename(input_vcf) 301 | Boolean is_compressed = basename(base_vcf, "gz") != base_vcf 302 | String vcf_suffix = if is_compressed then ".vcf.gz" else ".vcf" 303 | String vcf_index_suffix = if is_compressed then ".tbi" else ".idx" 304 | String output_vcf = base_vcf + ".scored" + vcf_suffix 305 | String output_vcf_index = output_vcf + vcf_index_suffix 306 | 307 | String bamout_param = if defined(bamout) then "-I ~{bamout}" else "" 308 | String tensor_type = if defined(bamout) then "read-tensor" else "reference" 309 | 310 | command { 311 | /mnt/lustre/genomics/tools/gatk/gatk --java-options "-Xmx10000m" CNNScoreVariants \ 312 | -V ~{input_vcf} \ 313 | -R ~{ref_fasta} \ 314 | -O ~{output_vcf} \ 315 | ~{bamout_param} \ 316 | -tensor-type ~{tensor_type} 317 | } 318 | 319 | output { 320 | File scored_vcf = "~{output_vcf}" 321 | File scored_vcf_index = "~{output_vcf_index}" 322 | } 323 | 324 | runtime { 325 | memory: "15000 MiB" 326 | cpu: "2" 327 | } 328 | } 329 | 330 | task FilterVariantTranches { 331 | 332 | input { 333 | File input_vcf 334 | File input_vcf_index 335 | String vcf_basename 336 | Array[String] snp_tranches 337 | Array[String] indel_tranches 338 | File hapmap_resource_vcf 339 | File hapmap_resource_vcf_index 340 | File omni_resource_vcf 341 | File omni_resource_vcf_index 342 | File one_thousand_genomes_resource_vcf 343 | File one_thousand_genomes_resource_vcf_index 344 | File dbsnp_resource_vcf 345 | File dbsnp_resource_vcf_index 346 | String info_key 347 | } 348 | 349 | 350 | command { 351 | 352 | /mnt/lustre/genomics/tools/gatk/gatk --java-options "-Xmx6000m" FilterVariantTranches \ 353 | -V ~{input_vcf} \ 354 | -O ~{vcf_basename}.filtered.vcf.gz \ 355 | ~{sep=" " prefix("--snp-tranche ", snp_tranches)} \ 356 | ~{sep=" " prefix("--indel-tranche ", indel_tranches)} \ 357 | --resource ~{hapmap_resource_vcf} \ 358 | --resource ~{omni_resource_vcf} \ 359 | --resource ~{one_thousand_genomes_resource_vcf} \ 360 | --resource ~{dbsnp_resource_vcf} \ 361 | --info-key ~{info_key} \ 362 | --create-output-variant-index true 363 | } 364 | 365 | output { 366 | File filtered_vcf = "~{vcf_basename}.filtered.vcf.gz" 367 | File filtered_vcf_index = "~{vcf_basename}.filtered.vcf.gz.tbi" 368 | } 369 | 370 | runtime { 371 | memory: "7000 MiB" 372 | cpu: "2" 373 | } 374 | } 375 | -------------------------------------------------------------------------------- /20k_Tutorial/16T_PairedSingleSampleWf_optimized.inputs.20k.json: -------------------------------------------------------------------------------- 1 | { 2 | "##_COMMENT1": "Take note of the .64 extensions on the reference files, issues between 32 and 64 bit OS", 3 | 4 | "##_COMMENT2": "SAMPLES - read the README to find other examples.", 5 | "PairedEndSingleSampleWorkflow.sample_name": "NA12878", 6 | "PairedEndSingleSampleWorkflow.base_file_name": "NA12878", 7 | "PairedEndSingleSampleWorkflow.flowcell_unmapped_bams": [ 8 | "/cluster_share/data/RefArch_Broad_data/genomics-public-data/test-data/dna/wgs/hiseq2500/NA12878/H06HDADXX130110.1.ATCACGAT.20k_reads.bam", 9 | "/cluster_share/data/RefArch_Broad_data/genomics-public-data/test-data/dna/wgs/hiseq2500/NA12878/H06HDADXX130110.2.ATCACGAT.20k_reads.bam", 10 | "/cluster_share/data/RefArch_Broad_data/genomics-public-data/test-data/dna/wgs/hiseq2500/NA12878/H06JUADXX130110.1.ATCACGAT.20k_reads.bam" 11 | ], 12 | "PairedEndSingleSampleWorkflow.final_gvcf_name": "NA12878.g.vcf.gz", 13 | "PairedEndSingleSampleWorkflow.unmapped_bam_suffix": ".unmapped.bam", 14 | 15 | "##_COMMENT3": "REFERENCES", 16 | "PairedEndSingleSampleWorkflow.fingerprint_genotypes_file": "/cluster_share/data/RefArch_Broad_data/genomics-public-data/resources/broad/hg38/v0/NA12878.hg38.reference.fingerprint.vcf", 17 | "PairedEndSingleSampleWorkflow.contamination_sites_ud": "/cluster_share/data/RefArch_Broad_data/genomics-public-data/resources/broad/hg38/v0/Homo_sapiens_assembly38.contam.UD", 18 | "PairedEndSingleSampleWorkflow.contamination_sites_bed": "/cluster_share/data/RefArch_Broad_data/genomics-public-data/resources/broad/hg38/v0/Homo_sapiens_assembly38.contam.bed", 19 | "PairedEndSingleSampleWorkflow.contamination_sites_mu": "/cluster_share/data/RefArch_Broad_data/genomics-public-data/resources/broad/hg38/v0/Homo_sapiens_assembly38.contam.mu", 20 | "PairedEndSingleSampleWorkflow.wgs_calling_interval_list": "/cluster_share/data/RefArch_Broad_data/genomics-public-data/resources/broad/hg38/v0/wgs_calling_regions.hg38.interval_list", 21 | "PairedEndSingleSampleWorkflow.haplotype_scatter_count" : 50, 22 | "PairedEndSingleSampleWorkflow.break_bands_at_multiples_of" : 1000000, 23 | "PairedEndSingleSampleWorkflow.ref_dict": "/cluster_share/data/RefArch_Broad_data/genomics-public-data/resources/broad/hg38/v0/Homo_sapiens_assembly38.dict", 24 | "PairedEndSingleSampleWorkflow.ref_fasta": "/cluster_share/data/RefArch_Broad_data/genomics-public-data/resources/broad/hg38/v0/Homo_sapiens_assembly38.fasta", 25 | "PairedEndSingleSampleWorkflow.ref_fasta_index": "/cluster_share/data/RefArch_Broad_data/genomics-public-data/resources/broad/hg38/v0/Homo_sapiens_assembly38.fasta.fai", 26 | "PairedEndSingleSampleWorkflow.ref_alt": "/cluster_share/data/RefArch_Broad_data/genomics-public-data/resources/broad/hg38/v0/Homo_sapiens_assembly38.fasta.64.alt", 27 | "PairedEndSingleSampleWorkflow.ref_sa": "/cluster_share/data/RefArch_Broad_data/genomics-public-data/resources/broad/hg38/v0/Homo_sapiens_assembly38.fasta.64.sa", 28 | "PairedEndSingleSampleWorkflow.ref_amb": "/cluster_share/data/RefArch_Broad_data/genomics-public-data/resources/broad/hg38/v0/Homo_sapiens_assembly38.fasta.64.amb", 29 | "PairedEndSingleSampleWorkflow.ref_bwt": "/cluster_share/data/RefArch_Broad_data/genomics-public-data/resources/broad/hg38/v0/Homo_sapiens_assembly38.fasta.64.bwt", 30 | "PairedEndSingleSampleWorkflow.ref_ann": "/cluster_share/data/RefArch_Broad_data/genomics-public-data/resources/broad/hg38/v0/Homo_sapiens_assembly38.fasta.64.ann", 31 | "PairedEndSingleSampleWorkflow.ref_pac": "/cluster_share/data/RefArch_Broad_data/genomics-public-data/resources/broad/hg38/v0/Homo_sapiens_assembly38.fasta.64.pac", 32 | "PairedEndSingleSampleWorkflow.known_snps_sites_vcf": "/cluster_share/data/RefArch_Broad_data/genomics-public-data/resources/broad/hg38/v0/Mills_and_1000G_gold_standard.indels.hg38.vcf.gz", 33 | "PairedEndSingleSampleWorkflow.known_snps_sites_vcf_index": "/cluster_share/data/RefArch_Broad_data/genomics-public-data/resources/broad/hg38/v0/Mills_and_1000G_gold_standard.indels.hg38.vcf.gz.tbi", 34 | "PairedEndSingleSampleWorkflow.dbSNP_vcf": "/cluster_share/data/RefArch_Broad_data/genomics-public-data/resources/broad/hg38/v0/Homo_sapiens_assembly38.dbsnp138.vcf", 35 | "PairedEndSingleSampleWorkflow.dbSNP_vcf_index": "/cluster_share/data/RefArch_Broad_data/genomics-public-data/resources/broad/hg38/v0/Homo_sapiens_assembly38.dbsnp138.vcf.idx", 36 | "PairedEndSingleSampleWorkflow.wgs_coverage_interval_list": "/cluster_share/data/RefArch_Broad_data/genomics-public-data/resources/broad/hg38/v0/wgs_coverage_regions.hg38.interval_list", 37 | "PairedEndSingleSampleWorkflow.wgs_evaluation_interval_list": "/cluster_share/data/RefArch_Broad_data/genomics-public-data/resources/broad/hg38/v0/wgs_evaluation_regions.hg38.interval_list", 38 | "PairedEndSingleSampleWorkflow.known_indels_sites_VCFs": [ 39 | "/cluster_share/data/RefArch_Broad_data/genomics-public-data/resources/broad/hg38/v0/Mills_and_1000G_gold_standard.indels.hg38.vcf.gz", 40 | "/cluster_share/data/RefArch_Broad_data/genomics-public-data/resources/broad/hg38/v0/Homo_sapiens_assembly38.known_indels.vcf.gz" 41 | ], 42 | "PairedEndSingleSampleWorkflow.known_indels_sites_indices": [ 43 | "/cluster_share/data/RefArch_Broad_data/genomics-public-data/resources/broad/hg38/v0/Mills_and_1000G_gold_standard.indels.hg38.vcf.gz.tbi", 44 | "/cluster_share/data/RefArch_Broad_data/genomics-public-data/resources/broad/hg38/v0/Homo_sapiens_assembly38.known_indels.vcf.gz.tbi" 45 | ], 46 | 47 | "##_COMMENT5": "PATH TO GENOMICS TOOLS FOR NON-DOCKERIZED WORKFLOW", 48 | "PairedEndSingleSampleWorkflow.tool_path": "/cluster_share/Hybrid_Pipeline/tools", 49 | 50 | "##_COMMENT6":"OPTIMIZATION FLAGS FOR BWA, SAMTOOLS, GATK-GKL", 51 | "## SamToFastQ_COMMENT": "BWA mem is the bottleneck - request 16 cores for the task, assign 16T to bwa and let SamToFastQ and MergeBamAlignment timeshare", 52 | "PairedEndSingleSampleWorkflow.bwa_threads":16, 53 | "## SamtoolsSort_COMMENT": "Threads for Samtools sort step", 54 | "PairedEndSingleSampleWorkflow.samtools_threads":16, 55 | "## GENERAL_COMMENT": "Compression level for all java commands", 56 | "PairedEndSingleSampleWorkflow.compression_level":1, 57 | "## HaplotypeCaller_MT_comment": "See comment in the WDL file", 58 | "PairedEndSingleSampleWorkflow.gatk_gkl_pairhmm_implementation":"AVX_LOGLESS_CACHING", 59 | "PairedEndSingleSampleWorkflow.gatk_gkl_pairhmm_threads":1, 60 | "PairedEndSingleSampleWorkflow.HaplotypeCaller.smith_waterman_implementation":"AVX_ENABLED", 61 | "##COMMENT_TMPDIR": "defaults to /tmp, BUT you might want to customize this. For example, if your working directory is on an NVMe SSD", 62 | "PairedEndSingleSampleWorkflow.tmp_directory": "/tmp", 63 | "PairedEndSingleSampleWorkflow.SortSampleBam.mem_limit": "64M", 64 | 65 | "##_COMMENT7":"JAVA HEAP MEMORY OPTIONS", 66 | "PairedEndSingleSampleWorkflow.CollectQualityYieldMetrics.java_heap_memory_initial":"128m", 67 | "PairedEndSingleSampleWorkflow.SamToFastqAndBwaMemAndMba.java_heap_memory_initial":"2g", 68 | "PairedEndSingleSampleWorkflow.SortSampleBam.java_heap_memory_initial":"4g", 69 | "PairedEndSingleSampleWorkflow.CollectUnsortedReadgroupBamQualityMetrics.java_heap_memory_initial":"5g", 70 | "PairedEndSingleSampleWorkflow.CollectReadgroupBamQualityMetrics.java_heap_memory_initial":"5g", 71 | "PairedEndSingleSampleWorkflow.CollectAggregationMetrics.java_heap_memory_initial":"5g", 72 | "PairedEndSingleSampleWorkflow.CrossCheckFingerprints.java_heap_memory_initial":"2g", 73 | "PairedEndSingleSampleWorkflow.CheckFingerprint.java_heap_memory_initial":"1g", 74 | "PairedEndSingleSampleWorkflow.MarkDuplicates.java_heap_memory_initial":"4g", 75 | "PairedEndSingleSampleWorkflow.BaseRecalibrator.java_heap_memory_initial":"4g", 76 | "PairedEndSingleSampleWorkflow.GatherBqsrReports.java_heap_memory_initial":"3g", 77 | "PairedEndSingleSampleWorkflow.ApplyBQSR.java_heap_memory_initial":"3g", 78 | "PairedEndSingleSampleWorkflow.GatherBamFiles.java_heap_memory_initial":"2g", 79 | "PairedEndSingleSampleWorkflow.ValidateBamFromCram.java_heap_memory_initial":"6g", 80 | "PairedEndSingleSampleWorkflow.CollectWgsMetrics.java_heap_memory_initial":"2g", 81 | "PairedEndSingleSampleWorkflow.CollectRawWgsMetrics.java_heap_memory_initial":"2g", 82 | "PairedEndSingleSampleWorkflow.CalculateReadGroupChecksum.java_heap_memory_initial":"1g", 83 | "PairedEndSingleSampleWorkflow.ScatterIntervalList.java_heap_memory_initial":"1g", 84 | "PairedEndSingleSampleWorkflow.HaplotypeCaller.haplotypecaller_java_heap_memory_initial":"6g", 85 | "PairedEndSingleSampleWorkflow.MergeVCFs.java_heap_memory_initial":"2g", 86 | "PairedEndSingleSampleWorkflow.ValidateGVCF.java_heap_memory_initial":"3g", 87 | "PairedEndSingleSampleWorkflow.CollectGvcfCallingMetrics.java_heap_memory_initial":"2g", 88 | 89 | "##_COMMENT8":"RUNTIME SECTION MEMORY OPTIONS", 90 | "PairedEndSingleSampleWorkflow.CollectQualityYieldMetrics.memory":"2GB", 91 | "PairedEndSingleSampleWorkflow.CheckFinalVcfExtension.memory":"2GB", 92 | "PairedEndSingleSampleWorkflow.GetBwaVersion.memory":"1GB", 93 | "PairedEndSingleSampleWorkflow.SamToFastqAndBwaMemAndMba.memory":"10GB", 94 | "PairedEndSingleSampleWorkflow.SortSampleBam.memory":"5GB", 95 | "PairedEndSingleSampleWorkflow.CollectUnsortedReadgroupBamQualityMetrics.memory":"7GB", 96 | "PairedEndSingleSampleWorkflow.CollectReadgroupBamQualityMetrics.memory":"7GB", 97 | "PairedEndSingleSampleWorkflow.CollectAggregationMetrics.memory":"7GB", 98 | "PairedEndSingleSampleWorkflow.CrossCheckFingerprints.memory":"2GB", 99 | "PairedEndSingleSampleWorkflow.CheckFingerprint.memory":"1GB", 100 | "PairedEndSingleSampleWorkflow.MarkDuplicates.memory":"7GB", 101 | "PairedEndSingleSampleWorkflow.CreateSequenceGroupingTSV.memory":"2GB", 102 | "PairedEndSingleSampleWorkflow.BaseRecalibrator.memory":"6GB", 103 | "PairedEndSingleSampleWorkflow.GatherBqsrReports.memory":"3GB", 104 | "PairedEndSingleSampleWorkflow.ApplyBQSR.memory":"4GB", 105 | "PairedEndSingleSampleWorkflow.GatherBamFiles.memory":"3GB", 106 | "PairedEndSingleSampleWorkflow.ValidateBamFromCram.memory":"7GB", 107 | "PairedEndSingleSampleWorkflow.CollectWgsMetrics.memory":"3GB", 108 | "PairedEndSingleSampleWorkflow.CollectRawWgsMetrics.memory":"3GB", 109 | "PairedEndSingleSampleWorkflow.CalculateReadGroupChecksum.memory":"2GB", 110 | "PairedEndSingleSampleWorkflow.CheckContamination.memory":"2GB", 111 | "PairedEndSingleSampleWorkflow.ScatterIntervalList.memory":"2GB", 112 | "PairedEndSingleSampleWorkflow.HaplotypeCaller.memory":"7GB", 113 | "PairedEndSingleSampleWorkflow.MergeVCFs.memory":"3GB", 114 | "PairedEndSingleSampleWorkflow.ValidateGVCF.memory":"4GB", 115 | "PairedEndSingleSampleWorkflow.CollectGvcfCallingMetrics.memory":"3GB", 116 | "PairedEndSingleSampleWorkflow.ConvertToCram.memory":"3GB", 117 | "PairedEndSingleSampleWorkflow.CramToBam.memory":"3GB", 118 | 119 | "##_COMMENT9":"RUNTIME SECTION CPU OPTIONS", 120 | "PairedEndSingleSampleWorkflow.CollectQualityYieldMetrics.cpu":1, 121 | "PairedEndSingleSampleWorkflow.CheckFinalVcfExtension.cpu":1, 122 | "PairedEndSingleSampleWorkflow.GetBwaVersion.cpu":1, 123 | "PairedEndSingleSampleWorkflow.SamToFastqAndBwaMemAndMba.cpu":16, 124 | "PairedEndSingleSampleWorkflow.SortSampleBam.cpu":16, 125 | "PairedEndSingleSampleWorkflow.CollectUnsortedReadgroupBamQualityMetrics.cpu":1, 126 | "PairedEndSingleSampleWorkflow.CollectReadgroupBamQualityMetrics.cpu":1, 127 | "PairedEndSingleSampleWorkflow.CollectAggregationMetrics.cpu":1, 128 | "PairedEndSingleSampleWorkflow.CrossCheckFingerprints.cpu":1, 129 | "PairedEndSingleSampleWorkflow.CheckFingerprint.cpu":1, 130 | "PairedEndSingleSampleWorkflow.MarkDuplicates.cpu":1, 131 | "PairedEndSingleSampleWorkflow.CreateSequenceGroupingTSV.cpu":1, 132 | "PairedEndSingleSampleWorkflow.BaseRecalibrator.cpu":1, 133 | "PairedEndSingleSampleWorkflow.GatherBqsrReports.cpu":1, 134 | "PairedEndSingleSampleWorkflow.ApplyBQSR.cpu":1, 135 | "PairedEndSingleSampleWorkflow.GatherBamFiles.cpu":1, 136 | "PairedEndSingleSampleWorkflow.ValidateBamFromCram.cpu":1, 137 | "PairedEndSingleSampleWorkflow.CollectWgsMetrics.cpu":1, 138 | "PairedEndSingleSampleWorkflow.CollectRawWgsMetrics.cpu":1, 139 | "PairedEndSingleSampleWorkflow.CalculateReadGroupChecksum.cpu":1, 140 | "PairedEndSingleSampleWorkflow.CheckContamination.cpu":1, 141 | "PairedEndSingleSampleWorkflow.ScatterIntervalList.cpu":1, 142 | "PairedEndSingleSampleWorkflow.HaplotypeCaller.cpu":1, 143 | "PairedEndSingleSampleWorkflow.MergeVCFs.cpu":1, 144 | "PairedEndSingleSampleWorkflow.ValidateGVCF.cpu":1, 145 | "PairedEndSingleSampleWorkflow.CollectGvcfCallingMetrics.cpu":1, 146 | "PairedEndSingleSampleWorkflow.ConvertToCram.cpu":1, 147 | "PairedEndSingleSampleWorkflow.CramToBam.cpu":1 148 | } 149 | -------------------------------------------------------------------------------- /20k_Throughput-run/WDL/UnmappedBamToAlignedBam.wdl: -------------------------------------------------------------------------------- 1 | version 1.0 2 | 3 | ## Copyright Broad Institute, 2018 4 | ## 5 | ## This WDL pipeline implements data processing according to the GATK Best Practices (June 2016) 6 | ## for human whole-genome and exome sequencing data. 7 | ## 8 | ## Runtime parameters are often optimized for Broad's Google Cloud Platform implementation. 9 | ## For program versions, see docker containers. 10 | ## 11 | ## LICENSING : 12 | ## This script is released under the WDL source code license (BSD-3) (see LICENSE in 13 | ## https://github.com/broadinstitute/wdl). Note however that the programs it calls may 14 | ## be subject to different licenses. Users are responsible for checking that they are 15 | ## authorized to run all programs before running this script. Please see the docker 16 | ## page at https://hub.docker.com/r/broadinstitute/genomes-in-the-cloud/ for detailed 17 | ## licensing information pertaining to the included programs. 18 | 19 | import "Alignment.wdl" as Alignment 20 | import "DragmapAlignment.wdl" as DragmapAlignment 21 | import "SplitLargeReadGroup.wdl" as SplitRG 22 | import "Qc.wdl" as QC 23 | import "BamProcessing.wdl" as Processing 24 | import "Utilities.wdl" as Utils 25 | import "DNASeqStructs.wdl" as Structs 26 | 27 | # WORKFLOW DEFINITION 28 | workflow UnmappedBamToAlignedBam { 29 | 30 | input { 31 | SampleAndUnmappedBams sample_and_unmapped_bams 32 | DNASeqSingleSampleReferences references 33 | DragmapReference? dragmap_reference 34 | PapiSettings papi_settings 35 | 36 | File contamination_sites_ud 37 | File contamination_sites_bed 38 | File contamination_sites_mu 39 | 40 | String cross_check_fingerprints_by 41 | File haplotype_database_file 42 | Float lod_threshold 43 | String recalibrated_bam_basename 44 | Boolean hard_clip_reads = false 45 | Boolean unmap_contaminant_reads = true 46 | Boolean bin_base_qualities = true 47 | Boolean somatic = false 48 | Boolean perform_bqsr = true 49 | Boolean use_bwa_mem = true 50 | Boolean allow_empty_ref_alt = false 51 | } 52 | 53 | Float cutoff_for_large_rg_in_gb = 20.0 54 | 55 | String bwa_commandline = "bwa mem -K 100000000 -p -v 3 -t 16 -Y $bash_ref_fasta" 56 | 57 | Int compression_level = 2 58 | 59 | # Get the size of the standard reference files as well as the additional reference files needed for BWA 60 | 61 | # Align flowcell-level unmapped input bams in parallel 62 | scatter (unmapped_bam in sample_and_unmapped_bams.flowcell_unmapped_bams) { 63 | 64 | Float unmapped_bam_size = size(unmapped_bam, "GiB") 65 | 66 | String unmapped_bam_basename = basename(unmapped_bam, sample_and_unmapped_bams.unmapped_bam_suffix) 67 | 68 | # QC the unmapped BAM 69 | call QC.CollectQualityYieldMetrics as CollectQualityYieldMetrics { 70 | input: 71 | input_bam = unmapped_bam, 72 | metrics_filename = unmapped_bam_basename + ".unmapped.quality_yield_metrics" 73 | } 74 | 75 | if (unmapped_bam_size > cutoff_for_large_rg_in_gb) { 76 | # Split bam into multiple smaller bams, 77 | # map reads to reference and recombine into one bam 78 | call SplitRG.SplitLargeReadGroup as SplitRG { 79 | input: 80 | input_bam = unmapped_bam, 81 | bwa_commandline = bwa_commandline, 82 | output_bam_basename = unmapped_bam_basename + ".aligned.unsorted", 83 | reference_fasta = references.reference_fasta, 84 | dragmap_reference = dragmap_reference, 85 | compression_level = compression_level, 86 | hard_clip_reads = hard_clip_reads, 87 | unmap_contaminant_reads = unmap_contaminant_reads, 88 | use_bwa_mem = use_bwa_mem, 89 | allow_empty_ref_alt = allow_empty_ref_alt 90 | } 91 | } 92 | 93 | if (unmapped_bam_size <= cutoff_for_large_rg_in_gb) { 94 | # Map reads to reference 95 | if (use_bwa_mem) { 96 | call Alignment.SamToFastqAndBwaMemAndMba as SamToFastqAndBwaMemAndMba { 97 | input: 98 | input_bam = unmapped_bam, 99 | bwa_commandline = bwa_commandline, 100 | output_bam_basename = unmapped_bam_basename + ".aligned.unsorted", 101 | reference_fasta = references.reference_fasta, 102 | compression_level = compression_level, 103 | hard_clip_reads = hard_clip_reads, 104 | unmap_contaminant_reads = unmap_contaminant_reads, 105 | allow_empty_ref_alt = allow_empty_ref_alt 106 | } 107 | } 108 | if (!use_bwa_mem) { 109 | call DragmapAlignment.SamToFastqAndDragmapAndMba as SamToFastqAndDragmapAndMba { 110 | input: 111 | input_bam = unmapped_bam, 112 | output_bam_basename = unmapped_bam_basename + ".aligned.unsorted", 113 | reference_fasta = references.reference_fasta, 114 | dragmap_reference = select_first([dragmap_reference]), 115 | compression_level = compression_level, 116 | hard_clip_reads = hard_clip_reads, 117 | unmap_contaminant_reads = unmap_contaminant_reads 118 | } 119 | } 120 | } 121 | 122 | File output_aligned_bam = select_first([SamToFastqAndBwaMemAndMba.output_bam, SamToFastqAndDragmapAndMba.output_bam, SplitRG.aligned_bam]) 123 | 124 | Float mapped_bam_size = size(output_aligned_bam, "GiB") 125 | 126 | # QC the aligned but unsorted readgroup BAM 127 | # no reference as the input here is unsorted, providing a reference would cause an error 128 | call QC.CollectUnsortedReadgroupBamQualityMetrics as CollectUnsortedReadgroupBamQualityMetrics { 129 | input: 130 | input_bam = output_aligned_bam, 131 | output_bam_prefix = unmapped_bam_basename + ".readgroup" 132 | } 133 | } 134 | 135 | # MarkDuplicates and SortSam currently take too long for preemptibles if the input data is too large 136 | Float gb_size_cutoff_for_preemptibles = 110.0 137 | Boolean data_too_large_for_preemptibles = size(output_aligned_bam, "GiB") > gb_size_cutoff_for_preemptibles 138 | 139 | # Aggregate aligned+merged flowcell BAM files and mark duplicates 140 | # We take advantage of the tool's ability to take multiple BAM inputs and write out a single output 141 | # to avoid having to spend time just merging BAM files. 142 | call Processing.MarkDuplicates as MarkDuplicates { 143 | input: 144 | input_bams = output_aligned_bam, 145 | output_bam_basename = sample_and_unmapped_bams.base_file_name + ".aligned.unsorted.duplicates_marked", 146 | metrics_filename = sample_and_unmapped_bams.base_file_name + ".duplicate_metrics", 147 | total_input_size = size(output_aligned_bam, "GiB"), 148 | compression_level = compression_level 149 | } 150 | 151 | # Sort aggregated+deduped BAM file and fix tags 152 | call Processing.SortSam as SortSampleBam { 153 | input: 154 | input_bam = MarkDuplicates.output_bam, 155 | output_bam_basename = sample_and_unmapped_bams.base_file_name + ".aligned.duplicate_marked.sorted", 156 | compression_level = compression_level 157 | } 158 | 159 | Float agg_bam_size = size(SortSampleBam.output_bam, "GiB") 160 | 161 | if (defined(haplotype_database_file)) { 162 | # Check identity of fingerprints across readgroups 163 | call QC.CrossCheckFingerprints as CrossCheckFingerprints { 164 | input: 165 | input_bams = [ SortSampleBam.output_bam ], 166 | input_bam_indexes = [SortSampleBam.output_bam_index], 167 | haplotype_database_file = haplotype_database_file, 168 | metrics_filename = sample_and_unmapped_bams.base_file_name + ".crosscheck", 169 | total_input_size = agg_bam_size, 170 | lod_threshold = lod_threshold, 171 | cross_check_by = cross_check_fingerprints_by 172 | } 173 | } 174 | 175 | # Create list of sequences for scatter-gather parallelization 176 | call Utils.CreateSequenceGroupingTSV as CreateSequenceGroupingTSV { 177 | input: 178 | ref_dict = references.reference_fasta.ref_dict 179 | } 180 | 181 | # Estimate level of cross-sample contamination 182 | call Processing.CheckContamination as CheckContamination { 183 | input: 184 | input_bam = SortSampleBam.output_bam, 185 | input_bam_index = SortSampleBam.output_bam_index, 186 | contamination_sites_ud = contamination_sites_ud, 187 | contamination_sites_bed = contamination_sites_bed, 188 | contamination_sites_mu = contamination_sites_mu, 189 | ref_fasta = references.reference_fasta.ref_fasta, 190 | ref_fasta_index = references.reference_fasta.ref_fasta_index, 191 | output_prefix = sample_and_unmapped_bams.base_file_name + ".preBqsr", 192 | contamination_underestimation_factor = 0.75 193 | } 194 | 195 | # We need disk to localize the sharded input and output due to the scatter for BQSR. 196 | # If we take the number we are scattering by and reduce by 3 we will have enough disk space 197 | # to account for the fact that the data is not split evenly. 198 | Int num_of_bqsr_scatters = length(CreateSequenceGroupingTSV.sequence_grouping) 199 | Int potential_bqsr_divisor = num_of_bqsr_scatters - 10 200 | Int bqsr_divisor = if potential_bqsr_divisor > 1 then potential_bqsr_divisor else 1 201 | 202 | # Perform Base Quality Score Recalibration (BQSR) on the sorted BAM in parallel 203 | 204 | if (perform_bqsr) { 205 | scatter (subgroup in CreateSequenceGroupingTSV.sequence_grouping) { 206 | # Generate the recalibration model by interval 207 | call Processing.BaseRecalibrator as BaseRecalibrator { 208 | input: 209 | input_bam = SortSampleBam.output_bam, 210 | input_bam_index = SortSampleBam.output_bam_index, 211 | recalibration_report_filename = sample_and_unmapped_bams.base_file_name + ".recal_data.csv", 212 | sequence_group_interval = subgroup, 213 | dbsnp_vcf = references.dbsnp_vcf, 214 | dbsnp_vcf_index = references.dbsnp_vcf_index, 215 | known_indels_sites_vcfs = references.known_indels_sites_vcfs, 216 | known_indels_sites_indices = references.known_indels_sites_indices, 217 | ref_dict = references.reference_fasta.ref_dict, 218 | ref_fasta = references.reference_fasta.ref_fasta, 219 | ref_fasta_index = references.reference_fasta.ref_fasta_index, 220 | bqsr_scatter = bqsr_divisor 221 | } 222 | } 223 | 224 | # Merge the recalibration reports resulting from by-interval recalibration 225 | # The reports are always the same size 226 | call Processing.GatherBqsrReports as GatherBqsrReports { 227 | input: 228 | input_bqsr_reports = BaseRecalibrator.recalibration_report, 229 | output_report_filename = sample_and_unmapped_bams.base_file_name + ".recal_data.csv" 230 | } 231 | 232 | scatter (subgroup in CreateSequenceGroupingTSV.sequence_grouping_with_unmapped) { 233 | # Apply the recalibration model by interval 234 | call Processing.ApplyBQSR as ApplyBQSR { 235 | input: 236 | input_bam = SortSampleBam.output_bam, 237 | input_bam_index = SortSampleBam.output_bam_index, 238 | output_bam_basename = recalibrated_bam_basename, 239 | recalibration_report = GatherBqsrReports.output_bqsr_report, 240 | sequence_group_interval = subgroup, 241 | ref_dict = references.reference_fasta.ref_dict, 242 | ref_fasta = references.reference_fasta.ref_fasta, 243 | ref_fasta_index = references.reference_fasta.ref_fasta_index, 244 | bqsr_scatter = bqsr_divisor, 245 | compression_level = compression_level, 246 | bin_base_qualities = bin_base_qualities, 247 | somatic = somatic 248 | } 249 | } 250 | } 251 | 252 | # Merge the recalibrated BAM files resulting from by-interval recalibration 253 | call Processing.GatherSortedBamFiles as GatherBamFiles { 254 | input: 255 | input_bams = select_first([ApplyBQSR.recalibrated_bam, [SortSampleBam.output_bam]]), 256 | output_bam_basename = sample_and_unmapped_bams.base_file_name, 257 | total_input_size = agg_bam_size, 258 | compression_level = compression_level, 259 | } 260 | 261 | # Outputs that will be retained when execution is complete 262 | output { 263 | Array[File] quality_yield_metrics = CollectQualityYieldMetrics.quality_yield_metrics 264 | 265 | Array[File] unsorted_read_group_base_distribution_by_cycle_pdf = CollectUnsortedReadgroupBamQualityMetrics.base_distribution_by_cycle_pdf 266 | Array[File] unsorted_read_group_base_distribution_by_cycle_metrics = CollectUnsortedReadgroupBamQualityMetrics.base_distribution_by_cycle_metrics 267 | Array[File] unsorted_read_group_insert_size_histogram_pdf = CollectUnsortedReadgroupBamQualityMetrics.insert_size_histogram_pdf 268 | Array[File] unsorted_read_group_insert_size_metrics = CollectUnsortedReadgroupBamQualityMetrics.insert_size_metrics 269 | Array[File] unsorted_read_group_quality_by_cycle_pdf = CollectUnsortedReadgroupBamQualityMetrics.quality_by_cycle_pdf 270 | Array[File] unsorted_read_group_quality_by_cycle_metrics = CollectUnsortedReadgroupBamQualityMetrics.quality_by_cycle_metrics 271 | Array[File] unsorted_read_group_quality_distribution_pdf = CollectUnsortedReadgroupBamQualityMetrics.quality_distribution_pdf 272 | Array[File] unsorted_read_group_quality_distribution_metrics = CollectUnsortedReadgroupBamQualityMetrics.quality_distribution_metrics 273 | 274 | File? cross_check_fingerprints_metrics = CrossCheckFingerprints.cross_check_fingerprints_metrics 275 | 276 | File selfSM = CheckContamination.selfSM 277 | Float contamination = CheckContamination.contamination 278 | 279 | File duplicate_metrics = MarkDuplicates.duplicate_metrics 280 | File? output_bqsr_reports = GatherBqsrReports.output_bqsr_report 281 | 282 | File output_bam = GatherBamFiles.output_bam 283 | File output_bam_index = GatherBamFiles.output_bam_index 284 | } 285 | meta { 286 | allowNestedInputs: true 287 | } 288 | } 289 | -------------------------------------------------------------------------------- /20k_Throughput-run/WholeGenomeGermlineSingleSample.wdl: -------------------------------------------------------------------------------- 1 | version 1.0 2 | 3 | ## Copyright Broad Institute, 2018 4 | ## 5 | ## This WDL pipeline implements data pre-processing and initial variant calling (GVCF 6 | ## generation) according to the GATK Best Practices (June 2016) for germline SNP and 7 | ## Indel discovery in human whole-genome data. 8 | ## 9 | ## Requirements/expectations : 10 | ## - Human whole-genome pair-end sequencing data in unmapped BAM (uBAM) format 11 | ## - One or more read groups, one per uBAM file, all belonging to a single sample (SM) 12 | ## - Input uBAM files must additionally comply with the following requirements: 13 | ## - - filenames all have the same suffix (we use ".unmapped.bam") 14 | ## - - files must pass validation by ValidateSamFile 15 | ## - - reads are provided in query-sorted order 16 | ## - - all reads must have an RG tag 17 | ## - GVCF output names must end in ".g.vcf.gz" 18 | ## - Reference genome must be Hg38 with ALT contigs 19 | ## 20 | ## Runtime parameters are optimized for Broad's Google Cloud Platform implementation. 21 | ## For program versions, see docker containers. 22 | ## 23 | ## LICENSING : 24 | ## This script is released under the WDL source code license (BSD-3) (see LICENSE in 25 | ## https://github.com/broadinstitute/wdl). Note however that the programs it calls may 26 | ## be subject to different licenses. Users are responsible for checking that they are 27 | ## authorized to run all programs before running this script. Please see the docker 28 | ## page at https://hub.docker.com/r/broadinstitute/genomes-in-the-cloud/ for detailed 29 | ## licensing information pertaining to the included programs. 30 | 31 | import "UnmappedBamToAlignedBam.wdl" as ToBam 32 | import "AggregatedBamQC.wdl" as AggregatedQC 33 | import "Qc.wdl" as QC 34 | import "BamToCram.wdl" as ToCram 35 | import "Utilities.wdl" as Utilities 36 | import "VariantCalling.wdl" as ToGvcf 37 | import "DNASeqStructs.wdl" 38 | 39 | # WORKFLOW DEFINITION 40 | workflow WholeGenomeGermlineSingleSample { 41 | 42 | 43 | String pipeline_version = "3.1.19" 44 | 45 | 46 | input { 47 | SampleAndUnmappedBams sample_and_unmapped_bams 48 | DNASeqSingleSampleReferences references 49 | DragmapReference? dragmap_reference 50 | VariantCallingScatterSettings scatter_settings 51 | PapiSettings papi_settings 52 | 53 | File? fingerprint_genotypes_file 54 | File? fingerprint_genotypes_index 55 | 56 | File wgs_coverage_interval_list 57 | 58 | Boolean provide_bam_output = false 59 | Boolean use_gatk3_haplotype_caller = false 60 | 61 | Boolean dragen_functional_equivalence_mode = false 62 | Boolean dragen_maximum_quality_mode = false 63 | 64 | Boolean run_dragen_mode_variant_calling = false 65 | Boolean use_spanning_event_genotyping = true 66 | Boolean unmap_contaminant_reads = true 67 | Boolean perform_bqsr = true 68 | Boolean use_bwa_mem = true 69 | Boolean allow_empty_ref_alt = false 70 | Boolean use_dragen_hard_filtering = false 71 | } 72 | 73 | if (dragen_functional_equivalence_mode && dragen_maximum_quality_mode) { 74 | call Utilities.ErrorWithMessage as PresetArgumentsError { 75 | input: 76 | message = "Both dragen_functional_equivalence_mode and dragen_maximum_quality_mode have been set to true, however, they are mutually exclusive. You can set either of them to true, or set them both to false and adjust the arguments individually." 77 | } 78 | } 79 | 80 | if (run_dragen_mode_variant_calling && use_gatk3_haplotype_caller) { 81 | call Utilities.ErrorWithMessage as DragenModeVariantCallingAndGATK3Error { 82 | input: 83 | message = "DRAGEN mode variant calling has been activated, however, the HaplotypeCaller version has been set to use GATK 3. Please set use_gatk3_haplotype_caller to false to use DRAGEN mode variant calling." 84 | } 85 | } 86 | 87 | # Set DRAGEN-related arguments according to the preset arguments 88 | Boolean run_dragen_mode_variant_calling_ = if (dragen_functional_equivalence_mode || dragen_maximum_quality_mode) then true else run_dragen_mode_variant_calling 89 | Boolean use_spanning_event_genotyping_ = if dragen_functional_equivalence_mode then false else (if dragen_maximum_quality_mode then true else use_spanning_event_genotyping) 90 | Boolean unmap_contaminant_reads_ = if dragen_functional_equivalence_mode then false else (if dragen_maximum_quality_mode then true else unmap_contaminant_reads) 91 | Boolean perform_bqsr_ = if (dragen_functional_equivalence_mode || dragen_maximum_quality_mode) then false else perform_bqsr 92 | Boolean use_bwa_mem_ = if (dragen_functional_equivalence_mode || dragen_maximum_quality_mode) then false else use_bwa_mem 93 | Boolean use_gatk3_haplotype_caller_ = if (dragen_functional_equivalence_mode || dragen_maximum_quality_mode) then false else use_gatk3_haplotype_caller 94 | Boolean use_dragen_hard_filtering_ = if (dragen_functional_equivalence_mode || dragen_maximum_quality_mode) then true else use_dragen_hard_filtering 95 | 96 | # Not overridable: 97 | Float lod_threshold = -20.0 98 | String cross_check_fingerprints_by = "READGROUP" 99 | String recalibrated_bam_basename = sample_and_unmapped_bams.base_file_name + ".aligned.duplicates_marked.recalibrated" 100 | 101 | String final_gvcf_base_name = select_first([sample_and_unmapped_bams.final_gvcf_base_name, sample_and_unmapped_bams.base_file_name]) 102 | 103 | call ToBam.UnmappedBamToAlignedBam { 104 | input: 105 | sample_and_unmapped_bams = sample_and_unmapped_bams, 106 | references = references, 107 | dragmap_reference = dragmap_reference, 108 | papi_settings = papi_settings, 109 | 110 | contamination_sites_ud = references.contamination_sites_ud, 111 | contamination_sites_bed = references.contamination_sites_bed, 112 | contamination_sites_mu = references.contamination_sites_mu, 113 | 114 | cross_check_fingerprints_by = cross_check_fingerprints_by, 115 | haplotype_database_file = references.haplotype_database_file, 116 | lod_threshold = lod_threshold, 117 | recalibrated_bam_basename = recalibrated_bam_basename, 118 | perform_bqsr = perform_bqsr_, 119 | use_bwa_mem = use_bwa_mem_, 120 | unmap_contaminant_reads = unmap_contaminant_reads_, 121 | allow_empty_ref_alt = allow_empty_ref_alt 122 | } 123 | 124 | call AggregatedQC.AggregatedBamQC { 125 | input: 126 | base_recalibrated_bam = UnmappedBamToAlignedBam.output_bam, 127 | base_recalibrated_bam_index = UnmappedBamToAlignedBam.output_bam_index, 128 | base_name = sample_and_unmapped_bams.base_file_name, 129 | sample_name = sample_and_unmapped_bams.sample_name, 130 | recalibrated_bam_base_name = recalibrated_bam_basename, 131 | haplotype_database_file = references.haplotype_database_file, 132 | references = references, 133 | fingerprint_genotypes_file = fingerprint_genotypes_file, 134 | fingerprint_genotypes_index = fingerprint_genotypes_index, 135 | papi_settings = papi_settings 136 | } 137 | 138 | call ToCram.BamToCram as BamToCram { 139 | input: 140 | input_bam = UnmappedBamToAlignedBam.output_bam, 141 | ref_fasta = references.reference_fasta.ref_fasta, 142 | ref_fasta_index = references.reference_fasta.ref_fasta_index, 143 | ref_dict = references.reference_fasta.ref_dict, 144 | duplication_metrics = UnmappedBamToAlignedBam.duplicate_metrics, 145 | chimerism_metrics = AggregatedBamQC.agg_alignment_summary_metrics, 146 | base_file_name = sample_and_unmapped_bams.base_file_name, 147 | } 148 | 149 | # QC the sample WGS metrics (stringent thresholds) 150 | call QC.CollectWgsMetrics as CollectWgsMetrics { 151 | input: 152 | input_bam = UnmappedBamToAlignedBam.output_bam, 153 | input_bam_index = UnmappedBamToAlignedBam.output_bam_index, 154 | metrics_filename = sample_and_unmapped_bams.base_file_name + ".wgs_metrics", 155 | ref_fasta = references.reference_fasta.ref_fasta, 156 | ref_fasta_index = references.reference_fasta.ref_fasta_index, 157 | wgs_coverage_interval_list = wgs_coverage_interval_list, 158 | } 159 | 160 | # QC the sample raw WGS metrics (common thresholds) 161 | call QC.CollectRawWgsMetrics as CollectRawWgsMetrics { 162 | input: 163 | input_bam = UnmappedBamToAlignedBam.output_bam, 164 | input_bam_index = UnmappedBamToAlignedBam.output_bam_index, 165 | metrics_filename = sample_and_unmapped_bams.base_file_name + ".raw_wgs_metrics", 166 | ref_fasta = references.reference_fasta.ref_fasta, 167 | ref_fasta_index = references.reference_fasta.ref_fasta_index, 168 | wgs_coverage_interval_list = wgs_coverage_interval_list, 169 | } 170 | 171 | call ToGvcf.VariantCalling as BamToGvcf { 172 | input: 173 | run_dragen_mode_variant_calling = run_dragen_mode_variant_calling_, 174 | use_spanning_event_genotyping = use_spanning_event_genotyping_, 175 | calling_interval_list = references.calling_interval_list, 176 | evaluation_interval_list = references.evaluation_interval_list, 177 | haplotype_scatter_count = scatter_settings.haplotype_scatter_count, 178 | break_bands_at_multiples_of = scatter_settings.break_bands_at_multiples_of, 179 | contamination = UnmappedBamToAlignedBam.contamination, 180 | input_bam = UnmappedBamToAlignedBam.output_bam, 181 | input_bam_index = UnmappedBamToAlignedBam.output_bam_index, 182 | ref_fasta = references.reference_fasta.ref_fasta, 183 | ref_fasta_index = references.reference_fasta.ref_fasta_index, 184 | ref_dict = references.reference_fasta.ref_dict, 185 | ref_str = references.reference_fasta.ref_str, 186 | dbsnp_vcf = references.dbsnp_vcf, 187 | dbsnp_vcf_index = references.dbsnp_vcf_index, 188 | base_file_name = sample_and_unmapped_bams.base_file_name, 189 | final_vcf_base_name = final_gvcf_base_name, 190 | use_gatk3_haplotype_caller = use_gatk3_haplotype_caller_, 191 | use_dragen_hard_filtering = use_dragen_hard_filtering_ 192 | } 193 | 194 | if (provide_bam_output) { 195 | File provided_output_bam = UnmappedBamToAlignedBam.output_bam 196 | File provided_output_bam_index = UnmappedBamToAlignedBam.output_bam_index 197 | } 198 | 199 | # Outputs that will be retained when execution is complete 200 | output { 201 | Array[File] quality_yield_metrics = UnmappedBamToAlignedBam.quality_yield_metrics 202 | 203 | Array[File] unsorted_read_group_base_distribution_by_cycle_pdf = UnmappedBamToAlignedBam.unsorted_read_group_base_distribution_by_cycle_pdf 204 | Array[File] unsorted_read_group_base_distribution_by_cycle_metrics = UnmappedBamToAlignedBam.unsorted_read_group_base_distribution_by_cycle_metrics 205 | Array[File] unsorted_read_group_insert_size_histogram_pdf = UnmappedBamToAlignedBam.unsorted_read_group_insert_size_histogram_pdf 206 | Array[File] unsorted_read_group_insert_size_metrics = UnmappedBamToAlignedBam.unsorted_read_group_insert_size_metrics 207 | Array[File] unsorted_read_group_quality_by_cycle_pdf = UnmappedBamToAlignedBam.unsorted_read_group_quality_by_cycle_pdf 208 | Array[File] unsorted_read_group_quality_by_cycle_metrics = UnmappedBamToAlignedBam.unsorted_read_group_quality_by_cycle_metrics 209 | Array[File] unsorted_read_group_quality_distribution_pdf = UnmappedBamToAlignedBam.unsorted_read_group_quality_distribution_pdf 210 | Array[File] unsorted_read_group_quality_distribution_metrics = UnmappedBamToAlignedBam.unsorted_read_group_quality_distribution_metrics 211 | 212 | File read_group_alignment_summary_metrics = AggregatedBamQC.read_group_alignment_summary_metrics 213 | File read_group_gc_bias_detail_metrics = AggregatedBamQC.read_group_gc_bias_detail_metrics 214 | File read_group_gc_bias_pdf = AggregatedBamQC.read_group_gc_bias_pdf 215 | File read_group_gc_bias_summary_metrics = AggregatedBamQC.read_group_gc_bias_summary_metrics 216 | 217 | File? cross_check_fingerprints_metrics = UnmappedBamToAlignedBam.cross_check_fingerprints_metrics 218 | 219 | File selfSM = UnmappedBamToAlignedBam.selfSM 220 | Float contamination = UnmappedBamToAlignedBam.contamination 221 | 222 | File calculate_read_group_checksum_md5 = AggregatedBamQC.calculate_read_group_checksum_md5 223 | 224 | File agg_alignment_summary_metrics = AggregatedBamQC.agg_alignment_summary_metrics 225 | File agg_bait_bias_detail_metrics = AggregatedBamQC.agg_bait_bias_detail_metrics 226 | File agg_bait_bias_summary_metrics = AggregatedBamQC.agg_bait_bias_summary_metrics 227 | File agg_gc_bias_detail_metrics = AggregatedBamQC.agg_gc_bias_detail_metrics 228 | File agg_gc_bias_pdf = AggregatedBamQC.agg_gc_bias_pdf 229 | File agg_gc_bias_summary_metrics = AggregatedBamQC.agg_gc_bias_summary_metrics 230 | File agg_insert_size_histogram_pdf = AggregatedBamQC.agg_insert_size_histogram_pdf 231 | File agg_insert_size_metrics = AggregatedBamQC.agg_insert_size_metrics 232 | File agg_pre_adapter_detail_metrics = AggregatedBamQC.agg_pre_adapter_detail_metrics 233 | File agg_pre_adapter_summary_metrics = AggregatedBamQC.agg_pre_adapter_summary_metrics 234 | File agg_quality_distribution_pdf = AggregatedBamQC.agg_quality_distribution_pdf 235 | File agg_quality_distribution_metrics = AggregatedBamQC.agg_quality_distribution_metrics 236 | File agg_error_summary_metrics = AggregatedBamQC.agg_error_summary_metrics 237 | 238 | File? fingerprint_summary_metrics = AggregatedBamQC.fingerprint_summary_metrics 239 | File? fingerprint_detail_metrics = AggregatedBamQC.fingerprint_detail_metrics 240 | 241 | File wgs_metrics = CollectWgsMetrics.metrics 242 | File raw_wgs_metrics = CollectRawWgsMetrics.metrics 243 | 244 | File duplicate_metrics = UnmappedBamToAlignedBam.duplicate_metrics 245 | File? output_bqsr_reports = UnmappedBamToAlignedBam.output_bqsr_reports 246 | 247 | File gvcf_summary_metrics = BamToGvcf.vcf_summary_metrics 248 | File gvcf_detail_metrics = BamToGvcf.vcf_detail_metrics 249 | 250 | File? output_bam = provided_output_bam 251 | File? output_bam_index = provided_output_bam_index 252 | 253 | File output_cram = BamToCram.output_cram 254 | File output_cram_index = BamToCram.output_cram_index 255 | File output_cram_md5 = BamToCram.output_cram_md5 256 | 257 | File validate_cram_file_report = BamToCram.validate_cram_file_report 258 | 259 | File output_vcf = BamToGvcf.output_vcf 260 | File output_vcf_index = BamToGvcf.output_vcf_index 261 | } 262 | meta { 263 | allowNestedInputs: true 264 | } 265 | } 266 | -------------------------------------------------------------------------------- /20k_Throughput-run/WDL/BamProcessing.wdl: -------------------------------------------------------------------------------- 1 | version 1.0 2 | 3 | ## Copyright Broad Institute, 2018 4 | ## 5 | ## This WDL defines tasks used for BAM file processing of human whole-genome or exome sequencing data. 6 | ## 7 | ## Runtime parameters are often optimized for Broad's Google Cloud Platform implementation. 8 | ## For program versions, see docker containers. 9 | ## 10 | ## LICENSING : 11 | ## This script is released under the WDL source code license (BSD-3) (see LICENSE in 12 | ## https://github.com/broadinstitute/wdl). Note however that the programs it calls may 13 | ## be subject to different licenses. Users are responsible for checking that they are 14 | ## authorized to run all programs before running this script. Please see the docker 15 | ## page at https://hub.docker.com/r/broadinstitute/genomes-in-the-cloud/ for detailed 16 | ## licensing information pertaining to the included programs. 17 | 18 | # Sort BAM file by coordinate order 19 | task SortSam { 20 | input { 21 | File input_bam 22 | String output_bam_basename 23 | Int compression_level 24 | Int additional_disk = 20 25 | Int memory_multiplier = 1 26 | } 27 | # SortSam spills to disk a lot more because we are only store 300000 records in RAM now because its faster for our data so it needs 28 | # more disk space. Also it spills to disk in an uncompressed format so we need to account for that with a larger multiplier 29 | Float sort_sam_disk_multiplier = 3.25 30 | Int disk_size = ceil(sort_sam_disk_multiplier * size(input_bam, "GiB")) + additional_disk 31 | 32 | command { 33 | java -Dsamjdk.compression_level=~{compression_level} -Xms4000m -Xmx4900m -jar /mnt/lustre/genomics/tools/picard.jar \ 34 | SortSam \ 35 | INPUT=~{input_bam} \ 36 | OUTPUT=~{output_bam_basename}.bam \ 37 | SORT_ORDER="coordinate" \ 38 | CREATE_INDEX=true \ 39 | CREATE_MD5_FILE=true \ 40 | MAX_RECORDS_IN_RAM=300000 41 | 42 | } 43 | runtime { 44 | cpu: "16" 45 | memory: "5000 MiB" 46 | } 47 | output { 48 | File output_bam = "~{output_bam_basename}.bam" 49 | File output_bam_index = "~{output_bam_basename}.bai" 50 | File output_bam_md5 = "~{output_bam_basename}.bam.md5" 51 | } 52 | } 53 | 54 | 55 | # Mark duplicate reads to avoid counting non-independent observations 56 | task MarkDuplicates { 57 | input { 58 | Array[File] input_bams 59 | String output_bam_basename 60 | String metrics_filename 61 | Float total_input_size 62 | Int compression_level 63 | 64 | # The program default for READ_NAME_REGEX is appropriate in nearly every case. 65 | # Sometimes we wish to supply "null" in order to turn off optical duplicate detection 66 | # This can be desirable if you don't mind the estimated library size being wrong and optical duplicate detection is taking >7 days and failing 67 | String? read_name_regex 68 | Int memory_multiplier = 1 69 | Int additional_disk = 20 70 | 71 | Float? sorting_collection_size_ratio 72 | } 73 | 74 | # The merged bam will be smaller than the sum of the parts so we need to account for the unmerged inputs and the merged output. 75 | # Mark Duplicates takes in as input readgroup bams and outputs a slightly smaller aggregated bam. Giving .25 as wiggleroom 76 | Float md_disk_multiplier = 3 77 | Int disk_size = ceil(md_disk_multiplier * total_input_size) + additional_disk 78 | 79 | Float memory_size = 7.5 * memory_multiplier 80 | Int java_memory_size = (ceil(memory_size) - 2) 81 | 82 | # Task is assuming query-sorted input so that the Secondary and Supplementary reads get marked correctly 83 | # This works because the output of BWA is query-grouped and therefore, so is the output of MergeBamAlignment. 84 | # While query-grouped isn't actually query-sorted, it's good enough for MarkDuplicates with ASSUME_SORT_ORDER="queryname" 85 | 86 | command { 87 | java -Dsamjdk.compression_level=~{compression_level} -Xms~{java_memory_size}g -jar /mnt/lustre/genomics/tools/picard.jar \ 88 | MarkDuplicates \ 89 | INPUT=~{sep=' INPUT=' input_bams} \ 90 | OUTPUT=~{output_bam_basename}.bam \ 91 | METRICS_FILE=~{metrics_filename} \ 92 | VALIDATION_STRINGENCY=SILENT \ 93 | ~{"READ_NAME_REGEX=" + read_name_regex} \ 94 | ~{"SORTING_COLLECTION_SIZE_RATIO=" + sorting_collection_size_ratio} \ 95 | OPTICAL_DUPLICATE_PIXEL_DISTANCE=2500 \ 96 | ASSUME_SORT_ORDER="queryname" \ 97 | CLEAR_DT="false" \ 98 | ADD_PG_TAG_TO_READS=false 99 | } 100 | runtime { 101 | cpu: "2" 102 | memory: "~{memory_size} GiB" 103 | } 104 | output { 105 | File output_bam = "~{output_bam_basename}.bam" 106 | File duplicate_metrics = "~{metrics_filename}" 107 | } 108 | } 109 | 110 | # Generate Base Quality Score Recalibration (BQSR) model 111 | task BaseRecalibrator { 112 | input { 113 | File input_bam 114 | File input_bam_index 115 | String recalibration_report_filename 116 | Array[String] sequence_group_interval 117 | File dbsnp_vcf 118 | File dbsnp_vcf_index 119 | Array[File] known_indels_sites_vcfs 120 | Array[File] known_indels_sites_indices 121 | File ref_dict 122 | File ref_fasta 123 | File ref_fasta_index 124 | Int bqsr_scatter 125 | } 126 | 127 | Float ref_size = size(ref_fasta, "GiB") + size(ref_fasta_index, "GiB") + size(ref_dict, "GiB") 128 | Float dbsnp_size = size(dbsnp_vcf, "GiB") 129 | Int disk_size = ceil((size(input_bam, "GiB") / bqsr_scatter) + ref_size + dbsnp_size) + 20 130 | 131 | parameter_meta { 132 | input_bam: { 133 | localization_optional: true 134 | } 135 | } 136 | 137 | command { 138 | /mnt/lustre/genomics/tools/gatk/gatk --java-options "-XX:GCTimeLimit=50 -XX:GCHeapFreeLimit=10 -XX:+PrintFlagsFinal \ 139 | -Xlog:gc*:gc_log.log:time,level,tags \ 140 | -Xloggc:gc_log.log -Xms5g -Xmx6g" \ 141 | BaseRecalibrator \ 142 | -R ~{ref_fasta} \ 143 | -I ~{input_bam} \ 144 | --use-original-qualities \ 145 | -O ~{recalibration_report_filename} \ 146 | --known-sites ~{dbsnp_vcf} \ 147 | --known-sites ~{sep=" -known-sites " known_indels_sites_vcfs} \ 148 | -L ~{sep=" -L " sequence_group_interval} 149 | } 150 | runtime { 151 | cpu: "2" 152 | memory: "6000 MiB" 153 | } 154 | output { 155 | File recalibration_report = "~{recalibration_report_filename}" 156 | } 157 | } 158 | 159 | # Apply Base Quality Score Recalibration (BQSR) model 160 | task ApplyBQSR { 161 | input { 162 | File input_bam 163 | File input_bam_index 164 | String output_bam_basename 165 | File recalibration_report 166 | Array[String] sequence_group_interval 167 | File ref_dict 168 | File ref_fasta 169 | File ref_fasta_index 170 | Int compression_level 171 | Int bqsr_scatter 172 | Int memory_multiplier = 1 173 | Int additional_disk = 20 174 | Boolean bin_base_qualities = true 175 | Boolean somatic = false 176 | } 177 | 178 | Float ref_size = size(ref_fasta, "GiB") + size(ref_fasta_index, "GiB") + size(ref_dict, "GiB") 179 | Int disk_size = ceil((size(input_bam, "GiB") * 3 / bqsr_scatter) + ref_size) + additional_disk 180 | 181 | Int memory_size = ceil(3500 * memory_multiplier) 182 | Int java_memory_mb = memory_size - 500 183 | 184 | Boolean bin_somatic_base_qualities = bin_base_qualities && somatic 185 | 186 | parameter_meta { 187 | input_bam: { 188 | localization_optional: true 189 | } 190 | } 191 | 192 | command { 193 | /mnt/lustre/genomics/tools/gatk/gatk --java-options "-XX:+PrintFlagsFinal \ 194 | -Xlog:gc*:gc_log.log:time,level,tags \ 195 | -XX:GCTimeLimit=50 -XX:GCHeapFreeLimit=10 -Dsamjdk.compression_level=~{compression_level} -Xms3000m -Xmx~{java_memory_mb}m" \ 196 | ApplyBQSR \ 197 | --create-output-bam-md5 \ 198 | --add-output-sam-program-record \ 199 | -R ~{ref_fasta} \ 200 | -I ~{input_bam} \ 201 | --use-original-qualities \ 202 | -O ~{output_bam_basename}.bam \ 203 | -bqsr ~{recalibration_report} \ 204 | ~{true='--static-quantized-quals 10' false='' bin_base_qualities} \ 205 | ~{true='--static-quantized-quals 20' false='' bin_base_qualities} \ 206 | ~{true='--static-quantized-quals 30' false='' bin_base_qualities} \ 207 | ~{true='--static-quantized-quals 40' false='' bin_somatic_base_qualities} \ 208 | ~{true='--static-quantized-quals 50' false='' bin_somatic_base_qualities} \ 209 | -L ~{sep=" -L " sequence_group_interval} 210 | } 211 | runtime { 212 | memory: "~{memory_size} MiB" 213 | cpu: "2" 214 | } 215 | output { 216 | File recalibrated_bam = "~{output_bam_basename}.bam" 217 | File recalibrated_bam_checksum = "~{output_bam_basename}.bam.md5" 218 | } 219 | } 220 | 221 | # Combine multiple recalibration tables from scattered BaseRecalibrator runs 222 | task GatherBqsrReports { 223 | input { 224 | Array[File] input_bqsr_reports 225 | String output_report_filename 226 | } 227 | 228 | command { 229 | /mnt/lustre/genomics/tools/gatk/gatk --java-options "-Xms3000m -Xmx3400m" \ 230 | GatherBQSRReports \ 231 | -I ~{sep=' -I ' input_bqsr_reports} \ 232 | -O ~{output_report_filename} 233 | } 234 | runtime { 235 | cpu: "2" 236 | memory: "3500 MiB" 237 | } 238 | output { 239 | File output_bqsr_report = "~{output_report_filename}" 240 | } 241 | } 242 | 243 | # Combine multiple *sorted* BAM files 244 | task GatherSortedBamFiles { 245 | input { 246 | Array[File] input_bams 247 | String output_bam_basename 248 | Float total_input_size 249 | Int compression_level 250 | Int additional_disk = 20 251 | Int memory_multiplier = 1 252 | } 253 | 254 | # Multiply the input bam size by two to account for the input and output 255 | Int disk_size = ceil(2 * total_input_size) + additional_disk 256 | 257 | command { 258 | java -Dsamjdk.compression_level=~{compression_level} -Xms2000m -Xmx2900m -jar /mnt/lustre/genomics/tools/picard.jar \ 259 | GatherBamFiles \ 260 | INPUT=~{sep=' INPUT=' input_bams} \ 261 | OUTPUT=~{output_bam_basename}.bam \ 262 | CREATE_INDEX=true \ 263 | CREATE_MD5_FILE=true 264 | } 265 | runtime { 266 | memory: "3000 MiB" 267 | } 268 | output { 269 | File output_bam = "~{output_bam_basename}.bam" 270 | File output_bam_index = "~{output_bam_basename}.bai" 271 | File output_bam_md5 = "~{output_bam_basename}.bam.md5" 272 | } 273 | } 274 | 275 | # Combine multiple *unsorted* BAM files 276 | # Note that if/when WDL supports optional outputs, we should merge this task with the sorted version 277 | task GatherUnsortedBamFiles { 278 | input { 279 | Array[File] input_bams 280 | String output_bam_basename 281 | Float total_input_size 282 | Int compression_level 283 | } 284 | 285 | # Multiply the input bam size by two to account for the input and output 286 | Int disk_size = ceil(2 * total_input_size) + 20 287 | 288 | command { 289 | java -Dsamjdk.compression_level=~{compression_level} -Xms2000m -Xmx2900m -jar /mnt/lustre/genomics/tools/picard.jar \ 290 | GatherBamFiles \ 291 | INPUT=~{sep=' INPUT=' input_bams} \ 292 | OUTPUT=~{output_bam_basename}.bam \ 293 | CREATE_INDEX=false \ 294 | CREATE_MD5_FILE=false 295 | } 296 | runtime { 297 | cpu: "2" 298 | memory: "3 GiB" 299 | } 300 | output { 301 | File output_bam = "~{output_bam_basename}.bam" 302 | } 303 | } 304 | 305 | task GenerateSubsettedContaminationResources { 306 | input { 307 | String bait_set_name 308 | File target_interval_list 309 | File contamination_sites_ud 310 | File contamination_sites_bed 311 | File contamination_sites_mu 312 | } 313 | 314 | String output_ud = bait_set_name + "." + basename(contamination_sites_ud) 315 | String output_bed = bait_set_name + "." + basename(contamination_sites_bed) 316 | String output_mu = bait_set_name + "." + basename(contamination_sites_mu) 317 | String target_overlap_counts = "target_overlap_counts.txt" 318 | 319 | command <<< 320 | set -e -o pipefail 321 | 322 | grep -vE "^@" ~{target_interval_list} | 323 | awk -v OFS='\t' '$2=$2-1' | 324 | /app/bedtools intersect -c -a ~{contamination_sites_bed} -b - | 325 | cut -f6 > ~{target_overlap_counts} 326 | 327 | function restrict_to_overlaps() { 328 | # print lines from whole-genome file from loci with non-zero overlap 329 | # with target intervals 330 | WGS_FILE=$1 331 | EXOME_FILE=$2 332 | paste ~{target_overlap_counts} $WGS_FILE | 333 | grep -Ev "^0" | 334 | cut -f 2- > $EXOME_FILE 335 | echo "Generated $EXOME_FILE" 336 | } 337 | 338 | restrict_to_overlaps ~{contamination_sites_ud} ~{output_ud} 339 | restrict_to_overlaps ~{contamination_sites_bed} ~{output_bed} 340 | restrict_to_overlaps ~{contamination_sites_mu} ~{output_mu} 341 | 342 | >>> 343 | runtime { 344 | memory: "3.5 GiB" 345 | } 346 | output { 347 | File subsetted_contamination_ud = output_ud 348 | File subsetted_contamination_bed = output_bed 349 | File subsetted_contamination_mu = output_mu 350 | } 351 | } 352 | 353 | # Notes on the contamination estimate: 354 | # The contamination value is read from the FREEMIX field of the selfSM file output by verifyBamId 355 | # 356 | # In Zamboni production, this value is stored directly in METRICS.AGGREGATION_CONTAM 357 | # 358 | # Contamination is also stored in GVCF_CALLING and thereby passed to HAPLOTYPE_CALLER 359 | # But first, it is divided by an underestimation factor thusly: 360 | # float(FREEMIX) / ContaminationUnderestimationFactor 361 | # where the denominator is hardcoded in Zamboni: 362 | # val ContaminationUnderestimationFactor = 0.75f 363 | # 364 | # Here, I am handling this by returning both the original selfSM file for reporting, and the adjusted 365 | # contamination estimate for use in variant calling 366 | task CheckContamination { 367 | input { 368 | File input_bam 369 | File input_bam_index 370 | File contamination_sites_ud 371 | File contamination_sites_bed 372 | File contamination_sites_mu 373 | File ref_fasta 374 | File ref_fasta_index 375 | String output_prefix 376 | Float contamination_underestimation_factor 377 | Boolean disable_sanity_check = false 378 | } 379 | 380 | Int disk_size = ceil(size(input_bam, "GiB") + size(ref_fasta, "GiB")) + 30 381 | 382 | command <<< 383 | set -e 384 | 385 | # creates a ~{output_prefix}.selfSM file, a TSV file with 2 rows, 19 columns. 386 | # First row are the keys (e.g., SEQ_SM, RG, FREEMIX), second row are the associated values 387 | /mnt/lustre/genomics/tools/VerifyBamID/bin/VerifyBamID \ 388 | --Verbose \ 389 | --NumPC 4 \ 390 | --Output ~{output_prefix} \ 391 | --BamFile ~{input_bam} \ 392 | --Reference ~{ref_fasta} \ 393 | --UDPath ~{contamination_sites_ud} \ 394 | --MeanPath ~{contamination_sites_mu} \ 395 | --BedPath ~{contamination_sites_bed} \ 396 | ~{true="--DisableSanityCheck" false="" disable_sanity_check} \ 397 | 1>/dev/null 398 | 399 | # used to read from the selfSM file and calculate contamination, which gets printed out 400 | python3 <>> 422 | runtime { 423 | memory: "7.5 GiB" 424 | cpu: "2" 425 | } 426 | output { 427 | File selfSM = "~{output_prefix}.selfSM" 428 | Float contamination = read_float(stdout()) 429 | } 430 | } 431 | -------------------------------------------------------------------------------- /20k_Throughput-run/WDL/Qc.wdl: -------------------------------------------------------------------------------- 1 | version 1.0 2 | 3 | ## Copyright Broad Institute, 2018 4 | ## 5 | ## This WDL defines tasks used for QC of human whole-genome or exome sequencing data. 6 | ## 7 | ## Runtime parameters are often optimized for Broad's Google Cloud Platform implementation. 8 | ## For program versions, see docker containers. 9 | ## 10 | ## LICENSING : 11 | ## This script is released under the WDL source code license (BSD-3) (see LICENSE in 12 | ## https://github.com/broadinstitute/wdl). Note however that the programs it calls may 13 | ## be subject to different licenses. Users are responsible for checking that they are 14 | ## authorized to run all programs before running this script. Please see the docker 15 | ## page at https://hub.docker.com/r/broadinstitute/genomes-in-the-cloud/ for detailed 16 | ## licensing information pertaining to the included programs. 17 | 18 | # Collect sequencing yield quality metrics 19 | task CollectQualityYieldMetrics { 20 | input { 21 | File input_bam 22 | String metrics_filename 23 | } 24 | 25 | Int disk_size = ceil(size(input_bam, "GiB")) + 20 26 | 27 | command { 28 | java -Xms2000m -Xmx3400m -jar /mnt/lustre/genomics/tools/picard.jar \ 29 | CollectQualityYieldMetrics \ 30 | INPUT=~{input_bam} \ 31 | OQ=true \ 32 | OUTPUT=~{metrics_filename} 33 | } 34 | runtime { 35 | cpu: "2" 36 | memory: "3500 MiB" 37 | } 38 | output { 39 | File quality_yield_metrics = "~{metrics_filename}" 40 | } 41 | } 42 | 43 | # Collect base quality and insert size metrics 44 | task CollectUnsortedReadgroupBamQualityMetrics { 45 | input { 46 | File input_bam 47 | String output_bam_prefix 48 | } 49 | 50 | Int disk_size = ceil(size(input_bam, "GiB")) + 20 51 | 52 | command { 53 | java -Xms5000m -Xmx6900m -jar /mnt/lustre/genomics/tools/picard.jar \ 54 | CollectMultipleMetrics \ 55 | INPUT=~{input_bam} \ 56 | OUTPUT=~{output_bam_prefix} \ 57 | ASSUME_SORTED=true \ 58 | PROGRAM=null \ 59 | PROGRAM=CollectBaseDistributionByCycle \ 60 | PROGRAM=CollectInsertSizeMetrics \ 61 | PROGRAM=MeanQualityByCycle \ 62 | PROGRAM=QualityScoreDistribution \ 63 | METRIC_ACCUMULATION_LEVEL=null \ 64 | METRIC_ACCUMULATION_LEVEL=ALL_READS 65 | 66 | touch ~{output_bam_prefix}.insert_size_metrics 67 | touch ~{output_bam_prefix}.insert_size_histogram.pdf 68 | } 69 | runtime { 70 | memory: "7000 MiB" 71 | cpu: "2" 72 | } 73 | output { 74 | File base_distribution_by_cycle_pdf = "~{output_bam_prefix}.base_distribution_by_cycle.pdf" 75 | File base_distribution_by_cycle_metrics = "~{output_bam_prefix}.base_distribution_by_cycle_metrics" 76 | File insert_size_histogram_pdf = "~{output_bam_prefix}.insert_size_histogram.pdf" 77 | File insert_size_metrics = "~{output_bam_prefix}.insert_size_metrics" 78 | File quality_by_cycle_pdf = "~{output_bam_prefix}.quality_by_cycle.pdf" 79 | File quality_by_cycle_metrics = "~{output_bam_prefix}.quality_by_cycle_metrics" 80 | File quality_distribution_pdf = "~{output_bam_prefix}.quality_distribution.pdf" 81 | File quality_distribution_metrics = "~{output_bam_prefix}.quality_distribution_metrics" 82 | } 83 | } 84 | 85 | # Collect alignment summary and GC bias quality metrics 86 | task CollectReadgroupBamQualityMetrics { 87 | input { 88 | File input_bam 89 | File input_bam_index 90 | String output_bam_prefix 91 | File ref_dict 92 | File ref_fasta 93 | File ref_fasta_index 94 | Boolean collect_gc_bias_metrics = true 95 | } 96 | 97 | Float ref_size = size(ref_fasta, "GiB") + size(ref_fasta_index, "GiB") + size(ref_dict, "GiB") 98 | Int disk_size = ceil(size(input_bam, "GiB") + ref_size) + 20 99 | 100 | command { 101 | # These are optionally generated, but need to exist for Cromwell's sake 102 | touch ~{output_bam_prefix}.gc_bias.detail_metrics \ 103 | ~{output_bam_prefix}.gc_bias.pdf \ 104 | ~{output_bam_prefix}.gc_bias.summary_metrics 105 | 106 | java -Xms5000m -Xmx6900m -jar /mnt/lustre/genomics/tools/picard.jar \ 107 | CollectMultipleMetrics \ 108 | INPUT=~{input_bam} \ 109 | REFERENCE_SEQUENCE=~{ref_fasta} \ 110 | OUTPUT=~{output_bam_prefix} \ 111 | ASSUME_SORTED=true \ 112 | PROGRAM=null \ 113 | PROGRAM=CollectAlignmentSummaryMetrics \ 114 | ~{true='PROGRAM="CollectGcBiasMetrics"' false="" collect_gc_bias_metrics} \ 115 | METRIC_ACCUMULATION_LEVEL=null \ 116 | METRIC_ACCUMULATION_LEVEL=READ_GROUP 117 | } 118 | runtime { 119 | cpu: "2" 120 | memory: "7000 MiB" 121 | } 122 | output { 123 | File alignment_summary_metrics = "~{output_bam_prefix}.alignment_summary_metrics" 124 | File gc_bias_detail_metrics = "~{output_bam_prefix}.gc_bias.detail_metrics" 125 | File gc_bias_pdf = "~{output_bam_prefix}.gc_bias.pdf" 126 | File gc_bias_summary_metrics = "~{output_bam_prefix}.gc_bias.summary_metrics" 127 | } 128 | } 129 | 130 | # Collect quality metrics from the aggregated bam 131 | task CollectAggregationMetrics { 132 | input { 133 | File input_bam 134 | File input_bam_index 135 | String output_bam_prefix 136 | File ref_dict 137 | File ref_fasta 138 | File ref_fasta_index 139 | Boolean collect_gc_bias_metrics = true 140 | } 141 | 142 | Float ref_size = size(ref_fasta, "GiB") + size(ref_fasta_index, "GiB") + size(ref_dict, "GiB") 143 | Int disk_size = ceil(size(input_bam, "GiB") + ref_size) + 20 144 | 145 | command { 146 | # These are optionally generated, but need to exist for Cromwell's sake 147 | touch ~{output_bam_prefix}.gc_bias.detail_metrics \ 148 | ~{output_bam_prefix}.gc_bias.pdf \ 149 | ~{output_bam_prefix}.gc_bias.summary_metrics \ 150 | ~{output_bam_prefix}.insert_size_metrics \ 151 | ~{output_bam_prefix}.insert_size_histogram.pdf 152 | 153 | java -Xms5000m -Xmx6900m -jar /mnt/lustre/genomics/tools/picard.jar \ 154 | CollectMultipleMetrics \ 155 | INPUT=~{input_bam} \ 156 | REFERENCE_SEQUENCE=~{ref_fasta} \ 157 | OUTPUT=~{output_bam_prefix} \ 158 | ASSUME_SORTED=true \ 159 | PROGRAM=null \ 160 | PROGRAM=CollectAlignmentSummaryMetrics \ 161 | PROGRAM=CollectInsertSizeMetrics \ 162 | PROGRAM=CollectSequencingArtifactMetrics \ 163 | PROGRAM=QualityScoreDistribution \ 164 | ~{true='PROGRAM="CollectGcBiasMetrics"' false="" collect_gc_bias_metrics} \ 165 | METRIC_ACCUMULATION_LEVEL=null \ 166 | METRIC_ACCUMULATION_LEVEL=SAMPLE \ 167 | METRIC_ACCUMULATION_LEVEL=LIBRARY 168 | } 169 | runtime { 170 | cpu: "2" 171 | memory: "7000 MiB" 172 | } 173 | output { 174 | File alignment_summary_metrics = "~{output_bam_prefix}.alignment_summary_metrics" 175 | File bait_bias_detail_metrics = "~{output_bam_prefix}.bait_bias_detail_metrics" 176 | File bait_bias_summary_metrics = "~{output_bam_prefix}.bait_bias_summary_metrics" 177 | File gc_bias_detail_metrics = "~{output_bam_prefix}.gc_bias.detail_metrics" 178 | File gc_bias_pdf = "~{output_bam_prefix}.gc_bias.pdf" 179 | File gc_bias_summary_metrics = "~{output_bam_prefix}.gc_bias.summary_metrics" 180 | File insert_size_histogram_pdf = "~{output_bam_prefix}.insert_size_histogram.pdf" 181 | File insert_size_metrics = "~{output_bam_prefix}.insert_size_metrics" 182 | File pre_adapter_detail_metrics = "~{output_bam_prefix}.pre_adapter_detail_metrics" 183 | File pre_adapter_summary_metrics = "~{output_bam_prefix}.pre_adapter_summary_metrics" 184 | File quality_distribution_pdf = "~{output_bam_prefix}.quality_distribution.pdf" 185 | File quality_distribution_metrics = "~{output_bam_prefix}.quality_distribution_metrics" 186 | File error_summary_metrics = "~{output_bam_prefix}.error_summary_metrics" 187 | } 188 | } 189 | 190 | task ConvertSequencingArtifactToOxoG { 191 | input { 192 | File pre_adapter_detail_metrics 193 | File bait_bias_detail_metrics 194 | String base_name 195 | File ref_dict 196 | File ref_fasta 197 | File ref_fasta_index 198 | Int memory_multiplier = 1 199 | } 200 | 201 | Float ref_size = size(ref_fasta, "GiB") + size(ref_fasta_index, "GiB") + size(ref_dict, "GiB") 202 | Int disk_size = ceil(size(pre_adapter_detail_metrics, "GiB") + size(bait_bias_detail_metrics, "GiB") + ref_size) + 20 203 | 204 | Int memory_size = ceil(4000 * memory_multiplier) 205 | Int java_memory_size = memory_size - 1000 206 | Int max_heap = memory_size - 500 207 | 208 | command { 209 | input_base=$(dirname ~{pre_adapter_detail_metrics})/~{base_name} 210 | java -Xms~{java_memory_size}m -Xmx~{max_heap}m \ 211 | -jar /mnt/lustre/genomics/tools/picard.jar \ 212 | ConvertSequencingArtifactToOxoG \ 213 | --INPUT_BASE $input_base \ 214 | --OUTPUT_BASE ~{base_name} \ 215 | --REFERENCE_SEQUENCE ~{ref_fasta} 216 | } 217 | runtime { 218 | memory: "~{memory_size} MiB" 219 | } 220 | output { 221 | File oxog_metrics = "~{base_name}.oxog_metrics" 222 | } 223 | } 224 | 225 | # Check that the fingerprints of separate readgroups all match 226 | task CrossCheckFingerprints { 227 | input { 228 | Array[File] input_bams 229 | Array[File] input_bam_indexes 230 | File haplotype_database_file 231 | String metrics_filename 232 | Float total_input_size 233 | Float lod_threshold 234 | String cross_check_by 235 | } 236 | 237 | Int disk_size = ceil(total_input_size) + 20 238 | 239 | command <<< 240 | java -Dsamjdk.buffer_size=131072 \ 241 | -XX:GCTimeLimit=50 -XX:GCHeapFreeLimit=10 -Xms3000m -Xmx3400m \ 242 | -jar /mnt/lustre/genomics/tools/picard.jar \ 243 | CrosscheckFingerprints \ 244 | OUTPUT=~{metrics_filename} \ 245 | HAPLOTYPE_MAP=~{haplotype_database_file} \ 246 | EXPECT_ALL_GROUPS_TO_MATCH=true \ 247 | INPUT=~{sep=' INPUT=' input_bams} \ 248 | LOD_THRESHOLD=~{lod_threshold} \ 249 | CROSSCHECK_BY=~{cross_check_by} 250 | >>> 251 | runtime { 252 | cpu: "2" 253 | memory: "3500 MiB" 254 | } 255 | output { 256 | File cross_check_fingerprints_metrics = "~{metrics_filename}" 257 | } 258 | } 259 | 260 | task CheckFingerprintTask { 261 | input { 262 | File? input_bam 263 | File? input_bam_index 264 | File? input_vcf 265 | File? input_vcf_index 266 | String? input_sample_alias 267 | 268 | File genotypes 269 | File? genotypes_index 270 | String expected_sample_alias 271 | 272 | String output_basename 273 | Float genotype_lod_threshold = 5.0 274 | 275 | File haplotype_database_file 276 | File? ref_fasta 277 | File? ref_fasta_index 278 | 279 | Int memory_size = 2500 280 | 281 | Boolean allow_lod_zero = false 282 | } 283 | 284 | Int java_memory_size = memory_size - 1000 285 | Int max_heap = memory_size - 500 286 | 287 | Int disk_size = ceil(size(input_bam, "GiB") + size(input_vcf, "GiB")) + 20 288 | # Picard has different behavior depending on whether or not the OUTPUT parameter ends with a '.', so we are explicitly 289 | # passing in where we want the two metrics files to go to avoid any potential confusion. 290 | String summary_metrics_location = "~{output_basename}.fingerprinting_summary_metrics" 291 | String detail_metrics_location = "~{output_basename}.fingerprinting_detail_metrics" 292 | 293 | File input_file = select_first([input_vcf, input_bam]) 294 | 295 | command <<< 296 | set -e 297 | java -Xms3g -Xmx3400m -Dpicard.useLegacyParser=false -jar /mnt/lustre/genomics/tools/picard.jar \ 298 | CheckFingerprint \ 299 | --INPUT ~{input_file} \ 300 | ~{if defined(input_vcf) then "--OBSERVED_SAMPLE_ALIAS \"" + input_sample_alias + "\"" else ""} \ 301 | --GENOTYPES ~{genotypes} \ 302 | --EXPECTED_SAMPLE_ALIAS "~{expected_sample_alias}" \ 303 | ~{if defined(input_bam) then "--IGNORE_READ_GROUPS true" else ""} \ 304 | --HAPLOTYPE_MAP ~{haplotype_database_file} \ 305 | --GENOTYPE_LOD_THRESHOLD ~{genotype_lod_threshold} \ 306 | --SUMMARY_OUTPUT ~{summary_metrics_location} \ 307 | --DETAIL_OUTPUT ~{detail_metrics_location} \ 308 | ~{"--REFERENCE_SEQUENCE " + ref_fasta} \ 309 | ~{true='--EXIT_CODE_WHEN_NO_VALID_CHECKS 0' false='' allow_lod_zero} 310 | 311 | CONTENT_LINE=$(cat ~{summary_metrics_location} | 312 | grep -n "## METRICS CLASS\tpicard.analysis.FingerprintingSummaryMetrics" | 313 | cut -f1 -d:) 314 | CONTENT_LINE=$(($CONTENT_LINE+2)) 315 | sed '8q;d' ~{summary_metrics_location} | cut -f5 > lod 316 | >>> 317 | 318 | runtime { 319 | cpu: "2" 320 | memory: "~{memory_size} MiB" 321 | } 322 | 323 | output { 324 | File summary_metrics = summary_metrics_location 325 | File detail_metrics = detail_metrics_location 326 | Float lod = read_float("lod") 327 | } 328 | } 329 | 330 | task CheckPreValidation { 331 | input { 332 | File duplication_metrics 333 | File chimerism_metrics 334 | Float max_duplication_in_reasonable_sample 335 | Float max_chimerism_in_reasonable_sample 336 | } 337 | 338 | command <<< 339 | set -o pipefail 340 | set -e 341 | 342 | grep -A 1 PERCENT_DUPLICATION ~{duplication_metrics} > duplication.csv 343 | grep -A 3 PCT_CHIMERAS ~{chimerism_metrics} | grep -v OF_PAIR > chimerism.csv 344 | 345 | python3 <>> 365 | runtime { 366 | memory: "2 GiB" 367 | } 368 | output { 369 | Float duplication_rate = read_float("duplication_value.txt") 370 | Float chimerism_rate = read_float("chimerism_value.txt") 371 | Boolean is_outlier_data = duplication_rate > max_duplication_in_reasonable_sample || chimerism_rate > max_chimerism_in_reasonable_sample 372 | } 373 | } 374 | 375 | task ValidateSamFile { 376 | input { 377 | File input_bam 378 | File? input_bam_index 379 | String report_filename 380 | File ref_dict 381 | File ref_fasta 382 | File ref_fasta_index 383 | Int? max_output 384 | Array[String]? ignore 385 | Boolean? is_outlier_data 386 | Int memory_multiplier = 1 387 | Int additional_disk = 20 388 | 389 | Int disk_size = ceil(size(input_bam, "GiB") 390 | + size(ref_fasta, "GiB") 391 | + size(ref_fasta_index, "GiB") 392 | + size(ref_dict, "GiB")) + additional_disk 393 | } 394 | 395 | Int memory_size = ceil(16000 * memory_multiplier) 396 | Int java_memory_size = memory_size - 1000 397 | Int max_heap = memory_size - 500 398 | 399 | command { 400 | java -Xms~{java_memory_size}m -Xmx~{max_heap}m -jar /mnt/lustre/genomics/tools/picard.jar \ 401 | ValidateSamFile \ 402 | INPUT=~{input_bam} \ 403 | OUTPUT=~{report_filename} \ 404 | REFERENCE_SEQUENCE=~{ref_fasta} \ 405 | ~{"MAX_OUTPUT=" + max_output} \ 406 | IGNORE=~{default="null" sep=" IGNORE=" ignore} \ 407 | MODE=VERBOSE \ 408 | ~{default='SKIP_MATE_VALIDATION=false' true='SKIP_MATE_VALIDATION=true' false='SKIP_MATE_VALIDATION=false' is_outlier_data} \ 409 | IS_BISULFITE_SEQUENCED=false 410 | } 411 | runtime { 412 | memory: "~{memory_size} MiB" 413 | cpu: "2" 414 | } 415 | output { 416 | File report = "~{report_filename}" 417 | } 418 | } 419 | 420 | task CollectWgsMetrics { 421 | input { 422 | File input_bam 423 | File input_bam_index 424 | String metrics_filename 425 | File wgs_coverage_interval_list 426 | File ref_fasta 427 | File ref_fasta_index 428 | Int read_length = 250 429 | } 430 | 431 | Float ref_size = size(ref_fasta, "GiB") + size(ref_fasta_index, "GiB") 432 | Int disk_size = ceil(size(input_bam, "GiB") + ref_size) + 20 433 | 434 | command { 435 | java -Xms2000m -Xmx3g -jar /mnt/lustre/genomics/tools/picard.jar \ 436 | CollectWgsMetrics \ 437 | INPUT=~{input_bam} \ 438 | VALIDATION_STRINGENCY=SILENT \ 439 | REFERENCE_SEQUENCE=~{ref_fasta} \ 440 | INCLUDE_BQ_HISTOGRAM=true \ 441 | INTERVALS=~{wgs_coverage_interval_list} \ 442 | OUTPUT=~{metrics_filename} \ 443 | USE_FAST_ALGORITHM=true \ 444 | READ_LENGTH=~{read_length} 445 | } 446 | runtime { 447 | cpu: "2" 448 | memory: "3000 MiB" 449 | } 450 | output { 451 | File metrics = "~{metrics_filename}" 452 | } 453 | } 454 | 455 | # Collect raw WGS metrics (commonly used QC thresholds) 456 | task CollectRawWgsMetrics { 457 | input { 458 | File input_bam 459 | File input_bam_index 460 | String metrics_filename 461 | File wgs_coverage_interval_list 462 | File ref_fasta 463 | File ref_fasta_index 464 | Int read_length = 250 465 | Int memory_multiplier = 1 466 | Int additional_disk = 20 467 | } 468 | Float ref_size = size(ref_fasta, "GiB") + size(ref_fasta_index, "GiB") 469 | Int disk_size = ceil(size(input_bam, "GiB") + ref_size) + additional_disk 470 | 471 | Int memory_size = ceil((if (disk_size < 110) then 5 else 7) * memory_multiplier) 472 | String java_memory_size = (memory_size - 1) * 1000 473 | 474 | command { 475 | java -Xms~{java_memory_size}m -Xmx~{memory_size}g -jar /mnt/lustre/genomics/tools/picard.jar \ 476 | CollectRawWgsMetrics \ 477 | INPUT=~{input_bam} \ 478 | VALIDATION_STRINGENCY=SILENT \ 479 | REFERENCE_SEQUENCE=~{ref_fasta} \ 480 | INCLUDE_BQ_HISTOGRAM=true \ 481 | INTERVALS=~{wgs_coverage_interval_list} \ 482 | OUTPUT=~{metrics_filename} \ 483 | USE_FAST_ALGORITHM=true \ 484 | READ_LENGTH=~{read_length} 485 | } 486 | runtime { 487 | cpu: "2" 488 | memory: "~{memory_size} GiB" 489 | } 490 | output { 491 | File metrics = "~{metrics_filename}" 492 | } 493 | } 494 | 495 | task CollectHsMetrics { 496 | input { 497 | File input_bam 498 | File input_bam_index 499 | File ref_fasta 500 | File ref_fasta_index 501 | String metrics_filename 502 | File target_interval_list 503 | File bait_interval_list 504 | Int memory_multiplier = 1 505 | Int additional_disk = 20 506 | } 507 | 508 | Float ref_size = size(ref_fasta, "GiB") + size(ref_fasta_index, "GiB") 509 | Int disk_size = ceil(size(input_bam, "GiB") + ref_size) + additional_disk 510 | # Try to fit the input bam into memory, within reason. 511 | Int rounded_bam_size = ceil(size(input_bam, "GiB") + 0.5) 512 | Int rounded_memory_size = ceil((if (rounded_bam_size > 10) then 10 else rounded_bam_size) * memory_multiplier) 513 | Int memory_size = if rounded_memory_size < 7 then 7000 else (rounded_memory_size * 1000) 514 | Int java_memory_size = memory_size - 1000 515 | Int max_heap = memory_size - 500 516 | 517 | # There are probably more metrics we want to generate with this tool 518 | command { 519 | java -Xms~{java_memory_size}m -Xmx~{max_heap}m -jar /mnt/lustre/genomics/tools/picard.jar \ 520 | CollectHsMetrics \ 521 | INPUT=~{input_bam} \ 522 | REFERENCE_SEQUENCE=~{ref_fasta} \ 523 | VALIDATION_STRINGENCY=SILENT \ 524 | TARGET_INTERVALS=~{target_interval_list} \ 525 | BAIT_INTERVALS=~{bait_interval_list} \ 526 | METRIC_ACCUMULATION_LEVEL=null \ 527 | METRIC_ACCUMULATION_LEVEL=SAMPLE \ 528 | METRIC_ACCUMULATION_LEVEL=LIBRARY \ 529 | OUTPUT=~{metrics_filename} 530 | } 531 | 532 | runtime { 533 | memory: "~{memory_size} MiB" 534 | } 535 | 536 | output { 537 | File metrics = metrics_filename 538 | } 539 | } 540 | 541 | # Generate a checksum per readgroup 542 | task CalculateReadGroupChecksum { 543 | input { 544 | File input_bam 545 | File input_bam_index 546 | String read_group_md5_filename 547 | } 548 | 549 | Int disk_size = ceil(size(input_bam, "GiB")) + 40 550 | 551 | command { 552 | java -Xms1000m -Xmx1900m -jar /mnt/lustre/genomics/tools/picard.jar \ 553 | CalculateReadGroupChecksum \ 554 | INPUT=~{input_bam} \ 555 | OUTPUT=~{read_group_md5_filename} 556 | } 557 | runtime { 558 | cpu: "2" 559 | memory: "2 GiB" 560 | } 561 | output { 562 | File md5_file = "~{read_group_md5_filename}" 563 | } 564 | } 565 | 566 | # Validate a (g)VCF with -gvcf specific validation 567 | task ValidateVCF { 568 | input { 569 | File input_vcf 570 | File input_vcf_index 571 | File ref_fasta 572 | File ref_fasta_index 573 | File ref_dict 574 | File? dbsnp_vcf 575 | File? dbsnp_vcf_index 576 | File calling_interval_list 577 | File? calling_interval_list_index # if the interval list is a VCF, than an index file is also required 578 | Boolean is_gvcf = true 579 | String? extra_args 580 | } 581 | 582 | Boolean calling_intervals_is_vcf = defined(calling_interval_list_index) 583 | String calling_interval_list_basename = basename(calling_interval_list) 584 | String calling_interval_list_index_basename = if calling_intervals_is_vcf then basename(select_first([calling_interval_list_index])) else "" 585 | 586 | Float ref_size = size(ref_fasta, "GiB") + size(ref_fasta_index, "GiB") + size(ref_dict, "GiB") 587 | Int disk_size = ceil(size(input_vcf, "GiB") + size(dbsnp_vcf, "GiB") + ref_size) + 20 588 | 589 | command { 590 | set -e 591 | 592 | # We can't always assume the index was located with the vcf, so make a link so that the paths look the same 593 | ln -s ~{calling_interval_list} ~{calling_interval_list_basename} 594 | if [ ~{calling_intervals_is_vcf} == "true" ]; then 595 | ln -s ~{calling_interval_list_index} ~{calling_interval_list_index_basename} 596 | fi 597 | 598 | # Note that WGS needs a lot of memory to do the -L *.vcf if an interval file is not supplied 599 | /mnt/lustre/genomics/tools/gatk/gatk --java-options "-Xms6000m -Xmx6900m" \ 600 | ValidateVariants \ 601 | -V ~{input_vcf} \ 602 | -R ~{ref_fasta} \ 603 | -L ~{calling_interval_list_basename} \ 604 | ~{true="-gvcf" false="" is_gvcf} \ 605 | --validation-type-to-exclude ALLELES \ 606 | ~{"--dbsnp " + dbsnp_vcf} \ 607 | ~{extra_args} 608 | } 609 | runtime { 610 | cpu: "2" 611 | memory: "7000 MiB" 612 | } 613 | } 614 | 615 | # Collect variant calling metrics from GVCF output 616 | task CollectVariantCallingMetrics { 617 | input { 618 | File input_vcf 619 | File input_vcf_index 620 | String metrics_basename 621 | File dbsnp_vcf 622 | File dbsnp_vcf_index 623 | File ref_dict 624 | File evaluation_interval_list 625 | Boolean is_gvcf = true 626 | } 627 | 628 | Int disk_size = ceil(size(input_vcf, "GiB") + size(dbsnp_vcf, "GiB")) + 20 629 | 630 | command { 631 | java -Xms2000m -Xmx2900m -jar /mnt/lustre/genomics/tools/picard.jar \ 632 | CollectVariantCallingMetrics \ 633 | INPUT=~{input_vcf} \ 634 | OUTPUT=~{metrics_basename} \ 635 | DBSNP=~{dbsnp_vcf} \ 636 | SEQUENCE_DICTIONARY=~{ref_dict} \ 637 | TARGET_INTERVALS=~{evaluation_interval_list} \ 638 | ~{true="GVCF_INPUT=true" false="" is_gvcf} 639 | } 640 | runtime { 641 | cpu: "2" 642 | memory: "3000 MiB" 643 | } 644 | output { 645 | File summary_metrics = "~{metrics_basename}.variant_calling_summary_metrics" 646 | File detail_metrics = "~{metrics_basename}.variant_calling_detail_metrics" 647 | } 648 | } 649 | --------------------------------------------------------------------------------