├── .gitignore ├── LICENSE ├── README.md ├── _config.yml ├── commands ├── 04-commands.txt ├── 05-commands.txt ├── 06-commands.txt ├── 07-commands.txt ├── 08-commands.txt ├── 09-commands.txt ├── 10-commands.txt └── 12-commands.txt ├── config ├── empty.options.json └── google.conf ├── figures ├── .keep └── README.md ├── metadata ├── README.md ├── book_sample-metadata.tsv └── workspace-metadata.tsv ├── notebooks ├── Basic-genomics-notebook.ipynb ├── README.md ├── Working-IGV-example.ipynb ├── install_GATK_4130_with_igv.sh ├── notebooks_Genomics-Notebook-executed.ipynb ├── notebooks_Genomics-Notebook.ipynb └── plotting.R ├── production ├── README.md ├── notebook_images │ ├── Genomics-Notebook-executed-igv.ipynb │ ├── cell_27.png │ ├── cell_28.png │ ├── cell_29.png │ ├── cell_32.png │ ├── cell_35.png │ ├── cell_36.png │ ├── cell_37.png │ └── cell_39.png └── pygments_lexer │ ├── README.md │ ├── hello-world.wdl │ ├── run_wdl_lexer.sh │ ├── test.html │ └── wdl_lexer.py ├── temp └── 05-plotting.R └── workflows ├── README.md ├── hello-hc ├── hc-break1.wdl ├── hc-break2.wdl ├── hello-haplotypecaller.inputs.json └── hello-haplotypecaller.wdl ├── hello-world ├── hello-world-again.wdl ├── hello-world-var.wdl ├── hello-world.inputs.json └── hello-world.wdl ├── mystery-1 ├── haplotypecaller-gvcf-gatk4.hg38.wgs.inputs.json └── haplotypecaller-gvcf-gatk4.wdl ├── mystery-2 ├── WholeGenomeGermlineSingleSample.hg38.inputs.json ├── WholeGenomeGermlineSingleSample.wdl ├── structs │ └── GermlineStructs.wdl └── tasks │ ├── AggregatedBamQC.wdl │ ├── Alignment.wdl │ ├── BamProcessing.wdl │ ├── BamToCram.wdl │ ├── GermlineVariantDiscovery.wdl │ ├── Qc.wdl │ ├── SplitLargeReadGroup.wdl │ ├── UnmappedBamToAlignedBam.wdl │ ├── Utilities.wdl │ ├── VariantCalling-local.wdl │ └── VariantCalling.wdl └── scatter-hc ├── scatter-haplotypecaller.gcs.inputs.json ├── scatter-haplotypecaller.gcs.inputs.test.json ├── scatter-haplotypecaller.local.inputs.json └── scatter-haplotypecaller.wdl /.gitignore: -------------------------------------------------------------------------------- 1 | 2 | .DS_Store 3 | pygments_lexer/html_output 4 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | BSD 3-Clause License 2 | 3 | Copyright (c) 2019, Broad Institute 4 | All rights reserved. 5 | 6 | Redistribution and use in source and binary forms, with or without 7 | modification, are permitted provided that the following conditions are met: 8 | 9 | 1. Redistributions of source code must retain the above copyright notice, this 10 | list of conditions and the following disclaimer. 11 | 12 | 2. Redistributions in binary form must reproduce the above copyright notice, 13 | this list of conditions and the following disclaimer in the documentation 14 | and/or other materials provided with the distribution. 15 | 16 | 3. Neither the name of the copyright holder nor the names of its 17 | contributors may be used to endorse or promote products derived from 18 | this software without specific prior written permission. 19 | 20 | THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" 21 | AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 22 | IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE 23 | DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE 24 | FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 25 | DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR 26 | SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER 27 | CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, 28 | OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE 29 | OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 30 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # genomics-in-the-cloud 2 | 3 | Source code and related materials for Genomics in the Cloud, an O'Reilly book by [Geraldine A. Van der Auwera](https://www.linkedin.com/in/geraldine-van-der-auwera-5a5811) and [Brian D. O'Connor](https://www.linkedin.com/in/briandoconnor/). 4 | 5 | You can find the book in the O'Reilly Learning Library at [https://oreil.ly/genomics-cloud](https://oreil.ly/genomics-cloud), on Amazon ([Kindle](https://www.amazon.com/Genomics-Cloud-Using-Docker-Terra-ebook-dp-B086Q7D47V/dp/B086Q7D47V/) or [paperback](https://www.amazon.com/Genomics-Cloud-GATK-Spark-Docker-dp-1491975199/dp/1491975199/)), and in both ebook and print formats from a variety of other booksellers. We do encourage you to get it through your local independent bookstore if you’re able. 6 | 7 | ## Book overview 8 | 9 | Data in the genomics field is booming. In just a few years, organizations such as the National Institutes of Health (NIH) will host 50+ petabytes—or 50 million gigabytes—of genomic data, and they’re turning to cloud infrastructure to make that data available to the research community. How do you adapt analysis tools and protocols to access and analyze that data in the cloud? 10 | 11 | With this practical book, researchers will learn how to work with genomics algorithms using open source tools including the Genome Analysis Toolkit (GATK), Docker, WDL, and Terra. Geraldine Van der Auwera, longtime custodian of the GATK user community, and Brian O’Connor of the UC Santa Cruz Genomics Institute guide you through the process. You’ll learn by working with real data and genomics algorithms from the field. 12 | 13 | This book takes you through: 14 | 15 | - Essential genomics and computing technology background 16 | - Basic cloud computing operations 17 | - Getting started with GATK 18 | - Three major GATK Best Practices pipelines for variant discovery 19 | - Automating analysis with scripted workflows using WDL and Cromwell 20 | - Scaling up workflow execution in the cloud, including parallelization and cost optimization 21 | - Interactive analysis in the cloud using Jupyter notebooks 22 | - Secure collaboration and computational reproducibility using Terra 23 | 24 | For more information about the book and why you might find it useful, please see the [Genomics in the Cloud blog](https://broadinstitute.github.io/genomics-in-the-cloud). 25 | 26 | ---- 27 | 28 | ## Resources 29 | 30 | ### List of commands 31 | 32 | See the [commands](commands/) folder for text files that let you easily copy and paste the commands from the hands-on exercises. 33 | 34 | ### Figures and semi-official companion booklet 35 | 36 | For those of you reading the print version of the book, which does not include color figures, we've made the figures available in full color in the [figures](https://console.cloud.google.com/storage/browser/genomics-in-the-cloud/figures/) directory of the GCS bucket. 37 | You may use all figures except 3-3 and 6-15 in your own non-commercial work, preferably with a notice of attribution referring to the book. For commercial use, please contact permissions@oreilly.com. Figures 3-3 and 6-15 do not belong to us, so you must request permission from their respective owners, which are noted in the book. 38 | 39 | We also put together a [companion booklet](https://console.cloud.google.com/storage/browser/_details/genomics-in-the-cloud/figures/Genomics_in_the_Cloud___Figures_Booklet.pdf) that contains the figures and their captions for more convenient browsing or printing. It's "semi-official" in the sense that we created and maintain it, but it is not published by O'Reilly, so it does not go through their quality control process. Think of it as an artisanal, locally sourced side dish. 40 | 41 | ### Blog 42 | 43 | We have a blog for the book at [https://broadinstitute.github.io/genomics-in-the-cloud/](https://broadinstitute.github.io/genomics-in-the-cloud/) where we cover various topics including additional tutorials, errata for the book, and regular updates on new features that you maay be interested in. Feel free to suggest blog topics by reaching out to us on Twitter or LinkedIn (see contact info below). 44 | 45 | ### Reporting errors 46 | 47 | If you encounter errors or broken links in the book, please file an issue on O'Reilly's [Errata page](https://www.oreilly.com/catalog/errata.csp?isbn=0636920058182). Anything reported there that we can verify will get fixed and updated in both the electronic versions and subsequent printing runs of the book, so others won't run into the same problems. 48 | 49 | *We don't use Github Issues for this project to avoid confusion and redundancy with the O'Reilly Errata page.* 50 | 51 | ### Getting help 52 | 53 | If you run into problems while working through the hands-on exercises, or if have follow-up questions about the topics we discuss in the book, please post your questions in either the [GATK forum](https://gatk.broadinstitute.org/hc/en-us/community/topics) or the [Terra forum](https://support.terra.bio/hc/en-us/community/topics). The frontline support team will most likely be able to address your questions, and for anything else they will loop us into the conversation if you mention that your question is related to our book. If you're not sure which forum to use, just flip a coin; it's the same team that maintains both communities. 54 | 55 | Remember also that you can often save yourself some time by searching the [GATK documentation](https://gatk.broadinstitute.org/hc/en-us) or [Terra documentation](https://support.terra.bio/hc/en-us) before posting a question -- that way you don't have to wait for someone to get back to you. 56 | 57 | ### Getting in touch with us 58 | If you'd like to get in touch, you can reach us on Twitter ([@VdAGeraldine](https://twitter.com/VdaGeraldine) and [@boconnor](https://twitter.com/boconnor)) and on LinkedIn ([Geraldine](https://www.linkedin.com/in/geraldine-van-der-auwera-5a5811) and [Brian](https://www.linkedin.com/in/briandoconnor/)). We look forward to hearing what you think of the book! If you like it, please consider posting a review on Amazon. 59 | 60 | 61 | -------------------------------------------------------------------------------- /_config.yml: -------------------------------------------------------------------------------- 1 | theme: jekyll-theme-dinky -------------------------------------------------------------------------------- /commands/04-commands.txt: -------------------------------------------------------------------------------- 1 | About: 2 | 3 | This is a simple text file that contains the commands present in 4 | chapter 4. We created this file to prevent readers from having 5 | to type out commands from our book. This is not a stand-alone 6 | tutorial, you'll want to follow along in the chapter to get 7 | context on what these commands do. 8 | 9 | Conventions: 10 | 11 | Commands you run on your cloud shell or VM begin with "$". 12 | Commands you run within a Docker container begin with "#". 13 | Commands lacking an initial "$" or "#" are typically 14 | provided for illustration purposes and we don't expect you 15 | to run them. 16 | 17 | Commands: 18 | 19 | $ gcloud config set project ferrous-layout-260200 20 | 21 | $ ls 22 | 23 | $ cat README-cloudshell.txt 24 | 25 | $ gsutil ls gs://genomics-in-the-cloud 26 | 27 | $ gsutil cat gs://genomics-in-the-cloud/hello.txt 28 | 29 | $ gsutil cp gs://genomics-in-the-cloud/hello.txt . 30 | 31 | $ ls 32 | 33 | $ gsutil mb gs://my-bucket 34 | 35 | $ export BUCKET="gs://my-bucket" 36 | 37 | $ echo $BUCKET 38 | 39 | $ gsutil cp gs://genomics-in-the-cloud/hello.txt $BUCKET/ 40 | 41 | $ gsutil cp hello.txt $BUCKET/ 42 | 43 | $ gsutil cp $BUCKET/hello.txt $BUCKET/my-directory/ 44 | 45 | $ docker pull ubuntu 46 | 47 | $ docker run ubuntu 48 | 49 | $ docker run ubuntu echo "Hello World!" 50 | 51 | $ docker run -it ubuntu /bin/bash 52 | 53 | $ docker ps -a 54 | 55 | $ mkdir book 56 | 57 | $ mv hello.txt book/ 58 | 59 | $ ls book 60 | 61 | $ docker run -v ~/book:/home/book -it ubuntu /bin/bash 62 | 63 | # ls home/book 64 | 65 | $ gcloud init 66 | 67 | $ mkdir ~/book 68 | 69 | $ gsutil -m cp -r gs://genomics-in-the-cloud/v1/* ~/book/ 70 | 71 | $ cd ~/book 72 | 73 | $ git clone https://github.com/broadinstitute/genomics-in-the-cloud.git code 74 | 75 | $ cd ~/book/code 76 | 77 | $ git pull 78 | 79 | $ docker 80 | 81 | $ curl -sSL https://get.docker.com/ | sh 82 | 83 | $ sudo usermod -aG docker $USER 84 | 85 | $ exit 86 | 87 | $ docker pull us.gcr.io/broad-gatk/gatk:4.1.3.0 88 | 89 | $ docker run -v ~/book:/home/book -it us.gcr.io/broad-gatk/gatk:4.1.3.0 /bin/bash 90 | 91 | # gatk 92 | -------------------------------------------------------------------------------- /commands/05-commands.txt: -------------------------------------------------------------------------------- 1 | About: 2 | 3 | This is a simple text file that contains the commands present in 4 | chapter 5. We created this file to prevent readers from having 5 | to type out commands from our book. This is not a stand-alone 6 | tutorial, you'll want to follow along in the chapter to get 7 | context on what these commands do. 8 | 9 | Conventions: 10 | 11 | Commands you run on your cloud shell or VM begin with "$". 12 | Commands you run within a Docker container begin with "#". 13 | Commands lacking an initial "$" or "#" are typically 14 | provided for illustration purposes and we don't expect you 15 | to run them. 16 | 17 | Commands: 18 | 19 | $ java -jar program.jar [program arguments] 20 | 21 | $ gatk ToolName [tool arguments] 22 | 23 | $ java -Xmx4G -XX:+PrintGCDetails -jar program.jar [program arguments] 24 | 25 | $ gatk --java-options "-Xmx4G -XX:+PrintGCDetails" ToolName [tool arguments] 26 | 27 | gatk MySparkTool \ 28 | -R data/reference.fasta \ 29 | -I data/sample1.bam \ 30 | -O data/variants.vcf \ 31 | -- \ 32 | --spark-master 'local[4]' 33 | 34 | --spark-runner SPARK --spark-master spark://23.195.26.187:7077 35 | 36 | --spark-runner GCS --cluster my_cluster 37 | 38 | $ docker run -v ~/book:/home/book -it us.gcr.io/broad-gatk/gatk:4.1.3.0 /bin/bash 39 | 40 | # ls 41 | 42 | # gatk 43 | 44 | # gatk HaplotypeCaller --help 45 | 46 | # cd /home/book/data/germline 47 | # mkdir sandbox 48 | 49 | # gatk HaplotypeCaller \ 50 | -R ref/ref.fasta \ 51 | -I bams/mother.bam \ 52 | -O sandbox/mother_variants.vcf 53 | 54 | # gatk ValidateSamFile \ 55 | -R ref/ref.fasta \ 56 | -I bams/mother.bam \ 57 | -O sandbox/mother_validation.txt 58 | 59 | # gatk HaplotypeCaller \ 60 | -R ref/ref.fasta \ 61 | -I bams/mother.bam \ 62 | -O sandbox/mother_variants.200k.vcf \ 63 | -L 20:10,000,000-10,200,000 64 | 65 | $ cd ~/book/data/germline/sandbox 66 | 67 | $ export BUCKET="gs://my-bucket" 68 | 69 | $ echo $BUCKET 70 | 71 | $ gsutil cp mother_variants.200k.vcf* $BUCKET/germline-sandbox/ 72 | 73 | # gatk HaplotypeCaller \ 74 | -R ref/ref.fasta \ 75 | -I bams/mother.bam \ 76 | -O sandbox/mother_variants.snippet.debug.vcf \ 77 | -bamout sandbox/mother_variants.snippet.debug.bam \ 78 | -L 20:10,002,000-10,003,000 79 | 80 | # zcat vcfs/motherSNP.vcf.gz | grep -v '##' | head -3 81 | 82 | # zcat vcfs/motherSNP.giab.vcf.gz | grep -v '##' | head -3 83 | 84 | # gatk VariantFiltration \ 85 | -R ref/ref.fasta \ 86 | -V vcfs/motherSNP.vcf.gz \ 87 | --filter-expression "QD < 2.0" \ 88 | --filter-name "QD2" \ 89 | -O sandbox/motherSNP.QD2.vcf.gz 90 | 91 | # zcat sandbox/motherSNP.QD2.vcf.gz | grep -v '##' | head -3 92 | 93 | # gatk VariantFiltration \ 94 | -R ref/ref.fasta \ 95 | -V vcfs/motherSNP.vcf.gz \ 96 | --filter-expression "QD < 2.0 || DP > 100.0" \ 97 | --filter-name "lowQD_highDP" \ 98 | -O sandbox/motherSNP.QD2.DP100.vcf.gz 99 | -------------------------------------------------------------------------------- /commands/06-commands.txt: -------------------------------------------------------------------------------- 1 | About: 2 | 3 | This is a simple text file that contains the commands present in 4 | chapter 5. We created this file to prevent readers from having 5 | to type out commands from our book. This is not a stand-alone 6 | tutorial, you'll want to follow along in the chapter to get 7 | context on what these commands do. 8 | 9 | Conventions: 10 | 11 | Commands you run on your cloud shell or VM begin with "$". 12 | Commands you run within a Docker container begin with "#". 13 | Commands lacking an initial "$" or "#" are typically 14 | provided for illustration purposes and we don't expect you 15 | to run them. 16 | 17 | Commands: 18 | 19 | bwa mem -M -t 7 -p reference.fasta unmapped_reads.fq > mapped_reads.sam 20 | 21 | gatk MarkDuplicates \ 22 | -R reference.fasta \ 23 | -I mapped_reads_rg1.bam \ 24 | -I mapped_reads_rg2.bam \ 25 | -I mapped_reads_rg3.bam \ 26 | -O sample_markdups.bam 27 | 28 | gatk BaseRecalibrator \ 29 | -R reference.fasta \ 30 | -I sample_markdups.bam \ 31 | --known-sites known_variation.vcf \ 32 | -O recal_data.table 33 | 34 | gatk ApplyBQSR \ 35 | -R reference.fasta \ 36 | -I sample_markdups.bam \ 37 | --bqsr-recal-file recal_data.table \ 38 | -O sample_markdups_recal.bam 39 | 40 | # gatk HaplotypeCaller \ 41 | -R ref/ref.fasta \ 42 | -I bams/mother.bam \ 43 | -O sandbox/mother_variants.200k.g.vcf.gz \ 44 | -L 20:10,000,000-10,200,000 \ 45 | -ERC GVCF 46 | 47 | # zcat sandbox/mother_variants.200k.g.vcf.gz | grep -v '##' | head -3 48 | 49 | # gatk GenomicsDBImport \ 50 | -V gvcfs/mother.g.vcf.gz \ 51 | -V gvcfs/father.g.vcf.gz \ 52 | --genomicsdb-workspace-path sandbox/trio-gdb \ 53 | --intervals 20:10,000,000-10,200,000 54 | 55 | # gatk SelectVariants \ 56 | -R ref/ref.fasta \ 57 | -V gendb://sandbox/trio-gdb \ 58 | -O sandbox/duo_selectvariants.g.vcf.gz 59 | 60 | $ zcat sandbox/duo_selectvariants.g.vcf.gz | grep -v '##' | head -3 61 | 62 | # gatk GenomicsDBImport \ 63 | -V gvcfs/son.g.vcf.gz \ 64 | --genomicsdb-update-workspace-path sandbox/trio-gdb 65 | 66 | # gatk SelectVariants \ 67 | -R ref/ref.fasta \ 68 | -V gendb://sandbox/trio-gdb \ 69 | -O sandbox/trio_selectvariants.g.vcf.gz 70 | 71 | $ zcat sandbox/trio_selectvariants.g.vcf.gz | grep -v '##' | head -3 72 | 73 | # gatk GenotypeGVCFs \ 74 | -R ref/ref.fasta \ 75 | -V gendb://sandbox/trio-gdb \ 76 | -O sandbox/trio-jointcalls.vcf.gz \ 77 | -L 20:10,000,000-10,200,000 78 | 79 | # zcat sandbox/trio-jointcalls.vcf.gz | grep -v '##' | head -3 80 | 81 | # gatk HaplotypeCaller \ 82 | -R ref/ref.fasta \ 83 | -I bams/mother.bam \ 84 | -I bams/father.bam \ 85 | -I bams/son.bam \ 86 | -O sandbox/trio_jointcalls_hc.vcf.gz \ 87 | -L 20:10,000,000-10,200,000 88 | 89 | gatk VariantRecalibrator \ 90 | -R reference.fasta \ 91 | -V jointcalls_hc.vcf.gz \ 92 | --resource:hapmap,known=false,training=true,truth=true,prior=15.0 \ 93 | hapmap_sites.vcf.gz \ 94 | --resource:omni,known=false,training=true,truth=false,prior=12.0 \ 95 | 1000G_omni2.5.sites.vcf.gz \ 96 | --resource:1000G,known=false,training=true,truth=false,prior=10.0 \ 97 | 1000G_phase1.snps.high_conf.vcf.gz \ 98 | --resource:dbsnp,known=true,training=false,truth=false,prior=2.0 dbsnp.vcf.gz \ 99 | -an QD -an MQ -an MQRankSum -an ReadPosRankSum -an FS -an SOR \ 100 | -mode SNP \ 101 | -O output.recal \ 102 | --tranches-file output.tranches 103 | 104 | gatk ApplyVQSR \ 105 | -R reference.fasta \ 106 | -V jointcalls_hc.vcf.gz \ 107 | -O jointcalls_filtered.vcf.gz \ 108 | --truth-sensitivity-filter-level 99.9 \ 109 | --tranches-file output.tranches \ 110 | --recal-file output.recal \ 111 | -mode SNP 112 | 113 | # gatk CalculateGenotypePosteriors \ 114 | -V sandbox/trio-jointcalls.vcf.gz \ 115 | -ped resources/trio-pedigree.ped \ 116 | --supporting-callsets resources/af-only-gnomad.vcf.gz \ 117 | -O sandbox/trio-refined.vcf.gz 118 | 119 | # cd ../cnn 120 | # mkdir sandbox 121 | # ls 122 | 123 | # gatk CNNScoreVariants \ 124 | -R ref/Homo_sapiens_assembly19.fasta \ 125 | -V vcfs/g94982_b37_chr20_1m_15871.vcf.gz \ 126 | -O sandbox/my_1d_cnn_scored.vcf 127 | 128 | 129 | # gatk FilterVariantTranches \ 130 | -V sandbox/my_1d_cnn_scored.vcf \ 131 | -O sandbox/my_1d_cnn_filtered.vcf \ 132 | --resource resources/1000G_omni2.5.b37.vcf.gz \ 133 | --resource resources/hapmap_3.3.b37.vcf.gz \ 134 | --info-key CNN_1D \ 135 | --snp-tranche 99.9 \ 136 | --indel-tranche 95.0 137 | 138 | # cat sandbox/my_1d_cnn_filtered.vcf | grep -v '##' | head -3 139 | 140 | # gatk CNNScoreVariants \ 141 | -R ref/Homo_sapiens_assembly19.fasta \ 142 | -I bams/g94982_chr20_1m_10m_bamout.bam \ 143 | -V vcfs/g94982_b37_chr20_1m_895.vcf \ 144 | -O sandbox/my_2d_cnn_scored.vcf \ 145 | --tensor-type read_tensor \ 146 | --transfer-batch-size 8 \ 147 | --inference-batch-size 8 148 | 149 | # gatk FilterVariantTranches \ 150 | -V sandbox/my_2d_cnn_scored.vcf \ 151 | -O sandbox/my_2d_cnn_filtered.vcf \ 152 | --resource resources/1000G_omni2.5.b37.vcf.gz \ 153 | --resource resources/hapmap_3.3.b37.vcf.gz \ 154 | --info-key CNN_2D \ 155 | --snp-tranche 99.9 \ 156 | --indel-tranche 95.0 157 | 158 | # cat sandbox/my_2d_cnn_filtered.vcf | grep -v '##' | head -3 159 | -------------------------------------------------------------------------------- /commands/07-commands.txt: -------------------------------------------------------------------------------- 1 | About: 2 | 3 | This is a simple text file that contains the commands present in 4 | chapter 5. We created this file to prevent readers from having 5 | to type out commands from our book. This is not a stand-alone 6 | tutorial, you'll want to follow along in the chapter to get 7 | context on what these commands do. 8 | 9 | Conventions: 10 | 11 | Commands you run on your cloud shell or VM begin with "$". 12 | Commands you run within a Docker container begin with "#". 13 | Commands lacking an initial "$" or "#" are typically 14 | provided for illustration purposes and we don't expect you 15 | to run them. 16 | 17 | Commands: 18 | 19 | gatk Mutect2 \ 20 | -R reference.fasta \ 21 | -I normal_1.bam \ 22 | -O normal_1.vcf.gz \ 23 | --max-mnp-distance 0 24 | 25 | gatk GenomicsDBImport \ 26 | -R reference.fasta \ 27 | -L intervals.interval_list \ 28 | -V normal_1.vcf.gz \ 29 | -V normal_2.vcf.gz \ 30 | -V normal_3.vcf.gz \ 31 | --genomicsdb-workspace-path pon_db 32 | 33 | gatk CreateSomaticPanelOfNormals \ 34 | -R reference.fasta \ 35 | -V gendb://pon_db \ 36 | --germline-resource af-only-gnomad.vcf.gz \ 37 | -O pon.vcf.gz 38 | 39 | # zcat resources/chr17_m2pon.vcf.gz | grep -v '##' | head -3 40 | 41 | # cd /home/book/data/somatic 42 | # mkdir sandbox 43 | 44 | # gatk Mutect2 \ 45 | -R ref/Homo_sapiens_assembly38.fasta \ 46 | -I bams/tumor.bam \ 47 | -I bams/normal.bam \ 48 | -normal HCC1143_normal \ 49 | -L resources/chr17plus.interval_list \ 50 | -pon resources/chr17_m2pon.vcf.gz \ 51 | --germline-resource resources/chr17_af-only-gnomad_grch38.vcf.gz \ 52 | -bamout sandbox/m2_tumor_normal.bam \ 53 | -O sandbox/m2_somatic_calls.vcf.gz 54 | 55 | # gatk GetPileupSummaries \ 56 | -I bams/normal.bam \ 57 | -V resources/chr17_small_exac_common_3_grch38.vcf.gz \ 58 | -L resources/chr17_small_exac_common_3_grch38.vcf.gz \ 59 | -O sandbox/normal_getpileupsummaries.table 60 | 61 | # gatk GetPileupSummaries \ 62 | -I bams/tumor.bam \ 63 | -V resources/chr17_small_exac_common_3_grch38.vcf.gz \ 64 | -L resources/chr17_small_exac_common_3_grch38.vcf.gz \ 65 | -O sandbox/tumor_getpileupsummaries.table 66 | 67 | # head -5 sandbox/normal_getpileupsummaries.table 68 | 69 | # head -5 sandbox/tumor_getpileupsummaries.table 70 | 71 | # gatk CalculateContamination \ 72 | -I sandbox/tumor_getpileupsummaries.table \ 73 | -matched sandbox/normal_getpileupsummaries.table \ 74 | -tumor-segmentation sandbox/segments.table \ 75 | -O sandbox/pair_calculatecontamination.table 76 | 77 | $ cat sandbox/pair_calculatecontamination.table 78 | 79 | # gatk FilterMutectCalls \ 80 | -R ref/Homo_sapiens_assembly38.fasta \ 81 | -V sandbox/m2_somatic_calls.vcf.gz \ 82 | --contamination-table sandbox/pair_calculatecontamination.table \ 83 | -O sandbox/m2_somatic_calls.filtered.vcf.gz \ 84 | --stats sandbox/m2_somatic_calls.vcf.gz.stats \ 85 | --tumor-segmentation sandbox/segments.table 86 | 87 | # gatk Funcotator \ 88 | --data-sources-path resources/funcotator_dataSources_GATK_Workshop_20181205/ \ 89 | --ref-version hg38 \ 90 | -R ref/Homo_sapiens_assembly38.fasta \ 91 | -V sandbox/m2_somatic_calls.filtered.vcf.gz \ 92 | -O sandbox/m2_somatic_calls.funcotated.vcf.gz \ 93 | --output-file-format VCF 94 | 95 | # zcat sandbox/m2_somatic_calls.funcotated.vcf.gz | grep -v '##' | head -3 96 | 97 | # zcat sandbox/m2_somatic_calls.funcotated.vcf.gz | grep 7674220 98 | 99 | # gatk PreprocessIntervals \ 100 | -R ref/Homo_sapiens_assembly38.fasta \ 101 | -L resources/targets_chr17.interval_list \ 102 | -O sandbox/targets_chr17.preprocessed.interval_list \ 103 | --padding 250 \ 104 | --bin-length 0 \ 105 | --interval-merging-rule OVERLAPPING_ONLY 106 | 107 | # gatk CollectReadCounts \ 108 | -I bams/tumor.bam \ 109 | -L sandbox/targets_chr17.preprocessed.interval_list \ 110 | -R ref/Homo_sapiens_assembly38.fasta \ 111 | -O sandbox/tumor.counts.tsv \ 112 | --format TSV \ 113 | -imr OVERLAPPING_ONLY 114 | 115 | # head -5 sandbox/tumor.counts.tsv 116 | 117 | # tail -5 sandbox/tumor.counts.tsv 118 | 119 | gatk CreateReadCountPanelOfNormals \ 120 | -I file1_clean.counts.hdf5 \ 121 | … 122 | -I file40_clean.counts.hdf5 \ 123 | -O cnaponC.pon.hdf5 124 | 125 | # gatk DenoiseReadCounts \ 126 | -I cna_inputs/hcc1143_T_clean.counts.hdf5 \ 127 | --count-panel-of-normals cna_inputs/cnaponC.pon.hdf5 \ 128 | --standardized-copy-ratios sandbox/hcc1143_T_clean.standardizedCR.tsv \ 129 | --denoised-copy-ratios sandbox/hcc1143_T_clean.denoisedCR.tsv 130 | 131 | # gatk PlotDenoisedCopyRatios \ 132 | --sequence-dictionary ref/Homo_sapiens_assembly38.dict \ 133 | --standardized-copy-ratios sandbox/hcc1143_T_clean.standardizedCR.tsv \ 134 | --denoised-copy-ratios sandbox/hcc1143_T_clean.denoisedCR.tsv \ 135 | --minimum-contig-length 46709983 \ 136 | --output sandbox/cna_plots \ 137 | --output-prefix hcc1143_T_clean 138 | 139 | $ export BUCKET="gs://my-bucket" 140 | $ gsutil -m cp -R sandbox/cna_plots $BUCKET/somatic-sandbox/ 141 | 142 | # gatk ModelSegments \ 143 | --denoised-copy-ratios sandbox/hcc1143_T_clean.denoisedCR.tsv \ 144 | --output sandbox \ 145 | --output-prefix hcc1143_T_clean 146 | 147 | # gatk PlotModeledSegments \ 148 | --denoised-copy-ratios sandbox/hcc1143_T_clean.denoisedCR.tsv \ 149 | --segments sandbox/hcc1143_T_clean.modelFinal.seg \ 150 | --sequence-dictionary ref/Homo_sapiens_assembly38.dict \ 151 | --minimum-contig-length 46709983 \ 152 | --output sandbox/cna_plots \ 153 | --output-prefix hcc1143_T_clean 154 | 155 | # gatk CallCopyRatioSegments \ 156 | -I sandbox/hcc1143_T_clean.cr.seg \ 157 | -O sandbox/hcc1143_T_clean.called.seg 158 | 159 | # tail -5 sandbox/hcc1143_T_clean.called.seg 160 | -------------------------------------------------------------------------------- /commands/08-commands.txt: -------------------------------------------------------------------------------- 1 | About: 2 | 3 | This is a simple text file that contains the commands present in 4 | chapter 5. We created this file to prevent readers from having 5 | to type out commands from our book. This is not a stand-alone 6 | tutorial, you'll want to follow along in the chapter to get 7 | context on what these commands do. 8 | 9 | Conventions: 10 | 11 | Commands you run on your cloud shell or VM begin with "$". 12 | Commands you run within a Docker container begin with "#". 13 | Commands lacking an initial "$" or "#" are typically 14 | provided for illustration purposes and we don't expect you 15 | to run them. 16 | 17 | Commands: 18 | 19 | $ java -version 20 | 21 | $ sudo apt install openjdk-8-jre-headless 22 | 23 | $ java -version 24 | 25 | $ export BIN=~/book/bin 26 | 27 | $ curl -L -o ~/book/bin/cromwell-48.jar \ 28 | https://github.com/broadinstitute/cromwell/releases/download/48/cromwell-48.jar 29 | 30 | $ java -jar $BIN/cromwell-48.jar --help 31 | 32 | $ java -jar $BIN/womtool-48.jar --help 33 | 34 | $ export WF=~/book/code/workflows 35 | 36 | $ nano $WF/hello-world/hello-world.wdl 37 | 38 | $ echo "Hello World" 39 | 40 | $ java -jar $BIN/cromwell-48.jar run $WF/hello-world/hello-world.wdl 41 | 42 | $ cat ~/cromwell-executions/HelloWorld/b6d224b0-ccee-468f-83fa- 43 | ab2ce7e62ab7/call-WriteGreeting/execution/stdout 44 | 45 | $ nano $WF/hello-world/hello-world-var.wdl 46 | 47 | $ java -jar $BIN/cromwell-48.jar run $WF/hello-world/hello-world-var.wdl \ 48 | -i $WF/hello-world/hello-world.inputs.json 49 | 50 | $ nano $WF/hello-world/hello-world-again.wdl 51 | 52 | $ nano $WF/hello-hc/hello-haplotypecaller.wdl 53 | 54 | $ mkdir ~/sandbox-8 55 | 56 | $ java -jar $BIN/womtool-48.jar \ 57 | inputs $WF/hello-hc/hello-haplotypecaller.wdl \ 58 | > ~/sandbox-8/hello-haplotypecaller.inputs.json 59 | 60 | $ cat ~/sandbox-8/hello-haplotypecaller.inputs.json 61 | 62 | $ cat $WF/hello-hc/hello-haplotypecaller.inputs.json 63 | 64 | $ java -jar $BIN/cromwell-48.jar \ 65 | run $WF/hello-hc/hello-haplotypecaller.wdl \ 66 | -i $WF/hello-hc/hello-haplotypecaller.inputs.json 67 | 68 | $ head ~/cromwell-executions/HelloHaplotypeCaller/9a6a9c97-7453-455c 69 | -8cd8-be8af8cb6f7c/call-HaplotypeCallerGVCF/execution/mother.g.vcf 70 | 71 | $ cp $WF/hello-hc/hello-haplotypecaller.wdl ~/sandbox-8/hc-break1.wdl 72 | 73 | $ java -jar $BIN/cromwell-48.jar \ 74 | run ~/sandbox-8/hc-break1.wdl \ 75 | -i $WF/hello-hc/hello-haplotypecaller.inputs.json 76 | 77 | $ java -jar $BIN/womtool-48.jar \ 78 | validate ~/sandbox-8/hc-break1.wdl 79 | 80 | $ java -jar $BIN/cromwell-48.jar \ 81 | run ~/sandbox-8/hc-break2.wdl \ 82 | -i $WF/hello-hc/hello-haplotypecaller.inputs.json 83 | 84 | $ cat /home/username/cromwell-executions/HelloHaplotypeCaller/dd77316f-7c18-4eb1-aa86-e307113c1668/call-HaplotypeCallerGVCF/execution/stderr 85 | 86 | $ nano $WF/scatter-hc/scatter-haplotypecaller.wdl 87 | 88 | $ java -jar $BIN/cromwell-48.jar \ 89 | run $WF/scatter-hc/scatter-haplotypecaller.wdl \ 90 | -i $WF/scatter-hc/scatter-haplotypecaller.local.inputs.json 91 | 92 | $ java -jar $BIN/womtool-48.jar \ 93 | graph $WF/scatter-hc/scatter-haplotypecaller.wdl \ 94 | > ~/sandbox-8/scatter-haplotypecaller.dot 95 | -------------------------------------------------------------------------------- /commands/09-commands.txt: -------------------------------------------------------------------------------- 1 | About: 2 | 3 | This is a simple text file that contains the commands present in 4 | chapter 5. We created this file to prevent readers from having 5 | to type out commands from our book. This is not a stand-alone 6 | tutorial, you'll want to follow along in the chapter to get 7 | context on what these commands do. 8 | 9 | Conventions: 10 | 11 | Commands you run on your cloud shell or VM begin with "$". 12 | Commands you run within a Docker container begin with "#". 13 | Commands lacking an initial "$" or "#" are typically 14 | provided for illustration purposes and we don't expect you 15 | to run them. 16 | 17 | Commands: 18 | 19 | $ export CASE1=~/book/code/workflows/mystery-1 20 | 21 | $ mkdir ~/sandbox-9 22 | 23 | $ cat ~/sandbox-9/haplotypecaller-gvcf-gatk4.dot 24 | $ java -jar $BIN/womtool-48.jar graph $CASE1/haplotypecaller-gvcf-gatk4.wdl \ 25 | > ~/sandbox-9/haplotypecaller-gvcf-gatk4.dot 26 | 27 | $ export CASE2=~/book/code/workflows/mystery-2 28 | 29 | $ java -jar $BIN/womtool-48.jar graph $CASE2/WholeGenomeGermlineSingleSample.wdl \ 30 | > ~/sandbox-9/WholeGenomeGermlineSingleSample.dot 31 | 32 | $ cat ~/sandbox-9/WholeGenomeGermlineSingleSample.dot 33 | 34 | $ java -jar $BIN/womtool-48.jar graph $CASE2/tasks/VariantCalling.wdl \ 35 | > ~/sandbox-9/VariantCalling.dot 36 | -------------------------------------------------------------------------------- /commands/10-commands.txt: -------------------------------------------------------------------------------- 1 | About: 2 | 3 | This is a simple text file that contains the commands present in 4 | chapter 5. We created this file to prevent readers from having 5 | to type out commands from our book. This is not a stand-alone 6 | tutorial, you'll want to follow along in the chapter to get 7 | context on what these commands do. 8 | 9 | Conventions: 10 | 11 | Commands you run on your cloud shell or VM begin with "$". 12 | Commands you run within a Docker container begin with "#". 13 | Commands lacking an initial "$" or "#" are typically 14 | provided for illustration purposes and we don't expect you 15 | to run them. 16 | 17 | Commands: 18 | 19 | $ cat ~/book/code/config/google.conf 20 | 21 | $ mkdir ~/sandbox-10 22 | $ cp ~/book/code/config/google.conf ~/sandbox-10/my-google.conf 23 | 24 | $ export CONF=~/sandbox-10 25 | $ export BIN=~/book/bin 26 | $ export WF=~/book/code/workflows 27 | 28 | $ export BUCKET="gs://my-bucket" 29 | 30 | $ nano ~/sandbox-10/my-google.conf 31 | 32 | $ gcloud auth application-default login 33 | 34 | $ cat $WF/scatter-hc/scatter-haplotypecaller.gcs.inputs.json 35 | 36 | $ java -Dconfig.file=$CONF/my-google.conf -jar $BIN/cromwell-48.jar \ 37 | run $WF/scatter-hc/scatter-haplotypecaller.wdl \ 38 | -i $WF/scatter-hc/scatter-haplotypecaller.gcs.inputs.json 39 | 40 | $ export WR_CONF=~/book/code/config 41 | $ export WR_PIPE=~/book/wdl_runner/wdl_runner 42 | 43 | $ gcloud alpha genomics pipelines run \ 44 | --pipeline-file $WR_PIPE/wdl_pipeline.yaml \ 45 | --regions us-east4 \ 46 | --inputs-from-file WDL=$WF/scatter-hc/scatter-haplotypecaller.wdl,\ 47 | WORKFLOW_INPUTS=$WF/scatter-hc/scatter-haplotypecaller.gcs.inputs.json,\ 48 | WORKFLOW_OPTIONS=$WR_CONF/empty.options.json \ 49 | --env-vars WORKSPACE=$BUCKET/wdl_runner/test/work,\ 50 | OUTPUTS=$BUCKET/wdl_runner/test/output \ 51 | --logging $BUCKET/wdl_runner/test/logging 52 | 53 | $ gcloud config set compute/zone "" 54 | 55 | $ cd ~/book/wdl_runner 56 | $ bash monitoring_tools/monitor_wdl_pipeline.sh 7973899330424684165 57 | -------------------------------------------------------------------------------- /commands/12-commands.txt: -------------------------------------------------------------------------------- 1 | About: 2 | 3 | This is a simple text file that contains the commands present in 4 | chapter 5. We created this file to prevent readers from having 5 | to type out commands from our book. This is not a stand-alone 6 | tutorial, you'll want to follow along in the chapter to get 7 | context on what these commands do. 8 | 9 | Conventions: 10 | 11 | Commands you run on your cloud shell or VM begin with "$". 12 | Commands you run within a Docker container begin with "#". 13 | Commands lacking an initial "$" or "#" are typically 14 | provided for illustration purposes and we don't expect you 15 | to run them. 16 | 17 | Commands: 18 | 19 | gs://genomics-in-the-cloud/v1/scripts/install_GATK_4130_with_igv.sh 20 | -------------------------------------------------------------------------------- /config/empty.options.json: -------------------------------------------------------------------------------- 1 | { 2 | "default_runtime_attributes": { 3 | } 4 | } 5 | 6 | -------------------------------------------------------------------------------- /config/google.conf: -------------------------------------------------------------------------------- 1 | # This is an example configuration file that directs Cromwell to execute 2 | # workflow tasks via the Google Pipelines API backend and allows it to retrieve 3 | # input files from GCS buckets. It is intended only as a relatively simple example 4 | # and leaves out many options that are useful or important for production-scale 5 | # work. See https://cromwell.readthedocs.io/en/stable/backends/Google/ for more 6 | # complete documentation. 7 | 8 | engine { 9 | filesystems { 10 | gcs { 11 | auth = "application-default" 12 | project = "" 13 | } 14 | } 15 | } 16 | 17 | backend { 18 | default = PAPIv2 19 | 20 | providers { 21 | PAPIv2 { 22 | actor-factory = "cromwell.backend.google.pipelines.v2alpha1.PipelinesApiLifecycleActorFactory" 23 | config { 24 | # Google project 25 | project = "" 26 | 27 | # Base bucket for workflow executions 28 | root = "gs:///cromwell-execution" 29 | 30 | # Polling for completion backs-off gradually for slower-running jobs. 31 | # This is the maximum polling interval (in seconds): 32 | maximum-polling-interval = 600 33 | 34 | # Optional Dockerhub Credentials. Can be used to access private docker images. 35 | dockerhub { 36 | # account = "" 37 | # token = "" 38 | } 39 | 40 | # Number of workers to assign to PAPI requests 41 | request-workers = 3 42 | 43 | genomics { 44 | # A reference to an auth defined in the `google` stanza at the top. This auth is used to create 45 | # Pipelines and manipulate auth JSONs. 46 | auth = "application-default" 47 | 48 | # Endpoint for APIs, no reason to change this unless directed by Google. 49 | endpoint-url = "https://genomics.googleapis.com/" 50 | 51 | # Pipelines v2 only: specify the number of times localization and delocalization operations should be attempted 52 | # There is no logic to determine if the error was transient or not, everything is retried upon failure 53 | # Defaults to 3 54 | localization-attempts = 3 55 | 56 | } 57 | 58 | filesystems { 59 | gcs { 60 | auth = "application-default" 61 | project = "" 62 | } 63 | } 64 | 65 | default-runtime-attributes { 66 | cpu: 1 67 | failOnStderr: false 68 | continueOnReturnCode: 0 69 | memory: "2048 MB" 70 | bootDiskSizeGb: 10 71 | # Allowed to be a String, or a list of Strings 72 | disks: "local-disk 10 SSD" 73 | noAddress: false 74 | preemptible: 0 75 | zones: ["us-east4-a", "us-east4-b"] 76 | } 77 | } 78 | } 79 | } 80 | } 81 | -------------------------------------------------------------------------------- /figures/.keep: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/broadinstitute/genomics-in-the-cloud/e329fa7f0668f6bea7f4176736d686e79175ce8b/figures/.keep -------------------------------------------------------------------------------- /figures/README.md: -------------------------------------------------------------------------------- 1 | You can find all the figures in full color the [figures](https://console.cloud.google.com/storage/browser/genomics-in-the-cloud/figures/) directory of the GCS bucket. 2 | 3 | You may use all figures except 3-3 and 6-15 in your own non-commercial work, preferably with a notice of attribution referring to the book. For commercial use, please contact permissions@oreilly.com. Figures 3-3 and 6-15 do not belong to us, so you must request permission from their respective owners, which are noted in the book. 4 | 5 | We also put together a [companion booklet](https://storage.googleapis.com/genomics-in-the-cloud/figures/Genomics_in_the_Cloud___Figures_Booklet.pdf) that contains the figures and their captions for more convenient browsing or printing. It's "semi-official" in the sense that we created and maintain it, but it is not published by O'Reilly, so it does not go through their quality control process. Think of it as an artisanal, locally sourced side dish. 6 | -------------------------------------------------------------------------------- /metadata/README.md: -------------------------------------------------------------------------------- 1 | Chapter materials -------------------------------------------------------------------------------- /metadata/book_sample-metadata.tsv: -------------------------------------------------------------------------------- 1 | entity:book_sample_id input_bam input_bam_index 2 | father gs://genomics-in-the-cloud/v1/data/germline/bams/father.bam gs://genomics-in-the-cloud/v1/data/germline/bams/father.bai 3 | mother gs://genomics-in-the-cloud/v1/data/germline/bams/mother.bam gs://genomics-in-the-cloud/v1/data/germline/bams/mother.bai 4 | son gs://genomics-in-the-cloud/v1/data/germline/bams/son.bam gs://genomics-in-the-cloud/v1/data/germline/bams/son.bai -------------------------------------------------------------------------------- /metadata/workspace-metadata.tsv: -------------------------------------------------------------------------------- 1 | workspace:ref_fasta intervals_list_min ref_fasta_index gatk_docker ref_dict intervals_list_full 2 | gs://genomics-in-the-cloud/v1/data/germline/ref/ref.fasta gs://genomics-in-the-cloud/v1/data/germline/intervals/snippet-intervals-min.list gs://genomics-in-the-cloud/v1/data/germline/ref/ref.fasta.fai broadinstitute/gatk:4.1.3.0 gs://genomics-in-the-cloud/v1/data/germline/ref/ref.dict gs://genomics-in-the-cloud/v1/data/germline/intervals/snippet-intervals-full.list -------------------------------------------------------------------------------- /notebooks/Basic-genomics-notebook.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": {}, 6 | "source": [ 7 | "**Content summary** \n", 8 | "This notebook provides a short introduction to basic Jupyter Notebook functionality and illustrates some options for working with genomic data in cloud storage. It is based on [source code](https://github.com/broadinstitute/genomics-in-the-cloud/tree/main/notebooks) provided with the [Genomics in the Cloud](https://oreil.ly/genomics-cloud) book (Van der Auwera & O'Connor, O'Reilly 2020).\n", 9 | "\n", 10 | "\n", 11 | "**Environment configuration** \n", 12 | "This notebook requires a custom [Terra](https://app.terra.bio/) Cloud Environment image provided as the container `gcr.io/broad-dsde-outreach/terra-base:ipyigv1`, complemented by a startup script (gs://genomics-in-the-cloud/v1/scripts/install_GATK_4130_with_igv.sh) that installs GATK version 4.1.3.0. \n", 13 | "\n", 14 | "You must customize your environment using the Cloud Environment configuration panel to match this notebook's requirements; SOME COMMANDS WILL NOT WORK IF YOU DO NOT DO THIS. \n", 15 | "\n", 16 | "- In the configuration panel, set the `Application Configuration` to `Custom Environment` (all the way at the bottom of the menu) and paste the container address given above into the `Container image` field. \n", 17 | "- Then (still in the config panel), in the `Cloud compute profile` box, paste the startup script link given above into the `Startup Script` field. \n", 18 | "\n", 19 | "Refer to [Terra documentation on customizing your environment](https://support.terra.bio/hc/en-us/articles/360038125912) to learn more about environment customization options.\n", 20 | "\n", 21 | "**Kernel** \n", 22 | "By default this notebook opens on a Python 3 kernel. When you have the notebook running in EDIT mode, the upper right corner of the notebook (under the Notebook Runtime widget) should display the label `Python3`. \n", 23 | "\n", 24 | "----" 25 | ] 26 | }, 27 | { 28 | "cell_type": "markdown", 29 | "metadata": {}, 30 | "source": [ 31 | "# Getting started with Jupyter in Terra" 32 | ] 33 | }, 34 | { 35 | "cell_type": "markdown", 36 | "metadata": {}, 37 | "source": [ 38 | "In this section, we run through some exercises to familiarize you with the basic usage of Jupyter notebooks in the Terra environment.\n", 39 | "\n", 40 | "\n", 41 | "## Run the Hello World cells\n", 42 | "We start with some simple Hello World examples, first in Python, then with a command-line tool call." 43 | ] 44 | }, 45 | { 46 | "cell_type": "markdown", 47 | "metadata": {}, 48 | "source": [ 49 | "Run the basic Hello World in Python" 50 | ] 51 | }, 52 | { 53 | "cell_type": "code", 54 | "execution_count": null, 55 | "metadata": {}, 56 | "outputs": [], 57 | "source": [ 58 | "print(\"Hello World!\")" 59 | ] 60 | }, 61 | { 62 | "cell_type": "markdown", 63 | "metadata": {}, 64 | "source": [ 65 | "Run the command-line tool `echo` using `!`" 66 | ] 67 | }, 68 | { 69 | "cell_type": "code", 70 | "execution_count": null, 71 | "metadata": { 72 | "scrolled": true 73 | }, 74 | "outputs": [], 75 | "source": [ 76 | "! echo \"Hello World!\"" 77 | ] 78 | }, 79 | { 80 | "cell_type": "markdown", 81 | "metadata": {}, 82 | "source": [ 83 | "## Interact with local storage" 84 | ] 85 | }, 86 | { 87 | "cell_type": "markdown", 88 | "metadata": {}, 89 | "source": [ 90 | "List contents of local storage (persistent disk)" 91 | ] 92 | }, 93 | { 94 | "cell_type": "code", 95 | "execution_count": null, 96 | "metadata": {}, 97 | "outputs": [], 98 | "source": [ 99 | "! ls ." 100 | ] 101 | }, 102 | { 103 | "cell_type": "markdown", 104 | "metadata": {}, 105 | "source": [ 106 | "Make a sandbox directory to keep project files organized" 107 | ] 108 | }, 109 | { 110 | "cell_type": "code", 111 | "execution_count": null, 112 | "metadata": {}, 113 | "outputs": [], 114 | "source": [ 115 | "! mkdir -p sandbox/\n", 116 | "! ls" 117 | ] 118 | }, 119 | { 120 | "cell_type": "markdown", 121 | "metadata": {}, 122 | "source": [ 123 | "## Access data in cloud storage buckets " 124 | ] 125 | }, 126 | { 127 | "cell_type": "markdown", 128 | "metadata": {}, 129 | "source": [ 130 | "List the contents of a public cloud storage bucket called `genomics-in-the-cloud`" 131 | ] 132 | }, 133 | { 134 | "cell_type": "code", 135 | "execution_count": null, 136 | "metadata": {}, 137 | "outputs": [], 138 | "source": [ 139 | "! gsutil ls gs://genomics-in-the-cloud/" 140 | ] 141 | }, 142 | { 143 | "cell_type": "markdown", 144 | "metadata": {}, 145 | "source": [ 146 | "Copy a file from the bucket to the sandbox (on persistent disk)" 147 | ] 148 | }, 149 | { 150 | "cell_type": "code", 151 | "execution_count": null, 152 | "metadata": {}, 153 | "outputs": [], 154 | "source": [ 155 | "! gsutil cp gs://genomics-in-the-cloud/hello.txt sandbox/" 156 | ] 157 | }, 158 | { 159 | "cell_type": "markdown", 160 | "metadata": {}, 161 | "source": [ 162 | "Read the contents of the locally-stored text file" 163 | ] 164 | }, 165 | { 166 | "cell_type": "code", 167 | "execution_count": null, 168 | "metadata": {}, 169 | "outputs": [], 170 | "source": [ 171 | "! cat sandbox/hello.txt" 172 | ] 173 | }, 174 | { 175 | "cell_type": "markdown", 176 | "metadata": {}, 177 | "source": [ 178 | "## Save local files to the workspace's storage bucket" 179 | ] 180 | }, 181 | { 182 | "cell_type": "markdown", 183 | "metadata": {}, 184 | "source": [ 185 | "Import the `os` package, look up the value of the `WORKSPACE_BUCKET` environment variable (set by Terra at the kernel level) and store it in a Python variable for easy access" 186 | ] 187 | }, 188 | { 189 | "cell_type": "code", 190 | "execution_count": null, 191 | "metadata": {}, 192 | "outputs": [], 193 | "source": [ 194 | "import os\n", 195 | "WS_BUCKET = os.environ['WORKSPACE_BUCKET']\n", 196 | "print(WS_BUCKET)" 197 | ] 198 | }, 199 | { 200 | "cell_type": "markdown", 201 | "metadata": {}, 202 | "source": [ 203 | "Back up the sandbox directory from the persistent disk to the workspace bucket " 204 | ] 205 | }, 206 | { 207 | "cell_type": "code", 208 | "execution_count": null, 209 | "metadata": {}, 210 | "outputs": [], 211 | "source": [ 212 | "! gsutil cp -r sandbox {WS_BUCKET}" 213 | ] 214 | }, 215 | { 216 | "cell_type": "markdown", 217 | "metadata": {}, 218 | "source": [ 219 | "Verify that it worked as expected" 220 | ] 221 | }, 222 | { 223 | "cell_type": "code", 224 | "execution_count": null, 225 | "metadata": {}, 226 | "outputs": [], 227 | "source": [ 228 | "! gsutil ls -r {WS_BUCKET}" 229 | ] 230 | }, 231 | { 232 | "cell_type": "markdown", 233 | "metadata": {}, 234 | "source": [ 235 | "## Set up variables pointing to genomic data in the bucket\n", 236 | "We're going to want to access the data in the bucket multiple times, so we make a variable to avoid hardcoding and repeating file paths." 237 | ] 238 | }, 239 | { 240 | "cell_type": "markdown", 241 | "metadata": {}, 242 | "source": [ 243 | "Create Python variables" 244 | ] 245 | }, 246 | { 247 | "cell_type": "code", 248 | "execution_count": null, 249 | "metadata": {}, 250 | "outputs": [], 251 | "source": [ 252 | "BAMS = \"gs://genomics-in-the-cloud/v1/data/germline/bams\"\n", 253 | "REF = \"gs://genomics-in-the-cloud/v1/data/germline/ref\"" 254 | ] 255 | }, 256 | { 257 | "cell_type": "markdown", 258 | "metadata": {}, 259 | "source": [ 260 | "Use the variable to list the bucket contents and verify they work as expected" 261 | ] 262 | }, 263 | { 264 | "cell_type": "code", 265 | "execution_count": null, 266 | "metadata": { 267 | "scrolled": true 268 | }, 269 | "outputs": [], 270 | "source": [ 271 | "! gsutil ls {BAMS}" 272 | ] 273 | }, 274 | { 275 | "cell_type": "code", 276 | "execution_count": null, 277 | "metadata": {}, 278 | "outputs": [], 279 | "source": [ 280 | "! gsutil ls {REF}" 281 | ] 282 | }, 283 | { 284 | "cell_type": "markdown", 285 | "metadata": {}, 286 | "source": [ 287 | "This completes the \"getting started\" portion of this notebook.\n", 288 | "\n", 289 | "----" 290 | ] 291 | }, 292 | { 293 | "cell_type": "markdown", 294 | "metadata": {}, 295 | "source": [ 296 | "# Visualizing genomic data in an embedded IGV window\n", 297 | "In this section, we embed IGV windows in the notebook in order to visualize genomic data without leaving the notebook environment.\n", 298 | "\n", 299 | "## Set up the embedded IGV browser\n", 300 | "First we need to import the `ipyigv` package and initialize a browser window." 301 | ] 302 | }, 303 | { 304 | "cell_type": "code", 305 | "execution_count": null, 306 | "metadata": {}, 307 | "outputs": [], 308 | "source": [ 309 | "import ipyigv as igv\n", 310 | "from ipywidgets.widgets.trait_types import InstanceDict\n", 311 | "from ipyigv.options import ReferenceGenome, Track\n", 312 | "from ipywidgets import Output" 313 | ] 314 | }, 315 | { 316 | "cell_type": "markdown", 317 | "metadata": {}, 318 | "source": [ 319 | "Initialize the browser instance with a genome reference" 320 | ] 321 | }, 322 | { 323 | "cell_type": "code", 324 | "execution_count": null, 325 | "metadata": {}, 326 | "outputs": [], 327 | "source": [ 328 | "genomeDict = igv.PUBLIC_GENOMES.hg19\n", 329 | "genome = ReferenceGenome(**genomeDict)\n", 330 | "browser = igv.IgvBrowser(genome=genome)" 331 | ] 332 | }, 333 | { 334 | "cell_type": "markdown", 335 | "metadata": {}, 336 | "source": [ 337 | "Display the browser window" 338 | ] 339 | }, 340 | { 341 | "cell_type": "code", 342 | "execution_count": null, 343 | "metadata": {}, 344 | "outputs": [], 345 | "source": [ 346 | "browser" 347 | ] 348 | }, 349 | { 350 | "cell_type": "markdown", 351 | "metadata": {}, 352 | "source": [ 353 | "## Add data to the IGV browser\n", 354 | "Now we can add data by pointing to files in a bucket." 355 | ] 356 | }, 357 | { 358 | "cell_type": "markdown", 359 | "metadata": {}, 360 | "source": [ 361 | "Define data tracks for two BAM files (whole genome and exome versions of the mother sample)" 362 | ] 363 | }, 364 | { 365 | "cell_type": "code", 366 | "execution_count": null, 367 | "metadata": {}, 368 | "outputs": [], 369 | "source": [ 370 | "wgs_track = {\n", 371 | " 'name': 'Mother WGS',\n", 372 | " 'format': 'bam',\n", 373 | " 'url': BAMS + '/mother.bam',\n", 374 | " 'indexURL': BAMS + '/mother.bai',\n", 375 | " 'height': 200\n", 376 | "}\n", 377 | "browser.add_track(Track(**wgs_track))" 378 | ] 379 | }, 380 | { 381 | "cell_type": "code", 382 | "execution_count": null, 383 | "metadata": {}, 384 | "outputs": [], 385 | "source": [ 386 | "exome_track = {\n", 387 | " 'name': 'Mother Exome',\n", 388 | " 'format': 'bam',\n", 389 | " 'url': BAMS + '/motherNEX.bam',\n", 390 | " 'indexURL': BAMS + '/motherNEX.bai',\n", 391 | " 'height': 200\n", 392 | "}\n", 393 | "browser.add_track(Track(**exome_track))" 394 | ] 395 | }, 396 | { 397 | "cell_type": "markdown", 398 | "metadata": {}, 399 | "source": [ 400 | "Zoom in to region of interest" 401 | ] 402 | }, 403 | { 404 | "cell_type": "code", 405 | "execution_count": null, 406 | "metadata": {}, 407 | "outputs": [], 408 | "source": [ 409 | "browser.search('chr20:10,025,584-10,036,143')" 410 | ] 411 | }, 412 | { 413 | "cell_type": "markdown", 414 | "metadata": {}, 415 | "source": [ 416 | "## Set up an access token to view private data\n", 417 | "IGV needs an access token to retrieve data from private buckets (including your workspace's own bucket)." 418 | ] 419 | }, 420 | { 421 | "cell_type": "markdown", 422 | "metadata": {}, 423 | "source": [ 424 | "Emit an acces token and save it to a file, then read it into a variable" 425 | ] 426 | }, 427 | { 428 | "cell_type": "code", 429 | "execution_count": null, 430 | "metadata": {}, 431 | "outputs": [], 432 | "source": [ 433 | "!gcloud auth print-access-token > token.txt\n", 434 | "\n", 435 | "token_file = open(\"token.txt\",\"r\") \n", 436 | "token = token_file.readline()" 437 | ] 438 | }, 439 | { 440 | "cell_type": "markdown", 441 | "metadata": {}, 442 | "source": [ 443 | "**Important note:** As long as this file is saved only to your notebook’s local storage, it is secure because your cloud environment is strictly personal to you and cannot be accessed by others, even if you share your workspace or your notebook with them. But don’t save this\n", 444 | "file to your workspace bucket! Saving it to the bucket would make it visible to anyone\n", 445 | "with whom you share the workspace." 446 | ] 447 | }, 448 | { 449 | "cell_type": "markdown", 450 | "metadata": {}, 451 | "source": [ 452 | "Copy a BAM file and its index to the workspace bucket" 453 | ] 454 | }, 455 | { 456 | "cell_type": "code", 457 | "execution_count": null, 458 | "metadata": {}, 459 | "outputs": [], 460 | "source": [ 461 | "! gsutil cp {BAMS}/mother.ba* {WS_BUCKET}/bams\n", 462 | "! gsutil ls {WS_BUCKET}/bams" 463 | ] 464 | }, 465 | { 466 | "cell_type": "markdown", 467 | "metadata": {}, 468 | "source": [ 469 | "Include the token in the track definition of any private files" 470 | ] 471 | }, 472 | { 473 | "cell_type": "code", 474 | "execution_count": null, 475 | "metadata": {}, 476 | "outputs": [], 477 | "source": [ 478 | "private_track = {\n", 479 | " 'name': 'Workspace bucket copy of Mother WGS',\n", 480 | " 'format': 'bam',\n", 481 | " 'url': WS_BUCKET + '/sandbox/mother.bam',\n", 482 | " 'indexURL': WS_BUCKET + '/sandbox/mother.bam',\n", 483 | " 'height': 200,\n", 484 | " 'oauthToken': token\n", 485 | "}\n", 486 | "\n", 487 | "browser.add_track(Track(**private_track))" 488 | ] 489 | }, 490 | { 491 | "cell_type": "markdown", 492 | "metadata": {}, 493 | "source": [ 494 | "This concludes the section on visualizing genomic data.\n", 495 | "\n", 496 | "----" 497 | ] 498 | }, 499 | { 500 | "cell_type": "markdown", 501 | "metadata": {}, 502 | "source": [ 503 | "# Running GATK Commands to Learn, Test, or Troubleshoot\n", 504 | "Now let's look at how we can run GATK commands inside the notebook.\n", 505 | "\n", 506 | "## Running a Basic GATK Command: HaplotypeCaller\n", 507 | "First we run a simple command. Note that we can run GATK directly on the files located in cloud storage — no need to copy them to local storage first." 508 | ] 509 | }, 510 | { 511 | "cell_type": "markdown", 512 | "metadata": {}, 513 | "source": [ 514 | "Run HaplotypeCaller on files in cloud storage" 515 | ] 516 | }, 517 | { 518 | "cell_type": "code", 519 | "execution_count": null, 520 | "metadata": {}, 521 | "outputs": [], 522 | "source": [ 523 | "! gatk HaplotypeCaller \\\n", 524 | "-R {REF}/ref.fasta \\\n", 525 | "-I {BAMS}/mother.bam \\\n", 526 | "-O sandbox/mother_variants.200k.vcf.gz \\\n", 527 | "-L 20:10,000,000-10,200,000" 528 | ] 529 | }, 530 | { 531 | "cell_type": "markdown", 532 | "metadata": {}, 533 | "source": [ 534 | "Verify that the output file is in the sandbox" 535 | ] 536 | }, 537 | { 538 | "cell_type": "code", 539 | "execution_count": null, 540 | "metadata": {}, 541 | "outputs": [], 542 | "source": [ 543 | "! ls sandbox" 544 | ] 545 | }, 546 | { 547 | "cell_type": "markdown", 548 | "metadata": {}, 549 | "source": [ 550 | "**Note:** This works with GATK from anywhere with an internet connection! We could even write the output directly to a bucket if we wanted to; the output filepath just has to start with a valid `gs://` bucket address. " 551 | ] 552 | }, 553 | { 554 | "cell_type": "markdown", 555 | "metadata": {}, 556 | "source": [ 557 | "## Loading the Data (BAM and VCF) into IGV\n", 558 | "Now we do a simple visual check of the result." 559 | ] 560 | }, 561 | { 562 | "cell_type": "markdown", 563 | "metadata": {}, 564 | "source": [ 565 | "Initialize a new IGV window" 566 | ] 567 | }, 568 | { 569 | "cell_type": "code", 570 | "execution_count": null, 571 | "metadata": {}, 572 | "outputs": [], 573 | "source": [ 574 | "second_browser = igv.IgvBrowser(genome=genome)\n", 575 | "\n", 576 | "second_browser" 577 | ] 578 | }, 579 | { 580 | "cell_type": "markdown", 581 | "metadata": {}, 582 | "source": [ 583 | "Load the variant calls produced by the HaplotypeCaller above\n", 584 | "\n", 585 | "*Adding `'color': \"#000000\"` as a workaround to [this issue](https://github.com/QuantStack/ipyigv/issues/21).*" 586 | ] 587 | }, 588 | { 589 | "cell_type": "code", 590 | "execution_count": null, 591 | "metadata": {}, 592 | "outputs": [], 593 | "source": [ 594 | "var_track = {\n", 595 | " 'name': 'Mother variants',\n", 596 | " 'format': 'vcf',\n", 597 | " 'url': 'files/sandbox/mother_variants.200k.vcf.gz',\n", 598 | " 'indexURL': 'files/sandbox/mother_variants.200k.vcf.gz.tbi',\n", 599 | " 'color': \"#000000\"\n", 600 | "}\n", 601 | "second_browser.add_track(Track(**var_track))" 602 | ] 603 | }, 604 | { 605 | "cell_type": "code", 606 | "execution_count": null, 607 | "metadata": {}, 608 | "outputs": [], 609 | "source": [ 610 | "second_browser.search('chr20:10,002,000-10,003,000')" 611 | ] 612 | }, 613 | { 614 | "cell_type": "markdown", 615 | "metadata": {}, 616 | "source": [ 617 | "Load the original BAM file on which we ran HaplotypeCaller" 618 | ] 619 | }, 620 | { 621 | "cell_type": "code", 622 | "execution_count": null, 623 | "metadata": {}, 624 | "outputs": [], 625 | "source": [ 626 | "wgs_track = {\n", 627 | " 'name': 'Mother WGS',\n", 628 | " 'format': 'bam',\n", 629 | " 'url': BAMS + '/mother.bam',\n", 630 | " 'indexURL': BAMS + '/mother.bai',\n", 631 | " 'height': 200\n", 632 | "}\n", 633 | "second_browser.add_track(Track(**wgs_track))" 634 | ] 635 | }, 636 | { 637 | "cell_type": "markdown", 638 | "metadata": {}, 639 | "source": [ 640 | "## Troubleshooting a Questionable Variant Call in the Embedded IGV Browser\n", 641 | "Something looks odd so we do some systematic troubleshooting..." 642 | ] 643 | }, 644 | { 645 | "cell_type": "markdown", 646 | "metadata": {}, 647 | "source": [ 648 | "Run HaplotypeCaller on the problem region to produce an output BAM, the `bamout`" 649 | ] 650 | }, 651 | { 652 | "cell_type": "code", 653 | "execution_count": null, 654 | "metadata": {}, 655 | "outputs": [], 656 | "source": [ 657 | "! gatk HaplotypeCaller \\\n", 658 | "-R {REF}/ref.fasta \\\n", 659 | "-I {BAMS}/mother.bam \\\n", 660 | "-O sandbox/motherHCdebug.vcf \\\n", 661 | "-bamout sandbox/motherHCdebug.bam \\\n", 662 | "-L 20:10,002,000-10,003,000" 663 | ] 664 | }, 665 | { 666 | "cell_type": "markdown", 667 | "metadata": {}, 668 | "source": [ 669 | "Load the `bamout` file into the IGV window" 670 | ] 671 | }, 672 | { 673 | "cell_type": "code", 674 | "execution_count": null, 675 | "metadata": {}, 676 | "outputs": [], 677 | "source": [ 678 | "bamout_track = {\n", 679 | "\"name\": \"Mother HC bamout\",\n", 680 | "\"url\": \"files/sandbox/motherHCdebug.bam\",\n", 681 | "\"indexURL\": \"files/sandbox/motherHCdebug.bai\",\n", 682 | "\"height\": 500,\n", 683 | "\"format\": \"bam\"\n", 684 | "}\n", 685 | "second_browser.add_track(Track(**bamout_track))" 686 | ] 687 | }, 688 | { 689 | "cell_type": "markdown", 690 | "metadata": {}, 691 | "source": [ 692 | "This concludes the GATK variant calling section of this notebook. \n", 693 | "\n", 694 | "----" 695 | ] 696 | }, 697 | { 698 | "cell_type": "code", 699 | "execution_count": null, 700 | "metadata": {}, 701 | "outputs": [], 702 | "source": [] 703 | } 704 | ], 705 | "metadata": { 706 | "kernelspec": { 707 | "display_name": "Python 3", 708 | "language": "python", 709 | "name": "python3" 710 | }, 711 | "language_info": { 712 | "codemirror_mode": { 713 | "name": "ipython", 714 | "version": 3 715 | }, 716 | "file_extension": ".py", 717 | "mimetype": "text/x-python", 718 | "name": "python", 719 | "nbconvert_exporter": "python", 720 | "pygments_lexer": "ipython3", 721 | "version": "3.7.12" 722 | }, 723 | "toc": { 724 | "base_numbering": 1, 725 | "nav_menu": {}, 726 | "number_sections": true, 727 | "sideBar": true, 728 | "skip_h1_title": false, 729 | "title_cell": "Table of Contents", 730 | "title_sidebar": "Contents", 731 | "toc_cell": false, 732 | "toc_position": {}, 733 | "toc_section_display": true, 734 | "toc_window_display": true 735 | } 736 | }, 737 | "nbformat": 4, 738 | "nbformat_minor": 2 739 | } 740 | -------------------------------------------------------------------------------- /notebooks/README.md: -------------------------------------------------------------------------------- 1 | Chapter materials -------------------------------------------------------------------------------- /notebooks/install_GATK_4130_with_igv.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | pip3 install igv-jupyter 4 | 5 | jupyter serverextension enable --py igv --sys-prefix 6 | jupyter nbextension install --py igv --sys-prefix 7 | jupyter nbextension enable --py igv --sys-prefix 8 | 9 | 10 | pip3 install rpy2==3.0.4 11 | pip3 install singledispatch 12 | pip3 install tzlocal 13 | 14 | 15 | echo "install.packages(c(\"optparse\",\"data.table\"),repos=\"http://cran.us.r-project.org\")" | R --no-save 16 | 17 | 18 | set -e 19 | 20 | GATK_VERSION=4.1.3.0 21 | GATK_ZIP_PATH=/tmp/gatk-$GATK_VERSION.zip 22 | 23 | # remove pre-existing GATK version 24 | rm -rf /bin/gatk 25 | 26 | # download the gatk zip if it doesn't already exist 27 | 28 | if ! [ -f $GATK_ZIP_PATH ]; then 29 | # curl and follow redirects and output to a temp file 30 | curl -L -o $GATK_ZIP_PATH https://github.com/broadinstitute/gatk/releases/download/$GATK_VERSION/gatk-$GATK_VERSION.zip 31 | fi 32 | 33 | # unzip with forced overwrite (if necessary) to /bin 34 | unzip -o $GATK_ZIP_PATH -d /etc/ 35 | 36 | # make a symlink to gatk right inside bin so it's available from the existing PATH 37 | 38 | ln -s /etc/gatk-$GATK_VERSION/gatk /bin/gatk 39 | 40 | pip3 install /etc/gatk-$GATK_VERSION/gatkPythonPackageArchive.zip 41 | 42 | 43 | export PATH=$PATH:/home/jupyter-user/.local/bin -------------------------------------------------------------------------------- /notebooks/plotting.R: -------------------------------------------------------------------------------- 1 | # plotting.R script loads ggplot and gridExtra libraries and defines functions to plot variant annotations 2 | library(ggplot2) 3 | install.packages("gridExtra") 4 | library(gridExtra) 5 | 6 | require(ggplot2, quietly = TRUE) 7 | require(gridExtra, quietly = TRUE) 8 | 9 | get_legend<-function(myggplot){ 10 | tmp <- ggplot_gtable(ggplot_build(myggplot)) 11 | leg <- which(sapply(tmp$grobs, function(x) x$name) == "guide-box") 12 | legend <- tmp$grobs[[leg]] 13 | return(legend) 14 | } 15 | 16 | 17 | # Function for making density plots of a single annotation 18 | makeDensityPlot <- function(dataframe, xvar, split, xmin=min(dataframe[xvar], na.rm=TRUE), xmax=max(dataframe[xvar], na.rm=TRUE), alpha=0.5) { 19 | 20 | if(missing(split)) { 21 | return(ggplot(data=dataframe, aes_string(x=xvar)) + xlim(xmin,xmax) + geom_density() ) 22 | } 23 | else { 24 | return(ggplot(data=dataframe, aes_string(x=xvar, fill=split)) + xlim(xmin,xmax) + geom_density(alpha=alpha) ) 25 | } 26 | } 27 | 28 | # Function for making scatter plots of two annotations 29 | makeScatterPlot <- function(dataframe, xvar, yvar, split, xmin=min(dataframe[xvar], na.rm=TRUE), xmax=max(dataframe[xvar], na.rm=TRUE), ymin=min(dataframe[yvar], na.rm=TRUE), ymax=max(dataframe[yvar], na.rm=TRUE), ptSize=1, alpha=0.6) { 30 | if(missing(split)) { 31 | return(ggplot(data=dataframe) + aes_string(x=xvar, y=yvar) + xlim(xmin,xmax) + ylim(ymin,ymax) + geom_point(size=ptSize, alpha=alpha) ) 32 | } 33 | else { 34 | return(ggplot(data=dataframe) + aes_string(x=xvar, y=yvar) + aes_string(color=split) + xlim(xmin,xmax) + ylim(ymin,ymax) + geom_point(size=ptSize, alpha=alpha) ) 35 | } 36 | } 37 | 38 | # Function for making scatter plots of two annotations with marginal density plots of each 39 | makeScatterPlotWithMarginalDensity <- function(dataframe, xvar, yvar, split, xmin=min(dataframe[xvar], na.rm=TRUE), xmax=max(dataframe[xvar], na.rm=TRUE), ymin=min(dataframe[yvar], na.rm=TRUE), ymax=max(dataframe[yvar], na.rm=TRUE), ptSize=1, ptAlpha=0.6, fillAlpha=0.5) { 40 | empty <- ggplot()+geom_point(aes(1,1), colour="white") + 41 | theme( 42 | plot.background = element_blank(), 43 | panel.grid.major = element_blank(), 44 | panel.grid.minor = element_blank(), 45 | panel.border = element_blank(), 46 | panel.background = element_blank(), 47 | axis.title.x = element_blank(), 48 | axis.title.y = element_blank(), 49 | axis.text.x = element_blank(), 50 | axis.text.y = element_blank(), 51 | axis.ticks = element_blank() 52 | ) 53 | 54 | if(missing(split)){ 55 | scatter <- ggplot(data=dataframe) + aes_string(x=xvar, y=yvar) + geom_point(size=ptSize, alpha=ptAlpha) + xlim(xmin,xmax) + ylim(ymin,ymax) 56 | plot_top <- ggplot(data=dataframe, aes_string(x=xvar)) + geom_density(alpha=fillAlpha) + theme(legend.position="none") + xlim(xmin,xmax) 57 | plot_right <- ggplot(data=dataframe, aes_string(x=yvar)) + geom_density(alpha=fillAlpha) + coord_flip() + theme(legend.position="none") + xlim(ymin,ymax) 58 | } 59 | else{ 60 | scatter <- ggplot(data=dataframe) + aes_string(x=xvar, y=yvar) + geom_point(size=ptSize, alpha=ptAlpha, aes_string(color=split)) + xlim(xmin,xmax) + ylim(ymin,ymax) 61 | plot_top <- ggplot(data=dataframe, aes_string(x=xvar, fill=split)) + geom_density(alpha=fillAlpha) + theme(legend.position="none") + xlim(xmin,xmax) 62 | plot_right <- ggplot(data=dataframe, aes_string(x=yvar, fill=split)) + geom_density(alpha=fillAlpha) + coord_flip() + theme(legend.position="none") + xlim(ymin,ymax) 63 | } 64 | legend <- get_legend(scatter) 65 | scatter <- scatter + theme(legend.position="none") 66 | temp <- grid.arrange(plot_top, legend, scatter, plot_right, ncol=2, nrow=2, widths=c(4,1), heights=c(1,4)) 67 | return(temp) 68 | } -------------------------------------------------------------------------------- /production/README.md: -------------------------------------------------------------------------------- 1 | ## About 2 | 3 | This is the directory we use for some production 4 | tools and elements for the book. Namely our 5 | pygments WDL lexer that's used to color-code 6 | the WDL code used throughout the book. 7 | 8 | If your a reader of the book this directory 9 | probably isn't interesting to you. 10 | -------------------------------------------------------------------------------- /production/notebook_images/cell_27.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/broadinstitute/genomics-in-the-cloud/e329fa7f0668f6bea7f4176736d686e79175ce8b/production/notebook_images/cell_27.png -------------------------------------------------------------------------------- /production/notebook_images/cell_28.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/broadinstitute/genomics-in-the-cloud/e329fa7f0668f6bea7f4176736d686e79175ce8b/production/notebook_images/cell_28.png -------------------------------------------------------------------------------- /production/notebook_images/cell_29.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/broadinstitute/genomics-in-the-cloud/e329fa7f0668f6bea7f4176736d686e79175ce8b/production/notebook_images/cell_29.png -------------------------------------------------------------------------------- /production/notebook_images/cell_32.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/broadinstitute/genomics-in-the-cloud/e329fa7f0668f6bea7f4176736d686e79175ce8b/production/notebook_images/cell_32.png -------------------------------------------------------------------------------- /production/notebook_images/cell_35.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/broadinstitute/genomics-in-the-cloud/e329fa7f0668f6bea7f4176736d686e79175ce8b/production/notebook_images/cell_35.png -------------------------------------------------------------------------------- /production/notebook_images/cell_36.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/broadinstitute/genomics-in-the-cloud/e329fa7f0668f6bea7f4176736d686e79175ce8b/production/notebook_images/cell_36.png -------------------------------------------------------------------------------- /production/notebook_images/cell_37.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/broadinstitute/genomics-in-the-cloud/e329fa7f0668f6bea7f4176736d686e79175ce8b/production/notebook_images/cell_37.png -------------------------------------------------------------------------------- /production/notebook_images/cell_39.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/broadinstitute/genomics-in-the-cloud/e329fa7f0668f6bea7f4176736d686e79175ce8b/production/notebook_images/cell_39.png -------------------------------------------------------------------------------- /production/pygments_lexer/README.md: -------------------------------------------------------------------------------- 1 | ## About 2 | 3 | This is a very basic [Pygments](https://pygments.org) WDL lexer based on the Objective-C lexer. 4 | See the [WDL spec](https://github.com/openwdl/wdl/blob/master/versions/1.0/SPEC.md) 5 | for more information about the language. Also see the 6 | [wdl-sublime-syntax-highlighter](https://github.com/broadinstitute/wdl-sublime-syntax-highlighter) 7 | for syntax highlighting in Sublime and Visual Studio and 8 | [language-wdl](https://github.com/broadinstitute/language-wdl) for 9 | syntax highlighting in the Atom editor. 10 | 11 | ## Using 12 | 13 | This lexer is easy to use. First, install Pygments using pip (or whatever 14 | mechanism you like): 15 | 16 | $ pip install Pygments 17 | 18 | Then use this wdl_lexer.py file with the [Pygments Command Line](https://pygments.org/docs/cmdline/): 19 | 20 | $ pygmentize -f html -O full,style=colorful -o test.html -l wdl_lexer.py -x hello-world.wdl 21 | 22 | You can then open the test.html file in your browser. Take a look at the 23 | command line docs for information on how this works and what options 24 | are available. The style argument is useful for quickly changing the 25 | look of the syntax highlighting. See the [live demo](https://pygments.org/demo/#try) 26 | page for the list of possible styles. 27 | 28 | To generate HTML for all WDL in the repo use this script: 29 | 30 | $ bash run_wdl_lexer.sh 31 | 32 | And look in the `html_output` directory. 33 | 34 | ## Known Limitations 35 | 36 | Here are the known issues: 37 | * when command blocks include parameters that overlap with WDL keywords (such as `--create-output-variant-index`) you'll get `output` highlighted. I'm looking at ways of excluding keywords that include '-' but I haven't been able to get this to work yet. 38 | 39 | ## Future 40 | 41 | This is a very, very rough lexer for WDL and, being based on the 42 | Objective-C lexer, there's a ton of room for cleanup and improvements. 43 | -------------------------------------------------------------------------------- /production/pygments_lexer/hello-world.wdl: -------------------------------------------------------------------------------- 1 | version 1.0 2 | 3 | workflow HelloWorld { 4 | call WriteGreeting 5 | } 6 | 7 | task WriteGreeting { 8 | command { 9 | echo "Hello World" 10 | } 11 | output { 12 | File output_greeting = stdout() 13 | } 14 | } 15 | -------------------------------------------------------------------------------- /production/pygments_lexer/run_wdl_lexer.sh: -------------------------------------------------------------------------------- 1 | mkdir html_output 2 | cd .. 3 | for i in `find . | grep '.wdl$'`; 4 | do f="$(basename -- $i)" 5 | echo $f 6 | pygmentize -f html -O full,style=default -o pygments_lexer/html_output/$f.html -l pygments_lexer/wdl_lexer.py -x $i 7 | done; 8 | -------------------------------------------------------------------------------- /production/pygments_lexer/test.html: -------------------------------------------------------------------------------- 1 | 3 | 8 | 9 | 10 | 11 | 12 | 92 | 93 | 94 |

95 | 96 |
version 1.0
 97 | 
 98 | workflow HelloWorld {
 99 |   call WriteGreeting
100 | }
101 | 
102 | task WriteGreeting {
103 |   command {
104 |      echo "Hello World"
105 |   }
106 |   output {
107 |      File output_greeting = stdout()
108 |   }
109 | }
110 | 
111 | 112 | 113 | -------------------------------------------------------------------------------- /production/pygments_lexer/wdl_lexer.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | """ 3 | pygments.lexers.objective 4 | ~~~~~~~~~~~~~~~~~~~~~~~~~ 5 | 6 | Lexers for WDL language based on Objective-C by Pygments team 7 | 8 | :copyright: Original copyright 2006-2019 by the Pygments team, see AUTHORS. 2020 copyright Brian O'Connor 9 | :license: BSD, see LICENSE for details. 10 | """ 11 | 12 | import re 13 | 14 | from pygments.lexer import RegexLexer, include, bygroups, using, this, words, \ 15 | inherit, default 16 | from pygments.token import Text, Keyword, Name, String, Operator, \ 17 | Number, Punctuation, Literal, Comment 18 | 19 | from pygments.lexers.c_cpp import CLexer, CppLexer 20 | 21 | __all__ = ['ObjectiveCLexer', 'ObjectiveCppLexer', 'LogosLexer', 'SwiftLexer'] 22 | 23 | 24 | def objective(baselexer): 25 | """ 26 | Generate a subclass of baselexer that accepts the WDL syntax 27 | extensions. 28 | """ 29 | 30 | # Have to be careful not to accidentally match JavaDoc/Doxygen syntax here, 31 | # since that's quite common in ordinary C/C++ files. It's OK to match 32 | # JavaDoc/Doxygen keywords that only apply to WDL, mind. 33 | # 34 | # The upshot of this is that we CANNOT match @class or @interface 35 | _oc_keywords = re.compile(r'@(?:end|implementation|protocol)') 36 | 37 | # Matches [ ? identifier ( identifier ? ] | identifier? : ) 38 | # (note the identifier is *optional* when there is a ':'!) 39 | _oc_message = re.compile(r'\[\s*[a-zA-Z_]\w*\s+' 40 | r'(?:[a-zA-Z_]\w*\s*\]|' 41 | r'(?:[a-zA-Z_]\w*)?:)') 42 | 43 | class GeneratedObjectiveCVariant(baselexer): 44 | """ 45 | Implements WDL syntax based on C family lexer. 46 | """ 47 | 48 | tokens = { 49 | 'statements': [ 50 | (r'@"', String, 'string'), 51 | (r"@'", String, 'string'), 52 | (r'@\\$', String, 'string'), 53 | (r'@(YES|NO)', Number), 54 | (r"@('\\$|\\.|\\[0-7]{1,3}|\\x[a-fA-F0-9]{1,2}|[^\\\'\n])'", String.Char), 55 | (r'@(\d+\.\d*|\.\d+|\d+)[eE][+-]?\d+[lL]?', Number.Float), 56 | (r'@(\d+\.\d*|\.\d+|\d+[fF])[fF]?', Number.Float), 57 | (r'@0x[0-9a-fA-F]+[Ll]?', Number.Hex), 58 | (r'@0[0-7]+[Ll]?', Number.Oct), 59 | (r'@\d+[Ll]?', Number.Integer), 60 | (r'@\(', Literal, 'literal_number'), 61 | (r'@\[', Literal, 'literal_array'), 62 | (r'@\{', Literal, 'literal_dictionary'), 63 | (words(( 64 | 'version', 'workflow', 'task', 'command', 'output', 'input', 65 | 'runtime', 'call', 'parameter_meta', 'meta', 'scatter', 'as', 66 | 'input:', 'import', 'if', 'struct'), suffix=r'\b'), 67 | Keyword), 68 | (words(('File', 'Array', 'Int', 'Float', 'Boolean', 'String', 69 | 'Map', 'Pair', 'Object'), suffix=r'\b'), 70 | Keyword.Type), 71 | (r'@(true|false|YES|NO)\n', Name.Builtin), 72 | (r'(YES|NO|nil|self|super)\b', Name.Builtin), 73 | # Carbon types 74 | (r'(Boolean|UInt8|SInt8|UInt16|SInt16|UInt32|SInt32)\b', Keyword.Type), 75 | # Carbon built-ins 76 | (r'(true|false)\b', Name.Builtin), 77 | (r'(@interface|@implementation)(\s+)', bygroups(Keyword, Text), 78 | ('#pop', 'oc_classname')), 79 | (r'(@class|@protocol)(\s+)', bygroups(Keyword, Text), 80 | ('#pop', 'oc_forward_classname')), 81 | # @ can also prefix other expressions like @{...} or @(...) 82 | (r'\\$', Punctuation), 83 | (r"['$@%#]+", Name.Variable), 84 | #(r'@', Punctuation), 85 | inherit, 86 | ], 87 | 'oc_classname': [ 88 | # interface definition that inherits 89 | (r'([a-zA-Z$_][\w$]*)(\s*:\s*)([a-zA-Z$_][\w$]*)?(\s*)(\{)', 90 | bygroups(Name.Class, Text, Name.Class, Text, Punctuation), 91 | ('#pop', 'oc_ivars')), 92 | (r'([a-zA-Z$_][\w$]*)(\s*:\s*)([a-zA-Z$_][\w$]*)?', 93 | bygroups(Name.Class, Text, Name.Class), '#pop'), 94 | # interface definition for a category 95 | (r'([a-zA-Z$_][\w$]*)(\s*)(\([a-zA-Z$_][\w$]*\))(\s*)(\{)', 96 | bygroups(Name.Class, Text, Name.Label, Text, Punctuation), 97 | ('#pop', 'oc_ivars')), 98 | (r'([a-zA-Z$_][\w$]*)(\s*)(\([a-zA-Z$_][\w$]*\))', 99 | bygroups(Name.Class, Text, Name.Label), '#pop'), 100 | # simple interface / implementation 101 | (r'([a-zA-Z$_][\w$]*)(\s*)(\{)', 102 | bygroups(Name.Class, Text, Punctuation), ('#pop', 'oc_ivars')), 103 | (r'([a-zA-Z$_][\w$]*)', Name.Class, '#pop') 104 | ], 105 | 'oc_forward_classname': [ 106 | (r'([a-zA-Z$_][\w$]*)(\s*,\s*)', 107 | bygroups(Name.Class, Text), 'oc_forward_classname'), 108 | (r'([a-zA-Z$_][\w$]*)(\s*;?)', 109 | bygroups(Name.Class, Text), '#pop') 110 | ], 111 | 'oc_ivars': [ 112 | include('whitespace'), 113 | include('statements'), 114 | (';', Punctuation), 115 | (r'\{', Punctuation, '#push'), 116 | (r'\}', Punctuation, '#pop'), 117 | ], 118 | 'root': [ 119 | # methods 120 | (r'^([-+])(\s*)' # method marker 121 | r'(\(.*?\))?(\s*)' # return type 122 | r'([a-zA-Z$_][\w$]*:?)', # begin of method name 123 | bygroups(Punctuation, Text, using(this), 124 | Text, Name.Function), 125 | 'method'), 126 | inherit, 127 | ], 128 | 'method': [ 129 | include('whitespace'), 130 | # TODO unsure if ellipses are allowed elsewhere, see 131 | # discussion in Issue 789 132 | (r',', Punctuation), 133 | (r'\.\.\.', Punctuation), 134 | (r'(\(.*?\))(\s*)([a-zA-Z$_][\w$]*)', 135 | bygroups(using(this), Text, Name.Variable)), 136 | (r'[a-zA-Z$_][\w$]*:', Name.Function), 137 | (';', Punctuation, '#pop'), 138 | (r'\{', Punctuation, 'function'), 139 | default('#pop'), 140 | ], 141 | 'literal_number': [ 142 | (r'\(', Punctuation, 'literal_number_inner'), 143 | (r'\)', Literal, '#pop'), 144 | include('statement'), 145 | ], 146 | 'literal_number_inner': [ 147 | (r'\(', Punctuation, '#push'), 148 | (r'\)', Punctuation, '#pop'), 149 | include('statement'), 150 | ], 151 | 'literal_array': [ 152 | (r'\[', Punctuation, 'literal_array_inner'), 153 | (r'\]', Literal, '#pop'), 154 | include('statement'), 155 | ], 156 | 'literal_array_inner': [ 157 | (r'\[', Punctuation, '#push'), 158 | (r'\]', Punctuation, '#pop'), 159 | include('statement'), 160 | ], 161 | 'literal_dictionary': [ 162 | (r'\}', Literal, '#pop'), 163 | include('statement'), 164 | ], 165 | } 166 | 167 | def analyse_text(text): 168 | if _oc_keywords.search(text): 169 | return 1.0 170 | elif '@"' in text: # strings 171 | return 0.8 172 | elif re.search('@[0-9]+', text): 173 | return 0.7 174 | elif _oc_message.search(text): 175 | return 0.8 176 | return 0 177 | 178 | def get_tokens_unprocessed(self, text): 179 | from pygments.lexers._cocoa_builtins import COCOA_INTERFACES, \ 180 | COCOA_PROTOCOLS, COCOA_PRIMITIVES 181 | 182 | for index, token, value in \ 183 | baselexer.get_tokens_unprocessed(self, text): 184 | if token is Name or token is Name.Class: 185 | if value in COCOA_INTERFACES or value in COCOA_PROTOCOLS \ 186 | or value in COCOA_PRIMITIVES: 187 | token = Name.Builtin.Pseudo 188 | 189 | yield index, token, value 190 | 191 | return GeneratedObjectiveCVariant 192 | 193 | 194 | class CustomLexer(objective(CLexer)): 195 | """ 196 | For WDL workflow code. 197 | """ 198 | 199 | name = 'WDL' 200 | aliases = ['wdl'] 201 | filenames = ['*.wdl'] 202 | mimetypes = ['text/x-wdl'] 203 | priority = 0.05 # Lower than C 204 | -------------------------------------------------------------------------------- /temp/05-plotting.R: -------------------------------------------------------------------------------- 1 | # plotting.R script loads ggplot and gridExtra libraries and defines functions to plot variant annotations 2 | 3 | library(ggplot2) 4 | install.packages("gridExtra") 5 | library(gridExtra) 6 | 7 | get_legend<-function(myggplot){ 8 | tmp <- ggplot_gtable(ggplot_build(myggplot)) 9 | leg <- which(sapply(tmp$grobs, function(x) x$name) == "guide-box") 10 | legend <- tmp$grobs[[leg]] 11 | return(legend) 12 | } 13 | 14 | # Function for making density plots of a single annotation 15 | makeDensityPlot <- function(dataframe, xvar, split, xmin=min(dataframe[xvar], na.rm=TRUE), xmax=max(dataframe[xvar], na.rm=TRUE), alpha=0.5) { 16 | 17 | if(missing(split)) { 18 | return(ggplot(data=dataframe, aes_string(x=xvar)) + xlim(xmin,xmax) + geom_density() ) 19 | } 20 | else { 21 | return(ggplot(data=dataframe, aes_string(x=xvar, fill=split)) + xlim(xmin,xmax) + geom_density(alpha=alpha) ) 22 | } 23 | } 24 | 25 | # Function for making scatter plots of two annotations 26 | makeScatterPlot <- function(dataframe, xvar, yvar, split, xmin=min(dataframe[xvar], na.rm=TRUE), xmax=max(dataframe[xvar], na.rm=TRUE), ymin=min(dataframe[yvar], na.rm=TRUE), ymax=max(dataframe[yvar], na.rm=TRUE), ptSize=1, alpha=0.6) { 27 | if(missing(split)) { 28 | return(ggplot(data=dataframe) + aes_string(x=xvar, y=yvar) + xlim(xmin,xmax) + ylim(ymin,ymax) + geom_point(size=ptSize, alpha=alpha) ) 29 | } 30 | else { 31 | return(ggplot(data=dataframe) + aes_string(x=xvar, y=yvar) + aes_string(color=split) + xlim(xmin,xmax) + ylim(ymin,ymax) + geom_point(size=ptSize, alpha=alpha) ) 32 | } 33 | } 34 | 35 | # Function for making scatter plots of two annotations with marginal density plots of each 36 | makeScatterPlotWithMarginalDensity <- function(dataframe, xvar, yvar, split, xmin=min(dataframe[xvar], na.rm=TRUE), xmax=max(dataframe[xvar], na.rm=TRUE), ymin=min(dataframe[yvar], na.rm=TRUE), ymax=max(dataframe[yvar], na.rm=TRUE), ptSize=1, ptAlpha=0.6, fillAlpha=0.5) { 37 | empty <- ggplot()+geom_point(aes(1,1), colour="white") + 38 | theme( 39 | plot.background = element_blank(), 40 | panel.grid.major = element_blank(), 41 | panel.grid.minor = element_blank(), 42 | panel.border = element_blank(), 43 | panel.background = element_blank(), 44 | axis.title.x = element_blank(), 45 | axis.title.y = element_blank(), 46 | axis.text.x = element_blank(), 47 | axis.text.y = element_blank(), 48 | axis.ticks = element_blank() 49 | ) 50 | 51 | if(missing(split)){ 52 | scatter <- ggplot(data=dataframe) + aes_string(x=xvar, y=yvar) + geom_point(size=ptSize, alpha=ptAlpha) + xlim(xmin,xmax) + ylim(ymin,ymax) 53 | plot_top <- ggplot(data=dataframe, aes_string(x=xvar)) + geom_density(alpha=fillAlpha) + theme(legend.position="none") + xlim(xmin,xmax) 54 | plot_right <- ggplot(data=dataframe, aes_string(x=yvar)) + geom_density(alpha=fillAlpha) + coord_flip() + theme(legend.position="none") + xlim(ymin,ymax) 55 | } 56 | else{ 57 | scatter <- ggplot(data=dataframe) + aes_string(x=xvar, y=yvar) + geom_point(size=ptSize, alpha=ptAlpha, aes_string(color=split)) + xlim(xmin,xmax) + ylim(ymin,ymax) 58 | plot_top <- ggplot(data=dataframe, aes_string(x=xvar, fill=split)) + geom_density(alpha=fillAlpha) + theme(legend.position="none") + xlim(xmin,xmax) 59 | plot_right <- ggplot(data=dataframe, aes_string(x=yvar, fill=split)) + geom_density(alpha=fillAlpha) + coord_flip() + theme(legend.position="none") + xlim(ymin,ymax) 60 | } 61 | legend <- get_legend(scatter) 62 | scatter <- scatter + theme(legend.position="none") 63 | temp <- grid.arrange(plot_top, legend, scatter, plot_right, ncol=2, nrow=2, widths=c(4,1), heights=c(1,4)) 64 | return(temp) 65 | } 66 | -------------------------------------------------------------------------------- /workflows/README.md: -------------------------------------------------------------------------------- 1 | Chapter materials -------------------------------------------------------------------------------- /workflows/hello-hc/hc-break1.wdl: -------------------------------------------------------------------------------- 1 | ## This workflow is intentionally broken! 2 | 3 | version 1.0 4 | 5 | workflow HelloHaplotypeCaller { 6 | 7 | call HaplotypeCallerGVCF 8 | } 9 | 10 | task HaplotypeCallerGVCF { 11 | 12 | input { 13 | String docker_image 14 | String java_opt 15 | 16 | File ref_fasta 17 | File ref_index 18 | File ref_dict 19 | File input_bam 20 | File input_bam_index 21 | File intervals 22 | } 23 | 24 | # The basename() function is missing its right parenthesis (rparen) 25 | String gvcf_name = basename(input_bam, ".bam" + ".g.vcf" 26 | 27 | command { 28 | gatk --java-options ${java_opt} HaploCaller \ 29 | -R ${ref_fasta} \ 30 | -I ${input_bam} \ 31 | -O ${gvcf_name} \ 32 | -L ${intervals} \ 33 | -ERC GVCF 34 | } 35 | 36 | output { 37 | File output_gvcf = "${gvcf_name}" 38 | } 39 | 40 | runtime { 41 | docker: docker_image 42 | } 43 | 44 | } 45 | -------------------------------------------------------------------------------- /workflows/hello-hc/hc-break2.wdl: -------------------------------------------------------------------------------- 1 | ## This workflow is intentionally broken! 2 | 3 | version 1.0 4 | 5 | workflow HelloHaplotypeCaller { 6 | 7 | call HaplotypeCallerGVCF 8 | } 9 | 10 | task HaplotypeCallerGVCF { 11 | 12 | input { 13 | String docker_image 14 | String java_opt 15 | 16 | File ref_fasta 17 | File ref_index 18 | File ref_dict 19 | File input_bam 20 | File input_bam_index 21 | File intervals 22 | } 23 | 24 | String gvcf_name = basename(input_bam, ".bam") + ".g.vcf" 25 | 26 | # The tool name in this command is wrong 27 | # (HaploCaller instead of HaplotypeCaller) 28 | command { 29 | gatk --java-options ${java_opt} HaploCaller \ 30 | -R ${ref_fasta} \ 31 | -I ${input_bam} \ 32 | -O ${gvcf_name} \ 33 | -L ${intervals} \ 34 | -ERC GVCF 35 | } 36 | 37 | output { 38 | File output_gvcf = "${gvcf_name}" 39 | } 40 | 41 | runtime { 42 | docker: docker_image 43 | } 44 | 45 | } 46 | -------------------------------------------------------------------------------- /workflows/hello-hc/hello-haplotypecaller.inputs.json: -------------------------------------------------------------------------------- 1 | { 2 | "HelloHaplotypeCaller.HaplotypeCallerGVCF.input_bam_index": "book/data/germline/bams/mother.bai", 3 | "HelloHaplotypeCaller.HaplotypeCallerGVCF.input_bam": "book/data/germline/bams/mother.bam", 4 | "HelloHaplotypeCaller.HaplotypeCallerGVCF.ref_fasta": "book/data/germline/ref/ref.fasta", 5 | "HelloHaplotypeCaller.HaplotypeCallerGVCF.ref_index": "book/data/germline/ref/ref.fasta.fai", 6 | "HelloHaplotypeCaller.HaplotypeCallerGVCF.ref_dict": "book/data/germline/ref/ref.dict", 7 | "HelloHaplotypeCaller.HaplotypeCallerGVCF.intervals": "book/data/germline/intervals/snippet-intervals-min.list", 8 | "HelloHaplotypeCaller.HaplotypeCallerGVCF.docker_image": "us.gcr.io/broad-gatk/gatk:4.1.3.0", 9 | "HelloHaplotypeCaller.HaplotypeCallerGVCF.java_opt": "-Xmx8G" 10 | } 11 | -------------------------------------------------------------------------------- /workflows/hello-hc/hello-haplotypecaller.wdl: -------------------------------------------------------------------------------- 1 | ## This workflow runs the HaplotypeCaller tool from GATK4 in GVCF mode 2 | ## on a single sample in BAM format and produces a single GVCF file, 3 | ## which can then be used by the joint-discovery workflow according 4 | ## to the GATK Best Practices for germline short variant discovery. 5 | 6 | version 1.0 7 | 8 | workflow HelloHaplotypeCaller { 9 | 10 | call HaplotypeCallerGVCF 11 | } 12 | 13 | task HaplotypeCallerGVCF { 14 | 15 | input { 16 | String docker_image 17 | String java_opt 18 | 19 | File ref_fasta 20 | File ref_index 21 | File ref_dict 22 | File input_bam 23 | File input_bam_index 24 | File intervals 25 | } 26 | 27 | String gvcf_name = basename(input_bam, ".bam") + ".g.vcf" 28 | 29 | command { 30 | gatk --java-options ${java_opt} HaplotypeCaller \ 31 | -R ${ref_fasta} \ 32 | -I ${input_bam} \ 33 | -O ${gvcf_name} \ 34 | -L ${intervals} \ 35 | -ERC GVCF 36 | } 37 | 38 | output { 39 | File output_gvcf = "${gvcf_name}" 40 | } 41 | 42 | runtime { 43 | docker: docker_image 44 | } 45 | 46 | } 47 | -------------------------------------------------------------------------------- /workflows/hello-world/hello-world-again.wdl: -------------------------------------------------------------------------------- 1 | version 1.0 2 | 3 | workflow HelloWorld { 4 | 5 | call WriteGreeting 6 | 7 | call ReadItBackToMe { 8 | input: 9 | written_greeting = WriteGreeting.output_greeting 10 | } 11 | 12 | output { 13 | File outfile = ReadItBackToMe.repeated_greeting 14 | } 15 | } 16 | 17 | task WriteGreeting { 18 | 19 | input { 20 | String greeting 21 | } 22 | 23 | command { 24 | echo "${greeting}" 25 | } 26 | output { 27 | File output_greeting = stdout() 28 | } 29 | } 30 | 31 | task ReadItBackToMe { 32 | 33 | input { 34 | File written_greeting 35 | String original_greeting = read_string(written_greeting) 36 | } 37 | 38 | command { 39 | echo "${original_greeting} to you too" 40 | } 41 | output { 42 | File repeated_greeting = stdout() 43 | } 44 | } 45 | -------------------------------------------------------------------------------- /workflows/hello-world/hello-world-var.wdl: -------------------------------------------------------------------------------- 1 | version 1.0 2 | 3 | workflow HelloWorld { 4 | call WriteGreeting 5 | } 6 | 7 | task WriteGreeting { 8 | 9 | input { 10 | String greeting 11 | } 12 | 13 | command { 14 | echo "${greeting}" 15 | } 16 | 17 | output { 18 | File output_greeting = stdout() 19 | } 20 | } 21 | -------------------------------------------------------------------------------- /workflows/hello-world/hello-world.inputs.json: -------------------------------------------------------------------------------- 1 | { 2 | "HelloWorld.WriteGreeting.greeting": "Hello Variable World" 3 | } -------------------------------------------------------------------------------- /workflows/hello-world/hello-world.wdl: -------------------------------------------------------------------------------- 1 | version 1.0 2 | 3 | workflow HelloWorld { 4 | call WriteGreeting 5 | } 6 | 7 | task WriteGreeting { 8 | command { 9 | echo "Hello World" 10 | } 11 | output { 12 | File output_greeting = stdout() 13 | } 14 | } 15 | -------------------------------------------------------------------------------- /workflows/mystery-1/haplotypecaller-gvcf-gatk4.hg38.wgs.inputs.json: -------------------------------------------------------------------------------- 1 | { 2 | "##_COMMENT1": "INPUT BAM", 3 | "#HaplotypeCallerGvcf_GATK4.input_bam": "gs://gatk-test-data/wgs_bam/NA12878_24RG_hg38/NA12878_24RG_small.hg38.bam", 4 | "#HaplotypeCallerGvcf_GATK4.input_bam_index": "gs://gatk-test-data/wgs_bam/NA12878_24RG_hg38/NA12878_24RG_small.hg38.bai", 5 | "HaplotypeCallerGvcf_GATK4.input_bam": "gs://broad-public-datasets/NA12878/NA12878.cram", 6 | "HaplotypeCallerGvcf_GATK4.input_bam_index": "gs://broad-public-datasets/NA12878/NA12878.cram.crai", 7 | 8 | "##_COMMENT2": "REFERENCE FILES", 9 | "HaplotypeCallerGvcf_GATK4.ref_dict": "gs://broad-references/hg38/v0/Homo_sapiens_assembly38.dict", 10 | "HaplotypeCallerGvcf_GATK4.ref_fasta": "gs://broad-references/hg38/v0/Homo_sapiens_assembly38.fasta", 11 | "HaplotypeCallerGvcf_GATK4.ref_fasta_index": "gs://broad-references/hg38/v0/Homo_sapiens_assembly38.fasta.fai", 12 | 13 | "##_COMMENT3": "INTERVALS", 14 | "HaplotypeCallerGvcf_GATK4.scattered_calling_intervals_list": "gs://gatk-test-data/intervals/hg38_wgs_scattered_calling_intervals.txt", 15 | 16 | "##_COMMENT4": "MISCELLANEOUS PARAMETERS", 17 | "#HaplotypeCallerGvcf_GATK4.HaplotypeCaller.make_gvcf": "True", 18 | "#HaplotypeCallerGvcf_GATK4.HaplotypeCaller.contamination": "Float? (optional)", 19 | 20 | "##_COMMENT5": "DOCKERS", 21 | "#HaplotypeCallerGvcf_GATK4.gatk_docker_override": "String? (optional)", 22 | "#HaplotypeCallerGvcf_GATK4.gitc_docker_override": "String? (optional)", 23 | 24 | "##_COMMENT6": "PATHS", 25 | "#HaplotypeCallerGvcf_GATK4.gatk_path_override": "String? (optional)", 26 | "#HaplotypeCallerGvcf_GATK4.samtools_path_override": "String? (optional)", 27 | 28 | "##_COMMENT7": "JAVA OPTIONS", 29 | "#HaplotypeCallerGvcf_GATK4.HaplotypeCaller.java_options": "String? (optional)", 30 | 31 | "##_COMMENT8": "MEMORY ALLOCATION", 32 | "#HaplotypeCallerGvcf_GATK4.HaplotypeCaller.mem_gb": "Int? (optional)", 33 | "#HaplotypeCallerGvcf_GATK4.MergeGVCFs.mem_gb": "Int? (optional)", 34 | "#HaplotypeCallerGvcf_GATK4.CramToBamTask.machine_mem_gb": "Int? (optional)", 35 | 36 | "##_COMMENT9": "DISK SIZE ALLOCATION", 37 | "#HaplotypeCallerGvcf_GATK4.HaplotypeCaller.disk_space_gb": "Int? (optional)", 38 | "#HaplotypeCallerGvcf_GATK4.MergeGVCFs.disk_space_gb": "Int? (optional)", 39 | "#HaplotypeCallerGvcf_GATK4.CramToBamTask.disk_space_gb": "Int? (optional)", 40 | 41 | "##_COMMENT10": "PREEMPTION", 42 | "#HaplotypeCallerGvcf_GATK4.HaplotypeCaller.preemptible_attempts": "Int? (optional)", 43 | "#HaplotypeCallerGvcf_GATK4.MergeGVCFs.preemptible_attempts": "Int? (optional)", 44 | "#HaplotypeCallerGvcf_GATK4.CramToBamTask.preemptible_attempts": "Int? (optional)" 45 | } 46 | -------------------------------------------------------------------------------- /workflows/mystery-1/haplotypecaller-gvcf-gatk4.wdl: -------------------------------------------------------------------------------- 1 | ## Copyright Broad Institute, 2020 2 | ## 3 | ## Adapted from https://github.com/gatk-workflows/gatk4-germline-snps-indels/blob/master/haplotypecaller-gvcf-gatk4.wdl 4 | ## 5 | ## The haplotypecaller-gvcf-gatk4 workflow runs the HaplotypeCaller tool 6 | ## from GATK4 in GVCF mode on a single sample according to GATK Best Practices. 7 | ## When executed, the workflow scatters the HaplotypeCaller tool over a sample 8 | ## using an intervals list file. The output file produced will be a 9 | ## single GVCF file that can be used by the joint-discovery workflow. 10 | ## 11 | ## Requirements/expectations : 12 | ## - One analysis-ready BAM file for a single sample (as identified in RG:SM) 13 | ## - Set of variant calling intervals lists for the scatter, provided in a file 14 | ## 15 | ## Outputs : 16 | ## - One GVCF file and its index 17 | ## 18 | ## Cromwell version support 19 | ## - Successfully tested on v48 20 | ## - Requires WDL 1.0 support 21 | ## 22 | ## Runtime parameters are optimized for Broad's Google Cloud Platform implementation. 23 | ## 24 | ## LICENSING : 25 | ## This script is released under the WDL source code license (BSD-3) (see LICENSE in 26 | ## https://github.com/openwdl/wdl). Note however that the programs it calls may 27 | ## be subject to different licenses. Users are responsible for checking that they are 28 | ## authorized to run all programs before running this script. Please see the dockers 29 | ## for detailed licensing information pertaining to the included programs. 30 | 31 | version 1.0 32 | 33 | # WORKFLOW DEFINITION 34 | workflow HaplotypeCallerGvcf_GATK4 { 35 | 36 | input { 37 | File input_bam 38 | File input_bam_index 39 | File ref_dict 40 | File ref_fasta 41 | File ref_fasta_index 42 | File scattered_calling_intervals_list 43 | 44 | Boolean? make_gvcf 45 | Boolean making_gvcf = select_first([make_gvcf,true]) 46 | 47 | String? gatk_docker_override 48 | String gatk_docker = select_first([gatk_docker_override, "us.gcr.io/broad-gatk/gatk:4.1.0.0"]) 49 | String? gatk_path_override 50 | String gatk_path = select_first([gatk_path_override, "/gatk/gatk"]) 51 | String? gitc_docker_override 52 | String gitc_docker = select_first([gitc_docker_override, "broadinstitute/genomes-in-the-cloud:2.3.1-1500064817"]) 53 | String? samtools_path_override 54 | String samtools_path = select_first([samtools_path_override, "samtools"]) 55 | } 56 | 57 | Array[File] scattered_calling_intervals = read_lines(scattered_calling_intervals_list) 58 | 59 | #is the input a cram file? 60 | Boolean is_cram = sub(basename(input_bam), ".*\\.", "") == "cram" 61 | 62 | String sample_basename = if is_cram then basename(input_bam, ".cram") else basename(input_bam, ".bam") 63 | String vcf_basename = sample_basename 64 | String output_suffix = if making_gvcf then ".g.vcf.gz" else ".vcf.gz" 65 | String output_filename = vcf_basename + output_suffix 66 | 67 | if ( is_cram ) { 68 | call CramToBamTask { 69 | input: 70 | input_cram = input_bam, 71 | sample_name = sample_basename, 72 | ref_dict = ref_dict, 73 | ref_fasta = ref_fasta, 74 | ref_fasta_index = ref_fasta_index, 75 | docker = gitc_docker, 76 | samtools_path = samtools_path 77 | } 78 | } 79 | 80 | # Call variants in parallel over grouped calling intervals 81 | scatter (interval_file in scattered_calling_intervals) { 82 | 83 | # Generate GVCF by interval 84 | call HaplotypeCaller { 85 | input: 86 | input_bam = select_first([CramToBamTask.output_bam, input_bam]), 87 | input_bam_index = select_first([CramToBamTask.output_bai, input_bam_index]), 88 | interval_list = interval_file, 89 | output_filename = output_filename, 90 | ref_dict = ref_dict, 91 | ref_fasta = ref_fasta, 92 | ref_fasta_index = ref_fasta_index, 93 | make_gvcf = making_gvcf, 94 | docker = gatk_docker, 95 | gatk_path = gatk_path 96 | } 97 | } 98 | 99 | # Merge per-interval GVCFs 100 | call MergeGVCFs { 101 | input: 102 | input_vcfs = HaplotypeCaller.output_vcf, 103 | input_vcfs_indexes = HaplotypeCaller.output_vcf_index, 104 | output_filename = output_filename, 105 | docker = gatk_docker, 106 | gatk_path = gatk_path 107 | } 108 | 109 | # Outputs that will be retained when execution is complete 110 | output { 111 | File output_vcf = MergeGVCFs.output_vcf 112 | File output_vcf_index = MergeGVCFs.output_vcf_index 113 | } 114 | } 115 | 116 | # TASK DEFINITIONS 117 | 118 | task CramToBamTask { 119 | 120 | input { 121 | # Command parameters 122 | File ref_fasta 123 | File ref_fasta_index 124 | File ref_dict 125 | File input_cram 126 | String sample_name 127 | 128 | # Runtime parameters 129 | String docker 130 | Int? machine_mem_gb 131 | Int? disk_space_gb 132 | Boolean use_ssd = false 133 | Int? preemptible_attempts 134 | String samtools_path 135 | } 136 | 137 | Float output_bam_size = size(input_cram, "GB") / 0.60 138 | Float ref_size = size(ref_fasta, "GB") + size(ref_fasta_index, "GB") + size(ref_dict, "GB") 139 | Int disk_size = ceil(size(input_cram, "GB") + output_bam_size + ref_size) + 20 140 | 141 | command { 142 | set -e 143 | set -o pipefail 144 | 145 | ${samtools_path} view -h -T ${ref_fasta} ${input_cram} | 146 | ${samtools_path} view -b -o ${sample_name}.bam - 147 | ${samtools_path} index -b ${sample_name}.bam 148 | mv ${sample_name}.bam.bai ${sample_name}.bai 149 | } 150 | runtime { 151 | docker: docker 152 | memory: select_first([machine_mem_gb, 15]) + " GB" 153 | disks: "local-disk " + select_first([disk_space_gb, disk_size]) + if use_ssd then " SSD" else " HDD" 154 | preemptible: select_first([preemptible_attempts, 3]) 155 | } 156 | output { 157 | File output_bam = "${sample_name}.bam" 158 | File output_bai = "${sample_name}.bai" 159 | } 160 | } 161 | 162 | # HaplotypeCaller per-sample in GVCF mode 163 | task HaplotypeCaller { 164 | 165 | input { 166 | File input_bam 167 | File input_bam_index 168 | File interval_list 169 | String output_filename 170 | File ref_dict 171 | File ref_fasta 172 | File ref_fasta_index 173 | Float? contamination 174 | Boolean make_gvcf 175 | 176 | String gatk_path 177 | String? java_options 178 | String java_opt = select_first([java_options, "-XX:GCTimeLimit=50 -XX:GCHeapFreeLimit=10"]) 179 | 180 | # Runtime parameters 181 | String docker 182 | Int? mem_gb 183 | Int? disk_space_gb 184 | Boolean use_ssd = false 185 | Int? preemptible_attempts 186 | } 187 | 188 | Int machine_mem_gb = select_first([mem_gb, 7]) 189 | Int command_mem_gb = machine_mem_gb - 1 190 | 191 | Float ref_size = size(ref_fasta, "GB") + size(ref_fasta_index, "GB") + size(ref_dict, "GB") 192 | Int disk_size = ceil(size(input_bam, "GB") + ref_size) + 20 193 | 194 | command <<< 195 | set -e 196 | 197 | ${gatk_path} --java-options "-Xmx${command_mem_gb}G ${java_opt}" \ 198 | HaplotypeCaller \ 199 | -R ${ref_fasta} \ 200 | -I ${input_bam} \ 201 | -L ${interval_list} \ 202 | -O ${output_filename} \ 203 | -contamination ${default=0 contamination} ${true="-ERC GVCF" false="" make_gvcf} 204 | >>> 205 | 206 | runtime { 207 | docker: docker 208 | memory: machine_mem_gb + " GB" 209 | disks: "local-disk " + select_first([disk_space_gb, disk_size]) + if use_ssd then " SSD" else " HDD" 210 | preemptible: select_first([preemptible_attempts, 3]) 211 | } 212 | 213 | output { 214 | File output_vcf = "${output_filename}" 215 | File output_vcf_index = "${output_filename}.tbi" 216 | } 217 | } 218 | # Merge GVCFs generated per-interval for the same sample 219 | task MergeGVCFs { 220 | 221 | input { 222 | Array[File] input_vcfs 223 | Array[File] input_vcfs_indexes 224 | String output_filename 225 | 226 | String gatk_path 227 | 228 | # Runtime parameters 229 | String docker 230 | Int? mem_gb 231 | Int? disk_space_gb 232 | Boolean use_ssd = false 233 | Int? preemptible_attempts 234 | } 235 | 236 | Int machine_mem_gb = select_first([mem_gb, 3]) 237 | Int command_mem_gb = machine_mem_gb - 1 238 | 239 | command <<< 240 | set -e 241 | 242 | ${gatk_path} --java-options "-Xmx${command_mem_gb}G" \ 243 | MergeVcfs \ 244 | --INPUT ${sep=' --INPUT ' input_vcfs} \ 245 | --OUTPUT ${output_filename} 246 | >>> 247 | 248 | runtime { 249 | docker: docker 250 | memory: machine_mem_gb + " GB" 251 | disks: "local-disk " + select_first([disk_space_gb, 100]) + if use_ssd then " SSD" else " HDD" 252 | preemptible: select_first([preemptible_attempts, 3]) 253 | } 254 | 255 | 256 | output { 257 | File output_vcf = "${output_filename}" 258 | File output_vcf_index = "${output_filename}.tbi" 259 | } 260 | } 261 | -------------------------------------------------------------------------------- /workflows/mystery-2/WholeGenomeGermlineSingleSample.hg38.inputs.json: -------------------------------------------------------------------------------- 1 | { 2 | "WholeGenomeGermlineSingleSample.sample_and_unmapped_bams": { 3 | "sample_name": "NA12878 PLUMBING", 4 | "base_file_name": "NA12878_PLUMBING", 5 | "flowcell_unmapped_bams": [ 6 | "gs://broad-public-datasets/NA12878_downsampled_for_testing/unmapped/H06HDADXX130110.1.ATCACGAT.20k_reads.bam", 7 | "gs://broad-public-datasets/NA12878_downsampled_for_testing/unmapped/H06HDADXX130110.2.ATCACGAT.20k_reads.bam", 8 | "gs://broad-public-datasets/NA12878_downsampled_for_testing/unmapped/H06JUADXX130110.1.ATCACGAT.20k_reads.bam" 9 | ], 10 | "final_gvcf_base_name": "NA12878_PLUMBING", 11 | "unmapped_bam_suffix": ".bam" 12 | }, 13 | 14 | "WholeGenomeGermlineSingleSample.references": { 15 | "fingerprint_genotypes_file": "gs://dsde-data-na12878-public/NA12878.hg38.reference.fingerprint.vcf", 16 | "fingerprint_genotypes_index": "gs://dsde-data-na12878-public/NA12878.hg38.reference.fingerprint.vcf.idx", 17 | "contamination_sites_ud": "gs://broad-references/hg38/v0/Homo_sapiens_assembly38.contam.UD", 18 | "contamination_sites_bed": "gs://broad-references/hg38/v0/Homo_sapiens_assembly38.contam.bed", 19 | "contamination_sites_mu": "gs://broad-references/hg38/v0/Homo_sapiens_assembly38.contam.mu", 20 | "calling_interval_list": "gs://broad-references/hg38/v0/wgs_calling_regions.hg38.interval_list", 21 | "haplotype_scatter_count": 10, 22 | "break_bands_at_multiples_of": 100000, 23 | "reference_fasta" : { 24 | "ref_dict": "gs://broad-references/hg38/v0/Homo_sapiens_assembly38.dict", 25 | "ref_fasta": "gs://broad-references/hg38/v0/Homo_sapiens_assembly38.fasta", 26 | "ref_fasta_index": "gs://broad-references/hg38/v0/Homo_sapiens_assembly38.fasta.fai", 27 | "ref_alt": "gs://broad-references/hg38/v0/Homo_sapiens_assembly38.fasta.64.alt", 28 | "ref_sa": "gs://broad-references/hg38/v0/Homo_sapiens_assembly38.fasta.64.sa", 29 | "ref_amb": "gs://broad-references/hg38/v0/Homo_sapiens_assembly38.fasta.64.amb", 30 | "ref_bwt": "gs://broad-references/hg38/v0/Homo_sapiens_assembly38.fasta.64.bwt", 31 | "ref_ann": "gs://broad-references/hg38/v0/Homo_sapiens_assembly38.fasta.64.ann", 32 | "ref_pac": "gs://broad-references/hg38/v0/Homo_sapiens_assembly38.fasta.64.pac" 33 | }, 34 | "known_indels_sites_vcfs": [ 35 | "gs://broad-references/hg38/v0/Mills_and_1000G_gold_standard.indels.hg38.vcf.gz", 36 | "gs://broad-references/hg38/v0/Homo_sapiens_assembly38.known_indels.vcf.gz" 37 | ], 38 | "known_indels_sites_indices": [ 39 | "gs://broad-references/hg38/v0/Mills_and_1000G_gold_standard.indels.hg38.vcf.gz.tbi", 40 | "gs://broad-references/hg38/v0/Homo_sapiens_assembly38.known_indels.vcf.gz.tbi" 41 | ], 42 | "dbsnp_vcf": "gs://broad-references/hg38/v0/Homo_sapiens_assembly38.dbsnp138.vcf", 43 | "dbsnp_vcf_index": "gs://broad-references/hg38/v0/Homo_sapiens_assembly38.dbsnp138.vcf.idx", 44 | "evaluation_interval_list": "gs://broad-references/hg38/v0/wgs_evaluation_regions.hg38.interval_list" 45 | }, 46 | 47 | "WholeGenomeGermlineSingleSample.wgs_coverage_interval_list": "gs://broad-references/hg38/v0/wgs_coverage_regions.hg38.interval_list", 48 | 49 | "WholeGenomeGermlineSingleSample.papi_settings": { 50 | "preemptible_tries": 3, 51 | "agg_preemptible_tries": 3 52 | } 53 | } 54 | -------------------------------------------------------------------------------- /workflows/mystery-2/WholeGenomeGermlineSingleSample.wdl: -------------------------------------------------------------------------------- 1 | version 1.0 2 | 3 | ## Copyright Broad Institute, 2018 4 | ## 5 | ## This WDL pipeline implements data pre-processing and initial variant calling (GVCF 6 | ## generation) according to the GATK Best Practices (June 2016) for germline SNP and 7 | ## Indel discovery in human whole-genome data. 8 | ## 9 | ## Requirements/expectations : 10 | ## - Human whole-genome pair-end sequencing data in unmapped BAM (uBAM) format 11 | ## - One or more read groups, one per uBAM file, all belonging to a single sample (SM) 12 | ## - Input uBAM files must additionally comply with the following requirements: 13 | ## - - filenames all have the same suffix (we use ".unmapped.bam") 14 | ## - - files must pass validation by ValidateSamFile 15 | ## - - reads are provided in query-sorted order 16 | ## - - all reads must have an RG tag 17 | ## - GVCF output names must end in ".g.vcf.gz" 18 | ## - Reference genome must be Hg38 with ALT contigs 19 | ## 20 | ## Runtime parameters are optimized for Broad's Google Cloud Platform implementation. 21 | ## For program versions, see docker containers. 22 | ## 23 | ## LICENSING : 24 | ## This script is released under the WDL source code license (BSD-3) (see LICENSE in 25 | ## https://github.com/broadinstitute/wdl). Note however that the programs it calls may 26 | ## be subject to different licenses. Users are responsible for checking that they are 27 | ## authorized to run all programs before running this script. Please see the docker 28 | ## page at https://hub.docker.com/r/broadinstitute/genomes-in-the-cloud/ for detailed 29 | ## licensing information pertaining to the included programs. 30 | 31 | # Local import 32 | #import "../../../../pipelines/dna_seq/UnmappedBamToAlignedBam.wdl" as ToBam 33 | #import "../../../../tasks/AggregatedBamQC.wdl" as AggregatedQC 34 | #import "../../../../tasks/GermlineVariantDiscovery.wdl" as Calling 35 | #import "../../../../tasks/Qc.wdl" as QC 36 | #import "../../../../tasks/Utilities.wdl" as Utils 37 | #import "../../../../tasks/BamToCram.wdl" as ToCram 38 | #import "../../../../tasks/VariantCalling.wdl" as ToGvcf 39 | #import "../../../../structs/dna_seq/germline/GermlineStructs.wdl" 40 | 41 | # Git URL import 42 | import "https://raw.githubusercontent.com/gatk-workflows/five-dollar-genome-analysis-pipeline/1.2.0/tasks/UnmappedBamToAlignedBam.wdl" as ToBam 43 | import "https://raw.githubusercontent.com/gatk-workflows/five-dollar-genome-analysis-pipeline/1.2.0/tasks/AggregatedBamQC.wdl" as AggregatedQC 44 | import "https://raw.githubusercontent.com/gatk-workflows/five-dollar-genome-analysis-pipeline/1.2.0/tasks/GermlineVariantDiscovery.wdl" as Calling 45 | import "https://raw.githubusercontent.com/gatk-workflows/five-dollar-genome-analysis-pipeline/1.2.0/tasks/Qc.wdl" as QC 46 | import "https://raw.githubusercontent.com/gatk-workflows/five-dollar-genome-analysis-pipeline/1.2.0/tasks/Utilities.wdl" as Utils 47 | import "https://raw.githubusercontent.com/gatk-workflows/five-dollar-genome-analysis-pipeline/1.2.0/tasks/BamToCram.wdl" as ToCram 48 | import "https://raw.githubusercontent.com/gatk-workflows/five-dollar-genome-analysis-pipeline/1.2.0/tasks/VariantCalling.wdl" as ToGvcf 49 | import "https://raw.githubusercontent.com/gatk-workflows/five-dollar-genome-analysis-pipeline/1.2.0/structs/GermlineStructs.wdl" 50 | 51 | # WORKFLOW DEFINITION 52 | workflow WholeGenomeGermlineSingleSample { 53 | input { 54 | SampleAndUnmappedBams sample_and_unmapped_bams 55 | GermlineSingleSampleReferences references 56 | PapiSettings papi_settings 57 | File wgs_coverage_interval_list 58 | 59 | File? haplotype_database_file 60 | Boolean provide_bam_output = false 61 | Boolean use_gatk3_haplotype_caller = true 62 | } 63 | 64 | # Not overridable: 65 | Int read_length = 250 66 | Float lod_threshold = -20.0 67 | String cross_check_fingerprints_by = "READGROUP" 68 | String recalibrated_bam_basename = sample_and_unmapped_bams.base_file_name + ".aligned.duplicates_marked.recalibrated" 69 | 70 | call ToBam.UnmappedBamToAlignedBam { 71 | input: 72 | sample_and_unmapped_bams = sample_and_unmapped_bams, 73 | references = references, 74 | papi_settings = papi_settings, 75 | 76 | cross_check_fingerprints_by = cross_check_fingerprints_by, 77 | haplotype_database_file = haplotype_database_file, 78 | lod_threshold = lod_threshold, 79 | recalibrated_bam_basename = recalibrated_bam_basename 80 | } 81 | 82 | call AggregatedQC.AggregatedBamQC { 83 | input: 84 | base_recalibrated_bam = UnmappedBamToAlignedBam.output_bam, 85 | base_recalibrated_bam_index = UnmappedBamToAlignedBam.output_bam_index, 86 | base_name = sample_and_unmapped_bams.base_file_name, 87 | sample_name = sample_and_unmapped_bams.sample_name, 88 | recalibrated_bam_base_name = recalibrated_bam_basename, 89 | haplotype_database_file = haplotype_database_file, 90 | references = references, 91 | papi_settings = papi_settings 92 | } 93 | 94 | call ToCram.BamToCram as BamToCram { 95 | input: 96 | input_bam = UnmappedBamToAlignedBam.output_bam, 97 | ref_fasta = references.reference_fasta.ref_fasta, 98 | ref_fasta_index = references.reference_fasta.ref_fasta_index, 99 | ref_dict = references.reference_fasta.ref_dict, 100 | duplication_metrics = UnmappedBamToAlignedBam.duplicate_metrics, 101 | chimerism_metrics = AggregatedBamQC.agg_alignment_summary_metrics, 102 | base_file_name = sample_and_unmapped_bams.base_file_name, 103 | agg_preemptible_tries = papi_settings.agg_preemptible_tries 104 | } 105 | 106 | # QC the sample WGS metrics (stringent thresholds) 107 | call QC.CollectWgsMetrics as CollectWgsMetrics { 108 | input: 109 | input_bam = UnmappedBamToAlignedBam.output_bam, 110 | input_bam_index = UnmappedBamToAlignedBam.output_bam_index, 111 | metrics_filename = sample_and_unmapped_bams.base_file_name + ".wgs_metrics", 112 | ref_fasta = references.reference_fasta.ref_fasta, 113 | ref_fasta_index = references.reference_fasta.ref_fasta_index, 114 | wgs_coverage_interval_list = wgs_coverage_interval_list, 115 | read_length = read_length, 116 | preemptible_tries = papi_settings.agg_preemptible_tries 117 | } 118 | 119 | # QC the sample raw WGS metrics (common thresholds) 120 | call QC.CollectRawWgsMetrics as CollectRawWgsMetrics { 121 | input: 122 | input_bam = UnmappedBamToAlignedBam.output_bam, 123 | input_bam_index = UnmappedBamToAlignedBam.output_bam_index, 124 | metrics_filename = sample_and_unmapped_bams.base_file_name + ".raw_wgs_metrics", 125 | ref_fasta = references.reference_fasta.ref_fasta, 126 | ref_fasta_index = references.reference_fasta.ref_fasta_index, 127 | wgs_coverage_interval_list = wgs_coverage_interval_list, 128 | read_length = read_length, 129 | preemptible_tries = papi_settings.agg_preemptible_tries 130 | } 131 | 132 | call ToGvcf.VariantCalling as BamToGvcf { 133 | input: 134 | calling_interval_list = references.calling_interval_list, 135 | evaluation_interval_list = references.evaluation_interval_list, 136 | haplotype_scatter_count = references.haplotype_scatter_count, 137 | break_bands_at_multiples_of = references.break_bands_at_multiples_of, 138 | contamination = UnmappedBamToAlignedBam.contamination, 139 | input_bam = UnmappedBamToAlignedBam.output_bam, 140 | ref_fasta = references.reference_fasta.ref_fasta, 141 | ref_fasta_index = references.reference_fasta.ref_fasta_index, 142 | ref_dict = references.reference_fasta.ref_dict, 143 | dbsnp_vcf = references.dbsnp_vcf, 144 | dbsnp_vcf_index = references.dbsnp_vcf_index, 145 | base_file_name = sample_and_unmapped_bams.base_file_name, 146 | final_vcf_base_name = sample_and_unmapped_bams.final_gvcf_base_name, 147 | agg_preemptible_tries = papi_settings.agg_preemptible_tries, 148 | use_gatk3_haplotype_caller = use_gatk3_haplotype_caller 149 | } 150 | 151 | if (provide_bam_output) { 152 | File provided_output_bam = UnmappedBamToAlignedBam.output_bam 153 | File provided_output_bam_index = UnmappedBamToAlignedBam.output_bam_index 154 | } 155 | 156 | # Outputs that will be retained when execution is complete 157 | output { 158 | Array[File] quality_yield_metrics = UnmappedBamToAlignedBam.quality_yield_metrics 159 | 160 | Array[File] unsorted_read_group_base_distribution_by_cycle_pdf = UnmappedBamToAlignedBam.unsorted_read_group_base_distribution_by_cycle_pdf 161 | Array[File] unsorted_read_group_base_distribution_by_cycle_metrics = UnmappedBamToAlignedBam.unsorted_read_group_base_distribution_by_cycle_metrics 162 | Array[File] unsorted_read_group_insert_size_histogram_pdf = UnmappedBamToAlignedBam.unsorted_read_group_insert_size_histogram_pdf 163 | Array[File] unsorted_read_group_insert_size_metrics = UnmappedBamToAlignedBam.unsorted_read_group_insert_size_metrics 164 | Array[File] unsorted_read_group_quality_by_cycle_pdf = UnmappedBamToAlignedBam.unsorted_read_group_quality_by_cycle_pdf 165 | Array[File] unsorted_read_group_quality_by_cycle_metrics = UnmappedBamToAlignedBam.unsorted_read_group_quality_by_cycle_metrics 166 | Array[File] unsorted_read_group_quality_distribution_pdf = UnmappedBamToAlignedBam.unsorted_read_group_quality_distribution_pdf 167 | Array[File] unsorted_read_group_quality_distribution_metrics = UnmappedBamToAlignedBam.unsorted_read_group_quality_distribution_metrics 168 | 169 | File read_group_alignment_summary_metrics = AggregatedBamQC.read_group_alignment_summary_metrics 170 | File read_group_gc_bias_detail_metrics = AggregatedBamQC.read_group_gc_bias_detail_metrics 171 | File read_group_gc_bias_pdf = AggregatedBamQC.read_group_gc_bias_pdf 172 | File read_group_gc_bias_summary_metrics = AggregatedBamQC.read_group_gc_bias_summary_metrics 173 | 174 | File? cross_check_fingerprints_metrics = UnmappedBamToAlignedBam.cross_check_fingerprints_metrics 175 | 176 | File selfSM = UnmappedBamToAlignedBam.selfSM 177 | Float contamination = UnmappedBamToAlignedBam.contamination 178 | 179 | File calculate_read_group_checksum_md5 = AggregatedBamQC.calculate_read_group_checksum_md5 180 | 181 | File agg_alignment_summary_metrics = AggregatedBamQC.agg_alignment_summary_metrics 182 | File agg_bait_bias_detail_metrics = AggregatedBamQC.agg_bait_bias_detail_metrics 183 | File agg_bait_bias_summary_metrics = AggregatedBamQC.agg_bait_bias_summary_metrics 184 | File agg_gc_bias_detail_metrics = AggregatedBamQC.agg_gc_bias_detail_metrics 185 | File agg_gc_bias_pdf = AggregatedBamQC.agg_gc_bias_pdf 186 | File agg_gc_bias_summary_metrics = AggregatedBamQC.agg_gc_bias_summary_metrics 187 | File agg_insert_size_histogram_pdf = AggregatedBamQC.agg_insert_size_histogram_pdf 188 | File agg_insert_size_metrics = AggregatedBamQC.agg_insert_size_metrics 189 | File agg_pre_adapter_detail_metrics = AggregatedBamQC.agg_pre_adapter_detail_metrics 190 | File agg_pre_adapter_summary_metrics = AggregatedBamQC.agg_pre_adapter_summary_metrics 191 | File agg_quality_distribution_pdf = AggregatedBamQC.agg_quality_distribution_pdf 192 | File agg_quality_distribution_metrics = AggregatedBamQC.agg_quality_distribution_metrics 193 | File agg_error_summary_metrics = AggregatedBamQC.agg_error_summary_metrics 194 | 195 | File? fingerprint_summary_metrics = AggregatedBamQC.fingerprint_summary_metrics 196 | File? fingerprint_detail_metrics = AggregatedBamQC.fingerprint_detail_metrics 197 | 198 | File wgs_metrics = CollectWgsMetrics.metrics 199 | File raw_wgs_metrics = CollectRawWgsMetrics.metrics 200 | 201 | File duplicate_metrics = UnmappedBamToAlignedBam.duplicate_metrics 202 | File output_bqsr_reports = UnmappedBamToAlignedBam.output_bqsr_reports 203 | 204 | File gvcf_summary_metrics = BamToGvcf.vcf_summary_metrics 205 | File gvcf_detail_metrics = BamToGvcf.vcf_detail_metrics 206 | 207 | File? output_bam = provided_output_bam 208 | File? output_bam_index = provided_output_bam_index 209 | 210 | File output_cram = BamToCram.output_cram 211 | File output_cram_index = BamToCram.output_cram_index 212 | File output_cram_md5 = BamToCram.output_cram_md5 213 | 214 | File validate_cram_file_report = BamToCram.validate_cram_file_report 215 | 216 | File output_vcf = BamToGvcf.output_vcf 217 | File output_vcf_index = BamToGvcf.output_vcf_index 218 | } 219 | } 220 | -------------------------------------------------------------------------------- /workflows/mystery-2/structs/GermlineStructs.wdl: -------------------------------------------------------------------------------- 1 | version 1.0 2 | 3 | struct SampleAndUnmappedBams { 4 | String base_file_name 5 | String final_gvcf_base_name 6 | Array[File] flowcell_unmapped_bams 7 | String sample_name 8 | String unmapped_bam_suffix 9 | } 10 | 11 | struct ReferenceFasta { 12 | File ref_dict 13 | File ref_fasta 14 | File ref_fasta_index 15 | File ref_alt 16 | File ref_sa 17 | File ref_amb 18 | File ref_bwt 19 | File ref_ann 20 | File ref_pac 21 | } 22 | 23 | struct GermlineSingleSampleReferences { 24 | File? fingerprint_genotypes_file 25 | File? fingerprint_genotypes_index 26 | 27 | File contamination_sites_ud 28 | File contamination_sites_bed 29 | File contamination_sites_mu 30 | File calling_interval_list 31 | 32 | Int haplotype_scatter_count 33 | Int break_bands_at_multiples_of 34 | 35 | ReferenceFasta reference_fasta 36 | 37 | Array[File] known_indels_sites_vcfs 38 | Array[File] known_indels_sites_indices 39 | 40 | File dbsnp_vcf 41 | File dbsnp_vcf_index 42 | 43 | File evaluation_interval_list 44 | } 45 | 46 | struct ExomeGermlineSingleSampleOligos { 47 | File target_interval_list 48 | File bait_interval_list 49 | String bait_set_name 50 | } 51 | 52 | struct CrossSpeciesContaminationReferences { 53 | File filter_bwa_image 54 | File kmer_file 55 | File meats_bwa_image 56 | File meats_fasta 57 | File meats_fasta_dict 58 | File meats_taxonomy_file 59 | File microbe_bwa_image 60 | File microbe_fasta 61 | File microbe_fasta_dict 62 | File microbe_taxonomy_file 63 | File normalization_file 64 | File metrics_script_file 65 | Float score_min_identity 66 | Int reads_after_downsampling 67 | } 68 | 69 | struct PapiSettings { 70 | Int preemptible_tries 71 | Int agg_preemptible_tries 72 | } 73 | -------------------------------------------------------------------------------- /workflows/mystery-2/tasks/AggregatedBamQC.wdl: -------------------------------------------------------------------------------- 1 | version 1.0 2 | ## Copyright Broad Institute, 2018 3 | ## 4 | ## This WDL pipeline implements data processing according to the GATK Best Practices (June 2016) 5 | ## for human whole-genome and exome sequencing data. 6 | ## 7 | ## Runtime parameters are often optimized for Broad's Google Cloud Platform implementation. 8 | ## For program versions, see docker containers. 9 | ## 10 | ## LICENSING : 11 | ## This script is released under the WDL source code license (BSD-3) (see LICENSE in 12 | ## https://github.com/broadinstitute/wdl). Note however that the programs it calls may 13 | ## be subject to different licenses. Users are responsible for checking that they are 14 | ## authorized to run all programs before running this script. Please see the docker 15 | ## page at https://hub.docker.com/r/broadinstitute/genomes-in-the-cloud/ for detailed 16 | ## licensing information pertaining to the included programs. 17 | 18 | # Local import 19 | #import "./Qc.wdl" as QC 20 | #import "../structs/GermlineStructs.wdl" 21 | 22 | # Git URL import 23 | import "https://raw.githubusercontent.com/gatk-workflows/five-dollar-genome-analysis-pipeline/1.2.0/tasks/Qc.wdl" as QC 24 | import "https://raw.githubusercontent.com/gatk-workflows/five-dollar-genome-analysis-pipeline/1.2.0/structs/GermlineStructs.wdl" 25 | 26 | # WORKFLOW DEFINITION 27 | workflow AggregatedBamQC { 28 | input { 29 | File base_recalibrated_bam 30 | File base_recalibrated_bam_index 31 | String base_name 32 | String sample_name 33 | String recalibrated_bam_base_name 34 | File? haplotype_database_file 35 | GermlineSingleSampleReferences references 36 | PapiSettings papi_settings 37 | } 38 | 39 | # QC the final BAM (consolidated after scattered BQSR) 40 | call QC.CollectReadgroupBamQualityMetrics as CollectReadgroupBamQualityMetrics { 41 | input: 42 | input_bam = base_recalibrated_bam, 43 | input_bam_index = base_recalibrated_bam_index, 44 | output_bam_prefix = base_name + ".readgroup", 45 | ref_dict = references.reference_fasta.ref_dict, 46 | ref_fasta = references.reference_fasta.ref_fasta, 47 | ref_fasta_index = references.reference_fasta.ref_fasta_index, 48 | preemptible_tries = papi_settings.agg_preemptible_tries 49 | } 50 | 51 | # QC the final BAM some more (no such thing as too much QC) 52 | call QC.CollectAggregationMetrics as CollectAggregationMetrics { 53 | input: 54 | input_bam = base_recalibrated_bam, 55 | input_bam_index = base_recalibrated_bam_index, 56 | output_bam_prefix = base_name, 57 | ref_dict = references.reference_fasta.ref_dict, 58 | ref_fasta = references.reference_fasta.ref_fasta, 59 | ref_fasta_index = references.reference_fasta.ref_fasta_index, 60 | preemptible_tries = papi_settings.agg_preemptible_tries 61 | } 62 | 63 | if (defined(haplotype_database_file) && defined(references.fingerprint_genotypes_file)) { 64 | # Check the sample BAM fingerprint against the sample array 65 | call QC.CheckFingerprint as CheckFingerprint { 66 | input: 67 | input_bam = base_recalibrated_bam, 68 | input_bam_index = base_recalibrated_bam_index, 69 | haplotype_database_file = haplotype_database_file, 70 | genotypes = references.fingerprint_genotypes_file, 71 | genotypes_index = references.fingerprint_genotypes_index, 72 | output_basename = base_name, 73 | sample = sample_name, 74 | preemptible_tries = papi_settings.agg_preemptible_tries 75 | } 76 | } 77 | 78 | # Generate a checksum per readgroup in the final BAM 79 | call QC.CalculateReadGroupChecksum as CalculateReadGroupChecksum { 80 | input: 81 | input_bam = base_recalibrated_bam, 82 | input_bam_index = base_recalibrated_bam_index, 83 | read_group_md5_filename = recalibrated_bam_base_name + ".bam.read_group_md5", 84 | preemptible_tries = papi_settings.agg_preemptible_tries 85 | } 86 | 87 | output { 88 | File read_group_alignment_summary_metrics = CollectReadgroupBamQualityMetrics.alignment_summary_metrics 89 | File read_group_gc_bias_detail_metrics = CollectReadgroupBamQualityMetrics.gc_bias_detail_metrics 90 | File read_group_gc_bias_pdf = CollectReadgroupBamQualityMetrics.gc_bias_pdf 91 | File read_group_gc_bias_summary_metrics = CollectReadgroupBamQualityMetrics.gc_bias_summary_metrics 92 | 93 | File calculate_read_group_checksum_md5 = CalculateReadGroupChecksum.md5_file 94 | 95 | File agg_alignment_summary_metrics = CollectAggregationMetrics.alignment_summary_metrics 96 | File agg_bait_bias_detail_metrics = CollectAggregationMetrics.bait_bias_detail_metrics 97 | File agg_bait_bias_summary_metrics = CollectAggregationMetrics.bait_bias_summary_metrics 98 | File agg_gc_bias_detail_metrics = CollectAggregationMetrics.gc_bias_detail_metrics 99 | File agg_gc_bias_pdf = CollectAggregationMetrics.gc_bias_pdf 100 | File agg_gc_bias_summary_metrics = CollectAggregationMetrics.gc_bias_summary_metrics 101 | File agg_insert_size_histogram_pdf = CollectAggregationMetrics.insert_size_histogram_pdf 102 | File agg_insert_size_metrics = CollectAggregationMetrics.insert_size_metrics 103 | File agg_pre_adapter_detail_metrics = CollectAggregationMetrics.pre_adapter_detail_metrics 104 | File agg_pre_adapter_summary_metrics = CollectAggregationMetrics.pre_adapter_summary_metrics 105 | File agg_quality_distribution_pdf = CollectAggregationMetrics.quality_distribution_pdf 106 | File agg_quality_distribution_metrics = CollectAggregationMetrics.quality_distribution_metrics 107 | File agg_error_summary_metrics = CollectAggregationMetrics.error_summary_metrics 108 | 109 | File? fingerprint_summary_metrics = CheckFingerprint.summary_metrics 110 | File? fingerprint_detail_metrics = CheckFingerprint.detail_metrics 111 | } 112 | } 113 | -------------------------------------------------------------------------------- /workflows/mystery-2/tasks/Alignment.wdl: -------------------------------------------------------------------------------- 1 | version 1.0 2 | ## Copyright Broad Institute, 2018 3 | ## 4 | ## This WDL defines tasks used for alignment of human whole-genome or exome sequencing data. 5 | ## 6 | ## Runtime parameters are often optimized for Broad's Google Cloud Platform implementation. 7 | ## For program versions, see docker containers. 8 | ## 9 | ## LICENSING : 10 | ## This script is released under the WDL source code license (BSD-3) (see LICENSE in 11 | ## https://github.com/broadinstitute/wdl). Note however that the programs it calls may 12 | ## be subject to different licenses. Users are responsible for checking that they are 13 | ## authorized to run all programs before running this script. Please see the docker 14 | ## page at https://hub.docker.com/r/broadinstitute/genomes-in-the-cloud/ for detailed 15 | ## licensing information pertaining to the included programs. 16 | 17 | # Local Import 18 | #import "../structs/GermlineStructs.wdl" 19 | 20 | # Git URL Import 21 | import "https://raw.githubusercontent.com/gatk-workflows/five-dollar-genome-analysis-pipeline/1.2.0/structs/GermlineStructs.wdl" 22 | 23 | # Get version of BWA 24 | task GetBwaVersion { 25 | command { 26 | # not setting set -o pipefail here because /bwa has a rc=1 and we dont want to allow rc=1 to succeed because 27 | # the sed may also fail with that error and that is something we actually want to fail on. 28 | /usr/gitc/bwa 2>&1 | \ 29 | grep -e '^Version' | \ 30 | sed 's/Version: //' 31 | } 32 | runtime { 33 | docker: "us.gcr.io/broad-gotc-prod/genomes-in-the-cloud:2.4.1-1540490856" 34 | memory: "1 GiB" 35 | } 36 | output { 37 | String bwa_version = read_string(stdout()) 38 | } 39 | } 40 | 41 | # Read unmapped BAM, convert on-the-fly to FASTQ and stream to BWA MEM for alignment, then stream to MergeBamAlignment 42 | task SamToFastqAndBwaMemAndMba { 43 | input { 44 | File input_bam 45 | String bwa_commandline 46 | String bwa_version 47 | String output_bam_basename 48 | 49 | # reference_fasta.ref_alt is the .alt file from bwa-kit 50 | # (https://github.com/lh3/bwa/tree/master/bwakit), 51 | # listing the reference contigs that are "alternative". 52 | ReferenceFasta reference_fasta 53 | 54 | Int compression_level 55 | Int preemptible_tries 56 | } 57 | 58 | Float unmapped_bam_size = size(input_bam, "GiB") 59 | Float ref_size = size(reference_fasta.ref_fasta, "GiB") + size(reference_fasta.ref_fasta_index, "GiB") + size(reference_fasta.ref_dict, "GiB") 60 | Float bwa_ref_size = ref_size + size(reference_fasta.ref_alt, "GiB") + size(reference_fasta.ref_amb, "GiB") + size(reference_fasta.ref_ann, "GiB") + size(reference_fasta.ref_bwt, "GiB") + size(reference_fasta.ref_pac, "GiB") + size(reference_fasta.ref_sa, "GiB") 61 | # Sometimes the output is larger than the input, or a task can spill to disk. 62 | # In these cases we need to account for the input (1) and the output (1.5) or the input(1), the output(1), and spillage (.5). 63 | Float disk_multiplier = 2.5 64 | Int disk_size = ceil(unmapped_bam_size + bwa_ref_size + (disk_multiplier * unmapped_bam_size) + 20) 65 | 66 | command <<< 67 | set -o pipefail 68 | set -e 69 | 70 | # set the bash variable needed for the command-line 71 | bash_ref_fasta=~{reference_fasta.ref_fasta} 72 | # if reference_fasta.ref_alt has data in it, 73 | if [ -s ~{reference_fasta.ref_alt} ]; then 74 | java -Xms1000m -Xmx1000m -jar /usr/gitc/picard.jar \ 75 | SamToFastq \ 76 | INPUT=~{input_bam} \ 77 | FASTQ=/dev/stdout \ 78 | INTERLEAVE=true \ 79 | NON_PF=true | \ 80 | /usr/gitc/~{bwa_commandline} /dev/stdin - 2> >(tee ~{output_bam_basename}.bwa.stderr.log >&2) | \ 81 | java -Dsamjdk.compression_level=~{compression_level} -Xms1000m -Xmx1000m -jar /usr/gitc/picard.jar \ 82 | MergeBamAlignment \ 83 | VALIDATION_STRINGENCY=SILENT \ 84 | EXPECTED_ORIENTATIONS=FR \ 85 | ATTRIBUTES_TO_RETAIN=X0 \ 86 | ATTRIBUTES_TO_REMOVE=NM \ 87 | ATTRIBUTES_TO_REMOVE=MD \ 88 | ALIGNED_BAM=/dev/stdin \ 89 | UNMAPPED_BAM=~{input_bam} \ 90 | OUTPUT=~{output_bam_basename}.bam \ 91 | REFERENCE_SEQUENCE=~{reference_fasta.ref_fasta} \ 92 | PAIRED_RUN=true \ 93 | SORT_ORDER="unsorted" \ 94 | IS_BISULFITE_SEQUENCE=false \ 95 | ALIGNED_READS_ONLY=false \ 96 | CLIP_ADAPTERS=false \ 97 | MAX_RECORDS_IN_RAM=2000000 \ 98 | ADD_MATE_CIGAR=true \ 99 | MAX_INSERTIONS_OR_DELETIONS=-1 \ 100 | PRIMARY_ALIGNMENT_STRATEGY=MostDistant \ 101 | PROGRAM_RECORD_ID="bwamem" \ 102 | PROGRAM_GROUP_VERSION="~{bwa_version}" \ 103 | PROGRAM_GROUP_COMMAND_LINE="~{bwa_commandline}" \ 104 | PROGRAM_GROUP_NAME="bwamem" \ 105 | UNMAPPED_READ_STRATEGY=COPY_TO_TAG \ 106 | ALIGNER_PROPER_PAIR_FLAGS=true \ 107 | UNMAP_CONTAMINANT_READS=true \ 108 | ADD_PG_TAG_TO_READS=false 109 | 110 | grep -m1 "read .* ALT contigs" ~{output_bam_basename}.bwa.stderr.log | \ 111 | grep -v "read 0 ALT contigs" 112 | 113 | # else reference_fasta.ref_alt is empty or could not be found 114 | else 115 | exit 1; 116 | fi 117 | >>> 118 | runtime { 119 | docker: "us.gcr.io/broad-gotc-prod/genomes-in-the-cloud:2.4.1-1540490856" 120 | preemptible: preemptible_tries 121 | memory: "14 GiB" 122 | cpu: "16" 123 | disks: "local-disk " + disk_size + " HDD" 124 | } 125 | output { 126 | File output_bam = "~{output_bam_basename}.bam" 127 | File bwa_stderr_log = "~{output_bam_basename}.bwa.stderr.log" 128 | } 129 | } 130 | 131 | task SamSplitter { 132 | input { 133 | File input_bam 134 | Int n_reads 135 | Int preemptible_tries 136 | Int compression_level 137 | } 138 | 139 | Float unmapped_bam_size = size(input_bam, "GiB") 140 | # Since the output bams are less compressed than the input bam we need a disk multiplier that's larger than 2. 141 | Float disk_multiplier = 2.5 142 | Int disk_size = ceil(disk_multiplier * unmapped_bam_size + 20) 143 | 144 | command { 145 | set -e 146 | mkdir output_dir 147 | 148 | total_reads=$(samtools view -c ~{input_bam}) 149 | 150 | java -Dsamjdk.compression_level=~{compression_level} -Xms3000m -jar /usr/gitc/picard.jar SplitSamByNumberOfReads \ 151 | INPUT=~{input_bam} \ 152 | OUTPUT=output_dir \ 153 | SPLIT_TO_N_READS=~{n_reads} \ 154 | TOTAL_READS_IN_INPUT=$total_reads 155 | } 156 | output { 157 | Array[File] split_bams = glob("output_dir/*.bam") 158 | } 159 | runtime { 160 | docker: "us.gcr.io/broad-gotc-prod/genomes-in-the-cloud:2.4.1-1540490856" 161 | preemptible: preemptible_tries 162 | memory: "3.75 GiB" 163 | disks: "local-disk " + disk_size + " HDD" 164 | } 165 | } 166 | -------------------------------------------------------------------------------- /workflows/mystery-2/tasks/BamProcessing.wdl: -------------------------------------------------------------------------------- 1 | version 1.0 2 | 3 | ## Copyright Broad Institute, 2018 4 | ## 5 | ## This WDL defines tasks used for BAM file processing of human whole-genome or exome sequencing data. 6 | ## 7 | ## Runtime parameters are often optimized for Broad's Google Cloud Platform implementation. 8 | ## For program versions, see docker containers. 9 | ## 10 | ## LICENSING : 11 | ## This script is released under the WDL source code license (BSD-3) (see LICENSE in 12 | ## https://github.com/broadinstitute/wdl). Note however that the programs it calls may 13 | ## be subject to different licenses. Users are responsible for checking that they are 14 | ## authorized to run all programs before running this script. Please see the docker 15 | ## page at https://hub.docker.com/r/broadinstitute/genomes-in-the-cloud/ for detailed 16 | ## licensing information pertaining to the included programs. 17 | 18 | # Sort BAM file by coordinate order 19 | task SortSam { 20 | input { 21 | File input_bam 22 | String output_bam_basename 23 | Int preemptible_tries 24 | Int compression_level 25 | } 26 | # SortSam spills to disk a lot more because we are only store 300000 records in RAM now because its faster for our data so it needs 27 | # more disk space. Also it spills to disk in an uncompressed format so we need to account for that with a larger multiplier 28 | Float sort_sam_disk_multiplier = 3.25 29 | Int disk_size = ceil(sort_sam_disk_multiplier * size(input_bam, "GiB")) + 20 30 | 31 | command { 32 | java -Dsamjdk.compression_level=~{compression_level} -Xms4000m -jar /usr/gitc/picard.jar \ 33 | SortSam \ 34 | INPUT=~{input_bam} \ 35 | OUTPUT=~{output_bam_basename}.bam \ 36 | SORT_ORDER="coordinate" \ 37 | CREATE_INDEX=true \ 38 | CREATE_MD5_FILE=true \ 39 | MAX_RECORDS_IN_RAM=300000 40 | 41 | } 42 | runtime { 43 | docker: "us.gcr.io/broad-gotc-prod/genomes-in-the-cloud:2.4.1-1540490856" 44 | disks: "local-disk " + disk_size + " HDD" 45 | cpu: "1" 46 | memory: "5000 MiB" 47 | preemptible: preemptible_tries 48 | } 49 | output { 50 | File output_bam = "~{output_bam_basename}.bam" 51 | File output_bam_index = "~{output_bam_basename}.bai" 52 | File output_bam_md5 = "~{output_bam_basename}.bam.md5" 53 | } 54 | } 55 | 56 | # Sort BAM file by coordinate order -- using Spark! 57 | task SortSamSpark { 58 | input { 59 | File input_bam 60 | String output_bam_basename 61 | Int preemptible_tries 62 | Int compression_level 63 | String gatk_docker = "us.gcr.io/broad-gatk/gatk:4.0.10.1" 64 | } 65 | # SortSam spills to disk a lot more because we are only store 300000 records in RAM now because its faster for our data so it needs 66 | # more disk space. Also it spills to disk in an uncompressed format so we need to account for that with a larger multiplier 67 | Float sort_sam_disk_multiplier = 3.25 68 | Int disk_size = ceil(sort_sam_disk_multiplier * size(input_bam, "GiB")) + 20 69 | 70 | command { 71 | set -e 72 | 73 | gatk --java-options "-Dsamjdk.compression_level=~{compression_level} -Xms100g -Xmx100g" \ 74 | SortSamSpark \ 75 | -I ~{input_bam} \ 76 | -O ~{output_bam_basename}.bam \ 77 | -- --conf spark.local.dir=. --spark-master 'local[16]' --conf 'spark.kryo.referenceTracking=false' 78 | 79 | samtools index ~{output_bam_basename}.bam ~{output_bam_basename}.bai 80 | } 81 | runtime { 82 | docker: gatk_docker 83 | disks: "local-disk " + disk_size + " HDD" 84 | bootDiskSizeGb: "15" 85 | cpu: "16" 86 | memory: "102 GiB" 87 | preemptible: preemptible_tries 88 | } 89 | output { 90 | File output_bam = "~{output_bam_basename}.bam" 91 | File output_bam_index = "~{output_bam_basename}.bai" 92 | } 93 | } 94 | 95 | # Mark duplicate reads to avoid counting non-independent observations 96 | task MarkDuplicates { 97 | input { 98 | Array[File] input_bams 99 | String output_bam_basename 100 | String metrics_filename 101 | Float total_input_size 102 | Int compression_level 103 | Int preemptible_tries 104 | 105 | # The program default for READ_NAME_REGEX is appropriate in nearly every case. 106 | # Sometimes we wish to supply "null" in order to turn off optical duplicate detection 107 | # This can be desirable if you don't mind the estimated library size being wrong and optical duplicate detection is taking >7 days and failing 108 | String? read_name_regex 109 | Int memory_multiplier = 1 110 | } 111 | 112 | # The merged bam will be smaller than the sum of the parts so we need to account for the unmerged inputs and the merged output. 113 | # Mark Duplicates takes in as input readgroup bams and outputs a slightly smaller aggregated bam. Giving .25 as wiggleroom 114 | Float md_disk_multiplier = 3 115 | Int disk_size = ceil(md_disk_multiplier * total_input_size) + 20 116 | 117 | Int memory_size = ceil(8 * memory_multiplier) 118 | Int java_memory_size = (memory_size - 2) 119 | 120 | # Task is assuming query-sorted input so that the Secondary and Supplementary reads get marked correctly 121 | # This works because the output of BWA is query-grouped and therefore, so is the output of MergeBamAlignment. 122 | # While query-grouped isn't actually query-sorted, it's good enough for MarkDuplicates with ASSUME_SORT_ORDER="queryname" 123 | 124 | command { 125 | java -Dsamjdk.compression_level=~{compression_level} -Xms~{java_memory_size}g -jar /usr/gitc/picard.jar \ 126 | MarkDuplicates \ 127 | INPUT=~{sep=' INPUT=' input_bams} \ 128 | OUTPUT=~{output_bam_basename}.bam \ 129 | METRICS_FILE=~{metrics_filename} \ 130 | VALIDATION_STRINGENCY=SILENT \ 131 | ~{"READ_NAME_REGEX=" + read_name_regex} \ 132 | OPTICAL_DUPLICATE_PIXEL_DISTANCE=2500 \ 133 | ASSUME_SORT_ORDER="queryname" \ 134 | CLEAR_DT="false" \ 135 | ADD_PG_TAG_TO_READS=false 136 | } 137 | runtime { 138 | docker: "us.gcr.io/broad-gotc-prod/genomes-in-the-cloud:2.4.1-1540490856" 139 | preemptible: preemptible_tries 140 | memory: "~{memory_size} GiB" 141 | disks: "local-disk " + disk_size + " HDD" 142 | } 143 | output { 144 | File output_bam = "~{output_bam_basename}.bam" 145 | File duplicate_metrics = "~{metrics_filename}" 146 | } 147 | } 148 | 149 | task MarkDuplicatesSpark { 150 | input { 151 | Array[File] input_bams 152 | String output_bam_basename 153 | String metrics_filename 154 | Float total_input_size 155 | Int compression_level 156 | Int preemptible_tries 157 | 158 | String? read_name_regex 159 | Int memory_multiplier = 3 160 | Int cpu_size = 6 161 | } 162 | 163 | # The merged bam will be smaller than the sum of the parts so we need to account for the unmerged inputs and the merged output. 164 | # Mark Duplicates takes in as input readgroup bams and outputs a slightly smaller aggregated bam. Giving 2.5 as wiggleroom 165 | Float md_disk_multiplier = 2.5 166 | Int disk_size = ceil(md_disk_multiplier * total_input_size) + 20 167 | 168 | Int memory_size = ceil(16 * memory_multiplier) 169 | Int java_memory_size = (memory_size - 6) 170 | 171 | String output_bam_location = "~{output_bam_basename}.bam" 172 | 173 | # Removed options ASSUME_SORT_ORDER, CLEAR_DT, and ADD_PG_TAG_TO_READS as it seems like they are a) not implemented 174 | # in MarkDuplicatesSpark, and/or b) are set to "false" aka "don't do" anyhow. 175 | # MarkDuplicatesSpark requires PAPIv2 176 | command <<< 177 | set -e 178 | export GATK_LOCAL_JAR=/root/gatk.jar 179 | gatk --java-options "-Dsamjdk.compression_level=~{compression_level} -Xmx~{java_memory_size}g" \ 180 | MarkDuplicatesSpark \ 181 | --input ~{sep=' --input ' input_bams} \ 182 | --output ~{output_bam_location} \ 183 | --metrics-file ~{metrics_filename} \ 184 | --read-validation-stringency SILENT \ 185 | ~{"--read-name-regex " + read_name_regex} \ 186 | --optical-duplicate-pixel-distance 2500 \ 187 | --treat-unsorted-as-querygroup-ordered \ 188 | --create-output-bam-index false \ 189 | -- --conf spark.local.dir=/mnt/tmp --spark-master 'local[16]' --conf 'spark.kryo.referenceTracking=false' 190 | >>> 191 | 192 | runtime { 193 | docker: "jamesemery/gatknightly:gatkMasterSnapshot44ca2e9e84a" 194 | disks: "/mnt/tmp " + ceil(2.1 * total_input_size) + " LOCAL, local-disk " + disk_size + " HDD" 195 | bootDiskSizeGb: "50" 196 | cpu: cpu_size 197 | memory: "~{memory_size} GiB" 198 | preemptible: preemptible_tries 199 | } 200 | 201 | output { 202 | File output_bam = output_bam_location 203 | File duplicate_metrics = metrics_filename 204 | } 205 | } 206 | 207 | # Generate Base Quality Score Recalibration (BQSR) model 208 | task BaseRecalibrator { 209 | input { 210 | File input_bam 211 | String recalibration_report_filename 212 | Array[String] sequence_group_interval 213 | File dbsnp_vcf 214 | File dbsnp_vcf_index 215 | Array[File] known_indels_sites_vcfs 216 | Array[File] known_indels_sites_indices 217 | File ref_dict 218 | File ref_fasta 219 | File ref_fasta_index 220 | Int bqsr_scatter 221 | Int preemptible_tries 222 | String gatk_docker = "us.gcr.io/broad-gatk/gatk:4.0.10.1" 223 | } 224 | 225 | Float ref_size = size(ref_fasta, "GiB") + size(ref_fasta_index, "GiB") + size(ref_dict, "GiB") 226 | Float dbsnp_size = size(dbsnp_vcf, "GiB") 227 | Int disk_size = ceil((size(input_bam, "GiB") / bqsr_scatter) + ref_size + dbsnp_size) + 20 228 | 229 | parameter_meta { 230 | input_bam: { 231 | localization_optional: true 232 | } 233 | } 234 | 235 | command { 236 | gatk --java-options "-XX:GCTimeLimit=50 -XX:GCHeapFreeLimit=10 -XX:+PrintFlagsFinal \ 237 | -XX:+PrintGCTimeStamps -XX:+PrintGCDateStamps -XX:+PrintGCDetails \ 238 | -Xloggc:gc_log.log -Xms4000m" \ 239 | BaseRecalibrator \ 240 | -R ~{ref_fasta} \ 241 | -I ~{input_bam} \ 242 | --use-original-qualities \ 243 | -O ~{recalibration_report_filename} \ 244 | --known-sites ~{dbsnp_vcf} \ 245 | --known-sites ~{sep=" -known-sites " known_indels_sites_vcfs} \ 246 | -L ~{sep=" -L " sequence_group_interval} 247 | } 248 | runtime { 249 | docker: gatk_docker 250 | preemptible: preemptible_tries 251 | memory: "6 GiB" 252 | disks: "local-disk " + disk_size + " HDD" 253 | } 254 | output { 255 | File recalibration_report = "~{recalibration_report_filename}" 256 | } 257 | } 258 | 259 | # Apply Base Quality Score Recalibration (BQSR) model 260 | task ApplyBQSR { 261 | input { 262 | File input_bam 263 | String output_bam_basename 264 | File recalibration_report 265 | Array[String] sequence_group_interval 266 | File ref_dict 267 | File ref_fasta 268 | File ref_fasta_index 269 | Int compression_level 270 | Int bqsr_scatter 271 | Int preemptible_tries 272 | String gatk_docker = "us.gcr.io/broad-gatk/gatk:4.0.10.1" 273 | Int memory_multiplier = 1 274 | } 275 | 276 | Float ref_size = size(ref_fasta, "GiB") + size(ref_fasta_index, "GiB") + size(ref_dict, "GiB") 277 | Int disk_size = ceil((size(input_bam, "GiB") * 3 / bqsr_scatter) + ref_size) + 20 278 | 279 | Int memory_size = ceil(3500 * memory_multiplier) 280 | 281 | parameter_meta { 282 | input_bam: { 283 | localization_optional: true 284 | } 285 | } 286 | 287 | command { 288 | gatk --java-options "-XX:+PrintFlagsFinal -XX:+PrintGCTimeStamps -XX:+PrintGCDateStamps \ 289 | -XX:+PrintGCDetails -Xloggc:gc_log.log \ 290 | -XX:GCTimeLimit=50 -XX:GCHeapFreeLimit=10 -Dsamjdk.compression_level=~{compression_level} -Xms3000m" \ 291 | ApplyBQSR \ 292 | --create-output-bam-md5 \ 293 | --add-output-sam-program-record \ 294 | -R ~{ref_fasta} \ 295 | -I ~{input_bam} \ 296 | --use-original-qualities \ 297 | -O ~{output_bam_basename}.bam \ 298 | -bqsr ~{recalibration_report} \ 299 | --static-quantized-quals 10 \ 300 | --static-quantized-quals 20 \ 301 | --static-quantized-quals 30 \ 302 | -L ~{sep=" -L " sequence_group_interval} 303 | } 304 | runtime { 305 | docker: gatk_docker 306 | preemptible: preemptible_tries 307 | memory: "~{memory_size} MiB" 308 | disks: "local-disk " + disk_size + " HDD" 309 | } 310 | output { 311 | File recalibrated_bam = "~{output_bam_basename}.bam" 312 | File recalibrated_bam_checksum = "~{output_bam_basename}.bam.md5" 313 | } 314 | } 315 | 316 | # Combine multiple recalibration tables from scattered BaseRecalibrator runs 317 | task GatherBqsrReports { 318 | input { 319 | Array[File] input_bqsr_reports 320 | String output_report_filename 321 | Int preemptible_tries 322 | String gatk_docker = "us.gcr.io/broad-gatk/gatk:4.0.10.1" 323 | } 324 | 325 | command { 326 | gatk --java-options "-Xms3000m" \ 327 | GatherBQSRReports \ 328 | -I ~{sep=' -I ' input_bqsr_reports} \ 329 | -O ~{output_report_filename} 330 | } 331 | runtime { 332 | docker: gatk_docker 333 | preemptible: preemptible_tries 334 | memory: "3500 MiB" 335 | disks: "local-disk 20 HDD" 336 | } 337 | output { 338 | File output_bqsr_report = "~{output_report_filename}" 339 | } 340 | } 341 | 342 | # Combine multiple *sorted* BAM files 343 | task GatherSortedBamFiles { 344 | input { 345 | Array[File] input_bams 346 | String output_bam_basename 347 | Float total_input_size 348 | Int compression_level 349 | Int preemptible_tries 350 | } 351 | 352 | # Multiply the input bam size by two to account for the input and output 353 | Int disk_size = ceil(2 * total_input_size) + 20 354 | 355 | command { 356 | java -Dsamjdk.compression_level=~{compression_level} -Xms2000m -jar /usr/gitc/picard.jar \ 357 | GatherBamFiles \ 358 | INPUT=~{sep=' INPUT=' input_bams} \ 359 | OUTPUT=~{output_bam_basename}.bam \ 360 | CREATE_INDEX=true \ 361 | CREATE_MD5_FILE=true 362 | } 363 | runtime { 364 | docker: "us.gcr.io/broad-gotc-prod/genomes-in-the-cloud:2.4.1-1540490856" 365 | preemptible: preemptible_tries 366 | memory: "3 GiB" 367 | disks: "local-disk " + disk_size + " HDD" 368 | } 369 | output { 370 | File output_bam = "~{output_bam_basename}.bam" 371 | File output_bam_index = "~{output_bam_basename}.bai" 372 | File output_bam_md5 = "~{output_bam_basename}.bam.md5" 373 | } 374 | } 375 | 376 | # Combine multiple *unsorted* BAM files 377 | # Note that if/when WDL supports optional outputs, we should merge this task with the sorted version 378 | task GatherUnsortedBamFiles { 379 | input { 380 | Array[File] input_bams 381 | String output_bam_basename 382 | Float total_input_size 383 | Int compression_level 384 | Int preemptible_tries 385 | } 386 | 387 | # Multiply the input bam size by two to account for the input and output 388 | Int disk_size = ceil(2 * total_input_size) + 20 389 | 390 | command { 391 | java -Dsamjdk.compression_level=~{compression_level} -Xms2000m -jar /usr/gitc/picard.jar \ 392 | GatherBamFiles \ 393 | INPUT=~{sep=' INPUT=' input_bams} \ 394 | OUTPUT=~{output_bam_basename}.bam \ 395 | CREATE_INDEX=false \ 396 | CREATE_MD5_FILE=false 397 | } 398 | runtime { 399 | docker: "us.gcr.io/broad-gotc-prod/genomes-in-the-cloud:2.4.1-1540490856" 400 | preemptible: preemptible_tries 401 | memory: "3 GiB" 402 | disks: "local-disk " + disk_size + " HDD" 403 | } 404 | output { 405 | File output_bam = "~{output_bam_basename}.bam" 406 | } 407 | } 408 | 409 | # Notes on the contamination estimate: 410 | # The contamination value is read from the FREEMIX field of the selfSM file output by verifyBamId 411 | # 412 | # In Zamboni production, this value is stored directly in METRICS.AGGREGATION_CONTAM 413 | # 414 | # Contamination is also stored in GVCF_CALLING and thereby passed to HAPLOTYPE_CALLER 415 | # But first, it is divided by an underestimation factor thusly: 416 | # float(FREEMIX) / ContaminationUnderestimationFactor 417 | # where the denominator is hardcoded in Zamboni: 418 | # val ContaminationUnderestimationFactor = 0.75f 419 | # 420 | # Here, I am handling this by returning both the original selfSM file for reporting, and the adjusted 421 | # contamination estimate for use in variant calling 422 | task CheckContamination { 423 | input { 424 | File input_bam 425 | File input_bam_index 426 | File contamination_sites_ud 427 | File contamination_sites_bed 428 | File contamination_sites_mu 429 | File ref_fasta 430 | File ref_fasta_index 431 | String output_prefix 432 | Int preemptible_tries 433 | Float contamination_underestimation_factor 434 | Boolean disable_sanity_check = false 435 | } 436 | 437 | Int disk_size = ceil(size(input_bam, "GiB") + size(ref_fasta, "GiB")) + 30 438 | 439 | command <<< 440 | set -e 441 | 442 | # creates a ~{output_prefix}.selfSM file, a TSV file with 2 rows, 19 columns. 443 | # First row are the keys (e.g., SEQ_SM, RG, FREEMIX), second row are the associated values 444 | /usr/gitc/VerifyBamID \ 445 | --Verbose \ 446 | --NumPC 4 \ 447 | --Output ~{output_prefix} \ 448 | --BamFile ~{input_bam} \ 449 | --Reference ~{ref_fasta} \ 450 | --UDPath ~{contamination_sites_ud} \ 451 | --MeanPath ~{contamination_sites_mu} \ 452 | --BedPath ~{contamination_sites_bed} \ 453 | ~{true="--DisableSanityCheck" false="" disable_sanity_check} \ 454 | 1>/dev/null 455 | 456 | # used to read from the selfSM file and calculate contamination, which gets printed out 457 | python3 <>> 479 | runtime { 480 | preemptible: preemptible_tries 481 | memory: "4 GiB" 482 | disks: "local-disk " + disk_size + " HDD" 483 | docker: "us.gcr.io/broad-gotc-prod/verify-bam-id:c1cba76e979904eb69c31520a0d7f5be63c72253-1553018888" 484 | cpu: "2" 485 | } 486 | output { 487 | File selfSM = "~{output_prefix}.selfSM" 488 | Float contamination = read_float(stdout()) 489 | } 490 | } 491 | -------------------------------------------------------------------------------- /workflows/mystery-2/tasks/BamToCram.wdl: -------------------------------------------------------------------------------- 1 | version 1.0 2 | 3 | # Local Import 4 | #import "Utilities.wdl" as Utils 5 | #import "Qc.wdl" as QC 6 | 7 | # Git URL Import 8 | import "https://raw.githubusercontent.com/gatk-workflows/five-dollar-genome-analysis-pipeline/1.2.0/tasks/Utilities.wdl" as Utils 9 | import "https://raw.githubusercontent.com/gatk-workflows/five-dollar-genome-analysis-pipeline/1.2.0/tasks/Qc.wdl" as QC 10 | 11 | workflow BamToCram { 12 | 13 | input { 14 | File input_bam 15 | File ref_fasta 16 | File ref_fasta_index 17 | File ref_dict 18 | File duplication_metrics 19 | File chimerism_metrics 20 | String base_file_name 21 | Int agg_preemptible_tries 22 | } 23 | 24 | 25 | # ValidateSamFile runs out of memory in mate validation on crazy edge case data, so we want to skip the mate validation 26 | # in those cases. These values set the thresholds for what is considered outside the normal realm of "reasonable" data. 27 | Float max_duplication_in_reasonable_sample = 0.30 28 | Float max_chimerism_in_reasonable_sample = 0.15 29 | 30 | # Convert the final merged recalibrated BAM file to CRAM format 31 | call Utils.ConvertToCram as ConvertToCram { 32 | input: 33 | input_bam = input_bam, 34 | ref_fasta = ref_fasta, 35 | ref_fasta_index = ref_fasta_index, 36 | output_basename = base_file_name, 37 | preemptible_tries = agg_preemptible_tries 38 | } 39 | 40 | # Check whether the data has massively high duplication or chimerism rates 41 | call QC.CheckPreValidation as CheckPreValidation { 42 | input: 43 | duplication_metrics = duplication_metrics, 44 | chimerism_metrics = chimerism_metrics, 45 | max_duplication_in_reasonable_sample = max_duplication_in_reasonable_sample, 46 | max_chimerism_in_reasonable_sample = max_chimerism_in_reasonable_sample, 47 | preemptible_tries = agg_preemptible_tries 48 | } 49 | 50 | # Validate the CRAM file 51 | call QC.ValidateSamFile as ValidateCram { 52 | input: 53 | input_bam = ConvertToCram.output_cram, 54 | input_bam_index = ConvertToCram.output_cram_index, 55 | report_filename = base_file_name + ".cram.validation_report", 56 | ref_dict = ref_dict, 57 | ref_fasta = ref_fasta, 58 | ref_fasta_index = ref_fasta_index, 59 | ignore = ["MISSING_TAG_NM"], 60 | max_output = 1000000000, 61 | is_outlier_data = CheckPreValidation.is_outlier_data, 62 | preemptible_tries = agg_preemptible_tries 63 | } 64 | 65 | output { 66 | File output_cram = ConvertToCram.output_cram 67 | File output_cram_index = ConvertToCram.output_cram_index 68 | File output_cram_md5 = ConvertToCram.output_cram_md5 69 | File validate_cram_file_report = ValidateCram.report 70 | } 71 | } 72 | 73 | -------------------------------------------------------------------------------- /workflows/mystery-2/tasks/GermlineVariantDiscovery.wdl: -------------------------------------------------------------------------------- 1 | version 1.0 2 | 3 | ## Copyright Broad Institute, 2018 4 | ## 5 | ## This WDL defines tasks used for germline variant discovery of human whole-genome or exome sequencing data. 6 | ## 7 | ## Runtime parameters are often optimized for Broad's Google Cloud Platform implementation. 8 | ## For program versions, see docker containers. 9 | ## 10 | ## LICENSING : 11 | ## This script is released under the WDL source code license (BSD-3) (see LICENSE in 12 | ## https://github.com/broadinstitute/wdl). Note however that the programs it calls may 13 | ## be subject to different licenses. Users are responsible for checking that they are 14 | ## authorized to run all programs before running this script. Please see the docker 15 | ## page at https://hub.docker.com/r/broadinstitute/genomes-in-the-cloud/ for detailed 16 | ## licensing information pertaining to the included programs. 17 | 18 | task HaplotypeCaller_GATK35_GVCF { 19 | input { 20 | File input_bam 21 | File interval_list 22 | String gvcf_basename 23 | File ref_dict 24 | File ref_fasta 25 | File ref_fasta_index 26 | Float? contamination 27 | Int preemptible_tries 28 | Int hc_scatter 29 | } 30 | 31 | parameter_meta { 32 | input_bam: { 33 | localization_optional: true 34 | } 35 | } 36 | 37 | Float ref_size = size(ref_fasta, "GB") + size(ref_fasta_index, "GB") + size(ref_dict, "GB") 38 | Int disk_size = ceil(((size(input_bam, "GB") + 30) / hc_scatter) + ref_size) + 20 39 | 40 | # We use interval_padding 500 below to make sure that the HaplotypeCaller has context on both sides around 41 | # the interval because the assembly uses them. 42 | # 43 | # Using PrintReads is a temporary solution until we update HaploypeCaller to use GATK4. Once that is done, 44 | # HaplotypeCaller can stream the required intervals directly from the cloud. 45 | command { 46 | /usr/gitc/gatk4/gatk-launch --javaOptions "-Xms2g" \ 47 | PrintReads \ 48 | -I ~{input_bam} \ 49 | --interval_padding 500 \ 50 | -L ~{interval_list} \ 51 | -O local.sharded.bam \ 52 | && \ 53 | java -XX:GCTimeLimit=50 -XX:GCHeapFreeLimit=10 -Xms8000m \ 54 | -jar /usr/gitc/GATK35.jar \ 55 | -T HaplotypeCaller \ 56 | -R ~{ref_fasta} \ 57 | -o ~{gvcf_basename}.vcf.gz \ 58 | -I local.sharded.bam \ 59 | -L ~{interval_list} \ 60 | -ERC GVCF \ 61 | --max_alternate_alleles 3 \ 62 | -variant_index_parameter 128000 \ 63 | -variant_index_type LINEAR \ 64 | -contamination ~{default=0 contamination} \ 65 | --read_filter OverclippedRead 66 | } 67 | runtime { 68 | docker: "us.gcr.io/broad-gotc-prod/genomes-in-the-cloud:2.3.2-1510681135" 69 | preemptible: preemptible_tries 70 | memory: "10 GiB" 71 | cpu: "1" 72 | disks: "local-disk " + disk_size + " HDD" 73 | } 74 | output { 75 | File output_gvcf = "~{gvcf_basename}.vcf.gz" 76 | File output_gvcf_index = "~{gvcf_basename}.vcf.gz.tbi" 77 | } 78 | } 79 | 80 | task HaplotypeCaller_GATK4_VCF { 81 | input { 82 | File input_bam 83 | File interval_list 84 | String vcf_basename 85 | File ref_dict 86 | File ref_fasta 87 | File ref_fasta_index 88 | Float? contamination 89 | Boolean make_gvcf 90 | Boolean make_bamout 91 | Int preemptible_tries 92 | Int hc_scatter 93 | String gatk_docker = "us.gcr.io/broad-gatk/gatk:4.0.10.1" 94 | } 95 | 96 | String output_suffix = if make_gvcf then ".g.vcf.gz" else ".vcf.gz" 97 | String output_file_name = vcf_basename + output_suffix 98 | 99 | Float ref_size = size(ref_fasta, "GiB") + size(ref_fasta_index, "GiB") + size(ref_dict, "GiB") 100 | Int disk_size = ceil(((size(input_bam, "GiB") + 30) / hc_scatter) + ref_size) + 20 101 | 102 | String bamout_arg = if make_bamout then "-bamout ~{vcf_basename}.bamout.bam" else "" 103 | 104 | parameter_meta { 105 | input_bam: { 106 | localization_optional: true 107 | } 108 | } 109 | 110 | command <<< 111 | set -e 112 | gatk --java-options "-Xms6000m -XX:GCTimeLimit=50 -XX:GCHeapFreeLimit=10" \ 113 | HaplotypeCaller \ 114 | -R ~{ref_fasta} \ 115 | -I ~{input_bam} \ 116 | -L ~{interval_list} \ 117 | -O ~{output_file_name} \ 118 | -contamination ~{default=0 contamination} \ 119 | -G StandardAnnotation -G StandardHCAnnotation ~{true="-G AS_StandardAnnotation" false="" make_gvcf} \ 120 | -new-qual \ 121 | -GQB 10 -GQB 20 -GQB 30 -GQB 40 -GQB 50 -GQB 60 -GQB 70 -GQB 80 -GQB 90 \ 122 | ~{true="-ERC GVCF" false="" make_gvcf} \ 123 | ~{bamout_arg} 124 | 125 | # Cromwell doesn't like optional task outputs, so we have to touch this file. 126 | touch ~{vcf_basename}.bamout.bam 127 | >>> 128 | 129 | runtime { 130 | docker: gatk_docker 131 | preemptible: preemptible_tries 132 | memory: "6.5 GiB" 133 | cpu: "2" 134 | disks: "local-disk " + disk_size + " HDD" 135 | } 136 | 137 | output { 138 | File output_vcf = "~{output_file_name}" 139 | File output_vcf_index = "~{output_file_name}.tbi" 140 | File bamout = "~{vcf_basename}.bamout.bam" 141 | } 142 | } 143 | 144 | # Combine multiple VCFs or GVCFs from scattered HaplotypeCaller runs 145 | task MergeVCFs { 146 | input { 147 | Array[File] input_vcfs 148 | Array[File] input_vcfs_indexes 149 | String output_vcf_name 150 | Int preemptible_tries 151 | } 152 | 153 | Int disk_size = ceil(size(input_vcfs, "GiB") * 2.5) + 10 154 | 155 | # Using MergeVcfs instead of GatherVcfs so we can create indices 156 | # See https://github.com/broadinstitute/picard/issues/789 for relevant GatherVcfs ticket 157 | command { 158 | java -Xms2000m -jar /usr/gitc/picard.jar \ 159 | MergeVcfs \ 160 | INPUT=~{sep=' INPUT=' input_vcfs} \ 161 | OUTPUT=~{output_vcf_name} 162 | } 163 | runtime { 164 | docker: "us.gcr.io/broad-gotc-prod/genomes-in-the-cloud:2.4.1-1540490856" 165 | preemptible: preemptible_tries 166 | memory: "3 GiB" 167 | disks: "local-disk ~{disk_size} HDD" 168 | } 169 | output { 170 | File output_vcf = "~{output_vcf_name}" 171 | File output_vcf_index = "~{output_vcf_name}.tbi" 172 | } 173 | } 174 | 175 | task HardFilterVcf { 176 | input { 177 | File input_vcf 178 | File input_vcf_index 179 | String vcf_basename 180 | File interval_list 181 | Int preemptible_tries 182 | String gatk_docker = "us.gcr.io/broad-gatk/gatk:4.0.10.1" 183 | } 184 | 185 | Int disk_size = ceil(2 * size(input_vcf, "GiB")) + 20 186 | String output_vcf_name = vcf_basename + ".filtered.vcf.gz" 187 | 188 | command { 189 | gatk --java-options "-Xms3000m" \ 190 | VariantFiltration \ 191 | -V ~{input_vcf} \ 192 | -L ~{interval_list} \ 193 | --filter-expression "QD < 2.0 || FS > 30.0 || SOR > 3.0 || MQ < 40.0 || MQRankSum < -3.0 || ReadPosRankSum < -3.0" \ 194 | --filter-name "HardFiltered" \ 195 | -O ~{output_vcf_name} 196 | } 197 | output { 198 | File output_vcf = "~{output_vcf_name}" 199 | File output_vcf_index = "~{output_vcf_name}.tbi" 200 | } 201 | runtime { 202 | docker: gatk_docker 203 | preemptible: preemptible_tries 204 | memory: "3 GiB" 205 | disks: "local-disk " + disk_size + " HDD" 206 | } 207 | } 208 | 209 | task CNNScoreVariants { 210 | 211 | input { 212 | File? bamout 213 | File? bamout_index 214 | File input_vcf 215 | File input_vcf_index 216 | String vcf_basename 217 | File ref_fasta 218 | File ref_fasta_index 219 | File ref_dict 220 | Int preemptible_tries 221 | String gatk_docker = "us.gcr.io/broad-gatk/gatk:4.1.0.0" 222 | } 223 | 224 | Int disk_size = ceil(size(bamout, "GiB") + size(ref_fasta, "GiB") + (size(input_vcf, "GiB") * 2)) 225 | 226 | String base_vcf = basename(input_vcf) 227 | Boolean is_compressed = basename(base_vcf, "gz") != base_vcf 228 | String vcf_suffix = if is_compressed then ".vcf.gz" else ".vcf" 229 | String vcf_index_suffix = if is_compressed then ".tbi" else ".idx" 230 | String output_vcf = base_vcf + ".scored" + vcf_suffix 231 | String output_vcf_index = output_vcf + vcf_index_suffix 232 | 233 | String bamout_param = if defined(bamout) then "-I ~{bamout}" else "" 234 | String tensor_type = if defined(bamout) then "read-tensor" else "reference" 235 | 236 | command { 237 | gatk --java-options -Xmx10g CNNScoreVariants \ 238 | -V ~{input_vcf} \ 239 | -R ~{ref_fasta} \ 240 | -O ~{output_vcf} \ 241 | ~{bamout_param} \ 242 | -tensor-type ~{tensor_type} 243 | } 244 | 245 | output { 246 | File scored_vcf = "~{output_vcf}" 247 | File scored_vcf_index = "~{output_vcf_index}" 248 | } 249 | 250 | runtime { 251 | docker: gatk_docker 252 | preemptible: preemptible_tries 253 | memory: "15 GiB" 254 | cpu: "2" 255 | disks: "local-disk " + disk_size + " HDD" 256 | } 257 | } 258 | 259 | task FilterVariantTranches { 260 | 261 | input { 262 | File input_vcf 263 | File input_vcf_index 264 | String vcf_basename 265 | Array[String] snp_tranches 266 | Array[String] indel_tranches 267 | File hapmap_resource_vcf 268 | File hapmap_resource_vcf_index 269 | File omni_resource_vcf 270 | File omni_resource_vcf_index 271 | File one_thousand_genomes_resource_vcf 272 | File one_thousand_genomes_resource_vcf_index 273 | File dbsnp_resource_vcf 274 | File dbsnp_resource_vcf_index 275 | String info_key 276 | Int preemptible_tries 277 | String gatk_docker = "us.gcr.io/broad-gatk/gatk:4.1.0.0" 278 | } 279 | 280 | Int disk_size = ceil(size(hapmap_resource_vcf, "GiB") + 281 | size(omni_resource_vcf, "GiB") + 282 | size(one_thousand_genomes_resource_vcf, "GiB") + 283 | size(dbsnp_resource_vcf, "GiB") + 284 | (size(input_vcf, "GiB") * 2) 285 | ) + 20 286 | 287 | command { 288 | 289 | gatk --java-options -Xmx6g FilterVariantTranches \ 290 | -V ~{input_vcf} \ 291 | -O ~{vcf_basename}.filtered.vcf.gz \ 292 | ~{sep=" " prefix("--snp-tranche ", snp_tranches)} \ 293 | ~{sep=" " prefix("--indel-tranche ", indel_tranches)} \ 294 | --resource ~{hapmap_resource_vcf} \ 295 | --resource ~{omni_resource_vcf} \ 296 | --resource ~{one_thousand_genomes_resource_vcf} \ 297 | --resource ~{dbsnp_resource_vcf} \ 298 | --info-key ~{info_key} \ 299 | --create-output-variant-index true 300 | } 301 | 302 | output { 303 | File filtered_vcf = "~{vcf_basename}.filtered.vcf.gz" 304 | File filtered_vcf_index = "~{vcf_basename}.filtered.vcf.gz.tbi" 305 | } 306 | 307 | runtime { 308 | memory: "7 GiB" 309 | cpu: "2" 310 | disks: "local-disk " + disk_size + " HDD" 311 | preemptible: preemptible_tries 312 | docker: gatk_docker 313 | } 314 | } 315 | -------------------------------------------------------------------------------- /workflows/mystery-2/tasks/SplitLargeReadGroup.wdl: -------------------------------------------------------------------------------- 1 | version 1.0 2 | 3 | ## Copyright Broad Institute, 2018 4 | ## 5 | ## This WDL pipeline implements a split of large readgroups for human whole-genome and exome sequencing data. 6 | ## 7 | ## Runtime parameters are often optimized for Broad's Google Cloud Platform implementation. 8 | ## For program versions, see docker containers. 9 | ## 10 | ## LICENSING : 11 | ## This script is released under the WDL source code license (BSD-3) (see LICENSE in 12 | ## https://github.com/broadinstitute/wdl). Note however that the programs it calls may 13 | ## be subject to different licenses. Users are responsible for checking that they are 14 | ## authorized to run all programs before running this script. Please see the docker 15 | ## page at https://hub.docker.com/r/broadinstitute/genomes-in-the-cloud/ for detailed 16 | ## licensing information pertaining to the included programs. 17 | 18 | import "https://raw.githubusercontent.com/gatk-workflows/five-dollar-genome-analysis-pipeline/1.2.0/tasks/Alignment.wdl" as Alignment 19 | import "https://raw.githubusercontent.com/gatk-workflows/five-dollar-genome-analysis-pipeline/1.2.0/tasks/BamProcessing.wdl" as Processing 20 | import "https://raw.githubusercontent.com/gatk-workflows/five-dollar-genome-analysis-pipeline/1.2.0/tasks/Utilities.wdl" as Utils 21 | import "https://raw.githubusercontent.com/gatk-workflows/five-dollar-genome-analysis-pipeline/1.2.0/structs/GermlineStructs.wdl" as Structs 22 | 23 | workflow SplitLargeReadGroup { 24 | input { 25 | File input_bam 26 | 27 | String bwa_commandline 28 | String bwa_version 29 | String output_bam_basename 30 | 31 | # reference_fasta.ref_alt is the .alt file from bwa-kit 32 | # (https://github.com/lh3/bwa/tree/master/bwakit), 33 | # listing the reference contigs that are "alternative". 34 | ReferenceFasta reference_fasta 35 | 36 | Int compression_level 37 | Int preemptible_tries 38 | Int reads_per_file = 48000000 39 | } 40 | 41 | call Alignment.SamSplitter as SamSplitter { 42 | input : 43 | input_bam = input_bam, 44 | n_reads = reads_per_file, 45 | preemptible_tries = preemptible_tries, 46 | compression_level = compression_level 47 | } 48 | 49 | scatter(unmapped_bam in SamSplitter.split_bams) { 50 | Float current_unmapped_bam_size = size(unmapped_bam, "GiB") 51 | String current_name = basename(unmapped_bam, ".bam") 52 | 53 | call Alignment.SamToFastqAndBwaMemAndMba as SamToFastqAndBwaMemAndMba { 54 | input: 55 | input_bam = unmapped_bam, 56 | bwa_commandline = bwa_commandline, 57 | output_bam_basename = current_name, 58 | reference_fasta = reference_fasta, 59 | bwa_version = bwa_version, 60 | compression_level = compression_level, 61 | preemptible_tries = preemptible_tries 62 | } 63 | 64 | Float current_mapped_size = size(SamToFastqAndBwaMemAndMba.output_bam, "GiB") 65 | } 66 | 67 | call Utils.SumFloats as SumSplitAlignedSizes { 68 | input: 69 | sizes = current_mapped_size, 70 | preemptible_tries = preemptible_tries 71 | } 72 | 73 | call Processing.GatherUnsortedBamFiles as GatherMonolithicBamFile { 74 | input: 75 | input_bams = SamToFastqAndBwaMemAndMba.output_bam, 76 | total_input_size = SumSplitAlignedSizes.total_size, 77 | output_bam_basename = output_bam_basename, 78 | preemptible_tries = preemptible_tries, 79 | compression_level = compression_level 80 | } 81 | output { 82 | File aligned_bam = GatherMonolithicBamFile.output_bam 83 | } 84 | } 85 | -------------------------------------------------------------------------------- /workflows/mystery-2/tasks/UnmappedBamToAlignedBam.wdl: -------------------------------------------------------------------------------- 1 | version 1.0 2 | 3 | ## Copyright Broad Institute, 2018 4 | ## 5 | ## This WDL pipeline implements data processing according to the GATK Best Practices (June 2016) 6 | ## for human whole-genome and exome sequencing data. 7 | ## 8 | ## Runtime parameters are often optimized for Broad's Google Cloud Platform implementation. 9 | ## For program versions, see docker containers. 10 | ## 11 | ## LICENSING : 12 | ## This script is released under the WDL source code license (BSD-3) (see LICENSE in 13 | ## https://github.com/broadinstitute/wdl). Note however that the programs it calls may 14 | ## be subject to different licenses. Users are responsible for checking that they are 15 | ## authorized to run all programs before running this script. Please see the docker 16 | ## page at https://hub.docker.com/r/broadinstitute/genomes-in-the-cloud/ for detailed 17 | ## licensing information pertaining to the included programs. 18 | 19 | import "https://raw.githubusercontent.com/gatk-workflows/five-dollar-genome-analysis-pipeline/1.2.0/tasks/Alignment.wdl" as Alignment 20 | import "https://raw.githubusercontent.com/gatk-workflows/five-dollar-genome-analysis-pipeline/1.2.0/tasks/SplitLargeReadGroup.wdl" as SplitRG 21 | import "https://raw.githubusercontent.com/gatk-workflows/five-dollar-genome-analysis-pipeline/1.2.0/tasks/Qc.wdl" as QC 22 | import "https://raw.githubusercontent.com/gatk-workflows/five-dollar-genome-analysis-pipeline/1.2.0/tasks/BamProcessing.wdl" as Processing 23 | import "https://raw.githubusercontent.com/gatk-workflows/five-dollar-genome-analysis-pipeline/1.2.0/tasks/Utilities.wdl" as Utils 24 | import "https://raw.githubusercontent.com/gatk-workflows/five-dollar-genome-analysis-pipeline/1.2.0/structs/GermlineStructs.wdl" as Structs 25 | 26 | # WORKFLOW DEFINITION 27 | workflow UnmappedBamToAlignedBam { 28 | input { 29 | SampleAndUnmappedBams sample_and_unmapped_bams 30 | GermlineSingleSampleReferences references 31 | PapiSettings papi_settings 32 | 33 | String cross_check_fingerprints_by 34 | File? haplotype_database_file 35 | Float lod_threshold 36 | String recalibrated_bam_basename 37 | } 38 | 39 | Float cutoff_for_large_rg_in_gb = 20.0 40 | 41 | String bwa_commandline = "bwa mem -K 100000000 -p -v 3 -t 16 -Y $bash_ref_fasta" 42 | 43 | Int compression_level = 2 44 | 45 | # Get the version of BWA to include in the PG record in the header of the BAM produced 46 | # by MergeBamAlignment. 47 | call Alignment.GetBwaVersion 48 | 49 | # Get the size of the standard reference files as well as the additional reference files needed for BWA 50 | 51 | # Align flowcell-level unmapped input bams in parallel 52 | scatter (unmapped_bam in sample_and_unmapped_bams.flowcell_unmapped_bams) { 53 | 54 | Float unmapped_bam_size = size(unmapped_bam, "GiB") 55 | 56 | String unmapped_bam_basename = basename(unmapped_bam, sample_and_unmapped_bams.unmapped_bam_suffix) 57 | 58 | # QC the unmapped BAM 59 | call QC.CollectQualityYieldMetrics as CollectQualityYieldMetrics { 60 | input: 61 | input_bam = unmapped_bam, 62 | metrics_filename = unmapped_bam_basename + ".unmapped.quality_yield_metrics", 63 | preemptible_tries = papi_settings.preemptible_tries 64 | } 65 | 66 | if (unmapped_bam_size > cutoff_for_large_rg_in_gb) { 67 | # Split bam into multiple smaller bams, 68 | # map reads to reference and recombine into one bam 69 | call SplitRG.SplitLargeReadGroup as SplitRG { 70 | input: 71 | input_bam = unmapped_bam, 72 | bwa_commandline = bwa_commandline, 73 | bwa_version = GetBwaVersion.bwa_version, 74 | output_bam_basename = unmapped_bam_basename + ".aligned.unsorted", 75 | reference_fasta = references.reference_fasta, 76 | compression_level = compression_level, 77 | preemptible_tries = papi_settings.preemptible_tries 78 | } 79 | } 80 | 81 | if (unmapped_bam_size <= cutoff_for_large_rg_in_gb) { 82 | # Map reads to reference 83 | call Alignment.SamToFastqAndBwaMemAndMba as SamToFastqAndBwaMemAndMba { 84 | input: 85 | input_bam = unmapped_bam, 86 | bwa_commandline = bwa_commandline, 87 | output_bam_basename = unmapped_bam_basename + ".aligned.unsorted", 88 | reference_fasta = references.reference_fasta, 89 | bwa_version = GetBwaVersion.bwa_version, 90 | compression_level = compression_level, 91 | preemptible_tries = papi_settings.preemptible_tries 92 | } 93 | } 94 | 95 | File output_aligned_bam = select_first([SamToFastqAndBwaMemAndMba.output_bam, SplitRG.aligned_bam]) 96 | 97 | Float mapped_bam_size = size(output_aligned_bam, "GiB") 98 | 99 | # QC the aligned but unsorted readgroup BAM 100 | # no reference as the input here is unsorted, providing a reference would cause an error 101 | call QC.CollectUnsortedReadgroupBamQualityMetrics as CollectUnsortedReadgroupBamQualityMetrics { 102 | input: 103 | input_bam = output_aligned_bam, 104 | output_bam_prefix = unmapped_bam_basename + ".readgroup", 105 | preemptible_tries = papi_settings.preemptible_tries 106 | } 107 | } 108 | 109 | # Sum the read group bam sizes to approximate the aggregated bam size 110 | call Utils.SumFloats as SumFloats { 111 | input: 112 | sizes = mapped_bam_size, 113 | preemptible_tries = papi_settings.preemptible_tries 114 | } 115 | 116 | # MarkDuplicates and SortSam currently take too long for preemptibles if the input data is too large 117 | Float gb_size_cutoff_for_preemptibles = 110.0 118 | Boolean data_too_large_for_preemptibles = SumFloats.total_size > gb_size_cutoff_for_preemptibles 119 | 120 | # Aggregate aligned+merged flowcell BAM files and mark duplicates 121 | # We take advantage of the tool's ability to take multiple BAM inputs and write out a single output 122 | # to avoid having to spend time just merging BAM files. 123 | call Processing.MarkDuplicates as MarkDuplicates { 124 | input: 125 | input_bams = output_aligned_bam, 126 | output_bam_basename = sample_and_unmapped_bams.base_file_name + ".aligned.unsorted.duplicates_marked", 127 | metrics_filename = sample_and_unmapped_bams.base_file_name + ".duplicate_metrics", 128 | total_input_size = SumFloats.total_size, 129 | compression_level = compression_level, 130 | preemptible_tries = if data_too_large_for_preemptibles then 0 else papi_settings.agg_preemptible_tries 131 | } 132 | 133 | # Sort aggregated+deduped BAM file and fix tags 134 | call Processing.SortSam as SortSampleBam { 135 | input: 136 | input_bam = MarkDuplicates.output_bam, 137 | output_bam_basename = sample_and_unmapped_bams.base_file_name + ".aligned.duplicate_marked.sorted", 138 | compression_level = compression_level, 139 | preemptible_tries = if data_too_large_for_preemptibles then 0 else papi_settings.agg_preemptible_tries 140 | } 141 | 142 | Float agg_bam_size = size(SortSampleBam.output_bam, "GiB") 143 | 144 | if (defined(haplotype_database_file)) { 145 | # Check identity of fingerprints across readgroups 146 | call QC.CrossCheckFingerprints as CrossCheckFingerprints { 147 | input: 148 | input_bams = [ SortSampleBam.output_bam ], 149 | input_bam_indexes = [SortSampleBam.output_bam_index], 150 | haplotype_database_file = haplotype_database_file, 151 | metrics_filename = sample_and_unmapped_bams.base_file_name + ".crosscheck", 152 | total_input_size = agg_bam_size, 153 | lod_threshold = lod_threshold, 154 | cross_check_by = cross_check_fingerprints_by, 155 | preemptible_tries = papi_settings.agg_preemptible_tries 156 | } 157 | } 158 | 159 | # Create list of sequences for scatter-gather parallelization 160 | call Utils.CreateSequenceGroupingTSV as CreateSequenceGroupingTSV { 161 | input: 162 | ref_dict = references.reference_fasta.ref_dict, 163 | preemptible_tries = papi_settings.preemptible_tries 164 | } 165 | 166 | # Estimate level of cross-sample contamination 167 | call Processing.CheckContamination as CheckContamination { 168 | input: 169 | input_bam = SortSampleBam.output_bam, 170 | input_bam_index = SortSampleBam.output_bam_index, 171 | contamination_sites_ud = references.contamination_sites_ud, 172 | contamination_sites_bed = references.contamination_sites_bed, 173 | contamination_sites_mu = references.contamination_sites_mu, 174 | ref_fasta = references.reference_fasta.ref_fasta, 175 | ref_fasta_index = references.reference_fasta.ref_fasta_index, 176 | output_prefix = sample_and_unmapped_bams.base_file_name + ".preBqsr", 177 | preemptible_tries = papi_settings.agg_preemptible_tries, 178 | contamination_underestimation_factor = 0.75 179 | } 180 | 181 | # We need disk to localize the sharded input and output due to the scatter for BQSR. 182 | # If we take the number we are scattering by and reduce by 3 we will have enough disk space 183 | # to account for the fact that the data is not split evenly. 184 | Int num_of_bqsr_scatters = length(CreateSequenceGroupingTSV.sequence_grouping) 185 | Int potential_bqsr_divisor = num_of_bqsr_scatters - 10 186 | Int bqsr_divisor = if potential_bqsr_divisor > 1 then potential_bqsr_divisor else 1 187 | 188 | # Perform Base Quality Score Recalibration (BQSR) on the sorted BAM in parallel 189 | scatter (subgroup in CreateSequenceGroupingTSV.sequence_grouping) { 190 | # Generate the recalibration model by interval 191 | call Processing.BaseRecalibrator as BaseRecalibrator { 192 | input: 193 | input_bam = SortSampleBam.output_bam, 194 | recalibration_report_filename = sample_and_unmapped_bams.base_file_name + ".recal_data.csv", 195 | sequence_group_interval = subgroup, 196 | dbsnp_vcf = references.dbsnp_vcf, 197 | dbsnp_vcf_index = references.dbsnp_vcf_index, 198 | known_indels_sites_vcfs = references.known_indels_sites_vcfs, 199 | known_indels_sites_indices = references.known_indels_sites_indices, 200 | ref_dict = references.reference_fasta.ref_dict, 201 | ref_fasta = references.reference_fasta.ref_fasta, 202 | ref_fasta_index = references.reference_fasta.ref_fasta_index, 203 | bqsr_scatter = bqsr_divisor, 204 | preemptible_tries = papi_settings.agg_preemptible_tries 205 | } 206 | } 207 | 208 | # Merge the recalibration reports resulting from by-interval recalibration 209 | # The reports are always the same size 210 | call Processing.GatherBqsrReports as GatherBqsrReports { 211 | input: 212 | input_bqsr_reports = BaseRecalibrator.recalibration_report, 213 | output_report_filename = sample_and_unmapped_bams.base_file_name + ".recal_data.csv", 214 | preemptible_tries = papi_settings.preemptible_tries 215 | } 216 | 217 | scatter (subgroup in CreateSequenceGroupingTSV.sequence_grouping_with_unmapped) { 218 | # Apply the recalibration model by interval 219 | call Processing.ApplyBQSR as ApplyBQSR { 220 | input: 221 | input_bam = SortSampleBam.output_bam, 222 | output_bam_basename = recalibrated_bam_basename, 223 | recalibration_report = GatherBqsrReports.output_bqsr_report, 224 | sequence_group_interval = subgroup, 225 | ref_dict = references.reference_fasta.ref_dict, 226 | ref_fasta = references.reference_fasta.ref_fasta, 227 | ref_fasta_index = references.reference_fasta.ref_fasta_index, 228 | bqsr_scatter = bqsr_divisor, 229 | compression_level = compression_level, 230 | preemptible_tries = papi_settings.agg_preemptible_tries 231 | } 232 | } 233 | 234 | # Merge the recalibrated BAM files resulting from by-interval recalibration 235 | call Processing.GatherSortedBamFiles as GatherBamFiles { 236 | input: 237 | input_bams = ApplyBQSR.recalibrated_bam, 238 | output_bam_basename = sample_and_unmapped_bams.base_file_name, 239 | total_input_size = agg_bam_size, 240 | compression_level = compression_level, 241 | preemptible_tries = papi_settings.agg_preemptible_tries 242 | } 243 | 244 | # Outputs that will be retained when execution is complete 245 | output { 246 | Array[File] quality_yield_metrics = CollectQualityYieldMetrics.quality_yield_metrics 247 | 248 | Array[File] unsorted_read_group_base_distribution_by_cycle_pdf = CollectUnsortedReadgroupBamQualityMetrics.base_distribution_by_cycle_pdf 249 | Array[File] unsorted_read_group_base_distribution_by_cycle_metrics = CollectUnsortedReadgroupBamQualityMetrics.base_distribution_by_cycle_metrics 250 | Array[File] unsorted_read_group_insert_size_histogram_pdf = CollectUnsortedReadgroupBamQualityMetrics.insert_size_histogram_pdf 251 | Array[File] unsorted_read_group_insert_size_metrics = CollectUnsortedReadgroupBamQualityMetrics.insert_size_metrics 252 | Array[File] unsorted_read_group_quality_by_cycle_pdf = CollectUnsortedReadgroupBamQualityMetrics.quality_by_cycle_pdf 253 | Array[File] unsorted_read_group_quality_by_cycle_metrics = CollectUnsortedReadgroupBamQualityMetrics.quality_by_cycle_metrics 254 | Array[File] unsorted_read_group_quality_distribution_pdf = CollectUnsortedReadgroupBamQualityMetrics.quality_distribution_pdf 255 | Array[File] unsorted_read_group_quality_distribution_metrics = CollectUnsortedReadgroupBamQualityMetrics.quality_distribution_metrics 256 | 257 | File? cross_check_fingerprints_metrics = CrossCheckFingerprints.cross_check_fingerprints_metrics 258 | 259 | File selfSM = CheckContamination.selfSM 260 | Float contamination = CheckContamination.contamination 261 | 262 | File duplicate_metrics = MarkDuplicates.duplicate_metrics 263 | File output_bqsr_reports = GatherBqsrReports.output_bqsr_report 264 | 265 | File output_bam = GatherBamFiles.output_bam 266 | File output_bam_index = GatherBamFiles.output_bam_index 267 | } 268 | } 269 | -------------------------------------------------------------------------------- /workflows/mystery-2/tasks/Utilities.wdl: -------------------------------------------------------------------------------- 1 | version 1.0 2 | 3 | ## Copyright Broad Institute, 2018 4 | ## 5 | ## This WDL defines utility tasks used for processing of sequencing data. 6 | ## 7 | ## Runtime parameters are often optimized for Broad's Google Cloud Platform implementation. 8 | ## For program versions, see docker containers. 9 | ## 10 | ## LICENSING : 11 | ## This script is released under the WDL source code license (BSD-3) (see LICENSE in 12 | ## https://github.com/broadinstitute/wdl). Note however that the programs it calls may 13 | ## be subject to different licenses. Users are responsible for checking that they are 14 | ## authorized to run all programs before running this script. Please see the docker 15 | ## page at https://hub.docker.com/r/broadinstitute/genomes-in-the-cloud/ for detailed 16 | ## licensing information pertaining to the included programs. 17 | 18 | # Generate sets of intervals for scatter-gathering over chromosomes 19 | task CreateSequenceGroupingTSV { 20 | input { 21 | File ref_dict 22 | Int preemptible_tries 23 | } 24 | # Use python to create the Sequencing Groupings used for BQSR and PrintReads Scatter. 25 | # It outputs to stdout where it is parsed into a wdl Array[Array[String]] 26 | # e.g. [["1"], ["2"], ["3", "4"], ["5"], ["6", "7", "8"]] 27 | command <<< 28 | python <>> 63 | runtime { 64 | preemptible: preemptible_tries 65 | docker: "us.gcr.io/broad-gotc-prod/python:2.7" 66 | memory: "2 GiB" 67 | } 68 | output { 69 | Array[Array[String]] sequence_grouping = read_tsv("sequence_grouping.txt") 70 | Array[Array[String]] sequence_grouping_with_unmapped = read_tsv("sequence_grouping_with_unmapped.txt") 71 | } 72 | } 73 | 74 | # This task calls picard's IntervalListTools to scatter the input interval list into scatter_count sub interval lists 75 | # Note that the number of sub interval lists may not be exactly equal to scatter_count. There may be slightly more or less. 76 | # Thus we have the block of python to count the number of generated sub interval lists. 77 | task ScatterIntervalList { 78 | input { 79 | File interval_list 80 | Int scatter_count 81 | Int break_bands_at_multiples_of 82 | } 83 | 84 | command <<< 85 | set -e 86 | mkdir out 87 | java -Xms1g -jar /usr/gitc/picard.jar \ 88 | IntervalListTools \ 89 | SCATTER_COUNT=~{scatter_count} \ 90 | SUBDIVISION_MODE=BALANCING_WITHOUT_INTERVAL_SUBDIVISION_WITH_OVERFLOW \ 91 | UNIQUE=true \ 92 | SORT=true \ 93 | BREAK_BANDS_AT_MULTIPLES_OF=~{break_bands_at_multiples_of} \ 94 | INPUT=~{interval_list} \ 95 | OUTPUT=out 96 | 97 | python3 <>> 108 | output { 109 | Array[File] out = glob("out/*/*.interval_list") 110 | Int interval_count = read_int(stdout()) 111 | } 112 | runtime { 113 | docker: "us.gcr.io/broad-gotc-prod/genomes-in-the-cloud:2.4.1-1540490856" 114 | memory: "2 GiB" 115 | } 116 | } 117 | 118 | # Convert BAM file to CRAM format 119 | # Note that reading CRAMs directly with Picard is not yet supported 120 | task ConvertToCram { 121 | input { 122 | File input_bam 123 | File ref_fasta 124 | File ref_fasta_index 125 | String output_basename 126 | Int preemptible_tries 127 | } 128 | 129 | Float ref_size = size(ref_fasta, "GiB") + size(ref_fasta_index, "GiB") 130 | Int disk_size = ceil(2 * size(input_bam, "GiB") + ref_size) + 20 131 | 132 | command <<< 133 | set -e 134 | set -o pipefail 135 | 136 | samtools view -C -T ~{ref_fasta} ~{input_bam} | \ 137 | tee ~{output_basename}.cram | \ 138 | md5sum | awk '{print $1}' > ~{output_basename}.cram.md5 139 | 140 | # Create REF_CACHE. Used when indexing a CRAM 141 | seq_cache_populate.pl -root ./ref/cache ~{ref_fasta} 142 | export REF_PATH=: 143 | export REF_CACHE=./ref/cache/%2s/%2s/%s 144 | 145 | samtools index ~{output_basename}.cram 146 | >>> 147 | runtime { 148 | docker: "us.gcr.io/broad-gotc-prod/genomes-in-the-cloud:2.4.1-1540490856" 149 | preemptible: preemptible_tries 150 | memory: "3 GiB" 151 | cpu: "1" 152 | disks: "local-disk " + disk_size + " HDD" 153 | } 154 | output { 155 | File output_cram = "~{output_basename}.cram" 156 | File output_cram_index = "~{output_basename}.cram.crai" 157 | File output_cram_md5 = "~{output_basename}.cram.md5" 158 | } 159 | } 160 | 161 | # Convert CRAM file to BAM format 162 | task ConvertToBam { 163 | input { 164 | File input_cram 165 | File ref_fasta 166 | File ref_fasta_index 167 | String output_basename 168 | } 169 | 170 | command <<< 171 | set -e 172 | set -o pipefail 173 | 174 | samtools view -b -o ~{output_basename}.bam -T ~{ref_fasta} ~{input_cram} 175 | 176 | samtools index ~{output_basename}.bam 177 | >>> 178 | runtime { 179 | docker: "us.gcr.io/broad-gotc-prod/genomes-in-the-cloud:2.4.1-1540490856" 180 | preemptible: 3 181 | memory: "3 GiB" 182 | cpu: "1" 183 | disks: "local-disk 200 HDD" 184 | } 185 | output { 186 | File output_bam = "~{output_basename}.bam" 187 | File output_bam_index = "~{output_basename}.bam.bai" 188 | } 189 | } 190 | 191 | # Calculates sum of a list of floats 192 | task SumFloats { 193 | input { 194 | Array[Float] sizes 195 | Int preemptible_tries 196 | } 197 | 198 | command <<< 199 | python -c "print ~{sep="+" sizes}" 200 | >>> 201 | output { 202 | Float total_size = read_float(stdout()) 203 | } 204 | runtime { 205 | docker: "us.gcr.io/broad-gotc-prod/python:2.7" 206 | preemptible: preemptible_tries 207 | } 208 | } 209 | -------------------------------------------------------------------------------- /workflows/mystery-2/tasks/VariantCalling-local.wdl: -------------------------------------------------------------------------------- 1 | 2 | 3 | # Local Import 4 | import "tasks/GermlineVariantDiscovery.wdl" as Calling 5 | import "tasks/Qc.wdl" as QC 6 | import "tasks/Utilities.wdl" as Utils 7 | import "tasks/BamProcessing.wdl" as BamProcessing 8 | 9 | # Git URL Import 10 | #import "https://raw.githubusercontent.com/gatk-workflows/five-dollar-genome-analysis-pipeline/1.2.0/tasks/GermlineVariantDiscovery.wdl" as Calling 11 | #import "https://raw.githubusercontent.com/gatk-workflows/five-dollar-genome-analysis-pipeline/1.2.0/tasks/Qc.wdl" as QC 12 | #import "https://raw.githubusercontent.com/gatk-workflows/five-dollar-genome-analysis-pipeline/1.2.0/tasks/Utilities.wdl" as Utils 13 | #import "https://raw.githubusercontent.com/gatk-workflows/five-dollar-genome-analysis-pipeline/1.2.0/tasks/BamProcessing.wdl" as BamProcessing 14 | 15 | workflow VariantCalling { 16 | 17 | 18 | File calling_interval_list 19 | File evaluation_interval_list 20 | Int haplotype_scatter_count 21 | Int break_bands_at_multiples_of 22 | Float? contamination 23 | File input_bam 24 | File ref_fasta 25 | File ref_fasta_index 26 | File ref_dict 27 | File dbsnp_vcf 28 | File dbsnp_vcf_index 29 | String base_file_name 30 | String final_vcf_base_name 31 | Int agg_preemptible_tries 32 | Boolean make_gvcf = true 33 | Boolean make_bamout = false 34 | Boolean use_gatk3_haplotype_caller = false 35 | 36 | 37 | 38 | # Break the calling interval_list into sub-intervals 39 | # Perform variant calling on the sub-intervals, and then gather the results 40 | call Utils.ScatterIntervalList as ScatterIntervalList { 41 | input: 42 | interval_list = calling_interval_list, 43 | scatter_count = haplotype_scatter_count, 44 | break_bands_at_multiples_of = break_bands_at_multiples_of 45 | } 46 | 47 | # We need disk to localize the sharded input and output due to the scatter for HaplotypeCaller. 48 | # If we take the number we are scattering by and reduce by 20 we will have enough disk space 49 | # to account for the fact that the data is quite uneven across the shards. 50 | Int potential_hc_divisor = ScatterIntervalList.interval_count - 20 51 | Int hc_divisor = if potential_hc_divisor > 1 then potential_hc_divisor else 1 52 | 53 | # Call variants in parallel over WGS calling intervals 54 | scatter (scattered_interval_list in ScatterIntervalList.out) { 55 | 56 | if (use_gatk3_haplotype_caller) { 57 | call Calling.HaplotypeCaller_GATK35_GVCF as HaplotypeCallerGATK3 { 58 | input: 59 | input_bam = input_bam, 60 | interval_list = scattered_interval_list, 61 | gvcf_basename = base_file_name, 62 | ref_dict = ref_dict, 63 | ref_fasta = ref_fasta, 64 | ref_fasta_index = ref_fasta_index, 65 | contamination = contamination, 66 | preemptible_tries = agg_preemptible_tries, 67 | hc_scatter = hc_divisor 68 | } 69 | } 70 | 71 | if (!use_gatk3_haplotype_caller) { 72 | 73 | # Generate GVCF by interval 74 | call Calling.HaplotypeCaller_GATK4_VCF as HaplotypeCallerGATK4 { 75 | input: 76 | contamination = contamination, 77 | input_bam = input_bam, 78 | interval_list = scattered_interval_list, 79 | vcf_basename = base_file_name, 80 | ref_dict = ref_dict, 81 | ref_fasta = ref_fasta, 82 | ref_fasta_index = ref_fasta_index, 83 | hc_scatter = hc_divisor, 84 | make_gvcf = make_gvcf, 85 | make_bamout = make_bamout, 86 | preemptible_tries = agg_preemptible_tries 87 | } 88 | 89 | # If bamout files were created, we need to sort and gather them into one bamout 90 | if (make_bamout) { 91 | call BamProcessing.SortSam as SortBamout { 92 | input: 93 | input_bam = HaplotypeCallerGATK4.bamout, 94 | output_bam_basename = final_vcf_base_name, 95 | preemptible_tries = agg_preemptible_tries, 96 | compression_level = 2 97 | } 98 | } 99 | } 100 | 101 | File vcfs_to_merge = select_first([HaplotypeCallerGATK3.output_gvcf, HaplotypeCallerGATK4.output_vcf]) 102 | File vcf_indices_to_merge = select_first([HaplotypeCallerGATK3.output_gvcf_index, HaplotypeCallerGATK4.output_vcf_index]) 103 | } 104 | 105 | # Combine by-interval (g)VCFs into a single sample (g)VCF file 106 | String merge_suffix = if make_gvcf then ".g.vcf.gz" else ".vcf.gz" 107 | call Calling.MergeVCFs as MergeVCFs { 108 | input: 109 | input_vcfs = vcfs_to_merge, 110 | input_vcfs_indexes = vcf_indices_to_merge, 111 | output_vcf_name = final_vcf_base_name + merge_suffix, 112 | preemptible_tries = agg_preemptible_tries 113 | } 114 | 115 | if (make_bamout) { 116 | call MergeBamouts { 117 | input: 118 | bams = select_all(SortBamout.output_bam), 119 | output_base_name = final_vcf_base_name 120 | } 121 | } 122 | 123 | # Validate the (g)VCF output of HaplotypeCaller 124 | call QC.ValidateVCF as ValidateVCF { 125 | input: 126 | input_vcf = MergeVCFs.output_vcf, 127 | input_vcf_index = MergeVCFs.output_vcf_index, 128 | dbsnp_vcf = dbsnp_vcf, 129 | dbsnp_vcf_index = dbsnp_vcf_index, 130 | ref_fasta = ref_fasta, 131 | ref_fasta_index = ref_fasta_index, 132 | ref_dict = ref_dict, 133 | calling_interval_list = calling_interval_list, 134 | is_gvcf = make_gvcf, 135 | preemptible_tries = agg_preemptible_tries 136 | } 137 | 138 | # QC the (g)VCF 139 | call QC.CollectVariantCallingMetrics as CollectVariantCallingMetrics { 140 | input: 141 | input_vcf = MergeVCFs.output_vcf, 142 | input_vcf_index = MergeVCFs.output_vcf_index, 143 | metrics_basename = final_vcf_base_name, 144 | dbsnp_vcf = dbsnp_vcf, 145 | dbsnp_vcf_index = dbsnp_vcf_index, 146 | ref_dict = ref_dict, 147 | evaluation_interval_list = evaluation_interval_list, 148 | is_gvcf = make_gvcf, 149 | preemptible_tries = agg_preemptible_tries 150 | } 151 | 152 | output { 153 | File vcf_summary_metrics = CollectVariantCallingMetrics.summary_metrics 154 | File vcf_detail_metrics = CollectVariantCallingMetrics.detail_metrics 155 | File output_vcf = MergeVCFs.output_vcf 156 | File output_vcf_index = MergeVCFs.output_vcf_index 157 | File? bamout = MergeBamouts.output_bam 158 | File? bamout_index = MergeBamouts.output_bam_index 159 | } 160 | } 161 | 162 | # This task is here because merging bamout files using Picard produces an error. 163 | task MergeBamouts { 164 | 165 | 166 | Array[File] bams 167 | String output_base_name 168 | 169 | 170 | Int disk_size = ceil(size(bams, "GiB") * 2) + 10 171 | 172 | command { 173 | samtools merge ${output_base_name}.bam ${sep=" " bams} 174 | samtools index ${output_base_name}.bam 175 | mv ${output_base_name}.bam.bai ${output_base_name}.bai 176 | } 177 | 178 | output { 179 | File output_bam = "${output_base_name}.bam" 180 | File output_bam_index = "${output_base_name}.bai" 181 | } 182 | 183 | runtime { 184 | docker: "biocontainers/samtools:1.3.1" 185 | memory: "4 GiB" 186 | disks: "local-disk ${disk_size} HDD" 187 | preemptible: 3 188 | cpu: 1 189 | } 190 | } 191 | -------------------------------------------------------------------------------- /workflows/mystery-2/tasks/VariantCalling.wdl: -------------------------------------------------------------------------------- 1 | version 1.0 2 | 3 | # Local Import 4 | #import "../tasks/GermlineVariantDiscovery.wdl" as Calling 5 | #import "../tasks/Qc.wdl" as QC 6 | #import "../tasks/Utilities.wdl" as Utils 7 | #import "../tasks/BamProcessing.wdl" as BamProcessing 8 | 9 | # Git URL Import 10 | import "https://raw.githubusercontent.com/gatk-workflows/five-dollar-genome-analysis-pipeline/1.2.0/tasks/GermlineVariantDiscovery.wdl" as Calling 11 | import "https://raw.githubusercontent.com/gatk-workflows/five-dollar-genome-analysis-pipeline/1.2.0/tasks/Qc.wdl" as QC 12 | import "https://raw.githubusercontent.com/gatk-workflows/five-dollar-genome-analysis-pipeline/1.2.0/tasks/Utilities.wdl" as Utils 13 | import "https://raw.githubusercontent.com/gatk-workflows/five-dollar-genome-analysis-pipeline/1.2.0/tasks/BamProcessing.wdl" as BamProcessing 14 | 15 | workflow VariantCalling { 16 | 17 | input { 18 | File calling_interval_list 19 | File evaluation_interval_list 20 | Int haplotype_scatter_count 21 | Int break_bands_at_multiples_of 22 | Float? contamination 23 | File input_bam 24 | File ref_fasta 25 | File ref_fasta_index 26 | File ref_dict 27 | File dbsnp_vcf 28 | File dbsnp_vcf_index 29 | String base_file_name 30 | String final_vcf_base_name 31 | Int agg_preemptible_tries 32 | Boolean make_gvcf = true 33 | Boolean make_bamout = false 34 | Boolean use_gatk3_haplotype_caller = false 35 | } 36 | 37 | parameter_meta { 38 | make_bamout: "For CNNScoreVariants to run with a 2D model, a bamout must be created by HaplotypeCaller. The bamout is a bam containing information on how HaplotypeCaller remapped reads while it was calling variants. See https://gatkforums.broadinstitute.org/gatk/discussion/5484/howto-generate-a-bamout-file-showing-how-haplotypecaller-has-remapped-sequence-reads for more details." 39 | } 40 | 41 | # Break the calling interval_list into sub-intervals 42 | # Perform variant calling on the sub-intervals, and then gather the results 43 | call Utils.ScatterIntervalList as ScatterIntervalList { 44 | input: 45 | interval_list = calling_interval_list, 46 | scatter_count = haplotype_scatter_count, 47 | break_bands_at_multiples_of = break_bands_at_multiples_of 48 | } 49 | 50 | # We need disk to localize the sharded input and output due to the scatter for HaplotypeCaller. 51 | # If we take the number we are scattering by and reduce by 20 we will have enough disk space 52 | # to account for the fact that the data is quite uneven across the shards. 53 | Int potential_hc_divisor = ScatterIntervalList.interval_count - 20 54 | Int hc_divisor = if potential_hc_divisor > 1 then potential_hc_divisor else 1 55 | 56 | # Call variants in parallel over WGS calling intervals 57 | scatter (scattered_interval_list in ScatterIntervalList.out) { 58 | 59 | if (use_gatk3_haplotype_caller) { 60 | call Calling.HaplotypeCaller_GATK35_GVCF as HaplotypeCallerGATK3 { 61 | input: 62 | input_bam = input_bam, 63 | interval_list = scattered_interval_list, 64 | gvcf_basename = base_file_name, 65 | ref_dict = ref_dict, 66 | ref_fasta = ref_fasta, 67 | ref_fasta_index = ref_fasta_index, 68 | contamination = contamination, 69 | preemptible_tries = agg_preemptible_tries, 70 | hc_scatter = hc_divisor 71 | } 72 | } 73 | 74 | if (!use_gatk3_haplotype_caller) { 75 | 76 | # Generate GVCF by interval 77 | call Calling.HaplotypeCaller_GATK4_VCF as HaplotypeCallerGATK4 { 78 | input: 79 | contamination = contamination, 80 | input_bam = input_bam, 81 | interval_list = scattered_interval_list, 82 | vcf_basename = base_file_name, 83 | ref_dict = ref_dict, 84 | ref_fasta = ref_fasta, 85 | ref_fasta_index = ref_fasta_index, 86 | hc_scatter = hc_divisor, 87 | make_gvcf = make_gvcf, 88 | make_bamout = make_bamout, 89 | preemptible_tries = agg_preemptible_tries 90 | } 91 | 92 | # If bamout files were created, we need to sort and gather them into one bamout 93 | if (make_bamout) { 94 | call BamProcessing.SortSam as SortBamout { 95 | input: 96 | input_bam = HaplotypeCallerGATK4.bamout, 97 | output_bam_basename = final_vcf_base_name, 98 | preemptible_tries = agg_preemptible_tries, 99 | compression_level = 2 100 | } 101 | } 102 | } 103 | 104 | File vcfs_to_merge = select_first([HaplotypeCallerGATK3.output_gvcf, HaplotypeCallerGATK4.output_vcf]) 105 | File vcf_indices_to_merge = select_first([HaplotypeCallerGATK3.output_gvcf_index, HaplotypeCallerGATK4.output_vcf_index]) 106 | } 107 | 108 | # Combine by-interval (g)VCFs into a single sample (g)VCF file 109 | String merge_suffix = if make_gvcf then ".g.vcf.gz" else ".vcf.gz" 110 | call Calling.MergeVCFs as MergeVCFs { 111 | input: 112 | input_vcfs = vcfs_to_merge, 113 | input_vcfs_indexes = vcf_indices_to_merge, 114 | output_vcf_name = final_vcf_base_name + merge_suffix, 115 | preemptible_tries = agg_preemptible_tries 116 | } 117 | 118 | if (make_bamout) { 119 | call MergeBamouts { 120 | input: 121 | bams = select_all(SortBamout.output_bam), 122 | output_base_name = final_vcf_base_name 123 | } 124 | } 125 | 126 | # Validate the (g)VCF output of HaplotypeCaller 127 | call QC.ValidateVCF as ValidateVCF { 128 | input: 129 | input_vcf = MergeVCFs.output_vcf, 130 | input_vcf_index = MergeVCFs.output_vcf_index, 131 | dbsnp_vcf = dbsnp_vcf, 132 | dbsnp_vcf_index = dbsnp_vcf_index, 133 | ref_fasta = ref_fasta, 134 | ref_fasta_index = ref_fasta_index, 135 | ref_dict = ref_dict, 136 | calling_interval_list = calling_interval_list, 137 | is_gvcf = make_gvcf, 138 | preemptible_tries = agg_preemptible_tries 139 | } 140 | 141 | # QC the (g)VCF 142 | call QC.CollectVariantCallingMetrics as CollectVariantCallingMetrics { 143 | input: 144 | input_vcf = MergeVCFs.output_vcf, 145 | input_vcf_index = MergeVCFs.output_vcf_index, 146 | metrics_basename = final_vcf_base_name, 147 | dbsnp_vcf = dbsnp_vcf, 148 | dbsnp_vcf_index = dbsnp_vcf_index, 149 | ref_dict = ref_dict, 150 | evaluation_interval_list = evaluation_interval_list, 151 | is_gvcf = make_gvcf, 152 | preemptible_tries = agg_preemptible_tries 153 | } 154 | 155 | output { 156 | File vcf_summary_metrics = CollectVariantCallingMetrics.summary_metrics 157 | File vcf_detail_metrics = CollectVariantCallingMetrics.detail_metrics 158 | File output_vcf = MergeVCFs.output_vcf 159 | File output_vcf_index = MergeVCFs.output_vcf_index 160 | File? bamout = MergeBamouts.output_bam 161 | File? bamout_index = MergeBamouts.output_bam_index 162 | } 163 | } 164 | 165 | # This task is here because merging bamout files using Picard produces an error. 166 | task MergeBamouts { 167 | 168 | input { 169 | Array[File] bams 170 | String output_base_name 171 | } 172 | 173 | Int disk_size = ceil(size(bams, "GiB") * 2) + 10 174 | 175 | command { 176 | samtools merge ~{output_base_name}.bam ~{sep=" " bams} 177 | samtools index ~{output_base_name}.bam 178 | mv ~{output_base_name}.bam.bai ~{output_base_name}.bai 179 | } 180 | 181 | output { 182 | File output_bam = "~{output_base_name}.bam" 183 | File output_bam_index = "~{output_base_name}.bai" 184 | } 185 | 186 | runtime { 187 | docker: "biocontainers/samtools:1.3.1" 188 | memory: "4 GiB" 189 | disks: "local-disk ~{disk_size} HDD" 190 | preemptible: 3 191 | cpu: 1 192 | } 193 | } 194 | -------------------------------------------------------------------------------- /workflows/scatter-hc/scatter-haplotypecaller.gcs.inputs.json: -------------------------------------------------------------------------------- 1 | { 2 | "ScatterHaplotypeCallerGVCF.HaplotypeCallerGVCF.ref_index": "gs://genomics-in-the-cloud/v1/data/germline/ref/ref.fasta.fai", 3 | "ScatterHaplotypeCallerGVCF.HaplotypeCallerGVCF.ref_dict": "gs://genomics-in-the-cloud/v1/data/germline/ref/ref.dict", 4 | "ScatterHaplotypeCallerGVCF.HaplotypeCallerGVCF.java_opt": "-Xmx8G", 5 | "ScatterHaplotypeCallerGVCF.input_bam_index": "gs://genomics-in-the-cloud/v1/data/germline/bams/mother.bai", 6 | "ScatterHaplotypeCallerGVCF.input_bam": "gs://genomics-in-the-cloud/v1/data/germline/bams/mother.bam", 7 | "ScatterHaplotypeCallerGVCF.MergeVCFs.docker_image": "us.gcr.io/broad-gatk/gatk:4.1.3.0", 8 | "ScatterHaplotypeCallerGVCF.HaplotypeCallerGVCF.ref_fasta": "gs://genomics-in-the-cloud/v1/data/germline/ref/ref.fasta", 9 | "ScatterHaplotypeCallerGVCF.MergeVCFs.java_opt": "-Xmx8G", 10 | "ScatterHaplotypeCallerGVCF.intervals_list": "gs://genomics-in-the-cloud/v1/data/germline/intervals/snippet-intervals-min.list", 11 | "ScatterHaplotypeCallerGVCF.HaplotypeCallerGVCF.docker_image": "us.gcr.io/broad-gatk/gatk:4.1.3.0" 12 | } 13 | 14 | -------------------------------------------------------------------------------- /workflows/scatter-hc/scatter-haplotypecaller.gcs.inputs.test.json: -------------------------------------------------------------------------------- 1 | { 2 | "HaplotypeCallerGVCF.ref_index": "gs://genomics-in-the-cloud/v1/data/germline/ref/ref.fasta.fai", 3 | "HaplotypeCallerGVCF.ref_dict": "gs://genomics-in-the-cloud/v1/data/germline/ref/ref.dict", 4 | "HaplotypeCallerGVCF.java_opt": "-Xmx8G", 5 | "ScatterHaplotypeCallerGVCF.input_bam_index": "gs://genomics-in-the-cloud/v1/data/germline/bams/mother.bai", 6 | "ScatterHaplotypeCallerGVCF.input_bam": "gs://genomics-in-the-cloud/v1/data/germline/bams/mother.bam", 7 | "ScatterHaplotypeCallerGVCF.MergeVCFs.docker_image": "us.gcr.io/broad-gatk/gatk:4.1.3.0", 8 | "ScatterHaplotypeCallerGVCF.HaplotypeCallerGVCF.ref_fasta": "gs://genomics-in-the-cloud/v1/data/germline/ref/ref.fasta", 9 | "ScatterHaplotypeCallerGVCF.MergeVCFs.java_opt": "-Xmx8G", 10 | "ScatterHaplotypeCallerGVCF.intervals_list": "gs://genomics-in-the-cloud/v1/data/germline/intervals/snippet-intervals-min.list", 11 | "ScatterHaplotypeCallerGVCF.HaplotypeCallerGVCF.docker_image": "us.gcr.io/broad-gatk/gatk:4.1.3.0" 12 | } 13 | 14 | -------------------------------------------------------------------------------- /workflows/scatter-hc/scatter-haplotypecaller.local.inputs.json: -------------------------------------------------------------------------------- 1 | { 2 | "ScatterHaplotypeCallerGVCF.HaplotypeCallerGVCF.ref_index": "book/data/germline/ref/ref.fasta.fai", 3 | "ScatterHaplotypeCallerGVCF.HaplotypeCallerGVCF.ref_dict": "book/data/germline/ref/ref.dict", 4 | "ScatterHaplotypeCallerGVCF.HaplotypeCallerGVCF.java_opt": "-Xmx8G", 5 | "ScatterHaplotypeCallerGVCF.input_bam_index": "book/data/germline/bams/mother.bai", 6 | "ScatterHaplotypeCallerGVCF.input_bam": "book/data/germline/bams/mother.bam", 7 | "ScatterHaplotypeCallerGVCF.MergeVCFs.docker_image": "us.gcr.io/broad-gatk/gatk:4.1.3.0", 8 | "ScatterHaplotypeCallerGVCF.HaplotypeCallerGVCF.ref_fasta": "book/data/germline/ref/ref.fasta", 9 | "ScatterHaplotypeCallerGVCF.MergeVCFs.java_opt": "-Xmx8G", 10 | "ScatterHaplotypeCallerGVCF.intervals_list": "book/data/germline/intervals/snippet-intervals-min.list", 11 | "ScatterHaplotypeCallerGVCF.HaplotypeCallerGVCF.docker_image": "us.gcr.io/broad-gatk/gatk:4.1.3.0" 12 | } 13 | -------------------------------------------------------------------------------- /workflows/scatter-hc/scatter-haplotypecaller.wdl: -------------------------------------------------------------------------------- 1 | ## This workflow runs the HaplotypeCaller tool from GATK4 in GVCF mode 2 | ## on a single sample in BAM format. The execution of the HaplotypeCaller 3 | ## tool is parallelized using an intervals list file. The per-interval 4 | ## output GVCF files are then merged to produce a single GVCF file for 5 | ## the sample, which can then be used by the joint-discovery workflow 6 | ## according to the GATK Best Practices for germline short variant 7 | ## discovery. 8 | 9 | version 1.0 10 | 11 | workflow ScatterHaplotypeCallerGVCF { 12 | 13 | input { 14 | File input_bam 15 | File input_bam_index 16 | File intervals_list 17 | } 18 | 19 | String output_basename = basename(input_bam, ".bam") 20 | 21 | Array[String] calling_intervals = read_lines(intervals_list) 22 | 23 | scatter(interval in calling_intervals) { 24 | call HaplotypeCallerGVCF { 25 | input: 26 | input_bam = input_bam, 27 | input_bam_index = input_bam_index, 28 | intervals = interval, 29 | gvcf_name = output_basename + ".scatter.g.vcf" 30 | } 31 | } 32 | call MergeVCFs { 33 | input: 34 | vcfs = HaplotypeCallerGVCF.output_gvcf, 35 | merged_vcf_name = output_basename + ".merged.g.vcf" 36 | } 37 | 38 | output { 39 | File output_gvcf = MergeVCFs.merged_vcf 40 | } 41 | } 42 | 43 | task HaplotypeCallerGVCF { 44 | 45 | input { 46 | String docker_image 47 | String java_opt 48 | 49 | File ref_fasta 50 | File ref_index 51 | File ref_dict 52 | File input_bam 53 | File input_bam_index 54 | String intervals 55 | String gvcf_name 56 | } 57 | 58 | command { 59 | gatk --java-options ${java_opt} HaplotypeCaller \ 60 | -R ${ref_fasta} \ 61 | -I ${input_bam} \ 62 | -O ${gvcf_name} \ 63 | -L ${intervals} \ 64 | -ERC GVCF 65 | } 66 | 67 | output { 68 | File output_gvcf = "${gvcf_name}" 69 | } 70 | 71 | runtime { 72 | docker: docker_image 73 | } 74 | } 75 | 76 | task MergeVCFs { 77 | 78 | input { 79 | String docker_image 80 | String java_opt 81 | 82 | Array[File] vcfs 83 | String merged_vcf_name 84 | } 85 | 86 | command { 87 | gatk --java-options ${java_opt} MergeVcfs \ 88 | -I ${sep=' -I' vcfs} \ 89 | -O ${merged_vcf_name} 90 | } 91 | 92 | output { 93 | File merged_vcf = "${merged_vcf_name}" 94 | } 95 | 96 | runtime { 97 | docker: docker_image 98 | } 99 | } 100 | --------------------------------------------------------------------------------