├── .gitignore
├── LICENSE
├── README.md
├── _config.yml
├── commands
    ├── 04-commands.txt
    ├── 05-commands.txt
    ├── 06-commands.txt
    ├── 07-commands.txt
    ├── 08-commands.txt
    ├── 09-commands.txt
    ├── 10-commands.txt
    └── 12-commands.txt
├── config
    ├── empty.options.json
    └── google.conf
├── figures
    ├── .keep
    └── README.md
├── metadata
    ├── README.md
    ├── book_sample-metadata.tsv
    └── workspace-metadata.tsv
├── notebooks
    ├── Basic-genomics-notebook.ipynb
    ├── README.md
    ├── Working-IGV-example.ipynb
    ├── install_GATK_4130_with_igv.sh
    ├── notebooks_Genomics-Notebook-executed.ipynb
    ├── notebooks_Genomics-Notebook.ipynb
    └── plotting.R
├── production
    ├── README.md
    ├── notebook_images
    │   ├── Genomics-Notebook-executed-igv.ipynb
    │   ├── cell_27.png
    │   ├── cell_28.png
    │   ├── cell_29.png
    │   ├── cell_32.png
    │   ├── cell_35.png
    │   ├── cell_36.png
    │   ├── cell_37.png
    │   └── cell_39.png
    └── pygments_lexer
    │   ├── README.md
    │   ├── hello-world.wdl
    │   ├── run_wdl_lexer.sh
    │   ├── test.html
    │   └── wdl_lexer.py
├── temp
    └── 05-plotting.R
└── workflows
    ├── README.md
    ├── hello-hc
        ├── hc-break1.wdl
        ├── hc-break2.wdl
        ├── hello-haplotypecaller.inputs.json
        └── hello-haplotypecaller.wdl
    ├── hello-world
        ├── hello-world-again.wdl
        ├── hello-world-var.wdl
        ├── hello-world.inputs.json
        └── hello-world.wdl
    ├── mystery-1
        ├── haplotypecaller-gvcf-gatk4.hg38.wgs.inputs.json
        └── haplotypecaller-gvcf-gatk4.wdl
    ├── mystery-2
        ├── WholeGenomeGermlineSingleSample.hg38.inputs.json
        ├── WholeGenomeGermlineSingleSample.wdl
        ├── structs
        │   └── GermlineStructs.wdl
        └── tasks
        │   ├── AggregatedBamQC.wdl
        │   ├── Alignment.wdl
        │   ├── BamProcessing.wdl
        │   ├── BamToCram.wdl
        │   ├── GermlineVariantDiscovery.wdl
        │   ├── Qc.wdl
        │   ├── SplitLargeReadGroup.wdl
        │   ├── UnmappedBamToAlignedBam.wdl
        │   ├── Utilities.wdl
        │   ├── VariantCalling-local.wdl
        │   └── VariantCalling.wdl
    └── scatter-hc
        ├── scatter-haplotypecaller.gcs.inputs.json
        ├── scatter-haplotypecaller.gcs.inputs.test.json
        ├── scatter-haplotypecaller.local.inputs.json
        └── scatter-haplotypecaller.wdl


/.gitignore:
--------------------------------------------------------------------------------
1 | 
2 | .DS_Store
3 | pygments_lexer/html_output
4 | 


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
 1 | BSD 3-Clause License
 2 | 
 3 | Copyright (c) 2019, Broad Institute
 4 | All rights reserved.
 5 | 
 6 | Redistribution and use in source and binary forms, with or without
 7 | modification, are permitted provided that the following conditions are met:
 8 | 
 9 | 1. Redistributions of source code must retain the above copyright notice, this
10 |    list of conditions and the following disclaimer.
11 | 
12 | 2. Redistributions in binary form must reproduce the above copyright notice,
13 |    this list of conditions and the following disclaimer in the documentation
14 |    and/or other materials provided with the distribution.
15 | 
16 | 3. Neither the name of the copyright holder nor the names of its
17 |    contributors may be used to endorse or promote products derived from
18 |    this software without specific prior written permission.
19 | 
20 | THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
21 | AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
22 | IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
23 | DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
24 | FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
25 | DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
26 | SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
27 | CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
28 | OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
29 | OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
30 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | # genomics-in-the-cloud
 2 | 
 3 | Source code and related materials for Genomics in the Cloud, an O'Reilly book by [Geraldine A. Van der Auwera](https://www.linkedin.com/in/geraldine-van-der-auwera-5a5811) and [Brian D. O'Connor](https://www.linkedin.com/in/briandoconnor/).
 4 | 
 5 | You can find the book in the O'Reilly Learning Library at [https://oreil.ly/genomics-cloud](https://oreil.ly/genomics-cloud), on Amazon ([Kindle](https://www.amazon.com/Genomics-Cloud-Using-Docker-Terra-ebook-dp-B086Q7D47V/dp/B086Q7D47V/) or [paperback](https://www.amazon.com/Genomics-Cloud-GATK-Spark-Docker-dp-1491975199/dp/1491975199/)), and in both ebook and print formats from a variety of other booksellers. We do encourage you to get it through your local independent bookstore if you’re able.
 6 | 
 7 | ## Book overview
 8 | 
 9 | Data in the genomics field is booming. In just a few years, organizations such as the National Institutes of Health (NIH) will host 50+ petabytes—or 50 million gigabytes—of genomic data, and they’re turning to cloud infrastructure to make that data available to the research community. How do you adapt analysis tools and protocols to access and analyze that data in the cloud?   
10 | 
11 | With this practical book, researchers will learn how to work with genomics algorithms using open source tools including the Genome Analysis Toolkit (GATK), Docker, WDL, and Terra. Geraldine Van der Auwera, longtime custodian of the GATK user community, and Brian O’Connor of the UC Santa Cruz Genomics Institute guide you through the process. You’ll learn by working with real data and genomics algorithms from the field.
12 | 
13 | This book takes you through:
14 | 
15 | - Essential genomics and computing technology background
16 | - Basic cloud computing operations
17 | - Getting started with GATK
18 | - Three major GATK Best Practices pipelines for variant discovery
19 | - Automating analysis with scripted workflows using WDL and Cromwell
20 | - Scaling up workflow execution in the cloud, including parallelization and cost optimization
21 | - Interactive analysis in the cloud using Jupyter notebooks
22 | - Secure collaboration and computational reproducibility using Terra
23 | 
24 | For more information about the book and why you might find it useful, please see the [Genomics in the Cloud blog](https://broadinstitute.github.io/genomics-in-the-cloud).
25 | 
26 | ----
27 | 
28 | ## Resources
29 | 
30 | ### List of commands
31 | 
32 | See the [commands](commands/) folder for text files that let you easily copy and paste the commands from the hands-on exercises.
33 | 
34 | ### Figures and semi-official companion booklet
35 | 
36 | For those of you reading the print version of the book, which does not include color figures, we've made the figures available in full color in the [figures](https://console.cloud.google.com/storage/browser/genomics-in-the-cloud/figures/) directory of the GCS bucket.  
37 | You may use all figures except 3-3 and 6-15 in your own non-commercial work, preferably with a notice of attribution referring to the book. For commercial use, please contact permissions@oreilly.com. Figures 3-3 and 6-15 do not belong to us, so you must request permission from their respective owners, which are noted in the book.
38 | 
39 | We also put together a [companion booklet](https://console.cloud.google.com/storage/browser/_details/genomics-in-the-cloud/figures/Genomics_in_the_Cloud___Figures_Booklet.pdf) that contains the figures and their captions for more convenient browsing or printing. It's "semi-official" in the sense that we created and maintain it, but it is not published by O'Reilly, so it does not go through their quality control process. Think of it as an artisanal, locally sourced side dish.  
40 | 
41 | ### Blog
42 | 
43 | We have a blog for the book at [https://broadinstitute.github.io/genomics-in-the-cloud/](https://broadinstitute.github.io/genomics-in-the-cloud/) where we cover various topics including additional tutorials, errata for the book, and regular updates on new features that you maay be interested in. Feel free to suggest blog topics by reaching out to us on Twitter or LinkedIn (see contact info below).
44 | 
45 | ### Reporting errors 
46 | 
47 | If you encounter errors or broken links in the book, please file an issue on O'Reilly's [Errata page](https://www.oreilly.com/catalog/errata.csp?isbn=0636920058182). Anything reported there that we can verify will get fixed and updated in both the electronic versions and subsequent printing runs of the book, so others won't run into the same problems.
48 | 
49 | *We don't use Github Issues for this project to avoid confusion and redundancy with the O'Reilly Errata page.*
50 | 
51 | ### Getting help
52 | 
53 | If you run into problems while working through the hands-on exercises, or if have follow-up questions about the topics we discuss in the book, please post your questions in either the [GATK forum](https://gatk.broadinstitute.org/hc/en-us/community/topics) or the [Terra forum](https://support.terra.bio/hc/en-us/community/topics). The frontline support team will most likely be able to address your questions, and for anything else they will loop us into the conversation if you mention that your question is related to our book. If you're not sure which forum to use, just flip a coin; it's the same team that maintains both communities.
54 | 
55 | Remember also that you can often save yourself some time by searching the [GATK documentation](https://gatk.broadinstitute.org/hc/en-us) or [Terra documentation](https://support.terra.bio/hc/en-us) before posting a question -- that way you don't have to wait for someone to get back to you. 
56 | 
57 | ### Getting in touch with us
58 | If you'd like to get in touch, you can reach us on Twitter ([@VdAGeraldine](https://twitter.com/VdaGeraldine) and [@boconnor](https://twitter.com/boconnor)) and on LinkedIn ([Geraldine](https://www.linkedin.com/in/geraldine-van-der-auwera-5a5811) and [Brian](https://www.linkedin.com/in/briandoconnor/)). We look forward to hearing what you think of the book! If you like it, please consider posting a review on Amazon. 
59 | 
60 | 
61 | 


--------------------------------------------------------------------------------
/_config.yml:
--------------------------------------------------------------------------------
1 | theme: jekyll-theme-dinky


--------------------------------------------------------------------------------
/commands/04-commands.txt:
--------------------------------------------------------------------------------
 1 | About:
 2 | 
 3 | This is a simple text file that contains the commands present in
 4 | chapter 4.  We created this file to prevent readers from having
 5 | to type out commands from our book.  This is not a stand-alone
 6 | tutorial, you'll want to follow along in the chapter to get
 7 | context on what these commands do.
 8 | 
 9 | Conventions:
10 | 
11 | Commands you run on your cloud shell or VM begin with "$".
12 | Commands you run within a Docker container begin with "#".
13 | Commands lacking an initial "$" or "#" are typically
14 | provided for illustration purposes and we don't expect you
15 | to run them.
16 | 
17 | Commands:
18 | 
19 | $ gcloud config set project ferrous-layout-260200
20 | 
21 | $ ls
22 | 
23 | $ cat README-cloudshell.txt
24 | 
25 | $ gsutil ls gs://genomics-in-the-cloud
26 | 
27 | $ gsutil cat gs://genomics-in-the-cloud/hello.txt
28 | 
29 | $ gsutil cp gs://genomics-in-the-cloud/hello.txt .
30 | 
31 | $ ls
32 | 
33 | $ gsutil mb gs://my-bucket
34 | 
35 | $ export BUCKET="gs://my-bucket"
36 | 
37 | $ echo $BUCKET
38 | 
39 | $ gsutil cp gs://genomics-in-the-cloud/hello.txt $BUCKET/
40 | 
41 | $ gsutil cp hello.txt $BUCKET/
42 | 
43 | $ gsutil cp $BUCKET/hello.txt $BUCKET/my-directory/
44 | 
45 | $ docker pull ubuntu
46 | 
47 | $ docker run ubuntu
48 | 
49 | $ docker run ubuntu echo "Hello World!"
50 | 
51 | $ docker run -it ubuntu /bin/bash
52 | 
53 | $ docker ps -a
54 | 
55 | $ mkdir book
56 | 
57 | $ mv hello.txt book/
58 | 
59 | $ ls book
60 | 
61 | $ docker run -v ~/book:/home/book -it ubuntu /bin/bash
62 | 
63 | # ls home/book
64 | 
65 | $ gcloud init
66 | 
67 | $ mkdir ~/book
68 | 
69 | $ gsutil -m cp -r gs://genomics-in-the-cloud/v1/* ~/book/
70 | 
71 | $ cd ~/book
72 | 
73 | $ git clone https://github.com/broadinstitute/genomics-in-the-cloud.git code
74 | 
75 | $ cd ~/book/code
76 | 
77 | $ git pull
78 | 
79 | $ docker
80 | 
81 | $ curl -sSL https://get.docker.com/ | sh
82 | 
83 | $ sudo usermod -aG docker $USER
84 | 
85 | $ exit
86 | 
87 | $ docker pull us.gcr.io/broad-gatk/gatk:4.1.3.0
88 | 
89 | $ docker run -v ~/book:/home/book -it us.gcr.io/broad-gatk/gatk:4.1.3.0 /bin/bash
90 | 
91 | # gatk
92 | 


--------------------------------------------------------------------------------
/commands/05-commands.txt:
--------------------------------------------------------------------------------
 1 | About:
 2 | 
 3 | This is a simple text file that contains the commands present in
 4 | chapter 5.  We created this file to prevent readers from having
 5 | to type out commands from our book.  This is not a stand-alone
 6 | tutorial, you'll want to follow along in the chapter to get
 7 | context on what these commands do.
 8 | 
 9 | Conventions:
10 | 
11 | Commands you run on your cloud shell or VM begin with "$".
12 | Commands you run within a Docker container begin with "#".
13 | Commands lacking an initial "$" or "#" are typically
14 | provided for illustration purposes and we don't expect you
15 | to run them.
16 | 
17 | Commands:
18 | 
19 | $ java -jar program.jar [program arguments]
20 | 
21 | $ gatk ToolName [tool arguments]
22 | 
23 | $ java -Xmx4G -XX:+PrintGCDetails -jar program.jar [program arguments]
24 | 
25 | $ gatk --java-options "-Xmx4G -XX:+PrintGCDetails" ToolName [tool arguments]
26 | 
27 | gatk MySparkTool \
28 |     -R data/reference.fasta \
29 |     -I data/sample1.bam \
30 |     -O data/variants.vcf \
31 |     -- \
32 |     --spark-master 'local[4]'
33 | 
34 | --spark-runner SPARK --spark-master spark://23.195.26.187:7077
35 | 
36 | --spark-runner GCS --cluster my_cluster
37 | 
38 | $ docker run -v ~/book:/home/book -it us.gcr.io/broad-gatk/gatk:4.1.3.0 /bin/bash
39 | 
40 | # ls
41 | 
42 | # gatk
43 | 
44 | # gatk HaplotypeCaller --help
45 | 
46 | # cd /home/book/data/germline
47 | # mkdir sandbox
48 | 
49 | # gatk HaplotypeCaller \
50 |     -R ref/ref.fasta \
51 |     -I bams/mother.bam \
52 |     -O sandbox/mother_variants.vcf
53 | 
54 | # gatk ValidateSamFile \
55 |     -R ref/ref.fasta \
56 |     -I bams/mother.bam \
57 |     -O sandbox/mother_validation.txt
58 | 
59 | # gatk HaplotypeCaller \
60 |     -R ref/ref.fasta \
61 |     -I bams/mother.bam \
62 |     -O sandbox/mother_variants.200k.vcf \
63 |     -L 20:10,000,000-10,200,000
64 | 
65 | $ cd ~/book/data/germline/sandbox
66 | 
67 | $ export BUCKET="gs://my-bucket"
68 | 
69 | $ echo $BUCKET
70 | 
71 | $ gsutil cp mother_variants.200k.vcf* $BUCKET/germline-sandbox/
72 | 
73 | # gatk HaplotypeCaller \
74 |     -R ref/ref.fasta \
75 |     -I bams/mother.bam \
76 |     -O sandbox/mother_variants.snippet.debug.vcf \
77 |     -bamout sandbox/mother_variants.snippet.debug.bam \
78 |     -L 20:10,002,000-10,003,000
79 | 
80 | # zcat vcfs/motherSNP.vcf.gz | grep -v '##' | head -3
81 | 
82 | # zcat vcfs/motherSNP.giab.vcf.gz | grep -v '##' | head -3
83 | 
84 | # gatk VariantFiltration \
85 |     -R ref/ref.fasta \
86 |     -V vcfs/motherSNP.vcf.gz \
87 |     --filter-expression "QD < 2.0" \
88 |     --filter-name "QD2" \
89 |     -O sandbox/motherSNP.QD2.vcf.gz
90 | 
91 | # zcat sandbox/motherSNP.QD2.vcf.gz | grep -v '##' | head -3
92 | 
93 | # gatk VariantFiltration \
94 |     -R ref/ref.fasta \
95 |     -V vcfs/motherSNP.vcf.gz \
96 |     --filter-expression "QD < 2.0 || DP > 100.0" \
97 |     --filter-name "lowQD_highDP" \
98 |     -O sandbox/motherSNP.QD2.DP100.vcf.gz
99 | 


--------------------------------------------------------------------------------
/commands/06-commands.txt:
--------------------------------------------------------------------------------
  1 | About:
  2 | 
  3 | This is a simple text file that contains the commands present in
  4 | chapter 5.  We created this file to prevent readers from having
  5 | to type out commands from our book.  This is not a stand-alone
  6 | tutorial, you'll want to follow along in the chapter to get
  7 | context on what these commands do.
  8 | 
  9 | Conventions:
 10 | 
 11 | Commands you run on your cloud shell or VM begin with "$".
 12 | Commands you run within a Docker container begin with "#".
 13 | Commands lacking an initial "$" or "#" are typically
 14 | provided for illustration purposes and we don't expect you
 15 | to run them.
 16 | 
 17 | Commands:
 18 | 
 19 | bwa mem -M -t 7 -p reference.fasta unmapped_reads.fq > mapped_reads.sam
 20 | 
 21 | gatk MarkDuplicates \
 22 |     -R reference.fasta \
 23 |     -I mapped_reads_rg1.bam \
 24 |     -I mapped_reads_rg2.bam \
 25 |     -I mapped_reads_rg3.bam \
 26 |     -O sample_markdups.bam
 27 | 
 28 | gatk BaseRecalibrator \
 29 |     -R reference.fasta \
 30 |     -I sample_markdups.bam \
 31 |     --known-sites known_variation.vcf \
 32 |     -O recal_data.table
 33 | 
 34 | gatk ApplyBQSR \
 35 |     -R reference.fasta \
 36 |     -I sample_markdups.bam \
 37 |     --bqsr-recal-file recal_data.table \
 38 |     -O sample_markdups_recal.bam
 39 | 
 40 | # gatk HaplotypeCaller \
 41 |     -R ref/ref.fasta \
 42 |     -I bams/mother.bam \
 43 |     -O sandbox/mother_variants.200k.g.vcf.gz \
 44 |     -L 20:10,000,000-10,200,000 \
 45 |     -ERC GVCF
 46 | 
 47 | # zcat sandbox/mother_variants.200k.g.vcf.gz | grep -v '##' | head -3
 48 | 
 49 | # gatk GenomicsDBImport \
 50 |     -V gvcfs/mother.g.vcf.gz \
 51 |     -V gvcfs/father.g.vcf.gz \
 52 |     --genomicsdb-workspace-path sandbox/trio-gdb \
 53 |     --intervals 20:10,000,000-10,200,000
 54 | 
 55 | # gatk SelectVariants \
 56 |     -R ref/ref.fasta \
 57 |     -V gendb://sandbox/trio-gdb \
 58 |     -O sandbox/duo_selectvariants.g.vcf.gz
 59 | 
 60 | $ zcat sandbox/duo_selectvariants.g.vcf.gz | grep -v '##' | head -3
 61 | 
 62 | # gatk GenomicsDBImport \
 63 |     -V gvcfs/son.g.vcf.gz \
 64 |     --genomicsdb-update-workspace-path sandbox/trio-gdb
 65 | 
 66 | # gatk SelectVariants \
 67 |     -R ref/ref.fasta \
 68 |     -V gendb://sandbox/trio-gdb \
 69 |     -O sandbox/trio_selectvariants.g.vcf.gz
 70 | 
 71 | $ zcat sandbox/trio_selectvariants.g.vcf.gz | grep -v '##' | head -3
 72 | 
 73 | # gatk GenotypeGVCFs \
 74 |     -R ref/ref.fasta \
 75 |     -V gendb://sandbox/trio-gdb \
 76 |     -O sandbox/trio-jointcalls.vcf.gz \
 77 |     -L 20:10,000,000-10,200,000
 78 | 
 79 | # zcat sandbox/trio-jointcalls.vcf.gz | grep -v '##' | head -3
 80 | 
 81 | # gatk HaplotypeCaller \
 82 |     -R ref/ref.fasta \
 83 |     -I bams/mother.bam \
 84 |     -I bams/father.bam \
 85 |     -I bams/son.bam \
 86 |     -O sandbox/trio_jointcalls_hc.vcf.gz \
 87 |     -L 20:10,000,000-10,200,000
 88 | 
 89 | gatk VariantRecalibrator \
 90 |    -R reference.fasta \
 91 |    -V jointcalls_hc.vcf.gz \
 92 |    --resource:hapmap,known=false,training=true,truth=true,prior=15.0 \
 93 |    hapmap_sites.vcf.gz \
 94 |    --resource:omni,known=false,training=true,truth=false,prior=12.0 \
 95 |    1000G_omni2.5.sites.vcf.gz \
 96 |    --resource:1000G,known=false,training=true,truth=false,prior=10.0 \
 97 |    1000G_phase1.snps.high_conf.vcf.gz \
 98 |    --resource:dbsnp,known=true,training=false,truth=false,prior=2.0 dbsnp.vcf.gz \
 99 |    -an QD -an MQ -an MQRankSum -an ReadPosRankSum -an FS -an SOR \
100 |    -mode SNP \
101 |    -O output.recal \
102 |    --tranches-file output.tranches
103 | 
104 | gatk ApplyVQSR \
105 |     -R reference.fasta \
106 |     -V jointcalls_hc.vcf.gz \
107 |     -O jointcalls_filtered.vcf.gz \
108 |     --truth-sensitivity-filter-level 99.9 \
109 |     --tranches-file output.tranches \
110 |     --recal-file output.recal \
111 |     -mode SNP
112 | 
113 | # gatk CalculateGenotypePosteriors \
114 |     -V sandbox/trio-jointcalls.vcf.gz \
115 |     -ped resources/trio-pedigree.ped \
116 |     --supporting-callsets resources/af-only-gnomad.vcf.gz \
117 |     -O sandbox/trio-refined.vcf.gz
118 | 
119 | # cd ../cnn
120 | # mkdir sandbox
121 | # ls
122 | 
123 | # gatk CNNScoreVariants \
124 |     -R ref/Homo_sapiens_assembly19.fasta \
125 |     -V vcfs/g94982_b37_chr20_1m_15871.vcf.gz \
126 |     -O sandbox/my_1d_cnn_scored.vcf
127 | 
128 | 
129 | # gatk FilterVariantTranches \
130 |     -V sandbox/my_1d_cnn_scored.vcf \
131 |     -O sandbox/my_1d_cnn_filtered.vcf \
132 |     --resource resources/1000G_omni2.5.b37.vcf.gz \
133 |     --resource resources/hapmap_3.3.b37.vcf.gz \
134 |     --info-key CNN_1D \
135 |     --snp-tranche 99.9 \
136 |     --indel-tranche 95.0
137 | 
138 | # cat sandbox/my_1d_cnn_filtered.vcf | grep -v '##' | head -3
139 | 
140 | # gatk CNNScoreVariants \
141 |     -R ref/Homo_sapiens_assembly19.fasta \
142 |     -I bams/g94982_chr20_1m_10m_bamout.bam \
143 |     -V vcfs/g94982_b37_chr20_1m_895.vcf \
144 |     -O sandbox/my_2d_cnn_scored.vcf \
145 |     --tensor-type read_tensor \
146 |     --transfer-batch-size 8 \
147 |     --inference-batch-size 8
148 | 
149 | # gatk FilterVariantTranches \
150 |     -V sandbox/my_2d_cnn_scored.vcf \
151 |     -O sandbox/my_2d_cnn_filtered.vcf \
152 |     --resource resources/1000G_omni2.5.b37.vcf.gz \
153 |     --resource resources/hapmap_3.3.b37.vcf.gz \
154 |     --info-key CNN_2D \
155 |     --snp-tranche 99.9 \
156 |     --indel-tranche 95.0
157 | 
158 | # cat sandbox/my_2d_cnn_filtered.vcf | grep -v '##' | head -3
159 | 


--------------------------------------------------------------------------------
/commands/07-commands.txt:
--------------------------------------------------------------------------------
  1 | About:
  2 | 
  3 | This is a simple text file that contains the commands present in
  4 | chapter 5.  We created this file to prevent readers from having
  5 | to type out commands from our book.  This is not a stand-alone
  6 | tutorial, you'll want to follow along in the chapter to get
  7 | context on what these commands do.
  8 | 
  9 | Conventions:
 10 | 
 11 | Commands you run on your cloud shell or VM begin with "$".
 12 | Commands you run within a Docker container begin with "#".
 13 | Commands lacking an initial "$" or "#" are typically
 14 | provided for illustration purposes and we don't expect you
 15 | to run them.
 16 | 
 17 | Commands:
 18 | 
 19 | gatk Mutect2 \
 20 |     -R reference.fasta \
 21 |     -I normal_1.bam \
 22 |     -O normal_1.vcf.gz \
 23 |     --max-mnp-distance 0
 24 | 
 25 | gatk GenomicsDBImport \
 26 |     -R reference.fasta \
 27 |     -L intervals.interval_list \
 28 |     -V normal_1.vcf.gz \
 29 |     -V normal_2.vcf.gz \
 30 |     -V normal_3.vcf.gz \
 31 |     --genomicsdb-workspace-path pon_db
 32 | 
 33 | gatk CreateSomaticPanelOfNormals \
 34 |     -R reference.fasta \
 35 |     -V gendb://pon_db \
 36 |     --germline-resource af-only-gnomad.vcf.gz \
 37 |     -O pon.vcf.gz
 38 | 
 39 | # zcat resources/chr17_m2pon.vcf.gz | grep -v '##' | head -3
 40 | 
 41 | # cd /home/book/data/somatic
 42 | # mkdir sandbox
 43 | 
 44 | # gatk Mutect2 \
 45 |     -R ref/Homo_sapiens_assembly38.fasta \
 46 |     -I bams/tumor.bam \
 47 |     -I bams/normal.bam \
 48 |     -normal HCC1143_normal \
 49 |     -L resources/chr17plus.interval_list \
 50 |     -pon resources/chr17_m2pon.vcf.gz \
 51 |     --germline-resource resources/chr17_af-only-gnomad_grch38.vcf.gz \
 52 |     -bamout sandbox/m2_tumor_normal.bam \
 53 |     -O sandbox/m2_somatic_calls.vcf.gz
 54 | 
 55 | # gatk GetPileupSummaries \
 56 |     -I bams/normal.bam \
 57 |     -V resources/chr17_small_exac_common_3_grch38.vcf.gz \
 58 |     -L resources/chr17_small_exac_common_3_grch38.vcf.gz \
 59 |     -O sandbox/normal_getpileupsummaries.table
 60 | 
 61 | # gatk GetPileupSummaries \
 62 |     -I bams/tumor.bam \
 63 |     -V resources/chr17_small_exac_common_3_grch38.vcf.gz \
 64 |     -L resources/chr17_small_exac_common_3_grch38.vcf.gz \
 65 |     -O sandbox/tumor_getpileupsummaries.table
 66 | 
 67 | # head -5 sandbox/normal_getpileupsummaries.table
 68 | 
 69 | # head -5 sandbox/tumor_getpileupsummaries.table
 70 | 
 71 | # gatk CalculateContamination \
 72 |     -I sandbox/tumor_getpileupsummaries.table \
 73 |     -matched sandbox/normal_getpileupsummaries.table \
 74 |     -tumor-segmentation sandbox/segments.table \
 75 |     -O sandbox/pair_calculatecontamination.table
 76 | 
 77 | $ cat sandbox/pair_calculatecontamination.table
 78 | 
 79 | # gatk FilterMutectCalls \
 80 |     -R ref/Homo_sapiens_assembly38.fasta \
 81 |     -V sandbox/m2_somatic_calls.vcf.gz \
 82 |     --contamination-table sandbox/pair_calculatecontamination.table \
 83 |     -O sandbox/m2_somatic_calls.filtered.vcf.gz \
 84 |     --stats sandbox/m2_somatic_calls.vcf.gz.stats \
 85 |     --tumor-segmentation sandbox/segments.table
 86 | 
 87 | # gatk Funcotator \
 88 |     --data-sources-path resources/funcotator_dataSources_GATK_Workshop_20181205/ \
 89 |     --ref-version hg38 \
 90 |     -R ref/Homo_sapiens_assembly38.fasta \
 91 |     -V sandbox/m2_somatic_calls.filtered.vcf.gz \
 92 |     -O sandbox/m2_somatic_calls.funcotated.vcf.gz \
 93 |     --output-file-format VCF
 94 | 
 95 | # zcat sandbox/m2_somatic_calls.funcotated.vcf.gz | grep -v '##' | head -3
 96 | 
 97 | # zcat sandbox/m2_somatic_calls.funcotated.vcf.gz | grep 7674220
 98 | 
 99 | # gatk PreprocessIntervals \
100 |     -R ref/Homo_sapiens_assembly38.fasta \
101 |     -L resources/targets_chr17.interval_list \
102 |     -O sandbox/targets_chr17.preprocessed.interval_list \
103 |     --padding 250 \
104 |     --bin-length 0 \
105 |     --interval-merging-rule OVERLAPPING_ONLY
106 | 
107 | # gatk CollectReadCounts \
108 |     -I bams/tumor.bam \
109 |     -L sandbox/targets_chr17.preprocessed.interval_list \
110 |     -R ref/Homo_sapiens_assembly38.fasta \
111 |     -O sandbox/tumor.counts.tsv \
112 |     --format TSV \
113 |     -imr OVERLAPPING_ONLY
114 | 
115 | # head -5 sandbox/tumor.counts.tsv
116 | 
117 | # tail -5 sandbox/tumor.counts.tsv
118 | 
119 | gatk CreateReadCountPanelOfNormals \
120 |     -I file1_clean.counts.hdf5 \
121 |     …
122 |     -I file40_clean.counts.hdf5 \
123 |     -O cnaponC.pon.hdf5
124 | 
125 | # gatk DenoiseReadCounts \
126 |     -I cna_inputs/hcc1143_T_clean.counts.hdf5 \
127 |     --count-panel-of-normals cna_inputs/cnaponC.pon.hdf5 \
128 |     --standardized-copy-ratios sandbox/hcc1143_T_clean.standardizedCR.tsv \
129 |     --denoised-copy-ratios sandbox/hcc1143_T_clean.denoisedCR.tsv
130 | 
131 | # gatk PlotDenoisedCopyRatios \
132 |     --sequence-dictionary ref/Homo_sapiens_assembly38.dict \
133 |     --standardized-copy-ratios sandbox/hcc1143_T_clean.standardizedCR.tsv \
134 |     --denoised-copy-ratios sandbox/hcc1143_T_clean.denoisedCR.tsv \
135 |     --minimum-contig-length 46709983 \
136 |     --output sandbox/cna_plots \
137 |     --output-prefix hcc1143_T_clean
138 | 
139 | $ export BUCKET="gs://my-bucket"
140 | $ gsutil -m cp -R sandbox/cna_plots $BUCKET/somatic-sandbox/
141 | 
142 | # gatk ModelSegments \
143 |     --denoised-copy-ratios sandbox/hcc1143_T_clean.denoisedCR.tsv \
144 |     --output sandbox  \
145 |     --output-prefix hcc1143_T_clean
146 | 
147 | # gatk PlotModeledSegments \
148 |     --denoised-copy-ratios sandbox/hcc1143_T_clean.denoisedCR.tsv \
149 |     --segments sandbox/hcc1143_T_clean.modelFinal.seg \
150 |     --sequence-dictionary ref/Homo_sapiens_assembly38.dict \
151 |     --minimum-contig-length 46709983 \
152 |     --output sandbox/cna_plots \
153 |     --output-prefix hcc1143_T_clean
154 | 
155 | # gatk CallCopyRatioSegments \
156 |     -I sandbox/hcc1143_T_clean.cr.seg \
157 |     -O sandbox/hcc1143_T_clean.called.seg
158 | 
159 | # tail -5 sandbox/hcc1143_T_clean.called.seg
160 | 


--------------------------------------------------------------------------------
/commands/08-commands.txt:
--------------------------------------------------------------------------------
 1 | About:
 2 | 
 3 | This is a simple text file that contains the commands present in
 4 | chapter 5.  We created this file to prevent readers from having
 5 | to type out commands from our book.  This is not a stand-alone
 6 | tutorial, you'll want to follow along in the chapter to get
 7 | context on what these commands do.
 8 | 
 9 | Conventions:
10 | 
11 | Commands you run on your cloud shell or VM begin with "$".
12 | Commands you run within a Docker container begin with "#".
13 | Commands lacking an initial "$" or "#" are typically
14 | provided for illustration purposes and we don't expect you
15 | to run them.
16 | 
17 | Commands:
18 | 
19 | $ java -version
20 | 
21 | $ sudo apt install openjdk-8-jre-headless
22 | 
23 | $ java -version
24 | 
25 | $ export BIN=~/book/bin
26 | 
27 | $ curl -L -o ~/book/bin/cromwell-48.jar \
28 | https://github.com/broadinstitute/cromwell/releases/download/48/cromwell-48.jar
29 | 
30 | $ java -jar $BIN/cromwell-48.jar --help
31 | 
32 | $ java -jar $BIN/womtool-48.jar --help
33 | 
34 | $ export WF=~/book/code/workflows
35 | 
36 | $ nano $WF/hello-world/hello-world.wdl
37 | 
38 | $ echo "Hello World"
39 | 
40 | $ java -jar $BIN/cromwell-48.jar run $WF/hello-world/hello-world.wdl
41 | 
42 | $ cat ~/cromwell-executions/HelloWorld/b6d224b0-ccee-468f-83fa-
43 | ab2ce7e62ab7/call-WriteGreeting/execution/stdout
44 | 
45 | $ nano $WF/hello-world/hello-world-var.wdl
46 | 
47 | $ java -jar $BIN/cromwell-48.jar run $WF/hello-world/hello-world-var.wdl \
48 | -i $WF/hello-world/hello-world.inputs.json
49 | 
50 | $ nano $WF/hello-world/hello-world-again.wdl
51 | 
52 | $ nano $WF/hello-hc/hello-haplotypecaller.wdl
53 | 
54 | $ mkdir ~/sandbox-8
55 | 
56 | $ java -jar $BIN/womtool-48.jar \
57 |     inputs $WF/hello-hc/hello-haplotypecaller.wdl \
58 |     > ~/sandbox-8/hello-haplotypecaller.inputs.json
59 | 
60 | $ cat ~/sandbox-8/hello-haplotypecaller.inputs.json
61 | 
62 | $ cat $WF/hello-hc/hello-haplotypecaller.inputs.json
63 | 
64 | $ java -jar $BIN/cromwell-48.jar \
65 |     run $WF/hello-hc/hello-haplotypecaller.wdl \
66 |     -i $WF/hello-hc/hello-haplotypecaller.inputs.json
67 | 
68 | $ head ~/cromwell-executions/HelloHaplotypeCaller/9a6a9c97-7453-455c
69 | -8cd8-be8af8cb6f7c/call-HaplotypeCallerGVCF/execution/mother.g.vcf
70 | 
71 | $ cp $WF/hello-hc/hello-haplotypecaller.wdl ~/sandbox-8/hc-break1.wdl
72 | 
73 | $ java -jar $BIN/cromwell-48.jar \
74 |     run ~/sandbox-8/hc-break1.wdl \
75 |     -i $WF/hello-hc/hello-haplotypecaller.inputs.json
76 | 
77 | $ java -jar $BIN/womtool-48.jar \
78 |     validate ~/sandbox-8/hc-break1.wdl
79 | 
80 | $ java -jar $BIN/cromwell-48.jar \
81 |     run ~/sandbox-8/hc-break2.wdl \
82 |     -i $WF/hello-hc/hello-haplotypecaller.inputs.json
83 | 
84 | $ cat /home/username/cromwell-executions/HelloHaplotypeCaller/dd77316f-7c18-4eb1-aa86-e307113c1668/call-HaplotypeCallerGVCF/execution/stderr
85 | 
86 | $ nano $WF/scatter-hc/scatter-haplotypecaller.wdl
87 | 
88 | $ java -jar $BIN/cromwell-48.jar \
89 |     run $WF/scatter-hc/scatter-haplotypecaller.wdl \
90 |     -i $WF/scatter-hc/scatter-haplotypecaller.local.inputs.json
91 | 
92 | $ java -jar $BIN/womtool-48.jar \
93 |     graph $WF/scatter-hc/scatter-haplotypecaller.wdl \
94 |     > ~/sandbox-8/scatter-haplotypecaller.dot
95 | 


--------------------------------------------------------------------------------
/commands/09-commands.txt:
--------------------------------------------------------------------------------
 1 | About:
 2 | 
 3 | This is a simple text file that contains the commands present in
 4 | chapter 5.  We created this file to prevent readers from having
 5 | to type out commands from our book.  This is not a stand-alone
 6 | tutorial, you'll want to follow along in the chapter to get
 7 | context on what these commands do.
 8 | 
 9 | Conventions:
10 | 
11 | Commands you run on your cloud shell or VM begin with "$".
12 | Commands you run within a Docker container begin with "#".
13 | Commands lacking an initial "$" or "#" are typically
14 | provided for illustration purposes and we don't expect you
15 | to run them.
16 | 
17 | Commands:
18 | 
19 | $ export CASE1=~/book/code/workflows/mystery-1
20 | 
21 | $ mkdir ~/sandbox-9
22 | 
23 | $ cat ~/sandbox-9/haplotypecaller-gvcf-gatk4.dot
24 | $ java -jar $BIN/womtool-48.jar graph $CASE1/haplotypecaller-gvcf-gatk4.wdl \
25 |     > ~/sandbox-9/haplotypecaller-gvcf-gatk4.dot
26 | 
27 | $ export CASE2=~/book/code/workflows/mystery-2
28 | 
29 | $ java -jar $BIN/womtool-48.jar graph $CASE2/WholeGenomeGermlineSingleSample.wdl  \
30 |     > ~/sandbox-9/WholeGenomeGermlineSingleSample.dot
31 | 
32 | $ cat ~/sandbox-9/WholeGenomeGermlineSingleSample.dot
33 | 
34 | $ java -jar $BIN/womtool-48.jar graph $CASE2/tasks/VariantCalling.wdl \
35 |     > ~/sandbox-9/VariantCalling.dot
36 | 


--------------------------------------------------------------------------------
/commands/10-commands.txt:
--------------------------------------------------------------------------------
 1 | About:
 2 | 
 3 | This is a simple text file that contains the commands present in
 4 | chapter 5.  We created this file to prevent readers from having
 5 | to type out commands from our book.  This is not a stand-alone
 6 | tutorial, you'll want to follow along in the chapter to get
 7 | context on what these commands do.
 8 | 
 9 | Conventions:
10 | 
11 | Commands you run on your cloud shell or VM begin with "$".
12 | Commands you run within a Docker container begin with "#".
13 | Commands lacking an initial "$" or "#" are typically
14 | provided for illustration purposes and we don't expect you
15 | to run them.
16 | 
17 | Commands:
18 | 
19 | $ cat ~/book/code/config/google.conf
20 | 
21 | $ mkdir ~/sandbox-10
22 | $ cp ~/book/code/config/google.conf ~/sandbox-10/my-google.conf
23 | 
24 | $ export CONF=~/sandbox-10
25 | $ export BIN=~/book/bin
26 | $ export WF=~/book/code/workflows
27 | 
28 | $ export BUCKET="gs://my-bucket"
29 | 
30 | $ nano ~/sandbox-10/my-google.conf
31 | 
32 | $ gcloud auth application-default login
33 | 
34 | $ cat $WF/scatter-hc/scatter-haplotypecaller.gcs.inputs.json
35 | 
36 | $ java -Dconfig.file=$CONF/my-google.conf -jar $BIN/cromwell-48.jar \
37 |     run $WF/scatter-hc/scatter-haplotypecaller.wdl \
38 |     -i $WF/scatter-hc/scatter-haplotypecaller.gcs.inputs.json
39 | 
40 | $ export WR_CONF=~/book/code/config
41 | $ export WR_PIPE=~/book/wdl_runner/wdl_runner
42 | 
43 | $ gcloud alpha genomics pipelines run \
44 |       --pipeline-file $WR_PIPE/wdl_pipeline.yaml \
45 |       --regions us-east4 \
46 |       --inputs-from-file WDL=$WF/scatter-hc/scatter-haplotypecaller.wdl,\
47 |  WORKFLOW_INPUTS=$WF/scatter-hc/scatter-haplotypecaller.gcs.inputs.json,\
48 |  WORKFLOW_OPTIONS=$WR_CONF/empty.options.json \
49 |       --env-vars WORKSPACE=$BUCKET/wdl_runner/test/work,\
50 |  OUTPUTS=$BUCKET/wdl_runner/test/output \
51 |       --logging $BUCKET/wdl_runner/test/logging
52 | 
53 | $ gcloud config set compute/zone ""
54 | 
55 | $ cd ~/book/wdl_runner
56 | $ bash monitoring_tools/monitor_wdl_pipeline.sh 7973899330424684165
57 | 


--------------------------------------------------------------------------------
/commands/12-commands.txt:
--------------------------------------------------------------------------------
 1 | About:
 2 | 
 3 | This is a simple text file that contains the commands present in
 4 | chapter 5.  We created this file to prevent readers from having
 5 | to type out commands from our book.  This is not a stand-alone
 6 | tutorial, you'll want to follow along in the chapter to get
 7 | context on what these commands do.
 8 | 
 9 | Conventions:
10 | 
11 | Commands you run on your cloud shell or VM begin with "$".
12 | Commands you run within a Docker container begin with "#".
13 | Commands lacking an initial "$" or "#" are typically
14 | provided for illustration purposes and we don't expect you
15 | to run them.
16 | 
17 | Commands:
18 | 
19 | gs://genomics-in-the-cloud/v1/scripts/install_GATK_4130_with_igv.sh
20 | 


--------------------------------------------------------------------------------
/config/empty.options.json:
--------------------------------------------------------------------------------
1 | {
2 |   "default_runtime_attributes": {
3 |   }
4 | }
5 | 
6 | 


--------------------------------------------------------------------------------
/config/google.conf:
--------------------------------------------------------------------------------
 1 | # This is an example configuration file that directs Cromwell to execute
 2 | # workflow tasks via the Google Pipelines API backend and allows it to retrieve 
 3 | # input files from GCS buckets. It is intended only as a relatively simple example 
 4 | # and leaves out many options that are useful or important for production-scale
 5 | # work. See https://cromwell.readthedocs.io/en/stable/backends/Google/ for more
 6 | # complete documentation. 
 7 | 
 8 | engine {
 9 |   filesystems {
10 |     gcs {
11 |       auth = "application-default"
12 |       project = "<google-billing-project-id>"
13 |     }
14 |   }
15 | }
16 | 
17 | backend {
18 |   default = PAPIv2
19 | 
20 |   providers {
21 |     PAPIv2 {
22 |       actor-factory = "cromwell.backend.google.pipelines.v2alpha1.PipelinesApiLifecycleActorFactory"
23 |       config {
24 |         # Google project
25 |         project = "<google-project-id>"
26 | 
27 |         # Base bucket for workflow executions
28 |         root = "gs://<google-bucket-name>/cromwell-execution"
29 | 
30 |         # Polling for completion backs-off gradually for slower-running jobs.
31 |         # This is the maximum polling interval (in seconds):
32 |         maximum-polling-interval = 600
33 | 
34 |         # Optional Dockerhub Credentials. Can be used to access private docker images.
35 |         dockerhub {
36 |           # account = ""
37 |           # token = ""
38 |         }
39 | 
40 |         # Number of workers to assign to PAPI requests
41 |         request-workers = 3
42 | 
43 |         genomics {
44 |           # A reference to an auth defined in the `google` stanza at the top.  This auth is used to create
45 |           # Pipelines and manipulate auth JSONs.
46 |           auth = "application-default"
47 | 
48 |           # Endpoint for APIs, no reason to change this unless directed by Google.
49 |           endpoint-url = "https://genomics.googleapis.com/"
50 | 
51 |           # Pipelines v2 only: specify the number of times localization and delocalization operations should be attempted
52 |           # There is no logic to determine if the error was transient or not, everything is retried upon failure
53 |           # Defaults to 3
54 |           localization-attempts = 3
55 | 
56 |         }
57 | 
58 |         filesystems {
59 |           gcs {
60 |             auth = "application-default"
61 |             project = "<google-billing-project-id>"
62 |           }
63 |         }
64 | 
65 |         default-runtime-attributes {
66 |           cpu: 1
67 |           failOnStderr: false
68 |           continueOnReturnCode: 0
69 |           memory: "2048 MB"
70 |           bootDiskSizeGb: 10
71 |           # Allowed to be a String, or a list of Strings
72 |           disks: "local-disk 10 SSD"
73 |           noAddress: false
74 |           preemptible: 0
75 |           zones: ["us-east4-a", "us-east4-b"]
76 |         }
77 |       }
78 |     }
79 |   }
80 | }
81 | 


--------------------------------------------------------------------------------
/figures/.keep:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/broadinstitute/genomics-in-the-cloud/e329fa7f0668f6bea7f4176736d686e79175ce8b/figures/.keep


--------------------------------------------------------------------------------
/figures/README.md:
--------------------------------------------------------------------------------
1 | You can find all the figures in full color the [figures](https://console.cloud.google.com/storage/browser/genomics-in-the-cloud/figures/) directory of the GCS bucket.
2 | 
3 | You may use all figures except 3-3 and 6-15 in your own non-commercial work, preferably with a notice of attribution referring to the book. For commercial use, please contact permissions@oreilly.com. Figures 3-3 and 6-15 do not belong to us, so you must request permission from their respective owners, which are noted in the book.
4 | 
5 | We also put together a [companion booklet](https://storage.googleapis.com/genomics-in-the-cloud/figures/Genomics_in_the_Cloud___Figures_Booklet.pdf) that contains the figures and their captions for more convenient browsing or printing. It's "semi-official" in the sense that we created and maintain it, but it is not published by O'Reilly, so it does not go through their quality control process. Think of it as an artisanal, locally sourced side dish.
6 | 


--------------------------------------------------------------------------------
/metadata/README.md:
--------------------------------------------------------------------------------
1 | Chapter materials


--------------------------------------------------------------------------------
/metadata/book_sample-metadata.tsv:
--------------------------------------------------------------------------------
1 | entity:book_sample_id	input_bam	input_bam_index
2 | father	gs://genomics-in-the-cloud/v1/data/germline/bams/father.bam	gs://genomics-in-the-cloud/v1/data/germline/bams/father.bai
3 | mother	gs://genomics-in-the-cloud/v1/data/germline/bams/mother.bam	gs://genomics-in-the-cloud/v1/data/germline/bams/mother.bai
4 | son	gs://genomics-in-the-cloud/v1/data/germline/bams/son.bam	gs://genomics-in-the-cloud/v1/data/germline/bams/son.bai


--------------------------------------------------------------------------------
/metadata/workspace-metadata.tsv:
--------------------------------------------------------------------------------
1 | workspace:ref_fasta	intervals_list_min	ref_fasta_index	gatk_docker	ref_dict	intervals_list_full
2 | gs://genomics-in-the-cloud/v1/data/germline/ref/ref.fasta	gs://genomics-in-the-cloud/v1/data/germline/intervals/snippet-intervals-min.list	gs://genomics-in-the-cloud/v1/data/germline/ref/ref.fasta.fai	broadinstitute/gatk:4.1.3.0	gs://genomics-in-the-cloud/v1/data/germline/ref/ref.dict	gs://genomics-in-the-cloud/v1/data/germline/intervals/snippet-intervals-full.list


--------------------------------------------------------------------------------
/notebooks/Basic-genomics-notebook.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "markdown",
  5 |    "metadata": {},
  6 |    "source": [
  7 |     "**Content summary**  \n",
  8 |     "This notebook provides a short introduction to basic Jupyter Notebook functionality and illustrates some options for working with genomic data in cloud storage. It is based on [source code](https://github.com/broadinstitute/genomics-in-the-cloud/tree/main/notebooks) provided with the [Genomics in the Cloud](https://oreil.ly/genomics-cloud) book (Van der Auwera & O'Connor, O'Reilly 2020).\n",
  9 |     "\n",
 10 |     "\n",
 11 |     "**Environment configuration**   \n",
 12 |     "This notebook requires a custom [Terra](https://app.terra.bio/) Cloud Environment image provided as the container `gcr.io/broad-dsde-outreach/terra-base:ipyigv1`, complemented by a startup script (gs://genomics-in-the-cloud/v1/scripts/install_GATK_4130_with_igv.sh) that installs GATK version 4.1.3.0.  \n",
 13 |     "\n",
 14 |     "You must customize your environment using the Cloud Environment configuration panel to match this notebook's requirements; SOME COMMANDS WILL NOT WORK IF YOU DO NOT DO THIS. \n",
 15 |     "\n",
 16 |     "- In the configuration panel, set the `Application Configuration` to `Custom Environment` (all the way at the bottom of the menu) and paste the container address given above into the `Container image` field. \n",
 17 |     "- Then (still in the config panel), in the `Cloud compute profile` box, paste the startup script link given above into the `Startup Script` field. \n",
 18 |     "\n",
 19 |     "Refer to [Terra documentation on customizing your environment](https://support.terra.bio/hc/en-us/articles/360038125912) to learn more about environment customization options.\n",
 20 |     "\n",
 21 |     "**Kernel**  \n",
 22 |     "By default this notebook opens on a Python 3 kernel. When you have the notebook running in EDIT mode, the upper right corner of the notebook (under the Notebook Runtime widget) should display the label `Python3`. \n",
 23 |     "\n",
 24 |     "----"
 25 |    ]
 26 |   },
 27 |   {
 28 |    "cell_type": "markdown",
 29 |    "metadata": {},
 30 |    "source": [
 31 |     "# Getting started with Jupyter in Terra"
 32 |    ]
 33 |   },
 34 |   {
 35 |    "cell_type": "markdown",
 36 |    "metadata": {},
 37 |    "source": [
 38 |     "In this section, we run through some exercises to familiarize you with the basic usage of Jupyter notebooks in the Terra environment.\n",
 39 |     "\n",
 40 |     "\n",
 41 |     "## Run the Hello World cells\n",
 42 |     "We start with some simple Hello World examples, first in Python, then with a command-line tool call."
 43 |    ]
 44 |   },
 45 |   {
 46 |    "cell_type": "markdown",
 47 |    "metadata": {},
 48 |    "source": [
 49 |     "Run the basic Hello World in Python"
 50 |    ]
 51 |   },
 52 |   {
 53 |    "cell_type": "code",
 54 |    "execution_count": null,
 55 |    "metadata": {},
 56 |    "outputs": [],
 57 |    "source": [
 58 |     "print(\"Hello World!\")"
 59 |    ]
 60 |   },
 61 |   {
 62 |    "cell_type": "markdown",
 63 |    "metadata": {},
 64 |    "source": [
 65 |     "Run the command-line tool `echo` using `!`"
 66 |    ]
 67 |   },
 68 |   {
 69 |    "cell_type": "code",
 70 |    "execution_count": null,
 71 |    "metadata": {
 72 |     "scrolled": true
 73 |    },
 74 |    "outputs": [],
 75 |    "source": [
 76 |     "! echo \"Hello World!\""
 77 |    ]
 78 |   },
 79 |   {
 80 |    "cell_type": "markdown",
 81 |    "metadata": {},
 82 |    "source": [
 83 |     "## Interact with local storage"
 84 |    ]
 85 |   },
 86 |   {
 87 |    "cell_type": "markdown",
 88 |    "metadata": {},
 89 |    "source": [
 90 |     "List contents of local storage (persistent disk)"
 91 |    ]
 92 |   },
 93 |   {
 94 |    "cell_type": "code",
 95 |    "execution_count": null,
 96 |    "metadata": {},
 97 |    "outputs": [],
 98 |    "source": [
 99 |     "! ls ."
100 |    ]
101 |   },
102 |   {
103 |    "cell_type": "markdown",
104 |    "metadata": {},
105 |    "source": [
106 |     "Make a sandbox directory to keep project files organized"
107 |    ]
108 |   },
109 |   {
110 |    "cell_type": "code",
111 |    "execution_count": null,
112 |    "metadata": {},
113 |    "outputs": [],
114 |    "source": [
115 |     "! mkdir -p sandbox/\n",
116 |     "! ls"
117 |    ]
118 |   },
119 |   {
120 |    "cell_type": "markdown",
121 |    "metadata": {},
122 |    "source": [
123 |     "## Access data in cloud storage buckets "
124 |    ]
125 |   },
126 |   {
127 |    "cell_type": "markdown",
128 |    "metadata": {},
129 |    "source": [
130 |     "List the contents of a public cloud storage bucket called `genomics-in-the-cloud`"
131 |    ]
132 |   },
133 |   {
134 |    "cell_type": "code",
135 |    "execution_count": null,
136 |    "metadata": {},
137 |    "outputs": [],
138 |    "source": [
139 |     "! gsutil ls gs://genomics-in-the-cloud/"
140 |    ]
141 |   },
142 |   {
143 |    "cell_type": "markdown",
144 |    "metadata": {},
145 |    "source": [
146 |     "Copy a file from the bucket to the sandbox (on persistent disk)"
147 |    ]
148 |   },
149 |   {
150 |    "cell_type": "code",
151 |    "execution_count": null,
152 |    "metadata": {},
153 |    "outputs": [],
154 |    "source": [
155 |     "! gsutil cp gs://genomics-in-the-cloud/hello.txt sandbox/"
156 |    ]
157 |   },
158 |   {
159 |    "cell_type": "markdown",
160 |    "metadata": {},
161 |    "source": [
162 |     "Read the contents of the locally-stored text file"
163 |    ]
164 |   },
165 |   {
166 |    "cell_type": "code",
167 |    "execution_count": null,
168 |    "metadata": {},
169 |    "outputs": [],
170 |    "source": [
171 |     "! cat sandbox/hello.txt"
172 |    ]
173 |   },
174 |   {
175 |    "cell_type": "markdown",
176 |    "metadata": {},
177 |    "source": [
178 |     "## Save local files to the workspace's storage bucket"
179 |    ]
180 |   },
181 |   {
182 |    "cell_type": "markdown",
183 |    "metadata": {},
184 |    "source": [
185 |     "Import the `os` package, look up the value of the `WORKSPACE_BUCKET` environment variable (set by Terra at the kernel level) and store it in a Python variable for easy access"
186 |    ]
187 |   },
188 |   {
189 |    "cell_type": "code",
190 |    "execution_count": null,
191 |    "metadata": {},
192 |    "outputs": [],
193 |    "source": [
194 |     "import os\n",
195 |     "WS_BUCKET = os.environ['WORKSPACE_BUCKET']\n",
196 |     "print(WS_BUCKET)"
197 |    ]
198 |   },
199 |   {
200 |    "cell_type": "markdown",
201 |    "metadata": {},
202 |    "source": [
203 |     "Back up the sandbox directory from the persistent disk to the workspace bucket "
204 |    ]
205 |   },
206 |   {
207 |    "cell_type": "code",
208 |    "execution_count": null,
209 |    "metadata": {},
210 |    "outputs": [],
211 |    "source": [
212 |     "! gsutil cp -r sandbox {WS_BUCKET}"
213 |    ]
214 |   },
215 |   {
216 |    "cell_type": "markdown",
217 |    "metadata": {},
218 |    "source": [
219 |     "Verify that it worked as expected"
220 |    ]
221 |   },
222 |   {
223 |    "cell_type": "code",
224 |    "execution_count": null,
225 |    "metadata": {},
226 |    "outputs": [],
227 |    "source": [
228 |     "! gsutil ls -r {WS_BUCKET}"
229 |    ]
230 |   },
231 |   {
232 |    "cell_type": "markdown",
233 |    "metadata": {},
234 |    "source": [
235 |     "## Set up variables pointing to genomic data in the bucket\n",
236 |     "We're going to want to access the data in the bucket multiple times, so we make a variable to avoid hardcoding and repeating file paths."
237 |    ]
238 |   },
239 |   {
240 |    "cell_type": "markdown",
241 |    "metadata": {},
242 |    "source": [
243 |     "Create Python variables"
244 |    ]
245 |   },
246 |   {
247 |    "cell_type": "code",
248 |    "execution_count": null,
249 |    "metadata": {},
250 |    "outputs": [],
251 |    "source": [
252 |     "BAMS = \"gs://genomics-in-the-cloud/v1/data/germline/bams\"\n",
253 |     "REF = \"gs://genomics-in-the-cloud/v1/data/germline/ref\""
254 |    ]
255 |   },
256 |   {
257 |    "cell_type": "markdown",
258 |    "metadata": {},
259 |    "source": [
260 |     "Use the variable to list the bucket contents and verify they work as expected"
261 |    ]
262 |   },
263 |   {
264 |    "cell_type": "code",
265 |    "execution_count": null,
266 |    "metadata": {
267 |     "scrolled": true
268 |    },
269 |    "outputs": [],
270 |    "source": [
271 |     "! gsutil ls {BAMS}"
272 |    ]
273 |   },
274 |   {
275 |    "cell_type": "code",
276 |    "execution_count": null,
277 |    "metadata": {},
278 |    "outputs": [],
279 |    "source": [
280 |     "! gsutil ls {REF}"
281 |    ]
282 |   },
283 |   {
284 |    "cell_type": "markdown",
285 |    "metadata": {},
286 |    "source": [
287 |     "This completes the \"getting started\" portion of this notebook.\n",
288 |     "\n",
289 |     "----"
290 |    ]
291 |   },
292 |   {
293 |    "cell_type": "markdown",
294 |    "metadata": {},
295 |    "source": [
296 |     "# Visualizing genomic data in an embedded IGV window\n",
297 |     "In this section, we embed IGV windows in the notebook in order to visualize genomic data without leaving the notebook environment.\n",
298 |     "\n",
299 |     "## Set up the embedded IGV browser\n",
300 |     "First we need to import the `ipyigv` package and initialize a browser window."
301 |    ]
302 |   },
303 |   {
304 |    "cell_type": "code",
305 |    "execution_count": null,
306 |    "metadata": {},
307 |    "outputs": [],
308 |    "source": [
309 |     "import ipyigv as igv\n",
310 |     "from ipywidgets.widgets.trait_types import InstanceDict\n",
311 |     "from ipyigv.options import ReferenceGenome, Track\n",
312 |     "from ipywidgets import Output"
313 |    ]
314 |   },
315 |   {
316 |    "cell_type": "markdown",
317 |    "metadata": {},
318 |    "source": [
319 |     "Initialize the browser instance with a genome reference"
320 |    ]
321 |   },
322 |   {
323 |    "cell_type": "code",
324 |    "execution_count": null,
325 |    "metadata": {},
326 |    "outputs": [],
327 |    "source": [
328 |     "genomeDict = igv.PUBLIC_GENOMES.hg19\n",
329 |     "genome = ReferenceGenome(**genomeDict)\n",
330 |     "browser = igv.IgvBrowser(genome=genome)"
331 |    ]
332 |   },
333 |   {
334 |    "cell_type": "markdown",
335 |    "metadata": {},
336 |    "source": [
337 |     "Display the browser window"
338 |    ]
339 |   },
340 |   {
341 |    "cell_type": "code",
342 |    "execution_count": null,
343 |    "metadata": {},
344 |    "outputs": [],
345 |    "source": [
346 |     "browser"
347 |    ]
348 |   },
349 |   {
350 |    "cell_type": "markdown",
351 |    "metadata": {},
352 |    "source": [
353 |     "## Add data to the IGV browser\n",
354 |     "Now we can add data by pointing to files in a bucket."
355 |    ]
356 |   },
357 |   {
358 |    "cell_type": "markdown",
359 |    "metadata": {},
360 |    "source": [
361 |     "Define data tracks for two BAM files (whole genome and exome versions of the mother sample)"
362 |    ]
363 |   },
364 |   {
365 |    "cell_type": "code",
366 |    "execution_count": null,
367 |    "metadata": {},
368 |    "outputs": [],
369 |    "source": [
370 |     "wgs_track = {\n",
371 |     "  'name': 'Mother WGS',\n",
372 |     "  'format': 'bam',\n",
373 |     "  'url': BAMS + '/mother.bam',\n",
374 |     "  'indexURL': BAMS + '/mother.bai',\n",
375 |     "  'height': 200\n",
376 |     "}\n",
377 |     "browser.add_track(Track(**wgs_track))"
378 |    ]
379 |   },
380 |   {
381 |    "cell_type": "code",
382 |    "execution_count": null,
383 |    "metadata": {},
384 |    "outputs": [],
385 |    "source": [
386 |     "exome_track = {\n",
387 |     "  'name': 'Mother Exome',\n",
388 |     "  'format': 'bam',\n",
389 |     "  'url': BAMS + '/motherNEX.bam',\n",
390 |     "  'indexURL': BAMS + '/motherNEX.bai',\n",
391 |     "  'height': 200\n",
392 |     "}\n",
393 |     "browser.add_track(Track(**exome_track))"
394 |    ]
395 |   },
396 |   {
397 |    "cell_type": "markdown",
398 |    "metadata": {},
399 |    "source": [
400 |     "Zoom in to region of interest"
401 |    ]
402 |   },
403 |   {
404 |    "cell_type": "code",
405 |    "execution_count": null,
406 |    "metadata": {},
407 |    "outputs": [],
408 |    "source": [
409 |     "browser.search('chr20:10,025,584-10,036,143')"
410 |    ]
411 |   },
412 |   {
413 |    "cell_type": "markdown",
414 |    "metadata": {},
415 |    "source": [
416 |     "## Set up an access token to view private data\n",
417 |     "IGV needs an access token to retrieve data from private buckets (including your workspace's own bucket)."
418 |    ]
419 |   },
420 |   {
421 |    "cell_type": "markdown",
422 |    "metadata": {},
423 |    "source": [
424 |     "Emit an acces token and save it to a file, then read it into a variable"
425 |    ]
426 |   },
427 |   {
428 |    "cell_type": "code",
429 |    "execution_count": null,
430 |    "metadata": {},
431 |    "outputs": [],
432 |    "source": [
433 |     "!gcloud auth print-access-token > token.txt\n",
434 |     "\n",
435 |     "token_file = open(\"token.txt\",\"r\") \n",
436 |     "token = token_file.readline()"
437 |    ]
438 |   },
439 |   {
440 |    "cell_type": "markdown",
441 |    "metadata": {},
442 |    "source": [
443 |     "**Important note:** As long as this file is saved only to your notebook’s local storage, it is secure because your cloud environment is strictly personal to you and cannot be accessed by others, even if you share your workspace or your notebook with them. But don’t save this\n",
444 |     "file to your workspace bucket! Saving it to the bucket would make it visible to anyone\n",
445 |     "with whom you share the workspace."
446 |    ]
447 |   },
448 |   {
449 |    "cell_type": "markdown",
450 |    "metadata": {},
451 |    "source": [
452 |     "Copy a BAM file and its index to the workspace bucket"
453 |    ]
454 |   },
455 |   {
456 |    "cell_type": "code",
457 |    "execution_count": null,
458 |    "metadata": {},
459 |    "outputs": [],
460 |    "source": [
461 |     "! gsutil cp {BAMS}/mother.ba* {WS_BUCKET}/bams\n",
462 |     "! gsutil ls {WS_BUCKET}/bams"
463 |    ]
464 |   },
465 |   {
466 |    "cell_type": "markdown",
467 |    "metadata": {},
468 |    "source": [
469 |     "Include the token in the track definition of any private files"
470 |    ]
471 |   },
472 |   {
473 |    "cell_type": "code",
474 |    "execution_count": null,
475 |    "metadata": {},
476 |    "outputs": [],
477 |    "source": [
478 |     "private_track = {\n",
479 |     "  'name': 'Workspace bucket copy of Mother WGS',\n",
480 |     "  'format': 'bam',\n",
481 |     "  'url': WS_BUCKET + '/sandbox/mother.bam',\n",
482 |     "  'indexURL': WS_BUCKET + '/sandbox/mother.bam',\n",
483 |     "  'height': 200,\n",
484 |     "  'oauthToken': token\n",
485 |     "}\n",
486 |     "\n",
487 |     "browser.add_track(Track(**private_track))"
488 |    ]
489 |   },
490 |   {
491 |    "cell_type": "markdown",
492 |    "metadata": {},
493 |    "source": [
494 |     "This concludes the section on visualizing genomic data.\n",
495 |     "\n",
496 |     "----"
497 |    ]
498 |   },
499 |   {
500 |    "cell_type": "markdown",
501 |    "metadata": {},
502 |    "source": [
503 |     "# Running GATK Commands to Learn, Test, or Troubleshoot\n",
504 |     "Now let's look at how we can run GATK commands inside the notebook.\n",
505 |     "\n",
506 |     "## Running a Basic GATK Command: HaplotypeCaller\n",
507 |     "First we run a simple command. Note that we can run GATK directly on the files located in cloud storage — no need to copy them to local storage first."
508 |    ]
509 |   },
510 |   {
511 |    "cell_type": "markdown",
512 |    "metadata": {},
513 |    "source": [
514 |     "Run HaplotypeCaller on files in cloud storage"
515 |    ]
516 |   },
517 |   {
518 |    "cell_type": "code",
519 |    "execution_count": null,
520 |    "metadata": {},
521 |    "outputs": [],
522 |    "source": [
523 |     "! gatk HaplotypeCaller \\\n",
524 |     "-R {REF}/ref.fasta \\\n",
525 |     "-I {BAMS}/mother.bam \\\n",
526 |     "-O sandbox/mother_variants.200k.vcf.gz \\\n",
527 |     "-L 20:10,000,000-10,200,000"
528 |    ]
529 |   },
530 |   {
531 |    "cell_type": "markdown",
532 |    "metadata": {},
533 |    "source": [
534 |     "Verify that the output file is in the sandbox"
535 |    ]
536 |   },
537 |   {
538 |    "cell_type": "code",
539 |    "execution_count": null,
540 |    "metadata": {},
541 |    "outputs": [],
542 |    "source": [
543 |     "! ls sandbox"
544 |    ]
545 |   },
546 |   {
547 |    "cell_type": "markdown",
548 |    "metadata": {},
549 |    "source": [
550 |     "**Note:** This works with GATK from anywhere with an internet connection! We could even write the output directly to a bucket if we wanted to; the output filepath just has to start with a valid `gs://` bucket address. "
551 |    ]
552 |   },
553 |   {
554 |    "cell_type": "markdown",
555 |    "metadata": {},
556 |    "source": [
557 |     "## Loading the Data (BAM and VCF) into IGV\n",
558 |     "Now we do a simple visual check of the result."
559 |    ]
560 |   },
561 |   {
562 |    "cell_type": "markdown",
563 |    "metadata": {},
564 |    "source": [
565 |     "Initialize a new IGV window"
566 |    ]
567 |   },
568 |   {
569 |    "cell_type": "code",
570 |    "execution_count": null,
571 |    "metadata": {},
572 |    "outputs": [],
573 |    "source": [
574 |     "second_browser = igv.IgvBrowser(genome=genome)\n",
575 |     "\n",
576 |     "second_browser"
577 |    ]
578 |   },
579 |   {
580 |    "cell_type": "markdown",
581 |    "metadata": {},
582 |    "source": [
583 |     "Load the variant calls produced by the HaplotypeCaller above\n",
584 |     "\n",
585 |     "*Adding `'color': \"#000000\"` as a workaround to [this issue](https://github.com/QuantStack/ipyigv/issues/21).*"
586 |    ]
587 |   },
588 |   {
589 |    "cell_type": "code",
590 |    "execution_count": null,
591 |    "metadata": {},
592 |    "outputs": [],
593 |    "source": [
594 |     "var_track = {\n",
595 |     "  'name': 'Mother variants',\n",
596 |     "  'format': 'vcf',\n",
597 |     "  'url': 'files/sandbox/mother_variants.200k.vcf.gz',\n",
598 |     "  'indexURL': 'files/sandbox/mother_variants.200k.vcf.gz.tbi',\n",
599 |     "  'color': \"#000000\"\n",
600 |     "}\n",
601 |     "second_browser.add_track(Track(**var_track))"
602 |    ]
603 |   },
604 |   {
605 |    "cell_type": "code",
606 |    "execution_count": null,
607 |    "metadata": {},
608 |    "outputs": [],
609 |    "source": [
610 |     "second_browser.search('chr20:10,002,000-10,003,000')"
611 |    ]
612 |   },
613 |   {
614 |    "cell_type": "markdown",
615 |    "metadata": {},
616 |    "source": [
617 |     "Load the original BAM file on which we ran HaplotypeCaller"
618 |    ]
619 |   },
620 |   {
621 |    "cell_type": "code",
622 |    "execution_count": null,
623 |    "metadata": {},
624 |    "outputs": [],
625 |    "source": [
626 |     "wgs_track = {\n",
627 |     "  'name': 'Mother WGS',\n",
628 |     "  'format': 'bam',\n",
629 |     "  'url': BAMS + '/mother.bam',\n",
630 |     "  'indexURL': BAMS + '/mother.bai',\n",
631 |     "  'height': 200\n",
632 |     "}\n",
633 |     "second_browser.add_track(Track(**wgs_track))"
634 |    ]
635 |   },
636 |   {
637 |    "cell_type": "markdown",
638 |    "metadata": {},
639 |    "source": [
640 |     "## Troubleshooting a Questionable Variant Call in the Embedded IGV Browser\n",
641 |     "Something looks odd so we do some systematic troubleshooting..."
642 |    ]
643 |   },
644 |   {
645 |    "cell_type": "markdown",
646 |    "metadata": {},
647 |    "source": [
648 |     "Run HaplotypeCaller on the problem region to produce an output BAM, the `bamout`"
649 |    ]
650 |   },
651 |   {
652 |    "cell_type": "code",
653 |    "execution_count": null,
654 |    "metadata": {},
655 |    "outputs": [],
656 |    "source": [
657 |     "! gatk HaplotypeCaller \\\n",
658 |     "-R {REF}/ref.fasta \\\n",
659 |     "-I {BAMS}/mother.bam \\\n",
660 |     "-O sandbox/motherHCdebug.vcf \\\n",
661 |     "-bamout sandbox/motherHCdebug.bam \\\n",
662 |     "-L 20:10,002,000-10,003,000"
663 |    ]
664 |   },
665 |   {
666 |    "cell_type": "markdown",
667 |    "metadata": {},
668 |    "source": [
669 |     "Load the `bamout` file into the IGV window"
670 |    ]
671 |   },
672 |   {
673 |    "cell_type": "code",
674 |    "execution_count": null,
675 |    "metadata": {},
676 |    "outputs": [],
677 |    "source": [
678 |     "bamout_track = {\n",
679 |     "\"name\": \"Mother HC bamout\",\n",
680 |     "\"url\": \"files/sandbox/motherHCdebug.bam\",\n",
681 |     "\"indexURL\": \"files/sandbox/motherHCdebug.bai\",\n",
682 |     "\"height\": 500,\n",
683 |     "\"format\": \"bam\"\n",
684 |     "}\n",
685 |     "second_browser.add_track(Track(**bamout_track))"
686 |    ]
687 |   },
688 |   {
689 |    "cell_type": "markdown",
690 |    "metadata": {},
691 |    "source": [
692 |     "This concludes the GATK variant calling section of this notebook. \n",
693 |     "\n",
694 |     "----"
695 |    ]
696 |   },
697 |   {
698 |    "cell_type": "code",
699 |    "execution_count": null,
700 |    "metadata": {},
701 |    "outputs": [],
702 |    "source": []
703 |   }
704 |  ],
705 |  "metadata": {
706 |   "kernelspec": {
707 |    "display_name": "Python 3",
708 |    "language": "python",
709 |    "name": "python3"
710 |   },
711 |   "language_info": {
712 |    "codemirror_mode": {
713 |     "name": "ipython",
714 |     "version": 3
715 |    },
716 |    "file_extension": ".py",
717 |    "mimetype": "text/x-python",
718 |    "name": "python",
719 |    "nbconvert_exporter": "python",
720 |    "pygments_lexer": "ipython3",
721 |    "version": "3.7.12"
722 |   },
723 |   "toc": {
724 |    "base_numbering": 1,
725 |    "nav_menu": {},
726 |    "number_sections": true,
727 |    "sideBar": true,
728 |    "skip_h1_title": false,
729 |    "title_cell": "Table of Contents",
730 |    "title_sidebar": "Contents",
731 |    "toc_cell": false,
732 |    "toc_position": {},
733 |    "toc_section_display": true,
734 |    "toc_window_display": true
735 |   }
736 |  },
737 |  "nbformat": 4,
738 |  "nbformat_minor": 2
739 | }
740 | 


--------------------------------------------------------------------------------
/notebooks/README.md:
--------------------------------------------------------------------------------
1 | Chapter materials


--------------------------------------------------------------------------------
/notebooks/install_GATK_4130_with_igv.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | pip3 install igv-jupyter
 4 | 
 5 | jupyter serverextension enable --py igv --sys-prefix
 6 | jupyter nbextension install --py igv --sys-prefix
 7 | jupyter nbextension enable --py igv --sys-prefix
 8 | 
 9 | 
10 | pip3 install rpy2==3.0.4
11 | pip3 install singledispatch
12 | pip3 install tzlocal
13 | 
14 | 
15 | echo "install.packages(c(\"optparse\",\"data.table\"),repos=\"http://cran.us.r-project.org\")" | R --no-save
16 | 
17 | 
18 | set -e
19 | 
20 | GATK_VERSION=4.1.3.0
21 | GATK_ZIP_PATH=/tmp/gatk-$GATK_VERSION.zip
22 | 
23 | # remove pre-existing GATK version
24 | rm -rf /bin/gatk
25 | 
26 | # download the gatk zip if it doesn't already exist
27 | 
28 | if ! [ -f $GATK_ZIP_PATH ]; then
29 |   # curl and follow redirects and output to a temp file
30 |   curl -L -o $GATK_ZIP_PATH https://github.com/broadinstitute/gatk/releases/download/$GATK_VERSION/gatk-$GATK_VERSION.zip
31 | fi
32 | 
33 | # unzip with forced overwrite (if necessary) to /bin
34 | unzip -o $GATK_ZIP_PATH -d /etc/
35 | 
36 | # make a symlink to gatk right inside bin so it's available from the existing PATH
37 | 
38 | ln -s /etc/gatk-$GATK_VERSION/gatk /bin/gatk
39 | 
40 | pip3 install /etc/gatk-$GATK_VERSION/gatkPythonPackageArchive.zip
41 | 
42 | 
43 | export PATH=$PATH:/home/jupyter-user/.local/bin


--------------------------------------------------------------------------------
/notebooks/plotting.R:
--------------------------------------------------------------------------------
 1 | # plotting.R script loads ggplot and gridExtra libraries and defines functions to plot variant annotations 
 2 | library(ggplot2)
 3 | install.packages("gridExtra")
 4 | library(gridExtra)
 5 | 
 6 | require(ggplot2, quietly = TRUE)
 7 | require(gridExtra, quietly = TRUE)
 8 | 
 9 | get_legend<-function(myggplot){
10 |   tmp <- ggplot_gtable(ggplot_build(myggplot))
11 |   leg <- which(sapply(tmp$grobs, function(x) x$name) == "guide-box")
12 |   legend <- tmp$grobs[[leg]]
13 |   return(legend)
14 | }
15 | 
16 | 
17 | # Function for making density plots of a single annotation
18 | makeDensityPlot <- function(dataframe, xvar, split, xmin=min(dataframe[xvar], na.rm=TRUE), xmax=max(dataframe[xvar], na.rm=TRUE), alpha=0.5) {
19 |   
20 |   if(missing(split)) {
21 |     return(ggplot(data=dataframe, aes_string(x=xvar)) + xlim(xmin,xmax) + geom_density() )
22 |   }
23 |   else {
24 |     return(ggplot(data=dataframe, aes_string(x=xvar, fill=split)) + xlim(xmin,xmax) + geom_density(alpha=alpha) )
25 |   }
26 | }
27 | 
28 | # Function for making scatter plots of two annotations
29 | makeScatterPlot <- function(dataframe, xvar, yvar, split, xmin=min(dataframe[xvar], na.rm=TRUE), xmax=max(dataframe[xvar], na.rm=TRUE), ymin=min(dataframe[yvar], na.rm=TRUE), ymax=max(dataframe[yvar], na.rm=TRUE), ptSize=1, alpha=0.6) {
30 |   if(missing(split)) {
31 |     return(ggplot(data=dataframe) + aes_string(x=xvar, y=yvar) + xlim(xmin,xmax) + ylim(ymin,ymax) + geom_point(size=ptSize, alpha=alpha) )
32 |   }
33 |   else {
34 |     return(ggplot(data=dataframe) + aes_string(x=xvar, y=yvar) + aes_string(color=split) + xlim(xmin,xmax) + ylim(ymin,ymax) + geom_point(size=ptSize, alpha=alpha) )
35 |   }
36 | }
37 | 
38 | # Function for making scatter plots of two annotations with marginal density plots of each
39 | makeScatterPlotWithMarginalDensity <- function(dataframe, xvar, yvar, split, xmin=min(dataframe[xvar], na.rm=TRUE), xmax=max(dataframe[xvar], na.rm=TRUE), ymin=min(dataframe[yvar], na.rm=TRUE), ymax=max(dataframe[yvar], na.rm=TRUE), ptSize=1, ptAlpha=0.6, fillAlpha=0.5) {
40 |   empty <- ggplot()+geom_point(aes(1,1), colour="white") +
41 |     theme(
42 |       plot.background = element_blank(), 
43 |       panel.grid.major = element_blank(), 
44 |       panel.grid.minor = element_blank(), 
45 |       panel.border = element_blank(), 
46 |       panel.background = element_blank(),
47 |       axis.title.x = element_blank(),
48 |       axis.title.y = element_blank(),
49 |       axis.text.x = element_blank(),
50 |       axis.text.y = element_blank(),
51 |       axis.ticks = element_blank()
52 |     )
53 |   
54 |   if(missing(split)){
55 |     scatter <- ggplot(data=dataframe) + aes_string(x=xvar, y=yvar) + geom_point(size=ptSize, alpha=ptAlpha) + xlim(xmin,xmax) + ylim(ymin,ymax) 
56 |     plot_top <- ggplot(data=dataframe, aes_string(x=xvar)) + geom_density(alpha=fillAlpha) + theme(legend.position="none") + xlim(xmin,xmax) 
57 |     plot_right <- ggplot(data=dataframe, aes_string(x=yvar)) + geom_density(alpha=fillAlpha) + coord_flip() + theme(legend.position="none") + xlim(ymin,ymax) 
58 |   } 
59 |   else{
60 |     scatter <- ggplot(data=dataframe) + aes_string(x=xvar, y=yvar) + geom_point(size=ptSize, alpha=ptAlpha, aes_string(color=split)) + xlim(xmin,xmax) + ylim(ymin,ymax) 
61 |     plot_top <- ggplot(data=dataframe, aes_string(x=xvar, fill=split)) + geom_density(alpha=fillAlpha) + theme(legend.position="none") + xlim(xmin,xmax) 
62 |     plot_right <- ggplot(data=dataframe, aes_string(x=yvar, fill=split)) + geom_density(alpha=fillAlpha) + coord_flip() + theme(legend.position="none") + xlim(ymin,ymax) 
63 |   }
64 |   legend <- get_legend(scatter)
65 |   scatter <- scatter + theme(legend.position="none")
66 |   temp <- grid.arrange(plot_top, legend, scatter, plot_right, ncol=2, nrow=2, widths=c(4,1), heights=c(1,4))
67 |   return(temp)
68 | }


--------------------------------------------------------------------------------
/production/README.md:
--------------------------------------------------------------------------------
 1 | ## About
 2 | 
 3 | This is the directory we use for some production
 4 | tools and elements for the book. Namely our
 5 | pygments WDL lexer that's used to color-code
 6 | the WDL code used throughout the book.
 7 | 
 8 | If your a reader of the book this directory
 9 | probably isn't interesting to you.
10 | 


--------------------------------------------------------------------------------
/production/notebook_images/cell_27.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/broadinstitute/genomics-in-the-cloud/e329fa7f0668f6bea7f4176736d686e79175ce8b/production/notebook_images/cell_27.png


--------------------------------------------------------------------------------
/production/notebook_images/cell_28.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/broadinstitute/genomics-in-the-cloud/e329fa7f0668f6bea7f4176736d686e79175ce8b/production/notebook_images/cell_28.png


--------------------------------------------------------------------------------
/production/notebook_images/cell_29.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/broadinstitute/genomics-in-the-cloud/e329fa7f0668f6bea7f4176736d686e79175ce8b/production/notebook_images/cell_29.png


--------------------------------------------------------------------------------
/production/notebook_images/cell_32.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/broadinstitute/genomics-in-the-cloud/e329fa7f0668f6bea7f4176736d686e79175ce8b/production/notebook_images/cell_32.png


--------------------------------------------------------------------------------
/production/notebook_images/cell_35.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/broadinstitute/genomics-in-the-cloud/e329fa7f0668f6bea7f4176736d686e79175ce8b/production/notebook_images/cell_35.png


--------------------------------------------------------------------------------
/production/notebook_images/cell_36.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/broadinstitute/genomics-in-the-cloud/e329fa7f0668f6bea7f4176736d686e79175ce8b/production/notebook_images/cell_36.png


--------------------------------------------------------------------------------
/production/notebook_images/cell_37.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/broadinstitute/genomics-in-the-cloud/e329fa7f0668f6bea7f4176736d686e79175ce8b/production/notebook_images/cell_37.png


--------------------------------------------------------------------------------
/production/notebook_images/cell_39.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/broadinstitute/genomics-in-the-cloud/e329fa7f0668f6bea7f4176736d686e79175ce8b/production/notebook_images/cell_39.png


--------------------------------------------------------------------------------
/production/pygments_lexer/README.md:
--------------------------------------------------------------------------------
 1 | ## About
 2 | 
 3 | This is a very basic [Pygments](https://pygments.org) WDL lexer based on the Objective-C lexer.
 4 | See the [WDL spec](https://github.com/openwdl/wdl/blob/master/versions/1.0/SPEC.md)
 5 | for more information about the language.  Also see the
 6 | [wdl-sublime-syntax-highlighter](https://github.com/broadinstitute/wdl-sublime-syntax-highlighter)
 7 | for syntax highlighting in Sublime and Visual Studio and
 8 | [language-wdl](https://github.com/broadinstitute/language-wdl) for
 9 | syntax highlighting in the Atom editor.
10 | 
11 | ## Using
12 | 
13 | This lexer is easy to use.  First, install Pygments using pip (or whatever
14 |   mechanism you like):
15 | 
16 |     $ pip install Pygments
17 | 
18 | Then use this wdl_lexer.py file with the [Pygments Command Line](https://pygments.org/docs/cmdline/):
19 | 
20 |     $ pygmentize -f html -O full,style=colorful -o test.html -l wdl_lexer.py -x hello-world.wdl
21 | 
22 | You can then open the test.html file in your browser.  Take a look at the
23 | command line docs for information on how this works and what options
24 | are available.  The style argument is useful for quickly changing the
25 | look of the syntax highlighting.  See the [live demo](https://pygments.org/demo/#try)
26 | page for the list of possible styles.
27 | 
28 | To generate HTML for all WDL in the repo use this script:
29 | 
30 |     $ bash run_wdl_lexer.sh
31 | 
32 | And look in the `html_output` directory.
33 | 
34 | ## Known Limitations
35 | 
36 | Here are the known issues:
37 | * when command blocks include parameters that overlap with WDL keywords (such as `--create-output-variant-index`) you'll get `output` highlighted.  I'm looking at ways of excluding keywords that include '-' but I haven't been able to get this to work yet.
38 | 
39 | ## Future
40 | 
41 | This is a very, very rough lexer for WDL and, being based on the
42 | Objective-C lexer, there's a ton of room for cleanup and improvements.
43 | 


--------------------------------------------------------------------------------
/production/pygments_lexer/hello-world.wdl:
--------------------------------------------------------------------------------
 1 | version 1.0
 2 | 
 3 | workflow HelloWorld {
 4 |   call WriteGreeting
 5 | }
 6 | 
 7 | task WriteGreeting {
 8 |   command {
 9 |      echo "Hello World"
10 |   }
11 |   output {
12 |      File output_greeting = stdout()
13 |   }
14 | }
15 | 


--------------------------------------------------------------------------------
/production/pygments_lexer/run_wdl_lexer.sh:
--------------------------------------------------------------------------------
1 | mkdir html_output
2 | cd ..
3 | for i in `find . | grep '.wdl$'`;
4 |   do f="$(basename -- $i)"
5 |   echo $f
6 |   pygmentize -f html -O full,style=default -o pygments_lexer/html_output/$f.html -l pygments_lexer/wdl_lexer.py -x $i
7 | done;
8 | 


--------------------------------------------------------------------------------
/production/pygments_lexer/test.html:
--------------------------------------------------------------------------------
  1 | <!DOCTYPE html PUBLIC "-//W3C//DTD HTML 4.01//EN"
  2 |    "http://www.w3.org/TR/html4/strict.dtd">
  3 | <!--
  4 | generated by Pygments <https://pygments.org/>
  5 | Copyright 2006-2019 by the Pygments team.
  6 | Licensed under the BSD license, see LICENSE for details.
  7 | -->
  8 | <html>
  9 | <head>
 10 |   <title></title>
 11 |   <meta http-equiv="content-type" content="text/html; charset=utf-8">
 12 |   <style type="text/css">
 13 | /*
 14 | generated by Pygments <https://pygments.org/>
 15 | Copyright 2006-2019 by the Pygments team.
 16 | Licensed under the BSD license, see LICENSE for details.
 17 | */
 18 | td.linenos { background-color: #f0f0f0; padding-right: 10px; }
 19 | span.lineno { background-color: #f0f0f0; padding: 0 5px 0 5px; }
 20 | pre { line-height: 125%; }
 21 | body .hll { background-color: #ffffcc }
 22 | body  { background: #ffffff; }
 23 | body .c { color: #888888 } /* Comment */
 24 | body .err { color: #FF0000; background-color: #FFAAAA } /* Error */
 25 | body .k { color: #008800; font-weight: bold } /* Keyword */
 26 | body .o { color: #333333 } /* Operator */
 27 | body .ch { color: #888888 } /* Comment.Hashbang */
 28 | body .cm { color: #888888 } /* Comment.Multiline */
 29 | body .cp { color: #557799 } /* Comment.Preproc */
 30 | body .cpf { color: #888888 } /* Comment.PreprocFile */
 31 | body .c1 { color: #888888 } /* Comment.Single */
 32 | body .cs { color: #cc0000; font-weight: bold } /* Comment.Special */
 33 | body .gd { color: #A00000 } /* Generic.Deleted */
 34 | body .ge { font-style: italic } /* Generic.Emph */
 35 | body .gr { color: #FF0000 } /* Generic.Error */
 36 | body .gh { color: #000080; font-weight: bold } /* Generic.Heading */
 37 | body .gi { color: #00A000 } /* Generic.Inserted */
 38 | body .go { color: #888888 } /* Generic.Output */
 39 | body .gp { color: #c65d09; font-weight: bold } /* Generic.Prompt */
 40 | body .gs { font-weight: bold } /* Generic.Strong */
 41 | body .gu { color: #800080; font-weight: bold } /* Generic.Subheading */
 42 | body .gt { color: #0044DD } /* Generic.Traceback */
 43 | body .kc { color: #008800; font-weight: bold } /* Keyword.Constant */
 44 | body .kd { color: #008800; font-weight: bold } /* Keyword.Declaration */
 45 | body .kn { color: #008800; font-weight: bold } /* Keyword.Namespace */
 46 | body .kp { color: #003388; font-weight: bold } /* Keyword.Pseudo */
 47 | body .kr { color: #008800; font-weight: bold } /* Keyword.Reserved */
 48 | body .kt { color: #333399; font-weight: bold } /* Keyword.Type */
 49 | body .m { color: #6600EE; font-weight: bold } /* Literal.Number */
 50 | body .s { background-color: #fff0f0 } /* Literal.String */
 51 | body .na { color: #0000CC } /* Name.Attribute */
 52 | body .nb { color: #007020 } /* Name.Builtin */
 53 | body .nc { color: #BB0066; font-weight: bold } /* Name.Class */
 54 | body .no { color: #003366; font-weight: bold } /* Name.Constant */
 55 | body .nd { color: #555555; font-weight: bold } /* Name.Decorator */
 56 | body .ni { color: #880000; font-weight: bold } /* Name.Entity */
 57 | body .ne { color: #FF0000; font-weight: bold } /* Name.Exception */
 58 | body .nf { color: #0066BB; font-weight: bold } /* Name.Function */
 59 | body .nl { color: #997700; font-weight: bold } /* Name.Label */
 60 | body .nn { color: #0e84b5; font-weight: bold } /* Name.Namespace */
 61 | body .nt { color: #007700 } /* Name.Tag */
 62 | body .nv { color: #996633 } /* Name.Variable */
 63 | body .ow { color: #000000; font-weight: bold } /* Operator.Word */
 64 | body .w { color: #bbbbbb } /* Text.Whitespace */
 65 | body .mb { color: #6600EE; font-weight: bold } /* Literal.Number.Bin */
 66 | body .mf { color: #6600EE; font-weight: bold } /* Literal.Number.Float */
 67 | body .mh { color: #005588; font-weight: bold } /* Literal.Number.Hex */
 68 | body .mi { color: #0000DD; font-weight: bold } /* Literal.Number.Integer */
 69 | body .mo { color: #4400EE; font-weight: bold } /* Literal.Number.Oct */
 70 | body .sa { background-color: #fff0f0 } /* Literal.String.Affix */
 71 | body .sb { background-color: #fff0f0 } /* Literal.String.Backtick */
 72 | body .sc { color: #0044DD } /* Literal.String.Char */
 73 | body .dl { background-color: #fff0f0 } /* Literal.String.Delimiter */
 74 | body .sd { color: #DD4422 } /* Literal.String.Doc */
 75 | body .s2 { background-color: #fff0f0 } /* Literal.String.Double */
 76 | body .se { color: #666666; font-weight: bold; background-color: #fff0f0 } /* Literal.String.Escape */
 77 | body .sh { background-color: #fff0f0 } /* Literal.String.Heredoc */
 78 | body .si { background-color: #eeeeee } /* Literal.String.Interpol */
 79 | body .sx { color: #DD2200; background-color: #fff0f0 } /* Literal.String.Other */
 80 | body .sr { color: #000000; background-color: #fff0ff } /* Literal.String.Regex */
 81 | body .s1 { background-color: #fff0f0 } /* Literal.String.Single */
 82 | body .ss { color: #AA6600 } /* Literal.String.Symbol */
 83 | body .bp { color: #007020 } /* Name.Builtin.Pseudo */
 84 | body .fm { color: #0066BB; font-weight: bold } /* Name.Function.Magic */
 85 | body .vc { color: #336699 } /* Name.Variable.Class */
 86 | body .vg { color: #dd7700; font-weight: bold } /* Name.Variable.Global */
 87 | body .vi { color: #3333BB } /* Name.Variable.Instance */
 88 | body .vm { color: #996633 } /* Name.Variable.Magic */
 89 | body .il { color: #0000DD; font-weight: bold } /* Literal.Number.Integer.Long */
 90 | 
 91 |   </style>
 92 | </head>
 93 | <body>
 94 | <h2></h2>
 95 | 
 96 | <div class="highlight"><pre><span></span><span class="k">version</span> <span class="mf">1.0</span>
 97 | 
 98 | <span class="k">workflow</span> <span class="n">HelloWorld</span> <span class="p">{</span>
 99 |   <span class="k">call</span> <span class="n">WriteGreeting</span>
100 | <span class="p">}</span>
101 | 
102 | <span class="k">task</span> <span class="n">WriteGreeting</span> <span class="p">{</span>
103 |   <span class="k">command</span> <span class="p">{</span>
104 |      <span class="n">echo</span> <span class="s">&quot;Hello World&quot;</span>
105 |   <span class="p">}</span>
106 |   <span class="k">output</span> <span class="p">{</span>
107 |      <span class="kt">File</span> <span class="n">output_greeting</span> <span class="o">=</span> <span class="n">stdout</span><span class="p">()</span>
108 |   <span class="p">}</span>
109 | <span class="p">}</span>
110 | </pre></div>
111 | </body>
112 | </html>
113 | 


--------------------------------------------------------------------------------
/production/pygments_lexer/wdl_lexer.py:
--------------------------------------------------------------------------------
  1 | # -*- coding: utf-8 -*-
  2 | """
  3 |     pygments.lexers.objective
  4 |     ~~~~~~~~~~~~~~~~~~~~~~~~~
  5 | 
  6 |     Lexers for WDL language based on Objective-C by Pygments team
  7 | 
  8 |     :copyright: Original copyright 2006-2019 by the Pygments team, see AUTHORS. 2020 copyright Brian O'Connor
  9 |     :license: BSD, see LICENSE for details.
 10 | """
 11 | 
 12 | import re
 13 | 
 14 | from pygments.lexer import RegexLexer, include, bygroups, using, this, words, \
 15 |     inherit, default
 16 | from pygments.token import Text, Keyword, Name, String, Operator, \
 17 |     Number, Punctuation, Literal, Comment
 18 | 
 19 | from pygments.lexers.c_cpp import CLexer, CppLexer
 20 | 
 21 | __all__ = ['ObjectiveCLexer', 'ObjectiveCppLexer', 'LogosLexer', 'SwiftLexer']
 22 | 
 23 | 
 24 | def objective(baselexer):
 25 |     """
 26 |     Generate a subclass of baselexer that accepts the WDL syntax
 27 |     extensions.
 28 |     """
 29 | 
 30 |     # Have to be careful not to accidentally match JavaDoc/Doxygen syntax here,
 31 |     # since that's quite common in ordinary C/C++ files.  It's OK to match
 32 |     # JavaDoc/Doxygen keywords that only apply to WDL, mind.
 33 |     #
 34 |     # The upshot of this is that we CANNOT match @class or @interface
 35 |     _oc_keywords = re.compile(r'@(?:end|implementation|protocol)')
 36 | 
 37 |     # Matches [ <ws>? identifier <ws> ( identifier <ws>? ] |  identifier? : )
 38 |     # (note the identifier is *optional* when there is a ':'!)
 39 |     _oc_message = re.compile(r'\[\s*[a-zA-Z_]\w*\s+'
 40 |                              r'(?:[a-zA-Z_]\w*\s*\]|'
 41 |                              r'(?:[a-zA-Z_]\w*)?:)')
 42 | 
 43 |     class GeneratedObjectiveCVariant(baselexer):
 44 |         """
 45 |         Implements WDL syntax based on C family lexer.
 46 |         """
 47 | 
 48 |         tokens = {
 49 |             'statements': [
 50 |                 (r'@"', String, 'string'),
 51 |                 (r"@'", String, 'string'),
 52 |                 (r'@\\$', String, 'string'),
 53 |                 (r'@(YES|NO)', Number),
 54 |                 (r"@('\\$|\\.|\\[0-7]{1,3}|\\x[a-fA-F0-9]{1,2}|[^\\\'\n])'", String.Char),
 55 |                 (r'@(\d+\.\d*|\.\d+|\d+)[eE][+-]?\d+[lL]?', Number.Float),
 56 |                 (r'@(\d+\.\d*|\.\d+|\d+[fF])[fF]?', Number.Float),
 57 |                 (r'@0x[0-9a-fA-F]+[Ll]?', Number.Hex),
 58 |                 (r'@0[0-7]+[Ll]?', Number.Oct),
 59 |                 (r'@\d+[Ll]?', Number.Integer),
 60 |                 (r'@\(', Literal, 'literal_number'),
 61 |                 (r'@\[', Literal, 'literal_array'),
 62 |                 (r'@\{', Literal, 'literal_dictionary'),
 63 |                 (words((
 64 |                     'version', 'workflow', 'task', 'command', 'output', 'input',
 65 |                     'runtime', 'call', 'parameter_meta', 'meta', 'scatter', 'as',
 66 |                     'input:', 'import', 'if', 'struct'), suffix=r'\b'),
 67 |                  Keyword),
 68 |                 (words(('File', 'Array', 'Int', 'Float', 'Boolean', 'String',
 69 |                         'Map', 'Pair', 'Object'), suffix=r'\b'),
 70 |                  Keyword.Type),
 71 |                 (r'@(true|false|YES|NO)\n', Name.Builtin),
 72 |                 (r'(YES|NO|nil|self|super)\b', Name.Builtin),
 73 |                 # Carbon types
 74 |                 (r'(Boolean|UInt8|SInt8|UInt16|SInt16|UInt32|SInt32)\b', Keyword.Type),
 75 |                 # Carbon built-ins
 76 |                 (r'(true|false)\b', Name.Builtin),
 77 |                 (r'(@interface|@implementation)(\s+)', bygroups(Keyword, Text),
 78 |                  ('#pop', 'oc_classname')),
 79 |                 (r'(@class|@protocol)(\s+)', bygroups(Keyword, Text),
 80 |                  ('#pop', 'oc_forward_classname')),
 81 |                 # @ can also prefix other expressions like @{...} or @(...)
 82 |                 (r'\\$', Punctuation),
 83 |                 (r"['$@%#]+", Name.Variable),
 84 |                 #(r'@', Punctuation),
 85 |                 inherit,
 86 |             ],
 87 |             'oc_classname': [
 88 |                 # interface definition that inherits
 89 |                 (r'([a-zA-Z$_][\w$]*)(\s*:\s*)([a-zA-Z$_][\w$]*)?(\s*)(\{)',
 90 |                  bygroups(Name.Class, Text, Name.Class, Text, Punctuation),
 91 |                  ('#pop', 'oc_ivars')),
 92 |                 (r'([a-zA-Z$_][\w$]*)(\s*:\s*)([a-zA-Z$_][\w$]*)?',
 93 |                  bygroups(Name.Class, Text, Name.Class), '#pop'),
 94 |                 # interface definition for a category
 95 |                 (r'([a-zA-Z$_][\w$]*)(\s*)(\([a-zA-Z$_][\w$]*\))(\s*)(\{)',
 96 |                  bygroups(Name.Class, Text, Name.Label, Text, Punctuation),
 97 |                  ('#pop', 'oc_ivars')),
 98 |                 (r'([a-zA-Z$_][\w$]*)(\s*)(\([a-zA-Z$_][\w$]*\))',
 99 |                  bygroups(Name.Class, Text, Name.Label), '#pop'),
100 |                 # simple interface / implementation
101 |                 (r'([a-zA-Z$_][\w$]*)(\s*)(\{)',
102 |                  bygroups(Name.Class, Text, Punctuation), ('#pop', 'oc_ivars')),
103 |                 (r'([a-zA-Z$_][\w$]*)', Name.Class, '#pop')
104 |             ],
105 |             'oc_forward_classname': [
106 |                 (r'([a-zA-Z$_][\w$]*)(\s*,\s*)',
107 |                  bygroups(Name.Class, Text), 'oc_forward_classname'),
108 |                 (r'([a-zA-Z$_][\w$]*)(\s*;?)',
109 |                  bygroups(Name.Class, Text), '#pop')
110 |             ],
111 |             'oc_ivars': [
112 |                 include('whitespace'),
113 |                 include('statements'),
114 |                 (';', Punctuation),
115 |                 (r'\{', Punctuation, '#push'),
116 |                 (r'\}', Punctuation, '#pop'),
117 |             ],
118 |             'root': [
119 |                 # methods
120 |                 (r'^([-+])(\s*)'                         # method marker
121 |                  r'(\(.*?\))?(\s*)'                      # return type
122 |                  r'([a-zA-Z$_][\w$]*:?)',        # begin of method name
123 |                  bygroups(Punctuation, Text, using(this),
124 |                           Text, Name.Function),
125 |                  'method'),
126 |                 inherit,
127 |             ],
128 |             'method': [
129 |                 include('whitespace'),
130 |                 # TODO unsure if ellipses are allowed elsewhere, see
131 |                 # discussion in Issue 789
132 |                 (r',', Punctuation),
133 |                 (r'\.\.\.', Punctuation),
134 |                 (r'(\(.*?\))(\s*)([a-zA-Z$_][\w$]*)',
135 |                  bygroups(using(this), Text, Name.Variable)),
136 |                 (r'[a-zA-Z$_][\w$]*:', Name.Function),
137 |                 (';', Punctuation, '#pop'),
138 |                 (r'\{', Punctuation, 'function'),
139 |                 default('#pop'),
140 |             ],
141 |             'literal_number': [
142 |                 (r'\(', Punctuation, 'literal_number_inner'),
143 |                 (r'\)', Literal, '#pop'),
144 |                 include('statement'),
145 |             ],
146 |             'literal_number_inner': [
147 |                 (r'\(', Punctuation, '#push'),
148 |                 (r'\)', Punctuation, '#pop'),
149 |                 include('statement'),
150 |             ],
151 |             'literal_array': [
152 |                 (r'\[', Punctuation, 'literal_array_inner'),
153 |                 (r'\]', Literal, '#pop'),
154 |                 include('statement'),
155 |             ],
156 |             'literal_array_inner': [
157 |                 (r'\[', Punctuation, '#push'),
158 |                 (r'\]', Punctuation, '#pop'),
159 |                 include('statement'),
160 |             ],
161 |             'literal_dictionary': [
162 |                 (r'\}', Literal, '#pop'),
163 |                 include('statement'),
164 |             ],
165 |         }
166 | 
167 |         def analyse_text(text):
168 |             if _oc_keywords.search(text):
169 |                 return 1.0
170 |             elif '@"' in text:  # strings
171 |                 return 0.8
172 |             elif re.search('@[0-9]+', text):
173 |                 return 0.7
174 |             elif _oc_message.search(text):
175 |                 return 0.8
176 |             return 0
177 | 
178 |         def get_tokens_unprocessed(self, text):
179 |             from pygments.lexers._cocoa_builtins import COCOA_INTERFACES, \
180 |                 COCOA_PROTOCOLS, COCOA_PRIMITIVES
181 | 
182 |             for index, token, value in \
183 |                     baselexer.get_tokens_unprocessed(self, text):
184 |                 if token is Name or token is Name.Class:
185 |                     if value in COCOA_INTERFACES or value in COCOA_PROTOCOLS \
186 |                        or value in COCOA_PRIMITIVES:
187 |                         token = Name.Builtin.Pseudo
188 | 
189 |                 yield index, token, value
190 | 
191 |     return GeneratedObjectiveCVariant
192 | 
193 | 
194 | class CustomLexer(objective(CLexer)):
195 |     """
196 |     For WDL workflow code.
197 |     """
198 | 
199 |     name = 'WDL'
200 |     aliases = ['wdl']
201 |     filenames = ['*.wdl']
202 |     mimetypes = ['text/x-wdl']
203 |     priority = 0.05    # Lower than C
204 | 


--------------------------------------------------------------------------------
/temp/05-plotting.R:
--------------------------------------------------------------------------------
 1 | # plotting.R script loads ggplot and gridExtra libraries and defines functions to plot variant annotations 
 2 | 
 3 | library(ggplot2)
 4 | install.packages("gridExtra")
 5 | library(gridExtra)
 6 | 
 7 | get_legend<-function(myggplot){
 8 |   tmp <- ggplot_gtable(ggplot_build(myggplot))
 9 |   leg <- which(sapply(tmp$grobs, function(x) x$name) == "guide-box")
10 |   legend <- tmp$grobs[[leg]]
11 |   return(legend)
12 | }
13 | 
14 | # Function for making density plots of a single annotation
15 | makeDensityPlot <- function(dataframe, xvar, split, xmin=min(dataframe[xvar], na.rm=TRUE), xmax=max(dataframe[xvar], na.rm=TRUE), alpha=0.5) {
16 |   
17 |   if(missing(split)) {
18 |     return(ggplot(data=dataframe, aes_string(x=xvar)) + xlim(xmin,xmax) + geom_density() )
19 |   }
20 |   else {
21 |     return(ggplot(data=dataframe, aes_string(x=xvar, fill=split)) + xlim(xmin,xmax) + geom_density(alpha=alpha) )
22 |   }
23 | }
24 | 
25 | # Function for making scatter plots of two annotations
26 | makeScatterPlot <- function(dataframe, xvar, yvar, split, xmin=min(dataframe[xvar], na.rm=TRUE), xmax=max(dataframe[xvar], na.rm=TRUE), ymin=min(dataframe[yvar], na.rm=TRUE), ymax=max(dataframe[yvar], na.rm=TRUE), ptSize=1, alpha=0.6) {
27 |   if(missing(split)) {
28 |     return(ggplot(data=dataframe) + aes_string(x=xvar, y=yvar) + xlim(xmin,xmax) + ylim(ymin,ymax) + geom_point(size=ptSize, alpha=alpha) )
29 |   }
30 |   else {
31 |     return(ggplot(data=dataframe) + aes_string(x=xvar, y=yvar) + aes_string(color=split) + xlim(xmin,xmax) + ylim(ymin,ymax) + geom_point(size=ptSize, alpha=alpha) )
32 |   }
33 | }
34 | 
35 | # Function for making scatter plots of two annotations with marginal density plots of each
36 | makeScatterPlotWithMarginalDensity <- function(dataframe, xvar, yvar, split, xmin=min(dataframe[xvar], na.rm=TRUE), xmax=max(dataframe[xvar], na.rm=TRUE), ymin=min(dataframe[yvar], na.rm=TRUE), ymax=max(dataframe[yvar], na.rm=TRUE), ptSize=1, ptAlpha=0.6, fillAlpha=0.5) {
37 |   empty <- ggplot()+geom_point(aes(1,1), colour="white") +
38 |     theme(
39 |       plot.background = element_blank(), 
40 |       panel.grid.major = element_blank(), 
41 |       panel.grid.minor = element_blank(), 
42 |       panel.border = element_blank(), 
43 |       panel.background = element_blank(),
44 |       axis.title.x = element_blank(),
45 |       axis.title.y = element_blank(),
46 |       axis.text.x = element_blank(),
47 |       axis.text.y = element_blank(),
48 |       axis.ticks = element_blank()
49 |     )
50 |   
51 |   if(missing(split)){
52 |     scatter <- ggplot(data=dataframe) + aes_string(x=xvar, y=yvar) + geom_point(size=ptSize, alpha=ptAlpha) + xlim(xmin,xmax) + ylim(ymin,ymax) 
53 |     plot_top <- ggplot(data=dataframe, aes_string(x=xvar)) + geom_density(alpha=fillAlpha) + theme(legend.position="none") + xlim(xmin,xmax) 
54 |     plot_right <- ggplot(data=dataframe, aes_string(x=yvar)) + geom_density(alpha=fillAlpha) + coord_flip() + theme(legend.position="none") + xlim(ymin,ymax) 
55 |   } 
56 |   else{
57 |     scatter <- ggplot(data=dataframe) + aes_string(x=xvar, y=yvar) + geom_point(size=ptSize, alpha=ptAlpha, aes_string(color=split)) + xlim(xmin,xmax) + ylim(ymin,ymax) 
58 |     plot_top <- ggplot(data=dataframe, aes_string(x=xvar, fill=split)) + geom_density(alpha=fillAlpha) + theme(legend.position="none") + xlim(xmin,xmax) 
59 |     plot_right <- ggplot(data=dataframe, aes_string(x=yvar, fill=split)) + geom_density(alpha=fillAlpha) + coord_flip() + theme(legend.position="none") + xlim(ymin,ymax) 
60 |   }
61 |   legend <- get_legend(scatter)
62 |   scatter <- scatter + theme(legend.position="none")
63 |   temp <- grid.arrange(plot_top, legend, scatter, plot_right, ncol=2, nrow=2, widths=c(4,1), heights=c(1,4))
64 |   return(temp)
65 | }
66 | 


--------------------------------------------------------------------------------
/workflows/README.md:
--------------------------------------------------------------------------------
1 | Chapter materials


--------------------------------------------------------------------------------
/workflows/hello-hc/hc-break1.wdl:
--------------------------------------------------------------------------------
 1 | ## This workflow is intentionally broken!
 2 | 
 3 | version 1.0 
 4 | 
 5 | workflow HelloHaplotypeCaller {
 6 | 
 7 | 	call HaplotypeCallerGVCF
 8 | }
 9 | 
10 | task HaplotypeCallerGVCF {
11 | 
12 | 	input {
13 | 		String docker_image
14 | 		String java_opt
15 | 	
16 | 		File ref_fasta
17 | 		File ref_index
18 | 		File ref_dict
19 | 		File input_bam
20 | 		File input_bam_index
21 | 		File intervals
22 | 	}
23 | 
24 | 	# The basename() function is missing its right parenthesis (rparen)
25 | 	String gvcf_name = basename(input_bam, ".bam" + ".g.vcf"
26 | 
27 | 	command {
28 | 		gatk --java-options ${java_opt} HaploCaller \
29 | 			-R ${ref_fasta} \
30 | 			-I ${input_bam} \
31 | 			-O ${gvcf_name} \
32 | 			-L ${intervals} \
33 | 			-ERC GVCF
34 | 	}
35 | 	
36 | 	output {
37 | 		File output_gvcf = "${gvcf_name}"
38 | 	}
39 | 
40 | 	runtime {
41 | 		docker: docker_image
42 | 	}
43 | 
44 | }
45 | 


--------------------------------------------------------------------------------
/workflows/hello-hc/hc-break2.wdl:
--------------------------------------------------------------------------------
 1 | ## This workflow is intentionally broken!
 2 | 
 3 | version 1.0 
 4 | 
 5 | workflow HelloHaplotypeCaller {
 6 | 
 7 | 	call HaplotypeCallerGVCF
 8 | }
 9 | 
10 | task HaplotypeCallerGVCF {
11 | 
12 | 	input {
13 | 		String docker_image
14 | 		String java_opt
15 | 	
16 | 		File ref_fasta
17 | 		File ref_index
18 | 		File ref_dict
19 | 		File input_bam
20 | 		File input_bam_index
21 | 		File intervals
22 | 	}
23 | 
24 | 	String gvcf_name = basename(input_bam, ".bam") + ".g.vcf"
25 | 
26 | 	# The tool name in this command is wrong 
27 | 	# (HaploCaller instead of HaplotypeCaller)
28 | 	command {
29 | 		gatk --java-options ${java_opt} HaploCaller \
30 | 			-R ${ref_fasta} \
31 | 			-I ${input_bam} \
32 | 			-O ${gvcf_name} \
33 | 			-L ${intervals} \
34 | 			-ERC GVCF
35 | 	}
36 | 	
37 | 	output {
38 | 		File output_gvcf = "${gvcf_name}"
39 | 	}
40 | 
41 | 	runtime {
42 | 		docker: docker_image
43 | 	}
44 | 
45 | }
46 | 


--------------------------------------------------------------------------------
/workflows/hello-hc/hello-haplotypecaller.inputs.json:
--------------------------------------------------------------------------------
 1 | {
 2 | "HelloHaplotypeCaller.HaplotypeCallerGVCF.input_bam_index": "book/data/germline/bams/mother.bai",
 3 | "HelloHaplotypeCaller.HaplotypeCallerGVCF.input_bam": "book/data/germline/bams/mother.bam",
 4 | "HelloHaplotypeCaller.HaplotypeCallerGVCF.ref_fasta": "book/data/germline/ref/ref.fasta",
 5 | "HelloHaplotypeCaller.HaplotypeCallerGVCF.ref_index": "book/data/germline/ref/ref.fasta.fai",
 6 | "HelloHaplotypeCaller.HaplotypeCallerGVCF.ref_dict": "book/data/germline/ref/ref.dict",
 7 | "HelloHaplotypeCaller.HaplotypeCallerGVCF.intervals": "book/data/germline/intervals/snippet-intervals-min.list",
 8 | "HelloHaplotypeCaller.HaplotypeCallerGVCF.docker_image": "us.gcr.io/broad-gatk/gatk:4.1.3.0",
 9 | "HelloHaplotypeCaller.HaplotypeCallerGVCF.java_opt": "-Xmx8G"
10 | }
11 | 


--------------------------------------------------------------------------------
/workflows/hello-hc/hello-haplotypecaller.wdl:
--------------------------------------------------------------------------------
 1 | ## This workflow runs the HaplotypeCaller tool from GATK4 in GVCF mode 
 2 | ## on a single sample in BAM format and produces a single GVCF file, 
 3 | ## which can then be used by the joint-discovery workflow according 
 4 | ## to the GATK Best Practices for germline short variant discovery.
 5 | 
 6 | version 1.0 
 7 | 
 8 | workflow HelloHaplotypeCaller {
 9 | 
10 | 	call HaplotypeCallerGVCF
11 | }
12 | 
13 | task HaplotypeCallerGVCF {
14 | 
15 | 	input {
16 | 		String docker_image
17 | 		String java_opt
18 | 	
19 | 		File ref_fasta
20 | 		File ref_index
21 | 		File ref_dict
22 | 		File input_bam
23 | 		File input_bam_index
24 | 		File intervals
25 | 	}
26 | 
27 | 	String gvcf_name = basename(input_bam, ".bam") + ".g.vcf"
28 | 
29 | 	command {
30 | 		gatk --java-options ${java_opt} HaplotypeCaller \
31 | 			-R ${ref_fasta} \
32 | 			-I ${input_bam} \
33 | 			-O ${gvcf_name} \
34 | 			-L ${intervals} \
35 | 			-ERC GVCF
36 | 	}
37 | 	
38 | 	output {
39 | 		File output_gvcf = "${gvcf_name}"
40 | 	}
41 | 
42 | 	runtime {
43 | 		docker: docker_image
44 | 	}
45 | 
46 | }
47 | 


--------------------------------------------------------------------------------
/workflows/hello-world/hello-world-again.wdl:
--------------------------------------------------------------------------------
 1 | version 1.0
 2 | 
 3 | workflow HelloWorld {
 4 | 
 5 |   call WriteGreeting
 6 | 
 7 |   call ReadItBackToMe {
 8 |      input:
 9 |         written_greeting = WriteGreeting.output_greeting
10 |   }
11 | 
12 |   output {
13 |      File outfile = ReadItBackToMe.repeated_greeting
14 |   }
15 | }
16 | 
17 | task WriteGreeting {
18 | 
19 |   input {
20 |      String greeting
21 |   }
22 | 
23 |   command {
24 |      echo "${greeting}"
25 |   }
26 |   output {
27 |      File output_greeting = stdout()
28 |   }
29 | }
30 | 
31 | task ReadItBackToMe {
32 | 
33 |   input {
34 |      File written_greeting
35 |      String original_greeting = read_string(written_greeting)
36 |   }
37 | 
38 |   command {
39 |      echo "${original_greeting} to you too"
40 |   }
41 |   output {
42 |      File repeated_greeting = stdout()
43 |   }
44 | }
45 | 


--------------------------------------------------------------------------------
/workflows/hello-world/hello-world-var.wdl:
--------------------------------------------------------------------------------
 1 | version 1.0
 2 | 
 3 | workflow HelloWorld {
 4 |   call WriteGreeting
 5 | }
 6 | 
 7 | task WriteGreeting {
 8 | 
 9 |   input {
10 |       String greeting
11 |   }
12 | 
13 |   command {
14 |      echo "${greeting}"
15 |   }
16 | 
17 |   output {
18 |      File output_greeting = stdout()
19 |   }
20 | }
21 | 


--------------------------------------------------------------------------------
/workflows/hello-world/hello-world.inputs.json:
--------------------------------------------------------------------------------
1 | {
2 |  "HelloWorld.WriteGreeting.greeting": "Hello Variable World"
3 | }


--------------------------------------------------------------------------------
/workflows/hello-world/hello-world.wdl:
--------------------------------------------------------------------------------
 1 | version 1.0
 2 | 
 3 | workflow HelloWorld {
 4 |   call WriteGreeting
 5 | }
 6 | 
 7 | task WriteGreeting {
 8 |   command {
 9 |      echo "Hello World"
10 |   }
11 |   output {
12 |      File output_greeting = stdout()
13 |   }
14 | }
15 | 


--------------------------------------------------------------------------------
/workflows/mystery-1/haplotypecaller-gvcf-gatk4.hg38.wgs.inputs.json:
--------------------------------------------------------------------------------
 1 | {
 2 |   "##_COMMENT1": "INPUT BAM",
 3 |   "#HaplotypeCallerGvcf_GATK4.input_bam": "gs://gatk-test-data/wgs_bam/NA12878_24RG_hg38/NA12878_24RG_small.hg38.bam",
 4 |   "#HaplotypeCallerGvcf_GATK4.input_bam_index": "gs://gatk-test-data/wgs_bam/NA12878_24RG_hg38/NA12878_24RG_small.hg38.bai",
 5 |   "HaplotypeCallerGvcf_GATK4.input_bam": "gs://broad-public-datasets/NA12878/NA12878.cram",
 6 |   "HaplotypeCallerGvcf_GATK4.input_bam_index": "gs://broad-public-datasets/NA12878/NA12878.cram.crai",
 7 | 
 8 |   "##_COMMENT2": "REFERENCE FILES",
 9 |   "HaplotypeCallerGvcf_GATK4.ref_dict": "gs://broad-references/hg38/v0/Homo_sapiens_assembly38.dict",
10 |   "HaplotypeCallerGvcf_GATK4.ref_fasta": "gs://broad-references/hg38/v0/Homo_sapiens_assembly38.fasta",
11 |   "HaplotypeCallerGvcf_GATK4.ref_fasta_index": "gs://broad-references/hg38/v0/Homo_sapiens_assembly38.fasta.fai",
12 | 
13 |   "##_COMMENT3": "INTERVALS",
14 |   "HaplotypeCallerGvcf_GATK4.scattered_calling_intervals_list": "gs://gatk-test-data/intervals/hg38_wgs_scattered_calling_intervals.txt",
15 |   
16 |   "##_COMMENT4": "MISCELLANEOUS PARAMETERS",
17 |   "#HaplotypeCallerGvcf_GATK4.HaplotypeCaller.make_gvcf": "True",
18 |   "#HaplotypeCallerGvcf_GATK4.HaplotypeCaller.contamination": "Float? (optional)",
19 | 
20 |   "##_COMMENT5": "DOCKERS", 
21 |   "#HaplotypeCallerGvcf_GATK4.gatk_docker_override": "String? (optional)",
22 |   "#HaplotypeCallerGvcf_GATK4.gitc_docker_override": "String? (optional)",
23 |   
24 |   "##_COMMENT6": "PATHS", 
25 |   "#HaplotypeCallerGvcf_GATK4.gatk_path_override": "String? (optional)",
26 |   "#HaplotypeCallerGvcf_GATK4.samtools_path_override": "String? (optional)",
27 | 
28 |   "##_COMMENT7": "JAVA OPTIONS", 
29 |   "#HaplotypeCallerGvcf_GATK4.HaplotypeCaller.java_options": "String? (optional)",
30 | 
31 |   "##_COMMENT8": "MEMORY ALLOCATION", 
32 |   "#HaplotypeCallerGvcf_GATK4.HaplotypeCaller.mem_gb": "Int? (optional)",
33 |   "#HaplotypeCallerGvcf_GATK4.MergeGVCFs.mem_gb": "Int? (optional)",
34 |   "#HaplotypeCallerGvcf_GATK4.CramToBamTask.machine_mem_gb": "Int? (optional)",
35 | 
36 |   "##_COMMENT9": "DISK SIZE ALLOCATION",
37 |   "#HaplotypeCallerGvcf_GATK4.HaplotypeCaller.disk_space_gb": "Int? (optional)",
38 |   "#HaplotypeCallerGvcf_GATK4.MergeGVCFs.disk_space_gb": "Int? (optional)",
39 |   "#HaplotypeCallerGvcf_GATK4.CramToBamTask.disk_space_gb": "Int? (optional)",
40 | 
41 |   "##_COMMENT10": "PREEMPTION",
42 |   "#HaplotypeCallerGvcf_GATK4.HaplotypeCaller.preemptible_attempts": "Int? (optional)",
43 |   "#HaplotypeCallerGvcf_GATK4.MergeGVCFs.preemptible_attempts": "Int? (optional)",
44 |   "#HaplotypeCallerGvcf_GATK4.CramToBamTask.preemptible_attempts": "Int? (optional)"
45 | }
46 | 


--------------------------------------------------------------------------------
/workflows/mystery-1/haplotypecaller-gvcf-gatk4.wdl:
--------------------------------------------------------------------------------
  1 | ## Copyright Broad Institute, 2020
  2 | ##
  3 | ## Adapted from https://github.com/gatk-workflows/gatk4-germline-snps-indels/blob/master/haplotypecaller-gvcf-gatk4.wdl
  4 | ##
  5 | ## The haplotypecaller-gvcf-gatk4 workflow runs the HaplotypeCaller tool
  6 | ## from GATK4 in GVCF mode on a single sample according to GATK Best Practices.
  7 | ## When executed, the workflow scatters the HaplotypeCaller tool over a sample
  8 | ## using an intervals list file. The output file produced will be a
  9 | ## single GVCF file that can be used by the joint-discovery workflow.
 10 | ##
 11 | ## Requirements/expectations :
 12 | ## - One analysis-ready BAM file for a single sample (as identified in RG:SM)
 13 | ## - Set of variant calling intervals lists for the scatter, provided in a file
 14 | ##
 15 | ## Outputs :
 16 | ## - One GVCF file and its index
 17 | ##
 18 | ## Cromwell version support
 19 | ## - Successfully tested on v48
 20 | ## - Requires WDL 1.0 support
 21 | ##
 22 | ## Runtime parameters are optimized for Broad's Google Cloud Platform implementation.
 23 | ##
 24 | ## LICENSING :
 25 | ## This script is released under the WDL source code license (BSD-3) (see LICENSE in
 26 | ## https://github.com/openwdl/wdl). Note however that the programs it calls may
 27 | ## be subject to different licenses. Users are responsible for checking that they are
 28 | ## authorized to run all programs before running this script. Please see the dockers
 29 | ## for detailed licensing information pertaining to the included programs.
 30 | 
 31 | version 1.0
 32 | 
 33 | # WORKFLOW DEFINITION
 34 | workflow HaplotypeCallerGvcf_GATK4 {
 35 | 
 36 |   input {
 37 |     File input_bam
 38 |     File input_bam_index
 39 |     File ref_dict
 40 |     File ref_fasta
 41 |     File ref_fasta_index
 42 |     File scattered_calling_intervals_list
 43 | 
 44 |     Boolean? make_gvcf
 45 |     Boolean making_gvcf = select_first([make_gvcf,true])
 46 | 
 47 |     String? gatk_docker_override
 48 |     String gatk_docker = select_first([gatk_docker_override, "us.gcr.io/broad-gatk/gatk:4.1.0.0"])
 49 |     String? gatk_path_override
 50 |     String gatk_path = select_first([gatk_path_override, "/gatk/gatk"])
 51 |     String? gitc_docker_override
 52 |     String gitc_docker = select_first([gitc_docker_override, "broadinstitute/genomes-in-the-cloud:2.3.1-1500064817"])
 53 |     String? samtools_path_override
 54 |     String samtools_path = select_first([samtools_path_override, "samtools"])
 55 |   }
 56 | 
 57 |   Array[File] scattered_calling_intervals = read_lines(scattered_calling_intervals_list)
 58 | 
 59 |   #is the input a cram file?
 60 |   Boolean is_cram = sub(basename(input_bam), ".*\\.", "") == "cram"
 61 | 
 62 |   String sample_basename = if is_cram then  basename(input_bam, ".cram") else basename(input_bam, ".bam")
 63 |   String vcf_basename = sample_basename
 64 |   String output_suffix = if making_gvcf then ".g.vcf.gz" else ".vcf.gz"
 65 |   String output_filename = vcf_basename + output_suffix
 66 | 
 67 |   if ( is_cram ) {
 68 |     call CramToBamTask {
 69 |           input:
 70 |             input_cram = input_bam,
 71 |             sample_name = sample_basename,
 72 |             ref_dict = ref_dict,
 73 |             ref_fasta = ref_fasta,
 74 |             ref_fasta_index = ref_fasta_index,
 75 |             docker = gitc_docker,
 76 |             samtools_path = samtools_path
 77 |     }
 78 |   }
 79 | 
 80 |   # Call variants in parallel over grouped calling intervals
 81 |   scatter (interval_file in scattered_calling_intervals) {
 82 | 
 83 |     # Generate GVCF by interval
 84 |     call HaplotypeCaller {
 85 |       input:
 86 |         input_bam = select_first([CramToBamTask.output_bam, input_bam]),
 87 |         input_bam_index = select_first([CramToBamTask.output_bai, input_bam_index]),
 88 |         interval_list = interval_file,
 89 |         output_filename = output_filename,
 90 |         ref_dict = ref_dict,
 91 |         ref_fasta = ref_fasta,
 92 |         ref_fasta_index = ref_fasta_index,
 93 |         make_gvcf = making_gvcf,
 94 |         docker = gatk_docker,
 95 |         gatk_path = gatk_path
 96 |     }
 97 |   }
 98 | 
 99 |   # Merge per-interval GVCFs
100 |   call MergeGVCFs {
101 |     input:
102 |       input_vcfs = HaplotypeCaller.output_vcf,
103 |       input_vcfs_indexes = HaplotypeCaller.output_vcf_index,
104 |       output_filename = output_filename,
105 |       docker = gatk_docker,
106 |       gatk_path = gatk_path
107 |   }
108 | 
109 |   # Outputs that will be retained when execution is complete
110 |   output {
111 |     File output_vcf = MergeGVCFs.output_vcf
112 |     File output_vcf_index = MergeGVCFs.output_vcf_index
113 |   }
114 | }
115 | 
116 | # TASK DEFINITIONS
117 | 
118 | task CramToBamTask {
119 | 
120 |   input {
121 |     # Command parameters
122 |     File ref_fasta
123 |     File ref_fasta_index
124 |     File ref_dict
125 |     File input_cram
126 |     String sample_name
127 | 
128 |     # Runtime parameters
129 |     String docker
130 |     Int? machine_mem_gb
131 |     Int? disk_space_gb
132 |     Boolean use_ssd = false
133 |     Int? preemptible_attempts
134 |     String samtools_path
135 |   }
136 | 
137 |     Float output_bam_size = size(input_cram, "GB") / 0.60
138 |     Float ref_size = size(ref_fasta, "GB") + size(ref_fasta_index, "GB") + size(ref_dict, "GB")
139 |     Int disk_size = ceil(size(input_cram, "GB") + output_bam_size + ref_size) + 20
140 | 
141 |   command {
142 |     set -e
143 |     set -o pipefail
144 | 
145 |     ${samtools_path} view -h -T ${ref_fasta} ${input_cram} |
146 |     ${samtools_path} view -b -o ${sample_name}.bam -
147 |     ${samtools_path} index -b ${sample_name}.bam
148 |     mv ${sample_name}.bam.bai ${sample_name}.bai
149 |   }
150 |   runtime {
151 |     docker: docker
152 |     memory: select_first([machine_mem_gb, 15]) + " GB"
153 |     disks: "local-disk " + select_first([disk_space_gb, disk_size]) + if use_ssd then " SSD" else " HDD"
154 |     preemptible: select_first([preemptible_attempts, 3])
155 |  }
156 |   output {
157 |     File output_bam = "${sample_name}.bam"
158 |     File output_bai = "${sample_name}.bai"
159 |   }
160 | }
161 | 
162 | # HaplotypeCaller per-sample in GVCF mode
163 | task HaplotypeCaller {
164 | 
165 |   input {
166 |     File input_bam
167 |     File input_bam_index
168 |     File interval_list
169 |     String output_filename
170 |     File ref_dict
171 |     File ref_fasta
172 |     File ref_fasta_index
173 |     Float? contamination
174 |     Boolean make_gvcf
175 | 
176 |     String gatk_path
177 |     String? java_options
178 |     String java_opt = select_first([java_options, "-XX:GCTimeLimit=50 -XX:GCHeapFreeLimit=10"])
179 | 
180 |     # Runtime parameters
181 |     String docker
182 |     Int? mem_gb
183 |     Int? disk_space_gb
184 |     Boolean use_ssd = false
185 |     Int? preemptible_attempts
186 |   }
187 | 
188 |   Int machine_mem_gb = select_first([mem_gb, 7])
189 |   Int command_mem_gb = machine_mem_gb - 1
190 | 
191 |   Float ref_size = size(ref_fasta, "GB") + size(ref_fasta_index, "GB") + size(ref_dict, "GB")
192 |   Int disk_size = ceil(size(input_bam, "GB") + ref_size) + 20
193 | 
194 |   command <<<
195 |   set -e
196 | 
197 |     ${gatk_path} --java-options "-Xmx${command_mem_gb}G ${java_opt}" \
198 |       HaplotypeCaller \
199 |       -R ${ref_fasta} \
200 |       -I ${input_bam} \
201 |       -L ${interval_list} \
202 |       -O ${output_filename} \
203 |       -contamination ${default=0 contamination} ${true="-ERC GVCF" false="" make_gvcf}
204 |   >>>
205 | 
206 |   runtime {
207 |     docker: docker
208 |     memory: machine_mem_gb + " GB"
209 |     disks: "local-disk " + select_first([disk_space_gb, disk_size]) + if use_ssd then " SSD" else " HDD"
210 |     preemptible: select_first([preemptible_attempts, 3])
211 |   }
212 | 
213 |   output {
214 |     File output_vcf = "${output_filename}"
215 |     File output_vcf_index = "${output_filename}.tbi"
216 |   }
217 | }
218 | # Merge GVCFs generated per-interval for the same sample
219 | task MergeGVCFs {
220 | 
221 |   input {
222 |     Array[File] input_vcfs
223 |     Array[File] input_vcfs_indexes
224 |     String output_filename
225 | 
226 |     String gatk_path
227 | 
228 |     # Runtime parameters
229 |     String docker
230 |     Int? mem_gb
231 |     Int? disk_space_gb
232 |     Boolean use_ssd = false
233 |     Int? preemptible_attempts
234 |   }
235 | 
236 |     Int machine_mem_gb = select_first([mem_gb, 3])
237 |     Int command_mem_gb = machine_mem_gb - 1
238 | 
239 |   command <<<
240 |   set -e
241 | 
242 |     ${gatk_path} --java-options "-Xmx${command_mem_gb}G"  \
243 |       MergeVcfs \
244 |       --INPUT ${sep=' --INPUT ' input_vcfs} \
245 |       --OUTPUT ${output_filename}
246 |   >>>
247 | 
248 |   runtime {
249 |     docker: docker
250 |     memory: machine_mem_gb + " GB"
251 |     disks: "local-disk " + select_first([disk_space_gb, 100]) + if use_ssd then " SSD" else " HDD"
252 |     preemptible: select_first([preemptible_attempts, 3])
253 |   }
254 | 
255 | 
256 |   output {
257 |     File output_vcf = "${output_filename}"
258 |     File output_vcf_index = "${output_filename}.tbi"
259 |   }
260 | }
261 | 


--------------------------------------------------------------------------------
/workflows/mystery-2/WholeGenomeGermlineSingleSample.hg38.inputs.json:
--------------------------------------------------------------------------------
 1 | {
 2 |   "WholeGenomeGermlineSingleSample.sample_and_unmapped_bams": {
 3 |     "sample_name": "NA12878 PLUMBING",
 4 |     "base_file_name": "NA12878_PLUMBING",
 5 |     "flowcell_unmapped_bams": [
 6 |     "gs://broad-public-datasets/NA12878_downsampled_for_testing/unmapped/H06HDADXX130110.1.ATCACGAT.20k_reads.bam",
 7 |     "gs://broad-public-datasets/NA12878_downsampled_for_testing/unmapped/H06HDADXX130110.2.ATCACGAT.20k_reads.bam",
 8 |     "gs://broad-public-datasets/NA12878_downsampled_for_testing/unmapped/H06JUADXX130110.1.ATCACGAT.20k_reads.bam"
 9 |     ],
10 |     "final_gvcf_base_name": "NA12878_PLUMBING",
11 |     "unmapped_bam_suffix": ".bam"
12 |   },
13 | 
14 |   "WholeGenomeGermlineSingleSample.references": {
15 |     "fingerprint_genotypes_file": "gs://dsde-data-na12878-public/NA12878.hg38.reference.fingerprint.vcf",
16 |     "fingerprint_genotypes_index": "gs://dsde-data-na12878-public/NA12878.hg38.reference.fingerprint.vcf.idx",
17 |     "contamination_sites_ud": "gs://broad-references/hg38/v0/Homo_sapiens_assembly38.contam.UD",
18 |     "contamination_sites_bed": "gs://broad-references/hg38/v0/Homo_sapiens_assembly38.contam.bed",
19 |     "contamination_sites_mu": "gs://broad-references/hg38/v0/Homo_sapiens_assembly38.contam.mu",
20 |     "calling_interval_list": "gs://broad-references/hg38/v0/wgs_calling_regions.hg38.interval_list",
21 |     "haplotype_scatter_count": 10,
22 |     "break_bands_at_multiples_of": 100000,
23 |     "reference_fasta" : {
24 |         "ref_dict": "gs://broad-references/hg38/v0/Homo_sapiens_assembly38.dict",
25 |         "ref_fasta": "gs://broad-references/hg38/v0/Homo_sapiens_assembly38.fasta",
26 |         "ref_fasta_index": "gs://broad-references/hg38/v0/Homo_sapiens_assembly38.fasta.fai",
27 |         "ref_alt": "gs://broad-references/hg38/v0/Homo_sapiens_assembly38.fasta.64.alt",
28 |         "ref_sa": "gs://broad-references/hg38/v0/Homo_sapiens_assembly38.fasta.64.sa",
29 |         "ref_amb": "gs://broad-references/hg38/v0/Homo_sapiens_assembly38.fasta.64.amb",
30 |         "ref_bwt": "gs://broad-references/hg38/v0/Homo_sapiens_assembly38.fasta.64.bwt",
31 |         "ref_ann": "gs://broad-references/hg38/v0/Homo_sapiens_assembly38.fasta.64.ann",
32 |         "ref_pac": "gs://broad-references/hg38/v0/Homo_sapiens_assembly38.fasta.64.pac"
33 |     },
34 |     "known_indels_sites_vcfs": [
35 |       "gs://broad-references/hg38/v0/Mills_and_1000G_gold_standard.indels.hg38.vcf.gz",
36 |       "gs://broad-references/hg38/v0/Homo_sapiens_assembly38.known_indels.vcf.gz"
37 |     ],
38 |     "known_indels_sites_indices": [
39 |       "gs://broad-references/hg38/v0/Mills_and_1000G_gold_standard.indels.hg38.vcf.gz.tbi",
40 |       "gs://broad-references/hg38/v0/Homo_sapiens_assembly38.known_indels.vcf.gz.tbi"
41 |     ],
42 |     "dbsnp_vcf": "gs://broad-references/hg38/v0/Homo_sapiens_assembly38.dbsnp138.vcf",
43 |     "dbsnp_vcf_index": "gs://broad-references/hg38/v0/Homo_sapiens_assembly38.dbsnp138.vcf.idx",
44 |     "evaluation_interval_list": "gs://broad-references/hg38/v0/wgs_evaluation_regions.hg38.interval_list"
45 |   },
46 | 
47 |   "WholeGenomeGermlineSingleSample.wgs_coverage_interval_list": "gs://broad-references/hg38/v0/wgs_coverage_regions.hg38.interval_list",
48 | 
49 |   "WholeGenomeGermlineSingleSample.papi_settings": {
50 |     "preemptible_tries": 3,
51 |     "agg_preemptible_tries": 3
52 |   }
53 | }
54 | 


--------------------------------------------------------------------------------
/workflows/mystery-2/WholeGenomeGermlineSingleSample.wdl:
--------------------------------------------------------------------------------
  1 | version 1.0
  2 | 
  3 | ## Copyright Broad Institute, 2018
  4 | ##
  5 | ## This WDL pipeline implements data pre-processing and initial variant calling (GVCF
  6 | ## generation) according to the GATK Best Practices (June 2016) for germline SNP and
  7 | ## Indel discovery in human whole-genome data.
  8 | ##
  9 | ## Requirements/expectations :
 10 | ## - Human whole-genome pair-end sequencing data in unmapped BAM (uBAM) format
 11 | ## - One or more read groups, one per uBAM file, all belonging to a single sample (SM)
 12 | ## - Input uBAM files must additionally comply with the following requirements:
 13 | ## - - filenames all have the same suffix (we use ".unmapped.bam")
 14 | ## - - files must pass validation by ValidateSamFile
 15 | ## - - reads are provided in query-sorted order
 16 | ## - - all reads must have an RG tag
 17 | ## - GVCF output names must end in ".g.vcf.gz"
 18 | ## - Reference genome must be Hg38 with ALT contigs
 19 | ##
 20 | ## Runtime parameters are optimized for Broad's Google Cloud Platform implementation.
 21 | ## For program versions, see docker containers.
 22 | ##
 23 | ## LICENSING :
 24 | ## This script is released under the WDL source code license (BSD-3) (see LICENSE in
 25 | ## https://github.com/broadinstitute/wdl). Note however that the programs it calls may
 26 | ## be subject to different licenses. Users are responsible for checking that they are
 27 | ## authorized to run all programs before running this script. Please see the docker
 28 | ## page at https://hub.docker.com/r/broadinstitute/genomes-in-the-cloud/ for detailed
 29 | ## licensing information pertaining to the included programs.
 30 | 
 31 | # Local import
 32 | #import "../../../../pipelines/dna_seq/UnmappedBamToAlignedBam.wdl" as ToBam
 33 | #import "../../../../tasks/AggregatedBamQC.wdl" as AggregatedQC
 34 | #import "../../../../tasks/GermlineVariantDiscovery.wdl" as Calling
 35 | #import "../../../../tasks/Qc.wdl" as QC
 36 | #import "../../../../tasks/Utilities.wdl" as Utils
 37 | #import "../../../../tasks/BamToCram.wdl" as ToCram
 38 | #import "../../../../tasks/VariantCalling.wdl" as ToGvcf
 39 | #import "../../../../structs/dna_seq/germline/GermlineStructs.wdl"
 40 | 
 41 | # Git URL import
 42 | import "https://raw.githubusercontent.com/gatk-workflows/five-dollar-genome-analysis-pipeline/1.2.0/tasks/UnmappedBamToAlignedBam.wdl" as ToBam
 43 | import "https://raw.githubusercontent.com/gatk-workflows/five-dollar-genome-analysis-pipeline/1.2.0/tasks/AggregatedBamQC.wdl" as AggregatedQC
 44 | import "https://raw.githubusercontent.com/gatk-workflows/five-dollar-genome-analysis-pipeline/1.2.0/tasks/GermlineVariantDiscovery.wdl" as Calling
 45 | import "https://raw.githubusercontent.com/gatk-workflows/five-dollar-genome-analysis-pipeline/1.2.0/tasks/Qc.wdl" as QC
 46 | import "https://raw.githubusercontent.com/gatk-workflows/five-dollar-genome-analysis-pipeline/1.2.0/tasks/Utilities.wdl" as Utils
 47 | import "https://raw.githubusercontent.com/gatk-workflows/five-dollar-genome-analysis-pipeline/1.2.0/tasks/BamToCram.wdl" as ToCram
 48 | import "https://raw.githubusercontent.com/gatk-workflows/five-dollar-genome-analysis-pipeline/1.2.0/tasks/VariantCalling.wdl" as ToGvcf
 49 | import "https://raw.githubusercontent.com/gatk-workflows/five-dollar-genome-analysis-pipeline/1.2.0/structs/GermlineStructs.wdl"
 50 | 
 51 | # WORKFLOW DEFINITION
 52 | workflow WholeGenomeGermlineSingleSample {
 53 |   input {
 54 |     SampleAndUnmappedBams sample_and_unmapped_bams
 55 |     GermlineSingleSampleReferences references
 56 |     PapiSettings papi_settings
 57 |     File wgs_coverage_interval_list
 58 | 
 59 |     File? haplotype_database_file
 60 |     Boolean provide_bam_output = false
 61 |     Boolean use_gatk3_haplotype_caller = true
 62 |   }
 63 | 
 64 |   # Not overridable:
 65 |   Int read_length = 250
 66 |   Float lod_threshold = -20.0
 67 |   String cross_check_fingerprints_by = "READGROUP"
 68 |   String recalibrated_bam_basename = sample_and_unmapped_bams.base_file_name + ".aligned.duplicates_marked.recalibrated"
 69 | 
 70 |   call ToBam.UnmappedBamToAlignedBam {
 71 |     input:
 72 |       sample_and_unmapped_bams    = sample_and_unmapped_bams,
 73 |       references                  = references,
 74 |       papi_settings               = papi_settings,
 75 | 
 76 |       cross_check_fingerprints_by = cross_check_fingerprints_by,
 77 |       haplotype_database_file     = haplotype_database_file,
 78 |       lod_threshold               = lod_threshold,
 79 |       recalibrated_bam_basename   = recalibrated_bam_basename
 80 |   }
 81 | 
 82 |   call AggregatedQC.AggregatedBamQC {
 83 |     input:
 84 |       base_recalibrated_bam = UnmappedBamToAlignedBam.output_bam,
 85 |       base_recalibrated_bam_index = UnmappedBamToAlignedBam.output_bam_index,
 86 |       base_name = sample_and_unmapped_bams.base_file_name,
 87 |       sample_name = sample_and_unmapped_bams.sample_name,
 88 |       recalibrated_bam_base_name = recalibrated_bam_basename,
 89 |       haplotype_database_file = haplotype_database_file,
 90 |       references = references,
 91 |       papi_settings = papi_settings
 92 |   }
 93 | 
 94 |   call ToCram.BamToCram as BamToCram {
 95 |     input:
 96 |       input_bam = UnmappedBamToAlignedBam.output_bam,
 97 |       ref_fasta = references.reference_fasta.ref_fasta,
 98 |       ref_fasta_index = references.reference_fasta.ref_fasta_index,
 99 |       ref_dict = references.reference_fasta.ref_dict,
100 |       duplication_metrics = UnmappedBamToAlignedBam.duplicate_metrics,
101 |       chimerism_metrics = AggregatedBamQC.agg_alignment_summary_metrics,
102 |       base_file_name = sample_and_unmapped_bams.base_file_name,
103 |       agg_preemptible_tries = papi_settings.agg_preemptible_tries
104 |   }
105 | 
106 |   # QC the sample WGS metrics (stringent thresholds)
107 |   call QC.CollectWgsMetrics as CollectWgsMetrics {
108 |     input:
109 |       input_bam = UnmappedBamToAlignedBam.output_bam,
110 |       input_bam_index = UnmappedBamToAlignedBam.output_bam_index,
111 |       metrics_filename = sample_and_unmapped_bams.base_file_name + ".wgs_metrics",
112 |       ref_fasta = references.reference_fasta.ref_fasta,
113 |       ref_fasta_index = references.reference_fasta.ref_fasta_index,
114 |       wgs_coverage_interval_list = wgs_coverage_interval_list,
115 |       read_length = read_length,
116 |       preemptible_tries = papi_settings.agg_preemptible_tries
117 |   }
118 | 
119 |   # QC the sample raw WGS metrics (common thresholds)
120 |   call QC.CollectRawWgsMetrics as CollectRawWgsMetrics {
121 |     input:
122 |       input_bam = UnmappedBamToAlignedBam.output_bam,
123 |       input_bam_index = UnmappedBamToAlignedBam.output_bam_index,
124 |       metrics_filename = sample_and_unmapped_bams.base_file_name + ".raw_wgs_metrics",
125 |       ref_fasta = references.reference_fasta.ref_fasta,
126 |       ref_fasta_index = references.reference_fasta.ref_fasta_index,
127 |       wgs_coverage_interval_list = wgs_coverage_interval_list,
128 |       read_length = read_length,
129 |       preemptible_tries = papi_settings.agg_preemptible_tries
130 |   }
131 | 
132 |   call ToGvcf.VariantCalling as BamToGvcf {
133 |     input:
134 |       calling_interval_list = references.calling_interval_list,
135 |       evaluation_interval_list = references.evaluation_interval_list,
136 |       haplotype_scatter_count = references.haplotype_scatter_count,
137 |       break_bands_at_multiples_of = references.break_bands_at_multiples_of,
138 |       contamination = UnmappedBamToAlignedBam.contamination,
139 |       input_bam = UnmappedBamToAlignedBam.output_bam,
140 |       ref_fasta = references.reference_fasta.ref_fasta,
141 |       ref_fasta_index = references.reference_fasta.ref_fasta_index,
142 |       ref_dict = references.reference_fasta.ref_dict,
143 |       dbsnp_vcf = references.dbsnp_vcf,
144 |       dbsnp_vcf_index = references.dbsnp_vcf_index,
145 |       base_file_name = sample_and_unmapped_bams.base_file_name,
146 |       final_vcf_base_name = sample_and_unmapped_bams.final_gvcf_base_name,
147 |       agg_preemptible_tries = papi_settings.agg_preemptible_tries,
148 |       use_gatk3_haplotype_caller = use_gatk3_haplotype_caller
149 |   }
150 | 
151 |   if (provide_bam_output) {
152 |     File provided_output_bam = UnmappedBamToAlignedBam.output_bam
153 |     File provided_output_bam_index = UnmappedBamToAlignedBam.output_bam_index
154 |   }
155 | 
156 |   # Outputs that will be retained when execution is complete
157 |   output {
158 |     Array[File] quality_yield_metrics = UnmappedBamToAlignedBam.quality_yield_metrics
159 | 
160 |     Array[File] unsorted_read_group_base_distribution_by_cycle_pdf = UnmappedBamToAlignedBam.unsorted_read_group_base_distribution_by_cycle_pdf
161 |     Array[File] unsorted_read_group_base_distribution_by_cycle_metrics = UnmappedBamToAlignedBam.unsorted_read_group_base_distribution_by_cycle_metrics
162 |     Array[File] unsorted_read_group_insert_size_histogram_pdf = UnmappedBamToAlignedBam.unsorted_read_group_insert_size_histogram_pdf
163 |     Array[File] unsorted_read_group_insert_size_metrics = UnmappedBamToAlignedBam.unsorted_read_group_insert_size_metrics
164 |     Array[File] unsorted_read_group_quality_by_cycle_pdf = UnmappedBamToAlignedBam.unsorted_read_group_quality_by_cycle_pdf
165 |     Array[File] unsorted_read_group_quality_by_cycle_metrics = UnmappedBamToAlignedBam.unsorted_read_group_quality_by_cycle_metrics
166 |     Array[File] unsorted_read_group_quality_distribution_pdf = UnmappedBamToAlignedBam.unsorted_read_group_quality_distribution_pdf
167 |     Array[File] unsorted_read_group_quality_distribution_metrics = UnmappedBamToAlignedBam.unsorted_read_group_quality_distribution_metrics
168 | 
169 |     File read_group_alignment_summary_metrics = AggregatedBamQC.read_group_alignment_summary_metrics
170 |     File read_group_gc_bias_detail_metrics = AggregatedBamQC.read_group_gc_bias_detail_metrics
171 |     File read_group_gc_bias_pdf = AggregatedBamQC.read_group_gc_bias_pdf
172 |     File read_group_gc_bias_summary_metrics = AggregatedBamQC.read_group_gc_bias_summary_metrics
173 | 
174 |     File? cross_check_fingerprints_metrics = UnmappedBamToAlignedBam.cross_check_fingerprints_metrics
175 | 
176 |     File selfSM = UnmappedBamToAlignedBam.selfSM
177 |     Float contamination = UnmappedBamToAlignedBam.contamination
178 | 
179 |     File calculate_read_group_checksum_md5 = AggregatedBamQC.calculate_read_group_checksum_md5
180 | 
181 |     File agg_alignment_summary_metrics = AggregatedBamQC.agg_alignment_summary_metrics
182 |     File agg_bait_bias_detail_metrics = AggregatedBamQC.agg_bait_bias_detail_metrics
183 |     File agg_bait_bias_summary_metrics = AggregatedBamQC.agg_bait_bias_summary_metrics
184 |     File agg_gc_bias_detail_metrics = AggregatedBamQC.agg_gc_bias_detail_metrics
185 |     File agg_gc_bias_pdf = AggregatedBamQC.agg_gc_bias_pdf
186 |     File agg_gc_bias_summary_metrics = AggregatedBamQC.agg_gc_bias_summary_metrics
187 |     File agg_insert_size_histogram_pdf = AggregatedBamQC.agg_insert_size_histogram_pdf
188 |     File agg_insert_size_metrics = AggregatedBamQC.agg_insert_size_metrics
189 |     File agg_pre_adapter_detail_metrics = AggregatedBamQC.agg_pre_adapter_detail_metrics
190 |     File agg_pre_adapter_summary_metrics = AggregatedBamQC.agg_pre_adapter_summary_metrics
191 |     File agg_quality_distribution_pdf = AggregatedBamQC.agg_quality_distribution_pdf
192 |     File agg_quality_distribution_metrics = AggregatedBamQC.agg_quality_distribution_metrics
193 |     File agg_error_summary_metrics = AggregatedBamQC.agg_error_summary_metrics
194 | 
195 |     File? fingerprint_summary_metrics = AggregatedBamQC.fingerprint_summary_metrics
196 |     File? fingerprint_detail_metrics = AggregatedBamQC.fingerprint_detail_metrics
197 | 
198 |     File wgs_metrics = CollectWgsMetrics.metrics
199 |     File raw_wgs_metrics = CollectRawWgsMetrics.metrics
200 | 
201 |     File duplicate_metrics = UnmappedBamToAlignedBam.duplicate_metrics
202 |     File output_bqsr_reports = UnmappedBamToAlignedBam.output_bqsr_reports
203 | 
204 |     File gvcf_summary_metrics = BamToGvcf.vcf_summary_metrics
205 |     File gvcf_detail_metrics = BamToGvcf.vcf_detail_metrics
206 | 
207 |     File? output_bam = provided_output_bam
208 |     File? output_bam_index = provided_output_bam_index
209 | 
210 |     File output_cram = BamToCram.output_cram
211 |     File output_cram_index = BamToCram.output_cram_index
212 |     File output_cram_md5 = BamToCram.output_cram_md5
213 | 
214 |     File validate_cram_file_report = BamToCram.validate_cram_file_report
215 | 
216 |     File output_vcf = BamToGvcf.output_vcf
217 |     File output_vcf_index = BamToGvcf.output_vcf_index
218 |   }
219 | }
220 | 


--------------------------------------------------------------------------------
/workflows/mystery-2/structs/GermlineStructs.wdl:
--------------------------------------------------------------------------------
 1 | version 1.0
 2 | 
 3 | struct SampleAndUnmappedBams {
 4 |   String base_file_name
 5 |   String final_gvcf_base_name
 6 |   Array[File] flowcell_unmapped_bams
 7 |   String sample_name
 8 |   String unmapped_bam_suffix
 9 | }
10 | 
11 | struct ReferenceFasta {
12 |   File ref_dict
13 |   File ref_fasta
14 |   File ref_fasta_index
15 |   File ref_alt
16 |   File ref_sa
17 |   File ref_amb
18 |   File ref_bwt
19 |   File ref_ann
20 |   File ref_pac
21 | }
22 | 
23 | struct GermlineSingleSampleReferences {
24 |   File? fingerprint_genotypes_file
25 |   File? fingerprint_genotypes_index
26 | 
27 |   File contamination_sites_ud
28 |   File contamination_sites_bed
29 |   File contamination_sites_mu
30 |   File calling_interval_list
31 | 
32 |   Int haplotype_scatter_count
33 |   Int break_bands_at_multiples_of
34 | 
35 |   ReferenceFasta reference_fasta
36 | 
37 |   Array[File] known_indels_sites_vcfs
38 |   Array[File] known_indels_sites_indices
39 | 
40 |   File dbsnp_vcf
41 |   File dbsnp_vcf_index
42 | 
43 |   File evaluation_interval_list
44 | }
45 | 
46 | struct ExomeGermlineSingleSampleOligos {
47 |   File target_interval_list
48 |   File bait_interval_list
49 |   String bait_set_name
50 | }
51 | 
52 | struct CrossSpeciesContaminationReferences {
53 |   File filter_bwa_image
54 |   File kmer_file
55 |   File meats_bwa_image
56 |   File meats_fasta
57 |   File meats_fasta_dict
58 |   File meats_taxonomy_file
59 |   File microbe_bwa_image
60 |   File microbe_fasta
61 |   File microbe_fasta_dict
62 |   File microbe_taxonomy_file
63 |   File normalization_file
64 |   File metrics_script_file
65 |   Float score_min_identity
66 |   Int reads_after_downsampling
67 | }
68 | 
69 | struct PapiSettings {
70 |   Int preemptible_tries
71 |   Int agg_preemptible_tries
72 | }
73 | 


--------------------------------------------------------------------------------
/workflows/mystery-2/tasks/AggregatedBamQC.wdl:
--------------------------------------------------------------------------------
  1 | version 1.0
  2 | ## Copyright Broad Institute, 2018
  3 | ##
  4 | ## This WDL pipeline implements data processing according to the GATK Best Practices (June 2016)
  5 | ## for human whole-genome and exome sequencing data.
  6 | ##
  7 | ## Runtime parameters are often optimized for Broad's Google Cloud Platform implementation.
  8 | ## For program versions, see docker containers.
  9 | ##
 10 | ## LICENSING :
 11 | ## This script is released under the WDL source code license (BSD-3) (see LICENSE in
 12 | ## https://github.com/broadinstitute/wdl). Note however that the programs it calls may
 13 | ## be subject to different licenses. Users are responsible for checking that they are
 14 | ## authorized to run all programs before running this script. Please see the docker
 15 | ## page at https://hub.docker.com/r/broadinstitute/genomes-in-the-cloud/ for detailed
 16 | ## licensing information pertaining to the included programs.
 17 | 
 18 | # Local import
 19 | #import "./Qc.wdl" as QC
 20 | #import "../structs/GermlineStructs.wdl"
 21 | 
 22 | # Git URL import
 23 | import "https://raw.githubusercontent.com/gatk-workflows/five-dollar-genome-analysis-pipeline/1.2.0/tasks/Qc.wdl" as QC
 24 | import "https://raw.githubusercontent.com/gatk-workflows/five-dollar-genome-analysis-pipeline/1.2.0/structs/GermlineStructs.wdl"
 25 | 
 26 | # WORKFLOW DEFINITION
 27 | workflow AggregatedBamQC {
 28 | input {
 29 |     File base_recalibrated_bam
 30 |     File base_recalibrated_bam_index
 31 |     String base_name
 32 |     String sample_name
 33 |     String recalibrated_bam_base_name
 34 |     File? haplotype_database_file
 35 |     GermlineSingleSampleReferences references
 36 |     PapiSettings papi_settings
 37 |   }
 38 | 
 39 |   # QC the final BAM (consolidated after scattered BQSR)
 40 |   call QC.CollectReadgroupBamQualityMetrics as CollectReadgroupBamQualityMetrics {
 41 |     input:
 42 |       input_bam = base_recalibrated_bam,
 43 |       input_bam_index = base_recalibrated_bam_index,
 44 |       output_bam_prefix = base_name + ".readgroup",
 45 |       ref_dict = references.reference_fasta.ref_dict,
 46 |       ref_fasta = references.reference_fasta.ref_fasta,
 47 |       ref_fasta_index = references.reference_fasta.ref_fasta_index,
 48 |       preemptible_tries = papi_settings.agg_preemptible_tries
 49 |   }
 50 | 
 51 |   # QC the final BAM some more (no such thing as too much QC)
 52 |   call QC.CollectAggregationMetrics as CollectAggregationMetrics {
 53 |     input:
 54 |       input_bam = base_recalibrated_bam,
 55 |       input_bam_index = base_recalibrated_bam_index,
 56 |       output_bam_prefix = base_name,
 57 |       ref_dict = references.reference_fasta.ref_dict,
 58 |       ref_fasta = references.reference_fasta.ref_fasta,
 59 |       ref_fasta_index = references.reference_fasta.ref_fasta_index,
 60 |       preemptible_tries = papi_settings.agg_preemptible_tries
 61 |   }
 62 | 
 63 |   if (defined(haplotype_database_file) && defined(references.fingerprint_genotypes_file)) {
 64 |     # Check the sample BAM fingerprint against the sample array
 65 |     call QC.CheckFingerprint as CheckFingerprint {
 66 |       input:
 67 |         input_bam = base_recalibrated_bam,
 68 |         input_bam_index = base_recalibrated_bam_index,
 69 |         haplotype_database_file = haplotype_database_file,
 70 |         genotypes = references.fingerprint_genotypes_file,
 71 |         genotypes_index = references.fingerprint_genotypes_index,
 72 |         output_basename = base_name,
 73 |         sample = sample_name,
 74 |         preemptible_tries = papi_settings.agg_preemptible_tries
 75 |     }
 76 |   }
 77 | 
 78 |   # Generate a checksum per readgroup in the final BAM
 79 |   call QC.CalculateReadGroupChecksum as CalculateReadGroupChecksum {
 80 |     input:
 81 |       input_bam = base_recalibrated_bam,
 82 |       input_bam_index = base_recalibrated_bam_index,
 83 |       read_group_md5_filename = recalibrated_bam_base_name + ".bam.read_group_md5",
 84 |       preemptible_tries = papi_settings.agg_preemptible_tries
 85 |   }
 86 | 
 87 |   output {
 88 |     File read_group_alignment_summary_metrics = CollectReadgroupBamQualityMetrics.alignment_summary_metrics
 89 |     File read_group_gc_bias_detail_metrics = CollectReadgroupBamQualityMetrics.gc_bias_detail_metrics
 90 |     File read_group_gc_bias_pdf = CollectReadgroupBamQualityMetrics.gc_bias_pdf
 91 |     File read_group_gc_bias_summary_metrics = CollectReadgroupBamQualityMetrics.gc_bias_summary_metrics
 92 | 
 93 |     File calculate_read_group_checksum_md5 = CalculateReadGroupChecksum.md5_file
 94 | 
 95 |     File agg_alignment_summary_metrics = CollectAggregationMetrics.alignment_summary_metrics
 96 |     File agg_bait_bias_detail_metrics = CollectAggregationMetrics.bait_bias_detail_metrics
 97 |     File agg_bait_bias_summary_metrics = CollectAggregationMetrics.bait_bias_summary_metrics
 98 |     File agg_gc_bias_detail_metrics = CollectAggregationMetrics.gc_bias_detail_metrics
 99 |     File agg_gc_bias_pdf = CollectAggregationMetrics.gc_bias_pdf
100 |     File agg_gc_bias_summary_metrics = CollectAggregationMetrics.gc_bias_summary_metrics
101 |     File agg_insert_size_histogram_pdf = CollectAggregationMetrics.insert_size_histogram_pdf
102 |     File agg_insert_size_metrics = CollectAggregationMetrics.insert_size_metrics
103 |     File agg_pre_adapter_detail_metrics = CollectAggregationMetrics.pre_adapter_detail_metrics
104 |     File agg_pre_adapter_summary_metrics = CollectAggregationMetrics.pre_adapter_summary_metrics
105 |     File agg_quality_distribution_pdf = CollectAggregationMetrics.quality_distribution_pdf
106 |     File agg_quality_distribution_metrics = CollectAggregationMetrics.quality_distribution_metrics
107 |     File agg_error_summary_metrics = CollectAggregationMetrics.error_summary_metrics
108 | 
109 |     File? fingerprint_summary_metrics = CheckFingerprint.summary_metrics
110 |     File? fingerprint_detail_metrics = CheckFingerprint.detail_metrics
111 |   }
112 | }
113 | 


--------------------------------------------------------------------------------
/workflows/mystery-2/tasks/Alignment.wdl:
--------------------------------------------------------------------------------
  1 | version 1.0
  2 | ## Copyright Broad Institute, 2018
  3 | ##
  4 | ## This WDL defines tasks used for alignment of human whole-genome or exome sequencing data.
  5 | ##
  6 | ## Runtime parameters are often optimized for Broad's Google Cloud Platform implementation.
  7 | ## For program versions, see docker containers.
  8 | ##
  9 | ## LICENSING :
 10 | ## This script is released under the WDL source code license (BSD-3) (see LICENSE in
 11 | ## https://github.com/broadinstitute/wdl). Note however that the programs it calls may
 12 | ## be subject to different licenses. Users are responsible for checking that they are
 13 | ## authorized to run all programs before running this script. Please see the docker
 14 | ## page at https://hub.docker.com/r/broadinstitute/genomes-in-the-cloud/ for detailed
 15 | ## licensing information pertaining to the included programs.
 16 | 
 17 | # Local Import
 18 | #import "../structs/GermlineStructs.wdl"
 19 | 
 20 | # Git URL Import
 21 | import "https://raw.githubusercontent.com/gatk-workflows/five-dollar-genome-analysis-pipeline/1.2.0/structs/GermlineStructs.wdl"
 22 | 
 23 | # Get version of BWA
 24 | task GetBwaVersion {
 25 |   command {
 26 |     # not setting set -o pipefail here because /bwa has a rc=1 and we dont want to allow rc=1 to succeed because
 27 |     # the sed may also fail with that error and that is something we actually want to fail on.
 28 |     /usr/gitc/bwa 2>&1 | \
 29 |     grep -e '^Version' | \
 30 |     sed 's/Version: //'
 31 |   }
 32 |   runtime {
 33 |     docker: "us.gcr.io/broad-gotc-prod/genomes-in-the-cloud:2.4.1-1540490856"
 34 |     memory: "1 GiB"
 35 |   }
 36 |   output {
 37 |     String bwa_version = read_string(stdout())
 38 |   }
 39 | }
 40 | 
 41 | # Read unmapped BAM, convert on-the-fly to FASTQ and stream to BWA MEM for alignment, then stream to MergeBamAlignment
 42 | task SamToFastqAndBwaMemAndMba {
 43 |   input {
 44 |     File input_bam
 45 |     String bwa_commandline
 46 |     String bwa_version
 47 |     String output_bam_basename
 48 | 
 49 |     # reference_fasta.ref_alt is the .alt file from bwa-kit
 50 |     # (https://github.com/lh3/bwa/tree/master/bwakit),
 51 |     # listing the reference contigs that are "alternative".
 52 |     ReferenceFasta reference_fasta
 53 | 
 54 |     Int compression_level
 55 |     Int preemptible_tries
 56 |   }
 57 | 
 58 |   Float unmapped_bam_size = size(input_bam, "GiB")
 59 |   Float ref_size = size(reference_fasta.ref_fasta, "GiB") + size(reference_fasta.ref_fasta_index, "GiB") + size(reference_fasta.ref_dict, "GiB")
 60 |   Float bwa_ref_size = ref_size + size(reference_fasta.ref_alt, "GiB") + size(reference_fasta.ref_amb, "GiB") + size(reference_fasta.ref_ann, "GiB") + size(reference_fasta.ref_bwt, "GiB") + size(reference_fasta.ref_pac, "GiB") + size(reference_fasta.ref_sa, "GiB")
 61 |   # Sometimes the output is larger than the input, or a task can spill to disk.
 62 |   # In these cases we need to account for the input (1) and the output (1.5) or the input(1), the output(1), and spillage (.5).
 63 |   Float disk_multiplier = 2.5
 64 |   Int disk_size = ceil(unmapped_bam_size + bwa_ref_size + (disk_multiplier * unmapped_bam_size) + 20)
 65 | 
 66 |   command <<<
 67 |     set -o pipefail
 68 |     set -e
 69 | 
 70 |     # set the bash variable needed for the command-line
 71 |     bash_ref_fasta=~{reference_fasta.ref_fasta}
 72 |     # if reference_fasta.ref_alt has data in it,
 73 |     if [ -s ~{reference_fasta.ref_alt} ]; then
 74 |       java -Xms1000m -Xmx1000m -jar /usr/gitc/picard.jar \
 75 |         SamToFastq \
 76 |         INPUT=~{input_bam} \
 77 |         FASTQ=/dev/stdout \
 78 |         INTERLEAVE=true \
 79 |         NON_PF=true | \
 80 |       /usr/gitc/~{bwa_commandline} /dev/stdin - 2> >(tee ~{output_bam_basename}.bwa.stderr.log >&2) | \
 81 |       java -Dsamjdk.compression_level=~{compression_level} -Xms1000m -Xmx1000m -jar /usr/gitc/picard.jar \
 82 |         MergeBamAlignment \
 83 |         VALIDATION_STRINGENCY=SILENT \
 84 |         EXPECTED_ORIENTATIONS=FR \
 85 |         ATTRIBUTES_TO_RETAIN=X0 \
 86 |         ATTRIBUTES_TO_REMOVE=NM \
 87 |         ATTRIBUTES_TO_REMOVE=MD \
 88 |         ALIGNED_BAM=/dev/stdin \
 89 |         UNMAPPED_BAM=~{input_bam} \
 90 |         OUTPUT=~{output_bam_basename}.bam \
 91 |         REFERENCE_SEQUENCE=~{reference_fasta.ref_fasta} \
 92 |         PAIRED_RUN=true \
 93 |         SORT_ORDER="unsorted" \
 94 |         IS_BISULFITE_SEQUENCE=false \
 95 |         ALIGNED_READS_ONLY=false \
 96 |         CLIP_ADAPTERS=false \
 97 |         MAX_RECORDS_IN_RAM=2000000 \
 98 |         ADD_MATE_CIGAR=true \
 99 |         MAX_INSERTIONS_OR_DELETIONS=-1 \
100 |         PRIMARY_ALIGNMENT_STRATEGY=MostDistant \
101 |         PROGRAM_RECORD_ID="bwamem" \
102 |         PROGRAM_GROUP_VERSION="~{bwa_version}" \
103 |         PROGRAM_GROUP_COMMAND_LINE="~{bwa_commandline}" \
104 |         PROGRAM_GROUP_NAME="bwamem" \
105 |         UNMAPPED_READ_STRATEGY=COPY_TO_TAG \
106 |         ALIGNER_PROPER_PAIR_FLAGS=true \
107 |         UNMAP_CONTAMINANT_READS=true \
108 |         ADD_PG_TAG_TO_READS=false
109 | 
110 |       grep -m1 "read .* ALT contigs" ~{output_bam_basename}.bwa.stderr.log | \
111 |       grep -v "read 0 ALT contigs"
112 | 
113 |     # else reference_fasta.ref_alt is empty or could not be found
114 |     else
115 |       exit 1;
116 |     fi
117 |   >>>
118 |   runtime {
119 |     docker: "us.gcr.io/broad-gotc-prod/genomes-in-the-cloud:2.4.1-1540490856"
120 |     preemptible: preemptible_tries
121 |     memory: "14 GiB"
122 |     cpu: "16"
123 |     disks: "local-disk " + disk_size + " HDD"
124 |   }
125 |   output {
126 |     File output_bam = "~{output_bam_basename}.bam"
127 |     File bwa_stderr_log = "~{output_bam_basename}.bwa.stderr.log"
128 |   }
129 | }
130 | 
131 | task SamSplitter {
132 |   input {
133 |     File input_bam
134 |     Int n_reads
135 |     Int preemptible_tries
136 |     Int compression_level
137 |   }
138 | 
139 |   Float unmapped_bam_size = size(input_bam, "GiB")
140 |   # Since the output bams are less compressed than the input bam we need a disk multiplier that's larger than 2.
141 |   Float disk_multiplier = 2.5
142 |   Int disk_size = ceil(disk_multiplier * unmapped_bam_size + 20)
143 | 
144 |   command {
145 |     set -e
146 |     mkdir output_dir
147 | 
148 |     total_reads=$(samtools view -c ~{input_bam})
149 | 
150 |     java -Dsamjdk.compression_level=~{compression_level} -Xms3000m -jar /usr/gitc/picard.jar SplitSamByNumberOfReads \
151 |       INPUT=~{input_bam} \
152 |       OUTPUT=output_dir \
153 |       SPLIT_TO_N_READS=~{n_reads} \
154 |       TOTAL_READS_IN_INPUT=$total_reads
155 |   }
156 |   output {
157 |     Array[File] split_bams = glob("output_dir/*.bam")
158 |   }
159 |   runtime {
160 |     docker: "us.gcr.io/broad-gotc-prod/genomes-in-the-cloud:2.4.1-1540490856"
161 |     preemptible: preemptible_tries
162 |     memory: "3.75 GiB"
163 |     disks: "local-disk " + disk_size + " HDD"
164 |   }
165 | }
166 | 


--------------------------------------------------------------------------------
/workflows/mystery-2/tasks/BamProcessing.wdl:
--------------------------------------------------------------------------------
  1 | version 1.0
  2 | 
  3 | ## Copyright Broad Institute, 2018
  4 | ##
  5 | ## This WDL defines tasks used for BAM file processing of human whole-genome or exome sequencing data.
  6 | ##
  7 | ## Runtime parameters are often optimized for Broad's Google Cloud Platform implementation.
  8 | ## For program versions, see docker containers.
  9 | ##
 10 | ## LICENSING :
 11 | ## This script is released under the WDL source code license (BSD-3) (see LICENSE in
 12 | ## https://github.com/broadinstitute/wdl). Note however that the programs it calls may
 13 | ## be subject to different licenses. Users are responsible for checking that they are
 14 | ## authorized to run all programs before running this script. Please see the docker
 15 | ## page at https://hub.docker.com/r/broadinstitute/genomes-in-the-cloud/ for detailed
 16 | ## licensing information pertaining to the included programs.
 17 | 
 18 | # Sort BAM file by coordinate order
 19 | task SortSam {
 20 |   input {
 21 |     File input_bam
 22 |     String output_bam_basename
 23 |     Int preemptible_tries
 24 |     Int compression_level
 25 |   }
 26 |   # SortSam spills to disk a lot more because we are only store 300000 records in RAM now because its faster for our data so it needs
 27 |   # more disk space.  Also it spills to disk in an uncompressed format so we need to account for that with a larger multiplier
 28 |   Float sort_sam_disk_multiplier = 3.25
 29 |   Int disk_size = ceil(sort_sam_disk_multiplier * size(input_bam, "GiB")) + 20
 30 | 
 31 |   command {
 32 |     java -Dsamjdk.compression_level=~{compression_level} -Xms4000m -jar /usr/gitc/picard.jar \
 33 |       SortSam \
 34 |       INPUT=~{input_bam} \
 35 |       OUTPUT=~{output_bam_basename}.bam \
 36 |       SORT_ORDER="coordinate" \
 37 |       CREATE_INDEX=true \
 38 |       CREATE_MD5_FILE=true \
 39 |       MAX_RECORDS_IN_RAM=300000
 40 | 
 41 |   }
 42 |   runtime {
 43 |     docker: "us.gcr.io/broad-gotc-prod/genomes-in-the-cloud:2.4.1-1540490856"
 44 |     disks: "local-disk " + disk_size + " HDD"
 45 |     cpu: "1"
 46 |     memory: "5000 MiB"
 47 |     preemptible: preemptible_tries
 48 |   }
 49 |   output {
 50 |     File output_bam = "~{output_bam_basename}.bam"
 51 |     File output_bam_index = "~{output_bam_basename}.bai"
 52 |     File output_bam_md5 = "~{output_bam_basename}.bam.md5"
 53 |   }
 54 | }
 55 | 
 56 | # Sort BAM file by coordinate order -- using Spark!
 57 | task SortSamSpark {
 58 |   input {
 59 |     File input_bam
 60 |     String output_bam_basename
 61 |     Int preemptible_tries
 62 |     Int compression_level
 63 |     String gatk_docker = "us.gcr.io/broad-gatk/gatk:4.0.10.1"
 64 |   }
 65 |   # SortSam spills to disk a lot more because we are only store 300000 records in RAM now because its faster for our data so it needs
 66 |   # more disk space.  Also it spills to disk in an uncompressed format so we need to account for that with a larger multiplier
 67 |   Float sort_sam_disk_multiplier = 3.25
 68 |   Int disk_size = ceil(sort_sam_disk_multiplier * size(input_bam, "GiB")) + 20
 69 | 
 70 |   command {
 71 |     set -e
 72 | 
 73 |     gatk --java-options "-Dsamjdk.compression_level=~{compression_level} -Xms100g -Xmx100g" \
 74 |       SortSamSpark \
 75 |       -I ~{input_bam} \
 76 |       -O ~{output_bam_basename}.bam \
 77 |       -- --conf spark.local.dir=. --spark-master 'local[16]' --conf 'spark.kryo.referenceTracking=false'
 78 | 
 79 |     samtools index ~{output_bam_basename}.bam ~{output_bam_basename}.bai
 80 |   }
 81 |   runtime {
 82 |     docker: gatk_docker
 83 |     disks: "local-disk " + disk_size + " HDD"
 84 |     bootDiskSizeGb: "15"
 85 |     cpu: "16"
 86 |     memory: "102 GiB"
 87 |     preemptible: preemptible_tries
 88 |   }
 89 |   output {
 90 |     File output_bam = "~{output_bam_basename}.bam"
 91 |     File output_bam_index = "~{output_bam_basename}.bai"
 92 |   }
 93 | }
 94 | 
 95 | # Mark duplicate reads to avoid counting non-independent observations
 96 | task MarkDuplicates {
 97 |   input {
 98 |     Array[File] input_bams
 99 |     String output_bam_basename
100 |     String metrics_filename
101 |     Float total_input_size
102 |     Int compression_level
103 |     Int preemptible_tries
104 | 
105 |     # The program default for READ_NAME_REGEX is appropriate in nearly every case.
106 |     # Sometimes we wish to supply "null" in order to turn off optical duplicate detection
107 |     # This can be desirable if you don't mind the estimated library size being wrong and optical duplicate detection is taking >7 days and failing
108 |     String? read_name_regex
109 |     Int memory_multiplier = 1
110 |   }
111 | 
112 |   # The merged bam will be smaller than the sum of the parts so we need to account for the unmerged inputs and the merged output.
113 |   # Mark Duplicates takes in as input readgroup bams and outputs a slightly smaller aggregated bam. Giving .25 as wiggleroom
114 |   Float md_disk_multiplier = 3
115 |   Int disk_size = ceil(md_disk_multiplier * total_input_size) + 20
116 | 
117 |   Int memory_size = ceil(8 * memory_multiplier)
118 |   Int java_memory_size = (memory_size - 2)
119 | 
120 |   # Task is assuming query-sorted input so that the Secondary and Supplementary reads get marked correctly
121 |   # This works because the output of BWA is query-grouped and therefore, so is the output of MergeBamAlignment.
122 |   # While query-grouped isn't actually query-sorted, it's good enough for MarkDuplicates with ASSUME_SORT_ORDER="queryname"
123 | 
124 |   command {
125 |     java -Dsamjdk.compression_level=~{compression_level} -Xms~{java_memory_size}g -jar /usr/gitc/picard.jar \
126 |       MarkDuplicates \
127 |       INPUT=~{sep=' INPUT=' input_bams} \
128 |       OUTPUT=~{output_bam_basename}.bam \
129 |       METRICS_FILE=~{metrics_filename} \
130 |       VALIDATION_STRINGENCY=SILENT \
131 |       ~{"READ_NAME_REGEX=" + read_name_regex} \
132 |       OPTICAL_DUPLICATE_PIXEL_DISTANCE=2500 \
133 |       ASSUME_SORT_ORDER="queryname" \
134 |       CLEAR_DT="false" \
135 |       ADD_PG_TAG_TO_READS=false
136 |   }
137 |   runtime {
138 |     docker: "us.gcr.io/broad-gotc-prod/genomes-in-the-cloud:2.4.1-1540490856"
139 |     preemptible: preemptible_tries
140 |     memory: "~{memory_size} GiB"
141 |     disks: "local-disk " + disk_size + " HDD"
142 |   }
143 |   output {
144 |     File output_bam = "~{output_bam_basename}.bam"
145 |     File duplicate_metrics = "~{metrics_filename}"
146 |   }
147 | }
148 | 
149 | task MarkDuplicatesSpark {
150 |   input {
151 |     Array[File] input_bams
152 |     String output_bam_basename
153 |     String metrics_filename
154 |     Float total_input_size
155 |     Int compression_level
156 |     Int preemptible_tries
157 | 
158 |     String? read_name_regex
159 |     Int memory_multiplier = 3
160 |     Int cpu_size = 6
161 |   }
162 | 
163 |   # The merged bam will be smaller than the sum of the parts so we need to account for the unmerged inputs and the merged output.
164 |   # Mark Duplicates takes in as input readgroup bams and outputs a slightly smaller aggregated bam. Giving 2.5 as wiggleroom
165 |   Float md_disk_multiplier = 2.5
166 |   Int disk_size = ceil(md_disk_multiplier * total_input_size) + 20
167 | 
168 |   Int memory_size = ceil(16 * memory_multiplier)
169 |   Int java_memory_size = (memory_size - 6)
170 | 
171 |   String output_bam_location = "~{output_bam_basename}.bam"
172 | 
173 |   # Removed options ASSUME_SORT_ORDER, CLEAR_DT, and ADD_PG_TAG_TO_READS as it seems like they are a) not implemented
174 |   #   in MarkDuplicatesSpark, and/or b) are set to "false" aka "don't do" anyhow.
175 |   # MarkDuplicatesSpark requires PAPIv2
176 |   command <<<
177 |     set -e
178 |     export GATK_LOCAL_JAR=/root/gatk.jar
179 |     gatk --java-options "-Dsamjdk.compression_level=~{compression_level} -Xmx~{java_memory_size}g" \
180 |       MarkDuplicatesSpark \
181 |       --input ~{sep=' --input ' input_bams} \
182 |       --output ~{output_bam_location} \
183 |       --metrics-file ~{metrics_filename} \
184 |       --read-validation-stringency SILENT \
185 |       ~{"--read-name-regex " + read_name_regex} \
186 |       --optical-duplicate-pixel-distance 2500 \
187 |       --treat-unsorted-as-querygroup-ordered \
188 |       --create-output-bam-index false \
189 |       -- --conf spark.local.dir=/mnt/tmp --spark-master 'local[16]' --conf 'spark.kryo.referenceTracking=false'
190 |   >>>
191 | 
192 |   runtime {
193 |     docker: "jamesemery/gatknightly:gatkMasterSnapshot44ca2e9e84a"
194 |     disks: "/mnt/tmp " + ceil(2.1 * total_input_size) + " LOCAL, local-disk " + disk_size + " HDD"
195 |     bootDiskSizeGb: "50"
196 |     cpu: cpu_size
197 |     memory: "~{memory_size} GiB"
198 |     preemptible: preemptible_tries
199 |   }
200 | 
201 |   output {
202 |     File output_bam = output_bam_location
203 |     File duplicate_metrics = metrics_filename
204 |   }
205 | }
206 | 
207 | # Generate Base Quality Score Recalibration (BQSR) model
208 | task BaseRecalibrator {
209 |   input {
210 |     File input_bam
211 |     String recalibration_report_filename
212 |     Array[String] sequence_group_interval
213 |     File dbsnp_vcf
214 |     File dbsnp_vcf_index
215 |     Array[File] known_indels_sites_vcfs
216 |     Array[File] known_indels_sites_indices
217 |     File ref_dict
218 |     File ref_fasta
219 |     File ref_fasta_index
220 |     Int bqsr_scatter
221 |     Int preemptible_tries
222 |     String gatk_docker = "us.gcr.io/broad-gatk/gatk:4.0.10.1"
223 |   }
224 | 
225 |   Float ref_size = size(ref_fasta, "GiB") + size(ref_fasta_index, "GiB") + size(ref_dict, "GiB")
226 |   Float dbsnp_size = size(dbsnp_vcf, "GiB")
227 |   Int disk_size = ceil((size(input_bam, "GiB") / bqsr_scatter) + ref_size + dbsnp_size) + 20
228 | 
229 |   parameter_meta {
230 |     input_bam: {
231 |       localization_optional: true
232 |     }
233 |   }
234 | 
235 |   command {
236 |     gatk --java-options "-XX:GCTimeLimit=50 -XX:GCHeapFreeLimit=10 -XX:+PrintFlagsFinal \
237 |       -XX:+PrintGCTimeStamps -XX:+PrintGCDateStamps -XX:+PrintGCDetails \
238 |       -Xloggc:gc_log.log -Xms4000m" \
239 |       BaseRecalibrator \
240 |       -R ~{ref_fasta} \
241 |       -I ~{input_bam} \
242 |       --use-original-qualities \
243 |       -O ~{recalibration_report_filename} \
244 |       --known-sites ~{dbsnp_vcf} \
245 |       --known-sites ~{sep=" -known-sites " known_indels_sites_vcfs} \
246 |       -L ~{sep=" -L " sequence_group_interval}
247 |   }
248 |   runtime {
249 |     docker: gatk_docker
250 |     preemptible: preemptible_tries
251 |     memory: "6 GiB"
252 |     disks: "local-disk " + disk_size + " HDD"
253 |   }
254 |   output {
255 |     File recalibration_report = "~{recalibration_report_filename}"
256 |   }
257 | }
258 | 
259 | # Apply Base Quality Score Recalibration (BQSR) model
260 | task ApplyBQSR {
261 |   input {
262 |     File input_bam
263 |     String output_bam_basename
264 |     File recalibration_report
265 |     Array[String] sequence_group_interval
266 |     File ref_dict
267 |     File ref_fasta
268 |     File ref_fasta_index
269 |     Int compression_level
270 |     Int bqsr_scatter
271 |     Int preemptible_tries
272 |     String gatk_docker = "us.gcr.io/broad-gatk/gatk:4.0.10.1"
273 |     Int memory_multiplier = 1
274 |   }
275 | 
276 |   Float ref_size = size(ref_fasta, "GiB") + size(ref_fasta_index, "GiB") + size(ref_dict, "GiB")
277 |   Int disk_size = ceil((size(input_bam, "GiB") * 3 / bqsr_scatter) + ref_size) + 20
278 | 
279 |   Int memory_size = ceil(3500 * memory_multiplier)
280 | 
281 |   parameter_meta {
282 |     input_bam: {
283 |       localization_optional: true
284 |     }
285 |   }
286 | 
287 |   command {
288 |     gatk --java-options "-XX:+PrintFlagsFinal -XX:+PrintGCTimeStamps -XX:+PrintGCDateStamps \
289 |       -XX:+PrintGCDetails -Xloggc:gc_log.log \
290 |       -XX:GCTimeLimit=50 -XX:GCHeapFreeLimit=10 -Dsamjdk.compression_level=~{compression_level} -Xms3000m" \
291 |       ApplyBQSR \
292 |       --create-output-bam-md5 \
293 |       --add-output-sam-program-record \
294 |       -R ~{ref_fasta} \
295 |       -I ~{input_bam} \
296 |       --use-original-qualities \
297 |       -O ~{output_bam_basename}.bam \
298 |       -bqsr ~{recalibration_report} \
299 |       --static-quantized-quals 10 \
300 |       --static-quantized-quals 20 \
301 |       --static-quantized-quals 30 \
302 |       -L ~{sep=" -L " sequence_group_interval}
303 |   }
304 |   runtime {
305 |     docker: gatk_docker
306 |     preemptible: preemptible_tries
307 |     memory: "~{memory_size} MiB"
308 |     disks: "local-disk " + disk_size + " HDD"
309 |   }
310 |   output {
311 |     File recalibrated_bam = "~{output_bam_basename}.bam"
312 |     File recalibrated_bam_checksum = "~{output_bam_basename}.bam.md5"
313 |   }
314 | }
315 | 
316 | # Combine multiple recalibration tables from scattered BaseRecalibrator runs
317 | task GatherBqsrReports {
318 |   input {
319 |     Array[File] input_bqsr_reports
320 |     String output_report_filename
321 |     Int preemptible_tries
322 |     String gatk_docker = "us.gcr.io/broad-gatk/gatk:4.0.10.1"
323 |   }
324 | 
325 |   command {
326 |     gatk --java-options "-Xms3000m" \
327 |       GatherBQSRReports \
328 |       -I ~{sep=' -I ' input_bqsr_reports} \
329 |       -O ~{output_report_filename}
330 |     }
331 |   runtime {
332 |     docker: gatk_docker
333 |     preemptible: preemptible_tries
334 |     memory: "3500 MiB"
335 |     disks: "local-disk 20 HDD"
336 |   }
337 |   output {
338 |     File output_bqsr_report = "~{output_report_filename}"
339 |   }
340 | }
341 | 
342 | # Combine multiple *sorted* BAM files
343 | task GatherSortedBamFiles {
344 |   input {
345 |     Array[File] input_bams
346 |     String output_bam_basename
347 |     Float total_input_size
348 |     Int compression_level
349 |     Int preemptible_tries
350 |   }
351 | 
352 |   # Multiply the input bam size by two to account for the input and output
353 |   Int disk_size = ceil(2 * total_input_size) + 20
354 | 
355 |   command {
356 |     java -Dsamjdk.compression_level=~{compression_level} -Xms2000m -jar /usr/gitc/picard.jar \
357 |       GatherBamFiles \
358 |       INPUT=~{sep=' INPUT=' input_bams} \
359 |       OUTPUT=~{output_bam_basename}.bam \
360 |       CREATE_INDEX=true \
361 |       CREATE_MD5_FILE=true
362 |     }
363 |   runtime {
364 |     docker: "us.gcr.io/broad-gotc-prod/genomes-in-the-cloud:2.4.1-1540490856"
365 |     preemptible: preemptible_tries
366 |     memory: "3 GiB"
367 |     disks: "local-disk " + disk_size + " HDD"
368 |   }
369 |   output {
370 |     File output_bam = "~{output_bam_basename}.bam"
371 |     File output_bam_index = "~{output_bam_basename}.bai"
372 |     File output_bam_md5 = "~{output_bam_basename}.bam.md5"
373 |   }
374 | }
375 | 
376 | # Combine multiple *unsorted* BAM files
377 | # Note that if/when WDL supports optional outputs, we should merge this task with the sorted version
378 | task GatherUnsortedBamFiles {
379 |   input {
380 |     Array[File] input_bams
381 |     String output_bam_basename
382 |     Float total_input_size
383 |     Int compression_level
384 |     Int preemptible_tries
385 |   }
386 | 
387 |   # Multiply the input bam size by two to account for the input and output
388 |   Int disk_size = ceil(2 * total_input_size) + 20
389 | 
390 |   command {
391 |     java -Dsamjdk.compression_level=~{compression_level} -Xms2000m -jar /usr/gitc/picard.jar \
392 |       GatherBamFiles \
393 |       INPUT=~{sep=' INPUT=' input_bams} \
394 |       OUTPUT=~{output_bam_basename}.bam \
395 |       CREATE_INDEX=false \
396 |       CREATE_MD5_FILE=false
397 |     }
398 |   runtime {
399 |     docker: "us.gcr.io/broad-gotc-prod/genomes-in-the-cloud:2.4.1-1540490856"
400 |     preemptible: preemptible_tries
401 |     memory: "3 GiB"
402 |     disks: "local-disk " + disk_size + " HDD"
403 |   }
404 |   output {
405 |     File output_bam = "~{output_bam_basename}.bam"
406 |   }
407 | }
408 | 
409 | # Notes on the contamination estimate:
410 | # The contamination value is read from the FREEMIX field of the selfSM file output by verifyBamId
411 | #
412 | # In Zamboni production, this value is stored directly in METRICS.AGGREGATION_CONTAM
413 | #
414 | # Contamination is also stored in GVCF_CALLING and thereby passed to HAPLOTYPE_CALLER
415 | # But first, it is divided by an underestimation factor thusly:
416 | #   float(FREEMIX) / ContaminationUnderestimationFactor
417 | #     where the denominator is hardcoded in Zamboni:
418 | #     val ContaminationUnderestimationFactor = 0.75f
419 | #
420 | # Here, I am handling this by returning both the original selfSM file for reporting, and the adjusted
421 | # contamination estimate for use in variant calling
422 | task CheckContamination {
423 |   input {
424 |     File input_bam
425 |     File input_bam_index
426 |     File contamination_sites_ud
427 |     File contamination_sites_bed
428 |     File contamination_sites_mu
429 |     File ref_fasta
430 |     File ref_fasta_index
431 |     String output_prefix
432 |     Int preemptible_tries
433 |     Float contamination_underestimation_factor
434 |     Boolean disable_sanity_check = false
435 |   }
436 | 
437 |   Int disk_size = ceil(size(input_bam, "GiB") + size(ref_fasta, "GiB")) + 30
438 | 
439 |   command <<<
440 |     set -e
441 | 
442 |     # creates a ~{output_prefix}.selfSM file, a TSV file with 2 rows, 19 columns.
443 |     # First row are the keys (e.g., SEQ_SM, RG, FREEMIX), second row are the associated values
444 |     /usr/gitc/VerifyBamID \
445 |     --Verbose \
446 |     --NumPC 4 \
447 |     --Output ~{output_prefix} \
448 |     --BamFile ~{input_bam} \
449 |     --Reference ~{ref_fasta} \
450 |     --UDPath ~{contamination_sites_ud} \
451 |     --MeanPath ~{contamination_sites_mu} \
452 |     --BedPath ~{contamination_sites_bed} \
453 |     ~{true="--DisableSanityCheck" false="" disable_sanity_check} \
454 |     1>/dev/null
455 | 
456 |     # used to read from the selfSM file and calculate contamination, which gets printed out
457 |     python3 <<CODE
458 |     import csv
459 |     import sys
460 |     with open('~{output_prefix}.selfSM') as selfSM:
461 |       reader = csv.DictReader(selfSM, delimiter='\t')
462 |       i = 0
463 |       for row in reader:
464 |         if float(row["FREELK0"])==0 and float(row["FREELK1"])==0:
465 |           # a zero value for the likelihoods implies no data. This usually indicates a problem rather than a real event.
466 |           # if the bam isn't really empty, this is probably due to the use of a incompatible reference build between
467 |           # vcf and bam.
468 |           sys.stderr.write("Found zero likelihoods. Bam is either very-very shallow, or aligned to the wrong reference (relative to the vcf).")
469 |           sys.exit(1)
470 |         print(float(row["FREEMIX"])/~{contamination_underestimation_factor})
471 |         i = i + 1
472 |         # there should be exactly one row, and if this isn't the case the format of the output is unexpectedly different
473 |         # and the results are not reliable.
474 |         if i != 1:
475 |           sys.stderr.write("Found %d rows in .selfSM file. Was expecting exactly 1. This is an error"%(i))
476 |           sys.exit(2)
477 |     CODE
478 |   >>>
479 |   runtime {
480 |     preemptible: preemptible_tries
481 |     memory: "4 GiB"
482 |     disks: "local-disk " + disk_size + " HDD"
483 |     docker: "us.gcr.io/broad-gotc-prod/verify-bam-id:c1cba76e979904eb69c31520a0d7f5be63c72253-1553018888"
484 |     cpu: "2"
485 |   }
486 |   output {
487 |     File selfSM = "~{output_prefix}.selfSM"
488 |     Float contamination = read_float(stdout())
489 |   }
490 | }
491 | 


--------------------------------------------------------------------------------
/workflows/mystery-2/tasks/BamToCram.wdl:
--------------------------------------------------------------------------------
 1 | version 1.0
 2 | 
 3 | # Local Import
 4 | #import "Utilities.wdl" as Utils
 5 | #import "Qc.wdl" as QC
 6 | 
 7 | # Git URL Import
 8 | import "https://raw.githubusercontent.com/gatk-workflows/five-dollar-genome-analysis-pipeline/1.2.0/tasks/Utilities.wdl" as Utils
 9 | import "https://raw.githubusercontent.com/gatk-workflows/five-dollar-genome-analysis-pipeline/1.2.0/tasks/Qc.wdl" as QC
10 | 
11 | workflow BamToCram {
12 | 
13 |   input {
14 |     File input_bam
15 |     File ref_fasta
16 |     File ref_fasta_index
17 |     File ref_dict
18 |     File duplication_metrics
19 |     File chimerism_metrics
20 |     String base_file_name
21 |     Int agg_preemptible_tries
22 |   }
23 | 
24 | 
25 |   # ValidateSamFile runs out of memory in mate validation on crazy edge case data, so we want to skip the mate validation
26 |   # in those cases.  These values set the thresholds for what is considered outside the normal realm of "reasonable" data.
27 |   Float max_duplication_in_reasonable_sample = 0.30
28 |   Float max_chimerism_in_reasonable_sample = 0.15
29 | 
30 |   # Convert the final merged recalibrated BAM file to CRAM format
31 |   call Utils.ConvertToCram as ConvertToCram {
32 |     input:
33 |       input_bam = input_bam,
34 |       ref_fasta = ref_fasta,
35 |       ref_fasta_index = ref_fasta_index,
36 |       output_basename = base_file_name,
37 |       preemptible_tries = agg_preemptible_tries
38 |   }
39 | 
40 |   # Check whether the data has massively high duplication or chimerism rates
41 |   call QC.CheckPreValidation as CheckPreValidation {
42 |     input:
43 |       duplication_metrics = duplication_metrics,
44 |       chimerism_metrics = chimerism_metrics,
45 |       max_duplication_in_reasonable_sample = max_duplication_in_reasonable_sample,
46 |       max_chimerism_in_reasonable_sample = max_chimerism_in_reasonable_sample,
47 |       preemptible_tries = agg_preemptible_tries
48 |  }
49 | 
50 |   # Validate the CRAM file
51 |   call QC.ValidateSamFile as ValidateCram {
52 |     input:
53 |       input_bam = ConvertToCram.output_cram,
54 |       input_bam_index = ConvertToCram.output_cram_index,
55 |       report_filename = base_file_name + ".cram.validation_report",
56 |       ref_dict = ref_dict,
57 |       ref_fasta = ref_fasta,
58 |       ref_fasta_index = ref_fasta_index,
59 |       ignore = ["MISSING_TAG_NM"],
60 |       max_output = 1000000000,
61 |       is_outlier_data = CheckPreValidation.is_outlier_data,
62 |       preemptible_tries = agg_preemptible_tries
63 |   }
64 | 
65 |   output {
66 |      File output_cram = ConvertToCram.output_cram
67 |      File output_cram_index = ConvertToCram.output_cram_index
68 |      File output_cram_md5 = ConvertToCram.output_cram_md5
69 |      File validate_cram_file_report = ValidateCram.report
70 |   }
71 | }
72 | 
73 | 


--------------------------------------------------------------------------------
/workflows/mystery-2/tasks/GermlineVariantDiscovery.wdl:
--------------------------------------------------------------------------------
  1 | version 1.0
  2 | 
  3 | ## Copyright Broad Institute, 2018
  4 | ##
  5 | ## This WDL defines tasks used for germline variant discovery of human whole-genome or exome sequencing data.
  6 | ##
  7 | ## Runtime parameters are often optimized for Broad's Google Cloud Platform implementation.
  8 | ## For program versions, see docker containers.
  9 | ##
 10 | ## LICENSING :
 11 | ## This script is released under the WDL source code license (BSD-3) (see LICENSE in
 12 | ## https://github.com/broadinstitute/wdl). Note however that the programs it calls may
 13 | ## be subject to different licenses. Users are responsible for checking that they are
 14 | ## authorized to run all programs before running this script. Please see the docker
 15 | ## page at https://hub.docker.com/r/broadinstitute/genomes-in-the-cloud/ for detailed
 16 | ## licensing information pertaining to the included programs.
 17 | 
 18 | task HaplotypeCaller_GATK35_GVCF {
 19 |   input {
 20 |     File input_bam
 21 |     File interval_list
 22 |     String gvcf_basename
 23 |     File ref_dict
 24 |     File ref_fasta
 25 |     File ref_fasta_index
 26 |     Float? contamination
 27 |     Int preemptible_tries
 28 |     Int hc_scatter
 29 |   }
 30 | 
 31 |   parameter_meta {
 32 |     input_bam: {
 33 |       localization_optional: true
 34 |     }
 35 |   }
 36 | 
 37 |   Float ref_size = size(ref_fasta, "GB") + size(ref_fasta_index, "GB") + size(ref_dict, "GB")
 38 |   Int disk_size = ceil(((size(input_bam, "GB") + 30) / hc_scatter) + ref_size) + 20
 39 | 
 40 |   # We use interval_padding 500 below to make sure that the HaplotypeCaller has context on both sides around
 41 |   # the interval because the assembly uses them.
 42 |   #
 43 |   # Using PrintReads is a temporary solution until we update HaploypeCaller to use GATK4. Once that is done,
 44 |   # HaplotypeCaller can stream the required intervals directly from the cloud.
 45 |   command {
 46 |     /usr/gitc/gatk4/gatk-launch --javaOptions "-Xms2g" \
 47 |       PrintReads \
 48 |       -I ~{input_bam} \
 49 |       --interval_padding 500 \
 50 |       -L ~{interval_list} \
 51 |       -O local.sharded.bam \
 52 |     && \
 53 |     java -XX:GCTimeLimit=50 -XX:GCHeapFreeLimit=10 -Xms8000m \
 54 |       -jar /usr/gitc/GATK35.jar \
 55 |       -T HaplotypeCaller \
 56 |       -R ~{ref_fasta} \
 57 |       -o ~{gvcf_basename}.vcf.gz \
 58 |       -I local.sharded.bam \
 59 |       -L ~{interval_list} \
 60 |       -ERC GVCF \
 61 |       --max_alternate_alleles 3 \
 62 |       -variant_index_parameter 128000 \
 63 |       -variant_index_type LINEAR \
 64 |       -contamination ~{default=0 contamination} \
 65 |       --read_filter OverclippedRead
 66 |   }
 67 |   runtime {
 68 |     docker: "us.gcr.io/broad-gotc-prod/genomes-in-the-cloud:2.3.2-1510681135"
 69 |     preemptible: preemptible_tries
 70 |     memory: "10 GiB"
 71 |     cpu: "1"
 72 |     disks: "local-disk " + disk_size + " HDD"
 73 |   }
 74 |   output {
 75 |     File output_gvcf = "~{gvcf_basename}.vcf.gz"
 76 |     File output_gvcf_index = "~{gvcf_basename}.vcf.gz.tbi"
 77 |   }
 78 | }
 79 | 
 80 | task HaplotypeCaller_GATK4_VCF {
 81 |   input {
 82 |     File input_bam
 83 |     File interval_list
 84 |     String vcf_basename
 85 |     File ref_dict
 86 |     File ref_fasta
 87 |     File ref_fasta_index
 88 |     Float? contamination
 89 |     Boolean make_gvcf
 90 |     Boolean make_bamout
 91 |     Int preemptible_tries
 92 |     Int hc_scatter
 93 |     String gatk_docker = "us.gcr.io/broad-gatk/gatk:4.0.10.1"
 94 |   }
 95 | 
 96 |   String output_suffix = if make_gvcf then ".g.vcf.gz" else ".vcf.gz"
 97 |   String output_file_name = vcf_basename + output_suffix
 98 | 
 99 |   Float ref_size = size(ref_fasta, "GiB") + size(ref_fasta_index, "GiB") + size(ref_dict, "GiB")
100 |   Int disk_size = ceil(((size(input_bam, "GiB") + 30) / hc_scatter) + ref_size) + 20
101 | 
102 |   String bamout_arg = if make_bamout then "-bamout ~{vcf_basename}.bamout.bam" else ""
103 | 
104 |   parameter_meta {
105 |     input_bam: {
106 |       localization_optional: true
107 |     }
108 |   }
109 | 
110 |   command <<<
111 |     set -e
112 |     gatk --java-options "-Xms6000m -XX:GCTimeLimit=50 -XX:GCHeapFreeLimit=10" \
113 |       HaplotypeCaller \
114 |       -R ~{ref_fasta} \
115 |       -I ~{input_bam} \
116 |       -L ~{interval_list} \
117 |       -O ~{output_file_name} \
118 |       -contamination ~{default=0 contamination} \
119 |       -G StandardAnnotation -G StandardHCAnnotation ~{true="-G AS_StandardAnnotation" false="" make_gvcf} \
120 |       -new-qual \
121 |       -GQB 10 -GQB 20 -GQB 30 -GQB 40 -GQB 50 -GQB 60 -GQB 70 -GQB 80 -GQB 90 \
122 |       ~{true="-ERC GVCF" false="" make_gvcf} \
123 |       ~{bamout_arg}
124 | 
125 |     # Cromwell doesn't like optional task outputs, so we have to touch this file.
126 |     touch ~{vcf_basename}.bamout.bam
127 |   >>>
128 | 
129 |   runtime {
130 |     docker: gatk_docker
131 |     preemptible: preemptible_tries
132 |     memory: "6.5 GiB"
133 |     cpu: "2"
134 |     disks: "local-disk " + disk_size + " HDD"
135 |   }
136 | 
137 |   output {
138 |     File output_vcf = "~{output_file_name}"
139 |     File output_vcf_index = "~{output_file_name}.tbi"
140 |     File bamout = "~{vcf_basename}.bamout.bam"
141 |   }
142 | }
143 | 
144 | # Combine multiple VCFs or GVCFs from scattered HaplotypeCaller runs
145 | task MergeVCFs {
146 |   input {
147 |     Array[File] input_vcfs
148 |     Array[File] input_vcfs_indexes
149 |     String output_vcf_name
150 |     Int preemptible_tries
151 |   }
152 | 
153 |   Int disk_size = ceil(size(input_vcfs, "GiB") * 2.5) + 10
154 | 
155 |   # Using MergeVcfs instead of GatherVcfs so we can create indices
156 |   # See https://github.com/broadinstitute/picard/issues/789 for relevant GatherVcfs ticket
157 |   command {
158 |     java -Xms2000m -jar /usr/gitc/picard.jar \
159 |       MergeVcfs \
160 |       INPUT=~{sep=' INPUT=' input_vcfs} \
161 |       OUTPUT=~{output_vcf_name}
162 |   }
163 |   runtime {
164 |     docker: "us.gcr.io/broad-gotc-prod/genomes-in-the-cloud:2.4.1-1540490856"
165 |     preemptible: preemptible_tries
166 |     memory: "3 GiB"
167 |     disks: "local-disk ~{disk_size} HDD"
168 |   }
169 |   output {
170 |     File output_vcf = "~{output_vcf_name}"
171 |     File output_vcf_index = "~{output_vcf_name}.tbi"
172 |   }
173 | }
174 | 
175 | task HardFilterVcf {
176 |   input {
177 |     File input_vcf
178 |     File input_vcf_index
179 |     String vcf_basename
180 |     File interval_list
181 |     Int preemptible_tries
182 |     String gatk_docker = "us.gcr.io/broad-gatk/gatk:4.0.10.1"
183 |   }
184 | 
185 |   Int disk_size = ceil(2 * size(input_vcf, "GiB")) + 20
186 |   String output_vcf_name = vcf_basename + ".filtered.vcf.gz"
187 | 
188 |   command {
189 |      gatk --java-options "-Xms3000m" \
190 |       VariantFiltration \
191 |       -V ~{input_vcf} \
192 |       -L ~{interval_list} \
193 |       --filter-expression "QD < 2.0 || FS > 30.0 || SOR > 3.0 || MQ < 40.0 || MQRankSum < -3.0 || ReadPosRankSum < -3.0" \
194 |       --filter-name "HardFiltered" \
195 |       -O ~{output_vcf_name}
196 |   }
197 |   output {
198 |       File output_vcf = "~{output_vcf_name}"
199 |       File output_vcf_index = "~{output_vcf_name}.tbi"
200 |     }
201 |   runtime {
202 |     docker: gatk_docker
203 |     preemptible: preemptible_tries
204 |     memory: "3 GiB"
205 |     disks: "local-disk " + disk_size + " HDD"
206 |   }
207 | }
208 | 
209 | task CNNScoreVariants {
210 | 
211 |   input {
212 |     File? bamout
213 |     File? bamout_index
214 |     File input_vcf
215 |     File input_vcf_index
216 |     String vcf_basename
217 |     File ref_fasta
218 |     File ref_fasta_index
219 |     File ref_dict
220 |     Int preemptible_tries
221 |     String gatk_docker = "us.gcr.io/broad-gatk/gatk:4.1.0.0"
222 |   }
223 | 
224 |   Int disk_size = ceil(size(bamout, "GiB") + size(ref_fasta, "GiB") + (size(input_vcf, "GiB") * 2))
225 | 
226 |   String base_vcf = basename(input_vcf)
227 |   Boolean is_compressed = basename(base_vcf, "gz") != base_vcf
228 |   String vcf_suffix = if is_compressed then ".vcf.gz" else ".vcf"
229 |   String vcf_index_suffix = if is_compressed then ".tbi" else ".idx"
230 |   String output_vcf = base_vcf + ".scored" + vcf_suffix
231 |   String output_vcf_index = output_vcf + vcf_index_suffix
232 | 
233 |   String bamout_param = if defined(bamout) then "-I ~{bamout}" else ""
234 |   String tensor_type = if defined(bamout) then "read-tensor" else "reference"
235 | 
236 |   command {
237 |      gatk --java-options -Xmx10g CNNScoreVariants \
238 |        -V ~{input_vcf} \
239 |        -R ~{ref_fasta} \
240 |        -O ~{output_vcf} \
241 |        ~{bamout_param} \
242 |        -tensor-type ~{tensor_type}
243 |   }
244 | 
245 |   output {
246 |     File scored_vcf = "~{output_vcf}"
247 |     File scored_vcf_index = "~{output_vcf_index}"
248 |   }
249 | 
250 |   runtime {
251 |     docker: gatk_docker
252 |     preemptible: preemptible_tries
253 |     memory: "15 GiB"
254 |     cpu: "2"
255 |     disks: "local-disk " + disk_size + " HDD"
256 |   }
257 | }
258 | 
259 | task FilterVariantTranches {
260 | 
261 |   input {
262 |     File input_vcf
263 |     File input_vcf_index
264 |     String vcf_basename
265 |     Array[String] snp_tranches
266 |     Array[String] indel_tranches
267 |     File hapmap_resource_vcf
268 |     File hapmap_resource_vcf_index
269 |     File omni_resource_vcf
270 |     File omni_resource_vcf_index
271 |     File one_thousand_genomes_resource_vcf
272 |     File one_thousand_genomes_resource_vcf_index
273 |     File dbsnp_resource_vcf
274 |     File dbsnp_resource_vcf_index
275 |     String info_key
276 |     Int preemptible_tries
277 |     String gatk_docker = "us.gcr.io/broad-gatk/gatk:4.1.0.0"
278 |   }
279 | 
280 |   Int disk_size = ceil(size(hapmap_resource_vcf, "GiB") +
281 |                         size(omni_resource_vcf, "GiB") +
282 |                         size(one_thousand_genomes_resource_vcf, "GiB") +
283 |                         size(dbsnp_resource_vcf, "GiB") +
284 |                         (size(input_vcf, "GiB") * 2)
285 |                       ) + 20
286 | 
287 |   command {
288 | 
289 |     gatk --java-options -Xmx6g FilterVariantTranches \
290 |       -V ~{input_vcf} \
291 |       -O ~{vcf_basename}.filtered.vcf.gz \
292 |       ~{sep=" " prefix("--snp-tranche ", snp_tranches)} \
293 |       ~{sep=" " prefix("--indel-tranche ", indel_tranches)} \
294 |       --resource ~{hapmap_resource_vcf} \
295 |       --resource ~{omni_resource_vcf} \
296 |       --resource ~{one_thousand_genomes_resource_vcf} \
297 |       --resource ~{dbsnp_resource_vcf} \
298 |       --info-key ~{info_key} \
299 |       --create-output-variant-index true
300 |   }
301 | 
302 |   output {
303 |     File filtered_vcf = "~{vcf_basename}.filtered.vcf.gz"
304 |     File filtered_vcf_index = "~{vcf_basename}.filtered.vcf.gz.tbi"
305 |   }
306 | 
307 |   runtime {
308 |     memory: "7 GiB"
309 |     cpu: "2"
310 |     disks: "local-disk " + disk_size + " HDD"
311 |     preemptible: preemptible_tries
312 |     docker: gatk_docker
313 |   }
314 | }
315 | 


--------------------------------------------------------------------------------
/workflows/mystery-2/tasks/SplitLargeReadGroup.wdl:
--------------------------------------------------------------------------------
 1 | version 1.0
 2 | 
 3 | ## Copyright Broad Institute, 2018
 4 | ##
 5 | ## This WDL pipeline implements a split of large readgroups for human whole-genome and exome sequencing data.
 6 | ##
 7 | ## Runtime parameters are often optimized for Broad's Google Cloud Platform implementation.
 8 | ## For program versions, see docker containers.
 9 | ##
10 | ## LICENSING :
11 | ## This script is released under the WDL source code license (BSD-3) (see LICENSE in
12 | ## https://github.com/broadinstitute/wdl). Note however that the programs it calls may
13 | ## be subject to different licenses. Users are responsible for checking that they are
14 | ## authorized to run all programs before running this script. Please see the docker
15 | ## page at https://hub.docker.com/r/broadinstitute/genomes-in-the-cloud/ for detailed
16 | ## licensing information pertaining to the included programs.
17 | 
18 | import "https://raw.githubusercontent.com/gatk-workflows/five-dollar-genome-analysis-pipeline/1.2.0/tasks/Alignment.wdl" as Alignment
19 | import "https://raw.githubusercontent.com/gatk-workflows/five-dollar-genome-analysis-pipeline/1.2.0/tasks/BamProcessing.wdl" as Processing
20 | import "https://raw.githubusercontent.com/gatk-workflows/five-dollar-genome-analysis-pipeline/1.2.0/tasks/Utilities.wdl" as Utils
21 | import "https://raw.githubusercontent.com/gatk-workflows/five-dollar-genome-analysis-pipeline/1.2.0/structs/GermlineStructs.wdl" as Structs
22 | 
23 | workflow SplitLargeReadGroup {
24 |   input {
25 |     File input_bam
26 | 
27 |     String bwa_commandline
28 |     String bwa_version
29 |     String output_bam_basename
30 | 
31 |     # reference_fasta.ref_alt is the .alt file from bwa-kit
32 |     # (https://github.com/lh3/bwa/tree/master/bwakit),
33 |     # listing the reference contigs that are "alternative".
34 |     ReferenceFasta reference_fasta
35 | 
36 |     Int compression_level
37 |     Int preemptible_tries
38 |     Int reads_per_file = 48000000
39 |   }
40 | 
41 |   call Alignment.SamSplitter as SamSplitter {
42 |     input :
43 |       input_bam = input_bam,
44 |       n_reads = reads_per_file,
45 |       preemptible_tries = preemptible_tries,
46 |       compression_level = compression_level
47 |   }
48 | 
49 |   scatter(unmapped_bam in SamSplitter.split_bams) {
50 |     Float current_unmapped_bam_size = size(unmapped_bam, "GiB")
51 |     String current_name = basename(unmapped_bam, ".bam")
52 | 
53 |     call Alignment.SamToFastqAndBwaMemAndMba as SamToFastqAndBwaMemAndMba {
54 |       input:
55 |         input_bam = unmapped_bam,
56 |         bwa_commandline = bwa_commandline,
57 |         output_bam_basename = current_name,
58 |         reference_fasta = reference_fasta,
59 |         bwa_version = bwa_version,
60 |         compression_level = compression_level,
61 |         preemptible_tries = preemptible_tries
62 |     }
63 | 
64 |     Float current_mapped_size = size(SamToFastqAndBwaMemAndMba.output_bam, "GiB")
65 |   }
66 | 
67 |   call Utils.SumFloats as SumSplitAlignedSizes {
68 |     input:
69 |       sizes = current_mapped_size,
70 |       preemptible_tries = preemptible_tries
71 |   }
72 | 
73 |   call Processing.GatherUnsortedBamFiles as GatherMonolithicBamFile {
74 |     input:
75 |       input_bams = SamToFastqAndBwaMemAndMba.output_bam,
76 |       total_input_size = SumSplitAlignedSizes.total_size,
77 |       output_bam_basename = output_bam_basename,
78 |       preemptible_tries = preemptible_tries,
79 |       compression_level = compression_level
80 |   }
81 |   output {
82 |     File aligned_bam = GatherMonolithicBamFile.output_bam
83 |   }
84 | }
85 | 


--------------------------------------------------------------------------------
/workflows/mystery-2/tasks/UnmappedBamToAlignedBam.wdl:
--------------------------------------------------------------------------------
  1 | version 1.0
  2 | 
  3 | ## Copyright Broad Institute, 2018
  4 | ##
  5 | ## This WDL pipeline implements data processing according to the GATK Best Practices (June 2016)
  6 | ## for human whole-genome and exome sequencing data.
  7 | ##
  8 | ## Runtime parameters are often optimized for Broad's Google Cloud Platform implementation.
  9 | ## For program versions, see docker containers.
 10 | ##
 11 | ## LICENSING :
 12 | ## This script is released under the WDL source code license (BSD-3) (see LICENSE in
 13 | ## https://github.com/broadinstitute/wdl). Note however that the programs it calls may
 14 | ## be subject to different licenses. Users are responsible for checking that they are
 15 | ## authorized to run all programs before running this script. Please see the docker
 16 | ## page at https://hub.docker.com/r/broadinstitute/genomes-in-the-cloud/ for detailed
 17 | ## licensing information pertaining to the included programs.
 18 | 
 19 | import "https://raw.githubusercontent.com/gatk-workflows/five-dollar-genome-analysis-pipeline/1.2.0/tasks/Alignment.wdl" as Alignment
 20 | import "https://raw.githubusercontent.com/gatk-workflows/five-dollar-genome-analysis-pipeline/1.2.0/tasks/SplitLargeReadGroup.wdl" as SplitRG
 21 | import "https://raw.githubusercontent.com/gatk-workflows/five-dollar-genome-analysis-pipeline/1.2.0/tasks/Qc.wdl" as QC
 22 | import "https://raw.githubusercontent.com/gatk-workflows/five-dollar-genome-analysis-pipeline/1.2.0/tasks/BamProcessing.wdl" as Processing
 23 | import "https://raw.githubusercontent.com/gatk-workflows/five-dollar-genome-analysis-pipeline/1.2.0/tasks/Utilities.wdl" as Utils
 24 | import "https://raw.githubusercontent.com/gatk-workflows/five-dollar-genome-analysis-pipeline/1.2.0/structs/GermlineStructs.wdl" as Structs
 25 | 
 26 | # WORKFLOW DEFINITION
 27 | workflow UnmappedBamToAlignedBam {
 28 |   input {
 29 |     SampleAndUnmappedBams sample_and_unmapped_bams
 30 |     GermlineSingleSampleReferences references
 31 |     PapiSettings papi_settings
 32 | 
 33 |     String cross_check_fingerprints_by
 34 |     File? haplotype_database_file
 35 |     Float lod_threshold
 36 |     String recalibrated_bam_basename
 37 |   }
 38 | 
 39 |   Float cutoff_for_large_rg_in_gb = 20.0
 40 | 
 41 |   String bwa_commandline = "bwa mem -K 100000000 -p -v 3 -t 16 -Y $bash_ref_fasta"
 42 | 
 43 |   Int compression_level = 2
 44 | 
 45 |   # Get the version of BWA to include in the PG record in the header of the BAM produced
 46 |   # by MergeBamAlignment.
 47 |   call Alignment.GetBwaVersion
 48 | 
 49 |   # Get the size of the standard reference files as well as the additional reference files needed for BWA
 50 | 
 51 |   # Align flowcell-level unmapped input bams in parallel
 52 |   scatter (unmapped_bam in sample_and_unmapped_bams.flowcell_unmapped_bams) {
 53 | 
 54 |     Float unmapped_bam_size = size(unmapped_bam, "GiB")
 55 | 
 56 |     String unmapped_bam_basename = basename(unmapped_bam, sample_and_unmapped_bams.unmapped_bam_suffix)
 57 | 
 58 |     # QC the unmapped BAM
 59 |     call QC.CollectQualityYieldMetrics as CollectQualityYieldMetrics {
 60 |       input:
 61 |         input_bam = unmapped_bam,
 62 |         metrics_filename = unmapped_bam_basename + ".unmapped.quality_yield_metrics",
 63 |         preemptible_tries = papi_settings.preemptible_tries
 64 |     }
 65 | 
 66 |     if (unmapped_bam_size > cutoff_for_large_rg_in_gb) {
 67 |       # Split bam into multiple smaller bams,
 68 |       # map reads to reference and recombine into one bam
 69 |       call SplitRG.SplitLargeReadGroup as SplitRG {
 70 |         input:
 71 |           input_bam = unmapped_bam,
 72 |           bwa_commandline = bwa_commandline,
 73 |           bwa_version = GetBwaVersion.bwa_version,
 74 |           output_bam_basename = unmapped_bam_basename + ".aligned.unsorted",
 75 |           reference_fasta = references.reference_fasta,
 76 |           compression_level = compression_level,
 77 |           preemptible_tries = papi_settings.preemptible_tries
 78 |       }
 79 |     }
 80 | 
 81 |     if (unmapped_bam_size <= cutoff_for_large_rg_in_gb) {
 82 |       # Map reads to reference
 83 |       call Alignment.SamToFastqAndBwaMemAndMba as SamToFastqAndBwaMemAndMba {
 84 |         input:
 85 |           input_bam = unmapped_bam,
 86 |           bwa_commandline = bwa_commandline,
 87 |           output_bam_basename = unmapped_bam_basename + ".aligned.unsorted",
 88 |           reference_fasta = references.reference_fasta,
 89 |           bwa_version = GetBwaVersion.bwa_version,
 90 |           compression_level = compression_level,
 91 |           preemptible_tries = papi_settings.preemptible_tries
 92 |       }
 93 |     }
 94 | 
 95 |     File output_aligned_bam = select_first([SamToFastqAndBwaMemAndMba.output_bam, SplitRG.aligned_bam])
 96 | 
 97 |     Float mapped_bam_size = size(output_aligned_bam, "GiB")
 98 | 
 99 |     # QC the aligned but unsorted readgroup BAM
100 |     # no reference as the input here is unsorted, providing a reference would cause an error
101 |     call QC.CollectUnsortedReadgroupBamQualityMetrics as CollectUnsortedReadgroupBamQualityMetrics {
102 |       input:
103 |         input_bam = output_aligned_bam,
104 |         output_bam_prefix = unmapped_bam_basename + ".readgroup",
105 |         preemptible_tries = papi_settings.preemptible_tries
106 |     }
107 |   }
108 | 
109 |   # Sum the read group bam sizes to approximate the aggregated bam size
110 |   call Utils.SumFloats as SumFloats {
111 |     input:
112 |       sizes = mapped_bam_size,
113 |       preemptible_tries = papi_settings.preemptible_tries
114 |   }
115 | 
116 |   # MarkDuplicates and SortSam currently take too long for preemptibles if the input data is too large
117 |   Float gb_size_cutoff_for_preemptibles = 110.0
118 |   Boolean data_too_large_for_preemptibles = SumFloats.total_size > gb_size_cutoff_for_preemptibles
119 | 
120 |   # Aggregate aligned+merged flowcell BAM files and mark duplicates
121 |   # We take advantage of the tool's ability to take multiple BAM inputs and write out a single output
122 |   # to avoid having to spend time just merging BAM files.
123 |   call Processing.MarkDuplicates as MarkDuplicates {
124 |     input:
125 |       input_bams = output_aligned_bam,
126 |       output_bam_basename = sample_and_unmapped_bams.base_file_name + ".aligned.unsorted.duplicates_marked",
127 |       metrics_filename = sample_and_unmapped_bams.base_file_name + ".duplicate_metrics",
128 |       total_input_size = SumFloats.total_size,
129 |       compression_level = compression_level,
130 |       preemptible_tries = if data_too_large_for_preemptibles then 0 else papi_settings.agg_preemptible_tries
131 |   }
132 | 
133 |   # Sort aggregated+deduped BAM file and fix tags
134 |   call Processing.SortSam as SortSampleBam {
135 |     input:
136 |       input_bam = MarkDuplicates.output_bam,
137 |       output_bam_basename = sample_and_unmapped_bams.base_file_name + ".aligned.duplicate_marked.sorted",
138 |       compression_level = compression_level,
139 |       preemptible_tries = if data_too_large_for_preemptibles then 0 else papi_settings.agg_preemptible_tries
140 |   }
141 | 
142 |   Float agg_bam_size = size(SortSampleBam.output_bam, "GiB")
143 | 
144 |   if (defined(haplotype_database_file)) {
145 |     # Check identity of fingerprints across readgroups
146 |     call QC.CrossCheckFingerprints as CrossCheckFingerprints {
147 |       input:
148 |         input_bams = [ SortSampleBam.output_bam ],
149 |         input_bam_indexes = [SortSampleBam.output_bam_index],
150 |         haplotype_database_file = haplotype_database_file,
151 |         metrics_filename = sample_and_unmapped_bams.base_file_name + ".crosscheck",
152 |         total_input_size = agg_bam_size,
153 |         lod_threshold = lod_threshold,
154 |         cross_check_by = cross_check_fingerprints_by,
155 |         preemptible_tries = papi_settings.agg_preemptible_tries
156 |     }
157 |   }
158 | 
159 |   # Create list of sequences for scatter-gather parallelization
160 |   call Utils.CreateSequenceGroupingTSV as CreateSequenceGroupingTSV {
161 |     input:
162 |       ref_dict = references.reference_fasta.ref_dict,
163 |       preemptible_tries = papi_settings.preemptible_tries
164 |   }
165 | 
166 |   # Estimate level of cross-sample contamination
167 |   call Processing.CheckContamination as CheckContamination {
168 |     input:
169 |       input_bam = SortSampleBam.output_bam,
170 |       input_bam_index = SortSampleBam.output_bam_index,
171 |       contamination_sites_ud = references.contamination_sites_ud,
172 |       contamination_sites_bed = references.contamination_sites_bed,
173 |       contamination_sites_mu = references.contamination_sites_mu,
174 |       ref_fasta = references.reference_fasta.ref_fasta,
175 |       ref_fasta_index = references.reference_fasta.ref_fasta_index,
176 |       output_prefix = sample_and_unmapped_bams.base_file_name + ".preBqsr",
177 |       preemptible_tries = papi_settings.agg_preemptible_tries,
178 |       contamination_underestimation_factor = 0.75
179 |   }
180 | 
181 |   # We need disk to localize the sharded input and output due to the scatter for BQSR.
182 |   # If we take the number we are scattering by and reduce by 3 we will have enough disk space
183 |   # to account for the fact that the data is not split evenly.
184 |   Int num_of_bqsr_scatters = length(CreateSequenceGroupingTSV.sequence_grouping)
185 |   Int potential_bqsr_divisor = num_of_bqsr_scatters - 10
186 |   Int bqsr_divisor = if potential_bqsr_divisor > 1 then potential_bqsr_divisor else 1
187 | 
188 |   # Perform Base Quality Score Recalibration (BQSR) on the sorted BAM in parallel
189 |   scatter (subgroup in CreateSequenceGroupingTSV.sequence_grouping) {
190 |     # Generate the recalibration model by interval
191 |     call Processing.BaseRecalibrator as BaseRecalibrator {
192 |       input:
193 |         input_bam = SortSampleBam.output_bam,
194 |         recalibration_report_filename = sample_and_unmapped_bams.base_file_name + ".recal_data.csv",
195 |         sequence_group_interval = subgroup,
196 |         dbsnp_vcf = references.dbsnp_vcf,
197 |         dbsnp_vcf_index = references.dbsnp_vcf_index,
198 |         known_indels_sites_vcfs = references.known_indels_sites_vcfs,
199 |         known_indels_sites_indices = references.known_indels_sites_indices,
200 |         ref_dict = references.reference_fasta.ref_dict,
201 |         ref_fasta = references.reference_fasta.ref_fasta,
202 |         ref_fasta_index = references.reference_fasta.ref_fasta_index,
203 |         bqsr_scatter = bqsr_divisor,
204 |         preemptible_tries = papi_settings.agg_preemptible_tries
205 |     }
206 |   }
207 | 
208 |   # Merge the recalibration reports resulting from by-interval recalibration
209 |   # The reports are always the same size
210 |   call Processing.GatherBqsrReports as GatherBqsrReports {
211 |     input:
212 |       input_bqsr_reports = BaseRecalibrator.recalibration_report,
213 |       output_report_filename = sample_and_unmapped_bams.base_file_name + ".recal_data.csv",
214 |       preemptible_tries = papi_settings.preemptible_tries
215 |   }
216 | 
217 |   scatter (subgroup in CreateSequenceGroupingTSV.sequence_grouping_with_unmapped) {
218 |     # Apply the recalibration model by interval
219 |     call Processing.ApplyBQSR as ApplyBQSR {
220 |       input:
221 |         input_bam = SortSampleBam.output_bam,
222 |         output_bam_basename = recalibrated_bam_basename,
223 |         recalibration_report = GatherBqsrReports.output_bqsr_report,
224 |         sequence_group_interval = subgroup,
225 |         ref_dict = references.reference_fasta.ref_dict,
226 |         ref_fasta = references.reference_fasta.ref_fasta,
227 |         ref_fasta_index = references.reference_fasta.ref_fasta_index,
228 |         bqsr_scatter = bqsr_divisor,
229 |         compression_level = compression_level,
230 |         preemptible_tries = papi_settings.agg_preemptible_tries
231 |     }
232 |   }
233 | 
234 |   # Merge the recalibrated BAM files resulting from by-interval recalibration
235 |   call Processing.GatherSortedBamFiles as GatherBamFiles {
236 |     input:
237 |       input_bams = ApplyBQSR.recalibrated_bam,
238 |       output_bam_basename = sample_and_unmapped_bams.base_file_name,
239 |       total_input_size = agg_bam_size,
240 |       compression_level = compression_level,
241 |       preemptible_tries = papi_settings.agg_preemptible_tries
242 |   }
243 | 
244 |   # Outputs that will be retained when execution is complete
245 |   output {
246 |     Array[File] quality_yield_metrics = CollectQualityYieldMetrics.quality_yield_metrics
247 | 
248 |     Array[File] unsorted_read_group_base_distribution_by_cycle_pdf = CollectUnsortedReadgroupBamQualityMetrics.base_distribution_by_cycle_pdf
249 |     Array[File] unsorted_read_group_base_distribution_by_cycle_metrics = CollectUnsortedReadgroupBamQualityMetrics.base_distribution_by_cycle_metrics
250 |     Array[File] unsorted_read_group_insert_size_histogram_pdf = CollectUnsortedReadgroupBamQualityMetrics.insert_size_histogram_pdf
251 |     Array[File] unsorted_read_group_insert_size_metrics = CollectUnsortedReadgroupBamQualityMetrics.insert_size_metrics
252 |     Array[File] unsorted_read_group_quality_by_cycle_pdf = CollectUnsortedReadgroupBamQualityMetrics.quality_by_cycle_pdf
253 |     Array[File] unsorted_read_group_quality_by_cycle_metrics = CollectUnsortedReadgroupBamQualityMetrics.quality_by_cycle_metrics
254 |     Array[File] unsorted_read_group_quality_distribution_pdf = CollectUnsortedReadgroupBamQualityMetrics.quality_distribution_pdf
255 |     Array[File] unsorted_read_group_quality_distribution_metrics = CollectUnsortedReadgroupBamQualityMetrics.quality_distribution_metrics
256 | 
257 |     File? cross_check_fingerprints_metrics = CrossCheckFingerprints.cross_check_fingerprints_metrics
258 | 
259 |     File selfSM = CheckContamination.selfSM
260 |     Float contamination = CheckContamination.contamination
261 | 
262 |     File duplicate_metrics = MarkDuplicates.duplicate_metrics
263 |     File output_bqsr_reports = GatherBqsrReports.output_bqsr_report
264 | 
265 |     File output_bam = GatherBamFiles.output_bam
266 |     File output_bam_index = GatherBamFiles.output_bam_index
267 |   }
268 | }
269 | 


--------------------------------------------------------------------------------
/workflows/mystery-2/tasks/Utilities.wdl:
--------------------------------------------------------------------------------
  1 | version 1.0
  2 | 
  3 | ## Copyright Broad Institute, 2018
  4 | ##
  5 | ## This WDL defines utility tasks used for processing of sequencing data.
  6 | ##
  7 | ## Runtime parameters are often optimized for Broad's Google Cloud Platform implementation.
  8 | ## For program versions, see docker containers.
  9 | ##
 10 | ## LICENSING :
 11 | ## This script is released under the WDL source code license (BSD-3) (see LICENSE in
 12 | ## https://github.com/broadinstitute/wdl). Note however that the programs it calls may
 13 | ## be subject to different licenses. Users are responsible for checking that they are
 14 | ## authorized to run all programs before running this script. Please see the docker
 15 | ## page at https://hub.docker.com/r/broadinstitute/genomes-in-the-cloud/ for detailed
 16 | ## licensing information pertaining to the included programs.
 17 | 
 18 | # Generate sets of intervals for scatter-gathering over chromosomes
 19 | task CreateSequenceGroupingTSV {
 20 |   input {
 21 |     File ref_dict
 22 |     Int preemptible_tries
 23 |   }
 24 |   # Use python to create the Sequencing Groupings used for BQSR and PrintReads Scatter.
 25 |   # It outputs to stdout where it is parsed into a wdl Array[Array[String]]
 26 |   # e.g. [["1"], ["2"], ["3", "4"], ["5"], ["6", "7", "8"]]
 27 |   command <<<
 28 |     python <<CODE
 29 |     with open("~{ref_dict}", "r") as ref_dict_file:
 30 |         sequence_tuple_list = []
 31 |         longest_sequence = 0
 32 |         for line in ref_dict_file:
 33 |             if line.startswith("@SQ"):
 34 |                 line_split = line.split("\t")
 35 |                 # (Sequence_Name, Sequence_Length)
 36 |                 sequence_tuple_list.append((line_split[1].split("SN:")[1], int(line_split[2].split("LN:")[1])))
 37 |         longest_sequence = sorted(sequence_tuple_list, key=lambda x: x[1], reverse=True)[0][1]
 38 |     # We are adding this to the intervals because hg38 has contigs named with embedded colons and a bug in GATK strips off
 39 |     # the last element after a :, so we add this as a sacrificial element.
 40 |     hg38_protection_tag = ":1+"
 41 |     # initialize the tsv string with the first sequence
 42 |     tsv_string = sequence_tuple_list[0][0] + hg38_protection_tag
 43 |     temp_size = sequence_tuple_list[0][1]
 44 |     for sequence_tuple in sequence_tuple_list[1:]:
 45 |         if temp_size + sequence_tuple[1] <= longest_sequence:
 46 |             temp_size += sequence_tuple[1]
 47 |             tsv_string += "\t" + sequence_tuple[0] + hg38_protection_tag
 48 |         else:
 49 |             tsv_string += "\n" + sequence_tuple[0] + hg38_protection_tag
 50 |             temp_size = sequence_tuple[1]
 51 |     # add the unmapped sequences as a separate line to ensure that they are recalibrated as well
 52 |     with open("sequence_grouping.txt","w") as tsv_file:
 53 |       tsv_file.write(tsv_string)
 54 |       tsv_file.close()
 55 | 
 56 |     tsv_string += '\n' + "unmapped"
 57 | 
 58 |     with open("sequence_grouping_with_unmapped.txt","w") as tsv_file_with_unmapped:
 59 |       tsv_file_with_unmapped.write(tsv_string)
 60 |       tsv_file_with_unmapped.close()
 61 |     CODE
 62 |   >>>
 63 |   runtime {
 64 |     preemptible: preemptible_tries
 65 |     docker: "us.gcr.io/broad-gotc-prod/python:2.7"
 66 |     memory: "2 GiB"
 67 |   }
 68 |   output {
 69 |     Array[Array[String]] sequence_grouping = read_tsv("sequence_grouping.txt")
 70 |     Array[Array[String]] sequence_grouping_with_unmapped = read_tsv("sequence_grouping_with_unmapped.txt")
 71 |   }
 72 | }
 73 | 
 74 | # This task calls picard's IntervalListTools to scatter the input interval list into scatter_count sub interval lists
 75 | # Note that the number of sub interval lists may not be exactly equal to scatter_count.  There may be slightly more or less.
 76 | # Thus we have the block of python to count the number of generated sub interval lists.
 77 | task ScatterIntervalList {
 78 |   input {
 79 |     File interval_list
 80 |     Int scatter_count
 81 |     Int break_bands_at_multiples_of
 82 |   }
 83 | 
 84 |   command <<<
 85 |     set -e
 86 |     mkdir out
 87 |     java -Xms1g -jar /usr/gitc/picard.jar \
 88 |       IntervalListTools \
 89 |       SCATTER_COUNT=~{scatter_count} \
 90 |       SUBDIVISION_MODE=BALANCING_WITHOUT_INTERVAL_SUBDIVISION_WITH_OVERFLOW \
 91 |       UNIQUE=true \
 92 |       SORT=true \
 93 |       BREAK_BANDS_AT_MULTIPLES_OF=~{break_bands_at_multiples_of} \
 94 |       INPUT=~{interval_list} \
 95 |       OUTPUT=out
 96 | 
 97 |     python3 <<CODE
 98 |     import glob, os
 99 |     # Works around a JES limitation where multiples files with the same name overwrite each other when globbed
100 |     intervals = sorted(glob.glob("out/*/*.interval_list"))
101 |     for i, interval in enumerate(intervals):
102 |       (directory, filename) = os.path.split(interval)
103 |       newName = os.path.join(directory, str(i + 1) + filename)
104 |       os.rename(interval, newName)
105 |     print(len(intervals))
106 |     CODE
107 |   >>>
108 |   output {
109 |     Array[File] out = glob("out/*/*.interval_list")
110 |     Int interval_count = read_int(stdout())
111 |   }
112 |   runtime {
113 |     docker: "us.gcr.io/broad-gotc-prod/genomes-in-the-cloud:2.4.1-1540490856"
114 |     memory: "2 GiB"
115 |   }
116 | }
117 | 
118 | # Convert BAM file to CRAM format
119 | # Note that reading CRAMs directly with Picard is not yet supported
120 | task ConvertToCram {
121 |   input {
122 |     File input_bam
123 |     File ref_fasta
124 |     File ref_fasta_index
125 |     String output_basename
126 |     Int preemptible_tries
127 |   }
128 | 
129 |   Float ref_size = size(ref_fasta, "GiB") + size(ref_fasta_index, "GiB")
130 |   Int disk_size = ceil(2 * size(input_bam, "GiB") + ref_size) + 20
131 | 
132 |   command <<<
133 |     set -e
134 |     set -o pipefail
135 | 
136 |     samtools view -C -T ~{ref_fasta} ~{input_bam} | \
137 |     tee ~{output_basename}.cram | \
138 |     md5sum | awk '{print $1}' > ~{output_basename}.cram.md5
139 | 
140 |     # Create REF_CACHE. Used when indexing a CRAM
141 |     seq_cache_populate.pl -root ./ref/cache ~{ref_fasta}
142 |     export REF_PATH=:
143 |     export REF_CACHE=./ref/cache/%2s/%2s/%s
144 | 
145 |     samtools index ~{output_basename}.cram
146 |   >>>
147 |   runtime {
148 |     docker: "us.gcr.io/broad-gotc-prod/genomes-in-the-cloud:2.4.1-1540490856"
149 |     preemptible: preemptible_tries
150 |     memory: "3 GiB"
151 |     cpu: "1"
152 |     disks: "local-disk " + disk_size + " HDD"
153 |   }
154 |   output {
155 |     File output_cram = "~{output_basename}.cram"
156 |     File output_cram_index = "~{output_basename}.cram.crai"
157 |     File output_cram_md5 = "~{output_basename}.cram.md5"
158 |   }
159 | }
160 | 
161 | # Convert CRAM file to BAM format
162 | task ConvertToBam {
163 |   input {
164 |     File input_cram
165 |     File ref_fasta
166 |     File ref_fasta_index
167 |     String output_basename
168 |   }
169 | 
170 |   command <<<
171 |     set -e
172 |     set -o pipefail
173 | 
174 |     samtools view -b -o ~{output_basename}.bam -T ~{ref_fasta} ~{input_cram}
175 | 
176 |     samtools index ~{output_basename}.bam
177 |   >>>
178 |   runtime {
179 |     docker: "us.gcr.io/broad-gotc-prod/genomes-in-the-cloud:2.4.1-1540490856"
180 |     preemptible: 3
181 |     memory: "3 GiB"
182 |     cpu: "1"
183 |     disks: "local-disk 200 HDD"
184 |   }
185 |   output {
186 |     File output_bam = "~{output_basename}.bam"
187 |     File output_bam_index = "~{output_basename}.bam.bai"
188 |   }
189 | }
190 | 
191 | # Calculates sum of a list of floats
192 | task SumFloats {
193 |   input {
194 |     Array[Float] sizes
195 |     Int preemptible_tries
196 |   }
197 | 
198 |   command <<<
199 |   python -c "print ~{sep="+" sizes}"
200 |   >>>
201 |   output {
202 |     Float total_size = read_float(stdout())
203 |   }
204 |   runtime {
205 |     docker: "us.gcr.io/broad-gotc-prod/python:2.7"
206 |     preemptible: preemptible_tries
207 |   }
208 | }
209 | 


--------------------------------------------------------------------------------
/workflows/mystery-2/tasks/VariantCalling-local.wdl:
--------------------------------------------------------------------------------
  1 | 
  2 | 
  3 | # Local Import
  4 | import "tasks/GermlineVariantDiscovery.wdl" as Calling
  5 | import "tasks/Qc.wdl" as QC
  6 | import "tasks/Utilities.wdl" as Utils
  7 | import "tasks/BamProcessing.wdl" as BamProcessing
  8 | 
  9 | # Git URL Import
 10 | #import "https://raw.githubusercontent.com/gatk-workflows/five-dollar-genome-analysis-pipeline/1.2.0/tasks/GermlineVariantDiscovery.wdl" as Calling
 11 | #import "https://raw.githubusercontent.com/gatk-workflows/five-dollar-genome-analysis-pipeline/1.2.0/tasks/Qc.wdl" as QC
 12 | #import "https://raw.githubusercontent.com/gatk-workflows/five-dollar-genome-analysis-pipeline/1.2.0/tasks/Utilities.wdl" as Utils
 13 | #import "https://raw.githubusercontent.com/gatk-workflows/five-dollar-genome-analysis-pipeline/1.2.0/tasks/BamProcessing.wdl" as BamProcessing
 14 | 
 15 | workflow VariantCalling {
 16 | 
 17 | 
 18 |     File calling_interval_list
 19 |     File evaluation_interval_list
 20 |     Int haplotype_scatter_count
 21 |     Int break_bands_at_multiples_of
 22 |     Float? contamination
 23 |     File input_bam
 24 |     File ref_fasta
 25 |     File ref_fasta_index
 26 |     File ref_dict
 27 |     File dbsnp_vcf
 28 |     File dbsnp_vcf_index
 29 |     String base_file_name
 30 |     String final_vcf_base_name
 31 |     Int agg_preemptible_tries
 32 |     Boolean make_gvcf = true
 33 |     Boolean make_bamout = false
 34 |     Boolean use_gatk3_haplotype_caller = false
 35 | 
 36 | 
 37 | 
 38 |   # Break the calling interval_list into sub-intervals
 39 |   # Perform variant calling on the sub-intervals, and then gather the results
 40 |   call Utils.ScatterIntervalList as ScatterIntervalList {
 41 |     input:
 42 |       interval_list = calling_interval_list,
 43 |       scatter_count = haplotype_scatter_count,
 44 |       break_bands_at_multiples_of = break_bands_at_multiples_of
 45 |   }
 46 | 
 47 |   # We need disk to localize the sharded input and output due to the scatter for HaplotypeCaller.
 48 |   # If we take the number we are scattering by and reduce by 20 we will have enough disk space
 49 |   # to account for the fact that the data is quite uneven across the shards.
 50 |   Int potential_hc_divisor = ScatterIntervalList.interval_count - 20
 51 |   Int hc_divisor = if potential_hc_divisor > 1 then potential_hc_divisor else 1
 52 | 
 53 |   # Call variants in parallel over WGS calling intervals
 54 |   scatter (scattered_interval_list in ScatterIntervalList.out) {
 55 | 
 56 |     if (use_gatk3_haplotype_caller) {
 57 |       call Calling.HaplotypeCaller_GATK35_GVCF as HaplotypeCallerGATK3 {
 58 |         input:
 59 |         input_bam = input_bam,
 60 |         interval_list = scattered_interval_list,
 61 |         gvcf_basename = base_file_name,
 62 |         ref_dict = ref_dict,
 63 |         ref_fasta = ref_fasta,
 64 |         ref_fasta_index = ref_fasta_index,
 65 |         contamination = contamination,
 66 |         preemptible_tries = agg_preemptible_tries,
 67 |         hc_scatter = hc_divisor
 68 |       }
 69 |     }
 70 | 
 71 |     if (!use_gatk3_haplotype_caller) {
 72 | 
 73 |       # Generate GVCF by interval
 74 |       call Calling.HaplotypeCaller_GATK4_VCF as HaplotypeCallerGATK4 {
 75 |         input:
 76 |           contamination = contamination,
 77 |           input_bam = input_bam,
 78 |           interval_list = scattered_interval_list,
 79 |           vcf_basename = base_file_name,
 80 |           ref_dict = ref_dict,
 81 |           ref_fasta = ref_fasta,
 82 |           ref_fasta_index = ref_fasta_index,
 83 |           hc_scatter = hc_divisor,
 84 |           make_gvcf = make_gvcf,
 85 |           make_bamout = make_bamout,
 86 |           preemptible_tries = agg_preemptible_tries
 87 |        }
 88 | 
 89 |       # If bamout files were created, we need to sort and gather them into one bamout
 90 |       if (make_bamout) {
 91 |         call BamProcessing.SortSam as SortBamout {
 92 |           input:
 93 |             input_bam = HaplotypeCallerGATK4.bamout,
 94 |             output_bam_basename = final_vcf_base_name,
 95 |             preemptible_tries = agg_preemptible_tries,
 96 |             compression_level = 2
 97 |         }
 98 |       }
 99 |     }
100 | 
101 |     File vcfs_to_merge = select_first([HaplotypeCallerGATK3.output_gvcf, HaplotypeCallerGATK4.output_vcf])
102 |     File vcf_indices_to_merge = select_first([HaplotypeCallerGATK3.output_gvcf_index, HaplotypeCallerGATK4.output_vcf_index])
103 |   }
104 | 
105 |   # Combine by-interval (g)VCFs into a single sample (g)VCF file
106 |   String merge_suffix = if make_gvcf then ".g.vcf.gz" else ".vcf.gz"
107 |   call Calling.MergeVCFs as MergeVCFs {
108 |     input:
109 |       input_vcfs = vcfs_to_merge,
110 |       input_vcfs_indexes = vcf_indices_to_merge,
111 |       output_vcf_name = final_vcf_base_name + merge_suffix,
112 |       preemptible_tries = agg_preemptible_tries
113 |   }
114 | 
115 |   if (make_bamout) {
116 |     call MergeBamouts {
117 |       input:
118 |         bams = select_all(SortBamout.output_bam),
119 |         output_base_name = final_vcf_base_name
120 |     }
121 |   }
122 | 
123 |   # Validate the (g)VCF output of HaplotypeCaller
124 |   call QC.ValidateVCF as ValidateVCF {
125 |     input:
126 |       input_vcf = MergeVCFs.output_vcf,
127 |       input_vcf_index = MergeVCFs.output_vcf_index,
128 |       dbsnp_vcf = dbsnp_vcf,
129 |       dbsnp_vcf_index = dbsnp_vcf_index,
130 |       ref_fasta = ref_fasta,
131 |       ref_fasta_index = ref_fasta_index,
132 |       ref_dict = ref_dict,
133 |       calling_interval_list = calling_interval_list,
134 |       is_gvcf = make_gvcf,
135 |       preemptible_tries = agg_preemptible_tries
136 |   }
137 | 
138 |   # QC the (g)VCF
139 |   call QC.CollectVariantCallingMetrics as CollectVariantCallingMetrics {
140 |     input:
141 |       input_vcf = MergeVCFs.output_vcf,
142 |       input_vcf_index = MergeVCFs.output_vcf_index,
143 |       metrics_basename = final_vcf_base_name,
144 |       dbsnp_vcf = dbsnp_vcf,
145 |       dbsnp_vcf_index = dbsnp_vcf_index,
146 |       ref_dict = ref_dict,
147 |       evaluation_interval_list = evaluation_interval_list,
148 |       is_gvcf = make_gvcf,
149 |       preemptible_tries = agg_preemptible_tries
150 |   }
151 | 
152 |   output {
153 |     File vcf_summary_metrics = CollectVariantCallingMetrics.summary_metrics
154 |     File vcf_detail_metrics = CollectVariantCallingMetrics.detail_metrics
155 |     File output_vcf = MergeVCFs.output_vcf
156 |     File output_vcf_index = MergeVCFs.output_vcf_index
157 |     File? bamout = MergeBamouts.output_bam
158 |     File? bamout_index = MergeBamouts.output_bam_index
159 |   }
160 | }
161 | 
162 | # This task is here because merging bamout files using Picard produces an error.
163 | task MergeBamouts {
164 | 
165 | 
166 |     Array[File] bams
167 |     String output_base_name
168 | 
169 | 
170 |   Int disk_size = ceil(size(bams, "GiB") * 2) + 10
171 | 
172 |   command {
173 |     samtools merge ${output_base_name}.bam ${sep=" " bams}
174 |     samtools index ${output_base_name}.bam
175 |     mv ${output_base_name}.bam.bai ${output_base_name}.bai
176 |   }
177 | 
178 |   output {
179 |     File output_bam = "${output_base_name}.bam"
180 |     File output_bam_index = "${output_base_name}.bai"
181 |   }
182 | 
183 |   runtime {
184 |     docker: "biocontainers/samtools:1.3.1"
185 |     memory: "4 GiB"
186 |     disks: "local-disk ${disk_size} HDD"
187 |     preemptible: 3
188 |     cpu: 1
189 |   }
190 | }
191 | 


--------------------------------------------------------------------------------
/workflows/mystery-2/tasks/VariantCalling.wdl:
--------------------------------------------------------------------------------
  1 | version 1.0
  2 | 
  3 | # Local Import
  4 | #import "../tasks/GermlineVariantDiscovery.wdl" as Calling
  5 | #import "../tasks/Qc.wdl" as QC
  6 | #import "../tasks/Utilities.wdl" as Utils
  7 | #import "../tasks/BamProcessing.wdl" as BamProcessing
  8 | 
  9 | # Git URL Import
 10 | import "https://raw.githubusercontent.com/gatk-workflows/five-dollar-genome-analysis-pipeline/1.2.0/tasks/GermlineVariantDiscovery.wdl" as Calling
 11 | import "https://raw.githubusercontent.com/gatk-workflows/five-dollar-genome-analysis-pipeline/1.2.0/tasks/Qc.wdl" as QC
 12 | import "https://raw.githubusercontent.com/gatk-workflows/five-dollar-genome-analysis-pipeline/1.2.0/tasks/Utilities.wdl" as Utils
 13 | import "https://raw.githubusercontent.com/gatk-workflows/five-dollar-genome-analysis-pipeline/1.2.0/tasks/BamProcessing.wdl" as BamProcessing
 14 | 
 15 | workflow VariantCalling {
 16 | 
 17 |   input {
 18 |     File calling_interval_list
 19 |     File evaluation_interval_list
 20 |     Int haplotype_scatter_count
 21 |     Int break_bands_at_multiples_of
 22 |     Float? contamination
 23 |     File input_bam
 24 |     File ref_fasta
 25 |     File ref_fasta_index
 26 |     File ref_dict
 27 |     File dbsnp_vcf
 28 |     File dbsnp_vcf_index
 29 |     String base_file_name
 30 |     String final_vcf_base_name
 31 |     Int agg_preemptible_tries
 32 |     Boolean make_gvcf = true
 33 |     Boolean make_bamout = false
 34 |     Boolean use_gatk3_haplotype_caller = false
 35 |   }
 36 | 
 37 |   parameter_meta {
 38 |     make_bamout: "For CNNScoreVariants to run with a 2D model, a bamout must be created by HaplotypeCaller. The bamout is a bam containing information on how HaplotypeCaller remapped reads while it was calling variants. See https://gatkforums.broadinstitute.org/gatk/discussion/5484/howto-generate-a-bamout-file-showing-how-haplotypecaller-has-remapped-sequence-reads for more details."
 39 |   }
 40 | 
 41 |   # Break the calling interval_list into sub-intervals
 42 |   # Perform variant calling on the sub-intervals, and then gather the results
 43 |   call Utils.ScatterIntervalList as ScatterIntervalList {
 44 |     input:
 45 |       interval_list = calling_interval_list,
 46 |       scatter_count = haplotype_scatter_count,
 47 |       break_bands_at_multiples_of = break_bands_at_multiples_of
 48 |   }
 49 | 
 50 |   # We need disk to localize the sharded input and output due to the scatter for HaplotypeCaller.
 51 |   # If we take the number we are scattering by and reduce by 20 we will have enough disk space
 52 |   # to account for the fact that the data is quite uneven across the shards.
 53 |   Int potential_hc_divisor = ScatterIntervalList.interval_count - 20
 54 |   Int hc_divisor = if potential_hc_divisor > 1 then potential_hc_divisor else 1
 55 | 
 56 |   # Call variants in parallel over WGS calling intervals
 57 |   scatter (scattered_interval_list in ScatterIntervalList.out) {
 58 | 
 59 |     if (use_gatk3_haplotype_caller) {
 60 |       call Calling.HaplotypeCaller_GATK35_GVCF as HaplotypeCallerGATK3 {
 61 |         input:
 62 |         input_bam = input_bam,
 63 |         interval_list = scattered_interval_list,
 64 |         gvcf_basename = base_file_name,
 65 |         ref_dict = ref_dict,
 66 |         ref_fasta = ref_fasta,
 67 |         ref_fasta_index = ref_fasta_index,
 68 |         contamination = contamination,
 69 |         preemptible_tries = agg_preemptible_tries,
 70 |         hc_scatter = hc_divisor
 71 |       }
 72 |     }
 73 | 
 74 |     if (!use_gatk3_haplotype_caller) {
 75 | 
 76 |       # Generate GVCF by interval
 77 |       call Calling.HaplotypeCaller_GATK4_VCF as HaplotypeCallerGATK4 {
 78 |         input:
 79 |           contamination = contamination,
 80 |           input_bam = input_bam,
 81 |           interval_list = scattered_interval_list,
 82 |           vcf_basename = base_file_name,
 83 |           ref_dict = ref_dict,
 84 |           ref_fasta = ref_fasta,
 85 |           ref_fasta_index = ref_fasta_index,
 86 |           hc_scatter = hc_divisor,
 87 |           make_gvcf = make_gvcf,
 88 |           make_bamout = make_bamout,
 89 |           preemptible_tries = agg_preemptible_tries
 90 |        }
 91 | 
 92 |       # If bamout files were created, we need to sort and gather them into one bamout
 93 |       if (make_bamout) {
 94 |         call BamProcessing.SortSam as SortBamout {
 95 |           input:
 96 |             input_bam = HaplotypeCallerGATK4.bamout,
 97 |             output_bam_basename = final_vcf_base_name,
 98 |             preemptible_tries = agg_preemptible_tries,
 99 |             compression_level = 2
100 |         }
101 |       }
102 |     }
103 | 
104 |     File vcfs_to_merge = select_first([HaplotypeCallerGATK3.output_gvcf, HaplotypeCallerGATK4.output_vcf])
105 |     File vcf_indices_to_merge = select_first([HaplotypeCallerGATK3.output_gvcf_index, HaplotypeCallerGATK4.output_vcf_index])
106 |   }
107 | 
108 |   # Combine by-interval (g)VCFs into a single sample (g)VCF file
109 |   String merge_suffix = if make_gvcf then ".g.vcf.gz" else ".vcf.gz"
110 |   call Calling.MergeVCFs as MergeVCFs {
111 |     input:
112 |       input_vcfs = vcfs_to_merge,
113 |       input_vcfs_indexes = vcf_indices_to_merge,
114 |       output_vcf_name = final_vcf_base_name + merge_suffix,
115 |       preemptible_tries = agg_preemptible_tries
116 |   }
117 | 
118 |   if (make_bamout) {
119 |     call MergeBamouts {
120 |       input:
121 |         bams = select_all(SortBamout.output_bam),
122 |         output_base_name = final_vcf_base_name
123 |     }
124 |   }
125 | 
126 |   # Validate the (g)VCF output of HaplotypeCaller
127 |   call QC.ValidateVCF as ValidateVCF {
128 |     input:
129 |       input_vcf = MergeVCFs.output_vcf,
130 |       input_vcf_index = MergeVCFs.output_vcf_index,
131 |       dbsnp_vcf = dbsnp_vcf,
132 |       dbsnp_vcf_index = dbsnp_vcf_index,
133 |       ref_fasta = ref_fasta,
134 |       ref_fasta_index = ref_fasta_index,
135 |       ref_dict = ref_dict,
136 |       calling_interval_list = calling_interval_list,
137 |       is_gvcf = make_gvcf,
138 |       preemptible_tries = agg_preemptible_tries
139 |   }
140 | 
141 |   # QC the (g)VCF
142 |   call QC.CollectVariantCallingMetrics as CollectVariantCallingMetrics {
143 |     input:
144 |       input_vcf = MergeVCFs.output_vcf,
145 |       input_vcf_index = MergeVCFs.output_vcf_index,
146 |       metrics_basename = final_vcf_base_name,
147 |       dbsnp_vcf = dbsnp_vcf,
148 |       dbsnp_vcf_index = dbsnp_vcf_index,
149 |       ref_dict = ref_dict,
150 |       evaluation_interval_list = evaluation_interval_list,
151 |       is_gvcf = make_gvcf,
152 |       preemptible_tries = agg_preemptible_tries
153 |   }
154 | 
155 |   output {
156 |     File vcf_summary_metrics = CollectVariantCallingMetrics.summary_metrics
157 |     File vcf_detail_metrics = CollectVariantCallingMetrics.detail_metrics
158 |     File output_vcf = MergeVCFs.output_vcf
159 |     File output_vcf_index = MergeVCFs.output_vcf_index
160 |     File? bamout = MergeBamouts.output_bam
161 |     File? bamout_index = MergeBamouts.output_bam_index
162 |   }
163 | }
164 | 
165 | # This task is here because merging bamout files using Picard produces an error.
166 | task MergeBamouts {
167 | 
168 |   input {
169 |     Array[File] bams
170 |     String output_base_name
171 |   }
172 | 
173 |   Int disk_size = ceil(size(bams, "GiB") * 2) + 10
174 | 
175 |   command {
176 |     samtools merge ~{output_base_name}.bam ~{sep=" " bams}
177 |     samtools index ~{output_base_name}.bam
178 |     mv ~{output_base_name}.bam.bai ~{output_base_name}.bai
179 |   }
180 | 
181 |   output {
182 |     File output_bam = "~{output_base_name}.bam"
183 |     File output_bam_index = "~{output_base_name}.bai"
184 |   }
185 | 
186 |   runtime {
187 |     docker: "biocontainers/samtools:1.3.1"
188 |     memory: "4 GiB"
189 |     disks: "local-disk ~{disk_size} HDD"
190 |     preemptible: 3
191 |     cpu: 1
192 |   }
193 | }
194 | 


--------------------------------------------------------------------------------
/workflows/scatter-hc/scatter-haplotypecaller.gcs.inputs.json:
--------------------------------------------------------------------------------
 1 | {
 2 |   "ScatterHaplotypeCallerGVCF.HaplotypeCallerGVCF.ref_index": "gs://genomics-in-the-cloud/v1/data/germline/ref/ref.fasta.fai",
 3 |   "ScatterHaplotypeCallerGVCF.HaplotypeCallerGVCF.ref_dict": "gs://genomics-in-the-cloud/v1/data/germline/ref/ref.dict",
 4 |   "ScatterHaplotypeCallerGVCF.HaplotypeCallerGVCF.java_opt": "-Xmx8G",
 5 |   "ScatterHaplotypeCallerGVCF.input_bam_index": "gs://genomics-in-the-cloud/v1/data/germline/bams/mother.bai",
 6 |   "ScatterHaplotypeCallerGVCF.input_bam": "gs://genomics-in-the-cloud/v1/data/germline/bams/mother.bam",
 7 |   "ScatterHaplotypeCallerGVCF.MergeVCFs.docker_image": "us.gcr.io/broad-gatk/gatk:4.1.3.0",
 8 |   "ScatterHaplotypeCallerGVCF.HaplotypeCallerGVCF.ref_fasta": "gs://genomics-in-the-cloud/v1/data/germline/ref/ref.fasta",
 9 |   "ScatterHaplotypeCallerGVCF.MergeVCFs.java_opt": "-Xmx8G",
10 |   "ScatterHaplotypeCallerGVCF.intervals_list": "gs://genomics-in-the-cloud/v1/data/germline/intervals/snippet-intervals-min.list",
11 |   "ScatterHaplotypeCallerGVCF.HaplotypeCallerGVCF.docker_image": "us.gcr.io/broad-gatk/gatk:4.1.3.0"
12 | }
13 | 
14 | 


--------------------------------------------------------------------------------
/workflows/scatter-hc/scatter-haplotypecaller.gcs.inputs.test.json:
--------------------------------------------------------------------------------
 1 | {
 2 |   "HaplotypeCallerGVCF.ref_index": "gs://genomics-in-the-cloud/v1/data/germline/ref/ref.fasta.fai",
 3 |   "HaplotypeCallerGVCF.ref_dict": "gs://genomics-in-the-cloud/v1/data/germline/ref/ref.dict",
 4 |   "HaplotypeCallerGVCF.java_opt": "-Xmx8G",
 5 |   "ScatterHaplotypeCallerGVCF.input_bam_index": "gs://genomics-in-the-cloud/v1/data/germline/bams/mother.bai",
 6 |   "ScatterHaplotypeCallerGVCF.input_bam": "gs://genomics-in-the-cloud/v1/data/germline/bams/mother.bam",
 7 |   "ScatterHaplotypeCallerGVCF.MergeVCFs.docker_image": "us.gcr.io/broad-gatk/gatk:4.1.3.0",
 8 |   "ScatterHaplotypeCallerGVCF.HaplotypeCallerGVCF.ref_fasta": "gs://genomics-in-the-cloud/v1/data/germline/ref/ref.fasta",
 9 |   "ScatterHaplotypeCallerGVCF.MergeVCFs.java_opt": "-Xmx8G",
10 |   "ScatterHaplotypeCallerGVCF.intervals_list": "gs://genomics-in-the-cloud/v1/data/germline/intervals/snippet-intervals-min.list",
11 |   "ScatterHaplotypeCallerGVCF.HaplotypeCallerGVCF.docker_image": "us.gcr.io/broad-gatk/gatk:4.1.3.0"
12 | }
13 | 
14 | 


--------------------------------------------------------------------------------
/workflows/scatter-hc/scatter-haplotypecaller.local.inputs.json:
--------------------------------------------------------------------------------
 1 | {
 2 |   "ScatterHaplotypeCallerGVCF.HaplotypeCallerGVCF.ref_index": "book/data/germline/ref/ref.fasta.fai",
 3 |   "ScatterHaplotypeCallerGVCF.HaplotypeCallerGVCF.ref_dict": "book/data/germline/ref/ref.dict",
 4 |   "ScatterHaplotypeCallerGVCF.HaplotypeCallerGVCF.java_opt": "-Xmx8G",
 5 |   "ScatterHaplotypeCallerGVCF.input_bam_index": "book/data/germline/bams/mother.bai",
 6 |   "ScatterHaplotypeCallerGVCF.input_bam": "book/data/germline/bams/mother.bam",
 7 |   "ScatterHaplotypeCallerGVCF.MergeVCFs.docker_image": "us.gcr.io/broad-gatk/gatk:4.1.3.0",
 8 |   "ScatterHaplotypeCallerGVCF.HaplotypeCallerGVCF.ref_fasta": "book/data/germline/ref/ref.fasta",
 9 |   "ScatterHaplotypeCallerGVCF.MergeVCFs.java_opt": "-Xmx8G",
10 |   "ScatterHaplotypeCallerGVCF.intervals_list": "book/data/germline/intervals/snippet-intervals-min.list",
11 |   "ScatterHaplotypeCallerGVCF.HaplotypeCallerGVCF.docker_image": "us.gcr.io/broad-gatk/gatk:4.1.3.0"
12 | }
13 | 


--------------------------------------------------------------------------------
/workflows/scatter-hc/scatter-haplotypecaller.wdl:
--------------------------------------------------------------------------------
  1 | ## This workflow runs the HaplotypeCaller tool from GATK4 in GVCF mode
  2 | ## on a single sample in BAM format. The execution of the HaplotypeCaller
  3 | ## tool is parallelized using an intervals list file. The per-interval
  4 | ## output GVCF files are then merged to produce a single GVCF file for
  5 | ## the sample, which can then be used by the joint-discovery workflow
  6 | ## according to the GATK Best Practices for germline short variant
  7 | ## discovery.
  8 | 
  9 | version 1.0
 10 | 
 11 | workflow ScatterHaplotypeCallerGVCF {
 12 | 
 13 |     input {
 14 |         File input_bam
 15 |         File input_bam_index
 16 |         File intervals_list
 17 |     }
 18 | 
 19 |     String output_basename = basename(input_bam, ".bam")
 20 | 
 21 |     Array[String] calling_intervals = read_lines(intervals_list)
 22 | 
 23 |     scatter(interval in calling_intervals) {
 24 |         call HaplotypeCallerGVCF {
 25 |             input:
 26 |                 input_bam = input_bam,
 27 |                 input_bam_index = input_bam_index,
 28 |                 intervals = interval,
 29 |                 gvcf_name = output_basename + ".scatter.g.vcf"
 30 |         }
 31 |     }
 32 |     call MergeVCFs {
 33 |         input:
 34 |             vcfs = HaplotypeCallerGVCF.output_gvcf,
 35 |             merged_vcf_name = output_basename + ".merged.g.vcf"
 36 |     }
 37 | 
 38 |     output {
 39 |         File output_gvcf = MergeVCFs.merged_vcf
 40 |     }
 41 | }
 42 | 
 43 | task HaplotypeCallerGVCF {
 44 | 
 45 |     input {
 46 |         String docker_image
 47 |         String java_opt
 48 | 
 49 |         File ref_fasta
 50 |         File ref_index
 51 |         File ref_dict
 52 |         File input_bam
 53 |         File input_bam_index
 54 |         String intervals
 55 |         String gvcf_name
 56 |     }
 57 | 
 58 |     command {
 59 |         gatk --java-options ${java_opt} HaplotypeCaller \
 60 |             -R ${ref_fasta} \
 61 |             -I ${input_bam} \
 62 |             -O ${gvcf_name} \
 63 |             -L ${intervals} \
 64 |             -ERC GVCF
 65 |     }
 66 | 
 67 |     output {
 68 |         File output_gvcf = "${gvcf_name}"
 69 |     }
 70 | 
 71 |     runtime {
 72 |         docker: docker_image
 73 |     }
 74 | }
 75 | 
 76 | task MergeVCFs {
 77 | 
 78 |     input {
 79 |         String docker_image
 80 |         String java_opt
 81 | 
 82 |         Array[File] vcfs
 83 |         String merged_vcf_name
 84 |     }
 85 | 
 86 |     command {
 87 |         gatk --java-options ${java_opt} MergeVcfs \
 88 |             -I ${sep=' -I' vcfs} \
 89 |             -O ${merged_vcf_name}
 90 |     }
 91 | 
 92 |     output {
 93 |         File merged_vcf = "${merged_vcf_name}"
 94 |     }
 95 | 
 96 |     runtime {
 97 |         docker: docker_image
 98 |     }
 99 | }
100 | 


--------------------------------------------------------------------------------