├── pipelines_pylib
    ├── __init__.py
    ├── poller.py
    └── defaults.py
├── wdl_runner
    └── README.md
├── samtools
    ├── Dockerfile
    ├── cloud
    │   ├── samtools.yaml
    │   └── run_samtools.py
    ├── local
    │   ├── test_index.sh
    │   └── test_view.sh
    └── README.md
├── fastqc
    ├── Dockerfile
    ├── local
    │   └── test_fastqc.sh
    ├── cloud
    │   └── run_fastqc.py
    └── README.md
├── LICENSE
├── cwl_runner
    ├── cwl_shutdown.sh
    ├── README.md
    ├── cwl_runner.sh
    └── cwl_startup.sh
├── bioconductor
    ├── countOverlapsFromBAM.R
    ├── README.md
    └── run_bioconductor.py
├── tools
    ├── get_yaml_value.py
    ├── operations_util.sh
    └── poll.sh
├── set_vcf_sample_id
    ├── set_vcf_sample_id.py
    ├── process_vcfs.sh
    ├── cloud
    │   └── run_set_vcf_sample_id.py
    └── README.md
├── CONTRIBUTING.rst
├── README.md
└── compress
    ├── run_compress.py
    └── README.md


/pipelines_pylib/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/wdl_runner/README.md:
--------------------------------------------------------------------------------
1 | # Run a WDL workflow
2 | 
3 | The wdl_runner example and monitoring tool are now part of the
4 | [OpenWDL github repository](https://github.com/openwdl/wdl) as
5 | part of the Cromwell-on-Google toolset.
6 | 
7 | Please go to 
8 | [cromwell_on_google/wdl_runner](https://github.com/openwdl/wdl/tree/master/runners/cromwell_on_google/wdl_runner).
9 | 


--------------------------------------------------------------------------------
/samtools/Dockerfile:
--------------------------------------------------------------------------------
 1 | # Copyright 2017 Google Inc.
 2 | #
 3 | # Use of this source code is governed by a BSD-style
 4 | # license that can be found in the LICENSE file or at
 5 | # https://developers.google.com/open-source/licenses/bsd
 6 | 
 7 | FROM ubuntu
 8 | MAINTAINER Matt Bookman <mbookman@google.com>
 9 | 
10 | # Update the aptitude cache, install samtools, and clean up
11 | # the local aptitude repository
12 | ENV DEBIAN_FRONTEND=noninteractive
13 | RUN apt-get update && \
14 |     apt-get install -y samtools && \
15 |     apt-get clean
16 | 


--------------------------------------------------------------------------------
/fastqc/Dockerfile:
--------------------------------------------------------------------------------
 1 | # Copyright 2017 Google Inc.
 2 | #
 3 | # Use of this source code is governed by a BSD-style
 4 | # license that can be found in the LICENSE file or at
 5 | # https://developers.google.com/open-source/licenses/bsd
 6 | 
 7 | FROM java:8
 8 | MAINTAINER Matt Bookman <mbookman@google.com>
 9 | 
10 | # Download FASTQC unzip it and link it to /usr/bin
11 | RUN wget http://www.bioinformatics.babraham.ac.uk/projects/fastqc/fastqc_v0.11.4.zip && \
12 |   unzip fastqc_v0.11.4.zip && \
13 |   chmod +x FastQC/fastqc && \
14 |   cp -r FastQC /usr/share/ && \
15 |   ln -s /usr/share/FastQC/fastqc /usr/bin/
16 | 
17 | 


--------------------------------------------------------------------------------
/pipelines_pylib/poller.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/python
 2 | 
 3 | # Copyright 2017 Google Inc.
 4 | #
 5 | # Use of this source code is governed by a BSD-style
 6 | # license that can be found in the LICENSE file or at
 7 | # https://developers.google.com/open-source/licenses/bsd
 8 | 
 9 | import time
10 | 
11 | def poll(service, operation, poll_interval):
12 |   """Poll a genomics operation until completion.
13 | 
14 |   Args:
15 |       service: genomics service endpoint
16 |       operation: operation object for the operation to poll
17 |       poll_interval: polling interval (in seconds).
18 | 
19 |   Returns:
20 |       The operation object when it has been marked "done".
21 |   """
22 | 
23 |   print
24 |   print "Polling for completion of operation"
25 | 
26 |   while not operation['done']:
27 |     print "Operation not complete. Sleeping %d seconds" % (poll_interval)
28 | 
29 |     time.sleep(poll_interval)
30 | 
31 |     operation = service.operations().get(name=operation['name']).execute()
32 | 
33 |   print
34 |   print "Operation complete"
35 |   print
36 |   return operation
37 | 


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
 1 | Copyright 2017, Google Inc.
 2 | 
 3 | Redistribution and use in source and binary forms, with or without
 4 | modification, are permitted provided that the following conditions are
 5 | met:
 6 | 
 7 |    * Redistributions of source code must retain the above copyright
 8 | notice, this list of conditions and the following disclaimer.
 9 |    * Redistributions in binary form must reproduce the above
10 | copyright notice, this list of conditions and the following disclaimer
11 | in the documentation and/or other materials provided with the
12 | distribution.
13 | 
14 |    * Neither the name of Google Inc. nor the names of its
15 | contributors may be used to endorse or promote products derived from
16 | this software without specific prior written permission.
17 | 
18 | THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
19 | "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
20 | LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
21 | A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
22 | OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
23 | SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
24 | LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
25 | DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
26 | THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
27 | (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
28 | OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
29 | 


--------------------------------------------------------------------------------
/cwl_runner/cwl_shutdown.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | # Copyright 2017 Google Inc.
 4 | #
 5 | # Use of this source code is governed by a BSD-style
 6 | # license that can be found in the LICENSE file or at
 7 | # https://developers.google.com/open-source/licenses/bsd
 8 | 
 9 | # cwl_shutdown.sh
10 | #
11 | # This is the shutdown script that runs on Compute Engine before the VM shuts down.
12 | 
13 | echo "$(date)"
14 | echo "Running shutdown script"
15 | 
16 | readonly METADATA_URL="http://metadata.google.internal/computeMetadata/v1/instance"
17 | readonly METADATA_HEADERS="Metadata-Flavor: Google"
18 | readonly OUTPUT=$(curl "${METADATA_URL}/attributes/output" -H "${METADATA_HEADERS}")
19 | readonly OPERATION_ID=$(curl "${METADATA_URL}/attributes/operation-id" -H "${METADATA_HEADERS}")
20 | readonly STATUS_LOCAL="/tmp/status-${OPERATION_ID}.txt"
21 | 
22 | readonly STDOUT=/tmp/stdout-${OPERATION_ID}.txt
23 | readonly STDERR=/tmp/stderr-${OPERATION_ID}.txt
24 | 
25 | echo "$(date)"
26 | echo "Copying stdout and stderr to Cloud Storage"
27 | CMD="gsutil -m cp ${STDOUT} ${STDERR} ${OUTPUT}/"
28 | echo "${CMD}"
29 | ${CMD}
30 | 
31 | # Typically shutdown will cause a running job to fail and status will be set to FAILED
32 | # In case the status is left as RUNNING, set it to FAILED
33 | STATUS="$(cat ${STATUS_LOCAL})"
34 | echo "Status ${STATUS}"
35 | 
36 | if [[ "${STATUS}" == "RUNNING" ]]; then
37 |   echo "Setting status to FAILED"
38 |   echo "FAILED" > ${STATUS_LOCAL}
39 |   STATUS_FILE=$(curl "${METADATA_URL}/attributes/status-file" -H "${METADATA_HEADERS}")
40 |   gsutil cp ${STATUS_LOCAL} ${STATUS_FILE}
41 | fi
42 | 


--------------------------------------------------------------------------------
/pipelines_pylib/defaults.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/python
 2 | 
 3 | # Copyright 2017 Google Inc.
 4 | #
 5 | # Use of this source code is governed by a BSD-style
 6 | # license that can be found in the LICENSE file or at
 7 | # https://developers.google.com/open-source/licenses/bsd
 8 | 
 9 | # A hard-coded list of the current Compute Engine zones
10 | # If this needs to live on, then it should be replaced by a call to.
11 | # compute.zones().list().
12 | _ZONES = [
13 |   "asia-east1-a", "asia-east1-b", "asia-east1-c",
14 |   "europe-west1-b", "europe-west1-c", "europe-west1-d",
15 |   "us-central1-a", "us-central1-b", "us-central1-c", "us-central1-f",
16 |   "us-east1-b", "us-east1-c", "us-east1-d",
17 |   "us-west1-a", "us-west1-b",
18 | ]
19 | 
20 | def get_zones(input_list):
21 |   """Returns a list of zones based on any wildcard input.
22 | 
23 |   This function is intended to provide an easy method for producing a list
24 |   of desired zones for a pipeline to run in.
25 | 
26 |   Currently the API default zone list is "any zone". The problem with
27 |   "any zone" is that it may lead to incurring Cloud Storage egress charges.
28 |   A user with a bucket in "US" (multi-region) would typically want to
29 |   restrict pipelines to run in either us-central1 or us-east1. The user
30 |   typically cares more about region than zone.
31 | 
32 |   This function allows for a simple short-hand such as:
33 |      [ "us-*" ]
34 |      [ "us-central1-*" ]
35 | 
36 |   These examples will expand out to the full list of US and us-central1 zones
37 |   respectively.
38 | """
39 | 
40 |   output_list = []
41 | 
42 |   for zone in input_list:
43 |     if zone.endswith("*"):
44 |       prefix = zone[:-1]
45 |       output_list.extend(filter(lambda z: z.startswith(prefix), _ZONES))
46 |     else:
47 |       output_list.append(zone)
48 | 
49 |   return output_list
50 | 


--------------------------------------------------------------------------------
/samtools/cloud/samtools.yaml:
--------------------------------------------------------------------------------
 1 | name: samtools
 2 | description: Run samtools on one or more files
 3 | 
 4 | # Define the resources needed for this pipeline.
 5 | resources:
 6 |   zones:
 7 |   - us-central1-a
 8 |   - us-central1-b
 9 |   - us-central1-c
10 |   - us-central1-f
11 |   - us-east1-b
12 |   - us-east1-c
13 |   - us-east1-d
14 | 
15 |   # Create a data disk that is attached to the VM and destroyed when the
16 |   # pipeline terminates.
17 |   disks:
18 |   - name: datadisk
19 |     autoDelete: True
20 | 
21 |     # Within the Docker container, specify a mount point for the disk.
22 |     mountPoint: /mnt/data
23 | 
24 | # Specify the Docker image to use along with the command
25 | docker:
26 |   imageName: gcr.io/YOUR-PROJECT-ID/samtools
27 | 
28 |   # The Pipelines API will create the input directory when localizing files,
29 |   # but does not create the output directory.
30 |   cmd: >
31 |     mkdir /mnt/data/output &&
32 |     find /mnt/data/input &&
33 |     for file in $(/bin/ls /mnt/data/input); do
34 |       samtools index /mnt/data/input/${file} /mnt/data/output/${file}.bai;
35 |     done
36 | 
37 | # The Pipelines API currently supports GCS paths, along with patterns (globs),
38 | # but it doesn't directly support a list of files being passed as a single input
39 | # parameter ("gs://bucket/foo.bam gs://bucket/bar.bam").
40 | inputParameters:
41 | - name: inputPath
42 |   description: Cloud Storage path or pattern to input file(s)
43 |   localCopy:
44 |     path: input/
45 |     disk: datadisk
46 | 
47 | # By specifying an outputParameter, we instruct the pipelines API to
48 | # copy /mnt/data/output/* to the Cloud Storage location specified in
49 | # the pipelineArgs (see below).
50 | outputParameters:
51 | - name: outputPath
52 |   description: Cloud Storage path for where to samtools output
53 |   localCopy:
54 |     path: output/*
55 |     disk: datadisk
56 | 


--------------------------------------------------------------------------------
/bioconductor/countOverlapsFromBAM.R:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env Rscript
 2 | 
 3 | # Copyright 2017 Google Inc.
 4 | #
 5 | # Use of this source code is governed by a BSD-style
 6 | # license that can be found in the LICENSE file or at
 7 | # https://developers.google.com/open-source/licenses/bsd
 8 | 
 9 | # This example is adapted from Bioconductor Vignette:
10 | #   https://bioconductor.org/packages/release/bioc/vignettes/BiocParallel/inst/doc/Introduction_To_BiocParallel.pdf
11 | 
12 | library(GenomicAlignments) ## for GenomicRanges and readGAlignments()
13 | 
14 | # Update this to choose a different genomic region
15 | # or modify this script to take a parameter for this value.
16 | gRanges = GRanges("MT", IRanges((1000:3999)*10, width=1000))
17 | 
18 | # These file paths are relative to the Docker container. Do not update them
19 | # without making the corresponding changes in the pipeline definition.
20 | bamFile = "input.bam"
21 | bamIndex = "input.bam.bai"
22 | outputFile = "overlapsCount.tsv"
23 | 
24 | param = ScanBamParam(which=range(gRanges))
25 | 
26 | # Retrieve the BAM header information. This information is added to the output
27 | # so that if we run this pipeline on many different BAM files, we can
28 | # differentiate between results
29 | header = scanBamHeader(bamFile, index=bamIndex, param=param)
30 | 
31 | # Retrieve the alignments overlapping the desired regions.
32 | gal <- readGAlignments(file = bamFile, index = bamIndex, param = param)
33 | 
34 | # This just a simple sum, but a more elaborate analysis could occur here.
35 | count = sum(countOverlaps(gRanges, gal))
36 | 
37 | # In this case our output is simply one tab-separated row of data,
38 | # but it could be a dataframe, an image, a serialized R object, etc...
39 | result = paste(
40 |   paste(header$input.bam$text$`@RG`,
41 |         collapse="\t"),
42 |   count,
43 |   sep="\t")
44 | 
45 | write(result, file = outputFile)
46 | 


--------------------------------------------------------------------------------
/tools/get_yaml_value.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/python
 2 | 
 3 | # Copyright 2017 Google Inc.
 4 | #
 5 | # Use of this source code is governed by a BSD-style
 6 | # license that can be found in the LICENSE file or at
 7 | # https://developers.google.com/open-source/licenses/bsd
 8 | 
 9 | # get_yaml_value.py
10 | #
11 | # Utility script for extracting values from YAML.
12 | # This will typically be called from shell scripts, which typically
13 | # have fairly hacky ways of extracting values from YAML.
14 | #
15 | # An example usage would be where a shell script has YAML output from
16 | # a gcloud command for an genomics operation and needs to extract fields:
17 | #
18 | #  OP=$(gcloud --format=yaml alpha genomics operations describe "${OP_ID}")
19 | #  CTIME=$(python tools/get_yaml_value.py "${OP}" "metadata.createTime")
20 | #  ETIME=$(python tools/get_yaml_value.py "${OP}" "metadata.endTime")
21 | #
22 | # Note that gcloud directly supports extracting fields, so the above could also
23 | # be:
24 | #
25 | #  CTIME=$(gcloud alpha genomics operations describe "${OP_ID}"
26 | #           --format='value(metadata.createTime)')
27 | #  ETIME=$(gcloud alpha genomics operations describe "${OP_ID}"
28 | #          --format='value(metadata.endTime)')
29 | #
30 | # but then requires an API calls to get each value.
31 | #
32 | # Note that if the value requested does not exist in the YAML, this script
33 | # exits with an error code (1).
34 | 
35 | from __future__ import print_function
36 | 
37 | import sys
38 | import yaml
39 | 
40 | if len(sys.argv) != 3:
41 |   print("Usage: %s [yaml] [field]" % sys.argv[0], file=sys.stderr)
42 |   sys.exit(1)
43 | 
44 | def main(yaml_string, field):
45 |   data = yaml.load(yaml_string)
46 | 
47 |   # field is expected to be period-separated: foo.bar.baz
48 |   fields = field.split('.')
49 | 
50 |   # Walk the list of fields and check that the key exists.
51 |   curr = data
52 |   for key in fields:
53 |     if key in curr:
54 |       curr = curr[key]
55 |     else:
56 |       sys.exit(1)
57 |   
58 |   print(curr)
59 | 
60 | if __name__ == '__main__':
61 |   main(sys.argv[1], sys.argv[2])
62 | 


--------------------------------------------------------------------------------
/fastqc/local/test_fastqc.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | # Copyright 2017 Google Inc.
 4 | #
 5 | # Use of this source code is governed by a BSD-style
 6 | # license that can be found in the LICENSE file or at
 7 | # https://developers.google.com/open-source/licenses/bsd
 8 | 
 9 | # test_fastqc.sh
10 | #
11 | # Simple test script to exercise the fastqc Docker image.
12 | # The Docker image is assumed to have been tagged with the user id
13 | # upon creation:
14 | #
15 | #   docker build -t ${USER}/fastqc PATH/TO/pipelines-api-examples/fastqc/Dockerfile
16 | #
17 | # This test script will download a small BAM file and run "fastqc <file>".
18 | 
19 | set -o nounset
20 | set -o errexit
21 | 
22 | readonly DOCKER_IMAGE=${USER}/fastqc
23 | 
24 | # Use the smallest BAM file in the 1000genomes data (26534 bytes)
25 | readonly TEST_INPUT_URI=http://storage.googleapis.com/genomics-public-data/ftp-trace.ncbi.nih.gov/1000genomes/ftp/technical/pilot3_exon_targetted_GRCh37_bams/data/NA06986/alignment/NA06986.chromMT.ILLUMINA.bwa.CEU.exon_targetted.20100311.bam
26 | 
27 | readonly TEST_INPUT_FILENAME=$(basename ${TEST_INPUT_URI})
28 | 
29 | # Set up the host path locations
30 | readonly HOST_SCRATCH_DIR=$(pwd)/test_mnt
31 | 
32 | # Set up the local (container) path locations
33 | readonly LOCAL_SCRATCH=/scratch
34 | readonly LOCAL_INPUT_NAME=${LOCAL_SCRATCH}/input/${TEST_INPUT_FILENAME}
35 | readonly LOCAL_OUTPUT_DIR=${LOCAL_SCRATCH}/output
36 | 
37 | #
38 | # BEGIN MAIN EXECUTION
39 | #
40 | 
41 | # Create the test input/output directories on the host
42 | rm -rf ${HOST_SCRATCH_DIR}
43 | mkdir -p ${HOST_SCRATCH_DIR}/input
44 | mkdir -p ${HOST_SCRATCH_DIR}/output
45 | 
46 | # Pull down the test BAM file
47 | echo "Copying test file ${TEST_INPUT_FILENAME} to ${HOST_SCRATCH_DIR}"
48 | (cd ${HOST_SCRATCH_DIR}/input && curl -O ${TEST_INPUT_URI})
49 | 
50 | echo
51 | echo "Running fastqc index via docker"
52 | docker run --rm \
53 |   -v ${HOST_SCRATCH_DIR}:${LOCAL_SCRATCH} \
54 |   ${DOCKER_IMAGE} \
55 |   fastqc ${LOCAL_INPUT_NAME} --outdir=${LOCAL_OUTPUT_DIR}
56 | 
57 | echo
58 | echo "Execution completed"
59 | 
60 | echo
61 | echo "Scratch directory:"
62 | cd ${HOST_SCRATCH_DIR} && find
63 | 
64 | 


--------------------------------------------------------------------------------
/tools/operations_util.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | # Copyright 2017 Google Inc.
 4 | #
 5 | # Use of this source code is governed by a BSD-style
 6 | # license that can be found in the LICENSE file or at
 7 | # https://developers.google.com/open-source/licenses/bsd
 8 | 
 9 | # operations_util.sh
10 | 
11 | # get_operation_value
12 | #
13 | # Request just the specified value of the operation
14 | function get_operation_value() {
15 |   local operation_id="${1}"
16 |   local field="${2}"
17 | 
18 |   gcloud alpha genomics operations describe ${operation_id} \
19 |       --format='value('${field}')'
20 | }
21 | readonly -f get_operation_value
22 | 
23 | # get_operation_done_status
24 | #
25 | # Request just the value of the operation top-level "done" field.
26 | # Returns the value in all lower-case.
27 | function get_operation_done_status() {
28 |   local operation_id="${1}"
29 | 
30 |   gcloud alpha genomics operations describe ${operation_id} \
31 |       --format='value(done)' \
32 |     | tr 'A-Z' 'a-z'
33 | }
34 | readonly -f get_operation_done_status
35 | 
36 | # get_operation_status
37 | #
38 | # Return basic status information about the pipeline:
39 | #
40 | #  * done
41 | #  * error
42 | #  * metadata.events
43 | #  * name
44 | #
45 | function get_operation_status() {
46 |   local operation_id="${1}"
47 | 
48 |   gcloud alpha genomics operations describe ${operation_id} \
49 |     --format='yaml(done, error, metadata.events, name)'
50 | }
51 | readonly -f get_operation_status
52 | 
53 | # get_operation_compute_resources
54 | #
55 | # Return the Compute Engine resources for the operation (if present)
56 | #
57 | function get_operation_compute_resources() {
58 |   local operation_id="${1}"
59 | 
60 |   gcloud alpha genomics operations describe ${operation_id} \
61 |     --format='yaml(metadata.runtimeMetadata.computeEngine)'
62 | }
63 | readonly -f get_operation_compute_resources
64 | 
65 | # get_operation_all
66 | #
67 | # Requests the full details of the operation in YAML format
68 | function get_operation_all() {
69 |   local operation_id="${1}"
70 | 
71 |   gcloud alpha genomics operations describe ${operation_id} \
72 |     --format yaml
73 | }
74 | readonly -f get_operation_all
75 | 
76 | 


--------------------------------------------------------------------------------
/samtools/local/test_index.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | # Copyright 2017 Google Inc.
 4 | #
 5 | # Use of this source code is governed by a BSD-style
 6 | # license that can be found in the LICENSE file or at
 7 | # https://developers.google.com/open-source/licenses/bsd
 8 | 
 9 | # test_index.sh
10 | #
11 | # Simple test script to exercise the samtools docker image.
12 | # The docker image is assumed to have been tagged with the user id
13 | # upon creation:
14 | #
15 | #   docker build -t ${USER}/samtools PATH/TO/pipelines-api-examples/samtools/Dockerfile
16 | #
17 | # This test script will download a small BAM file and run "samtools index"
18 | # create a BAI file.
19 | 
20 | set -o nounset
21 | set -o errexit
22 | 
23 | readonly DOCKER_IMAGE=${USER}/samtools
24 | 
25 | # Use the smallest BAM file in the 1000genomes data (26534 bytes)
26 | readonly TEST_INPUT_URI=http://storage.googleapis.com/genomics-public-data/ftp-trace.ncbi.nih.gov/1000genomes/ftp/technical/pilot3_exon_targetted_GRCh37_bams/data/NA06986/alignment/NA06986.chromMT.ILLUMINA.bwa.CEU.exon_targetted.20100311.bam
27 | 
28 | readonly TEST_INPUT_FILENAME=$(basename ${TEST_INPUT_URI})
29 | readonly TEST_OUTPUT_FILENAME=${TEST_INPUT_FILENAME}.bai
30 | 
31 | # Set up the host path locations
32 | readonly HOST_SCRATCH_DIR=$(pwd)/test_mnt
33 | readonly HOST_OUTPUT_FILENAME=${HOST_SCRATCH_DIR}/output/${TEST_OUTPUT_FILENAME}
34 | 
35 | # Set up the local (container) path locations
36 | readonly LOCAL_SCRATCH=/scratch
37 | readonly LOCAL_INPUT_NAME=${LOCAL_SCRATCH}/input/${TEST_INPUT_FILENAME}
38 | readonly LOCAL_OUTPUT_NAME=${LOCAL_SCRATCH}/output/${TEST_OUTPUT_FILENAME}
39 | 
40 | #
41 | # BEGIN MAIN EXECUTION
42 | #
43 | 
44 | # Create the test input/output directories on the host
45 | rm -rf ${HOST_SCRATCH_DIR}
46 | mkdir -p ${HOST_SCRATCH_DIR}/input
47 | mkdir -p ${HOST_SCRATCH_DIR}/output
48 | 
49 | # Pull down the test BAM file
50 | echo "Copying test file ${TEST_INPUT_FILENAME} to ${HOST_SCRATCH_DIR}"
51 | (cd ${HOST_SCRATCH_DIR}/input && curl -O ${TEST_INPUT_URI})
52 | 
53 | echo
54 | echo "Running samtools index via docker"
55 | docker run --rm \
56 |   -v ${HOST_SCRATCH_DIR}:${LOCAL_SCRATCH} \
57 |   ${DOCKER_IMAGE} \
58 |   samtools index ${LOCAL_INPUT_NAME} ${LOCAL_OUTPUT_NAME}
59 | 
60 | echo
61 | echo "Execution completed"
62 | 
63 | echo
64 | echo "Scratch directory:"
65 | cd ${HOST_SCRATCH_DIR} && find
66 | 
67 | 


--------------------------------------------------------------------------------
/tools/poll.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | # Copyright 2017 Google Inc.
 4 | #
 5 | # Use of this source code is governed by a BSD-style
 6 | # license that can be found in the LICENSE file or at
 7 | # https://developers.google.com/open-source/licenses/bsd
 8 | 
 9 | # poll.sh
10 | #
11 | # Polls the completion status of a Google Genomics operation until
12 | # the operation is completed.
13 | #
14 | # When the operation is marked as "done: true", the script emits a
15 | # brief status summary of the operation such that one can easily determine
16 | # whether the operation was successful. For example:
17 | #
18 | #   done: true
19 | #   metadata:
20 | #     events:
21 | #     - description: start
22 | #       startTime: '2016-08-05T23:08:26.432090867Z'
23 | #     - description: pulling-image
24 | #       startTime: '2016-08-05T23:08:26.432154840Z'
25 | #     - description: localizing-files
26 | #       startTime: '2016-08-05T23:09:03.947223371Z'
27 | #     - description: running-docker
28 | #       startTime: '2016-08-05T23:09:03.947277516Z'
29 | #     - description: delocalizing-files
30 | #       startTime: '2016-08-06T00:26:22.863609038Z'
31 | #     - description: ok
32 | #       startTime: '2016-08-06T00:26:24.296178476Z'
33 | #   name: operations/OPERATION-ID  
34 | #
35 | # If an error has occurred, then the top-level "errors" object will be present.
36 | #
37 | # To have the script emit the entire operation, set the environment variable:
38 | #
39 | #   OUTPUT_LEVEL="verbose"
40 | 
41 | set -o errexit
42 | set -o nounset
43 | 
44 | readonly SCRIPT_DIR=$(dirname "${0}")
45 | 
46 | # Bring in operation utility functions
47 | source ${SCRIPT_DIR}/operations_util.sh
48 | 
49 | # MAIN
50 | 
51 | # Check usage
52 | if [[ $# -ne 1 ]] && [[ $# -ne 2 ]]; then
53 |   2>&1 echo "Usage: $0 OPERATION-ID <poll-interval-seconds>"
54 |   exit 1
55 | fi
56 | 
57 | # Extract command-line arguments
58 | readonly OPERATION_ID="${1}"
59 | readonly POLL_INTERVAL_SECONDS="${2:-60}"  # Default 60 seconds between requests
60 | 
61 | # Loop until operation complete
62 | while [[ $(get_operation_done_status "${OPERATION_ID}") == "false" ]]; do
63 |   echo "Operation not complete. Sleeping ${POLL_INTERVAL_SECONDS} seconds"
64 |   sleep ${POLL_INTERVAL_SECONDS}
65 | done
66 | 
67 | # Emit the operation details
68 | echo
69 | echo "Operation complete"
70 | if [[ ${OUTPUT_LEVEL:-} == "verbose" ]]; then
71 |   get_operation_all "${OPERATION_ID}"
72 | else
73 |   get_operation_status "${OPERATION_ID}"
74 | fi
75 | 
76 | 


--------------------------------------------------------------------------------
/samtools/local/test_view.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | # Copyright 2017 Google Inc.
 4 | #
 5 | # Use of this source code is governed by a BSD-style
 6 | # license that can be found in the LICENSE file or at
 7 | # https://developers.google.com/open-source/licenses/bsd
 8 | 
 9 | # test_view.sh
10 | #
11 | # Simple test script to exercise the samtools docker image.
12 | # The docker image is assumed to have been tagged with the user id
13 | # upon creation:
14 | #
15 | #   docker build -t ${USER}/samtools PATH/TO/pipelines-api-examples/samtools/Dockerfile
16 | #
17 | # This test script will download a small BAM file and run "samtools view"
18 | # to dump out the reads.
19 | 
20 | set -o nounset
21 | set -o errexit
22 | 
23 | readonly DOCKER_IMAGE=${USER}/samtools
24 | 
25 | # Use the smallest BAM file in the 1000genomes data (26534 bytes)
26 | readonly TEST_INPUT_URI=http://storage.googleapis.com/genomics-public-data/ftp-trace.ncbi.nih.gov/1000genomes/ftp/technical/pilot3_exon_targetted_GRCh37_bams/data/NA06986/alignment/NA06986.chromMT.ILLUMINA.bwa.CEU.exon_targetted.20100311.bam
27 | 
28 | readonly TEST_INPUT_FILENAME=$(basename ${TEST_INPUT_URI})
29 | readonly TEST_OUTPUT_FILENAME=${TEST_INPUT_FILENAME/%.bam/.sam}
30 | 
31 | # Set up the host path locations
32 | readonly HOST_SCRATCH_DIR=$(pwd)/test_mnt
33 | readonly HOST_OUTPUT_FILENAME=${HOST_SCRATCH_DIR}/output/${TEST_OUTPUT_FILENAME}
34 | 
35 | # Set up the local (container) path locations
36 | readonly LOCAL_SCRATCH=/scratch
37 | readonly LOCAL_INPUT_NAME=${LOCAL_SCRATCH}/input/${TEST_INPUT_FILENAME}
38 | readonly LOCAL_OUTPUT_NAME=${LOCAL_SCRATCH}/output/${TEST_OUTPUT_FILENAME}
39 | 
40 | # Create the test input/output directories on the host
41 | rm -rf ${HOST_SCRATCH_DIR}
42 | mkdir -p ${HOST_SCRATCH_DIR}/input
43 | mkdir -p ${HOST_SCRATCH_DIR}/output
44 | 
45 | # Pull down the test BAM file
46 | echo "Copying test file ${TEST_INPUT_FILENAME} to ${HOST_SCRATCH_DIR}"
47 | (cd ${HOST_SCRATCH_DIR}/input && curl -O ${TEST_INPUT_URI})
48 | 
49 | echo
50 | echo "Running samtools view via docker"
51 | docker run --rm \
52 |   -v ${HOST_SCRATCH_DIR}:${LOCAL_SCRATCH} \
53 |   ${DOCKER_IMAGE} \
54 |   samtools view ${LOCAL_INPUT_NAME} -o ${LOCAL_OUTPUT_NAME}
55 | 
56 | echo
57 | echo "Execution completed"
58 | 
59 | echo
60 | echo "Scratch directory:"
61 | cd ${HOST_SCRATCH_DIR} && find
62 | 
63 | echo
64 | echo "Output file is $(cat ${HOST_OUTPUT_FILENAME} | wc -l) lines"
65 | echo "First 2 lines:"
66 | head -n 2 ${HOST_OUTPUT_FILENAME}
67 | 
68 | 


--------------------------------------------------------------------------------
/set_vcf_sample_id/set_vcf_sample_id.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | 
 3 | # Copyright 2017 Google Inc.
 4 | #
 5 | # Use of this source code is governed by a BSD-style
 6 | # license that can be found in the LICENSE file or at
 7 | # https://developers.google.com/open-source/licenses/bsd
 8 | 
 9 | # set_vcf_sample_id.py
10 | #
11 | # This script processes a single sample VCF file and replaces the
12 | # sample ID in the header line.
13 | #
14 | # This could be replaced (almost) with a one-line sed script:
15 | #
16 | #   sed -e 's/\(^#CHROM\t.*\t\)original$/\1new/' \
17 | #
18 | # What this script adds is a little more control, notably with error
19 | # handling. sed will not report the number of changes, so to determine
20 | # if a change was made, you'd need to make a second pass over the file.
21 | #
22 | # This script reads from stdin and writes to stdout.
23 | #
24 | # Usage:
25 | #   python set_vcf_sample_id.py original_id new_id
26 | #
27 | # If the original_id is specified, it will be verified before making the change.
28 | # If the original_id is set to "", verification will be skipped.
29 | 
30 | import sys
31 | 
32 | def main():
33 |   """Entry point to the script."""
34 | 
35 |   if len(sys.argv) != 3:
36 |     print >> sys.stderr, "Usage: %s original_id new_id" % sys.argv[0]
37 |     sys.exit(1)
38 | 
39 |   original_id = sys.argv[1]
40 |   new_id = sys.argv[2]
41 | 
42 |   lines_processed = 0
43 |   lines_changed = 0
44 |   for line in sys.stdin:
45 |     lines_processed = lines_processed + 1
46 |     # Only line we care about is the #^CHROM line
47 |     if line.startswith('#CHROM\t'):
48 |       fields = line.rstrip('\n').split('\t')
49 | 
50 |       # If an "original_id" was specified, verify that is what is in the file
51 |       if original_id:
52 |         curr_id = fields[-1]
53 |         if curr_id != original_id:
54 |           print >> sys.stderr, \
55 |             "ERROR: Current sample ID does not match expected: %s != %s\n" % (
56 |             curr_id, original_id)
57 |           sys.exit(1)
58 | 
59 |       # Set the new value into the fields array and recreate the line
60 |       fields[-1] = new_id
61 |       line = '\t'.join(fields) + '\n'
62 | 
63 |       lines_changed = lines_changed + 1
64 | 
65 |     # Emit the current line
66 |     sys.stdout.write(line)
67 | 
68 |   # Emit some statistics to stderr
69 |   print >> sys.stderr, "Total lines: %d" % lines_processed
70 |   print >> sys.stderr, "Changed lines: %d" % lines_changed
71 | 
72 |   if lines_changed != 1:
73 |     print >> sys.stderr, "ERROR: Changed lines is not 1"
74 |     sys.exit(1)
75 |     
76 | if __name__ == "__main__":
77 |   main()
78 | 
79 | 


--------------------------------------------------------------------------------
/CONTRIBUTING.rst:
--------------------------------------------------------------------------------
 1 | How to contribute
 2 | ===================================
 3 | 
 4 | First of all, thank you for contributing!
 5 | 
 6 | The mailing list
 7 | ----------------
 8 | 
 9 | For general questions or if you are having trouble getting started, try the 
10 | `Google Genomics Discuss mailing list <https://groups.google.com/forum/#!forum/google-genomics-discuss>`_. 
11 | It's a good way to sync up with other people who use googlegenomics including the core developers. You can subscribe
12 | by sending an email to ``google-genomics-discuss+subscribe@googlegroups.com`` or just post using
13 | the `web forum page <https://groups.google.com/forum/#!forum/google-genomics-discuss>`_.
14 | 
15 | 
16 | Submitting issues
17 | -----------------
18 | 
19 | If you are encountering a bug in the code or have a feature request in mind - file away! 
20 | 
21 | 
22 | Submitting a pull request
23 | -------------------------
24 | 
25 | If you are ready to contribute code, Github provides a nice `overview on how to create a pull request
26 | <https://help.github.com/articles/creating-a-pull-request>`_.
27 | 
28 | Some general rules to follow:
29 | 
30 | * Do your work in `a fork <https://help.github.com/articles/fork-a-repo>`_ of this repo.
31 | * Create a branch for each update that you're working on. 
32 |   These branches are often called "feature" or "topic" branches. Any changes
33 |   that you push to your feature branch will automatically be shown in the pull request.
34 | * Keep your pull requests as small as possible. Large pull requests are hard to review. 
35 |   Try to break up your changes into self-contained and incremental pull requests.
36 | * The first line of commit messages should be a short (<80 character) summary, 
37 |   followed by an empty line and then any details that you want to share about the commit.
38 | * Please try to follow the existing syntax style
39 | 
40 | When you submit or change your pull request, the Travis build system will automatically run tests. 
41 | If your pull request fails to pass tests, review the test log, make changes and
42 | then push them to your feature branch to be tested again.
43 | 
44 | 
45 | Contributor License Agreements
46 | ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
47 | 
48 | All pull requests are welcome. Before we can submit them though, there is a legal hurdle we have to jump. 
49 | You'll need to fill out either the individual or corporate Contributor License Agreement
50 | (CLA).
51 | 
52 | * If you are an individual writing original source code and you're sure you
53 |   own the intellectual property, then you'll need to sign an `individual CLA
54 |   <https://developers.google.com/open-source/cla/individual>`_.
55 | * If you work for a company that wants to allow you to contribute your work,
56 |   then you'll need to sign a `corporate CLA
57 |   <https://developers.google.com/open-source/cla/corporate>`_.
58 | 
59 | Follow either of the two links above to access the appropriate CLA and
60 | instructions for how to sign and return it. Once we receive it, we'll be able to
61 | accept your pull requests.
62 | 


--------------------------------------------------------------------------------
/bioconductor/README.md:
--------------------------------------------------------------------------------
 1 | # Use Bioconductor to count overlaps in a BAM file
 2 | 
 3 | This example counts the number of reads overlapping a particular region in the genome.
 4 | 
 5 | It uses:
 6 | 
 7 | * a [Bioconductor Docker container](https://bioconductor.org/help/docker/)
 8 | * an [R script](./countOverlapsFromBAM.R)
 9 | * and BAM file and index.
10 | 
11 | It emits a one-line TSV file containing the count of overlapping reads and metadata identifying the sample from which the result was obtained.
12 | 
13 | This simplistic example could be extended to:
14 | 
15 | * Run a more interesting Bioconductor analysis on the BAM file.
16 | * Loop over, for example, all the 1000 Genomes phase 3 BAMs in [gs://genomics-public-data/ftp-trace.ncbi.nih.gov/1000genomes/ftp/phase3/data/\*/high_coverage_alignment/\*.bam](https://console.cloud.google.com/storage/browser/genomics-public-data/ftp-trace.ncbi.nih.gov/1000genomes/ftp/phase3/data/HG00096/high_coverage_alignment/), kicking off the parallel execution of this pipeline on each sample and emitting a distinct output file for each result.
17 | 
18 | ## (1) Fetch the Docker container.
19 | ```
20 | docker pull bioconductor/release_core
21 | ```
22 | ## (2) Test the pipeline locally.
23 | ```
24 | wget -O input.bam \
25 |   ftp://ftp-trace.ncbi.nih.gov/1000genomes/ftp/technical/pilot3_exon_targetted_GRCh37_bams/data/NA06986/alignment/NA06986.chromMT.ILLUMINA.bwa.CEU.exon_targetted.20100311.bam
26 | 
27 | wget -O input.bam.bai \
28 |   ftp://ftp-trace.ncbi.nih.gov/1000genomes/ftp/technical/pilot3_exon_targetted_GRCh37_bams/data/NA06986/alignment/NA06986.chromMT.ILLUMINA.bwa.CEU.exon_targetted.20100311.bam.bai
29 | 
30 | docker run -v `pwd`:/mnt/data bioconductor/release_core \
31 |   /bin/bash -c "cd /mnt/data/ ; R CMD BATCH countOverlapsFromBAM.R"
32 | ```
33 | 
34 | The result should a one-line TSV file and also the log from the R batch command:
35 | ```
36 | ~$ cat overlapsCount.tsv
37 | ID:SRR003486	PL:ILLUMINA	LB:Solexa-5005	PI:0	DS:SRP000033	SM:NA06986	CN:BI	20626
38 | 
39 | ~$ cat countOverlapsFromBAM.Rout
40 | 
41 | R version 3.2.3 RC (2015-12-03 r69731) -- "Wooden Christmas-Tree"
42 | Copyright (C) 2015 The R Foundation for Statistical Computing
43 | Platform: x86_64-pc-linux-gnu (64-bit)
44 | . . .
45 | ```
46 | ## (3) Upload the script to Cloud Storage.
47 | ```
48 | gsutil cp countOverlapsFromBAM.R gs://YOUR-BUCKET/pipelines-api-examples/bioconductor/script.R
49 | ```
50 | ## (4) Run the pipeline on the cloud.
51 | Edit your copy of [run_bioconductor.py](./run_bioconductor.py) to specify:
52 | 
53 |   1. The id of the Google Cloud Platform project in which the pipeline should run.
54 |   1. The bucket in which the pipeline output file and log files should be placed.
55 | 
56 | And then run the script:
57 | ```
58 |  python ./run_bioconductor.py
59 | ```
60 | 
61 | It will emit the operation id and poll for completion.
62 | 
63 | ## (5) View the resultant files.
64 | Navigate to your bucket in the [Cloud Console](https://console.cloud.google.com/project/_/storage) to see the resultant TSV file and log files for the operation.
65 | 
66 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | # pipelines-api-examples
 2 | 
 3 | This repository contains examples for the [Google Genomics Pipelines API]
 4 | (https://cloud.google.com/genomics/reference/rest/v1alpha2/pipelines).
 5 | 
 6 | | Alpha |
 7 | |-------|
 8 | | This is an Alpha release of Google Genomics API. This feature might be changed in backward-incompatible ways and is not recommended for production use. It is not subject to any SLA or deprecation policy. |
 9 | 
10 | The API provides an easy way to create, run, and monitor command-line tools
11 | on Google Compute Engine running in a Docker container. You can use it like
12 | you would a job scheduler.
13 | 
14 | The most common use case is to run an off-the-shelf tool or custom script
15 | that reads and writes files. You may want to run such a tool over files in
16 | Google Cloud Storage. You may want to run this independently over hundreds
17 | or thousands of files.
18 | 
19 | The typical flow for a pipeline is:
20 | 
21 |   1. Create a Compute Engine virtual machine
22 |   1. Copy one or more files from Cloud Storage to a disk
23 |   1. Run the tool on the file(s)
24 |   1. Copy the output to Cloud Storage
25 |   1. Destroy the Compute Engine virtual machine
26 | 
27 | You can submit batch operations from your laptop, and have them run in the cloud.
28 | You can do the packaging to Docker yourself, or use existing Docker images.
29 | 
30 | ## Prerequisites
31 | 
32 | 1. Clone or fork this repository.
33 | 1. If you plan to create your own Docker images, then install docker: https://docs.docker.com/engine/installation/#installation
34 | 1. Follow the Google Genomics [getting started instructions](https://cloud.google.com/genomics/install-genomics-tools#create-project-and-authenticate) to set up your Google Cloud Project. The Pipelines API requires that the following are enabled in your project:
35 |     1. [Genomics API](https://console.cloud.google.com/project/_/apis/api/genomics)
36 |     2. [Cloud Storage API](https://console.cloud.google.com/project/_/apis/api/storage_api)
37 |     3. [Compute Engine API](https://console.cloud.google.com/project/_/apis/api/compute_component)
38 | 1. Follow the Google Genomics [getting started instructions](https://cloud.google.com/genomics/install-genomics-tools#install-genomics-tools) to install and authorize the Google Cloud SDK.
39 | 1. Install or update the python client via `pip install --upgrade google-api-python-client`.  For more detail see https://cloud.google.com/genomics/v1/libraries.
40 | 
41 | ## Examples
42 | 
43 | * [Compress or Decompress files](./compress)
44 | * [Run FastQC over a list of BAM or FASTQ files](./fastqc)
45 | * [Use samtools to create a BAM index file](./samtools)
46 | * [Use a custom script in Cloud Storage to update a VCF header](./set_vcf_sample_id)
47 | * [Use Bioconductor to count overlaps in a BAM file](./bioconductor)
48 | * [Use Cromwell and WDL to orchestrate a multi-stage workflow](./wdl_runner)
49 | 
50 | ## See Also
51 | 
52 | * [Pipelines API docs](https://cloud.google.com/genomics/reference/rest/v1alpha2/pipelines)
53 | * [PyDocs](https://developers.google.com/resources/api-libraries/documentation/genomics/v1alpha2/python/latest/)
54 | 


--------------------------------------------------------------------------------
/set_vcf_sample_id/process_vcfs.sh:
--------------------------------------------------------------------------------
  1 | #!/bin/bash
  2 | 
  3 | # Copyright 2017 Google Inc.
  4 | #
  5 | # Use of this source code is governed by a BSD-style
  6 | # license that can be found in the LICENSE file or at
  7 | # https://developers.google.com/open-source/licenses/bsd
  8 | 
  9 | # process_vcfs.sh
 10 | #
 11 | # Simple shell script which can be used to change the "sample ID" in
 12 | # a single-sample VCF file. For example, suppose your VCF "header line"
 13 | # looks like:
 14 | #
 15 | #   #CHROM POS ID REF ALT QUAL FILTER INFO FORMAT SAMPLE-001
 16 | #
 17 | # but you want it to look like:
 18 | #
 19 | #   #CHROM POS ID REF ALT QUAL FILTER INFO FORMAT SAMPLE-001-TEST-01
 20 | #
 21 | # and you have note just a single VCF, but many (maybe one VCF per chromosome).
 22 | #
 23 | # This script can be run on a list of VCFs to update the header.
 24 | # The VCFs can be uncompressed or compressed with gzip or bzip2.
 25 | # If the input VCFs are compressed, then the output VCFs will be too.
 26 | #
 27 | # ** Note that this script will delete the input VCF from the local disk. **
 28 | # ** This script is intended to be run as part of a Pipeline on a VM in   **
 29 | # ** the cloud. Deleting the local copy of the input file allows for the  **
 30 | # ** disk to be sized at less than 2x all of the input VCF files, namely: **
 31 | # **                                                                      **
 32 | # **    disk size ~= 2*size(largest uncompressed VCF)                     **
 33 | # **                 + size(remaining VCFs)                               **
 34 | # **                                                                      **
 35 | 
 36 | set -o errexit
 37 | set -o nounset
 38 | 
 39 | # Usage:
 40 | #  ./process_vcfs.sh \
 41 | #      [original_sample_id] \
 42 | #      [new_sample_id] \
 43 | #      [input_path] \
 44 | #      [output_path]
 45 | #
 46 | #  original_sample_id: If set to a non-empty string, the sample ID in the
 47 | #                      input VCF header will be verified before update
 48 | #  new_sample_id: Set to the new sample ID
 49 | #  input_path: on-disk directory or pattern of input VCF files
 50 | #  output_path: on-disk directory to copy output VCF files
 51 | 
 52 | readonly ORIG_SAMPLE_ID="${1}"
 53 | readonly NEW_SAMPLE_ID="${2}"
 54 | readonly INPUT_PATH="${3%/}"   # Trim trailing slash (if any)
 55 | readonly OUTPUT_PATH="${4%/}"  # Trim trailing slash (if any)
 56 | 
 57 | function log() {
 58 |   echo "${1}"
 59 | }
 60 | readonly -f log
 61 | 
 62 | # Dump out some details for of the script parameters and environment
 63 | 
 64 | log "BEGIN: log runtime details:"
 65 | log "Original sample id: ${ORIG_SAMPLE_ID}"
 66 | log "New sample id: ${NEW_SAMPLE_ID}"
 67 | log "Input path: ${INPUT_PATH}"
 68 | log "Output path: ${OUTPUT_PATH}"
 69 | 
 70 | log "find /mnt"
 71 | find /mnt
 72 | 
 73 | log "df -k -h"
 74 | df -k -h
 75 | 
 76 | log "env"
 77 | env
 78 | log "END: log runtime details"
 79 | 
 80 | # Process the input files
 81 | 
 82 | declare -i COUNT=0
 83 | declare -i SKIPPED=0
 84 | declare -i UPDATED=0
 85 | 
 86 | readonly START=$(date +%s)
 87 | for FILE in ${INPUT_PATH}; do
 88 |   # Check if the input file is compressed.
 89 |   # We'll need to decompress it for processing and then compress the output.
 90 |   COMPRESSION=""
 91 |   case "${FILE}" in
 92 |     *.gz)
 93 |       COMPRESSION=gz
 94 |       gunzip ${FILE}
 95 |       FILE=${FILE%.gz}
 96 |       ;;
 97 | 
 98 |    *.bz2)
 99 |       COMPRESSION=bz2
100 |       bunzip2 ${FILE}
101 |       FILE=${FILE%.bz2}
102 |       ;;
103 |   esac
104 | 
105 |   INPUT_DIR=$(dirname ${FILE})
106 |   FILE_NAME=$(basename ${FILE})
107 | 
108 |   log "Updating header for file ${FILE_NAME}"
109 |   cat ${FILE} |
110 |     python \
111 |       "${SCRIPT_DIR}/set_vcf_sample_id.py" \
112 |       "${ORIG_SAMPLE_ID}" "${NEW_SAMPLE_ID}" \
113 |       > ${OUTPUT_PATH}/${FILE_NAME}
114 | 
115 |   # To minimize disk usage, remove the input file now
116 |   rm -f ${FILE}
117 | 
118 |   UPDATED=$((UPDATED + 1))
119 | 
120 |   # Compress the output file if the input was compressed
121 |   case "${COMPRESSION}" in
122 |     gz)
123 |       gzip ${OUTPUT_PATH}/${FILE_NAME}
124 |       ;;
125 |     bz2)
126 |       bzip2 ${OUTPUT_PATH}/${FILE_NAME}
127 |       ;;
128 |   esac
129 | 
130 |   COUNT=$((COUNT + 1))
131 | done
132 | readonly END=$(date +%s)
133 | 
134 | log ""
135 | log "Updated: ${UPDATED}"
136 | log "Skipped: ${SKIPPED}"
137 | log "Total: ${COUNT} files processed in $((END-START)) seconds"
138 | 
139 | 


--------------------------------------------------------------------------------
/cwl_runner/README.md:
--------------------------------------------------------------------------------
  1 | # Run a CWL workflow
  2 | 
  3 | ## This example demonstrates running a multi-stage workflow on Google Cloud Platform
  4 | 
  5 | * The workflow is launched with a bash script, [cwl_runner.sh](cwl_runner.sh), that calls the gcloud command-line tool that is included in the [Google Cloud SDK](https://cloud.google.com/sdk)
  6 | * The workflow is defined using the [Common Workflow Language (CWL)](http://www.commonwl.org)
  7 | * The workflow stages are orchestrated using [cwltool](https://github.com/common-workflow-language/cwltool/tree/master/cwltool) or [rabix](https://github.com/rabix/bunny).
  8 | 
  9 | To run a CWL workflow, `cwl_runner.sh` will:
 10 | 
 11 | 1. Create a disk
 12 | 1. Create a Compute Engine VM with that disk
 13 | 1. Run a startup script on the VM
 14 | 
 15 | The startup script, [cwl_startup.sh](cwl_startup.sh), will run on the VM and:
 16 | 
 17 | 1. Mount and format the disk
 18 | 1. Download input files from Google Cloud Storage
 19 | 1. Install Docker
 20 | 1. Install cwltool
 21 | 1. Run the CWL workflow and wait until completion
 22 | 1. Copy output files to Google Cloud Storage
 23 | 1. Copy stdout and stderr logs to Google Cloud Storage
 24 | 1. Shutdown and delete the VM and disk
 25 | 
 26 | __Note that the CWL runner does not use the [Pipelines API](https://cloud.google.com/genomics/reference/rest/v1alpha2/pipelines). If you don't have enough quota, the script will fail; it won't be queued to run when quota is available.__
 27 | 
 28 | ## Prerequisites
 29 | 
 30 | 1. Download the required script files, `cwl_runner.sh`, `cwl_startup.sh` and `cwl_shutdown.sh`, or, if you prefer, clone or fork this github repository.
 31 | 1. Enable the Genomics, Cloud Storage, and Compute Engine APIs on a new or existing Google Cloud Project using the [Cloud Console](https://console.cloud.google.com/flows/enableapi?apiid=storage_component,compute_component&redirect=https://console.cloud.google.com)
 32 | 1. Install and initialize the [Google Cloud SDK](https://cloud.google.com/sdk).
 33 | 1. Follow the Cloud Storage instructions for [Creating Storage Buckets](https://cloud.google.com/storage/docs/creating-buckets) to create a bucket to store workflow output and logging
 34 | 
 35 | ## Running a sample workflow in the cloud
 36 | 
 37 | This script should be able to support any CWL workflow supported by cwltool.
 38 | 
 39 | You can run the script with `--help` to see all of the command-line options.
 40 | 
 41 | ```
 42 | ./cwl_runner.sh --help
 43 | ```
 44 | 
 45 | As an example, let's run a real workflow from the [Genome Data Commons](https://gdc.cancer.gov) stored in a [GDC github project](https://github.com/nci-gdc/gdc-dnaseq-cwl).
 46 | 
 47 | This particular workflow requires:
 48 | 
 49 | * a reference genome bundle
 50 | * a DNA reads file in BAM format
 51 | * several CWL tool definitions
 52 | 
 53 | All of the required files have already been copied into Google Cloud Storage (at gs://genomics-public-data/cwl-examples/gdc-dnaseq-cwl), so we can just reference them when we run the CWL workflow.
 54 | 
 55 | Here's an example command-line:
 56 | 
 57 | ```
 58 | ./cwl_runner.sh \
 59 |   --workflow-file gs://genomics-public-data/cwl-examples/gdc-dnaseq-cwl/workflows/dnaseq/transform.cwl \
 60 |   --settings-file gs://genomics-public-data/cwl-examples/gdc-dnaseq-cwl/input/gdc-dnaseq-input.json \
 61 |   --input-recursive gs://genomics-public-data/cwl-examples/gdc-dnaseq-cwl \
 62 |   --output gs://MY-BUCKET/MY-PATH \
 63 |   --machine-type n1-standard-4
 64 | ```
 65 | 
 66 | Set `MY-BUCKET/MY-PATH` to a path in a Cloud Storage bucket that you have write access to.
 67 | 
 68 | The workflow will start running. If all goes well, it should complete in a couple of hours.
 69 | 
 70 | Here's some more information about what's happening:
 71 | 
 72 | * The command will run the CWL workflow definition located at the `workflow-file` path in Cloud Storage, using the workflow settings in the `settings-file`.
 73 | * All path parameters defined in the `settings-file` are relative to the location of the file.
 74 | * A reference genome is required as input; the reference genome files are identified by the `input` wildcard path.
 75 | * This particular GDC workflow uses lots of relative paths to the definition files for the individual workflow steps. In order to preserve relative paths, the GDC directory is recursively copied from the path passed to `input-recursive`.
 76 | * Output files and logs will be written to the `output` folder.
 77 | * The whole workflow will run on a single VM instance of the specified `machine-type`.
 78 | 
 79 | ## Monitoring your workflow
 80 | 
 81 | Once your job starts, it will have an `OPERATION-ID` assigned, which you can use to check status and find the VM and disk in your cloud project.
 82 | 
 83 | To monitor your job, check the status to see if it's RUNNING, COMPLETED, or FAILED:
 84 | ```
 85 | gsutil cat gs://MY-BUCKET/MY-PATH/status-OPERATION-ID.txt
 86 | ```
 87 | 
 88 | While your job is running, you can see the VM in the [Cloud Console](https://console.cloud.google.com/compute/instances) and command-line. When the job completes, the VM will no longer be found unless `--keep-alive` is set. Command-line:  
 89 | 
 90 | ```
 91 | gcloud compute instances describe cwl-vm-OPERATION-ID
 92 | ```
 93 | 
 94 | ## Canceling a job
 95 | 
 96 | To cancel a running job, you can terminate the VM from the cloud console or command-line:
 97 | ```
 98 | gcloud compute instances delete cwl-vm-OPERATION-ID
 99 | ```
100 | 
101 | ## Debugging a job
102 | 
103 | To debug a failed run, look at the log files in your output directory. 
104 | 
105 | Cloud console:
106 | ```
107 | https://console.cloud.google.com/storage/browser
108 | ```
109 | 
110 | Command-line:
111 | ```
112 | gsutil cat gs://MY-BUCKET/MY-PATH/stderr-OPERATION-ID.txt | less
113 | gsutil cat gs://MY-BUCKET/MY-PATH/stdout-OPERATION-ID.txt | less
114 | ```
115 | 
116 | For additional debugging, you can rerun this script with --keep-alive and ssh into the VM.
117 | If you use --keep-alive, you will need to manually delete the VM to avoid charges.
118 | 


--------------------------------------------------------------------------------
/cwl_runner/cwl_runner.sh:
--------------------------------------------------------------------------------
  1 | #!/bin/bash
  2 | 
  3 | # Copyright 2017 Google Inc.
  4 | #
  5 | # Use of this source code is governed by a BSD-style
  6 | # license that can be found in the LICENSE file or at
  7 | # https://developers.google.com/open-source/licenses/bsd
  8 | 
  9 | # cwl_runner.sh
 10 | #
 11 | # From your shell prompt, launch a Google Compute Engine VM to run
 12 | # a Common Workflow Language (CWL) workflow with cwltool.
 13 | 
 14 | declare WORKFLOW_PATH=
 15 | declare SETTINGS_PATH=
 16 | declare INPUT=
 17 | declare INPUT_RECURSIVE=
 18 | declare OUTPUT=
 19 | declare KEEP_ALIVE=
 20 | declare DISK_SIZE=200
 21 | declare MACHINE_TYPE="n1-standard-1"
 22 | declare PREEMPTIBLE=
 23 | declare RUNNER="cwltool"
 24 | declare ZONE=
 25 | readonly OPERATION_ID=$$
 26 | 
 27 | read -r -d '' HELP_MESSAGE << EOM
 28 | 
 29 | USAGE: $0 [args]
 30 | 
 31 | -h --help
 32 |   Show help message and exit
 33 | 
 34 | Common options:
 35 | -w --workflow-file PATH
 36 |   REQUIRED. The absolute path to the .cwl workflow definition file in Cloud Storage.
 37 | -s --settings-file PATH
 38 |   REQUIRED. The absolute path to the .json settings file in Cloud Storage.
 39 | -i --input GCS_PATH1[,GCS_PATH2,...]
 40 |   The absolute path(s) in Cloud Storage to the file(s) that must be copied to the VM's local disk. 
 41 | -I --input-recursive GCS_PATH1[,GCS_PATH2,...]
 42 |   The absolute path(s) in Cloud Storage to the folder(s) that must be copied to the VM's local disk. 
 43 | -m --machine-type STRING
 44 |   The Google Compute Engine VM machine type name. Default: ${MACHINE_TYPE}.
 45 | -o --output GCS_PATH
 46 |   REQUIRED. The path where CWL outputs and logs will be copied after the workflow completes.
 47 | 
 48 | Other options:
 49 | -d --disk-size INT
 50 |   The disk size in Gb. Default: ${DISK_SIZE}.
 51 | -k --keep-alive
 52 |   Leave the VM running after the workflow completes or fails so that you can ssh in for debugging.
 53 | -p --preemptible
 54 |   Run with a preemptible VM that costs less but may be terminated before finishing.
 55 | -r --runner STRING
 56 |   The CWL runner to use. Values can be "cwltool" or "rabix". Default: ${RUNNER}.
 57 | -z --zone STRING
 58 |   The zone to launch the VM and disk in. If omitted, your default project zone will be used.
 59 | 
 60 | EOM
 61 | 
 62 | set -o errexit
 63 | set -o nounset
 64 | 
 65 | # Parse command-line
 66 | while [[ $# -gt 0 ]]; do
 67 |   KEY="$1"
 68 | 
 69 |   case ${KEY} in
 70 |     -h|--help)
 71 |     echo "${HELP_MESSAGE}"
 72 |     exit 1
 73 |     ;;
 74 |     -w|--workflow-file)
 75 |     WORKFLOW_FILE="$2"
 76 |     shift
 77 |     ;;
 78 |     -s|--settings-file)
 79 |     SETTINGS_FILE="$2"
 80 |     shift
 81 |     ;;
 82 |     -i|--input)
 83 |     INPUT="$2"
 84 |     shift
 85 |     ;;
 86 |     -I|--input-recursive)
 87 |     INPUT_RECURSIVE="$2"
 88 |     shift
 89 |     ;;
 90 |     -o|--output)
 91 |     OUTPUT="$2"
 92 |     shift
 93 |     ;;
 94 |     -d|--disk-size)
 95 |     DISK_SIZE="$2"
 96 |     shift
 97 |     ;;
 98 |     -k|--keep-alive)
 99 |     KEEP_ALIVE="true"
100 |     ;;
101 |     -m|--machine-type)
102 |     MACHINE_TYPE="$2"
103 |     shift
104 |     ;;
105 |     -p|--preemptible)
106 |     PREEMPTIBLE="--preemptible"
107 |     ;;
108 |     -z|--zone)
109 |     ZONE="--zone $2"
110 |     shift
111 |     ;;
112 |     -r|--runner)
113 |     RUNNER="$2"
114 |     shift
115 |     ;;
116 |     *)
117 |     # unknown option
118 |     ;;
119 |   esac
120 |   shift
121 | done
122 | 
123 | if [[ -z "${WORKFLOW_FILE}" || -z "${SETTINGS_FILE}" || -z "${OUTPUT}" ]]; then
124 |   >&2 echo "Error: Workflow file, settings file, and output are required."
125 |   exit 1
126 | fi
127 | 
128 | readonly DISK_NAME="cwl-disk-${OPERATION_ID}"
129 | readonly DISK_CMD="gcloud compute disks create ${DISK_NAME} ${ZONE} --size ${DISK_SIZE}"
130 | 
131 | readonly SCRIPT_DIR="$( cd $( dirname ${BASH_SOURCE[0]} ) && pwd )"
132 | 
133 | readonly STARTUP_SCRIPT_NAME="cwl_startup.sh"
134 | readonly STARTUP_SCRIPT="${SCRIPT_DIR}/${STARTUP_SCRIPT_NAME}"
135 | readonly STARTUP_SCRIPT_URL="${OUTPUT}/${STARTUP_SCRIPT_NAME%.*}-${OPERATION_ID}.sh"
136 | 
137 | readonly SHUTDOWN_SCRIPT_NAME="cwl_shutdown.sh"
138 | readonly SHUTDOWN_SCRIPT="${SCRIPT_DIR}/${SHUTDOWN_SCRIPT_NAME}"
139 | readonly SHUTDOWN_SCRIPT_URL="${OUTPUT}/${SHUTDOWN_SCRIPT_NAME%.*}-${OPERATION_ID}.sh"
140 | 
141 | readonly STATUS_FILE="${OUTPUT}/status-${OPERATION_ID}.txt"
142 | 
143 | readonly VM_NAME="cwl-vm-${OPERATION_ID}"
144 | readonly VM_CMD="gcloud compute instances create ${VM_NAME} \
145 | --disk name=${DISK_NAME},device-name=${DISK_NAME},auto-delete=yes \
146 | --machine-type ${MACHINE_TYPE} \
147 | --scopes storage-rw,compute-rw \
148 | ${ZONE} \
149 | ${PREEMPTIBLE} \
150 | --metadata \
151 | startup-script-url=${STARTUP_SCRIPT_URL},\
152 | shutdown-script-url=${SHUTDOWN_SCRIPT_URL},\
153 | operation-id=${OPERATION_ID},\
154 | workflow-file=${WORKFLOW_FILE},\
155 | settings-file=${SETTINGS_FILE},\
156 | input=\"${INPUT}\",\
157 | input-recursive=\"${INPUT_RECURSIVE}\",\
158 | output=${OUTPUT},\
159 | runner=${RUNNER},\
160 | status-file=${STATUS_FILE},\
161 | keep-alive=${KEEP_ALIVE}"
162 | 
163 | >&2 echo $(date)
164 | >&2 echo "Generating script commands and writing to file"
165 | readonly TMP_SCRIPT=".$(basename ${0%.*} )-${OPERATION_ID}.sh"
166 | cat > "${TMP_SCRIPT}" << EOF
167 | #!/bin/bash
168 | >&2 ${DISK_CMD}
169 | >&2 ${VM_CMD}
170 | echo ${OPERATION_ID}
171 | EOF
172 | 
173 | >&2 echo "Copying scripts to the output path in Cloud Storage"
174 | gsutil cp "${STARTUP_SCRIPT}" "${STARTUP_SCRIPT_URL}"
175 | gsutil cp "${SHUTDOWN_SCRIPT}" "${SHUTDOWN_SCRIPT_URL}"
176 | gsutil cp "${TMP_SCRIPT}" "${OUTPUT}/${TMP_SCRIPT/./}"
177 | rm "${TMP_SCRIPT}"
178 | 
179 | >&2 echo "Creating Google Compute Engine VM and disk"
180 | >&2 echo "${DISK_CMD}"
181 | >&2 ${DISK_CMD}
182 | 
183 | >&2 echo "${VM_CMD}"
184 | >&2 ${VM_CMD}
185 | 
186 | echo ${OPERATION_ID}
187 | 
188 | >&2 cat << EOM
189 | 
190 | Congratulations! Your job is running.
191 | 
192 | To monitor your job, check the status to see if it's RUNNING, COMPLETED, or FAILED:
193 | gsutil cat "${STATUS_FILE}"
194 | 
195 | While your job is running, you can see the VM in the cloud console and command-line.
196 | When the job completes, the VM will no longer be found unless --keep-alive is set.
197 | 
198 | Cloud console: 
199 | https://console.cloud.google.com/compute/instances
200 | 
201 | Command-line:  
202 | gcloud compute instances describe ${VM_NAME} ${ZONE}
203 | 
204 | To cancel a running job, you can delete the VM from the cloud console or command-line:
205 | gcloud compute instances delete ${VM_NAME} ${ZONE}
206 | 
207 | To debug a failed run, look at the log files in your output directory. 
208 | 
209 | Cloud console: 
210 | https://console.cloud.google.com/storage/browser/${OUTPUT/gs:\/\//}
211 | 
212 | Command-line:
213 | gsutil cat "${OUTPUT}/stderr-${OPERATION_ID}.txt" | less
214 | gsutil cat "${OUTPUT}/stdout-${OPERATION_ID}.txt" | less
215 | 
216 | For additional debugging, you can rerun this script with --keep-alive and ssh into the VM.
217 | If you use --keep-alive, you will need to manually delete the VM to avoid charges.
218 | EOM
219 | 


--------------------------------------------------------------------------------
/bioconductor/run_bioconductor.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/python
  2 | 
  3 | # Copyright 2017 Google Inc.
  4 | #
  5 | # Use of this source code is governed by a BSD-style
  6 | # license that can be found in the LICENSE file or at
  7 | # https://developers.google.com/open-source/licenses/bsd
  8 | 
  9 | """Python sample demonstrating use of the Google Genomics Pipelines API.
 10 | 
 11 | This sample demonstrates a pipeline that uses Bioconductor to analyze
 12 | files in Google Cloud Storage.
 13 | 
 14 | This pipeline is run in an "ephemeral" manner; no call to pipelines.create()
 15 | is necessary. No pipeline is persisted in the pipelines list.
 16 | """
 17 | 
 18 | import pprint
 19 | import time
 20 | 
 21 | from oauth2client.client import GoogleCredentials
 22 | from apiclient.discovery import build
 23 | 
 24 | PROJECT_ID='**FILL IN PROJECT ID**'
 25 | BUCKET='**FILL IN BUCKET**'
 26 | 
 27 | # Output will be written underneath gs://<BUCKET>/<PREFIX>/
 28 | PREFIX='pipelines-api-examples/bioconductor'
 29 | 
 30 | # Update this path if you uploaded the script elsewhere in Cloud Storage.
 31 | SCRIPT='gs://%s/%s/script.R' % (BUCKET, PREFIX)
 32 | 
 33 | # This script will poll for completion of the pipeline.
 34 | POLL_INTERVAL_SECONDS = 20
 35 | 
 36 | # Create the genomics service.
 37 | credentials = GoogleCredentials.get_application_default()
 38 | service = build('genomics', 'v1alpha2', credentials=credentials)
 39 | 
 40 | # Run the pipeline.
 41 | operation = service.pipelines().run(body={
 42 |   # The ephemeralPipeline provides the template for the pipeline.
 43 |   # The pipelineArgs provide the inputs specific to this run.
 44 |   'ephemeralPipeline' : {
 45 |     'projectId': PROJECT_ID,
 46 |     'name': 'Bioconductor: count overlaps in a BAM',
 47 |     'description': 'This sample demonstrates a subset of the vignette https://bioconductor.org/packages/release/bioc/vignettes/BiocParallel/inst/doc/Introduction_To_BiocParallel.pdf.',
 48 | 
 49 |     # Define the resources needed for this pipeline.
 50 |     'resources' : {
 51 |       # Specify default VM parameters for the pipeline.
 52 |       'minimumCpuCores': 1,  # TODO: remove this when the API has a default.
 53 |       'minimumRamGb': 3.75, # TODO: remove this when the API has a default.
 54 | 
 55 |       # Create a data disk that is attached to the VM and destroyed when the
 56 |       # pipeline terminates.
 57 |       'disks': [ {
 58 |         'name': 'data',
 59 |         'autoDelete': True,
 60 | 
 61 |         # Within the docker container, specify a mount point for the disk.
 62 |         # The pipeline input argument below will specify that inputs should be
 63 |         # written to this disk.
 64 |         'mountPoint': '/mnt/data',
 65 | 
 66 |         # Specify a default size and type.
 67 |         'sizeGb': 100,            # TODO: remove this when the API has a default
 68 |         'type': 'PERSISTENT_HDD', # TODO: remove this when the API has a default
 69 |       } ],
 70 |     },
 71 | 
 72 |     # Specify the docker image to use along with the command. See
 73 |     # http://www.bioconductor.org/help/docker/ for more detail.
 74 |     'docker' : {
 75 |       'imageName': 'bioconductor/release_core',
 76 | 
 77 |       # Change into the directory in which the script and input reside. Then
 78 |       # run the R script in batch mode to completion.
 79 |       'cmd': '/bin/bash -c "cd /mnt/data/ ; R CMD BATCH script.R"',
 80 |     },
 81 | 
 82 |     'inputParameters' : [ {
 83 |       'name': 'script',
 84 |       'description': 'Cloud Storage path to the R script to run.',
 85 |       'localCopy': {
 86 |         'path': 'script.R',
 87 |         'disk': 'data'
 88 |       }
 89 |     }, {
 90 |       'name': 'bamFile',
 91 |       'description': 'Cloud Storage path to the BAM file.',
 92 |       'localCopy': {
 93 |         'path': 'input.bam',
 94 |         'disk': 'data'
 95 |       }
 96 |     }, {
 97 |       'name': 'indexFile',
 98 |       'description': 'Cloud Storage path to the BAM index file.',
 99 |       'localCopy': {
100 |         'path': 'input.bam.bai',
101 |         'disk': 'data'
102 |         }
103 |     } ],
104 | 
105 |     'outputParameters' : [ {
106 |       'name': 'outputFile',
107 |       'description': 'Cloud Storage path for where to write the result.',
108 |       'localCopy': {
109 |         'path': 'overlapsCount.tsv',
110 |         'disk': 'data'
111 |       }
112 |     }, {
113 |       'name': 'rBatchLogFile',
114 |       'description': 'Cloud Storage path for where to write the R batch log file.',
115 |       'localCopy': {
116 |         'path': 'script.Rout',
117 |         'disk': 'data'
118 |       }
119 |     } ]
120 |   },
121 | 
122 |   'pipelineArgs' : {
123 |     'projectId': PROJECT_ID,
124 | 
125 |     # Here we use a very tiny BAM as an example but this pipeline could be invoked in
126 |     # a loop to kick off parallel execution of this pipeline on, for example, all the
127 |     # 1000 Genomes phase 3 BAMs in
128 |     # gs://genomics-public-data/ftp-trace.ncbi.nih.gov/1000genomes/ftp/phase3/data/*/alignment/*.mapped.ILLUMINA.bwa.*.low_coverage.20120522.bam'
129 |     # emitting a distinct output file for each result. Then you can:
130 |     #     gsutil cat gs://<BUCKET>/<PREFIX>/output/*tsv > allOverlapsCount.tsv
131 |     # to create the final consolidated TSV file.
132 |     'inputs': {
133 |       'script': SCRIPT,
134 |       'bamFile': 'gs://genomics-public-data/ftp-trace.ncbi.nih.gov/1000genomes/ftp/technical/pilot3_exon_targetted_GRCh37_bams/data/NA06986/alignment/NA06986.chromMT.ILLUMINA.bwa.CEU.exon_targetted.20100311.bam',
135 |       'indexFile': 'gs://genomics-public-data/ftp-trace.ncbi.nih.gov/1000genomes/ftp/technical/pilot3_exon_targetted_GRCh37_bams/data/NA06986/alignment/NA06986.chromMT.ILLUMINA.bwa.CEU.exon_targetted.20100311.bam.bai'
136 |     },
137 |     # Pass the user-specified Cloud Storage destination for pipeline output.
138 |     'outputs': {
139 |       # The R script explicitly writes out one file of results.
140 |       'outputFile': 'gs://%s/%s/output/overlapsCount.tsv' % (BUCKET, PREFIX),
141 |       # R, when run in batch mode, writes console output to a file.
142 |       'rBatchLogFile': 'gs://%s/%s/output/script.Rout' % (BUCKET, PREFIX)
143 |     },
144 |     # Pass the user-specified Cloud Storage destination for pipeline logging.
145 |     'logging': {
146 |       'gcsPath': 'gs://%s/%s/logging' % (BUCKET, PREFIX)
147 |     },
148 | 
149 |     # TODO: remove this when the API has a default
150 |     'serviceAccount': {
151 |         'email': 'default',
152 |         'scopes': [
153 |             'https://www.googleapis.com/auth/compute',
154 |             'https://www.googleapis.com/auth/devstorage.full_control',
155 |             'https://www.googleapis.com/auth/genomics'
156 |         ]
157 |     }
158 |   }
159 | }).execute()
160 | 
161 | # Emit the result of the pipeline run submission and poll for completion.
162 | pp = pprint.PrettyPrinter(indent=2)
163 | pp.pprint(operation)
164 | operation_name = operation['name']
165 | print
166 | print "Polling for completion of operation"
167 | 
168 | while not operation['done']:
169 |   print "Operation not complete. Sleeping %d seconds" % (POLL_INTERVAL_SECONDS)
170 |   time.sleep(POLL_INTERVAL_SECONDS)
171 |   operation = service.operations().get(name=operation_name).execute()
172 | 
173 | print
174 | print "Operation complete"
175 | print
176 | pp.pprint(operation)
177 | 


--------------------------------------------------------------------------------
/cwl_runner/cwl_startup.sh:
--------------------------------------------------------------------------------
  1 | #!/bin/bash
  2 | 
  3 | # Copyright 2017 Google Inc.
  4 | #
  5 | # Use of this source code is governed by a BSD-style
  6 | # license that can be found in the LICENSE file or at
  7 | # https://developers.google.com/open-source/licenses/bsd
  8 | 
  9 | # cwl_startup_script.sh
 10 | #
 11 | # This is the startup script that runs on Compute Engine to run
 12 | # a Common Workflow Language (CWL) workflow with cwltool.
 13 | 
 14 | readonly METADATA_URL="http://metadata.google.internal/computeMetadata/v1/instance"
 15 | readonly METADATA_HEADERS="Metadata-Flavor: Google"
 16 | readonly OPERATION_ID=$(curl "${METADATA_URL}/attributes/operation-id" -H "${METADATA_HEADERS}")
 17 | readonly OUTPUT=$(curl "${METADATA_URL}/attributes/output" -H "${METADATA_HEADERS}")
 18 | readonly STATUS_FILE=$(curl "${METADATA_URL}/attributes/status-file" -H "${METADATA_HEADERS}")
 19 | readonly STATUS_LOCAL="/tmp/status-${OPERATION_ID}.txt"
 20 | STATUS="RUNNING"
 21 | 
 22 | echo "$(date)"
 23 | echo "Status ${STATUS}"
 24 | echo "${STATUS}" > ${STATUS_LOCAL}
 25 | gsutil cp ${STATUS_LOCAL} ${STATUS_FILE}
 26 | 
 27 | echo "Redirecting stdout and stderr"
 28 | readonly STDOUT=/tmp/stdout-${OPERATION_ID}.txt
 29 | readonly STDERR=/tmp/stderr-${OPERATION_ID}.txt
 30 | exec >  >(tee -ia ${STDOUT})
 31 | exec 2> >(tee -ia ${STDERR} >&2)
 32 | 
 33 | echo "$(date)"
 34 | echo "Running startup script"
 35 | 
 36 | echo "Initializing variables"
 37 | readonly WORKFLOW_FILE=$(curl "${METADATA_URL}/attributes/workflow-file" -H "${METADATA_HEADERS}")
 38 | readonly SETTINGS_FILE=$(curl "${METADATA_URL}/attributes/settings-file" -H "${METADATA_HEADERS}")
 39 | readonly INPUT=$(curl "${METADATA_URL}/attributes/input" -H "${METADATA_HEADERS}")
 40 | readonly INPUT_RECURSIVE=$(curl "${METADATA_URL}/attributes/input-recursive" -H "${METADATA_HEADERS}")
 41 | readonly DISK_NAME=google-$(curl "${METADATA_URL}/disks/1/device-name" -H "${METADATA_HEADERS}")
 42 | readonly RUNNER=$(curl "${METADATA_URL}/attributes/runner" -H "${METADATA_HEADERS}")
 43 | 
 44 | echo "$(date)"
 45 | echo "Mounting and formatting disk"
 46 | readonly MOUNT_POINT="/mnt/data"
 47 | sudo mkfs.ext4 -F -E lazy_itable_init=0,lazy_journal_init=0,discard /dev/disk/by-id/${DISK_NAME}
 48 | sudo mkdir -p ${MOUNT_POINT}
 49 | sudo mount -o discard,defaults /dev/disk/by-id/${DISK_NAME} ${MOUNT_POINT}
 50 | sudo chmod 777 ${MOUNT_POINT}
 51 | 
 52 | echo "$(date)"
 53 | echo "Creating folders for workflow inputs and outputs"
 54 | readonly INPUT_FOLDER="${MOUNT_POINT}/input"
 55 | readonly OUTPUT_FOLDER="${MOUNT_POINT}/output"
 56 | readonly TMP_FOLDER="${MOUNT_POINT}/tmp"
 57 | sudo mkdir -m 777 -p "${INPUT_FOLDER}"
 58 | sudo mkdir -m 777 -p "${OUTPUT_FOLDER}"
 59 | sudo mkdir -m 777 -p "${TMP_FOLDER}"
 60 | 
 61 | echo "$(date)"
 62 | echo "Copying input files to local disk"
 63 | while IFS=';' read -ra URL_LIST; do
 64 |   for URL in "${URL_LIST[@]}"; do
 65 |     URL=$(echo ${URL} | tr -d '"')  # Remove quotes
 66 |     URL_LOCAL="${INPUT_FOLDER}/$(dirname ${URL//:\//})"
 67 |     CMD="mkdir -p ${URL_LOCAL}"
 68 |     echo "${CMD}"
 69 |     ${CMD}
 70 |     CMD="gsutil -m -o GSUtil:parallel_composite_upload_threshold=150M cp ${URL} ${URL_LOCAL}"
 71 |     echo "${CMD}"
 72 |     ${CMD}
 73 |   done
 74 | done <<< "${INPUT}"
 75 | 
 76 | echo "$(date)"
 77 | echo "Recursively copying input folders to local disk"
 78 | while IFS=';' read -ra URL_LIST; do
 79 |   for URL in "${URL_LIST[@]}"; do
 80 |     URL=$(echo ${URL} | tr -d '"')  # Remove quotes
 81 |     URL_LOCAL="${INPUT_FOLDER}/${URL//:\//}"
 82 |     CMD="mkdir -p ${URL_LOCAL}"
 83 |     echo "${CMD}"
 84 |     ${CMD}
 85 |     CMD="gsutil -m -o GSUtil:parallel_composite_upload_threshold=150M rsync -r ${URL}/ ${URL_LOCAL}"
 86 |     echo "${CMD}"
 87 |     ${CMD}
 88 |   done
 89 | done <<< "${INPUT_RECURSIVE}"
 90 | 
 91 | echo "Copying workflow file to local disk"
 92 | readonly WORKFLOW_LOCAL="${INPUT_FOLDER}/${WORKFLOW_FILE//:\//}"
 93 | CMD="mkdir -p $(dirname ${WORKFLOW_LOCAL})"
 94 | echo "${CMD}"
 95 | ${CMD}
 96 | CMD="gsutil -m cp ${WORKFLOW_FILE} ${WORKFLOW_LOCAL}"
 97 | echo "${CMD}"
 98 | ${CMD}
 99 | 
100 | echo "Copying settings file to local disk"
101 | readonly SETTINGS_LOCAL="${INPUT_FOLDER}/${SETTINGS_FILE//:\//}"
102 | CMD="mkdir -p $(dirname ${SETTINGS_LOCAL})"
103 | echo "${CMD}"
104 | ${CMD}
105 | CMD="gsutil -m cp ${SETTINGS_FILE} ${SETTINGS_LOCAL}"
106 | echo "${CMD}"
107 | ${CMD}
108 | 
109 | echo "$(date)"
110 | echo "Installing Docker and CWL runner ${RUNNER}"
111 | 
112 | if [[ ${RUNNER} == "cwltool" ]]; then
113 |   sudo apt-get update
114 |   sudo apt-get --yes install apt-utils docker.io gcc python-dev python-setuptools ca-certificates
115 |   sudo easy_install -U virtualenv
116 |   sudo systemctl start docker.service
117 | 
118 |   echo "$(date)"
119 |   echo "Starting virtualenv"
120 |   virtualenv cwl
121 |   source cwl/bin/activate
122 |   pip install cwlref-runner
123 | 
124 |   echo "$(date)"
125 |   echo "Running the CWL workflow"
126 |   export HOME="/root"  # cwl runner needs it; startup scripts don't have it defined
127 |   cd "${INPUT_FOLDER}"
128 |   CMD="cwl-runner --outdir ${OUTPUT_FOLDER} --tmpdir-prefix ${TMP_FOLDER} --tmp-outdir-prefix ${TMP_FOLDER} ${WORKFLOW_LOCAL} ${SETTINGS_LOCAL}"
129 |   echo "${CMD}"
130 |   ${CMD} && STATUS="COMPLETED" || STATUS="FAILED"
131 | 
132 |   deactivate
133 | 
134 | elif [[ ${RUNNER} == "rabix" ]]
135 | then
136 |   sudo apt-get --yes install openjdk-8-jre
137 |   sudo apt-get update
138 |   sudo apt-get --yes install apt-utils docker.io gcc ca-certificates
139 |   sudo systemctl start docker.service
140 | 
141 |   cd "${INPUT_FOLDER}"
142 |   wget https://github.com/rabix/bunny/releases/download/v1.0.0-rc2/rabix-1.0.0-rc2.tar.gz && tar -xvf rabix-1.0.0-rc2.tar.gz
143 |   RABIX="${INPUT_FOLDER}/rabix-1.0.0-rc2/rabix"
144 | 
145 |   echo "$(date)"
146 |   echo "Running the CWL workflow"
147 |   export HOME="/root"  # cwl runner needs it; startup scripts don't have it defined
148 |   CMD="${RABIX} --basedir ${OUTPUT_FOLDER} --outdir ${OUTPUT_FOLDER} --tmpdir-prefix ${TMP_FOLDER} --tmp-outdir-prefix ${TMP_FOLDER} ${WORKFLOW_LOCAL} ${SETTINGS_LOCAL}"
149 |   echo "${CMD}"
150 |   ${CMD} && STATUS="COMPLETED" || STATUS="FAILED"
151 | 
152 | else
153 |   >&2 echo "Error. Unknown CWL runner: ${RUNNER}"
154 | fi
155 | 
156 | echo "$(date)"
157 | echo "Finished running CWL"
158 | echo "Copying output files to Cloud Storage"
159 | CMD="gsutil -m -o GSUtil:parallel_composite_upload_threshold=150M rsync -r ${OUTPUT_FOLDER}/ ${OUTPUT}/"
160 | echo "${CMD}"
161 | ${CMD}
162 | 
163 | echo "$(date)"
164 | echo "Status ${STATUS}"
165 | echo ${STATUS} > ${STATUS_LOCAL}
166 | CMD="gsutil cp ${STATUS_LOCAL} ${STATUS_FILE}"
167 | echo "${CMD}"
168 | ${CMD}
169 | 
170 | KEEP_ALIVE=$(curl "${METADATA_URL}/attributes/keep-alive" -H "${METADATA_HEADERS}")
171 | if [[ "${KEEP_ALIVE}" = "true" ]]; then
172 |   echo "$(date)"
173 |   echo "Leaving VM running because keep-alive == true"
174 |   echo "Copying stdout and stderr to Cloud Storage"
175 |   CMD="gsutil -m cp ${STDOUT} ${STDERR} ${OUTPUT}/"
176 |   echo "${CMD}"
177 |   ${CMD}
178 | else
179 |   echo "Shutting down and deleting the VM"
180 |   ZONE=$(curl "${METADATA_URL}/zone" -H "${METADATA_HEADERS}")
181 |   INSTANCE_NAME=$(curl "${METADATA_URL}/name" -H "${METADATA_HEADERS}")
182 |   CMD="sudo gcloud --quiet compute instances delete --zone ${ZONE} ${INSTANCE_NAME}"
183 |   echo "${CMD}"
184 |   ${CMD}
185 | fi
186 | 


--------------------------------------------------------------------------------
/compress/run_compress.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/python
  2 | 
  3 | # Copyright 2017 Google Inc.
  4 | #
  5 | # Use of this source code is governed by a BSD-style
  6 | # license that can be found in the LICENSE file or at
  7 | # https://developers.google.com/open-source/licenses/bsd
  8 | 
  9 | """Python sample demonstrating use of the Google Genomics Pipelines API.
 10 | 
 11 | This sample demonstrates running a pipeline to compress or decompress
 12 | a file that is in Google Cloud Storage.
 13 | 
 14 | This sample demonstrates running the pipeline in an "ephemeral" manner;
 15 | no call to pipelines.create() is necessary. No pipeline is persisted
 16 | in the pipelines list.
 17 | 
 18 | Usage:
 19 |   * python run_compress.py \
 20 |       --project <project-id> \
 21 |       --zones <gce-zones> \
 22 |       --disk-size <size-in-gb> \
 23 |       --operation <compression-operation> \
 24 |       --input <gcs-input-path> \
 25 |       --output <gcs-output-path> \
 26 |       --logging <gcs-logging-path> \
 27 |       --poll-interval <interval-in-seconds>
 28 | 
 29 | Where the poll-interval is optional (default is no polling).
 30 | 
 31 | Users will typically want to restrict the Compute Engine zones to avoid Cloud
 32 | Storage egress charges. This script supports a short-hand pattern-matching
 33 | for specifying zones, such as:
 34 | 
 35 |   --zones "*"                # All zones
 36 |   --zones "us-*"             # All US zones
 37 |   --zones "us-central1-*"    # All us-central1 zones
 38 | 
 39 | an explicit list may be specified, space-separated:
 40 |   --zones us-central1-a us-central1-b
 41 | """
 42 | 
 43 | import argparse
 44 | import pprint
 45 | 
 46 | from oauth2client.client import GoogleCredentials
 47 | from apiclient.discovery import build
 48 | 
 49 | from pipelines_pylib import defaults
 50 | from pipelines_pylib import poller
 51 | 
 52 | # Parse input args
 53 | parser = argparse.ArgumentParser()
 54 | parser.add_argument("--project", required=True,
 55 |                     help="Cloud project id to run the pipeline in")
 56 | parser.add_argument("--disk-size", required=True, type=int,
 57 |                     help="Size (in GB) of disk for both input and output")
 58 | parser.add_argument("--zones", required=True, nargs="+",
 59 |                     help="List of Google Compute Engine zones (supports wildcards)")
 60 | parser.add_argument("--operation", required=False, default="gzip",
 61 |                     choices=[ "gzip", "gunzip", "bzip2", "bunzip2" ],
 62 |                     help="Choice of compression/decompression command")
 63 | parser.add_argument("--input", required=True, nargs="+",
 64 |                     help="Cloud Storage path to input file(s)")
 65 | parser.add_argument("--output", required=True,
 66 |                     help="Cloud Storage path to output file (with the .gz extension)")
 67 | parser.add_argument("--logging", required=True,
 68 |                     help="Cloud Storage path to send logging output")
 69 | parser.add_argument("--poll-interval", default=0, type=int,
 70 |                     help="Frequency (in seconds) to poll for completion (default: no polling)")
 71 | args = parser.parse_args()
 72 | 
 73 | # Create the genomics service
 74 | credentials = GoogleCredentials.get_application_default()
 75 | service = build('genomics', 'v1alpha2', credentials=credentials)
 76 | 
 77 | # Run the pipeline
 78 | operation = service.pipelines().run(body={
 79 |   # The ephemeralPipeline provides the template for the pipeline
 80 |   # The pipelineArgs provide the inputs specific to this run
 81 | 
 82 |   # There are some nuances in the API that are still being ironed out
 83 |   # to make this more compact.
 84 | 
 85 |   'ephemeralPipeline': {
 86 |     'projectId': args.project,
 87 |     'name': 'compress',
 88 |     'description': 'Compress or decompress a file',
 89 | 
 90 |     # Define the resources needed for this pipeline.
 91 |     'resources': {
 92 |       # Create a data disk that is attached to the VM and destroyed when the
 93 |       # pipeline terminates.
 94 |       'disks': [ {
 95 |         'name': 'datadisk',
 96 |         'autoDelete': True,
 97 | 
 98 |         # Within the Docker container, specify a mount point for the disk.
 99 |         # The pipeline input argument below will specify that inputs should be
100 |         # written to this disk.
101 |         'mountPoint': '/mnt/data',
102 |       } ],
103 |     },
104 | 
105 |     # Specify the Docker image to use along with the command
106 |     'docker': {
107 |       'imageName': 'ubuntu', # Stock ubuntu contains the gzip, bzip2 commands
108 | 
109 |       'cmd': ('cd /mnt/data/workspace && '
110 |               'for file in $(/bin/ls); do '
111 |                 '%s ${file}; '
112 |               'done' % args.operation),
113 |     },
114 | 
115 |     # The Pipelines API currently supports full GCS paths, along with patterns (globs),
116 |     # but it doesn't directly support a list of files being passed as a single input
117 |     # parameter ("gs://bucket/foo.bam gs://bucket/bar.bam").
118 |     #
119 |     # We can simply generate a series of inputs (input0, input1, etc.) to support this here.
120 |     #
121 |     # 'inputParameters': [ {
122 |     #   'name': 'inputFile0',
123 |     #   'description': 'Cloud Storage path to an input file',
124 |     #   'localCopy': {
125 |     #     'path': 'workspace/',
126 |     #     'disk': 'datadisk'
127 |     #   }
128 |     # }, {
129 |     #   'name': 'inputFile1',
130 |     #   'description': 'Cloud Storage path to an input file',
131 |     #   'localCopy': {
132 |     #     'path': 'workspace/',
133 |     #     'disk': 'datadisk'
134 |     #   }
135 |     # <etc>
136 |     # } ],
137 | 
138 |     # The inputFile<n> specified in the pipelineArgs (see below) will specify the
139 |     # Cloud Storage path to copy to /mnt/data/workspace/.
140 | 
141 |     'inputParameters': [ {
142 |       'name': 'inputFile%d' % idx,
143 |       'description': 'Cloud Storage path to an input file',
144 |       'localCopy': {
145 |         'path': 'workspace/',
146 |         'disk': 'datadisk'
147 |       }
148 |     } for idx in range(len(args.input)) ],
149 | 
150 |     # By specifying an outputParameter, we instruct the pipelines API to
151 |     # copy /mnt/data/workspace/* to the Cloud Storage location specified in
152 |     # the pipelineArgs (see below).
153 |     'outputParameters': [ {
154 |       'name': 'outputPath',
155 |       'description': 'Cloud Storage path for where to FastQC output',
156 |       'localCopy': {
157 |         'path': 'workspace/*',
158 |         'disk': 'datadisk'
159 |       }
160 |     } ]
161 |   },
162 | 
163 |   'pipelineArgs': {
164 |     'projectId': args.project,
165 | 
166 |     # Override the resources needed for this pipeline
167 |     'resources': {
168 |       # Expand any zone short-hand patterns
169 |       'zones': defaults.get_zones(args.zones),
170 | 
171 |       # For the data disk, specify the size
172 |       'disks': [ {
173 |         'name': 'datadisk',
174 | 
175 |         'sizeGb': args.disk_size,
176 |       } ]
177 |     },
178 | 
179 |     # Pass the user-specified Cloud Storage paths as a map of input files
180 |     # 'inputs': {
181 |     #   'inputFile0': 'gs://bucket/foo.bam',
182 |     #   'inputFile1': 'gs://bucket/bar.bam', 
183 |     #   <etc>
184 |     # }
185 |     'inputs': {
186 |       'inputFile%d' % idx : value for idx, value in enumerate(args.input)
187 |     },
188 | 
189 |     # Pass the user-specified Cloud Storage destination path of output
190 |     'outputs': {
191 |       'outputPath': args.output
192 |     },
193 | 
194 |     # Pass the user-specified Cloud Storage destination for pipeline logging
195 |     'logging': {
196 |       'gcsPath': args.logging
197 |     },
198 |   }
199 | }).execute()
200 | 
201 | # Emit the result of the pipeline run submission
202 | pp = pprint.PrettyPrinter(indent=2)
203 | pp.pprint(operation)
204 | 
205 | # If requested - poll until the operation reaches completion state ("done: true")
206 | if args.poll_interval > 0:
207 |   completed_op = poller.poll(service, operation, args.poll_interval)
208 |   pp.pprint(completed_op)
209 | 


--------------------------------------------------------------------------------
/samtools/README.md:
--------------------------------------------------------------------------------
  1 | # Use samtools to create a BAM index file
  2 | 
  3 | This example will enable you to create an index file (BAI) for a BAM, using [samtools](http://www.htslib.org/).
  4 | Execution of the `samtools index` command will be on a [Google Compute Engine](https://cloud.google.com/compute/docs/)
  5 | virtual machine.
  6 | 
  7 | Instructions provided here demonstrate:
  8 | 
  9 | 1. Building a Docker image containing `samtools`
 10 | 1. Testing the Docker image by running it on your local workstation/laptop
 11 | 1. Pushing the Docker image to the Google Container Registry
 12 | 1. Launching and monitoring the pipeline using command-line tools (`gcloud`)
 13 | 1. Launching and monitoring the pipeline calling the Genomics API from Python
 14 | 
 15 | The `gcloud` command supports defining your pipeline in a JSON or YAML file and then setting per-run parameters from the command line.
 16 | 
 17 | The Python example demonstrates full control over the construction of `pipeline.run()` API call.
 18 | 
 19 | ## (0) Complete the prerequisites
 20 | 
 21 | Be sure you have completed the [Prerequisites](../README.md#prerequisites)
 22 | listed at the top of this github repository.
 23 | 
 24 | ## (1) Create the Docker image.
 25 | 
 26 | ```
 27 | git clone https://github.com/googlegenomics/pipelines-api-examples.git
 28 | cd pipelines-api-examples/samtools/
 29 | docker build -t ${USER}/samtools .
 30 | ```
 31 | 
 32 | ## (2) Test locally the Docker image used by the pipeline.
 33 | 
 34 | ```
 35 | ./local/test_index.sh
 36 | ```
 37 | 
 38 | The result should be the newly created .bam.bai file in a subdirectory on your local machine:
 39 | ```
 40 | Running samtools index via Docker
 41 | 
 42 | Execution completed
 43 | 
 44 | Scratch directory:
 45 | .
 46 | ./output
 47 | ./output/NA06986.chromMT.ILLUMINA.bwa.CEU.exon_targetted.20100311.bam.bai
 48 | ./input
 49 | ./input/NA06986.chromMT.ILLUMINA.bwa.CEU.exon_targetted.20100311.bam
 50 | ```
 51 | 
 52 | ## (3) Push the Docker image to a repository.
 53 | 
 54 | In this example, we push the container to [Google Container Registry](https://cloud.google.com/container-registry/) via the following commands:
 55 | ```
 56 | docker tag ${USER}/samtools gcr.io/YOUR-PROJECT-ID/samtools
 57 | gcloud docker -- push gcr.io/YOUR-PROJECT-ID/samtools
 58 | ```
 59 | 
 60 | ## (4) Run the Docker image in the cloud, using gcloud
 61 | 
 62 | The `gcloud` tool that comes with the Google Cloud SDK includes a command
 63 | to run pipelines. You can get details of the command with:
 64 | 
 65 | ```
 66 | gcloud alpha genomics pipelines run --help
 67 | ```
 68 | 
 69 | ### (4a) Run the Docker image in the cloud, using gcloud
 70 | 
 71 | To run this example, first edit the included [./cloud/samtools.yaml](./cloud/samtools.yaml) file:
 72 | 
 73 | * Replace `YOUR-PROJECT-ID` with your project ID.
 74 | 
 75 | ### (4b) Execute the `pipelines run` command:
 76 | 
 77 | ```
 78 | gcloud alpha genomics pipelines run \
 79 |   --pipeline-file cloud/samtools.yaml \
 80 |   --inputs inputPath=gs://genomics-public-data/ftp-trace.ncbi.nih.gov/1000genomes/ftp/technical/pilot3_exon_targetted_GRCh37_bams/data/NA06986/alignment/NA06986.chromMT.ILLUMINA.bwa.CEU.exon_targetted.20100311.bam \
 81 |   --outputs outputPath=gs://YOUR-BUCKET/pipelines-api-examples/samtools/gcloud/output/ \
 82 |   --logging gs://YOUR-BUCKET/pipelines-api-examples/samtools/gcloud/logging/ \
 83 |   --disk-size datadisk:100
 84 | Running: [operations/YOUR-NEW-OPERATION-ID]
 85 | ```
 86 | 
 87 | * Replace `YOUR-BUCKET` with a bucket in your project.
 88 | 
 89 | ### (4c) Monitor the pipeline operation
 90 | 
 91 | This github repo includes a shell script, [../tools/poll.sh](../tools/poll.sh), for monitoring the completion status of an operation.
 92 | 
 93 | ```
 94 | $ ../tools/poll.sh YOUR-NEW-OPERATION-ID 20
 95 | Operation not complete. Sleeping 20 seconds
 96 | Operation not complete. Sleeping 20 seconds
 97 | ...
 98 | Operation not complete. Sleeping 20 seconds
 99 | 
100 | Operation complete
101 | done: true
102 | metadata:
103 |   events:
104 |   - description: start
105 |     startTime: '2016-05-04T17:22:16.258279445Z'
106 |   - description: pulling-image
107 |     startTime: '2016-05-04T17:22:16.258324967Z'
108 |   - description: localizing-files
109 |     startTime: '2016-05-04T17:22:27.650908389Z'
110 |   - description: running-docker
111 |     startTime: '2016-05-04T17:22:30.615818360Z'
112 |   - description: delocalizing-files
113 |     startTime: '2016-05-04T17:22:31.100643739Z'
114 |   - description: ok
115 |     startTime: '2016-05-04T17:22:34.669517713Z'
116 | name: operations/YOUR-NEW-OPERATION-ID
117 | ```
118 | 
119 | ### (4d) Check the results
120 | 
121 | Check the operation output for a top-level `errors` field.
122 | If none, then the operation should have finished successfully.
123 | 
124 | ```
125 | $ gsutil ls gs://YOUR-BUCKET/pipelines-api-examples/samtools/gcloud/output
126 | gs://YOUR-BUCKET/pipelines-api-examples/samtools/gcloud/output/NA06986.chromMT.ILLUMINA.bwa.CEU.exon_targetted.20100311.bam.bai
127 | ```
128 | 
129 | ## (5) Run the Docker image in the cloud, using the Python client libraries
130 | 
131 | The `run_samtools.py` script demonstrates having full programmatic control
132 | over the pipelines.run() API call.
133 | 
134 | ## (5a) Run the Docker image in the cloud
135 | 
136 | ```
137 | PYTHONPATH=.. python cloud/run_samtools.py \
138 |   --project YOUR-PROJECT-ID \
139 |   --zones "us-*" \
140 |   --disk-size 100 \
141 |   --input \
142 |     gs://genomics-public-data/ftp-trace.ncbi.nih.gov/1000genomes/ftp/technical/pilot3_exon_targetted_GRCh37_bams/data/NA06986/alignment/NA06986.chromMT.ILLUMINA.bwa.CEU.exon_targetted.20100311.bam \
143 |     gs://genomics-public-data/ftp-trace.ncbi.nih.gov/1000genomes/ftp/technical/pilot3_exon_targetted_GRCh37_bams/data/NA18628/alignment/NA18628.chromY.LS454.ssaha2.CHB.exon_targetted.20100311.bam \
144 |   --output gs://YOUR-BUCKET/pipelines-api-examples/samtools/python/output/ \
145 |   --logging gs://YOUR-BUCKET/pipelines-api-examples/samtools/python/logging \
146 |   --poll-interval 20
147 | ```
148 | 
149 | * Replace `YOUR-PROJECT-ID` with your project ID.
150 | * Replace `YOUR-BUCKET` with a bucket in your project.
151 | 
152 | The `PYTHONPATH` must include the top-level directory of the
153 | `pipelines-api-examples` in order to pick up modules in the
154 | [pipelines_pylib](../pipelines_pylib) directory.
155 | 
156 | The output will be the JSON description of the operation, followed by periodic
157 | messages for polling. When the operation completes, the full operation will
158 | be emitted.
159 | ```
160 | { u'done': False,
161 |   u'metadata': { u'@type': u'type.googleapis.com/google.genomics.v1.OperationMetadata',
162 |                  u'clientId': u'',
163 |                  u'createTime': u'2016-03-31T04:23:17.000Z',
164 |                  u'events': [],
165 |                  u'projectId': u'YOUR-PROJECT-ID'},
166 |   u'name': u'operations/YOUR-NEW-OPERATION-ID'}
167 | 
168 | Polling for completion of operation
169 | Operation not complete. Sleeping 20 seconds
170 | Operation not complete. Sleeping 20 seconds
171 | ...
172 | Operation not complete. Sleeping 20 seconds
173 | 
174 | Operation complete
175 | 
176 | { u'done': True,
177 |   u'metadata': { u'@type': u'type.googleapis.com/google.genomics.v1.OperationMetadata',
178 |                  u'clientId': u'',
179 |                  u'createTime': u'2016-03-31T04:23:17.000Z',
180 |                  u'endTime': u'2016-03-31T04:25:08.000Z',
181 | ...
182 |                  u'startTime': u'2016-03-31T04:23:46.000Z'},
183 |   u'name': u'operations/YOUR-NEW-OPERATION-ID'}
184 | ```
185 | 
186 | ## (5b) Check the results
187 | 
188 | Check the operation output for a top-level `errors` field.
189 | If none, then the operation should have finished successfully.
190 | 
191 | ```
192 | $ gsutil ls gs://YOUR-BUCKET/pipelines-api-examples/samtools/python/output/
193 | gs://YOUR-BUCKET/pipelines-api-examples/samtools/python/output/NA06986.chromMT.ILLUMINA.bwa.CEU.exon_targetted.20100311.bam.bai
194 | gs://YOUR-BUCKET/pipelines-api-examples/samtools/python/output/NA18628.chromY.LS454.ssaha2.CHB.exon_targetted.20100311.bam.bai
195 | ```
196 | 
197 | 


--------------------------------------------------------------------------------
/fastqc/cloud/run_fastqc.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/python
  2 | 
  3 | # Copyright 2017 Google Inc.
  4 | #
  5 | # Use of this source code is governed by a BSD-style
  6 | # license that can be found in the LICENSE file or at
  7 | # https://developers.google.com/open-source/licenses/bsd
  8 | 
  9 | """Python sample demonstrating use of the Google Genomics Pipelines API.
 10 | 
 11 | This sample demonstrates running FASTQC
 12 | (http://www.bioinformatics.babraham.ac.uk/projects/fastqc/) over one
 13 | or more files in Google Cloud Storage.
 14 | 
 15 | This sample demonstrates running the pipeline in an "ephemeral" manner;
 16 | no call to pipelines.create() is necessary. No pipeline is persisted
 17 | in the pipelines list.
 18 | 
 19 | For large input files, it will typically make sense to have a single
 20 | call to this script (which makes a single call to the Pipelines API).
 21 | 
 22 | For small input files, it may make sense to batch them together into a single call.
 23 | Google Compute Engine instance billing is for a minimum of 10 minutes, and then
 24 | per-minute billing after that. If you are running FastQC over a BAM file for
 25 | mitochondrial DNA, it may take less than 10 minutes.
 26 | 
 27 | So if you have a series of such files, batch them together:
 28 | 
 29 |  --input "gs://bucket/sample1/chrMT.bam gs://bucket/sample1/chrY.bam gs://<etc>"
 30 | 
 31 | Usage:
 32 |   * python run_fastqc.py \
 33 |       --project <project-id> \
 34 |       --zones <gce-zones> \
 35 |       --disk-size <size-in-gb> \
 36 |       --input <gcs-input-path> \
 37 |       --output <gcs-output-path> \
 38 |       --logging <gcs-logging-path> \
 39 |       --poll-interval <interval-in-seconds>
 40 | 
 41 | Where the poll-interval is optional (default is no polling).
 42 | 
 43 | Users will typically want to restrict the Compute Engine zones to avoid Cloud
 44 | Storage egress charges. This script supports a short-hand pattern-matching
 45 | for specifying zones, such as:
 46 | 
 47 |   --zones "*"                # All zones
 48 |   --zones "us-*"             # All US zones
 49 |   --zones "us-central1-*"    # All us-central1 zones
 50 | 
 51 | an explicit list may be specified, space-separated:
 52 |   --zones us-central1-a us-central1-b
 53 | """
 54 | 
 55 | import argparse
 56 | import pprint
 57 | 
 58 | from oauth2client.client import GoogleCredentials
 59 | from apiclient.discovery import build
 60 | 
 61 | from pipelines_pylib import defaults
 62 | from pipelines_pylib import poller
 63 | 
 64 | # Parse input args
 65 | parser = argparse.ArgumentParser()
 66 | parser.add_argument("--project", required=True,
 67 |                     help="Cloud project id to run the pipeline in")
 68 | parser.add_argument("--disk-size", required=True, type=int,
 69 |                     help="Size (in GB) of disk for both input and output")
 70 | parser.add_argument("--zones", required=True, nargs="+",
 71 |                     help="List of Google Compute Engine zones (supports wildcards)")
 72 | parser.add_argument("--input", required=True, nargs="+",
 73 |                     help="Cloud Storage path to input file(s)")
 74 | parser.add_argument("--output", required=True,
 75 |                     help="Cloud Storage path to write output files")
 76 | parser.add_argument("--logging", required=True,
 77 |                     help="Cloud Storage path to send logging output")
 78 | parser.add_argument("--poll-interval", default=0, type=int,
 79 |                     help="Frequency (in seconds) to poll for completion (default: no polling)")
 80 | args = parser.parse_args()
 81 | 
 82 | # Create the genomics service
 83 | credentials = GoogleCredentials.get_application_default()
 84 | service = build('genomics', 'v1alpha2', credentials=credentials)
 85 | 
 86 | # Run the pipeline
 87 | operation = service.pipelines().run(body={
 88 |   # The ephemeralPipeline provides the template for the pipeline
 89 |   # The pipelineArgs provide the inputs specific to this run
 90 | 
 91 |   # There are some nuances in the API that are still being ironed out
 92 |   # to make this more compact.
 93 | 
 94 |   'ephemeralPipeline': {
 95 |     'projectId': args.project,
 96 |     'name': 'fastqc',
 97 |     'description': 'Run "FastQC" on one or more files',
 98 | 
 99 |     # Define the resources needed for this pipeline.
100 |     'resources': {
101 |       # Create a data disk that is attached to the VM and destroyed when the
102 |       # pipeline terminates.
103 |       'disks': [ {
104 |         'name': 'datadisk',
105 |         'autoDelete': True,
106 | 
107 |         # Within the Docker container, specify a mount point for the disk.
108 |         # The pipeline input argument below will specify that inputs should be
109 |         # written to this disk.
110 |         'mountPoint': '/mnt/data',
111 |       } ],
112 |     },
113 | 
114 |     # Specify the Docker image to use along with the command. Projects IDs with a
115 |     # colon (:) must swap it for a forward slash when specifying image names.
116 |     'docker': {
117 |       'imageName': 'gcr.io/%s/fastqc' % args.project.replace(':', '/'),
118 | 
119 |       # The Pipelines API will create the input directory when localizing files,
120 |       # but does not create the output directory.
121 |       'cmd': ('mkdir /mnt/data/output && '
122 |               'fastqc /mnt/data/input/* --outdir=/mnt/data/output/'),
123 |     },
124 | 
125 |     # The Pipelines API currently supports full GCS paths, along with patterns (globs),
126 |     # but it doesn't directly support a list of files being passed as a single input
127 |     # parameter ("gs://bucket/foo.bam gs://bucket/bar.bam").
128 |     #
129 |     # We can simply generate a series of inputs (input0, input1, etc.) to support this here.
130 |     #
131 |     # 'inputParameters': [ {
132 |     #   'name': 'inputFile0',
133 |     #   'description': 'Cloud Storage path to an input file',
134 |     #   'localCopy': {
135 |     #     'path': 'input/',
136 |     #     'disk': 'datadisk'
137 |     #   }
138 |     # }, {
139 |     #   'name': 'inputFile1',
140 |     #   'description': 'Cloud Storage path to an input file',
141 |     #   'localCopy': {
142 |     #     'path': 'input/',
143 |     #     'disk': 'datadisk'
144 |     #   }
145 |     # <etc>
146 |     # } ],
147 | 
148 |     # The inputFile<n> specified in the pipelineArgs (see below) will specify the
149 |     # Cloud Storage path to copy to /mnt/data/input/.
150 | 
151 |     'inputParameters': [ {
152 |       'name': 'inputFile%d' % idx,
153 |       'description': 'Cloud Storage path to an input file',
154 |       'localCopy': {
155 |         'path': 'input/',
156 |         'disk': 'datadisk'
157 |       }
158 |     } for idx in range(len(args.input)) ],
159 | 
160 |     # By specifying an outputParameter, we instruct the pipelines API to
161 |     # copy /mnt/data/output/* to the Cloud Storage location specified in
162 |     # the pipelineArgs (see below).
163 |     'outputParameters': [ {
164 |       'name': 'outputPath',
165 |       'description': 'Cloud Storage path for where to FastQC output',
166 |       'localCopy': {
167 |         'path': 'output/*',
168 |         'disk': 'datadisk'
169 |       }
170 |     } ]
171 |   },
172 | 
173 |   'pipelineArgs': {
174 |     'projectId': args.project,
175 | 
176 |     # Override the resources needed for this pipeline
177 |     'resources': {
178 |       'minimumRamGb': 1, # For this example, override the 3.75 GB default
179 | 
180 |       # Expand any zone short-hand patterns
181 |       'zones': defaults.get_zones(args.zones),
182 | 
183 |       # For the data disk, specify the size
184 |       'disks': [ {
185 |         'name': 'datadisk',
186 | 
187 |         'sizeGb': args.disk_size,
188 |       } ]
189 |     },
190 | 
191 |     # Pass the user-specified Cloud Storage paths as a map of input files
192 |     # 'inputs': {
193 |     #   'inputFile0': 'gs://bucket/foo.bam',
194 |     #   'inputFile1': 'gs://bucket/bar.bam', 
195 |     #   <etc>
196 |     # }
197 |     'inputs': {
198 |       'inputFile%d' % idx : value for idx, value in enumerate(args.input)
199 |     },
200 | 
201 |     # Pass the user-specified Cloud Storage destination path of the FastQC output
202 |     'outputs': {
203 |       'outputPath': args.output
204 |     },
205 | 
206 |     # Pass the user-specified Cloud Storage destination for pipeline logging
207 |     'logging': {
208 |       'gcsPath': args.logging
209 |     },
210 |   }
211 | }).execute()
212 | 
213 | # Emit the result of the pipeline run submission
214 | pp = pprint.PrettyPrinter(indent=2)
215 | pp.pprint(operation)
216 | 
217 | # If requested - poll until the operation reaches completion state ("done: true")
218 | if args.poll_interval > 0:
219 |   completed_op = poller.poll(service, operation, args.poll_interval)
220 |   pp.pprint(completed_op)
221 | 


--------------------------------------------------------------------------------
/samtools/cloud/run_samtools.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/python
  2 | 
  3 | # Copyright 2017 Google Inc.
  4 | #
  5 | # Use of this source code is governed by a BSD-style
  6 | # license that can be found in the LICENSE file or at
  7 | # https://developers.google.com/open-source/licenses/bsd
  8 | 
  9 | """Python sample demonstrating use of the Google Genomics Pipelines API.
 10 | 
 11 | This sample demonstrates running samtools (http://www.htslib.org/) over one
 12 | or more files in Google Cloud Storage.
 13 | 
 14 | This sample demonstrates running the pipeline in an "ephemeral" manner;
 15 | no call to pipelines.create() is necessary. No pipeline is persisted
 16 | in the pipelines list.
 17 | 
 18 | For large input files, it will typically make sense to have a single
 19 | call to this script (which makes a single call to the Pipelines API).
 20 | 
 21 | For small input files, it may make sense to batch them together into a single call.
 22 | Google Compute Engine instance billing is for a minimum of 10 minutes, and then
 23 | per-minute billing after that. If you are running samtools over a BAM file for
 24 | mitochondrial DNA, it may take less than 10 minutes.
 25 | 
 26 | So if you have a series of such files, batch them together:
 27 | 
 28 |  --input "gs://bucket/sample1/chrMT.bam gs://bucket/sample1/chrY.bam gs://<etc>"
 29 | 
 30 | Usage:
 31 |   * python run_samtools.py \
 32 |       --project <project-id> \
 33 |       --zones <gce-zones> \
 34 |       --disk-size <size-in-gb> \
 35 |       --input <gcs-input-path> \
 36 |       --output <gcs-output-path> \
 37 |       --logging <gcs-logging-path> \
 38 |       --poll-interval <interval-in-seconds>
 39 | 
 40 | Where the poll-interval is optional (default is no polling).
 41 | 
 42 | Users will typically want to restrict the Compute Engine zones to avoid Cloud
 43 | Storage egress charges. This script supports a short-hand pattern-matching
 44 | for specifying zones, such as:
 45 | 
 46 |   --zones "*"                # All zones
 47 |   --zones "us-*"             # All US zones
 48 |   --zones "us-central1-*"    # All us-central1 zones
 49 | 
 50 | an explicit list may be specified, space-separated:
 51 |   --zones us-central1-a us-central1-b
 52 | """
 53 | 
 54 | import argparse
 55 | import pprint
 56 | 
 57 | from oauth2client.client import GoogleCredentials
 58 | from apiclient.discovery import build
 59 | 
 60 | from pipelines_pylib import defaults
 61 | from pipelines_pylib import poller
 62 | 
 63 | # Parse input args
 64 | parser = argparse.ArgumentParser()
 65 | parser.add_argument("--project", required=True,
 66 |                     help="Cloud project id to run the pipeline in")
 67 | parser.add_argument("--disk-size", required=True, type=int,
 68 |                     help="Size (in GB) of disk for both input and output")
 69 | parser.add_argument("--zones", required=True, nargs="+",
 70 |                     help="List of Google Compute Engine zones (supports wildcards)")
 71 | parser.add_argument("--input", required=True, nargs="+",
 72 |                     help="Cloud Storage path to input file(s)")
 73 | parser.add_argument("--output", required=True,
 74 |                     help="Cloud Storage path to output file (with the .gz extension)")
 75 | parser.add_argument("--logging", required=True,
 76 |                     help="Cloud Storage path to send logging output")
 77 | parser.add_argument("--poll-interval", default=0, type=int,
 78 |                     help="Frequency (in seconds) to poll for completion (default: no polling)")
 79 | args = parser.parse_args()
 80 | 
 81 | # Create the genomics service
 82 | credentials = GoogleCredentials.get_application_default()
 83 | service = build('genomics', 'v1alpha2', credentials=credentials)
 84 | 
 85 | # Run the pipeline
 86 | operation = service.pipelines().run(body={
 87 |   # The ephemeralPipeline provides the template for the pipeline
 88 |   # The pipelineArgs provide the inputs specific to this run
 89 | 
 90 |   # There are some nuances in the API that are still being ironed out
 91 |   # to make this more compact.
 92 | 
 93 |   'ephemeralPipeline': {
 94 |     'projectId': args.project,
 95 |     'name': 'samtools',
 96 |     'description': 'Run samtools on one or more files',
 97 | 
 98 |     # Define the resources needed for this pipeline.
 99 |     'resources': {
100 |       # Create a data disk that is attached to the VM and destroyed when the
101 |       # pipeline terminates.
102 |       'disks': [ {
103 |         'name': 'datadisk',
104 |         'autoDelete': True,
105 | 
106 |         # Within the Docker container, specify a mount point for the disk.
107 |         # The pipeline input argument below will specify that inputs should be
108 |         # written to this disk.
109 |         'mountPoint': '/mnt/data',
110 |       } ],
111 |     },
112 | 
113 |     # Specify the Docker image to use along with the command
114 |     'docker': {
115 |       'imageName': 'gcr.io/%s/samtools' % args.project,
116 | 
117 |       # The Pipelines API will create the input directory when localizing files,
118 |       # but does not create the output directory.
119 |       'cmd': ('mkdir /mnt/data/output && '
120 |               'find /mnt/data/input && '
121 |               'for file in $(/bin/ls /mnt/data/input); do '
122 |                 'samtools index '
123 |                   '/mnt/data/input/${file} /mnt/data/output/${file}.bai; '
124 |               'done'),
125 |     },
126 | 
127 |     # The Pipelines API currently supports full GCS paths, along with patterns (globs),
128 |     # but it doesn't directly support a list of files being passed as a single input
129 |     # parameter ("gs://bucket/foo.bam gs://bucket/bar.bam").
130 |     #
131 |     # We can simply generate a series of inputs (input0, input1, etc.) to support this here.
132 |     #
133 |     # 'inputParameters': [ {
134 |     #   'name': 'inputFile0',
135 |     #   'description': 'Cloud Storage path to an input file',
136 |     #   'localCopy': {
137 |     #     'path': 'input/',
138 |     #     'disk': 'datadisk'
139 |     #   }
140 |     # }, {
141 |     #   'name': 'inputFile1',
142 |     #   'description': 'Cloud Storage path to an input file',
143 |     #   'localCopy': {
144 |     #     'path': 'input/',
145 |     #     'disk': 'datadisk'
146 |     #   }
147 |     # <etc>
148 |     # } ],
149 | 
150 |     # The inputFile<n> specified in the pipelineArgs (see below) will specify the
151 |     # Cloud Storage path to copy to /mnt/data/input/.
152 | 
153 |     'inputParameters': [ {
154 |       'name': 'inputFile%d' % idx,
155 |       'description': 'Cloud Storage path to an input file',
156 |       'localCopy': {
157 |         'path': 'input/',
158 |         'disk': 'datadisk'
159 |       }
160 |     } for idx in range(len(args.input)) ],
161 | 
162 |     # By specifying an outputParameter, we instruct the pipelines API to
163 |     # copy /mnt/data/output/* to the Cloud Storage location specified in
164 |     # the pipelineArgs (see below).
165 |     'outputParameters': [ {
166 |       'name': 'outputPath',
167 |       'description': 'Cloud Storage path for where to samtools output',
168 |       'localCopy': {
169 |         'path': 'output/*',
170 |         'disk': 'datadisk'
171 |       }
172 |     } ]
173 |   },
174 | 
175 |   'pipelineArgs': {
176 |     'projectId': args.project,
177 | 
178 |     # Override the resources needed for this pipeline
179 |     'resources': {
180 |       'minimumRamGb': 1, # For this example, override the 3.75 GB default
181 | 
182 |       # Expand any zone short-hand patterns
183 |       'zones': defaults.get_zones(args.zones),
184 | 
185 |       # For the data disk, specify the size
186 |       'disks': [ {
187 |         'name': 'datadisk',
188 | 
189 |         'sizeGb': args.disk_size,
190 |       } ]
191 |     },
192 | 
193 |     # Pass the user-specified Cloud Storage paths as a map of input files
194 |     # 'inputs': {
195 |     #   'inputFile0': 'gs://bucket/foo.bam',
196 |     #   'inputFile1': 'gs://bucket/bar.bam', 
197 |     #   <etc>
198 |     # }
199 |     'inputs': {
200 |       'inputFile%d' % idx : value for idx, value in enumerate(args.input)
201 |     },
202 | 
203 |     # Pass the user-specified Cloud Storage destination path of the samtools output
204 |     'outputs': {
205 |       'outputPath': args.output
206 |     },
207 | 
208 |     # Pass the user-specified Cloud Storage destination for pipeline logging
209 |     'logging': {
210 |       'gcsPath': args.logging
211 |     },
212 |   }
213 | }).execute()
214 | 
215 | # Emit the result of the pipeline run submission
216 | pp = pprint.PrettyPrinter(indent=2)
217 | pp.pprint(operation)
218 | 
219 | # If requested - poll until the operation reaches completion state ("done: true")
220 | if args.poll_interval > 0:
221 |   completed_op = poller.poll(service, operation, args.poll_interval)
222 |   pp.pprint(completed_op)
223 | 


--------------------------------------------------------------------------------
/compress/README.md:
--------------------------------------------------------------------------------
  1 | # Compress or Decompress files from Cloud Storage
  2 | 
  3 | This pipeline provides the use-case of downloading one or more files
  4 | from Cloud Storage, compressing or decompressing it (via gzip, gunzip,
  5 | bzip2, or bunzip2) and pushing the result to Cloud Storage.
  6 | 
  7 | This pipeline does not involve packaging a custom Docker image, and
  8 | thus there is no requirement to install Docker on your local machine.
  9 | The gzip and bzip2 commands are provided as part of the default `ubuntu` image.
 10 | 
 11 | ## (1) Run the pipeline in the cloud
 12 | 
 13 | When the Prerequisites from this repository's [README.md](../README.md)
 14 | are satisfied, then you can run this pipeline as:
 15 | 
 16 | ```
 17 | PYTHONPATH=.. python ./run_compress.py \
 18 |   --project YOUR-PROJECT-ID \
 19 |   --zones "us-*" \
 20 |   --disk-size 200 \
 21 |   --operation "gunzip" \
 22 |   --input gs://genomics-public-data/ftp-trace.ncbi.nih.gov/1000genomes/ftp/technical/working/20140123_NA12878_Illumina_Platinum/**.vcf.gz \
 23 |   --output gs://YOUR-BUCKET/pipelines-api-examples/compress/output \
 24 |   --logging gs://YOUR-BUCKET/pipelines-api-examples/compress/logging \
 25 |   --poll-interval 20
 26 | ```
 27 | 
 28 | * Replace `YOUR-PROJECT-ID` with your project ID.
 29 | * Replace `YOUR-BUCKET` with a bucket in your project.
 30 | 
 31 | The `PYTHONPATH` must include the top-level directory of the
 32 | `pipelines-api-examples` in order to pick up modules in the
 33 | [pipelines_pylib](../pipelines_pylib) directory.
 34 | 
 35 | The output will be the JSON description of the operation, followed by periodic
 36 | messages for polling. When the operation completes, the full operation will
 37 | be emitted.
 38 | ```
 39 | { u'done': False,
 40 |   u'metadata': { u'@type': u'type.googleapis.com/google.genomics.v1.OperationMetadata',
 41 |                  u'clientId': u'',
 42 |                  u'createTime': u'2016-03-30T17:34:08.000Z',
 43 |                  u'events': [],
 44 |                  u'projectId': u'YOUR-PROJECT-ID'},
 45 |   u'name': u'operations/YOUR-NEW-OPERATION-ID'}
 46 | 
 47 | Polling for completion of operation
 48 | Operation not complete. Sleeping 20 seconds
 49 | Operation not complete. Sleeping 20 seconds
 50 | ...
 51 | Operation not complete. Sleeping 20 seconds
 52 | 
 53 | Operation complete
 54 | 
 55 | { u'done': True,
 56 |   u'metadata': { u'@type': u'type.googleapis.com/google.genomics.v1.OperationMetadata',
 57 |                  u'clientId': u'',
 58 |                  u'createTime': u'2016-03-30T17:34:08.000Z',
 59 |                  u'endTime': u'2016-03-30T17:36:11.000Z',
 60 |                  u'events': [ { u'description': u'start',
 61 |                                 u'startTime': u'2016-03-30T17:35:34.244369759Z'},
 62 |                               { u'description': u'pulling-image',
 63 |                                 u'startTime': u'2016-03-30T17:35:34.244435642Z'},
 64 |                               { u'description': u'localizing-files',
 65 |                                 u'startTime': u'2016-03-30T17:35:44.884961352Z'},
 66 |                               { u'description': u'running-docker',
 67 |                                 u'startTime': u'2016-03-30T17:35:50.872211301Z'},
 68 |                               { u'description': u'delocalizing-files',
 69 |                                 u'startTime': u'2016-03-30T17:35:58.234466119Z'},
 70 |                               { u'description': u'ok',
 71 |                                 u'startTime': u'2016-03-30T17:36:11.404718158Z'}],
 72 |                  u'projectId': u'YOUR-PROJECT-ID',
 73 |                  u'request': { u'@type': u'type.googleapis.com/google.genomics.v1alpha2.RunPipelineRequest',
 74 |                                u'ephemeralPipeline': { u'description': u'Compress or decompress a file',
 75 |                                                        u'docker': { u'cmd': u'cd /mnt/data/workspace && for file in $(/bin/ls); do gunzip ${file}; done',
 76 |                                                                     u'imageName': u'ubuntu'},
 77 |                                                        u'name': u'compress',
 78 |                                                        u'parameters': [ { u'description': u'Cloud Storage path to an input file',
 79 |                                                                           u'name': u'inputFile0'},
 80 |                                                                         { u'description': u'Cloud Storage path for where to FastQC output',
 81 |                                                                           u'name': u'outputPath'}],
 82 |                                                        u'projectId': u'YOUR-PROJECT-ID',
 83 |                                                        u'resources': { u'disks': [ { u'autoDelete': True,
 84 |                                                                                      u'name': u'datadisk'}]}},
 85 |                                u'pipelineArgs': { u'clientId': u'',
 86 |                                                   u'inputs': { u'inputFile0': u'gs://genomics-public-data/ftp-trace.ncbi.nih.gov/1000genomes/ftp/technical/working/20140123_NA12878_Illumina_Platinum/**.vcf.gz'},
 87 |                                                   u'logging': { u'gcsPath': u'gs://YOUR-BUCKET/pipelines-api-examples/compress/logging'},
 88 |                                                   u'outputs': { u'outputPath': u'gs://YOUR-BUCKET/pipelines-api-examples/compress/output'},
 89 |                                                   u'projectId': u'YOUR-PROJECT-ID',
 90 |                                                   u'resources': { u'bootDiskSizeGb': 0,
 91 |                                                                   u'disks': [ { u'autoDelete': False,
 92 |                                                                                 u'mountPoint': u'',
 93 |                                                                                 u'name': u'datadisk',
 94 |                                                                                 u'readOnly': False,
 95 |                                                                                 u'sizeGb': 200,
 96 |                                                                                 u'source': u'',
 97 |                                                                                 u'type': u'TYPE_UNSPECIFIED'}],
 98 |                                                                   u'minimumCpuCores': 0,
 99 |                                                                   u'minimumRamGb': 0,
100 |                                                                   u'preemptible': False,
101 |                                                                   u'zones': [ u'us-central1-a',
102 |                                                                               u'us-central1-b',
103 |                                                                               u'us-central1-c',
104 |                                                                               u'us-central1-f',
105 |                                                                               u'us-east1-b',
106 |                                                                               u'us-east1-c',
107 |                                                                               u'us-east1-d']},
108 |                                                   u'serviceAccount': { u'email': u'default',
109 |                                                                        u'scopes': [ u'https://www.googleapis.com/auth/compute',
110 |                                                                                     u'https://www.googleapis.com/auth/devstorage.full_control',
111 |                                                                                     u'https://www.googleapis.com/auth/genomics']}}},
112 |                  u'startTime': u'2016-03-30T17:34:36.000Z'},
113 |   u'name': u'operations/YOUR-NEW-OPERATION-ID'}
114 | ```
115 | 
116 | ## (2) Check the results
117 | 
118 | Check the operation output for a top-level `errors` field.
119 | If none, then the operation should have finished successfully.
120 | 
121 | ```
122 | $ gsutil ls gs://YOUR-BUCKET/pipelines-api-examples/compress/output
123 | gs://YOUR-BUCKET/pipelines-api-examples/compress/output/NA12878.wgs.illumina_platinum.20140122.indel.genotypes.vcf
124 | gs://YOUR-BUCKET/pipelines-api-examples/compress/output/NA12878.wgs.illumina_platinum.20140122.snp.genotypes.vcf
125 | gs://YOUR-BUCKET/pipelines-api-examples/compress/output/NA12878.wgs.illumina_platinum.20140404.indels_v2.vcf
126 | gs://YOUR-BUCKET/pipelines-api-examples/compress/output/NA12878.wgs.illumina_platinum.20140404.snps_v2.vcf
127 | gs://YOUR-BUCKET/pipelines-api-examples/compress/output/NA12878.wgs.illumina_platinum.20140404.svs_v2.vcf
128 | ```
129 | 


--------------------------------------------------------------------------------
/set_vcf_sample_id/cloud/run_set_vcf_sample_id.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/python
  2 | 
  3 | # Copyright 2017 Google Inc.
  4 | #
  5 | # Use of this source code is governed by a BSD-style
  6 | # license that can be found in the LICENSE file or at
  7 | # https://developers.google.com/open-source/licenses/bsd
  8 | 
  9 | """
 10 | Usage:
 11 |   * python run_set_vcf_sample_id.py \
 12 |       --project <project-id> \
 13 |       --zones <gce-zones> \
 14 |       --disk-size <size-in-gb> \
 15 |       --original-sample-id <original-id> \
 16 |       --new-sample-id <new-id> \
 17 |       --script-path <gcs-script-path> \
 18 |       --input <gcs-input-path> \
 19 |       --output <gcs-output-path> \
 20 |       --logging <gcs-logging-path> \
 21 |       --poll-interval <interval-in-seconds>
 22 | 
 23 | Where the poll-interval is optional (default is no polling).
 24 | 
 25 | Users will typically want to restrict the Compute Engine zones to avoid Cloud
 26 | Storage egress charges. This script supports a short-hand pattern-matching
 27 | for specifying zones, such as:
 28 | 
 29 |   --zones "*"                # All zones
 30 |   --zones "us-*"             # All US zones
 31 |   --zones "us-central1-*"    # All us-central1 zones
 32 | 
 33 | an explicit list may be specified, space-separated:
 34 |   --zones us-central1-a us-central1-b
 35 | 
 36 | Passing the --original-sample-id is optional. If set, then the pipeline
 37 | script will verify the value in the input VCF, and if not equal, the
 38 | pipeline will fail.
 39 | 
 40 | Note that the pipeline API does not allow for input arguments with no
 41 | value. Thus if the --original-sample-id is not specified (or is empty),
 42 | the ORIGINAL_SAMPLE_ID input parameter is left out of the pipeline definition.
 43 | """
 44 | 
 45 | import argparse
 46 | import pprint
 47 | 
 48 | from oauth2client.client import GoogleCredentials
 49 | from apiclient.discovery import build
 50 | 
 51 | from pipelines_pylib import defaults
 52 | from pipelines_pylib import poller
 53 | 
 54 | # Parse input args
 55 | parser = argparse.ArgumentParser()
 56 | parser.add_argument("--project", required=True,
 57 |                     help="Cloud project id to run the pipeline in")
 58 | parser.add_argument("--disk-size", required=True, type=int,
 59 |                     help="Size (in GB) of disk for both input and output")
 60 | parser.add_argument("--zones", required=True, nargs="+",
 61 |                     help="List of Google Compute Engine zones (supports wildcards)")
 62 | parser.add_argument("--original-sample-id", required=False,
 63 |                     help="The original sample ID to be validated in the input")
 64 | parser.add_argument("--new-sample-id", required=True,
 65 |                     help="The new sample ID")
 66 | parser.add_argument("--script-path", required=True,
 67 |                     help="Cloud Storage path to script file(s)")
 68 | parser.add_argument("--input", required=True, nargs="+",
 69 |                     help="Cloud Storage path to input file(s)")
 70 | parser.add_argument("--output", required=True,
 71 |                     help="Cloud Storage path to output file (with the .gz extension)")
 72 | parser.add_argument("--logging", required=True,
 73 |                     help="Cloud Storage path to send logging output")
 74 | parser.add_argument("--poll-interval", default=0, type=int,
 75 |                     help="Frequency (in seconds) to poll for completion (default: no polling)")
 76 | args = parser.parse_args()
 77 | args.script_path.rstrip('/')
 78 | 
 79 | # Create the genomics service
 80 | credentials = GoogleCredentials.get_application_default()
 81 | service = build('genomics', 'v1alpha2', credentials=credentials)
 82 | 
 83 | # Run the pipeline
 84 | operation = service.pipelines().run(body={
 85 |   # The ephemeralPipeline provides the template for the pipeline
 86 |   # The pipelineArgs provide the inputs specific to this run
 87 | 
 88 |   # There are some nuances in the API that are still being ironed out
 89 |   # to make this more compact.
 90 | 
 91 |   'ephemeralPipeline': {
 92 |     'projectId': args.project,
 93 |     'name': 'set_vcf_sample_id',
 94 |     'description': 'Set the sample ID in a VCF header',
 95 | 
 96 |     # Define the resources needed for this pipeline.
 97 |     'resources': {
 98 |       # Create a data disk that is attached to the VM and destroyed when the
 99 |       # pipeline terminates.
100 |       'disks': [ {
101 |         'name': 'datadisk',
102 |         'autoDelete': True,
103 | 
104 |         # Within the Docker container, specify a mount point for the disk.
105 |         # The pipeline input argument below will specify that inputs should be
106 |         # written to this disk.
107 |         'mountPoint': '/mnt/data',
108 |       } ],
109 |     },
110 | 
111 |     # Specify the Docker image to use along with the command
112 |     'docker': {
113 |       'imageName': 'python:2.7',
114 | 
115 |       # The Pipelines API will create the input directory when localizing files,
116 |       # but does not create the output directory.
117 | 
118 |       'cmd': ('mkdir /mnt/data/output && '
119 | 
120 |               'export SCRIPT_DIR=/mnt/data/scripts && '
121 |               'chmod u+x ${SCRIPT_DIR}/* && '
122 | 
123 |               '${SCRIPT_DIR}/process_vcfs.sh '
124 |                 '"${ORIGINAL_SAMPLE_ID:-}" '
125 |                 '"${NEW_SAMPLE_ID}" '
126 |                 '"/mnt/data/input/*" '
127 |                 '"/mnt/data/output"'),
128 |     },
129 | 
130 |     # The inputFile<n> specified in the pipelineArgs (see below) will
131 |     # specify the Cloud Storage path to copy to /mnt/data/input/.
132 | 
133 |     'inputParameters': [ {
134 |       'name': 'inputFile%d' % idx,
135 |       'description': 'Cloud Storage path to input file(s)',
136 |       'localCopy': {
137 |         'path': 'input/',
138 |         'disk': 'datadisk'
139 |       }
140 |     } for idx in range(len(args.input)) ] + [ {
141 |       'name': 'setVcfSampleId_Script',
142 |       'description': 'Cloud Storage path to process_vcfs.sh script',
143 |       'defaultValue': '%s/process_vcfs.sh' % args.script_path,
144 |       'localCopy': {
145 |         'path': 'scripts/',
146 |         'disk': 'datadisk'
147 |       }
148 |     }, {
149 |       'name': 'setVcfSampleId_Python',
150 |       'description': 'Cloud Storage path to set_vcf_sample_id.py script',
151 |       'defaultValue': '%s/set_vcf_sample_id.py' % args.script_path,
152 |       'localCopy': {
153 |         'path': 'scripts/',
154 |         'disk': 'datadisk'
155 |       }
156 |     }] + ([{
157 |       'name': 'ORIGINAL_SAMPLE_ID',
158 |       'description': 'Sample ID which must already appear in the VCF header',
159 |     }] if args.original_sample_id else []) + [ {
160 |       'name': 'NEW_SAMPLE_ID',
161 |       'description': 'New sample ID to set in the VCF header',
162 |     } ],
163 | 
164 |     # By specifying an outputParameter, we instruct the pipelines API to
165 |     # copy /mnt/data/output/* to the Cloud Storage location specified in
166 |     # the pipelineArgs (see below).
167 |     'outputParameters': [ {
168 |       'name': 'outputPath',
169 |       'description': 'Cloud Storage path for where to copy the output',
170 |       'localCopy': {
171 |         'path': 'output/*',
172 |         'disk': 'datadisk'
173 |       }
174 |     } ]
175 |   },
176 | 
177 |   'pipelineArgs': {
178 |     'projectId': args.project,
179 | 
180 |     # Override the resources needed for this pipeline
181 |     'resources': {
182 |       'minimumRamGb': 1, # Shouldn't need the default 3.75 GB
183 | 
184 |       # Expand any zone short-hand patterns
185 |       'zones': defaults.get_zones(args.zones),
186 | 
187 |       # For the data disk, specify the size
188 |       'disks': [ {
189 |         'name': 'datadisk',
190 | 
191 |         'sizeGb': args.disk_size,
192 |       } ]
193 |     },
194 | 
195 |     # We can set a series of individual files, but typically usage will
196 |     # just be:
197 |     # 'inputs': {
198 |     #   'inputFile0': 'gs://bucket/<sample>/*.vcf',
199 |     # }
200 |     'inputs': dict( {
201 |       'inputFile%d' % idx: value for idx, value in enumerate(args.input)
202 |     }.items() + ({
203 |       'ORIGINAL_SAMPLE_ID': args.original_sample_id,
204 |     }.items() if args.original_sample_id else []) + {
205 |       'NEW_SAMPLE_ID': args.new_sample_id,
206 |     }.items()),
207 | 
208 |     # Pass the user-specified Cloud Storage destination path output
209 |     'outputs': {
210 |       'outputPath': args.output
211 |     },
212 | 
213 |     # Pass the user-specified Cloud Storage destination for pipeline logging
214 |     'logging': {
215 |       'gcsPath': args.logging
216 |     },
217 |   }
218 | }).execute()
219 | 
220 | # Emit the result of the pipeline run submission
221 | pp = pprint.PrettyPrinter(indent=2)
222 | pp.pprint(operation)
223 | 
224 | # If requested - poll until the operation reaches completion state ("done: true")
225 | if args.poll_interval > 0:
226 |   completed_op = poller.poll(service, operation, args.poll_interval)
227 |   pp.pprint(completed_op)
228 | 


--------------------------------------------------------------------------------
/fastqc/README.md:
--------------------------------------------------------------------------------
  1 | # Run FASTQC on a list of BAM or FASTQ files
  2 | 
  3 | ## (1) Create the Docker image.
  4 | ```
  5 | git clone https://github.com/googlegenomics/pipelines-api-examples.git
  6 | cd pipelines-api-examples/fastqc/
  7 | docker build -t ${USER}/fastqc .
  8 | ```
  9 | ## (2) Test locally the Docker image used by the pipeline.
 10 | ```
 11 | ./local/test_fastqc.sh
 12 | ```
 13 | 
 14 | The result should be the newly created .html and .zip file in a subdirectory
 15 | on your local machine:
 16 | ```
 17 | Copying test file NA06986.chromMT.ILLUMINA.bwa.CEU.exon_targetted.20100311.bam to <snip>/pipelines-api-examples/src/test_mnt
 18 |   % Total    % Received % Xferd  Average Speed   Time    Time     Time  Current
 19 |                                  Dload  Upload   Total   Spent    Left  Speed
 20 | 100 26534  100 26534    0     0   6983      0  0:00:03  0:00:03 --:--:--  6982
 21 | 
 22 | Running fastqc index via docker
 23 | Started analysis of NA06986.chromMT.ILLUMINA.bwa.CEU.exon_targetted.20100311.bam
 24 | Analysis complete for NA06986.chromMT.ILLUMINA.bwa.CEU.exon_targetted.20100311.bam
 25 | 
 26 | Execution completed
 27 | 
 28 | Scratch directory:
 29 | .
 30 | ./output
 31 | ./output/NA06986.chromMT.ILLUMINA.bwa.CEU.exon_targetted.20100311_fastqc.zip
 32 | ./output/NA06986.chromMT.ILLUMINA.bwa.CEU.exon_targetted.20100311_fastqc.html
 33 | ./input
 34 | ./input/NA06986.chromMT.ILLUMINA.bwa.CEU.exon_targetted.20100311.bam
 35 | ```
 36 | 
 37 | ## (3) Push the Docker image to a repository.
 38 | In this example, we push the container to [Google Container Registry](https://cloud.google.com/container-registry/) via the following commands:
 39 | ```
 40 | docker tag ${USER}/fastqc gcr.io/YOUR-PROJECT-ID/fastqc
 41 | gcloud docker -- push gcr.io/YOUR-PROJECT-ID/fastqc
 42 | ```
 43 | 
 44 | ## (4) Run the Docker image in the cloud
 45 | ```
 46 | PYTHONPATH=.. python cloud/run_fastqc.py \
 47 |   --project YOUR-PROJECT-ID \
 48 |   --zones "us-*" \
 49 |   --disk-size 100 \
 50 |   --input \
 51 |     gs://genomics-public-data/ftp-trace.ncbi.nih.gov/1000genomes/ftp/technical/pilot3_exon_targetted_GRCh37_bams/data/NA06986/alignment/NA06986.chromMT.ILLUMINA.bwa.CEU.exon_targetted.20100311.bam \
 52 |     gs://genomics-public-data/ftp-trace.ncbi.nih.gov/1000genomes/ftp/technical/pilot3_exon_targetted_GRCh37_bams/data/NA18628/alignment/NA18628.chromY.LS454.ssaha2.CHB.exon_targetted.20100311.bam \
 53 |   --output gs://YOUR-BUCKET/pipelines-api-examples/fastqc/output \
 54 |   --logging gs://YOUR-BUCKET/pipelines-api-examples/fastqc/logging \
 55 |   --poll-interval 20
 56 | ```
 57 | 
 58 | * Replace `YOUR-PROJECT-ID` with your project ID.
 59 | * Replace `YOUR-BUCKET` with a bucket in your project.
 60 | 
 61 | The `PYTHONPATH` must include the top-level directory of the
 62 | `pipelines-api-examples` in order to pick up modules in the
 63 | [pipelines_pylib](../pipelines_pylib) directory.
 64 | 
 65 | The output will be the JSON description of the operation, followed by periodic
 66 | messages for polling. When the operation completes, the full operation will
 67 | be emitted.
 68 | ```
 69 | { u'done': False,
 70 |   u'metadata': { u'@type': u'type.googleapis.com/google.genomics.v1.OperationMetadata',
 71 |                  u'clientId': u'',
 72 |                  u'createTime': u'2016-03-10T02:01:42.000Z',
 73 |                  u'events': [],
 74 |                  u'projectId': u'YOUR-PROJECT-ID'},
 75 |   u'name': u'operations/YOUR-NEW-OPERATION-ID'}
 76 | 
 77 | Polling for completion of operation
 78 | Operation not complete. Sleeping 20 seconds
 79 | Operation not complete. Sleeping 20 seconds
 80 | ...
 81 | Operation not complete. Sleeping 20 seconds
 82 | 
 83 | Operation complete
 84 | 
 85 | { u'done': True,
 86 |   u'metadata': { u'@type': u'type.googleapis.com/google.genomics.v1.OperationMetadata',
 87 |                  u'clientId': u'',
 88 |                  u'createTime': u'2016-03-10T02:01:42.000Z',
 89 |                  u'endTime': u'2016-03-10T02:04:52.000Z',
 90 |                  u'events': [ { u'description': u'start(g1-small)',
 91 |                                 u'startTime': u'2016-03-10T02:03:01.000Z'},
 92 |                               { u'description': u'pulling-image',
 93 |                                 u'startTime': u'2016-03-10T02:03:11.000Z'},
 94 |                               { u'description': u'localizing-files',
 95 |                                 u'startTime': u'2016-03-10T02:03:50.000Z'},
 96 |                               { u'description': u'running-docker',
 97 |                                 u'startTime': u'2016-03-10T02:04:07.000Z'},
 98 |                               { u'description': u'delocalizing-files',
 99 |                                 u'startTime': u'2016-03-10T02:04:29.000Z'},
100 |                               { u'description': u'ok',
101 |                                 u'startTime': u'2016-03-10T02:04:43.000Z'}],
102 |                  u'projectId': u'YOUR-PROJECT-ID',
103 |                  u'request': { u'@type': u'type.googleapis.com/google.genomics.v1alpha2.RunPipelineRequest',
104 |                                u'ephemeralPipeline': { u'description': u'Run "FastQC" on one or more files',
105 |                                                        u'docker': { u'cmd': u'mkdir /mnt/data/output && fastqc /mnt/data/input/* --outdir=/mnt/data/output/',
106 |                                                                     u'imageName': u'gcr.io/YOUR-PROJECT-ID/fastqc'},
107 |                                                        u'name': u'fastqc',
108 |                                                        u'parameters': [ { u'description': u'Cloud Storage path to an input file',
109 |                                                                           u'name': u'inputFile0'},
110 |                                                                         { u'description': u'Cloud Storage path to an input file',
111 |                                                                           u'name': u'inputFile1'},
112 |                                                                         { u'description': u'Cloud Storage path for where to FastQC output',
113 |                                                                           u'name': u'outputPath'}],
114 |                                                        u'projectId': u'YOUR-PROJECT-ID',
115 |                                                        u'resources': { u'disks': [ { u'autoDelete': True,
116 |                                                                                      u'name': u'datadisk',
117 |                                                                                      u'sizeGb': 500,
118 |                                                                                      u'type': u'PERSISTENT_HDD'}],
119 |                                                                        u'minimumCpuCores': 1,
120 |                                                                        u'minimumRamGb': 3.75}},
121 |                                u'pipelineArgs': { u'clientId': u'',
122 |                                                   u'inputs': { u'inputFile0': u'gs://genomics-public-data/ftp-trace.ncbi.nih.gov/1000genomes/ftp/technical/pilot3_exon_targetted_GRCh37_bams/data/NA06986/alignment/NA06986.chromMT.ILLUMINA.bwa.CEU.exon_targetted.20100311.bam',
123 |                                                                u'inputFile1': u'gs://genomics-public-data/ftp-trace.ncbi.nih.gov/1000genomes/ftp/technical/pilot3_exon_targetted_GRCh37_bams/data/NA18628/alignment/NA18628.chromY.LS454.ssaha2.CHB.exon_targetted.20100311.bam'},
124 |                                                   u'logging': { u'gcsPath': u'gs://YOUR-BUCKET/pipelines-api-examples/fastqc/logging'},
125 |                                                   u'outputs': { u'outputPath': u'gs://YOUR-BUCKET/pipelines-api-examples/fastqc/output'},
126 |                                                   u'projectId': u'YOUR-PROJECT-ID',
127 |                                                   u'resources': { u'disks': [ { u'autoDelete': False,
128 |                                                                                 u'mountPoint': u'',
129 |                                                                                 u'name': u'datadisk',
130 |                                                                                 u'readOnly': False,
131 |                                                                                 u'sizeGb': 100,
132 |                                                                                 u'source': u'',
133 |                                                                                 u'type': u'PERSISTENT_HDD'}],
134 |                                                                   u'minimumCpuCores': 0,
135 |                                                                   u'minimumRamGb': 1,
136 |                                                                   u'preemptible': False,
137 |                                                                   u'zones': [ u'us-central1-a',
138 |                                                                               u'us-central1-b',
139 |                                                                               u'us-central1-c',
140 |                                                                               u'us-central1-f']},
141 |                                                   u'serviceAccount': { u'email': u'default',
142 |                                                                        u'scopes': [ u'https://www.googleapis.com/auth/compute',
143 |                                                                                     u'https://www.googleapis.com/auth/devstorage.full_control',
144 |                                                                                     u'https://www.googleapis.com/auth/genomics']}}},
145 |                  u'startTime': u'2016-03-10T02:02:09.000Z'},
146 |   u'name': u'operations/YOUR-NEW-OPERATION-ID'}
147 | ```
148 | 
149 | ## (5) Check the results
150 | 
151 | Check the operation output for a top-level `errors` field.
152 | If none, then the operation should have finished successfully.
153 | 
154 | ```
155 | $ gsutil ls gs://YOUR-BUCKET/pipelines-api-examples/fastqc/output
156 | gs://YOUR-BUCKET/pipelines-api-examples/fastqc/output/NA06986.chromMT.ILLUMINA.bwa.CEU.exon_targetted.20100311_fastqc.html
157 | gs://YOUR-BUCKET/pipelines-api-examples/fastqc/output/NA06986.chromMT.ILLUMINA.bwa.CEU.exon_targetted.20100311_fastqc.zip
158 | gs://YOUR-BUCKET/pipelines-api-examples/fastqc/output/NA18628.chromY.LS454.ssaha2.CHB.exon_targetted.20100311_fastqc.html
159 | gs://YOUR-BUCKET/pipelines-api-examples/fastqc/output/NA18628.chromY.LS454.ssaha2.CHB.exon_targetted.20100311_fastqc.zip
160 | ```
161 | 


--------------------------------------------------------------------------------
/set_vcf_sample_id/README.md:
--------------------------------------------------------------------------------
  1 | # Update a VCF header line sample ID
  2 | 
  3 | This pipeline is useful when you have a set of single-sample VCFs
  4 | in Cloud Storage in which the sample ID in the header needs to be changed.
  5 | 
  6 | For example, suppose you have a set of 24 VCFs for a single individual
  7 | where each VCF contains variants for a single chromosome (1-22, X, Y). 
  8 | Suppose further that the header line of each VCF looks like:
  9 | 
 10 | ```
 11 | #CHROM POS ID REF ALT QUAL FILTER INFO FORMAT SAMPLE-001
 12 | ```
 13 | 
 14 | and you want it to look like:
 15 | 
 16 | ```
 17 | #CHROM POS ID REF ALT QUAL FILTER INFO FORMAT SAMPLE-001-TEST-01
 18 | ```
 19 | 
 20 | The scripts in this example can be used to make this change to a set of VCFs.
 21 | The input VCFs can be compressed with gzip or bzip2 or they can be uncompressed.
 22 | The output VCFs' compression state will reflect the input VCFs'.
 23 | 
 24 | 
 25 | #### API Notes
 26 | 
 27 | * This example demonstrates using the Pipelines API to run custom code *without building a Docker image*. In this example, the scripts are copied into Cloud Storage and then downloaded at pipeline run time.  The Docker image used is the stock [Python 2.7 image from Docker Hub](https://hub.docker.com/_/python/).
 28 | 
 29 | * This example demonstrates the use of non-file input parameters. These parameters (`ORIGINAL_SAMPLE_ID` and `NEW_SAMPLE_ID`) get set in the environment and are then available to the code running in the Docker container.
 30 | 
 31 | * If you have already imported your VCFs into Google Genomics, you can update the call set name using:
 32 | 
 33 |   ```
 34 |   gcloud alpha genomics callsets update <ID> --name <NAME>
 35 |   ```
 36 | 
 37 | ## (1) Copy scripts to Cloud Storage
 38 | 
 39 | ```
 40 | gsutil cp process_vcfs.sh set_vcf_sample_id.py \
 41 |   gs://YOUR-BUCKET/pipelines-api-examples/set_vcf_sample_id/
 42 | ```
 43 | 
 44 | * Replace `YOUR-BUCKET` with a bucket in your project.
 45 | 
 46 | ## (2) Launch the Pipeline
 47 | 
 48 | ```
 49 | PYTHONPATH=.. python cloud/run_set_vcf_sample_id.py \
 50 |   --project YOUR-PROJECT \
 51 |   --zones "us-*" \
 52 |   --disk-size 100 \
 53 |   --script-path gs://YOUR-BUCKET/pipelines-api-examples/set_vcf_sample_id \
 54 |   --input gs://genomics-public-data/ftp-trace.ncbi.nih.gov/1000genomes/ftp/technical/working/20140123_NA12878_Illumina_Platinum/**.vcf.gz \
 55 |   --original-sample-id NA12878 \
 56 |   --new-sample-id NA12878-NEW \
 57 |   --output gs://YOUR-BUCKET/pipelines-api-examples/set_vcf_sample_id/output \
 58 |   --logging gs://YOUR-BUCKET/pipelines-api-examples/set_vcf_sample_id/logging \
 59 |   --poll-interval 20
 60 | ```
 61 | 
 62 | * Replace `YOUR-PROJECT-ID` with your project ID.
 63 | * Replace `YOUR-BUCKET` with a bucket in your project.
 64 | 
 65 | The `PYTHONPATH` must include the top-level directory of the
 66 | `pipelines-api-examples` in order to pick up modules in the
 67 | [pipelines_pylib](../pipelines_pylib) directory.
 68 | 
 69 | The output will be the JSON description of the operation, followed by periodic
 70 | messages for polling. When the operation completes, the full operation will
 71 | be emitted.
 72 | 
 73 | ```
 74 | { u'done': False,
 75 |   u'metadata': { u'@type': u'type.googleapis.com/google.genomics.v1.OperationMetadata',
 76 |                  u'clientId': u'',
 77 |                  u'createTime': u'2016-03-26T20:23:16.000Z',
 78 |                  u'events': [],
 79 |                  u'projectId': u'YOUR-PROJECT-ID'},
 80 |   u'name': u'operations/YOUR-NEW-OPERATION-ID'}
 81 | 
 82 | Polling for completion of operation
 83 | Operation not complete. Sleeping 20 seconds
 84 | Operation not complete. Sleeping 20 seconds
 85 | ...
 86 | Operation not complete. Sleeping 20 seconds
 87 | 
 88 | Operation complete
 89 | 
 90 | { u'done': True,
 91 |   u'metadata': { u'@type': u'type.googleapis.com/google.genomics.v1.OperationMetadata',
 92 |                  u'clientId': u'',
 93 |                  u'createTime': u'2016-03-26T20:23:16.000Z',
 94 |                  u'endTime': u'2016-03-26T20:26:34.000Z',
 95 |                  u'events': [ { u'description': u'start',
 96 |                                 u'startTime': u'2016-03-26T20:24:23.037434420Z'},
 97 |                               { u'description': u'pulling-image',
 98 |                                 u'startTime': u'2016-03-26T20:24:23.037517871Z'},
 99 |                               { u'description': u'localizing-files',
100 |                                 u'startTime': u'2016-03-26T20:24:53.813765964Z'},
101 |                               { u'description': u'running-docker',
102 |                                 u'startTime': u'2016-03-26T20:25:01.034948524Z'},
103 |                               { u'description': u'delocalizing-files',
104 |                                 u'startTime': u'2016-03-26T20:26:05.086924619Z'},
105 |                               { u'description': u'ok',
106 |                                 u'startTime': u'2016-03-26T20:26:34.544887148Z'}],
107 |                  u'projectId': u'YOUR-PROJECT-ID',
108 |                  u'request': { u'@type': u'type.googleapis.com/google.genomics.v1alpha2.RunPipelineRequest',
109 |                                u'ephemeralPipeline': { u'description': u'Set the sample ID in a VCF header',
110 |                                                        u'docker': { u'cmd': u'mkdir /mnt/data/output && export SCRIPT_DIR=/mnt/data/scripts && chmod u+x ${SCRIPT_DIR}/* && ${SCRIPT_DIR}/process_vcfs.sh "${ORIGINAL_SAMPLE_ID:-}" "${NEW_SAMPLE_ID}" "/mnt/data/input/*" "/mnt/data/output"',
111 |                                                                     u'imageName': u'python:2.7'},
112 |                                                        u'name': u'set_vcf_sample_id',
113 |                                                        u'parameters': [ { u'description': u'Cloud Storage path to input file(s)',
114 |                                                                           u'name': u'inputFile0'},
115 |                                                                         { u'description': u'Cloud Storage path to process_vcfs.sh script',
116 |                                                                           u'name': u'setVcfSampleId_Script'},
117 |                                                                         { u'description': u'Cloud Storage path to set_vcf_sample_id.py script',
118 |                                                                           u'name': u'setVcfSampleId_Python'},
119 |                                                                         { u'description': u'Sample ID which must already appear in the VCF header',
120 |                                                                           u'name': u'ORIGINAL_SAMPLE_ID'},
121 |                                                                         { u'description': u'New sample ID to set in the VCF header',
122 |                                                                           u'name': u'NEW_SAMPLE_ID'},
123 |                                                                         { u'description': u'Cloud Storage path for where to copy the output',
124 |                                                                           u'name': u'outputPath'}],
125 |                                                        u'projectId': u'YOUR-PROJECT-ID',
126 |                                                        u'resources': { u'disks': [ { u'autoDelete': True,
127 |                                                                                      u'name': u'datadisk'}]}},
128 |                                u'pipelineArgs': { u'clientId': u'',
129 |                                                   u'inputs': { u'NEW_SAMPLE_ID': u'NA12878-NEW',
130 |                                                                u'ORIGINAL_SAMPLE_ID': u'NA12878',
131 |                                                                u'inputFile0': u'gs://genomics-public-data/ftp-trace.ncbi.nih.gov/1000genomes/ftp/technical/working/20140123_NA12878_Illumina_Platinum/**.vcf.gz'},
132 |                                                   u'logging': { u'gcsPath': u'gs://YOUR-BUCKET/pipelines-api-examples/set_vcf_sample_id/logging'},
133 |                                                   u'outputs': { u'outputPath': u'gs://YOUR-BUCKET/pipelines-api-examples/set_vcf_sample_id/output'},
134 |                                                   u'projectId': u'YOUR-PROJECT-ID',
135 |                                                   u'resources': { u'bootDiskSizeGb': 0,
136 |                                                                   u'disks': [ { u'autoDelete': False,
137 |                                                                                 u'mountPoint': u'',
138 |                                                                                 u'name': u'datadisk',
139 |                                                                                 u'readOnly': False,
140 |                                                                                 u'sizeGb': 100,
141 |                                                                                 u'source': u'',
142 |                                                                                 u'type': u'TYPE_UNSPECIFIED'}],
143 |                                                                   u'minimumCpuCores': 0,
144 |                                                                   u'minimumRamGb': 1,
145 |                                                                   u'preemptible': False,
146 |                                                                   u'zones': [ u'us-central1-a',
147 |                                                                               u'us-central1-b',
148 |                                                                               u'us-central1-c',
149 |                                                                               u'us-central1-f',
150 |                                                                               u'us-east1-b',
151 |                                                                               u'us-east1-c',
152 |                                                                               u'us-east1-d']},
153 |                                                   u'serviceAccount': { u'email': u'default',
154 |                                                                        u'scopes': [ u'https://www.googleapis.com/auth/compute',
155 |                                                                                     u'https://www.googleapis.com/auth/devstorage.full_control',
156 |                                                                                     u'https://www.googleapis.com/auth/genomics']}}},
157 |                  u'startTime': u'2016-03-26T20:23:45.000Z'},
158 |   u'name': u'operations/YOUR-NEW-OPERATION-ID'}
159 | ```
160 | 
161 | ## (3) Check the results
162 | 
163 | Check the operation output for a top-level `errors` field.
164 | If none, then the operation should have finished successfully.
165 | 
166 | ## (4) Check that the output exists
167 | 
168 | ```
169 | $ gsutil ls -l gs://YOUR-BUCKET/pipelines-api-examples/set_vcf_sample_id/output
170 | 
171 |    5055692  2016-03-26T20:26:08Z  gs://YOUR-BUCKET/pipelines-api-examples/set_vcf_sample_id/output/NA12878.wgs.illumina_platinum.20140122.indel.genotypes.vcf.gz
172 |   29162279  2016-03-26T20:26:12Z  gs://YOUR-BUCKET/pipelines-api-examples/set_vcf_sample_id/output/NA12878.wgs.illumina_platinum.20140122.snp.genotypes.vcf.gz
173 |    5243803  2016-03-26T20:26:07Z  gs://YOUR-BUCKET/pipelines-api-examples/set_vcf_sample_id/output/NA12878.wgs.illumina_platinum.20140404.indels_v2.vcf.gz
174 |   29014774  2016-03-26T20:26:09Z  gs://YOUR-BUCKET/pipelines-api-examples/set_vcf_sample_id/output/NA12878.wgs.illumina_platinum.20140404.snps_v2.vcf.gz
175 |      31103  2016-03-26T20:26:07Z  gs://YOUR-BUCKET/pipelines-api-examples/set_vcf_sample_id/output/NA12878.wgs.illumina_platinum.20140404.svs_v2.vcf.gz
176 | TOTAL: 5 objects, 68507651 bytes (65.33 MiB)
177 | ```
178 | 
179 | ## (5) Check the header in the output
180 | 
181 | ```
182 | $ gsutil cat gs://YOUR-BUCKET/pipelines-api-examples/set_vcf_sample_id/output/* \
183 |  | zcat \
184 |  | grep ^#CHROM
185 | #CHROM  POS ID  REF ALT QUAL  FILTER  INFO  FORMAT  NA12878-NEW
186 | #CHROM  POS ID  REF ALT QUAL  FILTER  INFO  FORMAT  NA12878-NEW
187 | #CHROM  POS ID  REF ALT QUAL  FILTER  INFO  FORMAT  NA12878-NEW
188 | #CHROM  POS ID  REF ALT QUAL  FILTER  INFO  FORMAT  NA12878-NEW
189 | #CHROM  POS ID  REF ALT QUAL  FILTER  INFO  FORMAT  NA12878-NEW
190 | ```
191 | 


--------------------------------------------------------------------------------