├── .travis.yml ├── project └── assembly.sbt ├── assembly.sbt ├── .gitignore ├── src └── main │ ├── scala │ └── com │ │ └── google │ │ └── cloud │ │ └── genomics │ │ ├── Client.scala │ │ └── spark │ │ └── examples │ │ ├── VariantsCommon.scala │ │ ├── GenomicsConf.scala │ │ ├── SearchVariantsExample.scala │ │ ├── rdd │ │ ├── ReadsRDD.scala │ │ └── VariantsRDD.scala │ │ ├── VariantsPca.scala │ │ └── SearchReadsExample.scala │ └── python │ └── variants_pca.py ├── CONTRIBUTING.rst ├── README.md └── LICENSE /.travis.yml: -------------------------------------------------------------------------------- 1 | language: scala 2 | scala: 3 | - 2.10.4 4 | -------------------------------------------------------------------------------- /project/assembly.sbt: -------------------------------------------------------------------------------- 1 | addSbtPlugin("com.eed3si9n" % "sbt-assembly" % "0.14.0") 2 | -------------------------------------------------------------------------------- /assembly.sbt: -------------------------------------------------------------------------------- 1 | run in Compile <<= Defaults.runTask(fullClasspath in Compile, mainClass in (Compile, run), runner in (Compile, run)) 2 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | *.class 2 | *.log 3 | 4 | # sbt specific 5 | .cache/ 6 | .history/ 7 | .lib/ 8 | dist/* 9 | target/ 10 | lib_managed/ 11 | src_managed/ 12 | project/boot/ 13 | project/plugins/project/ 14 | 15 | # Scala-IDE specific 16 | .scala_dependencies 17 | .worksheet 18 | -------------------------------------------------------------------------------- /src/main/scala/com/google/cloud/genomics/Client.scala: -------------------------------------------------------------------------------- 1 | /* 2 | Copyright 2014 Google Inc. All rights reserved. 3 | 4 | Licensed under the Apache License, Version 2.0 (the "License"); 5 | you may not use this file except in compliance with the License. 6 | You may obtain a copy of the License at 7 | 8 | http://www.apache.org/licenses/LICENSE-2.0 9 | 10 | Unless required by applicable law or agreed to in writing, software 11 | distributed under the License is distributed on an "AS IS" BASIS, 12 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | See the License for the specific language governing permissions and 14 | limitations under the License. 15 | */ 16 | package com.google.cloud.genomics 17 | 18 | import java.io.File 19 | import java.io.FileReader 20 | import java.io.StringReader 21 | import java.util.Scanner 22 | import scala.util.{Try, Success, Failure} 23 | import com.google.api.services.genomics.Genomics 24 | import com.google.cloud.genomics.utils.CredentialFactory 25 | import com.google.cloud.genomics.utils.GenomicsFactory 26 | import com.google.cloud.genomics.utils.OfflineAuth 27 | 28 | object Authentication { 29 | def getAccessToken(clientSecretsFile: Option[String], 30 | applicationName: String = "spark-examples") = { 31 | if(clientSecretsFile.isDefined) { 32 | System.out.println("\nThis pipeline will make your user credential available to all" 33 | + " Spark worker processes. Your credentials may be visible to others with access to the" 34 | + " machines on which this pipeline is running."); 35 | System.out.println("Do you want to continue (Y/n)?"); 36 | val kbd = new Scanner(System.in) 37 | val decision = kbd.nextLine() 38 | decision match { 39 | case "yes" | "Yes" | "YES" | "y" | "Y" => "proceed" 40 | case _ => System.exit(0) 41 | } 42 | new OfflineAuth(CredentialFactory.getCredentialFromClientSecrets(clientSecretsFile.get, applicationName)) 43 | } else { 44 | new OfflineAuth() 45 | } 46 | } 47 | } 48 | 49 | object Client { 50 | 51 | def apply(auth: OfflineAuth, applicationName: String = "spark-examples"): Client = { 52 | val factory = GenomicsFactory.builder().build() 53 | new Client(factory.fromOfflineAuth(auth), factory) 54 | } 55 | } 56 | 57 | class Client(val genomics: Genomics, private val factory: GenomicsFactory) { 58 | def initializedRequestsCount = factory.initializedRequestsCount() 59 | def unsuccessfulResponsesCount = factory.unsuccessfulResponsesCount() 60 | def ioExceptionsCount = factory.ioExceptionsCount() 61 | } 62 | 63 | -------------------------------------------------------------------------------- /CONTRIBUTING.rst: -------------------------------------------------------------------------------- 1 | How to contribute 2 | =================================== 3 | 4 | First of all, thank you for contributing! 5 | 6 | The mailing list 7 | ---------------- 8 | 9 | For general questions or if you are having trouble getting started, try the 10 | `Google Genomics Discuss mailing list `_. 11 | It's a good way to sync up with other people who use googlegenomics including the core developers. You can subscribe 12 | by sending an email to ``google-genomics-discuss+subscribe@googlegroups.com`` or just post using 13 | the `web forum page `_. 14 | 15 | 16 | Submitting issues 17 | ----------------- 18 | 19 | If you are encountering a bug in the code or have a feature request in mind - file away! 20 | 21 | 22 | Submitting a pull request 23 | ------------------------- 24 | 25 | If you are ready to contribute code, Github provides a nice `overview on how to create a pull request 26 | `_. 27 | 28 | Some general rules to follow: 29 | 30 | * Do your work in `a fork `_ of this repo. 31 | * Create a branch for each update that you're working on. 32 | These branches are often called "feature" or "topic" branches. Any changes 33 | that you push to your feature branch will automatically be shown in the pull request. 34 | * Keep your pull requests as small as possible. Large pull requests are hard to review. 35 | Try to break up your changes into self-contained and incremental pull requests. 36 | * The first line of commit messages should be a short (<80 character) summary, 37 | followed by an empty line and then any details that you want to share about the commit. 38 | * Please try to follow the existing syntax style 39 | 40 | When you submit or change your pull request, the Travis build system will automatically run tests. 41 | If your pull request fails to pass tests, review the test log, make changes and 42 | then push them to your feature branch to be tested again. 43 | 44 | 45 | Contributor License Agreements 46 | ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ 47 | 48 | All pull requests are welcome. Before we can submit them though, there is a legal hurdle we have to jump. 49 | You'll need to fill out either the individual or corporate Contributor License Agreement 50 | (CLA). 51 | 52 | * If you are an individual writing original source code and you're sure you 53 | own the intellectual property, then you'll need to sign an `individual CLA 54 | `_. 55 | * If you work for a company that wants to allow you to contribute your work, 56 | then you'll need to sign a `corporate CLA 57 | `_. 58 | 59 | Follow either of the two links above to access the appropriate CLA and 60 | instructions for how to sign and return it. Once we receive it, we'll be able to 61 | accept your pull requests. 62 | -------------------------------------------------------------------------------- /src/main/scala/com/google/cloud/genomics/spark/examples/VariantsCommon.scala: -------------------------------------------------------------------------------- 1 | /* 2 | Copyright 2015 Google Inc. All rights reserved. 3 | 4 | Licensed under the Apache License, Version 2.0 (the "License"); 5 | you may not use this file except in compliance with the License. 6 | You may obtain a copy of the License at 7 | 8 | http://www.apache.org/licenses/LICENSE-2.0 9 | 10 | Unless required by applicable law or agreed to in writing, software 11 | distributed under the License is distributed on an "AS IS" BASIS, 12 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | See the License for the specific language governing permissions and 14 | limitations under the License. 15 | */ 16 | package com.google.cloud.genomics.spark.examples 17 | 18 | import scala.collection.JavaConversions._ 19 | import org.apache.spark.SparkContext 20 | import com.google.api.services.genomics.model.SearchCallSetsRequest 21 | import com.google.genomics.v1.{Variant => VariantModel} 22 | import com.google.cloud.genomics.Authentication 23 | import com.google.cloud.genomics.Client 24 | import com.google.cloud.genomics.spark.examples.rdd.Variant 25 | import com.google.cloud.genomics.spark.examples.rdd.VariantKey 26 | import com.google.cloud.genomics.spark.examples.rdd.VariantsPartitioner 27 | import com.google.cloud.genomics.spark.examples.rdd.VariantsRDD 28 | import com.google.cloud.genomics.spark.examples.rdd.VariantsRddStats 29 | import com.google.cloud.genomics.utils.Paginator 30 | 31 | import org.apache.spark.rdd.RDD 32 | 33 | class VariantsCommon(conf: PcaConf, sc: SparkContext) { 34 | 35 | private val auth = Authentication.getAccessToken(conf.clientSecrets.get) 36 | private val ioStats = createIoStats 37 | 38 | val (indexes, names) = { 39 | val client = Client(auth).genomics 40 | val searchCallsets = Paginator.Callsets.create(client) 41 | val req = new SearchCallSetsRequest() 42 | .setVariantSetIds(conf.variantSetId()) 43 | val callsets = searchCallsets.search(req).iterator().toSeq 44 | val indexes = callsets.map( 45 | callset => callset.getId()).toSeq.zipWithIndex.toMap 46 | val names = callsets.map( 47 | callset => (callset.getId(), callset.getName())).toMap 48 | println(s"Matrix size: ${indexes.size}.") 49 | (indexes, names) 50 | } 51 | 52 | val data = { 53 | if (conf.inputPath.isDefined) { 54 | List(sc.objectFile[(VariantKey, Variant)](conf.inputPath()).map(_._2)) 55 | } else { 56 | val variantSets = conf.variantSetId() 57 | println(s"Running PCA on ${variantSets.length} datasets.") 58 | conf.variantSetId().zipWithIndex.map { 59 | case (variantSetId, variantSetIndex) => 60 | new VariantsRDD(sc, this.getClass.getName, auth, 61 | variantSetId, 62 | conf.getPartitioner(auth, variantSetId, variantSetIndex), 63 | stats=ioStats).map(_._2) 64 | } 65 | } 66 | } 67 | 68 | def reportIoStats = { 69 | this.ioStats match { 70 | case Some(stats) => println(stats.toString) 71 | case _ => {} 72 | } 73 | } 74 | 75 | // For now assume a single dataset when invoking from python. 76 | def getJavaData: RDD[VariantModel] = data.head.map(_.toJavaVariant) 77 | 78 | def createIoStats = if (conf.inputPath.isDefined) None 79 | else Option(new VariantsRddStats(sc)) 80 | 81 | } 82 | -------------------------------------------------------------------------------- /src/main/scala/com/google/cloud/genomics/spark/examples/GenomicsConf.scala: -------------------------------------------------------------------------------- 1 | /* 2 | Copyright 2014 Google Inc. All rights reserved. 3 | 4 | Licensed under the Apache License, Version 2.0 (the "License"); 5 | you may not use this file except in compliance with the License. 6 | You may obtain a copy of the License at 7 | 8 | http://www.apache.org/licenses/LICENSE-2.0 9 | 10 | Unless required by applicable law or agreed to in writing, software 11 | distributed under the License is distributed on an "AS IS" BASIS, 12 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | See the License for the specific language governing permissions and 14 | limitations under the License. 15 | */ 16 | 17 | package com.google.cloud.genomics.spark.examples 18 | 19 | import scala.collection.JavaConversions._ 20 | 21 | import org.apache.spark.SparkConf 22 | import org.apache.spark.SparkContext 23 | import org.rogach.scallop.ScallopConf 24 | 25 | import com.google.cloud.genomics.spark.examples.rdd.AllReferencesVariantsPartitioner 26 | import com.google.cloud.genomics.spark.examples.rdd.ReferencesVariantsPartitioner 27 | import com.google.cloud.genomics.spark.examples.rdd.VariantsPartitioner 28 | import com.google.cloud.genomics.utils.OfflineAuth 29 | import com.google.cloud.genomics.utils.ShardUtils.SexChromosomeFilter 30 | 31 | class GenomicsConf(arguments: Seq[String]) extends ScallopConf(arguments) { 32 | val DEFAULT_NUMBER_OF_BASES_PER_SHARD = 1000000 33 | val PLATINUM_GENOMES_BRCA1_REFERENCES = "chr17:41196311:41277499" 34 | 35 | val basesPerPartition = opt[Long](default = 36 | Some(DEFAULT_NUMBER_OF_BASES_PER_SHARD), 37 | descr = "Partition each reference using a fixed number of bases") 38 | val clientSecrets = opt[String]( 39 | descr = "Provide the file path to client_secrets.json to use a user " 40 | + "credential instead of the Application Default Credential.") 41 | val inputPath = opt[String]() 42 | val numReducePartitions = opt[Int](default = Some(10), 43 | descr = "Set it to a " + 44 | "number greater than the number of cores, to achieve maximum " + 45 | "throughput.") 46 | val outputPath = opt[String]() 47 | val references = opt[List[String]](default= 48 | Some(List(PLATINUM_GENOMES_BRCA1_REFERENCES)), 49 | descr = "Comma separated tuples of reference:start:end,... " + 50 | "one list of tuples should be specified per variantset " + 51 | "in the corresponding order.") 52 | val sparkMaster = opt[String]( 53 | descr = "A spark master URL. Leave empty if using spark-submit.") 54 | val variantSetId = opt[List[String]]( 55 | default = Some(List(GoogleGenomicsPublicData.Platinum_Genomes)), 56 | descr = "List of VariantSetId to use in the analysis.") 57 | 58 | def newSparkContext(className: String) = { 59 | val conf = new SparkConf() 60 | .setAppName(className) 61 | .set("spark.shuffle.consolidateFiles", "true") 62 | if (this.sparkMaster.isDefined) 63 | conf.setMaster(this.sparkMaster()) 64 | new SparkContext(conf) 65 | } 66 | 67 | def getPartitioner(references: String) = { 68 | new ReferencesVariantsPartitioner(references, this.basesPerPartition()) 69 | } 70 | } 71 | 72 | object PcaConf { 73 | val ExcludeXY = SexChromosomeFilter.EXCLUDE_XY 74 | } 75 | 76 | class PcaConf(arguments: Seq[String]) extends GenomicsConf(arguments) { 77 | val allReferences = opt[Boolean]( 78 | descr = "Use all references (except X and Y) to compute PCA " + 79 | "(overrides --references).") 80 | val debugDatasets = opt[Boolean]() 81 | val minAlleleFrequency = opt[Float]( 82 | descr = "For 2-way PCA, omit variants from the left variant set (typically 1,000 Genomes)" + 83 | " by including only variants with allelic frequency (field AF) greater than" + 84 | " or equal to this value.") 85 | val numPc = opt[Int](default = Some(2)) 86 | 87 | /** 88 | * Returns either the parsed references for all datasets and their 89 | * corresponding --references or all references 90 | * except X and Y if --all-references is specified. 91 | */ 92 | def getPartitioner(auth: OfflineAuth, variantSetId: String, 93 | variantSetIndex: Int = 0) = { 94 | if (this.allReferences()) { 95 | new AllReferencesVariantsPartitioner(this.basesPerPartition(), auth) 96 | } else { 97 | new ReferencesVariantsPartitioner(this.references().get(variantSetIndex), 98 | this.basesPerPartition()) 99 | } 100 | } 101 | } 102 | -------------------------------------------------------------------------------- /src/main/scala/com/google/cloud/genomics/spark/examples/SearchVariantsExample.scala: -------------------------------------------------------------------------------- 1 | /* 2 | Copyright 2014 Google Inc. All rights reserved. 3 | 4 | Licensed under the Apache License, Version 2.0 (the "License"); 5 | you may not use this file except in compliance with the License. 6 | You may obtain a copy of the License at 7 | 8 | http://www.apache.org/licenses/LICENSE-2.0 9 | 10 | Unless required by applicable law or agreed to in writing, software 11 | distributed under the License is distributed on an "AS IS" BASIS, 12 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | See the License for the specific language governing permissions and 14 | limitations under the License. 15 | */ 16 | package com.google.cloud.genomics.spark.examples 17 | 18 | import org.apache.log4j.Level 19 | import org.apache.log4j.Logger 20 | import com.google.cloud.genomics.spark.examples.rdd.Variant 21 | import com.google.cloud.genomics.spark.examples.rdd.VariantKey 22 | import com.google.cloud.genomics.spark.examples.rdd.VariantsPartitioner 23 | import com.google.cloud.genomics.spark.examples.rdd.VariantsRDD 24 | import com.google.cloud.genomics.Authentication 25 | import com.google.cloud.genomics.utils.Contig 26 | 27 | object GoogleGenomicsPublicData { 28 | final val Platinum_Genomes = "3049512673186936334" 29 | final val Thousand_Genomes_Phase_1 = "10473108253681171589" 30 | final val Thousand_Genomes_Phase_3 = "4252737135923902652" 31 | } 32 | 33 | /** 34 | * The variant in this example corresponds to dbSNP ID rs9536314, 35 | * causing an amino acid substitution in the Klotho gene (KL 36 | * F327V). About 30% of people carry the variant. In build 37, this is 37 | * an A to G substition at chromosome 13, position 33628138. 38 | */ 39 | object SearchVariantsExampleKlotho { 40 | val PLATINUM_GENOMES_KLOTHO_REFERENCES = "chr13:33628137:33628138" 41 | 42 | def main(args: Array[String]) = { 43 | val conf = new GenomicsConf(args) 44 | val applicationName = this.getClass.getName 45 | val sc = conf.newSparkContext(applicationName) 46 | Logger.getLogger("org").setLevel(Level.WARN) 47 | val references = "chr13:33628137:33628138" 48 | val accessToken = Authentication.getAccessToken(conf.clientSecrets.get) 49 | val data = new VariantsRDD(sc, 50 | applicationName, 51 | accessToken, 52 | GoogleGenomicsPublicData.Platinum_Genomes, 53 | conf.getPartitioner(references)) 54 | data.cache() // The amount of data is small since its just for one SNP. 55 | println("We have " + data.count() + " records that overlap Klotho.") 56 | println("But only " + data.filter { kv => 57 | val (key, variant) = kv 58 | variant.alternateBases != None 59 | }.count() + " records are of a variant.") 60 | println("The other " + data.filter { kv => 61 | val (key, variant) = kv 62 | variant.alternateBases == None 63 | }.count() + " records are reference-matching blocks.") 64 | val variants = data.filter { kv => 65 | val(key, variant) = kv 66 | variant.referenceBases != "N" 67 | } 68 | variants.collect.foreach { kv => 69 | val (key, variant) = kv 70 | println(s"Reference: ${variant.contig} @ ${variant.start}") 71 | } 72 | 73 | // Exercise conversion from scala objects back to java objects. This 74 | // is needed for a forthcoming example which writes modified 75 | // variants back to the variant store. 76 | // 77 | // TODO: this really belongs in an integration test or a unit test 78 | // with a mocked-out Genomics client; not in this sample. 79 | data.collect.foreach { kv => 80 | val (key, variant) = kv 81 | variant.toJavaVariant() } 82 | sc.stop 83 | } 84 | } 85 | 86 | /** 87 | * This example pulls all variants that overlap BRCA1. 88 | */ 89 | object SearchVariantsExampleBRCA1 { 90 | def main(args: Array[String]) = { 91 | val conf = new GenomicsConf(args) 92 | val applicationName = this.getClass.getName 93 | val sc = conf.newSparkContext(applicationName) 94 | Logger.getLogger("org").setLevel(Level.WARN) 95 | val brca1 = "chr17:41196311:41277499" 96 | val accessToken = Authentication.getAccessToken(conf.clientSecrets.get) 97 | val data = new VariantsRDD(sc, 98 | this.getClass.getName, 99 | accessToken, 100 | GoogleGenomicsPublicData.Platinum_Genomes, 101 | conf.getPartitioner(brca1)) 102 | data.cache() // The amount of data is small since its just for one gene 103 | println("We have " + data.count() + " records that overlap BRCA1.") 104 | println("But only " + data.filter { kv => 105 | val(key, variant) = kv 106 | variant.referenceBases != "N" 107 | }.count() + " records are of a variant.") 108 | println("The other " + data.filter { kv => 109 | val(key, variant) = kv 110 | variant.referenceBases == "N" 111 | }.count() + " records are reference-matching blocks.") 112 | sc.stop 113 | } 114 | } 115 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | spark-examples [![Build Status](https://img.shields.io/travis/googlegenomics/spark-examples.svg?style=flat)](https://travis-ci.org/googlegenomics/spark-examples) 2 | ============== 3 | 4 | The projects in this repository demonstrate working with genomic data accessible via the [Google Genomics API](https://cloud.google.com/genomics/) using [Apache Spark](http://spark.apache.org/). 5 | 6 | > If you are ready to start coding, take a look at the information below. But if you are 7 | > looking for a task-oriented list (e.g., [How do I compute principal coordinate analysis 8 | > with Google Genomics?](http://googlegenomics.readthedocs.org/en/latest/use_cases/compute_principal_coordinate_analysis/index.html)), 9 | > a better place to start is the [Google Genomics Cookbook](http://googlegenomics.readthedocs.org/en/latest/index.html). 10 | 11 | Getting Started 12 | --------------- 13 | 14 | 1. git clone this repository. 15 | 16 | 1. If you have not already done so, follow the Google Genomics [getting started instructions](https://cloud.google.com/genomics/install-genomics-tools) to set up your environment 17 | including [installing gcloud](https://cloud.google.com/sdk/) and running `gcloud init`. 18 | 19 | 1. Download and install [Apache Spark](https://spark.apache.org/downloads.html). 20 | 21 | 1. Install [SBT](http://www.scala-sbt.org/release/docs/Getting-Started/Setup.html). 22 | 23 | 1. This project now includes code for calling the Genomics API using [gRPC](http://www.grpc.io). To use gRPC, you'll need a version of ALPN that matches your JRE version. 24 | 25 | 2. See the [ALPN documentation](http://www.eclipse.org/jetty/documentation/9.2.10.v20150310/alpn-chapter.html) for a table of which ALPN jar to use for your JRE version. 26 | 2. Then download the correct version from [here](http://mvnrepository.com/artifact/org.mortbay.jetty.alpn/alpn-boot). 27 | 28 | Local Run 29 | --------- 30 | From the `spark-examples` directory run `sbt run` 31 | 32 | Use the following flags to match your runtime configuration: 33 | 34 | ``` 35 | $ export SBT_OPTS='-Xbootclasspath/p:/YOUR/PATH/TO/alpn-boot-YOUR-VERSION.jar' 36 | $ sbt "run --help" 37 | -o, --output-path 38 | -s, --spark-master A spark master URL. Leave empty if using spark-submit. 39 | ... 40 | --help Show help message 41 | ``` 42 | 43 | For example: 44 | 45 | ``` 46 | $ sbt "run --spark-master local[4]" 47 | ``` 48 | 49 | A menu should appear asking you to pick the sample to run: 50 | ``` 51 | Multiple main classes detected, select one to run: 52 | 53 | [1] com.google.cloud.genomics.spark.examples.SearchVariantsExampleKlotho 54 | [2] com.google.cloud.genomics.spark.examples.SearchVariantsExampleBRCA1 55 | [3] com.google.cloud.genomics.spark.examples.SearchReadsExample1 56 | [4] com.google.cloud.genomics.spark.examples.SearchReadsExample2 57 | [5] com.google.cloud.genomics.spark.examples.SearchReadsExample3 58 | [6] com.google.cloud.genomics.spark.examples.SearchReadsExample4 59 | [7] com.google.cloud.genomics.spark.examples.VariantsPcaDriver 60 | 61 | Enter number: 62 | ``` 63 | 64 | ### Troubleshooting: 65 | 66 | If you are seeing `java.lang.OutOfMemoryError: PermGen space` errors, set the following SBT_OPTS flag: 67 | ``` 68 | export SBT_OPTS='-XX:MaxPermSize=256m' 69 | ``` 70 | 71 | Run on Google Compute Engine 72 | ----------------------------- 73 | 74 | (1) Build the assembly. 75 | ``` 76 | sbt assembly 77 | ``` 78 | (2) Deploy your Spark cluster using [Google Cloud Dataproc](https://cloud.google.com/dataproc/). 79 | ``` 80 | gcloud beta dataproc clusters create example-cluster --scopes cloud-platform 81 | ``` 82 | (3) Copy the assembly jar to the master node. 83 | ``` 84 | gcloud compute copy-files \ 85 | target/scala-2.10/googlegenomics-spark-examples-assembly-1.0.jar example-cluster-m:~/ 86 | ``` 87 | (4) ssh to the master. 88 | ``` 89 | gcloud compute ssh example-cluster-m 90 | ``` 91 | (5) Run one of the examples. 92 | ``` 93 | spark-submit --class com.google.cloud.genomics.spark.examples.SearchReadsExample1 \ 94 | googlegenomics-spark-examples-assembly-1.0.jar 95 | ``` 96 | 97 | ### Running PCA variant analysis on GCE 98 | To run the [variant PCA analysis](https://github.com/googlegenomics/spark-examples/blob/master/src/main/scala/com/google/cloud/genomics/spark/examples/VariantsPca.scala) on GCE make sure you have followed all the steps on the previous section and that you are able to run at least one of the examples. 99 | 100 | Run the example PCA analysis for BRCA1 on the [1000 Genomes Project dataset](https://cloud.google.com/genomics/data/1000-genomes). 101 | ``` 102 | spark-submit --class com.google.cloud.genomics.spark.examples.VariantsPcaDriver \ 103 | googlegenomics-spark-examples-assembly-1.0.jar 104 | ``` 105 | 106 | The analysis will output the two principal components for each sample to the console. Here is an example of the last few lines. 107 | ``` 108 | ... 109 | NA20811 0.0286308791579312 -0.008456233951873527 110 | NA20812 0.030970386921818943 -0.006755469223823698 111 | NA20813 0.03080348019961635 -0.007475822860939408 112 | NA20814 0.02865238920148145 -0.008084003476919057 113 | NA20815 0.028798695736608034 -0.003755789964021788 114 | NA20816 0.026104805529612096 -0.010430718823329282 115 | NA20818 -0.033609576645005836 -0.026655905606186293 116 | NA20819 0.032019557126552155 -0.00775750983842731 117 | NA20826 0.03026607917284046 -0.009102704080927001 118 | NA20828 -0.03412964005321165 -0.025991697661590686 119 | NA21313 -0.03401702847363714 -0.024555217139987182 120 | ``` 121 | 122 | This pipeline is described in greater detail on [How do I compute principal coordinate analysis with Google Genomics?](http://googlegenomics.readthedocs.org/en/latest/use_cases/compute_principal_coordinate_analysis/index.html) 123 | 124 | ### Debugging 125 | 126 | For more information, see https://cloud.google.com/dataproc/faq 127 | -------------------------------------------------------------------------------- /src/main/scala/com/google/cloud/genomics/spark/examples/rdd/ReadsRDD.scala: -------------------------------------------------------------------------------- 1 | /* 2 | Copyright 2014 Google Inc. All rights reserved. 3 | 4 | Licensed under the Apache License, Version 2.0 (the "License"); 5 | you may not use this file except in compliance with the License. 6 | You may obtain a copy of the License at 7 | 8 | http://www.apache.org/licenses/LICENSE-2.0 9 | 10 | Unless required by applicable law or agreed to in writing, software 11 | distributed under the License is distributed on an "AS IS" BASIS, 12 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | See the License for the specific language governing permissions and 14 | limitations under the License. 15 | */ 16 | package com.google.cloud.genomics.spark.examples.rdd 17 | 18 | import java.util.{List => JList} 19 | 20 | import com.google.cloud.genomics.Client 21 | import com.google.cloud.genomics.utils.OfflineAuth 22 | import com.google.cloud.genomics.utils.ShardBoundary 23 | import com.google.cloud.genomics.utils.ShardUtils 24 | import com.google.cloud.genomics.utils.ShardUtils.SexChromosomeFilter 25 | import com.google.cloud.genomics.utils.grpc.ReadStreamIterator 26 | import com.google.genomics.v1.StreamReadsRequest 27 | import com.google.genomics.v1.{Read => ReadModel} 28 | import com.google.protobuf.ByteString 29 | import com.google.protobuf.ListValue 30 | import com.google.protobuf.Value 31 | 32 | import org.apache.spark.Partition 33 | import org.apache.spark.SparkContext 34 | import org.apache.spark.SparkContext._ 35 | import org.apache.spark.TaskContext 36 | import org.apache.spark.rdd.RDD 37 | import scala.collection.JavaConversions._ 38 | import scala.collection.JavaConverters._ 39 | 40 | /** 41 | * A serializable version of the Read. 42 | * https://github.com/googlegenomics/spark-examples/issues/84 43 | */ 44 | case class Read(alignedQuality: JList[Integer], cigar: String, 45 | id: String, mappingQuality: Int, matePosition: Option[Long], 46 | mateReferenceName: Option[String], fragmentName: String, alignedSequence: String, 47 | position: Long, readGroupSetId: String, referenceName: String, 48 | info: Map[String, JList[String]], fragmentLength: Int) extends Serializable 49 | 50 | object ReadBuilder { 51 | 52 | val CIGAR_MATCH = Map( 53 | "ALIGNMENT_MATCH" -> "M", 54 | "CLIP_HARD" -> "H", 55 | "CLIP_SOFT" -> "S", 56 | "DELETE" -> "D", 57 | "INSERT" -> "I", 58 | "PAD" -> "P", 59 | "SEQUENCE_MATCH" -> "=", 60 | "SEQUENCE_MISMATCH" -> "X", 61 | "SKIP" -> "N") 62 | 63 | def fromJavaRead(r: ReadModel) = { 64 | val readKey = ReadKey(r.getAlignment.getPosition.getReferenceName, 65 | r.getAlignment.getPosition.getPosition) 66 | 67 | val cigar = r.getAlignment.getCigarList.map(cigarUnit => 68 | cigarUnit.getOperationLength() + 69 | CIGAR_MATCH(cigarUnit.getOperation().name())).mkString("") 70 | 71 | val read = Read( 72 | r.getAlignedQualityList, 73 | cigar, 74 | r.getId, 75 | r.getAlignment.getMappingQuality, 76 | Some(r.getNextMatePosition.getPosition), 77 | Some(r.getNextMatePosition.getReferenceName), 78 | r.getFragmentName, 79 | r.getAlignedSequence, 80 | r.getAlignment.getPosition.getPosition, 81 | r.getReadGroupSetId, 82 | r.getAlignment.getPosition.getReferenceName, 83 | r.getInfo.mapValues(_.getValuesList.map(_.getStringValue()).toList.asJava).toMap, 84 | r.getFragmentLength) 85 | (readKey, read) 86 | } 87 | } 88 | 89 | /** 90 | * A simple Spark RDD backed by Google Genomics Readstore and populated 91 | * via the StreamReads API call (https://cloud.google.com/genomics/reference/rpc/google.genomics.v1#streamingreadservice). 92 | */ 93 | class ReadsRDD(sc: SparkContext, 94 | applicationName: String, 95 | auth: OfflineAuth, 96 | readGroupSetId: String, 97 | readsPartitioner: ReadsPartitioner) 98 | extends RDD[(ReadKey, Read)](sc, Nil) { 99 | 100 | override def getPartitions: Array[Partition] = { 101 | readsPartitioner.getPartitions(readGroupSetId) 102 | } 103 | 104 | override def compute(part: Partition, ctx: TaskContext): 105 | Iterator[(ReadKey, Read)] = { 106 | val client = Client(auth).genomics 107 | val partition = part.asInstanceOf[ReadsPartition] 108 | val request = partition.getReadsRequest 109 | val responses = ReadStreamIterator.enforceShardBoundary( 110 | auth, request, ShardBoundary.Requirement.STRICT, null); 111 | val iterator = responses.flatMap(readResponse => { 112 | readResponse.getAlignmentsList().map(read => { 113 | ReadBuilder.fromJavaRead(read) 114 | }) 115 | }) 116 | // Wrap the iterator to read the number of initialized requests once 117 | // it is fully traversed. 118 | new Iterator[(ReadKey, Read)]() { 119 | def hasNext = { 120 | val hasNext = iterator.hasNext 121 | hasNext 122 | } 123 | 124 | def next = iterator.next 125 | } 126 | } 127 | } 128 | 129 | /** 130 | * Defines a search range over a contig. 131 | */ 132 | case class ReadsPartition( 133 | override val index: Int, serializedRequest: ByteString) 134 | extends Partition { 135 | 136 | def getReadsRequest = StreamReadsRequest.parseFrom(serializedRequest) 137 | 138 | def range = { 139 | val request = getReadsRequest 140 | request.getEnd() - request.getStart() 141 | } 142 | } 143 | 144 | 145 | /** 146 | * Indexes a Read to its partition. 147 | */ 148 | case class ReadKey(contig: String, position: Long) 149 | 150 | trait ReadsPartitioner extends Serializable { 151 | def getPartitions(readGroupSetId: String): Array[Partition] 152 | } 153 | 154 | 155 | /** 156 | * Describes partitions for a set of contigs and their ranges. 157 | */ 158 | class AllReferencesReadsPartitioner(numberOfBasesPerShard: Long, 159 | auth: OfflineAuth) extends ReadsPartitioner { 160 | 161 | // Generates all partitions for all mapped Reads in the contig space. 162 | def getPartitions(readGroupSetId: String): Array[Partition] = { 163 | println(s"ReadGroupSetId: ${readGroupSetId}; All refs, exclude XY") 164 | ShardUtils.getReadRequests( 165 | readGroupSetId, SexChromosomeFilter.INCLUDE_XY, 166 | numberOfBasesPerShard, auth).zipWithIndex.map { 167 | case(request, index) => ReadsPartition(index, request.toByteString()) 168 | }.toArray 169 | } 170 | } 171 | 172 | class ReferencesReadsPartitioner(references: String, 173 | numberOfBasesPerShard: Long) extends ReadsPartitioner { 174 | // Generates all partitions for all mapped Reads in the contig space. 175 | def getPartitions(readGroupSetId: String): Array[Partition] = { 176 | println(s"ReadGroupSetId: ${readGroupSetId}; Refs: ${references}") 177 | ShardUtils.getReadRequests( 178 | List(readGroupSetId), references, numberOfBasesPerShard).zipWithIndex.map { 179 | case(request, index) => ReadsPartition(index, request.toByteString) 180 | }.toArray 181 | } 182 | } 183 | -------------------------------------------------------------------------------- /src/main/python/variants_pca.py: -------------------------------------------------------------------------------- 1 | # spark-submit --jars googlegenomics-spark-examples-assembly-1.0.jar \ 2 | # --driver-class-path googlegenomics-spark-examples-assembly-1.0.jar \ 3 | # src/main/python/variants_pca.py --client-secrets client_secrets.json 4 | import json 5 | import numpy 6 | import operator 7 | import sys 8 | 9 | import pyspark 10 | from pyspark import serializers 11 | import pyspark.conf 12 | from pyspark.mllib import common as mllib_common 13 | from pyspark.mllib import linalg 14 | import pyspark.rdd 15 | 16 | conf = pyspark.conf.SparkConf() 17 | sc = pyspark.SparkContext(conf=conf) 18 | 19 | def prepare_call_data(py_rdd, py_id_to_index): 20 | """Return an RDD[Seq[int]] from the RDD[(VariantKey, Variant)]. 21 | 22 | Args: 23 | py_rdd: An RDD of (VariantKey, Variant) of all Variants matching the 24 | search criteria. 25 | py_id_to_index: A dictionary of string to int, giving the indices of 26 | callset names in ``py_rdd``. 27 | 28 | Returns: 29 | An RDD[Seq[int]] in the same order of the input RDD, each entry is a 30 | list of indices of variant calls. 31 | """ 32 | 33 | # Obtain all samples that have at least one matching call. 34 | samples_with_variant = (py_rdd. 35 | map(lambda v: v.get('calls', [])). 36 | map(lambda calls: [c for c in calls if any(c['genotype'])]). 37 | filter(lambda calls: len(calls) > 0) 38 | ) 39 | 40 | # Obtain the callset name from the samples. 41 | callset_names = (samples_with_variant. 42 | map(lambda callset: [c['callSetId'] for c in callset]) 43 | ) 44 | 45 | # Convert all names (strings) to indices (ints). 46 | sc = pyspark.SparkContext._active_spark_context 47 | broadcast_index_map = sc.broadcast(py_id_to_index) 48 | call_rdd = callset_names.map( 49 | lambda callset: [broadcast_index_map.value[c] for c in callset] 50 | ) 51 | 52 | return call_rdd 53 | 54 | def calculate_similarity_matrix(call_rdd, matrix_size): 55 | """Return an RDD[(int, int), int] where each entry is similarity value of 56 | call ``x``, with respect to call ``y``. 57 | 58 | Args: 59 | call_rdd: An RDD[Seq[int]] as returned by ``prepare_call_data``. 60 | matrix_size: The size (N) of the N x N matrix. 61 | 62 | Returns: 63 | An RDD[(x, y), sim_value] where each entry is similarity value of call 64 | ``x`` with respect to call ``y``. 65 | """ 66 | 67 | def sum_similarity(callsets): 68 | matrix = numpy.zeros((matrix_size, matrix_size), numpy.int) 69 | for callset in callsets: 70 | for x in callset: 71 | for y in callset: 72 | matrix[y][x] += 1 73 | for x in xrange(matrix_size): 74 | for y in xrange(matrix_size): 75 | yield (y, x), matrix[y][x] 76 | 77 | sim_matrix = (call_rdd. 78 | mapPartitions(sum_similarity). 79 | reduceByKey(operator.add) 80 | ) 81 | 82 | return sim_matrix 83 | 84 | def center_matrix(sim_matrix, row_count): 85 | """Center the rows and columns of a similarity matrix. 86 | 87 | Args: 88 | sim_matrix: A similarity matrix as returned by 89 | ``calculate_similarity_matrix``. 90 | row_count: The size (N) of the N x N matrix. 91 | 92 | Returns: 93 | An RDD[int, (int, float)] representing centered rows. The first int is 94 | the row index, the (int, float) tuple is the column index, and the 95 | centered value. 96 | """ 97 | 98 | # Row-by-row (row major) RDD. Each row is a list of (column, value). 99 | entries = (sim_matrix. 100 | map(lambda ((y, x), v): (y, (x, float(v)))). 101 | groupByKey(). 102 | sortByKey(True). 103 | cache() 104 | ) 105 | row_sums = entries.map(lambda (y, xvs): sum(v for (x, v) in xvs)).collect() 106 | matrix_sum = sum(row_sums) 107 | matrix_mean = float(matrix_sum) / row_count / row_count 108 | 109 | sc = pyspark.SparkContext._active_spark_context 110 | broadcast_row_sums = sc.broadcast(row_sums) 111 | 112 | def center_rows((row, col_vals)): 113 | row_mean = broadcast_row_sums.value[row] / float(row_count) 114 | 115 | def center_cols(col, val): 116 | col_mean = broadcast_row_sums.value[col] / float(row_count) 117 | return (col, val - row_mean - col_mean + matrix_mean) 118 | 119 | return [center_cols(col, val) for col, val in col_vals] 120 | 121 | return entries.map(center_rows) 122 | 123 | def perform_pca(matrix, row_count, nr_principal_components=2): 124 | """Return principal components of the input matrix. 125 | 126 | This function uses MLlib's ``RowMatrix`` to compute principal components. 127 | 128 | Args: 129 | matrix: An RDD[int, (int, float)] representing a sparse matrix. This 130 | is returned by ``center_matrix`` but it is not required to center 131 | the matrix first. 132 | row_count: The size (N) of the N x N ``matrix``. 133 | nr_principal_components: Number of components we want to obtain. This 134 | value must be less than or equal to the number of rows in the input 135 | square matrix. 136 | 137 | Returns: 138 | An array of ``nr_principal_components`` columns, and same number of rows 139 | as the input ``matrix``. This array is a ``numpy`` array. 140 | """ 141 | 142 | py_rdd = matrix.map(lambda row: linalg.Vectors.sparse(row_count, row)) 143 | sc = pyspark.SparkContext._active_spark_context 144 | java_rdd = mllib_common._py2java(sc, py_rdd) 145 | scala_rdd = java_rdd.rdd() 146 | sc = pyspark.SparkContext._active_spark_context 147 | row_matrix = (sc._jvm.org.apache.spark.mllib.linalg.distributed. 148 | RowMatrix(scala_rdd) 149 | ) 150 | pca = row_matrix.computePrincipalComponents(nr_principal_components) 151 | pca = mllib_common._java2py(sc, pca) 152 | return pca.toArray() 153 | 154 | def pca(argv): 155 | sc = pyspark.SparkContext._active_spark_context 156 | args = sc._jvm.java.util.ArrayList() 157 | for arg in argv: 158 | args.append(arg) 159 | args = sc._jvm.scala.collection.JavaConversions.asScalaBuffer(args) 160 | jsc = sc._jsc.sc() 161 | 162 | pca_conf = (sc._jvm.com.google.cloud.genomics.spark.examples. 163 | PcaConf(args)) 164 | variants_common = (sc._jvm.com.google.cloud.genomics.spark.examples. 165 | VariantsCommon(pca_conf, jsc)) 166 | 167 | # Map of sample ID to an index in the list of all sample IDs. 168 | # e.g. the list ['NA20818', 'NA20819', 'NA20826'] produces the map 169 | # {'NA20818': 0, 'NA20819': 1, 'NA20826', 2} 170 | java_id_to_index = sc._jvm.scala.collection.JavaConversions.mapAsJavaMap( 171 | variants_common.indexes()) 172 | py_id_to_index = {} 173 | for k in java_id_to_index: 174 | py_id_to_index[k] = java_id_to_index[k] 175 | # This is the reverse map of the previous one. 176 | index_to_id = dict((v, k) for (k, v) in py_id_to_index.iteritems()) 177 | 178 | # Obtain an RDD of all Variants matching PcaConf. 179 | scala_rdd = variants_common.getJavaData() 180 | java_rdd = scala_rdd.toJavaRDD() 181 | # Convert it to Python RDD. 182 | py_rdd = mllib_common._java2py(sc, java_rdd) 183 | 184 | call_rdd = prepare_call_data(py_rdd, py_id_to_index) 185 | 186 | row_count = len(py_id_to_index) 187 | sim_matrix = calculate_similarity_matrix(call_rdd, row_count) 188 | 189 | centered_rows = center_matrix(sim_matrix, row_count) 190 | 191 | result = perform_pca(centered_rows, row_count, 192 | pca_conf.numPc().get().get()) 193 | assert(len(result) == len(index_to_id)) 194 | result = [(index_to_id[i], result[i]) for i in range(len(index_to_id))] 195 | result.sort() 196 | for name, components in result: 197 | print '%s\t%s' % (name, '\t'.join(str(c) for c in components)) 198 | 199 | sc.stop() 200 | 201 | pca(sys.argv[1:]) 202 | -------------------------------------------------------------------------------- /src/main/scala/com/google/cloud/genomics/spark/examples/rdd/VariantsRDD.scala: -------------------------------------------------------------------------------- 1 | /* 2 | Copyright 2014 Google Inc. All rights reserved. 3 | 4 | Licensed under the Apache License, Version 2.0 (the "License"); 5 | you may not use this file except in compliance with the License. 6 | You may obtain a copy of the License at 7 | 8 | http://www.apache.org/licenses/LICENSE-2.0 9 | 10 | Unless required by applicable law or agreed to in writing, software 11 | distributed under the License is distributed on an "AS IS" BASIS, 12 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | See the License for the specific language governing permissions and 14 | limitations under the License. 15 | */ 16 | package com.google.cloud.genomics.spark.examples.rdd 17 | 18 | import java.lang.{Double => JDouble} 19 | import java.util.{List => JList} 20 | 21 | import com.google.cloud.genomics.Client 22 | import com.google.cloud.genomics.utils.OfflineAuth 23 | import com.google.cloud.genomics.utils.ShardBoundary 24 | import com.google.cloud.genomics.utils.ShardUtils 25 | import com.google.cloud.genomics.utils.ShardUtils.SexChromosomeFilter 26 | import com.google.cloud.genomics.utils.grpc.VariantStreamIterator 27 | import com.google.genomics.v1.StreamVariantsRequest 28 | import com.google.genomics.v1.{Variant => VariantModel} 29 | import com.google.genomics.v1.{VariantCall => CallModel} 30 | import com.google.protobuf.ByteString 31 | import com.google.protobuf.ListValue 32 | import com.google.protobuf.Value 33 | 34 | import org.apache.spark.Accumulator 35 | import org.apache.spark.Partition 36 | import org.apache.spark.SparkContext 37 | import org.apache.spark.SparkContext._ 38 | import org.apache.spark.TaskContext 39 | import org.apache.spark.rdd.RDD 40 | import scala.collection.JavaConversions._ 41 | 42 | /** 43 | * A serializable version of the Variant. 44 | * https://github.com/googlegenomics/spark-examples/issues/84 45 | */ 46 | case class Call(callsetId: String, callsetName: String, genotype: List[Integer], 47 | genotypeLikelihood: Option[List[JDouble]], phaseset: String, 48 | info: Map[String, List[String]]) extends Serializable 49 | 50 | 51 | case class Variant(contig: String, id: String, names: Option[List[String]], 52 | start: Long, end: Long, referenceBases: String, 53 | alternateBases: Option[List[String]], info: Map[String, List[String]], 54 | created: Long, variantSetId: String, calls: Option[Seq[Call]]) extends Serializable { 55 | 56 | def toListValue(values: List[String]) = { 57 | val listValue = ListValue.newBuilder() 58 | listValue.addAllValues( 59 | values.map(Value.newBuilder().setStringValue(_).build)) 60 | listValue.build 61 | } 62 | 63 | def toJavaVariant() = { 64 | val variant = VariantModel.newBuilder() 65 | .setReferenceName(this.contig) 66 | .setCreated(this.created) 67 | .setVariantSetId(this.variantSetId) 68 | .setId(this.id) 69 | .setStart(this.start) 70 | .setEnd(this.end) 71 | .setReferenceBases(this.referenceBases) 72 | 73 | variant.putAllInfo(this.info.mapValues(toListValue)) 74 | 75 | if (this.alternateBases isDefined) 76 | variant.addAllAlternateBases(this.alternateBases.get) 77 | if (this.names isDefined) 78 | variant.addAllNames(this.names.get) 79 | if (this.calls isDefined) { 80 | val calls = this.calls.get.map 81 | { c => 82 | val call = CallModel.newBuilder() 83 | .setCallSetId(c.callsetId) 84 | .setCallSetName(c.callsetName) 85 | 86 | call.addAllGenotype(c.genotype) 87 | call.setPhaseset(c.phaseset) 88 | 89 | call.putAllInfo(c.info.mapValues(toListValue)) 90 | if (c.genotypeLikelihood isDefined) 91 | call.addAllGenotypeLikelihood(c.genotypeLikelihood.get) 92 | call.build 93 | } 94 | variant.addAllCalls(calls) 95 | } 96 | variant.build 97 | } 98 | } 99 | 100 | 101 | object VariantsBuilder { 102 | 103 | val refNameRegex = """([a-z]*)?([0-9]*)""".r 104 | 105 | def normalize(referenceName: String) = { 106 | referenceName match { 107 | case refNameRegex(ref, id) => Some(id) 108 | case _ => None 109 | } 110 | } 111 | 112 | def toStringList(values: ListValue) = 113 | values.getValuesList.map(_.getStringValue()).toList 114 | 115 | def build(r: VariantModel) = { 116 | val variantKey = VariantKey(r.getReferenceName, r.getStart) 117 | val calls = if (r.getCallsCount > 0) 118 | Some(r.getCallsList.map( 119 | c => Call( 120 | c.getCallSetId, 121 | c.getCallSetName, 122 | c.getGenotypeList.toList, 123 | if (c.getGenotypeLikelihoodCount > 0) 124 | Some(c.getGenotypeLikelihoodList.toList) 125 | else 126 | None, 127 | c.getPhaseset, 128 | c.getInfo.mapValues(toStringList).toMap))) 129 | else 130 | None 131 | 132 | val referenceName = normalize(r.getReferenceName) 133 | 134 | if (referenceName.isEmpty) { 135 | None; 136 | } else { 137 | val variant = Variant( 138 | referenceName.get, 139 | r.getId, 140 | if (r.getNamesCount() > 0) 141 | Some(r.getNamesList.toList) 142 | else 143 | None, 144 | r.getStart, 145 | r.getEnd, 146 | r.getReferenceBases, 147 | if (r.getAlternateBasesCount() > 0) 148 | Some(r.getAlternateBasesList.toList) 149 | else 150 | None, 151 | r.getInfo.mapValues(toStringList).toMap, 152 | r.getCreated, 153 | r.getVariantSetId, 154 | calls) 155 | Some((variantKey, variant)) 156 | } 157 | } 158 | } 159 | 160 | class VariantsRddStats(sc: SparkContext) extends Serializable { 161 | val partitionsAccum = sc.accumulator(0, "Partitions count") 162 | val referenceBasesAccum = sc.accumulator(0L, "Reference bases count") 163 | val requestsAccum = sc.accumulator(0, "Request count") 164 | val unsuccessfulResponsesAccum = sc.accumulator(0, "Unsuccessful count") 165 | val ioExceptionsAccum = sc.accumulator(0, "IO exceptions count") 166 | val variantsAccum = sc.accumulator(0, "Variant count") 167 | 168 | override def toString ={ 169 | val buf = new StringBuilder 170 | buf ++= "Variants API stats:\n" 171 | buf ++= "-------------------------------\n" 172 | buf ++= s"# of partitions: ${this.partitionsAccum}\n" 173 | buf ++= s"# of bases requested: ${this.referenceBasesAccum}\n" 174 | buf ++= s"# of variants read: ${this.variantsAccum}\n" 175 | buf ++= s"# of API requests: ${this.requestsAccum}\n" 176 | buf ++= s"# of unsuccessful responses: ${this.unsuccessfulResponsesAccum}\n" 177 | buf ++= s"# of IO exceptions: ${this.ioExceptionsAccum}\n" 178 | buf.toString 179 | } 180 | } 181 | 182 | /** 183 | * A simple Spark RDD backed by Google Genomics VariantStore and 184 | * populated via the StreamVariants API call 185 | * (https://cloud.google.com/genomics/reference/rpc/google.genomics.v1#streamingvariantservice). 186 | */ 187 | class VariantsRDD(sc: SparkContext, 188 | applicationName: String, 189 | auth: OfflineAuth, 190 | variantSetId: String, 191 | variantsPartitioner: VariantsPartitioner, 192 | stats:Option[VariantsRddStats] = None) 193 | extends RDD[(VariantKey, Variant)](sc, Nil) { 194 | 195 | override def getPartitions: Array[Partition] = { 196 | variantsPartitioner.getPartitions(variantSetId) 197 | } 198 | 199 | def reportStats(client: Client) = stats map { stat => 200 | stat.requestsAccum += client.initializedRequestsCount 201 | stat.unsuccessfulResponsesAccum += client.unsuccessfulResponsesCount 202 | stat.ioExceptionsAccum += client.ioExceptionsCount 203 | } 204 | 205 | override def compute(part: Partition, ctx: TaskContext): 206 | Iterator[(VariantKey, Variant)] = { 207 | val client = Client(auth) 208 | val partition = part.asInstanceOf[VariantsPartition] 209 | val request = partition.getVariantsRequest 210 | val responses = VariantStreamIterator.enforceShardBoundary( 211 | auth, request, ShardBoundary.Requirement.STRICT, null); 212 | val iterator = responses.flatMap(variantResponse => { 213 | variantResponse.getVariantsList().map(variant => { 214 | stats map { _.variantsAccum += 1 } 215 | VariantsBuilder.build(variant) 216 | }) 217 | }).filter(_.isDefined).map(_.get) 218 | stats map { stat => 219 | stat.partitionsAccum += 1 220 | stat.referenceBasesAccum += (partition.range) 221 | } 222 | // Wrap the iterator to read the number of initialized requests once 223 | // it is fully traversed. 224 | new Iterator[(VariantKey, Variant)]() { 225 | def hasNext = { 226 | val hasNext = iterator.hasNext 227 | if (!hasNext) { 228 | reportStats(client) 229 | } 230 | hasNext 231 | } 232 | 233 | def next = iterator.next 234 | } 235 | } 236 | } 237 | 238 | 239 | /** 240 | * Defines a search range over a contig. 241 | */ 242 | case class VariantsPartition( 243 | override val index: Int, serializedRequest: ByteString) 244 | extends Partition { 245 | 246 | def getVariantsRequest = StreamVariantsRequest.parseFrom(serializedRequest) 247 | 248 | def range = { 249 | val request = getVariantsRequest 250 | request.getEnd() - request.getStart() 251 | } 252 | } 253 | 254 | 255 | /** 256 | * Indexes a variant to its partition. 257 | */ 258 | case class VariantKey(contig: String, position: Long) 259 | 260 | trait VariantsPartitioner extends Serializable { 261 | def getPartitions(variantSetId: String): Array[Partition] 262 | } 263 | 264 | 265 | /** 266 | * Describes partitions for a set of contigs and their ranges. 267 | */ 268 | class AllReferencesVariantsPartitioner(numberOfBasesPerShard: Long, 269 | auth: OfflineAuth) extends VariantsPartitioner { 270 | 271 | // Generates all partitions for all mapped variants in the contig space. 272 | def getPartitions(variantSetId: String): Array[Partition] = { 273 | println(s"Variantset: ${variantSetId}; All refs, exclude XY") 274 | ShardUtils.getVariantRequests( 275 | variantSetId, SexChromosomeFilter.EXCLUDE_XY, 276 | numberOfBasesPerShard, auth).zipWithIndex.map { 277 | case(request, index) => VariantsPartition(index, request.toByteString()) 278 | }.toArray 279 | } 280 | } 281 | 282 | class ReferencesVariantsPartitioner(references: String, 283 | numberOfBasesPerShard: Long) extends VariantsPartitioner { 284 | // Generates all partitions for all mapped variants in the contig space. 285 | def getPartitions(variantSetId: String): Array[Partition] = { 286 | println(s"Variantset: ${variantSetId}; Refs: ${references}") 287 | ShardUtils.getVariantRequests( 288 | variantSetId, references, numberOfBasesPerShard).zipWithIndex.map { 289 | case(request, index) => VariantsPartition(index, request.toByteString) 290 | }.toArray 291 | } 292 | } 293 | -------------------------------------------------------------------------------- /src/main/scala/com/google/cloud/genomics/spark/examples/VariantsPca.scala: -------------------------------------------------------------------------------- 1 | /* 2 | Copyright 2014 Google Inc. All rights reserved. 3 | 4 | Licensed under the Apache License, Version 2.0 (the "License"); 5 | you may not use this file except in compliance with the License. 6 | You may obtain a copy of the License at 7 | 8 | http://www.apache.org/licenses/LICENSE-2.0 9 | 10 | Unless required by applicable law or agreed to in writing, software 11 | distributed under the License is distributed on an "AS IS" BASIS, 12 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | See the License for the specific language governing permissions and 14 | limitations under the License. 15 | */ 16 | package com.google.cloud.genomics.spark.examples 17 | 18 | import scala.collection.JavaConversions._ 19 | 20 | import org.apache.log4j.Level 21 | import org.apache.log4j.Logger 22 | import org.apache.spark.SparkContext 23 | import org.apache.spark.broadcast.Broadcast 24 | import org.apache.spark.mllib.linalg.Vectors 25 | import org.apache.spark.mllib.linalg.distributed.RowMatrix 26 | import org.apache.spark.rdd.RDD 27 | import org.apache.spark.rdd.RDD._ 28 | import org.apache.spark.rdd.UnionRDD 29 | 30 | import com.google.cloud.genomics.spark.examples.rdd.Variant 31 | import com.google.common.base.Charsets 32 | import com.google.common.hash.Hashing 33 | 34 | import breeze.linalg.DenseMatrix 35 | 36 | object VariantsPcaDriver { 37 | 38 | def main(args: Array[String]) = { 39 | Logger.getLogger("org").setLevel(Level.WARN) 40 | val conf = new PcaConf(args) 41 | val driver = VariantsPcaDriver(conf) 42 | val data = driver.getData 43 | val filtered = data.map(driver.filterDataset) 44 | val callsRdd = driver.getCallsRdd(filtered) 45 | val simMatrix = driver.getSimilarityMatrix(callsRdd) 46 | val result = driver.computePca(simMatrix) 47 | driver.emitResult(result) 48 | driver.reportIoStats 49 | driver.stop 50 | } 51 | 52 | def apply(conf: PcaConf) = new VariantsPcaDriver(conf) 53 | 54 | // The following functions are defined on the companion object so they can be 55 | // serialized and used on the RDD functions. 56 | def extractCallInfo(variant: Variant, mapping: Map[String, Int]) = { 57 | variant.calls.getOrElse(Seq()).map( 58 | call => CallData(call.genotype.foldLeft(false)(_ || _ > 0), 59 | mapping(call.callsetId))) 60 | } 61 | 62 | def getVariantKey(variant: Variant, debug:Boolean = false) = { 63 | val alternateBases = variant.alternateBases.map( 64 | altBases => altBases.mkString("")).getOrElse("") 65 | val referenceBases = Option(variant.referenceBases).getOrElse("") 66 | if (debug) { 67 | println(s"${variant.contig}: (${variant.start}, ${variant.end}) ref=${referenceBases} alt=${alternateBases}") 68 | } 69 | Hashing.murmur3_128().newHasher() 70 | .putString(variant.contig, Charsets.UTF_8) 71 | .putLong(variant.start) 72 | .putLong(variant.end) 73 | .putString(referenceBases, Charsets.UTF_8) 74 | .putString( 75 | alternateBases, 76 | Charsets.UTF_8) 77 | .hash().toString() 78 | } 79 | } 80 | 81 | class VariantsPcaDriver(conf: PcaConf, ctx: SparkContext = null) { 82 | private val applicationName = this.getClass.getName 83 | private val sc = if (ctx != null) ctx 84 | else conf.newSparkContext(applicationName) 85 | private val common = new VariantsCommon(conf, sc) 86 | 87 | def getData = common.data 88 | 89 | 90 | /** 91 | * Filter datasets according to the specified flags. 92 | * 93 | * Possible flags include: 94 | * --min-allele-frequency 95 | */ 96 | def filterDataset(data: RDD[Variant]) = { 97 | if (conf.minAlleleFrequency.isDefined) { 98 | val minAlleleFrequency = conf.minAlleleFrequency() 99 | println(s"Min allele frequency ${minAlleleFrequency}.") 100 | data.filter(variant => { 101 | val alleleFrequency = variant.info.get("AF") 102 | alleleFrequency.map(_.get(0).toFloat >= minAlleleFrequency) 103 | .getOrElse(false) 104 | }) 105 | } else { 106 | data 107 | } 108 | } 109 | /** 110 | * Returns an RDD of calls joined by their variant matching key. 111 | * 112 | * The key is composed by the reference name its start and end positions, 113 | * as well as the reference and alternate bases. 114 | */ 115 | def joinDatasets(datasets: List[RDD[Variant]], 116 | broadcastIndexes: Broadcast[Map[String, Int]]): RDD[Seq[CallData]] = { 117 | val broadcastIndexes = sc.broadcast(common.indexes) 118 | val debugDatasets = conf.debugDatasets() 119 | val callsets = datasets.map(_.map(variant => 120 | (VariantsPcaDriver.getVariantKey(variant, debugDatasets), variant)) 121 | .mapValues( 122 | VariantsPcaDriver.extractCallInfo(_, broadcastIndexes.value))) 123 | val callset1 = callsets(0) 124 | val callset2 = callsets(1) 125 | callset1.join(callset2, conf.numReducePartitions()) 126 | .values 127 | .map(related => related._1 ++ related._2) 128 | } 129 | 130 | /** 131 | * Returns an RDD of calls merged by their variant matching key. 132 | * 133 | * The key is composed by the reference name its start and end positions, 134 | * as well as the reference and alternate bases. 135 | */ 136 | def mergeDatasets(data: List[RDD[Variant]], variantSetCount: Int, 137 | broadcastIndexes: Broadcast[Map[String, Int]]): RDD[Seq[CallData]]= { 138 | val callsets = new UnionRDD(sc, data) 139 | val broadcastIndexes = sc.broadcast(common.indexes) 140 | callsets.map(variant => 141 | (VariantsPcaDriver.getVariantKey(variant), variant)) 142 | .mapValues( 143 | VariantsPcaDriver.extractCallInfo(_, broadcastIndexes.value)) 144 | .groupByKey(conf.numReducePartitions()) 145 | .values 146 | .filter(_.size() == variantSetCount) 147 | .map(related => related.flatMap(calls => calls).toSeq) 148 | } 149 | 150 | /** 151 | * Returns an RDD of variant callsets with each call mapped to a position. 152 | */ 153 | def getCallsRdd(data: List[RDD[Variant]]): RDD[Seq[Int]] = { 154 | val variantSetCount = conf.variantSetId().size 155 | val broadcastIndexes = sc.broadcast(common.indexes) 156 | val callsets = if (variantSetCount == 1) { 157 | data.head.map(VariantsPcaDriver.extractCallInfo(_, broadcastIndexes.value)) 158 | } else if (variantSetCount == 2) { 159 | joinDatasets(data, broadcastIndexes) 160 | } else { 161 | mergeDatasets(data, variantSetCount, broadcastIndexes) 162 | } 163 | return callsets 164 | .map(calls => calls.filter(_.hasVariation)) 165 | // Return only those sets that have at least one call with variation. 166 | .filter(_.size > 0) 167 | .map(_.map(_.callsetId)) 168 | } 169 | 170 | /** 171 | * Computes a similarity matrix from the variant information. 172 | * 173 | * This method computes the similarity in place, this means it updates the 174 | * the counts in place on a pre-allocated dense matrix. 175 | * 176 | * Use this method if the partial matrix will fit in memory, roughly 177 | * a data set with 50K samples would fit on ~20GB of memory. 178 | * 179 | * @param calls an RDD of call ids, one variant per record. 180 | * @return an RDD of tuples with the matrix entry indexes and its similarity. 181 | */ 182 | def getSimilarityMatrix(callsets: RDD[Seq[Int]]) = { 183 | val size = common.indexes.size 184 | callsets.mapPartitions(callsInPartition => { 185 | val matrix = DenseMatrix.zeros[Int](size, size) 186 | callsInPartition.foreach(callset => 187 | for (c1 <- callset; c2 <- callset) 188 | matrix.update(c1, c2, matrix(c1, c2) + 1)) 189 | matrix.iterator 190 | }).reduceByKey(_ + _, conf.numReducePartitions()) 191 | } 192 | 193 | /** 194 | * Computes the PCA from the similarity matrix entries. 195 | * 196 | * @param matrixEntries an RDD of tuples representing the matrix entries. 197 | */ 198 | def computePca(matrixEntries: RDD[((Int, Int), Int)]) = { 199 | val rowCount = common.indexes.size 200 | val entries = 201 | matrixEntries.map(item => (item._1._1, item._1._2, item._2.toDouble)) 202 | .map(item => (item._1, (item._2, item._3))) 203 | .groupByKey() 204 | .sortByKey(true) 205 | .cache 206 | val rowSums = entries.map(_._2.foldLeft(0D)(_ + _._2)).collect 207 | val nonZeroRows = rowSums.filter(_ > 0).size 208 | println(s"Non zero rows in matrix: ${nonZeroRows} / ${common.indexes.size}.") 209 | val broadcastRowSums = sc.broadcast(rowSums) 210 | val matrixSum = rowSums.reduce(_ + _) 211 | val matrixMean = matrixSum / rowCount / rowCount; 212 | val centeredRows = entries.map(indexedRow => { 213 | val localRowSums = broadcastRowSums.value 214 | val i = indexedRow._1 215 | val row = indexedRow._2 216 | val rowMean = localRowSums(i) / rowCount; 217 | row.map(entry => { 218 | val j = entry._1 219 | val data = entry._2 220 | val colMean = localRowSums(j) / rowCount; 221 | (j, data - rowMean - colMean + matrixMean) 222 | }).toSeq 223 | }) 224 | val rows = centeredRows.map(row => Vectors.sparse(rowCount, row)) 225 | val matrix = new RowMatrix(rows) 226 | val pca = matrix.computePrincipalComponents(conf.numPc()) 227 | val array = pca.toArray 228 | val reverse = common.indexes.map(_.swap) 229 | for (i <- 0 until pca.numRows) 230 | yield (reverse(i), array(i), array(i + pca.numRows)) 231 | } 232 | 233 | def emitResult(result: Seq[(String, Double, Double)]) { 234 | val resultWithNames = result.map { tuple => 235 | val dataset = tuple._1.split("-").head 236 | (common.names(tuple._1), tuple._2, tuple._3, dataset) 237 | } 238 | resultWithNames.sortBy(_._1).foreach(tuple => 239 | println(s"${tuple._1}\t${tuple._4}\t${tuple._2}\t${tuple._3}")) 240 | 241 | if(conf.outputPath.isDefined) { 242 | val resultRdd = sc.parallelize(resultWithNames) 243 | resultRdd.map(tuple => s"${tuple._1}\t${tuple._2}\t${tuple._3}\t${tuple._4}") 244 | .saveAsTextFile(conf.outputPath() + "-pca.tsv") 245 | } 246 | } 247 | 248 | /** 249 | * Computes a similarity matrix from the variant information. 250 | * 251 | * This method computes the similarity in a streaming fashion, this means 252 | * it never stores the partial similarity matrix in memory. The drawback 253 | * from this approach is that it generates large shuffles as it emits a 254 | * pair for each co-occurring call on a variant. 255 | * 256 | * Use this method only if the partial matrix won't fit in memory, roughly 257 | * a data set with 50K samples would fit on ~20GB of memory. 258 | * 259 | * @param calls an RDD of call ids, one variant per record. 260 | * @return an RDD of tuples with the matrix entry indexes and its similarity. 261 | */ 262 | def getSimilarityMatrixStream(calls: RDD[Seq[Int]]): 263 | RDD[((Int, Int), Int)] = { 264 | // Keep track of how many calls share the same variant 265 | calls.flatMap(callset => 266 | // Emit only half of the counts 267 | for (c1 <- callset.iterator; c2 <- callset.iterator if c1 <= c2) 268 | yield ((c1, c2), 1)) 269 | // Aggregate the similar pairs, partially done in memory. 270 | .reduceByKey(_ + _, conf.numReducePartitions()) 271 | // Rebuild the symmetric matrix from the aggregated pairs. 272 | .flatMap(item => { 273 | if (item._1._1 < item._1._2) { 274 | Seq(item, ((item._1._2, item._1._1), item._2)) 275 | } else { 276 | Seq(item) 277 | } 278 | }) 279 | } 280 | 281 | def reportIoStats = common.reportIoStats 282 | 283 | def stop { 284 | sc.stop 285 | } 286 | } 287 | 288 | case class CallData(hasVariation: Boolean, callsetId: Int) extends Serializable 289 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | Apache License 2 | Version 2.0, January 2004 3 | http://www.apache.org/licenses/ 4 | 5 | TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION 6 | 7 | 1. Definitions. 8 | 9 | "License" shall mean the terms and conditions for use, reproduction, 10 | and distribution as defined by Sections 1 through 9 of this document. 11 | 12 | "Licensor" shall mean the copyright owner or entity authorized by 13 | the copyright owner that is granting the License. 14 | 15 | "Legal Entity" shall mean the union of the acting entity and all 16 | other entities that control, are controlled by, or are under common 17 | control with that entity. For the purposes of this definition, 18 | "control" means (i) the power, direct or indirect, to cause the 19 | direction or management of such entity, whether by contract or 20 | otherwise, or (ii) ownership of fifty percent (50%) or more of the 21 | outstanding shares, or (iii) beneficial ownership of such entity. 22 | 23 | "You" (or "Your") shall mean an individual or Legal Entity 24 | exercising permissions granted by this License. 25 | 26 | "Source" form shall mean the preferred form for making modifications, 27 | including but not limited to software source code, documentation 28 | source, and configuration files. 29 | 30 | "Object" form shall mean any form resulting from mechanical 31 | transformation or translation of a Source form, including but 32 | not limited to compiled object code, generated documentation, 33 | and conversions to other media types. 34 | 35 | "Work" shall mean the work of authorship, whether in Source or 36 | Object form, made available under the License, as indicated by a 37 | copyright notice that is included in or attached to the work 38 | (an example is provided in the Appendix below). 39 | 40 | "Derivative Works" shall mean any work, whether in Source or Object 41 | form, that is based on (or derived from) the Work and for which the 42 | editorial revisions, annotations, elaborations, or other modifications 43 | represent, as a whole, an original work of authorship. For the purposes 44 | of this License, Derivative Works shall not include works that remain 45 | separable from, or merely link (or bind by name) to the interfaces of, 46 | the Work and Derivative Works thereof. 47 | 48 | "Contribution" shall mean any work of authorship, including 49 | the original version of the Work and any modifications or additions 50 | to that Work or Derivative Works thereof, that is intentionally 51 | submitted to Licensor for inclusion in the Work by the copyright owner 52 | or by an individual or Legal Entity authorized to submit on behalf of 53 | the copyright owner. For the purposes of this definition, "submitted" 54 | means any form of electronic, verbal, or written communication sent 55 | to the Licensor or its representatives, including but not limited to 56 | communication on electronic mailing lists, source code control systems, 57 | and issue tracking systems that are managed by, or on behalf of, the 58 | Licensor for the purpose of discussing and improving the Work, but 59 | excluding communication that is conspicuously marked or otherwise 60 | designated in writing by the copyright owner as "Not a Contribution." 61 | 62 | "Contributor" shall mean Licensor and any individual or Legal Entity 63 | on behalf of whom a Contribution has been received by Licensor and 64 | subsequently incorporated within the Work. 65 | 66 | 2. Grant of Copyright License. Subject to the terms and conditions of 67 | this License, each Contributor hereby grants to You a perpetual, 68 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable 69 | copyright license to reproduce, prepare Derivative Works of, 70 | publicly display, publicly perform, sublicense, and distribute the 71 | Work and such Derivative Works in Source or Object form. 72 | 73 | 3. Grant of Patent License. Subject to the terms and conditions of 74 | this License, each Contributor hereby grants to You a perpetual, 75 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable 76 | (except as stated in this section) patent license to make, have made, 77 | use, offer to sell, sell, import, and otherwise transfer the Work, 78 | where such license applies only to those patent claims licensable 79 | by such Contributor that are necessarily infringed by their 80 | Contribution(s) alone or by combination of their Contribution(s) 81 | with the Work to which such Contribution(s) was submitted. If You 82 | institute patent litigation against any entity (including a 83 | cross-claim or counterclaim in a lawsuit) alleging that the Work 84 | or a Contribution incorporated within the Work constitutes direct 85 | or contributory patent infringement, then any patent licenses 86 | granted to You under this License for that Work shall terminate 87 | as of the date such litigation is filed. 88 | 89 | 4. Redistribution. You may reproduce and distribute copies of the 90 | Work or Derivative Works thereof in any medium, with or without 91 | modifications, and in Source or Object form, provided that You 92 | meet the following conditions: 93 | 94 | (a) You must give any other recipients of the Work or 95 | Derivative Works a copy of this License; and 96 | 97 | (b) You must cause any modified files to carry prominent notices 98 | stating that You changed the files; and 99 | 100 | (c) You must retain, in the Source form of any Derivative Works 101 | that You distribute, all copyright, patent, trademark, and 102 | attribution notices from the Source form of the Work, 103 | excluding those notices that do not pertain to any part of 104 | the Derivative Works; and 105 | 106 | (d) If the Work includes a "NOTICE" text file as part of its 107 | distribution, then any Derivative Works that You distribute must 108 | include a readable copy of the attribution notices contained 109 | within such NOTICE file, excluding those notices that do not 110 | pertain to any part of the Derivative Works, in at least one 111 | of the following places: within a NOTICE text file distributed 112 | as part of the Derivative Works; within the Source form or 113 | documentation, if provided along with the Derivative Works; or, 114 | within a display generated by the Derivative Works, if and 115 | wherever such third-party notices normally appear. The contents 116 | of the NOTICE file are for informational purposes only and 117 | do not modify the License. You may add Your own attribution 118 | notices within Derivative Works that You distribute, alongside 119 | or as an addendum to the NOTICE text from the Work, provided 120 | that such additional attribution notices cannot be construed 121 | as modifying the License. 122 | 123 | You may add Your own copyright statement to Your modifications and 124 | may provide additional or different license terms and conditions 125 | for use, reproduction, or distribution of Your modifications, or 126 | for any such Derivative Works as a whole, provided Your use, 127 | reproduction, and distribution of the Work otherwise complies with 128 | the conditions stated in this License. 129 | 130 | 5. Submission of Contributions. Unless You explicitly state otherwise, 131 | any Contribution intentionally submitted for inclusion in the Work 132 | by You to the Licensor shall be under the terms and conditions of 133 | this License, without any additional terms or conditions. 134 | Notwithstanding the above, nothing herein shall supersede or modify 135 | the terms of any separate license agreement you may have executed 136 | with Licensor regarding such Contributions. 137 | 138 | 6. Trademarks. This License does not grant permission to use the trade 139 | names, trademarks, service marks, or product names of the Licensor, 140 | except as required for reasonable and customary use in describing the 141 | origin of the Work and reproducing the content of the NOTICE file. 142 | 143 | 7. Disclaimer of Warranty. Unless required by applicable law or 144 | agreed to in writing, Licensor provides the Work (and each 145 | Contributor provides its Contributions) on an "AS IS" BASIS, 146 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or 147 | implied, including, without limitation, any warranties or conditions 148 | of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A 149 | PARTICULAR PURPOSE. You are solely responsible for determining the 150 | appropriateness of using or redistributing the Work and assume any 151 | risks associated with Your exercise of permissions under this License. 152 | 153 | 8. Limitation of Liability. In no event and under no legal theory, 154 | whether in tort (including negligence), contract, or otherwise, 155 | unless required by applicable law (such as deliberate and grossly 156 | negligent acts) or agreed to in writing, shall any Contributor be 157 | liable to You for damages, including any direct, indirect, special, 158 | incidental, or consequential damages of any character arising as a 159 | result of this License or out of the use or inability to use the 160 | Work (including but not limited to damages for loss of goodwill, 161 | work stoppage, computer failure or malfunction, or any and all 162 | other commercial damages or losses), even if such Contributor 163 | has been advised of the possibility of such damages. 164 | 165 | 9. Accepting Warranty or Additional Liability. While redistributing 166 | the Work or Derivative Works thereof, You may choose to offer, 167 | and charge a fee for, acceptance of support, warranty, indemnity, 168 | or other liability obligations and/or rights consistent with this 169 | License. However, in accepting such obligations, You may act only 170 | on Your own behalf and on Your sole responsibility, not on behalf 171 | of any other Contributor, and only if You agree to indemnify, 172 | defend, and hold each Contributor harmless for any liability 173 | incurred by, or claims asserted against, such Contributor by reason 174 | of your accepting any such warranty or additional liability. 175 | 176 | END OF TERMS AND CONDITIONS 177 | 178 | APPENDIX: How to apply the Apache License to your work. 179 | 180 | To apply the Apache License to your work, attach the following 181 | boilerplate notice, with the fields enclosed by brackets "{}" 182 | replaced with your own identifying information. (Don't include 183 | the brackets!) The text should be enclosed in the appropriate 184 | comment syntax for the file format. We also recommend that a 185 | file or class name and description of purpose be included on the 186 | same "printed page" as the copyright notice for easier 187 | identification within third-party archives. 188 | 189 | Copyright {yyyy} {name of copyright owner} 190 | 191 | Licensed under the Apache License, Version 2.0 (the "License"); 192 | you may not use this file except in compliance with the License. 193 | You may obtain a copy of the License at 194 | 195 | http://www.apache.org/licenses/LICENSE-2.0 196 | 197 | Unless required by applicable law or agreed to in writing, software 198 | distributed under the License is distributed on an "AS IS" BASIS, 199 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 200 | See the License for the specific language governing permissions and 201 | limitations under the License. -------------------------------------------------------------------------------- /src/main/scala/com/google/cloud/genomics/spark/examples/SearchReadsExample.scala: -------------------------------------------------------------------------------- 1 | /* 2 | Copyright 2014 Google Inc. All rights reserved. 3 | 4 | Licensed under the Apache License, Version 2.0 (the "License"); 5 | you may not use this file except in compliance with the License. 6 | You may obtain a copy of the License at 7 | 8 | http://www.apache.org/licenses/LICENSE-2.0 9 | 10 | Unless required by applicable law or agreed to in writing, software 11 | distributed under the License is distributed on an "AS IS" BASIS, 12 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | See the License for the specific language governing permissions and 14 | limitations under the License. 15 | */ 16 | package com.google.cloud.genomics.spark.examples 17 | 18 | import scala.collection.JavaConversions._ 19 | 20 | import scala.collection.mutable.{Map => MutableMap} 21 | import org.apache.log4j.Level 22 | import org.apache.log4j.Logger 23 | import org.apache.spark.SparkContext._ 24 | import com.google.cloud.genomics.spark.examples.rdd.ReadsRDD 25 | import com.google.cloud.genomics.spark.examples.rdd.ReadsPartitioner 26 | import com.google.cloud.genomics.spark.examples.rdd.ReferencesReadsPartitioner 27 | import com.google.cloud.genomics.Authentication 28 | 29 | object Examples { 30 | 31 | final val Google_1KG_HG00096_Readset = "CMvnhpKTFhCwvIWYw9eikzQ" 32 | // From http://googlegenomics.readthedocs.org/en/latest/constants.html 33 | final val Google_Example_Readset = "CMvnhpKTFhD04eLE-q2yxnU" 34 | // Sage Bio DREAM Contest - Synthetic Set #3 35 | val Google_DREAM_Set3_Normal = "CPHG3MzoCRDRkqXzk7b6l_kB" 36 | val Google_DREAM_Set3_Tumor = "CPHG3MzoCRCO1rDx8pOY6yo" 37 | 38 | // SNP @ 6889648 - cilantro/soap variant near OR10A2 39 | final val Cilantro = 6889648L 40 | 41 | final val HumanChromosomes = Map[String, Long]( 42 | ("1", 249250621), 43 | ("2", 243199373), 44 | ("3", 198022430), 45 | ("4", 191154276), 46 | ("5", 180915260), 47 | ("6", 171115067), 48 | ("7", 159138663), 49 | ("8", 146364022), 50 | ("9", 141213431), 51 | ("10", 135534747), 52 | ("11", 135006516), 53 | ("12", 133851895), 54 | ("13", 115169878), 55 | ("14", 107349540), 56 | ("15", 102531392), 57 | ("16", 90354753), 58 | ("17", 81195210), 59 | ("18", 78077248), 60 | ("19", 59128983), 61 | ("20", 63025520), 62 | ("21", 48129895), 63 | ("22", 51304566), 64 | ("X", 155270560), 65 | ("Y", 59373566)) 66 | } 67 | 68 | /** 69 | * This example searches for all reads covering the cilantro/soap SNP near OR10A2 70 | * on chromosome 11 and prints out a pileup. The quality score of each read at 71 | * the SNP location is also printed inline. This can be visualized in the Genomics API Browser: 72 | * http://gabrowse.appspot.com/#backend=GOOGLE&readsetId=CJDmkYn8ChCh4IH4hOf4gacB&location=11%3A6889648 73 | * Note that the reads may be displayed in different order. 74 | */ 75 | object SearchReadsExample1 { 76 | def main(args: Array[String]) = { 77 | val conf = new GenomicsConf(args) 78 | val applicationName = this.getClass.getName 79 | val sc = conf.newSparkContext(applicationName) 80 | Logger.getLogger("org").setLevel(Level.WARN) 81 | val accessToken = Authentication.getAccessToken(conf.clientSecrets.get) 82 | val references = s"11:${Examples.Cilantro - 1000}:${Examples.Cilantro + 1000}" 83 | val data = new ReadsRDD(sc, applicationName, accessToken, 84 | Examples.Google_Example_Readset, 85 | new ReferencesReadsPartitioner(references, conf.basesPerPartition())) 86 | .filter { rk => 87 | val (_, read) = rk 88 | // TODO: Take the cigar into account 89 | read.position <= Examples.Cilantro && read.position + read.alignedSequence.length >= Examples.Cilantro 90 | }.cache() 91 | val first = data.collect.foldLeft(999999999L) { (a, b) => 92 | val (_, read) = b 93 | val p = read.position 94 | if (p < a) { p.toLong } else { a } 95 | } 96 | println(List.fill((Examples.Cilantro - first).toInt)(" ").mkString("") + "v") 97 | val out = data.map { rk => 98 | val (_, read) = rk 99 | val i = (Examples.Cilantro - read.position).toInt 100 | val bases = read.alignedSequence.splitAt(i + 1) 101 | val q = "%02d".format(read.alignedQuality(i)) 102 | List.fill((read.position - first).toInt)(" ").mkString("") + bases._1 + "(" + q + ") " + bases._2 103 | } 104 | // Collect the results so they are printed on the local console. 105 | out.collect.foreach(println(_)) 106 | 107 | println(List.fill((Examples.Cilantro - first).toInt)(" ").mkString("") + "^") 108 | sc.stop 109 | } 110 | } 111 | 112 | /** 113 | * This example computes the average read coverage for a genomic range. 114 | */ 115 | object SearchReadsExample2 { 116 | def main(args: Array[String]) = { 117 | val conf = new GenomicsConf(args) 118 | val applicationName = this.getClass.getName 119 | val sc = conf.newSparkContext(applicationName) 120 | val chr = "21" 121 | val len = Examples.HumanChromosomes(chr) 122 | val references = s"${chr}:1:${len}" 123 | val accessToken = Authentication.getAccessToken(conf.clientSecrets.get) 124 | val data = new ReadsRDD(sc, applicationName, accessToken, 125 | Examples.Google_Example_Readset, 126 | new ReferencesReadsPartitioner(references, conf.basesPerPartition())) 127 | // TODO: Take the cigar into account 128 | val coverage = data.map(_._2.alignedSequence.length.toLong) 129 | .reduce(_ + _).toDouble / len.toDouble 130 | println("Coverage of chromosome " + chr + " = " + coverage) 131 | sc.stop 132 | } 133 | } 134 | 135 | /** 136 | * This example computes the per-base read depth for a genomic range. 137 | */ 138 | object SearchReadsExample3 { 139 | def main(args: Array[String]) = { 140 | val conf = new GenomicsConf(args) 141 | val outPath = conf.outputPath.orElse(Option("."))() 142 | val applicationName = this.getClass.getName 143 | val sc = conf.newSparkContext(applicationName) 144 | val chr = "21" 145 | val references = s"${chr}:1:${Examples.HumanChromosomes(chr)}" 146 | val accessToken = Authentication.getAccessToken(conf.clientSecrets.get) 147 | val data = new ReadsRDD(sc, applicationName, accessToken, 148 | Examples.Google_Example_Readset, 149 | new ReferencesReadsPartitioner(references, conf.basesPerPartition())) 150 | data.flatMap { rk => 151 | val (_, read) = rk 152 | val cover = MutableMap[Long, Int]() 153 | // TODO: Take the cigar into account 154 | for (i <- 0 until read.alignedSequence.length) { 155 | cover(read.position + i) = 1 156 | } 157 | cover 158 | } 159 | .reduceByKey(_ + _) 160 | .sortByKey(true) // optional, obviously 161 | .saveAsTextFile(outPath + "/coverage_" + chr) 162 | sc.stop 163 | } 164 | } 165 | 166 | /** 167 | * This example illustrates one way to work with multiple RDDs by aggregating and 168 | * comparing bases at the same position in different readsets. It uses synthetic 169 | * tumor-normal data from the ICGC-TCGA DREAM Contest (https://www.synapse.org/#!Synapse:syn312572). 170 | */ 171 | object SearchReadsExample4 { 172 | def main(args: Array[String]) = { 173 | val conf = new GenomicsConf(args) 174 | val outPath = conf.outputPath.orElse(Option("."))() 175 | val applicationName = this.getClass.getName 176 | val accessToken = Authentication.getAccessToken(conf.clientSecrets.get) 177 | val sc = conf.newSparkContext(applicationName) 178 | val chr = "1" 179 | val references = s"${chr}:100000000:101000000" 180 | val minMappingQual = 30 181 | val minBaseQual = 30 182 | val minFreq = 0.25 183 | 184 | // Generates an RDD that maps genomic position to a base read frequencies. 185 | // Reads with a mapping quality less than minMappingQual are discarded 186 | // as are individual bases with base quality scores less than minBaseQual. 187 | // 188 | // For example, a snippet of the text dump of the RDD for chromosome 1 189 | // (synthetic set #3 normal) looks like: 190 | // (100091811,Map(A -> 1.0)) 191 | // (100091812,Map(G -> 0.5428571428571428, A -> 0.45714285714285713)) 192 | // (100091813,Map(G -> 0.08333333333333333, A -> 0.3611111111111111, C -> 0.5555555555555556)) 193 | // (100091814,Map(G -> 0.30303030303030304, A -> 0.6060606060606061, C -> 0.09090909090909091)) 194 | // (100091815,Map(G -> 0.03125, A -> 0.6875, C -> 0.28125)) 195 | // (100091816,Map(A -> 0.375, C -> 0.03125, T -> 0.59375)) 196 | // (100091817,Map(A -> 0.90625, T -> 0.09375)) 197 | // (100091818,Map(A -> 0.125, T -> 0.875)) 198 | // (100091819,Map(A -> 0.28125, T -> 0.71875)) 199 | // (100091820,Map(G -> 0.6176470588235294, A -> 0.029411764705882353, T -> 0.35294117647058826)) 200 | // (100091821,Map(G -> 0.08823529411764706, C -> 0.6470588235294118, T -> 0.2647058823529412)) 201 | // (100091822,Map(G -> 0.23529411764705882, C -> 0.7352941176470589, T -> 0.029411764705882353)) 202 | // (100091823,Map(G -> 0.029411764705882353, A -> 0.6470588235294118, C -> 0.3235294117647059)) 203 | // (100091824,Map(A -> 0.08823529411764706, C -> 0.2647058823529412, T -> 0.6470588235294118)) 204 | // (100091825,Map(A -> 0.23529411764705882, C -> 0.6764705882352942, T -> 0.08823529411764706)) 205 | // (100091826,Map(A -> 0.6764705882352942, C -> 0.08823529411764706, T -> 0.23529411764705882)) 206 | // (100091827,Map(A -> 0.75, C -> 0.2222222222222222, T -> 0.027777777777777776)) 207 | // (100091828,Map(G -> 0.6571428571428571, A -> 0.3142857142857143, C -> 0.02857142857142857)) 208 | // (100091829,Map(G -> 0.11428571428571428, A -> 0.2571428571428571, T -> 0.6285714285714286)) 209 | // (100091830,Map(G -> 0.9142857142857143, A -> 0.02857142857142857, T -> 0.05714285714285714)) 210 | // (100091831,Map(G -> 0.1111111111111111, A -> 0.6666666666666666, T -> 0.2222222222222222)) 211 | // (100091832,Map(G -> 0.8888888888888888, A -> 0.08333333333333333, T -> 0.027777777777777776)) 212 | // (100091833,Map(G -> 0.11428571428571428, A -> 0.8571428571428571, T -> 0.02857142857142857)) 213 | // (100091834,Map(G -> 0.2, A -> 0.11428571428571428, T -> 0.6857142857142857)) 214 | // (100091835,Map(G -> 0.7142857142857143, A -> 0.2, T -> 0.08571428571428572)) 215 | // (100091836,Map(G -> 0.08333333333333333, A -> 0.7222222222222222, T -> 0.19444444444444445)) 216 | def freqRDD(readGroupSetId: String, partitioner: ReadsPartitioner) = { 217 | new ReadsRDD(sc, applicationName, accessToken, 218 | readGroupSetId, partitioner) 219 | .filter(rk => rk._2.mappingQuality >= minMappingQual) 220 | .flatMap { rk => 221 | val (_, read) = rk 222 | var bases = List[(Long, Char)]() 223 | // TODO: Take the cigar into account 224 | for (i <- 0 until read.alignedSequence.length) { 225 | if (i < read.alignedQuality.length && read.alignedQuality(i) >= minBaseQual) { 226 | bases ::= (read.position + i, read.alignedSequence(i)) 227 | } 228 | } 229 | bases 230 | } 231 | .groupByKey() 232 | .mapValues { v => 233 | val vSeq = v.toSeq 234 | val total = vSeq.length.toDouble 235 | vSeq.groupBy(c => c) 236 | .map(p => (p._1, p._2.length)) 237 | .map(p => (p._1, p._2.toDouble / total)) 238 | } 239 | .groupByKey() 240 | .map(p => (p._1, p._2.head)) 241 | } 242 | 243 | val readsPartitioner = new ReferencesReadsPartitioner(references, conf.basesPerPartition()) 244 | val normal = freqRDD(Examples.Google_DREAM_Set3_Normal, readsPartitioner) 245 | val tumor = freqRDD(Examples.Google_DREAM_Set3_Tumor, readsPartitioner) 246 | 247 | // Generate a new RDD that maps position to a pair of sorted base strings where 248 | // the first item is the normal and the second is the tumor. 249 | // Any base occurring with frequency less than minFreq is filtered out. 250 | // Example: 251 | // (100091811,(A,A)) 252 | // (100091812,(AG,AG)) 253 | // (100091813,(AC,AC)) 254 | // (100091814,(AG,AG)) 255 | // (100091815,(AC,AC)) 256 | // (100091816,(AT,AT)) 257 | // (100091817,(A,A)) 258 | // (100091818,(T,T)) 259 | // (100091819,(AT,AT)) 260 | // (100091820,(GT,GT)) 261 | // (100091821,(CT,CT)) 262 | // (100091822,(C,CG)) 263 | // (100091823,(AC,AC)) 264 | // (100091824,(CT,CT)) 265 | // (100091825,(C,AC)) 266 | // (100091826,(A,AT)) 267 | // (100091827,(A,AC)) 268 | // (100091828,(AG,AG)) 269 | // (100091829,(AT,AT)) 270 | // (100091830,(G,G)) 271 | // (100091831,(A,AT)) 272 | // (100091832,(G,G)) 273 | // (100091833,(A,A)) 274 | // (100091834,(T,GT)) 275 | // (100091835,(G,AG)) 276 | // (100091836,(A,AT)) 277 | val paired = normal.join(tumor).groupByKey() 278 | .map(p => (p._1, p._2.head)) 279 | .map { p => 280 | def f(m: Map[Char, Double]): String = { 281 | var s = "" 282 | m.foreach { kv => 283 | if (kv._2 >= minFreq) { s += kv._1 } 284 | } 285 | s.sorted 286 | } 287 | (p._1, (f(p._2._1), f(p._2._2))) 288 | } 289 | 290 | // This RDD can be further filtered to eliminate any positions with matching bases. 291 | // Example: 292 | // (100091822,(C,CG)) 293 | // (100091825,(C,AC)) 294 | // (100091826,(A,AT)) 295 | // (100091827,(A,AC)) 296 | // (100091831,(A,AT)) 297 | // (100091834,(T,GT)) 298 | // (100091835,(G,AG)) 299 | // (100091836,(A,AT)) 300 | val diff = paired.filter(p => p._2._1 != p._2._2) 301 | diff.sortByKey().saveAsTextFile(outPath + "/diff_" + chr) 302 | sc.stop 303 | } 304 | } 305 | --------------------------------------------------------------------------------