├── .travis.yml
├── project
    └── assembly.sbt
├── assembly.sbt
├── .gitignore
├── src
    └── main
    │   ├── scala
    │       └── com
    │       │   └── google
    │       │       └── cloud
    │       │           └── genomics
    │       │               ├── Client.scala
    │       │               └── spark
    │       │                   └── examples
    │       │                       ├── VariantsCommon.scala
    │       │                       ├── GenomicsConf.scala
    │       │                       ├── SearchVariantsExample.scala
    │       │                       ├── rdd
    │       │                           ├── ReadsRDD.scala
    │       │                           └── VariantsRDD.scala
    │       │                       ├── VariantsPca.scala
    │       │                       └── SearchReadsExample.scala
    │   └── python
    │       └── variants_pca.py
├── CONTRIBUTING.rst
├── README.md
└── LICENSE


/.travis.yml:
--------------------------------------------------------------------------------
1 | language: scala
2 | scala:
3 |    - 2.10.4
4 | 


--------------------------------------------------------------------------------
/project/assembly.sbt:
--------------------------------------------------------------------------------
1 | addSbtPlugin("com.eed3si9n" % "sbt-assembly" % "0.14.0")
2 | 


--------------------------------------------------------------------------------
/assembly.sbt:
--------------------------------------------------------------------------------
1 | run in Compile <<= Defaults.runTask(fullClasspath in Compile, mainClass in (Compile, run), runner in (Compile, run))
2 | 


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
 1 | *.class
 2 | *.log
 3 | 
 4 | # sbt specific
 5 | .cache/
 6 | .history/
 7 | .lib/
 8 | dist/*
 9 | target/
10 | lib_managed/
11 | src_managed/
12 | project/boot/
13 | project/plugins/project/
14 | 
15 | # Scala-IDE specific
16 | .scala_dependencies
17 | .worksheet
18 | 


--------------------------------------------------------------------------------
/src/main/scala/com/google/cloud/genomics/Client.scala:
--------------------------------------------------------------------------------
 1 | /*
 2 | Copyright 2014 Google Inc. All rights reserved.
 3 | 
 4 | Licensed under the Apache License, Version 2.0 (the "License");
 5 | you may not use this file except in compliance with the License.
 6 | You may obtain a copy of the License at
 7 | 
 8 |     http://www.apache.org/licenses/LICENSE-2.0
 9 | 
10 | Unless required by applicable law or agreed to in writing, software
11 | distributed under the License is distributed on an "AS IS" BASIS,
12 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | See the License for the specific language governing permissions and
14 | limitations under the License.
15 | */
16 | package com.google.cloud.genomics
17 | 
18 | import java.io.File
19 | import java.io.FileReader
20 | import java.io.StringReader
21 | import java.util.Scanner
22 | import scala.util.{Try, Success, Failure}
23 | import com.google.api.services.genomics.Genomics
24 | import com.google.cloud.genomics.utils.CredentialFactory
25 | import com.google.cloud.genomics.utils.GenomicsFactory
26 | import com.google.cloud.genomics.utils.OfflineAuth
27 | 
28 | object Authentication {
29 |   def getAccessToken(clientSecretsFile: Option[String],
30 |       applicationName: String = "spark-examples") = {
31 |     if(clientSecretsFile.isDefined) {
32 |       System.out.println("\nThis pipeline will make your user credential available to all"
33 |         + " Spark worker processes.  Your credentials may be visible to others with access to the"
34 |         + " machines on which this pipeline is running.");
35 |       System.out.println("Do you want to continue (Y/n)?");
36 |       val kbd = new Scanner(System.in)
37 |       val decision = kbd.nextLine()
38 |       decision match {
39 |         case "yes" | "Yes" | "YES" | "y" | "Y" => "proceed"
40 |         case _ =>  System.exit(0)
41 |       }
42 |       new OfflineAuth(CredentialFactory.getCredentialFromClientSecrets(clientSecretsFile.get, applicationName))
43 |     } else {
44 |       new OfflineAuth()
45 |     }
46 |   }
47 | }
48 | 
49 | object Client {
50 | 
51 |   def apply(auth: OfflineAuth, applicationName: String = "spark-examples"): Client = {
52 |     val factory = GenomicsFactory.builder().build()
53 |     new Client(factory.fromOfflineAuth(auth), factory)
54 |   }
55 | }
56 | 
57 | class Client(val genomics: Genomics, private val factory: GenomicsFactory) {
58 |   def initializedRequestsCount = factory.initializedRequestsCount()
59 |   def unsuccessfulResponsesCount = factory.unsuccessfulResponsesCount()
60 |   def ioExceptionsCount = factory.ioExceptionsCount()
61 | }
62 | 
63 | 


--------------------------------------------------------------------------------
/CONTRIBUTING.rst:
--------------------------------------------------------------------------------
 1 | How to contribute
 2 | ===================================
 3 | 
 4 | First of all, thank you for contributing!
 5 | 
 6 | The mailing list
 7 | ----------------
 8 | 
 9 | For general questions or if you are having trouble getting started, try the 
10 | `Google Genomics Discuss mailing list <https://groups.google.com/forum/#!forum/google-genomics-discuss>`_. 
11 | It's a good way to sync up with other people who use googlegenomics including the core developers. You can subscribe
12 | by sending an email to ``google-genomics-discuss+subscribe@googlegroups.com`` or just post using
13 | the `web forum page <https://groups.google.com/forum/#!forum/google-genomics-discuss>`_.
14 | 
15 | 
16 | Submitting issues
17 | -----------------
18 | 
19 | If you are encountering a bug in the code or have a feature request in mind - file away! 
20 | 
21 | 
22 | Submitting a pull request
23 | -------------------------
24 | 
25 | If you are ready to contribute code, Github provides a nice `overview on how to create a pull request
26 | <https://help.github.com/articles/creating-a-pull-request>`_.
27 | 
28 | Some general rules to follow:
29 | 
30 | * Do your work in `a fork <https://help.github.com/articles/fork-a-repo>`_ of this repo.
31 | * Create a branch for each update that you're working on. 
32 |   These branches are often called "feature" or "topic" branches. Any changes
33 |   that you push to your feature branch will automatically be shown in the pull request.
34 | * Keep your pull requests as small as possible. Large pull requests are hard to review. 
35 |   Try to break up your changes into self-contained and incremental pull requests.
36 | * The first line of commit messages should be a short (<80 character) summary, 
37 |   followed by an empty line and then any details that you want to share about the commit.
38 | * Please try to follow the existing syntax style
39 | 
40 | When you submit or change your pull request, the Travis build system will automatically run tests. 
41 | If your pull request fails to pass tests, review the test log, make changes and
42 | then push them to your feature branch to be tested again.
43 | 
44 | 
45 | Contributor License Agreements
46 | ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
47 | 
48 | All pull requests are welcome. Before we can submit them though, there is a legal hurdle we have to jump. 
49 | You'll need to fill out either the individual or corporate Contributor License Agreement
50 | (CLA).
51 | 
52 | * If you are an individual writing original source code and you're sure you
53 |   own the intellectual property, then you'll need to sign an `individual CLA
54 |   <https://developers.google.com/open-source/cla/individual>`_.
55 | * If you work for a company that wants to allow you to contribute your work,
56 |   then you'll need to sign a `corporate CLA
57 |   <https://developers.google.com/open-source/cla/corporate>`_.
58 | 
59 | Follow either of the two links above to access the appropriate CLA and
60 | instructions for how to sign and return it. Once we receive it, we'll be able to
61 | accept your pull requests.
62 | 


--------------------------------------------------------------------------------
/src/main/scala/com/google/cloud/genomics/spark/examples/VariantsCommon.scala:
--------------------------------------------------------------------------------
 1 | /*
 2 | Copyright 2015 Google Inc. All rights reserved.
 3 | 
 4 | Licensed under the Apache License, Version 2.0 (the "License");
 5 | you may not use this file except in compliance with the License.
 6 | You may obtain a copy of the License at
 7 | 
 8 |     http://www.apache.org/licenses/LICENSE-2.0
 9 | 
10 | Unless required by applicable law or agreed to in writing, software
11 | distributed under the License is distributed on an "AS IS" BASIS,
12 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | See the License for the specific language governing permissions and
14 | limitations under the License.
15 | */
16 | package com.google.cloud.genomics.spark.examples
17 | 
18 | import scala.collection.JavaConversions._
19 | import org.apache.spark.SparkContext
20 | import com.google.api.services.genomics.model.SearchCallSetsRequest
21 | import com.google.genomics.v1.{Variant => VariantModel}
22 | import com.google.cloud.genomics.Authentication
23 | import com.google.cloud.genomics.Client
24 | import com.google.cloud.genomics.spark.examples.rdd.Variant
25 | import com.google.cloud.genomics.spark.examples.rdd.VariantKey
26 | import com.google.cloud.genomics.spark.examples.rdd.VariantsPartitioner
27 | import com.google.cloud.genomics.spark.examples.rdd.VariantsRDD
28 | import com.google.cloud.genomics.spark.examples.rdd.VariantsRddStats
29 | import com.google.cloud.genomics.utils.Paginator
30 | 
31 | import org.apache.spark.rdd.RDD
32 | 
33 | class VariantsCommon(conf: PcaConf, sc: SparkContext) {
34 | 
35 |   private val auth = Authentication.getAccessToken(conf.clientSecrets.get)
36 |   private val ioStats = createIoStats
37 | 
38 |   val (indexes, names) = {
39 |     val client = Client(auth).genomics
40 |     val searchCallsets = Paginator.Callsets.create(client)
41 |     val req = new SearchCallSetsRequest()
42 |         .setVariantSetIds(conf.variantSetId())
43 |     val callsets = searchCallsets.search(req).iterator().toSeq
44 |     val indexes = callsets.map(
45 |         callset => callset.getId()).toSeq.zipWithIndex.toMap
46 |     val names = callsets.map(
47 |         callset => (callset.getId(), callset.getName())).toMap
48 |     println(s"Matrix size: ${indexes.size}.")
49 |     (indexes, names)
50 |   }
51 | 
52 |   val data = {
53 |     if (conf.inputPath.isDefined) {
54 |       List(sc.objectFile[(VariantKey, Variant)](conf.inputPath()).map(_._2))
55 |     } else {
56 |       val variantSets = conf.variantSetId()
57 |       println(s"Running PCA on ${variantSets.length} datasets.")
58 |       conf.variantSetId().zipWithIndex.map {
59 |         case (variantSetId, variantSetIndex) =>
60 |         new VariantsRDD(sc, this.getClass.getName, auth,
61 |           variantSetId,
62 |           conf.getPartitioner(auth, variantSetId, variantSetIndex),
63 |           stats=ioStats).map(_._2)
64 |       }
65 |     }
66 |   }
67 | 
68 |   def reportIoStats = {
69 |     this.ioStats match {
70 |       case Some(stats) => println(stats.toString)
71 |       case _ => {}
72 |     }
73 |   }
74 | 
75 |   // For now assume a single dataset when invoking from python.
76 |   def getJavaData: RDD[VariantModel] = data.head.map(_.toJavaVariant)
77 | 
78 |   def createIoStats = if (conf.inputPath.isDefined) None
79 |       else Option(new VariantsRddStats(sc))
80 | 
81 | }
82 | 


--------------------------------------------------------------------------------
/src/main/scala/com/google/cloud/genomics/spark/examples/GenomicsConf.scala:
--------------------------------------------------------------------------------
  1 | /*
  2 | Copyright 2014 Google Inc. All rights reserved.
  3 | 
  4 | Licensed under the Apache License, Version 2.0 (the "License");
  5 | you may not use this file except in compliance with the License.
  6 | You may obtain a copy of the License at
  7 | 
  8 |     http://www.apache.org/licenses/LICENSE-2.0
  9 | 
 10 | Unless required by applicable law or agreed to in writing, software
 11 | distributed under the License is distributed on an "AS IS" BASIS,
 12 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 13 | See the License for the specific language governing permissions and
 14 | limitations under the License.
 15 | */
 16 | 
 17 | package com.google.cloud.genomics.spark.examples
 18 | 
 19 | import scala.collection.JavaConversions._
 20 | 
 21 | import org.apache.spark.SparkConf
 22 | import org.apache.spark.SparkContext
 23 | import org.rogach.scallop.ScallopConf
 24 | 
 25 | import com.google.cloud.genomics.spark.examples.rdd.AllReferencesVariantsPartitioner
 26 | import com.google.cloud.genomics.spark.examples.rdd.ReferencesVariantsPartitioner
 27 | import com.google.cloud.genomics.spark.examples.rdd.VariantsPartitioner
 28 | import com.google.cloud.genomics.utils.OfflineAuth
 29 | import com.google.cloud.genomics.utils.ShardUtils.SexChromosomeFilter
 30 | 
 31 | class GenomicsConf(arguments: Seq[String]) extends ScallopConf(arguments) {
 32 |   val DEFAULT_NUMBER_OF_BASES_PER_SHARD = 1000000
 33 |   val PLATINUM_GENOMES_BRCA1_REFERENCES = "chr17:41196311:41277499"
 34 | 
 35 |   val basesPerPartition = opt[Long](default =
 36 |     Some(DEFAULT_NUMBER_OF_BASES_PER_SHARD),
 37 |       descr = "Partition each reference using a fixed number of bases")
 38 |   val clientSecrets = opt[String](
 39 |     descr = "Provide the file path to client_secrets.json to use a user "
 40 |     + "credential instead of the Application Default Credential.")
 41 |   val inputPath = opt[String]()
 42 |   val numReducePartitions = opt[Int](default = Some(10),
 43 |       descr = "Set it to a " +
 44 |       "number greater than the number of cores, to achieve maximum " +
 45 |       "throughput.")
 46 |   val outputPath = opt[String]()
 47 |   val references = opt[List[String]](default=
 48 |     Some(List(PLATINUM_GENOMES_BRCA1_REFERENCES)),
 49 |       descr = "Comma separated tuples of reference:start:end,... " +
 50 |       "one list of tuples should be specified per variantset " +
 51 |       "in the corresponding order.")
 52 |   val sparkMaster = opt[String](
 53 |       descr = "A spark master URL. Leave empty if using spark-submit.")
 54 |   val variantSetId = opt[List[String]](
 55 |     default = Some(List(GoogleGenomicsPublicData.Platinum_Genomes)),
 56 |       descr = "List of VariantSetId to use in the analysis.")
 57 | 
 58 |   def newSparkContext(className: String) = {
 59 |     val conf = new SparkConf()
 60 |       .setAppName(className)
 61 |       .set("spark.shuffle.consolidateFiles", "true")
 62 |     if (this.sparkMaster.isDefined)
 63 |       conf.setMaster(this.sparkMaster())
 64 |     new SparkContext(conf)
 65 |   }
 66 | 
 67 |   def getPartitioner(references: String) = {
 68 |     new ReferencesVariantsPartitioner(references, this.basesPerPartition())
 69 |   }
 70 | }
 71 | 
 72 | object PcaConf {
 73 |   val ExcludeXY = SexChromosomeFilter.EXCLUDE_XY
 74 | }
 75 | 
 76 | class PcaConf(arguments: Seq[String]) extends GenomicsConf(arguments) {
 77 |   val allReferences = opt[Boolean](
 78 |       descr =  "Use all references (except X and Y) to compute PCA " +
 79 |       "(overrides --references).")
 80 |   val debugDatasets = opt[Boolean]()
 81 |   val minAlleleFrequency = opt[Float](
 82 |       descr = "For 2-way PCA, omit variants from the left variant set (typically 1,000 Genomes)" +
 83 |       " by including only variants with allelic frequency (field AF) greater than" +
 84 |       " or equal to this value.")
 85 |   val numPc = opt[Int](default = Some(2))
 86 | 
 87 |   /**
 88 |    * Returns either the parsed references for all datasets and their
 89 |    * corresponding  --references or all references
 90 |    * except X and Y if --all-references is specified.
 91 |    */
 92 |   def getPartitioner(auth: OfflineAuth, variantSetId: String,
 93 |       variantSetIndex: Int = 0) = {
 94 |     if (this.allReferences()) {
 95 |       new AllReferencesVariantsPartitioner(this.basesPerPartition(), auth)
 96 |     } else {
 97 |       new ReferencesVariantsPartitioner(this.references().get(variantSetIndex),
 98 |           this.basesPerPartition())
 99 |     }
100 |   }
101 | }
102 | 


--------------------------------------------------------------------------------
/src/main/scala/com/google/cloud/genomics/spark/examples/SearchVariantsExample.scala:
--------------------------------------------------------------------------------
  1 | /*
  2 | Copyright 2014 Google Inc. All rights reserved.
  3 | 
  4 | Licensed under the Apache License, Version 2.0 (the "License");
  5 | you may not use this file except in compliance with the License.
  6 | You may obtain a copy of the License at
  7 | 
  8 |     http://www.apache.org/licenses/LICENSE-2.0
  9 | 
 10 | Unless required by applicable law or agreed to in writing, software
 11 | distributed under the License is distributed on an "AS IS" BASIS,
 12 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 13 | See the License for the specific language governing permissions and
 14 | limitations under the License.
 15 | */
 16 | package com.google.cloud.genomics.spark.examples
 17 | 
 18 | import org.apache.log4j.Level
 19 | import org.apache.log4j.Logger
 20 | import com.google.cloud.genomics.spark.examples.rdd.Variant
 21 | import com.google.cloud.genomics.spark.examples.rdd.VariantKey
 22 | import com.google.cloud.genomics.spark.examples.rdd.VariantsPartitioner
 23 | import com.google.cloud.genomics.spark.examples.rdd.VariantsRDD
 24 | import com.google.cloud.genomics.Authentication
 25 | import com.google.cloud.genomics.utils.Contig
 26 | 
 27 | object GoogleGenomicsPublicData {
 28 |   final val Platinum_Genomes =   "3049512673186936334"
 29 |   final val Thousand_Genomes_Phase_1 = "10473108253681171589"
 30 |   final val Thousand_Genomes_Phase_3 = "4252737135923902652"
 31 | }
 32 | 
 33 | /**
 34 |  * The variant in this example corresponds to dbSNP ID rs9536314,
 35 |  * causing an amino acid substitution in the Klotho gene (KL
 36 |  * F327V). About 30% of people carry the variant. In build 37, this is
 37 |  * an A to G substition at chromosome 13, position 33628138.
 38 |  */
 39 | object SearchVariantsExampleKlotho {
 40 |   val PLATINUM_GENOMES_KLOTHO_REFERENCES = "chr13:33628137:33628138"
 41 | 
 42 |   def main(args: Array[String]) = {
 43 |     val conf = new GenomicsConf(args)
 44 |     val applicationName = this.getClass.getName
 45 |     val sc = conf.newSparkContext(applicationName)
 46 |     Logger.getLogger("org").setLevel(Level.WARN)
 47 |     val references = "chr13:33628137:33628138"
 48 |     val accessToken = Authentication.getAccessToken(conf.clientSecrets.get)
 49 |     val data = new VariantsRDD(sc,
 50 |       applicationName,
 51 |       accessToken,
 52 |       GoogleGenomicsPublicData.Platinum_Genomes,
 53 |       conf.getPartitioner(references))
 54 |     data.cache()  // The amount of data is small since its just for one SNP.
 55 |     println("We have " + data.count() + " records that overlap Klotho.")
 56 |     println("But only " + data.filter { kv =>
 57 |                                         val (key, variant) = kv
 58 |                                         variant.alternateBases != None
 59 |       }.count() + " records are of a variant.")
 60 |     println("The other " + data.filter { kv =>
 61 |                                          val (key, variant) = kv
 62 |                                          variant.alternateBases == None
 63 |       }.count() + " records are reference-matching blocks.")
 64 |     val variants = data.filter { kv =>
 65 |                                  val(key, variant) = kv
 66 |                                  variant.referenceBases != "N"
 67 |     }
 68 |     variants.collect.foreach { kv =>
 69 |       val (key, variant) = kv
 70 |       println(s"Reference: ${variant.contig} @ ${variant.start}")
 71 |     }
 72 | 
 73 |     // Exercise conversion from scala objects back to java objects.  This
 74 |     // is needed for a forthcoming example which writes modified
 75 |     // variants back to the variant store.
 76 |     //
 77 |     // TODO: this really belongs in an integration test or a unit test
 78 |     // with a mocked-out Genomics client; not in this sample.
 79 |     data.collect.foreach { kv =>
 80 |                            val (key, variant) = kv
 81 |                            variant.toJavaVariant() }
 82 |     sc.stop
 83 |   }
 84 | }
 85 | 
 86 | /**
 87 |  * This example pulls all variants that overlap BRCA1.
 88 |  */
 89 | object SearchVariantsExampleBRCA1 {
 90 |   def main(args: Array[String]) = {
 91 |     val conf = new GenomicsConf(args)
 92 |     val applicationName = this.getClass.getName
 93 |     val sc = conf.newSparkContext(applicationName)
 94 |     Logger.getLogger("org").setLevel(Level.WARN)
 95 |     val brca1 = "chr17:41196311:41277499"
 96 |     val accessToken = Authentication.getAccessToken(conf.clientSecrets.get)
 97 |     val data = new VariantsRDD(sc,
 98 |         this.getClass.getName,
 99 |         accessToken,
100 |         GoogleGenomicsPublicData.Platinum_Genomes,
101 |         conf.getPartitioner(brca1))
102 |     data.cache() // The amount of data is small since its just for one gene
103 |     println("We have " + data.count() + " records that overlap BRCA1.")
104 |     println("But only " + data.filter { kv =>
105 |                                         val(key, variant) = kv
106 |                                         variant.referenceBases != "N"
107 |       }.count() + " records are of a variant.")
108 |     println("The other " + data.filter { kv =>
109 |                                          val(key, variant) = kv
110 |                                          variant.referenceBases == "N"
111 |       }.count() + " records are reference-matching blocks.")
112 |     sc.stop
113 |   }
114 | }
115 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
  1 | spark-examples [![Build Status](https://img.shields.io/travis/googlegenomics/spark-examples.svg?style=flat)](https://travis-ci.org/googlegenomics/spark-examples)
  2 | ==============
  3 | 
  4 | The projects in this repository demonstrate working with genomic data accessible via the [Google Genomics API](https://cloud.google.com/genomics/) using [Apache Spark](http://spark.apache.org/).
  5 | 
  6 | > If you are ready to start coding, take a look at the information below.  But if you are
  7 | > looking for a task-oriented list (e.g., [How do I compute principal coordinate analysis
  8 | > with Google Genomics?](http://googlegenomics.readthedocs.org/en/latest/use_cases/compute_principal_coordinate_analysis/index.html)),
  9 | > a better place to start is the [Google Genomics Cookbook](http://googlegenomics.readthedocs.org/en/latest/index.html).
 10 | 
 11 | Getting Started
 12 | ---------------
 13 | 
 14 |  1. git clone this repository.
 15 |  
 16 |  1. If you have not already done so, follow the Google Genomics [getting started instructions](https://cloud.google.com/genomics/install-genomics-tools) to set up your environment
 17 |   including [installing gcloud](https://cloud.google.com/sdk/) and running `gcloud init`.
 18 | 
 19 |  1. Download and install [Apache Spark](https://spark.apache.org/downloads.html).
 20 | 
 21 |  1. Install [SBT](http://www.scala-sbt.org/release/docs/Getting-Started/Setup.html).
 22 | 
 23 |  1. This project now includes code for calling the Genomics API using [gRPC](http://www.grpc.io).  To use gRPC, you'll need a version of ALPN that matches your JRE version. 
 24 | 
 25 |   2. See the [ALPN documentation](http://www.eclipse.org/jetty/documentation/9.2.10.v20150310/alpn-chapter.html) for a table of which ALPN jar to use for your JRE version.
 26 |   2. Then download the correct version from [here](http://mvnrepository.com/artifact/org.mortbay.jetty.alpn/alpn-boot).
 27 | 
 28 | Local Run
 29 | ---------
 30 | From the `spark-examples` directory run `sbt run`
 31 | 
 32 | Use the following flags to match your runtime configuration:
 33 | 
 34 | ```
 35 | $ export SBT_OPTS='-Xbootclasspath/p:/YOUR/PATH/TO/alpn-boot-YOUR-VERSION.jar'
 36 | $ sbt "run --help"
 37 |   -o, --output-path  <arg>
 38 |   -s, --spark-master  <arg>      A spark master URL. Leave empty if using spark-submit.
 39 |   ...
 40 |       --help                     Show help message
 41 | ```
 42 | 
 43 | For example: 
 44 | 
 45 | ```
 46 | $ sbt "run --spark-master local[4]"
 47 | ```
 48 | 
 49 | A menu should appear asking you to pick the sample to run:
 50 | ```
 51 | Multiple main classes detected, select one to run:
 52 | 
 53 |  [1] com.google.cloud.genomics.spark.examples.SearchVariantsExampleKlotho
 54 |  [2] com.google.cloud.genomics.spark.examples.SearchVariantsExampleBRCA1
 55 |  [3] com.google.cloud.genomics.spark.examples.SearchReadsExample1
 56 |  [4] com.google.cloud.genomics.spark.examples.SearchReadsExample2
 57 |  [5] com.google.cloud.genomics.spark.examples.SearchReadsExample3
 58 |  [6] com.google.cloud.genomics.spark.examples.SearchReadsExample4
 59 |  [7] com.google.cloud.genomics.spark.examples.VariantsPcaDriver
 60 |  
 61 | Enter number:
 62 | ```
 63 | 
 64 | ### Troubleshooting:
 65 | 
 66 | If you are seeing `java.lang.OutOfMemoryError: PermGen space` errors, set the following SBT_OPTS flag:
 67 | ```
 68 | export SBT_OPTS='-XX:MaxPermSize=256m'
 69 | ``` 
 70 | 
 71 | Run on Google Compute Engine
 72 | -----------------------------
 73 | 
 74 | (1) Build the assembly.
 75 | ```
 76 | sbt assembly
 77 | ```
 78 | (2) Deploy your Spark cluster using [Google Cloud Dataproc](https://cloud.google.com/dataproc/).
 79 | ```
 80 | gcloud beta dataproc clusters create example-cluster --scopes cloud-platform
 81 | ```
 82 | (3) Copy the assembly jar to the master node.
 83 | ```
 84 | gcloud compute copy-files \
 85 |   target/scala-2.10/googlegenomics-spark-examples-assembly-1.0.jar  example-cluster-m:~/
 86 | ```
 87 | (4) ssh to the master.
 88 | ```
 89 | gcloud compute ssh example-cluster-m
 90 | ```
 91 | (5) Run one of the examples.
 92 | ```
 93 | spark-submit --class com.google.cloud.genomics.spark.examples.SearchReadsExample1 \
 94 |   googlegenomics-spark-examples-assembly-1.0.jar
 95 | ```
 96 | 
 97 | ### Running PCA variant analysis on GCE
 98 | To run the [variant PCA analysis](https://github.com/googlegenomics/spark-examples/blob/master/src/main/scala/com/google/cloud/genomics/spark/examples/VariantsPca.scala) on GCE  make sure you have followed all the steps on the previous section and that you are able to run at least one of the examples.
 99 | 
100 | Run the example PCA analysis for BRCA1 on the [1000 Genomes Project dataset](https://cloud.google.com/genomics/data/1000-genomes).
101 | ```
102 | spark-submit --class com.google.cloud.genomics.spark.examples.VariantsPcaDriver \
103 |   googlegenomics-spark-examples-assembly-1.0.jar
104 | ```
105 | 
106 | The analysis will output the two principal components for each sample to the console. Here is an example of the last few lines.
107 | ```
108 | ...
109 | NA20811		0.0286308791579312	-0.008456233951873527
110 | NA20812		0.030970386921818943	-0.006755469223823698
111 | NA20813		0.03080348019961635	-0.007475822860939408
112 | NA20814		0.02865238920148145	-0.008084003476919057
113 | NA20815		0.028798695736608034	-0.003755789964021788
114 | NA20816		0.026104805529612096	-0.010430718823329282
115 | NA20818		-0.033609576645005836	-0.026655905606186293
116 | NA20819		0.032019557126552155	-0.00775750983842731
117 | NA20826		0.03026607917284046	-0.009102704080927001
118 | NA20828		-0.03412964005321165	-0.025991697661590686
119 | NA21313		-0.03401702847363714	-0.024555217139987182
120 | ```
121 | 
122 | This pipeline is described in greater detail on [How do I compute principal coordinate analysis with Google Genomics?](http://googlegenomics.readthedocs.org/en/latest/use_cases/compute_principal_coordinate_analysis/index.html)
123 | 
124 | ### Debugging 
125 | 
126 | For more information, see https://cloud.google.com/dataproc/faq
127 | 


--------------------------------------------------------------------------------
/src/main/scala/com/google/cloud/genomics/spark/examples/rdd/ReadsRDD.scala:
--------------------------------------------------------------------------------
  1 | /*
  2 | Copyright 2014 Google Inc. All rights reserved.
  3 | 
  4 | Licensed under the Apache License, Version 2.0 (the "License");
  5 | you may not use this file except in compliance with the License.
  6 | You may obtain a copy of the License at
  7 | 
  8 |     http://www.apache.org/licenses/LICENSE-2.0
  9 | 
 10 | Unless required by applicable law or agreed to in writing, software
 11 | distributed under the License is distributed on an "AS IS" BASIS,
 12 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 13 | See the License for the specific language governing permissions and
 14 | limitations under the License.
 15 | */
 16 | package com.google.cloud.genomics.spark.examples.rdd
 17 | 
 18 | import java.util.{List => JList}
 19 | 
 20 | import com.google.cloud.genomics.Client
 21 | import com.google.cloud.genomics.utils.OfflineAuth
 22 | import com.google.cloud.genomics.utils.ShardBoundary
 23 | import com.google.cloud.genomics.utils.ShardUtils
 24 | import com.google.cloud.genomics.utils.ShardUtils.SexChromosomeFilter
 25 | import com.google.cloud.genomics.utils.grpc.ReadStreamIterator
 26 | import com.google.genomics.v1.StreamReadsRequest
 27 | import com.google.genomics.v1.{Read => ReadModel}
 28 | import com.google.protobuf.ByteString
 29 | import com.google.protobuf.ListValue
 30 | import com.google.protobuf.Value
 31 | 
 32 | import org.apache.spark.Partition
 33 | import org.apache.spark.SparkContext
 34 | import org.apache.spark.SparkContext._
 35 | import org.apache.spark.TaskContext
 36 | import org.apache.spark.rdd.RDD
 37 | import scala.collection.JavaConversions._
 38 | import scala.collection.JavaConverters._
 39 | 
 40 | /**
 41 |  * A serializable version of the Read.
 42 |  * https://github.com/googlegenomics/spark-examples/issues/84
 43 |  */
 44 | case class Read(alignedQuality: JList[Integer], cigar: String,
 45 |     id: String, mappingQuality: Int, matePosition: Option[Long],
 46 |     mateReferenceName: Option[String], fragmentName: String, alignedSequence: String,
 47 |     position: Long, readGroupSetId: String, referenceName: String,
 48 |     info: Map[String, JList[String]], fragmentLength: Int) extends Serializable
 49 | 
 50 | object ReadBuilder {
 51 | 
 52 |   val CIGAR_MATCH = Map(
 53 |       "ALIGNMENT_MATCH" -> "M",
 54 |       "CLIP_HARD" -> "H",
 55 |       "CLIP_SOFT" -> "S",
 56 |       "DELETE" -> "D",
 57 |       "INSERT" -> "I",
 58 |       "PAD" -> "P",
 59 |       "SEQUENCE_MATCH" -> "=",
 60 |       "SEQUENCE_MISMATCH" -> "X",
 61 |       "SKIP" -> "N")
 62 | 
 63 |   def fromJavaRead(r: ReadModel) = {
 64 |     val readKey = ReadKey(r.getAlignment.getPosition.getReferenceName,
 65 |         r.getAlignment.getPosition.getPosition)
 66 | 
 67 |    val cigar =  r.getAlignment.getCigarList.map(cigarUnit =>
 68 |      cigarUnit.getOperationLength() +
 69 |      CIGAR_MATCH(cigarUnit.getOperation().name())).mkString("")
 70 | 
 71 |     val read = Read(
 72 |         r.getAlignedQualityList,
 73 |         cigar,
 74 |         r.getId,
 75 |         r.getAlignment.getMappingQuality,
 76 |         Some(r.getNextMatePosition.getPosition),
 77 |         Some(r.getNextMatePosition.getReferenceName),
 78 |         r.getFragmentName,
 79 |         r.getAlignedSequence,
 80 |         r.getAlignment.getPosition.getPosition,
 81 |         r.getReadGroupSetId,
 82 |         r.getAlignment.getPosition.getReferenceName,
 83 |         r.getInfo.mapValues(_.getValuesList.map(_.getStringValue()).toList.asJava).toMap,
 84 |         r.getFragmentLength)
 85 |         (readKey, read)
 86 |   }
 87 | }
 88 | 
 89 | /**
 90 |  * A simple Spark RDD backed by Google Genomics Readstore and populated
 91 |  * via the StreamReads API call (https://cloud.google.com/genomics/reference/rpc/google.genomics.v1#streamingreadservice).
 92 |  */
 93 | class ReadsRDD(sc: SparkContext,
 94 |                applicationName: String,
 95 |                auth: OfflineAuth,
 96 |                readGroupSetId: String,
 97 |                readsPartitioner: ReadsPartitioner)
 98 |                extends RDD[(ReadKey, Read)](sc, Nil) {
 99 | 
100 |   override def getPartitions: Array[Partition] = {
101 |     readsPartitioner.getPartitions(readGroupSetId)
102 |   }
103 | 
104 |   override def compute(part: Partition, ctx: TaskContext):
105 |     Iterator[(ReadKey, Read)] = {
106 |     val client = Client(auth).genomics
107 |     val partition = part.asInstanceOf[ReadsPartition]
108 |     val request = partition.getReadsRequest
109 |     val responses = ReadStreamIterator.enforceShardBoundary(
110 |         auth, request, ShardBoundary.Requirement.STRICT, null);
111 |     val iterator = responses.flatMap(readResponse => {
112 |       readResponse.getAlignmentsList().map(read => {
113 |           ReadBuilder.fromJavaRead(read)
114 |         })
115 |       })
116 |     // Wrap the iterator to read the number of initialized requests once
117 |     // it is fully traversed.
118 |     new Iterator[(ReadKey, Read)]() {
119 |       def hasNext = {
120 |         val hasNext = iterator.hasNext
121 |         hasNext
122 |       }
123 | 
124 |       def next = iterator.next
125 |     }
126 |   }
127 | }
128 | 
129 | /**
130 |  * Defines a search range over a contig.
131 |  */
132 | case class ReadsPartition(
133 |     override val index: Int, serializedRequest: ByteString)
134 |     extends Partition {
135 | 
136 |   def getReadsRequest = StreamReadsRequest.parseFrom(serializedRequest)
137 | 
138 |   def range = {
139 |     val request = getReadsRequest
140 |     request.getEnd() - request.getStart()
141 |   }
142 | }
143 | 
144 | 
145 | /**
146 |  * Indexes a Read to its partition.
147 |  */
148 | case class ReadKey(contig: String, position: Long)
149 | 
150 | trait ReadsPartitioner extends Serializable {
151 |   def getPartitions(readGroupSetId: String): Array[Partition]
152 | }
153 | 
154 | 
155 | /**
156 |  * Describes partitions for a set of contigs and their ranges.
157 |  */
158 | class AllReferencesReadsPartitioner(numberOfBasesPerShard: Long,
159 |     auth: OfflineAuth) extends ReadsPartitioner {
160 | 
161 |   // Generates all partitions for all mapped Reads in the contig space.
162 |   def getPartitions(readGroupSetId: String): Array[Partition] = {
163 |     println(s"ReadGroupSetId: ${readGroupSetId}; All refs, exclude XY")
164 |     ShardUtils.getReadRequests(
165 |         readGroupSetId, SexChromosomeFilter.INCLUDE_XY,
166 |         numberOfBasesPerShard, auth).zipWithIndex.map {
167 |       case(request, index) => ReadsPartition(index, request.toByteString())
168 |     }.toArray
169 |   }
170 | }
171 | 
172 | class ReferencesReadsPartitioner(references: String,
173 |     numberOfBasesPerShard: Long) extends ReadsPartitioner {
174 |   // Generates all partitions for all mapped Reads in the contig space.
175 |   def getPartitions(readGroupSetId: String): Array[Partition] = {
176 |     println(s"ReadGroupSetId: ${readGroupSetId}; Refs: ${references}")
177 |     ShardUtils.getReadRequests(
178 |         List(readGroupSetId), references, numberOfBasesPerShard).zipWithIndex.map {
179 |       case(request, index) => ReadsPartition(index, request.toByteString)
180 |     }.toArray
181 |   }
182 | }
183 | 


--------------------------------------------------------------------------------
/src/main/python/variants_pca.py:
--------------------------------------------------------------------------------
  1 | # spark-submit --jars googlegenomics-spark-examples-assembly-1.0.jar \
  2 | #     --driver-class-path googlegenomics-spark-examples-assembly-1.0.jar \
  3 | #     src/main/python/variants_pca.py --client-secrets client_secrets.json
  4 | import json
  5 | import numpy
  6 | import operator
  7 | import sys
  8 | 
  9 | import pyspark
 10 | from pyspark import serializers
 11 | import pyspark.conf
 12 | from pyspark.mllib import common as mllib_common
 13 | from pyspark.mllib import linalg
 14 | import pyspark.rdd
 15 | 
 16 | conf = pyspark.conf.SparkConf()
 17 | sc = pyspark.SparkContext(conf=conf)
 18 | 
 19 | def prepare_call_data(py_rdd, py_id_to_index):
 20 |     """Return an RDD[Seq[int]] from the RDD[(VariantKey, Variant)].
 21 | 
 22 |     Args:
 23 |         py_rdd: An RDD of (VariantKey, Variant) of all Variants matching the
 24 |             search criteria.
 25 |         py_id_to_index: A dictionary of string to int, giving the indices of
 26 |             callset names in ``py_rdd``.
 27 | 
 28 |     Returns:
 29 |         An RDD[Seq[int]] in the same order of the input RDD, each entry is a
 30 |         list of indices of variant calls.
 31 |     """
 32 | 
 33 |     # Obtain all samples that have at least one matching call.
 34 |     samples_with_variant = (py_rdd.
 35 |         map(lambda v: v.get('calls', [])).
 36 |         map(lambda calls: [c for c in calls if any(c['genotype'])]).
 37 |         filter(lambda calls: len(calls) > 0)
 38 |     )
 39 | 
 40 |     # Obtain the callset name from the samples.
 41 |     callset_names = (samples_with_variant.
 42 |         map(lambda callset: [c['callSetId'] for c in callset])
 43 |     )
 44 | 
 45 |     # Convert all names (strings) to indices (ints).
 46 |     sc = pyspark.SparkContext._active_spark_context
 47 |     broadcast_index_map = sc.broadcast(py_id_to_index)
 48 |     call_rdd = callset_names.map(
 49 |         lambda callset: [broadcast_index_map.value[c] for c in callset]
 50 |     )
 51 | 
 52 |     return call_rdd
 53 | 
 54 | def calculate_similarity_matrix(call_rdd, matrix_size):
 55 |     """Return an RDD[(int, int), int] where each entry is similarity value of
 56 |     call ``x``, with respect to call ``y``.
 57 | 
 58 |     Args:
 59 |         call_rdd: An RDD[Seq[int]] as returned by ``prepare_call_data``.
 60 |         matrix_size: The size (N) of the N x N matrix.
 61 | 
 62 |     Returns:
 63 |         An RDD[(x, y), sim_value] where each entry is similarity value of call
 64 |         ``x`` with respect to call ``y``.
 65 |     """
 66 | 
 67 |     def sum_similarity(callsets):
 68 |         matrix = numpy.zeros((matrix_size, matrix_size), numpy.int)
 69 |         for callset in callsets:
 70 |             for x in callset:
 71 |                 for y in callset:
 72 |                     matrix[y][x] += 1
 73 |         for x in xrange(matrix_size):
 74 |             for y in xrange(matrix_size):
 75 |                yield (y, x), matrix[y][x]
 76 | 
 77 |     sim_matrix = (call_rdd.
 78 |         mapPartitions(sum_similarity).
 79 |         reduceByKey(operator.add)
 80 |     )
 81 | 
 82 |     return sim_matrix
 83 | 
 84 | def center_matrix(sim_matrix, row_count):
 85 |     """Center the rows and columns of a similarity matrix.
 86 | 
 87 |     Args:
 88 |         sim_matrix: A similarity matrix as returned by
 89 |             ``calculate_similarity_matrix``.
 90 |         row_count: The size (N) of the N x N matrix.
 91 | 
 92 |     Returns:
 93 |         An RDD[int, (int, float)] representing centered rows. The first int is
 94 |         the row index, the (int, float) tuple is the column index, and the
 95 |         centered value.
 96 |     """
 97 | 
 98 |     # Row-by-row (row major) RDD. Each row is a list of (column, value).
 99 |     entries = (sim_matrix.
100 |         map(lambda ((y, x), v): (y, (x, float(v)))).
101 |         groupByKey().
102 |         sortByKey(True).
103 |         cache()
104 |     )
105 |     row_sums = entries.map(lambda (y, xvs): sum(v for (x, v) in xvs)).collect()
106 |     matrix_sum = sum(row_sums)
107 |     matrix_mean = float(matrix_sum) / row_count / row_count
108 | 
109 |     sc = pyspark.SparkContext._active_spark_context
110 |     broadcast_row_sums = sc.broadcast(row_sums)
111 | 
112 |     def center_rows((row, col_vals)):
113 |         row_mean = broadcast_row_sums.value[row] / float(row_count)
114 | 
115 |         def center_cols(col, val):
116 |             col_mean = broadcast_row_sums.value[col] / float(row_count)
117 |             return (col, val - row_mean - col_mean + matrix_mean)
118 | 
119 |         return [center_cols(col, val) for col, val in col_vals]
120 | 
121 |     return entries.map(center_rows)
122 | 
123 | def perform_pca(matrix, row_count, nr_principal_components=2):
124 |     """Return principal components of the input matrix.
125 | 
126 |     This function uses MLlib's ``RowMatrix`` to compute principal components.
127 | 
128 |     Args:
129 |         matrix: An RDD[int, (int, float)] representing a sparse matrix. This
130 |             is returned by ``center_matrix`` but it is not required to center
131 |             the matrix first.
132 |         row_count: The size (N) of the N x N ``matrix``.
133 |         nr_principal_components: Number of components we want to obtain. This
134 |             value must be less than or equal to the number of rows in the input
135 |             square matrix.
136 | 
137 |     Returns:
138 |         An array of ``nr_principal_components`` columns, and same number of rows
139 |         as the input ``matrix``. This array is a ``numpy`` array.
140 |     """
141 | 
142 |     py_rdd = matrix.map(lambda row: linalg.Vectors.sparse(row_count, row))
143 |     sc = pyspark.SparkContext._active_spark_context
144 |     java_rdd = mllib_common._py2java(sc, py_rdd)
145 |     scala_rdd = java_rdd.rdd()
146 |     sc = pyspark.SparkContext._active_spark_context
147 |     row_matrix = (sc._jvm.org.apache.spark.mllib.linalg.distributed.
148 |         RowMatrix(scala_rdd)
149 |     )
150 |     pca = row_matrix.computePrincipalComponents(nr_principal_components)
151 |     pca = mllib_common._java2py(sc, pca)
152 |     return pca.toArray()
153 | 
154 | def pca(argv):
155 |     sc = pyspark.SparkContext._active_spark_context
156 |     args = sc._jvm.java.util.ArrayList()
157 |     for arg in argv:
158 |         args.append(arg)
159 |     args = sc._jvm.scala.collection.JavaConversions.asScalaBuffer(args)
160 |     jsc = sc._jsc.sc()
161 | 
162 |     pca_conf = (sc._jvm.com.google.cloud.genomics.spark.examples.
163 |         PcaConf(args))
164 |     variants_common = (sc._jvm.com.google.cloud.genomics.spark.examples.
165 |         VariantsCommon(pca_conf, jsc))
166 | 
167 |     # Map of sample ID to an index in the list of all sample IDs.
168 |     # e.g. the list ['NA20818', 'NA20819', 'NA20826'] produces the map
169 |     # {'NA20818': 0, 'NA20819': 1, 'NA20826', 2}
170 |     java_id_to_index = sc._jvm.scala.collection.JavaConversions.mapAsJavaMap(
171 |         variants_common.indexes())
172 |     py_id_to_index = {}
173 |     for k in java_id_to_index:
174 |         py_id_to_index[k] = java_id_to_index[k]
175 |     # This is the reverse map of the previous one.
176 |     index_to_id = dict((v, k) for (k, v) in py_id_to_index.iteritems())
177 | 
178 |     # Obtain an RDD of all Variants matching PcaConf.
179 |     scala_rdd = variants_common.getJavaData()
180 |     java_rdd = scala_rdd.toJavaRDD()
181 |     # Convert it to Python RDD.
182 |     py_rdd = mllib_common._java2py(sc, java_rdd)
183 | 
184 |     call_rdd = prepare_call_data(py_rdd, py_id_to_index)
185 | 
186 |     row_count = len(py_id_to_index)
187 |     sim_matrix = calculate_similarity_matrix(call_rdd, row_count)
188 | 
189 |     centered_rows = center_matrix(sim_matrix, row_count)
190 | 
191 |     result = perform_pca(centered_rows, row_count,
192 |                          pca_conf.numPc().get().get())
193 |     assert(len(result) == len(index_to_id))
194 |     result = [(index_to_id[i], result[i]) for i in range(len(index_to_id))]
195 |     result.sort()
196 |     for name, components in result:
197 |         print '%s\t%s' % (name, '\t'.join(str(c) for c in components))
198 | 
199 |     sc.stop()
200 | 
201 | pca(sys.argv[1:])
202 | 


--------------------------------------------------------------------------------
/src/main/scala/com/google/cloud/genomics/spark/examples/rdd/VariantsRDD.scala:
--------------------------------------------------------------------------------
  1 | /*
  2 | Copyright 2014 Google Inc. All rights reserved.
  3 | 
  4 | Licensed under the Apache License, Version 2.0 (the "License");
  5 | you may not use this file except in compliance with the License.
  6 | You may obtain a copy of the License at
  7 | 
  8 |     http://www.apache.org/licenses/LICENSE-2.0
  9 | 
 10 | Unless required by applicable law or agreed to in writing, software
 11 | distributed under the License is distributed on an "AS IS" BASIS,
 12 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 13 | See the License for the specific language governing permissions and
 14 | limitations under the License.
 15 | */
 16 | package com.google.cloud.genomics.spark.examples.rdd
 17 | 
 18 | import java.lang.{Double => JDouble}
 19 | import java.util.{List => JList}
 20 | 
 21 | import com.google.cloud.genomics.Client
 22 | import com.google.cloud.genomics.utils.OfflineAuth
 23 | import com.google.cloud.genomics.utils.ShardBoundary
 24 | import com.google.cloud.genomics.utils.ShardUtils
 25 | import com.google.cloud.genomics.utils.ShardUtils.SexChromosomeFilter
 26 | import com.google.cloud.genomics.utils.grpc.VariantStreamIterator
 27 | import com.google.genomics.v1.StreamVariantsRequest
 28 | import com.google.genomics.v1.{Variant => VariantModel}
 29 | import com.google.genomics.v1.{VariantCall => CallModel}
 30 | import com.google.protobuf.ByteString
 31 | import com.google.protobuf.ListValue
 32 | import com.google.protobuf.Value
 33 | 
 34 | import org.apache.spark.Accumulator
 35 | import org.apache.spark.Partition
 36 | import org.apache.spark.SparkContext
 37 | import org.apache.spark.SparkContext._
 38 | import org.apache.spark.TaskContext
 39 | import org.apache.spark.rdd.RDD
 40 | import scala.collection.JavaConversions._
 41 | 
 42 | /**
 43 |  * A serializable version of the Variant.
 44 |  * https://github.com/googlegenomics/spark-examples/issues/84
 45 |  */
 46 | case class Call(callsetId: String, callsetName: String, genotype: List[Integer],
 47 |     genotypeLikelihood: Option[List[JDouble]], phaseset: String,
 48 |     info: Map[String, List[String]]) extends Serializable
 49 | 
 50 | 
 51 | case class Variant(contig: String, id: String, names: Option[List[String]],
 52 |     start: Long, end: Long, referenceBases: String,
 53 |     alternateBases: Option[List[String]], info: Map[String, List[String]],
 54 |     created: Long, variantSetId: String, calls: Option[Seq[Call]]) extends Serializable {
 55 | 
 56 |   def toListValue(values: List[String]) = {
 57 |     val listValue = ListValue.newBuilder()
 58 |     listValue.addAllValues(
 59 |       values.map(Value.newBuilder().setStringValue(_).build))
 60 |     listValue.build
 61 |   }
 62 | 
 63 |   def toJavaVariant() = {
 64 |     val variant = VariantModel.newBuilder()
 65 |     .setReferenceName(this.contig)
 66 |     .setCreated(this.created)
 67 |     .setVariantSetId(this.variantSetId)
 68 |     .setId(this.id)
 69 |     .setStart(this.start)
 70 |     .setEnd(this.end)
 71 |     .setReferenceBases(this.referenceBases)
 72 | 
 73 |     variant.putAllInfo(this.info.mapValues(toListValue))
 74 | 
 75 |     if (this.alternateBases isDefined)
 76 |       variant.addAllAlternateBases(this.alternateBases.get)
 77 |     if (this.names isDefined)
 78 |       variant.addAllNames(this.names.get)
 79 |     if (this.calls isDefined) {
 80 |       val calls = this.calls.get.map
 81 |       { c =>
 82 |         val call = CallModel.newBuilder()
 83 |         .setCallSetId(c.callsetId)
 84 |         .setCallSetName(c.callsetName)
 85 | 
 86 |         call.addAllGenotype(c.genotype)
 87 |         call.setPhaseset(c.phaseset)
 88 | 
 89 |         call.putAllInfo(c.info.mapValues(toListValue))
 90 |         if (c.genotypeLikelihood isDefined)
 91 |           call.addAllGenotypeLikelihood(c.genotypeLikelihood.get)
 92 |         call.build
 93 |       }
 94 |       variant.addAllCalls(calls)
 95 |     }
 96 |     variant.build
 97 |   }
 98 | }
 99 | 
100 | 
101 | object VariantsBuilder {
102 | 
103 |   val refNameRegex = """([a-z]*)?([0-9]*)""".r
104 | 
105 |   def normalize(referenceName: String) = {
106 |     referenceName match {
107 |       case refNameRegex(ref, id) => Some(id)
108 |       case _ => None
109 |     }
110 |   }
111 | 
112 |   def toStringList(values: ListValue) =
113 |     values.getValuesList.map(_.getStringValue()).toList
114 | 
115 |   def build(r: VariantModel) = {
116 |     val variantKey = VariantKey(r.getReferenceName, r.getStart)
117 |     val calls = if (r.getCallsCount > 0)
118 |         Some(r.getCallsList.map(
119 |             c => Call(
120 |                 c.getCallSetId,
121 |                 c.getCallSetName,
122 |                 c.getGenotypeList.toList,
123 |                 if (c.getGenotypeLikelihoodCount > 0)
124 |                   Some(c.getGenotypeLikelihoodList.toList)
125 |                 else
126 |                   None,
127 |                 c.getPhaseset,
128 |                 c.getInfo.mapValues(toStringList).toMap)))
129 |       else
130 |         None
131 | 
132 |     val referenceName = normalize(r.getReferenceName)
133 | 
134 |     if (referenceName.isEmpty) {
135 |       None;
136 |     } else {
137 |       val variant = Variant(
138 |           referenceName.get,
139 |           r.getId,
140 |           if (r.getNamesCount() > 0)
141 |             Some(r.getNamesList.toList)
142 |           else
143 |             None,
144 |           r.getStart,
145 |           r.getEnd,
146 |           r.getReferenceBases,
147 |           if (r.getAlternateBasesCount() > 0)
148 |             Some(r.getAlternateBasesList.toList)
149 |           else
150 |             None,
151 |           r.getInfo.mapValues(toStringList).toMap,
152 |           r.getCreated,
153 |           r.getVariantSetId,
154 |           calls)
155 |       Some((variantKey, variant))
156 |     }
157 |   }
158 | }
159 | 
160 | class VariantsRddStats(sc: SparkContext) extends Serializable {
161 |     val partitionsAccum = sc.accumulator(0, "Partitions count")
162 |     val referenceBasesAccum = sc.accumulator(0L, "Reference bases count")
163 |     val requestsAccum = sc.accumulator(0, "Request count")
164 |     val unsuccessfulResponsesAccum = sc.accumulator(0, "Unsuccessful count")
165 |     val ioExceptionsAccum = sc.accumulator(0, "IO exceptions count")
166 |     val variantsAccum = sc.accumulator(0, "Variant count")
167 | 
168 |     override def toString ={
169 |       val buf = new StringBuilder
170 |       buf ++= "Variants API stats:\n"
171 |       buf ++= "-------------------------------\n"
172 |       buf ++= s"# of partitions: ${this.partitionsAccum}\n"
173 |       buf ++= s"# of bases requested: ${this.referenceBasesAccum}\n"
174 |       buf ++= s"# of variants read: ${this.variantsAccum}\n"
175 |       buf ++= s"# of API requests: ${this.requestsAccum}\n"
176 |       buf ++= s"# of unsuccessful responses: ${this.unsuccessfulResponsesAccum}\n"
177 |       buf ++= s"# of IO exceptions: ${this.ioExceptionsAccum}\n"
178 |       buf.toString
179 |     }
180 | }
181 | 
182 | /**
183 |  * A simple Spark RDD backed by Google Genomics VariantStore and
184 |  * populated via the StreamVariants API call
185 |  * (https://cloud.google.com/genomics/reference/rpc/google.genomics.v1#streamingvariantservice).
186 |  */
187 | class VariantsRDD(sc: SparkContext,
188 |     applicationName: String,
189 |     auth: OfflineAuth,
190 |     variantSetId: String,
191 |     variantsPartitioner: VariantsPartitioner,
192 |     stats:Option[VariantsRddStats] = None)
193 |      extends RDD[(VariantKey, Variant)](sc, Nil) {
194 | 
195 |   override def getPartitions: Array[Partition] = {
196 |     variantsPartitioner.getPartitions(variantSetId)
197 |   }
198 | 
199 |   def reportStats(client: Client) = stats map { stat =>
200 |     stat.requestsAccum += client.initializedRequestsCount
201 |     stat.unsuccessfulResponsesAccum += client.unsuccessfulResponsesCount
202 |     stat.ioExceptionsAccum += client.ioExceptionsCount
203 |   }
204 | 
205 |   override def compute(part: Partition, ctx: TaskContext):
206 |     Iterator[(VariantKey, Variant)] = {
207 |     val client = Client(auth)
208 |     val partition = part.asInstanceOf[VariantsPartition]
209 |     val request = partition.getVariantsRequest
210 |     val responses = VariantStreamIterator.enforceShardBoundary(
211 |         auth, request, ShardBoundary.Requirement.STRICT, null);
212 |     val iterator = responses.flatMap(variantResponse => {
213 |       variantResponse.getVariantsList().map(variant => {
214 |           stats map { _.variantsAccum += 1 }
215 |           VariantsBuilder.build(variant)
216 |         })
217 |       }).filter(_.isDefined).map(_.get)
218 |     stats map { stat =>
219 |         stat.partitionsAccum += 1
220 |         stat.referenceBasesAccum += (partition.range)
221 |     }
222 |     // Wrap the iterator to read the number of initialized requests once
223 |     // it is fully traversed.
224 |     new Iterator[(VariantKey, Variant)]() {
225 |       def hasNext = {
226 |         val hasNext = iterator.hasNext
227 |         if (!hasNext) {
228 |           reportStats(client)
229 |         }
230 |         hasNext
231 |       }
232 | 
233 |       def next = iterator.next
234 |     }
235 |   }
236 | }
237 | 
238 | 
239 | /**
240 |  * Defines a search range over a contig.
241 |  */
242 | case class VariantsPartition(
243 |     override val index: Int, serializedRequest: ByteString)
244 |     extends Partition {
245 | 
246 |   def getVariantsRequest = StreamVariantsRequest.parseFrom(serializedRequest)
247 | 
248 |   def range = {
249 |     val request = getVariantsRequest
250 |     request.getEnd() - request.getStart()
251 |   }
252 | }
253 | 
254 | 
255 | /**
256 |  * Indexes a variant to its partition.
257 |  */
258 | case class VariantKey(contig: String, position: Long)
259 | 
260 | trait VariantsPartitioner extends Serializable {
261 |   def getPartitions(variantSetId: String): Array[Partition]
262 | }
263 | 
264 | 
265 | /**
266 |  * Describes partitions for a set of contigs and their ranges.
267 |  */
268 | class AllReferencesVariantsPartitioner(numberOfBasesPerShard: Long,
269 |     auth: OfflineAuth) extends VariantsPartitioner {
270 | 
271 |   // Generates all partitions for all mapped variants in the contig space.
272 |   def getPartitions(variantSetId: String): Array[Partition] = {
273 |     println(s"Variantset: ${variantSetId}; All refs, exclude XY")
274 |     ShardUtils.getVariantRequests(
275 |         variantSetId, SexChromosomeFilter.EXCLUDE_XY,
276 |         numberOfBasesPerShard, auth).zipWithIndex.map {
277 |       case(request, index) => VariantsPartition(index, request.toByteString())
278 |     }.toArray
279 |   }
280 | }
281 | 
282 | class ReferencesVariantsPartitioner(references: String,
283 |     numberOfBasesPerShard: Long) extends VariantsPartitioner {
284 |   // Generates all partitions for all mapped variants in the contig space.
285 |   def getPartitions(variantSetId: String): Array[Partition] = {
286 |     println(s"Variantset: ${variantSetId}; Refs: ${references}")
287 |     ShardUtils.getVariantRequests(
288 |         variantSetId, references, numberOfBasesPerShard).zipWithIndex.map {
289 |       case(request, index) => VariantsPartition(index, request.toByteString)
290 |     }.toArray
291 |   }
292 | }
293 | 


--------------------------------------------------------------------------------
/src/main/scala/com/google/cloud/genomics/spark/examples/VariantsPca.scala:
--------------------------------------------------------------------------------
  1 | /*
  2 | Copyright 2014 Google Inc. All rights reserved.
  3 | 
  4 | Licensed under the Apache License, Version 2.0 (the "License");
  5 | you may not use this file except in compliance with the License.
  6 | You may obtain a copy of the License at
  7 | 
  8 |     http://www.apache.org/licenses/LICENSE-2.0
  9 | 
 10 | Unless required by applicable law or agreed to in writing, software
 11 | distributed under the License is distributed on an "AS IS" BASIS,
 12 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 13 | See the License for the specific language governing permissions and
 14 | limitations under the License.
 15 | */
 16 | package com.google.cloud.genomics.spark.examples
 17 | 
 18 | import scala.collection.JavaConversions._
 19 | 
 20 | import org.apache.log4j.Level
 21 | import org.apache.log4j.Logger
 22 | import org.apache.spark.SparkContext
 23 | import org.apache.spark.broadcast.Broadcast
 24 | import org.apache.spark.mllib.linalg.Vectors
 25 | import org.apache.spark.mllib.linalg.distributed.RowMatrix
 26 | import org.apache.spark.rdd.RDD
 27 | import org.apache.spark.rdd.RDD._
 28 | import org.apache.spark.rdd.UnionRDD
 29 | 
 30 | import com.google.cloud.genomics.spark.examples.rdd.Variant
 31 | import com.google.common.base.Charsets
 32 | import com.google.common.hash.Hashing
 33 | 
 34 | import breeze.linalg.DenseMatrix
 35 | 
 36 | object VariantsPcaDriver {
 37 | 
 38 |   def main(args: Array[String]) = {
 39 |     Logger.getLogger("org").setLevel(Level.WARN)
 40 |     val conf = new PcaConf(args)
 41 |     val driver = VariantsPcaDriver(conf)
 42 |     val data = driver.getData
 43 |     val filtered = data.map(driver.filterDataset)
 44 |     val callsRdd = driver.getCallsRdd(filtered)
 45 |     val simMatrix = driver.getSimilarityMatrix(callsRdd)
 46 |     val result = driver.computePca(simMatrix)
 47 |     driver.emitResult(result)
 48 |     driver.reportIoStats
 49 |     driver.stop
 50 |   }
 51 | 
 52 |   def apply(conf: PcaConf) = new VariantsPcaDriver(conf)
 53 | 
 54 |   // The following functions are defined on the companion object so they can be
 55 |   // serialized and used on the RDD functions.
 56 |   def extractCallInfo(variant: Variant, mapping: Map[String, Int]) = {
 57 |     variant.calls.getOrElse(Seq()).map(
 58 |         call => CallData(call.genotype.foldLeft(false)(_ || _ > 0),
 59 |             mapping(call.callsetId)))
 60 |  }
 61 | 
 62 |   def getVariantKey(variant: Variant, debug:Boolean = false) = {
 63 |     val alternateBases = variant.alternateBases.map(
 64 |                altBases => altBases.mkString("")).getOrElse("")
 65 |     val referenceBases = Option(variant.referenceBases).getOrElse("")
 66 |     if (debug) {
 67 |       println(s"${variant.contig}: (${variant.start}, ${variant.end}) ref=${referenceBases} alt=${alternateBases}")
 68 |     }
 69 |     Hashing.murmur3_128().newHasher()
 70 |        .putString(variant.contig, Charsets.UTF_8)
 71 |        .putLong(variant.start)
 72 |        .putLong(variant.end)
 73 |        .putString(referenceBases, Charsets.UTF_8)
 74 |        .putString(
 75 |            alternateBases,
 76 |            Charsets.UTF_8)
 77 |       .hash().toString()
 78 |   }
 79 | }
 80 | 
 81 | class VariantsPcaDriver(conf: PcaConf, ctx: SparkContext = null) {
 82 |   private val applicationName = this.getClass.getName
 83 |   private val sc = if (ctx != null) ctx
 84 |                    else conf.newSparkContext(applicationName)
 85 |   private val common = new VariantsCommon(conf, sc)
 86 | 
 87 |   def getData = common.data
 88 | 
 89 | 
 90 |   /**
 91 |    * Filter datasets according to the specified flags.
 92 |    *
 93 |    * Possible flags include:
 94 |    *   --min-allele-frequency
 95 |    */
 96 |   def filterDataset(data: RDD[Variant]) = {
 97 |     if (conf.minAlleleFrequency.isDefined) {
 98 |       val minAlleleFrequency = conf.minAlleleFrequency()
 99 |       println(s"Min allele frequency ${minAlleleFrequency}.")
100 |       data.filter(variant => {
101 |         val alleleFrequency = variant.info.get("AF")
102 |         alleleFrequency.map(_.get(0).toFloat >= minAlleleFrequency)
103 |           .getOrElse(false)
104 |       })
105 |     } else {
106 |       data
107 |     }
108 |   }
109 |   /**
110 |    * Returns an RDD of calls joined by their variant matching key.
111 |    *
112 |    * The key is composed by the reference name its start and end positions,
113 |    * as well as the reference and alternate bases.
114 |    */
115 |   def joinDatasets(datasets: List[RDD[Variant]],
116 |       broadcastIndexes: Broadcast[Map[String, Int]]): RDD[Seq[CallData]] = {
117 |     val broadcastIndexes = sc.broadcast(common.indexes)
118 |     val debugDatasets = conf.debugDatasets()
119 |     val callsets = datasets.map(_.map(variant =>
120 |       (VariantsPcaDriver.getVariantKey(variant, debugDatasets), variant))
121 |       .mapValues(
122 |           VariantsPcaDriver.extractCallInfo(_, broadcastIndexes.value)))
123 |     val callset1 = callsets(0)
124 |     val callset2 = callsets(1)
125 |     callset1.join(callset2, conf.numReducePartitions())
126 |       .values
127 |       .map(related => related._1 ++ related._2)
128 |   }
129 | 
130 |   /**
131 |    * Returns an RDD of calls merged by their variant matching key.
132 |    *
133 |    * The key is composed by the reference name its start and end positions,
134 |    * as well as the reference and alternate bases.
135 |    */
136 |   def mergeDatasets(data: List[RDD[Variant]], variantSetCount: Int,
137 |       broadcastIndexes: Broadcast[Map[String, Int]]): RDD[Seq[CallData]]= {
138 |     val callsets = new UnionRDD(sc, data)
139 |     val broadcastIndexes = sc.broadcast(common.indexes)
140 |     callsets.map(variant =>
141 |       (VariantsPcaDriver.getVariantKey(variant), variant))
142 |       .mapValues(
143 |           VariantsPcaDriver.extractCallInfo(_, broadcastIndexes.value))
144 |       .groupByKey(conf.numReducePartitions())
145 |       .values
146 |       .filter(_.size() == variantSetCount)
147 |       .map(related => related.flatMap(calls => calls).toSeq)
148 |   }
149 | 
150 |   /**
151 |    * Returns an RDD of variant callsets with each call mapped to a position.
152 |    */
153 |   def getCallsRdd(data: List[RDD[Variant]]): RDD[Seq[Int]] = {
154 |     val variantSetCount = conf.variantSetId().size
155 |     val broadcastIndexes = sc.broadcast(common.indexes)
156 |     val callsets = if (variantSetCount == 1) {
157 |       data.head.map(VariantsPcaDriver.extractCallInfo(_, broadcastIndexes.value))
158 |     } else if (variantSetCount == 2) {
159 |       joinDatasets(data, broadcastIndexes)
160 |     } else {
161 |       mergeDatasets(data, variantSetCount, broadcastIndexes)
162 |     }
163 |     return callsets
164 |       .map(calls => calls.filter(_.hasVariation))
165 |        // Return only those sets that have at least one call with variation.
166 |       .filter(_.size > 0)
167 |       .map(_.map(_.callsetId))
168 |   }
169 | 
170 |   /**
171 |    * Computes a similarity matrix from the variant information.
172 |    *
173 |    * This method computes the similarity in place, this means it updates the
174 |    * the counts in place on a pre-allocated dense matrix.
175 |    *
176 |    * Use this method if the partial matrix will fit in memory, roughly
177 |    * a data set with 50K samples would fit on ~20GB of memory.
178 |    *
179 |    * @param calls an RDD of call ids, one variant per record.
180 |    * @return an RDD of tuples with the matrix entry indexes and its similarity.
181 |    */
182 |   def getSimilarityMatrix(callsets: RDD[Seq[Int]]) = {
183 |     val size = common.indexes.size
184 |     callsets.mapPartitions(callsInPartition => {
185 |       val matrix = DenseMatrix.zeros[Int](size, size)
186 |       callsInPartition.foreach(callset =>
187 |         for (c1 <- callset; c2 <- callset)
188 |           matrix.update(c1, c2, matrix(c1, c2) + 1))
189 |       matrix.iterator
190 |     }).reduceByKey(_ + _, conf.numReducePartitions())
191 |   }
192 | 
193 |   /**
194 |    * Computes the PCA from the similarity matrix entries.
195 |    *
196 |    * @param matrixEntries an RDD of tuples representing the matrix entries.
197 |    */
198 |   def computePca(matrixEntries: RDD[((Int, Int), Int)]) = {
199 |     val rowCount = common.indexes.size
200 |     val entries =
201 |       matrixEntries.map(item => (item._1._1, item._1._2, item._2.toDouble))
202 |         .map(item => (item._1, (item._2, item._3)))
203 |         .groupByKey()
204 |         .sortByKey(true)
205 |         .cache
206 |     val rowSums = entries.map(_._2.foldLeft(0D)(_ + _._2)).collect
207 |     val nonZeroRows = rowSums.filter(_ > 0).size
208 |     println(s"Non zero rows in matrix: ${nonZeroRows} / ${common.indexes.size}.")
209 |     val broadcastRowSums = sc.broadcast(rowSums)
210 |     val matrixSum = rowSums.reduce(_ + _)
211 |     val matrixMean = matrixSum / rowCount / rowCount;
212 |     val centeredRows = entries.map(indexedRow => {
213 |       val localRowSums = broadcastRowSums.value
214 |       val i = indexedRow._1
215 |       val row = indexedRow._2
216 |       val rowMean = localRowSums(i) / rowCount;
217 |       row.map(entry => {
218 |         val j = entry._1
219 |         val data = entry._2
220 |         val colMean = localRowSums(j) / rowCount;
221 |         (j, data - rowMean - colMean + matrixMean)
222 |       }).toSeq
223 |     })
224 |     val rows = centeredRows.map(row => Vectors.sparse(rowCount, row))
225 |     val matrix = new RowMatrix(rows)
226 |     val pca = matrix.computePrincipalComponents(conf.numPc())
227 |     val array = pca.toArray
228 |     val reverse = common.indexes.map(_.swap)
229 |     for (i <- 0 until pca.numRows)
230 |       yield (reverse(i), array(i), array(i + pca.numRows))
231 |   }
232 | 
233 |   def emitResult(result: Seq[(String, Double, Double)]) {
234 |     val resultWithNames = result.map { tuple =>
235 |       val dataset = tuple._1.split("-").head
236 |       (common.names(tuple._1), tuple._2, tuple._3, dataset)
237 |     }
238 |     resultWithNames.sortBy(_._1).foreach(tuple =>
239 |       println(s"${tuple._1}\t${tuple._4}\t${tuple._2}\t${tuple._3}"))
240 | 
241 |     if(conf.outputPath.isDefined) {
242 |       val resultRdd = sc.parallelize(resultWithNames)
243 |       resultRdd.map(tuple => s"${tuple._1}\t${tuple._2}\t${tuple._3}\t${tuple._4}")
244 |         .saveAsTextFile(conf.outputPath() + "-pca.tsv")
245 |     }
246 |   }
247 | 
248 |   /**
249 |    * Computes a similarity matrix from the variant information.
250 |    *
251 |    * This method computes the similarity in a streaming fashion, this means
252 |    * it never stores the partial similarity matrix in memory. The drawback
253 |    * from this approach is that it generates large shuffles as it emits a
254 |    * pair for each co-occurring call on a variant.
255 |    *
256 |    * Use this method only if the partial matrix won't fit in memory, roughly
257 |    * a data set with 50K samples would fit on ~20GB of memory.
258 |    *
259 |    * @param calls an RDD of call ids, one variant per record.
260 |    * @return an RDD of tuples with the matrix entry indexes and its similarity.
261 |    */
262 |   def getSimilarityMatrixStream(calls: RDD[Seq[Int]]):
263 |     RDD[((Int, Int), Int)] = {
264 |     // Keep track of how many calls share the same variant
265 |     calls.flatMap(callset =>
266 |       // Emit only half of the counts
267 |       for (c1 <- callset.iterator; c2 <- callset.iterator if c1 <= c2)
268 |         yield ((c1, c2), 1))
269 |     // Aggregate the similar pairs, partially done in memory.
270 |     .reduceByKey(_ + _, conf.numReducePartitions())
271 |     // Rebuild the symmetric matrix from the aggregated pairs.
272 |     .flatMap(item => {
273 |       if (item._1._1 < item._1._2) {
274 |         Seq(item, ((item._1._2, item._1._1), item._2))
275 |       } else {
276 |         Seq(item)
277 |       }
278 |     })
279 |   }
280 | 
281 |   def reportIoStats = common.reportIoStats
282 | 
283 |   def stop {
284 |     sc.stop
285 |   }
286 | }
287 | 
288 | case class CallData(hasVariation: Boolean, callsetId: Int) extends Serializable
289 | 


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
  1 | Apache License
  2 |                            Version 2.0, January 2004
  3 |                         http://www.apache.org/licenses/
  4 | 
  5 |    TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
  6 | 
  7 |    1. Definitions.
  8 | 
  9 |       "License" shall mean the terms and conditions for use, reproduction,
 10 |       and distribution as defined by Sections 1 through 9 of this document.
 11 | 
 12 |       "Licensor" shall mean the copyright owner or entity authorized by
 13 |       the copyright owner that is granting the License.
 14 | 
 15 |       "Legal Entity" shall mean the union of the acting entity and all
 16 |       other entities that control, are controlled by, or are under common
 17 |       control with that entity. For the purposes of this definition,
 18 |       "control" means (i) the power, direct or indirect, to cause the
 19 |       direction or management of such entity, whether by contract or
 20 |       otherwise, or (ii) ownership of fifty percent (50%) or more of the
 21 |       outstanding shares, or (iii) beneficial ownership of such entity.
 22 | 
 23 |       "You" (or "Your") shall mean an individual or Legal Entity
 24 |       exercising permissions granted by this License.
 25 | 
 26 |       "Source" form shall mean the preferred form for making modifications,
 27 |       including but not limited to software source code, documentation
 28 |       source, and configuration files.
 29 | 
 30 |       "Object" form shall mean any form resulting from mechanical
 31 |       transformation or translation of a Source form, including but
 32 |       not limited to compiled object code, generated documentation,
 33 |       and conversions to other media types.
 34 | 
 35 |       "Work" shall mean the work of authorship, whether in Source or
 36 |       Object form, made available under the License, as indicated by a
 37 |       copyright notice that is included in or attached to the work
 38 |       (an example is provided in the Appendix below).
 39 | 
 40 |       "Derivative Works" shall mean any work, whether in Source or Object
 41 |       form, that is based on (or derived from) the Work and for which the
 42 |       editorial revisions, annotations, elaborations, or other modifications
 43 |       represent, as a whole, an original work of authorship. For the purposes
 44 |       of this License, Derivative Works shall not include works that remain
 45 |       separable from, or merely link (or bind by name) to the interfaces of,
 46 |       the Work and Derivative Works thereof.
 47 | 
 48 |       "Contribution" shall mean any work of authorship, including
 49 |       the original version of the Work and any modifications or additions
 50 |       to that Work or Derivative Works thereof, that is intentionally
 51 |       submitted to Licensor for inclusion in the Work by the copyright owner
 52 |       or by an individual or Legal Entity authorized to submit on behalf of
 53 |       the copyright owner. For the purposes of this definition, "submitted"
 54 |       means any form of electronic, verbal, or written communication sent
 55 |       to the Licensor or its representatives, including but not limited to
 56 |       communication on electronic mailing lists, source code control systems,
 57 |       and issue tracking systems that are managed by, or on behalf of, the
 58 |       Licensor for the purpose of discussing and improving the Work, but
 59 |       excluding communication that is conspicuously marked or otherwise
 60 |       designated in writing by the copyright owner as "Not a Contribution."
 61 | 
 62 |       "Contributor" shall mean Licensor and any individual or Legal Entity
 63 |       on behalf of whom a Contribution has been received by Licensor and
 64 |       subsequently incorporated within the Work.
 65 | 
 66 |    2. Grant of Copyright License. Subject to the terms and conditions of
 67 |       this License, each Contributor hereby grants to You a perpetual,
 68 |       worldwide, non-exclusive, no-charge, royalty-free, irrevocable
 69 |       copyright license to reproduce, prepare Derivative Works of,
 70 |       publicly display, publicly perform, sublicense, and distribute the
 71 |       Work and such Derivative Works in Source or Object form.
 72 | 
 73 |    3. Grant of Patent License. Subject to the terms and conditions of
 74 |       this License, each Contributor hereby grants to You a perpetual,
 75 |       worldwide, non-exclusive, no-charge, royalty-free, irrevocable
 76 |       (except as stated in this section) patent license to make, have made,
 77 |       use, offer to sell, sell, import, and otherwise transfer the Work,
 78 |       where such license applies only to those patent claims licensable
 79 |       by such Contributor that are necessarily infringed by their
 80 |       Contribution(s) alone or by combination of their Contribution(s)
 81 |       with the Work to which such Contribution(s) was submitted. If You
 82 |       institute patent litigation against any entity (including a
 83 |       cross-claim or counterclaim in a lawsuit) alleging that the Work
 84 |       or a Contribution incorporated within the Work constitutes direct
 85 |       or contributory patent infringement, then any patent licenses
 86 |       granted to You under this License for that Work shall terminate
 87 |       as of the date such litigation is filed.
 88 | 
 89 |    4. Redistribution. You may reproduce and distribute copies of the
 90 |       Work or Derivative Works thereof in any medium, with or without
 91 |       modifications, and in Source or Object form, provided that You
 92 |       meet the following conditions:
 93 | 
 94 |       (a) You must give any other recipients of the Work or
 95 |           Derivative Works a copy of this License; and
 96 | 
 97 |       (b) You must cause any modified files to carry prominent notices
 98 |           stating that You changed the files; and
 99 | 
100 |       (c) You must retain, in the Source form of any Derivative Works
101 |           that You distribute, all copyright, patent, trademark, and
102 |           attribution notices from the Source form of the Work,
103 |           excluding those notices that do not pertain to any part of
104 |           the Derivative Works; and
105 | 
106 |       (d) If the Work includes a "NOTICE" text file as part of its
107 |           distribution, then any Derivative Works that You distribute must
108 |           include a readable copy of the attribution notices contained
109 |           within such NOTICE file, excluding those notices that do not
110 |           pertain to any part of the Derivative Works, in at least one
111 |           of the following places: within a NOTICE text file distributed
112 |           as part of the Derivative Works; within the Source form or
113 |           documentation, if provided along with the Derivative Works; or,
114 |           within a display generated by the Derivative Works, if and
115 |           wherever such third-party notices normally appear. The contents
116 |           of the NOTICE file are for informational purposes only and
117 |           do not modify the License. You may add Your own attribution
118 |           notices within Derivative Works that You distribute, alongside
119 |           or as an addendum to the NOTICE text from the Work, provided
120 |           that such additional attribution notices cannot be construed
121 |           as modifying the License.
122 | 
123 |       You may add Your own copyright statement to Your modifications and
124 |       may provide additional or different license terms and conditions
125 |       for use, reproduction, or distribution of Your modifications, or
126 |       for any such Derivative Works as a whole, provided Your use,
127 |       reproduction, and distribution of the Work otherwise complies with
128 |       the conditions stated in this License.
129 | 
130 |    5. Submission of Contributions. Unless You explicitly state otherwise,
131 |       any Contribution intentionally submitted for inclusion in the Work
132 |       by You to the Licensor shall be under the terms and conditions of
133 |       this License, without any additional terms or conditions.
134 |       Notwithstanding the above, nothing herein shall supersede or modify
135 |       the terms of any separate license agreement you may have executed
136 |       with Licensor regarding such Contributions.
137 | 
138 |    6. Trademarks. This License does not grant permission to use the trade
139 |       names, trademarks, service marks, or product names of the Licensor,
140 |       except as required for reasonable and customary use in describing the
141 |       origin of the Work and reproducing the content of the NOTICE file.
142 | 
143 |    7. Disclaimer of Warranty. Unless required by applicable law or
144 |       agreed to in writing, Licensor provides the Work (and each
145 |       Contributor provides its Contributions) on an "AS IS" BASIS,
146 |       WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
147 |       implied, including, without limitation, any warranties or conditions
148 |       of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
149 |       PARTICULAR PURPOSE. You are solely responsible for determining the
150 |       appropriateness of using or redistributing the Work and assume any
151 |       risks associated with Your exercise of permissions under this License.
152 | 
153 |    8. Limitation of Liability. In no event and under no legal theory,
154 |       whether in tort (including negligence), contract, or otherwise,
155 |       unless required by applicable law (such as deliberate and grossly
156 |       negligent acts) or agreed to in writing, shall any Contributor be
157 |       liable to You for damages, including any direct, indirect, special,
158 |       incidental, or consequential damages of any character arising as a
159 |       result of this License or out of the use or inability to use the
160 |       Work (including but not limited to damages for loss of goodwill,
161 |       work stoppage, computer failure or malfunction, or any and all
162 |       other commercial damages or losses), even if such Contributor
163 |       has been advised of the possibility of such damages.
164 | 
165 |    9. Accepting Warranty or Additional Liability. While redistributing
166 |       the Work or Derivative Works thereof, You may choose to offer,
167 |       and charge a fee for, acceptance of support, warranty, indemnity,
168 |       or other liability obligations and/or rights consistent with this
169 |       License. However, in accepting such obligations, You may act only
170 |       on Your own behalf and on Your sole responsibility, not on behalf
171 |       of any other Contributor, and only if You agree to indemnify,
172 |       defend, and hold each Contributor harmless for any liability
173 |       incurred by, or claims asserted against, such Contributor by reason
174 |       of your accepting any such warranty or additional liability.
175 | 
176 |    END OF TERMS AND CONDITIONS
177 | 
178 |    APPENDIX: How to apply the Apache License to your work.
179 | 
180 |       To apply the Apache License to your work, attach the following
181 |       boilerplate notice, with the fields enclosed by brackets "{}"
182 |       replaced with your own identifying information. (Don't include
183 |       the brackets!)  The text should be enclosed in the appropriate
184 |       comment syntax for the file format. We also recommend that a
185 |       file or class name and description of purpose be included on the
186 |       same "printed page" as the copyright notice for easier
187 |       identification within third-party archives.
188 | 
189 |    Copyright {yyyy} {name of copyright owner}
190 | 
191 |    Licensed under the Apache License, Version 2.0 (the "License");
192 |    you may not use this file except in compliance with the License.
193 |    You may obtain a copy of the License at
194 | 
195 |        http://www.apache.org/licenses/LICENSE-2.0
196 | 
197 |    Unless required by applicable law or agreed to in writing, software
198 |    distributed under the License is distributed on an "AS IS" BASIS,
199 |    WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
200 |    See the License for the specific language governing permissions and
201 |    limitations under the License.


--------------------------------------------------------------------------------
/src/main/scala/com/google/cloud/genomics/spark/examples/SearchReadsExample.scala:
--------------------------------------------------------------------------------
  1 | /*
  2 | Copyright 2014 Google Inc. All rights reserved.
  3 | 
  4 | Licensed under the Apache License, Version 2.0 (the "License");
  5 | you may not use this file except in compliance with the License.
  6 | You may obtain a copy of the License at
  7 | 
  8 |     http://www.apache.org/licenses/LICENSE-2.0
  9 | 
 10 | Unless required by applicable law or agreed to in writing, software
 11 | distributed under the License is distributed on an "AS IS" BASIS,
 12 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 13 | See the License for the specific language governing permissions and
 14 | limitations under the License.
 15 | */
 16 | package com.google.cloud.genomics.spark.examples
 17 | 
 18 | import scala.collection.JavaConversions._
 19 | 
 20 | import scala.collection.mutable.{Map => MutableMap}
 21 | import org.apache.log4j.Level
 22 | import org.apache.log4j.Logger
 23 | import org.apache.spark.SparkContext._
 24 | import com.google.cloud.genomics.spark.examples.rdd.ReadsRDD
 25 | import com.google.cloud.genomics.spark.examples.rdd.ReadsPartitioner
 26 | import com.google.cloud.genomics.spark.examples.rdd.ReferencesReadsPartitioner
 27 | import com.google.cloud.genomics.Authentication
 28 | 
 29 | object Examples {
 30 | 
 31 |   final val Google_1KG_HG00096_Readset = "CMvnhpKTFhCwvIWYw9eikzQ"
 32 |   // From http://googlegenomics.readthedocs.org/en/latest/constants.html
 33 |   final val Google_Example_Readset = "CMvnhpKTFhD04eLE-q2yxnU"
 34 |   // Sage Bio DREAM Contest - Synthetic Set #3
 35 |   val Google_DREAM_Set3_Normal = "CPHG3MzoCRDRkqXzk7b6l_kB"
 36 |   val Google_DREAM_Set3_Tumor = "CPHG3MzoCRCO1rDx8pOY6yo"
 37 | 
 38 |   // SNP @ 6889648 - cilantro/soap variant near OR10A2
 39 |   final val Cilantro = 6889648L
 40 | 
 41 |   final val HumanChromosomes = Map[String, Long](
 42 |     ("1", 249250621),
 43 |     ("2", 243199373),
 44 |     ("3", 198022430),
 45 |     ("4", 191154276),
 46 |     ("5", 180915260),
 47 |     ("6", 171115067),
 48 |     ("7", 159138663),
 49 |     ("8", 146364022),
 50 |     ("9", 141213431),
 51 |     ("10", 135534747),
 52 |     ("11", 135006516),
 53 |     ("12", 133851895),
 54 |     ("13", 115169878),
 55 |     ("14", 107349540),
 56 |     ("15", 102531392),
 57 |     ("16", 90354753),
 58 |     ("17", 81195210),
 59 |     ("18", 78077248),
 60 |     ("19", 59128983),
 61 |     ("20", 63025520),
 62 |     ("21", 48129895),
 63 |     ("22", 51304566),
 64 |     ("X", 155270560),
 65 |     ("Y", 59373566))
 66 | }
 67 | 
 68 | /**
 69 |  * This example searches for all reads covering the cilantro/soap SNP near OR10A2
 70 |  * on chromosome 11 and prints out a pileup. The quality score of each read at
 71 |  * the SNP location is also printed inline. This can be visualized in the Genomics API Browser:
 72 |  * http://gabrowse.appspot.com/#backend=GOOGLE&readsetId=CJDmkYn8ChCh4IH4hOf4gacB&location=11%3A6889648
 73 |  * Note that the reads may be displayed in different order.
 74 |  */
 75 | object SearchReadsExample1 {
 76 |   def main(args: Array[String]) = {
 77 |     val conf = new GenomicsConf(args)
 78 |     val applicationName = this.getClass.getName
 79 |     val sc = conf.newSparkContext(applicationName)
 80 |     Logger.getLogger("org").setLevel(Level.WARN)
 81 |     val accessToken = Authentication.getAccessToken(conf.clientSecrets.get)
 82 |     val references = s"11:${Examples.Cilantro - 1000}:${Examples.Cilantro + 1000}"
 83 |     val data = new ReadsRDD(sc, applicationName, accessToken,
 84 |       Examples.Google_Example_Readset,
 85 |       new ReferencesReadsPartitioner(references, conf.basesPerPartition()))
 86 |       .filter { rk =>
 87 |         val (_, read) = rk
 88 |         // TODO: Take the cigar into account
 89 |         read.position <= Examples.Cilantro && read.position + read.alignedSequence.length >= Examples.Cilantro
 90 |       }.cache()
 91 |     val first = data.collect.foldLeft(999999999L) { (a, b) =>
 92 |       val (_, read) = b
 93 |       val p = read.position
 94 |       if (p < a) { p.toLong } else { a }
 95 |     }
 96 |     println(List.fill((Examples.Cilantro - first).toInt)(" ").mkString("") + "v")
 97 |     val out = data.map { rk =>
 98 |       val (_, read) = rk
 99 |       val i = (Examples.Cilantro - read.position).toInt
100 |       val bases = read.alignedSequence.splitAt(i + 1)
101 |       val q = "%02d".format(read.alignedQuality(i))
102 |       List.fill((read.position - first).toInt)(" ").mkString("") + bases._1 + "(" + q + ") " + bases._2
103 |     }
104 |     // Collect the results so they are printed on the local console.
105 |     out.collect.foreach(println(_))
106 | 
107 |     println(List.fill((Examples.Cilantro - first).toInt)(" ").mkString("") + "^")
108 |     sc.stop
109 |   }
110 | }
111 | 
112 | /**
113 |  * This example computes the average read coverage for a genomic range.
114 |  */
115 | object SearchReadsExample2 {
116 |   def main(args: Array[String]) = {
117 |     val conf = new GenomicsConf(args)
118 |     val applicationName = this.getClass.getName
119 |     val sc = conf.newSparkContext(applicationName)
120 |     val chr = "21"
121 |     val len = Examples.HumanChromosomes(chr)
122 |     val references = s"${chr}:1:${len}"
123 |     val accessToken = Authentication.getAccessToken(conf.clientSecrets.get)
124 |     val data = new ReadsRDD(sc, applicationName, accessToken,
125 |       Examples.Google_Example_Readset,
126 |       new ReferencesReadsPartitioner(references, conf.basesPerPartition()))
127 |     // TODO: Take the cigar into account
128 |     val coverage = data.map(_._2.alignedSequence.length.toLong)
129 |       .reduce(_ + _).toDouble / len.toDouble
130 |     println("Coverage of chromosome " + chr + " = " + coverage)
131 |     sc.stop
132 |   }
133 | }
134 | 
135 | /**
136 |  * This example computes the per-base read depth for a genomic range.
137 |  */
138 | object SearchReadsExample3 {
139 |   def main(args: Array[String]) = {
140 |     val conf = new GenomicsConf(args)
141 |     val outPath = conf.outputPath.orElse(Option("."))()
142 |     val applicationName = this.getClass.getName
143 |     val sc = conf.newSparkContext(applicationName)
144 |     val chr = "21"
145 |     val references = s"${chr}:1:${Examples.HumanChromosomes(chr)}"
146 |     val accessToken = Authentication.getAccessToken(conf.clientSecrets.get)
147 |     val data = new ReadsRDD(sc, applicationName, accessToken,
148 |       Examples.Google_Example_Readset,
149 |       new ReferencesReadsPartitioner(references, conf.basesPerPartition()))
150 |     data.flatMap { rk =>
151 |       val (_, read) = rk
152 |       val cover = MutableMap[Long, Int]()
153 |       // TODO: Take the cigar into account
154 |       for (i <- 0 until read.alignedSequence.length) {
155 |         cover(read.position + i) = 1
156 |       }
157 |       cover
158 |     }
159 |       .reduceByKey(_ + _)
160 |       .sortByKey(true) // optional, obviously
161 |       .saveAsTextFile(outPath + "/coverage_" + chr)
162 |     sc.stop
163 |   }
164 | }
165 | 
166 | /**
167 |  * This example illustrates one way to work with multiple RDDs by aggregating and
168 |  * comparing bases at the same position in different readsets. It uses synthetic
169 |  * tumor-normal data from the ICGC-TCGA DREAM Contest (https://www.synapse.org/#!Synapse:syn312572).
170 |  */
171 | object SearchReadsExample4 {
172 |   def main(args: Array[String]) = {
173 |     val conf = new GenomicsConf(args)
174 |     val outPath = conf.outputPath.orElse(Option("."))()
175 |     val applicationName = this.getClass.getName
176 |     val accessToken = Authentication.getAccessToken(conf.clientSecrets.get)
177 |     val sc = conf.newSparkContext(applicationName)
178 |     val chr = "1"
179 |     val references = s"${chr}:100000000:101000000"
180 |     val minMappingQual = 30
181 |     val minBaseQual = 30
182 |     val minFreq = 0.25
183 | 
184 |     // Generates an RDD that maps genomic position to a base read frequencies.
185 |     // Reads with a mapping quality less than minMappingQual are discarded
186 |     // as are individual bases with base quality scores less than minBaseQual.
187 |     //
188 |     // For example, a snippet of the text dump of the RDD for chromosome 1
189 |     // (synthetic set #3 normal) looks like:
190 |     //    (100091811,Map(A -> 1.0))
191 |     //    (100091812,Map(G -> 0.5428571428571428, A -> 0.45714285714285713))
192 |     //    (100091813,Map(G -> 0.08333333333333333, A -> 0.3611111111111111, C -> 0.5555555555555556))
193 |     //    (100091814,Map(G -> 0.30303030303030304, A -> 0.6060606060606061, C -> 0.09090909090909091))
194 |     //    (100091815,Map(G -> 0.03125, A -> 0.6875, C -> 0.28125))
195 |     //    (100091816,Map(A -> 0.375, C -> 0.03125, T -> 0.59375))
196 |     //    (100091817,Map(A -> 0.90625, T -> 0.09375))
197 |     //    (100091818,Map(A -> 0.125, T -> 0.875))
198 |     //    (100091819,Map(A -> 0.28125, T -> 0.71875))
199 |     //    (100091820,Map(G -> 0.6176470588235294, A -> 0.029411764705882353, T -> 0.35294117647058826))
200 |     //    (100091821,Map(G -> 0.08823529411764706, C -> 0.6470588235294118, T -> 0.2647058823529412))
201 |     //    (100091822,Map(G -> 0.23529411764705882, C -> 0.7352941176470589, T -> 0.029411764705882353))
202 |     //    (100091823,Map(G -> 0.029411764705882353, A -> 0.6470588235294118, C -> 0.3235294117647059))
203 |     //    (100091824,Map(A -> 0.08823529411764706, C -> 0.2647058823529412, T -> 0.6470588235294118))
204 |     //    (100091825,Map(A -> 0.23529411764705882, C -> 0.6764705882352942, T -> 0.08823529411764706))
205 |     //    (100091826,Map(A -> 0.6764705882352942, C -> 0.08823529411764706, T -> 0.23529411764705882))
206 |     //    (100091827,Map(A -> 0.75, C -> 0.2222222222222222, T -> 0.027777777777777776))
207 |     //    (100091828,Map(G -> 0.6571428571428571, A -> 0.3142857142857143, C -> 0.02857142857142857))
208 |     //    (100091829,Map(G -> 0.11428571428571428, A -> 0.2571428571428571, T -> 0.6285714285714286))
209 |     //    (100091830,Map(G -> 0.9142857142857143, A -> 0.02857142857142857, T -> 0.05714285714285714))
210 |     //    (100091831,Map(G -> 0.1111111111111111, A -> 0.6666666666666666, T -> 0.2222222222222222))
211 |     //    (100091832,Map(G -> 0.8888888888888888, A -> 0.08333333333333333, T -> 0.027777777777777776))
212 |     //    (100091833,Map(G -> 0.11428571428571428, A -> 0.8571428571428571, T -> 0.02857142857142857))
213 |     //    (100091834,Map(G -> 0.2, A -> 0.11428571428571428, T -> 0.6857142857142857))
214 |     //    (100091835,Map(G -> 0.7142857142857143, A -> 0.2, T -> 0.08571428571428572))
215 |     //    (100091836,Map(G -> 0.08333333333333333, A -> 0.7222222222222222, T -> 0.19444444444444445))
216 |     def freqRDD(readGroupSetId: String, partitioner: ReadsPartitioner) = {
217 |       new ReadsRDD(sc, applicationName, accessToken,
218 |           readGroupSetId, partitioner)
219 |         .filter(rk => rk._2.mappingQuality >= minMappingQual)
220 |         .flatMap { rk =>
221 |           val (_, read) = rk
222 |           var bases = List[(Long, Char)]()
223 |           // TODO: Take the cigar into account
224 |           for (i <- 0 until read.alignedSequence.length) {
225 |             if (i < read.alignedQuality.length && read.alignedQuality(i) >= minBaseQual) {
226 |               bases ::= (read.position + i, read.alignedSequence(i))
227 |             }
228 |           }
229 |           bases
230 |         }
231 |         .groupByKey()
232 |         .mapValues { v =>
233 |           val vSeq = v.toSeq
234 |           val total = vSeq.length.toDouble
235 |           vSeq.groupBy(c => c)
236 |             .map(p => (p._1, p._2.length))
237 |             .map(p => (p._1, p._2.toDouble / total))
238 |         }
239 |         .groupByKey()
240 |         .map(p => (p._1, p._2.head))
241 |     }
242 | 
243 |     val readsPartitioner = new ReferencesReadsPartitioner(references, conf.basesPerPartition())
244 |     val normal = freqRDD(Examples.Google_DREAM_Set3_Normal, readsPartitioner)
245 |     val tumor = freqRDD(Examples.Google_DREAM_Set3_Tumor, readsPartitioner)
246 | 
247 |     // Generate a new RDD that maps position to a pair of sorted base strings where
248 |     // the first item is the normal and the second is the tumor.
249 |     // Any base occurring with frequency less than minFreq is filtered out.
250 |     // Example:
251 |     //    (100091811,(A,A))
252 |     //    (100091812,(AG,AG))
253 |     //    (100091813,(AC,AC))
254 |     //    (100091814,(AG,AG))
255 |     //    (100091815,(AC,AC))
256 |     //    (100091816,(AT,AT))
257 |     //    (100091817,(A,A))
258 |     //    (100091818,(T,T))
259 |     //    (100091819,(AT,AT))
260 |     //    (100091820,(GT,GT))
261 |     //    (100091821,(CT,CT))
262 |     //    (100091822,(C,CG))
263 |     //    (100091823,(AC,AC))
264 |     //    (100091824,(CT,CT))
265 |     //    (100091825,(C,AC))
266 |     //    (100091826,(A,AT))
267 |     //    (100091827,(A,AC))
268 |     //    (100091828,(AG,AG))
269 |     //    (100091829,(AT,AT))
270 |     //    (100091830,(G,G))
271 |     //    (100091831,(A,AT))
272 |     //    (100091832,(G,G))
273 |     //    (100091833,(A,A))
274 |     //    (100091834,(T,GT))
275 |     //    (100091835,(G,AG))
276 |     //    (100091836,(A,AT))
277 |     val paired = normal.join(tumor).groupByKey()
278 |       .map(p => (p._1, p._2.head))
279 |       .map { p =>
280 |         def f(m: Map[Char, Double]): String = {
281 |           var s = ""
282 |           m.foreach { kv =>
283 |             if (kv._2 >= minFreq) { s += kv._1 }
284 |           }
285 |           s.sorted
286 |         }
287 |         (p._1, (f(p._2._1), f(p._2._2)))
288 |       }
289 | 
290 |     // This RDD can be further filtered to eliminate any positions with matching bases.
291 |     // Example:
292 |     //    (100091822,(C,CG))
293 |     //    (100091825,(C,AC))
294 |     //    (100091826,(A,AT))
295 |     //    (100091827,(A,AC))
296 |     //    (100091831,(A,AT))
297 |     //    (100091834,(T,GT))
298 |     //    (100091835,(G,AG))
299 |     //    (100091836,(A,AT))
300 |     val diff = paired.filter(p => p._2._1 != p._2._2)
301 |     diff.sortByKey().saveAsTextFile(outPath + "/diff_" + chr)
302 |     sc.stop
303 |   }
304 | }
305 | 


--------------------------------------------------------------------------------