├── .gitignore ├── README.md ├── pom.xml └── src ├── main └── scala │ └── com │ └── cloudera │ └── dataflow │ └── dsl │ └── Job.scala └── test └── scala └── com └── cloudera └── dataflow └── dsl └── TestJob.scala /.gitignore: -------------------------------------------------------------------------------- 1 | *.class 2 | *.log 3 | 4 | # sbt specific 5 | .cache 6 | .history 7 | .lib/ 8 | dist/* 9 | target/ 10 | lib_managed/ 11 | src_managed/ 12 | project/boot/ 13 | project/plugins/project/ 14 | 15 | # Scala-IDE specific 16 | .scala_dependencies 17 | .worksheet 18 | .idea/ 19 | *.iml 20 | 21 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | scala-dataflow-dsl 2 | ================== 3 | 4 | A Scala Interface for dataflow 5 | 6 | This library allows you to write dataflow jobs in a native scala collections like way, by 7 | implementing a Job class. For example, we can count words in a list of strings by running the 8 | following job class: 9 | 10 | class LiterallyCountStuff extends Job { 11 | override def createPipeline() = { 12 | val inputdata: RichPCollection[String] = Create.of(List("stuff", "more stuff")) 13 | val splitLowerCase: RichPCollection[String] = inputdata.flatMap(_.split("\\s+")).map(_ 14 | .toLowerCase) 15 | val allCounts = splitLowerCase.countAll() 16 | val perElemCounts = splitLowerCase.countPerElement() 17 | } 18 | } -------------------------------------------------------------------------------- /pom.xml: -------------------------------------------------------------------------------- 1 | 2 | 13 | 16 | 4.0.0 17 | Cloud Dataflow Examples 18 | com.cloudera.dataflow 19 | scala-dsl 20 | 0.0.1-SNAPSHOT 21 | jar 22 | 23 | 24 | UTF-8 25 | 2.10.4 26 | 27 | 28 | 29 | 30 | dataflow-snapshot 31 | file://${project.basedir}/jars 32 | 33 | true 34 | 35 | 36 | true 37 | always 38 | 39 | 40 | 41 | 42 | 43 | 44 | 45 | net.alchim31.maven 46 | scala-maven-plugin 47 | 3.2.0 48 | 49 | 50 | 51 | compile 52 | testCompile 53 | 54 | 55 | 56 | -deprecation 57 | -dependencyfile 58 | ${project.build.directory}/.scala_dependencies 59 | 60 | 61 | 62 | 63 | 64 | 65 | 66 | maven-compiler-plugin 67 | 3.1 68 | 69 | 1.7 70 | 1.7 71 | 72 | 73 | 74 | 75 | org.apache.felix 76 | maven-bundle-plugin 77 | 2.4.0 78 | true 79 | 80 | 81 | 82 | *;scope=compile|runtime;inline=true 83 | 84 | 85 | 86 | 87 | 88 | org.apache.maven.plugins 89 | maven-dependency-plugin 90 | 2.8 91 | 92 | 93 | copy-dependencies 94 | package 95 | 96 | copy-dependencies 97 | 98 | 99 | 100 | 101 | 102 | 103 | org.apache.maven.plugins 104 | maven-surefire-plugin 105 | 2.7 106 | 107 | true 108 | 109 | 110 | 111 | 112 | org.scalatest 113 | scalatest-maven-plugin 114 | 1.0 115 | 116 | ${project.build.directory}/surefire-reports 117 | . 118 | WDF TestSuite.txt 119 | 120 | 121 | 122 | test 123 | 124 | test 125 | 126 | 127 | 128 | 129 | 130 | 131 | 132 | 133 | 134 | com.google.cloud.dataflow 135 | google-cloud-dataflow-java-sdk-all 136 | 0.3.141216 137 | 138 | 139 | 140 | org.scala-lang 141 | scala-library 142 | ${scala.version} 143 | 144 | 145 | 146 | org.scala-lang 147 | scala-compiler 148 | ${scala.version} 149 | 150 | 151 | 152 | org.hamcrest 153 | hamcrest-all 154 | 1.3 155 | test 156 | 157 | 158 | junit 159 | junit 160 | 4.9 161 | test 162 | 163 | 164 | org.scalatest 165 | scalatest_2.10 166 | 2.2.1 167 | test 168 | 169 | 170 | 171 | -------------------------------------------------------------------------------- /src/main/scala/com/cloudera/dataflow/dsl/Job.scala: -------------------------------------------------------------------------------- 1 | package com.cloudera.dataflow.dsl 2 | 3 | import com.google.cloud.dataflow.sdk.{PipelineResult, Pipeline} 4 | import com.google.cloud.dataflow.sdk.coders.{Coder, CoderRegistry} 5 | import com.google.cloud.dataflow.sdk.io.TextIO 6 | import com.google.cloud.dataflow.sdk.options.{PipelineOptions, PipelineOptionsFactory} 7 | import com.google.cloud.dataflow.sdk.transforms.{Create => DataflowCreate, Count, DoFn, 8 | PTransform, ParDo} 9 | import com.google.cloud.dataflow.sdk.values.{KV, PCollection} 10 | import com.google.common.reflect.TypeToken 11 | 12 | import scala.collection.JavaConversions 13 | import scala.reflect.ClassTag 14 | 15 | /** 16 | * Wrapper class for defining extra methods on PCollections. Thyis allows us to call map, flatmap, 17 | * etc on PCollections, and have the corresponding Ptransforms added to the Dataflow pipeline 18 | * under the hood. 19 | */ 20 | class RichPCollection[S](val pc: PCollection[S]) { 21 | 22 | // Initialize a coder registry; we'll likely want some Scala types in here at some point 23 | lazy val coders = { 24 | val registry = new CoderRegistry 25 | registry.registerStandardCoders() 26 | registry 27 | } 28 | 29 | // Coder lookup 30 | def getCoder[T](ct: ClassTag[T]): Coder[T] = { 31 | return coders.getDefaultCoder(TypeToken.of(ct.runtimeClass)).asInstanceOf[Coder[T]] 32 | } 33 | 34 | /** 35 | * Maps over PCollection, returing a new pcollection of results. 36 | */ 37 | def map[T: ClassTag](f: S => T): PCollection[T] = { 38 | val mapFunction: DoFn[S, T] = new DoFn[S, T] { 39 | override def processElement(context: DoFn[S, T]#ProcessContext): Unit = { 40 | context.output(f(context.element())) 41 | } 42 | } 43 | val mapTransform = new PTransform[PCollection[S], 44 | PCollection[T]]() { 45 | override def apply(input: PCollection[S]) = { 46 | input.apply(ParDo.of(mapFunction)).setCoder(getCoder(implicitly[ClassTag[T]])) 47 | } 48 | } 49 | pc.apply(mapTransform) 50 | } 51 | /** 52 | * FlatMap over a PCollection, and return a new PCollection. 53 | */ 54 | def flatMap[T: ClassTag](f: S => TraversableOnce[T]): PCollection[T] = { 55 | val flatMapFunction: DoFn[S, T] = new DoFn[S, T] { 56 | override def processElement(context: DoFn[S, T]#ProcessContext): Unit = { 57 | for (x <- f(context.element())){ 58 | context.output(x) 59 | } 60 | } 61 | } 62 | val flatMapTransform = new PTransform[PCollection[S], 63 | PCollection[T]]() { 64 | override def apply(input: PCollection[S]) = { 65 | input.apply(ParDo.of(flatMapFunction)).setCoder(getCoder(implicitly[ClassTag[T]])) 66 | } 67 | } 68 | pc.apply(flatMapTransform) 69 | } 70 | /** Count all elements that appear in this PCollection. */ 71 | def countAll() = { 72 | val countTransform = Count.globally[S]() 73 | pc.apply(countTransform) 74 | } 75 | 76 | /** Count the number of time each unique element appears in this PCollection. Returns a 77 | * PCollection of KVs. */ 78 | def countPerElement() = { 79 | val countTransform = Count.perElement[S]() 80 | pc.apply(countTransform) 81 | } 82 | } 83 | 84 | object Create { 85 | /** Create a PCollection from a Scala Iterable. */ 86 | def of[T](iter: Iterable[T])(implicit p: Pipeline): PCollection[T]= { 87 | p.apply(DataflowCreate.of(JavaConversions.asJavaIterable(iter))) 88 | } 89 | 90 | /** 91 | * Returns a PCollection created from applying a Text.IO transform for the given file pattern 92 | */ 93 | def text(filePattern: String)(implicit p: Pipeline): PCollection[String] = { 94 | p.apply(TextIO.Read.from(filePattern)) 95 | } 96 | } 97 | 98 | abstract class Job() { 99 | /** Default pipeline options. Override this value for alternate options. */ 100 | val pipelineOptions: PipelineOptions = PipelineOptionsFactory.create() 101 | /** This pipeline is magically passed to the first transform to a pipeline. It is them the 102 | * object we call run on. This should probably be handled in a more transparent way. */ 103 | implicit val pipeline: Pipeline = Pipeline.create(pipelineOptions) 104 | 105 | /** Override this method to define a new, better pipeline. */ 106 | def createPipeline(): AnyRef = {pipeline} 107 | 108 | def run(): PipelineResult = { 109 | createPipeline() 110 | pipeline.run() 111 | } 112 | /** Treat a pcollection as a RichPCollection on demand. */ 113 | implicit def pCollectionToRichPCollection[S](pc: PCollection[S]) = new RichPCollection[S](pc) 114 | /** Treat a rich PCollection as a PCollection on demand. */ 115 | implicit def richPCollectionToPCollection[S](rpc: RichPCollection[S]) = rpc.pc 116 | 117 | implicit def tuple2kv[K, V](kv: KV[K, V]) = (kv.getKey, kv.getValue) 118 | 119 | implicit def kv2tuple2[K, V](kv: (K, V)) = KV.of(kv._1, kv._2) 120 | } 121 | -------------------------------------------------------------------------------- /src/test/scala/com/cloudera/dataflow/dsl/TestJob.scala: -------------------------------------------------------------------------------- 1 | package com.cloudera.dataflow.dsl 2 | 3 | import com.google.cloud.dataflow.sdk.PipelineResult 4 | import com.google.cloud.dataflow.sdk.testing.DataflowAssert 5 | import com.google.cloud.dataflow.sdk.values.KV 6 | import org.scalatest.FlatSpec 7 | 8 | class LiterallyCountStuff extends Job { 9 | override def createPipeline() = { 10 | val inputdata: RichPCollection[String] = Create.of(List("stuff", "more stuff")) 11 | val splitLowerCase: RichPCollection[String] = inputdata.flatMap(_.split("\\s+")).map(_ 12 | .toLowerCase) 13 | val allCounts = splitLowerCase.countAll() 14 | val perElemCounts = splitLowerCase.countPerElement() 15 | DataflowAssert.that(splitLowerCase).containsInAnyOrder("stuff", "stuff", "more") 16 | DataflowAssert.that(allCounts).containsInAnyOrder(3L) 17 | DataflowAssert.that(perElemCounts).containsInAnyOrder(KV.of("stuff", 2L), KV.of("more", 1L)) 18 | } 19 | } 20 | 21 | class TestJob extends FlatSpec { 22 | val myjob = new LiterallyCountStuff() 23 | val result: PipelineResult = myjob.run() 24 | } 25 | --------------------------------------------------------------------------------