├── project ├── build.properties └── plugins.sbt ├── .gitignore ├── core ├── src │ ├── main │ │ ├── scala │ │ │ └── com │ │ │ │ └── zendesk │ │ │ │ └── scalaflow │ │ │ │ ├── package.scala │ │ │ │ ├── sugar │ │ │ │ ├── DurationOps.scala │ │ │ │ ├── WrapperOps.scala │ │ │ │ ├── PipelineOps.scala │ │ │ │ ├── KVCollectionOps.scala │ │ │ │ ├── CollectionOps.scala │ │ │ │ └── CoderOps.scala │ │ │ │ └── coders │ │ │ │ ├── OptionCoder.scala │ │ │ │ ├── TryCoder.scala │ │ │ │ └── EitherCoder.scala │ │ └── boilerplate │ │ │ └── com │ │ │ └── zendesk │ │ │ └── scalaflow │ │ │ ├── sugar │ │ │ ├── TupleOps.scala.template │ │ │ ├── JoinOps.scala.template │ │ │ └── CaseClassOps.scala.template │ │ │ └── coders │ │ │ └── TupleCoders.scala.template │ └── test │ │ ├── resources │ │ └── logback.xml │ │ ├── boilerplate │ │ └── com │ │ │ └── zendesk │ │ │ └── scalaflow │ │ │ └── coders │ │ │ └── TupleCodersSpec.scala.template │ │ └── scala │ │ └── com │ │ └── zendesk │ │ └── scalaflow │ │ └── sugar │ │ ├── JoinOpsSpec.scala │ │ ├── RichPipelineSpec.scala │ │ ├── RichKVCollectionSpec.scala │ │ ├── CaseClassOpsSpec.scala │ │ └── RichCollectionSpec.scala └── build.sbt ├── .github └── workflows │ └── actions.yml ├── README.md └── LICENSE /project/build.properties: -------------------------------------------------------------------------------- 1 | sbt.version=0.13.13 2 | -------------------------------------------------------------------------------- /project/plugins.sbt: -------------------------------------------------------------------------------- 1 | addSbtPlugin("io.spray" % "sbt-boilerplate" % "0.6.0") 2 | addSbtPlugin("com.jsuereth" % "sbt-pgp" % "1.0.0") 3 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | # IntelliJ 2 | 3 | .idea/ 4 | out/ 5 | *.iws 6 | *.iml 7 | 8 | # SBT 9 | 10 | target/ 11 | lib_managed/ 12 | src_managed/ 13 | project/boot/ 14 | .history 15 | .cache -------------------------------------------------------------------------------- /core/src/main/scala/com/zendesk/scalaflow/package.scala: -------------------------------------------------------------------------------- 1 | package com.zendesk 2 | 3 | import com.zendesk.scalaflow.sugar._ 4 | 5 | package object scalaflow extends CoderOps 6 | with TupleOps 7 | with CaseClassOps 8 | with CollectionOps 9 | with DurationOps 10 | with KVCollectionOps 11 | with JoinOps 12 | with PipelineOps 13 | with WrapperOps 14 | -------------------------------------------------------------------------------- /core/src/test/resources/logback.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | %msg%n 6 | 7 | 8 | 9 | 10 | 11 | 12 | 13 | 14 | -------------------------------------------------------------------------------- /core/src/main/boilerplate/com/zendesk/scalaflow/sugar/TupleOps.scala.template: -------------------------------------------------------------------------------- 1 | package com.zendesk.scalaflow.sugar 2 | 3 | import com.google.cloud.dataflow.sdk.coders.Coder 4 | 5 | import com.zendesk.scalaflow.coders._ 6 | 7 | 8 | trait TupleOps { 9 | 10 | [2..22#implicit def tuple1Coder[[#V1#]](implicit [#coder1: Coder[V1]#]): Coder[Tuple1[[#V1#]]] = { 11 | new Tuple1Coder[[#V1#]]() 12 | } 13 | # 14 | ] 15 | } 16 | 17 | object TupleOps extends TupleOps 18 | -------------------------------------------------------------------------------- /core/build.sbt: -------------------------------------------------------------------------------- 1 | name := "scala-flow-core" 2 | 3 | enablePlugins(BoilerplatePlugin) 4 | 5 | libraryDependencies ++= Seq( 6 | "org.scala-lang" % "scala-reflect" % scalaVersion.value, 7 | 8 | "com.google.cloud.dataflow" % "google-cloud-dataflow-java-sdk-all" % "1.9.0", 9 | 10 | "org.scalatest" %% "scalatest" % "3.0.1" % Test, 11 | "ch.qos.logback" % "logback-classic" % "1.2.1" % Test, 12 | "org.hamcrest" % "hamcrest-library" % "1.3" % Test, 13 | "junit" % "junit" % "4.12" % Test 14 | ) 15 | -------------------------------------------------------------------------------- /core/src/main/scala/com/zendesk/scalaflow/sugar/DurationOps.scala: -------------------------------------------------------------------------------- 1 | package com.zendesk.scalaflow.sugar 2 | 3 | import java.time.{Duration => JavaDuration} 4 | import org.joda.time.{Duration => JodaDuration} 5 | 6 | import scala.concurrent.duration.{Duration => ScalaDuration} 7 | import scala.language.implicitConversions 8 | 9 | trait DurationOps { 10 | implicit def java2joda(javaDuration: JavaDuration) = JodaDuration.millis(javaDuration.toMillis) 11 | implicit def scala2joda(scalaDuration: ScalaDuration) = JodaDuration.millis(scalaDuration.toMillis) 12 | } 13 | 14 | object DurationOps extends DurationOps 15 | -------------------------------------------------------------------------------- /core/src/main/scala/com/zendesk/scalaflow/sugar/WrapperOps.scala: -------------------------------------------------------------------------------- 1 | package com.zendesk.scalaflow.sugar 2 | 3 | import com.google.cloud.dataflow.sdk.transforms.{DoFn, PTransform, ParDo, SimpleFunction} 4 | import com.google.cloud.dataflow.sdk.values.{PInput, POutput} 5 | 6 | trait WrapperOps { 7 | 8 | def asSimpleFn[A, B](f: A => B): SimpleFunction[A, B] = { 9 | new SimpleFunction[A, B] { 10 | override def apply(input: A): B = f(input) 11 | } 12 | } 13 | 14 | def asParDo[A, B](f: DoFn[A, B]#ProcessContext => Unit): ParDo.Bound[A, B] = { 15 | val doFn = new DoFn[A, B] { 16 | override def processElement(c: DoFn[A, B]#ProcessContext): Unit = f(c) 17 | } 18 | 19 | ParDo.of(doFn) 20 | } 21 | 22 | def asPTransform[A <: PInput, B <: POutput](f: A => B): PTransform[A, B] = { 23 | new PTransform[A, B] { 24 | override def apply(input: A): B = f(input) 25 | } 26 | } 27 | } 28 | 29 | object WrapperOps extends WrapperOps 30 | -------------------------------------------------------------------------------- /core/src/main/scala/com/zendesk/scalaflow/sugar/PipelineOps.scala: -------------------------------------------------------------------------------- 1 | package com.zendesk.scalaflow.sugar 2 | 3 | import com.google.cloud.dataflow.sdk.coders.Coder 4 | import com.google.cloud.dataflow.sdk.transforms.Create 5 | import com.google.cloud.dataflow.sdk.values.{PBegin, PCollection, POutput} 6 | import com.zendesk.scalaflow.sugar.CollectionOps.RichCollection 7 | import com.zendesk.scalaflow.sugar.WrapperOps._ 8 | 9 | trait PipelineOps { 10 | 11 | implicit class RichBegin(begin: PBegin) { 12 | 13 | def transform[T](values: Create.Values[T])(implicit coder: Coder[T]): PCollection[T] = { 14 | begin.apply(values.withCoder(coder)) 15 | } 16 | 17 | def transformWith[A <: POutput](name: String)(f: PBegin => A) = { 18 | begin.apply(name, asPTransform(f)) 19 | } 20 | 21 | def flatten[A : Coder](first: PCollection[A], second: PCollection[A], others: PCollection[A]*): PCollection[A] = { 22 | first.flattenWith(second, others: _*) 23 | } 24 | } 25 | 26 | implicit class RichOutput(output: POutput) { 27 | def run() = output.getPipeline.run() 28 | } 29 | } 30 | 31 | object PipelineOps extends PipelineOps 32 | -------------------------------------------------------------------------------- /.github/workflows/actions.yml: -------------------------------------------------------------------------------- 1 | name: repo-checks 2 | on: 3 | push: 4 | branches: 5 | - master 6 | pull_request: 7 | jobs: 8 | main: 9 | name: scala-simple 10 | runs-on: ubuntu-latest 11 | env: 12 | TRAVIS_SCALA_VERSION: 2.12.1 13 | SCALA_ENV: travis 14 | steps: 15 | - uses: zendesk/checkout@v2 16 | - uses: zendesk/setup-java@v1 17 | with: 18 | java-version: "8" 19 | - name: Cache SBT ivy cache 20 | uses: zendesk/cache@v2 21 | with: 22 | path: "~/.ivy2/cache" 23 | key: zendesk-stable-sbt-ivy-cache-${{ hashFiles('**/build.sbt') }} 24 | - name: Cache SBT m2 cache 25 | uses: zendesk/cache@v2 26 | with: 27 | path: "~/.m2/repository" 28 | key: zendesk-stable-sbt-ivy-cache-${{ hashFiles('**/build.sbt') }} 29 | - name: Cache SBT 30 | uses: zendesk/cache@v2 31 | with: 32 | path: "~/.sbt" 33 | key: zendesk-stable-sbt-${{ hashFiles('**/build.sbt') }} 34 | - name: clean test 35 | run: sbt clean test 36 | - name: publish 37 | if: github.ref == 'refs/heads/master' 38 | run: sbt core/publish 39 | -------------------------------------------------------------------------------- /core/src/test/boilerplate/com/zendesk/scalaflow/coders/TupleCodersSpec.scala.template: -------------------------------------------------------------------------------- 1 | package com.zendesk.scalaflow.coders 2 | 3 | import com.google.cloud.dataflow.sdk.options.PipelineOptions.CheckEnabled._ 4 | import com.google.cloud.dataflow.sdk.testing.{DataflowAssert, TestPipeline} 5 | import com.google.cloud.dataflow.sdk.transforms.Create 6 | import org.scalatest.{FlatSpec, Matchers} 7 | 8 | import com.zendesk.scalaflow._ 9 | 10 | case object Data { 11 | val data = Array([#"1"#]) 12 | val roll = (limit: Int) => (index: Int) => data(index % limit) 13 | } 14 | 15 | class TupleCodersSpec extends FlatSpec with Matchers { 16 | import Data._ 17 | 18 | private def testPipeline() = { 19 | val pipelineOptions = TestPipeline.testingPipelineOptions 20 | pipelineOptions.setStableUniqueNames(OFF) 21 | 22 | TestPipeline.fromOptions(pipelineOptions) 23 | } 24 | 25 | [2..22#"TupleCoder1" should "encode and decode values" in { 26 | val x = roll(1) 27 | val pipeline = testPipeline() 28 | val output = pipeline.begin 29 | .transform(Create.of(([#x(1)#]))) 30 | .map(t => ([#x(2)#])) 31 | 32 | DataflowAssert.that(output).containsInAnyOrder(([#x(2)#])) 33 | pipeline.run() 34 | }# 35 | 36 | ] 37 | } 38 | -------------------------------------------------------------------------------- /core/src/test/scala/com/zendesk/scalaflow/sugar/JoinOpsSpec.scala: -------------------------------------------------------------------------------- 1 | package com.zendesk.scalaflow.sugar 2 | 3 | import com.google.cloud.dataflow.sdk.options.PipelineOptions.CheckEnabled._ 4 | import com.google.cloud.dataflow.sdk.testing.{DataflowAssert, TestPipeline} 5 | import com.google.cloud.dataflow.sdk.transforms.Create 6 | import com.google.cloud.dataflow.sdk.values.KV 7 | import org.scalatest.{FlatSpec, Matchers} 8 | import com.zendesk.scalaflow._ 9 | 10 | class JoinOpsSpec extends FlatSpec with Matchers { 11 | 12 | "coGroupByKey" should "join two collections" in { 13 | val pipeline = testPipeline() 14 | val input1 = pipeline.apply(Create.of(KV.of("x", 1), KV.of("y", 2), KV.of("x", 3))) 15 | val input2 = pipeline.apply(Create.of(KV.of("y", "yo"), KV.of("x", "lo"))) 16 | 17 | val output = input1 18 | .coGroupByKey(input2) 19 | .mapValue { case (x, y) => (x.toSet, y.toSet) } // avoid ordering problems 20 | 21 | DataflowAssert.that(output).containsInAnyOrder( 22 | KV.of("x", (Set(1, 3), Set("lo"))), 23 | KV.of("y", (Set(2), Set("yo"))) 24 | ) 25 | 26 | pipeline.run() 27 | } 28 | 29 | private def testPipeline() = { 30 | val pipelineOptions = TestPipeline.testingPipelineOptions 31 | pipelineOptions.setStableUniqueNames(OFF) 32 | 33 | val pipeline = TestPipeline.fromOptions(pipelineOptions) 34 | 35 | pipeline 36 | } 37 | } 38 | -------------------------------------------------------------------------------- /core/src/test/scala/com/zendesk/scalaflow/sugar/RichPipelineSpec.scala: -------------------------------------------------------------------------------- 1 | package com.zendesk.scalaflow.sugar 2 | 3 | import com.google.cloud.dataflow.sdk.options.PipelineOptions.CheckEnabled._ 4 | import com.google.cloud.dataflow.sdk.testing.{DataflowAssert, TestPipeline} 5 | import com.google.cloud.dataflow.sdk.transforms.Create 6 | import com.zendesk.scalaflow._ 7 | import org.scalatest.{FlatSpec, Matchers} 8 | 9 | import scala.util.Try 10 | 11 | class RichPipelineSpec extends FlatSpec with Matchers { 12 | 13 | behavior of "RichPipeline" 14 | 15 | it should "register coders for Scala primitives and core types" in { 16 | val pipeline = testPipeline() 17 | 18 | val output = pipeline 19 | .apply(Create.of("x")) 20 | // Primitives 21 | .map(x => 1) 22 | .map(x => 1L) 23 | .map(x => 1.0) 24 | // Tuples 25 | .map(x => (2, 2L)) 26 | .map(x => (3, 3L, 3.0)) 27 | // Option 28 | .map(x => Some("some").asInstanceOf[Option[String]]) 29 | .map(x => Option.empty[Int]) 30 | // Try 31 | .map(x => Try("yay")) 32 | .map(x => Try[String](throw new RuntimeException("boo"))) 33 | // Either 34 | .map(x => Left("left").asInstanceOf[Either[String, String]]) 35 | .map(x => Right("right").asInstanceOf[Either[String, String]]) 36 | 37 | DataflowAssert.that(output).containsInAnyOrder(Right("right")) 38 | pipeline.run() 39 | } 40 | 41 | 42 | private def testPipeline() = { 43 | val pipelineOptions = TestPipeline.testingPipelineOptions 44 | pipelineOptions.setStableUniqueNames(OFF) 45 | TestPipeline.fromOptions(pipelineOptions) 46 | } 47 | } 48 | -------------------------------------------------------------------------------- /core/src/main/boilerplate/com/zendesk/scalaflow/coders/TupleCoders.scala.template: -------------------------------------------------------------------------------- 1 | package com.zendesk.scalaflow.coders 2 | 3 | import java.io.{InputStream, OutputStream} 4 | import java.util.{List => JList} 5 | 6 | import com.google.cloud.dataflow.sdk.coders.Coder.Context 7 | import com.google.cloud.dataflow.sdk.coders.{Coder, CustomCoder} 8 | 9 | import scala.collection.JavaConverters._ 10 | 11 | 12 | 13 | [2..22#class Tuple1Coder[[#V1#]]()(implicit [#coder1: Coder[V1]#]) extends CustomCoder[Tuple1[[#V1#]]] { 14 | 15 | private val coders = Array([#coder1#]) 16 | 17 | override def getEncodingId = "Tuple1Coder" + coders.mkString("[", ", ", "]") 18 | 19 | override def consistentWithEquals(): Boolean = coders.forall(_.consistentWithEquals) 20 | 21 | override def getCoderArguments: JList[Coder[_]] = coders.map(_.asInstanceOf[Coder[_]]).toList.asJava 22 | 23 | override def verifyDeterministic(): Unit = { 24 | coders.zipWithIndex.foreach { case (coder, index) => 25 | verifyDeterministic(s"Coder$index must be deterministic", coder) 26 | } 27 | } 28 | 29 | override def structuralValue(value: Tuple1[[#V1#]]): Object = { 30 | if (consistentWithEquals) { 31 | value 32 | } else { 33 | Tuple1([#coder1.structuralValue(value._1)#]) 34 | } 35 | } 36 | 37 | override def encode(value: Tuple1[[#V1#]], out: OutputStream, context: Context): Unit = { 38 | [#coder1.encode(value._1, out, context.nested)# 39 | ] 40 | } 41 | 42 | override def decode(in: InputStream, context: Context): Tuple1[[#V1#]] = { 43 | [#val v1 = coder1.decode(in, context.nested)# 44 | ] 45 | Tuple1([#v1#]) 46 | } 47 | }# 48 | 49 | 50 | 51 | ] 52 | -------------------------------------------------------------------------------- /core/src/main/boilerplate/com/zendesk/scalaflow/sugar/JoinOps.scala.template: -------------------------------------------------------------------------------- 1 | package com.zendesk.scalaflow.sugar 2 | 3 | import com.google.cloud.dataflow.sdk.coders.Coder 4 | import com.google.cloud.dataflow.sdk.transforms.join.{CoGbkResult, CoGroupByKey, KeyedPCollectionTuple} 5 | import com.google.cloud.dataflow.sdk.transforms.{DoFn, ParDo} 6 | import com.google.cloud.dataflow.sdk.values.{KV, PCollection, TupleTag} 7 | 8 | import scala.collection.JavaConverters._ 9 | 10 | import CoderOps._ 11 | import TupleOps._ 12 | 13 | trait JoinOps { 14 | 15 | implicit class JoinPCollection[K : Coder, V1 : Coder](coll1: PCollection[KV[K, V1]]) { 16 | 17 | [2..21#def coGroupByKey[[2..#V1 : Coder#]]([2..#coll1: PCollection[KV[K, V1]]#]): PCollection[KV[K, ([#Iterable[V1]#])]] = { 18 | [#val tag1 = new TupleTag[V1]()# 19 | ] 20 | 21 | val tuple = KeyedPCollectionTuple 22 | .of(tag##1, coll##1) 23 | [2..#.and(tag1, coll1)# 24 | ] 25 | 26 | val doFn = new DoFn[KV[K, CoGbkResult], KV[K, ([#Iterable[V1]#])]] { 27 | override def processElement(c: DoFn[KV[K, CoGbkResult], KV[K, ([#Iterable[V1]#])]]\#ProcessContext) = { 28 | val key = c.element().getKey 29 | val coGbkResult = c.element().getValue 30 | 31 | [#val values1 = coGbkResult.getAll(tag1).asScala# 32 | ] 33 | 34 | val result = KV.of(key, ([#values1#])) 35 | 36 | c.output(result) 37 | } 38 | } 39 | 40 | val coder = implicitly[Coder[KV[K, ([#Iterable[V1]#])]]] 41 | 42 | tuple 43 | .apply(CoGroupByKey.create[K]) 44 | .apply(ParDo.of(doFn)) 45 | .setCoder(coder) 46 | }# 47 | 48 | ] 49 | 50 | } 51 | } 52 | 53 | object JoinOps extends JoinOps 54 | 55 | -------------------------------------------------------------------------------- /core/src/test/scala/com/zendesk/scalaflow/sugar/RichKVCollectionSpec.scala: -------------------------------------------------------------------------------- 1 | package com.zendesk.scalaflow.sugar 2 | 3 | import com.google.cloud.dataflow.sdk.options.PipelineOptions.CheckEnabled._ 4 | import com.google.cloud.dataflow.sdk.testing.{DataflowAssert, TestPipeline} 5 | import com.google.cloud.dataflow.sdk.transforms.Create 6 | import com.google.cloud.dataflow.sdk.values.{KV, PCollection} 7 | import com.zendesk.scalaflow._ 8 | import org.scalatest.{FlatSpec, Matchers} 9 | 10 | import scala.collection.JavaConverters._ 11 | import scala.util.Try 12 | 13 | class RichKVCollectionSpec extends FlatSpec with Matchers { 14 | 15 | behavior of "Rich Collection" 16 | 17 | "flatMap" should "work with Option" in { 18 | val pipeline = testPipeline() 19 | val input = List("42", "yo", "13") 20 | val output: PCollection[KV[String, Int]] = pipeline 21 | .apply(Create.of(input.asJava)) 22 | .map { value => KV.of(value, value) } 23 | .flatMapValue { x => Try(x.toInt).toOption } 24 | 25 | DataflowAssert.that(output).containsInAnyOrder(KV.of("42", 42), KV.of("13", 13)) 26 | 27 | pipeline.run() 28 | } 29 | 30 | "groupByKey" should "group by key" in { 31 | val pipeline = testPipeline() 32 | val input = List("john" -> 42, "maggie" -> 39, "john" -> 25).map { case (k, v) => KV.of(k, v) } 33 | 34 | val output = pipeline 35 | .apply(Create.of(input.asJava)) 36 | .groupByKey 37 | .mapValue(_.toSet) 38 | 39 | DataflowAssert 40 | .that(output) 41 | .containsInAnyOrder( 42 | KV.of("john", Set(42, 25)), 43 | KV.of("maggie", Set(39)) 44 | ) 45 | 46 | pipeline.run() 47 | } 48 | 49 | private def testPipeline() = { 50 | val pipelineOptions = TestPipeline.testingPipelineOptions 51 | pipelineOptions.setStableUniqueNames(OFF) 52 | 53 | TestPipeline.fromOptions(pipelineOptions) 54 | } 55 | } 56 | -------------------------------------------------------------------------------- /core/src/main/boilerplate/com/zendesk/scalaflow/sugar/CaseClassOps.scala.template: -------------------------------------------------------------------------------- 1 | package com.zendesk.scalaflow.sugar 2 | 3 | import com.google.cloud.dataflow.sdk.Pipeline 4 | import com.google.cloud.dataflow.sdk.coders.{Coder, DelegateCoder, VoidCoder} 5 | import com.google.cloud.dataflow.sdk.coders.DelegateCoder.CodingFunction 6 | 7 | import scala.reflect.ClassTag 8 | import scala.reflect.runtime.universe._ 9 | 10 | import com.zendesk.scalaflow.coders._ 11 | 12 | trait CaseClassOps { 13 | 14 | def caseClassCoder[P <: Product](construct: () => P): Coder[P] = { 15 | val intermediate = VoidCoder.of() 16 | val to = new CodingFunction[P, Void] { 17 | override def apply(p: P) = null 18 | } 19 | val from = new CodingFunction[Void, P] { 20 | override def apply(v: Void) = construct() 21 | } 22 | 23 | DelegateCoder.of(intermediate, to, from) 24 | } 25 | 26 | def caseClassCoder[V1: Coder, P <: Product](construct: (V1) => P): Coder[P] = { 27 | val intermediate = implicitly[Coder[V1]] 28 | val to = new CodingFunction[P, V1] { 29 | override def apply(p: P) = p.productElement(0).asInstanceOf[V1] 30 | } 31 | val from = new CodingFunction[V1, P] { 32 | override def apply(v: V1) = construct(v) 33 | } 34 | 35 | DelegateCoder.of(intermediate, to, from) 36 | } 37 | 38 | [2..22#def caseClassCoder[[#V1: Coder#], P <: Product](construct: ([#V1#]) => P): Coder[P] = { 39 | val intermediate = new Tuple1Coder[[#V1#]]() 40 | val to = new CodingFunction[P, Tuple1[[#V1#]]] { 41 | override def apply(p: P) = { 42 | [#val v1 = p.productElement(0).asInstanceOf[V1]# 43 | ] 44 | Tuple1([#v1#]) 45 | } 46 | } 47 | val from = new CodingFunction[Tuple1[[#V1#]], P] { 48 | override def apply(t: Tuple1[[#V1#]]) = { 49 | construct([#t._1#]) 50 | } 51 | } 52 | 53 | DelegateCoder.of(intermediate, to, from) 54 | }# 55 | 56 | ] 57 | } 58 | 59 | object CaseClassOps extends CaseClassOps 60 | -------------------------------------------------------------------------------- /core/src/main/scala/com/zendesk/scalaflow/sugar/KVCollectionOps.scala: -------------------------------------------------------------------------------- 1 | package com.zendesk.scalaflow.sugar 2 | 3 | import java.lang.{Iterable => JIterable} 4 | 5 | import com.google.cloud.dataflow.sdk.coders.{Coder, IterableCoder} 6 | import com.google.cloud.dataflow.sdk.transforms.{Combine, DoFn, GroupByKey, Top} 7 | import com.google.cloud.dataflow.sdk.values.{KV, PCollection} 8 | import com.zendesk.scalaflow._ 9 | import org.joda.time.Instant 10 | 11 | import scala.collection.JavaConverters._ 12 | 13 | trait KVCollectionOps { 14 | 15 | implicit class RichKVCollection[K : Coder, A: Coder](val collection: PCollection[KV[K, A]]) { 16 | 17 | def parDo[B](f: DoFn[KV[K, A], KV[K, B]]#ProcessContext => Unit)(implicit coder: Coder[KV[K, B]]): PCollection[KV[K, B]] = { 18 | collection.apply(asParDo(f)).setCoder(coder) 19 | } 20 | 21 | def mapValue[B : Coder](f: A => B): PCollection[KV[K, B]] = parDo { 22 | c => c.output(KV.of(c.element.getKey, f(c.element.getValue))) 23 | } 24 | 25 | def flatMapValue[B : Coder](f: A => Iterable[B]): PCollection[KV[K, B]] = parDo { 26 | c => f(c.element.getValue).foreach { value => c.output(KV.of(c.element.getKey, value)) } 27 | } 28 | 29 | def extractTimestamp: PCollection[KV[K, (A, Instant)]] = parDo { 30 | c => c.output(KV.of(c.element.getKey, (c.element.getValue, c.timestamp))) 31 | } 32 | 33 | def combinePerKey(zero: A)(f: (A, A) => A): PCollection[KV[K, A]] = { 34 | val g = (input: JIterable[A]) => input.asScala.fold(zero)(f) 35 | collection.apply(Combine.perKey[K, A](asSimpleFn(g))) 36 | } 37 | 38 | def groupByKey: PCollection[KV[K, Iterable[A]]] = { 39 | collection.apply(GroupByKey.create[K, A]).mapValue(_.asScala) 40 | } 41 | 42 | def topPerKey(count: Int)(implicit ordered: Ordering[A]): PCollection[KV[K, List[A]]] = { 43 | collection.apply(Top.perKey(count, ordered)).mapValue(_.asScala.toList) 44 | } 45 | } 46 | } 47 | 48 | object KVCollectionOps extends KVCollectionOps 49 | -------------------------------------------------------------------------------- /core/src/main/scala/com/zendesk/scalaflow/sugar/CollectionOps.scala: -------------------------------------------------------------------------------- 1 | package com.zendesk.scalaflow.sugar 2 | 3 | import com.google.cloud.dataflow.sdk.coders.Coder 4 | import com.google.cloud.dataflow.sdk.transforms._ 5 | import com.google.cloud.dataflow.sdk.values.{KV, PCollection, PCollectionList, POutput} 6 | import com.zendesk.scalaflow.sugar.WrapperOps._ 7 | import org.joda.time.Instant 8 | 9 | import scala.collection.JavaConverters._ 10 | 11 | trait CollectionOps { 12 | 13 | implicit class RichCollection[A: Coder](val collection: PCollection[A]) { 14 | 15 | def parDo[B](f: DoFn[A, B]#ProcessContext => Unit)(implicit coder: Coder[B]): PCollection[B] = { 16 | collection.apply(asParDo(f)).setCoder(coder) 17 | } 18 | 19 | def map[B : Coder](f: A => B): PCollection[B] = parDo { 20 | c => c.output(f(c.element)) 21 | } 22 | 23 | def filter(f: A => Boolean): PCollection[A] = parDo { 24 | c => if (f(c.element)) c.output(c.element) 25 | } 26 | 27 | def collect[B : Coder](pf: PartialFunction[A, B]): PCollection[B] = parDo { 28 | c => if (pf.isDefinedAt(c.element)) c.output(pf(c.element)) 29 | } 30 | 31 | def extractTimestamp(implicit c: Coder[(A, Instant)]): PCollection[(A, Instant)] = parDo { 32 | c => c.output((c.element, c.timestamp)) 33 | } 34 | 35 | def flatMap[B : Coder](f: A => Iterable[B]): PCollection[B] = parDo { 36 | c => f(c.element).foreach(c.output) 37 | } 38 | 39 | def foreach(f: A => Unit): PCollection[A] = parDo { 40 | c => { f(c.element); c.output(c.element) } 41 | } 42 | 43 | def withKey[B : Coder](f: A => B)(implicit c: Coder[KV[B, A]]): PCollection[KV[B, A]] = parDo { 44 | c => c.output(KV.of(f(c.element), c.element)) 45 | } 46 | 47 | def flattenWith(first: PCollection[A], others: PCollection[A]*): PCollection[A] = { 48 | val all = collection :: first :: others.toList 49 | PCollectionList.of(all.asJava).apply(Flatten.pCollections[A]) 50 | } 51 | 52 | def transformWith[B <: POutput](name: String)(f: PCollection[A] => B): B = { 53 | collection.apply(name, asPTransform(f)) 54 | } 55 | } 56 | } 57 | 58 | object CollectionOps extends CollectionOps 59 | -------------------------------------------------------------------------------- /core/src/main/scala/com/zendesk/scalaflow/coders/OptionCoder.scala: -------------------------------------------------------------------------------- 1 | package com.zendesk.scalaflow.coders 2 | 3 | import java.io.{IOException, InputStream, OutputStream} 4 | import java.util.{Arrays, List => JList} 5 | 6 | import com.google.cloud.dataflow.sdk.coders.Coder.Context 7 | import com.google.cloud.dataflow.sdk.coders.{ByteCoder, Coder, CustomCoder} 8 | import com.google.cloud.dataflow.sdk.util.common.ElementByteSizeObserver 9 | 10 | class OptionCoder[T](valueCoder: Coder[T]) extends CustomCoder[Option[T]] { 11 | private val byteCoder = ByteCoder.of 12 | 13 | override def encode(value: Option[T], outStream: OutputStream, context: Context): Unit = { 14 | value match { 15 | case Some(left) => 16 | outStream.write(1) 17 | valueCoder.encode(left, outStream, context.nested) 18 | case None => 19 | outStream.write(0) 20 | } 21 | } 22 | 23 | override def decode(inStream: InputStream, context: Context): Option[T] = { 24 | val tag = inStream.read() 25 | 26 | if (tag == 1) Some(valueCoder.decode(inStream, context.nested)) 27 | else if (tag == 0) None 28 | else throw new IOException(s"Unexpected value $tag encountered decoding 1 byte from input stream") 29 | } 30 | 31 | override def consistentWithEquals(): Boolean = { 32 | valueCoder.consistentWithEquals 33 | } 34 | 35 | override def getCoderArguments: JList[Coder[_]] = { 36 | Arrays.asList(valueCoder) 37 | } 38 | 39 | override def verifyDeterministic(): Unit = { 40 | verifyDeterministic("First coder must be deterministic", valueCoder) 41 | } 42 | 43 | override def registerByteSizeObserver(value: Option[T], observer: ElementByteSizeObserver, context: Context): Unit = { 44 | value.foreach(v => valueCoder.registerByteSizeObserver(v, observer, context.nested)) 45 | } 46 | 47 | override def structuralValue(value: Option[T]): AnyRef = { 48 | if (consistentWithEquals) 49 | value 50 | else 51 | value.map(v => valueCoder.structuralValue(v)) 52 | } 53 | 54 | override def isRegisterByteSizeObserverCheap(value: Option[T], context: Context): Boolean = { 55 | value 56 | .map(v => valueCoder.isRegisterByteSizeObserverCheap(v, context.nested)) 57 | .getOrElse(true) 58 | } 59 | 60 | override def getEncodingId = s"OptionCoder(${valueCoder.getEncodingId})" 61 | } 62 | -------------------------------------------------------------------------------- /core/src/test/scala/com/zendesk/scalaflow/sugar/CaseClassOpsSpec.scala: -------------------------------------------------------------------------------- 1 | package com.zendesk.scalaflow.sugar 2 | 3 | import com.google.cloud.dataflow.sdk.options.PipelineOptions.CheckEnabled._ 4 | import com.google.cloud.dataflow.sdk.testing.{DataflowAssert, TestPipeline} 5 | import com.google.cloud.dataflow.sdk.transforms.Create 6 | 7 | import org.scalatest.{FlatSpec, Matchers} 8 | 9 | import com.zendesk.scalaflow._ 10 | 11 | object CaseClassOpsSpec { 12 | case class Foo() 13 | case class Bar(name: String) 14 | case class Qux(name: String, age: Int) 15 | case class Wibble(foo: Foo, bar: Bar, qux: Qux) 16 | } 17 | 18 | class CaseClassOpsSpec extends FlatSpec with Matchers { 19 | import CaseClassOpsSpec._ 20 | 21 | behavior of "CaseClassCoders" 22 | 23 | it should "handle zero member case class" in { 24 | implicit val fooCoder = caseClassCoder(Foo) 25 | 26 | val pipeline = testPipeline() 27 | val output = pipeline.begin 28 | .transform(Create.of(Foo())) 29 | .map(identity) 30 | 31 | DataflowAssert.that(output).containsInAnyOrder(Foo()) 32 | pipeline.run() 33 | } 34 | 35 | it should "handle single member case class" in { 36 | implicit val barCoder = caseClassCoder(Bar) 37 | 38 | val pipeline = testPipeline() 39 | val output = pipeline.begin 40 | .transform(Create.of(Bar("Fred"))) 41 | .map(_.copy(name = "John")) 42 | 43 | DataflowAssert.that(output).containsInAnyOrder(Bar("John")) 44 | pipeline.run() 45 | } 46 | 47 | it should "handle double member case classes" in { 48 | implicit val quxCoder = caseClassCoder(Qux) 49 | 50 | val pipeline = testPipeline() 51 | val output = pipeline.begin 52 | .transform(Create.of(Qux("Fred", 27))) 53 | .map(_.copy(age = 35)) 54 | 55 | DataflowAssert.that(output).containsInAnyOrder(Qux("Fred", 35)) 56 | pipeline.run() 57 | } 58 | 59 | it should "handle nested case classes" in { 60 | implicit val fooCoder = caseClassCoder(Foo) 61 | implicit val barCoder = caseClassCoder(Bar) 62 | implicit val quxCoder = caseClassCoder(Qux) 63 | implicit val wibbleCoder = caseClassCoder(Wibble) 64 | 65 | val pipeline = testPipeline() 66 | val output = pipeline.begin 67 | .transform(Create.of(Wibble(Foo(), Bar("John"), Qux("Fred", 27)))) 68 | .map(_.copy(qux = Qux("Fred", 35))) 69 | 70 | DataflowAssert.that(output).containsInAnyOrder(Wibble(Foo(), Bar("John"), Qux("Fred", 35))) 71 | pipeline.run() 72 | } 73 | 74 | private def testPipeline() = { 75 | val pipelineOptions = TestPipeline.testingPipelineOptions 76 | pipelineOptions.setStableUniqueNames(OFF) 77 | TestPipeline.fromOptions(pipelineOptions) 78 | } 79 | } 80 | -------------------------------------------------------------------------------- /core/src/main/scala/com/zendesk/scalaflow/coders/TryCoder.scala: -------------------------------------------------------------------------------- 1 | package com.zendesk.scalaflow.coders 2 | 3 | import java.io.{IOException, InputStream, OutputStream} 4 | import java.util.{Arrays, List => JList} 5 | 6 | import com.google.cloud.dataflow.sdk.coders.Coder.Context 7 | import com.google.cloud.dataflow.sdk.coders._ 8 | import com.google.cloud.dataflow.sdk.util.common.ElementByteSizeObserver 9 | 10 | import scala.util.{Failure, Success, Try} 11 | 12 | class TryCoder[A](coder: Coder[A]) extends CustomCoder[Try[A]] { 13 | 14 | private val errorCoder = SerializableCoder.of(classOf[Throwable]) 15 | 16 | override def encode(value: Try[A], outStream: OutputStream, context: Context): Unit = { 17 | value match { 18 | case Failure(failure) => 19 | outStream.write(1) 20 | errorCoder.encode(failure, outStream, context.nested) 21 | case Success(success) => 22 | outStream.write(0) 23 | coder.encode(success, outStream, context.nested) 24 | } 25 | } 26 | 27 | override def decode(inStream: InputStream, context: Context): Try[A] = { 28 | val tag = inStream.read() 29 | 30 | if (tag == 1) Failure(errorCoder.decode(inStream, context.nested)) 31 | else if (tag == 0) Success(coder.decode(inStream, context.nested)) 32 | else throw new IOException(s"Unexpected value $tag encountered decoding 1 byte from input stream") 33 | } 34 | 35 | override def consistentWithEquals(): Boolean = false 36 | 37 | override def getCoderArguments: JList[Coder[_]] = Arrays.asList(coder) 38 | 39 | override def verifyDeterministic(): Unit = { 40 | throw new Coder.NonDeterministicException(this, "Java Serialization may be non-deterministic.") 41 | } 42 | 43 | override def registerByteSizeObserver(value: Try[A], observer: ElementByteSizeObserver, context: Context): Unit = { 44 | value match { 45 | case Failure(failure) => errorCoder.registerByteSizeObserver(failure, observer, context.nested) 46 | case Success(success) => coder.registerByteSizeObserver(success, observer, context.nested) 47 | } 48 | } 49 | 50 | override def structuralValue(value: Try[A]): AnyRef = { 51 | if (consistentWithEquals) 52 | value 53 | else 54 | value match { 55 | case Failure(failure) => errorCoder.structuralValue(failure) 56 | case Success(success) => coder.structuralValue(success) 57 | } 58 | } 59 | 60 | override def isRegisterByteSizeObserverCheap(value: Try[A], context: Context): Boolean = { 61 | value match { 62 | case Failure(failure) => errorCoder.isRegisterByteSizeObserverCheap(failure, context.nested) 63 | case Success(success) => coder.isRegisterByteSizeObserverCheap(success, context.nested) 64 | } 65 | } 66 | 67 | override def getEncodingId = s"TryCoder(${coder.getEncodingId})" 68 | } 69 | -------------------------------------------------------------------------------- /core/src/main/scala/com/zendesk/scalaflow/coders/EitherCoder.scala: -------------------------------------------------------------------------------- 1 | package com.zendesk.scalaflow.coders 2 | 3 | import java.io.{IOException, InputStream, OutputStream} 4 | import java.util.{Arrays, List => JList} 5 | 6 | import com.google.cloud.dataflow.sdk.coders.Coder.Context 7 | import com.google.cloud.dataflow.sdk.coders.{Coder, CustomCoder} 8 | import com.google.cloud.dataflow.sdk.util.common.ElementByteSizeObserver 9 | 10 | class EitherCoder[A, B](aCoder: Coder[A], bCoder: Coder[B]) extends CustomCoder[Either[A, B]] { 11 | 12 | override def encode(value: Either[A, B], outStream: OutputStream, context: Context): Unit = { 13 | value match { 14 | case Left(left) => 15 | outStream.write(1) 16 | aCoder.encode(left, outStream, context.nested) 17 | case Right(right) => 18 | outStream.write(0) 19 | bCoder.encode(right, outStream, context.nested) 20 | } 21 | } 22 | 23 | override def decode(inStream: InputStream, context: Context): Either[A, B] = { 24 | val tag = inStream.read 25 | 26 | if (tag == 1) Left(aCoder.decode(inStream, context.nested)) 27 | else if (tag == 0) Right(bCoder.decode(inStream, context.nested)) 28 | else throw new IOException(s"Unexpected value $tag encountered decoding 1 byte from input stream") 29 | } 30 | 31 | override def consistentWithEquals(): Boolean = { 32 | aCoder.consistentWithEquals && bCoder.consistentWithEquals 33 | } 34 | 35 | override def getCoderArguments: JList[Coder[_]] = { 36 | Arrays.asList(aCoder, bCoder) 37 | } 38 | 39 | override def verifyDeterministic(): Unit = { 40 | verifyDeterministic("First coder must be deterministic", aCoder) 41 | verifyDeterministic("Second coder must be deterministic", bCoder) 42 | } 43 | 44 | override def registerByteSizeObserver(value: Either[A, B], observer: ElementByteSizeObserver, context: Context): Unit = { 45 | value match { 46 | case Left(left) => aCoder.registerByteSizeObserver(left, observer, context.nested) 47 | case Right(right) => bCoder.registerByteSizeObserver(right, observer, context.nested) 48 | } 49 | } 50 | 51 | override def structuralValue(value: Either[A, B]): AnyRef = { 52 | if (consistentWithEquals) 53 | value 54 | else 55 | value match { 56 | case Left(left) => aCoder.structuralValue(left) 57 | case Right(right) => bCoder.structuralValue(right) 58 | } 59 | } 60 | 61 | override def isRegisterByteSizeObserverCheap(value: Either[A, B], context: Context): Boolean = { 62 | value match { 63 | case Left(left) => aCoder.isRegisterByteSizeObserverCheap(left, context.nested) 64 | case Right(right) => bCoder.isRegisterByteSizeObserverCheap(right, context.nested) 65 | } 66 | } 67 | 68 | override def getEncodingId = s"EitherCoder(${aCoder.getEncodingId},${bCoder.getEncodingId})" 69 | } 70 | -------------------------------------------------------------------------------- /core/src/main/scala/com/zendesk/scalaflow/sugar/CoderOps.scala: -------------------------------------------------------------------------------- 1 | package com.zendesk.scalaflow.sugar 2 | 3 | import java.lang.{Iterable => JIterable} 4 | import java.util.{List => JList, Map => JMap, Set => JSet} 5 | 6 | import com.google.cloud.dataflow.sdk.coders._ 7 | import com.google.cloud.dataflow.sdk.values.KV 8 | import com.zendesk.scalaflow.coders._ 9 | import org.joda.time.Instant 10 | 11 | import scala.collection.JavaConverters._ 12 | import scala.reflect.ClassTag 13 | import scala.util.Try 14 | 15 | trait CoderOps { 16 | 17 | // Built in coders 18 | implicit val stringCoder: Coder[String] = StringUtf8Coder.of() 19 | 20 | implicit val instantCoder: Coder[Instant] = InstantCoder.of() 21 | 22 | implicit def kvCoder[K, V](implicit k: Coder[K], v: Coder[V]): Coder[KV[K, V]] = KvCoder.of(k, v) 23 | 24 | implicit def javaIterableCoder[T](implicit c: Coder[T]) = IterableCoder.of(c) 25 | 26 | implicit def javaListCoder[T](implicit c: Coder[T]) = ListCoder.of(c) 27 | 28 | implicit def javaMapCoder[K, V](implicit k: Coder[K], v: Coder[V]) = MapCoder.of(k, v) 29 | 30 | implicit def javaSetCoder[T](implicit c: Coder[T]) = SetCoder.of(c) 31 | 32 | // Primitives 33 | implicit val unitCoder: Coder[Unit] = DelegateCoder.of[Unit, Void](VoidCoder.of(), null, _ => ()) 34 | 35 | implicit val intCoder: Coder[Int] = VarIntCoder.of().asInstanceOf[Coder[Int]] 36 | 37 | implicit val longCoder: Coder[Long] = VarLongCoder.of().asInstanceOf[Coder[Long]] 38 | 39 | implicit val doubleCoder: Coder[Double] = DoubleCoder.of().asInstanceOf[Coder[Double]] 40 | 41 | // Core Types 42 | implicit def optionCoder[T](implicit c: Coder[T]): Coder[Option[T]] = new OptionCoder(c) 43 | 44 | implicit def tryCoder[T](implicit c: Coder[T]): Coder[Try[T]] = new TryCoder(c) 45 | 46 | implicit def eitherCoder[A, B](implicit a: Coder[A], b: Coder[B]): Coder[Either[A, B]] = new EitherCoder(a, b) 47 | 48 | // Immutable Collections 49 | implicit def iterableCoder[T](implicit c: Coder[T]): Coder[Iterable[T]] = { 50 | DelegateCoder.of[Iterable[T], JIterable[T]](javaIterableCoder, _.asJava, _.asScala) 51 | } 52 | 53 | implicit def listCoder[T](implicit c: Coder[T]): Coder[List[T]] = { 54 | DelegateCoder.of[List[T], JList[T]](javaListCoder, _.asJava, _.asScala.toList) 55 | } 56 | 57 | implicit def mapCoder[K, V](implicit k: Coder[K], v: Coder[V]): Coder[Map[K, V]] = { 58 | DelegateCoder.of[Map[K, V], JMap[K, V]](javaMapCoder, _.asJava, _.asScala.toMap) 59 | } 60 | 61 | implicit def setCoder[T](implicit c: Coder[T]): Coder[Set[T]] = { 62 | DelegateCoder.of[Set[T], JSet[T]](javaSetCoder, _.asJava, _.asScala.toSet) 63 | } 64 | 65 | implicit def arrayCoder[T](implicit c: Coder[T], tag: ClassTag[T]): Coder[Array[T]] = { 66 | DelegateCoder.of[Array[T], JList[T]](javaListCoder, _.toList.asJava, _.asScala.toArray) 67 | } 68 | 69 | // Opt-in convenience catch-all coder for anything that doesn't fit in above and is not a case class 70 | def serializableCoder[T <: Serializable](implicit tag: ClassTag[T]): Coder[T] = { 71 | SerializableCoder.of(tag.runtimeClass.asInstanceOf[Class[T]]) 72 | } 73 | } 74 | 75 | object CoderOps extends CoderOps 76 | -------------------------------------------------------------------------------- /core/src/test/scala/com/zendesk/scalaflow/sugar/RichCollectionSpec.scala: -------------------------------------------------------------------------------- 1 | package com.zendesk.scalaflow.sugar 2 | 3 | import com.google.cloud.dataflow.sdk.options.PipelineOptions.CheckEnabled._ 4 | import com.google.cloud.dataflow.sdk.testing.{DataflowAssert, TestPipeline} 5 | import com.google.cloud.dataflow.sdk.transforms.Create 6 | import com.google.cloud.dataflow.sdk.values.{KV, TimestampedValue} 7 | import com.zendesk.scalaflow._ 8 | import org.joda.time.DateTime 9 | import org.scalatest.{FlatSpec, Matchers} 10 | 11 | import scala.collection.JavaConverters._ 12 | 13 | class RichCollectionSpec extends FlatSpec with Matchers { 14 | 15 | implicit val rangeCoder = serializableCoder[Range.Inclusive] 16 | 17 | behavior of "Rich Collection" 18 | 19 | it should "map values" in { 20 | val pipeline = testPipeline() 21 | val input = 10 to 12 22 | val output = pipeline.begin 23 | .transform(Create.of(input.asJava)) 24 | .map(_.toHexString) 25 | 26 | DataflowAssert.that(output).containsInAnyOrder("a", "b", "c") 27 | pipeline.run() 28 | } 29 | 30 | it should "flat map iterables" in { 31 | val pipeline = testPipeline() 32 | val input = List(1 to 2, 3 to 5, 6 to 9) 33 | val output = pipeline.begin 34 | .transform(Create.of(input.asJava)) 35 | .flatMap(identity) 36 | 37 | DataflowAssert.that(output).containsInAnyOrder(1, 2, 3, 4, 5, 6, 7, 8, 9) 38 | pipeline.run() 39 | } 40 | 41 | it should "filter values" in { 42 | val pipeline = testPipeline() 43 | val input = 0 to 9 44 | val output = pipeline.begin 45 | .transform(Create.of(input.asJava)) 46 | .filter(_ % 2 == 0) 47 | 48 | DataflowAssert.that(output).containsInAnyOrder(0, 2, 4, 6, 8) 49 | pipeline.run() 50 | } 51 | 52 | // FizzBuzz as a massively scalable streaming functional paradigm ;-) 53 | it should "collect values" in { 54 | val pipeline = testPipeline() 55 | val input = 1 to 15 56 | val output = pipeline.begin 57 | .transform(Create.of(input.asJava)) 58 | .collect { 59 | case x if x % 3 == 0 => if (x % 5 == 0) "FizzBuzz" else "Fizz" 60 | case x if x % 5 == 0 => "Buzz" 61 | } 62 | 63 | DataflowAssert.that(output).containsInAnyOrder("Fizz", "Buzz", "Fizz", "Fizz", "Buzz", "Fizz", "FizzBuzz") 64 | pipeline.run() 65 | } 66 | 67 | it should "flatten PCollections together" in { 68 | val pipeline = testPipeline() 69 | val first = pipeline.begin.transform(Create.of((1 to 2).asJava)) 70 | val second = pipeline.begin.transform(Create.of((3 to 5).asJava)) 71 | val third = pipeline.begin.transform(Create.of((6 to 9).asJava)) 72 | 73 | val output = first.flattenWith(second, third) 74 | 75 | DataflowAssert.that(output).containsInAnyOrder(1, 2, 3, 4, 5, 6, 7, 8, 9) 76 | pipeline.run() 77 | } 78 | 79 | it should "use withKey to facilitate conversion to KV" in { 80 | val pipeline = testPipeline() 81 | val input = (1 to 3).map(_.toString) 82 | val output = pipeline.begin 83 | .transform(Create.of(input.asJava)) 84 | .withKey("k" + _) 85 | 86 | DataflowAssert.that(output).containsInAnyOrder(KV.of("k1", "1"), KV.of("k2", "2"), KV.of("k3", "3")) 87 | pipeline.run() 88 | } 89 | 90 | it should "extract timestamps" in { 91 | val pipeline = testPipeline() 92 | 93 | val now = DateTime.now() 94 | val yesterday = now.minusDays(1).toInstant 95 | val today = now.toInstant() 96 | val tomorrow = now.plusDays(1).toInstant 97 | 98 | val input = List(TimestampedValue.of("yesterday", yesterday), TimestampedValue.of("today", today), TimestampedValue.of("tomorrow", tomorrow)) 99 | val output = pipeline.begin 100 | .transform(Create.timestamped(input.asJava)) 101 | .extractTimestamp 102 | 103 | DataflowAssert.that(output).containsInAnyOrder(("yesterday", yesterday), ("today", today), ("tomorrow", tomorrow)) 104 | pipeline.run() 105 | } 106 | 107 | private def testPipeline() = { 108 | val pipelineOptions = TestPipeline.testingPipelineOptions 109 | pipelineOptions.setStableUniqueNames(OFF) 110 | 111 | TestPipeline.fromOptions(pipelineOptions) 112 | } 113 | } 114 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # scala-flow 2 | ![repo-checks](https://github.com/zendesk/scala-flow/workflows/repo-checks/badge.svg) 3 | 4 | _scala-flow_ is a lightweight library intended to make developing Google DataFlow jobs in Scala easier. The core dataflow classes are enriched to allow more idiomatic and concise Scala usage while preserving full access to the underlying Java SDK. 5 | 6 | Coders for Scala primitives and collection classes have been implemented so that you can conveniently return these types from your PTransforms. In addition you can easily create coders for your own case classes. 7 | 8 | **Caveat:** This library is still evolving rapidly as we improve our knowledge and understanding of Dataflow, so there will be a some flux in the API as we discover and refine what works well and what doesn't. 9 | 10 | As a preview of what's possible here's the eponymous MinimalWordCount example: 11 | 12 | ```scala 13 | Pipeline.create(...) 14 | .apply(TextIO.Read.from("gs://dataflow-samples/shakespeare/kinglear.txt")) 15 | .flatMap(_.split("\\W+").filter(_.nonEmpty).toIterable) 16 | .apply(Count.perElement[String]) 17 | .map(kv => kv.getKey + ": " + kv.getValue) 18 | .apply(TextIO.Write.to("results.text")) 19 | .run() 20 | ``` 21 | 22 | ## Usage 23 | 24 | #### Pipeline 25 | 26 | `Pipeline` has been enriched with a handful of methods. 27 | 28 | To create a `PCollection` from in-memory data use the `transform` method instead of `apply`. This method ensures that the coder is set correctly on the input data. 29 | 30 | In addition a `run` method has been added to the `POutput` type, so that you can fluently chain transforms then run your pipeline. For example: 31 | 32 | ```scala 33 | val result = Pipeline.create(...) 34 | .transform(Create.of("foo", "bar")) 35 | .apply(...transforms...) 36 | .run() 37 | ``` 38 | 39 | #### Basic PCollection Methods 40 | 41 | `PCollection` now has `map`, `flatMap`, `filter` and `collect` methods that each behave as you would expect. 42 | 43 | Simple example: 44 | 45 | ```scala 46 | val result = Pipeline.create(...) 47 | .transform(Create.of("123", "456", "789")) 48 | .flatMap(_.split("")) 49 | .map(_.toInt) 50 | .filter(_ < 5) 51 | .collect { 52 | case x if x % 3 == 0 => if (x % 5 == 0) "FizzBuzz" else "Fizz" 53 | case x if x % 5 == 0 => "Buzz" 54 | } 55 | .run() 56 | ``` 57 | 58 | #### PCollection Extras 59 | 60 | ##### Logging Side Effect 61 | 62 | A side-effecting method `foreach` has been added in order to allow handy debug logging. This method supplies each element of the `PCollection` to it's argument then passes on the element unchanged. 63 | For example: 64 | 65 | ```scala 66 | val result = Pipeline(...) 67 | .transform(Create.of("123", "456", "789")) 68 | .foreach(println) 69 | .apply(...continue as normal...) 70 | ``` 71 | 72 | ##### Extracting Timetamps 73 | 74 | `extractTimestamp` converts each element in the PCollection to a tuple with its corresponding timestamp. For example: 75 | ```scala 76 | val collection: PCollection[(String, Instant)] = Pipeline.create(...) 77 | .transform(Create.of("foo", "bar")) 78 | .withTimestamps 79 | ``` 80 | 81 | ##### Converting to a `KV` 82 | 83 | The `withKey` method provides a drop in replacement for the `WithKeys` transform. 84 | 85 | ##### Merging PCollections of the same type 86 | 87 | The `flattenWith` method is the equivalent to the `Flatten` transform, allowing collections of the same type to be merged together. For example: 88 | 89 | ```scala 90 | val first: PCollection[String] = ... 91 | val second: PCollection[String] = ... 92 | val third: PCollection[String] = ... 93 | 94 | val combined: PCollection[String] = first.flattenWith(second, third) 95 | ``` 96 | 97 | ##### Naming your transforms 98 | 99 | To provide better visualization of the Pipeline graph and to allow updating of running jobs, you can name blocks of transforms using the `transformWith` method. For example: 100 | 101 | ```scala 102 | val result = Pipeline.create(...) 103 | .transformWith("Load Resources") { _ 104 | .apply(TextIO.Read.from("gs://dataflow-samples/shakespeare/kinglear.txt")) 105 | } 106 | .transformWith("Split and count Words") { _ 107 | .flatMap(_.split("\\W+").filter(_.nonEmpty).toIterable) 108 | .apply(Count.perElement[String]) 109 | } 110 | .transformWith("Output Results") { _ 111 | .map(kv => kv.getKey + ": " + kv.getValue) 112 | .apply(TextIO.Write.to("results.text")) 113 | } 114 | .run() 115 | ``` 116 | 117 | Under the hood this method simply converts each nested block of methods into a `PTransform` class. 118 | 119 | ##### ParDo Escape Hatch 120 | 121 | The `parDo` method provides an escape hatch in case none of the existing methods do what you want. Pass any arbitrary function wth a `DoFn` signature to this method and it will be converted to a `ParDo` transform. For example: 122 | ```scala 123 | Pipeline.create(...) 124 | .apply(TextIO.Read.from("gs://dataflow-samples/shakespeare/kinglear.txt")) 125 | .parDo { (c: DoFn[String, String]#ProcessContext) => 126 | /* Do anything whatsoever here */ 127 | c.output(...) 128 | } 129 | ``` 130 | 131 | #### KV Collection 132 | 133 | Several methods have been added specifically for KV collections: 134 | 135 | The `mapValue` and `flatMapValue` methods allow you to change the value of a `KV` pair without affecting they key. For example: 136 | ```scala 137 | val result = Pipeline.create(...) 138 | .transform(Create.of("123", "456", "789") 139 | .withKey(_.toInt) 140 | .mapValue(_.split("")) 141 | .flatMapValue(_.mkString(".") 142 | 143 | /* Result contains KV(123, "1.2.3"), KV(456, "4.5.6."), KV(789, "7.8.9") */ 144 | ``` 145 | 146 | In addition there are `combinePerKey`, `topPerKey` and `groupPerKey` methods that work exactly the same as the Dataflow transform equivalents. 147 | 148 | #### Joining KV PCollections 149 | 150 | In order to join two or more collections of `KV` values by key you can use `coGroupByKey`, a type-safe wrapper around Dataflow's [`CoGroupByKey`](https://cloud.google.com/dataflow/java-sdk/JavaDoc/com/google/cloud/dataflow/sdk/transforms/join/CoGroupByKey) transform. 151 | 152 | ```scala 153 | val buyOrders: PCollection[KV[CustomerId, BuyOrder]] = ... 154 | val sellOrders: PCollection[KV[CustomerId, SellOrder]] = ... 155 | 156 | val allOrders: PCollection[KV[CustomerId, (Iterable[BuyOrder], Iterable[SellOrder])]] = buyOrders.coGroupByKey(sellOrders) 157 | ``` 158 | 159 | ### Coders 160 | 161 | Implicit coders for the following types have been added: 162 | 163 | * `Int`, `Long`, `Double` 164 | * `Option`, `Try`, `Either` 165 | * `Tuple2` to `Tuple22` 166 | * `Iterable`, `List`, `Set`, `Map`, `Array` 167 | 168 | Every method mentioned above required a coder for its output type to be implicitly available. This happens by default for any of the types listed above (and also any arbitrary combination e.g. `List[Option[(Either[String, Int], Array[Double])]]`) 169 | If you create coders for any other types then you'll need to ensure that they are available in the implicit scope somewhere. 170 | 171 | #### Case Class Coders 172 | 173 | You can create a custom coder for any case class containing up to 22 members using the `caseClassCoder` method. For example: 174 | ```scala 175 | case class Foo(name: String) 176 | case class Bar(name: String, age: Int) 177 | case class Qux[T](value : T) 178 | 179 | implicit val fooCoder = caseClassCoder(Foo) 180 | implicit val barCoder = caseClassCoder(Bar) 181 | implicit def quxCoder = caseClassCoder(Qux.apply[T] _) 182 | ``` 183 | 184 | The last line shows demonstrates how to create a coder for a generic types, this is essentially a much simpler replacement for a `CoderFactory`. 185 | 186 | #### Serializable Coder 187 | 188 | By default Dataflow will always try to create a `SerializableCoder` if no other suitable coder can be found. `scala-flow` provides an equivalent with the `serializableCoder` method. For example: 189 | ```scala 190 | class Foo(val name: String) extends Serializable 191 | 192 | implicit val fooCoder = serializableCoder(Foo) 193 | ``` 194 | 195 | ## Why create a Scala Dataflow library? 196 | 197 | There are already some existing libraries for working with Dataflow: 198 | * [Apache Beam](https://beam.apache.org): Supports not only Dataflow, but also Spark, Apex and FLink. 199 | * [Scio](https://github.com/spotify/scio): Spotify have developed this excellent and extensive Scala library 200 | 201 | We initially used Beam directly but quickly found that the complex nature of the Java API (particulary around type erasure), made Scala interop tricky. 202 | We then evaluated Scio, but while we were learning the complex Dataflow concepts we wanted something that was very lightweight, and that kept us very close to the API. 203 | Hence this library that we feel fits in a niche between the two libraries above. 204 | 205 | ## Roadmap 206 | 207 | * Create version of each method that accepts a name to better support updating pipelines 208 | * Switch underlying support to Apache Beam 209 | 210 | ## Credits 211 | 212 | The case class coder approach was heavily inspired by [Spray Json](https://github.com/spray/spray-json), a really nice, light weight JSON parser. 213 | 214 | ## Contributing 215 | 216 | Bug reports and pull requests are welcome on GitHub at 217 | https://github.com/zendesk/scala-flow/ 218 | 219 | ## Copyright and license 220 | 221 | Copyright 2017 Zendesk, Inc. 222 | 223 | Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. 224 | 225 | You may obtain a copy of the License at 226 | http://www.apache.org/licenses/LICENSE-2.0 227 | 228 | Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. 229 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | Apache License 2 | Version 2.0, January 2004 3 | http://www.apache.org/licenses/ 4 | 5 | TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION 6 | 7 | 1. Definitions. 8 | 9 | "License" shall mean the terms and conditions for use, reproduction, 10 | and distribution as defined by Sections 1 through 9 of this document. 11 | 12 | "Licensor" shall mean the copyright owner or entity authorized by 13 | the copyright owner that is granting the License. 14 | 15 | "Legal Entity" shall mean the union of the acting entity and all 16 | other entities that control, are controlled by, or are under common 17 | control with that entity. For the purposes of this definition, 18 | "control" means (i) the power, direct or indirect, to cause the 19 | direction or management of such entity, whether by contract or 20 | otherwise, or (ii) ownership of fifty percent (50%) or more of the 21 | outstanding shares, or (iii) beneficial ownership of such entity. 22 | 23 | "You" (or "Your") shall mean an individual or Legal Entity 24 | exercising permissions granted by this License. 25 | 26 | "Source" form shall mean the preferred form for making modifications, 27 | including but not limited to software source code, documentation 28 | source, and configuration files. 29 | 30 | "Object" form shall mean any form resulting from mechanical 31 | transformation or translation of a Source form, including but 32 | not limited to compiled object code, generated documentation, 33 | and conversions to other media types. 34 | 35 | "Work" shall mean the work of authorship, whether in Source or 36 | Object form, made available under the License, as indicated by a 37 | copyright notice that is included in or attached to the work 38 | (an example is provided in the Appendix below). 39 | 40 | "Derivative Works" shall mean any work, whether in Source or Object 41 | form, that is based on (or derived from) the Work and for which the 42 | editorial revisions, annotations, elaborations, or other modifications 43 | represent, as a whole, an original work of authorship. For the purposes 44 | of this License, Derivative Works shall not include works that remain 45 | separable from, or merely link (or bind by name) to the interfaces of, 46 | the Work and Derivative Works thereof. 47 | 48 | "Contribution" shall mean any work of authorship, including 49 | the original version of the Work and any modifications or additions 50 | to that Work or Derivative Works thereof, that is intentionally 51 | submitted to Licensor for inclusion in the Work by the copyright owner 52 | or by an individual or Legal Entity authorized to submit on behalf of 53 | the copyright owner. For the purposes of this definition, "submitted" 54 | means any form of electronic, verbal, or written communication sent 55 | to the Licensor or its representatives, including but not limited to 56 | communication on electronic mailing lists, source code control systems, 57 | and issue tracking systems that are managed by, or on behalf of, the 58 | Licensor for the purpose of discussing and improving the Work, but 59 | excluding communication that is conspicuously marked or otherwise 60 | designated in writing by the copyright owner as "Not a Contribution." 61 | 62 | "Contributor" shall mean Licensor and any individual or Legal Entity 63 | on behalf of whom a Contribution has been received by Licensor and 64 | subsequently incorporated within the Work. 65 | 66 | 2. Grant of Copyright License. Subject to the terms and conditions of 67 | this License, each Contributor hereby grants to You a perpetual, 68 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable 69 | copyright license to reproduce, prepare Derivative Works of, 70 | publicly display, publicly perform, sublicense, and distribute the 71 | Work and such Derivative Works in Source or Object form. 72 | 73 | 3. Grant of Patent License. Subject to the terms and conditions of 74 | this License, each Contributor hereby grants to You a perpetual, 75 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable 76 | (except as stated in this section) patent license to make, have made, 77 | use, offer to sell, sell, import, and otherwise transfer the Work, 78 | where such license applies only to those patent claims licensable 79 | by such Contributor that are necessarily infringed by their 80 | Contribution(s) alone or by combination of their Contribution(s) 81 | with the Work to which such Contribution(s) was submitted. If You 82 | institute patent litigation against any entity (including a 83 | cross-claim or counterclaim in a lawsuit) alleging that the Work 84 | or a Contribution incorporated within the Work constitutes direct 85 | or contributory patent infringement, then any patent licenses 86 | granted to You under this License for that Work shall terminate 87 | as of the date such litigation is filed. 88 | 89 | 4. Redistribution. You may reproduce and distribute copies of the 90 | Work or Derivative Works thereof in any medium, with or without 91 | modifications, and in Source or Object form, provided that You 92 | meet the following conditions: 93 | 94 | (a) You must give any other recipients of the Work or 95 | Derivative Works a copy of this License; and 96 | 97 | (b) You must cause any modified files to carry prominent notices 98 | stating that You changed the files; and 99 | 100 | (c) You must retain, in the Source form of any Derivative Works 101 | that You distribute, all copyright, patent, trademark, and 102 | attribution notices from the Source form of the Work, 103 | excluding those notices that do not pertain to any part of 104 | the Derivative Works; and 105 | 106 | (d) If the Work includes a "NOTICE" text file as part of its 107 | distribution, then any Derivative Works that You distribute must 108 | include a readable copy of the attribution notices contained 109 | within such NOTICE file, excluding those notices that do not 110 | pertain to any part of the Derivative Works, in at least one 111 | of the following places: within a NOTICE text file distributed 112 | as part of the Derivative Works; within the Source form or 113 | documentation, if provided along with the Derivative Works; or, 114 | within a display generated by the Derivative Works, if and 115 | wherever such third-party notices normally appear. The contents 116 | of the NOTICE file are for informational purposes only and 117 | do not modify the License. You may add Your own attribution 118 | notices within Derivative Works that You distribute, alongside 119 | or as an addendum to the NOTICE text from the Work, provided 120 | that such additional attribution notices cannot be construed 121 | as modifying the License. 122 | 123 | You may add Your own copyright statement to Your modifications and 124 | may provide additional or different license terms and conditions 125 | for use, reproduction, or distribution of Your modifications, or 126 | for any such Derivative Works as a whole, provided Your use, 127 | reproduction, and distribution of the Work otherwise complies with 128 | the conditions stated in this License. 129 | 130 | 5. Submission of Contributions. Unless You explicitly state otherwise, 131 | any Contribution intentionally submitted for inclusion in the Work 132 | by You to the Licensor shall be under the terms and conditions of 133 | this License, without any additional terms or conditions. 134 | Notwithstanding the above, nothing herein shall supersede or modify 135 | the terms of any separate license agreement you may have executed 136 | with Licensor regarding such Contributions. 137 | 138 | 6. Trademarks. This License does not grant permission to use the trade 139 | names, trademarks, service marks, or product names of the Licensor, 140 | except as required for reasonable and customary use in describing the 141 | origin of the Work and reproducing the content of the NOTICE file. 142 | 143 | 7. Disclaimer of Warranty. Unless required by applicable law or 144 | agreed to in writing, Licensor provides the Work (and each 145 | Contributor provides its Contributions) on an "AS IS" BASIS, 146 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or 147 | implied, including, without limitation, any warranties or conditions 148 | of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A 149 | PARTICULAR PURPOSE. You are solely responsible for determining the 150 | appropriateness of using or redistributing the Work and assume any 151 | risks associated with Your exercise of permissions under this License. 152 | 153 | 8. Limitation of Liability. In no event and under no legal theory, 154 | whether in tort (including negligence), contract, or otherwise, 155 | unless required by applicable law (such as deliberate and grossly 156 | negligent acts) or agreed to in writing, shall any Contributor be 157 | liable to You for damages, including any direct, indirect, special, 158 | incidental, or consequential damages of any character arising as a 159 | result of this License or out of the use or inability to use the 160 | Work (including but not limited to damages for loss of goodwill, 161 | work stoppage, computer failure or malfunction, or any and all 162 | other commercial damages or losses), even if such Contributor 163 | has been advised of the possibility of such damages. 164 | 165 | 9. Accepting Warranty or Additional Liability. While redistributing 166 | the Work or Derivative Works thereof, You may choose to offer, 167 | and charge a fee for, acceptance of support, warranty, indemnity, 168 | or other liability obligations and/or rights consistent with this 169 | License. However, in accepting such obligations, You may act only 170 | on Your own behalf and on Your sole responsibility, not on behalf 171 | of any other Contributor, and only if You agree to indemnify, 172 | defend, and hold each Contributor harmless for any liability 173 | incurred by, or claims asserted against, such Contributor by reason 174 | of your accepting any such warranty or additional liability. 175 | 176 | END OF TERMS AND CONDITIONS --------------------------------------------------------------------------------