├── .scalafmt.conf ├── project ├── build.properties └── plugins.sbt ├── .gitignore ├── spark └── src │ └── main │ └── scala │ ├── Row.scala │ └── DataType.scala ├── src ├── main │ └── scala │ │ ├── package.scala │ │ ├── JsonPatch.scala │ │ ├── 3-validation.scala │ │ ├── solutions │ │ ├── 5-patches.scala │ │ ├── 0-prelude.scala │ │ ├── 3-validation.scala │ │ ├── 1-schema.scala │ │ ├── 2-avro.scala │ │ └── 4-spark-avro.scala │ │ ├── 0-prelude.scala │ │ ├── 1-schema.scala │ │ ├── 2-avro.scala │ │ └── 4-spark-avro.scala └── test │ └── scala │ ├── 2-avro │ └── LabellingSpec.scala │ ├── package.scala │ ├── 1-schema │ └── ParquetSpec.scala │ ├── 4-spark-avro │ ├── SparkConverterSpec.scala │ └── AvroConverterSpec.scala │ ├── 5-patches │ ├── ApplyPatchSpec.scala │ └── ArbitraryPatch.scala │ └── 3-validation │ └── SchemaRules.scala ├── 1-SCHEMA.md ├── 0-PRELUDE.md ├── 4-BIG_DATA.md ├── README.md ├── 2-AVRO.md ├── 3-VALIDATION.md └── 5-PATCHES.md /.scalafmt.conf: -------------------------------------------------------------------------------- 1 | align = most 2 | maxColumn = 120 -------------------------------------------------------------------------------- /project/build.properties: -------------------------------------------------------------------------------- 1 | sbt.version=1.1.5 2 | -------------------------------------------------------------------------------- /project/plugins.sbt: -------------------------------------------------------------------------------- 1 | addSbtPlugin("com.geirsson" % "sbt-scalafmt" % "1.5.1") 2 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | target/ 2 | *.class 3 | .ensime* 4 | .idea/ 5 | *.ipr 6 | *.iws 7 | .DS_Store 8 | -------------------------------------------------------------------------------- /spark/src/main/scala/Row.scala: -------------------------------------------------------------------------------- 1 | package org.apache.spark.sql 2 | 3 | final case class Row(values: Any*) 4 | -------------------------------------------------------------------------------- /src/main/scala/package.scala: -------------------------------------------------------------------------------- 1 | package object lc2018 { 2 | 3 | type TODO = Nothing 4 | 5 | def TODO[A]: A = throw new Exception("not implemented") 6 | } 7 | -------------------------------------------------------------------------------- /1-SCHEMA.md: -------------------------------------------------------------------------------- 1 | ## Schemas 2 | 3 | We want to build a data ingestion pipeline that's able to accept new data sources without writing any new code. That means that our pipeline will be completely configured by the schema of the data source. 4 | 5 | So we'll be manipulating schemas all day long. And schemas are inherently recursive, so you see where this is heading, right? :) 6 | 7 | When you're ready, head on to `src/main/scala/1-prelude.scala` you'll find explanatory comments and many more `TODO` to replace. 8 | -------------------------------------------------------------------------------- /src/main/scala/JsonPatch.scala: -------------------------------------------------------------------------------- 1 | package lc2018 2 | 3 | import jto.validation.jsonast.JValue 4 | 5 | sealed trait Operation 6 | case object Add extends Operation 7 | case object Remove extends Operation 8 | case object Replace extends Operation 9 | 10 | sealed trait Position 11 | final case class Field(name: String) extends Position 12 | final case class Index(value: Int) extends Position 13 | //final case class Last(pos: Position) extends Position 14 | case object End extends Position 15 | final case class JsonPatch(op: Operation, path: List[Position], value: JValue) 16 | -------------------------------------------------------------------------------- /0-PRELUDE.md: -------------------------------------------------------------------------------- 1 | ## Prelude 2 | 3 | Our HR department told us that we need to provide new hires with a full week of training, but we decided to shrink that to just one hour. 4 | 5 | We'll be using recursion schemes quite intensively for our project, and we've been told that it's a technique that hasn't yet been widely adopted by the industry. 6 | 7 | So let's begin our training by a recursion schemes crash course. There will first be a short presentation given by our ~sales manager~ technical lead (seriously, what kind of developer wears a suit and a tie nowadays?). After that, you'll be tasked to solve the exercises under `src/main/scala/0-prelude.scala`. 8 | -------------------------------------------------------------------------------- /src/test/scala/2-avro/LabellingSpec.scala: -------------------------------------------------------------------------------- 1 | package lc2018 2 | 3 | import org.scalacheck.Properties 4 | import org.scalacheck.Prop._ 5 | import matryoshka._, implicits._, data.Fix, Fix._ 6 | import matryoshka.scalacheck.arbitrary._ 7 | import scalaz._, Scalaz._ 8 | 9 | class AvroSpec extends Properties("Avro-related algebras") with SchemaToAvroAlgebras { 10 | 11 | import SchemaF._ 12 | 13 | property("labelling solution") = forAll { (schema: Fix[SchemaF]) => 14 | val avro = schemaFToAvro(schema) 15 | val back = avro.anaM[Fix[SchemaF]](avroToSchemaF) 16 | (Some(schema) == back) :| s"Some($schema) == $back" 17 | } 18 | 19 | property("registry solution") = forAll { (schema: Fix[SchemaF]) => 20 | val avro = toAvro(schema) 21 | val back = avro.anaM[Fix[SchemaF]](avroToSchemaF) 22 | (Some(schema) == back) :| s"Some($schema) == $back" 23 | } 24 | } 25 | -------------------------------------------------------------------------------- /4-BIG_DATA.md: -------------------------------------------------------------------------------- 1 | ## Data or Big Data ? 2 | 3 | We now know how to represent and validate incoming Data, but our clients are whining that our ADT 4 | serialized in Kryo is not exactly "usable" for them, and the "Data Management" is saying that this 5 | is not exactly a "serious" and durable way of storing data. 6 | 7 | So we're back at the drawing board ! 8 | 9 | We need to store data in a way that is both durable and usable. 10 | 11 | As we'll be using Apache Spark for our batch processing framework, we now need to be able to read our data 12 | as Apache Spark's Row data structure. Fortunately for your bandwith - we don't "really" need the whole Apache Spark 13 | project - so we replicated the *Row* data structure so you may work offline easily. 14 | 15 | But for our Stream Processing framework it makes more sense to use Apache Avro, so now let's finish the job ! 16 | 17 | Your mission if you accept it : 18 | * Create the Algebra necessary to project any GData into Apache Avro or Apache Spark's data structure. 19 | * you'll find more instructions in the `src/main/scala/4-spark-avro.scala` 20 | 21 | 22 | Good Hunting. -------------------------------------------------------------------------------- /spark/src/main/scala/DataType.scala: -------------------------------------------------------------------------------- 1 | package org.apache.spark.sql.types 2 | 3 | 4 | final case class Metadata(map: Map[String, Any]) 5 | object Metadata { 6 | def empty: Metadata = new Metadata(Map.empty[String, Any]) 7 | } 8 | 9 | sealed trait DataType 10 | 11 | 12 | case object BooleanType extends DataType 13 | case object DateType extends DataType 14 | case object DoubleType extends DataType 15 | case object FloatType extends DataType 16 | case object IntegerType extends DataType 17 | case object LongType extends DataType 18 | case object StringType extends DataType 19 | final case class StructField(name: String, dataType: DataType, nullable: Boolean = true, metadata: Metadata = Metadata.empty) 20 | final case class StructType(fields: Array[StructField]) extends DataType { 21 | override def toString: String = s"StructType${fields.map(_.toString).mkString("(", ", ", ")")}" 22 | 23 | override def equals(other: Any): Boolean = other match { 24 | case StructType(otherFields) => (fields zip otherFields).forall{ case (l, r) => l == r} 25 | case _ => false 26 | } 27 | } 28 | final case class ArrayType(elementType: DataType, containsNull: Boolean) extends DataType 29 | -------------------------------------------------------------------------------- /src/test/scala/package.scala: -------------------------------------------------------------------------------- 1 | package lc2018 2 | 3 | import jto.validation.Rule 4 | import jto.validation.jsonast._ 5 | import matryoshka._, implicits._ 6 | import org.joda.time.LocalDateTime 7 | 8 | import org.joda.time.format.ISODateTimeFormat 9 | import play.api.libs.json._ 10 | import scalaz.{Applicative, Functor} 11 | 12 | package object solutions { 13 | def toJson[D](value: D)(implicit D: Recursive.Aux[D, GData], F: Functor[GData]): JValue = { 14 | val alg: Algebra[GData, JValue] = { 15 | case GStruct(fields) => 16 | JObject(fields) 17 | 18 | case GArray(elems) => 19 | JArray(elems) 20 | 21 | case GBoolean(el) => JBoolean(el) 22 | case GFloat(el) => JNumber(el) 23 | case GInteger(el) => JNumber(el) 24 | case GDate(el) => JString(LocalDateTime.fromDateFields(el).toString(ISODateTimeFormat.dateTime())) 25 | case GLong(el) => JNumber(el) 26 | case GDouble(el) => JNumber(el) 27 | case GString(el) => JString(el) 28 | 29 | } 30 | value.cata(alg) 31 | } 32 | 33 | import SchemaRules.JRule 34 | implicit val ruleApplicativeForScalaz: Applicative[JRule] = new Applicative[JRule] { 35 | override def point[A](a: => A): JRule[A] = Rule.pure(a) 36 | 37 | override def ap[A, B](fa: => JRule[A])(f: => JRule[A => B]): JRule[B] = fa.ap(f) 38 | } 39 | 40 | } 41 | -------------------------------------------------------------------------------- /src/test/scala/1-schema/ParquetSpec.scala: -------------------------------------------------------------------------------- 1 | package lc2018 2 | 3 | import org.apache.spark.sql.types._ 4 | import org.scalacheck._ 5 | import org.scalacheck.Prop._ 6 | import matryoshka._, implicits._, data.Fix, Fix._ 7 | import matryoshka.scalacheck.arbitrary._ 8 | import scalaz._, Scalaz._ 9 | 10 | /** 11 | * Verifying that our (co)algebras that convert SchemaF from/to DataType are correct should be trivial using 12 | * property-based testing. 13 | * 14 | * But to do that, we first need to be able to generate arbitrary Fix[SchemaF]. So we ned a way to summon instances of 15 | * Arbitrary[T[SchemaF]] for any fix-point T. We'll again need help from our new friend Delay. 16 | * Now we only need to verify that, given an arbitrary Fix[SchemaF], converting it to a DataType using `schemaFToDataType` 17 | * and then convert that back to Fix[SchemaF] using `dataTypeToSchemaF` should produce the initial Fix[SchemaF]. 18 | */ 19 | object SchemaFToDataTypeAlgebrasSpec extends Properties("Parquet-related algebras") with SchemaFToDataTypeAlgebras { 20 | 21 | import SchemaF._ 22 | 23 | property("invertible") = forAll { (schema: Fix[SchemaF]) => 24 | // We want to convert `schema` to DataType and then back to Fix[SchemaF] using the (co)algebras we've just defined. 25 | val roundtrip: Fix[SchemaF] = schema.cata(schemaFToDataType).ana[Fix[SchemaF]](dataTypeToSchemaF) 26 | (roundtrip == schema) :| s"$roundtrip\n==\n$schema" 27 | } 28 | } 29 | -------------------------------------------------------------------------------- /src/test/scala/4-spark-avro/SparkConverterSpec.scala: -------------------------------------------------------------------------------- 1 | package lc2018 2 | 3 | import lc2018.GData._ 4 | import lc2018.SparkConverter._ 5 | import matryoshka._ 6 | import matryoshka.implicits._ 7 | import matryoshka.data.Fix._ 8 | import matryoshka.data._ 9 | import org.apache.spark.sql.Row 10 | import org.scalacheck.Prop._ 11 | import org.scalacheck._ 12 | import org.scalatest.{FlatSpec, Matchers} 13 | 14 | import scala.collection.immutable.ListMap 15 | 16 | class SparkConverterSpec extends Properties("Spark-rules algebras") with SchemaToAvroAlgebras { 17 | 18 | property("should generate valid and compatible data vs schema") = forAll(genSchemaAndData[Fix[SchemaF], Fix[GData]]) { 19 | case (_, data) => fromGDataToSparkRow(data) != null // should not fail 20 | } 21 | } 22 | 23 | class SparkConverterTest extends FlatSpec with Matchers { 24 | 25 | "Spark conversions" should "work with sample schema and data" in { 26 | val body = """{"a": "toto", "b": 12}""" 27 | val data = Fix[GData](GStruct(ListMap("a" -> Fix(GString("toto")), "b" -> Fix(GInteger(12))))) 28 | val row = fromGDataToSparkRow(data) 29 | row should be(Row("toto", 12)) 30 | } 31 | 32 | it should "work with nested data as well" in { 33 | val body = """{"a": "toto", "b": { "c": 12 }}""" 34 | val data = Fix[GData]( 35 | GStruct( 36 | ListMap( 37 | "a" -> Fix(GString("toto")), 38 | "b" -> Fix(GStruct(ListMap("c" -> Fix(GInteger(12))))) 39 | ))) 40 | val row = fromGDataToSparkRow(data) 41 | row should be(Row("toto", Row(12))) 42 | } 43 | } 44 | -------------------------------------------------------------------------------- /src/test/scala/5-patches/ApplyPatchSpec.scala: -------------------------------------------------------------------------------- 1 | package lc2018 2 | package solutions 3 | 4 | import jto.validation.jsonast._ 5 | import org.scalacheck.Properties 6 | import org.scalacheck.Prop._ 7 | 8 | import matryoshka._, implicits._ 9 | import matryoshka.data.Fix, Fix._ 10 | import org.scalatest.{FlatSpec, Matchers} 11 | 12 | import scala.collection.immutable.ListMap 13 | 14 | import scalaz.\/- 15 | 16 | class ApplyPatchSpec extends FlatSpec with Matchers with PatchAlgebras { 17 | 18 | val patch = JsonPatch(Replace, List(Field("foo"), Field("bar"), End), JNumber(42)) 19 | 20 | val schema: Fix[SchemaF] = StructF( 21 | ListMap( 22 | "foo" -> StructF( 23 | ListMap( 24 | "bar" -> IntegerF[Fix[SchemaF]]().embed, 25 | "baz" -> BooleanF[Fix[SchemaF]]().embed 26 | )).embed, 27 | "qux" -> StringF[Fix[SchemaF]]().embed 28 | )).embed 29 | 30 | val initialData: Fix[GData] = GStruct( 31 | ListMap( 32 | "foo" -> GStruct( 33 | ListMap( 34 | "bar" -> GInteger[Fix[GData]](1).embed, 35 | "baz" -> GBoolean[Fix[GData]](true).embed 36 | )).embed, 37 | "qux" -> GString[Fix[GData]]("hoay!").embed 38 | )).embed 39 | 40 | val expected = GStruct( 41 | ListMap( 42 | "foo" -> GStruct( 43 | ListMap( 44 | "bar" -> GInteger[Fix[GData]](42).embed, 45 | "baz" -> GBoolean[Fix[GData]](true).embed 46 | )).embed, 47 | "qux" -> GString[Fix[GData]]("hoay!").embed 48 | )).embed 49 | "Applying a patch" should "update the relevant fields" in { 50 | applyPatch(schema, patch, initialData) should be(\/-(expected)) 51 | } 52 | } 53 | -------------------------------------------------------------------------------- /src/test/scala/4-spark-avro/AvroConverterSpec.scala: -------------------------------------------------------------------------------- 1 | package lc2018 2 | 3 | import lc2018.GData._ 4 | import matryoshka._, implicits._ 5 | import matryoshka.data._, Fix._ 6 | import org.apache.avro.generic.GenericData 7 | import org.scalacheck.Prop._ 8 | import org.scalacheck._ 9 | import org.scalatest.{FlatSpec, Matchers} 10 | import AvroConverter._ 11 | 12 | import scala.collection.immutable.ListMap 13 | 14 | class AvroConverterSpec extends Properties("Avro-rules algebras") with SchemaToAvroAlgebras { 15 | 16 | property("should generate valid avro data") = forAll(genSchemaAndData[Fix[SchemaF], Fix[GData]]) { 17 | case (schema, data) => 18 | val result = fromGDataToAvro(schema, data) 19 | val avroSchema = schemaFToAvro(schema) 20 | val genericData = GenericData.get() 21 | 22 | result.isRight :| s"Failed to generate avro data" && 23 | genericData.validate(avroSchema, result.toOption.get) :| "Datum generated is not valid according to Avro Schema" 24 | } 25 | } 26 | 27 | class AvroConverterTest extends FlatSpec with Matchers { 28 | 29 | "Avro conversion" should "work with sample schema and data" in { 30 | val body = """{"a": "toto", "b": 12}""" 31 | val schema = Fix[SchemaF](StructF(ListMap("a" -> Fix(StringF()), "b" -> Fix(IntegerF())))) 32 | val data = Fix[GData](GStruct(ListMap("a" -> Fix(GString("toto")), "b" -> Fix(GInteger(12))))) 33 | val avro = fromGDataToAvro(schema, data) 34 | 35 | avro.isRight should be(true) 36 | val result = avro.toOption.get 37 | result.toString should be("""{"a": "toto", "b": 12}""") 38 | GenericData.get().validate(schemaFToAvro(schema), result) should be(true) 39 | } 40 | } 41 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # Recursion schemes without the barbed wire 2 | 3 | Welcome to INC Inc. We are so happy we just hired a whole team of motivated engineers. Here at INC Inc. (INC is a Neat Company), we are proud proponents of statically-typed functional programming on the JVM — well, basically we use Scala. 4 | 5 | On your first day at work, we have a good news and a bad news. The good news is: we have an exciting new mission, our most important client, AcmeCorp has tasked us with the construction of its "meta data lake", whatever that means. The bad news is they want it live by tonight. 6 | 7 | But fear not, out architects have already designed the whole system and it works like a charm (on Powerpoint). All you need to do is to follow the specs and write a few Scala lines. 8 | 9 | ## Before we begin 10 | 11 | You'll need to fulfill a few requirements in order to get everything work. You'll need to install 12 | 13 | * Java8 JDK 14 | * sbt 15 | 16 | Everything else should be pretty much working out of the box. This project has a few external dependencies though, so in order to save everyone some network bandwidth, it would be cool if you managed to clone the repository and issue the `sbt update` command in advance of the workshop. 17 | 18 | ## Structure of the workshop 19 | 20 | This workshop is made of a series of practical exercises, 21 | interleaved with a bunch of useful explanations about specific recursion schemes, patterns and techniques. 22 | Each exercise lives in the main package of `src/main/scala` 23 | and a solution to each exercise is available in the `solutions` package. 24 | 25 | ## TOC 26 | 27 | * 0-PRELUDE 28 | * 1-SCHEMA 29 | * 2-AVRO 30 | * 3-VALIDATION 31 | * 4-SPARK-AVRO 32 | * 5-PATCHES 33 | -------------------------------------------------------------------------------- /2-AVRO.md: -------------------------------------------------------------------------------- 1 | ## Avro schemas 2 | 3 | Avro is a library (that has a Java version) and a data representation format. It is widely used in the data industry as it offers interesting features like schema versioning, automatic data upcast and downcast between schema versions and things like that. 4 | 5 | Unfortunately, it has not been designed with functional programming and strong typing in mind, let alone recursion schemes... So our job will be a little harder this time. 6 | 7 | You should find relevant hints in the comments in `src/main/scala/2-avro.scala` that'll help you tame that Avro beast. 8 | 9 | 10 | ##AVRO SCHEMA 101 11 | 12 | ### Inspecting schemas 13 | 14 | * a single type : Schema 15 | * a getType method that gives you the "kind" of schema: RECORD, ARRAY, INT, STRING, etc ... 16 | * depending on the result of getType, it is safe to call certain methods : 17 | * in case of RECORD, you can call `getFields()` 18 | * in case of ARRAY, you can call `getElementType() 19 | 20 | ### Building schemas 21 | 22 | * For simple types, you can do something like : Schema.create(Schema.Type.INT) 23 | * Some simple types (like Date) do not have a "natural" representation, but you can piggyback on existing schemas using so-called logical types : 24 | * To represent dates as long : 25 | `LogicalTypes.timestampMillis().addToSchema(Schema.create(Schema.Type.LONG))` 26 | * For arrays you can do : SchemaBuilder.array().items(...) <- you replace ... with the right Schema 27 | * For structs (called record in the Avro realm) it's a bit more complicated : 28 | ``` 29 | SchemaBuilder 30 | .record("nameOfMyRecord") 31 | .fields 32 | .name("nameOfTheField").`type`(...).noDefault <- replace ... with the right Schema, don't forget the noDefault 33 | // add more fields as needed 34 | .endRecord 35 | ``` 36 | -------------------------------------------------------------------------------- /src/test/scala/5-patches/ArbitraryPatch.scala: -------------------------------------------------------------------------------- 1 | package lc2018 2 | package solutions 3 | 4 | import jto.validation.jsonast.Ast 5 | import org.scalacheck.{Arbitrary, Gen} 6 | import matryoshka._, implicits._ 7 | 8 | trait ArbitraryPatch extends GDataInstances with SchemaToAvroAlgebras with DataWithSchemaGenerator { 9 | def patchForData[S, D](schema: S, data: D)(implicit S: Recursive.Aux[S, SchemaF], 10 | D: Birecursive.Aux[D, GData]): Gen[JsonPatch] = 11 | for { 12 | depth <- Gen.choose(1, 10) 13 | (path, sch) <- pathInData(depth, schema, data) 14 | patchValue <- sch cata schemaToDataGen 15 | } yield JsonPatch(Replace, path, toJson(patchValue)) 16 | 17 | def pathInData[S, D](depth: Int, schema: S, data: D)(implicit S: Recursive.Aux[S, SchemaF], 18 | D: Recursive.Aux[D, GData]): Gen[(List[Position], S)] = 19 | if (depth == 0) Gen.const((End :: Nil, schema)) 20 | else { 21 | (schema.project, data.project) match { 22 | case (StructF(fieldsS), GStruct(fieldsD)) if fieldsD.nonEmpty => 23 | for { 24 | head <- Gen.oneOf(fieldsS.keys.toSeq) 25 | sch = fieldsS(head) 26 | (tail, sub) <- pathInData(depth - 1, sch, fieldsD(head)) 27 | } yield (Field(head) :: tail, sub) 28 | case (ArrayF(elem), GArray(elems)) if elems.nonEmpty => 29 | for { 30 | head <- Gen.choose(0, elems.size - 1) 31 | (tail, sub) <- pathInData(depth - 1, elem, elems(head)) 32 | } yield (Index(head) :: tail, sub) 33 | case _ => (End :: Nil, schema) 34 | } 35 | } 36 | 37 | implicit def dataAndTwoPatches[S, D](implicit S: Birecursive.Aux[S, SchemaF], 38 | D: Birecursive.Aux[D, GData]): Arbitrary[(S, D, JsonPatch, JsonPatch)] = 39 | Arbitrary { 40 | for { 41 | (s, data) <- genSchemaAndData[S, D] 42 | patch1 <- patchForData(s, data) 43 | patch2 <- patchForData(s, data) 44 | } yield (s, data, patch1, patch2) 45 | } 46 | } 47 | -------------------------------------------------------------------------------- /3-VALIDATION.md: -------------------------------------------------------------------------------- 1 | ## It's a tough world out there 2 | 3 | We now have a Schema both in its Pattern-Functor form and usable as an Apache Avro Schema 4 | but it's not enough to have a working DataLake. 5 | 6 | It might be obvious, but we need Data ! 7 | 8 | We managed to negotiate for our MVP with the Product Owner that all incoming data will be in JSON. 9 | 10 | So now we need a way to make it work and ingest any kind of data into our Lake. 11 | But for it not to become, an absurd pile of junk data : a DataSwamp 12 | we can't trust the outside world with the data we'll receive. 13 | 14 | We need to design a system that will validate incoming data according to the expected Schema 15 | and output meaningful errors to our counterparts. 16 | 17 | Our main objective for this part III of our workshop will be to : 18 | 19 | > Leverage the power of JTO Validation (https://github.com/jto/validation) 20 | > and Matryoshka to generate `Rules` that will validate any incoming Data 21 | 22 | Being professionals we need this framework to be properly tested, of course a small sample unit test 23 | will be of great help, but can't possibly be enough to handle the variety of Schema and Data we will be handling. 24 | 25 | So another of your objective will be to generate arbitrary Schema and Data with ScalaCheck and 26 | test that the validation `Rules` that you'll create *really do in fact* validate your data 27 | The funny thing being that your generated *random* data should of course be compatible with your generated *random* schema. 28 | 29 | The tests that you'll need are already coded in `src/test/scala/3-validation/SchemaRules.scala` 30 | But it relies on a Schema *and* Data Generator in `src/main/scala/3-validation.scala` that you'll need to code. 31 | 32 | You'll be provided with the Pattern-Functor needed to represent data (i.e. `GData`), 33 | so all you need to complete this part is to code in `src/main/scala/3-validation.scala` : 34 | * The `Rules` generation method `SchemaRules.fromSchemaToRules(schema: Fix[SchemaF]): JRule[Fix[GData]]` 35 | * The Schema and Data generator `DataWithSchemaGenerator.genSchemaAndData: Gen[(Fix[SchemaF], Fix[GData])]` 36 | 37 | More specific constraints are included in the comments of the source code. 38 | 39 | Good Hunting. -------------------------------------------------------------------------------- /src/test/scala/3-validation/SchemaRules.scala: -------------------------------------------------------------------------------- 1 | package lc2018 2 | 3 | import jto.validation.jsonast.Ast 4 | import lc2018.GData._ 5 | import lc2018.SchemaRules._ 6 | import matryoshka._ 7 | import matryoshka.data._, Fix._ 8 | import matryoshka.implicits._ 9 | import org.joda.time.LocalDateTime 10 | import org.joda.time.format.ISODateTimeFormat 11 | import org.scalacheck.Prop._ 12 | import org.scalacheck._ 13 | import org.scalatest.{FlatSpec, Matchers} 14 | import play.api.libs.json._ 15 | 16 | import scala.collection.immutable.ListMap 17 | 18 | class SchemaRulesSpec extends Properties("Schema-rules algebras") with SchemaToAvroAlgebras { 19 | 20 | property("should generate valid and compatible data vs schema") = forAll(genSchemaAndData[Fix[SchemaF], Fix[GData]]) { 21 | case (schema, data) => 22 | val rules = fromSchemaToRules(schema) 23 | val jsonPayload = toJson(data) 24 | val result = Ast.from.validate(jsonPayload).map(rules.validate) 25 | result.isValid :| s"Rules did not validate arbitrary data and schema : $schema and $data" 26 | } 27 | 28 | def toJson(value: Fix[GData]): JsValue = { 29 | val alg: Algebra[GData, JsValue] = { 30 | case GStruct(fields) => 31 | JsObject(fields) 32 | 33 | case GArray(elems) => 34 | JsArray(elems) 35 | 36 | case GBoolean(el) => JsBoolean(el) 37 | case GFloat(el) => JsNumber(BigDecimal.decimal(el)) 38 | case GInteger(el) => JsNumber(el) 39 | case GDate(el) => JsString(LocalDateTime.fromDateFields(el).toString(ISODateTimeFormat.basicDateTime())) 40 | case GLong(el) => JsNumber(el) 41 | case GDouble(el) => JsNumber(el) 42 | case GString(el) => JsString(el) 43 | 44 | } 45 | value.cata(alg) 46 | } 47 | } 48 | 49 | class SchemaRulesTest extends FlatSpec with Matchers { 50 | 51 | "Rule generation" should "work with sample schema and data" in { 52 | val body = """{"a": "toto", "b": 12}""" 53 | val schema = Fix[SchemaF](StructF(ListMap("a" -> Fix(StringF()), "b" -> Fix(IntegerF())))) 54 | val data = Fix[GData](GStruct(ListMap("a" -> Fix(GString("toto")), "b" -> Fix(GInteger(12))))) 55 | val rules = fromSchemaToRules(schema) 56 | val result = Ast.from.validate(Json.parse(body)).map(rules.validate) 57 | result.isValid should be(true) 58 | } 59 | } 60 | -------------------------------------------------------------------------------- /5-PATCHES.md: -------------------------------------------------------------------------------- 1 | ## Applying JSON patches to GenericData 2 | 3 | So far we assumed that each of our data sources have a known schema. Unfortunately, this is not the case for all out client's sources. For each of their business entities, the client maintains a Kafka topic where they log each modification of the entities, formatted as a JSON patch (as specified by [RFC 6902](https://tools.ietf.org/html/rfc6902)). 4 | 5 | These patches have the following structure : 6 | 7 | ```json 8 | { 9 | "op": "add", 10 | "path": "/profile/phoneNumbers", 11 | "value": { 12 | "type": "landline", 13 | "number": "+33123456789" 14 | } 15 | } 16 | ``` 17 | 18 | They cannot provide us with a schema for such patches, because the *schema* of the `value` field depends on the *value* of the `path` field. Nevertheless, we want to be able to validate the incoming patches are correctly structured in respect with the target entity's schema. Moreover, since we also maintain a copy of the corresponding entities, we want to apply the patches to these entities. 19 | 20 | In other words, we want to write a function that given a JSON patch, the schema of the target entity (as a `T[SchemaF])` and the current state of the target entity (as a T[DataF]): 21 | 1. verifies that the patche's `path` exists in the entity's schema (it points to a subshema) and that the patch's `value` complies to this subschema (producing a representation of `value` as a `T[DataF]` 22 | 2. uses that representation to perform the patche's operation on the current state of the entity. We'll only implement the `replace` operation. 23 | 24 | ### Before you start 25 | 26 | This last assignment is by far the most difficult, be (we hope that) it's also the most interesting. After having mastered the various "tactics" (pattern-functors, algebras, etc), the next hurdle on the path toward using recursion schemes in production is to become able to find a "strategy" to combine them to solve the problem at hand. It is often difficult at first to find the pattern-functor that matches the structure of the problem best, the right scheme to use on it or the carrier for the needed (co)algebras. 27 | 28 | From an educational point of view, it would be better if you tried to build your own solution from sratch and come up with your own strategy. By now you should know all the required tactics to solve this problem. We've encoded a simple representation for JSON patches in `src/main/scala/JsonPatch.scala`, you might also be interested in the definition of `matryoshka.patterns.ListF`. 29 | 30 | Nevertheless, if you find yourself stuck or feel you might lack time to finish, we've laid out a solution in `src/main/scala/solutions/5-patches.scala`. But don't cheat and jump right at the solution before you've tried to come up with your own. 31 | 32 | -------------------------------------------------------------------------------- /src/main/scala/3-validation.scala: -------------------------------------------------------------------------------- 1 | package lc2018 2 | 3 | import jto.validation._ 4 | import jto.validation.jsonast._ 5 | import matryoshka._ 6 | import matryoshka.data._ 7 | import org.scalacheck.Arbitrary 8 | import scalaz.Scalaz._ 9 | import scalaz._ 10 | 11 | import scala.collection.immutable.ListMap 12 | import scala.language.higherKinds 13 | 14 | /** 15 | * Now that we have a Schema we will need to validate incoming data (JSON) 16 | * and output "validated" data or "errors" with what went wrong for the sources 17 | * to be able to fix their exports. 18 | * 19 | * For that we'll use the JTO Validation library but first we need to define what a "Data" is 20 | */ 21 | sealed trait GData[A] 22 | 23 | final case class GStruct[A](fields: ListMap[String, A]) extends GData[A] 24 | 25 | final case class GArray[A](element: Seq[A]) extends GData[A] 26 | 27 | final case class GBoolean[A](value: Boolean) extends GData[A] 28 | 29 | final case class GDate[A](value: java.util.Date) extends GData[A] 30 | 31 | final case class GDouble[A](value: Double) extends GData[A] 32 | 33 | final case class GFloat[A](value: Float) extends GData[A] 34 | 35 | final case class GInteger[A](value: Int) extends GData[A] 36 | 37 | final case class GLong[A](value: Long) extends GData[A] 38 | 39 | final case class GString[A](value: String) extends GData[A] 40 | 41 | object GData extends GDataInstances with DataWithSchemaGenerator 42 | 43 | /** 44 | * This is where you'll be working your magic. 45 | * This code will need to go through every part of the Schema tree 46 | * and create a `Rule` for each value, field of struct or array. 47 | */ 48 | object SchemaRules { 49 | 50 | /** 51 | * Here we only define a simple type alias to simplify the code later on. 52 | */ 53 | type JRule[A] = Rule[JValue, A] 54 | 55 | /** 56 | * One important thing is that going through a struct 57 | * means going through its fields one-by-one and generate `Rules` 58 | * that will be translated to a `Rule` for the whole struct. 59 | * 60 | * The best way will be to `traverse` the fields (there is an Applicative instance for JRule) 61 | */ 62 | def fromSchemaToRules[T](schema: T)(implicit T: Recursive.Aux[T, SchemaF]): JRule[Fix[GData]] = TODO 63 | 64 | } 65 | 66 | /** 67 | * We need to test that validation - of course specific unit tests can be done 68 | * but we're quite paranoid so let's "generate" abitrary schemas using ScalaCheck 69 | * 70 | * But then again - from a Schema we'll be able to generate Rules 71 | * But to validate those rules we'd need data. 72 | * So let's generate Data as well : 73 | * Data that will, of course, need to be compatible with the Schema itself. 74 | */ 75 | trait DataWithSchemaGenerator { 76 | 77 | import org.scalacheck.Gen 78 | 79 | import scala.collection.JavaConverters._ 80 | 81 | // Goal : first generate a schema and then recurse on it to generate the appropriate data 82 | // Bonus : handle number of fields 83 | // Bonus : handle max depth to "finish somewhere" 84 | // And don't forget the master defining what to generate is the schema 85 | def genSchemaAndData[S, D](implicit S: Birecursive.Aux[S, SchemaF], D: Corecursive.Aux[D, GData]): Gen[(S, D)] = TODO 86 | } 87 | -------------------------------------------------------------------------------- /src/main/scala/solutions/5-patches.scala: -------------------------------------------------------------------------------- 1 | package lc2018 2 | package solutions 3 | 4 | import jto.validation.ValidationError 5 | import jto.validation.jsonast.JValue 6 | import matryoshka._, implicits._ 7 | import matryoshka.patterns.{ListF, NilF, ConsF} 8 | import scalaz._, Scalaz._ 9 | 10 | sealed trait Step[S, D] 11 | final case class InnerStep[S, D](position: Position, schema: S, data: D) extends Step[S, D] 12 | final case class LastStep[S, D](data: D) extends Step[S, D] 13 | 14 | object Step { 15 | def inner[S, D](pos: Position, schema: S, data: D): Step[S, D] = InnerStep(pos, schema, data) 16 | def last[S, D](data: D): Step[S, D] = LastStep[S, D](data) 17 | } 18 | 19 | sealed trait EarlyResult extends Product with Serializable 20 | final case class InvalidPath(path: List[Position]) extends EarlyResult 21 | final case class InvalidPatch(value: JValue, errors: Seq[(jto.validation.Path, Seq[ValidationError])]) 22 | extends EarlyResult 23 | 24 | trait PatchAlgebras { 25 | 26 | type Traversal[Schema, Data] = (List[Position], Schema, Data) 27 | 28 | type ShortCircuitable[A] = EarlyResult \/ A 29 | 30 | def lookupS[S](position: Position, schema: S)(implicit S: Recursive.Aux[S, SchemaF]): Option[S] = 31 | (position, schema.project) match { 32 | case (Field(name), StructF(fields)) => fields.get(name) 33 | case (Index(idx), ArrayF(elem)) => Some(elem) 34 | case _ => None 35 | } 36 | 37 | def lookupD[D](position: Position, data: D)(implicit D: Recursive.Aux[D, GData]): Option[D] = 38 | (position, data.project) match { 39 | case (Field(name), GStruct(fields)) => fields.get(name) 40 | case (Index(idx), GArray(elements)) => if (idx >= 0 && idx < elements.size) elements(idx).some else None 41 | case _ => None 42 | } 43 | 44 | def validatePatch[S, D](patchValue: JValue)( 45 | implicit S: Recursive.Aux[S, SchemaF], 46 | D: Birecursive.Aux[D, GData]): CoalgebraM[ShortCircuitable, ListF[Step[S, D], ?], Traversal[S, D]] = { 47 | case (Nil, _, _) => NilF().right 48 | case (End :: Nil, schema, data) => 49 | val validator = SchemaRules.fromSchemaToRules(schema) 50 | \/.fromEither(validator.validate(patchValue).toEither) 51 | .bimap( 52 | InvalidPatch(patchValue, _), { subData => 53 | ConsF(Step.last[S, D](subData), (List.empty[Position], schema, data)) 54 | } 55 | ) 56 | 57 | case (path, schema, data) => 58 | (lookupS(path.head, schema) |@| lookupD(path.head, data)) { (subSchema, subData) => 59 | ConsF(Step.inner(path.head, schema, data), (path.tail, subSchema, subData)).right 60 | }.getOrElse(InvalidPath(path).left) 61 | } 62 | 63 | def updateValue[S, D](implicit S: Recursive.Aux[S, SchemaF], 64 | D: Birecursive.Aux[D, GData]): AlgebraM[ShortCircuitable, ListF[Step[S, D], ?], D] = { 65 | case NilF() => GBoolean[D](true).embed.right // hugly hack 66 | case ConsF(LastStep(data), _) => data.right 67 | case ConsF(InnerStep(position, schema, current), newData) => 68 | doUpdate(position, current, newData) 69 | case _ => InvalidPath(Nil).left 70 | } 71 | 72 | def doUpdate[D](position: Position, current: D, newData: D)( 73 | implicit D: Birecursive.Aux[D, GData]): ShortCircuitable[D] = 74 | (position, current.project) match { 75 | case (Field(n), GStruct(fields)) => 76 | GStruct(fields.map { 77 | case (name, field) => 78 | if (name == n) name -> newData else name -> field 79 | }).embed.right 80 | case (Index(i), GArray(elements)) => 81 | GArray(elements.take(i) ++ Seq(newData) ++ elements.drop(i + 1)).embed.right 82 | case _ => InvalidPath(position :: Nil).left 83 | } 84 | 85 | def applyPatch[S, D](schema: S, patch: JsonPatch, current: D)(implicit S: Recursive.Aux[S, SchemaF], 86 | D: Birecursive.Aux[D, GData]): EarlyResult \/ D = 87 | (patch.path, schema, current) 88 | .hyloM[ShortCircuitable, ListF[Step[S, D], ?], D](updateValue, validatePatch(patch.value)) 89 | } 90 | -------------------------------------------------------------------------------- /src/main/scala/0-prelude.scala: -------------------------------------------------------------------------------- 1 | package lc2018 2 | 3 | import matryoshka._ 4 | import matryoshka.data._ 5 | import matryoshka.implicits._ 6 | import scalaz._ 7 | import Scalaz._ 8 | 9 | /** 10 | * Let's begin with what's probably the simplest possible recursive structure: natural numbers 11 | * 12 | * Natural numbers can be defined recursively: 13 | * A number is either 14 | * - zero, noted `Z` 15 | * - the successor of a number, noted `S(n) where n is the notation of some number 16 | * 17 | * This notation is often referred to as the Peano notation. 18 | */ 19 | object PeanoNumbers { 20 | 21 | /** 22 | * We want to encode Peano numbers as a recursive type. 23 | * This encoding will be a type constructor, out so-called "pattern-functor" 24 | * 25 | * Hint: there is a type in the standard library that has exactly the structure we want. 26 | */ 27 | type PeanoNumberF[A] = TODO 28 | 29 | /** 30 | * The problem with the PeanonumberF encoding is that now, different numbers 31 | * will have different types. 32 | * 33 | * We need a fix-point of PeanoNumberF to build a type that can represent all numbers. 34 | */ 35 | type PeanoNumber = TODO 36 | 37 | /** 38 | * Now let's write our very first Algebra! Yay! 39 | * 40 | * We want to transform our Peano representation to Int. It's as simple as counting 41 | * the "layers" of "successor". 42 | */ 43 | def countLayers: Algebra[PeanoNumberF, Int] = TODO 44 | 45 | /** 46 | * We now have all the ingredients needed to use our first recursion scheme. 47 | * 48 | * Hint: this will use the algebra defined above to *destroy* our recursive structure. 49 | */ 50 | def toInt(peano: PeanoNumber): Int = TODO 51 | 52 | /** 53 | * Now we just need a value to test our functions 54 | */ 55 | val three: PeanoNumber = TODO 56 | 57 | assert(toInt(three) == 3) 58 | } 59 | 60 | /** 61 | * We now move on to a more interesting recursive structure: the binary tree. 62 | */ 63 | object BinaryTrees { 64 | 65 | sealed trait Tree 66 | final case class Branch(label: Int, left: Tree, right: Tree) extends Tree 67 | final case class Leaf(label: Int) extends Tree 68 | final case class Empty() extends Tree 69 | 70 | /** 71 | * So the first thing to do is to "translate" our Tree to a pattern-functor. 72 | * This is done by adding a type parameter and replace each recursive occurrences 73 | * of Tree by this type parameter in the ADT. 74 | */ 75 | sealed trait TreeF[A] 76 | // TODO 77 | 78 | /** 79 | * Of course, we need to have an instance of Functor[TreeF] for it to be a real pattern-functor. 80 | */ 81 | implicit val treeFFunctor: Functor[TreeF] = TODO 82 | 83 | /** 84 | * It's a good idea to have a pair of (co)algebras that go from Tree to TreeF (and vice versa). 85 | */ 86 | def treeAlg: Algebra[TreeF, Tree] = TODO 87 | def treeCoalg: Coalgebra[TreeF, Tree] = TODO 88 | 89 | /** 90 | * These two (co)algebras make it easy to provide a Birecursive instance for Tree/TreeF. 91 | * This allows to treat Tree as if it were a TreeF, and thus enables to use schemes directly 92 | * on a Tree (rather than having to wrap it in a fixpoint). 93 | */ 94 | implicit val treeBirecursive: Birecursive.Aux[Tree, TreeF] = Birecursive.fromAlgebraIso(treeAlg, treeCoalg) 95 | 96 | import Recursive.ops._ 97 | 98 | /** 99 | * A function TreeF[List[Int]] => List[Int] 100 | * 101 | * The produced list contains the labels of all the nodes in the tree 102 | * as enumerated by a depth-first, left-to-right traversal. 103 | */ 104 | def toList: Algebra[TreeF, List[Int]] = TODO 105 | 106 | val testTree: Recursive.AllOps[Tree, TreeF] = Branch(12, Branch(10, Leaf(1), Empty()), Leaf(15)) 107 | 108 | assert(testTree.cata(toList) == List(1, 10, 12, 15)) 109 | 110 | /** 111 | * A function List[Int] => TreeF[List[Int]] 112 | * 113 | * This function MUST produce a "sort tree", that is, a tree where each 114 | * node has a label that is greater than all the labels in its left subtree 115 | * and lesser than all the labels in its right subtree. 116 | */ 117 | def fromList: Coalgebra[TreeF, List[Int]] = TODO 118 | 119 | /** 120 | * I wonder what this mystery function does… 121 | */ 122 | def mystery(input: List[Int]): List[Int] = input.hylo(toList, fromList) 123 | 124 | } 125 | -------------------------------------------------------------------------------- /src/main/scala/1-schema.scala: -------------------------------------------------------------------------------- 1 | package lc2018 2 | 3 | import org.scalacheck.{Arbitrary, Gen} 4 | import scala.collection.immutable.ListMap 5 | import scalaz._, Scalaz._ 6 | import matryoshka._, implicits._ 7 | 8 | /** 9 | * Without further ado, let's define our main pattern-functor for the remaining of the session. 10 | */ 11 | sealed trait SchemaF[A] 12 | 13 | // we'll use a ListMap to keep the ordering of the fields 14 | final case class StructF[A](fields: ListMap[String, A]) extends SchemaF[A] 15 | final case class ArrayF[A](element: A) extends SchemaF[A] 16 | final case class BooleanF[A]() extends SchemaF[A] 17 | final case class DateF[A]() extends SchemaF[A] 18 | final case class DoubleF[A]() extends SchemaF[A] 19 | final case class FloatF[A]() extends SchemaF[A] 20 | final case class IntegerF[A]() extends SchemaF[A] 21 | final case class LongF[A]() extends SchemaF[A] 22 | final case class StringF[A]() extends SchemaF[A] 23 | 24 | object SchemaF extends SchemaFToDataTypeAlgebras with SchemaFArbitrary { 25 | 26 | /** 27 | * As usual, we need to define a Functor instance for our pattern. 28 | */ 29 | implicit val schemaFScalazFunctor: Functor[SchemaF] = TODO 30 | 31 | /** 32 | * It might be usefull to have a nice string representation of our schemas. 33 | * 34 | * Let say that we want a representation where: 35 | * - simple types like `BooleanF()` or `DateF()` would be represented as `boolean` and `date` respectively. 36 | * - arrays like `ArrayF(IntegerF())` would be represented as `[ integer ]`. 37 | * - structs like `StructF(ListMap("foo" -> FloatF(), "bar" -> LongF())` would be represented as 38 | * `{ foo: float, bar: long }` 39 | * 40 | * Because of the recursive nature of SchemaF, we cannot eagerly write a Show instance for SchemaF. 41 | * Fortunately matryoshka defines the Delay typeclass that is useful in such cases. It allows to "break 42 | * the infinite loop" by delaying the instantiation of Show[SchemaF[A]]. 43 | * 44 | * matryoshka.implicits contains implicit functions that, given that Delay[Show, SchemaF] instance, 45 | * will provide a Show[T[SchemaF]] for any fix-point T. 46 | * 47 | */ 48 | implicit val schemaFDelayShow: Delay[Show, SchemaF] = new Delay[Show, SchemaF] { 49 | def apply[A](showA: Show[A]): Show[SchemaF[A]] = new Show[SchemaF[A]] { 50 | override def show(schema: SchemaF[A]): Cord = TODO 51 | } 52 | } 53 | 54 | } 55 | 56 | /** 57 | * Now that we have a proper pattern-functor, we need (co)algebras to go from our "standard" schemas to 58 | * our new and shiny SchemaF (and vice versa). 59 | * 60 | * Lets focus on Parquet schemas first. Parquet is a columnar data format that allows efficient processing 61 | * of large datasets in a distributed environment (eg Spark). In the Spark API, Parquet schemas are represented 62 | * as instances of the DataType type. So what we want to write here is a pair of (co)algebras that go from/to 63 | * SchemaF/DataType. 64 | * 65 | * NOTE: in order not to depend directly on Spark (and, hence, transitively on half of maven-central), we've copied 66 | * the definition of the DataType trait and its subclasses in the current project under 67 | * `spark/src/main/scala/DataType.scala`. 68 | */ 69 | trait SchemaFToDataTypeAlgebras { 70 | 71 | import org.apache.spark.sql.types._ 72 | 73 | /** 74 | * As usual, simply a function from SchemaF[DataType] to DataType 75 | */ 76 | def schemaFToDataType: Algebra[SchemaF, DataType] = TODO 77 | 78 | /** 79 | * And the other way around, a function from DataType to SchemaF[DataType] 80 | */ 81 | def dataTypeToSchemaF: Coalgebra[SchemaF, DataType] = TODO 82 | 83 | /** 84 | * This pair of (co)algebras allows us to create a Birecursive[DataType, SchemaF] instance "for free". 85 | * 86 | * Such instance witnesses the fact that we can use a DataType in schemes that would normally apply to SchemaF. 87 | * For example, suppose that we have: 88 | * 89 | * {{{ 90 | * val parquet: DataType = ??? 91 | * val toAvro: Algebra[SchemaF, avro.Schema] = ??? 92 | * }}} 93 | * 94 | * If we have the instance bellow in scope (and the necessary implicits from matryoshka.implicits), we can now write 95 | * 96 | * {{{ 97 | * parquet.cata(toAvro) 98 | * }}} 99 | * 100 | * Instead of 101 | * 102 | * {{{ 103 | * parquet.hylo(dataTypeToSchemaf, toAvro) 104 | * }}} 105 | * 106 | * And the same goes with `ana` and any Coalgebra[SchemaF, X]. 107 | */ 108 | implicit def dataTypeSchemaBirecursive: Birecursive.Aux[DataType, SchemaF] = 109 | Birecursive.fromAlgebraIso(schemaFToDataType, dataTypeToSchemaF) 110 | } 111 | 112 | /** 113 | * Everything looks nice, but don't you feel we are missing something? 114 | * 115 | * I mean, think about it for a minute and meet me 20 lines bellow. 116 | * 117 | * 118 | * 119 | * 120 | * 121 | * 122 | * 123 | * 124 | * 125 | * 126 | * 127 | * 128 | * 129 | * 130 | * 131 | * 132 | * 133 | * 134 | * 135 | * Did you guess? 136 | * 137 | * 138 | * 139 | * You're right of course! We still have to write tests! 140 | * 141 | * Let's meet again in `src/test/scala/1-schema/ParquetSpec.scala`. 142 | */ 143 | trait SchemaFArbitrary { 144 | 145 | implicit def schemaFDelayArbitrary: Delay[Arbitrary, SchemaF] = new Delay[Arbitrary, SchemaF] { 146 | 147 | def apply[A](A: Arbitrary[A]): Arbitrary[SchemaF[A]] = TODO 148 | 149 | } 150 | } 151 | -------------------------------------------------------------------------------- /src/main/scala/solutions/0-prelude.scala: -------------------------------------------------------------------------------- 1 | package lc2018.solutions 2 | 3 | import matryoshka._ 4 | import matryoshka.data._ 5 | import matryoshka.implicits._ 6 | import scalaz._ 7 | import Scalaz._ 8 | 9 | /** 10 | * Let's begin with what's probably the simplest possible recursive structure: natural numbers 11 | * 12 | * Natural numbers can be defined recursively: 13 | * A number is either 14 | * - zero, noted `Z` 15 | * - the successor of a number, noted `S(n) where n is the notation of some number 16 | * 17 | * This notation is often referred to as the Peano notation. 18 | */ 19 | object PeanoNumbers { 20 | 21 | /** 22 | * We want to encode Peano numbers as a recursive type. 23 | * This encoding will be a type constructor, out so-called "pattern-functor" 24 | * 25 | * Hint: there is a type in the standard library that has exactly the structure we want. 26 | */ 27 | type PeanoNumberF[A] = Option[A] 28 | 29 | /** 30 | * The problem with the PeanonumberF encoding is that now, different numbers 31 | * will have different types. 32 | * 33 | * We need a fix-point of PeanoNumberF to build a type that can represent all numbers. 34 | */ 35 | type PeanoNumber = Fix[PeanoNumberF] 36 | 37 | /** 38 | * Now let's write our very first Algebra! Yay! 39 | * 40 | * We want to transform our Peano representation to Int. It's as simple as counting 41 | * the "layers" of "successor". 42 | */ 43 | def countLayers: Algebra[PeanoNumberF, Int] = _.fold(0)(_ + 1) 44 | 45 | /** 46 | * We now have all the ingredients needed to use our first recursion scheme. 47 | * 48 | * Hint: this will use the algebra defined above to *destroy* our recursive structure. 49 | */ 50 | def toInt(peano: PeanoNumber): Int = peano cata countLayers 51 | 52 | /** 53 | * Now we just need a value to test our functions 54 | */ 55 | val three: PeanoNumber = Fix(Option(Fix(Option(Fix(Option(Fix(Option.empty[PeanoNumber]))))))) 56 | 57 | assert(toInt(three) == 3) 58 | } 59 | 60 | /** 61 | * We now move one to a more interesting recursive structure: the binary tree. 62 | */ 63 | object BinaryTrees { 64 | 65 | sealed trait Tree 66 | final case class Branch(label: Int, left: Tree, right: Tree) extends Tree 67 | final case class Leaf(label: Int) extends Tree 68 | final case class Empty() extends Tree 69 | 70 | /** 71 | * So the first thing to do is to "translate" our Tree to a pattern-functor. 72 | * This is done by adding a type parameter and replace each recursive occurrences 73 | * of Tree by this type parameter in the ADT. 74 | */ 75 | sealed trait TreeF[A] 76 | final case class BranchF[A](label: Int, left: A, right: A) extends TreeF[A] 77 | final case class LeafF[A](label: Int) extends TreeF[A] 78 | final case class EmptyF[A]() extends TreeF[A] 79 | 80 | /** 81 | * Of course, we need to have an instance of Functor[TreeF] for it to be a real pattern-functor. 82 | */ 83 | implicit val treeFFunctor: Functor[TreeF] = new Functor[TreeF] { 84 | def map[A, B](fa: TreeF[A])(f: A => B): TreeF[B] = fa match { 85 | case BranchF(label, l, r) => BranchF(label, f(l), f(r)) 86 | case LeafF(label) => LeafF(label) 87 | case EmptyF() => EmptyF() 88 | } 89 | } 90 | 91 | /** 92 | * It's a good idea to have a pair of (co)algebras that go from Tree to TreeF (and vice versa). 93 | */ 94 | def treeAlg: Algebra[TreeF, Tree] = { 95 | case BranchF(label, l, r) => Branch(label, l, r) 96 | case LeafF(label) => Leaf(label) 97 | case EmptyF() => Empty() 98 | } 99 | def treeCoalg: Coalgebra[TreeF, Tree] = { 100 | case Branch(label, l, r) => BranchF(label, l, r) 101 | case Leaf(label) => LeafF(label) 102 | case Empty() => EmptyF() 103 | } 104 | 105 | /** 106 | * These two (co)algebras make it easy to provide a Birecursive instance for Tree/TreeF. 107 | * This allows to treat Tree as if it were a TreeF, and thus enables to use schemes directly 108 | * on a Tree (rather than having to wrap it in a fixpoint). 109 | */ 110 | implicit val treeBirecursive: Birecursive.Aux[Tree, TreeF] = Birecursive.fromAlgebraIso(treeAlg, treeCoalg) 111 | 112 | import Recursive.ops._ 113 | 114 | /** 115 | * A function TreeF[List[Int]] => List[Int] 116 | * 117 | * The produced list contains the labels of all the nodes in the tree 118 | * as enumerated by a depth-first, left-to-right traversal. 119 | */ 120 | def toList: Algebra[TreeF, List[Int]] = { 121 | case BranchF(label, l, r) => l ++ List(label) ++ r 122 | case LeafF(label) => List(label) 123 | case EmptyF() => Nil 124 | } 125 | 126 | val testTree: Recursive.AllOps[Tree, TreeF] = Branch(12, Branch(10, Leaf(1), Empty()), Leaf(15)) 127 | 128 | assert(testTree.cata(toList) == List(1, 10, 12, 15)) 129 | 130 | /** 131 | * A function List[Int] => TreeF[List[Int]] 132 | * 133 | * This function MUST produce a "sort tree", that is, a tree where each 134 | * node has a label that is greater than all the labels in its left subtree 135 | * and lesser than all the labels in its right subtree. 136 | */ 137 | def fromList: Coalgebra[TreeF, List[Int]] = { 138 | case Nil => EmptyF() 139 | case head :: Nil => LeafF(head) 140 | case head :: tail => 141 | val (lesser, greater) = tail.partition(_ < head) 142 | BranchF(head, lesser, greater) 143 | } 144 | 145 | /** 146 | * I wonder what this mystery function does… 147 | */ 148 | def mystery(input: List[Int]): List[Int] = input.hylo(toList, fromList) 149 | 150 | } 151 | -------------------------------------------------------------------------------- /src/main/scala/2-avro.scala: -------------------------------------------------------------------------------- 1 | package lc2018 2 | 3 | import org.apache.avro.{LogicalTypes, _} 4 | import matryoshka._, implicits._, patterns.EnvT 5 | import scala.collection.immutable.ListMap 6 | import scalaz._, Scalaz._ 7 | 8 | import scala.language.higherKinds 9 | import scala.collection.JavaConverters._ 10 | 11 | /** 12 | * There is a problem that makes writing SchemaF <-> Avro (co)algebras more difficult. 13 | * 14 | * As a matter of fact Avro mandates that, when building a Schema, all records (the Avro 15 | * equivalent to our StructF) are registered using a unique name. 16 | * 17 | * This is problematic to our algebra-based method because with the algebras we've seen so 18 | * far we only care about one "layer" at a time, so there is no way to know the names we've 19 | * already used for ther records we've registered so far. 20 | * 21 | * Fortunately, we have at least two solutions to that problem. But before going any further, 22 | * maybe you can take a few minutes to try and imagine how we can solve that problem in general, 23 | * even if you don't know how to implement your solution using recursion-schemes yet. 24 | */ 25 | trait SchemaToAvroAlgebras extends Labelling with UsingARegistry with AvroCoalgebra {} 26 | 27 | /** 28 | * The first solution comes from the observation that our schemas are in fact trees. And trees have 29 | * this nice property that each node have a unique path that goes from the root to it. If we can use 30 | * that unique path as the names of our records, we're good to go. So this solution boils down to 31 | * labelling each "node" of a schema with its path, and then use that path to form the names we 32 | * use to register our records. 33 | */ 34 | trait Labelling { 35 | 36 | /** 37 | * So lets define out Path as being simply a list of strings. These strings will be the field names 38 | * we need to traverse from the root to get to a specific element of our schema. 39 | */ 40 | type Path = List[String] 41 | 42 | /** 43 | * Here is the "special trick" of the current solution. 44 | * 45 | * EnvT is a kind of "glorified pair". Given a label type E and a (pattern)-functor F, it allows us 46 | * to label each "node" of a T[F] with a value of type E while retaining the original structure. In 47 | * other words, if F is a functor, then EnvT[E, F, ?] is a functor as well. 48 | */ 49 | type Labelled[A] = EnvT[Path, SchemaF, A] 50 | 51 | /** 52 | * If we are to label each "node" of a schema with its own path, we obviously need to go from the root 53 | * down to the leaves, so we definitely want to write a coalgebra. 54 | * This one might look a bit scarry though, but fear not, it's not as complcated as it looks. Lets just 55 | * follow the types together. 56 | * 57 | * A Coalgebra[F, A] is just a function A => F[A]. So the coalgebra bellow is just a function 58 | * (Path, T[SchemaF]) => Labelled[(Path, T[SchemaF]) 59 | * Expanding the Labelled alias it becomes 60 | * (Path, T[SchemaF]) => EnvT[Path, SchemaF, (Path, T[SchemaF])] 61 | * 62 | * Ok, maybe it still looks a bit scarry... 63 | * 64 | * Lets try to put it differently. Assume you will be given a "seed" consisting of a whole schema and an 65 | * initial path (that will start empty). Your job is to use that to produce an EnvT that will contain 66 | * the path of the node you just saw (the "root" of the schema that was in the seed), and the node itself 67 | * but modified such that its "content" is not just a "smaller schema" as it was initially, but a new "seed" 68 | * consisting of a (larger) path, and the said "smaller schema". 69 | */ 70 | def labelNodesWithPath[T](implicit T: Recursive.Aux[T, SchemaF]): Coalgebra[Labelled, (Path, T)] = TODO 71 | 72 | /** 73 | * Now the algebra (that we had no way to write before) becomes trivial. All we have to do is to use 74 | * the path labelling each "node" as the name we need when registering a new avro record. 75 | * 76 | * To extract the label (resp. node) of an EnvT you can use pattern-matching (EnvT contains only a pair 77 | * (label, node)), or you can use the `ask` and `lower` methods that return the label and node respectively. 78 | */ 79 | def labelledToSchema: Algebra[Labelled, Schema] = TODO 80 | 81 | def schemaFToAvro[T](schemaF: T)(implicit T: Recursive.Aux[T, SchemaF]): Schema = 82 | (List.empty[String], schemaF).hylo(labelledToSchema, labelNodesWithPath) 83 | } 84 | 85 | /** 86 | * That first solution was (relatively) simple but it is not completely satisfying. 87 | * We needed both an algebra and a coalgebra to got from our SchemaF to Avro's Schema, which forced us to 88 | * use hylo. 89 | * 90 | * Fortunately, every scheme (and the related algebra) come with a "monadic" version. In this version, we 91 | * have to "wrap" the result of our algebras inside our monad of choice. The scheme will then use this 92 | * monad's bind at each step. That has plenty of cool uses. 93 | * 94 | * We can for example "short-circuit" the traversal by using \/ or Option as our monad. Or in this very case 95 | * we can use the State monad to keep track of what records we've already created. 96 | * 97 | * A note though: in order to use monadic schemes, we need a Traverse instance for our pattern-functor. 98 | */ 99 | trait UsingARegistry { 100 | 101 | type Registry[A] = State[Map[Int, Schema], A] 102 | 103 | def fingerprint(fields: Map[String, Schema]): Int = fields.hashCode 104 | 105 | def useARegistry: AlgebraM[Registry, SchemaF, Schema] = TODO 106 | 107 | implicit def schemaFTraverse: Traverse[SchemaF] = TODO 108 | 109 | def toAvro[T](schemaF: T)(implicit T: Recursive.Aux[T, SchemaF]): Schema = 110 | schemaF.cataM(useARegistry).run(Map.empty)._2 111 | } 112 | 113 | trait AvroCoalgebra { 114 | 115 | /** 116 | * Of course we also need a coalgebra to go from Avro to SchemaF 117 | * Since there are some avro shcemas that we do not handle here, 118 | * we need a CoalgebraM, but we're not really interested in providing meaningful errors 119 | * here, so we can use Option as our monad. 120 | */ 121 | def avroToSchemaF: CoalgebraM[Option, SchemaF, Schema] = TODO 122 | } 123 | -------------------------------------------------------------------------------- /src/main/scala/4-spark-avro.scala: -------------------------------------------------------------------------------- 1 | package lc2018 2 | 3 | import matryoshka._ 4 | import matryoshka.data.Fix 5 | import matryoshka.implicits._ 6 | import matryoshka.patterns.EnvT 7 | import org.apache.avro.Schema 8 | import scalaz._, Scalaz._ 9 | 10 | import scala.language.higherKinds 11 | import org.apache.avro.generic.{GenericContainer, GenericData, GenericRecordBuilder} 12 | import org.apache.spark.sql.Row 13 | 14 | import scala.collection.immutable.ListMap 15 | import scala.language.higherKinds 16 | 17 | /** 18 | * It's time to confront ourselves to the real world of manipulating data with Spark & Avro 19 | * Two specific pain points we have to tackle are : 20 | * 21 | * - Spark's org.apache.spark.sql.Row is basically a wrapper of Array[Any] 22 | * but we need to handle two specifically different behaviour according to the level of the data : 23 | * When we're handling Arrays and Structs, no worry we need to output a Row 24 | * but when we're handling "simple" types, then if it's a top-level value we need to output a Row 25 | * but if it's not, then the value itself must be written. 26 | * 27 | * Exemple : 28 | * - Value("b") will be Row("b") 29 | * but 30 | * - Struct(a -> Value("b")) will be Row("b") as well (the Row now representing the outer struct) 31 | * 32 | * - For Apache Avro, it's a new kind of pain you'll need to overcome, Avro basically represents all of its data 33 | * as if, it will be at one point or another generated into Java classes. 34 | * So every "record" or Struct needs to have a qualified name "unique" otherwise the Avro engine will consider 35 | * the struct as being the same class. 36 | * But as it will obviously have different fields - you'll most likely end up with an error. 37 | * 38 | * Happy hunting. 39 | */ 40 | object SparkConverter extends GDataInstances { 41 | 42 | def isOfSimpleType[D](data: GData[D]) = data match { 43 | case GStruct(_) | GArray(_) => true 44 | case _ => false 45 | } 46 | 47 | /** 48 | * We have a proper way to overcome this problem. There is a `para` scheme that works a little bit like cata. 49 | * Using para, our algebra will "see" not only the result of its application to the level bellow but also 50 | * the structure of that level we just processed. 51 | * 52 | * To use para, we need a special kind of algebra : a GAlgebra. Given a functor F and a comonad W, Galgebra[W, F, A] 53 | * is simply a function F[W[A]] => A, so our carrier is simply wrapped in an additional layer. 54 | * 55 | * For para's GAlgebra we use (T[F], ?) as our comonad, in other words, our carrier will be paired with the "tree" we 56 | * processed during the previous step. 57 | * 58 | * We will use that to know when we need to "unwrap" the value we had wrapped in a Row at the previous step although we 59 | * shouldn't have. 60 | */ 61 | def gDataToRow[D](implicit D: Recursive.Aux[D, GData]): GAlgebra[(D, ?), GData, Row] = TODO 62 | 63 | def fromGDataToSparkRow(row: Fix[GData]): Row = 64 | row.para[Row](gDataToRow) 65 | 66 | } 67 | 68 | /** 69 | * We'll also need Avro to serialize streaming data into Kafka topics. 70 | * 71 | * This is just another kind of pain :). We will be using Avro's GenericContainer interface. 72 | * To build a GenericContainer you need an Avro schema, so we'll have to somehow "zip" the data 73 | * we want to serialize with its schema (this should remind you of something we already did). 74 | */ 75 | object AvroConverter extends SchemaToAvroAlgebras with GDataInstances { 76 | 77 | import scala.collection.JavaConverters._ 78 | 79 | /** 80 | * A generic data (of type [[GData]]) with each element 81 | * labelled with the corresponding `avro.Schema`. 82 | */ 83 | type DataWithSchema[A] = EnvT[Schema, GData, A] 84 | 85 | /** 86 | * When we'll zip data and schema there may be times when those two don't mix 87 | * we need to handle that case - this is what an Incompatibility is. 88 | */ 89 | case class Incompatibility[D](schema: Schema, data: D) 90 | 91 | /** 92 | * Avro API is not very typesafe, all values inside GenericRecord are treated as mere Objects. 93 | * They didn't defined a GenericContainer for storing simple values (like numbers, strings, etc). 94 | * So we need to define one, for there is no way *we* work on non-types like Any or AnyRef. 95 | */ 96 | case class SimpleValue(value: Any) extends GenericContainer { 97 | override def getSchema: Schema = throw new NotImplementedError() // we won't use that anyway 98 | } 99 | 100 | /** 101 | * But this is for our convenience only, we still need to feed avro API methods with unwrapped 102 | * simple values, so don't forget to use this method whenever needed. 103 | */ 104 | def unwrap(container: GenericContainer): Any = { 105 | container match { 106 | case SimpleValue(value) => value 107 | case value => value 108 | } 109 | } 110 | 111 | def fromGDataToAvro[S, D](schema: S, data: D)( 112 | implicit S: Birecursive.Aux[S, SchemaF], 113 | D: Birecursive.Aux[D, GData]): \/[Incompatibility[D], GenericContainer] = { 114 | 115 | val zipWithSchemaAlg: CoalgebraM[\/[Incompatibility[D], ?], DataWithSchema, (S, D)] = TODO 116 | 117 | val alg: AlgebraM[\/[Incompatibility[D], ?], DataWithSchema, GenericContainer] = TODO 118 | 119 | (schema, data).hyloM[\/[Incompatibility[D], ?], DataWithSchema, GenericContainer](alg, zipWithSchemaAlg) 120 | } 121 | 122 | } 123 | 124 | trait GDataInstances { 125 | 126 | implicit val genericDataFTraverse: Traverse[GData] = new Traverse[GData] { 127 | 128 | override def traverseImpl[G[_], A, B](fa: GData[A])(f: A => G[B])( 129 | implicit evidence$1: Applicative[G]): G[GData[B]] = fa match { 130 | case GArray(elems) => 131 | Functor[G].map(elems.toList traverse f)(GArray.apply) 132 | 133 | case GStruct(fields) => 134 | val (keys, values) = fields.unzip 135 | Functor[G].map(values.toList traverse f)(v => GStruct(ListMap((keys zip v).toSeq: _*))) 136 | 137 | case GString(value) => Applicative[G].point(GString[B](value)) 138 | case GLong(value) => Applicative[G].point(GLong[B](value)) 139 | case GInteger(value) => Applicative[G].point(GInteger[B](value)) 140 | case GDouble(value) => Applicative[G].point(GDouble[B](value)) 141 | case GFloat(value) => Applicative[G].point(GFloat[B](value)) 142 | case GDate(value) => Applicative[G].point(GDate[B](value)) 143 | case GBoolean(value) => Applicative[G].point(GBoolean[B](value)) 144 | } 145 | } 146 | } 147 | -------------------------------------------------------------------------------- /src/main/scala/solutions/3-validation.scala: -------------------------------------------------------------------------------- 1 | package lc2018.solutions 2 | 3 | import jto.validation._ 4 | import jto.validation.jsonast._ 5 | import matryoshka._, implicits._ 6 | import matryoshka.data._ 7 | import org.scalacheck.Arbitrary 8 | import scalaz.Scalaz._ 9 | import scalaz._ 10 | 11 | import scala.collection.immutable.ListMap 12 | import scala.language.higherKinds 13 | 14 | /** 15 | * Now that we have a Schema we will need to validate incoming data (JSON) 16 | * and output "validated" data or "errors" with what went wrong for the sources 17 | * to be able to fix their exports. 18 | * 19 | * For that we'll use the JTO Validation library but first we need to define what a "Data" is 20 | */ 21 | sealed trait GData[A] 22 | final case class GStruct[A](fields: ListMap[String, A]) extends GData[A] 23 | final case class GArray[A](element: Seq[A]) extends GData[A] 24 | final case class GBoolean[A](value: Boolean) extends GData[A] 25 | final case class GDate[A](value: java.util.Date) extends GData[A] 26 | final case class GDouble[A](value: Double) extends GData[A] 27 | final case class GFloat[A](value: Float) extends GData[A] 28 | final case class GInteger[A](value: Int) extends GData[A] 29 | final case class GLong[A](value: Long) extends GData[A] 30 | final case class GString[A](value: String) extends GData[A] 31 | 32 | object GData extends GDataInstances with DataWithSchemaGenerator 33 | 34 | object SchemaRules { 35 | type JRule[A] = Rule[JValue, A] 36 | 37 | implicit val ruleApplicativeForScalaz: Applicative[JRule] = new Applicative[JRule] { 38 | override def point[A](a: => A): JRule[A] = Rule.pure(a) 39 | 40 | override def ap[A, B](fa: => JRule[A])(f: => JRule[A => B]): JRule[B] = fa.ap(f) 41 | } 42 | 43 | def fromSchemaToRules[S, D](schema: S)(implicit S: Recursive.Aux[S, SchemaF], 44 | D: Corecursive.Aux[D, GData]): JRule[D] = { 45 | val alg: Algebra[SchemaF, JRule[D]] = { 46 | case StructF(fields) => 47 | fields.toList 48 | .traverse[JRule, (String, D)] { 49 | case (name, validation) => 50 | (Path \ name).read(_ => validation.map(fx => (name, fx))) 51 | } 52 | .map(fs => GStruct(ListMap(fs: _*)).embed) 53 | 54 | case ArrayF(elem) => Rules.pickSeq(elem).map(elems => GArray(elems).embed) 55 | case BooleanF() => Rules.booleanR.map(x => GBoolean[D](x).embed) 56 | case DateF() => Rules.stringR.andThen(Rules.isoDateR).map(x => GDate[D](x).embed) 57 | case DoubleF() => Rules.doubleR.map(x => GDouble[D](x).embed) 58 | case FloatF() => Rules.floatR.map(x => GFloat[D](x).embed) 59 | case IntegerF() => Rules.intR.map(x => GInteger[D](x).embed) 60 | case LongF() => Rules.longR.map(x => GLong[D](x).embed) 61 | case StringF() => Rules.stringR.map(x => GString[D](x).embed) 62 | } 63 | 64 | schema cata alg 65 | } 66 | 67 | } 68 | 69 | /** 70 | * We need to test that validation - of course specific unit tests can be done 71 | * but we're quite paranoid so let's "generate" abitrary schemas using ScalaCheck 72 | * 73 | * But then again - from a Schema we'll be able to generate Rules 74 | * But to validate those rules we'd need data. 75 | * So let's generate Data as well : 76 | * Data that will, of course, need to be compatible with the Schema itself. 77 | */ 78 | trait DataWithSchemaGenerator { 79 | import org.scalacheck.Gen 80 | 81 | import scala.collection.JavaConverters._ 82 | 83 | def genSchemaAndData[S, D](implicit S: Birecursive.Aux[S, SchemaF], D: Corecursive.Aux[D, GData]): Gen[(S, D)] = 84 | for { 85 | schemaF <- genSchemaF 86 | dataF <- schemaF cata schemaToDataGen 87 | } yield (schemaF, dataF) 88 | 89 | def schemaToDataGen[D](implicit D: Corecursive.Aux[D, GData]): Algebra[SchemaF, Gen[D]] = { 90 | case ArrayF(elems) => 91 | Gen.listOf(elems).map(lst => GArray(lst).embed) 92 | 93 | case StructF(fields) => 94 | val (names, values) = fields.unzip 95 | Gen.sequence(values).map(fields => GStruct(ListMap((names zip fields.asScala).toSeq: _*)).embed) 96 | 97 | case BooleanF() => 98 | Gen.oneOf(true, false).map(value => GBoolean[D](value).embed) 99 | 100 | case DateF() => 101 | Gen.choose(0, Long.MaxValue).map(value => GDate[D](new java.util.Date(value)).embed) 102 | 103 | case DoubleF() => 104 | Gen.choose(Double.MinValue, Double.MaxValue).map(value => GDouble[D](value).embed) 105 | 106 | case FloatF() => 107 | Gen.choose(Float.MinValue, Float.MaxValue).map(value => GFloat[D](value).embed) 108 | 109 | case IntegerF() => 110 | Gen.choose(Int.MinValue, Int.MaxValue).map(value => GInteger[D](value).embed) 111 | 112 | case LongF() => 113 | Gen.choose(Long.MinValue, Long.MaxValue).map(value => GLong[D](value).embed) 114 | 115 | case StringF() => 116 | Gen.alphaNumStr.map(value => GString[D](value).embed) 117 | } 118 | 119 | def genSchemaF[S](implicit S: Corecursive.Aux[S, SchemaF]): Gen[S] = 120 | for { 121 | depth <- Gen.choose(1, 1) 122 | nbTopLevelColumns <- Gen.choose(1, 1) 123 | columns <- Gen.listOfN(nbTopLevelColumns, genStructSchema(depth)) 124 | } yield StructF(ListMap(columns: _*)).embed 125 | 126 | def genValueSchema[S](implicit S: Corecursive.Aux[S, SchemaF]): Gen[(String, S)] = 127 | for { 128 | name <- Gen.identifier 129 | valueF <- Gen.oneOf( 130 | BooleanF[S]().embed, 131 | DateF[S]().embed, 132 | DoubleF[S]().embed, 133 | FloatF[S]().embed, 134 | IntegerF[S]().embed, 135 | LongF[S]().embed, 136 | StringF[S]().embed, 137 | ) 138 | } yield (name, valueF) 139 | 140 | def genColumnSchema[S](maxDepth: Int)(implicit S: Corecursive.Aux[S, SchemaF]): Gen[(String, S)] = 141 | if (maxDepth > 0) 142 | Gen.oneOf[(String, S)](genValueSchema, genStructSchema(maxDepth)) 143 | else genValueSchema 144 | 145 | def genStructSchema[S](maxDepth: Int)(implicit S: Corecursive.Aux[S, SchemaF]): Gen[(String, S)] = 146 | for { 147 | name <- Gen.identifier 148 | depth <- Gen.choose(1, maxDepth) 149 | nbFields <- Gen.choose(0, 3) 150 | fields <- Gen.listOfN(nbFields, genColumnSchema(maxDepth - depth)) 151 | } yield (name, StructF(ListMap(fields: _*)).embed) 152 | 153 | def genArraySchema[S](maxDepth: Int)(implicit S: Corecursive.Aux[S, SchemaF]): Gen[(String, S)] = 154 | for { 155 | name <- Gen.identifier 156 | depth <- Gen.choose(1, maxDepth) 157 | (_, elems) <- genNonArraySchema(maxDepth - depth) 158 | } yield (name, ArrayF(elems).embed) 159 | 160 | def genNonArraySchema[S](maxDepth: Int)(implicit S: Corecursive.Aux[S, SchemaF]): Gen[(String, S)] = 161 | if (maxDepth > 0) 162 | Gen.oneOf[(String, S)](genValueSchema, genStructSchema(maxDepth)) 163 | else genValueSchema 164 | } 165 | -------------------------------------------------------------------------------- /src/main/scala/solutions/1-schema.scala: -------------------------------------------------------------------------------- 1 | package lc2018.solutions 2 | 3 | import org.scalacheck.{Arbitrary, Gen} 4 | import scala.collection.immutable.ListMap 5 | import scalaz._, Scalaz._ 6 | import matryoshka._, implicits._ 7 | 8 | /** 9 | * Without further ado, let's define our main pattern-functor for the remaining of the session. 10 | */ 11 | sealed trait SchemaF[A] 12 | 13 | // we'll use a ListMap to keep the ordering of the fields 14 | final case class StructF[A](fields: ListMap[String, A]) extends SchemaF[A] 15 | final case class ArrayF[A](element: A) extends SchemaF[A] 16 | final case class BooleanF[A]() extends SchemaF[A] 17 | final case class DateF[A]() extends SchemaF[A] 18 | final case class DoubleF[A]() extends SchemaF[A] 19 | final case class FloatF[A]() extends SchemaF[A] 20 | final case class IntegerF[A]() extends SchemaF[A] 21 | final case class LongF[A]() extends SchemaF[A] 22 | final case class StringF[A]() extends SchemaF[A] 23 | 24 | object SchemaF extends SchemaFToDataTypeAlgebras with SchemaFArbitrary { 25 | 26 | /** 27 | * As usual, we need to define a Functor instance for our pattern. 28 | */ 29 | implicit val schemaFScalazFunctor: Functor[SchemaF] = new Functor[SchemaF] { 30 | def map[A, B](fa: SchemaF[A])(f: A => B): SchemaF[B] = fa match { 31 | case StructF(fields) => StructF(fields.map { case (name, value) => name -> f(value) }) 32 | case ArrayF(elem) => ArrayF(f(elem)) 33 | case BooleanF() => BooleanF() 34 | case DateF() => DateF() 35 | case DoubleF() => DoubleF() 36 | case FloatF() => FloatF() 37 | case IntegerF() => IntegerF() 38 | case LongF() => LongF() 39 | case StringF() => StringF() 40 | } 41 | } 42 | 43 | /** 44 | * It might be usefull to have a nice string representation of our schemas. 45 | * 46 | * Let say that we want a representation where: 47 | * - simple types like `BooleanF()` or `DateF()` would be represented as `boolean` and `date` respectively. 48 | * - arrays like `ArrayF(IntegerF())` would be represented as `[ integer ]`. 49 | * - structs like `StructF(ListMap("foo" -> FloatF(), "bar" -> LongF())` would be represented as 50 | * `{ foo: float, bar: long }` 51 | * 52 | * Because of the recursive nature of SchemaF, we cannot eagerly write a Show instance for SchemaF. 53 | * Fortunately matryoshka defines the Delay typeclass that is useful in such cases. It allows to "break 54 | * the infinite loop" by delaying the instantiation of Show[SchemaF[A]]. 55 | * 56 | * matryoshka.implicits contains implicit functions that, given that Delay[Show, SchemaF] instance, 57 | * will provide a Show[T[SchemaF]] for any fix-point T. 58 | * 59 | */ 60 | implicit val schemaFDelayShow: Delay[Show, SchemaF] = new Delay[Show, SchemaF] { 61 | def apply[A](showA: Show[A]): Show[SchemaF[A]] = new Show[SchemaF[A]] { 62 | override def show(schema: SchemaF[A]): Cord = schema match { 63 | case StructF(fields) => 64 | val showFields = fields.map { case (k, v) => Cord(k) ++ Cord(": ") ++ showA.show(v) }.toSeq 65 | Cord("{ ") ++ Cord.mkCord(Cord(", "), showFields: _*) ++ Cord(" }") 66 | case ArrayF(element) => Cord("[ ") ++ showA.show(element) ++ Cord(" ]") 67 | case BooleanF() => Cord("boolean") 68 | case DateF() => Cord("date") 69 | case DoubleF() => Cord("double") 70 | case FloatF() => Cord("float") 71 | case IntegerF() => Cord("integer") 72 | case LongF() => Cord("long") 73 | case StringF() => Cord("string") 74 | } 75 | } 76 | } 77 | 78 | } 79 | 80 | /** 81 | * Now that we have a proper pattern-functor, we need (co)algebras to go from our "standard" schemas to 82 | * our new and shiny SchemaF (and vice versa). 83 | * 84 | * Lets focus on Parquet schemas first. Parquet is a columnar data format that allows efficient processing 85 | * of large datasets in a distributed environment (eg Spark). In the Spark API, Parquet schemas are represented 86 | * as instances of the DataType type. So what we want to write here is a pair of (co)algebras that go from/to 87 | * SchemaF/DataType. 88 | * 89 | * NOTE: in order not to depend directly on Spark (and, hence, transitively on half of maven-central), we've copied 90 | * the definition of the DataType trait and its subclasses in the current project under 91 | * `spark/src/main/scala/DataType.scala`. 92 | */ 93 | trait SchemaFToDataTypeAlgebras { 94 | 95 | import org.apache.spark.sql.types._ 96 | 97 | /** 98 | * As usual, simply a function from SchemaF[DataType] to DataType 99 | */ 100 | def schemaFToDataType: Algebra[SchemaF, DataType] = { 101 | case StructF(fields) => StructType(fields.map { case (name, value) => StructField(name, value) }.toArray) 102 | case ArrayF(elem) => ArrayType(elem, containsNull = false) 103 | case BooleanF() => BooleanType 104 | case DateF() => DateType 105 | case DoubleF() => DoubleType 106 | case FloatF() => FloatType 107 | case IntegerF() => IntegerType 108 | case LongF() => LongType 109 | case StringF() => StringType 110 | 111 | } 112 | 113 | /** 114 | * And the other way around, a function from DataType to SchemaF[DataType] 115 | */ 116 | def dataTypeToSchemaF: Coalgebra[SchemaF, DataType] = { 117 | case StructType(fields) => StructF(ListMap(fields.map(f => f.name -> f.dataType): _*)) 118 | case ArrayType(elem, _) => ArrayF(elem) 119 | case BooleanType => BooleanF() 120 | case DateType => DateF() 121 | case DoubleType => DoubleF() 122 | case FloatType => FloatF() 123 | case IntegerType => IntegerF() 124 | case LongType => LongF() 125 | case StringType => StringF() 126 | 127 | } 128 | 129 | /** 130 | * This pair of (co)algebras allows us to create a Birecursive[DataType, SchemaF] instance "for free". 131 | * 132 | * Such instance witnesses the fact that we can use a DataType in schemes that would normally apply to SchemaF. 133 | * For example, suppose that we have: 134 | * 135 | * {{{ 136 | * val parquet: DataType = ??? 137 | * val toAvro: Algebra[SchemaF, avro.Schema] = ??? 138 | * }}} 139 | * 140 | * If we have the instance bellow in scope (and the necessary implicits from matryoshka.implicits), we can now write 141 | * 142 | * {{{ 143 | * parquet.cata(toAvro) 144 | * }}} 145 | * 146 | * Instead of 147 | * 148 | * {{{ 149 | * parquet.hylo(dataTypeToSchemaf, toAvro) 150 | * }}} 151 | * 152 | * And the same goes with `ana` and any Coalgebra[SchemaF, X]. 153 | */ 154 | implicit val dataTypeSchemaBirecursive: Birecursive.Aux[DataType, SchemaF] = 155 | Birecursive.fromAlgebraIso(schemaFToDataType, dataTypeToSchemaF) 156 | } 157 | 158 | /** 159 | * Everything looks nice, but don't you feel we are missing something? 160 | * 161 | * I mean, think about it for a minute and meet me 20 lines bellow. 162 | * 163 | * 164 | * 165 | * 166 | * 167 | * 168 | * 169 | * 170 | * 171 | * 172 | * 173 | * 174 | * 175 | * 176 | * 177 | * 178 | * 179 | * 180 | * 181 | * Did you guess? 182 | * 183 | * 184 | * 185 | * You're right of course! We still have to write tests! 186 | * 187 | * Let's meet again in `src/test/scala/1-schema/ParquetSpec.scala`. 188 | */ 189 | trait SchemaFArbitrary { 190 | 191 | implicit val schemaFDelayArbitrary: Delay[Arbitrary, SchemaF] = new Delay[Arbitrary, SchemaF] { 192 | 193 | def apply[A](A: Arbitrary[A]): Arbitrary[SchemaF[A]] = 194 | Arbitrary( 195 | Gen.oneOf( 196 | Gen.const(BooleanF[A]()), 197 | Gen.const(DateF[A]()), 198 | Gen.const(DoubleF[A]()), 199 | Gen.const(FloatF[A]()), 200 | Gen.const(IntegerF[A]()), 201 | Gen.const(LongF[A]()), 202 | Gen.const(StringF[A]()), 203 | for { 204 | nbFields <- Gen.choose(1, 10) 205 | // we need to make sure that fields' names are unique and non empty 206 | names <- Gen.listOfN(nbFields, Gen.alphaStr).map(_.map("a" ++ _).toSet) 207 | types <- Gen.listOfN(names.size, A.arbitrary) 208 | } yield StructF[A](ListMap((names.toList zip types): _*)), 209 | A.arbitrary.map(ArrayF.apply _) 210 | ) 211 | ) 212 | 213 | } 214 | } 215 | -------------------------------------------------------------------------------- /src/main/scala/solutions/2-avro.scala: -------------------------------------------------------------------------------- 1 | package lc2018 2 | package solutions 3 | 4 | import org.apache.avro.{LogicalTypes, _} 5 | import matryoshka._, implicits._, patterns.EnvT 6 | import scala.collection.immutable.ListMap 7 | import scalaz._, Scalaz._ 8 | 9 | import scala.language.higherKinds 10 | import scala.collection.JavaConverters._ 11 | 12 | /** 13 | * There is a problem that makes writing SchemaF <-> Avro (co)algebras more difficult. 14 | * 15 | * As a matter of fact Avro mandates that, when building a Schema, all records (the Avro 16 | * equivalent to our StructF) are registered using a unique name. 17 | * 18 | * This is problematic to our algebra-based method because with the algebras we've seen so 19 | * far we only care about one "layer" at a time, so there is no way to know the names we've 20 | * already used for ther records we've registered so far. 21 | * 22 | * Fortunately, we have at least two solutions to that problem. But before going any further, 23 | * maybe you can take a few minutes to try and imagine how we can solve that problem in general, 24 | * even if you don't know how to implement your solution using recursion-schemes yet. 25 | */ 26 | trait SchemaToAvroAlgebras extends Labelling with UsingARegistry with AvroCoalgebra {} 27 | 28 | /** 29 | * The first solution comes from the observation that our schemas are in fact trees. And trees have 30 | * this nice property that each node have a unique path that goes from the root to it. If we can use 31 | * that unique path as the names of our records, we're good to go. So this solution boils down to 32 | * labelling each "node" of a schema with its path, and then use that path to form the names we 33 | * use to register our records. 34 | */ 35 | trait Labelling { 36 | 37 | /** 38 | * So lets define out Path as being simply a list of strings. These strings will be the field names 39 | * we need to traverse from the root to get to a specific element of our schema. 40 | */ 41 | type Path = List[String] 42 | 43 | /** 44 | * Here is the "special trick" of the current solution. 45 | * 46 | * EnvT is a kind of "glorified pair". Given a label type E and a (pattern)-functor F, it allows us 47 | * to label each "node" of a T[F] with a value of type E while retaining the original structure. In 48 | * other words, if F is a functor, then EnvT[E, F, ?] is a functor as well. 49 | */ 50 | type Labelled[A] = EnvT[Path, SchemaF, A] 51 | 52 | /** 53 | * If we are to label each "node" of a schema with its own path, we obviously need to go from the root 54 | * down to the leaves, so we definitely want to write a coalgebra. 55 | * This one might look a bit scarry though, but fear not, it's not as complcated as it looks. Lets just 56 | * follow the types together. 57 | * 58 | * A Coalgebra[F, A] is just a function A => F[A]. So the coalgebra bellow is just a function 59 | * (Path, T[SchemaF]) => Labelled[(Path, T[SchemaF]) 60 | * Expanding the Labelled alias it becomes 61 | * (Path, T[SchemaF]) => EnvT[Path, SchemaF, (Path, T[SchemaF])] 62 | * 63 | * Ok, maybe it still looks a bit scarry... 64 | * 65 | * Lets try to put it differently. Assume you will be given a "seed" consisting of a whole schema and an 66 | * initial path (that will start empty). Your job is to use that to produce an EnvT that will contain 67 | * the path of the node you just saw (the "root" of the schema that was in the seed), and the node itself 68 | * but modified such that its "content" is not just a "smaller schema" as it was initially, but a new "seed" 69 | * consisting of a (larger) path, and the said "smaller schema". 70 | */ 71 | def labelNodesWithPath[T](implicit T: Recursive.Aux[T, SchemaF]): Coalgebra[Labelled, (Path, T)] = { 72 | case (path, t) => 73 | t.project match { 74 | // paths are formed only using structs' field names so we only need to really care about structs. 75 | // For each field, we "push down" a new path augmented with that field's name. 76 | case StructF(fields) => 77 | EnvT((path, StructF(fields.map { case (k, v) => k -> (path :+ k, v) }))) 78 | // All other cases don't participate to the construction of the path, so we only need to push the 79 | // current path down. 80 | case otherwise => 81 | EnvT((path, otherwise.map(x => (path, x)))) 82 | } 83 | } 84 | 85 | /** 86 | * Now the algebra (that we had no way to write before) becomes trivial. All we have to do is to use 87 | * the path labelling each "node" as the name we need when registering a new avro record. 88 | * 89 | * To extract the label (resp. node) of an EnvT you can use pattern-matching (EnvT contains only a pair 90 | * (label, node)), or you can use the `ask` and `lower` methods that return the label and node respectively. 91 | */ 92 | def labelledToSchema: Algebra[Labelled, Schema] = { envT => 93 | val path = envT.ask 94 | envT.lower match { 95 | case StructF(fields) => 96 | fields 97 | .foldLeft(SchemaBuilder.record(path.mkString("a", ".", "z")).fields) { 98 | case (builder, (key, value)) => 99 | builder.name(key).`type`(value).noDefault() 100 | } 101 | .endRecord() 102 | case ArrayF(element) => 103 | SchemaBuilder.array().items(element) 104 | case BooleanF() => Schema.create(Schema.Type.BOOLEAN) 105 | case DateF() => 106 | LogicalTypes.timestampMillis().addToSchema(Schema.create(Schema.Type.LONG)) 107 | case DoubleF() => Schema.create(Schema.Type.DOUBLE) 108 | case FloatF() => Schema.create(Schema.Type.FLOAT) 109 | case IntegerF() => Schema.create(Schema.Type.INT) 110 | case LongF() => Schema.create(Schema.Type.LONG) 111 | case StringF() => Schema.create(Schema.Type.STRING) 112 | } 113 | } 114 | 115 | /** 116 | * 117 | */ 118 | def schemaFToAvro[T](schemaF: T)(implicit T: Recursive.Aux[T, SchemaF]): Schema = 119 | (List.empty[String], schemaF).hylo(labelledToSchema, labelNodesWithPath) 120 | } 121 | 122 | /** 123 | * That first solution was (relatively) simple but it is not completely satisfying. 124 | * We needed both an algebra and a coalgebra to got from our SchemaF to Avro's Schema, which forced us to 125 | * use hylo. 126 | */ 127 | trait UsingARegistry { 128 | 129 | type Registry[A] = State[Map[Int, Schema], A] 130 | 131 | def fingerprint(fields: Map[String, Schema]): Int = fields.hashCode 132 | 133 | def useARegistry: AlgebraM[Registry, SchemaF, Schema] = { 134 | case StructF(fields) => 135 | val fp = fingerprint(fields) 136 | State { (reg: Map[Int, Schema]) => 137 | if (reg contains fp) { 138 | (reg, reg(fp)) 139 | } else { 140 | val record = 141 | fields 142 | .foldLeft(SchemaBuilder.record("r%x".format(fp)).fields) { 143 | case (builder, (k, v)) => 144 | builder.name(k).`type`(v).noDefault 145 | } 146 | .endRecord 147 | (reg + (fp -> record), record) 148 | } 149 | } 150 | case ArrayF(field) => 151 | State.state(SchemaBuilder.array.items(field)) 152 | 153 | case BooleanF() => State.state(Schema.create(Schema.Type.BOOLEAN)) 154 | case DateF() => State.state(LogicalTypes.timestampMillis().addToSchema(Schema.create(Schema.Type.LONG))) 155 | case DoubleF() => State.state(Schema.create(Schema.Type.DOUBLE)) 156 | case FloatF() => State.state(Schema.create(Schema.Type.FLOAT)) 157 | case IntegerF() => State.state(Schema.create(Schema.Type.INT)) 158 | case LongF() => State.state(Schema.create(Schema.Type.LONG)) 159 | case StringF() => State.state(Schema.create(Schema.Type.STRING)) 160 | } 161 | 162 | implicit val schemaFTraverse: Traverse[SchemaF] = new Traverse[SchemaF] { 163 | override def traverseImpl[G[_], A, B](fa: SchemaF[A])(f: A => G[B])(implicit G: Applicative[G]): G[SchemaF[B]] = 164 | fa match { 165 | case StructF(fields) => 166 | val (ks, vs) = fields.unzip 167 | vs.toList.traverse(f).map { xs => 168 | StructF(ListMap((ks.toList zip xs): _*)) 169 | } 170 | case ArrayF(elem) => 171 | f(elem).map(ArrayF.apply) 172 | case BooleanF() => G.point(BooleanF()) 173 | case DateF() => G.point(DateF()) 174 | case DoubleF() => G.point(DoubleF()) 175 | case FloatF() => G.point(FloatF()) 176 | case IntegerF() => G.point(IntegerF()) 177 | case LongF() => G.point(LongF()) 178 | case StringF() => G.point(StringF()) 179 | } 180 | } 181 | 182 | def toAvro[T](schemaF: T)(implicit T: Recursive.Aux[T, SchemaF]): Schema = 183 | schemaF.cataM(useARegistry).run(Map.empty)._2 184 | } 185 | 186 | trait AvroCoalgebra { 187 | 188 | def avroToSchemaF: CoalgebraM[Option, SchemaF, Schema] = { schema => 189 | schema.getType match { 190 | case Schema.Type.RECORD => 191 | val fields = schema.getFields.asScala 192 | StructF(ListMap(fields.map(f => f.name -> f.schema): _*)).some 193 | case Schema.Type.ARRAY => ArrayF(schema.getElementType).some 194 | case Schema.Type.BOOLEAN => BooleanF().some 195 | case Schema.Type.DOUBLE => DoubleF().some 196 | case Schema.Type.FLOAT => FloatF().some 197 | case Schema.Type.INT => IntegerF().some 198 | case Schema.Type.LONG => 199 | val lt = schema.getLogicalType 200 | if (lt != null) { 201 | if (lt.getName == LogicalTypes.timestampMillis().getName) { 202 | DateF().some 203 | } else None 204 | } else LongF().some 205 | case Schema.Type.STRING => StringF().some 206 | case _ => None 207 | } 208 | } 209 | } 210 | -------------------------------------------------------------------------------- /src/main/scala/solutions/4-spark-avro.scala: -------------------------------------------------------------------------------- 1 | package lc2018 2 | package solutions 3 | 4 | import matryoshka._ 5 | import matryoshka.data.Fix 6 | import matryoshka.implicits._ 7 | import matryoshka.patterns.EnvT 8 | import org.apache.avro.Schema 9 | import scalaz._, Scalaz._ 10 | 11 | import scala.language.higherKinds 12 | import org.apache.avro.generic.{GenericContainer, GenericData, GenericRecordBuilder} 13 | import org.apache.spark.sql.Row 14 | 15 | import scala.collection.immutable.ListMap 16 | import scala.language.higherKinds 17 | 18 | /** 19 | * It's time to confront ourselves to the real world of manipulating data with Spark & Avro 20 | * Two specific pain points we have to tackle are : 21 | * 22 | * - Spark's org.apache.spark.sql.Row is basically a wrapper of Array[Any] 23 | * but we need to handle two specifically different behaviour according to the level of the data : 24 | * When we're handling Arrays and Structs, no worry we need to output a Row 25 | * but when we're handling "simple" types, then if it's a top-level value we need to output a Row 26 | * but if it's not, then the value itself must be written. 27 | * 28 | * Exemple : 29 | * - Value("b") will be Row("b") 30 | * but 31 | * - Struct(a -> Value("b")) will be Row("b") as well (the Row now representing the outer struct) 32 | * 33 | * - For Apache Avro, it's a new kind of pain you'll need to overcome, Avro basically represents all of its data 34 | * as if, it will be at one point or another generated into Java classes. 35 | * So every "record" or Struct needs to have a qualified name "unique" otherwise the Avro engine will consider 36 | * the struct as being the same class. 37 | * But as it will obviously have different fields - you'll most likely end up with an error. 38 | * 39 | * Happy hunting. 40 | */ 41 | object SparkConverter extends GDataInstances { 42 | 43 | def isOfSimpleType[D](data: GData[D]) = data match { 44 | case GStruct(_) | GArray(_) => true 45 | case _ => false 46 | } 47 | 48 | /** 49 | * We have a proper way to overcome this problem. There is a `para` scheme that works a little bit like cata. 50 | * Using para, our algebra will "see" not only the result of its application to the level bellow but also 51 | * the structure of that level we just processed. 52 | * 53 | * To use para, we need a special kind of algebra : a GAlgebra. Given a functor F and a comonad W, Galgebra[W, F, A] 54 | * is simply a function F[W[A]] => A, so our carrier is simply wrapped in an additional layer. 55 | * 56 | * For para's GAlgebra we use (T[F], ?) as our comonad, in other words, our carrier will be paired with the "tree" we 57 | * processed during the previous step. 58 | * 59 | * We will use that to know when we need to "unwrap" the value we had wrapped in a Row at the previous step although we 60 | * shouldn't have. 61 | */ 62 | def gDataToRow[D](implicit D: Recursive.Aux[D, GData]): GAlgebra[(D, ?), GData, Row] = { 63 | case GArray(elems) => 64 | val values = elems.map { 65 | case (previous, current) => 66 | if (isOfSimpleType(previous.project)) 67 | current 68 | else 69 | current.values.head 70 | } 71 | Row(values) 72 | 73 | case GStruct(fields) => 74 | val values = fields.map { 75 | case (k, (previous, value)) => 76 | if (isOfSimpleType(previous.project)) { 77 | value 78 | } else { 79 | value.values.head 80 | } 81 | } 82 | Row(values.toSeq: _*) 83 | 84 | case GBoolean(el) => Row(el) 85 | case GFloat(el) => Row(el) 86 | case GInteger(el) => Row(el) 87 | case GDate(el) => Row(el) 88 | case GLong(el) => Row(el) 89 | case GDouble(el) => Row(el) 90 | case GString(el) => Row(el) 91 | } 92 | 93 | def fromGDataToSparkRow(row: Fix[GData]): Row = 94 | row.para[Row](gDataToRow) 95 | 96 | } 97 | 98 | /** 99 | * We'll also need Avro to serialize streaming data into Kafka topics. 100 | * 101 | * This is just another kind of pain :). We will be using Avro's GenericContainer interface. 102 | * To build a GenericContainer you need an Avro schema, so we'll have to somehow "zip" the data 103 | * we want to serialize with its schema (this should remind you of something we already did). 104 | */ 105 | object AvroConverter extends SchemaToAvroAlgebras with GDataInstances { 106 | 107 | import scala.collection.JavaConverters._ 108 | 109 | /** 110 | * A generic schema (of type [[SchemaF]]) with each element 111 | * labelled with the corresponding `avro.Schema`. 112 | */ 113 | type SchemaWithAvro[A] = EnvT[Schema, SchemaF, A] 114 | 115 | type DataWithSchema[A] = EnvT[Schema, GData, A] 116 | 117 | case class Incompatibility[D](schema: Schema, data: D) 118 | 119 | /** 120 | * Avro API is not very typesafe, all values inside GenericRecord are treated as mere Objects. 121 | * They didn't defined a GenericContainer for storing simple values (like numbers, strings, etc). 122 | * So we need to define one, for there is no way *we* work on non-types like Any or AnyRef. 123 | */ 124 | case class SimpleValue(value: Any) extends GenericContainer { 125 | override def getSchema: Schema = ??? 126 | } 127 | 128 | /** 129 | * But this is for our convenience only, we still need to feed avro API methods with unwrapped 130 | * simple values, so don't forget to use this method whenever needed. 131 | */ 132 | def unwrap(container: GenericContainer): Any = { 133 | container match { 134 | case SimpleValue(value) => value 135 | case value => value 136 | } 137 | } 138 | 139 | def fromGDataToAvro[S, D](schema: S, data: D)( 140 | implicit S: Birecursive.Aux[S, SchemaF], 141 | D: Birecursive.Aux[D, GData]): \/[Incompatibility[D], GenericContainer] = { 142 | 143 | val zipWithSchemaAlg: CoalgebraM[\/[Incompatibility[D], ?], DataWithSchema, (S, D)] = { 144 | case (sch, dat) => 145 | (sch.project, dat.project) match { 146 | 147 | case (structF @ StructF(fieldsSchema), GStruct(fields)) => 148 | val withSchema = GStruct( 149 | ListMap(fields.map { case (name, fx) => (name, (fieldsSchema(name), fx)) }.toSeq: _*)) 150 | EnvT[Schema, GData, (S, D)]((schemaFToAvro(structF.embed), withSchema)).right 151 | 152 | case (arrF @ ArrayF(fieldSchema), GArray(elems)) => 153 | val withSchema = GArray(elems.map(fx => (fieldSchema, fx))) 154 | EnvT[Schema, GData, (S, D)]((schemaFToAvro(arrF.embed), withSchema)).right 155 | 156 | case (valueF @ StringF(), GString(value)) => 157 | val withSchema = GString[(S, D)](value) 158 | EnvT[Schema, GData, (S, D)]((schemaFToAvro(valueF.embed), withSchema)).right 159 | 160 | case (valueF @ IntegerF(), GInteger(value)) => 161 | val withSchema = GInteger[(S, D)](value) 162 | EnvT[Schema, GData, (S, D)]((schemaFToAvro(valueF.embed), withSchema)).right 163 | 164 | case (valueF @ LongF(), GLong(value)) => 165 | val withSchema = GLong[(S, D)](value) 166 | EnvT[Schema, GData, (S, D)]((schemaFToAvro(valueF.embed), withSchema)).right 167 | 168 | case (valueF @ BooleanF(), GBoolean(value)) => 169 | val withSchema = GBoolean[(S, D)](value) 170 | EnvT[Schema, GData, (S, D)]((schemaFToAvro(valueF.embed), withSchema)).right 171 | 172 | case (valueF @ FloatF(), GFloat(value)) => 173 | val withSchema = GFloat[(S, D)](value) 174 | EnvT[Schema, GData, (S, D)]((schemaFToAvro(valueF.embed), withSchema)).right 175 | 176 | case (valueF @ DoubleF(), GDouble(value)) => 177 | val withSchema = GDouble[(S, D)](value) 178 | EnvT[Schema, GData, (S, D)]((schemaFToAvro(valueF.embed), withSchema)).right 179 | 180 | case (valueF @ DateF(), GDate(value)) => 181 | val withSchema = GDate[(S, D)](value) 182 | EnvT[Schema, GData, (S, D)]((schemaFToAvro(valueF.embed), withSchema)).right 183 | 184 | case (s, d) => 185 | Incompatibility(schemaFToAvro(s.embed), d.embed).left 186 | } 187 | } 188 | val alg: AlgebraM[\/[Incompatibility[D], ?], DataWithSchema, GenericContainer] = { 189 | case EnvT((avroSchema, GStruct(fields))) => 190 | val bldrWithFields = fields.foldLeft(new GenericRecordBuilder(avroSchema)) { (recordBuilder, container) => 191 | val (name, data) = container 192 | recordBuilder.set(name, unwrap(data)) 193 | } 194 | bldrWithFields.build().right 195 | 196 | case EnvT((avroSchema, GArray(elem))) => 197 | new GenericData.Array[Any](avroSchema, elem.map(unwrap).asJavaCollection).right 198 | 199 | case EnvT((_, GBoolean(el))) => SimpleValue(el).right 200 | case EnvT((_, GFloat(el))) => SimpleValue(el).right 201 | case EnvT((_, GInteger(el))) => SimpleValue(el).right 202 | case EnvT((_, GDate(el))) => SimpleValue(el.getTime).right // c.f. logical types 203 | case EnvT((_, GLong(el))) => SimpleValue(el).right 204 | case EnvT((_, GDouble(el))) => SimpleValue(el).right 205 | case EnvT((_, GString(el))) => SimpleValue(el).right 206 | } 207 | 208 | (schema, data).hyloM[\/[Incompatibility[D], ?], DataWithSchema, GenericContainer](alg, zipWithSchemaAlg) 209 | } 210 | 211 | } 212 | 213 | trait GDataInstances { 214 | 215 | implicit val genericDataFTraverse: Traverse[GData] = new Traverse[GData] { 216 | 217 | override def traverseImpl[G[_], A, B](fa: GData[A])(f: A => G[B])( 218 | implicit evidence$1: Applicative[G]): G[GData[B]] = fa match { 219 | case GArray(elems) => 220 | Functor[G].map(elems.toList traverse f)(GArray.apply) 221 | 222 | case GStruct(fields) => 223 | val (keys, values) = fields.unzip 224 | Functor[G].map(values.toList traverse f)(v => GStruct(ListMap((keys zip v).toSeq: _*))) 225 | 226 | case GString(value) => Applicative[G].point(GString[B](value)) 227 | case GLong(value) => Applicative[G].point(GLong[B](value)) 228 | case GInteger(value) => Applicative[G].point(GInteger[B](value)) 229 | case GDouble(value) => Applicative[G].point(GDouble[B](value)) 230 | case GFloat(value) => Applicative[G].point(GFloat[B](value)) 231 | case GDate(value) => Applicative[G].point(GDate[B](value)) 232 | case GBoolean(value) => Applicative[G].point(GBoolean[B](value)) 233 | } 234 | } 235 | } 236 | --------------------------------------------------------------------------------