├── .scalafmt.conf
├── project
    ├── build.properties
    └── plugins.sbt
├── .gitignore
├── spark
    └── src
    │   └── main
    │       └── scala
    │           ├── Row.scala
    │           └── DataType.scala
├── src
    ├── main
    │   └── scala
    │   │   ├── package.scala
    │   │   ├── JsonPatch.scala
    │   │   ├── 3-validation.scala
    │   │   ├── solutions
    │   │       ├── 5-patches.scala
    │   │       ├── 0-prelude.scala
    │   │       ├── 3-validation.scala
    │   │       ├── 1-schema.scala
    │   │       ├── 2-avro.scala
    │   │       └── 4-spark-avro.scala
    │   │   ├── 0-prelude.scala
    │   │   ├── 1-schema.scala
    │   │   ├── 2-avro.scala
    │   │   └── 4-spark-avro.scala
    └── test
    │   └── scala
    │       ├── 2-avro
    │           └── LabellingSpec.scala
    │       ├── package.scala
    │       ├── 1-schema
    │           └── ParquetSpec.scala
    │       ├── 4-spark-avro
    │           ├── SparkConverterSpec.scala
    │           └── AvroConverterSpec.scala
    │       ├── 5-patches
    │           ├── ApplyPatchSpec.scala
    │           └── ArbitraryPatch.scala
    │       └── 3-validation
    │           └── SchemaRules.scala
├── 1-SCHEMA.md
├── 0-PRELUDE.md
├── 4-BIG_DATA.md
├── README.md
├── 2-AVRO.md
├── 3-VALIDATION.md
└── 5-PATCHES.md


/.scalafmt.conf:
--------------------------------------------------------------------------------
1 | align = most
2 | maxColumn = 120


--------------------------------------------------------------------------------
/project/build.properties:
--------------------------------------------------------------------------------
1 | sbt.version=1.1.5
2 | 


--------------------------------------------------------------------------------
/project/plugins.sbt:
--------------------------------------------------------------------------------
1 | addSbtPlugin("com.geirsson" % "sbt-scalafmt" % "1.5.1")
2 | 


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
1 | target/
2 | *.class
3 | .ensime*
4 | .idea/
5 | *.ipr
6 | *.iws
7 | .DS_Store
8 | 


--------------------------------------------------------------------------------
/spark/src/main/scala/Row.scala:
--------------------------------------------------------------------------------
1 | package org.apache.spark.sql
2 | 
3 | final case class Row(values: Any*)
4 | 


--------------------------------------------------------------------------------
/src/main/scala/package.scala:
--------------------------------------------------------------------------------
1 | package object lc2018 {
2 | 
3 |   type TODO = Nothing
4 | 
5 |   def TODO[A]: A = throw new Exception("not implemented")
6 | }
7 | 


--------------------------------------------------------------------------------
/1-SCHEMA.md:
--------------------------------------------------------------------------------
1 | ## Schemas
2 | 
3 | We want to build a data ingestion pipeline that's able to accept new data sources without writing any new code. That means that our pipeline will be completely configured by the schema of the data source.
4 | 
5 | So we'll be manipulating schemas all day long. And schemas are inherently recursive, so you see where this is heading, right? :) 
6 | 
7 | When you're ready, head on to `src/main/scala/1-prelude.scala` you'll find explanatory comments and many more `TODO` to replace.
8 | 


--------------------------------------------------------------------------------
/src/main/scala/JsonPatch.scala:
--------------------------------------------------------------------------------
 1 | package lc2018
 2 | 
 3 | import jto.validation.jsonast.JValue
 4 | 
 5 | sealed trait Operation
 6 | case object Add     extends Operation
 7 | case object Remove  extends Operation
 8 | case object Replace extends Operation
 9 | 
10 | sealed trait Position
11 | final case class Field(name: String) extends Position
12 | final case class Index(value: Int)   extends Position
13 | //final case class Last(pos: Position) extends Position
14 | case object End extends Position
15 | final case class JsonPatch(op: Operation, path: List[Position], value: JValue)
16 | 


--------------------------------------------------------------------------------
/0-PRELUDE.md:
--------------------------------------------------------------------------------
1 | ## Prelude
2 | 
3 | Our HR department told us that we need to provide new hires with a full week of training, but we decided to shrink that to just one hour.
4 | 
5 | We'll be using recursion schemes quite intensively for our project, and we've been told that it's a technique that hasn't yet been widely adopted by the industry.
6 | 
7 | So let's begin our training by a recursion schemes crash course. There will first be a short presentation given by our ~sales manager~ technical lead (seriously, what kind of developer wears a suit and a tie nowadays?). After that, you'll be tasked to solve the exercises under `src/main/scala/0-prelude.scala`.
8 | 


--------------------------------------------------------------------------------
/src/test/scala/2-avro/LabellingSpec.scala:
--------------------------------------------------------------------------------
 1 | package lc2018
 2 | 
 3 | import org.scalacheck.Properties
 4 | import org.scalacheck.Prop._
 5 | import matryoshka._, implicits._, data.Fix, Fix._
 6 | import matryoshka.scalacheck.arbitrary._
 7 | import scalaz._, Scalaz._
 8 | 
 9 | class AvroSpec extends Properties("Avro-related algebras") with SchemaToAvroAlgebras {
10 | 
11 |   import SchemaF._
12 | 
13 |   property("labelling solution") = forAll { (schema: Fix[SchemaF]) =>
14 |     val avro = schemaFToAvro(schema)
15 |     val back = avro.anaM[Fix[SchemaF]](avroToSchemaF)
16 |     (Some(schema) == back) :| s"Some($schema) == $back"
17 |   }
18 | 
19 |   property("registry solution") = forAll { (schema: Fix[SchemaF]) =>
20 |     val avro = toAvro(schema)
21 |     val back = avro.anaM[Fix[SchemaF]](avroToSchemaF)
22 |     (Some(schema) == back) :| s"Some($schema) == $back"
23 |   }
24 | }
25 | 


--------------------------------------------------------------------------------
/4-BIG_DATA.md:
--------------------------------------------------------------------------------
 1 | ## Data or Big Data ?
 2 | 
 3 | We now know how to represent and validate incoming Data, but our clients are whining that our ADT
 4 | serialized in Kryo is not exactly "usable" for them, and the "Data Management" is saying that this
 5 | is not exactly a "serious" and durable way of storing data.
 6 | 
 7 | So we're back at the drawing board !
 8 | 
 9 | We need to store data in a way that is both durable and usable.
10 | 
11 | As we'll be using Apache Spark for our batch processing framework, we now need to be able to read our data
12 | as Apache Spark's Row data structure. Fortunately for your bandwith - we don't "really" need the whole Apache Spark
13 | project - so we replicated the *Row* data structure so you may work offline easily.
14 | 
15 | But for our Stream Processing framework it makes more sense to use Apache Avro, so now let's finish the job !
16 | 
17 | Your mission if you accept it :
18 | * Create the Algebra necessary to project any GData into Apache Avro or Apache Spark's data structure.
19 | * you'll find more instructions in the `src/main/scala/4-spark-avro.scala`
20 | 
21 | 
22 | Good Hunting.


--------------------------------------------------------------------------------
/spark/src/main/scala/DataType.scala:
--------------------------------------------------------------------------------
 1 | package org.apache.spark.sql.types
 2 | 
 3 | 
 4 | final case class Metadata(map: Map[String, Any])
 5 | object Metadata {
 6 |   def empty: Metadata = new Metadata(Map.empty[String, Any])
 7 | }
 8 | 
 9 | sealed trait DataType
10 | 
11 | 
12 | case object BooleanType extends DataType
13 | case object DateType extends DataType
14 | case object DoubleType extends DataType
15 | case object FloatType extends DataType
16 | case object IntegerType extends DataType
17 | case object LongType extends DataType
18 | case object StringType extends DataType
19 | final case class StructField(name: String, dataType: DataType, nullable: Boolean = true, metadata: Metadata = Metadata.empty)
20 | final case class StructType(fields: Array[StructField]) extends DataType {
21 |   override def toString: String = s"StructType${fields.map(_.toString).mkString("(", ", ", ")")}"
22 | 
23 |   override def equals(other: Any): Boolean = other match {
24 |     case StructType(otherFields) => (fields zip otherFields).forall{ case (l, r) => l == r}
25 |     case _ => false
26 |   }
27 | }
28 | final case class ArrayType(elementType: DataType, containsNull: Boolean) extends DataType 
29 | 


--------------------------------------------------------------------------------
/src/test/scala/package.scala:
--------------------------------------------------------------------------------
 1 | package lc2018
 2 | 
 3 | import jto.validation.Rule
 4 | import jto.validation.jsonast._
 5 | import matryoshka._, implicits._
 6 | import org.joda.time.LocalDateTime
 7 | 
 8 | import org.joda.time.format.ISODateTimeFormat
 9 | import play.api.libs.json._
10 | import scalaz.{Applicative, Functor}
11 | 
12 | package object solutions {
13 |   def toJson[D](value: D)(implicit D: Recursive.Aux[D, GData], F: Functor[GData]): JValue = {
14 |     val alg: Algebra[GData, JValue] = {
15 |       case GStruct(fields) =>
16 |         JObject(fields)
17 | 
18 |       case GArray(elems) =>
19 |         JArray(elems)
20 | 
21 |       case GBoolean(el) => JBoolean(el)
22 |       case GFloat(el)   => JNumber(el)
23 |       case GInteger(el) => JNumber(el)
24 |       case GDate(el)    => JString(LocalDateTime.fromDateFields(el).toString(ISODateTimeFormat.dateTime()))
25 |       case GLong(el)    => JNumber(el)
26 |       case GDouble(el)  => JNumber(el)
27 |       case GString(el)  => JString(el)
28 | 
29 |     }
30 |     value.cata(alg)
31 |   }
32 | 
33 |   import SchemaRules.JRule
34 |   implicit val ruleApplicativeForScalaz: Applicative[JRule] = new Applicative[JRule] {
35 |     override def point[A](a: => A): JRule[A] = Rule.pure(a)
36 | 
37 |     override def ap[A, B](fa: => JRule[A])(f: => JRule[A => B]): JRule[B] = fa.ap(f)
38 |   }
39 | 
40 | }
41 | 


--------------------------------------------------------------------------------
/src/test/scala/1-schema/ParquetSpec.scala:
--------------------------------------------------------------------------------
 1 | package lc2018
 2 | 
 3 | import org.apache.spark.sql.types._
 4 | import org.scalacheck._
 5 | import org.scalacheck.Prop._
 6 | import matryoshka._, implicits._, data.Fix, Fix._
 7 | import matryoshka.scalacheck.arbitrary._
 8 | import scalaz._, Scalaz._
 9 | 
10 | /**
11 |   * Verifying that our (co)algebras that convert SchemaF from/to DataType are correct should be trivial using
12 |   * property-based testing.
13 |   *
14 |   * But to do that, we first need to be able to generate arbitrary Fix[SchemaF]. So we ned a way to summon instances of
15 |   * Arbitrary[T[SchemaF]] for any fix-point T. We'll again need help from our new friend Delay.
16 |   * Now we only need to verify that, given an arbitrary Fix[SchemaF], converting it to a DataType using `schemaFToDataType`
17 |   * and then convert that back to Fix[SchemaF] using `dataTypeToSchemaF` should produce the initial Fix[SchemaF].
18 |   */
19 | object SchemaFToDataTypeAlgebrasSpec extends Properties("Parquet-related algebras") with SchemaFToDataTypeAlgebras {
20 | 
21 |   import SchemaF._
22 | 
23 |   property("invertible") = forAll { (schema: Fix[SchemaF]) =>
24 |     // We want to convert `schema` to DataType and then back to Fix[SchemaF] using the (co)algebras we've just defined.
25 |     val roundtrip: Fix[SchemaF] = schema.cata(schemaFToDataType).ana[Fix[SchemaF]](dataTypeToSchemaF)
26 |     (roundtrip == schema) :| s"$roundtrip\n==\n$schema"
27 |   }
28 | }
29 | 


--------------------------------------------------------------------------------
/src/test/scala/4-spark-avro/SparkConverterSpec.scala:
--------------------------------------------------------------------------------
 1 | package lc2018
 2 | 
 3 | import lc2018.GData._
 4 | import lc2018.SparkConverter._
 5 | import matryoshka._
 6 | import matryoshka.implicits._
 7 | import matryoshka.data.Fix._
 8 | import matryoshka.data._
 9 | import org.apache.spark.sql.Row
10 | import org.scalacheck.Prop._
11 | import org.scalacheck._
12 | import org.scalatest.{FlatSpec, Matchers}
13 | 
14 | import scala.collection.immutable.ListMap
15 | 
16 | class SparkConverterSpec extends Properties("Spark-rules algebras") with SchemaToAvroAlgebras {
17 | 
18 |   property("should generate valid and compatible data vs schema") = forAll(genSchemaAndData[Fix[SchemaF], Fix[GData]]) {
19 |     case (_, data) => fromGDataToSparkRow(data) != null // should not fail
20 |   }
21 | }
22 | 
23 | class SparkConverterTest extends FlatSpec with Matchers {
24 | 
25 |   "Spark conversions" should "work with sample schema and data" in {
26 |     val body = """{"a": "toto", "b": 12}"""
27 |     val data = Fix[GData](GStruct(ListMap("a" -> Fix(GString("toto")), "b" -> Fix(GInteger(12)))))
28 |     val row  = fromGDataToSparkRow(data)
29 |     row should be(Row("toto", 12))
30 |   }
31 | 
32 |   it should "work with nested data as well" in {
33 |     val body = """{"a": "toto", "b": { "c": 12 }}"""
34 |     val data = Fix[GData](
35 |       GStruct(
36 |         ListMap(
37 |           "a" -> Fix(GString("toto")),
38 |           "b" -> Fix(GStruct(ListMap("c" -> Fix(GInteger(12)))))
39 |         )))
40 |     val row = fromGDataToSparkRow(data)
41 |     row should be(Row("toto", Row(12)))
42 |   }
43 | }
44 | 


--------------------------------------------------------------------------------
/src/test/scala/5-patches/ApplyPatchSpec.scala:
--------------------------------------------------------------------------------
 1 | package lc2018
 2 | package solutions
 3 | 
 4 | import jto.validation.jsonast._
 5 | import org.scalacheck.Properties
 6 | import org.scalacheck.Prop._
 7 | 
 8 | import matryoshka._, implicits._
 9 | import matryoshka.data.Fix, Fix._
10 | import org.scalatest.{FlatSpec, Matchers}
11 | 
12 | import scala.collection.immutable.ListMap
13 | 
14 | import scalaz.\/-
15 | 
16 | class ApplyPatchSpec extends FlatSpec with Matchers with PatchAlgebras {
17 | 
18 |   val patch = JsonPatch(Replace, List(Field("foo"), Field("bar"), End), JNumber(42))
19 | 
20 |   val schema: Fix[SchemaF] = StructF(
21 |     ListMap(
22 |       "foo" -> StructF(
23 |         ListMap(
24 |           "bar" -> IntegerF[Fix[SchemaF]]().embed,
25 |           "baz" -> BooleanF[Fix[SchemaF]]().embed
26 |         )).embed,
27 |       "qux" -> StringF[Fix[SchemaF]]().embed
28 |     )).embed
29 | 
30 |   val initialData: Fix[GData] = GStruct(
31 |     ListMap(
32 |       "foo" -> GStruct(
33 |         ListMap(
34 |           "bar" -> GInteger[Fix[GData]](1).embed,
35 |           "baz" -> GBoolean[Fix[GData]](true).embed
36 |         )).embed,
37 |       "qux" -> GString[Fix[GData]]("hoay!").embed
38 |     )).embed
39 | 
40 |   val expected = GStruct(
41 |     ListMap(
42 |       "foo" -> GStruct(
43 |         ListMap(
44 |           "bar" -> GInteger[Fix[GData]](42).embed,
45 |           "baz" -> GBoolean[Fix[GData]](true).embed
46 |         )).embed,
47 |       "qux" -> GString[Fix[GData]]("hoay!").embed
48 |     )).embed
49 |   "Applying a patch" should "update the relevant fields" in {
50 |     applyPatch(schema, patch, initialData) should be(\/-(expected))
51 |   }
52 | }
53 | 


--------------------------------------------------------------------------------
/src/test/scala/4-spark-avro/AvroConverterSpec.scala:
--------------------------------------------------------------------------------
 1 | package lc2018
 2 | 
 3 | import lc2018.GData._
 4 | import matryoshka._, implicits._
 5 | import matryoshka.data._, Fix._
 6 | import org.apache.avro.generic.GenericData
 7 | import org.scalacheck.Prop._
 8 | import org.scalacheck._
 9 | import org.scalatest.{FlatSpec, Matchers}
10 | import AvroConverter._
11 | 
12 | import scala.collection.immutable.ListMap
13 | 
14 | class AvroConverterSpec extends Properties("Avro-rules algebras") with SchemaToAvroAlgebras {
15 | 
16 |   property("should generate valid avro data") = forAll(genSchemaAndData[Fix[SchemaF], Fix[GData]]) {
17 |     case (schema, data) =>
18 |       val result      = fromGDataToAvro(schema, data)
19 |       val avroSchema  = schemaFToAvro(schema)
20 |       val genericData = GenericData.get()
21 | 
22 |       result.isRight :| s"Failed to generate avro data" &&
23 |       genericData.validate(avroSchema, result.toOption.get) :| "Datum generated is not valid according to Avro Schema"
24 |   }
25 | }
26 | 
27 | class AvroConverterTest extends FlatSpec with Matchers {
28 | 
29 |   "Avro conversion" should "work with sample schema and data" in {
30 |     val body   = """{"a": "toto", "b": 12}"""
31 |     val schema = Fix[SchemaF](StructF(ListMap("a" -> Fix(StringF()), "b" -> Fix(IntegerF()))))
32 |     val data   = Fix[GData](GStruct(ListMap("a" -> Fix(GString("toto")), "b" -> Fix(GInteger(12)))))
33 |     val avro   = fromGDataToAvro(schema, data)
34 | 
35 |     avro.isRight should be(true)
36 |     val result = avro.toOption.get
37 |     result.toString should be("""{"a": "toto", "b": 12}""")
38 |     GenericData.get().validate(schemaFToAvro(schema), result) should be(true)
39 |   }
40 | }
41 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | # Recursion schemes without the barbed wire
 2 | 
 3 | Welcome to INC Inc. We are so happy we just hired a whole team of motivated engineers. Here at INC Inc. (INC is a Neat Company), we are proud proponents of statically-typed functional programming on the JVM — well, basically we use Scala.
 4 | 
 5 | On your first day at work, we have a good news and a bad news. The good news is: we have an exciting new mission, our most important client, AcmeCorp has tasked us with the construction of its "meta data lake", whatever that means. The bad news is they want it live by tonight.
 6 | 
 7 | But fear not, out architects have already designed the whole system and it works like a charm (on Powerpoint). All you need to do is to follow the specs and write a few Scala lines.
 8 | 
 9 | ## Before we begin
10 | 
11 | You'll need to fulfill a few requirements in order to get everything work. You'll need to install 
12 | 
13 | * Java8 JDK
14 | * sbt
15 | 
16 | Everything else should be pretty much working out of the box. This project has a few external dependencies though, so in order to save everyone some network bandwidth, it would be cool if you managed to clone the repository and issue the `sbt update` command in advance of the workshop.
17 | 
18 | ## Structure of the workshop
19 | 
20 | This workshop is made of a series of practical exercises,
21 | interleaved with a bunch of useful explanations about specific recursion schemes, patterns and techniques.
22 | Each exercise lives in the main package of `src/main/scala`
23 | and a solution to each exercise is available in the `solutions` package.
24 | 
25 | ## TOC
26 | 
27 | * 0-PRELUDE
28 | * 1-SCHEMA
29 | * 2-AVRO
30 | * 3-VALIDATION
31 | * 4-SPARK-AVRO
32 | * 5-PATCHES
33 | 


--------------------------------------------------------------------------------
/2-AVRO.md:
--------------------------------------------------------------------------------
 1 | ## Avro schemas
 2 | 
 3 | Avro is a library (that has a Java version) and a data representation format. It is widely used in the data industry as it offers interesting features like schema versioning, automatic data upcast and downcast between schema versions and things like that.
 4 | 
 5 | Unfortunately, it has not been designed with functional programming and strong typing in mind, let alone recursion schemes... So our job will be a little harder this time.
 6 | 
 7 | You should find relevant hints in the comments in `src/main/scala/2-avro.scala` that'll help you tame that Avro beast.
 8 | 
 9 | 
10 | ##AVRO SCHEMA 101
11 | 
12 | ### Inspecting schemas
13 | 
14 | * a single type : Schema
15 | * a getType method that gives you the "kind" of schema: RECORD, ARRAY, INT, STRING, etc ...
16 | * depending on the result of getType, it is safe to call certain methods : 
17 |   * in case of RECORD, you can call `getFields()`
18 |   * in case of ARRAY, you can call `getElementType()
19 | 
20 | ### Building schemas
21 | 
22 | * For simple types, you can do something like : Schema.create(Schema.Type.INT)
23 | * Some simple types (like Date) do not have a "natural" representation, but you can piggyback on existing schemas using so-called logical types : 
24 |     * To represent dates as long : 
25 |       `LogicalTypes.timestampMillis().addToSchema(Schema.create(Schema.Type.LONG))`
26 | * For arrays you can do : SchemaBuilder.array().items(...)  <- you replace ... with the right Schema
27 | * For structs (called record in the Avro realm) it's a bit more complicated : 
28 | ```
29 | SchemaBuilder
30 |     .record("nameOfMyRecord")
31 |     .fields
32 |     .name("nameOfTheField").`type`(...).noDefault  <- replace ... with the right Schema, don't forget the noDefault
33 |     // add more fields as needed
34 |     .endRecord
35 | ```
36 | 


--------------------------------------------------------------------------------
/src/test/scala/5-patches/ArbitraryPatch.scala:
--------------------------------------------------------------------------------
 1 | package lc2018
 2 | package solutions
 3 | 
 4 | import jto.validation.jsonast.Ast
 5 | import org.scalacheck.{Arbitrary, Gen}
 6 | import matryoshka._, implicits._
 7 | 
 8 | trait ArbitraryPatch extends GDataInstances with SchemaToAvroAlgebras with DataWithSchemaGenerator {
 9 |   def patchForData[S, D](schema: S, data: D)(implicit S: Recursive.Aux[S, SchemaF],
10 |                                              D: Birecursive.Aux[D, GData]): Gen[JsonPatch] =
11 |     for {
12 |       depth       <- Gen.choose(1, 10)
13 |       (path, sch) <- pathInData(depth, schema, data)
14 |       patchValue  <- sch cata schemaToDataGen
15 |     } yield JsonPatch(Replace, path, toJson(patchValue))
16 | 
17 |   def pathInData[S, D](depth: Int, schema: S, data: D)(implicit S: Recursive.Aux[S, SchemaF],
18 |                                                        D: Recursive.Aux[D, GData]): Gen[(List[Position], S)] =
19 |     if (depth == 0) Gen.const((End :: Nil, schema))
20 |     else {
21 |       (schema.project, data.project) match {
22 |         case (StructF(fieldsS), GStruct(fieldsD)) if fieldsD.nonEmpty =>
23 |           for {
24 |             head        <- Gen.oneOf(fieldsS.keys.toSeq)
25 |             sch         = fieldsS(head)
26 |             (tail, sub) <- pathInData(depth - 1, sch, fieldsD(head))
27 |           } yield (Field(head) :: tail, sub)
28 |         case (ArrayF(elem), GArray(elems)) if elems.nonEmpty =>
29 |           for {
30 |             head        <- Gen.choose(0, elems.size - 1)
31 |             (tail, sub) <- pathInData(depth - 1, elem, elems(head))
32 |           } yield (Index(head) :: tail, sub)
33 |         case _ => (End :: Nil, schema)
34 |       }
35 |     }
36 | 
37 |   implicit def dataAndTwoPatches[S, D](implicit S: Birecursive.Aux[S, SchemaF],
38 |                                        D: Birecursive.Aux[D, GData]): Arbitrary[(S, D, JsonPatch, JsonPatch)] =
39 |     Arbitrary {
40 |       for {
41 |         (s, data) <- genSchemaAndData[S, D]
42 |         patch1    <- patchForData(s, data)
43 |         patch2    <- patchForData(s, data)
44 |       } yield (s, data, patch1, patch2)
45 |     }
46 | }
47 | 


--------------------------------------------------------------------------------
/3-VALIDATION.md:
--------------------------------------------------------------------------------
 1 | ## It's a tough world out there
 2 | 
 3 | We now have a Schema both in its Pattern-Functor form and usable as an Apache Avro Schema
 4 | but it's not enough to have a working DataLake.
 5 | 
 6 | It might be obvious, but we need Data !
 7 | 
 8 | We managed to negotiate for our MVP with the Product Owner that all incoming data will be in JSON.
 9 | 
10 | So now we need a way to make it work and ingest any kind of data into our Lake.
11 | But for it not to become, an absurd pile of junk data : a DataSwamp
12 | we can't trust the outside world with the data we'll receive.
13 | 
14 | We need to design a system that will validate incoming data according to the expected Schema
15 | and output meaningful errors to our counterparts.
16 | 
17 | Our main objective for this part III of our workshop will be to :
18 | 
19 | > Leverage the power of JTO Validation (https://github.com/jto/validation)
20 | > and Matryoshka to generate `Rules` that will validate any incoming Data
21 | 
22 | Being professionals we need this framework to be properly tested, of course a small sample unit test
23 | will be of great help, but can't possibly be enough to handle the variety of Schema and Data we will be handling.
24 | 
25 | So another of your objective will be to generate arbitrary Schema and Data with ScalaCheck and
26 | test that the validation `Rules` that you'll create *really do in fact* validate your data
27 | The funny thing being that your generated *random* data should of course be compatible with your generated *random* schema.
28 | 
29 | The tests that you'll need are already coded in `src/test/scala/3-validation/SchemaRules.scala`
30 | But it relies on a Schema *and* Data Generator in `src/main/scala/3-validation.scala` that you'll need to code.
31 | 
32 | You'll be provided with the Pattern-Functor needed to represent data (i.e. `GData`),
33 | so all you need to complete this part is to code in `src/main/scala/3-validation.scala` :
34 | * The `Rules` generation method `SchemaRules.fromSchemaToRules(schema: Fix[SchemaF]): JRule[Fix[GData]]`
35 | * The Schema and Data generator `DataWithSchemaGenerator.genSchemaAndData: Gen[(Fix[SchemaF], Fix[GData])]`
36 | 
37 | More specific constraints are included in the comments of the source code.
38 | 
39 | Good Hunting.


--------------------------------------------------------------------------------
/src/test/scala/3-validation/SchemaRules.scala:
--------------------------------------------------------------------------------
 1 | package lc2018
 2 | 
 3 | import jto.validation.jsonast.Ast
 4 | import lc2018.GData._
 5 | import lc2018.SchemaRules._
 6 | import matryoshka._
 7 | import matryoshka.data._, Fix._
 8 | import matryoshka.implicits._
 9 | import org.joda.time.LocalDateTime
10 | import org.joda.time.format.ISODateTimeFormat
11 | import org.scalacheck.Prop._
12 | import org.scalacheck._
13 | import org.scalatest.{FlatSpec, Matchers}
14 | import play.api.libs.json._
15 | 
16 | import scala.collection.immutable.ListMap
17 | 
18 | class SchemaRulesSpec extends Properties("Schema-rules algebras") with SchemaToAvroAlgebras {
19 | 
20 |   property("should generate valid and compatible data vs schema") = forAll(genSchemaAndData[Fix[SchemaF], Fix[GData]]) {
21 |     case (schema, data) =>
22 |       val rules       = fromSchemaToRules(schema)
23 |       val jsonPayload = toJson(data)
24 |       val result      = Ast.from.validate(jsonPayload).map(rules.validate)
25 |       result.isValid :| s"Rules did not validate arbitrary data and schema : $schema and $data"
26 |   }
27 | 
28 |   def toJson(value: Fix[GData]): JsValue = {
29 |     val alg: Algebra[GData, JsValue] = {
30 |       case GStruct(fields) =>
31 |         JsObject(fields)
32 | 
33 |       case GArray(elems) =>
34 |         JsArray(elems)
35 | 
36 |       case GBoolean(el) => JsBoolean(el)
37 |       case GFloat(el)   => JsNumber(BigDecimal.decimal(el))
38 |       case GInteger(el) => JsNumber(el)
39 |       case GDate(el)    => JsString(LocalDateTime.fromDateFields(el).toString(ISODateTimeFormat.basicDateTime()))
40 |       case GLong(el)    => JsNumber(el)
41 |       case GDouble(el)  => JsNumber(el)
42 |       case GString(el)  => JsString(el)
43 | 
44 |     }
45 |     value.cata(alg)
46 |   }
47 | }
48 | 
49 | class SchemaRulesTest extends FlatSpec with Matchers {
50 | 
51 |   "Rule generation" should "work with sample schema and data" in {
52 |     val body   = """{"a": "toto", "b": 12}"""
53 |     val schema = Fix[SchemaF](StructF(ListMap("a" -> Fix(StringF()), "b" -> Fix(IntegerF()))))
54 |     val data   = Fix[GData](GStruct(ListMap("a" -> Fix(GString("toto")), "b" -> Fix(GInteger(12)))))
55 |     val rules  = fromSchemaToRules(schema)
56 |     val result = Ast.from.validate(Json.parse(body)).map(rules.validate)
57 |     result.isValid should be(true)
58 |   }
59 | }
60 | 


--------------------------------------------------------------------------------
/5-PATCHES.md:
--------------------------------------------------------------------------------
 1 | ## Applying JSON patches to GenericData
 2 | 
 3 | So far we assumed that each of our data sources have a known schema. Unfortunately, this is not the case for all out client's sources. For each of their business entities, the client maintains a Kafka topic where they log each modification of the entities, formatted as a JSON patch (as specified by [RFC 6902](https://tools.ietf.org/html/rfc6902)). 
 4 | 
 5 | These patches have the following structure : 
 6 | 
 7 | ```json
 8 | {
 9 |     "op": "add",
10 |     "path": "/profile/phoneNumbers",
11 |     "value": {
12 |         "type": "landline",
13 |         "number": "+33123456789"
14 |     }
15 | }
16 | ```
17 | 
18 | They cannot provide us with a schema for such patches, because the *schema* of the `value` field depends on the *value* of the `path` field. Nevertheless, we want to be able to validate the incoming patches are correctly structured in respect with the target entity's schema. Moreover, since we also maintain a copy of the corresponding entities, we want to apply the patches to these entities.
19 | 
20 | In other words, we want to write a function that given a JSON patch, the schema of the target entity (as a `T[SchemaF])` and the current state of the target entity (as a T[DataF]):
21 | 1. verifies that the patche's `path` exists in the entity's schema (it points to a subshema) and that the patch's `value` complies to this subschema (producing a representation of `value` as a `T[DataF]`
22 | 2. uses that representation to perform the patche's operation on the current state of the entity. We'll only implement the `replace` operation. 
23 | 
24 | ### Before you start
25 | 
26 | This last assignment is by far the most difficult, be (we hope that) it's also the most interesting. After having mastered the various "tactics" (pattern-functors, algebras, etc), the next hurdle on the path toward using recursion schemes in production is to become able to find a "strategy" to combine them to solve the problem at hand. It is often difficult at first to find the pattern-functor that matches the structure of the problem best, the right scheme to use on it or the carrier for the needed (co)algebras. 
27 | 
28 | From an educational point of view, it would be better if you tried to build your own solution from sratch and come up with your own strategy. By now you should know all the required tactics to solve this problem. We've encoded a simple representation for JSON patches in `src/main/scala/JsonPatch.scala`, you might also be interested in the definition of `matryoshka.patterns.ListF`.
29 | 
30 | Nevertheless, if you find yourself stuck or feel you might lack time to finish, we've laid out a solution in `src/main/scala/solutions/5-patches.scala`. But don't cheat and jump right at the solution before you've tried to come up with your own.
31 | 
32 | 


--------------------------------------------------------------------------------
/src/main/scala/3-validation.scala:
--------------------------------------------------------------------------------
 1 | package lc2018
 2 | 
 3 | import jto.validation._
 4 | import jto.validation.jsonast._
 5 | import matryoshka._
 6 | import matryoshka.data._
 7 | import org.scalacheck.Arbitrary
 8 | import scalaz.Scalaz._
 9 | import scalaz._
10 | 
11 | import scala.collection.immutable.ListMap
12 | import scala.language.higherKinds
13 | 
14 | /**
15 |   * Now that we have a Schema we will need to validate incoming data (JSON)
16 |   * and output "validated" data or "errors" with what went wrong for the sources
17 |   * to be able to fix their exports.
18 |   *
19 |   * For that we'll use the JTO Validation library but first we need to define what a "Data" is
20 |   */
21 | sealed trait GData[A]
22 | 
23 | final case class GStruct[A](fields: ListMap[String, A]) extends GData[A]
24 | 
25 | final case class GArray[A](element: Seq[A]) extends GData[A]
26 | 
27 | final case class GBoolean[A](value: Boolean) extends GData[A]
28 | 
29 | final case class GDate[A](value: java.util.Date) extends GData[A]
30 | 
31 | final case class GDouble[A](value: Double) extends GData[A]
32 | 
33 | final case class GFloat[A](value: Float) extends GData[A]
34 | 
35 | final case class GInteger[A](value: Int) extends GData[A]
36 | 
37 | final case class GLong[A](value: Long) extends GData[A]
38 | 
39 | final case class GString[A](value: String) extends GData[A]
40 | 
41 | object GData extends GDataInstances with DataWithSchemaGenerator
42 | 
43 | /**
44 |   * This is where you'll be working your magic.
45 |   * This code will need to go through every part of the Schema tree
46 |   * and create a `Rule` for each value, field of struct or array.
47 |   */
48 | object SchemaRules {
49 | 
50 |   /**
51 |     * Here we only define a simple type alias to simplify the code later on.
52 |     */
53 |   type JRule[A] = Rule[JValue, A]
54 | 
55 |   /**
56 |     * One important thing is that going through a struct
57 |     * means going through its fields one-by-one and generate `Rules`
58 |     * that will be translated to a `Rule` for the whole struct.
59 |     *
60 |     * The best way will be to `traverse` the fields (there is an Applicative instance for JRule)
61 |     */
62 |   def fromSchemaToRules[T](schema: T)(implicit T: Recursive.Aux[T, SchemaF]): JRule[Fix[GData]] = TODO
63 | 
64 | }
65 | 
66 | /**
67 |   * We need to test that validation - of course specific unit tests can be done
68 |   * but we're quite paranoid so let's "generate" abitrary schemas using ScalaCheck
69 |   *
70 |   * But then again - from a Schema we'll be able to generate Rules
71 |   * But to validate those rules we'd need data.
72 |   * So let's generate Data as well :
73 |   * Data that will, of course, need to be compatible with the Schema itself.
74 |   */
75 | trait DataWithSchemaGenerator {
76 | 
77 |   import org.scalacheck.Gen
78 | 
79 |   import scala.collection.JavaConverters._
80 | 
81 |   // Goal : first generate a schema and then recurse on it to generate the appropriate data
82 |   // Bonus : handle number of fields
83 |   // Bonus : handle max depth to "finish somewhere"
84 |   // And don't forget the master defining what to generate is the schema
85 |   def genSchemaAndData[S, D](implicit S: Birecursive.Aux[S, SchemaF], D: Corecursive.Aux[D, GData]): Gen[(S, D)] = TODO
86 | }
87 | 


--------------------------------------------------------------------------------
/src/main/scala/solutions/5-patches.scala:
--------------------------------------------------------------------------------
 1 | package lc2018
 2 | package solutions
 3 | 
 4 | import jto.validation.ValidationError
 5 | import jto.validation.jsonast.JValue
 6 | import matryoshka._, implicits._
 7 | import matryoshka.patterns.{ListF, NilF, ConsF}
 8 | import scalaz._, Scalaz._
 9 | 
10 | sealed trait Step[S, D]
11 | final case class InnerStep[S, D](position: Position, schema: S, data: D) extends Step[S, D]
12 | final case class LastStep[S, D](data: D)                                 extends Step[S, D]
13 | 
14 | object Step {
15 |   def inner[S, D](pos: Position, schema: S, data: D): Step[S, D] = InnerStep(pos, schema, data)
16 |   def last[S, D](data: D): Step[S, D]                            = LastStep[S, D](data)
17 | }
18 | 
19 | sealed trait EarlyResult                           extends Product with Serializable
20 | final case class InvalidPath(path: List[Position]) extends EarlyResult
21 | final case class InvalidPatch(value: JValue, errors: Seq[(jto.validation.Path, Seq[ValidationError])])
22 |     extends EarlyResult
23 | 
24 | trait PatchAlgebras {
25 | 
26 |   type Traversal[Schema, Data] = (List[Position], Schema, Data)
27 | 
28 |   type ShortCircuitable[A] = EarlyResult \/ A
29 | 
30 |   def lookupS[S](position: Position, schema: S)(implicit S: Recursive.Aux[S, SchemaF]): Option[S] =
31 |     (position, schema.project) match {
32 |       case (Field(name), StructF(fields)) => fields.get(name)
33 |       case (Index(idx), ArrayF(elem))     => Some(elem)
34 |       case _                              => None
35 |     }
36 | 
37 |   def lookupD[D](position: Position, data: D)(implicit D: Recursive.Aux[D, GData]): Option[D] =
38 |     (position, data.project) match {
39 |       case (Field(name), GStruct(fields)) => fields.get(name)
40 |       case (Index(idx), GArray(elements)) => if (idx >= 0 && idx < elements.size) elements(idx).some else None
41 |       case _                              => None
42 |     }
43 | 
44 |   def validatePatch[S, D](patchValue: JValue)(
45 |       implicit S: Recursive.Aux[S, SchemaF],
46 |       D: Birecursive.Aux[D, GData]): CoalgebraM[ShortCircuitable, ListF[Step[S, D], ?], Traversal[S, D]] = {
47 |     case (Nil, _, _) => NilF().right
48 |     case (End :: Nil, schema, data) =>
49 |       val validator = SchemaRules.fromSchemaToRules(schema)
50 |       \/.fromEither(validator.validate(patchValue).toEither)
51 |         .bimap(
52 |           InvalidPatch(patchValue, _), { subData =>
53 |             ConsF(Step.last[S, D](subData), (List.empty[Position], schema, data))
54 |           }
55 |         )
56 | 
57 |     case (path, schema, data) =>
58 |       (lookupS(path.head, schema) |@| lookupD(path.head, data)) { (subSchema, subData) =>
59 |         ConsF(Step.inner(path.head, schema, data), (path.tail, subSchema, subData)).right
60 |       }.getOrElse(InvalidPath(path).left)
61 |   }
62 | 
63 |   def updateValue[S, D](implicit S: Recursive.Aux[S, SchemaF],
64 |                         D: Birecursive.Aux[D, GData]): AlgebraM[ShortCircuitable, ListF[Step[S, D], ?], D] = {
65 |     case NilF()                   => GBoolean[D](true).embed.right // hugly hack
66 |     case ConsF(LastStep(data), _) => data.right
67 |     case ConsF(InnerStep(position, schema, current), newData) =>
68 |       doUpdate(position, current, newData)
69 |     case _ => InvalidPath(Nil).left
70 |   }
71 | 
72 |   def doUpdate[D](position: Position, current: D, newData: D)(
73 |       implicit D: Birecursive.Aux[D, GData]): ShortCircuitable[D] =
74 |     (position, current.project) match {
75 |       case (Field(n), GStruct(fields)) =>
76 |         GStruct(fields.map {
77 |           case (name, field) =>
78 |             if (name == n) name -> newData else name -> field
79 |         }).embed.right
80 |       case (Index(i), GArray(elements)) =>
81 |         GArray(elements.take(i) ++ Seq(newData) ++ elements.drop(i + 1)).embed.right
82 |       case _ => InvalidPath(position :: Nil).left
83 |     }
84 | 
85 |   def applyPatch[S, D](schema: S, patch: JsonPatch, current: D)(implicit S: Recursive.Aux[S, SchemaF],
86 |                                                                 D: Birecursive.Aux[D, GData]): EarlyResult \/ D =
87 |     (patch.path, schema, current)
88 |       .hyloM[ShortCircuitable, ListF[Step[S, D], ?], D](updateValue, validatePatch(patch.value))
89 | }
90 | 


--------------------------------------------------------------------------------
/src/main/scala/0-prelude.scala:
--------------------------------------------------------------------------------
  1 | package lc2018
  2 | 
  3 | import matryoshka._
  4 | import matryoshka.data._
  5 | import matryoshka.implicits._
  6 | import scalaz._
  7 | import Scalaz._
  8 | 
  9 | /**
 10 |   * Let's begin with what's probably the simplest possible recursive structure: natural numbers
 11 |   *
 12 |   * Natural numbers can be defined recursively:
 13 |   * A number is either
 14 |   *   - zero, noted `Z`
 15 |   *   - the successor of a number, noted `S(n) where n is the notation of some number
 16 |   *
 17 |   * This notation is often referred to as the Peano notation.
 18 |   */
 19 | object PeanoNumbers {
 20 | 
 21 |   /**
 22 |     * We want to encode Peano numbers as a recursive type.
 23 |     * This encoding will be a type constructor, out so-called "pattern-functor"
 24 |     *
 25 |     * Hint: there is a type in the standard library that has exactly the structure we want.
 26 |     */
 27 |   type PeanoNumberF[A] = TODO
 28 | 
 29 |   /**
 30 |     * The problem with the PeanonumberF encoding is that now, different numbers
 31 |     * will have different types.
 32 |     *
 33 |     * We need a fix-point of PeanoNumberF to build a type that can represent all numbers.
 34 |     */
 35 |   type PeanoNumber = TODO
 36 | 
 37 |   /**
 38 |     * Now let's write our very first Algebra! Yay!
 39 |     *
 40 |     * We want to transform our Peano representation to Int. It's as simple as counting
 41 |     * the "layers" of "successor".
 42 |     */
 43 |   def countLayers: Algebra[PeanoNumberF, Int] = TODO
 44 | 
 45 |   /**
 46 |     * We now have all the ingredients needed to use our first recursion scheme.
 47 |     *
 48 |     * Hint: this will use the algebra defined above to *destroy* our recursive structure.
 49 |     */
 50 |   def toInt(peano: PeanoNumber): Int = TODO
 51 | 
 52 |   /**
 53 |     * Now we just need a value to test our functions
 54 |     */
 55 |   val three: PeanoNumber = TODO
 56 | 
 57 |   assert(toInt(three) == 3)
 58 | }
 59 | 
 60 | /**
 61 |   * We now move on to a more interesting recursive structure: the binary tree.
 62 |   */
 63 | object BinaryTrees {
 64 | 
 65 |   sealed trait Tree
 66 |   final case class Branch(label: Int, left: Tree, right: Tree) extends Tree
 67 |   final case class Leaf(label: Int)                            extends Tree
 68 |   final case class Empty()                                     extends Tree
 69 | 
 70 |   /**
 71 |     * So the first thing to do is to "translate" our Tree to a pattern-functor.
 72 |     * This is done by adding a type parameter and replace each recursive occurrences
 73 |     * of Tree by this type parameter in the ADT.
 74 |     */
 75 |   sealed trait TreeF[A]
 76 |   // TODO
 77 | 
 78 |   /**
 79 |     * Of course, we need to have an instance of Functor[TreeF] for it to be a real pattern-functor.
 80 |     */
 81 |   implicit val treeFFunctor: Functor[TreeF] = TODO
 82 | 
 83 |   /**
 84 |     * It's a good idea to have a pair of (co)algebras that go from Tree to TreeF (and vice versa).
 85 |     */
 86 |   def treeAlg: Algebra[TreeF, Tree]     = TODO
 87 |   def treeCoalg: Coalgebra[TreeF, Tree] = TODO
 88 | 
 89 |   /**
 90 |     * These two (co)algebras make it easy to provide a Birecursive instance for Tree/TreeF.
 91 |     * This allows to treat Tree as if it were a TreeF, and thus enables to use schemes directly
 92 |     * on a Tree (rather than having to wrap it in a fixpoint).
 93 |     */
 94 |   implicit val treeBirecursive: Birecursive.Aux[Tree, TreeF] = Birecursive.fromAlgebraIso(treeAlg, treeCoalg)
 95 | 
 96 |   import Recursive.ops._
 97 | 
 98 |   /**
 99 |     * A function TreeF[List[Int]] => List[Int]
100 |     *
101 |     * The produced list contains the labels of all the nodes in the tree
102 |     * as enumerated by a depth-first, left-to-right traversal.
103 |     */
104 |   def toList: Algebra[TreeF, List[Int]] = TODO
105 | 
106 |   val testTree: Recursive.AllOps[Tree, TreeF] = Branch(12, Branch(10, Leaf(1), Empty()), Leaf(15))
107 | 
108 |   assert(testTree.cata(toList) == List(1, 10, 12, 15))
109 | 
110 |   /**
111 |     * A function List[Int] => TreeF[List[Int]]
112 |     *
113 |     * This function MUST produce a "sort tree", that is, a tree where each
114 |     * node has a label that is greater than all the labels in its left subtree
115 |     * and lesser than all the labels in its right subtree.
116 |     */
117 |   def fromList: Coalgebra[TreeF, List[Int]] = TODO
118 | 
119 |   /**
120 |     * I wonder what this mystery function does…
121 |     */
122 |   def mystery(input: List[Int]): List[Int] = input.hylo(toList, fromList)
123 | 
124 | }
125 | 


--------------------------------------------------------------------------------
/src/main/scala/1-schema.scala:
--------------------------------------------------------------------------------
  1 | package lc2018
  2 | 
  3 | import org.scalacheck.{Arbitrary, Gen}
  4 | import scala.collection.immutable.ListMap
  5 | import scalaz._, Scalaz._
  6 | import matryoshka._, implicits._
  7 | 
  8 | /**
  9 |   * Without further ado, let's define our main pattern-functor for the remaining of the session.
 10 |   */
 11 | sealed trait SchemaF[A]
 12 | 
 13 | // we'll use a ListMap to keep the ordering of the fields
 14 | final case class StructF[A](fields: ListMap[String, A]) extends SchemaF[A]
 15 | final case class ArrayF[A](element: A)                  extends SchemaF[A]
 16 | final case class BooleanF[A]()                          extends SchemaF[A]
 17 | final case class DateF[A]()                             extends SchemaF[A]
 18 | final case class DoubleF[A]()                           extends SchemaF[A]
 19 | final case class FloatF[A]()                            extends SchemaF[A]
 20 | final case class IntegerF[A]()                          extends SchemaF[A]
 21 | final case class LongF[A]()                             extends SchemaF[A]
 22 | final case class StringF[A]()                           extends SchemaF[A]
 23 | 
 24 | object SchemaF extends SchemaFToDataTypeAlgebras with SchemaFArbitrary {
 25 | 
 26 |   /**
 27 |     * As usual, we need to define a Functor instance for our pattern.
 28 |     */
 29 |   implicit val schemaFScalazFunctor: Functor[SchemaF] = TODO
 30 | 
 31 |   /**
 32 |     * It might be usefull to have a nice string representation of our schemas.
 33 |     *
 34 |     * Let say that we want a representation where:
 35 |     *   - simple types like `BooleanF()` or `DateF()` would be represented as `boolean` and `date` respectively.
 36 |     *   - arrays like `ArrayF(IntegerF())` would be represented as `[ integer ]`.
 37 |     *   - structs like `StructF(ListMap("foo" -> FloatF(), "bar" -> LongF())` would be represented as
 38 |     *     `{ foo: float, bar: long }`
 39 |     *
 40 |     * Because of the recursive nature of SchemaF, we cannot eagerly write a Show instance for SchemaF.
 41 |     * Fortunately matryoshka defines the Delay typeclass that is useful in such cases. It allows to "break
 42 |     * the infinite loop" by delaying the instantiation of Show[SchemaF[A]].
 43 |     *
 44 |     * matryoshka.implicits contains implicit functions that, given that Delay[Show, SchemaF] instance,
 45 |     * will provide a Show[T[SchemaF]] for any fix-point T.
 46 |     *
 47 |     */
 48 |   implicit val schemaFDelayShow: Delay[Show, SchemaF] = new Delay[Show, SchemaF] {
 49 |     def apply[A](showA: Show[A]): Show[SchemaF[A]] = new Show[SchemaF[A]] {
 50 |       override def show(schema: SchemaF[A]): Cord = TODO
 51 |     }
 52 |   }
 53 | 
 54 | }
 55 | 
 56 | /**
 57 |   * Now that we have a proper pattern-functor, we need (co)algebras to go from our "standard" schemas to
 58 |   * our new and shiny SchemaF (and vice versa).
 59 |   *
 60 |   * Lets focus on Parquet schemas first. Parquet is a columnar data format that allows efficient processing
 61 |   * of large datasets in a distributed environment (eg Spark). In the Spark API, Parquet schemas are represented
 62 |   * as instances of the DataType type. So what we want to write here is a pair of (co)algebras that go from/to
 63 |   * SchemaF/DataType.
 64 |   *
 65 |   * NOTE: in order not to depend directly on Spark (and, hence, transitively on half of maven-central), we've copied
 66 |   * the definition of the DataType trait and its subclasses in the current project under
 67 |   * `spark/src/main/scala/DataType.scala`.
 68 |   */
 69 | trait SchemaFToDataTypeAlgebras {
 70 | 
 71 |   import org.apache.spark.sql.types._
 72 | 
 73 |   /**
 74 |     * As usual, simply a function from SchemaF[DataType] to DataType
 75 |     */
 76 |   def schemaFToDataType: Algebra[SchemaF, DataType] = TODO
 77 | 
 78 |   /**
 79 |     * And the other way around, a function from DataType to SchemaF[DataType]
 80 |     */
 81 |   def dataTypeToSchemaF: Coalgebra[SchemaF, DataType] = TODO
 82 | 
 83 |   /**
 84 |     * This pair of (co)algebras allows us to create a Birecursive[DataType, SchemaF] instance "for free".
 85 |     *
 86 |     * Such instance witnesses the fact that we can use a DataType in schemes that would normally apply to SchemaF.
 87 |     * For example, suppose that we have:
 88 |     *
 89 |     * {{{
 90 |     *   val parquet: DataType = ???
 91 |     *   val toAvro: Algebra[SchemaF, avro.Schema] = ???
 92 |     * }}}
 93 |     *
 94 |     * If we have the instance bellow in scope (and the necessary implicits from matryoshka.implicits), we can now write
 95 |     *
 96 |     * {{{
 97 |     *   parquet.cata(toAvro)
 98 |     * }}}
 99 |     *
100 |     * Instead of
101 |     *
102 |     * {{{
103 |     *   parquet.hylo(dataTypeToSchemaf, toAvro)
104 |     * }}}
105 |     *
106 |     * And the same goes with `ana` and any Coalgebra[SchemaF, X].
107 |     */
108 |   implicit def dataTypeSchemaBirecursive: Birecursive.Aux[DataType, SchemaF] =
109 |     Birecursive.fromAlgebraIso(schemaFToDataType, dataTypeToSchemaF)
110 | }
111 | 
112 | /**
113 |   * Everything looks nice, but don't you feel we are missing something?
114 |   *
115 |   * I mean, think about it for a minute and meet me 20 lines bellow.
116 |   *
117 |   *
118 |   *
119 |   *
120 |   *
121 |   *
122 |   *
123 |   *
124 |   *
125 |   *
126 |   *
127 |   *
128 |   *
129 |   *
130 |   *
131 |   *
132 |   *
133 |   *
134 |   *
135 |   * Did you guess?
136 |   *
137 |   *
138 |   *
139 |   * You're right of course! We still have to write tests!
140 |   *
141 |   * Let's meet again in `src/test/scala/1-schema/ParquetSpec.scala`.
142 |   */
143 | trait SchemaFArbitrary {
144 | 
145 |   implicit def schemaFDelayArbitrary: Delay[Arbitrary, SchemaF] = new Delay[Arbitrary, SchemaF] {
146 | 
147 |     def apply[A](A: Arbitrary[A]): Arbitrary[SchemaF[A]] = TODO
148 | 
149 |   }
150 | }
151 | 


--------------------------------------------------------------------------------
/src/main/scala/solutions/0-prelude.scala:
--------------------------------------------------------------------------------
  1 | package lc2018.solutions
  2 | 
  3 | import matryoshka._
  4 | import matryoshka.data._
  5 | import matryoshka.implicits._
  6 | import scalaz._
  7 | import Scalaz._
  8 | 
  9 | /**
 10 |   * Let's begin with what's probably the simplest possible recursive structure: natural numbers
 11 |   *
 12 |   * Natural numbers can be defined recursively:
 13 |   * A number is either
 14 |   *   - zero, noted `Z`
 15 |   *   - the successor of a number, noted `S(n) where n is the notation of some number
 16 |   *
 17 |   * This notation is often referred to as the Peano notation.
 18 |   */
 19 | object PeanoNumbers {
 20 | 
 21 |   /**
 22 |     * We want to encode Peano numbers as a recursive type.
 23 |     * This encoding will be a type constructor, out so-called "pattern-functor"
 24 |     *
 25 |     * Hint: there is a type in the standard library that has exactly the structure we want.
 26 |     */
 27 |   type PeanoNumberF[A] = Option[A]
 28 | 
 29 |   /**
 30 |     * The problem with the PeanonumberF encoding is that now, different numbers
 31 |     * will have different types.
 32 |     *
 33 |     * We need a fix-point of PeanoNumberF to build a type that can represent all numbers.
 34 |     */
 35 |   type PeanoNumber = Fix[PeanoNumberF]
 36 | 
 37 |   /**
 38 |     * Now let's write our very first Algebra! Yay!
 39 |     *
 40 |     * We want to transform our Peano representation to Int. It's as simple as counting
 41 |     * the "layers" of "successor".
 42 |     */
 43 |   def countLayers: Algebra[PeanoNumberF, Int] = _.fold(0)(_ + 1)
 44 | 
 45 |   /**
 46 |     * We now have all the ingredients needed to use our first recursion scheme.
 47 |     *
 48 |     * Hint: this will use the algebra defined above to *destroy* our recursive structure.
 49 |     */
 50 |   def toInt(peano: PeanoNumber): Int = peano cata countLayers
 51 | 
 52 |   /**
 53 |     * Now we just need a value to test our functions
 54 |     */
 55 |   val three: PeanoNumber = Fix(Option(Fix(Option(Fix(Option(Fix(Option.empty[PeanoNumber])))))))
 56 | 
 57 |   assert(toInt(three) == 3)
 58 | }
 59 | 
 60 | /**
 61 |   * We now move one to a more interesting recursive structure: the binary tree.
 62 |   */
 63 | object BinaryTrees {
 64 | 
 65 |   sealed trait Tree
 66 |   final case class Branch(label: Int, left: Tree, right: Tree) extends Tree
 67 |   final case class Leaf(label: Int)                            extends Tree
 68 |   final case class Empty()                                     extends Tree
 69 | 
 70 |   /**
 71 |     * So the first thing to do is to "translate" our Tree to a pattern-functor.
 72 |     * This is done by adding a type parameter and replace each recursive occurrences
 73 |     * of Tree by this type parameter in the ADT.
 74 |     */
 75 |   sealed trait TreeF[A]
 76 |   final case class BranchF[A](label: Int, left: A, right: A) extends TreeF[A]
 77 |   final case class LeafF[A](label: Int)                      extends TreeF[A]
 78 |   final case class EmptyF[A]()                               extends TreeF[A]
 79 | 
 80 |   /**
 81 |     * Of course, we need to have an instance of Functor[TreeF] for it to be a real pattern-functor.
 82 |     */
 83 |   implicit val treeFFunctor: Functor[TreeF] = new Functor[TreeF] {
 84 |     def map[A, B](fa: TreeF[A])(f: A => B): TreeF[B] = fa match {
 85 |       case BranchF(label, l, r) => BranchF(label, f(l), f(r))
 86 |       case LeafF(label)         => LeafF(label)
 87 |       case EmptyF()             => EmptyF()
 88 |     }
 89 |   }
 90 | 
 91 |   /**
 92 |     * It's a good idea to have a pair of (co)algebras that go from Tree to TreeF (and vice versa).
 93 |     */
 94 |   def treeAlg: Algebra[TreeF, Tree] = {
 95 |     case BranchF(label, l, r) => Branch(label, l, r)
 96 |     case LeafF(label)         => Leaf(label)
 97 |     case EmptyF()             => Empty()
 98 |   }
 99 |   def treeCoalg: Coalgebra[TreeF, Tree] = {
100 |     case Branch(label, l, r) => BranchF(label, l, r)
101 |     case Leaf(label)         => LeafF(label)
102 |     case Empty()             => EmptyF()
103 |   }
104 | 
105 |   /**
106 |     * These two (co)algebras make it easy to provide a Birecursive instance for Tree/TreeF.
107 |     * This allows to treat Tree as if it were a TreeF, and thus enables to use schemes directly
108 |     * on a Tree (rather than having to wrap it in a fixpoint).
109 |     */
110 |   implicit val treeBirecursive: Birecursive.Aux[Tree, TreeF] = Birecursive.fromAlgebraIso(treeAlg, treeCoalg)
111 | 
112 |   import Recursive.ops._
113 | 
114 |   /**
115 |     * A function TreeF[List[Int]] => List[Int]
116 |     *
117 |     * The produced list contains the labels of all the nodes in the tree
118 |     * as enumerated by a depth-first, left-to-right traversal.
119 |     */
120 |   def toList: Algebra[TreeF, List[Int]] = {
121 |     case BranchF(label, l, r) => l ++ List(label) ++ r
122 |     case LeafF(label)         => List(label)
123 |     case EmptyF()             => Nil
124 |   }
125 | 
126 |   val testTree: Recursive.AllOps[Tree, TreeF] = Branch(12, Branch(10, Leaf(1), Empty()), Leaf(15))
127 | 
128 |   assert(testTree.cata(toList) == List(1, 10, 12, 15))
129 | 
130 |   /**
131 |     * A function List[Int] => TreeF[List[Int]]
132 |     *
133 |     * This function MUST produce a "sort tree", that is, a tree where each
134 |     * node has a label that is greater than all the labels in its left subtree
135 |     * and lesser than all the labels in its right subtree.
136 |     */
137 |   def fromList: Coalgebra[TreeF, List[Int]] = {
138 |     case Nil         => EmptyF()
139 |     case head :: Nil => LeafF(head)
140 |     case head :: tail =>
141 |       val (lesser, greater) = tail.partition(_ < head)
142 |       BranchF(head, lesser, greater)
143 |   }
144 | 
145 |   /**
146 |     * I wonder what this mystery function does…
147 |     */
148 |   def mystery(input: List[Int]): List[Int] = input.hylo(toList, fromList)
149 | 
150 | }
151 | 


--------------------------------------------------------------------------------
/src/main/scala/2-avro.scala:
--------------------------------------------------------------------------------
  1 | package lc2018
  2 | 
  3 | import org.apache.avro.{LogicalTypes, _}
  4 | import matryoshka._, implicits._, patterns.EnvT
  5 | import scala.collection.immutable.ListMap
  6 | import scalaz._, Scalaz._
  7 | 
  8 | import scala.language.higherKinds
  9 | import scala.collection.JavaConverters._
 10 | 
 11 | /**
 12 |   * There is a problem that makes writing SchemaF <-> Avro (co)algebras more difficult.
 13 |   *
 14 |   * As a matter of fact Avro mandates that, when building a Schema, all records (the Avro
 15 |   * equivalent to our StructF) are registered using a unique name.
 16 |   *
 17 |   * This is problematic to our algebra-based method because with the algebras we've seen so
 18 |   * far we only care about one "layer" at a time, so there is no way to know the names we've
 19 |   * already used for ther records we've registered so far.
 20 |   *
 21 |   * Fortunately, we have at least two solutions to that problem. But before going any further,
 22 |   * maybe you can take a few minutes to try and imagine how we can solve that problem in general,
 23 |   * even if you don't know how to implement your solution using recursion-schemes yet.
 24 |   */
 25 | trait SchemaToAvroAlgebras extends Labelling with UsingARegistry with AvroCoalgebra {}
 26 | 
 27 | /**
 28 |   * The first solution comes from the observation that our schemas are in fact trees. And trees have
 29 |   * this nice property that each node have a unique path that goes from the root to it. If we can use
 30 |   * that unique path as the names of our records, we're good to go. So this solution boils down to
 31 |   * labelling each "node" of a schema with its path, and then use that path to form the names we
 32 |   * use to register our records.
 33 |   */
 34 | trait Labelling {
 35 | 
 36 |   /**
 37 |     * So lets define out Path as being simply a list of strings. These strings will be the field names
 38 |     * we need to traverse from the root to get to a specific element of our schema.
 39 |     */
 40 |   type Path = List[String]
 41 | 
 42 |   /**
 43 |     * Here is the "special trick" of the current solution.
 44 |     *
 45 |     * EnvT is a kind of "glorified pair". Given a label type E and a (pattern)-functor F, it allows us
 46 |     * to label each "node" of a T[F] with a value of type E while retaining the original structure. In
 47 |     * other words, if F is a functor, then EnvT[E, F, ?] is a functor as well.
 48 |     */
 49 |   type Labelled[A] = EnvT[Path, SchemaF, A]
 50 | 
 51 |   /**
 52 |     * If we are to label each "node" of a schema with its own path, we obviously need to go from the root
 53 |     * down to the leaves, so we definitely want to write a coalgebra.
 54 |     * This one might look a bit scarry though, but fear not, it's not as complcated as it looks. Lets just
 55 |     * follow the types together.
 56 |     *
 57 |     * A Coalgebra[F, A] is just a function A => F[A]. So the coalgebra bellow is just a function
 58 |     *  (Path, T[SchemaF]) => Labelled[(Path, T[SchemaF])
 59 |     * Expanding the Labelled alias it becomes
 60 |     *  (Path, T[SchemaF]) => EnvT[Path, SchemaF, (Path, T[SchemaF])]
 61 |     *
 62 |     * Ok, maybe it still looks a bit scarry...
 63 |     *
 64 |     * Lets try to put it differently. Assume you will be given a "seed" consisting of a whole schema and an
 65 |     * initial path (that will start empty). Your job is to use that to produce an EnvT that will contain
 66 |     * the path of the node you just saw (the "root" of the schema that was in the seed), and the node itself
 67 |     * but modified such that its "content" is not just a "smaller schema" as it was initially, but a new "seed"
 68 |     * consisting of a (larger) path, and the said "smaller schema".
 69 |     */
 70 |   def labelNodesWithPath[T](implicit T: Recursive.Aux[T, SchemaF]): Coalgebra[Labelled, (Path, T)] = TODO
 71 | 
 72 |   /**
 73 |     * Now the algebra (that we had no way to write before) becomes trivial. All we have to do is to use
 74 |     * the path labelling each "node" as the name we need when registering a new avro record.
 75 |     *
 76 |     * To extract the label (resp. node) of an EnvT you can use pattern-matching (EnvT contains only a pair
 77 |     * (label, node)), or you can use the `ask` and `lower` methods that return the label and node respectively.
 78 |     */
 79 |   def labelledToSchema: Algebra[Labelled, Schema] = TODO
 80 | 
 81 |   def schemaFToAvro[T](schemaF: T)(implicit T: Recursive.Aux[T, SchemaF]): Schema =
 82 |     (List.empty[String], schemaF).hylo(labelledToSchema, labelNodesWithPath)
 83 | }
 84 | 
 85 | /**
 86 |   * That first solution was (relatively) simple but it is not completely satisfying.
 87 |   * We needed both an algebra and a coalgebra to got from our SchemaF to Avro's Schema, which forced us to
 88 |   * use hylo.
 89 |   *
 90 |   * Fortunately, every scheme (and the related algebra) come with a "monadic" version. In this version, we
 91 |   * have to "wrap" the result of our algebras inside our monad of choice. The scheme will then use this
 92 |   * monad's bind at each step. That has plenty of cool uses.
 93 |   *
 94 |   * We can for example "short-circuit" the traversal by using \/ or Option as our monad. Or in this very case
 95 |   * we can use the State monad to keep track of what records we've already created.
 96 |   *
 97 |   * A note though: in order to use monadic schemes, we need a Traverse instance for our pattern-functor.
 98 |   */
 99 | trait UsingARegistry {
100 | 
101 |   type Registry[A] = State[Map[Int, Schema], A]
102 | 
103 |   def fingerprint(fields: Map[String, Schema]): Int = fields.hashCode
104 | 
105 |   def useARegistry: AlgebraM[Registry, SchemaF, Schema] = TODO
106 | 
107 |   implicit def schemaFTraverse: Traverse[SchemaF] = TODO
108 | 
109 |   def toAvro[T](schemaF: T)(implicit T: Recursive.Aux[T, SchemaF]): Schema =
110 |     schemaF.cataM(useARegistry).run(Map.empty)._2
111 | }
112 | 
113 | trait AvroCoalgebra {
114 | 
115 |   /**
116 |     * Of course we also need a coalgebra to go from Avro to SchemaF
117 |     * Since there are some avro shcemas that we do not handle here,
118 |     * we need a CoalgebraM, but we're not really interested in providing meaningful errors
119 |     * here, so we can use Option as our monad.
120 |     */
121 |   def avroToSchemaF: CoalgebraM[Option, SchemaF, Schema] = TODO
122 | }
123 | 


--------------------------------------------------------------------------------
/src/main/scala/4-spark-avro.scala:
--------------------------------------------------------------------------------
  1 | package lc2018
  2 | 
  3 | import matryoshka._
  4 | import matryoshka.data.Fix
  5 | import matryoshka.implicits._
  6 | import matryoshka.patterns.EnvT
  7 | import org.apache.avro.Schema
  8 | import scalaz._, Scalaz._
  9 | 
 10 | import scala.language.higherKinds
 11 | import org.apache.avro.generic.{GenericContainer, GenericData, GenericRecordBuilder}
 12 | import org.apache.spark.sql.Row
 13 | 
 14 | import scala.collection.immutable.ListMap
 15 | import scala.language.higherKinds
 16 | 
 17 | /**
 18 |   * It's time to confront ourselves to the real world of manipulating data with Spark & Avro
 19 |   * Two specific pain points we have to tackle are :
 20 |   *
 21 |   * - Spark's org.apache.spark.sql.Row is basically a wrapper of Array[Any]
 22 |   * but we need to handle two specifically different behaviour according to the level of the data :
 23 |   * When we're handling Arrays and Structs, no worry we need to output a Row
 24 |   * but when we're handling "simple" types, then if it's a top-level value we need to output a Row
 25 |   * but if it's not, then the value itself must be written.
 26 |   *
 27 |   * Exemple :
 28 |   *   - Value("b") will be Row("b")
 29 |   * but
 30 |   *   - Struct(a -> Value("b")) will be Row("b") as well (the Row now representing the outer struct)
 31 |   *
 32 |   * - For Apache Avro, it's a new kind of pain you'll need to overcome, Avro basically represents all of its data
 33 |   * as if, it will be at one point or another generated into Java classes.
 34 |   * So every "record" or Struct needs to have a qualified name "unique" otherwise the Avro engine will consider
 35 |   * the struct as being the same class.
 36 |   * But as it will obviously have different fields - you'll most likely end up with an error.
 37 |   *
 38 |   * Happy hunting.
 39 |   */
 40 | object SparkConverter extends GDataInstances {
 41 | 
 42 |   def isOfSimpleType[D](data: GData[D]) = data match {
 43 |     case GStruct(_) | GArray(_) => true
 44 |     case _                      => false
 45 |   }
 46 | 
 47 |   /**
 48 |     * We have a proper way to overcome this problem. There is a `para` scheme that works a little bit like cata.
 49 |     * Using para, our algebra will "see" not only the result of its application to the level bellow but also
 50 |     * the structure of that level we just processed.
 51 |     *
 52 |     * To use para, we need a special kind of algebra : a GAlgebra. Given a functor F and a comonad W, Galgebra[W, F, A]
 53 |     * is simply a function F[W[A]] => A, so our carrier is simply wrapped in an additional layer.
 54 |     *
 55 |     * For para's GAlgebra we use (T[F], ?) as our comonad, in other words, our carrier will be paired with the "tree" we
 56 |     * processed during the previous step.
 57 |     *
 58 |     * We will use that to know when we need to "unwrap" the value we had wrapped in a Row at the previous step although we
 59 |     * shouldn't have.
 60 |     */
 61 |   def gDataToRow[D](implicit D: Recursive.Aux[D, GData]): GAlgebra[(D, ?), GData, Row] = TODO
 62 | 
 63 |   def fromGDataToSparkRow(row: Fix[GData]): Row =
 64 |     row.para[Row](gDataToRow)
 65 | 
 66 | }
 67 | 
 68 | /**
 69 |   * We'll also need Avro to serialize streaming data into Kafka topics.
 70 |   *
 71 |   * This is just another kind of pain :). We will be using Avro's GenericContainer interface.
 72 |   * To build a GenericContainer you need an Avro schema, so we'll have to somehow "zip" the data
 73 |   * we want to serialize with its schema (this should remind you of something we already did).
 74 |   */
 75 | object AvroConverter extends SchemaToAvroAlgebras with GDataInstances {
 76 | 
 77 |   import scala.collection.JavaConverters._
 78 | 
 79 |   /**
 80 |     * A generic data (of type [[GData]]) with each element
 81 |     * labelled with the corresponding `avro.Schema`.
 82 |     */
 83 |   type DataWithSchema[A] = EnvT[Schema, GData, A]
 84 | 
 85 |   /**
 86 |     * When we'll zip data and schema there may be times when those two don't mix
 87 |     * we need to handle that case - this is what an Incompatibility is.
 88 |     */
 89 |   case class Incompatibility[D](schema: Schema, data: D)
 90 | 
 91 |   /**
 92 |     * Avro API is not very typesafe, all values inside GenericRecord are treated as mere Objects.
 93 |     * They didn't defined a GenericContainer for storing simple values (like numbers, strings, etc).
 94 |     * So we need to define one, for there is no way *we* work on non-types like Any or AnyRef.
 95 |     */
 96 |   case class SimpleValue(value: Any) extends GenericContainer {
 97 |     override def getSchema: Schema = throw new NotImplementedError() // we won't use that anyway
 98 |   }
 99 | 
100 |   /**
101 |     * But this is for our convenience only, we still need to feed avro API methods with unwrapped
102 |     * simple values, so don't forget to use this method whenever needed.
103 |     */
104 |   def unwrap(container: GenericContainer): Any = {
105 |     container match {
106 |       case SimpleValue(value) => value
107 |       case value              => value
108 |     }
109 |   }
110 | 
111 |   def fromGDataToAvro[S, D](schema: S, data: D)(
112 |       implicit S: Birecursive.Aux[S, SchemaF],
113 |       D: Birecursive.Aux[D, GData]): \/[Incompatibility[D], GenericContainer] = {
114 | 
115 |     val zipWithSchemaAlg: CoalgebraM[\/[Incompatibility[D], ?], DataWithSchema, (S, D)] = TODO
116 | 
117 |     val alg: AlgebraM[\/[Incompatibility[D], ?], DataWithSchema, GenericContainer] = TODO
118 | 
119 |     (schema, data).hyloM[\/[Incompatibility[D], ?], DataWithSchema, GenericContainer](alg, zipWithSchemaAlg)
120 |   }
121 | 
122 | }
123 | 
124 | trait GDataInstances {
125 | 
126 |   implicit val genericDataFTraverse: Traverse[GData] = new Traverse[GData] {
127 | 
128 |     override def traverseImpl[G[_], A, B](fa: GData[A])(f: A => G[B])(
129 |         implicit evidence$1: Applicative[G]): G[GData[B]] = fa match {
130 |       case GArray(elems) =>
131 |         Functor[G].map(elems.toList traverse f)(GArray.apply)
132 | 
133 |       case GStruct(fields) =>
134 |         val (keys, values) = fields.unzip
135 |         Functor[G].map(values.toList traverse f)(v => GStruct(ListMap((keys zip v).toSeq: _*)))
136 | 
137 |       case GString(value)  => Applicative[G].point(GString[B](value))
138 |       case GLong(value)    => Applicative[G].point(GLong[B](value))
139 |       case GInteger(value) => Applicative[G].point(GInteger[B](value))
140 |       case GDouble(value)  => Applicative[G].point(GDouble[B](value))
141 |       case GFloat(value)   => Applicative[G].point(GFloat[B](value))
142 |       case GDate(value)    => Applicative[G].point(GDate[B](value))
143 |       case GBoolean(value) => Applicative[G].point(GBoolean[B](value))
144 |     }
145 |   }
146 | }
147 | 


--------------------------------------------------------------------------------
/src/main/scala/solutions/3-validation.scala:
--------------------------------------------------------------------------------
  1 | package lc2018.solutions
  2 | 
  3 | import jto.validation._
  4 | import jto.validation.jsonast._
  5 | import matryoshka._, implicits._
  6 | import matryoshka.data._
  7 | import org.scalacheck.Arbitrary
  8 | import scalaz.Scalaz._
  9 | import scalaz._
 10 | 
 11 | import scala.collection.immutable.ListMap
 12 | import scala.language.higherKinds
 13 | 
 14 | /**
 15 |   * Now that we have a Schema we will need to validate incoming data (JSON)
 16 |   * and output "validated" data or "errors" with what went wrong for the sources
 17 |   * to be able to fix their exports.
 18 |   *
 19 |   * For that we'll use the JTO Validation library but first we need to define what a "Data" is
 20 |   */
 21 | sealed trait GData[A]
 22 | final case class GStruct[A](fields: ListMap[String, A]) extends GData[A]
 23 | final case class GArray[A](element: Seq[A])             extends GData[A]
 24 | final case class GBoolean[A](value: Boolean)            extends GData[A]
 25 | final case class GDate[A](value: java.util.Date)        extends GData[A]
 26 | final case class GDouble[A](value: Double)              extends GData[A]
 27 | final case class GFloat[A](value: Float)                extends GData[A]
 28 | final case class GInteger[A](value: Int)                extends GData[A]
 29 | final case class GLong[A](value: Long)                  extends GData[A]
 30 | final case class GString[A](value: String)              extends GData[A]
 31 | 
 32 | object GData extends GDataInstances with DataWithSchemaGenerator
 33 | 
 34 | object SchemaRules {
 35 |   type JRule[A] = Rule[JValue, A]
 36 | 
 37 |   implicit val ruleApplicativeForScalaz: Applicative[JRule] = new Applicative[JRule] {
 38 |     override def point[A](a: => A): JRule[A] = Rule.pure(a)
 39 | 
 40 |     override def ap[A, B](fa: => JRule[A])(f: => JRule[A => B]): JRule[B] = fa.ap(f)
 41 |   }
 42 | 
 43 |   def fromSchemaToRules[S, D](schema: S)(implicit S: Recursive.Aux[S, SchemaF],
 44 |                                          D: Corecursive.Aux[D, GData]): JRule[D] = {
 45 |     val alg: Algebra[SchemaF, JRule[D]] = {
 46 |       case StructF(fields) =>
 47 |         fields.toList
 48 |           .traverse[JRule, (String, D)] {
 49 |             case (name, validation) =>
 50 |               (Path \ name).read(_ => validation.map(fx => (name, fx)))
 51 |           }
 52 |           .map(fs => GStruct(ListMap(fs: _*)).embed)
 53 | 
 54 |       case ArrayF(elem) => Rules.pickSeq(elem).map(elems => GArray(elems).embed)
 55 |       case BooleanF()   => Rules.booleanR.map(x => GBoolean[D](x).embed)
 56 |       case DateF()      => Rules.stringR.andThen(Rules.isoDateR).map(x => GDate[D](x).embed)
 57 |       case DoubleF()    => Rules.doubleR.map(x => GDouble[D](x).embed)
 58 |       case FloatF()     => Rules.floatR.map(x => GFloat[D](x).embed)
 59 |       case IntegerF()   => Rules.intR.map(x => GInteger[D](x).embed)
 60 |       case LongF()      => Rules.longR.map(x => GLong[D](x).embed)
 61 |       case StringF()    => Rules.stringR.map(x => GString[D](x).embed)
 62 |     }
 63 | 
 64 |     schema cata alg
 65 |   }
 66 | 
 67 | }
 68 | 
 69 | /**
 70 |   * We need to test that validation - of course specific unit tests can be done
 71 |   * but we're quite paranoid so let's "generate" abitrary schemas using ScalaCheck
 72 |   *
 73 |   * But then again - from a Schema we'll be able to generate Rules
 74 |   * But to validate those rules we'd need data.
 75 |   * So let's generate Data as well :
 76 |   * Data that will, of course, need to be compatible with the Schema itself.
 77 |   */
 78 | trait DataWithSchemaGenerator {
 79 |   import org.scalacheck.Gen
 80 | 
 81 |   import scala.collection.JavaConverters._
 82 | 
 83 |   def genSchemaAndData[S, D](implicit S: Birecursive.Aux[S, SchemaF], D: Corecursive.Aux[D, GData]): Gen[(S, D)] =
 84 |     for {
 85 |       schemaF <- genSchemaF
 86 |       dataF   <- schemaF cata schemaToDataGen
 87 |     } yield (schemaF, dataF)
 88 | 
 89 |   def schemaToDataGen[D](implicit D: Corecursive.Aux[D, GData]): Algebra[SchemaF, Gen[D]] = {
 90 |     case ArrayF(elems) =>
 91 |       Gen.listOf(elems).map(lst => GArray(lst).embed)
 92 | 
 93 |     case StructF(fields) =>
 94 |       val (names, values) = fields.unzip
 95 |       Gen.sequence(values).map(fields => GStruct(ListMap((names zip fields.asScala).toSeq: _*)).embed)
 96 | 
 97 |     case BooleanF() =>
 98 |       Gen.oneOf(true, false).map(value => GBoolean[D](value).embed)
 99 | 
100 |     case DateF() =>
101 |       Gen.choose(0, Long.MaxValue).map(value => GDate[D](new java.util.Date(value)).embed)
102 | 
103 |     case DoubleF() =>
104 |       Gen.choose(Double.MinValue, Double.MaxValue).map(value => GDouble[D](value).embed)
105 | 
106 |     case FloatF() =>
107 |       Gen.choose(Float.MinValue, Float.MaxValue).map(value => GFloat[D](value).embed)
108 | 
109 |     case IntegerF() =>
110 |       Gen.choose(Int.MinValue, Int.MaxValue).map(value => GInteger[D](value).embed)
111 | 
112 |     case LongF() =>
113 |       Gen.choose(Long.MinValue, Long.MaxValue).map(value => GLong[D](value).embed)
114 | 
115 |     case StringF() =>
116 |       Gen.alphaNumStr.map(value => GString[D](value).embed)
117 |   }
118 | 
119 |   def genSchemaF[S](implicit S: Corecursive.Aux[S, SchemaF]): Gen[S] =
120 |     for {
121 |       depth             <- Gen.choose(1, 1)
122 |       nbTopLevelColumns <- Gen.choose(1, 1)
123 |       columns           <- Gen.listOfN(nbTopLevelColumns, genStructSchema(depth))
124 |     } yield StructF(ListMap(columns: _*)).embed
125 | 
126 |   def genValueSchema[S](implicit S: Corecursive.Aux[S, SchemaF]): Gen[(String, S)] =
127 |     for {
128 |       name <- Gen.identifier
129 |       valueF <- Gen.oneOf(
130 |                  BooleanF[S]().embed,
131 |                  DateF[S]().embed,
132 |                  DoubleF[S]().embed,
133 |                  FloatF[S]().embed,
134 |                  IntegerF[S]().embed,
135 |                  LongF[S]().embed,
136 |                  StringF[S]().embed,
137 |                )
138 |     } yield (name, valueF)
139 | 
140 |   def genColumnSchema[S](maxDepth: Int)(implicit S: Corecursive.Aux[S, SchemaF]): Gen[(String, S)] =
141 |     if (maxDepth > 0)
142 |       Gen.oneOf[(String, S)](genValueSchema, genStructSchema(maxDepth))
143 |     else genValueSchema
144 | 
145 |   def genStructSchema[S](maxDepth: Int)(implicit S: Corecursive.Aux[S, SchemaF]): Gen[(String, S)] =
146 |     for {
147 |       name     <- Gen.identifier
148 |       depth    <- Gen.choose(1, maxDepth)
149 |       nbFields <- Gen.choose(0, 3)
150 |       fields   <- Gen.listOfN(nbFields, genColumnSchema(maxDepth - depth))
151 |     } yield (name, StructF(ListMap(fields: _*)).embed)
152 | 
153 |   def genArraySchema[S](maxDepth: Int)(implicit S: Corecursive.Aux[S, SchemaF]): Gen[(String, S)] =
154 |     for {
155 |       name       <- Gen.identifier
156 |       depth      <- Gen.choose(1, maxDepth)
157 |       (_, elems) <- genNonArraySchema(maxDepth - depth)
158 |     } yield (name, ArrayF(elems).embed)
159 | 
160 |   def genNonArraySchema[S](maxDepth: Int)(implicit S: Corecursive.Aux[S, SchemaF]): Gen[(String, S)] =
161 |     if (maxDepth > 0)
162 |       Gen.oneOf[(String, S)](genValueSchema, genStructSchema(maxDepth))
163 |     else genValueSchema
164 | }
165 | 


--------------------------------------------------------------------------------
/src/main/scala/solutions/1-schema.scala:
--------------------------------------------------------------------------------
  1 | package lc2018.solutions
  2 | 
  3 | import org.scalacheck.{Arbitrary, Gen}
  4 | import scala.collection.immutable.ListMap
  5 | import scalaz._, Scalaz._
  6 | import matryoshka._, implicits._
  7 | 
  8 | /**
  9 |   * Without further ado, let's define our main pattern-functor for the remaining of the session.
 10 |   */
 11 | sealed trait SchemaF[A]
 12 | 
 13 | // we'll use a ListMap to keep the ordering of the fields
 14 | final case class StructF[A](fields: ListMap[String, A]) extends SchemaF[A]
 15 | final case class ArrayF[A](element: A)                  extends SchemaF[A]
 16 | final case class BooleanF[A]()                          extends SchemaF[A]
 17 | final case class DateF[A]()                             extends SchemaF[A]
 18 | final case class DoubleF[A]()                           extends SchemaF[A]
 19 | final case class FloatF[A]()                            extends SchemaF[A]
 20 | final case class IntegerF[A]()                          extends SchemaF[A]
 21 | final case class LongF[A]()                             extends SchemaF[A]
 22 | final case class StringF[A]()                           extends SchemaF[A]
 23 | 
 24 | object SchemaF extends SchemaFToDataTypeAlgebras with SchemaFArbitrary {
 25 | 
 26 |   /**
 27 |     * As usual, we need to define a Functor instance for our pattern.
 28 |     */
 29 |   implicit val schemaFScalazFunctor: Functor[SchemaF] = new Functor[SchemaF] {
 30 |     def map[A, B](fa: SchemaF[A])(f: A => B): SchemaF[B] = fa match {
 31 |       case StructF(fields) => StructF(fields.map { case (name, value) => name -> f(value) })
 32 |       case ArrayF(elem)    => ArrayF(f(elem))
 33 |       case BooleanF()      => BooleanF()
 34 |       case DateF()         => DateF()
 35 |       case DoubleF()       => DoubleF()
 36 |       case FloatF()        => FloatF()
 37 |       case IntegerF()      => IntegerF()
 38 |       case LongF()         => LongF()
 39 |       case StringF()       => StringF()
 40 |     }
 41 |   }
 42 | 
 43 |   /**
 44 |     * It might be usefull to have a nice string representation of our schemas.
 45 |     *
 46 |     * Let say that we want a representation where:
 47 |     *   - simple types like `BooleanF()` or `DateF()` would be represented as `boolean` and `date` respectively.
 48 |     *   - arrays like `ArrayF(IntegerF())` would be represented as `[ integer ]`.
 49 |     *   - structs like `StructF(ListMap("foo" -> FloatF(), "bar" -> LongF())` would be represented as
 50 |     *     `{ foo: float, bar: long }`
 51 |     *
 52 |     * Because of the recursive nature of SchemaF, we cannot eagerly write a Show instance for SchemaF.
 53 |     * Fortunately matryoshka defines the Delay typeclass that is useful in such cases. It allows to "break
 54 |     * the infinite loop" by delaying the instantiation of Show[SchemaF[A]].
 55 |     *
 56 |     * matryoshka.implicits contains implicit functions that, given that Delay[Show, SchemaF] instance,
 57 |     * will provide a Show[T[SchemaF]] for any fix-point T.
 58 |     *
 59 |     */
 60 |   implicit val schemaFDelayShow: Delay[Show, SchemaF] = new Delay[Show, SchemaF] {
 61 |     def apply[A](showA: Show[A]): Show[SchemaF[A]] = new Show[SchemaF[A]] {
 62 |       override def show(schema: SchemaF[A]): Cord = schema match {
 63 |         case StructF(fields) =>
 64 |           val showFields = fields.map { case (k, v) => Cord(k) ++ Cord(": ") ++ showA.show(v) }.toSeq
 65 |           Cord("{ ") ++ Cord.mkCord(Cord(", "), showFields: _*) ++ Cord(" }")
 66 |         case ArrayF(element) => Cord("[ ") ++ showA.show(element) ++ Cord(" ]")
 67 |         case BooleanF()      => Cord("boolean")
 68 |         case DateF()         => Cord("date")
 69 |         case DoubleF()       => Cord("double")
 70 |         case FloatF()        => Cord("float")
 71 |         case IntegerF()      => Cord("integer")
 72 |         case LongF()         => Cord("long")
 73 |         case StringF()       => Cord("string")
 74 |       }
 75 |     }
 76 |   }
 77 | 
 78 | }
 79 | 
 80 | /**
 81 |   * Now that we have a proper pattern-functor, we need (co)algebras to go from our "standard" schemas to
 82 |   * our new and shiny SchemaF (and vice versa).
 83 |   *
 84 |   * Lets focus on Parquet schemas first. Parquet is a columnar data format that allows efficient processing
 85 |   * of large datasets in a distributed environment (eg Spark). In the Spark API, Parquet schemas are represented
 86 |   * as instances of the DataType type. So what we want to write here is a pair of (co)algebras that go from/to
 87 |   * SchemaF/DataType.
 88 |   *
 89 |   * NOTE: in order not to depend directly on Spark (and, hence, transitively on half of maven-central), we've copied
 90 |   * the definition of the DataType trait and its subclasses in the current project under
 91 |   * `spark/src/main/scala/DataType.scala`.
 92 |   */
 93 | trait SchemaFToDataTypeAlgebras {
 94 | 
 95 |   import org.apache.spark.sql.types._
 96 | 
 97 |   /**
 98 |     * As usual, simply a function from SchemaF[DataType] to DataType
 99 |     */
100 |   def schemaFToDataType: Algebra[SchemaF, DataType] = {
101 |     case StructF(fields) => StructType(fields.map { case (name, value) => StructField(name, value) }.toArray)
102 |     case ArrayF(elem)    => ArrayType(elem, containsNull = false)
103 |     case BooleanF()      => BooleanType
104 |     case DateF()         => DateType
105 |     case DoubleF()       => DoubleType
106 |     case FloatF()        => FloatType
107 |     case IntegerF()      => IntegerType
108 |     case LongF()         => LongType
109 |     case StringF()       => StringType
110 | 
111 |   }
112 | 
113 |   /**
114 |     * And the other way around, a function from DataType to SchemaF[DataType]
115 |     */
116 |   def dataTypeToSchemaF: Coalgebra[SchemaF, DataType] = {
117 |     case StructType(fields) => StructF(ListMap(fields.map(f => f.name -> f.dataType): _*))
118 |     case ArrayType(elem, _) => ArrayF(elem)
119 |     case BooleanType        => BooleanF()
120 |     case DateType           => DateF()
121 |     case DoubleType         => DoubleF()
122 |     case FloatType          => FloatF()
123 |     case IntegerType        => IntegerF()
124 |     case LongType           => LongF()
125 |     case StringType         => StringF()
126 | 
127 |   }
128 | 
129 |   /**
130 |     * This pair of (co)algebras allows us to create a Birecursive[DataType, SchemaF] instance "for free".
131 |     *
132 |     * Such instance witnesses the fact that we can use a DataType in schemes that would normally apply to SchemaF.
133 |     * For example, suppose that we have:
134 |     *
135 |     * {{{
136 |     *   val parquet: DataType = ???
137 |     *   val toAvro: Algebra[SchemaF, avro.Schema] = ???
138 |     * }}}
139 |     *
140 |     * If we have the instance bellow in scope (and the necessary implicits from matryoshka.implicits), we can now write
141 |     *
142 |     * {{{
143 |     *   parquet.cata(toAvro)
144 |     * }}}
145 |     *
146 |     * Instead of
147 |     *
148 |     * {{{
149 |     *   parquet.hylo(dataTypeToSchemaf, toAvro)
150 |     * }}}
151 |     *
152 |     * And the same goes with `ana` and any Coalgebra[SchemaF, X].
153 |     */
154 |   implicit val dataTypeSchemaBirecursive: Birecursive.Aux[DataType, SchemaF] =
155 |     Birecursive.fromAlgebraIso(schemaFToDataType, dataTypeToSchemaF)
156 | }
157 | 
158 | /**
159 |   * Everything looks nice, but don't you feel we are missing something?
160 |   *
161 |   * I mean, think about it for a minute and meet me 20 lines bellow.
162 |   *
163 |   *
164 |   *
165 |   *
166 |   *
167 |   *
168 |   *
169 |   *
170 |   *
171 |   *
172 |   *
173 |   *
174 |   *
175 |   *
176 |   *
177 |   *
178 |   *
179 |   *
180 |   *
181 |   * Did you guess?
182 |   *
183 |   *
184 |   *
185 |   * You're right of course! We still have to write tests!
186 |   *
187 |   * Let's meet again in `src/test/scala/1-schema/ParquetSpec.scala`.
188 |   */
189 | trait SchemaFArbitrary {
190 | 
191 |   implicit val schemaFDelayArbitrary: Delay[Arbitrary, SchemaF] = new Delay[Arbitrary, SchemaF] {
192 | 
193 |     def apply[A](A: Arbitrary[A]): Arbitrary[SchemaF[A]] =
194 |       Arbitrary(
195 |         Gen.oneOf(
196 |           Gen.const(BooleanF[A]()),
197 |           Gen.const(DateF[A]()),
198 |           Gen.const(DoubleF[A]()),
199 |           Gen.const(FloatF[A]()),
200 |           Gen.const(IntegerF[A]()),
201 |           Gen.const(LongF[A]()),
202 |           Gen.const(StringF[A]()),
203 |           for {
204 |             nbFields <- Gen.choose(1, 10)
205 |             // we need to make sure that fields' names are unique and non empty
206 |             names <- Gen.listOfN(nbFields, Gen.alphaStr).map(_.map("a" ++ _).toSet)
207 |             types <- Gen.listOfN(names.size, A.arbitrary)
208 |           } yield StructF[A](ListMap((names.toList zip types): _*)),
209 |           A.arbitrary.map(ArrayF.apply _)
210 |         )
211 |       )
212 | 
213 |   }
214 | }
215 | 


--------------------------------------------------------------------------------
/src/main/scala/solutions/2-avro.scala:
--------------------------------------------------------------------------------
  1 | package lc2018
  2 | package solutions
  3 | 
  4 | import org.apache.avro.{LogicalTypes, _}
  5 | import matryoshka._, implicits._, patterns.EnvT
  6 | import scala.collection.immutable.ListMap
  7 | import scalaz._, Scalaz._
  8 | 
  9 | import scala.language.higherKinds
 10 | import scala.collection.JavaConverters._
 11 | 
 12 | /**
 13 |   * There is a problem that makes writing SchemaF <-> Avro (co)algebras more difficult.
 14 |   *
 15 |   * As a matter of fact Avro mandates that, when building a Schema, all records (the Avro
 16 |   * equivalent to our StructF) are registered using a unique name.
 17 |   *
 18 |   * This is problematic to our algebra-based method because with the algebras we've seen so
 19 |   * far we only care about one "layer" at a time, so there is no way to know the names we've
 20 |   * already used for ther records we've registered so far.
 21 |   *
 22 |   * Fortunately, we have at least two solutions to that problem. But before going any further,
 23 |   * maybe you can take a few minutes to try and imagine how we can solve that problem in general,
 24 |   * even if you don't know how to implement your solution using recursion-schemes yet.
 25 |   */
 26 | trait SchemaToAvroAlgebras extends Labelling with UsingARegistry with AvroCoalgebra {}
 27 | 
 28 | /**
 29 |   * The first solution comes from the observation that our schemas are in fact trees. And trees have
 30 |   * this nice property that each node have a unique path that goes from the root to it. If we can use
 31 |   * that unique path as the names of our records, we're good to go. So this solution boils down to
 32 |   * labelling each "node" of a schema with its path, and then use that path to form the names we
 33 |   * use to register our records.
 34 |   */
 35 | trait Labelling {
 36 | 
 37 |   /**
 38 |     * So lets define out Path as being simply a list of strings. These strings will be the field names
 39 |     * we need to traverse from the root to get to a specific element of our schema.
 40 |     */
 41 |   type Path = List[String]
 42 | 
 43 |   /**
 44 |     * Here is the "special trick" of the current solution.
 45 |     *
 46 |     * EnvT is a kind of "glorified pair". Given a label type E and a (pattern)-functor F, it allows us
 47 |     * to label each "node" of a T[F] with a value of type E while retaining the original structure. In
 48 |     * other words, if F is a functor, then EnvT[E, F, ?] is a functor as well.
 49 |     */
 50 |   type Labelled[A] = EnvT[Path, SchemaF, A]
 51 | 
 52 |   /**
 53 |     * If we are to label each "node" of a schema with its own path, we obviously need to go from the root
 54 |     * down to the leaves, so we definitely want to write a coalgebra.
 55 |     * This one might look a bit scarry though, but fear not, it's not as complcated as it looks. Lets just
 56 |     * follow the types together.
 57 |     *
 58 |     * A Coalgebra[F, A] is just a function A => F[A]. So the coalgebra bellow is just a function
 59 |     *  (Path, T[SchemaF]) => Labelled[(Path, T[SchemaF])
 60 |     * Expanding the Labelled alias it becomes
 61 |     *  (Path, T[SchemaF]) => EnvT[Path, SchemaF, (Path, T[SchemaF])]
 62 |     *
 63 |     * Ok, maybe it still looks a bit scarry...
 64 |     *
 65 |     * Lets try to put it differently. Assume you will be given a "seed" consisting of a whole schema and an
 66 |     * initial path (that will start empty). Your job is to use that to produce an EnvT that will contain
 67 |     * the path of the node you just saw (the "root" of the schema that was in the seed), and the node itself
 68 |     * but modified such that its "content" is not just a "smaller schema" as it was initially, but a new "seed"
 69 |     * consisting of a (larger) path, and the said "smaller schema".
 70 |     */
 71 |   def labelNodesWithPath[T](implicit T: Recursive.Aux[T, SchemaF]): Coalgebra[Labelled, (Path, T)] = {
 72 |     case (path, t) =>
 73 |       t.project match {
 74 |         // paths are formed only using structs' field names so we only need to really care about structs.
 75 |         // For each field, we "push down" a new path augmented with that field's name.
 76 |         case StructF(fields) =>
 77 |           EnvT((path, StructF(fields.map { case (k, v) => k -> (path :+ k, v) })))
 78 |         // All other cases don't participate to the construction of the path, so we only need to push the
 79 |         // current path down.
 80 |         case otherwise =>
 81 |           EnvT((path, otherwise.map(x => (path, x))))
 82 |       }
 83 |   }
 84 | 
 85 |   /**
 86 |     * Now the algebra (that we had no way to write before) becomes trivial. All we have to do is to use
 87 |     * the path labelling each "node" as the name we need when registering a new avro record.
 88 |     *
 89 |     * To extract the label (resp. node) of an EnvT you can use pattern-matching (EnvT contains only a pair
 90 |     * (label, node)), or you can use the `ask` and `lower` methods that return the label and node respectively.
 91 |     */
 92 |   def labelledToSchema: Algebra[Labelled, Schema] = { envT =>
 93 |     val path = envT.ask
 94 |     envT.lower match {
 95 |       case StructF(fields) =>
 96 |         fields
 97 |           .foldLeft(SchemaBuilder.record(path.mkString("a", ".", "z")).fields) {
 98 |             case (builder, (key, value)) =>
 99 |               builder.name(key).`type`(value).noDefault()
100 |           }
101 |           .endRecord()
102 |       case ArrayF(element) =>
103 |         SchemaBuilder.array().items(element)
104 |       case BooleanF() => Schema.create(Schema.Type.BOOLEAN)
105 |       case DateF() =>
106 |         LogicalTypes.timestampMillis().addToSchema(Schema.create(Schema.Type.LONG))
107 |       case DoubleF()  => Schema.create(Schema.Type.DOUBLE)
108 |       case FloatF()   => Schema.create(Schema.Type.FLOAT)
109 |       case IntegerF() => Schema.create(Schema.Type.INT)
110 |       case LongF()    => Schema.create(Schema.Type.LONG)
111 |       case StringF()  => Schema.create(Schema.Type.STRING)
112 |     }
113 |   }
114 | 
115 |   /**
116 |     *
117 |     */
118 |   def schemaFToAvro[T](schemaF: T)(implicit T: Recursive.Aux[T, SchemaF]): Schema =
119 |     (List.empty[String], schemaF).hylo(labelledToSchema, labelNodesWithPath)
120 | }
121 | 
122 | /**
123 |   * That first solution was (relatively) simple but it is not completely satisfying.
124 |   * We needed both an algebra and a coalgebra to got from our SchemaF to Avro's Schema, which forced us to
125 |   * use hylo.
126 |   */
127 | trait UsingARegistry {
128 | 
129 |   type Registry[A] = State[Map[Int, Schema], A]
130 | 
131 |   def fingerprint(fields: Map[String, Schema]): Int = fields.hashCode
132 | 
133 |   def useARegistry: AlgebraM[Registry, SchemaF, Schema] = {
134 |     case StructF(fields) =>
135 |       val fp = fingerprint(fields)
136 |       State { (reg: Map[Int, Schema]) =>
137 |         if (reg contains fp) {
138 |           (reg, reg(fp))
139 |         } else {
140 |           val record =
141 |             fields
142 |               .foldLeft(SchemaBuilder.record("r%x".format(fp)).fields) {
143 |                 case (builder, (k, v)) =>
144 |                   builder.name(k).`type`(v).noDefault
145 |               }
146 |               .endRecord
147 |           (reg + (fp -> record), record)
148 |         }
149 |       }
150 |     case ArrayF(field) =>
151 |       State.state(SchemaBuilder.array.items(field))
152 | 
153 |     case BooleanF() => State.state(Schema.create(Schema.Type.BOOLEAN))
154 |     case DateF()    => State.state(LogicalTypes.timestampMillis().addToSchema(Schema.create(Schema.Type.LONG)))
155 |     case DoubleF()  => State.state(Schema.create(Schema.Type.DOUBLE))
156 |     case FloatF()   => State.state(Schema.create(Schema.Type.FLOAT))
157 |     case IntegerF() => State.state(Schema.create(Schema.Type.INT))
158 |     case LongF()    => State.state(Schema.create(Schema.Type.LONG))
159 |     case StringF()  => State.state(Schema.create(Schema.Type.STRING))
160 |   }
161 | 
162 |   implicit val schemaFTraverse: Traverse[SchemaF] = new Traverse[SchemaF] {
163 |     override def traverseImpl[G[_], A, B](fa: SchemaF[A])(f: A => G[B])(implicit G: Applicative[G]): G[SchemaF[B]] =
164 |       fa match {
165 |         case StructF(fields) =>
166 |           val (ks, vs) = fields.unzip
167 |           vs.toList.traverse(f).map { xs =>
168 |             StructF(ListMap((ks.toList zip xs): _*))
169 |           }
170 |         case ArrayF(elem) =>
171 |           f(elem).map(ArrayF.apply)
172 |         case BooleanF() => G.point(BooleanF())
173 |         case DateF()    => G.point(DateF())
174 |         case DoubleF()  => G.point(DoubleF())
175 |         case FloatF()   => G.point(FloatF())
176 |         case IntegerF() => G.point(IntegerF())
177 |         case LongF()    => G.point(LongF())
178 |         case StringF()  => G.point(StringF())
179 |       }
180 |   }
181 | 
182 |   def toAvro[T](schemaF: T)(implicit T: Recursive.Aux[T, SchemaF]): Schema =
183 |     schemaF.cataM(useARegistry).run(Map.empty)._2
184 | }
185 | 
186 | trait AvroCoalgebra {
187 | 
188 |   def avroToSchemaF: CoalgebraM[Option, SchemaF, Schema] = { schema =>
189 |     schema.getType match {
190 |       case Schema.Type.RECORD =>
191 |         val fields = schema.getFields.asScala
192 |         StructF(ListMap(fields.map(f => f.name -> f.schema): _*)).some
193 |       case Schema.Type.ARRAY   => ArrayF(schema.getElementType).some
194 |       case Schema.Type.BOOLEAN => BooleanF().some
195 |       case Schema.Type.DOUBLE  => DoubleF().some
196 |       case Schema.Type.FLOAT   => FloatF().some
197 |       case Schema.Type.INT     => IntegerF().some
198 |       case Schema.Type.LONG =>
199 |         val lt = schema.getLogicalType
200 |         if (lt != null) {
201 |           if (lt.getName == LogicalTypes.timestampMillis().getName) {
202 |             DateF().some
203 |           } else None
204 |         } else LongF().some
205 |       case Schema.Type.STRING => StringF().some
206 |       case _                  => None
207 |     }
208 |   }
209 | }
210 | 


--------------------------------------------------------------------------------
/src/main/scala/solutions/4-spark-avro.scala:
--------------------------------------------------------------------------------
  1 | package lc2018
  2 | package solutions
  3 | 
  4 | import matryoshka._
  5 | import matryoshka.data.Fix
  6 | import matryoshka.implicits._
  7 | import matryoshka.patterns.EnvT
  8 | import org.apache.avro.Schema
  9 | import scalaz._, Scalaz._
 10 | 
 11 | import scala.language.higherKinds
 12 | import org.apache.avro.generic.{GenericContainer, GenericData, GenericRecordBuilder}
 13 | import org.apache.spark.sql.Row
 14 | 
 15 | import scala.collection.immutable.ListMap
 16 | import scala.language.higherKinds
 17 | 
 18 | /**
 19 |   * It's time to confront ourselves to the real world of manipulating data with Spark & Avro
 20 |   * Two specific pain points we have to tackle are :
 21 |   *
 22 |   * - Spark's org.apache.spark.sql.Row is basically a wrapper of Array[Any]
 23 |   * but we need to handle two specifically different behaviour according to the level of the data :
 24 |   * When we're handling Arrays and Structs, no worry we need to output a Row
 25 |   * but when we're handling "simple" types, then if it's a top-level value we need to output a Row
 26 |   * but if it's not, then the value itself must be written.
 27 |   *
 28 |   * Exemple :
 29 |   *   - Value("b") will be Row("b")
 30 |   * but
 31 |   *   - Struct(a -> Value("b")) will be Row("b") as well (the Row now representing the outer struct)
 32 |   *
 33 |   * - For Apache Avro, it's a new kind of pain you'll need to overcome, Avro basically represents all of its data
 34 |   * as if, it will be at one point or another generated into Java classes.
 35 |   * So every "record" or Struct needs to have a qualified name "unique" otherwise the Avro engine will consider
 36 |   * the struct as being the same class.
 37 |   * But as it will obviously have different fields - you'll most likely end up with an error.
 38 |   *
 39 |   * Happy hunting.
 40 |   */
 41 | object SparkConverter extends GDataInstances {
 42 | 
 43 |   def isOfSimpleType[D](data: GData[D]) = data match {
 44 |     case GStruct(_) | GArray(_) => true
 45 |     case _                      => false
 46 |   }
 47 | 
 48 |   /**
 49 |     * We have a proper way to overcome this problem. There is a `para` scheme that works a little bit like cata.
 50 |     * Using para, our algebra will "see" not only the result of its application to the level bellow but also
 51 |     * the structure of that level we just processed.
 52 |     *
 53 |     * To use para, we need a special kind of algebra : a GAlgebra. Given a functor F and a comonad W, Galgebra[W, F, A]
 54 |     * is simply a function F[W[A]] => A, so our carrier is simply wrapped in an additional layer.
 55 |     *
 56 |     * For para's GAlgebra we use (T[F], ?) as our comonad, in other words, our carrier will be paired with the "tree" we
 57 |     * processed during the previous step.
 58 |     *
 59 |     * We will use that to know when we need to "unwrap" the value we had wrapped in a Row at the previous step although we
 60 |     * shouldn't have.
 61 |     */
 62 |   def gDataToRow[D](implicit D: Recursive.Aux[D, GData]): GAlgebra[(D, ?), GData, Row] = {
 63 |     case GArray(elems) =>
 64 |       val values = elems.map {
 65 |         case (previous, current) =>
 66 |           if (isOfSimpleType(previous.project))
 67 |             current
 68 |           else
 69 |             current.values.head
 70 |       }
 71 |       Row(values)
 72 | 
 73 |     case GStruct(fields) =>
 74 |       val values = fields.map {
 75 |         case (k, (previous, value)) =>
 76 |           if (isOfSimpleType(previous.project)) {
 77 |             value
 78 |           } else {
 79 |             value.values.head
 80 |           }
 81 |       }
 82 |       Row(values.toSeq: _*)
 83 | 
 84 |     case GBoolean(el) => Row(el)
 85 |     case GFloat(el)   => Row(el)
 86 |     case GInteger(el) => Row(el)
 87 |     case GDate(el)    => Row(el)
 88 |     case GLong(el)    => Row(el)
 89 |     case GDouble(el)  => Row(el)
 90 |     case GString(el)  => Row(el)
 91 |   }
 92 | 
 93 |   def fromGDataToSparkRow(row: Fix[GData]): Row =
 94 |     row.para[Row](gDataToRow)
 95 | 
 96 | }
 97 | 
 98 | /**
 99 |   * We'll also need Avro to serialize streaming data into Kafka topics.
100 |   *
101 |   * This is just another kind of pain :). We will be using Avro's GenericContainer interface.
102 |   * To build a GenericContainer you need an Avro schema, so we'll have to somehow "zip" the data
103 |   * we want to serialize with its schema (this should remind you of something we already did).
104 |   */
105 | object AvroConverter extends SchemaToAvroAlgebras with GDataInstances {
106 | 
107 |   import scala.collection.JavaConverters._
108 | 
109 |   /**
110 |     * A generic schema (of type [[SchemaF]]) with each element
111 |     * labelled with the corresponding `avro.Schema`.
112 |     */
113 |   type SchemaWithAvro[A] = EnvT[Schema, SchemaF, A]
114 | 
115 |   type DataWithSchema[A] = EnvT[Schema, GData, A]
116 | 
117 |   case class Incompatibility[D](schema: Schema, data: D)
118 | 
119 |   /**
120 |     * Avro API is not very typesafe, all values inside GenericRecord are treated as mere Objects.
121 |     * They didn't defined a GenericContainer for storing simple values (like numbers, strings, etc).
122 |     * So we need to define one, for there is no way *we* work on non-types like Any or AnyRef.
123 |     */
124 |   case class SimpleValue(value: Any) extends GenericContainer {
125 |     override def getSchema: Schema = ???
126 |   }
127 | 
128 |   /**
129 |     * But this is for our convenience only, we still need to feed avro API methods with unwrapped
130 |     * simple values, so don't forget to use this method whenever needed.
131 |     */
132 |   def unwrap(container: GenericContainer): Any = {
133 |     container match {
134 |       case SimpleValue(value) => value
135 |       case value              => value
136 |     }
137 |   }
138 | 
139 |   def fromGDataToAvro[S, D](schema: S, data: D)(
140 |       implicit S: Birecursive.Aux[S, SchemaF],
141 |       D: Birecursive.Aux[D, GData]): \/[Incompatibility[D], GenericContainer] = {
142 | 
143 |     val zipWithSchemaAlg: CoalgebraM[\/[Incompatibility[D], ?], DataWithSchema, (S, D)] = {
144 |       case (sch, dat) =>
145 |         (sch.project, dat.project) match {
146 | 
147 |           case (structF @ StructF(fieldsSchema), GStruct(fields)) =>
148 |             val withSchema = GStruct(
149 |               ListMap(fields.map { case (name, fx) => (name, (fieldsSchema(name), fx)) }.toSeq: _*))
150 |             EnvT[Schema, GData, (S, D)]((schemaFToAvro(structF.embed), withSchema)).right
151 | 
152 |           case (arrF @ ArrayF(fieldSchema), GArray(elems)) =>
153 |             val withSchema = GArray(elems.map(fx => (fieldSchema, fx)))
154 |             EnvT[Schema, GData, (S, D)]((schemaFToAvro(arrF.embed), withSchema)).right
155 | 
156 |           case (valueF @ StringF(), GString(value)) =>
157 |             val withSchema = GString[(S, D)](value)
158 |             EnvT[Schema, GData, (S, D)]((schemaFToAvro(valueF.embed), withSchema)).right
159 | 
160 |           case (valueF @ IntegerF(), GInteger(value)) =>
161 |             val withSchema = GInteger[(S, D)](value)
162 |             EnvT[Schema, GData, (S, D)]((schemaFToAvro(valueF.embed), withSchema)).right
163 | 
164 |           case (valueF @ LongF(), GLong(value)) =>
165 |             val withSchema = GLong[(S, D)](value)
166 |             EnvT[Schema, GData, (S, D)]((schemaFToAvro(valueF.embed), withSchema)).right
167 | 
168 |           case (valueF @ BooleanF(), GBoolean(value)) =>
169 |             val withSchema = GBoolean[(S, D)](value)
170 |             EnvT[Schema, GData, (S, D)]((schemaFToAvro(valueF.embed), withSchema)).right
171 | 
172 |           case (valueF @ FloatF(), GFloat(value)) =>
173 |             val withSchema = GFloat[(S, D)](value)
174 |             EnvT[Schema, GData, (S, D)]((schemaFToAvro(valueF.embed), withSchema)).right
175 | 
176 |           case (valueF @ DoubleF(), GDouble(value)) =>
177 |             val withSchema = GDouble[(S, D)](value)
178 |             EnvT[Schema, GData, (S, D)]((schemaFToAvro(valueF.embed), withSchema)).right
179 | 
180 |           case (valueF @ DateF(), GDate(value)) =>
181 |             val withSchema = GDate[(S, D)](value)
182 |             EnvT[Schema, GData, (S, D)]((schemaFToAvro(valueF.embed), withSchema)).right
183 | 
184 |           case (s, d) =>
185 |             Incompatibility(schemaFToAvro(s.embed), d.embed).left
186 |         }
187 |     }
188 |     val alg: AlgebraM[\/[Incompatibility[D], ?], DataWithSchema, GenericContainer] = {
189 |       case EnvT((avroSchema, GStruct(fields))) =>
190 |         val bldrWithFields = fields.foldLeft(new GenericRecordBuilder(avroSchema)) { (recordBuilder, container) =>
191 |           val (name, data) = container
192 |           recordBuilder.set(name, unwrap(data))
193 |         }
194 |         bldrWithFields.build().right
195 | 
196 |       case EnvT((avroSchema, GArray(elem))) =>
197 |         new GenericData.Array[Any](avroSchema, elem.map(unwrap).asJavaCollection).right
198 | 
199 |       case EnvT((_, GBoolean(el))) => SimpleValue(el).right
200 |       case EnvT((_, GFloat(el)))   => SimpleValue(el).right
201 |       case EnvT((_, GInteger(el))) => SimpleValue(el).right
202 |       case EnvT((_, GDate(el)))    => SimpleValue(el.getTime).right // c.f. logical types
203 |       case EnvT((_, GLong(el)))    => SimpleValue(el).right
204 |       case EnvT((_, GDouble(el)))  => SimpleValue(el).right
205 |       case EnvT((_, GString(el)))  => SimpleValue(el).right
206 |     }
207 | 
208 |     (schema, data).hyloM[\/[Incompatibility[D], ?], DataWithSchema, GenericContainer](alg, zipWithSchemaAlg)
209 |   }
210 | 
211 | }
212 | 
213 | trait GDataInstances {
214 | 
215 |   implicit val genericDataFTraverse: Traverse[GData] = new Traverse[GData] {
216 | 
217 |     override def traverseImpl[G[_], A, B](fa: GData[A])(f: A => G[B])(
218 |         implicit evidence$1: Applicative[G]): G[GData[B]] = fa match {
219 |       case GArray(elems) =>
220 |         Functor[G].map(elems.toList traverse f)(GArray.apply)
221 | 
222 |       case GStruct(fields) =>
223 |         val (keys, values) = fields.unzip
224 |         Functor[G].map(values.toList traverse f)(v => GStruct(ListMap((keys zip v).toSeq: _*)))
225 | 
226 |       case GString(value)  => Applicative[G].point(GString[B](value))
227 |       case GLong(value)    => Applicative[G].point(GLong[B](value))
228 |       case GInteger(value) => Applicative[G].point(GInteger[B](value))
229 |       case GDouble(value)  => Applicative[G].point(GDouble[B](value))
230 |       case GFloat(value)   => Applicative[G].point(GFloat[B](value))
231 |       case GDate(value)    => Applicative[G].point(GDate[B](value))
232 |       case GBoolean(value) => Applicative[G].point(GBoolean[B](value))
233 |     }
234 |   }
235 | }
236 | 


--------------------------------------------------------------------------------