├── project
    ├── build.properties
    └── plugins.sbt
├── .travis.yml
├── .gitignore
├── src
    ├── main
    │   └── scala
    │   │   ├── com
    │   │       └── github
    │   │       │   └── scala
    │   │       │       └── io
    │   │       │           └── talk
    │   │       │               ├── Incompatibility.scala
    │   │       │               ├── privacy
    │   │       │                   ├── package.scala
    │   │       │                   ├── matryoshkaEngine.scala
    │   │       │                   ├── lambdaEngine.scala
    │   │       │                   └── codegenEngine.scala
    │   │       │               ├── package.scala
    │   │       │               ├── SchemaF.scala
    │   │       │               └── DataF.scala
    │   │   └── org
    │   │       └── apache
    │   │           └── spark
    │   │               └── sql
    │   │                   └── utils
    │   │                       └── SmartRow.scala
    └── test
    │   └── scala
    │       └── com
    │           └── github
    │               └── scala
    │                   └── io
    │                       └── talk
    │                           └── PrivacyIntegrationTest.scala
├── README.md
└── LICENSE


/project/build.properties:
--------------------------------------------------------------------------------
1 | sbt.version=1.2.1
2 | 


--------------------------------------------------------------------------------
/.travis.yml:
--------------------------------------------------------------------------------
1 | language: scala
2 | scala:
3 |    - 2.11.12
4 | 


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
1 | *.class
2 | *.log
3 | target/
4 | .idea/
5 | *.iml
6 | *.ipr
7 | 


--------------------------------------------------------------------------------
/project/plugins.sbt:
--------------------------------------------------------------------------------
1 | addSbtPlugin("com.geirsson" % "sbt-scalafmt" % "1.5.1")


--------------------------------------------------------------------------------
/src/main/scala/com/github/scala/io/talk/Incompatibility.scala:
--------------------------------------------------------------------------------
1 | package com.github.scala.io.talk
2 | 
3 | import matryoshka.data.Fix
4 | 
5 | case class Incompatibility(schema: Fix[SchemaF], data: Fix[DataF])
6 | 


--------------------------------------------------------------------------------
/src/main/scala/org/apache/spark/sql/utils/SmartRow.scala:
--------------------------------------------------------------------------------
 1 | package org.apache.spark.sql.utils
 2 | 
 3 | import org.apache.spark.sql.catalyst.expressions.GenericInternalRow
 4 | 
 5 | class SmartRow(values: Array[Any]) extends GenericInternalRow(values)
 6 | 
 7 | object SmartRow {
 8 | 
 9 |   def fromSeq(values: Seq[Any]): SmartRow =
10 |     new SmartRow(values.toArray)
11 | }
12 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | # high-perf-privacy-scalaIO2018
 2 | 
 3 | [![Build Status](https://travis-ci.org/ogirardot/high-perf-privacy-scalaIO2018.svg?branch=master)](https://travis-ci.org/ogirardot/high-perf-privacy-scalaIO2018)
 4 | 
 5 | ## What am I looking at ? 
 6 | 
 7 | It's a fully fonctionnal implementation of a Privacy Framework we designed as an illustration for the Scala.IO 2018 talk : "High performance Privacy By Design using Matryoshka and Spark" we gave at Lyon.
 8 | 
 9 | You have three engines here : 
10 | * matryoshka engine: zipping data and schema together and matching semantic tags to cypher data;
11 | * lambda engine: creates a lambda to do that "digging work once" and apply the corresponding lambda multiple times;
12 | * codegen engine: creates an Apache Spark expression to do that work leveraging the Unsafe/Tungsten data format of Apache Spark SQL.
13 | 
14 | ## Where are the slides ? 
15 | Here you go : https://speakerdeck.com/ogirardot/high-performance-privacy-by-design-using-matryoshka-and-spark
16 | Enjoy !
17 | 


--------------------------------------------------------------------------------
/src/main/scala/com/github/scala/io/talk/privacy/package.scala:
--------------------------------------------------------------------------------
 1 | package com.github.scala.io.talk
 2 | 
 3 | import matryoshka.data.Fix
 4 | 
 5 | package object privacy {
 6 |   sealed trait PrivacyEngine
 7 | 
 8 |   case object MatryoshkaEngine extends PrivacyEngine
 9 | 
10 |   case object LambdaEngine extends PrivacyEngine
11 | 
12 |   case object CodegenEngine extends PrivacyEngine
13 | 
14 |   object PrivacyStrategy {
15 |     type PrivacyStrategies = Map[Seq[(String, String)], PrivacyStrategy]
16 |   }
17 | 
18 |   trait PrivacyStrategy extends Serializable {
19 | 
20 |     val allowedInputTypes: Set[String]
21 | 
22 |     def apply(
23 |         data: Fix[DataF]): Either[List[PrivacyApplicationFailure], Fix[DataF]]
24 | 
25 |     def schema[A](input: SchemaF[A]): SchemaF[A] = input
26 | 
27 |     def applyOrFail(value: Fix[DataF])(onError: String => Unit): Fix[DataF] =
28 |       apply(value).fold(
29 |         errors => {
30 |           if (value != Fix[DataF](GNullF())) {
31 |             errors.foreach(err =>
32 |               onError(s"Error while applying privacy on $value : $err"))
33 |           }
34 |           Fix[DataF](GNullF())
35 |         },
36 |         identity
37 |       )
38 |   }
39 | 
40 |   case class PrivacyApplicationFailure(reason: String)
41 | }
42 | 


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
 1 | BSD 3-Clause License
 2 | 
 3 | Copyright (c) 2018, Olivier Girardot
 4 | All rights reserved.
 5 | 
 6 | Redistribution and use in source and binary forms, with or without
 7 | modification, are permitted provided that the following conditions are met:
 8 | 
 9 | * Redistributions of source code must retain the above copyright notice, this
10 |   list of conditions and the following disclaimer.
11 | 
12 | * Redistributions in binary form must reproduce the above copyright notice,
13 |   this list of conditions and the following disclaimer in the documentation
14 |   and/or other materials provided with the distribution.
15 | 
16 | * Neither the name of the copyright holder nor the names of its
17 |   contributors may be used to endorse or promote products derived from
18 |   this software without specific prior written permission.
19 | 
20 | THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
21 | AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
22 | IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
23 | DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
24 | FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
25 | DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
26 | SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
27 | CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
28 | OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
29 | OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
30 | 


--------------------------------------------------------------------------------
/src/main/scala/com/github/scala/io/talk/privacy/matryoshkaEngine.scala:
--------------------------------------------------------------------------------
 1 | package com.github.scala.io.talk.privacy
 2 | 
 3 | import com.github.scala.io.api.DataWithSchema
 4 | import com.github.scala.io.talk._
 5 | import com.github.scala.io.talk.privacy.PrivacyStrategy.PrivacyStrategies
 6 | import matryoshka._
 7 | import matryoshka.data.Fix
 8 | import matryoshka.implicits._
 9 | import matryoshka.patterns.EnvT
10 | import org.slf4j.LoggerFactory
11 | import scalaz._
12 | 
13 | object matryoshkaEngine {
14 | 
15 |   private val logger = LoggerFactory.getLogger("ApplyPrivacyMatryoshka")
16 | 
17 |   def transform(schema: Fix[SchemaF],
18 |                 data: Fix[DataF],
19 |                 privacyStrategies: PrivacyStrategies): Fix[DataF] = {
20 |     import Scalaz._
21 |     val privacyAlg
22 |       : AlgebraM[\/[Incompatibility, ?], DataWithSchema, Fix[DataF]] = {
23 | 
24 |       case EnvT((Fix(StructF(fieldsType, meta)), gdata @ GStructF(fields))) =>
25 |         Fix(gdata).right
26 | 
27 |       case EnvT((Fix(ArrayF(elementType, meta)), gdata @ GArrayF(elems))) =>
28 |         Fix(gdata).right
29 | 
30 |       case EnvT((vSchema, value)) =>
31 |         val tags = vSchema.unFix.metadata.tags
32 |         val fixedValue = Fix(value)
33 |         privacyStrategies
34 |           .get(tags)
35 |           .map { privacyStrategy =>
36 |             privacyStrategy.applyOrFail(fixedValue)(logger.error)
37 |           }
38 |           .getOrElse(fixedValue)
39 |           .right
40 |     }
41 | 
42 |     (schema, data).hyloM[\/[Incompatibility, ?], DataWithSchema, Fix[DataF]](
43 |       privacyAlg,
44 |       DataF.zipWithSchema) match {
45 |       case -\/(incompatibilities) =>
46 |         throw new IllegalStateException(
47 |           s"Found incompatibilities between the observed data and its expected schema : $incompatibilities")
48 | 
49 |       case \/-(result) =>
50 |         result
51 |     }
52 |   }
53 | 
54 |   // TODO same as com.github.scala.io.talk.ApplyPrivacyExpression.dataType without spark
55 |   def transformSchema(schema: Fix[SchemaF],
56 |                       privacyStrategies: PrivacyStrategies): Fix[SchemaF] = {
57 |     def alg: Algebra[SchemaF, Fix[SchemaF]] = s => changeSchema(privacyStrategies, Fix(s))
58 | 
59 |     schema.cata(alg)
60 |   }
61 | 
62 |   def changeSchema(privacyStrategies: PrivacyStrategies,
63 |                    schemaF: Fix[SchemaF]): Fix[SchemaF] = {
64 |     val s = schemaF.unFix
65 |     privacyStrategies
66 |       .find {
67 |         case (tags, _) => tags.size == s.metadata.tags.size && tags.toSet == s.metadata.tags.toSet
68 |       }
69 |       .fold(schemaF) { case (_, strategy) => Fix(strategy.schema(s)) }
70 |   }
71 | 
72 | }
73 | 


--------------------------------------------------------------------------------
/src/main/scala/com/github/scala/io/talk/package.scala:
--------------------------------------------------------------------------------
 1 | package com.github.scala.io
 2 | 
 3 | import com.github.scala.io.talk.privacy.PrivacyStrategy.PrivacyStrategies
 4 | import com.github.scala.io.talk.privacy._
 5 | import com.github.scala.io.talk.{DataF, SchemaF, SparkDataConverter}
 6 | import matryoshka.data.Fix
 7 | import matryoshka.patterns.EnvT
 8 | import org.apache.spark.sql.types.StructType
 9 | import org.apache.spark.sql.{Column, DataFrame}
10 | 
11 | package object api {
12 | 
13 |   type DataWithSchema[A] = EnvT[Fix[SchemaF], DataF, A]
14 | 
15 |   type SchemaWithPath[A] = EnvT[Fix[SchemaF], DataF, A]
16 | 
17 |   implicit class DFEncrypt(val df: DataFrame) extends AnyVal {
18 | 
19 |     def encrypt(schema: Fix[SchemaF],
20 |                 privacyStrategies: PrivacyStrategies,
21 |                 engine: PrivacyEngine) = {
22 |       engine match {
23 |         case MatryoshkaEngine =>
24 |           val structSchema = df.schema
25 |           val mutated = df.rdd.map { row =>
26 |             val gdata = SparkDataConverter.toGenericData(row, structSchema)
27 |             val result =
28 |               matryoshkaEngine.transform(schema, gdata, privacyStrategies)
29 |             SparkDataConverter.fromGenericData(result)
30 |           }
31 |           val mutatedSchema =
32 |             matryoshkaEngine.transformSchema(schema, privacyStrategies)
33 |           val mutatedDataType =
34 |             Fix.birecursiveT.cataT(mutatedSchema)(SchemaF.schemaFToDataType)
35 |           df.sparkSession
36 |             .createDataFrame(mutated, mutatedDataType.asInstanceOf[StructType])
37 | 
38 |         case LambdaEngine =>
39 |           val mutatedSchema =
40 |             matryoshkaEngine.transformSchema(schema, privacyStrategies)
41 |           val mutatedDataType =
42 |             Fix.birecursiveT.cataT(mutatedSchema)(SchemaF.schemaFToDataType)
43 |           val preparedLambda =
44 |             ApplyPrivacyLambda.prepareTransform(schema, privacyStrategies)
45 |           val structSchema = df.schema
46 |           val mutated = df.rdd.map { row =>
47 |             val gdata = SparkDataConverter.toGenericData(row, structSchema)
48 |             val result = preparedLambda.apply(gdata)
49 |             SparkDataConverter.fromGenericData(result)
50 |           }
51 |           df.sparkSession
52 |             .createDataFrame(mutated, mutatedDataType.asInstanceOf[StructType])
53 | 
54 |         case CodegenEngine =>
55 |           val expression = ApplyPrivacyExpression(
56 |             schema,
57 |             privacyStrategies,
58 |             df.schema.fieldNames.map(c => df.col(c).expr)
59 |           )
60 | 
61 |           df.withColumn(
62 |               "structMeUp",
63 |               new Column(
64 |                 expression
65 |               )
66 |             )
67 |             .select("structMeUp.*")
68 |       }
69 |     }
70 |   }
71 | 
72 | }
73 | 


--------------------------------------------------------------------------------
/src/main/scala/com/github/scala/io/talk/privacy/lambdaEngine.scala:
--------------------------------------------------------------------------------
  1 | package com.github.scala.io.talk.privacy
  2 | 
  3 | import com.github.scala.io.talk._
  4 | import com.github.scala.io.talk.privacy.PrivacyStrategy.PrivacyStrategies
  5 | import matryoshka.Algebra
  6 | import matryoshka.data.Fix
  7 | import org.slf4j.LoggerFactory
  8 | 
  9 | object ApplyPrivacyLambda {
 10 |   private val logger = LoggerFactory.getLogger("ApplyPrivacyLambda")
 11 | 
 12 |   def prepareTransform(schema: Fix[SchemaF],
 13 |                        privacyStrategies: PrivacyStrategies): MutationOp = {
 14 | 
 15 |     val alg: Algebra[SchemaF, MutationOp] = {
 16 | 
 17 |       case StructF(fields, _) =>
 18 |         if (fields.map(_._2).forall(_ == NoMutationOp)) {
 19 |           // all fields are not to be privacied
 20 |           NoMutationOp
 21 |         } else {
 22 |           val lambda: Fix[DataF] => Fix[DataF] = {
 23 |             case Fix(GStructF(dataFields)) =>
 24 |               val newFields = fields.zip(dataFields).map {
 25 |                 case ((fieldName, innerOp), (_, data)) =>
 26 |                   if (innerOp == NoMutationOp || data == Fix[DataF](GNullF())) {
 27 |                     (fieldName, data)
 28 |                   } else {
 29 |                     val privacied = innerOp(data)
 30 |                     (fieldName, privacied)
 31 |                   }
 32 |               }
 33 |               Fix(GStructF(newFields))
 34 | 
 35 |             case gdata =>
 36 |               gdata // should not happen
 37 |           }
 38 |           GoDown(lambda)
 39 |         }
 40 | 
 41 |       case ArrayF(elementType, metadata) =>
 42 |         elementType match {
 43 |           case NoMutationOp =>
 44 |             NoMutationOp
 45 | 
 46 |           case op =>
 47 |             GoDown {
 48 |               case Fix(GArrayF(elems)) =>
 49 |                 val result = elems.map(elementType.apply)
 50 |                 Fix(GArrayF(result))
 51 |               case otherData =>
 52 |                 otherData // should not happen
 53 |             }
 54 |         }
 55 | 
 56 |       case value: ValueF[MutationOp] if value.metadata.tags.nonEmpty =>
 57 |         privacyStrategies
 58 |           .get(value.metadata.tags)
 59 |           .map { strat =>
 60 |             val lambda: Fix[DataF] => Fix[DataF] =
 61 |               cypherWithContext(strat)
 62 |             GoDown(lambda)
 63 |           }
 64 |           .getOrElse(NoMutationOp)
 65 | 
 66 |       case _ => NoMutationOp
 67 |     }
 68 | 
 69 |     Fix.birecursiveT.cataT(schema)(alg)
 70 |   }
 71 | 
 72 |   private def cypherWithContext(cypher: PrivacyStrategy)(
 73 |       value: Fix[DataF]): Fix[DataF] = {
 74 |     cypher(value).fold(
 75 |       errors => {
 76 |         if (value != Fix[DataF](GNullF())) {
 77 |           errors.foreach(err =>
 78 |             logger.warn(s"Error while applying privacy on $value : $err"))
 79 |         }
 80 |         Fix[DataF](GNullF())
 81 |       },
 82 |       x => x
 83 |     )
 84 |   }
 85 | }
 86 | 
 87 | /**
 88 |   * Represents a nested op that applies a function to a GenericData and
 89 |   * can be composed
 90 |   */
 91 | sealed trait MutationOp extends Serializable {
 92 | 
 93 |   def andThen(f: Fix[DataF] => Fix[DataF]): MutationOp
 94 | 
 95 |   def apply(gdata: Fix[DataF]): Fix[DataF]
 96 | }
 97 | 
 98 | /**
 99 |   * A specific [[MutationOp]] that goes "down" and apply a function to the data
100 |   *
101 |   * @param apply0 the function to apply
102 |   */
103 | private case class GoDown(apply0: Fix[DataF] => Fix[DataF])
104 |     extends MutationOp
105 |     with Serializable {
106 | 
107 |   override def andThen(f: Fix[DataF] => Fix[DataF]): MutationOp = {
108 |     GoDown(apply0.andThen(f))
109 |   }
110 | 
111 |   override def apply(gdata: Fix[DataF]): Fix[DataF] = apply0(gdata)
112 | }
113 | 
114 | /**
115 |   * NoOp - nothing comes out of this - there's nothing to do !
116 |   */
117 | private case object NoMutationOp extends MutationOp with Serializable {
118 | 
119 |   override def andThen(f: Fix[DataF] => Fix[DataF]): MutationOp = GoDown(f)
120 | 
121 |   override def apply(gdata: Fix[DataF]): Fix[DataF] = gdata
122 | }
123 | 


--------------------------------------------------------------------------------
/src/main/scala/com/github/scala/io/talk/SchemaF.scala:
--------------------------------------------------------------------------------
  1 | package com.github.scala.io.talk
  2 | 
  3 | import com.github.scala.io.talk.ColumnMetadata.SemanticTag
  4 | import matryoshka.{Algebra, Birecursive, Coalgebra}
  5 | import scalaz.Functor
  6 | 
  7 | case class ColumnMetadata(nullable: Boolean, tags: List[SemanticTag])
  8 | 
  9 | object ColumnMetadata {
 10 |   type SemanticTag = (String, String)
 11 | 
 12 |   def empty = ColumnMetadata(nullable = true, Nil)
 13 | 
 14 | }
 15 | 
 16 | /**
 17 |   * Without further ado, let's define our main pattern-functor for the remaining of the session.
 18 |   */
 19 | sealed trait SchemaF[A] {
 20 |   val metadata: ColumnMetadata
 21 | }
 22 | 
 23 | // we'll use a ListMap to keep the ordering of the fields
 24 | final case class StructF[A](fields: List[(String, A)], metadata: ColumnMetadata)
 25 |     extends SchemaF[A]
 26 | final case class ArrayF[A](element: A, metadata: ColumnMetadata)
 27 |     extends SchemaF[A]
 28 | 
 29 | sealed trait ValueF[A] extends SchemaF[A] {
 30 |   val metadata: ColumnMetadata
 31 | }
 32 | final case class BooleanF[A](metadata: ColumnMetadata) extends ValueF[A]
 33 | final case class DateF[A](metadata: ColumnMetadata) extends ValueF[A]
 34 | final case class DoubleF[A](metadata: ColumnMetadata) extends ValueF[A]
 35 | final case class FloatF[A](metadata: ColumnMetadata) extends ValueF[A]
 36 | final case class IntegerF[A](metadata: ColumnMetadata) extends ValueF[A]
 37 | final case class LongF[A](metadata: ColumnMetadata) extends ValueF[A]
 38 | final case class StringF[A](metadata: ColumnMetadata) extends ValueF[A]
 39 | 
 40 | object SchemaF extends SchemaFToDataTypeAlgebras {
 41 | 
 42 |   /**
 43 |     * As usual, we need to define a Functor instance for our pattern.
 44 |     */
 45 |   implicit val schemaFScalazFunctor: Functor[SchemaF] = new Functor[SchemaF] {
 46 |     def map[A, B](fa: SchemaF[A])(f: A => B): SchemaF[B] = fa match {
 47 |       case StructF(fields, m) =>
 48 |         StructF(List(
 49 |                   fields
 50 |                     .map { case (name, value) => name -> f(value) }: _*
 51 |                 ),
 52 |                 m)
 53 |       case ArrayF(elem, m) => ArrayF(f(elem), m)
 54 |       case BooleanF(m)     => BooleanF(m)
 55 |       case DateF(m)        => DateF(m)
 56 |       case DoubleF(m)      => DoubleF(m)
 57 |       case FloatF(m)       => FloatF(m)
 58 |       case IntegerF(m)     => IntegerF(m)
 59 |       case LongF(m)        => LongF(m)
 60 |       case StringF(m)      => StringF(m)
 61 |     }
 62 |   }
 63 | }
 64 | 
 65 | /**
 66 |   * Now that we have a proper pattern-functor, we need (co)algebras to go from our "standard" schemas to
 67 |   * our new and shiny SchemaF (and vice versa).
 68 |   *
 69 |   * Lets focus on Parquet schemas first. Parquet is a columnar data format that allows efficient processing
 70 |   * of large datasets in a distributed environment (eg Spark). In the Spark API, Parquet schemas are represented
 71 |   * as instances of the DataType type. So what we want to write here is a pair of (co)algebras that go from/to
 72 |   * SchemaF/DataType.
 73 |   *
 74 |   * NOTE: in order not to depend directly on Spark (and, hence, transitively on half of maven-central), we've copied
 75 |   * the definition of the DataType trait and its subclasses in the current project under
 76 |   * `spark/src/main/scala/DataType.scala`.
 77 |   */
 78 | trait SchemaFToDataTypeAlgebras {
 79 | 
 80 |   import org.apache.spark.sql.types._
 81 | 
 82 |   /**
 83 |     * As usual, simply a function from SchemaF[DataType] to DataType
 84 |     */
 85 |   def schemaFToDataType: Algebra[SchemaF, DataType] = {
 86 |     case StructF(fields, _) =>
 87 |       StructType(
 88 |         fields.map { case (name, value) => StructField(name, value) }.toArray)
 89 |     case ArrayF(elem, m) => ArrayType(elem, containsNull = false)
 90 |     case BooleanF(_)     => BooleanType
 91 |     case DateF(_)        => DateType
 92 |     case DoubleF(_)      => DoubleType
 93 |     case FloatF(_)       => FloatType
 94 |     case IntegerF(_)     => IntegerType
 95 |     case LongF(_)        => LongType
 96 |     case StringF(_)      => StringType
 97 | 
 98 |   }
 99 | 
100 |   /**
101 |     * And the other way around, a function from DataType to SchemaF[DataType]
102 |     */
103 |   def dataTypeToSchemaF: Coalgebra[SchemaF, DataType] = {
104 |     case StructType(fields) =>
105 |       StructF(List(fields.map(f => f.name -> f.dataType): _*),
106 |               ColumnMetadata.empty)
107 |     case ArrayType(elem, _) => ArrayF(elem, ColumnMetadata.empty)
108 |     case BooleanType        => BooleanF(ColumnMetadata.empty)
109 |     case DateType           => DateF(ColumnMetadata.empty)
110 |     case DoubleType         => DoubleF(ColumnMetadata.empty)
111 |     case FloatType          => FloatF(ColumnMetadata.empty)
112 |     case IntegerType        => IntegerF(ColumnMetadata.empty)
113 |     case LongType           => LongF(ColumnMetadata.empty)
114 |     case StringType         => StringF(ColumnMetadata.empty)
115 | 
116 |   }
117 | 
118 |   /**
119 |     * This pair of (co)algebras allows us to create a Birecursive[DataType, SchemaF] instance "for free".
120 |     *
121 |     * Such instance witnesses the fact that we can use a DataType in schemes that would normally apply to SchemaF.
122 |     * For example, suppose that we have:
123 |     *
124 |     * {{{
125 |     *   val parquet: DataType = ???
126 |     *   val toAvro: Algebra[SchemaF, avro.Schema] = ???
127 |     * }}}
128 |     *
129 |     * If we have the instance bellow in scope (and the necessary implicits from matryoshka.implicits), we can now write
130 |     *
131 |     * {{{
132 |     *   parquet.cata(toAvro)
133 |     * }}}
134 |     *
135 |     * Instead of
136 |     *
137 |     * {{{
138 |     *   parquet.hylo(dataTypeToSchemaf, toAvro)
139 |     * }}}
140 |     *
141 |     * And the same goes with `ana` and any Coalgebra[SchemaF, X].
142 |     */
143 |   implicit val dataTypeSchemaBirecursive: Birecursive.Aux[DataType, SchemaF] =
144 |     Birecursive.algebraIso(schemaFToDataType, dataTypeToSchemaF)
145 | }
146 | 


--------------------------------------------------------------------------------
/src/main/scala/com/github/scala/io/talk/DataF.scala:
--------------------------------------------------------------------------------
  1 | package com.github.scala.io.talk
  2 | 
  3 | import com.github.scala.io.api.DataWithSchema
  4 | import matryoshka.{CoalgebraM, Recursive}
  5 | import matryoshka.data.Fix
  6 | import matryoshka.implicits._
  7 | import matryoshka.patterns.EnvT
  8 | import org.apache.spark.sql.Row
  9 | import org.apache.spark.sql.types.{ArrayType, DataType, StructType}
 10 | import org.apache.spark.unsafe.types.UTF8String
 11 | import scalaz.Scalaz._
 12 | import scalaz._
 13 | 
 14 | sealed trait DataF[A]
 15 | 
 16 | /**
 17 |   * Marker trait for "terminal" data types
 18 |   */
 19 | sealed trait GValueF[A] extends DataF[A] {
 20 | 
 21 |   def value: Any
 22 | }
 23 | 
 24 | final case class GNullF[A]() extends GValueF[A] {
 25 | 
 26 |   def value: Any = null
 27 | }
 28 | 
 29 | final case class GArrayF[A](elems: Seq[A]) extends DataF[A]
 30 | 
 31 | final case class GStructF[A](fields: List[(String, A)]) extends DataF[A]
 32 | 
 33 | final case class GStringF[A](value: String) extends GValueF[A]
 34 | 
 35 | final case class GLongF[A](value: Long) extends GValueF[A]
 36 | 
 37 | final case class GIntF[A](value: Int) extends GValueF[A]
 38 | 
 39 | final case class GDoubleF[A](value: Double) extends GValueF[A]
 40 | 
 41 | final case class GFloatF[A](value: Float) extends GValueF[A]
 42 | 
 43 | final case class GDateF[A](value: java.sql.Date) extends GValueF[A]
 44 | 
 45 | final case class GTimestampF[A](value: java.sql.Timestamp) extends GValueF[A]
 46 | 
 47 | final case class GBooleanF[A](value: Boolean) extends GValueF[A]
 48 | 
 49 | trait DataFInstances {
 50 |   implicit val genericDataFTraverse: Traverse[DataF] = new Traverse[DataF] {
 51 | 
 52 |     override def traverseImpl[G[_], A, B](
 53 |         fa: DataF[A]
 54 |     )(f: A => G[B])(implicit evidence$1: Applicative[G]): G[DataF[B]] =
 55 |       fa match {
 56 |         case GNullF() => Applicative[G].point(GNullF[B]())
 57 |         case GArrayF(elems) =>
 58 |           Functor[G].map(elems.toList traverse f)(GArrayF.apply)
 59 | 
 60 |         case GStructF(fields) =>
 61 |           val (keys, values) = fields.unzip
 62 |           Functor[G].map(values.toList traverse f)(v =>
 63 |             GStructF(List((keys zip v).toSeq: _*)))
 64 | 
 65 |         case GStringF(value)    => Applicative[G].point(GStringF[B](value))
 66 |         case GLongF(value)      => Applicative[G].point(GLongF[B](value))
 67 |         case GIntF(value)       => Applicative[G].point(GIntF[B](value))
 68 |         case GDoubleF(value)    => Applicative[G].point(GDoubleF[B](value))
 69 |         case GFloatF(value)     => Applicative[G].point(GFloatF[B](value))
 70 |         case GDateF(value)      => Applicative[G].point(GDateF[B](value))
 71 |         case GTimestampF(value) => Applicative[G].point(GTimestampF[B](value))
 72 |         case GBooleanF(value)   => Applicative[G].point(GBooleanF[B](value))
 73 |       }
 74 |   }
 75 | }
 76 | 
 77 | trait DataFunctions {
 78 | 
 79 |   /**
 80 |     * @group coalgebras
 81 |     *
 82 |     *        This coalgebra can be used to label each element of a `Fix[DataF]` with its schema.
 83 |     *
 84 |     *        Given a schema and some data, return either a [[DataWithSchema]] or a [[Incompatibility]].
 85 |     */
 86 |   def zipWithSchema: CoalgebraM[\/[Incompatibility, ?],
 87 |                                 DataWithSchema,
 88 |                                 (Fix[SchemaF], Fix[DataF])] = {
 89 | 
 90 |     case (structf @ Fix(StructF(fields, metadata)), Fix(GStructF(values))) =>
 91 |       val fieldMap = fields
 92 |       val zipped = values.map {
 93 |         case (name, value) => (name, (fieldMap.toMap.apply(name), value))
 94 |       }
 95 |       EnvT[Fix[SchemaF], DataF, (Fix[SchemaF], Fix[DataF])](
 96 |         (structf, DataF.struct(zipped))).right[Incompatibility]
 97 | 
 98 |     case (structf @ Fix(StructF(_, _)), Fix(GNullF())) =>
 99 |       EnvT[Fix[SchemaF], DataF, (Fix[SchemaF], Fix[DataF])]((structf, GNullF()))
100 |         .right[Incompatibility]
101 | 
102 |     case (arrayF @ Fix(ArrayF(n, m)), Fix(GArrayF(elements))) =>
103 |       val fieldSchema: Fix[SchemaF] = arrayF // schemaFor(arrayF) FIXME
104 |       // no patch infos allowed on an array
105 |       val arrayColumnSchema = arrayF
106 |       //.copy(metadata = m.copy(patchInfo = None)) FIXME
107 |       val arrayFa = DataF.array(elements.toList map { e =>
108 |         fieldSchema -> e
109 |       })
110 |       EnvT[Fix[SchemaF], DataF, (Fix[SchemaF], Fix[DataF])](
111 |         (arrayColumnSchema, arrayFa)).right[Incompatibility]
112 | 
113 |     case (arrayF @ Fix(ArrayF(_, m)), Fix(GNullF())) =>
114 |       // no patch infos allowed on an array
115 |       val arrayColumnSchema = arrayF //.copy(metadata = m.copy(patchInfo = None)) FIXME
116 |       EnvT[Fix[SchemaF], DataF, (Fix[SchemaF], Fix[DataF])](
117 |         (arrayColumnSchema, GNullF())).right[Incompatibility]
118 | 
119 |     case (valueF, Fix(lower)) =>
120 |       val dataF = lower.map((valueF, _))
121 |       EnvT[Fix[SchemaF], DataF, (Fix[SchemaF], Fix[DataF])]((valueF, dataF))
122 |         .right[Incompatibility]
123 | 
124 |     case (s, d) => Incompatibility(s, d).left
125 |   }
126 | }
127 | 
128 | object DataF extends DataFInstances with DataFunctions {
129 | 
130 |   def struct[A](fields: List[(String, A)]): DataF[A] = GStructF(fields)
131 | 
132 |   def array[A](elements: List[A]): DataF[A] = GArrayF(elements)
133 | }
134 | 
135 | object SparkDataConverter {
136 | 
137 |   /**
138 |     * Convert from our GenericData container to a Spark SQL compatible Row
139 |     * first and last step before creating a dataframe
140 |     *
141 |     * @param row data
142 |     * @return spark's Row
143 |     */
144 |   def fromGenericData[T](row: T)(implicit T: Recursive.Aux[T, DataF]): Row = {
145 |     import matryoshka._
146 | 
147 |     import scala.language.higherKinds
148 | 
149 |     val gAlgebra: GAlgebra[(T, ?), DataF, Row] = {
150 |       case GArrayF(elems) =>
151 |         val values = elems.map {
152 |           case (previous, current) =>
153 |             if (previous.project.isInstanceOf[GValueF[_]])
154 |               current.get(0)
155 |             else
156 |               current
157 |         }
158 |         Row(values)
159 | 
160 |       case GStructF(fields) =>
161 |         val values = fields.map { field =>
162 |           val (fx, value) = field._2
163 |           if (fx.project.isInstanceOf[GValueF[_]] || fx.project
164 |                 .isInstanceOf[GArrayF[_]]) {
165 |             value.get(0)
166 |           } else {
167 |             value
168 |           }
169 |         }
170 |         Row(values: _*)
171 | 
172 |       case el: GValueF[_] =>
173 |         Row(el.value)
174 |     }
175 | 
176 |     row.para[Row](gAlgebra)
177 |   }
178 | 
179 |   def toGenericData(row: Row, schema: StructType): Fix[DataF] = {
180 |     def handleElement(element: Any, schema: DataType): Fix[DataF] = {
181 |       element match {
182 |         case arr: Seq[Any] =>
183 |           val arrayType = schema.asInstanceOf[ArrayType]
184 |           Fix(GArrayF(arr.map(el => handleElement(el, arrayType.elementType))))
185 | 
186 |         case struct: Row =>
187 |           val structType = schema.asInstanceOf[StructType]
188 |           val dataset = struct.toSeq.zipWithIndex.map {
189 |             case (el, idx) =>
190 |               val field = structType(idx)
191 |               val elementType = field.dataType
192 |               (field.name, handleElement(el, elementType))
193 |           }
194 |           Fix(GStructF(dataset.toList))
195 |         case value: java.sql.Timestamp =>
196 |           Fix(GTimestampF(value))
197 |         case value: java.sql.Date =>
198 |           Fix(GDateF(value))
199 |         case value: Boolean =>
200 |           Fix(GBooleanF(value))
201 |         case value: Int =>
202 |           Fix(GIntF(value))
203 |         case value: Float =>
204 |           Fix(GFloatF(value))
205 |         case value: Double =>
206 |           Fix(GDoubleF(value))
207 |         case value: Long =>
208 |           Fix(GLongF(value))
209 |         case value: String =>
210 |           Fix(GStringF(value))
211 |         case value: UTF8String =>
212 |           Fix(GStringF(value.toString))
213 |         case null =>
214 |           Fix(GNullF())
215 |       }
216 |     }
217 | 
218 |     handleElement(row, schema)
219 |   }
220 | }
221 | 


--------------------------------------------------------------------------------
/src/main/scala/com/github/scala/io/talk/privacy/codegenEngine.scala:
--------------------------------------------------------------------------------
  1 | package com.github.scala.io.talk.privacy
  2 | 
  3 | import com.github.scala.io.talk._
  4 | import com.github.scala.io.talk.privacy.PrivacyStrategy.PrivacyStrategies
  5 | import matryoshka.Algebra
  6 | import matryoshka.data.Fix
  7 | import org.apache.spark.sql.Row
  8 | import org.apache.spark.sql.catalyst.InternalRow
  9 | import org.apache.spark.sql.catalyst.expressions.Expression
 10 | import org.apache.spark.sql.catalyst.expressions.codegen.{
 11 |   CodegenContext,
 12 |   ExprCode
 13 | }
 14 | import org.apache.spark.sql.types.{ArrayType, DataType, StructField, StructType}
 15 | import org.apache.spark.sql.utils.SmartRow
 16 | import org.apache.spark.unsafe.types.UTF8String
 17 | 
 18 | case class InputVariable(name: String) extends AnyVal
 19 | 
 20 | sealed trait CatalystOp
 21 | 
 22 | case class CatalystCode(code: InputVariable => String, outputVariable: String)
 23 |     extends CatalystOp
 24 | 
 25 | case object NoOp extends CatalystOp
 26 | 
 27 | case class ApplyMe(lambda: Any => Any) {
 28 |   def apply(value: Any): Any = lambda(value)
 29 | }
 30 | 
 31 | case class ApplyPrivacyExpression(schema: Fix[SchemaF],
 32 |                                   privacyStrategies: PrivacyStrategies,
 33 |                                   children: Seq[Expression])
 34 |     extends Expression {
 35 | 
 36 |   type FieldName = String
 37 |   type FieldWithInfos = (DataType, CatalystOp)
 38 | 
 39 |   override def nullable: Boolean = children.forall(_.nullable)
 40 | 
 41 |   override def eval(input: InternalRow): Any = {
 42 |     // privacy "manually" #DelegateToMatryoshka
 43 |     val structType = Fix.birecursiveT
 44 |       .cataT(schema)(SchemaF.schemaFToDataType)
 45 |       .asInstanceOf[StructType]
 46 |     val gdata = SparkDataConverter.toGenericData(
 47 |       Row(input.toSeq(structType): _*),
 48 |       structType)
 49 |     val res = matryoshkaEngine.transform(schema, gdata, privacyStrategies)
 50 |     SmartRow.fromSeq(SparkDataConverter.fromGenericData(res).toSeq.map {
 51 |       case s: String => UTF8String.fromString(s)
 52 |       case a         => a
 53 |     })
 54 |   }
 55 | 
 56 |   /**
 57 |     * The mutate schema :
 58 |     * TODO mutate the schema through privacy .schema application
 59 |     *
 60 |     * @return
 61 |     */
 62 |   override def dataType: DataType = {
 63 |     import SchemaF._
 64 |     import matryoshka.data._
 65 |     // check if any privacy strategy needs to be applied an mutate the schema accordingly
 66 |     def ifPrivacy[A](input: SchemaF[A],
 67 |                      metadata: ColumnMetadata): SchemaF[A] = {
 68 |       privacyStrategies
 69 |         .find {
 70 |           case (tags, _) =>
 71 |             // we do not check here if the strat is "applicable" only if the tags match
 72 |             tags.size == metadata.tags.size && tags.toSet == metadata.tags.toSet
 73 |         }
 74 |         .map {
 75 |           case (_, strategy) =>
 76 |             strategy.schema(input)
 77 |         }
 78 |         .getOrElse(input)
 79 |     }
 80 | 
 81 |     val alg: Algebra[SchemaF, (Boolean, DataType)] = {
 82 |       case struct @ StructF(fields, metadata) =>
 83 |         val res = StructType(fields.map {
 84 |           case (name, (isNullable, field)) =>
 85 |             StructField(name, field, isNullable)
 86 |         })
 87 |         (metadata.nullable, res)
 88 | 
 89 |       case v @ ArrayF(element, metadata) =>
 90 |         val res = ArrayType(element._2, element._1)
 91 |         (metadata.nullable, res)
 92 | 
 93 |       case v: ValueF[(Boolean, DataType)] =>
 94 |         val res = ifPrivacy(v, v.metadata)
 95 |         (v.metadata.nullable,
 96 |          schemaFToDataType.apply(schemaFScalazFunctor.map(res)(_._2)))
 97 |     }
 98 |     val res = Fix.birecursiveT.cataT(schema)(alg)
 99 |     res._2
100 |   }
101 | 
102 |   override protected def doGenCode(ctx: CodegenContext,
103 |                                    ev: ExprCode): ExprCode = {
104 |     import SchemaF._
105 | 
106 |     val input = "inputadapter_row_0"
107 | 
108 |     val privacyAlg: Algebra[SchemaF, FieldWithInfos] = {
109 |       case StructF(fieldsWithDataTypes, metadata) =>
110 |         val tmp = ctx.freshName("toto")
111 |         val inputTmp = ctx.freshName("inputTmp")
112 | 
113 |         val CatalystCode(fieldsCode, _) =
114 |           generateCodeForStruct(ctx, fieldsWithDataTypes, tmp)
115 |         val outputDataType = fieldsToSparkDataType(fieldsWithDataTypes)
116 |         val outputDataTypeForCodegen =
117 |           ctx.addReferenceObj("outputDataType", outputDataType)
118 |         val code = (inputVariable: InputVariable) => {
119 |           s"""
120 |              org.apache.spark.sql.catalyst.InternalRow  $inputTmp = (org.apache.spark.sql.catalyst.InternalRow ) ${inputVariable.name};
121 |              org.apache.spark.sql.utils.SmartRow $tmp = (org.apache.spark.sql.utils.SmartRow) org.apache.spark.sql.utils.SmartRow.fromSeq($inputTmp.toSeq($outputDataTypeForCodegen));
122 |              ${fieldsCode.apply(InputVariable(tmp))}
123 |             """
124 |         }
125 |         (outputDataType, CatalystCode(code, tmp))
126 | 
127 |       case ArrayF(elementType, metadata) =>
128 |         val (elementSparkDataType, innerOp) = elementType
129 |         val arrayDataType = ArrayType(elementSparkDataType)
130 |         val resOp = if (innerOp == NoOp) {
131 |           innerOp
132 |         } else {
133 |           val tags = metadata.tags
134 |           val elementTypeBoxed = ctx.boxedType(elementSparkDataType)
135 |           val tpeName = ctx.addReferenceObj("tpe", elementSparkDataType)
136 |           val CatalystCode(innerCode, innerOuput) = innerOp
137 |           val tempVariable = ctx.freshName("tmp")
138 |           val pos = ctx.freshName("pos")
139 |           val output = ctx.freshName("output")
140 |           val code = (inputVariable: InputVariable) =>
141 |             s"""
142 |               Object[] $tempVariable = new Object[${inputVariable.name}.numElements()];
143 |               for (int $pos = 0; $pos < ${inputVariable.name}.numElements(); $pos++) {
144 |                 if (!${inputVariable.name}.isNullAt($pos)) {
145 |                   ${innerCode.apply(
146 |               InputVariable(s"(${inputVariable.name}.get($pos, $tpeName))")
147 |             )}
148 |                   $tempVariable[$pos] = $innerOuput;
149 |                 } else {
150 |                   $tempVariable[$pos] = null;
151 |                 }
152 |               }
153 |               org.apache.spark.sql.catalyst.util.ArrayData $output = new org.apache.spark.sql.catalyst.util.GenericArrayData($tempVariable);
154 |             """
155 |           CatalystCode(code, output)
156 |         }
157 |         (arrayDataType, resOp)
158 | 
159 |       case valueColumnSchema: ValueF[FieldWithInfos]
160 |           if valueColumnSchema.metadata.tags.nonEmpty =>
161 |         val tags: List[(String, String)] = valueColumnSchema.metadata.tags
162 |         val elementDataType: DataType =
163 |           schemaFToDataType.apply(schemaFScalazFunctor(valueColumnSchema)(_._1))
164 |         val resOp = privacyStrategies
165 |           .get(tags)
166 |           .map { strat =>
167 |             val output = ctx.freshName("output")
168 |             val outputSchema = strat.schema(valueColumnSchema)
169 |             val outputDataType =
170 |               schemaFToDataType.apply(schemaFScalazFunctor(outputSchema)(_._1))
171 |             val javaType = ctx.boxedType(outputDataType)
172 |             val cypherInSpark =
173 |               ctx.addReferenceObj("cypherMe", transTypePrivacyStrategy(strat))
174 |             val code = (inputVariable: InputVariable) => s"""
175 |                       $javaType $output = ($javaType) $cypherInSpark.apply(${inputVariable.name});
176 |                     """
177 |             CatalystCode(code, output)
178 | 
179 |           }
180 |           .getOrElse(NoOp)
181 |         (elementDataType, resOp)
182 | 
183 |       case value: ValueF[FieldWithInfos] if value.metadata.tags.isEmpty =>
184 |         val elementDataType =
185 |           schemaFToDataType.apply(schemaFScalazFunctor(value)(_._1))
186 |         (elementDataType, NoOp)
187 |     }
188 | 
189 |     ev.copy(code = Fix.birecursiveT.cataT(schema)(privacyAlg) match {
190 |       case (_, NoOp) =>
191 |         s"""
192 |            final boolean ${ev.isNull} = ($input != null) ? false : true;
193 |            final InternalRow  ${ev.value} = $input;
194 |           """
195 | 
196 |       case rec @ (topLevelDataType, CatalystCode(method, outputVariable)) =>
197 |         s"""
198 |               ${method(InputVariable(input))}
199 |               final boolean ${ev.isNull} = ($input != null) ? false : true;
200 |               final InternalRow ${ev.value} = $outputVariable;
201 |             """
202 |     })
203 |   }
204 | 
205 |   /**
206 |     * Generate Catalyst Code for a struct
207 |     *
208 |     * @param fieldsWithDataType all the fields of the inner struct
209 |     * @param tmp                the variable we want to mutate
210 |     * @return the code necessary to mutate a struct
211 |     */
212 |   def generateCodeForStruct(
213 |       ctx: CodegenContext,
214 |       fieldsWithDataType: Seq[(FieldName, FieldWithInfos)],
215 |       tmp: String
216 |   ): CatalystCode = {
217 |     fieldsWithDataType.zipWithIndex.foldLeft(CatalystCode(_ => "", tmp)) {
218 |       case (buffer, ((fieldName, (elementDataType, op)), idx)) =>
219 |         if (op == NoOp) {
220 |           buffer
221 |         } else {
222 |           val CatalystCode(code, intermediateOutput) = op
223 |           val fieldTpe = ctx.addReferenceObj("dt", elementDataType)
224 |           // we need top extract the data properly according to its element type
225 |           val fieldExtractor = elementDataType match {
226 |             case StructType(fields) =>
227 |               val numFields = fields.length
228 |               s"getStruct($idx, $numFields)"
229 |             case ArrayType(_, _) =>
230 |               s"getArray($idx)"
231 |             case _ =>
232 |               s"get($idx, $fieldTpe)"
233 |           }
234 | 
235 |           CatalystCode(
236 |             (inputVariable: InputVariable) =>
237 |               s"""
238 |                  ${buffer.code(inputVariable)}
239 |                  if (!${inputVariable.name}.isNullAt($idx)) {
240 |                  // $fieldName
241 |                    ${code.apply(
242 |                 InputVariable(s"${inputVariable.name}.$fieldExtractor"))}
243 |                    $tmp.update($idx, $intermediateOutput);
244 |                  }
245 |               """,
246 |             tmp
247 |           )
248 |         }
249 |     }
250 |   }
251 | 
252 |   def transTypePrivacyStrategy(strat: PrivacyStrategy): ApplyMe = {
253 |     ApplyMe((value: Any) => {
254 |       strat
255 |         .apply(wrap(value))
256 |         .fold(
257 |           errors => {
258 |             errors.foreach(println)
259 |             null
260 |           },
261 |           x => unwrap(x.unFix)
262 |         )
263 |     })
264 |   }
265 | 
266 |   def wrap(input: Any): Fix[DataF] = {
267 |     input match {
268 |       case null                  => Fix(GNullF())
269 |       case a: String             => Fix(GStringF(a))
270 |       case a: Long               => Fix(GLongF(a))
271 |       case a: java.lang.Long     => Fix(GLongF(a))
272 |       case a: UTF8String         => Fix(GStringF(a.toString))
273 |       case a: Double             => Fix(GDoubleF(a))
274 |       case a: java.lang.Double   => Fix(GDoubleF(a))
275 |       case a: Int                => Fix(GIntF(a))
276 |       case a: java.lang.Integer  => Fix(GIntF(a))
277 |       case a: Float              => Fix(GFloatF(a))
278 |       case a: java.lang.Float    => Fix(GFloatF(a))
279 |       case a: java.sql.Date      => Fix(GDateF(a))
280 |       case a: java.sql.Timestamp => Fix(GTimestampF(a))
281 |       case _ =>
282 |         throw new UnsupportedOperationException(
283 |           s"Input data is not supported : $input of type ${input.getClass}")
284 |     }
285 |   }
286 | 
287 |   def unwrap[A](input: DataF[A]): Any = input match {
288 |     case GNullF()       => null
289 |     case x: GStringF[A] => UTF8String.fromString(x.value)
290 |     case x: GValueF[A]  => x.value
291 |     case _ =>
292 |       throw new UnsupportedOperationException(
293 |         s"Input data is not supported : $input of type ${input.getClass}")
294 |   }
295 | 
296 |   /**
297 |     * Re-construct the Spark StructType data type, from the fields after privacy
298 |     *
299 |     * @param fieldsWithDataType all the fields transformed after privacy
300 |     * @return
301 |     */
302 |   private def fieldsToSparkDataType(
303 |       fieldsWithDataType: List[(FieldName, FieldWithInfos)]): StructType = {
304 |     StructType(fieldsWithDataType.map {
305 |       case (fieldName, (fieldDataType, _)) =>
306 |         StructField(fieldName, fieldDataType, nullable = true)
307 |     })
308 |   }
309 | }
310 | 


--------------------------------------------------------------------------------
/src/test/scala/com/github/scala/io/talk/PrivacyIntegrationTest.scala:
--------------------------------------------------------------------------------
  1 | package com.github.scala.io.talk
  2 | 
  3 | import java.io.ByteArrayOutputStream
  4 | import java.security.MessageDigest
  5 | import java.util.Base64
  6 | 
  7 | import com.github.scala.io.api._
  8 | import com.github.scala.io.talk.privacy._
  9 | import javax.crypto.Cipher
 10 | import javax.crypto.spec.{IvParameterSpec, SecretKeySpec}
 11 | import matryoshka.data.Fix
 12 | import org.apache.log4j.{Level, Logger}
 13 | import org.apache.spark.sql.types.StructType
 14 | import org.apache.spark.sql.{Encoders, Row, SparkSession}
 15 | import org.scalatest.{BeforeAndAfterAll, FlatSpec, Matchers}
 16 | 
 17 | class PrivacyIntegrationTest
 18 |     extends FlatSpec
 19 |     with Matchers
 20 |     with BeforeAndAfterAll {
 21 | 
 22 |   var spark: SparkSession = _
 23 |   val engines = List(LambdaEngine, CodegenEngine, MatryoshkaEngine)
 24 | 
 25 |   override def beforeAll {
 26 |     spark = SparkSession
 27 |       .builder()
 28 |       .appName("Dataframe encryption test")
 29 |       .master("local[2]")
 30 |       .getOrCreate()
 31 |     Logger.getLogger("org.apache.spark.executor.Executor").setLevel(Level.OFF)
 32 |   }
 33 | 
 34 |   override def afterAll {
 35 |     spark.stop()
 36 |     Logger.getLogger("org.apache.spark.executor.Executor").setLevel(Level.WARN)
 37 |   }
 38 | 
 39 |   def testWithEngine(engine: PrivacyEngine): Unit = {
 40 |     it should s"handle simple flat datasets with engine: $engine" in {
 41 |       val dataset =
 42 |         List(("AAAA", "BBBB", "CCCC", "DDDD"), ("EEEE", "FFFF", "GGGG", "HHHH"))
 43 |       val input = spark
 44 |         .createDataFrame(dataset)
 45 |         .toDF("first", "second", "third", "fourth")
 46 | 
 47 |       val strategies = Map(
 48 |         Seq(("rdfs:type", "http://schema.org/Person#pseudo")) -> new PrivacyStrategy {
 49 |           override val allowedInputTypes: Set[String] = Set()
 50 | 
 51 |           override def apply(data: Fix[DataF])
 52 |             : Either[List[PrivacyApplicationFailure], Fix[DataF]] = {
 53 |             data match {
 54 |               case Fix(GStringF(value)) =>
 55 |                 val res = new String(
 56 |                   Base64.getEncoder.encode(
 57 |                     MessageDigest
 58 |                       .getInstance("SHA1")
 59 |                       .digest(value.toString.getBytes("UTF-8"))))
 60 |                 Right(Fix(GStringF(res)))
 61 | 
 62 |               case _ =>
 63 |                 Right(Fix[DataF](GNullF()))
 64 |             }
 65 |           }
 66 |         }
 67 |       )
 68 | 
 69 |       val schemaFix = Fix[SchemaF](
 70 |         StructF(
 71 |           List(
 72 |             (
 73 |               "first",
 74 |               Fix(
 75 |                 StringF(
 76 |                   ColumnMetadata.empty.copy(tags =
 77 |                     List(("rdfs:type", "http://schema.org/Person#pseudo")))
 78 |                 ))
 79 |             ),
 80 |             ("second", Fix(StringF(ColumnMetadata.empty))),
 81 |             ("third", Fix(StringF(ColumnMetadata.empty))),
 82 |             ("fourth", Fix(StringF(ColumnMetadata.empty)))
 83 |           ),
 84 |           ColumnMetadata.empty
 85 |         ))
 86 | 
 87 |       val output = input.encrypt(schemaFix, strategies, engine)
 88 |       input.schema should be(input.schema)
 89 | 
 90 |       output.first() should be(
 91 |         Row("4lEhcqv4zJ9n/dSetsrPLfcbutM=", "BBBB", "CCCC", "DDDD"))
 92 |       output.collect()(1) should be(
 93 |         Row("xJw04gFKLaJw7q4H1zByb/3dMZY=", "FFFF", "GGGG", "HHHH"))
 94 |     }
 95 | 
 96 |     it should s"handle complex nested structs with engine: $engine" in {
 97 |       val data =
 98 |         """{"civility": {"familyName": "MARTIN", "gender": 1, "givenName": "FABIEN", "inner": {"civility": {"familyName": "MARTIN", "gender": 1, "givenName": "FABIEN"}, "gender": 2}}, "kind": "user#part", "lastUpdatedBy": "FICHECLIENT", "userId": "0211123586445"}"""
 99 | 
100 |       val input = spark.read.json(
101 |         spark.createDataset[String](List(data))(Encoders.STRING))
102 | 
103 |       val cypher = new PrivacyStrategy {
104 |         override val allowedInputTypes: Set[String] = Set()
105 | 
106 |         override def apply(data: Fix[DataF])
107 |           : Either[List[PrivacyApplicationFailure], Fix[DataF]] = {
108 |           data match {
109 |             case Fix(value: GValueF[_]) =>
110 |               val res = new String(
111 |                 Base64.getEncoder.encode(
112 |                   MessageDigest
113 |                     .getInstance("SHA1")
114 |                     .digest(value.value.toString.getBytes("UTF-8"))))
115 |               Right(Fix(GStringF(res)))
116 | 
117 |             case _ =>
118 |               Right(Fix[DataF](GNullF()))
119 |           }
120 |         }
121 | 
122 |         override def schema[A](input: SchemaF[A]): SchemaF[A] =
123 |           StringF(input.metadata)
124 |       }
125 |       val strategies = Map(
126 |         Seq(("rdfs:type", "http://schema.org/Person#pseudo")) -> cypher,
127 |         Seq(("rdfs:type", "http://schema.org/Person#interv")) -> cypher
128 |       )
129 | 
130 |       val tableSchema = Fix[SchemaF](
131 |         StructF(
132 |           List(
133 |             (
134 |               "civility",
135 |               Fix(StructF(
136 |                 List(
137 |                   (
138 |                     "familyName",
139 |                     Fix(StringF(
140 |                       ColumnMetadata.empty.copy(tags =
141 |                         List(("rdfs:type", "http://schema.org/Person#mask")))
142 |                     ))
143 |                   ),
144 |                   (
145 |                     "gender",
146 |                     Fix(LongF(
147 |                       ColumnMetadata.empty.copy(tags =
148 |                         List(("rdfs:type", "http://schema.org/Person#interv")))
149 |                     ))
150 |                   ),
151 |                   (
152 |                     "givenName",
153 |                     Fix(StringF(
154 |                       ColumnMetadata.empty.copy(tags =
155 |                         List(("rdfs:type", "http://schema.org/Person#pseudo")))
156 |                     ))
157 |                   ),
158 |                   ("inner",
159 |                    Fix(StructF(
160 |                      List(
161 |                        (
162 |                          "civility",
163 |                          Fix(StructF(
164 |                            List(
165 |                              (
166 |                                "familyName",
167 |                                Fix(StringF(
168 |                                  ColumnMetadata.empty.copy(tags =
169 |                                    List(("rdfs:type",
170 |                                          "http://schema.org/Person#mask")))
171 |                                ))
172 |                              ),
173 |                              (
174 |                                "gender",
175 |                                Fix(LongF(
176 |                                  ColumnMetadata.empty.copy(tags =
177 |                                    List(("rdfs:type",
178 |                                          "http://schema.org/Person#interv")))
179 |                                ))
180 |                              ),
181 |                              (
182 |                                "givenName",
183 |                                Fix(StringF(
184 |                                  ColumnMetadata.empty.copy(tags =
185 |                                    List(("rdfs:type",
186 |                                          "http://schema.org/Person#pseudo")))
187 |                                ))
188 |                              )
189 |                            ),
190 |                            ColumnMetadata.empty
191 |                          ))
192 |                        ),
193 |                        (
194 |                          "gender",
195 |                          Fix(LongF(
196 |                            ColumnMetadata.empty.copy(tags = List(
197 |                              ("rdfs:type", "http://schema.org/Person#interv")))
198 |                          ))
199 |                        )
200 |                      ),
201 |                      ColumnMetadata.empty
202 |                    )))
203 |                 ),
204 |                 ColumnMetadata.empty
205 |               ))
206 |             ),
207 |             (
208 |               "kind",
209 |               Fix(
210 |                 StringF(
211 |                   ColumnMetadata.empty.copy(tags =
212 |                     List(("rdfs:type", "http://schema.org/Person#pseudo")))
213 |                 ))
214 |             ),
215 |             ("lastUpdatedBy", Fix(StringF(ColumnMetadata.empty))),
216 |             ("userId", Fix(StringF(ColumnMetadata.empty)))
217 |           ),
218 |           ColumnMetadata.empty
219 |         ))
220 | 
221 |       val outputSchemaWithPrivacy = Fix[SchemaF](
222 |         StructF(
223 |           List(
224 |             (
225 |               "civility",
226 |               Fix(StructF(
227 |                 List(
228 |                   (
229 |                     "familyName",
230 |                     Fix(StringF(
231 |                       ColumnMetadata.empty.copy(tags =
232 |                         List(("rdfs:type", "http://schema.org/Person#mask")))
233 |                     ))
234 |                   ),
235 |                   (
236 |                     "gender",
237 |                     Fix(StringF(
238 |                       ColumnMetadata.empty.copy(tags =
239 |                         List(("rdfs:type", "http://schema.org/Person#interv")))
240 |                     ))
241 |                   ),
242 |                   (
243 |                     "givenName",
244 |                     Fix(StringF(
245 |                       ColumnMetadata.empty.copy(tags =
246 |                         List(("rdfs:type", "http://schema.org/Person#pseudo")))
247 |                     ))
248 |                   ),
249 |                   ("inner",
250 |                    Fix(StructF(
251 |                      List(
252 |                        (
253 |                          "civility",
254 |                          Fix(StructF(
255 |                            List(
256 |                              (
257 |                                "familyName",
258 |                                Fix(StringF(
259 |                                  ColumnMetadata.empty.copy(tags =
260 |                                    List(("rdfs:type",
261 |                                          "http://schema.org/Person#mask")))
262 |                                ))
263 |                              ),
264 |                              (
265 |                                "gender",
266 |                                Fix(StringF(
267 |                                  ColumnMetadata.empty.copy(tags =
268 |                                    List(("rdfs:type",
269 |                                          "http://schema.org/Person#interv")))
270 |                                ))
271 |                              ),
272 |                              (
273 |                                "givenName",
274 |                                Fix(StringF(
275 |                                  ColumnMetadata.empty.copy(tags =
276 |                                    List(("rdfs:type",
277 |                                          "http://schema.org/Person#pseudo")))
278 |                                ))
279 |                              )
280 |                            ),
281 |                            ColumnMetadata.empty
282 |                          ))
283 |                        ),
284 |                        (
285 |                          "gender",
286 |                          Fix(StringF(
287 |                            ColumnMetadata.empty.copy(tags = List(
288 |                              ("rdfs:type", "http://schema.org/Person#interv")))
289 |                          ))
290 |                        )
291 |                      ),
292 |                      ColumnMetadata.empty
293 |                    )))
294 |                 ),
295 |                 ColumnMetadata.empty
296 |               ))
297 |             ),
298 |             (
299 |               "kind",
300 |               Fix(
301 |                 StringF(
302 |                   ColumnMetadata.empty.copy(tags =
303 |                     List(("rdfs:type", "http://schema.org/Person#pseudo")))
304 |                 ))
305 |             ),
306 |             ("lastUpdatedBy", Fix(StringF(ColumnMetadata.empty))),
307 |             ("userId", Fix(StringF(ColumnMetadata.empty)))
308 |           ),
309 |           ColumnMetadata.empty
310 |         ))
311 | 
312 |       val output = input.encrypt(tableSchema, strategies, engine)
313 | 
314 |       val schemaAsDT = Fix.birecursiveT.cataT(outputSchemaWithPrivacy)(
315 |         SchemaF.schemaFToDataType)
316 |       output.schema should be(schemaAsDT.asInstanceOf[StructType])
317 | 
318 |       val row = output.first()
319 |       row should be(
320 |         Row(
321 |           Row(
322 |             "MARTIN",
323 |             "NWoZK3kTsExUV00Ywo1G5jlUKKs=",
324 |             "ZHmSvjodAvqIT7x0Lu6YDXA8D9g=",
325 |             Row(Row("MARTIN",
326 |                     "NWoZK3kTsExUV00Ywo1G5jlUKKs=",
327 |                     "ZHmSvjodAvqIT7x0Lu6YDXA8D9g="),
328 |                 "2kuSN7rMzfGcB2DKt67EqDWQELA=")
329 |           ),
330 |           "HgoSOFkFjIGGbMqW1Uz6LPIwG/M=",
331 |           "FICHECLIENT",
332 |           "0211123586445"
333 |         ))
334 |     }
335 |   }
336 | 
337 |   engines.foreach(testWithEngine)
338 | }
339 | 
340 | object SymmetricCrypt extends Serializable {
341 | 
342 |   def cryptoSecretToBytes(cryptoSecret: String,
343 |                           hexSecret: Boolean): Array[Byte] = {
344 |     if (hexSecret) hexToBytes(cryptoSecret)
345 |     else cryptoSecret.getBytes
346 |   }
347 | 
348 |   def encrypt(clearText: String,
349 |               cryptoSecret: String,
350 |               cryptoAlgorithm: String,
351 |               hexSecret: Boolean = false): String = {
352 |     val textToEncrypt = Option(clearText).getOrElse("")
353 |     val stream: ByteArrayOutputStream = new ByteArrayOutputStream
354 |     stream.write(textToEncrypt.getBytes)
355 |     var bytes: Array[Byte] = stream.toByteArray
356 |     val cipher: Cipher = Cipher.getInstance(cryptoAlgorithm)
357 |     val cryptoKey: SecretKeySpec =
358 |       new SecretKeySpec(cryptoSecretToBytes(cryptoSecret, hexSecret),
359 |                         cryptoAlgorithm.split("/")(0))
360 |     cipher.init(Cipher.ENCRYPT_MODE, cryptoKey)
361 |     bytes = cipher.doFinal(bytes)
362 |     val useInitializationVector: Boolean =
363 |       if (cryptoAlgorithm.indexOf('/') < 0) false
364 |       else cryptoAlgorithm.split("/")(1).toUpperCase ne "ECB"
365 |     if (useInitializationVector) {
366 |       val iv: Array[Byte] = cipher.getIV
367 |       val out2: Array[Byte] = new Array[Byte](iv.length + 1 + bytes.length)
368 |       out2(0) = iv.length.asInstanceOf[Byte]
369 |       System.arraycopy(iv, 0, out2, 1, iv.length)
370 |       System.arraycopy(bytes, 0, out2, 1 + iv.length, bytes.length)
371 |       bytes = out2
372 |     }
373 |     val cryptedData: String = Base64.getUrlEncoder.encodeToString(bytes)
374 |     return cryptedData
375 |   }
376 | 
377 |   def hexToBytes(str: String): Array[Byte] = {
378 |     if (str == null) {
379 |       null
380 |     } else if (str.length < 2) {
381 |       null
382 |     } else {
383 |       val len = str.length / 2
384 |       val buffer = new Array[Byte](len)
385 |       var i = 0
386 |       while (i < len) {
387 |         buffer(i) = Integer.parseInt(str.substring(i * 2, i * 2 + 2), 16).toByte
388 |         i = i + 1
389 |       }
390 |       buffer
391 |     }
392 |   }
393 | 
394 |   def decrypt(
395 |       cryptedData: String,
396 |       cryptoSecret: String,
397 |       cryptoAlgorithm: String,
398 |       hexSecret: Boolean = false
399 |   ): String = {
400 |     val cipher: Cipher = Cipher.getInstance(cryptoAlgorithm)
401 |     val cryptoKey: SecretKeySpec =
402 |       new SecretKeySpec(cryptoSecretToBytes(cryptoSecret, hexSecret),
403 |                         cryptoAlgorithm.split("/")(0))
404 |     val useInitializationVector: Boolean =
405 |       if (cryptoAlgorithm.indexOf('/') < 0) false
406 |       else cryptoAlgorithm.split("/")(1).toUpperCase ne "ECB"
407 |     var cryptedBytes: Array[Byte] = Base64.getUrlDecoder().decode(cryptedData)
408 |     if (useInitializationVector) {
409 |       val ivLen: Int = cryptedBytes(0)
410 |       val ivSpec: IvParameterSpec = new IvParameterSpec(cryptedBytes, 1, ivLen)
411 |       cipher.init(Cipher.DECRYPT_MODE, cryptoKey, ivSpec)
412 |       cryptedBytes =
413 |         cipher.doFinal(cryptedBytes, 1 + ivLen, cryptedBytes.length - 1 - ivLen)
414 |     } else {
415 |       cipher.init(Cipher.DECRYPT_MODE, cryptoKey)
416 |       cryptedBytes = cipher.doFinal(cryptedBytes)
417 |     }
418 |     new String(cryptedBytes)
419 |   }
420 | }
421 | 


--------------------------------------------------------------------------------