├── project ├── build.properties └── plugins.sbt ├── .travis.yml ├── .gitignore ├── src ├── main │ └── scala │ │ ├── com │ │ └── github │ │ │ └── scala │ │ │ └── io │ │ │ └── talk │ │ │ ├── Incompatibility.scala │ │ │ ├── privacy │ │ │ ├── package.scala │ │ │ ├── matryoshkaEngine.scala │ │ │ ├── lambdaEngine.scala │ │ │ └── codegenEngine.scala │ │ │ ├── package.scala │ │ │ ├── SchemaF.scala │ │ │ └── DataF.scala │ │ └── org │ │ └── apache │ │ └── spark │ │ └── sql │ │ └── utils │ │ └── SmartRow.scala └── test │ └── scala │ └── com │ └── github │ └── scala │ └── io │ └── talk │ └── PrivacyIntegrationTest.scala ├── README.md └── LICENSE /project/build.properties: -------------------------------------------------------------------------------- 1 | sbt.version=1.2.1 2 | -------------------------------------------------------------------------------- /.travis.yml: -------------------------------------------------------------------------------- 1 | language: scala 2 | scala: 3 | - 2.11.12 4 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | *.class 2 | *.log 3 | target/ 4 | .idea/ 5 | *.iml 6 | *.ipr 7 | -------------------------------------------------------------------------------- /project/plugins.sbt: -------------------------------------------------------------------------------- 1 | addSbtPlugin("com.geirsson" % "sbt-scalafmt" % "1.5.1") -------------------------------------------------------------------------------- /src/main/scala/com/github/scala/io/talk/Incompatibility.scala: -------------------------------------------------------------------------------- 1 | package com.github.scala.io.talk 2 | 3 | import matryoshka.data.Fix 4 | 5 | case class Incompatibility(schema: Fix[SchemaF], data: Fix[DataF]) 6 | -------------------------------------------------------------------------------- /src/main/scala/org/apache/spark/sql/utils/SmartRow.scala: -------------------------------------------------------------------------------- 1 | package org.apache.spark.sql.utils 2 | 3 | import org.apache.spark.sql.catalyst.expressions.GenericInternalRow 4 | 5 | class SmartRow(values: Array[Any]) extends GenericInternalRow(values) 6 | 7 | object SmartRow { 8 | 9 | def fromSeq(values: Seq[Any]): SmartRow = 10 | new SmartRow(values.toArray) 11 | } 12 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # high-perf-privacy-scalaIO2018 2 | 3 | [![Build Status](https://travis-ci.org/ogirardot/high-perf-privacy-scalaIO2018.svg?branch=master)](https://travis-ci.org/ogirardot/high-perf-privacy-scalaIO2018) 4 | 5 | ## What am I looking at ? 6 | 7 | It's a fully fonctionnal implementation of a Privacy Framework we designed as an illustration for the Scala.IO 2018 talk : "High performance Privacy By Design using Matryoshka and Spark" we gave at Lyon. 8 | 9 | You have three engines here : 10 | * matryoshka engine: zipping data and schema together and matching semantic tags to cypher data; 11 | * lambda engine: creates a lambda to do that "digging work once" and apply the corresponding lambda multiple times; 12 | * codegen engine: creates an Apache Spark expression to do that work leveraging the Unsafe/Tungsten data format of Apache Spark SQL. 13 | 14 | ## Where are the slides ? 15 | Here you go : https://speakerdeck.com/ogirardot/high-performance-privacy-by-design-using-matryoshka-and-spark 16 | Enjoy ! 17 | -------------------------------------------------------------------------------- /src/main/scala/com/github/scala/io/talk/privacy/package.scala: -------------------------------------------------------------------------------- 1 | package com.github.scala.io.talk 2 | 3 | import matryoshka.data.Fix 4 | 5 | package object privacy { 6 | sealed trait PrivacyEngine 7 | 8 | case object MatryoshkaEngine extends PrivacyEngine 9 | 10 | case object LambdaEngine extends PrivacyEngine 11 | 12 | case object CodegenEngine extends PrivacyEngine 13 | 14 | object PrivacyStrategy { 15 | type PrivacyStrategies = Map[Seq[(String, String)], PrivacyStrategy] 16 | } 17 | 18 | trait PrivacyStrategy extends Serializable { 19 | 20 | val allowedInputTypes: Set[String] 21 | 22 | def apply( 23 | data: Fix[DataF]): Either[List[PrivacyApplicationFailure], Fix[DataF]] 24 | 25 | def schema[A](input: SchemaF[A]): SchemaF[A] = input 26 | 27 | def applyOrFail(value: Fix[DataF])(onError: String => Unit): Fix[DataF] = 28 | apply(value).fold( 29 | errors => { 30 | if (value != Fix[DataF](GNullF())) { 31 | errors.foreach(err => 32 | onError(s"Error while applying privacy on $value : $err")) 33 | } 34 | Fix[DataF](GNullF()) 35 | }, 36 | identity 37 | ) 38 | } 39 | 40 | case class PrivacyApplicationFailure(reason: String) 41 | } 42 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | BSD 3-Clause License 2 | 3 | Copyright (c) 2018, Olivier Girardot 4 | All rights reserved. 5 | 6 | Redistribution and use in source and binary forms, with or without 7 | modification, are permitted provided that the following conditions are met: 8 | 9 | * Redistributions of source code must retain the above copyright notice, this 10 | list of conditions and the following disclaimer. 11 | 12 | * Redistributions in binary form must reproduce the above copyright notice, 13 | this list of conditions and the following disclaimer in the documentation 14 | and/or other materials provided with the distribution. 15 | 16 | * Neither the name of the copyright holder nor the names of its 17 | contributors may be used to endorse or promote products derived from 18 | this software without specific prior written permission. 19 | 20 | THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" 21 | AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 22 | IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE 23 | DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE 24 | FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 25 | DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR 26 | SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER 27 | CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, 28 | OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE 29 | OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 30 | -------------------------------------------------------------------------------- /src/main/scala/com/github/scala/io/talk/privacy/matryoshkaEngine.scala: -------------------------------------------------------------------------------- 1 | package com.github.scala.io.talk.privacy 2 | 3 | import com.github.scala.io.api.DataWithSchema 4 | import com.github.scala.io.talk._ 5 | import com.github.scala.io.talk.privacy.PrivacyStrategy.PrivacyStrategies 6 | import matryoshka._ 7 | import matryoshka.data.Fix 8 | import matryoshka.implicits._ 9 | import matryoshka.patterns.EnvT 10 | import org.slf4j.LoggerFactory 11 | import scalaz._ 12 | 13 | object matryoshkaEngine { 14 | 15 | private val logger = LoggerFactory.getLogger("ApplyPrivacyMatryoshka") 16 | 17 | def transform(schema: Fix[SchemaF], 18 | data: Fix[DataF], 19 | privacyStrategies: PrivacyStrategies): Fix[DataF] = { 20 | import Scalaz._ 21 | val privacyAlg 22 | : AlgebraM[\/[Incompatibility, ?], DataWithSchema, Fix[DataF]] = { 23 | 24 | case EnvT((Fix(StructF(fieldsType, meta)), gdata @ GStructF(fields))) => 25 | Fix(gdata).right 26 | 27 | case EnvT((Fix(ArrayF(elementType, meta)), gdata @ GArrayF(elems))) => 28 | Fix(gdata).right 29 | 30 | case EnvT((vSchema, value)) => 31 | val tags = vSchema.unFix.metadata.tags 32 | val fixedValue = Fix(value) 33 | privacyStrategies 34 | .get(tags) 35 | .map { privacyStrategy => 36 | privacyStrategy.applyOrFail(fixedValue)(logger.error) 37 | } 38 | .getOrElse(fixedValue) 39 | .right 40 | } 41 | 42 | (schema, data).hyloM[\/[Incompatibility, ?], DataWithSchema, Fix[DataF]]( 43 | privacyAlg, 44 | DataF.zipWithSchema) match { 45 | case -\/(incompatibilities) => 46 | throw new IllegalStateException( 47 | s"Found incompatibilities between the observed data and its expected schema : $incompatibilities") 48 | 49 | case \/-(result) => 50 | result 51 | } 52 | } 53 | 54 | // TODO same as com.github.scala.io.talk.ApplyPrivacyExpression.dataType without spark 55 | def transformSchema(schema: Fix[SchemaF], 56 | privacyStrategies: PrivacyStrategies): Fix[SchemaF] = { 57 | def alg: Algebra[SchemaF, Fix[SchemaF]] = s => changeSchema(privacyStrategies, Fix(s)) 58 | 59 | schema.cata(alg) 60 | } 61 | 62 | def changeSchema(privacyStrategies: PrivacyStrategies, 63 | schemaF: Fix[SchemaF]): Fix[SchemaF] = { 64 | val s = schemaF.unFix 65 | privacyStrategies 66 | .find { 67 | case (tags, _) => tags.size == s.metadata.tags.size && tags.toSet == s.metadata.tags.toSet 68 | } 69 | .fold(schemaF) { case (_, strategy) => Fix(strategy.schema(s)) } 70 | } 71 | 72 | } 73 | -------------------------------------------------------------------------------- /src/main/scala/com/github/scala/io/talk/package.scala: -------------------------------------------------------------------------------- 1 | package com.github.scala.io 2 | 3 | import com.github.scala.io.talk.privacy.PrivacyStrategy.PrivacyStrategies 4 | import com.github.scala.io.talk.privacy._ 5 | import com.github.scala.io.talk.{DataF, SchemaF, SparkDataConverter} 6 | import matryoshka.data.Fix 7 | import matryoshka.patterns.EnvT 8 | import org.apache.spark.sql.types.StructType 9 | import org.apache.spark.sql.{Column, DataFrame} 10 | 11 | package object api { 12 | 13 | type DataWithSchema[A] = EnvT[Fix[SchemaF], DataF, A] 14 | 15 | type SchemaWithPath[A] = EnvT[Fix[SchemaF], DataF, A] 16 | 17 | implicit class DFEncrypt(val df: DataFrame) extends AnyVal { 18 | 19 | def encrypt(schema: Fix[SchemaF], 20 | privacyStrategies: PrivacyStrategies, 21 | engine: PrivacyEngine) = { 22 | engine match { 23 | case MatryoshkaEngine => 24 | val structSchema = df.schema 25 | val mutated = df.rdd.map { row => 26 | val gdata = SparkDataConverter.toGenericData(row, structSchema) 27 | val result = 28 | matryoshkaEngine.transform(schema, gdata, privacyStrategies) 29 | SparkDataConverter.fromGenericData(result) 30 | } 31 | val mutatedSchema = 32 | matryoshkaEngine.transformSchema(schema, privacyStrategies) 33 | val mutatedDataType = 34 | Fix.birecursiveT.cataT(mutatedSchema)(SchemaF.schemaFToDataType) 35 | df.sparkSession 36 | .createDataFrame(mutated, mutatedDataType.asInstanceOf[StructType]) 37 | 38 | case LambdaEngine => 39 | val mutatedSchema = 40 | matryoshkaEngine.transformSchema(schema, privacyStrategies) 41 | val mutatedDataType = 42 | Fix.birecursiveT.cataT(mutatedSchema)(SchemaF.schemaFToDataType) 43 | val preparedLambda = 44 | ApplyPrivacyLambda.prepareTransform(schema, privacyStrategies) 45 | val structSchema = df.schema 46 | val mutated = df.rdd.map { row => 47 | val gdata = SparkDataConverter.toGenericData(row, structSchema) 48 | val result = preparedLambda.apply(gdata) 49 | SparkDataConverter.fromGenericData(result) 50 | } 51 | df.sparkSession 52 | .createDataFrame(mutated, mutatedDataType.asInstanceOf[StructType]) 53 | 54 | case CodegenEngine => 55 | val expression = ApplyPrivacyExpression( 56 | schema, 57 | privacyStrategies, 58 | df.schema.fieldNames.map(c => df.col(c).expr) 59 | ) 60 | 61 | df.withColumn( 62 | "structMeUp", 63 | new Column( 64 | expression 65 | ) 66 | ) 67 | .select("structMeUp.*") 68 | } 69 | } 70 | } 71 | 72 | } 73 | -------------------------------------------------------------------------------- /src/main/scala/com/github/scala/io/talk/privacy/lambdaEngine.scala: -------------------------------------------------------------------------------- 1 | package com.github.scala.io.talk.privacy 2 | 3 | import com.github.scala.io.talk._ 4 | import com.github.scala.io.talk.privacy.PrivacyStrategy.PrivacyStrategies 5 | import matryoshka.Algebra 6 | import matryoshka.data.Fix 7 | import org.slf4j.LoggerFactory 8 | 9 | object ApplyPrivacyLambda { 10 | private val logger = LoggerFactory.getLogger("ApplyPrivacyLambda") 11 | 12 | def prepareTransform(schema: Fix[SchemaF], 13 | privacyStrategies: PrivacyStrategies): MutationOp = { 14 | 15 | val alg: Algebra[SchemaF, MutationOp] = { 16 | 17 | case StructF(fields, _) => 18 | if (fields.map(_._2).forall(_ == NoMutationOp)) { 19 | // all fields are not to be privacied 20 | NoMutationOp 21 | } else { 22 | val lambda: Fix[DataF] => Fix[DataF] = { 23 | case Fix(GStructF(dataFields)) => 24 | val newFields = fields.zip(dataFields).map { 25 | case ((fieldName, innerOp), (_, data)) => 26 | if (innerOp == NoMutationOp || data == Fix[DataF](GNullF())) { 27 | (fieldName, data) 28 | } else { 29 | val privacied = innerOp(data) 30 | (fieldName, privacied) 31 | } 32 | } 33 | Fix(GStructF(newFields)) 34 | 35 | case gdata => 36 | gdata // should not happen 37 | } 38 | GoDown(lambda) 39 | } 40 | 41 | case ArrayF(elementType, metadata) => 42 | elementType match { 43 | case NoMutationOp => 44 | NoMutationOp 45 | 46 | case op => 47 | GoDown { 48 | case Fix(GArrayF(elems)) => 49 | val result = elems.map(elementType.apply) 50 | Fix(GArrayF(result)) 51 | case otherData => 52 | otherData // should not happen 53 | } 54 | } 55 | 56 | case value: ValueF[MutationOp] if value.metadata.tags.nonEmpty => 57 | privacyStrategies 58 | .get(value.metadata.tags) 59 | .map { strat => 60 | val lambda: Fix[DataF] => Fix[DataF] = 61 | cypherWithContext(strat) 62 | GoDown(lambda) 63 | } 64 | .getOrElse(NoMutationOp) 65 | 66 | case _ => NoMutationOp 67 | } 68 | 69 | Fix.birecursiveT.cataT(schema)(alg) 70 | } 71 | 72 | private def cypherWithContext(cypher: PrivacyStrategy)( 73 | value: Fix[DataF]): Fix[DataF] = { 74 | cypher(value).fold( 75 | errors => { 76 | if (value != Fix[DataF](GNullF())) { 77 | errors.foreach(err => 78 | logger.warn(s"Error while applying privacy on $value : $err")) 79 | } 80 | Fix[DataF](GNullF()) 81 | }, 82 | x => x 83 | ) 84 | } 85 | } 86 | 87 | /** 88 | * Represents a nested op that applies a function to a GenericData and 89 | * can be composed 90 | */ 91 | sealed trait MutationOp extends Serializable { 92 | 93 | def andThen(f: Fix[DataF] => Fix[DataF]): MutationOp 94 | 95 | def apply(gdata: Fix[DataF]): Fix[DataF] 96 | } 97 | 98 | /** 99 | * A specific [[MutationOp]] that goes "down" and apply a function to the data 100 | * 101 | * @param apply0 the function to apply 102 | */ 103 | private case class GoDown(apply0: Fix[DataF] => Fix[DataF]) 104 | extends MutationOp 105 | with Serializable { 106 | 107 | override def andThen(f: Fix[DataF] => Fix[DataF]): MutationOp = { 108 | GoDown(apply0.andThen(f)) 109 | } 110 | 111 | override def apply(gdata: Fix[DataF]): Fix[DataF] = apply0(gdata) 112 | } 113 | 114 | /** 115 | * NoOp - nothing comes out of this - there's nothing to do ! 116 | */ 117 | private case object NoMutationOp extends MutationOp with Serializable { 118 | 119 | override def andThen(f: Fix[DataF] => Fix[DataF]): MutationOp = GoDown(f) 120 | 121 | override def apply(gdata: Fix[DataF]): Fix[DataF] = gdata 122 | } 123 | -------------------------------------------------------------------------------- /src/main/scala/com/github/scala/io/talk/SchemaF.scala: -------------------------------------------------------------------------------- 1 | package com.github.scala.io.talk 2 | 3 | import com.github.scala.io.talk.ColumnMetadata.SemanticTag 4 | import matryoshka.{Algebra, Birecursive, Coalgebra} 5 | import scalaz.Functor 6 | 7 | case class ColumnMetadata(nullable: Boolean, tags: List[SemanticTag]) 8 | 9 | object ColumnMetadata { 10 | type SemanticTag = (String, String) 11 | 12 | def empty = ColumnMetadata(nullable = true, Nil) 13 | 14 | } 15 | 16 | /** 17 | * Without further ado, let's define our main pattern-functor for the remaining of the session. 18 | */ 19 | sealed trait SchemaF[A] { 20 | val metadata: ColumnMetadata 21 | } 22 | 23 | // we'll use a ListMap to keep the ordering of the fields 24 | final case class StructF[A](fields: List[(String, A)], metadata: ColumnMetadata) 25 | extends SchemaF[A] 26 | final case class ArrayF[A](element: A, metadata: ColumnMetadata) 27 | extends SchemaF[A] 28 | 29 | sealed trait ValueF[A] extends SchemaF[A] { 30 | val metadata: ColumnMetadata 31 | } 32 | final case class BooleanF[A](metadata: ColumnMetadata) extends ValueF[A] 33 | final case class DateF[A](metadata: ColumnMetadata) extends ValueF[A] 34 | final case class DoubleF[A](metadata: ColumnMetadata) extends ValueF[A] 35 | final case class FloatF[A](metadata: ColumnMetadata) extends ValueF[A] 36 | final case class IntegerF[A](metadata: ColumnMetadata) extends ValueF[A] 37 | final case class LongF[A](metadata: ColumnMetadata) extends ValueF[A] 38 | final case class StringF[A](metadata: ColumnMetadata) extends ValueF[A] 39 | 40 | object SchemaF extends SchemaFToDataTypeAlgebras { 41 | 42 | /** 43 | * As usual, we need to define a Functor instance for our pattern. 44 | */ 45 | implicit val schemaFScalazFunctor: Functor[SchemaF] = new Functor[SchemaF] { 46 | def map[A, B](fa: SchemaF[A])(f: A => B): SchemaF[B] = fa match { 47 | case StructF(fields, m) => 48 | StructF(List( 49 | fields 50 | .map { case (name, value) => name -> f(value) }: _* 51 | ), 52 | m) 53 | case ArrayF(elem, m) => ArrayF(f(elem), m) 54 | case BooleanF(m) => BooleanF(m) 55 | case DateF(m) => DateF(m) 56 | case DoubleF(m) => DoubleF(m) 57 | case FloatF(m) => FloatF(m) 58 | case IntegerF(m) => IntegerF(m) 59 | case LongF(m) => LongF(m) 60 | case StringF(m) => StringF(m) 61 | } 62 | } 63 | } 64 | 65 | /** 66 | * Now that we have a proper pattern-functor, we need (co)algebras to go from our "standard" schemas to 67 | * our new and shiny SchemaF (and vice versa). 68 | * 69 | * Lets focus on Parquet schemas first. Parquet is a columnar data format that allows efficient processing 70 | * of large datasets in a distributed environment (eg Spark). In the Spark API, Parquet schemas are represented 71 | * as instances of the DataType type. So what we want to write here is a pair of (co)algebras that go from/to 72 | * SchemaF/DataType. 73 | * 74 | * NOTE: in order not to depend directly on Spark (and, hence, transitively on half of maven-central), we've copied 75 | * the definition of the DataType trait and its subclasses in the current project under 76 | * `spark/src/main/scala/DataType.scala`. 77 | */ 78 | trait SchemaFToDataTypeAlgebras { 79 | 80 | import org.apache.spark.sql.types._ 81 | 82 | /** 83 | * As usual, simply a function from SchemaF[DataType] to DataType 84 | */ 85 | def schemaFToDataType: Algebra[SchemaF, DataType] = { 86 | case StructF(fields, _) => 87 | StructType( 88 | fields.map { case (name, value) => StructField(name, value) }.toArray) 89 | case ArrayF(elem, m) => ArrayType(elem, containsNull = false) 90 | case BooleanF(_) => BooleanType 91 | case DateF(_) => DateType 92 | case DoubleF(_) => DoubleType 93 | case FloatF(_) => FloatType 94 | case IntegerF(_) => IntegerType 95 | case LongF(_) => LongType 96 | case StringF(_) => StringType 97 | 98 | } 99 | 100 | /** 101 | * And the other way around, a function from DataType to SchemaF[DataType] 102 | */ 103 | def dataTypeToSchemaF: Coalgebra[SchemaF, DataType] = { 104 | case StructType(fields) => 105 | StructF(List(fields.map(f => f.name -> f.dataType): _*), 106 | ColumnMetadata.empty) 107 | case ArrayType(elem, _) => ArrayF(elem, ColumnMetadata.empty) 108 | case BooleanType => BooleanF(ColumnMetadata.empty) 109 | case DateType => DateF(ColumnMetadata.empty) 110 | case DoubleType => DoubleF(ColumnMetadata.empty) 111 | case FloatType => FloatF(ColumnMetadata.empty) 112 | case IntegerType => IntegerF(ColumnMetadata.empty) 113 | case LongType => LongF(ColumnMetadata.empty) 114 | case StringType => StringF(ColumnMetadata.empty) 115 | 116 | } 117 | 118 | /** 119 | * This pair of (co)algebras allows us to create a Birecursive[DataType, SchemaF] instance "for free". 120 | * 121 | * Such instance witnesses the fact that we can use a DataType in schemes that would normally apply to SchemaF. 122 | * For example, suppose that we have: 123 | * 124 | * {{{ 125 | * val parquet: DataType = ??? 126 | * val toAvro: Algebra[SchemaF, avro.Schema] = ??? 127 | * }}} 128 | * 129 | * If we have the instance bellow in scope (and the necessary implicits from matryoshka.implicits), we can now write 130 | * 131 | * {{{ 132 | * parquet.cata(toAvro) 133 | * }}} 134 | * 135 | * Instead of 136 | * 137 | * {{{ 138 | * parquet.hylo(dataTypeToSchemaf, toAvro) 139 | * }}} 140 | * 141 | * And the same goes with `ana` and any Coalgebra[SchemaF, X]. 142 | */ 143 | implicit val dataTypeSchemaBirecursive: Birecursive.Aux[DataType, SchemaF] = 144 | Birecursive.algebraIso(schemaFToDataType, dataTypeToSchemaF) 145 | } 146 | -------------------------------------------------------------------------------- /src/main/scala/com/github/scala/io/talk/DataF.scala: -------------------------------------------------------------------------------- 1 | package com.github.scala.io.talk 2 | 3 | import com.github.scala.io.api.DataWithSchema 4 | import matryoshka.{CoalgebraM, Recursive} 5 | import matryoshka.data.Fix 6 | import matryoshka.implicits._ 7 | import matryoshka.patterns.EnvT 8 | import org.apache.spark.sql.Row 9 | import org.apache.spark.sql.types.{ArrayType, DataType, StructType} 10 | import org.apache.spark.unsafe.types.UTF8String 11 | import scalaz.Scalaz._ 12 | import scalaz._ 13 | 14 | sealed trait DataF[A] 15 | 16 | /** 17 | * Marker trait for "terminal" data types 18 | */ 19 | sealed trait GValueF[A] extends DataF[A] { 20 | 21 | def value: Any 22 | } 23 | 24 | final case class GNullF[A]() extends GValueF[A] { 25 | 26 | def value: Any = null 27 | } 28 | 29 | final case class GArrayF[A](elems: Seq[A]) extends DataF[A] 30 | 31 | final case class GStructF[A](fields: List[(String, A)]) extends DataF[A] 32 | 33 | final case class GStringF[A](value: String) extends GValueF[A] 34 | 35 | final case class GLongF[A](value: Long) extends GValueF[A] 36 | 37 | final case class GIntF[A](value: Int) extends GValueF[A] 38 | 39 | final case class GDoubleF[A](value: Double) extends GValueF[A] 40 | 41 | final case class GFloatF[A](value: Float) extends GValueF[A] 42 | 43 | final case class GDateF[A](value: java.sql.Date) extends GValueF[A] 44 | 45 | final case class GTimestampF[A](value: java.sql.Timestamp) extends GValueF[A] 46 | 47 | final case class GBooleanF[A](value: Boolean) extends GValueF[A] 48 | 49 | trait DataFInstances { 50 | implicit val genericDataFTraverse: Traverse[DataF] = new Traverse[DataF] { 51 | 52 | override def traverseImpl[G[_], A, B]( 53 | fa: DataF[A] 54 | )(f: A => G[B])(implicit evidence$1: Applicative[G]): G[DataF[B]] = 55 | fa match { 56 | case GNullF() => Applicative[G].point(GNullF[B]()) 57 | case GArrayF(elems) => 58 | Functor[G].map(elems.toList traverse f)(GArrayF.apply) 59 | 60 | case GStructF(fields) => 61 | val (keys, values) = fields.unzip 62 | Functor[G].map(values.toList traverse f)(v => 63 | GStructF(List((keys zip v).toSeq: _*))) 64 | 65 | case GStringF(value) => Applicative[G].point(GStringF[B](value)) 66 | case GLongF(value) => Applicative[G].point(GLongF[B](value)) 67 | case GIntF(value) => Applicative[G].point(GIntF[B](value)) 68 | case GDoubleF(value) => Applicative[G].point(GDoubleF[B](value)) 69 | case GFloatF(value) => Applicative[G].point(GFloatF[B](value)) 70 | case GDateF(value) => Applicative[G].point(GDateF[B](value)) 71 | case GTimestampF(value) => Applicative[G].point(GTimestampF[B](value)) 72 | case GBooleanF(value) => Applicative[G].point(GBooleanF[B](value)) 73 | } 74 | } 75 | } 76 | 77 | trait DataFunctions { 78 | 79 | /** 80 | * @group coalgebras 81 | * 82 | * This coalgebra can be used to label each element of a `Fix[DataF]` with its schema. 83 | * 84 | * Given a schema and some data, return either a [[DataWithSchema]] or a [[Incompatibility]]. 85 | */ 86 | def zipWithSchema: CoalgebraM[\/[Incompatibility, ?], 87 | DataWithSchema, 88 | (Fix[SchemaF], Fix[DataF])] = { 89 | 90 | case (structf @ Fix(StructF(fields, metadata)), Fix(GStructF(values))) => 91 | val fieldMap = fields 92 | val zipped = values.map { 93 | case (name, value) => (name, (fieldMap.toMap.apply(name), value)) 94 | } 95 | EnvT[Fix[SchemaF], DataF, (Fix[SchemaF], Fix[DataF])]( 96 | (structf, DataF.struct(zipped))).right[Incompatibility] 97 | 98 | case (structf @ Fix(StructF(_, _)), Fix(GNullF())) => 99 | EnvT[Fix[SchemaF], DataF, (Fix[SchemaF], Fix[DataF])]((structf, GNullF())) 100 | .right[Incompatibility] 101 | 102 | case (arrayF @ Fix(ArrayF(n, m)), Fix(GArrayF(elements))) => 103 | val fieldSchema: Fix[SchemaF] = arrayF // schemaFor(arrayF) FIXME 104 | // no patch infos allowed on an array 105 | val arrayColumnSchema = arrayF 106 | //.copy(metadata = m.copy(patchInfo = None)) FIXME 107 | val arrayFa = DataF.array(elements.toList map { e => 108 | fieldSchema -> e 109 | }) 110 | EnvT[Fix[SchemaF], DataF, (Fix[SchemaF], Fix[DataF])]( 111 | (arrayColumnSchema, arrayFa)).right[Incompatibility] 112 | 113 | case (arrayF @ Fix(ArrayF(_, m)), Fix(GNullF())) => 114 | // no patch infos allowed on an array 115 | val arrayColumnSchema = arrayF //.copy(metadata = m.copy(patchInfo = None)) FIXME 116 | EnvT[Fix[SchemaF], DataF, (Fix[SchemaF], Fix[DataF])]( 117 | (arrayColumnSchema, GNullF())).right[Incompatibility] 118 | 119 | case (valueF, Fix(lower)) => 120 | val dataF = lower.map((valueF, _)) 121 | EnvT[Fix[SchemaF], DataF, (Fix[SchemaF], Fix[DataF])]((valueF, dataF)) 122 | .right[Incompatibility] 123 | 124 | case (s, d) => Incompatibility(s, d).left 125 | } 126 | } 127 | 128 | object DataF extends DataFInstances with DataFunctions { 129 | 130 | def struct[A](fields: List[(String, A)]): DataF[A] = GStructF(fields) 131 | 132 | def array[A](elements: List[A]): DataF[A] = GArrayF(elements) 133 | } 134 | 135 | object SparkDataConverter { 136 | 137 | /** 138 | * Convert from our GenericData container to a Spark SQL compatible Row 139 | * first and last step before creating a dataframe 140 | * 141 | * @param row data 142 | * @return spark's Row 143 | */ 144 | def fromGenericData[T](row: T)(implicit T: Recursive.Aux[T, DataF]): Row = { 145 | import matryoshka._ 146 | 147 | import scala.language.higherKinds 148 | 149 | val gAlgebra: GAlgebra[(T, ?), DataF, Row] = { 150 | case GArrayF(elems) => 151 | val values = elems.map { 152 | case (previous, current) => 153 | if (previous.project.isInstanceOf[GValueF[_]]) 154 | current.get(0) 155 | else 156 | current 157 | } 158 | Row(values) 159 | 160 | case GStructF(fields) => 161 | val values = fields.map { field => 162 | val (fx, value) = field._2 163 | if (fx.project.isInstanceOf[GValueF[_]] || fx.project 164 | .isInstanceOf[GArrayF[_]]) { 165 | value.get(0) 166 | } else { 167 | value 168 | } 169 | } 170 | Row(values: _*) 171 | 172 | case el: GValueF[_] => 173 | Row(el.value) 174 | } 175 | 176 | row.para[Row](gAlgebra) 177 | } 178 | 179 | def toGenericData(row: Row, schema: StructType): Fix[DataF] = { 180 | def handleElement(element: Any, schema: DataType): Fix[DataF] = { 181 | element match { 182 | case arr: Seq[Any] => 183 | val arrayType = schema.asInstanceOf[ArrayType] 184 | Fix(GArrayF(arr.map(el => handleElement(el, arrayType.elementType)))) 185 | 186 | case struct: Row => 187 | val structType = schema.asInstanceOf[StructType] 188 | val dataset = struct.toSeq.zipWithIndex.map { 189 | case (el, idx) => 190 | val field = structType(idx) 191 | val elementType = field.dataType 192 | (field.name, handleElement(el, elementType)) 193 | } 194 | Fix(GStructF(dataset.toList)) 195 | case value: java.sql.Timestamp => 196 | Fix(GTimestampF(value)) 197 | case value: java.sql.Date => 198 | Fix(GDateF(value)) 199 | case value: Boolean => 200 | Fix(GBooleanF(value)) 201 | case value: Int => 202 | Fix(GIntF(value)) 203 | case value: Float => 204 | Fix(GFloatF(value)) 205 | case value: Double => 206 | Fix(GDoubleF(value)) 207 | case value: Long => 208 | Fix(GLongF(value)) 209 | case value: String => 210 | Fix(GStringF(value)) 211 | case value: UTF8String => 212 | Fix(GStringF(value.toString)) 213 | case null => 214 | Fix(GNullF()) 215 | } 216 | } 217 | 218 | handleElement(row, schema) 219 | } 220 | } 221 | -------------------------------------------------------------------------------- /src/main/scala/com/github/scala/io/talk/privacy/codegenEngine.scala: -------------------------------------------------------------------------------- 1 | package com.github.scala.io.talk.privacy 2 | 3 | import com.github.scala.io.talk._ 4 | import com.github.scala.io.talk.privacy.PrivacyStrategy.PrivacyStrategies 5 | import matryoshka.Algebra 6 | import matryoshka.data.Fix 7 | import org.apache.spark.sql.Row 8 | import org.apache.spark.sql.catalyst.InternalRow 9 | import org.apache.spark.sql.catalyst.expressions.Expression 10 | import org.apache.spark.sql.catalyst.expressions.codegen.{ 11 | CodegenContext, 12 | ExprCode 13 | } 14 | import org.apache.spark.sql.types.{ArrayType, DataType, StructField, StructType} 15 | import org.apache.spark.sql.utils.SmartRow 16 | import org.apache.spark.unsafe.types.UTF8String 17 | 18 | case class InputVariable(name: String) extends AnyVal 19 | 20 | sealed trait CatalystOp 21 | 22 | case class CatalystCode(code: InputVariable => String, outputVariable: String) 23 | extends CatalystOp 24 | 25 | case object NoOp extends CatalystOp 26 | 27 | case class ApplyMe(lambda: Any => Any) { 28 | def apply(value: Any): Any = lambda(value) 29 | } 30 | 31 | case class ApplyPrivacyExpression(schema: Fix[SchemaF], 32 | privacyStrategies: PrivacyStrategies, 33 | children: Seq[Expression]) 34 | extends Expression { 35 | 36 | type FieldName = String 37 | type FieldWithInfos = (DataType, CatalystOp) 38 | 39 | override def nullable: Boolean = children.forall(_.nullable) 40 | 41 | override def eval(input: InternalRow): Any = { 42 | // privacy "manually" #DelegateToMatryoshka 43 | val structType = Fix.birecursiveT 44 | .cataT(schema)(SchemaF.schemaFToDataType) 45 | .asInstanceOf[StructType] 46 | val gdata = SparkDataConverter.toGenericData( 47 | Row(input.toSeq(structType): _*), 48 | structType) 49 | val res = matryoshkaEngine.transform(schema, gdata, privacyStrategies) 50 | SmartRow.fromSeq(SparkDataConverter.fromGenericData(res).toSeq.map { 51 | case s: String => UTF8String.fromString(s) 52 | case a => a 53 | }) 54 | } 55 | 56 | /** 57 | * The mutate schema : 58 | * TODO mutate the schema through privacy .schema application 59 | * 60 | * @return 61 | */ 62 | override def dataType: DataType = { 63 | import SchemaF._ 64 | import matryoshka.data._ 65 | // check if any privacy strategy needs to be applied an mutate the schema accordingly 66 | def ifPrivacy[A](input: SchemaF[A], 67 | metadata: ColumnMetadata): SchemaF[A] = { 68 | privacyStrategies 69 | .find { 70 | case (tags, _) => 71 | // we do not check here if the strat is "applicable" only if the tags match 72 | tags.size == metadata.tags.size && tags.toSet == metadata.tags.toSet 73 | } 74 | .map { 75 | case (_, strategy) => 76 | strategy.schema(input) 77 | } 78 | .getOrElse(input) 79 | } 80 | 81 | val alg: Algebra[SchemaF, (Boolean, DataType)] = { 82 | case struct @ StructF(fields, metadata) => 83 | val res = StructType(fields.map { 84 | case (name, (isNullable, field)) => 85 | StructField(name, field, isNullable) 86 | }) 87 | (metadata.nullable, res) 88 | 89 | case v @ ArrayF(element, metadata) => 90 | val res = ArrayType(element._2, element._1) 91 | (metadata.nullable, res) 92 | 93 | case v: ValueF[(Boolean, DataType)] => 94 | val res = ifPrivacy(v, v.metadata) 95 | (v.metadata.nullable, 96 | schemaFToDataType.apply(schemaFScalazFunctor.map(res)(_._2))) 97 | } 98 | val res = Fix.birecursiveT.cataT(schema)(alg) 99 | res._2 100 | } 101 | 102 | override protected def doGenCode(ctx: CodegenContext, 103 | ev: ExprCode): ExprCode = { 104 | import SchemaF._ 105 | 106 | val input = "inputadapter_row_0" 107 | 108 | val privacyAlg: Algebra[SchemaF, FieldWithInfos] = { 109 | case StructF(fieldsWithDataTypes, metadata) => 110 | val tmp = ctx.freshName("toto") 111 | val inputTmp = ctx.freshName("inputTmp") 112 | 113 | val CatalystCode(fieldsCode, _) = 114 | generateCodeForStruct(ctx, fieldsWithDataTypes, tmp) 115 | val outputDataType = fieldsToSparkDataType(fieldsWithDataTypes) 116 | val outputDataTypeForCodegen = 117 | ctx.addReferenceObj("outputDataType", outputDataType) 118 | val code = (inputVariable: InputVariable) => { 119 | s""" 120 | org.apache.spark.sql.catalyst.InternalRow $inputTmp = (org.apache.spark.sql.catalyst.InternalRow ) ${inputVariable.name}; 121 | org.apache.spark.sql.utils.SmartRow $tmp = (org.apache.spark.sql.utils.SmartRow) org.apache.spark.sql.utils.SmartRow.fromSeq($inputTmp.toSeq($outputDataTypeForCodegen)); 122 | ${fieldsCode.apply(InputVariable(tmp))} 123 | """ 124 | } 125 | (outputDataType, CatalystCode(code, tmp)) 126 | 127 | case ArrayF(elementType, metadata) => 128 | val (elementSparkDataType, innerOp) = elementType 129 | val arrayDataType = ArrayType(elementSparkDataType) 130 | val resOp = if (innerOp == NoOp) { 131 | innerOp 132 | } else { 133 | val tags = metadata.tags 134 | val elementTypeBoxed = ctx.boxedType(elementSparkDataType) 135 | val tpeName = ctx.addReferenceObj("tpe", elementSparkDataType) 136 | val CatalystCode(innerCode, innerOuput) = innerOp 137 | val tempVariable = ctx.freshName("tmp") 138 | val pos = ctx.freshName("pos") 139 | val output = ctx.freshName("output") 140 | val code = (inputVariable: InputVariable) => 141 | s""" 142 | Object[] $tempVariable = new Object[${inputVariable.name}.numElements()]; 143 | for (int $pos = 0; $pos < ${inputVariable.name}.numElements(); $pos++) { 144 | if (!${inputVariable.name}.isNullAt($pos)) { 145 | ${innerCode.apply( 146 | InputVariable(s"(${inputVariable.name}.get($pos, $tpeName))") 147 | )} 148 | $tempVariable[$pos] = $innerOuput; 149 | } else { 150 | $tempVariable[$pos] = null; 151 | } 152 | } 153 | org.apache.spark.sql.catalyst.util.ArrayData $output = new org.apache.spark.sql.catalyst.util.GenericArrayData($tempVariable); 154 | """ 155 | CatalystCode(code, output) 156 | } 157 | (arrayDataType, resOp) 158 | 159 | case valueColumnSchema: ValueF[FieldWithInfos] 160 | if valueColumnSchema.metadata.tags.nonEmpty => 161 | val tags: List[(String, String)] = valueColumnSchema.metadata.tags 162 | val elementDataType: DataType = 163 | schemaFToDataType.apply(schemaFScalazFunctor(valueColumnSchema)(_._1)) 164 | val resOp = privacyStrategies 165 | .get(tags) 166 | .map { strat => 167 | val output = ctx.freshName("output") 168 | val outputSchema = strat.schema(valueColumnSchema) 169 | val outputDataType = 170 | schemaFToDataType.apply(schemaFScalazFunctor(outputSchema)(_._1)) 171 | val javaType = ctx.boxedType(outputDataType) 172 | val cypherInSpark = 173 | ctx.addReferenceObj("cypherMe", transTypePrivacyStrategy(strat)) 174 | val code = (inputVariable: InputVariable) => s""" 175 | $javaType $output = ($javaType) $cypherInSpark.apply(${inputVariable.name}); 176 | """ 177 | CatalystCode(code, output) 178 | 179 | } 180 | .getOrElse(NoOp) 181 | (elementDataType, resOp) 182 | 183 | case value: ValueF[FieldWithInfos] if value.metadata.tags.isEmpty => 184 | val elementDataType = 185 | schemaFToDataType.apply(schemaFScalazFunctor(value)(_._1)) 186 | (elementDataType, NoOp) 187 | } 188 | 189 | ev.copy(code = Fix.birecursiveT.cataT(schema)(privacyAlg) match { 190 | case (_, NoOp) => 191 | s""" 192 | final boolean ${ev.isNull} = ($input != null) ? false : true; 193 | final InternalRow ${ev.value} = $input; 194 | """ 195 | 196 | case rec @ (topLevelDataType, CatalystCode(method, outputVariable)) => 197 | s""" 198 | ${method(InputVariable(input))} 199 | final boolean ${ev.isNull} = ($input != null) ? false : true; 200 | final InternalRow ${ev.value} = $outputVariable; 201 | """ 202 | }) 203 | } 204 | 205 | /** 206 | * Generate Catalyst Code for a struct 207 | * 208 | * @param fieldsWithDataType all the fields of the inner struct 209 | * @param tmp the variable we want to mutate 210 | * @return the code necessary to mutate a struct 211 | */ 212 | def generateCodeForStruct( 213 | ctx: CodegenContext, 214 | fieldsWithDataType: Seq[(FieldName, FieldWithInfos)], 215 | tmp: String 216 | ): CatalystCode = { 217 | fieldsWithDataType.zipWithIndex.foldLeft(CatalystCode(_ => "", tmp)) { 218 | case (buffer, ((fieldName, (elementDataType, op)), idx)) => 219 | if (op == NoOp) { 220 | buffer 221 | } else { 222 | val CatalystCode(code, intermediateOutput) = op 223 | val fieldTpe = ctx.addReferenceObj("dt", elementDataType) 224 | // we need top extract the data properly according to its element type 225 | val fieldExtractor = elementDataType match { 226 | case StructType(fields) => 227 | val numFields = fields.length 228 | s"getStruct($idx, $numFields)" 229 | case ArrayType(_, _) => 230 | s"getArray($idx)" 231 | case _ => 232 | s"get($idx, $fieldTpe)" 233 | } 234 | 235 | CatalystCode( 236 | (inputVariable: InputVariable) => 237 | s""" 238 | ${buffer.code(inputVariable)} 239 | if (!${inputVariable.name}.isNullAt($idx)) { 240 | // $fieldName 241 | ${code.apply( 242 | InputVariable(s"${inputVariable.name}.$fieldExtractor"))} 243 | $tmp.update($idx, $intermediateOutput); 244 | } 245 | """, 246 | tmp 247 | ) 248 | } 249 | } 250 | } 251 | 252 | def transTypePrivacyStrategy(strat: PrivacyStrategy): ApplyMe = { 253 | ApplyMe((value: Any) => { 254 | strat 255 | .apply(wrap(value)) 256 | .fold( 257 | errors => { 258 | errors.foreach(println) 259 | null 260 | }, 261 | x => unwrap(x.unFix) 262 | ) 263 | }) 264 | } 265 | 266 | def wrap(input: Any): Fix[DataF] = { 267 | input match { 268 | case null => Fix(GNullF()) 269 | case a: String => Fix(GStringF(a)) 270 | case a: Long => Fix(GLongF(a)) 271 | case a: java.lang.Long => Fix(GLongF(a)) 272 | case a: UTF8String => Fix(GStringF(a.toString)) 273 | case a: Double => Fix(GDoubleF(a)) 274 | case a: java.lang.Double => Fix(GDoubleF(a)) 275 | case a: Int => Fix(GIntF(a)) 276 | case a: java.lang.Integer => Fix(GIntF(a)) 277 | case a: Float => Fix(GFloatF(a)) 278 | case a: java.lang.Float => Fix(GFloatF(a)) 279 | case a: java.sql.Date => Fix(GDateF(a)) 280 | case a: java.sql.Timestamp => Fix(GTimestampF(a)) 281 | case _ => 282 | throw new UnsupportedOperationException( 283 | s"Input data is not supported : $input of type ${input.getClass}") 284 | } 285 | } 286 | 287 | def unwrap[A](input: DataF[A]): Any = input match { 288 | case GNullF() => null 289 | case x: GStringF[A] => UTF8String.fromString(x.value) 290 | case x: GValueF[A] => x.value 291 | case _ => 292 | throw new UnsupportedOperationException( 293 | s"Input data is not supported : $input of type ${input.getClass}") 294 | } 295 | 296 | /** 297 | * Re-construct the Spark StructType data type, from the fields after privacy 298 | * 299 | * @param fieldsWithDataType all the fields transformed after privacy 300 | * @return 301 | */ 302 | private def fieldsToSparkDataType( 303 | fieldsWithDataType: List[(FieldName, FieldWithInfos)]): StructType = { 304 | StructType(fieldsWithDataType.map { 305 | case (fieldName, (fieldDataType, _)) => 306 | StructField(fieldName, fieldDataType, nullable = true) 307 | }) 308 | } 309 | } 310 | -------------------------------------------------------------------------------- /src/test/scala/com/github/scala/io/talk/PrivacyIntegrationTest.scala: -------------------------------------------------------------------------------- 1 | package com.github.scala.io.talk 2 | 3 | import java.io.ByteArrayOutputStream 4 | import java.security.MessageDigest 5 | import java.util.Base64 6 | 7 | import com.github.scala.io.api._ 8 | import com.github.scala.io.talk.privacy._ 9 | import javax.crypto.Cipher 10 | import javax.crypto.spec.{IvParameterSpec, SecretKeySpec} 11 | import matryoshka.data.Fix 12 | import org.apache.log4j.{Level, Logger} 13 | import org.apache.spark.sql.types.StructType 14 | import org.apache.spark.sql.{Encoders, Row, SparkSession} 15 | import org.scalatest.{BeforeAndAfterAll, FlatSpec, Matchers} 16 | 17 | class PrivacyIntegrationTest 18 | extends FlatSpec 19 | with Matchers 20 | with BeforeAndAfterAll { 21 | 22 | var spark: SparkSession = _ 23 | val engines = List(LambdaEngine, CodegenEngine, MatryoshkaEngine) 24 | 25 | override def beforeAll { 26 | spark = SparkSession 27 | .builder() 28 | .appName("Dataframe encryption test") 29 | .master("local[2]") 30 | .getOrCreate() 31 | Logger.getLogger("org.apache.spark.executor.Executor").setLevel(Level.OFF) 32 | } 33 | 34 | override def afterAll { 35 | spark.stop() 36 | Logger.getLogger("org.apache.spark.executor.Executor").setLevel(Level.WARN) 37 | } 38 | 39 | def testWithEngine(engine: PrivacyEngine): Unit = { 40 | it should s"handle simple flat datasets with engine: $engine" in { 41 | val dataset = 42 | List(("AAAA", "BBBB", "CCCC", "DDDD"), ("EEEE", "FFFF", "GGGG", "HHHH")) 43 | val input = spark 44 | .createDataFrame(dataset) 45 | .toDF("first", "second", "third", "fourth") 46 | 47 | val strategies = Map( 48 | Seq(("rdfs:type", "http://schema.org/Person#pseudo")) -> new PrivacyStrategy { 49 | override val allowedInputTypes: Set[String] = Set() 50 | 51 | override def apply(data: Fix[DataF]) 52 | : Either[List[PrivacyApplicationFailure], Fix[DataF]] = { 53 | data match { 54 | case Fix(GStringF(value)) => 55 | val res = new String( 56 | Base64.getEncoder.encode( 57 | MessageDigest 58 | .getInstance("SHA1") 59 | .digest(value.toString.getBytes("UTF-8")))) 60 | Right(Fix(GStringF(res))) 61 | 62 | case _ => 63 | Right(Fix[DataF](GNullF())) 64 | } 65 | } 66 | } 67 | ) 68 | 69 | val schemaFix = Fix[SchemaF]( 70 | StructF( 71 | List( 72 | ( 73 | "first", 74 | Fix( 75 | StringF( 76 | ColumnMetadata.empty.copy(tags = 77 | List(("rdfs:type", "http://schema.org/Person#pseudo"))) 78 | )) 79 | ), 80 | ("second", Fix(StringF(ColumnMetadata.empty))), 81 | ("third", Fix(StringF(ColumnMetadata.empty))), 82 | ("fourth", Fix(StringF(ColumnMetadata.empty))) 83 | ), 84 | ColumnMetadata.empty 85 | )) 86 | 87 | val output = input.encrypt(schemaFix, strategies, engine) 88 | input.schema should be(input.schema) 89 | 90 | output.first() should be( 91 | Row("4lEhcqv4zJ9n/dSetsrPLfcbutM=", "BBBB", "CCCC", "DDDD")) 92 | output.collect()(1) should be( 93 | Row("xJw04gFKLaJw7q4H1zByb/3dMZY=", "FFFF", "GGGG", "HHHH")) 94 | } 95 | 96 | it should s"handle complex nested structs with engine: $engine" in { 97 | val data = 98 | """{"civility": {"familyName": "MARTIN", "gender": 1, "givenName": "FABIEN", "inner": {"civility": {"familyName": "MARTIN", "gender": 1, "givenName": "FABIEN"}, "gender": 2}}, "kind": "user#part", "lastUpdatedBy": "FICHECLIENT", "userId": "0211123586445"}""" 99 | 100 | val input = spark.read.json( 101 | spark.createDataset[String](List(data))(Encoders.STRING)) 102 | 103 | val cypher = new PrivacyStrategy { 104 | override val allowedInputTypes: Set[String] = Set() 105 | 106 | override def apply(data: Fix[DataF]) 107 | : Either[List[PrivacyApplicationFailure], Fix[DataF]] = { 108 | data match { 109 | case Fix(value: GValueF[_]) => 110 | val res = new String( 111 | Base64.getEncoder.encode( 112 | MessageDigest 113 | .getInstance("SHA1") 114 | .digest(value.value.toString.getBytes("UTF-8")))) 115 | Right(Fix(GStringF(res))) 116 | 117 | case _ => 118 | Right(Fix[DataF](GNullF())) 119 | } 120 | } 121 | 122 | override def schema[A](input: SchemaF[A]): SchemaF[A] = 123 | StringF(input.metadata) 124 | } 125 | val strategies = Map( 126 | Seq(("rdfs:type", "http://schema.org/Person#pseudo")) -> cypher, 127 | Seq(("rdfs:type", "http://schema.org/Person#interv")) -> cypher 128 | ) 129 | 130 | val tableSchema = Fix[SchemaF]( 131 | StructF( 132 | List( 133 | ( 134 | "civility", 135 | Fix(StructF( 136 | List( 137 | ( 138 | "familyName", 139 | Fix(StringF( 140 | ColumnMetadata.empty.copy(tags = 141 | List(("rdfs:type", "http://schema.org/Person#mask"))) 142 | )) 143 | ), 144 | ( 145 | "gender", 146 | Fix(LongF( 147 | ColumnMetadata.empty.copy(tags = 148 | List(("rdfs:type", "http://schema.org/Person#interv"))) 149 | )) 150 | ), 151 | ( 152 | "givenName", 153 | Fix(StringF( 154 | ColumnMetadata.empty.copy(tags = 155 | List(("rdfs:type", "http://schema.org/Person#pseudo"))) 156 | )) 157 | ), 158 | ("inner", 159 | Fix(StructF( 160 | List( 161 | ( 162 | "civility", 163 | Fix(StructF( 164 | List( 165 | ( 166 | "familyName", 167 | Fix(StringF( 168 | ColumnMetadata.empty.copy(tags = 169 | List(("rdfs:type", 170 | "http://schema.org/Person#mask"))) 171 | )) 172 | ), 173 | ( 174 | "gender", 175 | Fix(LongF( 176 | ColumnMetadata.empty.copy(tags = 177 | List(("rdfs:type", 178 | "http://schema.org/Person#interv"))) 179 | )) 180 | ), 181 | ( 182 | "givenName", 183 | Fix(StringF( 184 | ColumnMetadata.empty.copy(tags = 185 | List(("rdfs:type", 186 | "http://schema.org/Person#pseudo"))) 187 | )) 188 | ) 189 | ), 190 | ColumnMetadata.empty 191 | )) 192 | ), 193 | ( 194 | "gender", 195 | Fix(LongF( 196 | ColumnMetadata.empty.copy(tags = List( 197 | ("rdfs:type", "http://schema.org/Person#interv"))) 198 | )) 199 | ) 200 | ), 201 | ColumnMetadata.empty 202 | ))) 203 | ), 204 | ColumnMetadata.empty 205 | )) 206 | ), 207 | ( 208 | "kind", 209 | Fix( 210 | StringF( 211 | ColumnMetadata.empty.copy(tags = 212 | List(("rdfs:type", "http://schema.org/Person#pseudo"))) 213 | )) 214 | ), 215 | ("lastUpdatedBy", Fix(StringF(ColumnMetadata.empty))), 216 | ("userId", Fix(StringF(ColumnMetadata.empty))) 217 | ), 218 | ColumnMetadata.empty 219 | )) 220 | 221 | val outputSchemaWithPrivacy = Fix[SchemaF]( 222 | StructF( 223 | List( 224 | ( 225 | "civility", 226 | Fix(StructF( 227 | List( 228 | ( 229 | "familyName", 230 | Fix(StringF( 231 | ColumnMetadata.empty.copy(tags = 232 | List(("rdfs:type", "http://schema.org/Person#mask"))) 233 | )) 234 | ), 235 | ( 236 | "gender", 237 | Fix(StringF( 238 | ColumnMetadata.empty.copy(tags = 239 | List(("rdfs:type", "http://schema.org/Person#interv"))) 240 | )) 241 | ), 242 | ( 243 | "givenName", 244 | Fix(StringF( 245 | ColumnMetadata.empty.copy(tags = 246 | List(("rdfs:type", "http://schema.org/Person#pseudo"))) 247 | )) 248 | ), 249 | ("inner", 250 | Fix(StructF( 251 | List( 252 | ( 253 | "civility", 254 | Fix(StructF( 255 | List( 256 | ( 257 | "familyName", 258 | Fix(StringF( 259 | ColumnMetadata.empty.copy(tags = 260 | List(("rdfs:type", 261 | "http://schema.org/Person#mask"))) 262 | )) 263 | ), 264 | ( 265 | "gender", 266 | Fix(StringF( 267 | ColumnMetadata.empty.copy(tags = 268 | List(("rdfs:type", 269 | "http://schema.org/Person#interv"))) 270 | )) 271 | ), 272 | ( 273 | "givenName", 274 | Fix(StringF( 275 | ColumnMetadata.empty.copy(tags = 276 | List(("rdfs:type", 277 | "http://schema.org/Person#pseudo"))) 278 | )) 279 | ) 280 | ), 281 | ColumnMetadata.empty 282 | )) 283 | ), 284 | ( 285 | "gender", 286 | Fix(StringF( 287 | ColumnMetadata.empty.copy(tags = List( 288 | ("rdfs:type", "http://schema.org/Person#interv"))) 289 | )) 290 | ) 291 | ), 292 | ColumnMetadata.empty 293 | ))) 294 | ), 295 | ColumnMetadata.empty 296 | )) 297 | ), 298 | ( 299 | "kind", 300 | Fix( 301 | StringF( 302 | ColumnMetadata.empty.copy(tags = 303 | List(("rdfs:type", "http://schema.org/Person#pseudo"))) 304 | )) 305 | ), 306 | ("lastUpdatedBy", Fix(StringF(ColumnMetadata.empty))), 307 | ("userId", Fix(StringF(ColumnMetadata.empty))) 308 | ), 309 | ColumnMetadata.empty 310 | )) 311 | 312 | val output = input.encrypt(tableSchema, strategies, engine) 313 | 314 | val schemaAsDT = Fix.birecursiveT.cataT(outputSchemaWithPrivacy)( 315 | SchemaF.schemaFToDataType) 316 | output.schema should be(schemaAsDT.asInstanceOf[StructType]) 317 | 318 | val row = output.first() 319 | row should be( 320 | Row( 321 | Row( 322 | "MARTIN", 323 | "NWoZK3kTsExUV00Ywo1G5jlUKKs=", 324 | "ZHmSvjodAvqIT7x0Lu6YDXA8D9g=", 325 | Row(Row("MARTIN", 326 | "NWoZK3kTsExUV00Ywo1G5jlUKKs=", 327 | "ZHmSvjodAvqIT7x0Lu6YDXA8D9g="), 328 | "2kuSN7rMzfGcB2DKt67EqDWQELA=") 329 | ), 330 | "HgoSOFkFjIGGbMqW1Uz6LPIwG/M=", 331 | "FICHECLIENT", 332 | "0211123586445" 333 | )) 334 | } 335 | } 336 | 337 | engines.foreach(testWithEngine) 338 | } 339 | 340 | object SymmetricCrypt extends Serializable { 341 | 342 | def cryptoSecretToBytes(cryptoSecret: String, 343 | hexSecret: Boolean): Array[Byte] = { 344 | if (hexSecret) hexToBytes(cryptoSecret) 345 | else cryptoSecret.getBytes 346 | } 347 | 348 | def encrypt(clearText: String, 349 | cryptoSecret: String, 350 | cryptoAlgorithm: String, 351 | hexSecret: Boolean = false): String = { 352 | val textToEncrypt = Option(clearText).getOrElse("") 353 | val stream: ByteArrayOutputStream = new ByteArrayOutputStream 354 | stream.write(textToEncrypt.getBytes) 355 | var bytes: Array[Byte] = stream.toByteArray 356 | val cipher: Cipher = Cipher.getInstance(cryptoAlgorithm) 357 | val cryptoKey: SecretKeySpec = 358 | new SecretKeySpec(cryptoSecretToBytes(cryptoSecret, hexSecret), 359 | cryptoAlgorithm.split("/")(0)) 360 | cipher.init(Cipher.ENCRYPT_MODE, cryptoKey) 361 | bytes = cipher.doFinal(bytes) 362 | val useInitializationVector: Boolean = 363 | if (cryptoAlgorithm.indexOf('/') < 0) false 364 | else cryptoAlgorithm.split("/")(1).toUpperCase ne "ECB" 365 | if (useInitializationVector) { 366 | val iv: Array[Byte] = cipher.getIV 367 | val out2: Array[Byte] = new Array[Byte](iv.length + 1 + bytes.length) 368 | out2(0) = iv.length.asInstanceOf[Byte] 369 | System.arraycopy(iv, 0, out2, 1, iv.length) 370 | System.arraycopy(bytes, 0, out2, 1 + iv.length, bytes.length) 371 | bytes = out2 372 | } 373 | val cryptedData: String = Base64.getUrlEncoder.encodeToString(bytes) 374 | return cryptedData 375 | } 376 | 377 | def hexToBytes(str: String): Array[Byte] = { 378 | if (str == null) { 379 | null 380 | } else if (str.length < 2) { 381 | null 382 | } else { 383 | val len = str.length / 2 384 | val buffer = new Array[Byte](len) 385 | var i = 0 386 | while (i < len) { 387 | buffer(i) = Integer.parseInt(str.substring(i * 2, i * 2 + 2), 16).toByte 388 | i = i + 1 389 | } 390 | buffer 391 | } 392 | } 393 | 394 | def decrypt( 395 | cryptedData: String, 396 | cryptoSecret: String, 397 | cryptoAlgorithm: String, 398 | hexSecret: Boolean = false 399 | ): String = { 400 | val cipher: Cipher = Cipher.getInstance(cryptoAlgorithm) 401 | val cryptoKey: SecretKeySpec = 402 | new SecretKeySpec(cryptoSecretToBytes(cryptoSecret, hexSecret), 403 | cryptoAlgorithm.split("/")(0)) 404 | val useInitializationVector: Boolean = 405 | if (cryptoAlgorithm.indexOf('/') < 0) false 406 | else cryptoAlgorithm.split("/")(1).toUpperCase ne "ECB" 407 | var cryptedBytes: Array[Byte] = Base64.getUrlDecoder().decode(cryptedData) 408 | if (useInitializationVector) { 409 | val ivLen: Int = cryptedBytes(0) 410 | val ivSpec: IvParameterSpec = new IvParameterSpec(cryptedBytes, 1, ivLen) 411 | cipher.init(Cipher.DECRYPT_MODE, cryptoKey, ivSpec) 412 | cryptedBytes = 413 | cipher.doFinal(cryptedBytes, 1 + ivLen, cryptedBytes.length - 1 - ivLen) 414 | } else { 415 | cipher.init(Cipher.DECRYPT_MODE, cryptoKey) 416 | cryptedBytes = cipher.doFinal(cryptedBytes) 417 | } 418 | new String(cryptedBytes) 419 | } 420 | } 421 | --------------------------------------------------------------------------------