├── src
    ├── main
    │   └── scala
    │   │   └── com
    │   │       └── zeotap
    │   │           └── utility
    │   │               └── spark
    │   │                   ├── types
    │   │                       ├── SparkDataframe.scala
    │   │                       ├── MapColumn.scala
    │   │                       ├── DataColumn.scala
    │   │                       └── ArrayColumn.scala
    │   │                   ├── traits
    │   │                       ├── DColumn.scala
    │   │                       ├── DataGenerator.scala
    │   │                       ├── DataType.scala
    │   │                       └── DataOption.scala
    │   │                   ├── ops
    │   │                       ├── ArrayColumnOps.scala
    │   │                       ├── SparkDataframeOps.scala
    │   │                       ├── DataGenerationOps.scala
    │   │                       └── DataColumnOps.scala
    │   │                   └── example
    │   │                       ├── types
    │   │                           └── CookieArrayColumn.scala
    │   │                       ├── helper
    │   │                           ├── ColumnConstants.scala
    │   │                           └── UserDefinedColumns.scala
    │   │                       └── generator
    │   │                           └── RandomDataGenerator.scala
    └── test
    │   └── scala
    │       └── com
    │           └── zeotap
    │               └── utility
    │                   └── spark
    │                       └── SparkDataframeOpsTest.scala
├── .gitignore
├── CONTRIBUTING.md
├── README.md
├── CODE_OF_CONDUCT.md
└── LICENSE


/src/main/scala/com/zeotap/utility/spark/types/SparkDataframe.scala:
--------------------------------------------------------------------------------
1 | package com.zeotap.utility.spark.types
2 | 
3 | import com.zeotap.utility.spark.traits.DColumn
4 | 
5 | case class SparkDataframe(dataColumns: DColumn*)
6 | 


--------------------------------------------------------------------------------
/src/main/scala/com/zeotap/utility/spark/traits/DColumn.scala:
--------------------------------------------------------------------------------
 1 | package com.zeotap.utility.spark.traits
 2 | 
 3 | import org.apache.spark.sql.types.StructField
 4 | import org.scalacheck.Gen
 5 | 
 6 | trait DColumn {
 7 |   def generateSchema: StructField
 8 | 
 9 |   def getName: String
10 | 
11 |   def dataGenerator[A]: Gen[A]
12 | }
13 | 


--------------------------------------------------------------------------------
/src/main/scala/com/zeotap/utility/spark/traits/DataGenerator.scala:
--------------------------------------------------------------------------------
 1 | package com.zeotap.utility.spark.traits
 2 | 
 3 | import org.scalacheck.Gen
 4 | 
 5 | trait DataGenerator[A] {
 6 |   def get(data: DataOption, values: List[A]): Gen[A]
 7 | }
 8 | object DataGenerator{
 9 |   def apply[A](implicit instance:DataGenerator[A]): DataGenerator[A] = instance
10 | }
11 | 


--------------------------------------------------------------------------------
/src/main/scala/com/zeotap/utility/spark/traits/DataType.scala:
--------------------------------------------------------------------------------
 1 | package com.zeotap.utility.spark.traits
 2 | 
 3 | sealed trait DataType
 4 | 
 5 | final case object DString extends DataType
 6 | 
 7 | final case object DInteger extends DataType
 8 | 
 9 | final case object DBoolean extends DataType
10 | 
11 | final case object DDouble extends DataType
12 | 
13 | final case object DLong extends DataType
14 | 


--------------------------------------------------------------------------------
/src/main/scala/com/zeotap/utility/spark/ops/ArrayColumnOps.scala:
--------------------------------------------------------------------------------
 1 | package com.zeotap.utility.spark.ops
 2 | 
 3 | import com.zeotap.utility.spark.ops.DataColumnOps.DataColumnUtils
 4 | import com.zeotap.utility.spark.types.ArrayColumn
 5 | 
 6 | object ArrayColumnOps {
 7 |   implicit class ArrayColumnExt(arr: ArrayColumn) {
 8 |     def withJunk = arr.copy(dataColumn = arr.dataColumn.withJunk)
 9 | 
10 |     def withNull = arr.copy(dataColumn = arr.dataColumn.withNull)
11 |   }
12 | }
13 | 


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
 1 | target/
 2 | .project
 3 | .idea
 4 | .bsp
 5 | .iml
 6 | .settings
 7 | .classpath
 8 | *.class
 9 | 
10 | # Mobile Tools for Java (J2ME)
11 | .mtj.tmp/
12 | 
13 | # Package Files #
14 | *.jar
15 | *.war
16 | *.ear
17 | 
18 | # virtual machine crash logs, see http://www.java.com/en/download/help/error_hotspot.xml
19 | hs_err_pid*
20 | 
21 | # intellij files
22 | *.iml
23 | .idea/
24 | 
25 | # sqlite file
26 | *.db
27 | 
28 | # log file
29 | *.log
30 | 
31 | *.DS_Store
32 | 
33 | **/dependency-reduced-pom.xml
34 | 
35 | **/*.pyc
36 | 


--------------------------------------------------------------------------------
/src/main/scala/com/zeotap/utility/spark/ops/SparkDataframeOps.scala:
--------------------------------------------------------------------------------
 1 | package com.zeotap.utility.spark.ops
 2 | 
 3 | import com.holdenkarau.spark.testing.{Column, DataframeGenerator}
 4 | import com.zeotap.utility.spark.types.SparkDataframe
 5 | import org.apache.spark.sql.types.StructType
 6 | import org.apache.spark.sql.{DataFrame, SparkSession}
 7 | 
 8 | object SparkDataframeOps {
 9 | 
10 |   implicit class SparkOps(dataGenerators: SparkDataframe) {
11 |     def getOne()(implicit sparkSession: SparkSession): Option[DataFrame] = getArbitraryGenerator().sample
12 | 
13 |     def getArbitraryGenerator()(implicit sparkSession: SparkSession) =
14 |       DataframeGenerator.arbitraryDataFrameWithCustomFields(sparkSession.sqlContext, getSchema())(dataGenerators
15 |         .dataColumns.map(a => new Column(a.getName, a.dataGenerator)): _*).arbitrary
16 | 
17 |     def getSchema(): StructType = StructType(dataGenerators.dataColumns.map(x => x.generateSchema))
18 |   }
19 | }
20 | 


--------------------------------------------------------------------------------
/CONTRIBUTING.md:
--------------------------------------------------------------------------------
 1 | >First off, thank you for considering contributing to spark-property-tests.
 2 | 
 3 | ## Contributions
 4 | spark-property-tests is an open source project and we love to receive contributions from our community — you! There are many ways to contribute, from writing tutorials or blog posts, improving the documentation, submitting bug reports and feature requests or writing code which can be incorporated into spark-property-tests itself.
 5 | 
 6 | ## How
 7 | Create issues for any major bugs and enhancements that you wish to make. Discuss things transparently and get community feedback.
 8 | 
 9 | For **something that is bigger** than a one or two line fix:
10 | 
11 | - Create your own fork of the code
12 | - Do the changes in your fork
13 | - If you like the change and think the project could use it: 
14 |   * Be sure you have followed the code style for the project. 
15 |   * Note the Code of Conduct. 
16 |   * Raise a pull request.
17 | 
18 | 
19 | ## Avoid
20 | Please, don't use the issue tracker for support questions. Shoot an email to team.data-engineering@zeotap.com and we will be happy to answer you back.
21 | 


--------------------------------------------------------------------------------
/src/main/scala/com/zeotap/utility/spark/example/types/CookieArrayColumn.scala:
--------------------------------------------------------------------------------
 1 | package com.zeotap.utility.spark.example.types
 2 | 
 3 | import com.zeotap.utility.spark.example.helper.ColumnConstants.ID_TYPE
 4 | import com.zeotap.utility.spark.example.helper.ColumnConstants
 5 | import com.zeotap.utility.spark.traits.DColumn
 6 | import com.zeotap.utility.spark.types.DataColumn
 7 | import org.apache.spark.sql.Row
 8 | import org.apache.spark.sql.types.{ArrayType, StringType, StructField, StructType}
 9 | import org.scalacheck.Gen
10 | 
11 | 
12 | case class CookieArrayColumn(name: String, idType: DataColumn,
13 |                              idValue: DataColumn, maxSize: Int) extends DColumn {
14 | 
15 |   override def generateSchema: StructField = StructField(name,
16 |     ArrayType(new StructType().add("id_type", StringType).add("id_value", StringType)))
17 | 
18 |   override def getName: String = name
19 | 
20 |   override def dataGenerator[A]: Gen[A] = {
21 |     val arrayGen = for {
22 |       k <- idType.dataGenerator[String]
23 |       v <- idValue.dataGenerator[String]
24 |     } yield Row(k, v)
25 |     Gen.containerOfN[Array, Row](maxSize, arrayGen)
26 |   }.asInstanceOf[Gen[A]]
27 | }
28 | 
29 | object CookieArrayColumn {
30 |   def cookieArrayColumn(name: String = "_cookieArray", maxSize: Int = 5)
31 |   = CookieArrayColumn(name, ID_TYPE, ColumnConstants.ADID.copy(name = "id_value"), maxSize)
32 | }
33 | 


--------------------------------------------------------------------------------
/src/main/scala/com/zeotap/utility/spark/ops/DataGenerationOps.scala:
--------------------------------------------------------------------------------
 1 | package com.zeotap.utility.spark.ops
 2 | 
 3 | import com.zeotap.utility.spark.traits._
 4 | import org.scalacheck.Gen
 5 | 
 6 | import scala.util.Random
 7 | 
 8 | 
 9 | object DataGenerationOps {
10 |   implicit val booleanGen = new DataGenerator[java.lang.Boolean] {
11 |     override def get(data: DataOption, values: List[java.lang.Boolean]): Gen[java.lang.Boolean] = getHelper(data, values)
12 |   }
13 | 
14 |   implicit val longGen = new DataGenerator[java.lang.Long] {
15 |     override def get(data: DataOption, values: List[java.lang.Long]): Gen[java.lang.Long] = getHelper(data, values)
16 |   }
17 | 
18 |   implicit val doubleGen = new DataGenerator[java.lang.Double] {
19 |     override def get(data: DataOption, values: List[java.lang.Double]): Gen[java.lang.Double] = getHelper(data, values)
20 |   }
21 | 
22 |   implicit val integerGen = new DataGenerator[Integer] {
23 |     override def get(data: DataOption, values: List[java.lang.Integer]): Gen[java.lang.Integer] = getHelper(data, values)
24 |   }
25 | 
26 |   implicit val stringGen = new DataGenerator[String] {
27 |     override def get(data: DataOption, values: List[String]): Gen[String] = getHelper(data, values)
28 |   }
29 | 
30 |   def getHelper[T](data: DataOption, values: List[T]): Gen[T] = data match {
31 |     case AlwaysPresent => Gen.oneOf(values)
32 |     case AlwaysUniform => generatorWithFrequency(values, List.fill(values.length)(1))
33 |     case AlwaysSkewed => generatorWithFrequency(values, getSkewedFrequency(values.length))
34 |   }
35 | 
36 |   def getSkewedFrequency(length: Int): List[Int] = List.fill(2)(9) ::: List.fill(length - 2)(Random.nextInt(2))
37 | 
38 |   def generatorWithFrequency[A](values: List[A], frequency: List[Int]): Gen[A] =
39 |     Gen.frequency((frequency zip Random.shuffle(values)).map(x => (x._1, Gen.oneOf(List(x._2)))): _*)
40 | 
41 | }
42 | 


--------------------------------------------------------------------------------
/src/main/scala/com/zeotap/utility/spark/traits/DataOption.scala:
--------------------------------------------------------------------------------
 1 | package com.zeotap.utility.spark.traits
 2 | 
 3 | sealed trait DataOption
 4 | 
 5 | final case object AlwaysPresent extends DataOption
 6 | 
 7 | //  Using AlwaysUniform one can expect approximately uniform distribution
 8 | //          +-----------------+-----+         +-----------------+-----+
 9 | //          |Income_preprocess|count|         |Income_preprocess|count|
10 | //          +-----------------+-----+         +-----------------+-----+
11 | //          |              0.7|    5|         |              0.7|   18|
12 | //          |              0.1|    5|         |              0.1|   14|
13 | //          |              0.8|    3|         |              0.8|   20|
14 | //          |              0.4|    7|         |              0.4|   18|
15 | //          +-----------------+-----+         +-----------------+-----+
16 | 
17 | final case object AlwaysUniform extends DataOption
18 | 
19 | //  Using AlwaysSkewed one can expect a distribution having two values with frequency 6-10 times the others
20 | //          +-----------------+-----+           +-----------------+-----+         +-----------------+-----+
21 | //          |Income_preprocess|count|           |Income_preprocess|count|         |Income_preprocess|count|
22 | //          +-----------------+-----+           +-----------------+-----+         +-----------------+-----+
23 | //          |              0.7|    4|           |              0.7|    4|         |              0.7|    2|
24 | //          |              0.1|   14|           |              0.1|   23|         |              0.1|   12|
25 | //          |              0.8|    2|           |              0.8|    4|         |              0.4|    6|
26 | //          |              0.4|   10|           |              0.4|   19|         +-----------------+-----+
27 | //          +-----------------+-----+           +-----------------+-----+
28 | 
29 | final case object AlwaysSkewed extends DataOption
30 | 


--------------------------------------------------------------------------------
/src/main/scala/com/zeotap/utility/spark/types/MapColumn.scala:
--------------------------------------------------------------------------------
 1 | package com.zeotap.utility.spark.types
 2 | 
 3 | import com.zeotap.utility.spark.example.helper.ColumnConstants.DefaultCollectionSize
 4 | import com.zeotap.utility.spark.ops.DataColumnOps.DataColumnUtils
 5 | import com.zeotap.utility.spark.traits.{AlwaysPresent, DColumn, DataType}
 6 | import com.zeotap.utility.spark.types.DataColumn.dataColumn
 7 | import org.apache.spark.sql.types.{MapType, StructField}
 8 | import org.scalacheck.Gen
 9 | 
10 | case class MapColumn(name: String, key: DataColumn, value: DataColumn, maxMapSize: Int) extends DColumn {
11 |   override def generateSchema: StructField = StructField(name,
12 |     MapType(key.getSparkCompatiblePrimitiveTypes, value.getSparkCompatiblePrimitiveTypes, false), true)
13 | 
14 |   override def getName: String = name
15 | 
16 |   override def dataGenerator[A]: Gen[A] = {
17 |     val tupleGen = for {
18 |       k <- key.dataGenerator[Any]
19 |       v <- value.dataGenerator[Any]
20 |     } yield Tuple2(k, v).asInstanceOf[(Any, Any)]
21 | 
22 |     val containerSize = scala.util.Random.nextInt(maxMapSize)
23 |     Gen.mapOfN(containerSize, tupleGen)
24 |   }.asInstanceOf[Gen[A]]
25 | }
26 | 
27 | object MapColumn {
28 |   def mapColumn(name: String, key: DataColumn, value: DataColumn, maxMapSize: Int = DefaultCollectionSize): MapColumn =
29 |     MapColumn(name, key, value, maxMapSize)
30 | 
31 |   def mapColumn(name: String, keys: List[String], values: List[String], keyType: DataType,
32 |                 valueType: DataType): MapColumn = MapColumn(name,
33 |     dataColumn(name, keyType, AlwaysPresent, keys), dataColumn(name, valueType, AlwaysPresent, values), DefaultCollectionSize)
34 | 
35 |   def mapColumn(name: String, keys: List[String], values: List[String], keyType: DataType,
36 |                 valueType: DataType, maxMapSize: Int): MapColumn = MapColumn(name,
37 |     dataColumn(name, keyType, AlwaysPresent, keys), dataColumn(name, valueType, AlwaysPresent, values), maxMapSize)
38 | }
39 | 


--------------------------------------------------------------------------------
/src/main/scala/com/zeotap/utility/spark/types/DataColumn.scala:
--------------------------------------------------------------------------------
 1 | package com.zeotap.utility.spark.types
 2 | 
 3 | 
 4 | import com.zeotap.utility.spark.ops.DataColumnOps.{DataColumnUtils, getBoolean, getDouble, getInteger, getLong}
 5 | import com.zeotap.utility.spark.ops.DataGenerationOps
 6 | import com.zeotap.utility.spark.traits._
 7 | import org.apache.spark.sql.types.{DataType => _, _}
 8 | import org.scalacheck.Gen
 9 | 
10 | case class DataColumn(name: String, dataType: DataType, options: DataOption, values: List[String]) extends DColumn {
11 |   override def generateSchema: StructField = StructField(name, this.getSparkCompatiblePrimitiveTypes, true)
12 | 
13 |   override def getName: String = name
14 | 
15 |   override def dataGenerator[A]: Gen[A] = {
16 |     import DataGenerationOps._
17 |     dataType match {
18 |       case DString => DataGenerator[String].get(options, values)
19 |       case DBoolean => DataGenerator[java.lang.Boolean].get(options, getBoolean(values))
20 |       case DDouble => DataGenerator[java.lang.Double].get(options, getDouble(values))
21 |       case DLong => DataGenerator[java.lang.Long].get(options, getLong(values))
22 |       case DInteger => DataGenerator[java.lang.Integer].get(options, getInteger(values))
23 |     }
24 |   }.asInstanceOf[Gen[A]]
25 | }
26 | 
27 | object DataColumn {
28 |   def dataColumn(name: String, datatype: DataType, options: DataOption, values: List[String]) =
29 |     DataColumn(name, datatype, options, values)
30 | 
31 |   def stringColumn(name: String, values: List[String]) = DataColumn(name, DString, AlwaysPresent, values)
32 | 
33 |   def intColumn(name: String, values: List[String]) = DataColumn(name, DInteger, AlwaysPresent, values)
34 | 
35 |   def boolColumn(name: String, values: List[String]) = DataColumn(name, DBoolean, AlwaysPresent, values)
36 | 
37 |   def doubleColumn(name: String, values: List[String]) = DataColumn(name, DDouble, AlwaysPresent, values)
38 | 
39 |   def longColumn(name: String, values: List[String]) = DataColumn(name, DLong, AlwaysPresent, values)
40 | }
41 | 


--------------------------------------------------------------------------------
/src/main/scala/com/zeotap/utility/spark/example/helper/ColumnConstants.scala:
--------------------------------------------------------------------------------
 1 | package com.zeotap.utility.spark.example.helper
 2 | 
 3 | import com.zeotap.utility.spark.example.generator.RandomDataGenerator
 4 | import com.zeotap.utility.spark.traits._
 5 | import com.zeotap.utility.spark.types.DataColumn
 6 | 
 7 | object ColumnConstants {
 8 |   final val DefaultCollectionSize = 120
 9 |   final val JavaNull = null
10 | 
11 |   final val ZUID = DataColumn("zuid", DString, AlwaysPresent, RandomDataGenerator.UUID(20))
12 | 
13 |   final val AGE = DataColumn("age", DInteger, AlwaysPresent, RandomDataGenerator.age(20, 12, 100))
14 | 
15 |   final val GENDER = DataColumn("gender", DString, AlwaysPresent, List("Male", "Female"))
16 | 
17 |   final val APPUSAGE = DataColumn("appusage", DString, AlwaysPresent, List("[[com.picsart.studio, android, BRA, 2021-03-24]]",
18 |     "[[com.vidfake.scarymo, android, BRA, 2021-03-24]]", "[[341232718, ios, BRA, 2021-03-27], [997362197, ios, BRA, 2021-03-24]]"))
19 | 
20 |   final val APPCATEGORY = DataColumn("appcategory", DString, AlwaysPresent, RandomDataGenerator.appCategory(20))
21 | 
22 |   final val RAW_IAB = DataColumn("rawIAB", DString, AlwaysPresent, RandomDataGenerator.rawIAB(20))
23 | 
24 |   final val ADID = DataColumn("adid", DString, AlwaysPresent, RandomDataGenerator.UUID(20))
25 | 
26 |   final val DEVICEOS = DataColumn("deviceos", DString, AlwaysPresent, List("iOS", "Android"))
27 | 
28 |   final val COUNTRYCODE = DataColumn("countrycode", DString, AlwaysPresent, RandomDataGenerator.country(20))
29 | 
30 |   final val OTR = DataColumn("otr", DDouble, AlwaysPresent, RandomDataGenerator.OTR(20))
31 | 
32 |   final val BUNDLEID = DataColumn("bundleid", DString, AlwaysPresent, RandomDataGenerator.bundleid(20))
33 | 
34 |   final val TIMESTAMP = DataColumn("timestamp", DLong, AlwaysPresent, RandomDataGenerator.timestamp(20))
35 | 
36 |   final val DATE = DataColumn("date", DString, AlwaysPresent, RandomDataGenerator.date(20))
37 | 
38 |   final val ID_TYPE = DataColumn("id_type", DString, AlwaysPresent, RandomDataGenerator.idType(20))
39 | }
40 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | # spark-property-tests
 2 | 
 3 | [![Join the chat at https://gitter.im/zeotap/spark-property-tests](https://badges.gitter.im/zeotap/spark-property-tests.svg)](https://gitter.im/zeotap/spark-property-tests?utm_source=badge&utm_medium=badge&utm_campaign=pr-badge&utm_content=badge)
 4 | 
 5 | Write property based tests easily on spark dataframes
 6 | 
 7 | ## Why
 8 | While writing tests for Spark code, we tend to write a lot of boilerplate just to create a test spark dataframe initialised with some test data. Not only were these test sets not readable, but they also do not adhere to Property-based testing standards. 
 9 | 
10 | We needed a utility that would have 
11 | 
12 | 🥇 less boilerplate code
13 | 
14 | 🥇 easily extensible interface for your custom use-cases
15 | 
16 | 🥇 easily build out-of-box support for most common attributes in your data/project
17 | 
18 | 🥇 promote usage of Property-based tests
19 | 
20 | This utility is based on the [spark-testing-base library by Holden Karau](https://github.com/holdenk/spark-testing-base)
21 | 
22 | ## Usage
23 | Please go through the [Wiki](https://github.com/zeotap/spark-property-tests/wiki) to understand the usage of the library. 
24 | 
25 | We have made use of Typeclasses in Scala and Generators in [scalacheck](https://github.com/typelevel/scalacheck/blob/main/doc/UserGuide.md) to present some simple interfaces to write easy property-based-tests in spark.
26 | 
27 | Additionally, we have provided examples of how you can leverage the library for your own organization under package `com.zeotap.utility.spark.example`
28 | 
29 | ## Dependency Management
30 | List of available versions are available at [this Maven Repo Link](https://mvnrepository.com/artifact/com.zeotap/spark-property-tests)
31 | 
32 | Choose from one of the versions available - generally we have chosen the spark version as the version for our repo
33 | 
34 | sbt
35 | ```
36 | "com.zeotap" %% "spark-property-tests" % "2.4.5" % "test"
37 | ```
38 | maven
39 | ```
40 | <dependency>
41 |     <groupId>com.zeotap</groupId>
42 |     <artifactId>spark-property-tests_2.11</artifactId>
43 |     <version>2.4.5</version>
44 |     <scope>test</scope>
45 | </dependency>
46 | ```
47 | 
48 | ## Build
49 | Project is build using `sbt`
50 | 


--------------------------------------------------------------------------------
/src/main/scala/com/zeotap/utility/spark/types/ArrayColumn.scala:
--------------------------------------------------------------------------------
 1 | package com.zeotap.utility.spark.types
 2 | 
 3 | import com.zeotap.utility.spark.example.helper.ColumnConstants.DefaultCollectionSize
 4 | import com.zeotap.utility.spark.ops.DataColumnOps.DataColumnUtils
 5 | import com.zeotap.utility.spark.traits._
 6 | import com.zeotap.utility.spark.types.DataColumn.dataColumn
 7 | import org.apache.spark.sql.types.{DataType => _, _}
 8 | import org.scalacheck.Gen
 9 | 
10 | case class ArrayColumn(dataColumn: DataColumn, maxArraySize: Int) extends DColumn {
11 |   override def generateSchema: StructField = StructField(dataColumn.name, ArrayType(dataColumn.getSparkCompatiblePrimitiveTypes), true)
12 | 
13 |   override def getName: String = dataColumn.getName
14 | 
15 |   override def dataGenerator[A]: Gen[A] = {
16 |     val containerSize = scala.util.Random.nextInt(maxArraySize)
17 |     dataColumn.dataType match {
18 |       case DString => Gen.containerOfN[Array, String](containerSize, dataColumn.dataGenerator.asInstanceOf[Gen[String]])
19 |       case DBoolean => Gen.containerOfN[Array, java.lang.Boolean](containerSize, dataColumn.dataGenerator.asInstanceOf[Gen[java.lang.Boolean]])
20 |       case DDouble => Gen.containerOfN[Array, java.lang.Double](containerSize, dataColumn.dataGenerator.asInstanceOf[Gen[java.lang.Double]])
21 |       case DLong => Gen.containerOfN[Array, java.lang.Long](containerSize, dataColumn.dataGenerator.asInstanceOf[Gen[java.lang.Long]])
22 |       case DInteger => Gen.containerOfN[Array, java.lang.Integer](containerSize, dataColumn.dataGenerator.asInstanceOf[Gen[java.lang.Integer]])
23 |     }
24 |   }.asInstanceOf[Gen[A]]
25 | }
26 | 
27 | object ArrayColumn {
28 | 
29 |   def arrayColumn(dataColumn: DataColumn, maxArraySize: Int = DefaultCollectionSize): ArrayColumn = ArrayColumn(dataColumn, maxArraySize)
30 | 
31 |   def arrayColumn(name: String, dataType: DataType, options: DataOption, values: List[String], maxArraySize: Int): ArrayColumn =
32 |     ArrayColumn(dataColumn(name, dataType, options, values), maxArraySize)
33 | 
34 |   def arrayColumn(name: String, dataType: DataType, options: DataOption, values: List[String]): ArrayColumn =
35 |     ArrayColumn(dataColumn(name, dataType, options, values), DefaultCollectionSize)
36 | }
37 | 


--------------------------------------------------------------------------------
/src/main/scala/com/zeotap/utility/spark/ops/DataColumnOps.scala:
--------------------------------------------------------------------------------
 1 | package com.zeotap.utility.spark.ops
 2 | 
 3 | import com.zeotap.utility.spark.traits._
 4 | import com.zeotap.utility.spark.types._
 5 | import org.apache.spark.sql.types
 6 | import org.apache.spark.sql.types._
 7 | 
 8 | 
 9 | object DataColumnOps {
10 | 
11 |   implicit class DataColumnUtils(dc: DataColumn) {
12 | 
13 |     def asString = dc.copy(dataType = DString)
14 | 
15 |     def asInt = dc.copy(dataType = DInteger)
16 | 
17 |     def asLong = dc.copy(dataType = DLong)
18 | 
19 |     def asDouble = dc.copy(dataType = DDouble)
20 | 
21 |     def asBoolean = dc.copy(dataType = DBoolean)
22 | 
23 |     def withNull = dc.copy(values = null :: dc.values)
24 | 
25 |     /* *
26 |     * Using withJunk for the below datatypes you can expect these junk values in your DF
27 |     * DString => "junkValue", "null", "", " "
28 |     * DInteger => 2147483647, -2147483648
29 |     * DLong => 9223372036854775807, -9223372036854775808
30 |     * DDouble => 1.7976931348623157E308, -1.7976931348623157E308
31 |     * DBoolean => No junk values will be there for boolean type
32 |     */
33 | 
34 |     def withJunk = dc.dataType match {
35 |       case DString => dc.copy(values = List("junkValue", "null", "", " ") ::: dc.values)
36 |       case DInteger => dc.copy(values = List(Integer.MAX_VALUE, Integer.MIN_VALUE).map(x => x.toString) ::: dc.values)
37 |       case DLong => dc.copy(values = List(Long.MaxValue, Long.MinValue).map(x => x.toString) ::: dc.values)
38 |       case DDouble => dc.copy(values = List(Double.MaxValue, Double.MinValue).map(x => x.toString) ::: dc.values)
39 |       case DBoolean => dc
40 |     }
41 | 
42 |     def getSparkCompatiblePrimitiveTypes: types.DataType = dc.dataType match {
43 |       case DString => StringType
44 |       case DInteger => IntegerType
45 |       case DLong => LongType
46 |       case DDouble => DoubleType
47 |       case DBoolean => BooleanType
48 |     }
49 |   }
50 | 
51 |   def getInteger(values: List[String]): List[java.lang.Integer] =
52 |     values.map(x => if (x == null) null else new java.lang.Integer(x.toInt))
53 | 
54 |   def getDouble(values: List[String]): List[java.lang.Double] =
55 |     values.map(x => if (x == null) null else new java.lang.Double(x.toDouble))
56 | 
57 |   def getBoolean(values: List[String]): List[java.lang.Boolean] =
58 |     values.map(x => if (x == null) null else new java.lang.Boolean(x.toBoolean))
59 | 
60 |   def getLong(values: List[String]): List[java.lang.Long] =
61 |     values.map(x => if (x == null) null else new java.lang.Long(x.toLong))
62 | }


--------------------------------------------------------------------------------
/src/main/scala/com/zeotap/utility/spark/example/generator/RandomDataGenerator.scala:
--------------------------------------------------------------------------------
 1 | package com.zeotap.utility.spark.example.generator
 2 | 
 3 | import java.text.SimpleDateFormat
 4 | import java.util.Date
 5 | 
 6 | import scala.util.Random
 7 | 
 8 | object RandomDataGenerator {
 9 | 
10 |   def age(count: Int, start: Int = 12, end: Int = 100): List[String] = {
11 |     List.fill(count)(start + Random.nextInt((end - start) + 1)).map(x => x.toString)
12 |   }
13 | 
14 |   def UUID(count: Int): List[String] = {
15 |     List.fill(count)(java.util.UUID.randomUUID().toString)
16 |   }
17 | 
18 |   def OTR(count: Int): List[String] = {
19 |     List.fill(count)((Random.nextDouble() * 100).ceil.toString())
20 |   }
21 | 
22 |   def bundleid(count: Int): List[String] = {
23 |     val values = List("droom.sleepIfUCan", "com.Deven.Arrow3D", "com.grindrapp.android", "net.zedge.android", "call.recorder.automatic.acr",
24 |       "com.rubygames.assassin", "997362197", "com.pixel.art.coloring.color.number", "com.mobisystems.msdict.embedded.wireless.svcon.tlen.full",
25 |       "com.lyrebirdstudio.collage", "com.milleniumapps.freealarmclock", "bp.free.puzzle.game.mahjong.onet", "art.color.planet.paint.by.number.game.puzzle.free",
26 |       "com.bandagames.mpuzzle.gp", "com.nextwave.wcc_lt", "short.video.app", "com.crazylabs.acrylic.nails", "cn.wps.moffice_eng", "com.best.lucky.forecast",
27 |       "com.easybrain.nonogram", "1533397036", "io.voodoo.crowdcity", "kik.android", "com.hideitpro", "com.ohmgames.cheatandrun", "com.smule.singandroid",
28 |       "com.hld.anzenbokusucal", "fast.phone.clean", "com.thinkyeah.galleryvault", "multi.parallel.dualspace.cloner")
29 |     text(count, values)
30 |   }
31 | 
32 |   def rawIAB(count: Int): List[String] = {
33 |     val values = List("IAB20_14", "IAB19", "IAB19,IAB19,IAB19,IAB19,IAB1_5", "IAB19,IAB19_47", "IAB19_18,IAB1_5,IAB1_5",
34 |       "IAB19_9", "IAB19,IAB19,IAB19,IAB10", "IAB2_1,IAB16_3,IAB16_3", "IAB19_9,IAB19_56,IAB19")
35 |     text(count, values)
36 |   }
37 | 
38 |   def text(count: Int, values: List[String]): List[String] = {
39 |     Random.shuffle(values).take(count)
40 |   }
41 | 
42 |   def appCategory(count: Int): List[String] = {
43 |     val values = List("Entertainment", "Games", "Social", "News")
44 |     text(count, values)
45 |   }
46 | 
47 |   def date(count: Int): List[String] = {
48 |     val jdf = new SimpleDateFormat("yyyy-MM-dd HH:mm:ss")
49 |     timestamp(count).map(x => {
50 |       jdf.format(new Date(x.toLong * 1000L))
51 |     })
52 |   }
53 | 
54 |   def timestamp(count: Int): List[String] = {
55 |     List.fill(count)((System.currentTimeMillis / 1000 - Random.nextInt(50000000)).toString)
56 |   }
57 | 
58 |   def country(count: Int): List[String] = {
59 |     val values = List("CAN", "USA", "MEX", "ITA", "FRA", "BGD", "DEU", "FIN", "POL", "COL", "CHL")
60 |     text(count, values)
61 |   }
62 | 
63 |   def idType(count: Int): List[String] = text(count, List("id_mid_1", "id_mid_12", "id_mid_13", "id_mid_4",
64 |     "id_mid_5", "id_mid_6", "id_mid_7", "id_mid_8", "id_mid_9", "id_mid_10", "id_mid_11"))
65 | }
66 | 


--------------------------------------------------------------------------------
/src/main/scala/com/zeotap/utility/spark/example/helper/UserDefinedColumns.scala:
--------------------------------------------------------------------------------
 1 | package com.zeotap.utility.spark.example.helper
 2 | 
 3 | import com.zeotap.utility.spark.example.helper.ColumnConstants._
 4 | import com.zeotap.utility.spark.traits.DataOption
 5 | 
 6 | object UserDefinedColumns {
 7 |   def zuid() = ZUID
 8 | 
 9 |   def zuid(option: DataOption) = ZUID.copy(options = option)
10 | 
11 |   def zuid(option: DataOption, values: List[String]) = ZUID.copy(options = option, values = values)
12 | 
13 |   def age() = AGE
14 | 
15 |   def age(option: DataOption) = AGE.copy(options = option)
16 | 
17 |   def age(option: DataOption, values: List[String]) = AGE.copy(options = option, values = values)
18 | 
19 |   def gender() = GENDER
20 | 
21 |   def gender(option: DataOption) = GENDER.copy(options = option)
22 | 
23 |   def gender(option: DataOption, values: List[String]) = GENDER.copy(options = option, values = values)
24 | 
25 |   def appUsageAsText = APPUSAGE
26 | 
27 |   def appUsageAsText(option: DataOption) = APPUSAGE.copy(options = option)
28 | 
29 |   def appUsageAsText(option: DataOption, values: List[String]) = APPUSAGE.copy(options = option, values = values)
30 | 
31 |   def appCategory() = APPCATEGORY
32 | 
33 |   def appCategory(option: DataOption) = APPCATEGORY.copy(options = option)
34 | 
35 |   def appCategory(option: DataOption, values: List[String]) = APPCATEGORY.copy(options = option, values = values)
36 | 
37 |   def rawIAB() = RAW_IAB
38 | 
39 |   def rawIAB(option: DataOption) = RAW_IAB.copy(options = option)
40 | 
41 |   def rawIAB(option: DataOption, values: List[String]) = RAW_IAB.copy(options = option, values = values)
42 | 
43 |   def adid() = ADID
44 | 
45 |   def adid(option: DataOption) = ADID.copy(options = option)
46 | 
47 |   def adid(option: DataOption, values: List[String]) = ADID.copy(options = option, values = values)
48 | 
49 |   def deviceOS() = DEVICEOS
50 | 
51 |   def deviceOS(option: DataOption) = DEVICEOS.copy(options = option)
52 | 
53 |   def deviceOS(option: DataOption, values: List[String]) = DEVICEOS.copy(options = option, values = values)
54 | 
55 |   def countryCode() = COUNTRYCODE
56 | 
57 |   def countryCode(option: DataOption) = COUNTRYCODE.copy(options = option)
58 | 
59 |   def countryCode(option: DataOption, values: List[String]) = COUNTRYCODE.copy(options = option, values = values)
60 | 
61 |   def otr() = OTR
62 | 
63 |   def otr(option: DataOption) = OTR.copy(options = option)
64 | 
65 |   def otr(option: DataOption, values: List[String]) = OTR.copy(options = option, values = values)
66 | 
67 |   def bundleid() = BUNDLEID
68 | 
69 |   def bundleid(option: DataOption) = BUNDLEID.copy(options = option)
70 | 
71 |   def bundleid(option: DataOption, values: List[String]) = BUNDLEID.copy(options = option, values = values)
72 | 
73 |   def timestamp() = TIMESTAMP
74 | 
75 |   def timestamp(option: DataOption) = TIMESTAMP.copy(options = option)
76 | 
77 |   def timestamp(option: DataOption, values: List[String]) = TIMESTAMP.copy(options = option, values = values)
78 | 
79 |   def date() = DATE
80 | 
81 |   def date(option: DataOption) = DATE.copy(options = option)
82 | 
83 |   def date(option: DataOption, values: List[String]) = DATE.copy(options = option, values = values)
84 | }
85 | 


--------------------------------------------------------------------------------
/CODE_OF_CONDUCT.md:
--------------------------------------------------------------------------------
 1 | # Contributor Covenant Code of Conduct
 2 | 
 3 | ## Our Pledge
 4 | 
 5 | In the interest of fostering an open and welcoming environment, we as
 6 | contributors and maintainers pledge to make participation in our project and
 7 | our community a harassment-free experience for everyone, regardless of age, body
 8 | size, disability, ethnicity, sex characteristics, gender identity and expression,
 9 | level of experience, education, socio-economic status, nationality, personal
10 | appearance, race, religion, or sexual identity and orientation.
11 | 
12 | ## Our Standards
13 | 
14 | Examples of behavior that contributes to creating a positive environment
15 | include:
16 | 
17 | * Using welcoming and inclusive language
18 | * Being respectful of differing viewpoints and experiences
19 | * Gracefully accepting constructive criticism
20 | * Focusing on what is best for the community
21 | * Showing empathy towards other community members
22 | 
23 | Examples of unacceptable behavior by participants include:
24 | 
25 | * The use of sexualized language or imagery and unwelcome sexual attention or
26 |   advances
27 | * Trolling, insulting/derogatory comments, and personal or political attacks
28 | * Public or private harassment
29 | * Publishing others' private information, such as a physical or electronic
30 |   address, without explicit permission
31 | * Other conduct which could reasonably be considered inappropriate in a
32 |   professional setting
33 | 
34 | ## Our Responsibilities
35 | 
36 | Project maintainers are responsible for clarifying the standards of acceptable
37 | behavior and are expected to take appropriate and fair corrective action in
38 | response to any instances of unacceptable behavior.
39 | 
40 | Project maintainers have the right and responsibility to remove, edit, or
41 | reject comments, commits, code, wiki edits, issues, and other contributions
42 | that are not aligned to this Code of Conduct, or to ban temporarily or
43 | permanently any contributor for other behaviors that they deem inappropriate,
44 | threatening, offensive, or harmful.
45 | 
46 | ## Scope
47 | 
48 | This Code of Conduct applies within all project spaces, and it also applies when
49 | an individual is representing the project or its community in public spaces.
50 | Examples of representing a project or community include using an official
51 | project e-mail address, posting via an official social media account, or acting
52 | as an appointed representative at an online or offline event. Representation of
53 | a project may be further defined and clarified by project maintainers.
54 | 
55 | ## Enforcement
56 | 
57 | Instances of abusive, harassing, or otherwise unacceptable behavior may be
58 | reported by contacting the project team at team.data-engineering@zeotap.com. All
59 | complaints will be reviewed and investigated and will result in a response that
60 | is deemed necessary and appropriate to the circumstances. The project team is
61 | obligated to maintain confidentiality with regard to the reporter of an incident.
62 | Further details of specific enforcement policies may be posted separately.
63 | 
64 | Project maintainers who do not follow or enforce the Code of Conduct in good
65 | faith may face temporary or permanent repercussions as determined by other
66 | members of the project's leadership.
67 | 
68 | ## Attribution
69 | 
70 | This Code of Conduct is adapted from the [Contributor Covenant][homepage], version 1.4,
71 | available at https://www.contributor-covenant.org/version/1/4/code-of-conduct.html
72 | 
73 | [homepage]: https://www.contributor-covenant.org
74 | 
75 | For answers to common questions about this code of conduct, see
76 | https://www.contributor-covenant.org/faq
77 | 


--------------------------------------------------------------------------------
/src/test/scala/com/zeotap/utility/spark/SparkDataframeOpsTest.scala:
--------------------------------------------------------------------------------
  1 | package com.zeotap.utility.spark
  2 | 
  3 | import com.holdenkarau.spark.testing.DataFrameSuiteBase
  4 | import com.zeotap.utility.spark.example.generator.RandomDataGenerator
  5 | import com.zeotap.utility.spark.example.helper.ColumnConstants.JavaNull
  6 | import com.zeotap.utility.spark.example.helper.UserDefinedColumns._
  7 | import com.zeotap.utility.spark.example.types.CookieArrayColumn
  8 | import com.zeotap.utility.spark.example.types.CookieArrayColumn.cookieArrayColumn
  9 | import com.zeotap.utility.spark.ops.DataColumnOps._
 10 | import com.zeotap.utility.spark.ops.SparkDataframeOps.SparkOps
 11 | import com.zeotap.utility.spark.traits._
 12 | import com.zeotap.utility.spark.types.ArrayColumn.arrayColumn
 13 | import com.zeotap.utility.spark.types.DataColumn._
 14 | import com.zeotap.utility.spark.types.MapColumn.mapColumn
 15 | import com.zeotap.utility.spark.types._
 16 | import org.apache.spark.sql.functions.{col, max, min}
 17 | import org.apache.spark.sql.types.{DataType => _, _}
 18 | import org.apache.spark.sql.{DataFrame, Encoders, Row, SparkSession}
 19 | import org.scalacheck.Prop.forAll
 20 | import org.scalatest.FunSuite
 21 | import org.scalatest.prop.Checkers.check
 22 | 
 23 | class SparkDataframeOpsTest extends FunSuite with DataFrameSuiteBase {
 24 | 
 25 |   test("test primitive and array column generation - schema and data") {
 26 |     val dataColumns = SparkDataframe(
 27 |       zuid(AlwaysSkewed),
 28 |       gender().withJunk,
 29 |       rawIAB().withNull.withJunk,
 30 |       age().asString,
 31 |       otr(AlwaysUniform),
 32 |       countryCode(AlwaysUniform).withNull,
 33 |       dataColumn("Income_preprocess", DInteger, AlwaysPresent, List("32000", "20000", "45000", "70000")).withNull,
 34 |       dataColumn("Common_ts", DLong, AlwaysSkewed, RandomDataGenerator.timestamp(15)),
 35 |       dataColumn("creditCardAvailable", DBoolean, AlwaysUniform, List("True", "False")),
 36 |       arrayColumn(dataColumn("Income_preprocess_array", DInteger, AlwaysSkewed, List("32000", "20000", "45000", "70000", null)), 10000),
 37 |       arrayColumn(adid().withNull.withJunk),
 38 |       mapColumn("adid_gender_map", adid(AlwaysPresent, RandomDataGenerator.UUID(5000)), bundleid(AlwaysSkewed), 1000),
 39 |       mapColumn("adid_to_age_mapping", adid().values, age().values, DString, DInteger),
 40 |       cookieArrayColumn()
 41 |     )
 42 |     implicit val sc: SparkSession = spark
 43 |     val prop = forAll(dataColumns.getArbitraryGenerator()) { df =>
 44 |       !testValues(dataColumns, df).exists(_ != true) && !testSchema(dataColumns, df).exists(_ != true)
 45 |     }
 46 |     check(prop)
 47 |   }
 48 | 
 49 |   def testValues(sparkDataframe: SparkDataframe, df: DataFrame): List[Boolean] = sparkDataframe.dataColumns.map {
 50 |     case d: DataColumn => testDataColumnValues(df, d)
 51 |     case a: ArrayColumn => testArrayColumnValues(df, a)
 52 |     case m: MapColumn => if (df.count() > 0) testNonPrimitiveColumnValues(df, m) else true
 53 |     case ca: CookieArrayColumn => if (df.count() > 0) testNonPrimitiveColumnValues(df, ca) else true
 54 |   }.toList
 55 | 
 56 |   /**
 57 |    * Map Column and Cookie Array test
 58 |    *  1. Populates key-value pair/cookie array column based on input DataColumns or List of String
 59 |    *  2. Empty maps/cookie array column may be present but no Java Null as instance reference
 60 |    *  3. AlwaysSkewed or AlwaysUniform - is not supported for MapColumn/Cookie Array type
 61 |    * @param df DataFrame
 62 |    * @param d  MapColumn  or CookieArrayColumn
 63 |    * @return result as Boolean
 64 |    */
 65 |   // TODO : The AlwaysSkewed feature does not make a lot of sense now. Define Skew in Map and come up with a better implementation
 66 |   def testNonPrimitiveColumnValues(df: DataFrame, d: DColumn): Boolean = df.map(row => {
 67 |     val nonPrimitiveColumn = d match {
 68 |       case m: MapColumn => row.getAs[Map[Any, Any]](m.name)
 69 |       case c: CookieArrayColumn => row.getAs[Seq[Row]](c.getName)
 70 |     }
 71 |     if (nonPrimitiveColumn.isEmpty) {
 72 |       nonPrimitiveColumn != JavaNull
 73 |     } else {
 74 |       nonPrimitiveColumn.nonEmpty
 75 |     }
 76 |   })(Encoders.scalaBoolean).reduce(_ && _)
 77 | 
 78 |   /**
 79 |    * Array Column test
 80 |    * 1. Uniformity Test is not very robust. Our observation is most of the times it is uniform but at times, can be skewed as well
 81 |    * 2. For Skew test, following was observed
 82 |    *    a. In a dataframe array column, the ratio of the most frequent element to the least frequent element
 83 |    *       is greater than or equal to 1:3
 84 |    *    b. We observed both cases of only skewed array as well as only uniform array being present at times with no certainty
 85 |    * // TODO : The AlwaysSkewed feature does not make a lot of sense now. Define Skew in Array and come up with a better implementation
 86 |    *
 87 |    * @param df DataFrame
 88 |    * @param a  ArrayColumn
 89 |    * @return result as Boolean
 90 |    */
 91 |   def testArrayColumnValues(df: DataFrame, a: ArrayColumn): Boolean = {
 92 |     if (df.count() == 0)
 93 |       true
 94 |     else {
 95 |       val resultDF = df.map(x => {
 96 |         val arrayColumnValues = x.getAs[Seq[Any]](a.dataColumn.name)
 97 |         val countMap = arrayColumnValues.groupBy(identity).mapValues(_.size)
 98 | 
 99 |         if (countMap.isEmpty) {
100 |           a.dataColumn.options match {
101 |             case AlwaysPresent => false
102 |             case AlwaysUniform => false
103 |             case AlwaysSkewed => true
104 |           }
105 |         } else {
106 |           val minTuple = countMap.minBy(_._2)
107 |           val maxTuple = countMap.maxBy(_._2)
108 |           maxTuple._2 >= minTuple._2 * 3
109 |         }
110 |       })(Encoders.scalaBoolean)
111 |       val actualResultDF = resultDF.groupBy("value").count()
112 |       a.dataColumn.options match {
113 |         case AlwaysSkewed => val rows = actualResultDF.collect()
114 |           if (rows.length == 1) {
115 |             true
116 |           } else {
117 |             assert(rows.length == 2)
118 |             val first = rows(0)
119 |             val second = rows(1)
120 |             val firstBool = first.getAs("value").asInstanceOf[Boolean]
121 |             val secondBool = second.getAs("value").asInstanceOf[Boolean]
122 |             val firstCount = first.getAs("count").asInstanceOf[Long]
123 |             val secondCount = second.getAs("count").asInstanceOf[Long]
124 |             (firstBool, secondBool) match {
125 |               case (true, false) => firstCount >= secondCount
126 |               case (false, true) => secondCount >= firstCount
127 |             }
128 |           }
129 |         case _ => true
130 |       }
131 |     }
132 |   }
133 | 
134 |   private def testDataColumnValues(df: DataFrame, dc: DataColumn) = {
135 |     dc.dataType match {
136 |       case DString => primitiveColumnCountCheck(dc, df, dc.values)
137 |       case DInteger => primitiveColumnCountCheck(dc, df, getInteger(dc.values))
138 |       case DBoolean => primitiveColumnCountCheck(dc, df, getBoolean(dc.values))
139 |       case DLong => primitiveColumnCountCheck(dc, df, getLong(dc.values))
140 |       case DDouble => primitiveColumnCountCheck(dc, df, getDouble(dc.values))
141 |     }
142 |   }
143 | 
144 |   def primitiveColumnCountCheck[A](x: DataColumn, df: DataFrame, values: List[A]): Boolean = x.options match {
145 |     case AlwaysPresent | AlwaysUniform => assertTotalCountEqualsFilterCount(x.name, values, df)
146 |     case AlwaysSkewed => assertCountsSkewedDistribution(x.name, df)
147 |   }
148 | 
149 |   def assertTotalCountEqualsFilterCount[A](colName: String, values: List[A], df: DataFrame) = {
150 |     if (values.contains(null))
151 |       df.filter(col(colName).isin(values: _*) or col(colName).isNull).count == df.count
152 |     else
153 |       df.filter(col(colName).isin(values: _*)).count == df.count
154 |   }
155 | 
156 |   def assertCountsSkewedDistribution(colName: String, df: DataFrame) = {
157 |     if (df.select(colName).distinct().count() <= 4) true
158 |     else {
159 |       val grouped = df.groupBy(colName).count.agg(min("count"), max("count")).head()
160 |       1.2 * grouped.getLong(0) <= grouped.getLong(1) || grouped.getLong(0) == grouped.getLong(1)
161 |     }
162 |   }
163 | 
164 |   def testSchema(sparkDataframe: SparkDataframe, df: DataFrame) = {
165 |     val sparkDataTypes = df.schema.fields.map(f => f.dataType)
166 |     (sparkDataframe.dataColumns zip sparkDataTypes).map {
167 |       case (a: ArrayColumn, b: ArrayType) => primitiveSchemaCheck(a.dataColumn.dataType, b.elementType)
168 |       case (d: DataColumn, b) => primitiveSchemaCheck(d.dataType, b)
169 |       case (m: MapColumn, b: MapType) => primitiveSchemaCheck(m.key.dataType, b.keyType) && primitiveSchemaCheck(m.value.dataType, b.valueType)
170 |       case (c: CookieArrayColumn, b: ArrayType) => cookieArraySchemaCheck(b, c)
171 |     }
172 |   }.toList
173 | 
174 |   def cookieArraySchemaCheck(sparkDataType: ArrayType, dType: CookieArrayColumn) = sparkDataType.elementType.isInstanceOf[StructType] && {
175 |     val structType = sparkDataType.elementType.asInstanceOf[StructType]
176 |     structType.size == 2 &&
177 |       structType.fields(0).name.equalsIgnoreCase("id_type") &&
178 |       structType.fields(1).name.equalsIgnoreCase("id_value") &&
179 |       primitiveSchemaCheck(dType.idType.dataType, structType.fields(0).dataType) &&
180 |       primitiveSchemaCheck(dType.idValue.dataType, structType.fields(1).dataType)
181 |   }
182 | 
183 |   def primitiveSchemaCheck(dataColumnDType: com.zeotap.utility.spark.traits.DataType, sparkDataType: org.apache.spark.sql.types.DataType) = dataColumnDType match {
184 |     case DString => sparkDataType == StringType
185 |     case DBoolean => sparkDataType == BooleanType
186 |     case DDouble => sparkDataType == DoubleType
187 |     case DLong => sparkDataType == LongType
188 |     case DInteger => sparkDataType == IntegerType
189 |   }
190 | }
191 | 


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
  1 |                                  Apache License
  2 |                            Version 2.0, January 2004
  3 |                         http://www.apache.org/licenses/
  4 | 
  5 |    TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
  6 | 
  7 |    1. Definitions.
  8 | 
  9 |       "License" shall mean the terms and conditions for use, reproduction,
 10 |       and distribution as defined by Sections 1 through 9 of this document.
 11 | 
 12 |       "Licensor" shall mean the copyright owner or entity authorized by
 13 |       the copyright owner that is granting the License.
 14 | 
 15 |       "Legal Entity" shall mean the union of the acting entity and all
 16 |       other entities that control, are controlled by, or are under common
 17 |       control with that entity. For the purposes of this definition,
 18 |       "control" means (i) the power, direct or indirect, to cause the
 19 |       direction or management of such entity, whether by contract or
 20 |       otherwise, or (ii) ownership of fifty percent (50%) or more of the
 21 |       outstanding shares, or (iii) beneficial ownership of such entity.
 22 | 
 23 |       "You" (or "Your") shall mean an individual or Legal Entity
 24 |       exercising permissions granted by this License.
 25 | 
 26 |       "Source" form shall mean the preferred form for making modifications,
 27 |       including but not limited to software source code, documentation
 28 |       source, and configuration files.
 29 | 
 30 |       "Object" form shall mean any form resulting from mechanical
 31 |       transformation or translation of a Source form, including but
 32 |       not limited to compiled object code, generated documentation,
 33 |       and conversions to other media types.
 34 | 
 35 |       "Work" shall mean the work of authorship, whether in Source or
 36 |       Object form, made available under the License, as indicated by a
 37 |       copyright notice that is included in or attached to the work
 38 |       (an example is provided in the Appendix below).
 39 | 
 40 |       "Derivative Works" shall mean any work, whether in Source or Object
 41 |       form, that is based on (or derived from) the Work and for which the
 42 |       editorial revisions, annotations, elaborations, or other modifications
 43 |       represent, as a whole, an original work of authorship. For the purposes
 44 |       of this License, Derivative Works shall not include works that remain
 45 |       separable from, or merely link (or bind by name) to the interfaces of,
 46 |       the Work and Derivative Works thereof.
 47 | 
 48 |       "Contribution" shall mean any work of authorship, including
 49 |       the original version of the Work and any modifications or additions
 50 |       to that Work or Derivative Works thereof, that is intentionally
 51 |       submitted to Licensor for inclusion in the Work by the copyright owner
 52 |       or by an individual or Legal Entity authorized to submit on behalf of
 53 |       the copyright owner. For the purposes of this definition, "submitted"
 54 |       means any form of electronic, verbal, or written communication sent
 55 |       to the Licensor or its representatives, including but not limited to
 56 |       communication on electronic mailing lists, source code control systems,
 57 |       and issue tracking systems that are managed by, or on behalf of, the
 58 |       Licensor for the purpose of discussing and improving the Work, but
 59 |       excluding communication that is conspicuously marked or otherwise
 60 |       designated in writing by the copyright owner as "Not a Contribution."
 61 | 
 62 |       "Contributor" shall mean Licensor and any individual or Legal Entity
 63 |       on behalf of whom a Contribution has been received by Licensor and
 64 |       subsequently incorporated within the Work.
 65 | 
 66 |    2. Grant of Copyright License. Subject to the terms and conditions of
 67 |       this License, each Contributor hereby grants to You a perpetual,
 68 |       worldwide, non-exclusive, no-charge, royalty-free, irrevocable
 69 |       copyright license to reproduce, prepare Derivative Works of,
 70 |       publicly display, publicly perform, sublicense, and distribute the
 71 |       Work and such Derivative Works in Source or Object form.
 72 | 
 73 |    3. Grant of Patent License. Subject to the terms and conditions of
 74 |       this License, each Contributor hereby grants to You a perpetual,
 75 |       worldwide, non-exclusive, no-charge, royalty-free, irrevocable
 76 |       (except as stated in this section) patent license to make, have made,
 77 |       use, offer to sell, sell, import, and otherwise transfer the Work,
 78 |       where such license applies only to those patent claims licensable
 79 |       by such Contributor that are necessarily infringed by their
 80 |       Contribution(s) alone or by combination of their Contribution(s)
 81 |       with the Work to which such Contribution(s) was submitted. If You
 82 |       institute patent litigation against any entity (including a
 83 |       cross-claim or counterclaim in a lawsuit) alleging that the Work
 84 |       or a Contribution incorporated within the Work constitutes direct
 85 |       or contributory patent infringement, then any patent licenses
 86 |       granted to You under this License for that Work shall terminate
 87 |       as of the date such litigation is filed.
 88 | 
 89 |    4. Redistribution. You may reproduce and distribute copies of the
 90 |       Work or Derivative Works thereof in any medium, with or without
 91 |       modifications, and in Source or Object form, provided that You
 92 |       meet the following conditions:
 93 | 
 94 |       (a) You must give any other recipients of the Work or
 95 |           Derivative Works a copy of this License; and
 96 | 
 97 |       (b) You must cause any modified files to carry prominent notices
 98 |           stating that You changed the files; and
 99 | 
100 |       (c) You must retain, in the Source form of any Derivative Works
101 |           that You distribute, all copyright, patent, trademark, and
102 |           attribution notices from the Source form of the Work,
103 |           excluding those notices that do not pertain to any part of
104 |           the Derivative Works; and
105 | 
106 |       (d) If the Work includes a "NOTICE" text file as part of its
107 |           distribution, then any Derivative Works that You distribute must
108 |           include a readable copy of the attribution notices contained
109 |           within such NOTICE file, excluding those notices that do not
110 |           pertain to any part of the Derivative Works, in at least one
111 |           of the following places: within a NOTICE text file distributed
112 |           as part of the Derivative Works; within the Source form or
113 |           documentation, if provided along with the Derivative Works; or,
114 |           within a display generated by the Derivative Works, if and
115 |           wherever such third-party notices normally appear. The contents
116 |           of the NOTICE file are for informational purposes only and
117 |           do not modify the License. You may add Your own attribution
118 |           notices within Derivative Works that You distribute, alongside
119 |           or as an addendum to the NOTICE text from the Work, provided
120 |           that such additional attribution notices cannot be construed
121 |           as modifying the License.
122 | 
123 |       You may add Your own copyright statement to Your modifications and
124 |       may provide additional or different license terms and conditions
125 |       for use, reproduction, or distribution of Your modifications, or
126 |       for any such Derivative Works as a whole, provided Your use,
127 |       reproduction, and distribution of the Work otherwise complies with
128 |       the conditions stated in this License.
129 | 
130 |    5. Submission of Contributions. Unless You explicitly state otherwise,
131 |       any Contribution intentionally submitted for inclusion in the Work
132 |       by You to the Licensor shall be under the terms and conditions of
133 |       this License, without any additional terms or conditions.
134 |       Notwithstanding the above, nothing herein shall supersede or modify
135 |       the terms of any separate license agreement you may have executed
136 |       with Licensor regarding such Contributions.
137 | 
138 |    6. Trademarks. This License does not grant permission to use the trade
139 |       names, trademarks, service marks, or product names of the Licensor,
140 |       except as required for reasonable and customary use in describing the
141 |       origin of the Work and reproducing the content of the NOTICE file.
142 | 
143 |    7. Disclaimer of Warranty. Unless required by applicable law or
144 |       agreed to in writing, Licensor provides the Work (and each
145 |       Contributor provides its Contributions) on an "AS IS" BASIS,
146 |       WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
147 |       implied, including, without limitation, any warranties or conditions
148 |       of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
149 |       PARTICULAR PURPOSE. You are solely responsible for determining the
150 |       appropriateness of using or redistributing the Work and assume any
151 |       risks associated with Your exercise of permissions under this License.
152 | 
153 |    8. Limitation of Liability. In no event and under no legal theory,
154 |       whether in tort (including negligence), contract, or otherwise,
155 |       unless required by applicable law (such as deliberate and grossly
156 |       negligent acts) or agreed to in writing, shall any Contributor be
157 |       liable to You for damages, including any direct, indirect, special,
158 |       incidental, or consequential damages of any character arising as a
159 |       result of this License or out of the use or inability to use the
160 |       Work (including but not limited to damages for loss of goodwill,
161 |       work stoppage, computer failure or malfunction, or any and all
162 |       other commercial damages or losses), even if such Contributor
163 |       has been advised of the possibility of such damages.
164 | 
165 |    9. Accepting Warranty or Additional Liability. While redistributing
166 |       the Work or Derivative Works thereof, You may choose to offer,
167 |       and charge a fee for, acceptance of support, warranty, indemnity,
168 |       or other liability obligations and/or rights consistent with this
169 |       License. However, in accepting such obligations, You may act only
170 |       on Your own behalf and on Your sole responsibility, not on behalf
171 |       of any other Contributor, and only if You agree to indemnify,
172 |       defend, and hold each Contributor harmless for any liability
173 |       incurred by, or claims asserted against, such Contributor by reason
174 |       of your accepting any such warranty or additional liability.
175 | 
176 |    END OF TERMS AND CONDITIONS
177 | 
178 |    APPENDIX: How to apply the Apache License to your work.
179 | 
180 |       To apply the Apache License to your work, attach the following
181 |       boilerplate notice, with the fields enclosed by brackets "[]"
182 |       replaced with your own identifying information. (Don't include
183 |       the brackets!)  The text should be enclosed in the appropriate
184 |       comment syntax for the file format. We also recommend that a
185 |       file or class name and description of purpose be included on the
186 |       same "printed page" as the copyright notice for easier
187 |       identification within third-party archives.
188 | 
189 |    Copyright [yyyy] [name of copyright owner]
190 | 
191 |    Licensed under the Apache License, Version 2.0 (the "License");
192 |    you may not use this file except in compliance with the License.
193 |    You may obtain a copy of the License at
194 | 
195 |        http://www.apache.org/licenses/LICENSE-2.0
196 | 
197 |    Unless required by applicable law or agreed to in writing, software
198 |    distributed under the License is distributed on an "AS IS" BASIS,
199 |    WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
200 |    See the License for the specific language governing permissions and
201 |    limitations under the License.
202 | 


--------------------------------------------------------------------------------