├── src
    └── main
    │   └── scala
    │       └── lexaneon
    │           └── amazon
    │               └── deequ
    │                   ├── repository
    │                       └── influxdb
    │                       │   ├── InfluxDBConnectionProperties.scala
    │                       │   ├── InfluxDBMetricsRepository.scala
    │                       │   ├── InfluxDBMetricsRepositoryMultipleResultsLoader.scala
    │                       │   ├── PointDecorator.scala
    │                       │   └── InfluxDBAnalysisResultSerde.scala
    │                   └── example
    │                       └── InfluxDBMetricRepository.scala
├── README.md
└── LICENSE


/src/main/scala/lexaneon/amazon/deequ/repository/influxdb/InfluxDBConnectionProperties.scala:
--------------------------------------------------------------------------------
1 | package lexaneon.amazon.deequ.repository.influxdb
2 | 
3 | case class InfluxDBConnectionProperties(
4 |                                          serverURLWithPort: String,
5 |                                          dbName: String,
6 |                                          measurementName: String)
7 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | # Amazon Deequ addons
 2 | What is Amazon Deequ you can find here [here]
 3 | 
 4 | Current repository stores next addons:
 5 | 
 6 | ## InfluxDBMetricsRepository
 7 | it allows save results to InfluxDB, for further visualization from Grafana (for example)
 8 | 
 9 | Features:
10 | - Save results to InfluxDB
11 | 
12 | 
13 | To do:
14 | - Read all data from InfluxDB
15 | - Read data from InfluxDB by some predicates
16 | - Add examples
17 | 
18 | 
19 | 
20 | [here]: https://github.com/awslabs/deequ


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
 1 | MIT License
 2 | 
 3 | Copyright (c) 2021 Alexey Artemov
 4 | 
 5 | Permission is hereby granted, free of charge, to any person obtaining a copy
 6 | of this software and associated documentation files (the "Software"), to deal
 7 | in the Software without restriction, including without limitation the rights
 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 | 
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 | 
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.
22 | 


--------------------------------------------------------------------------------
/src/main/scala/lexaneon/amazon/deequ/repository/influxdb/InfluxDBMetricsRepository.scala:
--------------------------------------------------------------------------------
 1 | package lexaneon.amazon.deequ.repository.influxdb
 2 | 
 3 | import com.amazon.deequ.analyzers.runners.AnalyzerContext
 4 | import com.amazon.deequ.repository.{AnalysisResult, MetricsRepository, MetricsRepositoryMultipleResultsLoader, ResultKey}
 5 | import org.influxdb.{InfluxDB, InfluxDBFactory}
 6 | 
 7 | import java.util.concurrent.TimeUnit
 8 | 
 9 | /** A Repository implementation using an influxDB
10 |  *
11 |  * @param serverURLWithPort - influxDB server URL with port, example: http://localhost:8086
12 |  * @param dbName
13 |  * @param measurementName
14 |  */
15 | class InfluxDBMetricsRepository(influxDBConnectionProperties: InfluxDBConnectionProperties) extends MetricsRepository{
16 | 
17 |   implicit val influxDBConnect = initInfluxDBConnect
18 | 
19 |   def initInfluxDBConnect(): InfluxDB = {
20 |     val influxDB = InfluxDBFactory.connect(influxDBConnectionProperties.serverURLWithPort)
21 |     influxDB.setDatabase(influxDBConnectionProperties.dbName)
22 |     influxDB
23 |   }
24 | 
25 |   override def save(resultKey: ResultKey, analyzerContext: AnalyzerContext): Unit = {
26 |     val successfulMetrics = analyzerContext.metricMap
27 |       .filter { case (_, metric) => metric.value.isSuccess }
28 | 
29 |     val analyzerContextWithSuccessfulValues = AnalyzerContext(successfulMetrics)
30 |     val points =
31 |       InfluxDBAnalysisResultSerde.
32 |         analysisResultToInfluxPointObject(resultKey, analyzerContextWithSuccessfulValues, influxDBConnectionProperties.measurementName)
33 | 
34 |     points.foreach(point => InfluxDBMetricsRepositoryMultipleResultsLoader.writeToInfluxDB(point))
35 | 
36 |   }
37 | 
38 |   override def loadByKey(resultKey: ResultKey): Option[AnalyzerContext] = None
39 | 
40 |   override def load(): MetricsRepositoryMultipleResultsLoader = new InfluxDBMetricsRepositoryMultipleResultsLoader // TODO should be finished
41 | }
42 | 


--------------------------------------------------------------------------------
/src/main/scala/lexaneon/amazon/deequ/example/InfluxDBMetricRepository.scala:
--------------------------------------------------------------------------------
 1 | package lexaneon.amazon.deequ.example
 2 | 
 3 | import com.amazon.deequ.analyzers.{Completeness, CountDistinct, Distinctness, Size, Uniqueness}
 4 | import com.amazon.deequ.analyzers.runners.AnalysisRunner
 5 | import com.amazon.deequ.analyzers.runners.AnalyzerContext.successMetricsAsDataFrame
 6 | import com.amazon.deequ.repository.ResultKey
 7 | import lexaneon.amazon.deequ.repository.influxdb.{InfluxDBConnectionProperties, InfluxDBMetricsRepository}
 8 | import org.apache.spark.sql.SparkSession
 9 | 
10 | object InfluxDBMetricRepository extends App{
11 | 
12 |   val spark = initSpark()
13 |   val filePath = "src/main/resources/dataForExample/data.csv"
14 |   val df = spark.read.option("header", "true").csv(filePath).toDF()
15 | 
16 |   val influxDBConnectionProperties = InfluxDBConnectionProperties("http://localhost:8086", "example", "InfluxDBMetricsRepository")
17 | 
18 |   val resultKey = ResultKey(
19 |     System.currentTimeMillis(),
20 |     Map("dataSetFilePath" -> filePath, "dataSetName" -> "orders"))
21 | 
22 |   val analysisResult = AnalysisRunner
23 |     .onData(df)
24 |     .useRepository(new InfluxDBMetricsRepository(influxDBConnectionProperties))
25 |     .saveOrAppendResult(resultKey)
26 |     .addAnalyzer(Size())
27 |     .addAnalyzer(Distinctness("customer_id"))
28 |     .addAnalyzer(CountDistinct("customer_id"))
29 |     .addAnalyzer(Uniqueness(Seq("customer_id", "id")))
30 |     .addAnalyzer(Uniqueness("id"))
31 |     .addAnalyzer(Completeness("trans_date"))
32 |     .addAnalyzer(Completeness("id"))
33 |     .run()
34 | 
35 |   val metric = successMetricsAsDataFrame( spark, analysisResult)
36 | 
37 |   metric.show(false)
38 | 
39 |   spark.close()
40 |   def initSpark(isLocalRun: Boolean = true): SparkSession = {
41 |     val sparkSessionBuilder =
42 |       SparkSession
43 |         .builder
44 |         .appName(this.getClass.getSimpleName)
45 | 
46 |     val spark =
47 |       if (isLocalRun){
48 |         sparkSessionBuilder
49 |           .master("local[*]")
50 |           .getOrCreate()
51 |       }else
52 |         sparkSessionBuilder.getOrCreate()
53 | 
54 |     spark.sparkContext.setLogLevel("ERROR")
55 |     spark.conf.set("spark.sql.session.timeZone", "UTC")
56 | 
57 |     spark
58 |   }
59 | 
60 | }
61 | 
62 | 
63 | 


--------------------------------------------------------------------------------
/src/main/scala/lexaneon/amazon/deequ/repository/influxdb/InfluxDBMetricsRepositoryMultipleResultsLoader.scala:
--------------------------------------------------------------------------------
 1 | package lexaneon.amazon.deequ.repository.influxdb
 2 | 
 3 | import com.amazon.deequ.analyzers.Analyzer
 4 | import com.amazon.deequ.analyzers.runners.AnalyzerContext
 5 | import com.amazon.deequ.metrics.{DoubleMetric, Entity, Metric}
 6 | import com.amazon.deequ.repository.{AnalysisResult, MetricsRepositoryMultipleResultsLoader, ResultKey}
 7 | import org.influxdb.InfluxDB
 8 | import org.influxdb.dto.{Query}
 9 | 
10 | 
11 | class InfluxDBMetricsRepositoryMultipleResultsLoader extends MetricsRepositoryMultipleResultsLoader{
12 | 
13 |   private[this] var tagValues: Option[Map[String, String]] = None
14 |   private[this] var forAnalyzers: Option[Seq[Analyzer[_, Metric[_]]]] = None
15 |   private[this] var before: Option[Long] = None
16 |   private[this] var after: Option[Long] = None
17 | 
18 |   /**
19 |    * Filter out results that don't have specific values for specific tags
20 |    *
21 |    * @param tagValues Map with tag names and the corresponding values to filter for
22 |    */
23 |   def withTagValues(tagValues: Map[String, String]): MetricsRepositoryMultipleResultsLoader = {
24 |     this.tagValues = Option(tagValues)
25 |     this
26 |   }
27 | 
28 |   /**
29 |    * Choose all metrics that you want to load
30 |    *
31 |    * @param analyzers A sequence of analyers who's resulting metrics you want to load
32 |    */
33 |   def forAnalyzers(analyzers: Seq[Analyzer[_, Metric[_]]])
34 |   : MetricsRepositoryMultipleResultsLoader = {
35 | 
36 |     this.forAnalyzers = Option(analyzers)
37 |     this
38 |   }
39 | 
40 |   /**
41 |    * Only look at AnalysisResults with a result key with a smaller value
42 |    *
43 |    * @param dateTime The maximum dateTime of AnalysisResults to look at
44 |    */
45 |   def before(dateTime: Long): MetricsRepositoryMultipleResultsLoader = {
46 |     this.before = Option(dateTime)
47 |     this
48 |   }
49 | 
50 |   /**
51 |    * Only look at AnalysisResults with a result key with a greater value
52 |    *
53 |    * @param dateTime The minimum dateTime of AnalysisResults to look at
54 |    */
55 |   def after(dateTime: Long): MetricsRepositoryMultipleResultsLoader = {
56 |     this.after = Option(dateTime)
57 |     this
58 |   }
59 | 
60 |   /** Get the AnalysisResult */
61 |   override def get(): Seq[AnalysisResult] = Seq.empty[AnalysisResult] //TODO should be finished
62 | 
63 | 
64 | }
65 | 
66 | object InfluxDBMetricsRepositoryMultipleResultsLoader {
67 | 
68 |   def apply(influxDBConnectionProperties: InfluxDBConnectionProperties): InfluxDBMetricsRepositoryMultipleResultsLoader = {
69 |     new InfluxDBMetricsRepositoryMultipleResultsLoader
70 |   }
71 | 
72 |   def writeToInfluxDB(point: PointBuilderDecorator)(implicit influxDBConnect: InfluxDB): Unit = {
73 |     influxDBConnect.write(point.build())
74 |   }
75 | 
76 |   def readFromInfluxDB(implicit influxDBConnect: InfluxDB, measurementName: String): Option[Seq[AnalysisResult]] = {
77 |     val queryResult = influxDBConnect.query(new Query(s"select * from ${measurementName}"))
78 |     InfluxDBAnalysisResultSerde.queryToAnalysisResult(queryResult)
79 |   }
80 | }
81 | 
82 | 
83 | 


--------------------------------------------------------------------------------
/src/main/scala/lexaneon/amazon/deequ/repository/influxdb/PointDecorator.scala:
--------------------------------------------------------------------------------
  1 | package lexaneon.amazon.deequ.repository.influxdb
  2 | 
  3 | import org.influxdb.dto.Point
  4 | 
  5 | import java.{lang, util}
  6 | import java.util.concurrent.TimeUnit
  7 | 
  8 | object PointDecorator {
  9 |   def measurement(measurement: String): PointBuilderDecorator = new PointBuilderDecorator(Point.measurement(measurement))
 10 | }
 11 | 
 12 | class PointBuilderDecorator(val pointBuilder: Point.Builder) {
 13 |   def tag(tagName: String, value: String): PointBuilderDecorator = {
 14 |     pointBuilder.tag(tagName, value)
 15 |     this
 16 |   }
 17 | 
 18 |   def tag(tagsToAdd: util.Map[String, String]): PointBuilderDecorator = {
 19 |     pointBuilder.tag(tagsToAdd)
 20 |     this
 21 |   }
 22 | 
 23 |   def addField(field: String, value: Boolean): PointBuilderDecorator = {
 24 |     pointBuilder.addField(field, value)
 25 |     this
 26 |   }
 27 | 
 28 |   def addField(field: String, value: Long): PointBuilderDecorator = {
 29 |     pointBuilder.addField(field, value)
 30 |     this
 31 |   }
 32 | 
 33 |   def addField(field: String, value: Double): PointBuilderDecorator = {
 34 |     pointBuilder.addField(field, value)
 35 |     this
 36 |   }
 37 | 
 38 |   def addField(field: String, value: Int): PointBuilderDecorator = {
 39 |     pointBuilder.addField(field, value)
 40 |     this
 41 |   }
 42 | 
 43 |   def addField(field: String, value: Float): PointBuilderDecorator = {
 44 |     pointBuilder.addField(field, value)
 45 |     this
 46 |   }
 47 | 
 48 |   def addField(field: String, value: Short): PointBuilderDecorator = {
 49 |     pointBuilder.addField(field, value)
 50 |     this
 51 |   }
 52 | 
 53 |   def addField(field: String, value: Number): PointBuilderDecorator = {
 54 |     pointBuilder.addField(field, value)
 55 |     this
 56 |   }
 57 | 
 58 |   def addField(field: String, value: String): PointBuilderDecorator = {
 59 |     pointBuilder.addField(field, value)
 60 |     this
 61 |   }
 62 | 
 63 |   def addField(field: String, value: Any): PointBuilderDecorator = {
 64 |     value match{
 65 |       case k: Double =>  pointBuilder.addField(field, k)
 66 |       case k: Float => pointBuilder.addField(field, k)
 67 |       case k: Long =>  pointBuilder.addField(field, k)
 68 |       case k: Int => pointBuilder.addField(field, k)
 69 |       case k: Short =>  pointBuilder.addField(field, k)
 70 |       case k: Byte =>  pointBuilder.addField(field, k)
 71 |       case k: Char =>  pointBuilder.addField(field, k)
 72 |       case k: Boolean =>  pointBuilder.addField(field, k)
 73 |       case k: String =>  pointBuilder.addField(field, k)
 74 |       case _ => throw new ClassCastException("No such class to cast")
 75 |     }
 76 |     this
 77 |   }
 78 | 
 79 |   def fields(fieldsToAdd: util.Map[String, AnyRef]): PointBuilderDecorator = {
 80 |     pointBuilder.fields(fieldsToAdd)
 81 |     this
 82 |   }
 83 | 
 84 |   def time(timeToSet: Number, precisionToSet: TimeUnit): PointBuilderDecorator = {
 85 |     pointBuilder.time(timeToSet, precisionToSet)
 86 |     this
 87 |   }
 88 | 
 89 |   def time(timeToSet: Long, precisionToSet: TimeUnit): PointBuilderDecorator = {
 90 |     pointBuilder.time(timeToSet, precisionToSet)
 91 |     this
 92 |   }
 93 | 
 94 |   def time(timeToSet: lang.Long, precisionToSet: TimeUnit): PointBuilderDecorator = {
 95 |     pointBuilder.time(timeToSet, precisionToSet)
 96 |     this
 97 |   }
 98 | 
 99 |   def hasFields: Boolean = pointBuilder.hasFields
100 | 
101 |   def addFieldsFromPOJO(pojo: Any): PointBuilderDecorator = {
102 |     pointBuilder.addFieldsFromPOJO(pojo)
103 |     this
104 |   }
105 | 
106 |   def build(): Point = pointBuilder.build()
107 | }


--------------------------------------------------------------------------------
/src/main/scala/lexaneon/amazon/deequ/repository/influxdb/InfluxDBAnalysisResultSerde.scala:
--------------------------------------------------------------------------------
  1 | package lexaneon.amazon.deequ.repository.influxdb
  2 | 
  3 | import com.amazon.deequ.analyzers.runners.AnalyzerContext
  4 | import com.amazon.deequ.repository.{AnalysisResult, ResultKey}
  5 | import org.influxdb.dto.QueryResult
  6 | 
  7 | import java.util.concurrent.TimeUnit
  8 | 
  9 | object InfluxDBAnalysisResultSerde {
 10 | 
 11 |   val DATASET_DATE_FIELD = "dataSetDate"
 12 |   val TAGS_PREFIX_FIELD = "tags_"
 13 |   val ENTITY_FIELD = "entity"
 14 |   val INSTANCE_FIELD = "instance"
 15 |   val NAME_FIELD = "name"
 16 |   val VALUE_FIELD = "value"
 17 | 
 18 |   // create InfluxDB Point objects from AnalysisResult
 19 |   def analysisResultToInfluxPointObject(resultKey: ResultKey, analyzerContext: AnalyzerContext, measurementName: String): Seq[PointBuilderDecorator] = {
 20 |     val result =
 21 |       analyzerContext
 22 |         .allMetrics
 23 |         .map(el =>{
 24 |           val point = PointDecorator
 25 |             .measurement(measurementName)
 26 |             .time(System.currentTimeMillis(), TimeUnit.MILLISECONDS)
 27 |             .addField(DATASET_DATE_FIELD, resultKey.dataSetDate)
 28 |             .addField(VALUE_FIELD, el.value.toOption.get)
 29 |             .tag(ENTITY_FIELD, el.entity.toString)
 30 |             .tag(INSTANCE_FIELD, el.instance)
 31 |             .tag(NAME_FIELD, el.name)
 32 | 
 33 |           resultKey
 34 |             .tags
 35 |             .foldLeft(point)((acc, pair) => acc.tag(s"$TAGS_PREFIX_FIELD${pair._1}", pair._2))
 36 |         })
 37 |     result
 38 |   }
 39 | 
 40 |   def queryToAnalysisResult(queryResult: QueryResult): Option[Seq[AnalysisResult]] = {
 41 |     val columns = getColumnsFromQueryResult(queryResult)
 42 |     val result = getResultMapFromQueryResult(queryResult)
 43 | 
 44 |     // TODO should be finished
 45 |     println(s"Columns: ${columns.mkString(";")}")
 46 | 
 47 |     result.foreach(rec => {
 48 |       rec.foreach(el => print(s"key: ${el._1} value: ${el._2};"))
 49 |       println
 50 |     })
 51 |     val tagColumns = columns.filter(_.startsWith(TAGS_PREFIX_FIELD))
 52 | 
 53 |     val resultTable =
 54 |       result
 55 |         .groupBy(el => {
 56 |           val tagMap = tagColumns.map(tags => tags.split(TAGS_PREFIX_FIELD)(1) -> el(tags).asInstanceOf[String]).toMap
 57 |           val dataSetMap = Map(DATASET_DATE_FIELD -> el(DATASET_DATE_FIELD).asInstanceOf[String])
 58 |           tagMap ++ dataSetMap
 59 |         })
 60 |         .map(el =>
 61 |           (ResultKey(
 62 |             el._1(DATASET_DATE_FIELD).asInstanceOf[Long],
 63 |             el._1.filter(key => key._1 != DATASET_DATE_FIELD)), el._2.map(k => k).distinct)
 64 |         )
 65 | 
 66 | 
 67 |     val resultKey =
 68 |       result
 69 |         .map(
 70 |           record =>
 71 |             ResultKey(
 72 |               record(DATASET_DATE_FIELD).asInstanceOf[Long],
 73 |               tagColumns.map(tags => tags.split(TAGS_PREFIX_FIELD)(1) -> record(tags).asInstanceOf[String]).toMap)
 74 |         )
 75 |         .distinct
 76 | 
 77 |     //    val metricMap = TODO finish
 78 |     //      result
 79 |     //        .map(
 80 |     //          record =>
 81 |     //            DoubleMetric(
 82 |     //              Entity.withName(record(ENTITY_FIELD).asInstanceOf[String]),
 83 |     //              record(NAME_FIELD).asInstanceOf[String],
 84 |     //              record(INSTANCE_FIELD).asInstanceOf[String],
 85 |     //              Try((VALUE_FIELD).asInstanceOf[Double])))
 86 |     //    AnalysisResult(resultKey(0), AnalyzerContext(metricMap))
 87 |     None
 88 |   }
 89 |   private def getSeriesFromQueryResult(queryResult: QueryResult): Seq[QueryResult.Series] = {
 90 |     import collection.JavaConverters._
 91 |     queryResult
 92 |       .getResults
 93 |       .asScala
 94 |       .map(record => {
 95 |         record
 96 |           .getSeries
 97 |           .asScala
 98 |       })
 99 |       .flatten
100 |   }
101 | 
102 |   private def getColumnsFromQueryResult(queryResult: QueryResult): Seq[String] = {
103 |     import collection.JavaConverters._
104 |     val series = getSeriesFromQueryResult(queryResult)
105 | 
106 |     series.map(el => el.getColumns.asScala).flatten
107 |   }
108 | 
109 |   private def getResultMapFromQueryResult(queryResult: QueryResult): Seq[Map[String, AnyRef]] = {
110 |     import collection.JavaConverters._
111 |     val series = getSeriesFromQueryResult(queryResult)
112 |     series
113 |       .map(el =>
114 |         el
115 |           .getValues
116 |           .asScala
117 |           .map(values =>
118 |             el
119 |               .getColumns
120 |               .asScala
121 |               .zip(values.asScala).toMap)
122 |       ).flatten
123 |   }
124 | 
125 | }
126 | 


--------------------------------------------------------------------------------