├── src └── main │ └── scala │ └── lexaneon │ └── amazon │ └── deequ │ ├── repository │ └── influxdb │ │ ├── InfluxDBConnectionProperties.scala │ │ ├── InfluxDBMetricsRepository.scala │ │ ├── InfluxDBMetricsRepositoryMultipleResultsLoader.scala │ │ ├── PointDecorator.scala │ │ └── InfluxDBAnalysisResultSerde.scala │ └── example │ └── InfluxDBMetricRepository.scala ├── README.md └── LICENSE /src/main/scala/lexaneon/amazon/deequ/repository/influxdb/InfluxDBConnectionProperties.scala: -------------------------------------------------------------------------------- 1 | package lexaneon.amazon.deequ.repository.influxdb 2 | 3 | case class InfluxDBConnectionProperties( 4 | serverURLWithPort: String, 5 | dbName: String, 6 | measurementName: String) 7 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # Amazon Deequ addons 2 | What is Amazon Deequ you can find here [here] 3 | 4 | Current repository stores next addons: 5 | 6 | ## InfluxDBMetricsRepository 7 | it allows save results to InfluxDB, for further visualization from Grafana (for example) 8 | 9 | Features: 10 | - Save results to InfluxDB 11 | 12 | 13 | To do: 14 | - Read all data from InfluxDB 15 | - Read data from InfluxDB by some predicates 16 | - Add examples 17 | 18 | 19 | 20 | [here]: https://github.com/awslabs/deequ -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2021 Alexey Artemov 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /src/main/scala/lexaneon/amazon/deequ/repository/influxdb/InfluxDBMetricsRepository.scala: -------------------------------------------------------------------------------- 1 | package lexaneon.amazon.deequ.repository.influxdb 2 | 3 | import com.amazon.deequ.analyzers.runners.AnalyzerContext 4 | import com.amazon.deequ.repository.{AnalysisResult, MetricsRepository, MetricsRepositoryMultipleResultsLoader, ResultKey} 5 | import org.influxdb.{InfluxDB, InfluxDBFactory} 6 | 7 | import java.util.concurrent.TimeUnit 8 | 9 | /** A Repository implementation using an influxDB 10 | * 11 | * @param serverURLWithPort - influxDB server URL with port, example: http://localhost:8086 12 | * @param dbName 13 | * @param measurementName 14 | */ 15 | class InfluxDBMetricsRepository(influxDBConnectionProperties: InfluxDBConnectionProperties) extends MetricsRepository{ 16 | 17 | implicit val influxDBConnect = initInfluxDBConnect 18 | 19 | def initInfluxDBConnect(): InfluxDB = { 20 | val influxDB = InfluxDBFactory.connect(influxDBConnectionProperties.serverURLWithPort) 21 | influxDB.setDatabase(influxDBConnectionProperties.dbName) 22 | influxDB 23 | } 24 | 25 | override def save(resultKey: ResultKey, analyzerContext: AnalyzerContext): Unit = { 26 | val successfulMetrics = analyzerContext.metricMap 27 | .filter { case (_, metric) => metric.value.isSuccess } 28 | 29 | val analyzerContextWithSuccessfulValues = AnalyzerContext(successfulMetrics) 30 | val points = 31 | InfluxDBAnalysisResultSerde. 32 | analysisResultToInfluxPointObject(resultKey, analyzerContextWithSuccessfulValues, influxDBConnectionProperties.measurementName) 33 | 34 | points.foreach(point => InfluxDBMetricsRepositoryMultipleResultsLoader.writeToInfluxDB(point)) 35 | 36 | } 37 | 38 | override def loadByKey(resultKey: ResultKey): Option[AnalyzerContext] = None 39 | 40 | override def load(): MetricsRepositoryMultipleResultsLoader = new InfluxDBMetricsRepositoryMultipleResultsLoader // TODO should be finished 41 | } 42 | -------------------------------------------------------------------------------- /src/main/scala/lexaneon/amazon/deequ/example/InfluxDBMetricRepository.scala: -------------------------------------------------------------------------------- 1 | package lexaneon.amazon.deequ.example 2 | 3 | import com.amazon.deequ.analyzers.{Completeness, CountDistinct, Distinctness, Size, Uniqueness} 4 | import com.amazon.deequ.analyzers.runners.AnalysisRunner 5 | import com.amazon.deequ.analyzers.runners.AnalyzerContext.successMetricsAsDataFrame 6 | import com.amazon.deequ.repository.ResultKey 7 | import lexaneon.amazon.deequ.repository.influxdb.{InfluxDBConnectionProperties, InfluxDBMetricsRepository} 8 | import org.apache.spark.sql.SparkSession 9 | 10 | object InfluxDBMetricRepository extends App{ 11 | 12 | val spark = initSpark() 13 | val filePath = "src/main/resources/dataForExample/data.csv" 14 | val df = spark.read.option("header", "true").csv(filePath).toDF() 15 | 16 | val influxDBConnectionProperties = InfluxDBConnectionProperties("http://localhost:8086", "example", "InfluxDBMetricsRepository") 17 | 18 | val resultKey = ResultKey( 19 | System.currentTimeMillis(), 20 | Map("dataSetFilePath" -> filePath, "dataSetName" -> "orders")) 21 | 22 | val analysisResult = AnalysisRunner 23 | .onData(df) 24 | .useRepository(new InfluxDBMetricsRepository(influxDBConnectionProperties)) 25 | .saveOrAppendResult(resultKey) 26 | .addAnalyzer(Size()) 27 | .addAnalyzer(Distinctness("customer_id")) 28 | .addAnalyzer(CountDistinct("customer_id")) 29 | .addAnalyzer(Uniqueness(Seq("customer_id", "id"))) 30 | .addAnalyzer(Uniqueness("id")) 31 | .addAnalyzer(Completeness("trans_date")) 32 | .addAnalyzer(Completeness("id")) 33 | .run() 34 | 35 | val metric = successMetricsAsDataFrame( spark, analysisResult) 36 | 37 | metric.show(false) 38 | 39 | spark.close() 40 | def initSpark(isLocalRun: Boolean = true): SparkSession = { 41 | val sparkSessionBuilder = 42 | SparkSession 43 | .builder 44 | .appName(this.getClass.getSimpleName) 45 | 46 | val spark = 47 | if (isLocalRun){ 48 | sparkSessionBuilder 49 | .master("local[*]") 50 | .getOrCreate() 51 | }else 52 | sparkSessionBuilder.getOrCreate() 53 | 54 | spark.sparkContext.setLogLevel("ERROR") 55 | spark.conf.set("spark.sql.session.timeZone", "UTC") 56 | 57 | spark 58 | } 59 | 60 | } 61 | 62 | 63 | -------------------------------------------------------------------------------- /src/main/scala/lexaneon/amazon/deequ/repository/influxdb/InfluxDBMetricsRepositoryMultipleResultsLoader.scala: -------------------------------------------------------------------------------- 1 | package lexaneon.amazon.deequ.repository.influxdb 2 | 3 | import com.amazon.deequ.analyzers.Analyzer 4 | import com.amazon.deequ.analyzers.runners.AnalyzerContext 5 | import com.amazon.deequ.metrics.{DoubleMetric, Entity, Metric} 6 | import com.amazon.deequ.repository.{AnalysisResult, MetricsRepositoryMultipleResultsLoader, ResultKey} 7 | import org.influxdb.InfluxDB 8 | import org.influxdb.dto.{Query} 9 | 10 | 11 | class InfluxDBMetricsRepositoryMultipleResultsLoader extends MetricsRepositoryMultipleResultsLoader{ 12 | 13 | private[this] var tagValues: Option[Map[String, String]] = None 14 | private[this] var forAnalyzers: Option[Seq[Analyzer[_, Metric[_]]]] = None 15 | private[this] var before: Option[Long] = None 16 | private[this] var after: Option[Long] = None 17 | 18 | /** 19 | * Filter out results that don't have specific values for specific tags 20 | * 21 | * @param tagValues Map with tag names and the corresponding values to filter for 22 | */ 23 | def withTagValues(tagValues: Map[String, String]): MetricsRepositoryMultipleResultsLoader = { 24 | this.tagValues = Option(tagValues) 25 | this 26 | } 27 | 28 | /** 29 | * Choose all metrics that you want to load 30 | * 31 | * @param analyzers A sequence of analyers who's resulting metrics you want to load 32 | */ 33 | def forAnalyzers(analyzers: Seq[Analyzer[_, Metric[_]]]) 34 | : MetricsRepositoryMultipleResultsLoader = { 35 | 36 | this.forAnalyzers = Option(analyzers) 37 | this 38 | } 39 | 40 | /** 41 | * Only look at AnalysisResults with a result key with a smaller value 42 | * 43 | * @param dateTime The maximum dateTime of AnalysisResults to look at 44 | */ 45 | def before(dateTime: Long): MetricsRepositoryMultipleResultsLoader = { 46 | this.before = Option(dateTime) 47 | this 48 | } 49 | 50 | /** 51 | * Only look at AnalysisResults with a result key with a greater value 52 | * 53 | * @param dateTime The minimum dateTime of AnalysisResults to look at 54 | */ 55 | def after(dateTime: Long): MetricsRepositoryMultipleResultsLoader = { 56 | this.after = Option(dateTime) 57 | this 58 | } 59 | 60 | /** Get the AnalysisResult */ 61 | override def get(): Seq[AnalysisResult] = Seq.empty[AnalysisResult] //TODO should be finished 62 | 63 | 64 | } 65 | 66 | object InfluxDBMetricsRepositoryMultipleResultsLoader { 67 | 68 | def apply(influxDBConnectionProperties: InfluxDBConnectionProperties): InfluxDBMetricsRepositoryMultipleResultsLoader = { 69 | new InfluxDBMetricsRepositoryMultipleResultsLoader 70 | } 71 | 72 | def writeToInfluxDB(point: PointBuilderDecorator)(implicit influxDBConnect: InfluxDB): Unit = { 73 | influxDBConnect.write(point.build()) 74 | } 75 | 76 | def readFromInfluxDB(implicit influxDBConnect: InfluxDB, measurementName: String): Option[Seq[AnalysisResult]] = { 77 | val queryResult = influxDBConnect.query(new Query(s"select * from ${measurementName}")) 78 | InfluxDBAnalysisResultSerde.queryToAnalysisResult(queryResult) 79 | } 80 | } 81 | 82 | 83 | -------------------------------------------------------------------------------- /src/main/scala/lexaneon/amazon/deequ/repository/influxdb/PointDecorator.scala: -------------------------------------------------------------------------------- 1 | package lexaneon.amazon.deequ.repository.influxdb 2 | 3 | import org.influxdb.dto.Point 4 | 5 | import java.{lang, util} 6 | import java.util.concurrent.TimeUnit 7 | 8 | object PointDecorator { 9 | def measurement(measurement: String): PointBuilderDecorator = new PointBuilderDecorator(Point.measurement(measurement)) 10 | } 11 | 12 | class PointBuilderDecorator(val pointBuilder: Point.Builder) { 13 | def tag(tagName: String, value: String): PointBuilderDecorator = { 14 | pointBuilder.tag(tagName, value) 15 | this 16 | } 17 | 18 | def tag(tagsToAdd: util.Map[String, String]): PointBuilderDecorator = { 19 | pointBuilder.tag(tagsToAdd) 20 | this 21 | } 22 | 23 | def addField(field: String, value: Boolean): PointBuilderDecorator = { 24 | pointBuilder.addField(field, value) 25 | this 26 | } 27 | 28 | def addField(field: String, value: Long): PointBuilderDecorator = { 29 | pointBuilder.addField(field, value) 30 | this 31 | } 32 | 33 | def addField(field: String, value: Double): PointBuilderDecorator = { 34 | pointBuilder.addField(field, value) 35 | this 36 | } 37 | 38 | def addField(field: String, value: Int): PointBuilderDecorator = { 39 | pointBuilder.addField(field, value) 40 | this 41 | } 42 | 43 | def addField(field: String, value: Float): PointBuilderDecorator = { 44 | pointBuilder.addField(field, value) 45 | this 46 | } 47 | 48 | def addField(field: String, value: Short): PointBuilderDecorator = { 49 | pointBuilder.addField(field, value) 50 | this 51 | } 52 | 53 | def addField(field: String, value: Number): PointBuilderDecorator = { 54 | pointBuilder.addField(field, value) 55 | this 56 | } 57 | 58 | def addField(field: String, value: String): PointBuilderDecorator = { 59 | pointBuilder.addField(field, value) 60 | this 61 | } 62 | 63 | def addField(field: String, value: Any): PointBuilderDecorator = { 64 | value match{ 65 | case k: Double => pointBuilder.addField(field, k) 66 | case k: Float => pointBuilder.addField(field, k) 67 | case k: Long => pointBuilder.addField(field, k) 68 | case k: Int => pointBuilder.addField(field, k) 69 | case k: Short => pointBuilder.addField(field, k) 70 | case k: Byte => pointBuilder.addField(field, k) 71 | case k: Char => pointBuilder.addField(field, k) 72 | case k: Boolean => pointBuilder.addField(field, k) 73 | case k: String => pointBuilder.addField(field, k) 74 | case _ => throw new ClassCastException("No such class to cast") 75 | } 76 | this 77 | } 78 | 79 | def fields(fieldsToAdd: util.Map[String, AnyRef]): PointBuilderDecorator = { 80 | pointBuilder.fields(fieldsToAdd) 81 | this 82 | } 83 | 84 | def time(timeToSet: Number, precisionToSet: TimeUnit): PointBuilderDecorator = { 85 | pointBuilder.time(timeToSet, precisionToSet) 86 | this 87 | } 88 | 89 | def time(timeToSet: Long, precisionToSet: TimeUnit): PointBuilderDecorator = { 90 | pointBuilder.time(timeToSet, precisionToSet) 91 | this 92 | } 93 | 94 | def time(timeToSet: lang.Long, precisionToSet: TimeUnit): PointBuilderDecorator = { 95 | pointBuilder.time(timeToSet, precisionToSet) 96 | this 97 | } 98 | 99 | def hasFields: Boolean = pointBuilder.hasFields 100 | 101 | def addFieldsFromPOJO(pojo: Any): PointBuilderDecorator = { 102 | pointBuilder.addFieldsFromPOJO(pojo) 103 | this 104 | } 105 | 106 | def build(): Point = pointBuilder.build() 107 | } -------------------------------------------------------------------------------- /src/main/scala/lexaneon/amazon/deequ/repository/influxdb/InfluxDBAnalysisResultSerde.scala: -------------------------------------------------------------------------------- 1 | package lexaneon.amazon.deequ.repository.influxdb 2 | 3 | import com.amazon.deequ.analyzers.runners.AnalyzerContext 4 | import com.amazon.deequ.repository.{AnalysisResult, ResultKey} 5 | import org.influxdb.dto.QueryResult 6 | 7 | import java.util.concurrent.TimeUnit 8 | 9 | object InfluxDBAnalysisResultSerde { 10 | 11 | val DATASET_DATE_FIELD = "dataSetDate" 12 | val TAGS_PREFIX_FIELD = "tags_" 13 | val ENTITY_FIELD = "entity" 14 | val INSTANCE_FIELD = "instance" 15 | val NAME_FIELD = "name" 16 | val VALUE_FIELD = "value" 17 | 18 | // create InfluxDB Point objects from AnalysisResult 19 | def analysisResultToInfluxPointObject(resultKey: ResultKey, analyzerContext: AnalyzerContext, measurementName: String): Seq[PointBuilderDecorator] = { 20 | val result = 21 | analyzerContext 22 | .allMetrics 23 | .map(el =>{ 24 | val point = PointDecorator 25 | .measurement(measurementName) 26 | .time(System.currentTimeMillis(), TimeUnit.MILLISECONDS) 27 | .addField(DATASET_DATE_FIELD, resultKey.dataSetDate) 28 | .addField(VALUE_FIELD, el.value.toOption.get) 29 | .tag(ENTITY_FIELD, el.entity.toString) 30 | .tag(INSTANCE_FIELD, el.instance) 31 | .tag(NAME_FIELD, el.name) 32 | 33 | resultKey 34 | .tags 35 | .foldLeft(point)((acc, pair) => acc.tag(s"$TAGS_PREFIX_FIELD${pair._1}", pair._2)) 36 | }) 37 | result 38 | } 39 | 40 | def queryToAnalysisResult(queryResult: QueryResult): Option[Seq[AnalysisResult]] = { 41 | val columns = getColumnsFromQueryResult(queryResult) 42 | val result = getResultMapFromQueryResult(queryResult) 43 | 44 | // TODO should be finished 45 | println(s"Columns: ${columns.mkString(";")}") 46 | 47 | result.foreach(rec => { 48 | rec.foreach(el => print(s"key: ${el._1} value: ${el._2};")) 49 | println 50 | }) 51 | val tagColumns = columns.filter(_.startsWith(TAGS_PREFIX_FIELD)) 52 | 53 | val resultTable = 54 | result 55 | .groupBy(el => { 56 | val tagMap = tagColumns.map(tags => tags.split(TAGS_PREFIX_FIELD)(1) -> el(tags).asInstanceOf[String]).toMap 57 | val dataSetMap = Map(DATASET_DATE_FIELD -> el(DATASET_DATE_FIELD).asInstanceOf[String]) 58 | tagMap ++ dataSetMap 59 | }) 60 | .map(el => 61 | (ResultKey( 62 | el._1(DATASET_DATE_FIELD).asInstanceOf[Long], 63 | el._1.filter(key => key._1 != DATASET_DATE_FIELD)), el._2.map(k => k).distinct) 64 | ) 65 | 66 | 67 | val resultKey = 68 | result 69 | .map( 70 | record => 71 | ResultKey( 72 | record(DATASET_DATE_FIELD).asInstanceOf[Long], 73 | tagColumns.map(tags => tags.split(TAGS_PREFIX_FIELD)(1) -> record(tags).asInstanceOf[String]).toMap) 74 | ) 75 | .distinct 76 | 77 | // val metricMap = TODO finish 78 | // result 79 | // .map( 80 | // record => 81 | // DoubleMetric( 82 | // Entity.withName(record(ENTITY_FIELD).asInstanceOf[String]), 83 | // record(NAME_FIELD).asInstanceOf[String], 84 | // record(INSTANCE_FIELD).asInstanceOf[String], 85 | // Try((VALUE_FIELD).asInstanceOf[Double]))) 86 | // AnalysisResult(resultKey(0), AnalyzerContext(metricMap)) 87 | None 88 | } 89 | private def getSeriesFromQueryResult(queryResult: QueryResult): Seq[QueryResult.Series] = { 90 | import collection.JavaConverters._ 91 | queryResult 92 | .getResults 93 | .asScala 94 | .map(record => { 95 | record 96 | .getSeries 97 | .asScala 98 | }) 99 | .flatten 100 | } 101 | 102 | private def getColumnsFromQueryResult(queryResult: QueryResult): Seq[String] = { 103 | import collection.JavaConverters._ 104 | val series = getSeriesFromQueryResult(queryResult) 105 | 106 | series.map(el => el.getColumns.asScala).flatten 107 | } 108 | 109 | private def getResultMapFromQueryResult(queryResult: QueryResult): Seq[Map[String, AnyRef]] = { 110 | import collection.JavaConverters._ 111 | val series = getSeriesFromQueryResult(queryResult) 112 | series 113 | .map(el => 114 | el 115 | .getValues 116 | .asScala 117 | .map(values => 118 | el 119 | .getColumns 120 | .asScala 121 | .zip(values.asScala).toMap) 122 | ).flatten 123 | } 124 | 125 | } 126 | --------------------------------------------------------------------------------