├── NOTICE ├── .gitignore ├── .travis.yml ├── test-data └── README.md ├── .github └── PULL_REQUEST_TEMPLATE.md ├── CODE_OF_CONDUCT.md ├── src ├── test │ ├── resources │ │ ├── log4j.properties │ │ └── EMRSparkShellTest.scala │ └── scala │ │ └── com │ │ └── amazon │ │ └── deequ │ │ ├── KLL │ │ ├── KLLBenchmarkHelper.scala │ │ ├── KLLBenchmark.java │ │ └── KLLDistanceTest.scala │ │ ├── utils │ │ ├── TempFileUtils.scala │ │ ├── CollectionUtils.scala │ │ └── AssertionUtils.scala │ │ ├── constraints │ │ └── ConstraintUtils.scala │ │ ├── examples │ │ └── ExamplesTest.scala │ │ ├── package.scala │ │ ├── checks │ │ ├── ColumnConditionTest.scala │ │ └── FilterableCheckTest.scala │ │ ├── anomalydetection │ │ ├── RateOfChangeStrategyTest.scala │ │ ├── AnomalyDetectionTestUtilsTest.scala │ │ ├── HistoryUtilsTest.scala │ │ ├── AnomalyDetectionTestUtils.scala │ │ └── SimpleThresholdStrategyTest.scala │ │ ├── DatatypeSuggestionTest.scala │ │ ├── SparkBasicTest.scala │ │ ├── analyzers │ │ ├── StatesTest.scala │ │ └── UniquenessTest.scala │ │ ├── metrics │ │ └── MetricsTests.scala │ │ ├── SparkMonitor.scala │ │ └── SparkContextSpec.scala └── main │ └── scala │ └── com │ └── amazon │ └── deequ │ ├── analyzers │ ├── FilterableAnalyzer.scala │ ├── CountDistinct.scala │ ├── catalyst │ │ ├── StatefulStdDevPop.scala │ │ ├── StatefulCorrelation.scala │ │ ├── DeequFunctions.scala │ │ ├── StatefulDataType.scala │ │ └── StatefulKLLSketch.scala │ ├── Distinctness.scala │ ├── Entropy.scala │ ├── Uniqueness.scala │ ├── Size.scala │ ├── MaxLength.scala │ ├── MinLength.scala │ ├── Sum.scala │ ├── Completeness.scala │ ├── UniqueValueRatio.scala │ ├── Maximum.scala │ ├── Minimum.scala │ ├── Mean.scala │ ├── NonSampleCompactor.scala │ ├── Compliance.scala │ ├── ApproxCountDistinct.scala │ ├── Analysis.scala │ ├── StandardDeviation.scala │ ├── runners │ │ └── MetricCalculationException.scala │ ├── Distance.scala │ ├── Correlation.scala │ ├── PatternMatch.scala │ └── MutualInformation.scala │ ├── examples │ ├── entities.scala │ ├── ExampleUtils.scala │ ├── KLLCheckExample.scala │ ├── IncrementalMetricsExample.scala │ ├── BasicExample.scala │ ├── ConstraintSuggestionExample.scala │ ├── DataProfilingExample.scala │ ├── MetricsRepositoryExample.scala │ └── AnomalyDetectionExample.scala │ ├── constraints │ └── ConstrainableDataTypes.scala │ ├── checks │ ├── ColumnCondition.scala │ └── CheckWithLastConstraintFilterable.scala │ ├── anomalydetection │ ├── RateOfChangeStrategy.scala │ ├── AnomalyDetectionStrategy.scala │ ├── AbsoluteChangeStrategy.scala │ ├── HistoryUtils.scala │ ├── DetectionResult.scala │ ├── SimpleThresholdStrategy.scala │ ├── RelativeRateOfChangeStrategy.scala │ └── BatchNormalStrategy.scala │ ├── suggestions │ ├── rules │ │ ├── ConstraintRule.scala │ │ ├── CompleteIfCompleteRule.scala │ │ ├── NonNegativeNumbersRule.scala │ │ ├── UniqueIfApproximatelyUniqueRule.scala │ │ ├── RetainTypeRule.scala │ │ ├── RetainCompletenessRule.scala │ │ └── CategoricalRangeRule.scala │ ├── ConstraintSuggestionResult.scala │ └── ConstraintSuggestion.scala │ ├── repository │ └── MetricsRepository.scala │ ├── metrics │ ├── HistogramMetric.scala │ ├── Metric.scala │ └── KLLMetric.scala │ ├── io │ └── DfsUtils.scala │ └── profiles │ └── ColumnProfilerRunner.scala ├── Makefile ├── settings.xml ├── docs └── key-concepts.md └── CONTRIBUTING.md /NOTICE: -------------------------------------------------------------------------------- 1 | Deequ 2 | Copyright 2018 Amazon.com, Inc. or its affiliates. All Rights Reserved. 3 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | .idea/ 2 | *.iml 3 | **/*.iml 4 | target/.travis/public-signing-key.gpg 5 | target/ 6 | -------------------------------------------------------------------------------- /.travis.yml: -------------------------------------------------------------------------------- 1 | language: java 2 | jdk: oraclejdk8 3 | dist: trusty 4 | 5 | script: make build 6 | -------------------------------------------------------------------------------- /test-data/README.md: -------------------------------------------------------------------------------- 1 | # Dataset used for testing 2 | 3 | * [titanic.csv](https://www.kaggle.com/c/titanic/data) -------------------------------------------------------------------------------- /.github/PULL_REQUEST_TEMPLATE.md: -------------------------------------------------------------------------------- 1 | *Issue #, if available:* 2 | 3 | *Description of changes:* 4 | 5 | 6 | By submitting this pull request, I confirm that my contribution is made under the terms of the Apache 2.0 license. 7 | -------------------------------------------------------------------------------- /CODE_OF_CONDUCT.md: -------------------------------------------------------------------------------- 1 | ## Code of Conduct 2 | This project has adopted the [Amazon Open Source Code of Conduct](https://aws.github.io/code-of-conduct). 3 | For more information see the [Code of Conduct FAQ](https://aws.github.io/code-of-conduct-faq) or contact 4 | opensource-codeofconduct@amazon.com with any additional questions or comments. 5 | -------------------------------------------------------------------------------- /src/test/resources/log4j.properties: -------------------------------------------------------------------------------- 1 | # Change this to set Spark log level 2 | log4j.logger.org.apache.spark=WARN 3 | 4 | # Silence akka remoting 5 | log4j.logger.Remoting=WARN 6 | 7 | # Ignore messages below warning level from Jetty, because it's a bit verbose 8 | log4j.logger.org.eclipse.jetty=WARN 9 | 10 | # INFO log level not required for tests 11 | log4j.logger.org.apache=WARN 12 | -------------------------------------------------------------------------------- /Makefile: -------------------------------------------------------------------------------- 1 | # mvn profiles for the different supported 2 | # Spark and Scala versions. Uncomment 3 | # the one that you want to use. You can also 4 | # override the profile on the command line: 5 | # `make MVN_PROFILE=spark-2.4-scala-2.11 build` 6 | MVN_PROFILE := spark-3.0-scala-2.12 7 | # MVN_PROFILE := spark-2.4-scala-2.11 8 | # MVN_PROFILE := spark-2.3-scala-2.11 9 | # MVN_PROFILE := spark-2.2-scala-2.11 10 | 11 | # Build the project for specific Spark and 12 | # Scala versions. You can change the profile 13 | # variable to use a different Scala or Spark 14 | # version (see list above). 15 | # If you need more log ouput remove the -q flag. 16 | build: 17 | mvn clean install -q -P $(MVN_PROFILE) 18 | -------------------------------------------------------------------------------- /settings.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | ossrh 5 | ${env.MAVEN_REPO_USERNAME} 6 | ${env.MAVEN_REPO_PASSWORD} 7 | 8 | 9 | 10 | 11 | release 12 | 13 | true 14 | 15 | 16 | gpg 17 | 72A07B34207DF21F2CD468178D0084713489CE20 18 | ${env.MAVEN_GPG_PASSPHRASE} 19 | 20 | 21 | 22 | 23 | -------------------------------------------------------------------------------- /src/main/scala/com/amazon/deequ/analyzers/FilterableAnalyzer.scala: -------------------------------------------------------------------------------- 1 | /** 2 | * Copyright 2018 Amazon.com, Inc. or its affiliates. All Rights Reserved. 3 | * 4 | * Licensed under the Apache License, Version 2.0 (the "License"). You may not 5 | * use this file except in compliance with the License. A copy of the License 6 | * is located at 7 | * 8 | * http://aws.amazon.com/apache2.0/ 9 | * 10 | * or in the "license" file accompanying this file. This file is distributed on 11 | * an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either 12 | * express or implied. See the License for the specific language governing 13 | * permissions and limitations under the License. 14 | * 15 | */ 16 | 17 | package com.amazon.deequ.analyzers 18 | 19 | /** 20 | * Common trait for Analyzers that support dataset filtering 21 | */ 22 | trait FilterableAnalyzer { 23 | def filterCondition: Option[String] 24 | } 25 | -------------------------------------------------------------------------------- /src/main/scala/com/amazon/deequ/examples/entities.scala: -------------------------------------------------------------------------------- 1 | /** 2 | * Copyright 2018 Amazon.com, Inc. or its affiliates. All Rights Reserved. 3 | * 4 | * Licensed under the Apache License, Version 2.0 (the "License"). You may not 5 | * use this file except in compliance with the License. A copy of the License 6 | * is located at 7 | * 8 | * http://aws.amazon.com/apache2.0/ 9 | * 10 | * or in the "license" file accompanying this file. This file is distributed on 11 | * an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either 12 | * express or implied. See the License for the specific language governing 13 | * permissions and limitations under the License. 14 | * 15 | */ 16 | 17 | package com.amazon.deequ.examples 18 | 19 | private[deequ] case class Item( 20 | id: Long, 21 | productName: String, 22 | description: String, 23 | priority: String, 24 | numViews: Long 25 | ) 26 | 27 | private[deequ] case class Manufacturer( 28 | id: Long, 29 | manufacturerName: String, 30 | countryCode: String 31 | ) 32 | -------------------------------------------------------------------------------- /src/test/scala/com/amazon/deequ/KLL/KLLBenchmarkHelper.scala: -------------------------------------------------------------------------------- 1 | /** 2 | * Copyright 2018 Amazon.com, Inc. or its affiliates. All Rights Reserved. 3 | * 4 | * Licensed under the Apache License, Version 2.0 (the "License"). You may not 5 | * use this file except in compliance with the License. A copy of the License 6 | * is located at 7 | * 8 | * http://aws.amazon.com/apache2.0/ 9 | * 10 | * or in the "license" file accompanying this file. This file is distributed on 11 | * an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either 12 | * express or implied. See the License for the specific language governing 13 | * permissions and limitations under the License. 14 | * 15 | */ 16 | 17 | package com.amazon.deequ.KLL 18 | 19 | import com.amazon.deequ.analyzers.{KLLSketch, QuantileNonSample} 20 | 21 | object KLLBenchmarkHelper { 22 | 23 | def floatSketch(): QuantileNonSample[java.lang.Float] = { 24 | new QuantileNonSample[java.lang.Float](KLLSketch.DEFAULT_SKETCH_SIZE, 25 | KLLSketch.DEFAULT_SHRINKING_FACTOR) 26 | } 27 | 28 | } 29 | -------------------------------------------------------------------------------- /src/test/scala/com/amazon/deequ/utils/TempFileUtils.scala: -------------------------------------------------------------------------------- 1 | /** 2 | * Copyright 2018 Amazon.com, Inc. or its affiliates. All Rights Reserved. 3 | * 4 | * Licensed under the Apache License, Version 2.0 (the "License"). You may not 5 | * use this file except in compliance with the License. A copy of the License 6 | * is located at 7 | * 8 | * http://aws.amazon.com/apache2.0/ 9 | * 10 | * or in the "license" file accompanying this file. This file is distributed on 11 | * an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either 12 | * express or implied. See the License for the specific language governing 13 | * permissions and limitations under the License. 14 | * 15 | */ 16 | 17 | package com.amazon.deequ.utils 18 | 19 | import java.nio.file.Files 20 | import java.util.UUID 21 | 22 | object TempFileUtils { 23 | def tempDir(prefix: String = UUID.randomUUID().toString): String = { 24 | val tempDir = Files.createTempDirectory(prefix).toFile 25 | tempDir.deleteOnExit() 26 | tempDir.getAbsolutePath 27 | } 28 | } 29 | -------------------------------------------------------------------------------- /src/main/scala/com/amazon/deequ/constraints/ConstrainableDataTypes.scala: -------------------------------------------------------------------------------- 1 | /** 2 | * Copyright 2018 Amazon.com, Inc. or its affiliates. All Rights Reserved. 3 | * 4 | * Licensed under the Apache License, Version 2.0 (the "License"). You may not 5 | * use this file except in compliance with the License. A copy of the License 6 | * is located at 7 | * 8 | * http://aws.amazon.com/apache2.0/ 9 | * 10 | * or in the "license" file accompanying this file. This file is distributed on 11 | * an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either 12 | * express or implied. See the License for the specific language governing 13 | * permissions and limitations under the License. 14 | * 15 | */ 16 | 17 | package com.amazon.deequ.constraints 18 | 19 | object ConstrainableDataTypes extends Enumeration { 20 | val Null: Value = Value(0) 21 | val Fractional: Value = Value(1) 22 | val Integral: Value = Value(2) 23 | val Boolean: Value = Value(3) 24 | val String: Value = Value(4) 25 | val Numeric: Value = Value(5) // Union of integral and fractional 26 | } 27 | -------------------------------------------------------------------------------- /src/test/scala/com/amazon/deequ/utils/CollectionUtils.scala: -------------------------------------------------------------------------------- 1 | /** 2 | * Copyright 2018 Amazon.com, Inc. or its affiliates. All Rights Reserved. 3 | * 4 | * Licensed under the Apache License, Version 2.0 (the "License"). You may not 5 | * use this file except in compliance with the License. A copy of the License 6 | * is located at 7 | * 8 | * http://aws.amazon.com/apache2.0/ 9 | * 10 | * or in the "license" file accompanying this file. This file is distributed on 11 | * an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either 12 | * express or implied. See the License for the specific language governing 13 | * permissions and limitations under the License. 14 | * 15 | */ 16 | 17 | package com.amazon.deequ.utils 18 | 19 | object CollectionUtils { 20 | 21 | implicit class SeqExtensions[A](val source: Seq[A]) { 22 | def forEachOrder(f: Seq[A] => Any): Unit = { 23 | source.combinations(source.size) 24 | .flatMap { _.permutations } 25 | .foreach { distinctOrder => f(distinctOrder) } 26 | } 27 | 28 | } 29 | } 30 | -------------------------------------------------------------------------------- /src/main/scala/com/amazon/deequ/checks/ColumnCondition.scala: -------------------------------------------------------------------------------- 1 | /** 2 | * Copyright 2018 Amazon.com, Inc. or its affiliates. All Rights Reserved. 3 | * 4 | * Licensed under the Apache License, Version 2.0 (the "License"). You may not 5 | * use this file except in compliance with the License. A copy of the License 6 | * is located at 7 | * 8 | * http://aws.amazon.com/apache2.0/ 9 | * 10 | * or in the "license" file accompanying this file. This file is distributed on 11 | * an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either 12 | * express or implied. See the License for the specific language governing 13 | * permissions and limitations under the License. 14 | * 15 | */ 16 | 17 | package com.amazon.deequ.checks 18 | 19 | import org.apache.spark.sql.functions.{col} 20 | 21 | private[checks] object ColumnCondition { 22 | 23 | def isEachNotNull(cols: Seq[String]): String = { 24 | cols 25 | .map(col(_).isNotNull) 26 | .reduce(_ and _) 27 | .toString() 28 | } 29 | 30 | def isAnyNotNull(cols: Seq[String]): String = { 31 | cols 32 | .map(col(_).isNotNull) 33 | .reduce(_ or _) 34 | .toString() 35 | } 36 | } 37 | -------------------------------------------------------------------------------- /src/test/scala/com/amazon/deequ/constraints/ConstraintUtils.scala: -------------------------------------------------------------------------------- 1 | /** 2 | * Copyright 2018 Amazon.com, Inc. or its affiliates. All Rights Reserved. 3 | * 4 | * Licensed under the Apache License, Version 2.0 (the "License"). You may not 5 | * use this file except in compliance with the License. A copy of the License 6 | * is located at 7 | * 8 | * http://aws.amazon.com/apache2.0/ 9 | * 10 | * or in the "license" file accompanying this file. This file is distributed on 11 | * an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either 12 | * express or implied. See the License for the specific language governing 13 | * permissions and limitations under the License. 14 | * 15 | */ 16 | 17 | package com.amazon.deequ.constraints 18 | 19 | import org.apache.spark.sql.DataFrame 20 | 21 | object ConstraintUtils { 22 | 23 | def calculate(constraint: Constraint, df: DataFrame): ConstraintResult = { 24 | 25 | val analysisBasedConstraint = constraint match { 26 | case nc: ConstraintDecorator => nc.inner 27 | case c: Constraint => c 28 | } 29 | 30 | analysisBasedConstraint.asInstanceOf[AnalysisBasedConstraint[_, _, _]].calculateAndEvaluate(df) 31 | } 32 | } 33 | -------------------------------------------------------------------------------- /src/test/scala/com/amazon/deequ/examples/ExamplesTest.scala: -------------------------------------------------------------------------------- 1 | /** 2 | * Copyright 2018 Amazon.com, Inc. or its affiliates. All Rights Reserved. 3 | * 4 | * Licensed under the Apache License, Version 2.0 (the "License"). You may not 5 | * use this file except in compliance with the License. A copy of the License 6 | * is located at 7 | * 8 | * http://aws.amazon.com/apache2.0/ 9 | * 10 | * or in the "license" file accompanying this file. This file is distributed on 11 | * an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either 12 | * express or implied. See the License for the specific language governing 13 | * permissions and limitations under the License. 14 | * 15 | */ 16 | 17 | package com.amazon.deequ.examples 18 | 19 | import org.scalatest.WordSpec 20 | 21 | class ExamplesTest extends WordSpec { 22 | 23 | "all examples" should { 24 | "run without errors" in { 25 | BasicExample.main(Array.empty) 26 | IncrementalMetricsExample.main(Array.empty) 27 | MetricsRepositoryExample.main(Array.empty) 28 | UpdateMetricsOnPartitionedDataExample.main(Array.empty) 29 | DataProfilingExample.main(Array.empty) 30 | AnomalyDetectionExample.main(Array.empty) 31 | ConstraintSuggestionExample.main(Array.empty) 32 | } 33 | } 34 | 35 | } 36 | -------------------------------------------------------------------------------- /src/test/scala/com/amazon/deequ/package.scala: -------------------------------------------------------------------------------- 1 | /** 2 | * Copyright 2018 Amazon.com, Inc. or its affiliates. All Rights Reserved. 3 | * 4 | * Licensed under the Apache License, Version 2.0 (the "License"). You may not 5 | * use this file except in compliance with the License. A copy of the License 6 | * is located at 7 | * 8 | * http://aws.amazon.com/apache2.0/ 9 | * 10 | * or in the "license" file accompanying this file. This file is distributed on 11 | * an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either 12 | * express or implied. See the License for the specific language governing 13 | * permissions and limitations under the License. 14 | * 15 | */ 16 | 17 | package com.amazon 18 | 19 | import org.apache.spark.sql.{DataFrame, Row, SparkSession} 20 | import org.apache.spark.sql.types.{StructField, StructType} 21 | import org.apache.spark.sql.types.{ DataType => SparkDT } 22 | 23 | package object deequ { 24 | def dataFrameWithColumn( 25 | name: String, 26 | columnType: SparkDT, 27 | sparkSession: SparkSession, 28 | values: Row*) 29 | : DataFrame = { 30 | 31 | import scala.collection.JavaConverters._ 32 | val struct = StructType(StructField(name, columnType) :: Nil) 33 | sparkSession.createDataFrame(values.asJava, struct).toDF(name) 34 | } 35 | } 36 | -------------------------------------------------------------------------------- /src/main/scala/com/amazon/deequ/anomalydetection/RateOfChangeStrategy.scala: -------------------------------------------------------------------------------- 1 | /** 2 | * Copyright 2018 Amazon.com, Inc. or its affiliates. All Rights Reserved. 3 | * 4 | * Licensed under the Apache License, Version 2.0 (the "License"). You may not 5 | * use this file except in compliance with the License. A copy of the License 6 | * is located at 7 | * 8 | * http://aws.amazon.com/apache2.0/ 9 | * 10 | * or in the "license" file accompanying this file. This file is distributed on 11 | * an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either 12 | * express or implied. See the License for the specific language governing 13 | * permissions and limitations under the License. 14 | * 15 | */ 16 | 17 | 18 | package com.amazon.deequ.anomalydetection 19 | 20 | /** 21 | * Provided for backwards compatibility. 22 | * the old [[RateOfChangeStrategy]] actually detects absolute changes 23 | * so it has been migrated to [[AbsoluteChangeStrategy]] 24 | * use [[RelativeRateOfChangeStrategy]] if you want to 25 | * detect relative changes to the previous values 26 | */ 27 | @deprecated("use AbsoluteChangeStrategy instead which describes the strategy more accurately") 28 | case class RateOfChangeStrategy( 29 | maxRateDecrease: Option[Double] = None, 30 | maxRateIncrease: Option[Double] = None, 31 | order: Int = 1) extends BaseChangeStrategy 32 | -------------------------------------------------------------------------------- /src/main/scala/com/amazon/deequ/anomalydetection/AnomalyDetectionStrategy.scala: -------------------------------------------------------------------------------- 1 | /** 2 | * Copyright 2018 Amazon.com, Inc. or its affiliates. All Rights Reserved. 3 | * 4 | * Licensed under the Apache License, Version 2.0 (the "License"). You may not 5 | * use this file except in compliance with the License. A copy of the License 6 | * is located at 7 | * 8 | * http://aws.amazon.com/apache2.0/ 9 | * 10 | * or in the "license" file accompanying this file. This file is distributed on 11 | * an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either 12 | * express or implied. See the License for the specific language governing 13 | * permissions and limitations under the License. 14 | * 15 | */ 16 | 17 | package com.amazon.deequ.anomalydetection 18 | 19 | /** Interface for all strategies that spot anomalies in a series of data points. */ 20 | trait AnomalyDetectionStrategy { 21 | 22 | /** 23 | * Search for anomalies in a series of data points. 24 | * 25 | * @param dataSeries The data contained in a Vector of Doubles 26 | * @param searchInterval The indices between which anomalies should be detected. [a, b). 27 | * @return The indices of all anomalies in the interval and their corresponding wrapper object. 28 | */ 29 | def detect( 30 | dataSeries: Vector[Double], 31 | searchInterval: (Int, Int) = (0, Int.MaxValue)): Seq[(Int, Anomaly)] 32 | } 33 | -------------------------------------------------------------------------------- /src/test/scala/com/amazon/deequ/checks/ColumnConditionTest.scala: -------------------------------------------------------------------------------- 1 | /** 2 | * Copyright 2018 Amazon.com, Inc. or its affiliates. All Rights Reserved. 3 | * 4 | * Licensed under the Apache License, Version 2.0 (the "License"). You may not 5 | * use this file except in compliance with the License. A copy of the License 6 | * is located at 7 | * 8 | * http://aws.amazon.com/apache2.0/ 9 | * 10 | * or in the "license" file accompanying this file. This file is distributed on 11 | * an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either 12 | * express or implied. See the License for the specific language governing 13 | * permissions and limitations under the License. 14 | * 15 | */ 16 | 17 | package com.amazon.deequ.checks 18 | 19 | 20 | import org.scalatest.wordspec.AnyWordSpec 21 | 22 | class ColumnConditionTest extends AnyWordSpec { 23 | 24 | "ColumnCondition" should { 25 | 26 | "return the correct isEachNotNull condition" in { 27 | assert( 28 | ColumnCondition.isEachNotNull(Seq("att1", "att2", "att3")) == 29 | "(((att1 IS NOT NULL) AND (att2 IS NOT NULL)) AND (att3 IS NOT NULL))" 30 | ) 31 | } 32 | 33 | "return the correct isAnyNotNull condition" in { 34 | assert( 35 | ColumnCondition.isAnyNotNull(Seq("att1", "att2", "att3")) == 36 | "(((att1 IS NOT NULL) OR (att2 IS NOT NULL)) OR (att3 IS NOT NULL))" 37 | ) 38 | } 39 | } 40 | 41 | } 42 | 43 | 44 | -------------------------------------------------------------------------------- /src/main/scala/com/amazon/deequ/analyzers/CountDistinct.scala: -------------------------------------------------------------------------------- 1 | /** 2 | * Copyright 2018 Amazon.com, Inc. or its affiliates. All Rights Reserved. 3 | * 4 | * Licensed under the Apache License, Version 2.0 (the "License"). You may not 5 | * use this file except in compliance with the License. A copy of the License 6 | * is located at 7 | * 8 | * http://aws.amazon.com/apache2.0/ 9 | * 10 | * or in the "license" file accompanying this file. This file is distributed on 11 | * an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either 12 | * express or implied. See the License for the specific language governing 13 | * permissions and limitations under the License. 14 | * 15 | */ 16 | 17 | package com.amazon.deequ.analyzers 18 | 19 | import com.amazon.deequ.metrics.DoubleMetric 20 | import org.apache.spark.sql.{Column, Row} 21 | import org.apache.spark.sql.functions.count 22 | import Analyzers._ 23 | 24 | case class CountDistinct(columns: Seq[String]) 25 | extends ScanShareableFrequencyBasedAnalyzer("CountDistinct", columns) { 26 | 27 | override def aggregationFunctions(numRows: Long): Seq[Column] = { 28 | count("*") :: Nil 29 | } 30 | 31 | override def fromAggregationResult(result: Row, offset: Int): DoubleMetric = { 32 | toSuccessMetric(result.getLong(offset).toDouble) 33 | } 34 | } 35 | 36 | object CountDistinct { 37 | def apply(column: String): CountDistinct = { 38 | new CountDistinct(column :: Nil) 39 | } 40 | } 41 | -------------------------------------------------------------------------------- /src/main/scala/com/amazon/deequ/analyzers/catalyst/StatefulStdDevPop.scala: -------------------------------------------------------------------------------- 1 | /** 2 | * Copyright 2018 Amazon.com, Inc. or its affiliates. All Rights Reserved. 3 | * 4 | * Licensed under the Apache License, Version 2.0 (the "License"). You may not 5 | * use this file except in compliance with the License. A copy of the License 6 | * is located at 7 | * 8 | * http://aws.amazon.com/apache2.0/ 9 | * 10 | * or in the "license" file accompanying this file. This file is distributed on 11 | * an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either 12 | * express or implied. See the License for the specific language governing 13 | * permissions and limitations under the License. 14 | * 15 | */ 16 | 17 | package org.apache.spark.sql 18 | 19 | import org.apache.spark.sql.catalyst.expressions._ 20 | import org.apache.spark.sql.catalyst.expressions.aggregate.CentralMomentAgg 21 | import org.apache.spark.sql.types._ 22 | 23 | /** Adjusted version of org.apache.spark.sql.catalyst.expressions.aggregate.StddevPop */ 24 | private[sql] case class StatefulStdDevPop(child: Expression) extends CentralMomentAgg(child) { 25 | 26 | override protected def momentOrder = 2 27 | 28 | override def dataType: DataType = StructType(StructField("n", DoubleType) :: 29 | StructField("avg", DoubleType) :: StructField("m2", DoubleType) :: Nil) 30 | 31 | override val evaluateExpression: Expression = CreateStruct(n :: avg :: m2 :: Nil) 32 | 33 | override def prettyName: String = "stateful_stddev_pop" 34 | } 35 | -------------------------------------------------------------------------------- /src/main/scala/com/amazon/deequ/suggestions/rules/ConstraintRule.scala: -------------------------------------------------------------------------------- 1 | /** 2 | * Copyright 2018 Amazon.com, Inc. or its affiliates. All Rights Reserved. 3 | * 4 | * Licensed under the Apache License, Version 2.0 (the "License"). You may not 5 | * use this file except in compliance with the License. A copy of the License 6 | * is located at 7 | * 8 | * http://aws.amazon.com/apache2.0/ 9 | * 10 | * or in the "license" file accompanying this file. This file is distributed on 11 | * an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either 12 | * express or implied. See the License for the specific language governing 13 | * permissions and limitations under the License. 14 | * 15 | */ 16 | 17 | package com.amazon.deequ.suggestions.rules 18 | 19 | import com.amazon.deequ.profiles.ColumnProfile 20 | import com.amazon.deequ.suggestions._ 21 | 22 | /** Abstract base class for all constraint suggestion rules */ 23 | abstract class ConstraintRule[P <: ColumnProfile] { 24 | 25 | val ruleDescription: String 26 | 27 | /** 28 | * Decide whether the rule should be applied to a particular column 29 | * 30 | * @param profile profile of the column 31 | * @param numRecords overall number of records 32 | * @return 33 | */ 34 | def shouldBeApplied(profile: P, numRecords: Long): Boolean 35 | 36 | /** 37 | * Generated a suggested constraint for the column 38 | * 39 | * @param profile profile of the column 40 | * @param numRecords overall number of records 41 | * @return 42 | */ 43 | def candidate(profile: P, numRecords: Long): ConstraintSuggestion 44 | } 45 | -------------------------------------------------------------------------------- /src/main/scala/com/amazon/deequ/analyzers/Distinctness.scala: -------------------------------------------------------------------------------- 1 | /** 2 | * Copyright 2018 Amazon.com, Inc. or its affiliates. All Rights Reserved. 3 | * 4 | * Licensed under the Apache License, Version 2.0 (the "License"). You may not 5 | * use this file except in compliance with the License. A copy of the License 6 | * is located at 7 | * 8 | * http://aws.amazon.com/apache2.0/ 9 | * 10 | * or in the "license" file accompanying this file. This file is distributed on 11 | * an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either 12 | * express or implied. See the License for the specific language governing 13 | * permissions and limitations under the License. 14 | * 15 | */ 16 | 17 | package com.amazon.deequ.analyzers 18 | 19 | import com.amazon.deequ.analyzers.Analyzers.COUNT_COL 20 | import org.apache.spark.sql.functions.{col, sum} 21 | import org.apache.spark.sql.types.DoubleType 22 | import org.apache.spark.sql.Column 23 | 24 | /** 25 | * Distinctness is the fraction of distinct values of a column(s). 26 | * 27 | * @param columns the column(s) for which to compute distinctness 28 | */ 29 | case class Distinctness(columns: Seq[String], where: Option[String] = None) 30 | extends ScanShareableFrequencyBasedAnalyzer("Distinctness", columns) 31 | with FilterableAnalyzer { 32 | 33 | override def aggregationFunctions(numRows: Long): Seq[Column] = { 34 | (sum(col(COUNT_COL).geq(1).cast(DoubleType)) / numRows) :: Nil 35 | } 36 | 37 | override def filterCondition: Option[String] = where 38 | } 39 | 40 | object Distinctness { 41 | def apply(column: String): Distinctness = { 42 | new Distinctness(column :: Nil) 43 | } 44 | } 45 | -------------------------------------------------------------------------------- /src/test/scala/com/amazon/deequ/anomalydetection/RateOfChangeStrategyTest.scala: -------------------------------------------------------------------------------- 1 | /** 2 | * Copyright 2018 Amazon.com, Inc. or its affiliates. All Rights Reserved. 3 | * 4 | * Licensed under the Apache License, Version 2.0 (the "License"). You may not 5 | * use this file except in compliance with the License. A copy of the License 6 | * is located at 7 | * 8 | * http://aws.amazon.com/apache2.0/ 9 | * 10 | * or in the "license" file accompanying this file. This file is distributed on 11 | * an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either 12 | * express or implied. See the License for the specific language governing 13 | * permissions and limitations under the License. 14 | * 15 | */ 16 | 17 | package com.amazon.deequ.anomalydetection 18 | 19 | import org.scalatest.{Matchers, WordSpec} 20 | 21 | /** 22 | * The tested class RateOfChangeStrategy is deprecated. 23 | * This test is to ensure backwards compatibility for deequ checks that still rely on this strategy. 24 | */ 25 | class RateOfChangeStrategyTest extends WordSpec with Matchers { 26 | 27 | "RateOfChange Strategy" should { 28 | 29 | val strategy = RateOfChangeStrategy(Some(-2.0), Some(2.0)) 30 | val data = (for (i <- 0 to 50) yield { 31 | if (i < 20 || i > 30) { 32 | 1.0 33 | } else { 34 | if (i % 2 == 0) i else -i 35 | } 36 | }).toVector 37 | 38 | "detect all anomalies if no interval specified" in { 39 | val anomalyResult = strategy.detect(data) 40 | val expected = for (i <- 20 to 31) yield { 41 | (i, Anomaly(Option(data(i)), 1.0)) 42 | } 43 | assert(anomalyResult == expected) 44 | } 45 | } 46 | } 47 | -------------------------------------------------------------------------------- /src/main/scala/com/amazon/deequ/analyzers/Entropy.scala: -------------------------------------------------------------------------------- 1 | /** 2 | * Copyright 2018 Amazon.com, Inc. or its affiliates. All Rights Reserved. 3 | * 4 | * Licensed under the Apache License, Version 2.0 (the "License"). You may not 5 | * use this file except in compliance with the License. A copy of the License 6 | * is located at 7 | * 8 | * http://aws.amazon.com/apache2.0/ 9 | * 10 | * or in the "license" file accompanying this file. This file is distributed on 11 | * an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either 12 | * express or implied. See the License for the specific language governing 13 | * permissions and limitations under the License. 14 | * 15 | */ 16 | 17 | package com.amazon.deequ.analyzers 18 | 19 | import com.amazon.deequ.analyzers.Analyzers.COUNT_COL 20 | import org.apache.spark.sql.Column 21 | import org.apache.spark.sql.functions.{col, sum, udf} 22 | 23 | /** 24 | * Entropy is a measure of the level of information contained in a message. Given the probability 25 | * distribution over values in a column, it describes how many bits are required to identify a 26 | * value. 27 | */ 28 | case class Entropy(column: String, where: Option[String] = None) 29 | extends ScanShareableFrequencyBasedAnalyzer("Entropy", column :: Nil) 30 | with FilterableAnalyzer { 31 | 32 | override def aggregationFunctions(numRows: Long): Seq[Column] = { 33 | val summands = udf { (count: Double) => 34 | if (count == 0.0) { 35 | 0.0 36 | } else { 37 | -(count / numRows) * math.log(count / numRows) 38 | } 39 | } 40 | 41 | sum(summands(col(COUNT_COL))) :: Nil 42 | } 43 | 44 | override def filterCondition: Option[String] = where 45 | } 46 | -------------------------------------------------------------------------------- /src/main/scala/com/amazon/deequ/anomalydetection/AbsoluteChangeStrategy.scala: -------------------------------------------------------------------------------- 1 | /** 2 | * Copyright 2018 Amazon.com, Inc. or its affiliates. All Rights Reserved. 3 | * 4 | * Licensed under the Apache License, Version 2.0 (the "License"). You may not 5 | * use this file except in compliance with the License. A copy of the License 6 | * is located at 7 | * 8 | * http://aws.amazon.com/apache2.0/ 9 | * 10 | * or in the "license" file accompanying this file. This file is distributed on 11 | * an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either 12 | * express or implied. See the License for the specific language governing 13 | * permissions and limitations under the License. 14 | * 15 | */ 16 | 17 | package com.amazon.deequ.anomalydetection 18 | 19 | /** 20 | * Detects anomalies based on the values' absolute change. 21 | * The order of the difference can be set manually. 22 | * If it is set to 0, this strategy acts like the [[SimpleThresholdStrategy]]. 23 | * 24 | * AbsoluteChangeStrategy(Some(-10.0), Some(10.0), 1) for example 25 | * calculates the first discrete difference 26 | * and if some point's value changes by more than 10.0 in one timestep, it flags it as an anomaly. 27 | * 28 | * @param maxRateDecrease Upper bound of accepted decrease (lower bound of increase). 29 | * @param maxRateIncrease Upper bound of accepted growth. 30 | * @param order Order of the calculated difference. 31 | * Set to 1 it calculates the difference between two consecutive values. 32 | */ 33 | case class AbsoluteChangeStrategy( 34 | maxRateDecrease: Option[Double] = None, 35 | maxRateIncrease: Option[Double] = None, 36 | order: Int = 1) extends BaseChangeStrategy 37 | -------------------------------------------------------------------------------- /src/main/scala/com/amazon/deequ/examples/ExampleUtils.scala: -------------------------------------------------------------------------------- 1 | /** 2 | * Copyright 2018 Amazon.com, Inc. or its affiliates. All Rights Reserved. 3 | * 4 | * Licensed under the Apache License, Version 2.0 (the "License"). You may not 5 | * use this file except in compliance with the License. A copy of the License 6 | * is located at 7 | * 8 | * http://aws.amazon.com/apache2.0/ 9 | * 10 | * or in the "license" file accompanying this file. This file is distributed on 11 | * an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either 12 | * express or implied. See the License for the specific language governing 13 | * permissions and limitations under the License. 14 | * 15 | */ 16 | 17 | package com.amazon.deequ.examples 18 | 19 | import org.apache.spark.sql.{DataFrame, SparkSession} 20 | 21 | private[deequ] object ExampleUtils { 22 | 23 | def withSpark(func: SparkSession => Unit): Unit = { 24 | val session = SparkSession.builder() 25 | .master("local") 26 | .appName("test") 27 | .config("spark.ui.enabled", "false") 28 | .getOrCreate() 29 | session.sparkContext.setCheckpointDir(System.getProperty("java.io.tmpdir")) 30 | 31 | try { 32 | func(session) 33 | } finally { 34 | session.stop() 35 | System.clearProperty("spark.driver.port") 36 | } 37 | } 38 | 39 | def itemsAsDataframe(session: SparkSession, items: Item*): DataFrame = { 40 | val rdd = session.sparkContext.parallelize(items) 41 | session.createDataFrame(rdd) 42 | } 43 | 44 | def manufacturersAsDataframe(session: SparkSession, manufacturers: Manufacturer*): DataFrame = { 45 | val rdd = session.sparkContext.parallelize(manufacturers) 46 | session.createDataFrame(rdd) 47 | } 48 | } 49 | -------------------------------------------------------------------------------- /src/main/scala/com/amazon/deequ/analyzers/Uniqueness.scala: -------------------------------------------------------------------------------- 1 | /** 2 | * Copyright 2018 Amazon.com, Inc. or its affiliates. All Rights Reserved. 3 | * 4 | * Licensed under the Apache License, Version 2.0 (the "License"). You may not 5 | * use this file except in compliance with the License. A copy of the License 6 | * is located at 7 | * 8 | * http://aws.amazon.com/apache2.0/ 9 | * 10 | * or in the "license" file accompanying this file. This file is distributed on 11 | * an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either 12 | * express or implied. See the License for the specific language governing 13 | * permissions and limitations under the License. 14 | * 15 | */ 16 | 17 | package com.amazon.deequ.analyzers 18 | 19 | import com.amazon.deequ.analyzers.Analyzers.COUNT_COL 20 | import org.apache.spark.sql.Column 21 | import org.apache.spark.sql.functions.{col, lit, sum} 22 | import org.apache.spark.sql.types.DoubleType 23 | 24 | /** Uniqueness is the fraction of unique values of a column(s), i.e., 25 | * values that occur exactly once. */ 26 | case class Uniqueness(columns: Seq[String], where: Option[String] = None) 27 | extends ScanShareableFrequencyBasedAnalyzer("Uniqueness", columns) 28 | with FilterableAnalyzer { 29 | 30 | override def aggregationFunctions(numRows: Long): Seq[Column] = { 31 | (sum(col(COUNT_COL).equalTo(lit(1)).cast(DoubleType)) / numRows) :: Nil 32 | } 33 | 34 | override def filterCondition: Option[String] = where 35 | } 36 | 37 | object Uniqueness { 38 | def apply(column: String): Uniqueness = { 39 | new Uniqueness(column :: Nil) 40 | } 41 | 42 | def apply(column: String, where: Option[String]): Uniqueness = { 43 | new Uniqueness(column :: Nil, where) 44 | } 45 | } 46 | -------------------------------------------------------------------------------- /src/main/scala/com/amazon/deequ/analyzers/Size.scala: -------------------------------------------------------------------------------- 1 | /** 2 | * Copyright 2018 Amazon.com, Inc. or its affiliates. All Rights Reserved. 3 | * 4 | * Licensed under the Apache License, Version 2.0 (the "License"). You may not 5 | * use this file except in compliance with the License. A copy of the License 6 | * is located at 7 | * 8 | * http://aws.amazon.com/apache2.0/ 9 | * 10 | * or in the "license" file accompanying this file. This file is distributed on 11 | * an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either 12 | * express or implied. See the License for the specific language governing 13 | * permissions and limitations under the License. 14 | * 15 | */ 16 | 17 | package com.amazon.deequ.analyzers 18 | 19 | import com.amazon.deequ.metrics.Entity 20 | import org.apache.spark.sql.{Column, Row} 21 | import Analyzers._ 22 | 23 | case class NumMatches(numMatches: Long) extends DoubleValuedState[NumMatches] { 24 | 25 | override def sum(other: NumMatches): NumMatches = { 26 | NumMatches(numMatches + other.numMatches) 27 | } 28 | 29 | override def metricValue(): Double = { 30 | numMatches.toDouble 31 | } 32 | 33 | } 34 | 35 | /** Size is the number of rows in a DataFrame. */ 36 | case class Size(where: Option[String] = None) 37 | extends StandardScanShareableAnalyzer[NumMatches]("Size", "*", Entity.Dataset) 38 | with FilterableAnalyzer { 39 | 40 | override def aggregationFunctions(): Seq[Column] = { 41 | conditionalCount(where) :: Nil 42 | } 43 | 44 | override def fromAggregationResult(result: Row, offset: Int): Option[NumMatches] = { 45 | ifNoNullsIn(result, offset) { _ => 46 | NumMatches(result.getLong(offset)) 47 | } 48 | } 49 | 50 | override def filterCondition: Option[String] = where 51 | } 52 | -------------------------------------------------------------------------------- /src/main/scala/com/amazon/deequ/analyzers/MaxLength.scala: -------------------------------------------------------------------------------- 1 | /** 2 | * Copyright 2018 Amazon.com, Inc. or its affiliates. All Rights Reserved. 3 | * 4 | * Licensed under the Apache License, Version 2.0 (the "License"). You may not 5 | * use this file except in compliance with the License. A copy of the License 6 | * is located at 7 | * 8 | * http://aws.amazon.com/apache2.0/ 9 | * 10 | * or in the "license" file accompanying this file. This file is distributed on 11 | * an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either 12 | * express or implied. See the License for the specific language governing 13 | * permissions and limitations under the License. 14 | * 15 | */ 16 | 17 | package com.amazon.deequ.analyzers 18 | 19 | import com.amazon.deequ.analyzers.Analyzers._ 20 | import com.amazon.deequ.analyzers.Preconditions.{hasColumn, isString} 21 | import org.apache.spark.sql.functions.{length, max} 22 | import org.apache.spark.sql.types.{DoubleType, StructType} 23 | import org.apache.spark.sql.{Column, Row} 24 | 25 | case class MaxLength(column: String, where: Option[String] = None) 26 | extends StandardScanShareableAnalyzer[MaxState]("MaxLength", column) 27 | with FilterableAnalyzer { 28 | 29 | override def aggregationFunctions(): Seq[Column] = { 30 | max(length(conditionalSelection(column, where))).cast(DoubleType) :: Nil 31 | } 32 | 33 | override def fromAggregationResult(result: Row, offset: Int): Option[MaxState] = { 34 | ifNoNullsIn(result, offset) { _ => 35 | MaxState(result.getDouble(offset)) 36 | } 37 | } 38 | 39 | override protected def additionalPreconditions(): Seq[StructType => Unit] = { 40 | hasColumn(column):: isString(column) :: Nil 41 | } 42 | 43 | override def filterCondition: Option[String] = where 44 | } 45 | -------------------------------------------------------------------------------- /src/test/scala/com/amazon/deequ/DatatypeSuggestionTest.scala: -------------------------------------------------------------------------------- 1 | /** 2 | * Copyright 2018 Amazon.com, Inc. or its affiliates. All Rights Reserved. 3 | * 4 | * Licensed under the Apache License, Version 2.0 (the "License"). You may not 5 | * use this file except in compliance with the License. A copy of the License 6 | * is located at 7 | * 8 | * http://aws.amazon.com/apache2.0/ 9 | * 10 | * or in the "license" file accompanying this file. This file is distributed on 11 | * an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either 12 | * express or implied. See the License for the specific language governing 13 | * permissions and limitations under the License. 14 | * 15 | */ 16 | 17 | package com.amazon.deequ 18 | 19 | import com.amazon.deequ.profiles.{ColumnProfiler, ColumnProfiles, StandardColumnProfile} 20 | import com.amazon.deequ.utils.FixtureSupport 21 | import org.apache.spark.sql.{DataFrame, SparkSession} 22 | import org.scalamock.scalatest.MockFactory 23 | import org.scalatest.{Matchers, WordSpec} 24 | 25 | class DatatypeSuggestionTest extends WordSpec with Matchers with SparkContextSpec 26 | with FixtureSupport with MockFactory{ 27 | 28 | "Column Profiler" should { 29 | "return the correct datatype(String) in case of profiling empty string columns" in 30 | withSparkSession { sparkSession => 31 | 32 | val df = getEmptyColumnDataDf(sparkSession = sparkSession) 33 | 34 | val profile = ColumnProfiler 35 | .profile(df, Option(Seq("att1"))) 36 | .profiles("att1") 37 | 38 | assert(profile.isInstanceOf[StandardColumnProfile]) 39 | assert(profile.isDataTypeInferred && profile.dataType.toString.equalsIgnoreCase("String")) 40 | } 41 | } 42 | 43 | } 44 | -------------------------------------------------------------------------------- /src/main/scala/com/amazon/deequ/analyzers/MinLength.scala: -------------------------------------------------------------------------------- 1 | /** 2 | * Copyright 2018 Amazon.com, Inc. or its affiliates. All Rights Reserved. 3 | * 4 | * Licensed under the Apache License, Version 2.0 (the "License"). You may not 5 | * use this file except in compliance with the License. A copy of the License 6 | * is located at 7 | * 8 | * http://aws.amazon.com/apache2.0/ 9 | * 10 | * or in the "license" file accompanying this file. This file is distributed on 11 | * an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either 12 | * express or implied. See the License for the specific language governing 13 | * permissions and limitations under the License. 14 | * 15 | */ 16 | 17 | package com.amazon.deequ.analyzers 18 | 19 | import com.amazon.deequ.analyzers.Analyzers._ 20 | import com.amazon.deequ.analyzers.Preconditions.{hasColumn, isString} 21 | import org.apache.spark.sql.functions.{length, min} 22 | import org.apache.spark.sql.types.{DoubleType, StructType} 23 | import org.apache.spark.sql.{Column, Row} 24 | 25 | case class MinLength(column: String, where: Option[String] = None) 26 | extends StandardScanShareableAnalyzer[MinState]("MinLength", column) 27 | with FilterableAnalyzer { 28 | 29 | override def aggregationFunctions(): Seq[Column] = { 30 | min(length(conditionalSelection(column, where))).cast(DoubleType) :: Nil 31 | } 32 | 33 | override def fromAggregationResult(result: Row, offset: Int): Option[MinState] = { 34 | ifNoNullsIn(result, offset) { _ => 35 | MinState(result.getDouble(offset)) 36 | } 37 | } 38 | 39 | override protected def additionalPreconditions(): Seq[StructType => Unit] = { 40 | hasColumn(column) :: isString(column) :: Nil 41 | } 42 | 43 | override def filterCondition: Option[String] = where 44 | } 45 | -------------------------------------------------------------------------------- /src/main/scala/com/amazon/deequ/anomalydetection/HistoryUtils.scala: -------------------------------------------------------------------------------- 1 | /** 2 | * Copyright 2018 Amazon.com, Inc. or its affiliates. All Rights Reserved. 3 | * 4 | * Licensed under the Apache License, Version 2.0 (the "License"). You may not 5 | * use this file except in compliance with the License. A copy of the License 6 | * is located at 7 | * 8 | * http://aws.amazon.com/apache2.0/ 9 | * 10 | * or in the "license" file accompanying this file. This file is distributed on 11 | * an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either 12 | * express or implied. See the License for the specific language governing 13 | * permissions and limitations under the License. 14 | * 15 | */ 16 | 17 | package com.amazon.deequ.anomalydetection 18 | 19 | import com.amazon.deequ.metrics.Metric 20 | 21 | /** 22 | * Contains utility methods to convert tuples of date and metric to a DataPoint 23 | */ 24 | private[deequ] object HistoryUtils { 25 | 26 | /** 27 | * Given a sequence of dated optional metrics, return sequence of dated optional metric values. 28 | * 29 | * @param metrics Sequence of dated optional metrics 30 | * @tparam M Type of the metric value 31 | * @return Sequence of dated optional metric values 32 | */ 33 | def extractMetricValues[M](metrics: Seq[(Long, Option[Metric[M]])]): Seq[DataPoint[M]] = { 34 | metrics.map { case (date, metric) => DataPoint(date, extractMetricValue[M](metric)) } 35 | } 36 | 37 | /** 38 | * Given an optional metric,returns optional metric value 39 | * 40 | * @param metric Optional metric 41 | * @tparam M Type of the metric value 42 | * @return Optional metric value 43 | */ 44 | def extractMetricValue[M](metric: Option[Metric[M]]): Option[M] = { 45 | metric.flatMap(_.value.toOption) 46 | } 47 | 48 | } 49 | -------------------------------------------------------------------------------- /src/test/scala/com/amazon/deequ/anomalydetection/AnomalyDetectionTestUtilsTest.scala: -------------------------------------------------------------------------------- 1 | /** 2 | * Copyright 2018 Amazon.com, Inc. or its affiliates. All Rights Reserved. 3 | * 4 | * Licensed under the Apache License, Version 2.0 (the "License"). You may not 5 | * use this file except in compliance with the License. A copy of the License 6 | * is located at 7 | * 8 | * http://aws.amazon.com/apache2.0/ 9 | * 10 | * or in the "license" file accompanying this file. This file is distributed on 11 | * an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either 12 | * express or implied. See the License for the specific language governing 13 | * permissions and limitations under the License. 14 | * 15 | */ 16 | 17 | package com.amazon.deequ.anomalydetection 18 | 19 | import org.scalatest.{Matchers, WordSpec} 20 | 21 | class AnomalyDetectionTestUtilsTest extends WordSpec with Matchers { 22 | 23 | "AnomalyDetectionTestUtilsTest" should { 24 | 25 | "throw an exception if no value found" in { 26 | intercept[IllegalArgumentException] { 27 | AnomalyDetectionTestUtils.firstDoubleFromString("noNumber") 28 | } 29 | intercept[IllegalArgumentException] { 30 | AnomalyDetectionTestUtils.firstThreeDoublesFromString("noNumber") 31 | } 32 | } 33 | 34 | "find first value" in { 35 | val str = "xx3.141yyu4.2" 36 | val value = AnomalyDetectionTestUtils.firstDoubleFromString(str) 37 | assert(value == 3.141) 38 | } 39 | 40 | "find all 3 values" in { 41 | val str = "In this 1 string are 3.000 values, not 42.01" 42 | 43 | val (first, second, third) = AnomalyDetectionTestUtils.firstThreeDoublesFromString(str) 44 | assert(first === 1) 45 | assert(second === 3.0) 46 | assert(third === 42.01) 47 | } 48 | } 49 | } 50 | -------------------------------------------------------------------------------- /src/test/scala/com/amazon/deequ/utils/AssertionUtils.scala: -------------------------------------------------------------------------------- 1 | /** 2 | * Copyright 2018 Amazon.com, Inc. or its affiliates. All Rights Reserved. 3 | * 4 | * Licensed under the Apache License, Version 2.0 (the "License"). You may not 5 | * use this file except in compliance with the License. A copy of the License 6 | * is located at 7 | * 8 | * http://aws.amazon.com/apache2.0/ 9 | * 10 | * or in the "license" file accompanying this file. This file is distributed on 11 | * an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either 12 | * express or implied. See the License for the specific language governing 13 | * permissions and limitations under the License. 14 | * 15 | */ 16 | 17 | package com.amazon.deequ.utils 18 | 19 | import scala.util.{Failure, Success, Try} 20 | 21 | object AssertionUtils { 22 | 23 | implicit class TryUtils[A](something: Try[A]) { 24 | def compare[B](other: Try[B]): Boolean = { 25 | (something, other) match { 26 | case (Success(a), Success(b)) => a == b 27 | case (Failure(a), Failure(b)) => a.getClass == b.getClass && (a.getMessage == b.getMessage) 28 | case (_, _) => false 29 | } 30 | } 31 | def compareFailureTypes[B](other: Try[B]): Boolean = { 32 | (something, other) match { 33 | case (Failure(a), Failure(b)) => a.getClass == b.getClass 34 | case (_, _) => false 35 | } 36 | } 37 | def compareOuterAndInnerFailureTypes[B](other: Try[B]): Boolean = { 38 | (something, other) match { 39 | case (Failure(a: Throwable), Failure(b: Throwable)) 40 | if (a.getCause != null) && (b.getCause != null) => 41 | (a.getClass == b.getClass) && (a.getCause.getClass == b.getCause.getClass) 42 | case (_, _) => false 43 | } 44 | } 45 | 46 | } 47 | 48 | } 49 | -------------------------------------------------------------------------------- /src/main/scala/com/amazon/deequ/suggestions/rules/CompleteIfCompleteRule.scala: -------------------------------------------------------------------------------- 1 | /** 2 | * Copyright 2018 Amazon.com, Inc. or its affiliates. All Rights Reserved. 3 | * 4 | * Licensed under the Apache License, Version 2.0 (the "License"). You may not 5 | * use this file except in compliance with the License. A copy of the License 6 | * is located at 7 | * 8 | * http://aws.amazon.com/apache2.0/ 9 | * 10 | * or in the "license" file accompanying this file. This file is distributed on 11 | * an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either 12 | * express or implied. See the License for the specific language governing 13 | * permissions and limitations under the License. 14 | * 15 | */ 16 | 17 | package com.amazon.deequ.suggestions.rules 18 | 19 | import com.amazon.deequ.checks.Check 20 | import com.amazon.deequ.constraints.Constraint.completenessConstraint 21 | import com.amazon.deequ.profiles.ColumnProfile 22 | import com.amazon.deequ.suggestions.ConstraintSuggestion 23 | 24 | /** If a column is complete in the sample, we suggest a NOT NULL constraint */ 25 | case class CompleteIfCompleteRule() extends ConstraintRule[ColumnProfile] { 26 | 27 | override def shouldBeApplied(profile: ColumnProfile, numRecords: Long): Boolean = { 28 | profile.completeness == 1.0 29 | } 30 | 31 | override def candidate(profile: ColumnProfile, numRecords: Long): ConstraintSuggestion = { 32 | 33 | val constraint = completenessConstraint(profile.column, Check.IsOne) 34 | 35 | ConstraintSuggestion( 36 | constraint, 37 | profile.column, 38 | "Completeness: " + profile.completeness.toString, 39 | s"'${profile.column}' is not null", 40 | this, 41 | s""".isComplete("${profile.column}")""" 42 | ) 43 | } 44 | 45 | override val ruleDescription: String = "If a column is complete in the sample, " + 46 | "we suggest a NOT NULL constraint" 47 | } 48 | -------------------------------------------------------------------------------- /src/main/scala/com/amazon/deequ/anomalydetection/DetectionResult.scala: -------------------------------------------------------------------------------- 1 | /** 2 | * Copyright 2018 Amazon.com, Inc. or its affiliates. All Rights Reserved. 3 | * 4 | * Licensed under the Apache License, Version 2.0 (the "License"). You may not 5 | * use this file except in compliance with the License. A copy of the License 6 | * is located at 7 | * 8 | * http://aws.amazon.com/apache2.0/ 9 | * 10 | * or in the "license" file accompanying this file. This file is distributed on 11 | * an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either 12 | * express or implied. See the License for the specific language governing 13 | * permissions and limitations under the License. 14 | * 15 | */ 16 | 17 | package com.amazon.deequ.anomalydetection 18 | 19 | class Anomaly( 20 | val value: Option[Double], 21 | val confidence: Double, 22 | val detail: Option[String] = None) { 23 | 24 | def canEqual(that: Any): Boolean = { 25 | that.isInstanceOf[Anomaly] 26 | } 27 | 28 | /** 29 | * Tests anomalies for equality. Ignores detailed explanation. 30 | * 31 | * @param obj The object/ anomaly to compare against 32 | * @return true, if and only if the value and confidence are the same 33 | */ 34 | override def equals(obj: Any): Boolean = { 35 | obj match { 36 | case anomaly: Anomaly => anomaly.value == value && anomaly.confidence == confidence 37 | case _ => false 38 | } 39 | } 40 | 41 | override def hashCode: Int = { 42 | val prime = 31 43 | var result = 1 44 | result = prime * result + (if (value == null) 0 else value.hashCode) 45 | prime * result + confidence.hashCode 46 | } 47 | 48 | } 49 | 50 | object Anomaly { 51 | def apply(value: Option[Double], confidence: Double, detail: Option[String] = None): Anomaly = { 52 | new Anomaly(value, confidence, detail) 53 | } 54 | } 55 | 56 | case class DetectionResult(anomalies: Seq[(Long, Anomaly)] = Seq.empty) 57 | -------------------------------------------------------------------------------- /src/main/scala/com/amazon/deequ/analyzers/Sum.scala: -------------------------------------------------------------------------------- 1 | /** 2 | * Copyright 2018 Amazon.com, Inc. or its affiliates. All Rights Reserved. 3 | * 4 | * Licensed under the Apache License, Version 2.0 (the "License"). You may not 5 | * use this file except in compliance with the License. A copy of the License 6 | * is located at 7 | * 8 | * http://aws.amazon.com/apache2.0/ 9 | * 10 | * or in the "license" file accompanying this file. This file is distributed on 11 | * an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either 12 | * express or implied. See the License for the specific language governing 13 | * permissions and limitations under the License. 14 | * 15 | */ 16 | 17 | package com.amazon.deequ.analyzers 18 | 19 | import com.amazon.deequ.analyzers.Preconditions.{hasColumn, isNumeric} 20 | import org.apache.spark.sql.functions.sum 21 | import org.apache.spark.sql.types.{DoubleType, StructType} 22 | import org.apache.spark.sql.{Column, Row} 23 | import Analyzers._ 24 | 25 | case class SumState(sum: Double) extends DoubleValuedState[SumState] { 26 | 27 | override def sum(other: SumState): SumState = { 28 | SumState(sum + other.sum) 29 | } 30 | 31 | override def metricValue(): Double = { 32 | sum 33 | } 34 | } 35 | 36 | case class Sum(column: String, where: Option[String] = None) 37 | extends StandardScanShareableAnalyzer[SumState]("Sum", column) 38 | with FilterableAnalyzer { 39 | 40 | override def aggregationFunctions(): Seq[Column] = { 41 | sum(conditionalSelection(column, where)).cast(DoubleType) :: Nil 42 | } 43 | 44 | override def fromAggregationResult(result: Row, offset: Int): Option[SumState] = { 45 | ifNoNullsIn(result, offset) { _ => 46 | SumState(result.getDouble(offset)) 47 | } 48 | } 49 | 50 | override protected def additionalPreconditions(): Seq[StructType => Unit] = { 51 | hasColumn(column) :: isNumeric(column) :: Nil 52 | } 53 | 54 | override def filterCondition: Option[String] = where 55 | } 56 | -------------------------------------------------------------------------------- /src/main/scala/com/amazon/deequ/analyzers/Completeness.scala: -------------------------------------------------------------------------------- 1 | /** 2 | * Copyright 2018 Amazon.com, Inc. or its affiliates. All Rights Reserved. 3 | * 4 | * Licensed under the Apache License, Version 2.0 (the "License"). You may not 5 | * use this file except in compliance with the License. A copy of the License 6 | * is located at 7 | * 8 | * http://aws.amazon.com/apache2.0/ 9 | * 10 | * or in the "license" file accompanying this file. This file is distributed on 11 | * an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either 12 | * express or implied. See the License for the specific language governing 13 | * permissions and limitations under the License. 14 | * 15 | */ 16 | 17 | package com.amazon.deequ.analyzers 18 | 19 | import com.amazon.deequ.analyzers.Preconditions.{hasColumn, isNotNested} 20 | import org.apache.spark.sql.functions.sum 21 | import org.apache.spark.sql.types.{IntegerType, StructType} 22 | import Analyzers._ 23 | import org.apache.spark.sql.{Column, Row} 24 | 25 | /** Completeness is the fraction of non-null values in a column of a DataFrame. */ 26 | case class Completeness(column: String, where: Option[String] = None) extends 27 | StandardScanShareableAnalyzer[NumMatchesAndCount]("Completeness", column) with 28 | FilterableAnalyzer { 29 | 30 | override def fromAggregationResult(result: Row, offset: Int): Option[NumMatchesAndCount] = { 31 | 32 | ifNoNullsIn(result, offset, howMany = 2) { _ => 33 | NumMatchesAndCount(result.getLong(offset), result.getLong(offset + 1)) 34 | } 35 | } 36 | 37 | override def aggregationFunctions(): Seq[Column] = { 38 | 39 | val summation = sum(conditionalSelection(column, where).isNotNull.cast(IntegerType)) 40 | 41 | summation :: conditionalCount(where) :: Nil 42 | } 43 | 44 | override protected def additionalPreconditions(): Seq[StructType => Unit] = { 45 | hasColumn(column) :: isNotNested(column) :: Nil 46 | } 47 | 48 | override def filterCondition: Option[String] = where 49 | } 50 | -------------------------------------------------------------------------------- /src/main/scala/com/amazon/deequ/checks/CheckWithLastConstraintFilterable.scala: -------------------------------------------------------------------------------- 1 | /** 2 | * Copyright 2018 Amazon.com, Inc. or its affiliates. All Rights Reserved. 3 | * 4 | * Licensed under the Apache License, Version 2.0 (the "License"). You may not 5 | * use this file except in compliance with the License. A copy of the License 6 | * is located at 7 | * 8 | * http://aws.amazon.com/apache2.0/ 9 | * 10 | * or in the "license" file accompanying this file. This file is distributed on 11 | * an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either 12 | * express or implied. See the License for the specific language governing 13 | * permissions and limitations under the License. 14 | * 15 | */ 16 | 17 | package com.amazon.deequ.checks 18 | 19 | import com.amazon.deequ.constraints.Constraint 20 | 21 | /** Allows to replace the last configured constraint in a check with a filtered version */ 22 | class CheckWithLastConstraintFilterable( 23 | level: CheckLevel.Value, 24 | description: String, 25 | constraints: Seq[Constraint], 26 | createReplacement: Option[String] => Constraint) 27 | extends Check(level, description, constraints) { 28 | 29 | /** 30 | * Defines a filter to apply before evaluating the previous constraint 31 | * 32 | * @param filter SparkSQL predicate to apply 33 | * @return 34 | */ 35 | def where(filter: String): Check = { 36 | 37 | val adjustedConstraints = 38 | constraints.take(constraints.size - 1) :+ createReplacement(Option(filter)) 39 | 40 | Check(level, description, adjustedConstraints) 41 | } 42 | } 43 | 44 | object CheckWithLastConstraintFilterable { 45 | def apply( 46 | level: CheckLevel.Value, 47 | description: String, 48 | constraints: Seq[Constraint], 49 | createReplacement: Option[String] => Constraint 50 | ): CheckWithLastConstraintFilterable = { 51 | 52 | new CheckWithLastConstraintFilterable(level, description, constraints, createReplacement) 53 | } 54 | } 55 | -------------------------------------------------------------------------------- /src/main/scala/com/amazon/deequ/repository/MetricsRepository.scala: -------------------------------------------------------------------------------- 1 | /** 2 | * Copyright 2018 Amazon.com, Inc. or its affiliates. All Rights Reserved. 3 | * 4 | * Licensed under the Apache License, Version 2.0 (the "License"). You may not 5 | * use this file except in compliance with the License. A copy of the License 6 | * is located at 7 | * 8 | * http://aws.amazon.com/apache2.0/ 9 | * 10 | * or in the "license" file accompanying this file. This file is distributed on 11 | * an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either 12 | * express or implied. See the License for the specific language governing 13 | * permissions and limitations under the License. 14 | * 15 | */ 16 | 17 | package com.amazon.deequ.repository 18 | 19 | import com.amazon.deequ.analyzers.runners.AnalyzerContext 20 | 21 | /** 22 | * Common trait for RepositoryIndexes where deequ runs can be stored. 23 | * Repository provides methods to store AnalysisResults(metrics) and VerificationResults(if any) 24 | */ 25 | trait MetricsRepository { 26 | 27 | /** 28 | * Saves Analysis results (metrics) 29 | * 30 | * @param resultKey A ResultKey that uniquely identifies a AnalysisResult 31 | * @param analyzerContext The resulting AnalyzerContext of an Analysis 32 | */ 33 | def save(resultKey: ResultKey, analyzerContext: AnalyzerContext): Unit 34 | 35 | /** 36 | * Get a AnalyzerContext saved using exactly the same resultKey if present 37 | */ 38 | def loadByKey(resultKey: ResultKey): Option[AnalyzerContext] 39 | 40 | /** Get a builder class to construct a loading query to get AnalysisResults */ 41 | def load(): MetricsRepositoryMultipleResultsLoader 42 | 43 | } 44 | 45 | /** 46 | * Information that uniquely identifies a AnalysisResult 47 | * 48 | * @param dataSetDate A date related to the AnalysisResult 49 | * @param tags A map with additional annotations 50 | */ 51 | case class ResultKey(dataSetDate: Long, tags: Map[String, String] = Map.empty) 52 | -------------------------------------------------------------------------------- /src/test/scala/com/amazon/deequ/SparkBasicTest.scala: -------------------------------------------------------------------------------- 1 | /** 2 | * Copyright 2018 Amazon.com, Inc. or its affiliates. All Rights Reserved. 3 | * 4 | * Licensed under the Apache License, Version 2.0 (the "License"). You may not 5 | * use this file except in compliance with the License. A copy of the License 6 | * is located at 7 | * 8 | * http://aws.amazon.com/apache2.0/ 9 | * 10 | * or in the "license" file accompanying this file. This file is distributed on 11 | * an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either 12 | * express or implied. See the License for the specific language governing 13 | * permissions and limitations under the License. 14 | * 15 | */ 16 | 17 | package com.amazon.deequ 18 | 19 | import org.scalatest.{Matchers, WordSpec} 20 | 21 | class SparkBasicTest extends WordSpec with Matchers with SparkContextSpec { 22 | "check that initializing a spark context and a basic example works" in 23 | withSparkSession { sparkSession => 24 | val sc = sparkSession.sparkContext 25 | val xs = sc.parallelize(1 to 100) 26 | val res = xs.sum() 27 | res should be(5050) 28 | } 29 | 30 | "check that monitoring spark session works" in 31 | withMonitorableSparkSession { (sparkSession, sparkMonitor) => 32 | val sc = sparkSession.sparkContext 33 | val xs = sc.parallelize(1 to 100) 34 | 35 | 36 | (1 to 2).foreach { index => 37 | val res = sparkMonitor.withMonitoringSession { stat => 38 | val sum = xs.map(_ * index).sum() 39 | // Spark jobs are running in different monitoring sessions 40 | assert(stat.jobCount == 1) 41 | sum 42 | } 43 | res should be(5050 * index) 44 | } 45 | 46 | sparkMonitor.withMonitoringSession { stat => 47 | (1 to 2).foreach { index => 48 | xs.map(_ * index).sum() 49 | } 50 | // Spark jobs are running in the same monitoring session 51 | assert(stat.jobCount == 2) 52 | } 53 | } 54 | } 55 | 56 | -------------------------------------------------------------------------------- /src/main/scala/com/amazon/deequ/analyzers/UniqueValueRatio.scala: -------------------------------------------------------------------------------- 1 | /** 2 | * Copyright 2018 Amazon.com, Inc. or its affiliates. All Rights Reserved. 3 | * 4 | * Licensed under the Apache License, Version 2.0 (the "License"). You may not 5 | * use this file except in compliance with the License. A copy of the License 6 | * is located at 7 | * 8 | * http://aws.amazon.com/apache2.0/ 9 | * 10 | * or in the "license" file accompanying this file. This file is distributed on 11 | * an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either 12 | * express or implied. See the License for the specific language governing 13 | * permissions and limitations under the License. 14 | * 15 | */ 16 | 17 | package com.amazon.deequ.analyzers 18 | 19 | import com.amazon.deequ.analyzers.Analyzers.COUNT_COL 20 | import com.amazon.deequ.metrics.DoubleMetric 21 | import org.apache.spark.sql.{Column, Row} 22 | import org.apache.spark.sql.functions.{col, count, lit, sum} 23 | import org.apache.spark.sql.types.DoubleType 24 | 25 | case class UniqueValueRatio(columns: Seq[String], where: Option[String] = None) 26 | extends ScanShareableFrequencyBasedAnalyzer("UniqueValueRatio", columns) 27 | with FilterableAnalyzer { 28 | 29 | override def aggregationFunctions(numRows: Long): Seq[Column] = { 30 | sum(col(COUNT_COL).equalTo(lit(1)).cast(DoubleType)) :: count("*") :: Nil 31 | } 32 | 33 | override def fromAggregationResult(result: Row, offset: Int): DoubleMetric = { 34 | val numUniqueValues = result.getDouble(offset) 35 | val numDistinctValues = result.getLong(offset + 1).toDouble 36 | 37 | toSuccessMetric(numUniqueValues / numDistinctValues) 38 | } 39 | 40 | override def filterCondition: Option[String] = where 41 | } 42 | 43 | object UniqueValueRatio { 44 | def apply(column: String): UniqueValueRatio = { 45 | new UniqueValueRatio(column :: Nil) 46 | } 47 | 48 | def apply(column: String, where: Option[String]): UniqueValueRatio = { 49 | new UniqueValueRatio(column :: Nil, where) 50 | } 51 | } 52 | -------------------------------------------------------------------------------- /src/main/scala/com/amazon/deequ/analyzers/Maximum.scala: -------------------------------------------------------------------------------- 1 | /** 2 | * Copyright 2018 Amazon.com, Inc. or its affiliates. All Rights Reserved. 3 | * 4 | * Licensed under the Apache License, Version 2.0 (the "License"). You may not 5 | * use this file except in compliance with the License. A copy of the License 6 | * is located at 7 | * 8 | * http://aws.amazon.com/apache2.0/ 9 | * 10 | * or in the "license" file accompanying this file. This file is distributed on 11 | * an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either 12 | * express or implied. See the License for the specific language governing 13 | * permissions and limitations under the License. 14 | * 15 | */ 16 | 17 | package com.amazon.deequ.analyzers 18 | 19 | import com.amazon.deequ.analyzers.Preconditions.{hasColumn, isNumeric} 20 | import org.apache.spark.sql.{Column, Row} 21 | import org.apache.spark.sql.functions.max 22 | import org.apache.spark.sql.types.{DoubleType, StructType} 23 | import Analyzers._ 24 | 25 | case class MaxState(maxValue: Double) extends DoubleValuedState[MaxState] { 26 | 27 | override def sum(other: MaxState): MaxState = { 28 | MaxState(math.max(maxValue, other.maxValue)) 29 | } 30 | 31 | override def metricValue(): Double = { 32 | maxValue 33 | } 34 | } 35 | 36 | case class Maximum(column: String, where: Option[String] = None) 37 | extends StandardScanShareableAnalyzer[MaxState]("Maximum", column) 38 | with FilterableAnalyzer { 39 | 40 | override def aggregationFunctions(): Seq[Column] = { 41 | max(conditionalSelection(column, where)).cast(DoubleType) :: Nil 42 | } 43 | 44 | override def fromAggregationResult(result: Row, offset: Int): Option[MaxState] = { 45 | 46 | ifNoNullsIn(result, offset) { _ => 47 | MaxState(result.getDouble(offset)) 48 | } 49 | } 50 | 51 | override protected def additionalPreconditions(): Seq[StructType => Unit] = { 52 | hasColumn(column) :: isNumeric(column) :: Nil 53 | } 54 | 55 | override def filterCondition: Option[String] = where 56 | } 57 | -------------------------------------------------------------------------------- /src/main/scala/com/amazon/deequ/analyzers/Minimum.scala: -------------------------------------------------------------------------------- 1 | /** 2 | * Copyright 2018 Amazon.com, Inc. or its affiliates. All Rights Reserved. 3 | * 4 | * Licensed under the Apache License, Version 2.0 (the "License"). You may not 5 | * use this file except in compliance with the License. A copy of the License 6 | * is located at 7 | * 8 | * http://aws.amazon.com/apache2.0/ 9 | * 10 | * or in the "license" file accompanying this file. This file is distributed on 11 | * an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either 12 | * express or implied. See the License for the specific language governing 13 | * permissions and limitations under the License. 14 | * 15 | */ 16 | 17 | package com.amazon.deequ.analyzers 18 | 19 | import com.amazon.deequ.analyzers.Preconditions.{hasColumn, isNumeric} 20 | import org.apache.spark.sql.{Column, Row} 21 | import org.apache.spark.sql.functions.min 22 | import org.apache.spark.sql.types.{DoubleType, StructType} 23 | import Analyzers._ 24 | 25 | case class MinState(minValue: Double) extends DoubleValuedState[MinState] { 26 | 27 | override def sum(other: MinState): MinState = { 28 | MinState(math.min(minValue, other.minValue)) 29 | } 30 | 31 | override def metricValue(): Double = { 32 | minValue 33 | } 34 | } 35 | 36 | case class Minimum(column: String, where: Option[String] = None) 37 | extends StandardScanShareableAnalyzer[MinState]("Minimum", column) 38 | with FilterableAnalyzer { 39 | 40 | override def aggregationFunctions(): Seq[Column] = { 41 | min(conditionalSelection(column, where)).cast(DoubleType) :: Nil 42 | } 43 | 44 | override def fromAggregationResult(result: Row, offset: Int): Option[MinState] = { 45 | 46 | ifNoNullsIn(result, offset) { _ => 47 | MinState(result.getDouble(offset)) 48 | } 49 | } 50 | 51 | override protected def additionalPreconditions(): Seq[StructType => Unit] = { 52 | hasColumn(column) :: isNumeric(column) :: Nil 53 | } 54 | 55 | override def filterCondition: Option[String] = where 56 | } 57 | -------------------------------------------------------------------------------- /src/test/scala/com/amazon/deequ/anomalydetection/HistoryUtilsTest.scala: -------------------------------------------------------------------------------- 1 | /** 2 | * Copyright 2018 Amazon.com, Inc. or its affiliates. All Rights Reserved. 3 | * 4 | * Licensed under the Apache License, Version 2.0 (the "License"). You may not 5 | * use this file except in compliance with the License. A copy of the License 6 | * is located at 7 | * 8 | * http://aws.amazon.com/apache2.0/ 9 | * 10 | * or in the "license" file accompanying this file. This file is distributed on 11 | * an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either 12 | * express or implied. See the License for the specific language governing 13 | * permissions and limitations under the License. 14 | * 15 | */ 16 | 17 | package com.amazon.deequ.anomalydetection 18 | 19 | import com.amazon.deequ.metrics.{DoubleMetric, Entity} 20 | import org.scalatest.{Matchers, WordSpec} 21 | 22 | import scala.util.{Failure, Success} 23 | 24 | class HistoryUtilsTest extends WordSpec with Matchers { 25 | 26 | "History Utils" should { 27 | val sampleException = new IllegalArgumentException() 28 | 29 | val noneMetric = None 30 | val metricWithNoValue = Some(DoubleMetric(Entity.Column, "metric-name", "instance-name", 31 | Failure(sampleException))) 32 | val metricWithValue = Some(DoubleMetric(Entity.Column, "metric-name", "instance-name", 33 | Success(50))) 34 | 35 | "extract optinal metric value" in { 36 | assert(HistoryUtils.extractMetricValue[Double](noneMetric).isEmpty) 37 | assert(HistoryUtils.extractMetricValue[Double](metricWithNoValue).isEmpty) 38 | assert(HistoryUtils.extractMetricValue[Double](metricWithValue).contains(50)) 39 | 40 | } 41 | "extract optinal metric values" in { 42 | val metrics = Seq(0L -> noneMetric, 1L -> metricWithNoValue, 2L -> metricWithValue) 43 | assert(HistoryUtils.extractMetricValues[Double](metrics) == Seq(DataPoint[Double](0L, None), 44 | DataPoint[Double](1L, None), DataPoint[Double](2, Some(50)))) 45 | } 46 | } 47 | } 48 | 49 | -------------------------------------------------------------------------------- /src/main/scala/com/amazon/deequ/analyzers/catalyst/StatefulCorrelation.scala: -------------------------------------------------------------------------------- 1 | /** 2 | * Copyright 2018 Amazon.com, Inc. or its affiliates. All Rights Reserved. 3 | * 4 | * Licensed under the Apache License, Version 2.0 (the "License"). You may not 5 | * use this file except in compliance with the License. A copy of the License 6 | * is located at 7 | * 8 | * http://aws.amazon.com/apache2.0/ 9 | * 10 | * or in the "license" file accompanying this file. This file is distributed on 11 | * an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either 12 | * express or implied. See the License for the specific language governing 13 | * permissions and limitations under the License. 14 | * 15 | */ 16 | 17 | package org.apache.spark.sql 18 | 19 | import org.apache.spark.sql.catalyst.expressions.aggregate.Corr 20 | import org.apache.spark.sql.catalyst.expressions._ 21 | import org.apache.spark.sql.types._ 22 | 23 | /** Adjusted version of org.apache.spark.sql.catalyst.expressions.aggregate.Corr */ 24 | private[sql] class StatefulCorrelation(x: Expression, y: Expression) extends Corr(x, y) { 25 | 26 | override def dataType: org.apache.spark.sql.types.DataType = 27 | StructType(StructField("n", DoubleType) :: StructField("xAvg", DoubleType) :: 28 | StructField("yAvg", DoubleType) :: StructField("ck", DoubleType) :: 29 | StructField("xMk", DoubleType) :: StructField("yMk", DoubleType) :: Nil) 30 | 31 | override val evaluateExpression: Expression = { 32 | CreateStruct(n :: xAvg :: yAvg :: ck :: xMk :: yMk :: Nil) 33 | } 34 | 35 | override def prettyName: String = "stateful_corr" 36 | 37 | override def canEqual(other: Any): Boolean = other.isInstanceOf[StatefulCorrelation] 38 | 39 | override def equals(other: Any): Boolean = other match { 40 | case that: StatefulCorrelation => 41 | (that canEqual this) && evaluateExpression == that.evaluateExpression 42 | case _ => false 43 | } 44 | 45 | override def hashCode(): Int = { 46 | val state = Seq(super.hashCode(), evaluateExpression) 47 | state.map { _.hashCode() }.foldLeft(0) {(a, b) => 31 * a + b } 48 | } 49 | } 50 | -------------------------------------------------------------------------------- /src/main/scala/com/amazon/deequ/analyzers/Mean.scala: -------------------------------------------------------------------------------- 1 | /** 2 | * Copyright 2018 Amazon.com, Inc. or its affiliates. All Rights Reserved. 3 | * 4 | * Licensed under the Apache License, Version 2.0 (the "License"). You may not 5 | * use this file except in compliance with the License. A copy of the License 6 | * is located at 7 | * 8 | * http://aws.amazon.com/apache2.0/ 9 | * 10 | * or in the "license" file accompanying this file. This file is distributed on 11 | * an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either 12 | * express or implied. See the License for the specific language governing 13 | * permissions and limitations under the License. 14 | * 15 | */ 16 | 17 | package com.amazon.deequ.analyzers 18 | 19 | import com.amazon.deequ.analyzers.Preconditions.{hasColumn, isNumeric} 20 | import org.apache.spark.sql.{Column, Row} 21 | import org.apache.spark.sql.functions.{count, sum} 22 | import org.apache.spark.sql.types.{DoubleType, StructType, LongType} 23 | import Analyzers._ 24 | 25 | case class MeanState(sum: Double, count: Long) extends DoubleValuedState[MeanState] { 26 | 27 | override def sum(other: MeanState): MeanState = { 28 | MeanState(sum + other.sum, count + other.count) 29 | } 30 | 31 | override def metricValue(): Double = { 32 | if (count == 0L) Double.NaN else sum / count 33 | } 34 | } 35 | 36 | case class Mean(column: String, where: Option[String] = None) 37 | extends StandardScanShareableAnalyzer[MeanState]("Mean", column) 38 | with FilterableAnalyzer { 39 | 40 | override def aggregationFunctions(): Seq[Column] = { 41 | sum(conditionalSelection(column, where)).cast(DoubleType) :: 42 | count(conditionalSelection(column, where)).cast(LongType) :: Nil 43 | } 44 | 45 | override def fromAggregationResult(result: Row, offset: Int): Option[MeanState] = { 46 | 47 | ifNoNullsIn(result, offset, howMany = 2) { _ => 48 | MeanState(result.getDouble(offset), result.getLong(offset + 1)) 49 | } 50 | } 51 | 52 | override protected def additionalPreconditions(): Seq[StructType => Unit] = { 53 | hasColumn(column) :: isNumeric(column) :: Nil 54 | } 55 | 56 | override def filterCondition: Option[String] = where 57 | } 58 | -------------------------------------------------------------------------------- /src/main/scala/com/amazon/deequ/metrics/HistogramMetric.scala: -------------------------------------------------------------------------------- 1 | /** 2 | * Copyright 2018 Amazon.com, Inc. or its affiliates. All Rights Reserved. 3 | * 4 | * Licensed under the Apache License, Version 2.0 (the "License"). You may not 5 | * use this file except in compliance with the License. A copy of the License 6 | * is located at 7 | * 8 | * http://aws.amazon.com/apache2.0/ 9 | * 10 | * or in the "license" file accompanying this file. This file is distributed on 11 | * an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either 12 | * express or implied. See the License for the specific language governing 13 | * permissions and limitations under the License. 14 | * 15 | */ 16 | 17 | package com.amazon.deequ.metrics 18 | 19 | import scala.util.{Failure, Success, Try} 20 | 21 | case class DistributionValue(absolute: Long, ratio: Double) 22 | 23 | case class Distribution(values: Map[String, DistributionValue], numberOfBins: Long) { 24 | 25 | def apply(key: String): DistributionValue = { 26 | values(key) 27 | } 28 | 29 | def argmax: String = { 30 | val (distributionKey, _) = values.toSeq 31 | .maxBy { case (_, distributionValue) => distributionValue.absolute } 32 | 33 | distributionKey 34 | } 35 | } 36 | 37 | case class HistogramMetric(column: String, value: Try[Distribution]) extends Metric[Distribution] { 38 | val entity: Entity.Value = Entity.Column 39 | val instance: String = column 40 | val name = "Histogram" 41 | 42 | def flatten(): Seq[DoubleMetric] = { 43 | value 44 | .map { distribution => 45 | val numberOfBins = Seq(DoubleMetric(entity, s"$name.bins", instance, 46 | Success(distribution.numberOfBins.toDouble))) 47 | 48 | val details = distribution.values 49 | .flatMap { case (key, distValue) => 50 | DoubleMetric(entity, s"$name.abs.$key", instance, Success(distValue.absolute)) :: 51 | DoubleMetric(entity, s"$name.ratio.$key", instance, Success(distValue.ratio)) :: Nil 52 | } 53 | numberOfBins ++ details 54 | } 55 | .recover { 56 | case e: Exception => Seq(DoubleMetric(entity, s"$name.bins", instance, Failure(e))) 57 | } 58 | .get 59 | } 60 | 61 | } 62 | -------------------------------------------------------------------------------- /src/test/scala/com/amazon/deequ/anomalydetection/AnomalyDetectionTestUtils.scala: -------------------------------------------------------------------------------- 1 | /** 2 | * Copyright 2018 Amazon.com, Inc. or its affiliates. All Rights Reserved. 3 | * 4 | * Licensed under the Apache License, Version 2.0 (the "License"). You may not 5 | * use this file except in compliance with the License. A copy of the License 6 | * is located at 7 | * 8 | * http://aws.amazon.com/apache2.0/ 9 | * 10 | * or in the "license" file accompanying this file. This file is distributed on 11 | * an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either 12 | * express or implied. See the License for the specific language governing 13 | * permissions and limitations under the License. 14 | * 15 | */ 16 | 17 | package com.amazon.deequ.anomalydetection 18 | 19 | import scala.util.matching.Regex 20 | 21 | /** 22 | * Utilities to test Anomaly Detection methods and related modules 23 | */ 24 | object AnomalyDetectionTestUtils { 25 | 26 | private val numericalValueRegex: Regex = """([+-]?([0-9]*[.])?[0-9]+([Ee][0-9]+)?)""".r 27 | 28 | /** 29 | * Finds the first numerical value in a string 30 | * 31 | * @param details The string containing a numerical value 32 | * @throws IllegalArgumentException Thrown if no value could be found 33 | * @return The value itself 34 | */ 35 | def firstDoubleFromString(details: String): Double = { 36 | val firstValue = numericalValueRegex.findFirstIn(details) 37 | 38 | require(firstValue.isDefined, "Input string did not contain a numerical value") 39 | 40 | firstValue.get.toString.toDouble 41 | } 42 | 43 | /** 44 | * Finds the first three numerical values in a string 45 | * 46 | * @param details The string containing at least three numerical values 47 | * @throws IllegalArgumentException Thrown if less than 3 values could be found 48 | * @return The values themselves 49 | */ 50 | def firstThreeDoublesFromString(details: String): (Double, Double, Double) = { 51 | val values = numericalValueRegex.findAllIn(details).toVector.map(_.toString.toDouble) 52 | 53 | require(values.length >= 3, "Input string did not contain at least 3 numerical values.") 54 | 55 | (values(0), values(1), values(2)) 56 | } 57 | } 58 | -------------------------------------------------------------------------------- /src/main/scala/com/amazon/deequ/metrics/Metric.scala: -------------------------------------------------------------------------------- 1 | /** 2 | * Copyright 2018 Amazon.com, Inc. or its affiliates. All Rights Reserved. 3 | * 4 | * Licensed under the Apache License, Version 2.0 (the "License"). You may not 5 | * use this file except in compliance with the License. A copy of the License 6 | * is located at 7 | * 8 | * http://aws.amazon.com/apache2.0/ 9 | * 10 | * or in the "license" file accompanying this file. This file is distributed on 11 | * an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either 12 | * express or implied. See the License for the specific language governing 13 | * permissions and limitations under the License. 14 | * 15 | */ 16 | 17 | package com.amazon.deequ.metrics 18 | 19 | import scala.util.{Failure, Success, Try} 20 | 21 | object Entity extends Enumeration { 22 | val Dataset, Column, Mutlicolumn = Value 23 | } 24 | 25 | /** Common trait for all data quality metrics */ 26 | trait Metric[T] { 27 | val entity: Entity.Value 28 | val instance: String 29 | val name: String 30 | val value: Try[T] 31 | 32 | /* 33 | * Composite metric objects e.g histogram can implement this method to 34 | * returned flattened view of the internal values in terms of double metrics. 35 | * @see HistogramMetric for sample 36 | */ 37 | def flatten(): Seq[DoubleMetric] 38 | } 39 | 40 | /** Common trait for all data quality metrics where the value is double */ 41 | case class DoubleMetric( 42 | entity: Entity.Value, 43 | name: String, 44 | instance: String, 45 | value: Try[Double]) 46 | extends Metric[Double] { 47 | 48 | override def flatten(): Seq[DoubleMetric] = Seq(this) 49 | } 50 | 51 | case class KeyedDoubleMetric( 52 | entity: Entity.Value, 53 | name: String, 54 | instance: String, 55 | value: Try[Map[String, Double]]) 56 | extends Metric[Map[String, Double]] { 57 | 58 | override def flatten(): Seq[DoubleMetric] = { 59 | if (value.isSuccess) { 60 | value.get.map { case (key, correspondingValue) => 61 | DoubleMetric(entity, s"$name-$key", instance, Success(correspondingValue)) 62 | } 63 | .toSeq 64 | } else { 65 | Seq(DoubleMetric(entity, s"$name", instance, Failure(value.failed.get))) 66 | } 67 | } 68 | } 69 | -------------------------------------------------------------------------------- /src/test/scala/com/amazon/deequ/analyzers/StatesTest.scala: -------------------------------------------------------------------------------- 1 | /** 2 | * Copyright 2018 Amazon.com, Inc. or its affiliates. All Rights Reserved. 3 | * 4 | * Licensed under the Apache License, Version 2.0 (the "License"). You may not 5 | * use this file except in compliance with the License. A copy of the License 6 | * is located at 7 | * 8 | * http://aws.amazon.com/apache2.0/ 9 | * 10 | * or in the "license" file accompanying this file. This file is distributed on 11 | * an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either 12 | * express or implied. See the License for the specific language governing 13 | * permissions and limitations under the License. 14 | * 15 | */ 16 | 17 | package com.amazon.deequ.analyzers 18 | 19 | import com.amazon.deequ.SparkContextSpec 20 | import com.amazon.deequ.utils.FixtureSupport 21 | import org.scalatest.matchers.should.Matchers 22 | import org.scalatest.wordspec.AnyWordSpec 23 | 24 | class StatesTest extends AnyWordSpec with Matchers with SparkContextSpec with FixtureSupport { 25 | 26 | "FrequenciesAndNumRows" should { 27 | "merge correctly" in withSparkSession { session => 28 | 29 | import session.implicits._ 30 | 31 | val dataA = Seq("A", "A", "B").toDF("att1") 32 | val dataB = Seq("A", "C", "C").toDF("att1") 33 | 34 | val stateA = FrequencyBasedAnalyzer.computeFrequencies(dataA, "att1" :: Nil) 35 | val stateB = FrequencyBasedAnalyzer.computeFrequencies(dataB, "att1" :: Nil) 36 | 37 | val stateAB = stateA.sum(stateB) 38 | 39 | println(stateA.frequencies.schema) 40 | stateA.frequencies.collect().foreach { println } 41 | println() 42 | 43 | println(stateB.frequencies.schema) 44 | stateB.frequencies.collect().foreach { println } 45 | println() 46 | 47 | println(stateAB.frequencies.schema) 48 | stateAB.frequencies.collect().foreach { println } 49 | 50 | val mergedFrequencies = stateAB.frequencies.collect() 51 | .map { row => row.getString(0) -> row.getLong(1) } 52 | .toMap 53 | 54 | assert(mergedFrequencies.size == 3) 55 | assert(mergedFrequencies.get("A").contains(3)) 56 | assert(mergedFrequencies.get("B").contains(1)) 57 | assert(mergedFrequencies.get("C").contains(2)) 58 | } 59 | } 60 | } 61 | -------------------------------------------------------------------------------- /src/main/scala/com/amazon/deequ/anomalydetection/SimpleThresholdStrategy.scala: -------------------------------------------------------------------------------- 1 | /** 2 | * Copyright 2018 Amazon.com, Inc. or its affiliates. All Rights Reserved. 3 | * 4 | * Licensed under the Apache License, Version 2.0 (the "License"). You may not 5 | * use this file except in compliance with the License. A copy of the License 6 | * is located at 7 | * 8 | * http://aws.amazon.com/apache2.0/ 9 | * 10 | * or in the "license" file accompanying this file. This file is distributed on 11 | * an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either 12 | * express or implied. See the License for the specific language governing 13 | * permissions and limitations under the License. 14 | * 15 | */ 16 | 17 | package com.amazon.deequ.anomalydetection 18 | 19 | /** 20 | * A simple anomaly detection method that checks if values are in a specified range. 21 | * 22 | * @param lowerBound Lower bound of accepted range of values 23 | * @param upperBound Upper bound of accepted range of values 24 | */ 25 | case class SimpleThresholdStrategy( 26 | lowerBound: Double = Double.MinValue, 27 | upperBound: Double) 28 | extends AnomalyDetectionStrategy { 29 | 30 | require(lowerBound <= upperBound, "The lower bound must be smaller or equal to the upper bound.") 31 | 32 | /** 33 | * Search for anomalies in a series of data points. 34 | * 35 | * @param dataSeries The data contained in a Vector of Doubles 36 | * @param searchInterval The indices between which anomalies should be detected. [a, b). 37 | * @return The indices of all anomalies in the interval and their corresponding wrapper object. 38 | */ 39 | override def detect( 40 | dataSeries: Vector[Double], 41 | searchInterval: (Int, Int)): Seq[(Int, Anomaly)] = { 42 | 43 | val (searchStart, searchEnd) = searchInterval 44 | 45 | require (searchStart <= searchEnd, "The start of the interval can't be larger than the end.") 46 | 47 | dataSeries.zipWithIndex 48 | .slice(searchStart, searchEnd) 49 | .filter { case (value, _) => value < lowerBound || value > upperBound } 50 | .map { case (value, index) => 51 | 52 | val detail = Some(s"[SimpleThresholdStrategy]: Value $value is not in " + 53 | s"bounds [$lowerBound, $upperBound]") 54 | 55 | (index, Anomaly(Option(value), 1.0, detail)) 56 | } 57 | } 58 | } 59 | -------------------------------------------------------------------------------- /src/main/scala/com/amazon/deequ/suggestions/rules/NonNegativeNumbersRule.scala: -------------------------------------------------------------------------------- 1 | /** 2 | * Copyright 2018 Amazon.com, Inc. or its affiliates. All Rights Reserved. 3 | * 4 | * Licensed under the Apache License, Version 2.0 (the "License"). You may not 5 | * use this file except in compliance with the License. A copy of the License 6 | * is located at 7 | * 8 | * http://aws.amazon.com/apache2.0/ 9 | * 10 | * or in the "license" file accompanying this file. This file is distributed on 11 | * an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either 12 | * express or implied. See the License for the specific language governing 13 | * permissions and limitations under the License. 14 | * 15 | */ 16 | 17 | package com.amazon.deequ.suggestions.rules 18 | 19 | import com.amazon.deequ.checks.Check 20 | import com.amazon.deequ.constraints.Constraint.complianceConstraint 21 | import com.amazon.deequ.profiles.{ColumnProfile, NumericColumnProfile} 22 | import com.amazon.deequ.suggestions.ConstraintSuggestion 23 | 24 | /** If we see only non-negative numbers in a column, we suggest a corresponding constraint */ 25 | case class NonNegativeNumbersRule() extends ConstraintRule[ColumnProfile] { 26 | 27 | override def shouldBeApplied(profile: ColumnProfile, numRecords: Long): Boolean = { 28 | profile match { 29 | case numericProfile: NumericColumnProfile => numericProfile.minimum.exists(_ >= 0.0) 30 | case _ => false 31 | } 32 | } 33 | 34 | override def candidate(profile: ColumnProfile, numRecords: Long): ConstraintSuggestion = { 35 | 36 | val description = s"'${profile.column}' has no negative values" 37 | val constraint = complianceConstraint(description, s"${profile.column} >= 0", Check.IsOne) 38 | 39 | val minimum = profile match { 40 | case numericProfile: NumericColumnProfile 41 | if numericProfile.minimum.isDefined => numericProfile.minimum.get.toString 42 | case _ => "Error while calculating minimum!" 43 | } 44 | 45 | ConstraintSuggestion( 46 | constraint, 47 | profile.column, 48 | "Minimum: " + minimum, 49 | description, 50 | this, 51 | s""".isNonNegative("${profile.column}")""" 52 | ) 53 | } 54 | 55 | override val ruleDescription: String = "If we see only non-negative numbers in a " + 56 | "column, we suggest a corresponding constraint" 57 | } 58 | -------------------------------------------------------------------------------- /src/main/scala/com/amazon/deequ/suggestions/rules/UniqueIfApproximatelyUniqueRule.scala: -------------------------------------------------------------------------------- 1 | /** 2 | * Copyright 2018 Amazon.com, Inc. or its affiliates. All Rights Reserved. 3 | * 4 | * Licensed under the Apache License, Version 2.0 (the "License"). You may not 5 | * use this file except in compliance with the License. A copy of the License 6 | * is located at 7 | * 8 | * http://aws.amazon.com/apache2.0/ 9 | * 10 | * or in the "license" file accompanying this file. This file is distributed on 11 | * an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either 12 | * express or implied. See the License for the specific language governing 13 | * permissions and limitations under the License. 14 | * 15 | */ 16 | 17 | package com.amazon.deequ.suggestions.rules 18 | 19 | import com.amazon.deequ.checks.Check 20 | import com.amazon.deequ.constraints.Constraint.uniquenessConstraint 21 | import com.amazon.deequ.profiles.ColumnProfile 22 | import com.amazon.deequ.suggestions.ConstraintSuggestion 23 | 24 | /** 25 | * If the ratio of approximate num distinct values in a column is close to the number of records 26 | * (within error of HLL sketch), we suggest a UNIQUE constraint 27 | */ 28 | case class UniqueIfApproximatelyUniqueRule() extends ConstraintRule[ColumnProfile] { 29 | 30 | override def shouldBeApplied(profile: ColumnProfile, numRecords: Long): Boolean = { 31 | 32 | val approximateDistinctness = profile.approximateNumDistinctValues.toDouble / numRecords 33 | 34 | // TODO This bound depends on the error guarantees of the HLL sketch 35 | profile.completeness == 1.0 && math.abs(1.0 - approximateDistinctness) <= 0.08 36 | } 37 | 38 | override def candidate(profile: ColumnProfile, numRecords: Long): ConstraintSuggestion = { 39 | 40 | val constraint = uniquenessConstraint(Seq(profile.column), Check.IsOne) 41 | val approximateDistinctness = profile.approximateNumDistinctValues.toDouble / numRecords 42 | 43 | ConstraintSuggestion( 44 | constraint, 45 | profile.column, 46 | "ApproxDistinctness: " + approximateDistinctness.toString, 47 | s"'${profile.column}' is unique", 48 | this, 49 | s""".isUnique("${profile.column}")""" 50 | ) 51 | } 52 | 53 | override val ruleDescription: String = "If the ratio of approximate num distinct values " + 54 | "in a column is close to the number of records (within the error of the HLL sketch), " + 55 | "we suggest a UNIQUE constraint" 56 | } 57 | -------------------------------------------------------------------------------- /src/main/scala/com/amazon/deequ/analyzers/NonSampleCompactor.scala: -------------------------------------------------------------------------------- 1 | /** 2 | * Copyright 2018 Amazon.com, Inc. or its affiliates. All Rights Reserved. 3 | * 4 | * Licensed under the Apache License, Version 2.0 (the "License"). You may not 5 | * use this file except in compliance with the License. A copy of the License 6 | * is located at 7 | * 8 | * http://aws.amazon.com/apache2.0/ 9 | * 10 | * or in the "license" file accompanying this file. This file is distributed on 11 | * an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either 12 | * express or implied. See the License for the specific language governing 13 | * permissions and limitations under the License. 14 | * 15 | */ 16 | 17 | package com.amazon.deequ.analyzers 18 | 19 | import scala.collection.mutable.ArrayBuffer 20 | import scala.reflect.ClassTag 21 | import scala.util.Random 22 | 23 | /** 24 | * A quantile sketcher whose output is half the size of its input. 25 | * 26 | * @tparam T type of the items being sketched. There should an ordering 27 | * over this item type 28 | */ 29 | class NonSampleCompactor[T]() 30 | (implicit ordering: Ordering[T], 31 | ct: ClassTag[T]) 32 | extends Serializable { 33 | 34 | var numOfCompress = 0 35 | var offset = 0 36 | var buffer: ArrayBuffer[T] = ArrayBuffer[T]() 37 | 38 | private def findOdd(items: Int): Option[T] = items % 2 match { 39 | case 1 => Some(buffer(math.max(items - 1, 0))) 40 | case _ => None 41 | } 42 | 43 | def compact : Array[T] = { 44 | var items = buffer.length 45 | val len = items - (items % 2) 46 | if (numOfCompress % 2 == 1) { 47 | offset = 1 - offset 48 | } 49 | // else { 50 | // offset = if (Random.nextBoolean()) 1 else 0 51 | // } 52 | val sortedBuffer = buffer.toArray.slice(0, len).sorted 53 | 54 | /** Selects half of the items from this level compactor to the next level compactor. 55 | * e.g. if sortedBuffer is Array(1,2,3,4), if offset is 1, output = Array(2,4), 56 | * and if offset is 0, output = Array(1,3), this will be the input to the next level compactor. 57 | */ 58 | val output = (offset until len by 2).map(sortedBuffer(_)).toArray 59 | val tail = findOdd(items) 60 | items = items % 2 61 | var newBuffer = ArrayBuffer[T]() 62 | if (tail.isDefined) { 63 | newBuffer = newBuffer :+ tail.get 64 | } 65 | buffer = newBuffer 66 | numOfCompress = numOfCompress + 1 67 | output 68 | } 69 | } 70 | 71 | -------------------------------------------------------------------------------- /src/test/resources/EMRSparkShellTest.scala: -------------------------------------------------------------------------------- 1 | /** 2 | * Copyright 2018 Amazon.com, Inc. or its affiliates. All Rights Reserved. 3 | * 4 | * Licensed under the Apache License, Version 2.0 (the "License"). You may not 5 | * use this file except in compliance with the License. A copy of the License 6 | * is located at 7 | * 8 | * http://aws.amazon.com/apache2.0/ 9 | * 10 | * or in the "license" file accompanying this file. This file is distributed on 11 | * an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either 12 | * express or implied. See the License for the specific language governing 13 | * permissions and limitations under the License. 14 | * 15 | */ 16 | 17 | /* 18 | For testing inside EMR or other flavors of spark cluster. Run commands after building git repo from source. 19 | Add additional test classes as needed 20 | scala 2.12 21 | spark-shell -i /src/test/resources/EMRSparkShellTest.txt \ 22 | --packages org.scalatest:scalatest_2.12:3.1.2,org.scalamock:scalamock_2.12:4.4.0,org.scala-lang:scala-compiler:2.12.10,\ 23 | org.mockito:mockito-core:2.28.2,org.openjdk.jmh:jmh-core:1.23,org.openjdk.jmh:jmh-generator-annprocess:1.23,org.apache.datasketches:datasketches-java:1.3.0-incubating \ 24 | --jars /target/deequ_2.12-1.1.0-SNAPSHOT.jar,/target/deequ_2.12-1.1.0-SNAPSHOT-tests.jar 25 | 26 | scala 2.11 27 | spark-shell -i /src/test/resources/EMRSparkShellTest.txt \ 28 | --packages org.scalatest:scalatest_2.11:3.1.2,org.scalamock:scalamock_2.11:4.4.0,org.scala-lang:scala-compiler:2.11.10,\ 29 | org.mockito:mockito-core:2.28.2,org.openjdk.jmh:jmh-core:1.23,org.openjdk.jmh:jmh-generator-annprocess:1.23,org.apache.datasketches:datasketches-java:1.3.0-incubating \ 30 | --jars /target/deequ-1.1.0-SNAPSHOT.jar,/target/spark-deequ-testing/deequ-1.1.0-SNAPSHOT-tests.jar 31 | */ 32 | 33 | import com.amazon.deequ.analyzers.{AnalysisTest, AnalyzerTests, IncrementalAnalysisTest} 34 | import com.amazon.deequ.analyzers.runners.{AnalysisRunnerTests, AnalyzerContextTest} 35 | import com.amazon.deequ.{VerificationResultTest, VerificationSuiteTest} 36 | 37 | (new VerificationSuiteTest).execute() 38 | (new VerificationResultTest).execute() 39 | (new AnalysisRunnerTests).execute() 40 | (new AnalyzerContextTest).execute() 41 | (new AnalysisTest).execute() 42 | (new AnalyzerTests).execute() 43 | (new IncrementalAnalysisTest).execute() 44 | //Add additional test classes as needed 45 | -------------------------------------------------------------------------------- /src/test/scala/com/amazon/deequ/checks/FilterableCheckTest.scala: -------------------------------------------------------------------------------- 1 | /** 2 | * Copyright 2018 Amazon.com, Inc. or its affiliates. All Rights Reserved. 3 | * 4 | * Licensed under the Apache License, Version 2.0 (the "License"). You may not 5 | * use this file except in compliance with the License. A copy of the License 6 | * is located at 7 | * 8 | * http://aws.amazon.com/apache2.0/ 9 | * 10 | * or in the "license" file accompanying this file. This file is distributed on 11 | * an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either 12 | * express or implied. See the License for the specific language governing 13 | * permissions and limitations under the License. 14 | * 15 | */ 16 | 17 | package com.amazon.deequ 18 | package checks 19 | 20 | import com.amazon.deequ.analyzers.{Completeness, Compliance} 21 | import com.amazon.deequ.utils.FixtureSupport 22 | import org.scalatest.matchers.should.Matchers 23 | import org.scalatest.wordspec.AnyWordSpec 24 | 25 | 26 | class FilterableCheckTest extends AnyWordSpec 27 | with Matchers 28 | with SparkContextSpec 29 | with FixtureSupport { 30 | 31 | "Filterable checks" should { 32 | "build correctly" in { 33 | 34 | val check = Check(CheckLevel.Error, "someCheck") 35 | .isComplete("col1") 36 | .isComplete("col2").where("marketplace = 'EU'") 37 | .hasCompleteness("col3", _ >= 0.9).where("marketplace = 'NA'") 38 | .satisfies("someCol > 5", "const1") 39 | .satisfies("someCol > 10", "const2").where("marketplace = 'EU'") 40 | 41 | val completenessAnalyzers = 42 | check.requiredAnalyzers() 43 | .filter { _.isInstanceOf[Completeness] } 44 | .map { _.asInstanceOf[Completeness] } 45 | .toArray 46 | .sortBy { _.column } 47 | 48 | assert(completenessAnalyzers.length == 3) 49 | assert(completenessAnalyzers.head.where.isEmpty) 50 | assert(completenessAnalyzers(1).where.contains("marketplace = 'EU'")) 51 | assert(completenessAnalyzers(2).where.contains("marketplace = 'NA'")) 52 | 53 | val complianceAnalyzers = 54 | check.requiredAnalyzers() 55 | .filter { _.isInstanceOf[Compliance] } 56 | .map { _.asInstanceOf[Compliance] } 57 | .toArray 58 | .sortBy { _.instance } 59 | 60 | assert(complianceAnalyzers.length == 2) 61 | assert(complianceAnalyzers.head.where.isEmpty) 62 | assert(complianceAnalyzers(1).where.contains("marketplace = 'EU'")) 63 | } 64 | } 65 | 66 | } 67 | -------------------------------------------------------------------------------- /src/main/scala/com/amazon/deequ/analyzers/Compliance.scala: -------------------------------------------------------------------------------- 1 | /** 2 | * Copyright 2018 Amazon.com, Inc. or its affiliates. All Rights Reserved. 3 | * 4 | * Licensed under the Apache License, Version 2.0 (the "License"). You may not 5 | * use this file except in compliance with the License. A copy of the License 6 | * is located at 7 | * 8 | * http://aws.amazon.com/apache2.0/ 9 | * 10 | * or in the "license" file accompanying this file. This file is distributed on 11 | * an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either 12 | * express or implied. See the License for the specific language governing 13 | * permissions and limitations under the License. 14 | * 15 | */ 16 | 17 | package com.amazon.deequ.analyzers 18 | 19 | import org.apache.spark.sql.types.IntegerType 20 | import org.apache.spark.sql.{Column, Row} 21 | import org.apache.spark.sql.functions._ 22 | import Analyzers._ 23 | 24 | /** 25 | * Compliance is a measure of the fraction of rows that complies with the given column constraint. 26 | * E.g if the constraint is "att1>3" and data frame has 5 rows with att1 column value greater than 27 | * 3 and 10 rows under 3; a DoubleMetric would be returned with 0.33 value 28 | * 29 | * @param instance Unlike other column analyzers (e.g completeness) this analyzer can not 30 | * infer to the metric instance name from column name. 31 | * Also the constraint given here can be referring to multiple columns, 32 | * so metric instance name should be provided, 33 | * describing what the analysis being done for. 34 | * @param predicate SQL-predicate to apply per row 35 | * @param where Additional filter to apply before the analyzer is run. 36 | */ 37 | case class Compliance(instance: String, predicate: String, where: Option[String] = None) 38 | extends StandardScanShareableAnalyzer[NumMatchesAndCount]("Compliance", instance) 39 | with FilterableAnalyzer { 40 | 41 | override def fromAggregationResult(result: Row, offset: Int): Option[NumMatchesAndCount] = { 42 | 43 | ifNoNullsIn(result, offset, howMany = 2) { _ => 44 | NumMatchesAndCount(result.getLong(offset), result.getLong(offset + 1)) 45 | } 46 | } 47 | 48 | override def aggregationFunctions(): Seq[Column] = { 49 | 50 | val summation = sum(conditionalSelection(expr(predicate), where).cast(IntegerType)) 51 | 52 | summation :: conditionalCount(where) :: Nil 53 | } 54 | 55 | override def filterCondition: Option[String] = where 56 | } 57 | -------------------------------------------------------------------------------- /src/main/scala/com/amazon/deequ/suggestions/rules/RetainTypeRule.scala: -------------------------------------------------------------------------------- 1 | /** 2 | * Copyright 2018 Amazon.com, Inc. or its affiliates. All Rights Reserved. 3 | * 4 | * Licensed under the Apache License, Version 2.0 (the "License"). You may not 5 | * use this file except in compliance with the License. A copy of the License 6 | * is located at 7 | * 8 | * http://aws.amazon.com/apache2.0/ 9 | * 10 | * or in the "license" file accompanying this file. This file is distributed on 11 | * an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either 12 | * express or implied. See the License for the specific language governing 13 | * permissions and limitations under the License. 14 | * 15 | */ 16 | 17 | package com.amazon.deequ.suggestions.rules 18 | 19 | import com.amazon.deequ.analyzers.DataTypeInstances 20 | import com.amazon.deequ.checks.Check 21 | import com.amazon.deequ.constraints.ConstrainableDataTypes 22 | import com.amazon.deequ.constraints.Constraint.dataTypeConstraint 23 | import com.amazon.deequ.profiles.ColumnProfile 24 | import com.amazon.deequ.suggestions.ConstraintSuggestion 25 | 26 | /** If we detect a non-string type, we suggest a type constraint */ 27 | case class RetainTypeRule() extends ConstraintRule[ColumnProfile] { 28 | 29 | override def shouldBeApplied(profile: ColumnProfile, numRecords: Long): Boolean = { 30 | val testableType = profile.dataType match { 31 | case DataTypeInstances.Integral | DataTypeInstances.Fractional | DataTypeInstances.Boolean => 32 | true 33 | case _ => false 34 | } 35 | 36 | profile.isDataTypeInferred && testableType 37 | } 38 | 39 | override def candidate(profile: ColumnProfile, numRecords: Long): ConstraintSuggestion = { 40 | 41 | val typeToCheck = profile.dataType match { 42 | case DataTypeInstances.Fractional => ConstrainableDataTypes.Fractional 43 | case DataTypeInstances.Integral => ConstrainableDataTypes.Integral 44 | case DataTypeInstances.Boolean => ConstrainableDataTypes.Boolean 45 | } 46 | 47 | val constraint = dataTypeConstraint(profile.column, typeToCheck, Check.IsOne) 48 | 49 | ConstraintSuggestion( 50 | constraint, 51 | profile.column, 52 | "DataType: " + profile.dataType.toString, 53 | s"'${profile.column}' has type ${profile.dataType}", 54 | this, 55 | s""".hasDataType("${profile.column}", ConstrainableDataTypes.${profile.dataType})""" 56 | ) 57 | } 58 | 59 | override val ruleDescription: String = "If we detect a non-string type, we suggest a " + 60 | "type constraint" 61 | } 62 | -------------------------------------------------------------------------------- /src/main/scala/com/amazon/deequ/suggestions/ConstraintSuggestionResult.scala: -------------------------------------------------------------------------------- 1 | /** 2 | * Copyright 2018 Amazon.com, Inc. or its affiliates. All Rights Reserved. 3 | * 4 | * Licensed under the Apache License, Version 2.0 (the "License"). You may not 5 | * use this file except in compliance with the License. A copy of the License 6 | * is located at 7 | * 8 | * http://aws.amazon.com/apache2.0/ 9 | * 10 | * or in the "license" file accompanying this file. This file is distributed on 11 | * an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either 12 | * express or implied. See the License for the specific language governing 13 | * permissions and limitations under the License. 14 | * 15 | */ 16 | 17 | package com.amazon.deequ.suggestions 18 | 19 | import com.amazon.deequ.VerificationResult 20 | import com.amazon.deequ.checks.CheckStatus 21 | import com.amazon.deequ.profiles.{ColumnProfile, ColumnProfiles} 22 | 23 | /** 24 | * The result returned from the ConstraintSuggestionSuite 25 | * 26 | * @param columnProfiles The column profiles 27 | * @param numRecordsUsedForProfiling The number of records that were used for computing 28 | * the column profiles 29 | * @param constraintSuggestions The suggested constraints 30 | * @param verificationResult The verificationResult in case a train/test split was used 31 | */ 32 | case class ConstraintSuggestionResult( 33 | columnProfiles: Map[String, ColumnProfile], 34 | numRecordsUsedForProfiling: Long, 35 | constraintSuggestions: Map[String, Seq[ConstraintSuggestion]], 36 | verificationResult: Option[VerificationResult] = None) 37 | 38 | 39 | object ConstraintSuggestionResult { 40 | 41 | def getColumnProfilesAsJson(constraintSuggestionResult: ConstraintSuggestionResult): String = { 42 | 43 | ColumnProfiles 44 | .toJson(constraintSuggestionResult.columnProfiles.values.toSeq) 45 | } 46 | 47 | def getConstraintSuggestionsAsJson(constraintSuggestionResult: ConstraintSuggestionResult) 48 | : String = { 49 | ConstraintSuggestions 50 | .toJson(constraintSuggestionResult.constraintSuggestions.values.fold(Seq.empty)( _ ++ _)) 51 | } 52 | 53 | def getEvaluationResultsAsJson(constraintSuggestionResult: ConstraintSuggestionResult) 54 | : String = { 55 | 56 | ConstraintSuggestions 57 | .evaluationResultsToJson( 58 | constraintSuggestionResult.constraintSuggestions.values.fold(Seq.empty)( _ ++ _), 59 | constraintSuggestionResult.verificationResult.getOrElse( 60 | VerificationResult(CheckStatus.Warning, Map.empty, Map.empty))) 61 | } 62 | } 63 | -------------------------------------------------------------------------------- /src/main/scala/com/amazon/deequ/analyzers/ApproxCountDistinct.scala: -------------------------------------------------------------------------------- 1 | /** 2 | * Copyright 2018 Amazon.com, Inc. or its affiliates. All Rights Reserved. 3 | * 4 | * Licensed under the Apache License, Version 2.0 (the "License"). You may not 5 | * use this file except in compliance with the License. A copy of the License 6 | * is located at 7 | * 8 | * http://aws.amazon.com/apache2.0/ 9 | * 10 | * or in the "license" file accompanying this file. This file is distributed on 11 | * an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either 12 | * express or implied. See the License for the specific language governing 13 | * permissions and limitations under the License. 14 | * 15 | */ 16 | 17 | package com.amazon.deequ.analyzers 18 | 19 | import com.amazon.deequ.analyzers.Preconditions.hasColumn 20 | import org.apache.spark.sql.DeequFunctions.stateful_approx_count_distinct 21 | import org.apache.spark.sql.catalyst.expressions.aggregate.DeequHyperLogLogPlusPlusUtils 22 | import org.apache.spark.sql.types.StructType 23 | import org.apache.spark.sql.{Column, Row} 24 | import Analyzers._ 25 | 26 | case class ApproxCountDistinctState(words: Array[Long]) 27 | extends DoubleValuedState[ApproxCountDistinctState] { 28 | 29 | override def sum(other: ApproxCountDistinctState): ApproxCountDistinctState = { 30 | ApproxCountDistinctState(DeequHyperLogLogPlusPlusUtils.merge(words, other.words)) 31 | } 32 | 33 | override def metricValue(): Double = { 34 | DeequHyperLogLogPlusPlusUtils.count(words) 35 | } 36 | 37 | override def toString: String = { 38 | s"ApproxCountDistinctState(${words.mkString(",")})" 39 | } 40 | } 41 | 42 | /** 43 | * Compute approximated count distinct with HyperLogLogPlusPlus. 44 | * 45 | * @param column Which column to compute this aggregation on. 46 | */ 47 | case class ApproxCountDistinct(column: String, where: Option[String] = None) 48 | extends StandardScanShareableAnalyzer[ApproxCountDistinctState]("ApproxCountDistinct", column) 49 | with FilterableAnalyzer { 50 | 51 | override def aggregationFunctions(): Seq[Column] = { 52 | stateful_approx_count_distinct(conditionalSelection(column, where)) :: Nil 53 | } 54 | 55 | override def fromAggregationResult(result: Row, offset: Int): Option[ApproxCountDistinctState] = { 56 | 57 | ifNoNullsIn(result, offset) { _ => 58 | DeequHyperLogLogPlusPlusUtils.wordsFromBytes(result.getAs[Array[Byte]](offset)) 59 | } 60 | } 61 | 62 | override protected def additionalPreconditions(): Seq[StructType => Unit] = { 63 | hasColumn(column) :: Nil 64 | } 65 | 66 | override def filterCondition: Option[String] = where 67 | } 68 | -------------------------------------------------------------------------------- /src/main/scala/com/amazon/deequ/analyzers/Analysis.scala: -------------------------------------------------------------------------------- 1 | /** 2 | * Copyright 2018 Amazon.com, Inc. or its affiliates. All Rights Reserved. 3 | * 4 | * Licensed under the Apache License, Version 2.0 (the "License"). You may not 5 | * use this file except in compliance with the License. A copy of the License 6 | * is located at 7 | * 8 | * http://aws.amazon.com/apache2.0/ 9 | * 10 | * or in the "license" file accompanying this file. This file is distributed on 11 | * an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either 12 | * express or implied. See the License for the specific language governing 13 | * permissions and limitations under the License. 14 | * 15 | */ 16 | 17 | package com.amazon.deequ.analyzers 18 | 19 | import com.amazon.deequ.analyzers.runners.{AnalysisRunner, AnalyzerContext} 20 | import com.amazon.deequ.metrics.Metric 21 | import org.apache.spark.sql.DataFrame 22 | import org.apache.spark.storage.StorageLevel 23 | 24 | /** 25 | * Defines a set of analyzers to run on data. 26 | * 27 | * @param analyzers 28 | */ 29 | case class Analysis(analyzers: Seq[Analyzer[_, Metric[_]]] = Seq.empty) { 30 | 31 | def addAnalyzer(analyzer: Analyzer[_, Metric[_]]): Analysis = { 32 | Analysis(analyzers :+ analyzer) 33 | } 34 | 35 | def addAnalyzers(otherAnalyzers: Seq[Analyzer[_, Metric[_]]]): Analysis = { 36 | Analysis(analyzers ++ otherAnalyzers) 37 | } 38 | 39 | /** 40 | * Compute the metrics from the analyzers configured in the analyis 41 | * 42 | * @param data data on which to operate 43 | * @param aggregateWith load existing states for the configured analyzers and aggregate them 44 | * (optional) 45 | * @param saveStatesWith persist resulting states for the configured analyzers (optional) 46 | * @param storageLevelOfGroupedDataForMultiplePasses caching level for grouped data that must 47 | * be accessed multiple times (use 48 | * StorageLevel.NONE to completely disable 49 | * caching) 50 | * @return 51 | */ 52 | @deprecated("Use the AnalysisRunner instead (the onData method there)", "24-09-2019") 53 | def run( 54 | data: DataFrame, 55 | aggregateWith: Option[StateLoader] = None, 56 | saveStatesWith: Option[StatePersister] = None, 57 | storageLevelOfGroupedDataForMultiplePasses: StorageLevel = StorageLevel.MEMORY_AND_DISK) 58 | : AnalyzerContext = { 59 | 60 | AnalysisRunner.doAnalysisRun(data, analyzers, aggregateWith = aggregateWith, 61 | saveStatesWith = saveStatesWith) 62 | } 63 | } 64 | -------------------------------------------------------------------------------- /src/main/scala/com/amazon/deequ/analyzers/StandardDeviation.scala: -------------------------------------------------------------------------------- 1 | /** 2 | * Copyright 2018 Amazon.com, Inc. or its affiliates. All Rights Reserved. 3 | * 4 | * Licensed under the Apache License, Version 2.0 (the "License"). You may not 5 | * use this file except in compliance with the License. A copy of the License 6 | * is located at 7 | * 8 | * http://aws.amazon.com/apache2.0/ 9 | * 10 | * or in the "license" file accompanying this file. This file is distributed on 11 | * an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either 12 | * express or implied. See the License for the specific language governing 13 | * permissions and limitations under the License. 14 | * 15 | */ 16 | 17 | package com.amazon.deequ.analyzers 18 | 19 | import com.amazon.deequ.analyzers.Preconditions.{hasColumn, isNumeric} 20 | import org.apache.spark.sql.DeequFunctions.stateful_stddev_pop 21 | import org.apache.spark.sql.{Column, Row} 22 | import org.apache.spark.sql.types.StructType 23 | import Analyzers._ 24 | 25 | case class StandardDeviationState( 26 | n: Double, 27 | avg: Double, 28 | m2: Double) 29 | extends DoubleValuedState[StandardDeviationState] { 30 | 31 | require(n > 0.0, "Standard deviation is undefined for n = 0.") 32 | 33 | override def metricValue(): Double = { 34 | math.sqrt(m2 / n) 35 | } 36 | 37 | override def sum(other: StandardDeviationState): StandardDeviationState = { 38 | val newN = n + other.n 39 | val delta = other.avg - avg 40 | val deltaN = if (newN == 0.0) 0.0 else delta / newN 41 | 42 | StandardDeviationState(newN, avg + deltaN * other.n, 43 | m2 + other.m2 + delta * deltaN * n * other.n) 44 | } 45 | } 46 | 47 | case class StandardDeviation(column: String, where: Option[String] = None) 48 | extends StandardScanShareableAnalyzer[StandardDeviationState]("StandardDeviation", column) 49 | with FilterableAnalyzer { 50 | 51 | override def aggregationFunctions(): Seq[Column] = { 52 | stateful_stddev_pop(conditionalSelection(column, where)) :: Nil 53 | } 54 | 55 | override def fromAggregationResult(result: Row, offset: Int): Option[StandardDeviationState] = { 56 | 57 | if (result.isNullAt(offset)) { 58 | None 59 | } else { 60 | val row = result.getAs[Row](offset) 61 | val n = row.getDouble(0) 62 | 63 | if (n == 0.0) { 64 | None 65 | } else { 66 | Some(StandardDeviationState(n, row.getDouble(1), row.getDouble(2))) 67 | } 68 | } 69 | } 70 | 71 | override protected def additionalPreconditions(): Seq[StructType => Unit] = { 72 | hasColumn(column) :: isNumeric(column) :: Nil 73 | } 74 | 75 | override def filterCondition: Option[String] = where 76 | } 77 | -------------------------------------------------------------------------------- /src/test/scala/com/amazon/deequ/metrics/MetricsTests.scala: -------------------------------------------------------------------------------- 1 | /** 2 | * Copyright 2018 Amazon.com, Inc. or its affiliates. All Rights Reserved. 3 | * 4 | * Licensed under the Apache License, Version 2.0 (the "License"). You may not 5 | * use this file except in compliance with the License. A copy of the License 6 | * is located at 7 | * 8 | * http://aws.amazon.com/apache2.0/ 9 | * 10 | * or in the "license" file accompanying this file. This file is distributed on 11 | * an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either 12 | * express or implied. See the License for the specific language governing 13 | * permissions and limitations under the License. 14 | * 15 | */ 16 | 17 | package com.amazon.deequ.metrics 18 | 19 | import com.amazon.deequ.analyzers.DataTypeInstances 20 | import org.scalatest.{Matchers, WordSpec} 21 | 22 | import scala.util.{Failure, Success} 23 | 24 | 25 | class MetricsTests extends WordSpec with Matchers { 26 | val sampleException = new IllegalArgumentException() 27 | "Double metric" should { 28 | "flatten and return itself" in { 29 | val metric = DoubleMetric(Entity.Column, "metric-name", "instance-name", Success(50)) 30 | assert(metric.flatten() == List(metric)) 31 | } 32 | 33 | "flatten in case of an error" in { 34 | val metric = DoubleMetric(Entity.Column, "metric-name", "instance-name", 35 | Failure(sampleException)) 36 | assert(metric.flatten() == List(metric)) 37 | } 38 | } 39 | 40 | "Histogram metric" should { 41 | "flatten matched and unmatched" in { 42 | val distribution = Distribution( 43 | Map("a" -> DistributionValue(6, 0.6), "b" -> DistributionValue(4, 0.4)), 2) 44 | 45 | val metric = HistogramMetric("instance-name", Success(distribution)) 46 | 47 | val expected = Seq( 48 | DoubleMetric(Entity.Column, "Histogram.bins", "instance-name", Success(2)), 49 | DoubleMetric(Entity.Column, "Histogram.abs.a", "instance-name", Success(6)), 50 | DoubleMetric(Entity.Column, "Histogram.abs.b", "instance-name", Success(4)), 51 | DoubleMetric(Entity.Column, "Histogram.ratio.a", "instance-name", Success(0.6)), 52 | DoubleMetric(Entity.Column, "Histogram.ratio.b", "instance-name", Success(0.4)) 53 | ).toSet 54 | assert(metric.flatten().toSet == expected) 55 | } 56 | 57 | "flatten matched and unmatched in case of an error" in { 58 | val metric = HistogramMetric("instance-name", Failure(sampleException)) 59 | 60 | val expected = Seq(DoubleMetric(Entity.Column, "Histogram.bins", "instance-name", 61 | Failure(sampleException))).toSet 62 | assert(metric.flatten().toSet == expected) 63 | } 64 | } 65 | 66 | } 67 | -------------------------------------------------------------------------------- /src/test/scala/com/amazon/deequ/SparkMonitor.scala: -------------------------------------------------------------------------------- 1 | /** 2 | * Copyright 2018 Amazon.com, Inc. or its affiliates. All Rights Reserved. 3 | * 4 | * Licensed under the Apache License, Version 2.0 (the "License"). You may not 5 | * use this file except in compliance with the License. A copy of the License 6 | * is located at 7 | * 8 | * http://aws.amazon.com/apache2.0/ 9 | * 10 | * or in the "license" file accompanying this file. This file is distributed on 11 | * an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either 12 | * express or implied. See the License for the specific language governing 13 | * permissions and limitations under the License. 14 | * 15 | */ 16 | 17 | package com.amazon.deequ 18 | 19 | import org.apache.spark.scheduler.{SparkListener, SparkListenerJobStart, SparkListenerStageCompleted, StageInfo} 20 | 21 | /** 22 | * A class representing a statistics about a sparkSession. 23 | * Currently, only number of spark jobs submitted and its stages are being tracked. 24 | */ 25 | class SparkSessionStats { 26 | private var numberOfJobsSubmitted = 0 27 | private var stageInfos = Seq[StageInfo]() 28 | 29 | def jobCount: Int = { 30 | numberOfJobsSubmitted 31 | } 32 | 33 | def allExecutedStages: Seq[StageInfo] = { 34 | stageInfos 35 | } 36 | 37 | def recordJobStart(jobStart: SparkListenerJobStart): Unit = { 38 | numberOfJobsSubmitted += 1 39 | } 40 | 41 | def recordStageInfos(stageInfo: StageInfo): Unit = { 42 | stageInfos = stageInfos :+ stageInfo 43 | } 44 | 45 | def reset(): Unit = { 46 | numberOfJobsSubmitted = 0 47 | stageInfos = Seq[StageInfo]() 48 | } 49 | 50 | } 51 | 52 | /** 53 | * A SparkListener implementation to monitor spark jobs submitted 54 | */ 55 | class SparkMonitor extends SparkListener { 56 | val stat = new SparkSessionStats 57 | 58 | override def onJobStart(jobStart: SparkListenerJobStart) { 59 | stat.recordJobStart(jobStart) 60 | println(s"Job started with ${jobStart.stageInfos.size} stages: $jobStart " + 61 | s"details : ${jobStart.stageInfos.map(_.name)}") 62 | 63 | } 64 | 65 | override def onStageCompleted(stageCompleted: SparkListenerStageCompleted): Unit = { 66 | stat.recordStageInfos(stageCompleted.stageInfo) 67 | println(s"Stage ${stageCompleted.stageInfo.stageId} completed with " + 68 | s"${stageCompleted.stageInfo.numTasks} tasks.") 69 | } 70 | 71 | /** 72 | * @param testFun thunk to run with SparkSessionStats as an argument. 73 | * Provides a monitoring session where the stats are being reset at the beginning 74 | * 75 | */ 76 | def withMonitoringSession(testFun: (SparkSessionStats) => Any): Any = { 77 | stat.reset 78 | testFun(stat) 79 | } 80 | 81 | } 82 | -------------------------------------------------------------------------------- /src/main/scala/com/amazon/deequ/suggestions/rules/RetainCompletenessRule.scala: -------------------------------------------------------------------------------- 1 | /** 2 | * Copyright 2018 Amazon.com, Inc. or its affiliates. All Rights Reserved. 3 | * 4 | * Licensed under the Apache License, Version 2.0 (the "License"). You may not 5 | * use this file except in compliance with the License. A copy of the License 6 | * is located at 7 | * 8 | * http://aws.amazon.com/apache2.0/ 9 | * 10 | * or in the "license" file accompanying this file. This file is distributed on 11 | * an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either 12 | * express or implied. See the License for the specific language governing 13 | * permissions and limitations under the License. 14 | * 15 | */ 16 | 17 | package com.amazon.deequ.suggestions.rules 18 | 19 | import com.amazon.deequ.constraints.Constraint.completenessConstraint 20 | import com.amazon.deequ.profiles._ 21 | import com.amazon.deequ.suggestions.ConstraintSuggestion 22 | import scala.math.BigDecimal.RoundingMode 23 | 24 | /** 25 | * If a column is incomplete in the sample, we model its completeness as a binomial variable, 26 | * estimate a confidence interval and use this to define a lower bound for the completeness 27 | */ 28 | case class RetainCompletenessRule() extends ConstraintRule[ColumnProfile] { 29 | 30 | override def shouldBeApplied(profile: ColumnProfile, numRecords: Long): Boolean = { 31 | profile.completeness > 0.2 && profile.completeness < 1.0 32 | } 33 | 34 | override def candidate(profile: ColumnProfile, numRecords: Long): ConstraintSuggestion = { 35 | 36 | val p = profile.completeness 37 | val n = numRecords 38 | val z = 1.96 39 | 40 | // TODO this needs to be more robust for p's close to 0 or 1 41 | val targetCompleteness = BigDecimal(p - z * math.sqrt(p * (1 - p) / n)) 42 | .setScale(2, RoundingMode.DOWN).toDouble 43 | 44 | val constraint = completenessConstraint(profile.column, _ >= targetCompleteness) 45 | 46 | val boundInPercent = ((1.0 - targetCompleteness) * 100).toInt 47 | 48 | val description = s"'${profile.column}' has less than $boundInPercent% missing values" 49 | 50 | ConstraintSuggestion( 51 | constraint, 52 | profile.column, 53 | "Completeness: " + profile.completeness.toString, 54 | description, 55 | this, 56 | s""".hasCompleteness("${profile.column}", _ >= $targetCompleteness, 57 | | Some("It should be above $targetCompleteness!"))""" 58 | .stripMargin.replaceAll("\n", "") 59 | ) 60 | } 61 | 62 | override val ruleDescription: String = "If a column is incomplete in the sample, " + 63 | "we model its completeness as a binomial variable, estimate a confidence interval " + 64 | "and use this to define a lower bound for the completeness" 65 | } 66 | -------------------------------------------------------------------------------- /src/test/scala/com/amazon/deequ/anomalydetection/SimpleThresholdStrategyTest.scala: -------------------------------------------------------------------------------- 1 | /** 2 | * Copyright 2018 Amazon.com, Inc. or its affiliates. All Rights Reserved. 3 | * 4 | * Licensed under the Apache License, Version 2.0 (the "License"). You may not 5 | * use this file except in compliance with the License. A copy of the License 6 | * is located at 7 | * 8 | * http://aws.amazon.com/apache2.0/ 9 | * 10 | * or in the "license" file accompanying this file. This file is distributed on 11 | * an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either 12 | * express or implied. See the License for the specific language governing 13 | * permissions and limitations under the License. 14 | * 15 | */ 16 | 17 | package com.amazon.deequ.anomalydetection 18 | 19 | import org.scalatest.{Matchers, WordSpec} 20 | 21 | class SimpleThresholdStrategyTest extends WordSpec with Matchers { 22 | 23 | "Simple Threshold Strategy" should { 24 | 25 | val strategy = SimpleThresholdStrategy(upperBound = 1.0) 26 | val data = Vector(-1.0, 2.0, 3.0, 0.5) 27 | val expected = Seq((1, Anomaly(Option(2.0), 1.0)), (2, Anomaly(Option(3.0), 1.0))) 28 | 29 | "detect values above threshold" in { 30 | val anomalyResult = strategy.detect(data, (0, 4)) 31 | 32 | assert(anomalyResult == expected) 33 | } 34 | 35 | "detect all values without range specified" in { 36 | val anomalyResult = strategy.detect(data) 37 | 38 | assert(anomalyResult == expected) 39 | } 40 | 41 | "work fine with empty input" in { 42 | val emptySeries = Vector[Double]() 43 | val anomalyResult = strategy.detect(emptySeries) 44 | 45 | assert(anomalyResult == Seq[(Int, Anomaly)]()) 46 | } 47 | 48 | "work with upper and lower threshold" in { 49 | val tS = SimpleThresholdStrategy(lowerBound = -0.5, upperBound = 1.0) 50 | val anomalyResult = tS.detect(data) 51 | 52 | assert(anomalyResult == Seq((0, Anomaly(Option(-1.0), 1.0)), 53 | (1, Anomaly(Option(2.0), 1.0)), (2, Anomaly(Option(3.0), 1.0)))) 54 | } 55 | 56 | "throw an error when thresholds are not ordered " in { 57 | intercept[IllegalArgumentException] { 58 | val ts = SimpleThresholdStrategy(lowerBound = 2.0, upperBound = 1.0) 59 | } 60 | } 61 | 62 | "produce error message with correct value and bounds" in { 63 | val result = strategy.detect(data) 64 | 65 | result.foreach { case (_, anom) => 66 | val (value, lowerBound, upperBound) = 67 | AnomalyDetectionTestUtils.firstThreeDoublesFromString(anom.detail.get) 68 | 69 | assert(anom.value.isDefined && value === anom.value.get) 70 | assert(value < lowerBound || value > upperBound) 71 | } 72 | } 73 | } 74 | } 75 | -------------------------------------------------------------------------------- /docs/key-concepts.md: -------------------------------------------------------------------------------- 1 | # Key Concepts in the Codebase 2 | There are a few key concepts that will help you to understand the codebase. 3 | 4 | ## Metrics, Analyzers, and State 5 | Metrics represent some metric associated with the data that changes over time. For example counting the rows in a 6 | DataFrame. 7 | 8 | An Analyzer knows how to calculate a Metric based on some input DataFrame. 9 | 10 | State is an optimization - it represents the state of the data, from which a metric can be calculated. This intermediate 11 | state can then be used to calculate future metrics more quickly. Check out the examples for some further details. 12 | 13 | ## Overall flow of running deequ checks 14 | When running checks a user specifies a DataFrame and a number of checks to do on that DataFrame. Many checks in Deequ 15 | are based on metrics which describe the data. In order to perform the checks the user requests deequ follows the 16 | following process: 17 | * First deequ figures out which Analyzers are required 18 | * Metrics are calculated using those Analyzers 19 | ** Metrics are also stored if a MetricsRepository is provided 20 | ** Intermediate state is stored if a StatePersister is provided 21 | ** Intermediate state is used for metric calculations if a StateLoader is provided 22 | * Checks are evaluated using the calculated Metrics 23 | 24 | The reason it works this way is for performance, primarily because calculating metrics at the same time gives the 25 | opportunity to calculate them in fewer passes over the data. 26 | 27 | ### Analyzers 28 | Types of analyzers: 29 | * ScanShareableAnalyzer - an analyzer which computes a metric based on a straight scan over the data, without any 30 | grouping being required 31 | * GroupingAnalyzer - an analyzer that requires the data to be grouped by a set of columns before the metric can be 32 | calculated 33 | 34 | ### Metrics 35 | A metric includes the following key details 36 | * name - the name for the type of metric 37 | * entity - the type of entity the metric is recorded against. e.g. A column, dataset, or multicolumn 38 | * instance - information about this instance of the metric. For example this could be the column name the metric is 39 | operating on 40 | * value - the value of the metric at a point in time. The type of this value varies between metrics. 41 | 42 | #### Metrics storage 43 | Metrics can be stored in a metrics repository. An entry in the repository consists of: 44 | * A resultKey, which is a combination of a timestamp and a map of tags. Typically a user may want to record things 45 | like the data source (e.g. table name) with the tags. The resultKey can be used to lookup stored metrics 46 | * An analyzerContext, which consists of a map of Analyzers to Metrics 47 | 48 | ### State 49 | Please consult the examples or the codebase for more details on State. -------------------------------------------------------------------------------- /src/main/scala/com/amazon/deequ/examples/KLLCheckExample.scala: -------------------------------------------------------------------------------- 1 | /** 2 | * Copyright 2018 Amazon.com, Inc. or its affiliates. All Rights Reserved. 3 | * 4 | * Licensed under the Apache License, Version 2.0 (the "License"). You may not 5 | * use this file except in compliance with the License. A copy of the License 6 | * is located at 7 | * 8 | * http://aws.amazon.com/apache2.0/ 9 | * 10 | * or in the "license" file accompanying this file. This file is distributed on 11 | * an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either 12 | * express or implied. See the License for the specific language governing 13 | * permissions and limitations under the License. 14 | * 15 | */ 16 | 17 | package com.amazon.deequ.examples 18 | 19 | import ExampleUtils.{itemsAsDataframe, withSpark} 20 | import com.amazon.deequ.VerificationSuite 21 | import com.amazon.deequ.analyzers.KLLParameters 22 | import com.amazon.deequ.checks.{Check, CheckLevel, CheckStatus} 23 | import com.amazon.deequ.constraints.ConstraintStatus 24 | import org.apache.spark.sql.types.DoubleType 25 | 26 | private[examples] object KLLCheckExample extends App { 27 | 28 | withSpark { session => 29 | 30 | val data = itemsAsDataframe(session, 31 | Item(1, "Thingy A", "awesome thing.", "high", 0), 32 | Item(2, "Thingy B", "available at http://thingb.com", null, 0), 33 | Item(3, null, null, "low", 5), 34 | Item(4, "Thingy D", "checkout https://thingd.ca", "low", 10), 35 | Item(5, "Thingy E", null, "high", 12)) 36 | 37 | val newData = data.select(data("numViews").cast(DoubleType).as("numViews")) 38 | 39 | val verificationResult = VerificationSuite() 40 | .onData(newData) 41 | .addCheck( 42 | Check(CheckLevel.Error, "integrity checks") 43 | // we expect 5 records 44 | .hasSize(_ == 5) 45 | // we expect the maximum of tips to be not more than 10 46 | .hasMax("numViews", _ <= 10) 47 | // we expect the sketch size to be at least 16 48 | .kllSketchSatisfies("numViews", _.parameters(1) >= 16, 49 | kllParameters = Option(KLLParameters(2, 0.64, 2)))) 50 | .run() 51 | 52 | if (verificationResult.status == CheckStatus.Success) { 53 | println("The data passed the test, everything is fine!") 54 | } else { 55 | println("We found errors in the data, the following constraints were not satisfied:\n") 56 | 57 | val resultsForAllConstraints = verificationResult.checkResults 58 | .flatMap { case (_, checkResult) => checkResult.constraintResults } 59 | 60 | resultsForAllConstraints 61 | .filter { _.status != ConstraintStatus.Success } 62 | .foreach { result => 63 | println(s"${result.constraint} failed: ${result.message.get}") 64 | } 65 | } 66 | 67 | } 68 | } 69 | 70 | -------------------------------------------------------------------------------- /src/main/scala/com/amazon/deequ/analyzers/runners/MetricCalculationException.scala: -------------------------------------------------------------------------------- 1 | /** 2 | * Copyright 2018 Amazon.com, Inc. or its affiliates. All Rights Reserved. 3 | * 4 | * Licensed under the Apache License, Version 2.0 (the "License"). You may not 5 | * use this file except in compliance with the License. A copy of the License 6 | * is located at 7 | * 8 | * http://aws.amazon.com/apache2.0/ 9 | * 10 | * or in the "license" file accompanying this file. This file is distributed on 11 | * an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either 12 | * express or implied. See the License for the specific language governing 13 | * permissions and limitations under the License. 14 | * 15 | */ 16 | 17 | package com.amazon.deequ.analyzers.runners 18 | 19 | abstract class MetricCalculationException(message: String) extends Exception(message) 20 | 21 | class MetricCalculationRuntimeException(message: String) 22 | extends MetricCalculationException(message) { 23 | 24 | def this(message: String, cause: Throwable) { 25 | this(message) 26 | initCause(cause) 27 | } 28 | 29 | def this(cause: Throwable) { 30 | this(Option(cause).map(_.toString).orNull, cause) 31 | } 32 | } 33 | 34 | class MetricCalculationPreconditionException(message: String) 35 | extends MetricCalculationException(message) 36 | 37 | 38 | class NoSuchColumnException(message: String) 39 | extends MetricCalculationPreconditionException(message) 40 | 41 | class WrongColumnTypeException(message: String) 42 | extends MetricCalculationPreconditionException(message) 43 | 44 | class NoColumnsSpecifiedException(message: String) 45 | extends MetricCalculationPreconditionException(message) 46 | 47 | class NumberOfSpecifiedColumnsException(message: String) 48 | extends MetricCalculationPreconditionException(message) 49 | 50 | class IllegalAnalyzerParameterException( 51 | message: String) 52 | extends MetricCalculationPreconditionException(message) 53 | 54 | class EmptyStateException(message: String) extends MetricCalculationRuntimeException(message) 55 | 56 | 57 | object MetricCalculationException { 58 | 59 | private[deequ] def getApproxQuantileIllegalParamMessage(quantile: Double): String = { 60 | "Quantile parameter must be in the closed interval [0, 1]. " + 61 | s"Currently, the value is: $quantile!" 62 | } 63 | 64 | private[deequ] def getApproxQuantileIllegalErrorParamMessage(relativeError: Double): String = { 65 | "Relative error parameter must be in the closed interval [0, 1]. " + 66 | s"Currently, the value is: $relativeError!" 67 | } 68 | 69 | def wrapIfNecessary(exception: Throwable) 70 | : MetricCalculationException = { 71 | 72 | exception match { 73 | case error: MetricCalculationException => error 74 | case error: Throwable => new MetricCalculationRuntimeException(error) 75 | } 76 | } 77 | 78 | } 79 | -------------------------------------------------------------------------------- /src/main/scala/com/amazon/deequ/examples/IncrementalMetricsExample.scala: -------------------------------------------------------------------------------- 1 | /** 2 | * Copyright 2018 Amazon.com, Inc. or its affiliates. All Rights Reserved. 3 | * 4 | * Licensed under the Apache License, Version 2.0 (the "License"). You may not 5 | * use this file except in compliance with the License. A copy of the License 6 | * is located at 7 | * 8 | * http://aws.amazon.com/apache2.0/ 9 | * 10 | * or in the "license" file accompanying this file. This file is distributed on 11 | * an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either 12 | * express or implied. See the License for the specific language governing 13 | * permissions and limitations under the License. 14 | * 15 | */ 16 | 17 | package com.amazon.deequ.examples 18 | 19 | import ExampleUtils.{itemsAsDataframe, withSpark} 20 | import com.amazon.deequ.analyzers.{Analysis, ApproxCountDistinct, Completeness, InMemoryStateProvider, Size} 21 | import com.amazon.deequ.analyzers.runners.AnalysisRunner 22 | 23 | private[examples] object IncrementalMetricsExample extends App { 24 | 25 | /* NOTE: Stateful support is still work in progress, and is therefore not yet integrated into 26 | VerificationSuite. We showcase however how to incrementally compute metrics on a growing 27 | dataset using the AnalysisRunner. */ 28 | 29 | withSpark { session => 30 | 31 | val data = itemsAsDataframe(session, 32 | Item(1, "Thingy A", "awesome thing.", "high", 0), 33 | Item(2, "Thingy B", "available tomorrow", "low", 0), 34 | Item(3, "Thing C", null, null, 5)) 35 | 36 | val moreData = itemsAsDataframe(session, 37 | Item(4, "Thingy D", null, "low", 10), 38 | Item(5, "Thingy E", null, "high", 12)) 39 | 40 | 41 | val analysis = Analysis() 42 | .addAnalyzer(Size()) 43 | .addAnalyzer(ApproxCountDistinct("id")) 44 | .addAnalyzer(Completeness("productName")) 45 | .addAnalyzer(Completeness("description")) 46 | 47 | val stateStore = InMemoryStateProvider() 48 | 49 | val metricsForData = AnalysisRunner.run( 50 | data = data, 51 | analysis = analysis, 52 | saveStatesWith = Some(stateStore) // persist the internal state of the computation 53 | ) 54 | 55 | // We update the metrics now from the stored states without having to access the previous data! 56 | val metricsAfterAddingMoreData = AnalysisRunner.run( 57 | data = moreData, 58 | analysis = analysis, 59 | aggregateWith = Some(stateStore) // continue from internal state of the computation 60 | ) 61 | 62 | println("Metrics for the first 3 records:\n") 63 | metricsForData.metricMap.foreach { case (analyzer, metric) => 64 | println(s"\t$analyzer: ${metric.value.get}") 65 | } 66 | 67 | println("\nMetrics after adding 2 more records:\n") 68 | metricsAfterAddingMoreData.metricMap.foreach { case (analyzer, metric) => 69 | println(s"\t$analyzer: ${metric.value.get}") 70 | } 71 | 72 | } 73 | } 74 | -------------------------------------------------------------------------------- /src/main/scala/com/amazon/deequ/io/DfsUtils.scala: -------------------------------------------------------------------------------- 1 | /** 2 | * Copyright 2018 Amazon.com, Inc. or its affiliates. All Rights Reserved. 3 | * 4 | * Licensed under the Apache License, Version 2.0 (the "License"). You may not 5 | * use this file except in compliance with the License. A copy of the License 6 | * is located at 7 | * 8 | * http://aws.amazon.com/apache2.0/ 9 | * 10 | * or in the "license" file accompanying this file. This file is distributed on 11 | * an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either 12 | * express or implied. See the License for the specific language governing 13 | * permissions and limitations under the License. 14 | * 15 | */ 16 | 17 | package com.amazon.deequ.io 18 | 19 | import java.io.{BufferedWriter, OutputStreamWriter} 20 | 21 | import org.apache.hadoop.fs.{FSDataInputStream, FSDataOutputStream, FileSystem, Path} 22 | import org.apache.spark.sql.SparkSession 23 | 24 | private[deequ] object DfsUtils { 25 | 26 | /* Helper function to read from a binary file on S3 */ 27 | def readFromFileOnDfs[T](session: SparkSession, path: String) 28 | (readFunc: FSDataInputStream => T): T = { 29 | 30 | val (fs, qualifiedPath) = asQualifiedPath(session, path) 31 | val input = fs.open(qualifiedPath) 32 | 33 | try { 34 | readFunc(input) 35 | } finally { 36 | if (input != null) { 37 | input.close() 38 | } 39 | } 40 | } 41 | 42 | /* Helper function to write to a binary file on S3 */ 43 | def writeToFileOnDfs(session: SparkSession, path: String, overwrite: Boolean = false) 44 | (writeFunc: FSDataOutputStream => Unit): Unit = { 45 | 46 | val (fs, qualifiedPath) = asQualifiedPath(session, path) 47 | val output = fs.create(qualifiedPath, overwrite) 48 | 49 | try { 50 | writeFunc(output) 51 | } finally { 52 | if (output != null) { 53 | output.close() 54 | } 55 | } 56 | } 57 | 58 | /* Helper function to write to a binary file on S3 */ 59 | def writeToTextFileOnDfs(session: SparkSession, path: String, overwrite: Boolean = false) 60 | (writeFunc: BufferedWriter => Unit): Unit = { 61 | 62 | val (fs, qualifiedPath) = asQualifiedPath(session, path) 63 | val output = fs.create(qualifiedPath, overwrite) 64 | 65 | try { 66 | val writer = new BufferedWriter(new OutputStreamWriter(output)) 67 | writeFunc(writer) 68 | writer.close() 69 | } finally { 70 | if (output != null) { 71 | output.close() 72 | } 73 | } 74 | } 75 | 76 | /* Make sure we write to the correct filesystem, as EMR clusters also have an internal HDFS */ 77 | def asQualifiedPath(session: SparkSession, path: String): (FileSystem, Path) = { 78 | val hdfsPath = new Path(path) 79 | val fs = hdfsPath.getFileSystem(session.sparkContext.hadoopConfiguration) 80 | val qualifiedPath = hdfsPath.makeQualified(fs.getUri, fs.getWorkingDirectory) 81 | 82 | (fs, qualifiedPath) 83 | } 84 | 85 | } 86 | -------------------------------------------------------------------------------- /src/main/scala/com/amazon/deequ/anomalydetection/RelativeRateOfChangeStrategy.scala: -------------------------------------------------------------------------------- 1 | /** 2 | * Copyright 2018 Amazon.com, Inc. or its affiliates. All Rights Reserved. 3 | * 4 | * Licensed under the Apache License, Version 2.0 (the "License"). You may not 5 | * use this file except in compliance with the License. A copy of the License 6 | * is located at 7 | * 8 | * http://aws.amazon.com/apache2.0/ 9 | * 10 | * or in the "license" file accompanying this file. This file is distributed on 11 | * an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either 12 | * express or implied. See the License for the specific language governing 13 | * permissions and limitations under the License. 14 | * 15 | */ 16 | 17 | package com.amazon.deequ.anomalydetection 18 | 19 | import breeze.linalg.DenseVector 20 | 21 | /** 22 | * Detects anomalies based on the values' rate of change. 23 | * The order of the difference can be set manually. 24 | * If it is set to 0, this strategy acts like the [[SimpleThresholdStrategy]]. 25 | * 26 | * RelativeRateOfChangeStrategy(Some(0.9), Some(1.1), 1) for example 27 | * calculates the first discrete difference 28 | * and if some point's value changes by more than 10.0 Percent in one timestep, 29 | * it flags it as an anomaly. 30 | * 31 | * @param maxRateDecrease Lower bound of accepted relative change (as new value / old value). 32 | * @param maxRateIncrease Upper bound of accepted relative change (as new value / old value). 33 | * @param order Order of the calculated difference. 34 | * Set to 1 it calculates the difference between two consecutive values. 35 | */ 36 | case class RelativeRateOfChangeStrategy( 37 | maxRateDecrease: Option[Double] = None, 38 | maxRateIncrease: Option[Double] = None, 39 | order: Int = 1) 40 | extends BaseChangeStrategy { 41 | 42 | /** 43 | * Calculates the rate of change with respect to the specified order. 44 | * If the order is set to 1, the resulting value for a point at index i 45 | * is equal to dataSeries (i) / dataSeries(i - 1). 46 | * Note that this difference cannot be calculated for the first [[order]] elements in the vector. 47 | * The resulting vector is therefore smaller by [[order]] elements. 48 | * 49 | * @param dataSeries The values contained in a DenseVector[Double] 50 | * @param order The order of the derivative. 51 | * @return A vector with the resulting rates of change for all values 52 | * except the first [[order]] elements. 53 | */ 54 | override def diff(dataSeries: DenseVector[Double], order: Int): DenseVector[Double] = { 55 | require(order > 0, "Order of diff cannot be zero or negative") 56 | if (dataSeries.length == 0) { 57 | dataSeries 58 | } else { 59 | val valuesRight = dataSeries.slice(order, dataSeries.length) 60 | val valuesLeft = dataSeries.slice(0, dataSeries.length - order) 61 | valuesRight / valuesLeft 62 | } 63 | } 64 | } 65 | -------------------------------------------------------------------------------- /src/test/scala/com/amazon/deequ/KLL/KLLBenchmark.java: -------------------------------------------------------------------------------- 1 | /** 2 | * Copyright 2018 Amazon.com, Inc. or its affiliates. All Rights Reserved. 3 | * 4 | * Licensed under the Apache License, Version 2.0 (the "License"). You may not 5 | * use this file except in compliance with the License. A copy of the License 6 | * is located at 7 | * 8 | * http://aws.amazon.com/apache2.0/ 9 | * 10 | * or in the "license" file accompanying this file. This file is distributed on 11 | * an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either 12 | * express or implied. See the License for the specific language governing 13 | * permissions and limitations under the License. 14 | * 15 | */ 16 | 17 | package com.amazon.deequ.KLL; 18 | 19 | import com.amazon.deequ.analyzers.QuantileNonSample; 20 | import org.apache.datasketches.kll.KllFloatsSketch; 21 | import org.openjdk.jmh.annotations.Benchmark; 22 | import org.openjdk.jmh.annotations.BenchmarkMode; 23 | import org.openjdk.jmh.annotations.Fork; 24 | import org.openjdk.jmh.annotations.Mode; 25 | import org.openjdk.jmh.annotations.OutputTimeUnit; 26 | import org.openjdk.jmh.infra.Blackhole; 27 | import org.openjdk.jmh.runner.Runner; 28 | import org.openjdk.jmh.runner.RunnerException; 29 | import org.openjdk.jmh.runner.options.Options; 30 | import org.openjdk.jmh.runner.options.OptionsBuilder; 31 | 32 | import java.util.Random; 33 | import java.util.concurrent.TimeUnit; 34 | 35 | @BenchmarkMode(Mode.AverageTime) 36 | @OutputTimeUnit(TimeUnit.MILLISECONDS) 37 | @Fork(value = 2, jvmArgs = {"-Xms2G", "-Xmx2G"}) 38 | public class KLLBenchmark { 39 | 40 | private static final int N = 10_000_000; 41 | 42 | private static float[] DATA_FOR_TESTING = createData(); 43 | 44 | public static void main(String[] args) throws RunnerException { 45 | 46 | Options opt = new OptionsBuilder() 47 | .include(KLLBenchmark.class.getSimpleName()) 48 | .forks(1) 49 | .build(); 50 | 51 | new Runner(opt).run(); 52 | } 53 | 54 | private static float[] createData() { 55 | Random prng = new Random(); 56 | float[] numbers = new float[N]; 57 | for (int i = 0; i < N; i++) { 58 | numbers[i] = prng.nextFloat(); 59 | } 60 | return numbers; 61 | } 62 | 63 | @Benchmark 64 | public void sumArray(Blackhole bh) { 65 | float sum = 0.0f; 66 | for (int i = 0; i < N; i++) { 67 | sum += DATA_FOR_TESTING[i]; 68 | } 69 | bh.consume(sum); 70 | } 71 | 72 | @Benchmark 73 | public void sketchArrayWithKLL(Blackhole bh) { 74 | QuantileNonSample sketch = KLLBenchmarkHelper.floatSketch(); 75 | for (int i = 0; i < N; i++) { 76 | sketch.update(DATA_FOR_TESTING[i]); 77 | } 78 | bh.consume(sketch); 79 | } 80 | 81 | @Benchmark 82 | public void sketchArrayWithJavaSketchesKLL(Blackhole bh) { 83 | KllFloatsSketch sketch = new KllFloatsSketch(); 84 | for (int i = 0; i < N; i++) { 85 | sketch.update(DATA_FOR_TESTING[i]); 86 | } 87 | bh.consume(sketch); 88 | } 89 | } 90 | -------------------------------------------------------------------------------- /src/main/scala/com/amazon/deequ/examples/BasicExample.scala: -------------------------------------------------------------------------------- 1 | /** 2 | * Copyright 2018 Amazon.com, Inc. or its affiliates. All Rights Reserved. 3 | * 4 | * Licensed under the Apache License, Version 2.0 (the "License"). You may not 5 | * use this file except in compliance with the License. A copy of the License 6 | * is located at 7 | * 8 | * http://aws.amazon.com/apache2.0/ 9 | * 10 | * or in the "license" file accompanying this file. This file is distributed on 11 | * an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either 12 | * express or implied. See the License for the specific language governing 13 | * permissions and limitations under the License. 14 | * 15 | */ 16 | 17 | package com.amazon.deequ.examples 18 | 19 | import ExampleUtils.{withSpark, itemsAsDataframe} 20 | import com.amazon.deequ.VerificationSuite 21 | import com.amazon.deequ.checks.{Check, CheckLevel, CheckStatus} 22 | import com.amazon.deequ.constraints.ConstraintStatus 23 | 24 | private[examples] object BasicExample extends App { 25 | 26 | withSpark { session => 27 | 28 | val data = itemsAsDataframe(session, 29 | Item(1, "Thingy A", "awesome thing.", "high", 0), 30 | Item(2, "Thingy B", "available at http://thingb.com", null, 0), 31 | Item(3, null, null, "low", 5), 32 | Item(4, "Thingy D", "checkout https://thingd.ca", "low", 10), 33 | Item(5, "Thingy E", null, "high", 12)) 34 | 35 | val verificationResult = VerificationSuite() 36 | .onData(data) 37 | .addCheck( 38 | Check(CheckLevel.Error, "integrity checks") 39 | // we expect 5 records 40 | .hasSize(_ == 5) 41 | // 'id' should never be NULL 42 | .isComplete("id") 43 | // 'id' should not contain duplicates 44 | .isUnique("id") 45 | // 'productName' should never be NULL 46 | .isComplete("productName") 47 | // 'priority' should only contain the values "high" and "low" 48 | .isContainedIn("priority", Array("high", "low")) 49 | // 'numViews' should not contain negative values 50 | .isNonNegative("numViews")) 51 | .addCheck( 52 | Check(CheckLevel.Warning, "distribution checks") 53 | // at least half of the 'description's should contain a url 54 | .containsURL("description", _ >= 0.5) 55 | // half of the items should have less than 10 'numViews' 56 | .hasApproxQuantile("numViews", 0.5, _ <= 10)) 57 | .run() 58 | 59 | if (verificationResult.status == CheckStatus.Success) { 60 | println("The data passed the test, everything is fine!") 61 | } else { 62 | println("We found errors in the data, the following constraints were not satisfied:\n") 63 | 64 | val resultsForAllConstraints = verificationResult.checkResults 65 | .flatMap { case (_, checkResult) => checkResult.constraintResults } 66 | 67 | resultsForAllConstraints 68 | .filter { _.status != ConstraintStatus.Success } 69 | .foreach { result => 70 | println(s"${result.constraint} failed: ${result.message.get}") 71 | } 72 | } 73 | 74 | } 75 | } 76 | -------------------------------------------------------------------------------- /src/test/scala/com/amazon/deequ/SparkContextSpec.scala: -------------------------------------------------------------------------------- 1 | /** 2 | * Copyright 2018 Amazon.com, Inc. or its affiliates. All Rights Reserved. 3 | * 4 | * Licensed under the Apache License, Version 2.0 (the "License"). You may not 5 | * use this file except in compliance with the License. A copy of the License 6 | * is located at 7 | * 8 | * http://aws.amazon.com/apache2.0/ 9 | * 10 | * or in the "license" file accompanying this file. This file is distributed on 11 | * an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either 12 | * express or implied. See the License for the specific language governing 13 | * permissions and limitations under the License. 14 | * 15 | */ 16 | 17 | package com.amazon.deequ 18 | 19 | import org.apache.spark.SparkContext 20 | import org.apache.spark.sql.{SQLContext, SparkSession} 21 | 22 | /** 23 | * To be mixed with Tests so they can use a default spark context suitable for testing 24 | */ 25 | trait SparkContextSpec { 26 | 27 | /** 28 | * @param testFun thunk to run with SparkSession as an argument 29 | */ 30 | def withSparkSession(testFun: SparkSession => Any): Unit = { 31 | val session = setupSparkSession 32 | try { 33 | testFun(session) 34 | } finally { 35 | /* empty cache of RDD size, as the referred ids are only valid within a session */ 36 | tearDownSparkSession(session) 37 | } 38 | } 39 | 40 | /** 41 | * @param testFun thunk to run with SparkSession and SparkMonitor as an argument for the tests 42 | * that would like to get details on spark jobs submitted 43 | * 44 | */ 45 | def withMonitorableSparkSession(testFun: (SparkSession, SparkMonitor) => Any): Unit = { 46 | val monitor = new SparkMonitor 47 | val session = setupSparkSession 48 | session.sparkContext.addSparkListener(monitor) 49 | try { 50 | testFun(session, monitor) 51 | } finally { 52 | tearDownSparkSession(session) 53 | } 54 | } 55 | 56 | /** 57 | * @param testFun thunk to run with SparkContext as an argument 58 | */ 59 | def withSparkContext(testFun: SparkContext => Any) { 60 | withSparkSession(session => testFun(session.sparkContext)) 61 | } 62 | 63 | /** 64 | * @param testFun thunk to run with SQLContext as an argument 65 | */ 66 | def withSparkSqlContext(testFun: SQLContext => Any) { 67 | withSparkSession(session => testFun(session.sqlContext)) 68 | } 69 | 70 | /** 71 | * Setups a local sparkSession 72 | * 73 | * @return sparkSession to be used 74 | */ 75 | private def setupSparkSession = { 76 | val session = SparkSession.builder() 77 | .master("local") 78 | .appName("test") 79 | .config("spark.ui.enabled", "false") 80 | .config("spark.sql.shuffle.partitions", 2.toString) 81 | .getOrCreate() 82 | session.sparkContext.setCheckpointDir(System.getProperty("java.io.tmpdir")) 83 | session 84 | } 85 | 86 | /** 87 | * Tears down the sparkSession 88 | * 89 | * @param session Session to be stopped 90 | * @return 91 | */ 92 | private def tearDownSparkSession(session: SparkSession) = { 93 | session.stop() 94 | System.clearProperty("spark.driver.port") 95 | } 96 | 97 | } 98 | -------------------------------------------------------------------------------- /src/main/scala/com/amazon/deequ/examples/ConstraintSuggestionExample.scala: -------------------------------------------------------------------------------- 1 | /** 2 | * Copyright 2018 Amazon.com, Inc. or its affiliates. All Rights Reserved. 3 | * 4 | * Licensed under the Apache License, Version 2.0 (the "License"). You may not 5 | * use this file except in compliance with the License. A copy of the License 6 | * is located at 7 | * 8 | * http://aws.amazon.com/apache2.0/ 9 | * 10 | * or in the "license" file accompanying this file. This file is distributed on 11 | * an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either 12 | * express or implied. See the License for the specific language governing 13 | * permissions and limitations under the License. 14 | * 15 | */ 16 | 17 | package com.amazon.deequ.examples 18 | 19 | import com.amazon.deequ.examples.ExampleUtils.withSpark 20 | import com.amazon.deequ.suggestions.{ConstraintSuggestionRunner, Rules} 21 | 22 | private[examples] object ConstraintSuggestionExample extends App { 23 | 24 | withSpark { session => 25 | 26 | // Lets first generate some example data 27 | val rows = session.sparkContext.parallelize(Seq( 28 | RawData("thingA", "13.0", "IN_TRANSIT", "true"), 29 | RawData("thingA", "5", "DELAYED", "false"), 30 | RawData("thingB", null, "DELAYED", null), 31 | RawData("thingC", null, "IN_TRANSIT", "false"), 32 | RawData("thingD", "1.0", "DELAYED", "true"), 33 | RawData("thingC", "7.0", "UNKNOWN", null), 34 | RawData("thingC", "24", "UNKNOWN", null), 35 | RawData("thingE", "20", "DELAYED", "false"), 36 | RawData("thingA", "13.0", "IN_TRANSIT", "true"), 37 | RawData("thingA", "5", "DELAYED", "false"), 38 | RawData("thingB", null, "DELAYED", null), 39 | RawData("thingC", null, "IN_TRANSIT", "false"), 40 | RawData("thingD", "1.0", "DELAYED", "true"), 41 | RawData("thingC", "17.0", "UNKNOWN", null), 42 | RawData("thingC", "22", "UNKNOWN", null), 43 | RawData("thingE", "23", "DELAYED", "false") 44 | )) 45 | 46 | val data = session.createDataFrame(rows) 47 | 48 | // We ask deequ to compute constraint suggestions for us on the data 49 | // It will profile the data and than apply a set of rules specified in addConstraintRules() 50 | // to suggest constraints 51 | val suggestionResult = ConstraintSuggestionRunner() 52 | .onData(data) 53 | .addConstraintRules(Rules.DEFAULT) 54 | .run() 55 | 56 | // We can now investigate the constraints that deequ suggested. We get a textual description 57 | // and the corresponding scala code for each suggested constraint 58 | // 59 | // Note that the constraint suggestion is based on heuristic rules and assumes that the data it 60 | // is shown is 'static' and correct, which might often not be the case in the real world. 61 | // Therefore the suggestions should always be manually reviewed before being applied in real 62 | // deployments. 63 | suggestionResult.constraintSuggestions.foreach { case (column, suggestions) => 64 | suggestions.foreach { suggestion => 65 | println(s"Constraint suggestion for '$column':\t${suggestion.description}\n" + 66 | s"The corresponding scala code is ${suggestion.codeForConstraint}\n") 67 | } 68 | } 69 | 70 | } 71 | } 72 | -------------------------------------------------------------------------------- /src/main/scala/com/amazon/deequ/analyzers/catalyst/DeequFunctions.scala: -------------------------------------------------------------------------------- 1 | /** 2 | * Copyright 2018 Amazon.com, Inc. or its affiliates. All Rights Reserved. 3 | * 4 | * Licensed under the Apache License, Version 2.0 (the "License"). You may not 5 | * use this file except in compliance with the License. A copy of the License 6 | * is located at 7 | * 8 | * http://aws.amazon.com/apache2.0/ 9 | * 10 | * or in the "license" file accompanying this file. This file is distributed on 11 | * an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either 12 | * express or implied. See the License for the specific language governing 13 | * permissions and limitations under the License. 14 | * 15 | */ 16 | 17 | package org.apache.spark.sql 18 | 19 | 20 | import com.amazon.deequ.analyzers.KLLSketch 21 | import org.apache.spark.sql.catalyst.expressions.aggregate.{AggregateFunction, StatefulApproxQuantile, StatefulHyperloglogPlus} 22 | import org.apache.spark.sql.catalyst.expressions.Literal 23 | 24 | /* Custom aggregation functions used internally by deequ */ 25 | object DeequFunctions { 26 | 27 | private[this] def withAggregateFunction( 28 | func: AggregateFunction, 29 | isDistinct: Boolean = false): Column = { 30 | 31 | Column(func.toAggregateExpression(isDistinct)) 32 | } 33 | 34 | /** Pearson correlation with state */ 35 | def stateful_corr(columnA: String, columnB: String): Column = { 36 | stateful_corr(Column(columnA), Column(columnB)) 37 | } 38 | 39 | /** Pearson correlation with state */ 40 | def stateful_corr(columnA: Column, columnB: Column): Column = withAggregateFunction { 41 | new StatefulCorrelation(columnA.expr, columnB.expr) 42 | } 43 | 44 | /** Standard deviation with state */ 45 | def stateful_stddev_pop(column: String): Column = { 46 | stateful_stddev_pop(Column(column)) 47 | } 48 | 49 | /** Standard deviation with state */ 50 | def stateful_stddev_pop(column: Column): Column = withAggregateFunction { 51 | StatefulStdDevPop(column.expr) 52 | } 53 | 54 | /** Approximate number of distinct values with state via HLL's */ 55 | def stateful_approx_count_distinct(column: String): Column = { 56 | stateful_approx_count_distinct(Column(column)) 57 | } 58 | 59 | /** Approximate number of distinct values with state via HLL's */ 60 | def stateful_approx_count_distinct(column: Column): Column = withAggregateFunction { 61 | StatefulHyperloglogPlus(column.expr) 62 | } 63 | 64 | def stateful_approx_quantile( 65 | column: Column, 66 | relativeError: Double) 67 | : Column = withAggregateFunction { 68 | 69 | StatefulApproxQuantile( 70 | column.expr, 71 | // val relativeError = 1.0D / accuracy inside StatefulApproxQuantile 72 | Literal(1.0 / relativeError), 73 | mutableAggBufferOffset = 0, 74 | inputAggBufferOffset = 0 75 | ) 76 | } 77 | 78 | /** Data type detection with state */ 79 | def stateful_datatype(column: Column): Column = { 80 | val statefulDataType = new StatefulDataType() 81 | statefulDataType(column) 82 | } 83 | 84 | def stateful_kll( 85 | column: Column, 86 | sketchSize: Int, 87 | shrinkingFactor: Double): Column = { 88 | val statefulKLL = new StatefulKLLSketch(sketchSize, shrinkingFactor) 89 | statefulKLL(column) 90 | } 91 | } 92 | 93 | 94 | -------------------------------------------------------------------------------- /src/main/scala/com/amazon/deequ/suggestions/rules/CategoricalRangeRule.scala: -------------------------------------------------------------------------------- 1 | /** 2 | * Copyright 2018 Amazon.com, Inc. or its affiliates. All Rights Reserved. 3 | * 4 | * Licensed under the Apache License, Version 2.0 (the "License"). You may not 5 | * use this file except in compliance with the License. A copy of the License 6 | * is located at 7 | * 8 | * http://aws.amazon.com/apache2.0/ 9 | * 10 | * or in the "license" file accompanying this file. This file is distributed on 11 | * an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either 12 | * express or implied. See the License for the specific language governing 13 | * permissions and limitations under the License. 14 | * 15 | */ 16 | 17 | package com.amazon.deequ.suggestions.rules 18 | 19 | import com.amazon.deequ.analyzers.{DataTypeInstances, Histogram} 20 | import com.amazon.deequ.checks.Check 21 | import com.amazon.deequ.constraints.Constraint.complianceConstraint 22 | import com.amazon.deequ.profiles.ColumnProfile 23 | import com.amazon.deequ.suggestions.ConstraintSuggestion 24 | import org.apache.commons.lang3.StringEscapeUtils 25 | 26 | /** If we see a categorical range for a column, we suggest an IS IN (...) constraint */ 27 | case class CategoricalRangeRule() extends ConstraintRule[ColumnProfile] { 28 | 29 | override def shouldBeApplied(profile: ColumnProfile, numRecords: Long): Boolean = { 30 | val hasHistogram = profile.histogram.isDefined && ( 31 | profile.dataType == DataTypeInstances.String || 32 | profile.dataType == DataTypeInstances.Integral 33 | ) 34 | 35 | if (hasHistogram) { 36 | val entries = profile.histogram.get.values 37 | 38 | val numUniqueElements = entries.count { case (_, value) => value.absolute == 1L } 39 | 40 | val uniqueValueRatio = numUniqueElements.toDouble / entries.size 41 | 42 | // TODO find a principled way to define this threshold... 43 | uniqueValueRatio <= 0.1 44 | } else { 45 | false 46 | } 47 | } 48 | 49 | override def candidate(profile: ColumnProfile, numRecords: Long): ConstraintSuggestion = { 50 | 51 | val valuesByPopularity = profile.histogram.get.values.toArray 52 | .filterNot { case (key, _) => key == Histogram.NullFieldReplacement } 53 | .sortBy { case (_, value) => value.absolute } 54 | .reverse 55 | 56 | val categoriesSql = valuesByPopularity 57 | // the character "'" can be contained in category names 58 | .map { case (key, _) => key.replace("'", "''") } 59 | .mkString("'", "', '", "'") 60 | 61 | val categoriesCode = valuesByPopularity 62 | .map { case (key, _) => StringEscapeUtils.escapeJava(key) } 63 | .mkString(""""""", """", """", """"""") 64 | 65 | val description = s"'${profile.column}' has value range $categoriesSql" 66 | val columnCondition = s"`${profile.column}` IN ($categoriesSql)" 67 | val constraint = complianceConstraint(description, columnCondition, Check.IsOne) 68 | 69 | ConstraintSuggestion( 70 | constraint, 71 | profile.column, 72 | "Compliance: 1", 73 | description, 74 | this, 75 | s""".isContainedIn("${profile.column}", Array($categoriesCode))""" 76 | ) 77 | } 78 | 79 | override val ruleDescription: String = "If we see a categorical range for a " + 80 | "column, we suggest an IS IN (...) constraint" 81 | } 82 | -------------------------------------------------------------------------------- /src/main/scala/com/amazon/deequ/examples/DataProfilingExample.scala: -------------------------------------------------------------------------------- 1 | /** 2 | * Copyright 2018 Amazon.com, Inc. or its affiliates. All Rights Reserved. 3 | * 4 | * Licensed under the Apache License, Version 2.0 (the "License"). You may not 5 | * use this file except in compliance with the License. A copy of the License 6 | * is located at 7 | * 8 | * http://aws.amazon.com/apache2.0/ 9 | * 10 | * or in the "license" file accompanying this file. This file is distributed on 11 | * an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either 12 | * express or implied. See the License for the specific language governing 13 | * permissions and limitations under the License. 14 | * 15 | */ 16 | 17 | package com.amazon.deequ.examples 18 | 19 | import com.amazon.deequ.examples.ExampleUtils.withSpark 20 | import com.amazon.deequ.profiles.{ColumnProfilerRunner, NumericColumnProfile} 21 | 22 | case class RawData(productName: String, totalNumber: String, status: String, valuable: String) 23 | 24 | private[examples] object DataProfilingExample extends App { 25 | 26 | withSpark { session => 27 | 28 | /* We profile raw data, mostly in string format (e.g., from a csv file) */ 29 | val rows = session.sparkContext.parallelize(Seq( 30 | RawData("thingA", "13.0", "IN_TRANSIT", "true"), 31 | RawData("thingA", "5", "DELAYED", "false"), 32 | RawData("thingB", null, "DELAYED", null), 33 | RawData("thingC", null, "IN_TRANSIT", "false"), 34 | RawData("thingD", "1.0", "DELAYED", "true"), 35 | RawData("thingC", "7.0", "UNKNOWN", null), 36 | RawData("thingC", "20", "UNKNOWN", null), 37 | RawData("thingE", "20", "DELAYED", "false") 38 | )) 39 | 40 | val rawData = session.createDataFrame(rows) 41 | 42 | /* Make deequ profile this data. It will execute the three passes over the data and avoid 43 | any shuffles. */ 44 | val result = ColumnProfilerRunner() 45 | .onData(rawData) 46 | .run() 47 | 48 | /* We get a profile for each column which allows to inspect the completeness of the column, 49 | the approximate number of distinct values and the inferred datatype. */ 50 | result.profiles.foreach { case (productName, profile) => 51 | 52 | println(s"Column '$productName':\n " + 53 | s"\tcompleteness: ${profile.completeness}\n" + 54 | s"\tapproximate number of distinct values: ${profile.approximateNumDistinctValues}\n" + 55 | s"\tdatatype: ${profile.dataType}\n") 56 | } 57 | 58 | /* For numeric columns, we get descriptive statistics */ 59 | val totalNumberProfile = result.profiles("totalNumber").asInstanceOf[NumericColumnProfile] 60 | 61 | println(s"Statistics of 'totalNumber':\n" + 62 | s"\tminimum: ${totalNumberProfile.minimum.get}\n" + 63 | s"\tmaximum: ${totalNumberProfile.maximum.get}\n" + 64 | s"\tmean: ${totalNumberProfile.mean.get}\n" + 65 | s"\tstandard deviation: ${totalNumberProfile.stdDev.get}\n") 66 | 67 | val statusProfile = result.profiles("status") 68 | 69 | /* For columns with a low number of distinct values, we get the full value distribution. */ 70 | println("Value distribution in 'stats':") 71 | statusProfile.histogram.foreach { 72 | _.values.foreach { case (key, entry) => 73 | println(s"\t$key occurred ${entry.absolute} times (ratio is ${entry.ratio})") 74 | } 75 | } 76 | 77 | } 78 | } 79 | -------------------------------------------------------------------------------- /src/main/scala/com/amazon/deequ/analyzers/Distance.scala: -------------------------------------------------------------------------------- 1 | /** 2 | * Copyright 2018 Amazon.com, Inc. or its affiliates. All Rights Reserved. 3 | * 4 | * Licensed under the Apache License, Version 2.0 (the "License"). You may not 5 | * use this file except in compliance with the License. A copy of the License 6 | * is located at 7 | * 8 | * http://aws.amazon.com/apache2.0/ 9 | * 10 | * or in the "license" file accompanying this file. This file is distributed on 11 | * an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either 12 | * express or implied. See the License for the specific language governing 13 | * permissions and limitations under the License. 14 | * 15 | */ 16 | 17 | package com.amazon.deequ.analyzers 18 | 19 | object Distance { 20 | 21 | /** Calculate distance of numerical profiles based on KLL Sketches and L-Infinity Distance */ 22 | def numericalDistance( 23 | sample1: QuantileNonSample[Double], 24 | sample2: QuantileNonSample[Double], 25 | correctForLowNumberOfSamples: Boolean = false) 26 | : Double = { 27 | val rankMap1 = sample1.getRankMap() 28 | val rankMap2 = sample2.getRankMap() 29 | val combinedKeys = rankMap1.keySet.union(rankMap2.keySet) 30 | val n = rankMap1.valuesIterator.max.toDouble 31 | val m = rankMap2.valuesIterator.max.toDouble 32 | var linfSimple = 0.0 33 | 34 | combinedKeys.foreach { key => 35 | val cdf1 = sample1.getRank(key, rankMap1) / n 36 | val cdf2 = sample2.getRank(key, rankMap2) / m 37 | val cdfDiff = Math.abs(cdf1 - cdf2) 38 | linfSimple = Math.max(linfSimple, cdfDiff) 39 | } 40 | selectMetrics(linfSimple, n, m, correctForLowNumberOfSamples) 41 | } 42 | 43 | /** Calculate distance of categorical profiles based on L-Infinity Distance */ 44 | def categoricalDistance( 45 | sample1: scala.collection.mutable.Map[String, Long], 46 | sample2: scala.collection.mutable.Map[String, Long], 47 | correctForLowNumberOfSamples: Boolean = false) 48 | : Double = { 49 | 50 | var n = 0.0 51 | var m = 0.0 52 | sample1.keySet.foreach { key => 53 | n += sample1(key) 54 | } 55 | sample2.keySet.foreach { key => 56 | m += sample2(key) 57 | } 58 | val combinedKeys = sample1.keySet.union(sample2.keySet) 59 | var linfSimple = 0.0 60 | 61 | combinedKeys.foreach { key => 62 | val cdf1 = sample1.getOrElse(key, 0L) / n 63 | val cdf2 = sample2.getOrElse(key, 0L) / m 64 | val cdfDiff = Math.abs(cdf1 - cdf2) 65 | linfSimple = Math.max(linfSimple, cdfDiff) 66 | } 67 | selectMetrics(linfSimple, n, m, correctForLowNumberOfSamples) 68 | } 69 | 70 | /** Select which metrics to compute (linf_simple or linf_robust) 71 | * based on whether samples are enough */ 72 | private[this] def selectMetrics( 73 | linfSimple: Double, 74 | n: Double, 75 | m: Double, 76 | correctForLowNumberOfSamples: Boolean = false) 77 | : Double = { 78 | if (correctForLowNumberOfSamples) { 79 | linfSimple 80 | } else { 81 | // This formula is based on “Two-sample Kolmogorov–Smirnov test" 82 | // Reference: https://en.m.wikipedia.org/wiki/Kolmogorov%E2%80%93Smirnov_test 83 | val linfRobust = Math.max(0.0, linfSimple - 1.8 * Math.sqrt((n + m) / (n * m))) 84 | linfRobust 85 | } 86 | } 87 | } 88 | 89 | -------------------------------------------------------------------------------- /src/test/scala/com/amazon/deequ/KLL/KLLDistanceTest.scala: -------------------------------------------------------------------------------- 1 | /** 2 | * Copyright 2018 Amazon.com, Inc. or its affiliates. All Rights Reserved. 3 | * 4 | * Licensed under the Apache License, Version 2.0 (the "License"). You may not 5 | * use this file except in compliance with the License. A copy of the License 6 | * is located at 7 | * 8 | * http://aws.amazon.com/apache2.0/ 9 | * 10 | * or in the "license" file accompanying this file. This file is distributed on 11 | * an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either 12 | * express or implied. See the License for the specific language governing 13 | * permissions and limitations under the License. 14 | * 15 | */ 16 | 17 | package com.amazon.deequ.KLL 18 | 19 | import com.amazon.deequ.SparkContextSpec 20 | import com.amazon.deequ.analyzers.{Distance, QuantileNonSample} 21 | import com.amazon.deequ.utils.FixtureSupport 22 | import org.scalatest.{Matchers, WordSpec} 23 | 24 | class KLLDistanceTest extends WordSpec with Matchers with SparkContextSpec 25 | with FixtureSupport{ 26 | 27 | "KLL distance calculator should compute correct linf_simple" in { 28 | var sample1 = new QuantileNonSample[Double](4, 0.64) 29 | var sample2 = new QuantileNonSample[Double](4, 0.64) 30 | sample1.reconstruct(4, 0.64, Array(Array(1, 2, 3, 4))) 31 | sample2.reconstruct(4, 0.64, Array(Array(2, 3, 4, 5))) 32 | assert(Distance.numericalDistance(sample1, sample2, true) == 0.25) 33 | } 34 | 35 | "KLL distance calculator should compute correct linf_robust" in { 36 | var sample1 = new QuantileNonSample[Double](4, 0.64) 37 | var sample2 = new QuantileNonSample[Double](4, 0.64) 38 | sample1.reconstruct(4, 0.64, Array(Array(1, 2, 3, 4))) 39 | sample2.reconstruct(4, 0.64, Array(Array(2, 3, 4, 5))) 40 | assert(Distance.numericalDistance(sample1, sample2) == 0.0) 41 | } 42 | 43 | "Categorial distance should compute correct linf_simple" in { 44 | val sample1 = scala.collection.mutable.Map( 45 | "a" -> 10L, "b" -> 20L, "c" -> 25L, "d" -> 10L, "e" -> 5L) 46 | val sample2 = scala.collection.mutable.Map( 47 | "a" -> 11L, "b" -> 20L, "c" -> 25L, "d" -> 10L, "e" -> 10L) 48 | assert(Distance.categoricalDistance(sample1, 49 | sample2, true) == 0.06015037593984962) 50 | } 51 | 52 | "Categorial distance should compute correct linf_robust" in { 53 | val sample1 = scala.collection.mutable.Map( 54 | "a" -> 10L, "b" -> 20L, "c" -> 25L, "d" -> 10L, "e" -> 5L) 55 | val sample2 = scala.collection.mutable.Map( 56 | "a" -> 11L, "b" -> 20L, "c" -> 25L, "d" -> 10L, "e" -> 10L) 57 | assert(Distance.categoricalDistance(sample1, sample2) == 0.0) 58 | } 59 | 60 | "Categorial distance should compute correct linf_simple with different bin value" in { 61 | val sample1 = scala.collection.mutable.Map( 62 | "a" -> 10L, "b" -> 20L, "c" -> 25L, "d" -> 10L, "e" -> 5L) 63 | val sample2 = scala.collection.mutable.Map( 64 | "f" -> 11L, "a" -> 20L, "c" -> 25L, "d" -> 10L, "e" -> 10L) 65 | assert(Distance.categoricalDistance(sample1, 66 | sample2, true) == 0.2857142857142857) 67 | } 68 | 69 | "Categorial distance should compute correct linf_robust with different bin value" in { 70 | val sample1 = scala.collection.mutable.Map( 71 | "a" -> 10L, "b" -> 20L, "c" -> 25L, "d" -> 10L, "e" -> 5L) 72 | val sample2 = scala.collection.mutable.Map( 73 | "f" -> 11L, "a" -> 20L, "c" -> 25L, "d" -> 10L, "e" -> 10L) 74 | assert(Distance.categoricalDistance(sample1, sample2) == 0.0) 75 | } 76 | } 77 | -------------------------------------------------------------------------------- /src/main/scala/com/amazon/deequ/analyzers/catalyst/StatefulDataType.scala: -------------------------------------------------------------------------------- 1 | /** 2 | * Copyright 2018 Amazon.com, Inc. or its affiliates. All Rights Reserved. 3 | * 4 | * Licensed under the Apache License, Version 2.0 (the "License"). You may not 5 | * use this file except in compliance with the License. A copy of the License 6 | * is located at 7 | * 8 | * http://aws.amazon.com/apache2.0/ 9 | * 10 | * or in the "license" file accompanying this file. This file is distributed on 11 | * an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either 12 | * express or implied. See the License for the specific language governing 13 | * permissions and limitations under the License. 14 | * 15 | */ 16 | 17 | package org.apache.spark.sql 18 | 19 | import com.amazon.deequ.analyzers.DataTypeHistogram 20 | import org.apache.spark.sql.expressions.{MutableAggregationBuffer, UserDefinedAggregateFunction} 21 | import org.apache.spark.sql.types._ 22 | 23 | import scala.util.matching.Regex 24 | 25 | 26 | private[sql] class StatefulDataType extends UserDefinedAggregateFunction { 27 | 28 | val SIZE_IN_BYTES = 40 29 | 30 | val NULL_POS = 0 31 | val FRACTIONAL_POS = 1 32 | val INTEGRAL_POS = 2 33 | val BOOLEAN_POS = 3 34 | val STRING_POS = 4 35 | 36 | val FRACTIONAL: Regex = """^(-|\+)? ?\d+((\.\d+)|((?:\.\d+)?[Ee][-+]?\d+))$""".r 37 | val INTEGRAL: Regex = """^(-|\+)? ?\d+$""".r 38 | val BOOLEAN: Regex = """^(true|false)$""".r 39 | 40 | override def inputSchema: StructType = StructType(StructField("value", StringType) :: Nil) 41 | 42 | override def bufferSchema: StructType = StructType(StructField("null", LongType) :: 43 | StructField("fractional", LongType) :: StructField("integral", LongType) :: 44 | StructField("boolean", LongType) :: StructField("string", LongType) :: Nil) 45 | 46 | override def dataType: types.DataType = BinaryType 47 | 48 | override def deterministic: Boolean = true 49 | 50 | override def initialize(buffer: MutableAggregationBuffer): Unit = { 51 | buffer(NULL_POS) = 0L 52 | buffer(FRACTIONAL_POS) = 0L 53 | buffer(INTEGRAL_POS) = 0L 54 | buffer(BOOLEAN_POS) = 0L 55 | buffer(STRING_POS) = 0L 56 | } 57 | 58 | override def update(buffer: MutableAggregationBuffer, input: Row): Unit = { 59 | if (input.isNullAt(0)) { 60 | buffer(NULL_POS) = buffer.getLong(NULL_POS) + 1L 61 | } else { 62 | input.getString(0) match { 63 | case FRACTIONAL(_*) => buffer(FRACTIONAL_POS) = buffer.getLong(FRACTIONAL_POS) + 1L 64 | case INTEGRAL(_*) => buffer(INTEGRAL_POS) = buffer.getLong(INTEGRAL_POS) + 1L 65 | case BOOLEAN(_*) => buffer(BOOLEAN_POS) = buffer.getLong(BOOLEAN_POS) + 1L 66 | case _ => buffer(STRING_POS) = buffer.getLong(STRING_POS) + 1L 67 | } 68 | } 69 | } 70 | 71 | override def merge(buffer1: MutableAggregationBuffer, buffer2: Row): Unit = { 72 | buffer1(NULL_POS) = buffer1.getLong(NULL_POS) + buffer2.getLong(NULL_POS) 73 | buffer1(FRACTIONAL_POS) = buffer1.getLong(FRACTIONAL_POS) + buffer2.getLong(FRACTIONAL_POS) 74 | buffer1(INTEGRAL_POS) = buffer1.getLong(INTEGRAL_POS) + buffer2.getLong(INTEGRAL_POS) 75 | buffer1(BOOLEAN_POS) = buffer1.getLong(BOOLEAN_POS) + buffer2.getLong(BOOLEAN_POS) 76 | buffer1(STRING_POS) = buffer1.getLong(STRING_POS) + buffer2.getLong(STRING_POS) 77 | } 78 | 79 | override def evaluate(buffer: Row): Any = { 80 | DataTypeHistogram.toBytes(buffer.getLong(NULL_POS), buffer.getLong(FRACTIONAL_POS), 81 | buffer.getLong(INTEGRAL_POS), buffer.getLong(BOOLEAN_POS), buffer.getLong(STRING_POS)) 82 | } 83 | } 84 | -------------------------------------------------------------------------------- /src/test/scala/com/amazon/deequ/analyzers/UniquenessTest.scala: -------------------------------------------------------------------------------- 1 | /** 2 | * Copyright 2018 Amazon.com, Inc. or its affiliates. All Rights Reserved. 3 | * 4 | * Licensed under the Apache License, Version 2.0 (the "License"). You may not 5 | * use this file except in compliance with the License. A copy of the License 6 | * is located at 7 | * 8 | * http://aws.amazon.com/apache2.0/ 9 | * 10 | * or in the "license" file accompanying this file. This file is distributed on 11 | * an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either 12 | * express or implied. See the License for the specific language governing 13 | * permissions and limitations under the License. 14 | * 15 | */ 16 | 17 | package com.amazon.deequ.analyzers 18 | 19 | import com.amazon.deequ.SparkContextSpec 20 | import com.amazon.deequ.analyzers.runners.AnalysisRunner 21 | import com.amazon.deequ.metrics.DoubleMetric 22 | import com.amazon.deequ.utils.FixtureSupport 23 | import org.apache.spark.sql.{DataFrame, SparkSession} 24 | import org.scalatest.matchers.should.Matchers 25 | import org.scalatest.wordspec.AnyWordSpec 26 | 27 | class UniquenessTest extends AnyWordSpec with Matchers with SparkContextSpec with FixtureSupport { 28 | 29 | def uniquenessSampleData(sparkSession: SparkSession): DataFrame = { 30 | import sparkSession.implicits._ 31 | 32 | // Example from https://github.com/awslabs/deequ/issues/178 33 | Seq( 34 | ("India", "Xavier House, 2nd Floor", "St. Peter Colony, Perry Road", "Bandra (West)"), 35 | ("India", "503 Godavari", "Sir Pochkhanwala Road", "Worli"), 36 | ("India", "4/4 Seema Society", "N Dutta Road, Four Bungalows", "Andheri"), 37 | ("India", "1001D Abhishek Apartments", "Juhu Versova Road", "Andheri"), 38 | ("India", "95, Hill Road", null, null), 39 | ("India", "90 Cuffe Parade", "Taj President Hotel", "Cuffe Parade"), 40 | ("India", "4, Seven PM", "Sir Pochkhanwala Rd", "Worli"), 41 | ("India", "1453 Sahar Road", null, null) 42 | ) 43 | .toDF("Country", "Address Line 1", "Address Line 2", "Address Line 3") 44 | } 45 | 46 | "Uniqueness" should { 47 | 48 | "be correct for multiple fields" in withSparkSession { session => 49 | 50 | val data = uniquenessSampleData(session) 51 | 52 | val stateStore = InMemoryStateProvider() 53 | 54 | val uniquenessA1 = Uniqueness("Address Line 1") 55 | val uniquenessA13 = Uniqueness(Seq("Address Line 1", "Address Line 2", "Address Line 3")) 56 | 57 | val analysis = Analysis(Seq(uniquenessA1, uniquenessA13)) 58 | 59 | val result = AnalysisRunner.run(data, analysis, saveStatesWith = Some(stateStore)) 60 | 61 | assert(result.metric(uniquenessA1).get.asInstanceOf[DoubleMetric].value.get == 1.0) 62 | assert(result.metric(uniquenessA13).get.asInstanceOf[DoubleMetric].value.get == 1.0) 63 | } 64 | } 65 | 66 | "Filtered Uniqueness" in withSparkSession { sparkSession => 67 | import sparkSession.implicits._ 68 | val df = Seq( 69 | ("1", "unique"), 70 | ("2", "unique"), 71 | ("3", "duplicate"), 72 | ("3", "duplicate"), 73 | ("4", "unique") 74 | ).toDF("value", "type") 75 | 76 | val stateStore = InMemoryStateProvider() 77 | 78 | val uniqueness = Uniqueness("value") 79 | val uniquenessWithFilter = Uniqueness(Seq("value"), Some("type = 'unique'")) 80 | 81 | val analysis = Analysis(Seq(uniqueness, uniquenessWithFilter)) 82 | 83 | val result = AnalysisRunner.run(df, analysis, saveStatesWith = Some(stateStore)) 84 | 85 | assert(result.metric(uniqueness).get.asInstanceOf[DoubleMetric].value.get == 0.6) 86 | assert(result.metric(uniquenessWithFilter).get.asInstanceOf[DoubleMetric].value.get == 1.0) 87 | } 88 | } 89 | -------------------------------------------------------------------------------- /src/main/scala/com/amazon/deequ/analyzers/Correlation.scala: -------------------------------------------------------------------------------- 1 | /** 2 | * Copyright 2018 Amazon.com, Inc. or its affiliates. All Rights Reserved. 3 | * 4 | * Licensed under the Apache License, Version 2.0 (the "License"). You may not 5 | * use this file except in compliance with the License. A copy of the License 6 | * is located at 7 | * 8 | * http://aws.amazon.com/apache2.0/ 9 | * 10 | * or in the "license" file accompanying this file. This file is distributed on 11 | * an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either 12 | * express or implied. See the License for the specific language governing 13 | * permissions and limitations under the License. 14 | * 15 | */ 16 | 17 | package com.amazon.deequ.analyzers 18 | 19 | import com.amazon.deequ.analyzers.Preconditions.{hasColumn, isNumeric} 20 | import com.amazon.deequ.metrics.Entity 21 | import org.apache.spark.sql.DeequFunctions.stateful_corr 22 | import org.apache.spark.sql.{Column, Row} 23 | import org.apache.spark.sql.types.StructType 24 | import Analyzers._ 25 | 26 | case class CorrelationState( 27 | n: Double, 28 | xAvg: Double, 29 | yAvg: Double, 30 | ck: Double, 31 | xMk: Double, 32 | yMk: Double) 33 | extends DoubleValuedState[CorrelationState] { 34 | 35 | require(n > 0.0, "Correlation undefined for n = 0.") 36 | 37 | override def sum(other: CorrelationState): CorrelationState = { 38 | val n1 = n 39 | val n2 = other.n 40 | val newN = n1 + n2 41 | val dx = other.xAvg - xAvg 42 | val dxN = if (newN == 0.0) 0.0 else dx / newN 43 | val dy = other.yAvg - yAvg 44 | val dyN = if (newN == 0.0) 0.0 else dy / newN 45 | val newXAvg = xAvg + dxN * n2 46 | val newYAvg = yAvg + dyN * n2 47 | val newCk = ck + other.ck + dx * dyN * n1 * n2 48 | val newXMk = xMk + other.xMk + dx * dxN * n1 * n2 49 | val newYMk = yMk + other.yMk + dy * dyN * n1 * n2 50 | 51 | CorrelationState(newN, newXAvg, newYAvg, newCk, newXMk, newYMk) 52 | } 53 | 54 | override def metricValue(): Double = { 55 | ck / math.sqrt(xMk * yMk) 56 | } 57 | } 58 | 59 | /** 60 | * Computes the pearson correlation coefficient between the two given columns 61 | * 62 | * @param firstColumn First input column for computation 63 | * @param secondColumn Second input column for computation 64 | */ 65 | case class Correlation( 66 | firstColumn: String, 67 | secondColumn: String, 68 | where: Option[String] = None) 69 | extends StandardScanShareableAnalyzer[CorrelationState]("Correlation", 70 | s"$firstColumn,$secondColumn", Entity.Mutlicolumn) 71 | with FilterableAnalyzer { 72 | 73 | override def aggregationFunctions(): Seq[Column] = { 74 | 75 | val firstSelection = conditionalSelection(firstColumn, where) 76 | val secondSelection = conditionalSelection(secondColumn, where) 77 | 78 | stateful_corr(firstSelection, secondSelection) :: Nil 79 | } 80 | 81 | override def fromAggregationResult(result: Row, offset: Int): Option[CorrelationState] = { 82 | 83 | if (result.isNullAt(offset)) { 84 | None 85 | } else { 86 | val row = result.getAs[Row](offset) 87 | val n = row.getDouble(0) 88 | if (n > 0.0) { 89 | Some(CorrelationState( 90 | n, 91 | row.getDouble(1), 92 | row.getDouble(2), 93 | row.getDouble(3), 94 | row.getDouble(4), 95 | row.getDouble(5))) 96 | } else { 97 | None 98 | } 99 | } 100 | } 101 | 102 | override protected def additionalPreconditions(): Seq[StructType => Unit] = { 103 | hasColumn(firstColumn) :: isNumeric(firstColumn) :: hasColumn(secondColumn) :: 104 | isNumeric(secondColumn) :: Nil 105 | } 106 | 107 | override def filterCondition: Option[String] = where 108 | } 109 | -------------------------------------------------------------------------------- /src/main/scala/com/amazon/deequ/examples/MetricsRepositoryExample.scala: -------------------------------------------------------------------------------- 1 | /** 2 | * Copyright 2018 Amazon.com, Inc. or its affiliates. All Rights Reserved. 3 | * 4 | * Licensed under the Apache License, Version 2.0 (the "License"). You may not 5 | * use this file except in compliance with the License. A copy of the License 6 | * is located at 7 | * 8 | * http://aws.amazon.com/apache2.0/ 9 | * 10 | * or in the "license" file accompanying this file. This file is distributed on 11 | * an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either 12 | * express or implied. See the License for the specific language governing 13 | * permissions and limitations under the License. 14 | * 15 | */ 16 | 17 | package com.amazon.deequ.examples 18 | 19 | import java.io.File 20 | 21 | import com.amazon.deequ.VerificationSuite 22 | import com.amazon.deequ.analyzers.Completeness 23 | import com.amazon.deequ.checks.{Check, CheckLevel} 24 | import com.amazon.deequ.examples.ExampleUtils.{itemsAsDataframe, withSpark} 25 | import com.amazon.deequ.repository.fs.FileSystemMetricsRepository 26 | import com.amazon.deequ.repository.{MetricsRepository, ResultKey} 27 | import com.google.common.io.Files 28 | 29 | object MetricsRepositoryExample extends App { 30 | 31 | withSpark { session => 32 | 33 | // The toy data on which we will compute metrics 34 | val data = itemsAsDataframe(session, 35 | Item(1, "Thingy A", "awesome thing.", "high", 0), 36 | Item(2, "Thingy B", "available at http://thingb.com", null, 0), 37 | Item(3, null, null, "low", 5), 38 | Item(4, "Thingy D", "checkout https://thingd.ca", "low", 10), 39 | Item(5, "Thingy E", null, "high", 12)) 40 | 41 | // A json file in which the computed metrics will be stored 42 | val metricsFile = new File(Files.createTempDir(), "metrics.json") 43 | 44 | // The repository which we will use to stored and load computed metrics; we use the local disk, 45 | // but it also supports HDFS and S3 46 | val repository: MetricsRepository = 47 | FileSystemMetricsRepository(session, metricsFile.getAbsolutePath) 48 | 49 | // The key under which we store the results, needs a timestamp and supports arbitrary 50 | // tags in the form of key-value pairs 51 | val resultKey = ResultKey(System.currentTimeMillis(), Map("tag" -> "repositoryExample")) 52 | 53 | VerificationSuite() 54 | .onData(data) 55 | // Some integrity checks 56 | .addCheck(Check(CheckLevel.Error, "integrity checks") 57 | .hasSize(_ == 5) 58 | .isComplete("id") 59 | .isComplete("productName") 60 | .isContainedIn("priority", Array("high", "low")) 61 | .isNonNegative("numViews")) 62 | // We want to store the computed metrics for the checks in our repository 63 | .useRepository(repository) 64 | .saveOrAppendResult(resultKey) 65 | .run() 66 | 67 | // We can now retrieve the metrics from the repository in different ways, e.g.: 68 | 69 | 70 | // We can load the metric for a particular analyzer stored under our result key: 71 | val completenessOfProductName = repository 72 | .loadByKey(resultKey).get 73 | .metric(Completeness("productName")).get 74 | 75 | println(s"The completeness of the productName column is: $completenessOfProductName") 76 | 77 | // We can query the repository for all metrics from the last 10 minutes and get them as json 78 | val json = repository.load() 79 | .after(System.currentTimeMillis() - 10000) 80 | .getSuccessMetricsAsJson() 81 | 82 | println(s"Metrics from the last 10 minutes:\n$json") 83 | 84 | // Finally we can also query by tag value and retrieve the result in the form of a dataframe 85 | repository.load() 86 | .withTagValues(Map("tag" -> "repositoryExample")) 87 | .getSuccessMetricsAsDataFrame(session) 88 | .show() 89 | } 90 | } 91 | -------------------------------------------------------------------------------- /src/main/scala/com/amazon/deequ/examples/AnomalyDetectionExample.scala: -------------------------------------------------------------------------------- 1 | /** 2 | * Copyright 2018 Amazon.com, Inc. or its affiliates. All Rights Reserved. 3 | * 4 | * Licensed under the Apache License, Version 2.0 (the "License"). You may not 5 | * use this file except in compliance with the License. A copy of the License 6 | * is located at 7 | * 8 | * http://aws.amazon.com/apache2.0/ 9 | * 10 | * or in the "license" file accompanying this file. This file is distributed on 11 | * an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either 12 | * express or implied. See the License for the specific language governing 13 | * permissions and limitations under the License. 14 | * 15 | */ 16 | 17 | package com.amazon.deequ.examples 18 | 19 | import com.amazon.deequ.VerificationSuite 20 | import com.amazon.deequ.analyzers.Size 21 | import com.amazon.deequ.anomalydetection.RelativeRateOfChangeStrategy 22 | import com.amazon.deequ.examples.ExampleUtils.{itemsAsDataframe, withSpark} 23 | import com.amazon.deequ.repository.ResultKey 24 | import com.amazon.deequ.repository.memory.InMemoryMetricsRepository 25 | import com.amazon.deequ.checks.CheckStatus._ 26 | 27 | private[examples] object AnomalyDetectionExample extends App { 28 | 29 | withSpark { session => 30 | 31 | /* In this simple example, we assume that we compute metrics on a dataset every day and we want 32 | to ensure that they don't change drastically. For sake of simplicity, we just look at the 33 | size of the data */ 34 | 35 | /* Anomaly detection operates on metrics stored in a metric repository, so lets create one */ 36 | val metricsRepository = new InMemoryMetricsRepository() 37 | 38 | /* This is the key which we use to store the metrics for the dataset from yesterday */ 39 | val yesterdaysKey = ResultKey(System.currentTimeMillis() - 24 * 60 * 1000) 40 | 41 | /* Yesterday, the data had only two rows */ 42 | val yesterdaysDataset = itemsAsDataframe(session, 43 | Item(1, "Thingy A", "awesome thing.", "high", 0), 44 | Item(2, "Thingy B", "available at http://thingb.com", null, 0)) 45 | 46 | /* We test for anomalies in the size of the data, it should not increase by more than 2x. Note 47 | that we store the resulting metrics in our repository */ 48 | VerificationSuite() 49 | .onData(yesterdaysDataset) 50 | .useRepository(metricsRepository) 51 | .saveOrAppendResult(yesterdaysKey) 52 | .addAnomalyCheck( 53 | RelativeRateOfChangeStrategy(maxRateIncrease = Some(2.0)), 54 | Size() 55 | ) 56 | .run() 57 | 58 | /* Todays data has five rows, so the data size more than doubled and our anomaly check should 59 | catch this */ 60 | val todaysDataset = itemsAsDataframe(session, 61 | Item(1, "Thingy A", "awesome thing.", "high", 0), 62 | Item(2, "Thingy B", "available at http://thingb.com", null, 0), 63 | Item(3, null, null, "low", 5), 64 | Item(4, "Thingy D", "checkout https://thingd.ca", "low", 10), 65 | Item(5, "Thingy E", null, "high", 12)) 66 | 67 | /* The key for today's result */ 68 | val todaysKey = ResultKey(System.currentTimeMillis()) 69 | 70 | /* Repeat the anomaly check for today's data */ 71 | val verificationResult = VerificationSuite() 72 | .onData(todaysDataset) 73 | .useRepository(metricsRepository) 74 | .saveOrAppendResult(todaysKey) 75 | .addAnomalyCheck( 76 | RelativeRateOfChangeStrategy(maxRateIncrease = Some(2.0)), 77 | Size() 78 | ) 79 | .run() 80 | 81 | /* Did we find an anomaly? */ 82 | if (verificationResult.status != Success) { 83 | println("Anomaly detected in the Size() metric!") 84 | 85 | /* Lets have a look at the actual metrics. */ 86 | metricsRepository 87 | .load() 88 | .forAnalyzers(Seq(Size())) 89 | .getSuccessMetricsAsDataFrame(session) 90 | .show() 91 | } 92 | } 93 | 94 | } 95 | -------------------------------------------------------------------------------- /src/main/scala/com/amazon/deequ/analyzers/PatternMatch.scala: -------------------------------------------------------------------------------- 1 | /** 2 | * Copyright 2018 Amazon.com, Inc. or its affiliates. All Rights Reserved. 3 | * 4 | * Licensed under the Apache License, Version 2.0 (the "License"). You may not 5 | * use this file except in compliance with the License. A copy of the License 6 | * is located at 7 | * 8 | * http://aws.amazon.com/apache2.0/ 9 | * 10 | * or in the "license" file accompanying this file. This file is distributed on 11 | * an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either 12 | * express or implied. See the License for the specific language governing 13 | * permissions and limitations under the License. 14 | * 15 | */ 16 | 17 | package com.amazon.deequ.analyzers 18 | 19 | import com.amazon.deequ.analyzers.Analyzers._ 20 | import com.amazon.deequ.analyzers.Preconditions.{hasColumn, isString} 21 | import org.apache.spark.sql.{Column, Row} 22 | import org.apache.spark.sql.functions.{col, lit, regexp_extract, sum, when} 23 | import org.apache.spark.sql.types.{IntegerType, StructType} 24 | 25 | import scala.util.matching.Regex 26 | 27 | /** 28 | * PatternMatch is a measure of the fraction of rows that complies with a given 29 | * column regex constraint. E.g if the constraint is Patterns.CREDITCARD and the 30 | * data frame has 5 rows which contain a credit card number in a certain column 31 | * according to the regex and and 10 rows that do not, a DoubleMetric would be 32 | * returned with 0.33 as value 33 | * 34 | * @param column Column to do the pattern match analysis on 35 | * @param pattern The regular expression to check for 36 | * @param where Additional filter to apply before the analyzer is run. 37 | */ 38 | case class PatternMatch(column: String, pattern: Regex, where: Option[String] = None) 39 | extends StandardScanShareableAnalyzer[NumMatchesAndCount]("PatternMatch", column) 40 | with FilterableAnalyzer { 41 | 42 | override def fromAggregationResult(result: Row, offset: Int): Option[NumMatchesAndCount] = { 43 | ifNoNullsIn(result, offset, howMany = 2) { _ => 44 | NumMatchesAndCount(result.getLong(offset), result.getLong(offset + 1)) 45 | } 46 | } 47 | 48 | override def aggregationFunctions(): Seq[Column] = { 49 | 50 | val expression = when(regexp_extract(col(column), pattern.toString(), 0) =!= lit(""), 1) 51 | .otherwise(0) 52 | 53 | val summation = sum(conditionalSelection(expression, where).cast(IntegerType)) 54 | 55 | summation :: conditionalCount(where) :: Nil 56 | } 57 | 58 | override def filterCondition: Option[String] = where 59 | 60 | override protected def additionalPreconditions(): Seq[StructType => Unit] = { 61 | hasColumn(column) :: isString(column) :: Nil 62 | } 63 | } 64 | 65 | object Patterns { 66 | 67 | // scalastyle:off 68 | // http://emailregex.com 69 | val EMAIL: Regex = """(?:[a-z0-9!#$%&'*+/=?^_`{|}~-]+(?:\.[a-z0-9!#$%&'*+/=?^_`{|}~-]+)*|"(?:[\x01-\x08\x0b\x0c\x0e-\x1f\x21\x23-\x5b\x5d-\x7f]|\\[\x01-\x09\x0b\x0c\x0e-\x7f])*")@(?:(?:[a-z0-9](?:[a-z0-9-]*[a-z0-9])?\.)+[a-z0-9](?:[a-z0-9-]*[a-z0-9])?|\[(?:(?:25[0-5]|2[0-4][0-9]|[01]?[0-9][0-9]?)\.){3}(?:25[0-5]|2[0-4][0-9]|[01]?[0-9][0-9]?|[a-z0-9-]*[a-z0-9]:(?:[\x01-\x08\x0b\x0c\x0e-\x1f\x21-\x5a\x53-\x7f]|\\[\x01-\x09\x0b\x0c\x0e-\x7f])+)\])""".r 70 | 71 | // https://mathiasbynens.be/demo/url-regex stephenhay 72 | val URL: Regex = """(https?|ftp)://[^\s/$.?#].[^\s]*""".r 73 | 74 | val SOCIAL_SECURITY_NUMBER_US: Regex = """((?!219-09-9999|078-05-1120)(?!666|000|9\d{2})\d{3}-(?!00)\d{2}-(?!0{4})\d{4})|((?!219 09 9999|078 05 1120)(?!666|000|9\d{2})\d{3} (?!00)\d{2} (?!0{4})\d{4})|((?!219099999|078051120)(?!666|000|9\d{2})\d{3}(?!00)\d{2}(?!0{4})\d{4})""".r 75 | 76 | // Visa, MasterCard, AMEX, Diners Club 77 | // http://www.richardsramblings.com/regex/credit-card-numbers/ 78 | val CREDITCARD: Regex = """\b(?:3[47]\d{2}([\ \-]?)\d{6}\1\d|(?:(?:4\d|5[1-5]|65)\d{2}|6011)([\ \-]?)\d{4}\2\d{4}\2)\d{4}\b""".r 79 | // scalastyle:on 80 | } 81 | -------------------------------------------------------------------------------- /src/main/scala/com/amazon/deequ/analyzers/catalyst/StatefulKLLSketch.scala: -------------------------------------------------------------------------------- 1 | /** 2 | * Copyright 2018 Amazon.com, Inc. or its affiliates. All Rights Reserved. 3 | * 4 | * Licensed under the Apache License, Version 2.0 (the "License"). You may not 5 | * use this file except in compliance with the License. A copy of the License 6 | * is located at 7 | * 8 | * http://aws.amazon.com/apache2.0/ 9 | * 10 | * or in the "license" file accompanying this file. This file is distributed on 11 | * an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either 12 | * express or implied. See the License for the specific language governing 13 | * permissions and limitations under the License. 14 | * 15 | */ 16 | 17 | package org.apache.spark.sql 18 | 19 | import java.nio.ByteBuffer 20 | 21 | import com.amazon.deequ.analyzers.QuantileNonSample 22 | import com.amazon.deequ.analyzers.catalyst.KLLSketchSerializer 23 | import com.google.common.primitives.Doubles 24 | 25 | import org.apache.spark.sql.expressions.{MutableAggregationBuffer, UserDefinedAggregateFunction} 26 | import org.apache.spark.sql.types._ 27 | 28 | 29 | private [sql] class StatefulKLLSketch( 30 | sketchSize: Int, 31 | shrinkingFactor: Double) 32 | extends UserDefinedAggregateFunction{ 33 | 34 | val OBJECT_POS = 0 35 | val MIN_POS = 1 36 | val MAX_POS = 2 37 | 38 | override def inputSchema: StructType = StructType(StructField("value", DoubleType) :: Nil) 39 | 40 | override def bufferSchema: StructType = StructType(StructField("data", BinaryType) :: 41 | StructField("minimum", DoubleType) :: StructField("maximum", DoubleType) :: Nil) 42 | 43 | override def dataType: DataType = BinaryType 44 | 45 | override def deterministic: Boolean = true 46 | 47 | override def initialize(buffer: MutableAggregationBuffer): Unit = { 48 | val qsketch = new QuantileNonSample[Double](sketchSize, shrinkingFactor) 49 | buffer(OBJECT_POS) = serialize(qsketch) 50 | buffer(MIN_POS) = Int.MaxValue.toDouble 51 | buffer(MAX_POS) = Int.MinValue.toDouble 52 | } 53 | 54 | override def update(buffer: MutableAggregationBuffer, input: Row): Unit = { 55 | if (input.isNullAt(OBJECT_POS)) { 56 | return 57 | } 58 | 59 | val tmp = input.getDouble(OBJECT_POS) 60 | val kll = deserialize(buffer.getAs[Array[Byte]](OBJECT_POS)) 61 | kll.update(tmp) 62 | buffer(OBJECT_POS) = serialize(kll) 63 | buffer(MIN_POS) = Math.min(buffer.getDouble(MIN_POS), tmp) 64 | buffer(MAX_POS) = Math.max(buffer.getDouble(MAX_POS), tmp) 65 | } 66 | 67 | override def merge(buffer1: MutableAggregationBuffer, buffer2: Row): Unit = { 68 | if (buffer2.isNullAt(OBJECT_POS)) { 69 | return 70 | } 71 | 72 | val kll_this = deserialize(buffer1.getAs[Array[Byte]](OBJECT_POS)) 73 | val kll_other = deserialize(buffer2.getAs[Array[Byte]](OBJECT_POS)) 74 | val kll_ret = kll_this.merge(kll_other) 75 | buffer1(OBJECT_POS) = serialize(kll_ret) 76 | buffer1(MIN_POS) = Math.min(buffer1.getDouble(MIN_POS), buffer2.getDouble(MIN_POS)) 77 | buffer1(MAX_POS) = Math.max(buffer1.getDouble(MAX_POS), buffer2.getDouble(MAX_POS)) 78 | } 79 | 80 | override def evaluate(buffer: Row): Any = { 81 | toBytes(buffer.getDouble(MIN_POS), 82 | buffer.getDouble(MAX_POS), 83 | buffer.getAs[Array[Byte]](OBJECT_POS)) 84 | } 85 | 86 | def toBytes(min: Double, max: Double, obj: Array[Byte]): Array[Byte] = { 87 | val buffer2 = ByteBuffer.wrap(new Array(Doubles.BYTES + Doubles.BYTES + obj.length)) 88 | buffer2.putDouble(min) 89 | buffer2.putDouble(max) 90 | buffer2.put(obj) 91 | buffer2.array() 92 | } 93 | 94 | def serialize(obj: QuantileNonSample[Double]): Array[Byte] = { 95 | KLLSketchSerializer.serializer.serialize(obj) 96 | } 97 | 98 | def deserialize(bytes: Array[Byte]): QuantileNonSample[Double] = { 99 | KLLSketchSerializer.serializer.deserialize(bytes) 100 | } 101 | } 102 | 103 | -------------------------------------------------------------------------------- /CONTRIBUTING.md: -------------------------------------------------------------------------------- 1 | # Contributing Guidelines 2 | 3 | Thank you for your interest in contributing to our project. Whether it's a bug report, new feature, correction, or additional 4 | documentation, we greatly value feedback and contributions from our community. 5 | 6 | Please read through this document before submitting any issues or pull requests to ensure we have all the necessary 7 | information to effectively respond to your bug report or contribution. 8 | 9 | 10 | ## Reporting Bugs/Feature Requests 11 | 12 | We welcome you to use the GitHub issue tracker to report bugs or suggest features. 13 | 14 | When filing an issue, please check [existing open](https://github.com/awslabs/deequ/issues), or [recently closed](https://github.com/awslabs/deequ/issues?utf8=%E2%9C%93&q=is%3Aissue%20is%3Aclosed%20), issues to make sure somebody else hasn't already 15 | reported the issue. Please try to include as much information as you can. Details like these are incredibly useful: 16 | 17 | * A reproducible test case or series of steps 18 | * The version of our code being used 19 | * Any modifications you've made relevant to the bug 20 | * Anything unusual about your environment or deployment 21 | 22 | 23 | ## Contributing via Pull Requests 24 | Contributions via pull requests are much appreciated. Before sending us a pull request, please ensure that: 25 | 26 | 1. You are working against the latest source on the *master* branch. 27 | 2. You check existing open, and recently merged, pull requests to make sure someone else hasn't addressed the problem already. 28 | 3. You open an issue to discuss any significant work - we would hate for your time to be wasted. 29 | 30 | To send us a pull request, please: 31 | 32 | 1. Fork the repository. 33 | 2. Modify the source; please focus on the specific change you are contributing. If you also reformat all the code, it will be hard for us to focus on your change. 34 | 3. Please ensure that your code follows our [code conventions](https://github.com/databricks/scala-style-guide), which we adopted from Apache Spark 35 | 4. Ensure local tests pass. 36 | 5. Commit to your fork using clear commit messages. 37 | 6. Send us a pull request, answering any default questions in the pull request interface. 38 | 7. Pay attention to any automated CI failures reported in the pull request, and stay involved in the conversation. 39 | 40 | GitHub provides additional document on [forking a repository](https://help.github.com/articles/fork-a-repo/) and 41 | [creating a pull request](https://help.github.com/articles/creating-a-pull-request/). 42 | 43 | 44 | ## Finding contributions to work on 45 | Looking at the existing issues is a great way to find something to contribute on. As our projects, by default, use the default GitHub issue labels ((enhancement/bug/duplicate/help wanted/invalid/question/wontfix), looking at any ['help wanted'](https://github.com/awslabs/deequ/labels/help%20wanted) issues is a great place to start. 46 | 47 | 48 | ## Understanding the existing codebase 49 | You may find the [documentation on the key concepts](/docs/key-concepts.md) in the codebase helpful. 50 | 51 | ## Code of Conduct 52 | This project has adopted the [Amazon Open Source Code of Conduct](https://aws.github.io/code-of-conduct). 53 | For more information see the [Code of Conduct FAQ](https://aws.github.io/code-of-conduct-faq) or contact 54 | opensource-codeofconduct@amazon.com with any additional questions or comments. 55 | 56 | 57 | ## Security issue notifications 58 | If you discover a potential security issue in this project we ask that you notify AWS/Amazon Security via our [vulnerability reporting page](http://aws.amazon.com/security/vulnerability-reporting/). Please do **not** create a public github issue. 59 | 60 | 61 | ## Licensing 62 | 63 | See the [LICENSE](https://github.com/awslabs/deequ/blob/master/LICENSE) file for our project's licensing. We will ask you to confirm the licensing of your contribution. 64 | 65 | We may ask you to sign a [Contributor License Agreement (CLA)](http://en.wikipedia.org/wiki/Contributor_License_Agreement) for larger changes. 66 | -------------------------------------------------------------------------------- /src/main/scala/com/amazon/deequ/anomalydetection/BatchNormalStrategy.scala: -------------------------------------------------------------------------------- 1 | /** 2 | * Copyright 2018 Amazon.com, Inc. or its affiliates. All Rights Reserved. 3 | * 4 | * Licensed under the Apache License, Version 2.0 (the "License"). You may not 5 | * use this file except in compliance with the License. A copy of the License 6 | * is located at 7 | * 8 | * http://aws.amazon.com/apache2.0/ 9 | * 10 | * or in the "license" file accompanying this file. This file is distributed on 11 | * an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either 12 | * express or implied. See the License for the specific language governing 13 | * permissions and limitations under the License. 14 | * 15 | */ 16 | 17 | package com.amazon.deequ.anomalydetection 18 | 19 | import breeze.stats.meanAndVariance 20 | 21 | 22 | /** 23 | * Detects anomalies based on the mean and standard deviation of all available values. 24 | * Assumes that the data is normally distributed. 25 | * 26 | * @param lowerDeviationFactor Detect anomalies if they are 27 | * smaller than mean - lowerDeviationFactor * stdDev 28 | * @param upperDeviationFactor Detect anomalies if they are 29 | * bigger than mean + upperDeviationFactor * stdDev 30 | * @param includeInterval Whether or not values inside the detection interval should be 31 | * included in the calculation of the mean/ stdDev 32 | */ 33 | case class BatchNormalStrategy( 34 | lowerDeviationFactor: Option[Double] = Some(3.0), 35 | upperDeviationFactor: Option[Double] = Some(3.0), 36 | includeInterval: Boolean = false) extends AnomalyDetectionStrategy { 37 | 38 | require(lowerDeviationFactor.isDefined || upperDeviationFactor.isDefined, 39 | "At least one factor has to be specified.") 40 | 41 | require(lowerDeviationFactor.getOrElse(1.0) >= 0 && upperDeviationFactor.getOrElse(1.0) >= 0, 42 | "Factors cannot be smaller than zero.") 43 | 44 | 45 | /** 46 | * Search for anomalies in a series of data points. 47 | * 48 | * @param dataSeries The data contained in a Vector of Doubles 49 | * @param searchInterval The indices between which anomalies should be detected. [a, b). 50 | * @return The indices of all anomalies in the interval and their corresponding wrapper object. 51 | */ 52 | override def detect( 53 | dataSeries: Vector[Double], 54 | searchInterval: (Int, Int)): Seq[(Int, Anomaly)] = { 55 | 56 | val (searchStart, searchEnd) = searchInterval 57 | 58 | require(searchStart <= searchEnd, "The start of the interval can't be larger than the end.") 59 | 60 | require(dataSeries.nonEmpty, "Data series is empty. Can't calculate mean/ stdDev.") 61 | 62 | val searchIntervalLength = searchEnd - searchStart 63 | 64 | require(includeInterval || searchIntervalLength < dataSeries.length, 65 | "Excluding values in searchInterval from calculation but not enough values remain to " + 66 | "calculate mean and stdDev.") 67 | 68 | val mAV = if (includeInterval) { 69 | meanAndVariance(dataSeries) 70 | } else { 71 | val valuesBeforeInterval = dataSeries.slice(0, searchStart) 72 | val valuesAfterInterval = dataSeries.slice(searchEnd, dataSeries.length) 73 | val dataSeriesWithoutInterval = valuesBeforeInterval ++ valuesAfterInterval 74 | 75 | meanAndVariance(dataSeriesWithoutInterval) 76 | } 77 | 78 | val mean = mAV.mean 79 | val stdDev = mAV.stdDev 80 | 81 | val upperBound = mean + upperDeviationFactor.getOrElse(Double.MaxValue) * stdDev 82 | val lowerBound = mean - lowerDeviationFactor.getOrElse(Double.MaxValue) * stdDev 83 | 84 | dataSeries.zipWithIndex 85 | .slice(searchStart, searchEnd) 86 | .filter { case (value, _) => value > upperBound || value < lowerBound } 87 | .map { case (value, index) => 88 | 89 | val detail = Some(s"[BatchNormalStrategy]: Value $value is not in " + 90 | s"bounds [$lowerBound, $upperBound].") 91 | 92 | (index, Anomaly(Option(value), 1.0, detail)) 93 | } 94 | } 95 | } 96 | -------------------------------------------------------------------------------- /src/main/scala/com/amazon/deequ/analyzers/MutualInformation.scala: -------------------------------------------------------------------------------- 1 | /** 2 | * Copyright 2018 Amazon.com, Inc. or its affiliates. All Rights Reserved. 3 | * 4 | * Licensed under the Apache License, Version 2.0 (the "License"). You may not 5 | * use this file except in compliance with the License. A copy of the License 6 | * is located at 7 | * 8 | * http://aws.amazon.com/apache2.0/ 9 | * 10 | * or in the "license" file accompanying this file. This file is distributed on 11 | * an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either 12 | * express or implied. See the License for the specific language governing 13 | * permissions and limitations under the License. 14 | * 15 | */ 16 | 17 | package com.amazon.deequ.analyzers 18 | 19 | import com.amazon.deequ.analyzers.Analyzers._ 20 | import com.amazon.deequ.metrics.{DoubleMetric, Entity} 21 | import org.apache.spark.sql.functions.{col, sum, udf} 22 | import org.apache.spark.sql.types.StructType 23 | import Analyzers.COUNT_COL 24 | import com.amazon.deequ.analyzers.runners.MetricCalculationException 25 | 26 | /** 27 | * Mutual Information describes how much information about one column can be inferred from another 28 | * column. 29 | * 30 | * If two columns are independent of each other, then nothing can be inferred from one column about 31 | * the other, and mutual information is zero. If there is a functional dependency of one column to 32 | * another and vice versa, then all information of the two columns are shared, and mutual 33 | * information is the entropy of each column. 34 | */ 35 | case class MutualInformation(columns: Seq[String], where: Option[String] = None) 36 | extends FrequencyBasedAnalyzer(columns) 37 | with FilterableAnalyzer { 38 | 39 | override def computeMetricFrom(state: Option[FrequenciesAndNumRows]): DoubleMetric = { 40 | 41 | state match { 42 | 43 | case Some(theState) => 44 | val total = theState.numRows 45 | val Seq(col1, col2) = columns 46 | 47 | val freqCol1 = s"__deequ_f1_$col1" 48 | val freqCol2 = s"__deequ_f2_$col2" 49 | 50 | val jointStats = theState.frequencies 51 | 52 | val marginalStats1 = jointStats 53 | .select(col1, COUNT_COL) 54 | .groupBy(col1) 55 | .agg(sum(COUNT_COL).as(freqCol1)) 56 | 57 | val marginalStats2 = jointStats 58 | .select(col2, COUNT_COL) 59 | .groupBy(col2) 60 | .agg(sum(COUNT_COL).as(freqCol2)) 61 | 62 | 63 | val miUdf = udf { 64 | (px: Double, py: Double, pxy: Double) => 65 | (pxy / total) * math.log((pxy / total) / ((px / total) * (py / total))) 66 | } 67 | 68 | val miCol = s"__deequ_mi_${col1}_$col2" 69 | val value = jointStats 70 | .join(marginalStats1, usingColumn = col1) 71 | .join(marginalStats2, usingColumn = col2) 72 | .withColumn(miCol, miUdf(col(freqCol1), col(freqCol2), col(COUNT_COL))) 73 | .agg(sum(miCol)) 74 | 75 | val resultRow = value.head() 76 | 77 | if (resultRow.isNullAt(0)) { 78 | metricFromEmpty(this, "MutualInformation", columns.mkString(","), Entity.Mutlicolumn) 79 | } else { 80 | metricFromValue(resultRow.getDouble(0), "MutualInformation", columns.mkString(","), 81 | Entity.Mutlicolumn) 82 | } 83 | 84 | case None => 85 | metricFromEmpty(this, "MutualInformation", columns.mkString(","), Entity.Mutlicolumn) 86 | } 87 | } 88 | 89 | 90 | /** We need at least one grouping column, and all specified columns must exist */ 91 | override def preconditions: Seq[StructType => Unit] = { 92 | Preconditions.exactlyNColumns(columns, 2) +: super.preconditions 93 | } 94 | 95 | override def toFailureMetric(exception: Exception): DoubleMetric = { 96 | metricFromFailure(exception, "MutualInformation", columns.mkString(","), Entity.Mutlicolumn) 97 | } 98 | 99 | override def filterCondition: Option[String] = where 100 | } 101 | 102 | object MutualInformation { 103 | def apply(columnA: String, columnB: String): MutualInformation = { 104 | new MutualInformation(columnA :: columnB :: Nil) 105 | } 106 | } 107 | -------------------------------------------------------------------------------- /src/main/scala/com/amazon/deequ/metrics/KLLMetric.scala: -------------------------------------------------------------------------------- 1 | /** 2 | * Copyright 2018 Amazon.com, Inc. or its affiliates. All Rights Reserved. 3 | * 4 | * Licensed under the Apache License, Version 2.0 (the "License"). You may not 5 | * use this file except in compliance with the License. A copy of the License 6 | * is located at 7 | * 8 | * http://aws.amazon.com/apache2.0/ 9 | * 10 | * or in the "license" file accompanying this file. This file is distributed on 11 | * an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either 12 | * express or implied. See the License for the specific language governing 13 | * permissions and limitations under the License. 14 | * 15 | */ 16 | 17 | package com.amazon.deequ.metrics 18 | 19 | import com.amazon.deequ.analyzers.QuantileNonSample 20 | 21 | import scala.util.{Failure, Success, Try} 22 | import scala.util.control.Breaks._ 23 | 24 | case class BucketValue(lowValue: Double, highValue: Double, count: Long) 25 | 26 | case class BucketDistribution( 27 | buckets: List[BucketValue], 28 | parameters: List[Double], 29 | data: Array[Array[Double]]) { 30 | 31 | def computePercentiles(): Array[Double] = { 32 | 33 | val sketchSize = parameters(0).toInt 34 | val shrinkingFactor = parameters(1) 35 | 36 | val quantileNonSample = new QuantileNonSample[Double](sketchSize, shrinkingFactor) 37 | quantileNonSample.reconstruct(sketchSize, shrinkingFactor, data) 38 | 39 | quantileNonSample.quantiles(100) 40 | } 41 | 42 | /** 43 | * Get relevant bucketValue with index of bucket. 44 | * @param key index of bucket 45 | * @return The metrics for the bucket 46 | */ 47 | def apply(key: Int): BucketValue = { 48 | buckets(key) 49 | } 50 | 51 | /** 52 | * Find the index of bucket which contains the most items. 53 | * @return The index of bucket which contains the most items. 54 | */ 55 | def argmax: Int = { 56 | var currentMax = 0L 57 | var maxBucket = 0 58 | buckets.foreach { bucket => 59 | if (bucket.count > currentMax) { 60 | currentMax = bucket.count 61 | maxBucket = buckets.indexOf(bucket) 62 | } 63 | } 64 | maxBucket 65 | } 66 | 67 | /** 68 | * Check if it is equal with two BucketDistribution. 69 | * @param obj object to compare 70 | * @return true if equal 71 | */ 72 | override def equals(obj: Any): Boolean = { 73 | obj match { 74 | case that: BucketDistribution => 75 | var check = that.isInstanceOf[BucketDistribution] && 76 | this.buckets.equals(that.buckets) && 77 | this.parameters.equals(that.parameters) && 78 | this.data.length == that.data.length 79 | breakable { 80 | for (i <- this.data.indices) { 81 | if (!this.data(i).sameElements(that.data(i))) { 82 | check = false 83 | break 84 | } 85 | } 86 | } 87 | check 88 | case _ => false 89 | } 90 | } 91 | 92 | // TODO not sure if thats correct... 93 | override def hashCode(): Int = super.hashCode() 94 | } 95 | 96 | case class KLLMetric(column: String, value: Try[BucketDistribution]) 97 | extends Metric[BucketDistribution] { 98 | 99 | val entity: Entity.Value = Entity.Column 100 | val instance: String = column 101 | val name = "KLL" 102 | 103 | def flatten(): Seq[DoubleMetric] = { 104 | value 105 | .map { distribution => 106 | val numberOfBuckets = Seq(DoubleMetric(entity, s"$name.buckets", instance, 107 | Success(distribution.buckets.length.toDouble))) 108 | 109 | val details = distribution.buckets 110 | .flatMap { distValue => 111 | DoubleMetric(entity, s"$name.low", instance, Success(distValue.lowValue)) :: 112 | DoubleMetric(entity, s"$name.high", instance, Success(distValue.highValue)) :: 113 | DoubleMetric(entity, s"$name.count", instance, Success(distValue.count)) :: Nil 114 | } 115 | numberOfBuckets ++ details 116 | } 117 | .recover { 118 | case e: Exception => Seq(DoubleMetric(entity, s"$name.buckets", instance, Failure(e))) 119 | } 120 | .get 121 | } 122 | 123 | } 124 | -------------------------------------------------------------------------------- /src/main/scala/com/amazon/deequ/profiles/ColumnProfilerRunner.scala: -------------------------------------------------------------------------------- 1 | /** 2 | * Copyright 2018 Amazon.com, Inc. or its affiliates. All Rights Reserved. 3 | * 4 | * Licensed under the Apache License, Version 2.0 (the "License"). You may not 5 | * use this file except in compliance with the License. A copy of the License 6 | * is located at 7 | * 8 | * http://aws.amazon.com/apache2.0/ 9 | * 10 | * or in the "license" file accompanying this file. This file is distributed on 11 | * an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either 12 | * express or implied. See the License for the specific language governing 13 | * permissions and limitations under the License. 14 | * 15 | */ 16 | 17 | package com.amazon.deequ.profiles 18 | 19 | import com.amazon.deequ.analyzers.{DataTypeInstances, KLLParameters} 20 | import com.amazon.deequ.io.DfsUtils 21 | import com.amazon.deequ.repository.{MetricsRepository, ResultKey} 22 | import org.apache.spark.annotation.Experimental 23 | import org.apache.spark.sql.{DataFrame, SparkSession} 24 | 25 | private[profiles] case class ColumnProfilerRunBuilderMetricsRepositoryOptions( 26 | metricsRepository: Option[MetricsRepository], 27 | reuseExistingResultsKey: Option[ResultKey], 28 | failIfResultsForReusingMissing: Boolean, 29 | saveOrAppendResultsKey: Option[ResultKey]) 30 | 31 | private[profiles] case class ColumnProfilerRunBuilderFileOutputOptions( 32 | session: Option[SparkSession], 33 | saveColumnProfilesJsonToPath: Option[String], 34 | overwriteResults: Boolean) 35 | 36 | @Experimental 37 | class ColumnProfilerRunner { 38 | 39 | def onData(data: DataFrame): ColumnProfilerRunBuilder = { 40 | new ColumnProfilerRunBuilder(data) 41 | } 42 | 43 | private[profiles] def run( 44 | data: DataFrame, 45 | restrictToColumns: Option[Seq[String]], 46 | lowCardinalityHistogramThreshold: Int, 47 | printStatusUpdates: Boolean, 48 | cacheInputs: Boolean, 49 | fileOutputOptions: ColumnProfilerRunBuilderFileOutputOptions, 50 | metricsRepositoryOptions: ColumnProfilerRunBuilderMetricsRepositoryOptions, 51 | kllProfiling: Boolean, 52 | kllParameters: Option[KLLParameters], 53 | predefinedTypes: Map[String, DataTypeInstances.Value]) 54 | : ColumnProfiles = { 55 | 56 | if (cacheInputs) { 57 | data.cache() 58 | } 59 | 60 | val columnProfiles = ColumnProfiler 61 | .profile( 62 | data, 63 | restrictToColumns, 64 | printStatusUpdates, 65 | lowCardinalityHistogramThreshold, 66 | metricsRepositoryOptions.metricsRepository, 67 | metricsRepositoryOptions.reuseExistingResultsKey, 68 | metricsRepositoryOptions.failIfResultsForReusingMissing, 69 | metricsRepositoryOptions.saveOrAppendResultsKey, 70 | kllProfiling, 71 | kllParameters, 72 | predefinedTypes 73 | ) 74 | 75 | saveColumnProfilesJsonToFileSystemIfNecessary( 76 | fileOutputOptions, 77 | printStatusUpdates, 78 | columnProfiles 79 | ) 80 | 81 | if (cacheInputs) { 82 | data.unpersist() 83 | } 84 | 85 | columnProfiles 86 | } 87 | 88 | private[this] def saveColumnProfilesJsonToFileSystemIfNecessary( 89 | fileOutputOptions: ColumnProfilerRunBuilderFileOutputOptions, 90 | printStatusUpdates: Boolean, 91 | columnProfiles: ColumnProfiles) 92 | : Unit = { 93 | 94 | fileOutputOptions.session.foreach { session => 95 | fileOutputOptions.saveColumnProfilesJsonToPath.foreach { profilesOutput => 96 | if (printStatusUpdates) { 97 | println(s"### WRITING COLUMN PROFILES TO $profilesOutput") 98 | } 99 | 100 | DfsUtils.writeToTextFileOnDfs(session, profilesOutput, 101 | overwrite = fileOutputOptions.overwriteResults) { writer => 102 | writer.append(ColumnProfiles.toJson(columnProfiles.profiles.values.toSeq).toString) 103 | writer.newLine() 104 | } 105 | } 106 | } 107 | } 108 | } 109 | 110 | object ColumnProfilerRunner { 111 | 112 | def apply(): ColumnProfilerRunner = { 113 | new ColumnProfilerRunner() 114 | } 115 | } 116 | -------------------------------------------------------------------------------- /src/main/scala/com/amazon/deequ/suggestions/ConstraintSuggestion.scala: -------------------------------------------------------------------------------- 1 | /** 2 | * Copyright 2018 Amazon.com, Inc. or its affiliates. All Rights Reserved. 3 | * 4 | * Licensed under the Apache License, Version 2.0 (the "License"). You may not 5 | * use this file except in compliance with the License. A copy of the License 6 | * is located at 7 | * 8 | * http://aws.amazon.com/apache2.0/ 9 | * 10 | * or in the "license" file accompanying this file. This file is distributed on 11 | * an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either 12 | * express or implied. See the License for the specific language governing 13 | * permissions and limitations under the License. 14 | * 15 | */ 16 | 17 | package com.amazon.deequ.suggestions 18 | 19 | import com.amazon.deequ.VerificationResult 20 | import com.amazon.deequ.constraints.Constraint 21 | import com.amazon.deequ.profiles.ColumnProfile 22 | import com.amazon.deequ.suggestions.rules.ConstraintRule 23 | import com.google.gson.{GsonBuilder, JsonArray, JsonObject} 24 | 25 | case class ConstraintSuggestion( 26 | constraint: Constraint, 27 | columnName: String, 28 | currentValue: String, 29 | description: String, 30 | suggestingRule: ConstraintRule[ColumnProfile], 31 | codeForConstraint: String 32 | ) 33 | 34 | object ConstraintSuggestions { 35 | 36 | private[this] val CONSTRANT_SUGGESTIONS_FIELD = "constraint_suggestions" 37 | 38 | private[suggestions] def toJson(constraintSuggestions: Seq[ConstraintSuggestion]): String = { 39 | 40 | val json = new JsonObject() 41 | 42 | val constraintsJson = new JsonArray() 43 | 44 | constraintSuggestions.foreach { constraintSuggestion => 45 | 46 | val constraintJson = new JsonObject() 47 | addSharedProperties(constraintJson, constraintSuggestion) 48 | 49 | constraintsJson.add(constraintJson) 50 | } 51 | 52 | json.add(CONSTRANT_SUGGESTIONS_FIELD, constraintsJson) 53 | 54 | val gson = new GsonBuilder() 55 | .setPrettyPrinting() 56 | .create() 57 | 58 | gson.toJson(json) 59 | } 60 | 61 | private[suggestions] def evaluationResultsToJson( 62 | constraintSuggestions: Seq[ConstraintSuggestion], 63 | result: VerificationResult) 64 | : String = { 65 | 66 | val constraintResults = result.checkResults 67 | .map { case (_, checkResult) => checkResult } 68 | .headOption.map { checkResult => 69 | checkResult.constraintResults 70 | } 71 | .getOrElse(Seq.empty) 72 | 73 | val json = new JsonObject() 74 | 75 | val constraintEvaluations = new JsonArray() 76 | 77 | val constraintResultsOnTestSet = constraintResults.map { checkResult => 78 | checkResult.status.toString 79 | } 80 | 81 | constraintSuggestions.zipAll(constraintResultsOnTestSet, null, "Unknown") 82 | .foreach { case (constraintSuggestion, constraintResult) => 83 | 84 | val constraintEvaluation = new JsonObject() 85 | addSharedProperties(constraintEvaluation, constraintSuggestion) 86 | 87 | constraintEvaluation.addProperty("constraint_result_on_test_set", 88 | constraintResult) 89 | 90 | constraintEvaluations.add(constraintEvaluation) 91 | } 92 | 93 | json.add(CONSTRANT_SUGGESTIONS_FIELD, constraintEvaluations) 94 | 95 | val gson = new GsonBuilder() 96 | .setPrettyPrinting() 97 | .create() 98 | 99 | gson.toJson(json) 100 | } 101 | 102 | private[this] def addSharedProperties( 103 | jsonObject: JsonObject, 104 | constraintSuggestion: ConstraintSuggestion) 105 | : Unit = { 106 | 107 | jsonObject.addProperty("constraint_name", constraintSuggestion.constraint.toString) 108 | jsonObject.addProperty("column_name", constraintSuggestion.columnName) 109 | jsonObject.addProperty("current_value", constraintSuggestion.currentValue) 110 | jsonObject.addProperty("description", constraintSuggestion.description) 111 | jsonObject.addProperty("suggesting_rule", constraintSuggestion.suggestingRule.toString) 112 | jsonObject.addProperty("rule_description", constraintSuggestion.suggestingRule.ruleDescription) 113 | jsonObject.addProperty("code_for_constraint", constraintSuggestion.codeForConstraint) 114 | } 115 | } 116 | --------------------------------------------------------------------------------