├── NOTICE
├── .gitignore
├── .travis.yml
├── test-data
└── README.md
├── .github
└── PULL_REQUEST_TEMPLATE.md
├── CODE_OF_CONDUCT.md
├── src
├── test
│ ├── resources
│ │ ├── log4j.properties
│ │ └── EMRSparkShellTest.scala
│ └── scala
│ │ └── com
│ │ └── amazon
│ │ └── deequ
│ │ ├── KLL
│ │ ├── KLLBenchmarkHelper.scala
│ │ ├── KLLBenchmark.java
│ │ └── KLLDistanceTest.scala
│ │ ├── utils
│ │ ├── TempFileUtils.scala
│ │ ├── CollectionUtils.scala
│ │ └── AssertionUtils.scala
│ │ ├── constraints
│ │ └── ConstraintUtils.scala
│ │ ├── examples
│ │ └── ExamplesTest.scala
│ │ ├── package.scala
│ │ ├── checks
│ │ ├── ColumnConditionTest.scala
│ │ └── FilterableCheckTest.scala
│ │ ├── anomalydetection
│ │ ├── RateOfChangeStrategyTest.scala
│ │ ├── AnomalyDetectionTestUtilsTest.scala
│ │ ├── HistoryUtilsTest.scala
│ │ ├── AnomalyDetectionTestUtils.scala
│ │ └── SimpleThresholdStrategyTest.scala
│ │ ├── DatatypeSuggestionTest.scala
│ │ ├── SparkBasicTest.scala
│ │ ├── analyzers
│ │ ├── StatesTest.scala
│ │ └── UniquenessTest.scala
│ │ ├── metrics
│ │ └── MetricsTests.scala
│ │ ├── SparkMonitor.scala
│ │ └── SparkContextSpec.scala
└── main
│ └── scala
│ └── com
│ └── amazon
│ └── deequ
│ ├── analyzers
│ ├── FilterableAnalyzer.scala
│ ├── CountDistinct.scala
│ ├── catalyst
│ │ ├── StatefulStdDevPop.scala
│ │ ├── StatefulCorrelation.scala
│ │ ├── DeequFunctions.scala
│ │ ├── StatefulDataType.scala
│ │ └── StatefulKLLSketch.scala
│ ├── Distinctness.scala
│ ├── Entropy.scala
│ ├── Uniqueness.scala
│ ├── Size.scala
│ ├── MaxLength.scala
│ ├── MinLength.scala
│ ├── Sum.scala
│ ├── Completeness.scala
│ ├── UniqueValueRatio.scala
│ ├── Maximum.scala
│ ├── Minimum.scala
│ ├── Mean.scala
│ ├── NonSampleCompactor.scala
│ ├── Compliance.scala
│ ├── ApproxCountDistinct.scala
│ ├── Analysis.scala
│ ├── StandardDeviation.scala
│ ├── runners
│ │ └── MetricCalculationException.scala
│ ├── Distance.scala
│ ├── Correlation.scala
│ ├── PatternMatch.scala
│ └── MutualInformation.scala
│ ├── examples
│ ├── entities.scala
│ ├── ExampleUtils.scala
│ ├── KLLCheckExample.scala
│ ├── IncrementalMetricsExample.scala
│ ├── BasicExample.scala
│ ├── ConstraintSuggestionExample.scala
│ ├── DataProfilingExample.scala
│ ├── MetricsRepositoryExample.scala
│ └── AnomalyDetectionExample.scala
│ ├── constraints
│ └── ConstrainableDataTypes.scala
│ ├── checks
│ ├── ColumnCondition.scala
│ └── CheckWithLastConstraintFilterable.scala
│ ├── anomalydetection
│ ├── RateOfChangeStrategy.scala
│ ├── AnomalyDetectionStrategy.scala
│ ├── AbsoluteChangeStrategy.scala
│ ├── HistoryUtils.scala
│ ├── DetectionResult.scala
│ ├── SimpleThresholdStrategy.scala
│ ├── RelativeRateOfChangeStrategy.scala
│ └── BatchNormalStrategy.scala
│ ├── suggestions
│ ├── rules
│ │ ├── ConstraintRule.scala
│ │ ├── CompleteIfCompleteRule.scala
│ │ ├── NonNegativeNumbersRule.scala
│ │ ├── UniqueIfApproximatelyUniqueRule.scala
│ │ ├── RetainTypeRule.scala
│ │ ├── RetainCompletenessRule.scala
│ │ └── CategoricalRangeRule.scala
│ ├── ConstraintSuggestionResult.scala
│ └── ConstraintSuggestion.scala
│ ├── repository
│ └── MetricsRepository.scala
│ ├── metrics
│ ├── HistogramMetric.scala
│ ├── Metric.scala
│ └── KLLMetric.scala
│ ├── io
│ └── DfsUtils.scala
│ └── profiles
│ └── ColumnProfilerRunner.scala
├── Makefile
├── settings.xml
├── docs
└── key-concepts.md
└── CONTRIBUTING.md
/NOTICE:
--------------------------------------------------------------------------------
1 | Deequ
2 | Copyright 2018 Amazon.com, Inc. or its affiliates. All Rights Reserved.
3 |
--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
1 | .idea/
2 | *.iml
3 | **/*.iml
4 | target/.travis/public-signing-key.gpg
5 | target/
6 |
--------------------------------------------------------------------------------
/.travis.yml:
--------------------------------------------------------------------------------
1 | language: java
2 | jdk: oraclejdk8
3 | dist: trusty
4 |
5 | script: make build
6 |
--------------------------------------------------------------------------------
/test-data/README.md:
--------------------------------------------------------------------------------
1 | # Dataset used for testing
2 |
3 | * [titanic.csv](https://www.kaggle.com/c/titanic/data)
--------------------------------------------------------------------------------
/.github/PULL_REQUEST_TEMPLATE.md:
--------------------------------------------------------------------------------
1 | *Issue #, if available:*
2 |
3 | *Description of changes:*
4 |
5 |
6 | By submitting this pull request, I confirm that my contribution is made under the terms of the Apache 2.0 license.
7 |
--------------------------------------------------------------------------------
/CODE_OF_CONDUCT.md:
--------------------------------------------------------------------------------
1 | ## Code of Conduct
2 | This project has adopted the [Amazon Open Source Code of Conduct](https://aws.github.io/code-of-conduct).
3 | For more information see the [Code of Conduct FAQ](https://aws.github.io/code-of-conduct-faq) or contact
4 | opensource-codeofconduct@amazon.com with any additional questions or comments.
5 |
--------------------------------------------------------------------------------
/src/test/resources/log4j.properties:
--------------------------------------------------------------------------------
1 | # Change this to set Spark log level
2 | log4j.logger.org.apache.spark=WARN
3 |
4 | # Silence akka remoting
5 | log4j.logger.Remoting=WARN
6 |
7 | # Ignore messages below warning level from Jetty, because it's a bit verbose
8 | log4j.logger.org.eclipse.jetty=WARN
9 |
10 | # INFO log level not required for tests
11 | log4j.logger.org.apache=WARN
12 |
--------------------------------------------------------------------------------
/Makefile:
--------------------------------------------------------------------------------
1 | # mvn profiles for the different supported
2 | # Spark and Scala versions. Uncomment
3 | # the one that you want to use. You can also
4 | # override the profile on the command line:
5 | # `make MVN_PROFILE=spark-2.4-scala-2.11 build`
6 | MVN_PROFILE := spark-3.0-scala-2.12
7 | # MVN_PROFILE := spark-2.4-scala-2.11
8 | # MVN_PROFILE := spark-2.3-scala-2.11
9 | # MVN_PROFILE := spark-2.2-scala-2.11
10 |
11 | # Build the project for specific Spark and
12 | # Scala versions. You can change the profile
13 | # variable to use a different Scala or Spark
14 | # version (see list above).
15 | # If you need more log ouput remove the -q flag.
16 | build:
17 | mvn clean install -q -P $(MVN_PROFILE)
18 |
--------------------------------------------------------------------------------
/settings.xml:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 | ossrh
5 | ${env.MAVEN_REPO_USERNAME}
6 | ${env.MAVEN_REPO_PASSWORD}
7 |
8 |
9 |
10 |
11 | release
12 |
13 | true
14 |
15 |
16 | gpg
17 | 72A07B34207DF21F2CD468178D0084713489CE20
18 | ${env.MAVEN_GPG_PASSPHRASE}
19 |
20 |
21 |
22 |
23 |
--------------------------------------------------------------------------------
/src/main/scala/com/amazon/deequ/analyzers/FilterableAnalyzer.scala:
--------------------------------------------------------------------------------
1 | /**
2 | * Copyright 2018 Amazon.com, Inc. or its affiliates. All Rights Reserved.
3 | *
4 | * Licensed under the Apache License, Version 2.0 (the "License"). You may not
5 | * use this file except in compliance with the License. A copy of the License
6 | * is located at
7 | *
8 | * http://aws.amazon.com/apache2.0/
9 | *
10 | * or in the "license" file accompanying this file. This file is distributed on
11 | * an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either
12 | * express or implied. See the License for the specific language governing
13 | * permissions and limitations under the License.
14 | *
15 | */
16 |
17 | package com.amazon.deequ.analyzers
18 |
19 | /**
20 | * Common trait for Analyzers that support dataset filtering
21 | */
22 | trait FilterableAnalyzer {
23 | def filterCondition: Option[String]
24 | }
25 |
--------------------------------------------------------------------------------
/src/main/scala/com/amazon/deequ/examples/entities.scala:
--------------------------------------------------------------------------------
1 | /**
2 | * Copyright 2018 Amazon.com, Inc. or its affiliates. All Rights Reserved.
3 | *
4 | * Licensed under the Apache License, Version 2.0 (the "License"). You may not
5 | * use this file except in compliance with the License. A copy of the License
6 | * is located at
7 | *
8 | * http://aws.amazon.com/apache2.0/
9 | *
10 | * or in the "license" file accompanying this file. This file is distributed on
11 | * an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either
12 | * express or implied. See the License for the specific language governing
13 | * permissions and limitations under the License.
14 | *
15 | */
16 |
17 | package com.amazon.deequ.examples
18 |
19 | private[deequ] case class Item(
20 | id: Long,
21 | productName: String,
22 | description: String,
23 | priority: String,
24 | numViews: Long
25 | )
26 |
27 | private[deequ] case class Manufacturer(
28 | id: Long,
29 | manufacturerName: String,
30 | countryCode: String
31 | )
32 |
--------------------------------------------------------------------------------
/src/test/scala/com/amazon/deequ/KLL/KLLBenchmarkHelper.scala:
--------------------------------------------------------------------------------
1 | /**
2 | * Copyright 2018 Amazon.com, Inc. or its affiliates. All Rights Reserved.
3 | *
4 | * Licensed under the Apache License, Version 2.0 (the "License"). You may not
5 | * use this file except in compliance with the License. A copy of the License
6 | * is located at
7 | *
8 | * http://aws.amazon.com/apache2.0/
9 | *
10 | * or in the "license" file accompanying this file. This file is distributed on
11 | * an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either
12 | * express or implied. See the License for the specific language governing
13 | * permissions and limitations under the License.
14 | *
15 | */
16 |
17 | package com.amazon.deequ.KLL
18 |
19 | import com.amazon.deequ.analyzers.{KLLSketch, QuantileNonSample}
20 |
21 | object KLLBenchmarkHelper {
22 |
23 | def floatSketch(): QuantileNonSample[java.lang.Float] = {
24 | new QuantileNonSample[java.lang.Float](KLLSketch.DEFAULT_SKETCH_SIZE,
25 | KLLSketch.DEFAULT_SHRINKING_FACTOR)
26 | }
27 |
28 | }
29 |
--------------------------------------------------------------------------------
/src/test/scala/com/amazon/deequ/utils/TempFileUtils.scala:
--------------------------------------------------------------------------------
1 | /**
2 | * Copyright 2018 Amazon.com, Inc. or its affiliates. All Rights Reserved.
3 | *
4 | * Licensed under the Apache License, Version 2.0 (the "License"). You may not
5 | * use this file except in compliance with the License. A copy of the License
6 | * is located at
7 | *
8 | * http://aws.amazon.com/apache2.0/
9 | *
10 | * or in the "license" file accompanying this file. This file is distributed on
11 | * an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either
12 | * express or implied. See the License for the specific language governing
13 | * permissions and limitations under the License.
14 | *
15 | */
16 |
17 | package com.amazon.deequ.utils
18 |
19 | import java.nio.file.Files
20 | import java.util.UUID
21 |
22 | object TempFileUtils {
23 | def tempDir(prefix: String = UUID.randomUUID().toString): String = {
24 | val tempDir = Files.createTempDirectory(prefix).toFile
25 | tempDir.deleteOnExit()
26 | tempDir.getAbsolutePath
27 | }
28 | }
29 |
--------------------------------------------------------------------------------
/src/main/scala/com/amazon/deequ/constraints/ConstrainableDataTypes.scala:
--------------------------------------------------------------------------------
1 | /**
2 | * Copyright 2018 Amazon.com, Inc. or its affiliates. All Rights Reserved.
3 | *
4 | * Licensed under the Apache License, Version 2.0 (the "License"). You may not
5 | * use this file except in compliance with the License. A copy of the License
6 | * is located at
7 | *
8 | * http://aws.amazon.com/apache2.0/
9 | *
10 | * or in the "license" file accompanying this file. This file is distributed on
11 | * an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either
12 | * express or implied. See the License for the specific language governing
13 | * permissions and limitations under the License.
14 | *
15 | */
16 |
17 | package com.amazon.deequ.constraints
18 |
19 | object ConstrainableDataTypes extends Enumeration {
20 | val Null: Value = Value(0)
21 | val Fractional: Value = Value(1)
22 | val Integral: Value = Value(2)
23 | val Boolean: Value = Value(3)
24 | val String: Value = Value(4)
25 | val Numeric: Value = Value(5) // Union of integral and fractional
26 | }
27 |
--------------------------------------------------------------------------------
/src/test/scala/com/amazon/deequ/utils/CollectionUtils.scala:
--------------------------------------------------------------------------------
1 | /**
2 | * Copyright 2018 Amazon.com, Inc. or its affiliates. All Rights Reserved.
3 | *
4 | * Licensed under the Apache License, Version 2.0 (the "License"). You may not
5 | * use this file except in compliance with the License. A copy of the License
6 | * is located at
7 | *
8 | * http://aws.amazon.com/apache2.0/
9 | *
10 | * or in the "license" file accompanying this file. This file is distributed on
11 | * an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either
12 | * express or implied. See the License for the specific language governing
13 | * permissions and limitations under the License.
14 | *
15 | */
16 |
17 | package com.amazon.deequ.utils
18 |
19 | object CollectionUtils {
20 |
21 | implicit class SeqExtensions[A](val source: Seq[A]) {
22 | def forEachOrder(f: Seq[A] => Any): Unit = {
23 | source.combinations(source.size)
24 | .flatMap { _.permutations }
25 | .foreach { distinctOrder => f(distinctOrder) }
26 | }
27 |
28 | }
29 | }
30 |
--------------------------------------------------------------------------------
/src/main/scala/com/amazon/deequ/checks/ColumnCondition.scala:
--------------------------------------------------------------------------------
1 | /**
2 | * Copyright 2018 Amazon.com, Inc. or its affiliates. All Rights Reserved.
3 | *
4 | * Licensed under the Apache License, Version 2.0 (the "License"). You may not
5 | * use this file except in compliance with the License. A copy of the License
6 | * is located at
7 | *
8 | * http://aws.amazon.com/apache2.0/
9 | *
10 | * or in the "license" file accompanying this file. This file is distributed on
11 | * an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either
12 | * express or implied. See the License for the specific language governing
13 | * permissions and limitations under the License.
14 | *
15 | */
16 |
17 | package com.amazon.deequ.checks
18 |
19 | import org.apache.spark.sql.functions.{col}
20 |
21 | private[checks] object ColumnCondition {
22 |
23 | def isEachNotNull(cols: Seq[String]): String = {
24 | cols
25 | .map(col(_).isNotNull)
26 | .reduce(_ and _)
27 | .toString()
28 | }
29 |
30 | def isAnyNotNull(cols: Seq[String]): String = {
31 | cols
32 | .map(col(_).isNotNull)
33 | .reduce(_ or _)
34 | .toString()
35 | }
36 | }
37 |
--------------------------------------------------------------------------------
/src/test/scala/com/amazon/deequ/constraints/ConstraintUtils.scala:
--------------------------------------------------------------------------------
1 | /**
2 | * Copyright 2018 Amazon.com, Inc. or its affiliates. All Rights Reserved.
3 | *
4 | * Licensed under the Apache License, Version 2.0 (the "License"). You may not
5 | * use this file except in compliance with the License. A copy of the License
6 | * is located at
7 | *
8 | * http://aws.amazon.com/apache2.0/
9 | *
10 | * or in the "license" file accompanying this file. This file is distributed on
11 | * an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either
12 | * express or implied. See the License for the specific language governing
13 | * permissions and limitations under the License.
14 | *
15 | */
16 |
17 | package com.amazon.deequ.constraints
18 |
19 | import org.apache.spark.sql.DataFrame
20 |
21 | object ConstraintUtils {
22 |
23 | def calculate(constraint: Constraint, df: DataFrame): ConstraintResult = {
24 |
25 | val analysisBasedConstraint = constraint match {
26 | case nc: ConstraintDecorator => nc.inner
27 | case c: Constraint => c
28 | }
29 |
30 | analysisBasedConstraint.asInstanceOf[AnalysisBasedConstraint[_, _, _]].calculateAndEvaluate(df)
31 | }
32 | }
33 |
--------------------------------------------------------------------------------
/src/test/scala/com/amazon/deequ/examples/ExamplesTest.scala:
--------------------------------------------------------------------------------
1 | /**
2 | * Copyright 2018 Amazon.com, Inc. or its affiliates. All Rights Reserved.
3 | *
4 | * Licensed under the Apache License, Version 2.0 (the "License"). You may not
5 | * use this file except in compliance with the License. A copy of the License
6 | * is located at
7 | *
8 | * http://aws.amazon.com/apache2.0/
9 | *
10 | * or in the "license" file accompanying this file. This file is distributed on
11 | * an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either
12 | * express or implied. See the License for the specific language governing
13 | * permissions and limitations under the License.
14 | *
15 | */
16 |
17 | package com.amazon.deequ.examples
18 |
19 | import org.scalatest.WordSpec
20 |
21 | class ExamplesTest extends WordSpec {
22 |
23 | "all examples" should {
24 | "run without errors" in {
25 | BasicExample.main(Array.empty)
26 | IncrementalMetricsExample.main(Array.empty)
27 | MetricsRepositoryExample.main(Array.empty)
28 | UpdateMetricsOnPartitionedDataExample.main(Array.empty)
29 | DataProfilingExample.main(Array.empty)
30 | AnomalyDetectionExample.main(Array.empty)
31 | ConstraintSuggestionExample.main(Array.empty)
32 | }
33 | }
34 |
35 | }
36 |
--------------------------------------------------------------------------------
/src/test/scala/com/amazon/deequ/package.scala:
--------------------------------------------------------------------------------
1 | /**
2 | * Copyright 2018 Amazon.com, Inc. or its affiliates. All Rights Reserved.
3 | *
4 | * Licensed under the Apache License, Version 2.0 (the "License"). You may not
5 | * use this file except in compliance with the License. A copy of the License
6 | * is located at
7 | *
8 | * http://aws.amazon.com/apache2.0/
9 | *
10 | * or in the "license" file accompanying this file. This file is distributed on
11 | * an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either
12 | * express or implied. See the License for the specific language governing
13 | * permissions and limitations under the License.
14 | *
15 | */
16 |
17 | package com.amazon
18 |
19 | import org.apache.spark.sql.{DataFrame, Row, SparkSession}
20 | import org.apache.spark.sql.types.{StructField, StructType}
21 | import org.apache.spark.sql.types.{ DataType => SparkDT }
22 |
23 | package object deequ {
24 | def dataFrameWithColumn(
25 | name: String,
26 | columnType: SparkDT,
27 | sparkSession: SparkSession,
28 | values: Row*)
29 | : DataFrame = {
30 |
31 | import scala.collection.JavaConverters._
32 | val struct = StructType(StructField(name, columnType) :: Nil)
33 | sparkSession.createDataFrame(values.asJava, struct).toDF(name)
34 | }
35 | }
36 |
--------------------------------------------------------------------------------
/src/main/scala/com/amazon/deequ/anomalydetection/RateOfChangeStrategy.scala:
--------------------------------------------------------------------------------
1 | /**
2 | * Copyright 2018 Amazon.com, Inc. or its affiliates. All Rights Reserved.
3 | *
4 | * Licensed under the Apache License, Version 2.0 (the "License"). You may not
5 | * use this file except in compliance with the License. A copy of the License
6 | * is located at
7 | *
8 | * http://aws.amazon.com/apache2.0/
9 | *
10 | * or in the "license" file accompanying this file. This file is distributed on
11 | * an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either
12 | * express or implied. See the License for the specific language governing
13 | * permissions and limitations under the License.
14 | *
15 | */
16 |
17 |
18 | package com.amazon.deequ.anomalydetection
19 |
20 | /**
21 | * Provided for backwards compatibility.
22 | * the old [[RateOfChangeStrategy]] actually detects absolute changes
23 | * so it has been migrated to [[AbsoluteChangeStrategy]]
24 | * use [[RelativeRateOfChangeStrategy]] if you want to
25 | * detect relative changes to the previous values
26 | */
27 | @deprecated("use AbsoluteChangeStrategy instead which describes the strategy more accurately")
28 | case class RateOfChangeStrategy(
29 | maxRateDecrease: Option[Double] = None,
30 | maxRateIncrease: Option[Double] = None,
31 | order: Int = 1) extends BaseChangeStrategy
32 |
--------------------------------------------------------------------------------
/src/main/scala/com/amazon/deequ/anomalydetection/AnomalyDetectionStrategy.scala:
--------------------------------------------------------------------------------
1 | /**
2 | * Copyright 2018 Amazon.com, Inc. or its affiliates. All Rights Reserved.
3 | *
4 | * Licensed under the Apache License, Version 2.0 (the "License"). You may not
5 | * use this file except in compliance with the License. A copy of the License
6 | * is located at
7 | *
8 | * http://aws.amazon.com/apache2.0/
9 | *
10 | * or in the "license" file accompanying this file. This file is distributed on
11 | * an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either
12 | * express or implied. See the License for the specific language governing
13 | * permissions and limitations under the License.
14 | *
15 | */
16 |
17 | package com.amazon.deequ.anomalydetection
18 |
19 | /** Interface for all strategies that spot anomalies in a series of data points. */
20 | trait AnomalyDetectionStrategy {
21 |
22 | /**
23 | * Search for anomalies in a series of data points.
24 | *
25 | * @param dataSeries The data contained in a Vector of Doubles
26 | * @param searchInterval The indices between which anomalies should be detected. [a, b).
27 | * @return The indices of all anomalies in the interval and their corresponding wrapper object.
28 | */
29 | def detect(
30 | dataSeries: Vector[Double],
31 | searchInterval: (Int, Int) = (0, Int.MaxValue)): Seq[(Int, Anomaly)]
32 | }
33 |
--------------------------------------------------------------------------------
/src/test/scala/com/amazon/deequ/checks/ColumnConditionTest.scala:
--------------------------------------------------------------------------------
1 | /**
2 | * Copyright 2018 Amazon.com, Inc. or its affiliates. All Rights Reserved.
3 | *
4 | * Licensed under the Apache License, Version 2.0 (the "License"). You may not
5 | * use this file except in compliance with the License. A copy of the License
6 | * is located at
7 | *
8 | * http://aws.amazon.com/apache2.0/
9 | *
10 | * or in the "license" file accompanying this file. This file is distributed on
11 | * an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either
12 | * express or implied. See the License for the specific language governing
13 | * permissions and limitations under the License.
14 | *
15 | */
16 |
17 | package com.amazon.deequ.checks
18 |
19 |
20 | import org.scalatest.wordspec.AnyWordSpec
21 |
22 | class ColumnConditionTest extends AnyWordSpec {
23 |
24 | "ColumnCondition" should {
25 |
26 | "return the correct isEachNotNull condition" in {
27 | assert(
28 | ColumnCondition.isEachNotNull(Seq("att1", "att2", "att3")) ==
29 | "(((att1 IS NOT NULL) AND (att2 IS NOT NULL)) AND (att3 IS NOT NULL))"
30 | )
31 | }
32 |
33 | "return the correct isAnyNotNull condition" in {
34 | assert(
35 | ColumnCondition.isAnyNotNull(Seq("att1", "att2", "att3")) ==
36 | "(((att1 IS NOT NULL) OR (att2 IS NOT NULL)) OR (att3 IS NOT NULL))"
37 | )
38 | }
39 | }
40 |
41 | }
42 |
43 |
44 |
--------------------------------------------------------------------------------
/src/main/scala/com/amazon/deequ/analyzers/CountDistinct.scala:
--------------------------------------------------------------------------------
1 | /**
2 | * Copyright 2018 Amazon.com, Inc. or its affiliates. All Rights Reserved.
3 | *
4 | * Licensed under the Apache License, Version 2.0 (the "License"). You may not
5 | * use this file except in compliance with the License. A copy of the License
6 | * is located at
7 | *
8 | * http://aws.amazon.com/apache2.0/
9 | *
10 | * or in the "license" file accompanying this file. This file is distributed on
11 | * an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either
12 | * express or implied. See the License for the specific language governing
13 | * permissions and limitations under the License.
14 | *
15 | */
16 |
17 | package com.amazon.deequ.analyzers
18 |
19 | import com.amazon.deequ.metrics.DoubleMetric
20 | import org.apache.spark.sql.{Column, Row}
21 | import org.apache.spark.sql.functions.count
22 | import Analyzers._
23 |
24 | case class CountDistinct(columns: Seq[String])
25 | extends ScanShareableFrequencyBasedAnalyzer("CountDistinct", columns) {
26 |
27 | override def aggregationFunctions(numRows: Long): Seq[Column] = {
28 | count("*") :: Nil
29 | }
30 |
31 | override def fromAggregationResult(result: Row, offset: Int): DoubleMetric = {
32 | toSuccessMetric(result.getLong(offset).toDouble)
33 | }
34 | }
35 |
36 | object CountDistinct {
37 | def apply(column: String): CountDistinct = {
38 | new CountDistinct(column :: Nil)
39 | }
40 | }
41 |
--------------------------------------------------------------------------------
/src/main/scala/com/amazon/deequ/analyzers/catalyst/StatefulStdDevPop.scala:
--------------------------------------------------------------------------------
1 | /**
2 | * Copyright 2018 Amazon.com, Inc. or its affiliates. All Rights Reserved.
3 | *
4 | * Licensed under the Apache License, Version 2.0 (the "License"). You may not
5 | * use this file except in compliance with the License. A copy of the License
6 | * is located at
7 | *
8 | * http://aws.amazon.com/apache2.0/
9 | *
10 | * or in the "license" file accompanying this file. This file is distributed on
11 | * an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either
12 | * express or implied. See the License for the specific language governing
13 | * permissions and limitations under the License.
14 | *
15 | */
16 |
17 | package org.apache.spark.sql
18 |
19 | import org.apache.spark.sql.catalyst.expressions._
20 | import org.apache.spark.sql.catalyst.expressions.aggregate.CentralMomentAgg
21 | import org.apache.spark.sql.types._
22 |
23 | /** Adjusted version of org.apache.spark.sql.catalyst.expressions.aggregate.StddevPop */
24 | private[sql] case class StatefulStdDevPop(child: Expression) extends CentralMomentAgg(child) {
25 |
26 | override protected def momentOrder = 2
27 |
28 | override def dataType: DataType = StructType(StructField("n", DoubleType) ::
29 | StructField("avg", DoubleType) :: StructField("m2", DoubleType) :: Nil)
30 |
31 | override val evaluateExpression: Expression = CreateStruct(n :: avg :: m2 :: Nil)
32 |
33 | override def prettyName: String = "stateful_stddev_pop"
34 | }
35 |
--------------------------------------------------------------------------------
/src/main/scala/com/amazon/deequ/suggestions/rules/ConstraintRule.scala:
--------------------------------------------------------------------------------
1 | /**
2 | * Copyright 2018 Amazon.com, Inc. or its affiliates. All Rights Reserved.
3 | *
4 | * Licensed under the Apache License, Version 2.0 (the "License"). You may not
5 | * use this file except in compliance with the License. A copy of the License
6 | * is located at
7 | *
8 | * http://aws.amazon.com/apache2.0/
9 | *
10 | * or in the "license" file accompanying this file. This file is distributed on
11 | * an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either
12 | * express or implied. See the License for the specific language governing
13 | * permissions and limitations under the License.
14 | *
15 | */
16 |
17 | package com.amazon.deequ.suggestions.rules
18 |
19 | import com.amazon.deequ.profiles.ColumnProfile
20 | import com.amazon.deequ.suggestions._
21 |
22 | /** Abstract base class for all constraint suggestion rules */
23 | abstract class ConstraintRule[P <: ColumnProfile] {
24 |
25 | val ruleDescription: String
26 |
27 | /**
28 | * Decide whether the rule should be applied to a particular column
29 | *
30 | * @param profile profile of the column
31 | * @param numRecords overall number of records
32 | * @return
33 | */
34 | def shouldBeApplied(profile: P, numRecords: Long): Boolean
35 |
36 | /**
37 | * Generated a suggested constraint for the column
38 | *
39 | * @param profile profile of the column
40 | * @param numRecords overall number of records
41 | * @return
42 | */
43 | def candidate(profile: P, numRecords: Long): ConstraintSuggestion
44 | }
45 |
--------------------------------------------------------------------------------
/src/main/scala/com/amazon/deequ/analyzers/Distinctness.scala:
--------------------------------------------------------------------------------
1 | /**
2 | * Copyright 2018 Amazon.com, Inc. or its affiliates. All Rights Reserved.
3 | *
4 | * Licensed under the Apache License, Version 2.0 (the "License"). You may not
5 | * use this file except in compliance with the License. A copy of the License
6 | * is located at
7 | *
8 | * http://aws.amazon.com/apache2.0/
9 | *
10 | * or in the "license" file accompanying this file. This file is distributed on
11 | * an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either
12 | * express or implied. See the License for the specific language governing
13 | * permissions and limitations under the License.
14 | *
15 | */
16 |
17 | package com.amazon.deequ.analyzers
18 |
19 | import com.amazon.deequ.analyzers.Analyzers.COUNT_COL
20 | import org.apache.spark.sql.functions.{col, sum}
21 | import org.apache.spark.sql.types.DoubleType
22 | import org.apache.spark.sql.Column
23 |
24 | /**
25 | * Distinctness is the fraction of distinct values of a column(s).
26 | *
27 | * @param columns the column(s) for which to compute distinctness
28 | */
29 | case class Distinctness(columns: Seq[String], where: Option[String] = None)
30 | extends ScanShareableFrequencyBasedAnalyzer("Distinctness", columns)
31 | with FilterableAnalyzer {
32 |
33 | override def aggregationFunctions(numRows: Long): Seq[Column] = {
34 | (sum(col(COUNT_COL).geq(1).cast(DoubleType)) / numRows) :: Nil
35 | }
36 |
37 | override def filterCondition: Option[String] = where
38 | }
39 |
40 | object Distinctness {
41 | def apply(column: String): Distinctness = {
42 | new Distinctness(column :: Nil)
43 | }
44 | }
45 |
--------------------------------------------------------------------------------
/src/test/scala/com/amazon/deequ/anomalydetection/RateOfChangeStrategyTest.scala:
--------------------------------------------------------------------------------
1 | /**
2 | * Copyright 2018 Amazon.com, Inc. or its affiliates. All Rights Reserved.
3 | *
4 | * Licensed under the Apache License, Version 2.0 (the "License"). You may not
5 | * use this file except in compliance with the License. A copy of the License
6 | * is located at
7 | *
8 | * http://aws.amazon.com/apache2.0/
9 | *
10 | * or in the "license" file accompanying this file. This file is distributed on
11 | * an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either
12 | * express or implied. See the License for the specific language governing
13 | * permissions and limitations under the License.
14 | *
15 | */
16 |
17 | package com.amazon.deequ.anomalydetection
18 |
19 | import org.scalatest.{Matchers, WordSpec}
20 |
21 | /**
22 | * The tested class RateOfChangeStrategy is deprecated.
23 | * This test is to ensure backwards compatibility for deequ checks that still rely on this strategy.
24 | */
25 | class RateOfChangeStrategyTest extends WordSpec with Matchers {
26 |
27 | "RateOfChange Strategy" should {
28 |
29 | val strategy = RateOfChangeStrategy(Some(-2.0), Some(2.0))
30 | val data = (for (i <- 0 to 50) yield {
31 | if (i < 20 || i > 30) {
32 | 1.0
33 | } else {
34 | if (i % 2 == 0) i else -i
35 | }
36 | }).toVector
37 |
38 | "detect all anomalies if no interval specified" in {
39 | val anomalyResult = strategy.detect(data)
40 | val expected = for (i <- 20 to 31) yield {
41 | (i, Anomaly(Option(data(i)), 1.0))
42 | }
43 | assert(anomalyResult == expected)
44 | }
45 | }
46 | }
47 |
--------------------------------------------------------------------------------
/src/main/scala/com/amazon/deequ/analyzers/Entropy.scala:
--------------------------------------------------------------------------------
1 | /**
2 | * Copyright 2018 Amazon.com, Inc. or its affiliates. All Rights Reserved.
3 | *
4 | * Licensed under the Apache License, Version 2.0 (the "License"). You may not
5 | * use this file except in compliance with the License. A copy of the License
6 | * is located at
7 | *
8 | * http://aws.amazon.com/apache2.0/
9 | *
10 | * or in the "license" file accompanying this file. This file is distributed on
11 | * an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either
12 | * express or implied. See the License for the specific language governing
13 | * permissions and limitations under the License.
14 | *
15 | */
16 |
17 | package com.amazon.deequ.analyzers
18 |
19 | import com.amazon.deequ.analyzers.Analyzers.COUNT_COL
20 | import org.apache.spark.sql.Column
21 | import org.apache.spark.sql.functions.{col, sum, udf}
22 |
23 | /**
24 | * Entropy is a measure of the level of information contained in a message. Given the probability
25 | * distribution over values in a column, it describes how many bits are required to identify a
26 | * value.
27 | */
28 | case class Entropy(column: String, where: Option[String] = None)
29 | extends ScanShareableFrequencyBasedAnalyzer("Entropy", column :: Nil)
30 | with FilterableAnalyzer {
31 |
32 | override def aggregationFunctions(numRows: Long): Seq[Column] = {
33 | val summands = udf { (count: Double) =>
34 | if (count == 0.0) {
35 | 0.0
36 | } else {
37 | -(count / numRows) * math.log(count / numRows)
38 | }
39 | }
40 |
41 | sum(summands(col(COUNT_COL))) :: Nil
42 | }
43 |
44 | override def filterCondition: Option[String] = where
45 | }
46 |
--------------------------------------------------------------------------------
/src/main/scala/com/amazon/deequ/anomalydetection/AbsoluteChangeStrategy.scala:
--------------------------------------------------------------------------------
1 | /**
2 | * Copyright 2018 Amazon.com, Inc. or its affiliates. All Rights Reserved.
3 | *
4 | * Licensed under the Apache License, Version 2.0 (the "License"). You may not
5 | * use this file except in compliance with the License. A copy of the License
6 | * is located at
7 | *
8 | * http://aws.amazon.com/apache2.0/
9 | *
10 | * or in the "license" file accompanying this file. This file is distributed on
11 | * an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either
12 | * express or implied. See the License for the specific language governing
13 | * permissions and limitations under the License.
14 | *
15 | */
16 |
17 | package com.amazon.deequ.anomalydetection
18 |
19 | /**
20 | * Detects anomalies based on the values' absolute change.
21 | * The order of the difference can be set manually.
22 | * If it is set to 0, this strategy acts like the [[SimpleThresholdStrategy]].
23 | *
24 | * AbsoluteChangeStrategy(Some(-10.0), Some(10.0), 1) for example
25 | * calculates the first discrete difference
26 | * and if some point's value changes by more than 10.0 in one timestep, it flags it as an anomaly.
27 | *
28 | * @param maxRateDecrease Upper bound of accepted decrease (lower bound of increase).
29 | * @param maxRateIncrease Upper bound of accepted growth.
30 | * @param order Order of the calculated difference.
31 | * Set to 1 it calculates the difference between two consecutive values.
32 | */
33 | case class AbsoluteChangeStrategy(
34 | maxRateDecrease: Option[Double] = None,
35 | maxRateIncrease: Option[Double] = None,
36 | order: Int = 1) extends BaseChangeStrategy
37 |
--------------------------------------------------------------------------------
/src/main/scala/com/amazon/deequ/examples/ExampleUtils.scala:
--------------------------------------------------------------------------------
1 | /**
2 | * Copyright 2018 Amazon.com, Inc. or its affiliates. All Rights Reserved.
3 | *
4 | * Licensed under the Apache License, Version 2.0 (the "License"). You may not
5 | * use this file except in compliance with the License. A copy of the License
6 | * is located at
7 | *
8 | * http://aws.amazon.com/apache2.0/
9 | *
10 | * or in the "license" file accompanying this file. This file is distributed on
11 | * an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either
12 | * express or implied. See the License for the specific language governing
13 | * permissions and limitations under the License.
14 | *
15 | */
16 |
17 | package com.amazon.deequ.examples
18 |
19 | import org.apache.spark.sql.{DataFrame, SparkSession}
20 |
21 | private[deequ] object ExampleUtils {
22 |
23 | def withSpark(func: SparkSession => Unit): Unit = {
24 | val session = SparkSession.builder()
25 | .master("local")
26 | .appName("test")
27 | .config("spark.ui.enabled", "false")
28 | .getOrCreate()
29 | session.sparkContext.setCheckpointDir(System.getProperty("java.io.tmpdir"))
30 |
31 | try {
32 | func(session)
33 | } finally {
34 | session.stop()
35 | System.clearProperty("spark.driver.port")
36 | }
37 | }
38 |
39 | def itemsAsDataframe(session: SparkSession, items: Item*): DataFrame = {
40 | val rdd = session.sparkContext.parallelize(items)
41 | session.createDataFrame(rdd)
42 | }
43 |
44 | def manufacturersAsDataframe(session: SparkSession, manufacturers: Manufacturer*): DataFrame = {
45 | val rdd = session.sparkContext.parallelize(manufacturers)
46 | session.createDataFrame(rdd)
47 | }
48 | }
49 |
--------------------------------------------------------------------------------
/src/main/scala/com/amazon/deequ/analyzers/Uniqueness.scala:
--------------------------------------------------------------------------------
1 | /**
2 | * Copyright 2018 Amazon.com, Inc. or its affiliates. All Rights Reserved.
3 | *
4 | * Licensed under the Apache License, Version 2.0 (the "License"). You may not
5 | * use this file except in compliance with the License. A copy of the License
6 | * is located at
7 | *
8 | * http://aws.amazon.com/apache2.0/
9 | *
10 | * or in the "license" file accompanying this file. This file is distributed on
11 | * an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either
12 | * express or implied. See the License for the specific language governing
13 | * permissions and limitations under the License.
14 | *
15 | */
16 |
17 | package com.amazon.deequ.analyzers
18 |
19 | import com.amazon.deequ.analyzers.Analyzers.COUNT_COL
20 | import org.apache.spark.sql.Column
21 | import org.apache.spark.sql.functions.{col, lit, sum}
22 | import org.apache.spark.sql.types.DoubleType
23 |
24 | /** Uniqueness is the fraction of unique values of a column(s), i.e.,
25 | * values that occur exactly once. */
26 | case class Uniqueness(columns: Seq[String], where: Option[String] = None)
27 | extends ScanShareableFrequencyBasedAnalyzer("Uniqueness", columns)
28 | with FilterableAnalyzer {
29 |
30 | override def aggregationFunctions(numRows: Long): Seq[Column] = {
31 | (sum(col(COUNT_COL).equalTo(lit(1)).cast(DoubleType)) / numRows) :: Nil
32 | }
33 |
34 | override def filterCondition: Option[String] = where
35 | }
36 |
37 | object Uniqueness {
38 | def apply(column: String): Uniqueness = {
39 | new Uniqueness(column :: Nil)
40 | }
41 |
42 | def apply(column: String, where: Option[String]): Uniqueness = {
43 | new Uniqueness(column :: Nil, where)
44 | }
45 | }
46 |
--------------------------------------------------------------------------------
/src/main/scala/com/amazon/deequ/analyzers/Size.scala:
--------------------------------------------------------------------------------
1 | /**
2 | * Copyright 2018 Amazon.com, Inc. or its affiliates. All Rights Reserved.
3 | *
4 | * Licensed under the Apache License, Version 2.0 (the "License"). You may not
5 | * use this file except in compliance with the License. A copy of the License
6 | * is located at
7 | *
8 | * http://aws.amazon.com/apache2.0/
9 | *
10 | * or in the "license" file accompanying this file. This file is distributed on
11 | * an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either
12 | * express or implied. See the License for the specific language governing
13 | * permissions and limitations under the License.
14 | *
15 | */
16 |
17 | package com.amazon.deequ.analyzers
18 |
19 | import com.amazon.deequ.metrics.Entity
20 | import org.apache.spark.sql.{Column, Row}
21 | import Analyzers._
22 |
23 | case class NumMatches(numMatches: Long) extends DoubleValuedState[NumMatches] {
24 |
25 | override def sum(other: NumMatches): NumMatches = {
26 | NumMatches(numMatches + other.numMatches)
27 | }
28 |
29 | override def metricValue(): Double = {
30 | numMatches.toDouble
31 | }
32 |
33 | }
34 |
35 | /** Size is the number of rows in a DataFrame. */
36 | case class Size(where: Option[String] = None)
37 | extends StandardScanShareableAnalyzer[NumMatches]("Size", "*", Entity.Dataset)
38 | with FilterableAnalyzer {
39 |
40 | override def aggregationFunctions(): Seq[Column] = {
41 | conditionalCount(where) :: Nil
42 | }
43 |
44 | override def fromAggregationResult(result: Row, offset: Int): Option[NumMatches] = {
45 | ifNoNullsIn(result, offset) { _ =>
46 | NumMatches(result.getLong(offset))
47 | }
48 | }
49 |
50 | override def filterCondition: Option[String] = where
51 | }
52 |
--------------------------------------------------------------------------------
/src/main/scala/com/amazon/deequ/analyzers/MaxLength.scala:
--------------------------------------------------------------------------------
1 | /**
2 | * Copyright 2018 Amazon.com, Inc. or its affiliates. All Rights Reserved.
3 | *
4 | * Licensed under the Apache License, Version 2.0 (the "License"). You may not
5 | * use this file except in compliance with the License. A copy of the License
6 | * is located at
7 | *
8 | * http://aws.amazon.com/apache2.0/
9 | *
10 | * or in the "license" file accompanying this file. This file is distributed on
11 | * an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either
12 | * express or implied. See the License for the specific language governing
13 | * permissions and limitations under the License.
14 | *
15 | */
16 |
17 | package com.amazon.deequ.analyzers
18 |
19 | import com.amazon.deequ.analyzers.Analyzers._
20 | import com.amazon.deequ.analyzers.Preconditions.{hasColumn, isString}
21 | import org.apache.spark.sql.functions.{length, max}
22 | import org.apache.spark.sql.types.{DoubleType, StructType}
23 | import org.apache.spark.sql.{Column, Row}
24 |
25 | case class MaxLength(column: String, where: Option[String] = None)
26 | extends StandardScanShareableAnalyzer[MaxState]("MaxLength", column)
27 | with FilterableAnalyzer {
28 |
29 | override def aggregationFunctions(): Seq[Column] = {
30 | max(length(conditionalSelection(column, where))).cast(DoubleType) :: Nil
31 | }
32 |
33 | override def fromAggregationResult(result: Row, offset: Int): Option[MaxState] = {
34 | ifNoNullsIn(result, offset) { _ =>
35 | MaxState(result.getDouble(offset))
36 | }
37 | }
38 |
39 | override protected def additionalPreconditions(): Seq[StructType => Unit] = {
40 | hasColumn(column):: isString(column) :: Nil
41 | }
42 |
43 | override def filterCondition: Option[String] = where
44 | }
45 |
--------------------------------------------------------------------------------
/src/test/scala/com/amazon/deequ/DatatypeSuggestionTest.scala:
--------------------------------------------------------------------------------
1 | /**
2 | * Copyright 2018 Amazon.com, Inc. or its affiliates. All Rights Reserved.
3 | *
4 | * Licensed under the Apache License, Version 2.0 (the "License"). You may not
5 | * use this file except in compliance with the License. A copy of the License
6 | * is located at
7 | *
8 | * http://aws.amazon.com/apache2.0/
9 | *
10 | * or in the "license" file accompanying this file. This file is distributed on
11 | * an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either
12 | * express or implied. See the License for the specific language governing
13 | * permissions and limitations under the License.
14 | *
15 | */
16 |
17 | package com.amazon.deequ
18 |
19 | import com.amazon.deequ.profiles.{ColumnProfiler, ColumnProfiles, StandardColumnProfile}
20 | import com.amazon.deequ.utils.FixtureSupport
21 | import org.apache.spark.sql.{DataFrame, SparkSession}
22 | import org.scalamock.scalatest.MockFactory
23 | import org.scalatest.{Matchers, WordSpec}
24 |
25 | class DatatypeSuggestionTest extends WordSpec with Matchers with SparkContextSpec
26 | with FixtureSupport with MockFactory{
27 |
28 | "Column Profiler" should {
29 | "return the correct datatype(String) in case of profiling empty string columns" in
30 | withSparkSession { sparkSession =>
31 |
32 | val df = getEmptyColumnDataDf(sparkSession = sparkSession)
33 |
34 | val profile = ColumnProfiler
35 | .profile(df, Option(Seq("att1")))
36 | .profiles("att1")
37 |
38 | assert(profile.isInstanceOf[StandardColumnProfile])
39 | assert(profile.isDataTypeInferred && profile.dataType.toString.equalsIgnoreCase("String"))
40 | }
41 | }
42 |
43 | }
44 |
--------------------------------------------------------------------------------
/src/main/scala/com/amazon/deequ/analyzers/MinLength.scala:
--------------------------------------------------------------------------------
1 | /**
2 | * Copyright 2018 Amazon.com, Inc. or its affiliates. All Rights Reserved.
3 | *
4 | * Licensed under the Apache License, Version 2.0 (the "License"). You may not
5 | * use this file except in compliance with the License. A copy of the License
6 | * is located at
7 | *
8 | * http://aws.amazon.com/apache2.0/
9 | *
10 | * or in the "license" file accompanying this file. This file is distributed on
11 | * an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either
12 | * express or implied. See the License for the specific language governing
13 | * permissions and limitations under the License.
14 | *
15 | */
16 |
17 | package com.amazon.deequ.analyzers
18 |
19 | import com.amazon.deequ.analyzers.Analyzers._
20 | import com.amazon.deequ.analyzers.Preconditions.{hasColumn, isString}
21 | import org.apache.spark.sql.functions.{length, min}
22 | import org.apache.spark.sql.types.{DoubleType, StructType}
23 | import org.apache.spark.sql.{Column, Row}
24 |
25 | case class MinLength(column: String, where: Option[String] = None)
26 | extends StandardScanShareableAnalyzer[MinState]("MinLength", column)
27 | with FilterableAnalyzer {
28 |
29 | override def aggregationFunctions(): Seq[Column] = {
30 | min(length(conditionalSelection(column, where))).cast(DoubleType) :: Nil
31 | }
32 |
33 | override def fromAggregationResult(result: Row, offset: Int): Option[MinState] = {
34 | ifNoNullsIn(result, offset) { _ =>
35 | MinState(result.getDouble(offset))
36 | }
37 | }
38 |
39 | override protected def additionalPreconditions(): Seq[StructType => Unit] = {
40 | hasColumn(column) :: isString(column) :: Nil
41 | }
42 |
43 | override def filterCondition: Option[String] = where
44 | }
45 |
--------------------------------------------------------------------------------
/src/main/scala/com/amazon/deequ/anomalydetection/HistoryUtils.scala:
--------------------------------------------------------------------------------
1 | /**
2 | * Copyright 2018 Amazon.com, Inc. or its affiliates. All Rights Reserved.
3 | *
4 | * Licensed under the Apache License, Version 2.0 (the "License"). You may not
5 | * use this file except in compliance with the License. A copy of the License
6 | * is located at
7 | *
8 | * http://aws.amazon.com/apache2.0/
9 | *
10 | * or in the "license" file accompanying this file. This file is distributed on
11 | * an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either
12 | * express or implied. See the License for the specific language governing
13 | * permissions and limitations under the License.
14 | *
15 | */
16 |
17 | package com.amazon.deequ.anomalydetection
18 |
19 | import com.amazon.deequ.metrics.Metric
20 |
21 | /**
22 | * Contains utility methods to convert tuples of date and metric to a DataPoint
23 | */
24 | private[deequ] object HistoryUtils {
25 |
26 | /**
27 | * Given a sequence of dated optional metrics, return sequence of dated optional metric values.
28 | *
29 | * @param metrics Sequence of dated optional metrics
30 | * @tparam M Type of the metric value
31 | * @return Sequence of dated optional metric values
32 | */
33 | def extractMetricValues[M](metrics: Seq[(Long, Option[Metric[M]])]): Seq[DataPoint[M]] = {
34 | metrics.map { case (date, metric) => DataPoint(date, extractMetricValue[M](metric)) }
35 | }
36 |
37 | /**
38 | * Given an optional metric,returns optional metric value
39 | *
40 | * @param metric Optional metric
41 | * @tparam M Type of the metric value
42 | * @return Optional metric value
43 | */
44 | def extractMetricValue[M](metric: Option[Metric[M]]): Option[M] = {
45 | metric.flatMap(_.value.toOption)
46 | }
47 |
48 | }
49 |
--------------------------------------------------------------------------------
/src/test/scala/com/amazon/deequ/anomalydetection/AnomalyDetectionTestUtilsTest.scala:
--------------------------------------------------------------------------------
1 | /**
2 | * Copyright 2018 Amazon.com, Inc. or its affiliates. All Rights Reserved.
3 | *
4 | * Licensed under the Apache License, Version 2.0 (the "License"). You may not
5 | * use this file except in compliance with the License. A copy of the License
6 | * is located at
7 | *
8 | * http://aws.amazon.com/apache2.0/
9 | *
10 | * or in the "license" file accompanying this file. This file is distributed on
11 | * an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either
12 | * express or implied. See the License for the specific language governing
13 | * permissions and limitations under the License.
14 | *
15 | */
16 |
17 | package com.amazon.deequ.anomalydetection
18 |
19 | import org.scalatest.{Matchers, WordSpec}
20 |
21 | class AnomalyDetectionTestUtilsTest extends WordSpec with Matchers {
22 |
23 | "AnomalyDetectionTestUtilsTest" should {
24 |
25 | "throw an exception if no value found" in {
26 | intercept[IllegalArgumentException] {
27 | AnomalyDetectionTestUtils.firstDoubleFromString("noNumber")
28 | }
29 | intercept[IllegalArgumentException] {
30 | AnomalyDetectionTestUtils.firstThreeDoublesFromString("noNumber")
31 | }
32 | }
33 |
34 | "find first value" in {
35 | val str = "xx3.141yyu4.2"
36 | val value = AnomalyDetectionTestUtils.firstDoubleFromString(str)
37 | assert(value == 3.141)
38 | }
39 |
40 | "find all 3 values" in {
41 | val str = "In this 1 string are 3.000 values, not 42.01"
42 |
43 | val (first, second, third) = AnomalyDetectionTestUtils.firstThreeDoublesFromString(str)
44 | assert(first === 1)
45 | assert(second === 3.0)
46 | assert(third === 42.01)
47 | }
48 | }
49 | }
50 |
--------------------------------------------------------------------------------
/src/test/scala/com/amazon/deequ/utils/AssertionUtils.scala:
--------------------------------------------------------------------------------
1 | /**
2 | * Copyright 2018 Amazon.com, Inc. or its affiliates. All Rights Reserved.
3 | *
4 | * Licensed under the Apache License, Version 2.0 (the "License"). You may not
5 | * use this file except in compliance with the License. A copy of the License
6 | * is located at
7 | *
8 | * http://aws.amazon.com/apache2.0/
9 | *
10 | * or in the "license" file accompanying this file. This file is distributed on
11 | * an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either
12 | * express or implied. See the License for the specific language governing
13 | * permissions and limitations under the License.
14 | *
15 | */
16 |
17 | package com.amazon.deequ.utils
18 |
19 | import scala.util.{Failure, Success, Try}
20 |
21 | object AssertionUtils {
22 |
23 | implicit class TryUtils[A](something: Try[A]) {
24 | def compare[B](other: Try[B]): Boolean = {
25 | (something, other) match {
26 | case (Success(a), Success(b)) => a == b
27 | case (Failure(a), Failure(b)) => a.getClass == b.getClass && (a.getMessage == b.getMessage)
28 | case (_, _) => false
29 | }
30 | }
31 | def compareFailureTypes[B](other: Try[B]): Boolean = {
32 | (something, other) match {
33 | case (Failure(a), Failure(b)) => a.getClass == b.getClass
34 | case (_, _) => false
35 | }
36 | }
37 | def compareOuterAndInnerFailureTypes[B](other: Try[B]): Boolean = {
38 | (something, other) match {
39 | case (Failure(a: Throwable), Failure(b: Throwable))
40 | if (a.getCause != null) && (b.getCause != null) =>
41 | (a.getClass == b.getClass) && (a.getCause.getClass == b.getCause.getClass)
42 | case (_, _) => false
43 | }
44 | }
45 |
46 | }
47 |
48 | }
49 |
--------------------------------------------------------------------------------
/src/main/scala/com/amazon/deequ/suggestions/rules/CompleteIfCompleteRule.scala:
--------------------------------------------------------------------------------
1 | /**
2 | * Copyright 2018 Amazon.com, Inc. or its affiliates. All Rights Reserved.
3 | *
4 | * Licensed under the Apache License, Version 2.0 (the "License"). You may not
5 | * use this file except in compliance with the License. A copy of the License
6 | * is located at
7 | *
8 | * http://aws.amazon.com/apache2.0/
9 | *
10 | * or in the "license" file accompanying this file. This file is distributed on
11 | * an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either
12 | * express or implied. See the License for the specific language governing
13 | * permissions and limitations under the License.
14 | *
15 | */
16 |
17 | package com.amazon.deequ.suggestions.rules
18 |
19 | import com.amazon.deequ.checks.Check
20 | import com.amazon.deequ.constraints.Constraint.completenessConstraint
21 | import com.amazon.deequ.profiles.ColumnProfile
22 | import com.amazon.deequ.suggestions.ConstraintSuggestion
23 |
24 | /** If a column is complete in the sample, we suggest a NOT NULL constraint */
25 | case class CompleteIfCompleteRule() extends ConstraintRule[ColumnProfile] {
26 |
27 | override def shouldBeApplied(profile: ColumnProfile, numRecords: Long): Boolean = {
28 | profile.completeness == 1.0
29 | }
30 |
31 | override def candidate(profile: ColumnProfile, numRecords: Long): ConstraintSuggestion = {
32 |
33 | val constraint = completenessConstraint(profile.column, Check.IsOne)
34 |
35 | ConstraintSuggestion(
36 | constraint,
37 | profile.column,
38 | "Completeness: " + profile.completeness.toString,
39 | s"'${profile.column}' is not null",
40 | this,
41 | s""".isComplete("${profile.column}")"""
42 | )
43 | }
44 |
45 | override val ruleDescription: String = "If a column is complete in the sample, " +
46 | "we suggest a NOT NULL constraint"
47 | }
48 |
--------------------------------------------------------------------------------
/src/main/scala/com/amazon/deequ/anomalydetection/DetectionResult.scala:
--------------------------------------------------------------------------------
1 | /**
2 | * Copyright 2018 Amazon.com, Inc. or its affiliates. All Rights Reserved.
3 | *
4 | * Licensed under the Apache License, Version 2.0 (the "License"). You may not
5 | * use this file except in compliance with the License. A copy of the License
6 | * is located at
7 | *
8 | * http://aws.amazon.com/apache2.0/
9 | *
10 | * or in the "license" file accompanying this file. This file is distributed on
11 | * an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either
12 | * express or implied. See the License for the specific language governing
13 | * permissions and limitations under the License.
14 | *
15 | */
16 |
17 | package com.amazon.deequ.anomalydetection
18 |
19 | class Anomaly(
20 | val value: Option[Double],
21 | val confidence: Double,
22 | val detail: Option[String] = None) {
23 |
24 | def canEqual(that: Any): Boolean = {
25 | that.isInstanceOf[Anomaly]
26 | }
27 |
28 | /**
29 | * Tests anomalies for equality. Ignores detailed explanation.
30 | *
31 | * @param obj The object/ anomaly to compare against
32 | * @return true, if and only if the value and confidence are the same
33 | */
34 | override def equals(obj: Any): Boolean = {
35 | obj match {
36 | case anomaly: Anomaly => anomaly.value == value && anomaly.confidence == confidence
37 | case _ => false
38 | }
39 | }
40 |
41 | override def hashCode: Int = {
42 | val prime = 31
43 | var result = 1
44 | result = prime * result + (if (value == null) 0 else value.hashCode)
45 | prime * result + confidence.hashCode
46 | }
47 |
48 | }
49 |
50 | object Anomaly {
51 | def apply(value: Option[Double], confidence: Double, detail: Option[String] = None): Anomaly = {
52 | new Anomaly(value, confidence, detail)
53 | }
54 | }
55 |
56 | case class DetectionResult(anomalies: Seq[(Long, Anomaly)] = Seq.empty)
57 |
--------------------------------------------------------------------------------
/src/main/scala/com/amazon/deequ/analyzers/Sum.scala:
--------------------------------------------------------------------------------
1 | /**
2 | * Copyright 2018 Amazon.com, Inc. or its affiliates. All Rights Reserved.
3 | *
4 | * Licensed under the Apache License, Version 2.0 (the "License"). You may not
5 | * use this file except in compliance with the License. A copy of the License
6 | * is located at
7 | *
8 | * http://aws.amazon.com/apache2.0/
9 | *
10 | * or in the "license" file accompanying this file. This file is distributed on
11 | * an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either
12 | * express or implied. See the License for the specific language governing
13 | * permissions and limitations under the License.
14 | *
15 | */
16 |
17 | package com.amazon.deequ.analyzers
18 |
19 | import com.amazon.deequ.analyzers.Preconditions.{hasColumn, isNumeric}
20 | import org.apache.spark.sql.functions.sum
21 | import org.apache.spark.sql.types.{DoubleType, StructType}
22 | import org.apache.spark.sql.{Column, Row}
23 | import Analyzers._
24 |
25 | case class SumState(sum: Double) extends DoubleValuedState[SumState] {
26 |
27 | override def sum(other: SumState): SumState = {
28 | SumState(sum + other.sum)
29 | }
30 |
31 | override def metricValue(): Double = {
32 | sum
33 | }
34 | }
35 |
36 | case class Sum(column: String, where: Option[String] = None)
37 | extends StandardScanShareableAnalyzer[SumState]("Sum", column)
38 | with FilterableAnalyzer {
39 |
40 | override def aggregationFunctions(): Seq[Column] = {
41 | sum(conditionalSelection(column, where)).cast(DoubleType) :: Nil
42 | }
43 |
44 | override def fromAggregationResult(result: Row, offset: Int): Option[SumState] = {
45 | ifNoNullsIn(result, offset) { _ =>
46 | SumState(result.getDouble(offset))
47 | }
48 | }
49 |
50 | override protected def additionalPreconditions(): Seq[StructType => Unit] = {
51 | hasColumn(column) :: isNumeric(column) :: Nil
52 | }
53 |
54 | override def filterCondition: Option[String] = where
55 | }
56 |
--------------------------------------------------------------------------------
/src/main/scala/com/amazon/deequ/analyzers/Completeness.scala:
--------------------------------------------------------------------------------
1 | /**
2 | * Copyright 2018 Amazon.com, Inc. or its affiliates. All Rights Reserved.
3 | *
4 | * Licensed under the Apache License, Version 2.0 (the "License"). You may not
5 | * use this file except in compliance with the License. A copy of the License
6 | * is located at
7 | *
8 | * http://aws.amazon.com/apache2.0/
9 | *
10 | * or in the "license" file accompanying this file. This file is distributed on
11 | * an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either
12 | * express or implied. See the License for the specific language governing
13 | * permissions and limitations under the License.
14 | *
15 | */
16 |
17 | package com.amazon.deequ.analyzers
18 |
19 | import com.amazon.deequ.analyzers.Preconditions.{hasColumn, isNotNested}
20 | import org.apache.spark.sql.functions.sum
21 | import org.apache.spark.sql.types.{IntegerType, StructType}
22 | import Analyzers._
23 | import org.apache.spark.sql.{Column, Row}
24 |
25 | /** Completeness is the fraction of non-null values in a column of a DataFrame. */
26 | case class Completeness(column: String, where: Option[String] = None) extends
27 | StandardScanShareableAnalyzer[NumMatchesAndCount]("Completeness", column) with
28 | FilterableAnalyzer {
29 |
30 | override def fromAggregationResult(result: Row, offset: Int): Option[NumMatchesAndCount] = {
31 |
32 | ifNoNullsIn(result, offset, howMany = 2) { _ =>
33 | NumMatchesAndCount(result.getLong(offset), result.getLong(offset + 1))
34 | }
35 | }
36 |
37 | override def aggregationFunctions(): Seq[Column] = {
38 |
39 | val summation = sum(conditionalSelection(column, where).isNotNull.cast(IntegerType))
40 |
41 | summation :: conditionalCount(where) :: Nil
42 | }
43 |
44 | override protected def additionalPreconditions(): Seq[StructType => Unit] = {
45 | hasColumn(column) :: isNotNested(column) :: Nil
46 | }
47 |
48 | override def filterCondition: Option[String] = where
49 | }
50 |
--------------------------------------------------------------------------------
/src/main/scala/com/amazon/deequ/checks/CheckWithLastConstraintFilterable.scala:
--------------------------------------------------------------------------------
1 | /**
2 | * Copyright 2018 Amazon.com, Inc. or its affiliates. All Rights Reserved.
3 | *
4 | * Licensed under the Apache License, Version 2.0 (the "License"). You may not
5 | * use this file except in compliance with the License. A copy of the License
6 | * is located at
7 | *
8 | * http://aws.amazon.com/apache2.0/
9 | *
10 | * or in the "license" file accompanying this file. This file is distributed on
11 | * an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either
12 | * express or implied. See the License for the specific language governing
13 | * permissions and limitations under the License.
14 | *
15 | */
16 |
17 | package com.amazon.deequ.checks
18 |
19 | import com.amazon.deequ.constraints.Constraint
20 |
21 | /** Allows to replace the last configured constraint in a check with a filtered version */
22 | class CheckWithLastConstraintFilterable(
23 | level: CheckLevel.Value,
24 | description: String,
25 | constraints: Seq[Constraint],
26 | createReplacement: Option[String] => Constraint)
27 | extends Check(level, description, constraints) {
28 |
29 | /**
30 | * Defines a filter to apply before evaluating the previous constraint
31 | *
32 | * @param filter SparkSQL predicate to apply
33 | * @return
34 | */
35 | def where(filter: String): Check = {
36 |
37 | val adjustedConstraints =
38 | constraints.take(constraints.size - 1) :+ createReplacement(Option(filter))
39 |
40 | Check(level, description, adjustedConstraints)
41 | }
42 | }
43 |
44 | object CheckWithLastConstraintFilterable {
45 | def apply(
46 | level: CheckLevel.Value,
47 | description: String,
48 | constraints: Seq[Constraint],
49 | createReplacement: Option[String] => Constraint
50 | ): CheckWithLastConstraintFilterable = {
51 |
52 | new CheckWithLastConstraintFilterable(level, description, constraints, createReplacement)
53 | }
54 | }
55 |
--------------------------------------------------------------------------------
/src/main/scala/com/amazon/deequ/repository/MetricsRepository.scala:
--------------------------------------------------------------------------------
1 | /**
2 | * Copyright 2018 Amazon.com, Inc. or its affiliates. All Rights Reserved.
3 | *
4 | * Licensed under the Apache License, Version 2.0 (the "License"). You may not
5 | * use this file except in compliance with the License. A copy of the License
6 | * is located at
7 | *
8 | * http://aws.amazon.com/apache2.0/
9 | *
10 | * or in the "license" file accompanying this file. This file is distributed on
11 | * an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either
12 | * express or implied. See the License for the specific language governing
13 | * permissions and limitations under the License.
14 | *
15 | */
16 |
17 | package com.amazon.deequ.repository
18 |
19 | import com.amazon.deequ.analyzers.runners.AnalyzerContext
20 |
21 | /**
22 | * Common trait for RepositoryIndexes where deequ runs can be stored.
23 | * Repository provides methods to store AnalysisResults(metrics) and VerificationResults(if any)
24 | */
25 | trait MetricsRepository {
26 |
27 | /**
28 | * Saves Analysis results (metrics)
29 | *
30 | * @param resultKey A ResultKey that uniquely identifies a AnalysisResult
31 | * @param analyzerContext The resulting AnalyzerContext of an Analysis
32 | */
33 | def save(resultKey: ResultKey, analyzerContext: AnalyzerContext): Unit
34 |
35 | /**
36 | * Get a AnalyzerContext saved using exactly the same resultKey if present
37 | */
38 | def loadByKey(resultKey: ResultKey): Option[AnalyzerContext]
39 |
40 | /** Get a builder class to construct a loading query to get AnalysisResults */
41 | def load(): MetricsRepositoryMultipleResultsLoader
42 |
43 | }
44 |
45 | /**
46 | * Information that uniquely identifies a AnalysisResult
47 | *
48 | * @param dataSetDate A date related to the AnalysisResult
49 | * @param tags A map with additional annotations
50 | */
51 | case class ResultKey(dataSetDate: Long, tags: Map[String, String] = Map.empty)
52 |
--------------------------------------------------------------------------------
/src/test/scala/com/amazon/deequ/SparkBasicTest.scala:
--------------------------------------------------------------------------------
1 | /**
2 | * Copyright 2018 Amazon.com, Inc. or its affiliates. All Rights Reserved.
3 | *
4 | * Licensed under the Apache License, Version 2.0 (the "License"). You may not
5 | * use this file except in compliance with the License. A copy of the License
6 | * is located at
7 | *
8 | * http://aws.amazon.com/apache2.0/
9 | *
10 | * or in the "license" file accompanying this file. This file is distributed on
11 | * an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either
12 | * express or implied. See the License for the specific language governing
13 | * permissions and limitations under the License.
14 | *
15 | */
16 |
17 | package com.amazon.deequ
18 |
19 | import org.scalatest.{Matchers, WordSpec}
20 |
21 | class SparkBasicTest extends WordSpec with Matchers with SparkContextSpec {
22 | "check that initializing a spark context and a basic example works" in
23 | withSparkSession { sparkSession =>
24 | val sc = sparkSession.sparkContext
25 | val xs = sc.parallelize(1 to 100)
26 | val res = xs.sum()
27 | res should be(5050)
28 | }
29 |
30 | "check that monitoring spark session works" in
31 | withMonitorableSparkSession { (sparkSession, sparkMonitor) =>
32 | val sc = sparkSession.sparkContext
33 | val xs = sc.parallelize(1 to 100)
34 |
35 |
36 | (1 to 2).foreach { index =>
37 | val res = sparkMonitor.withMonitoringSession { stat =>
38 | val sum = xs.map(_ * index).sum()
39 | // Spark jobs are running in different monitoring sessions
40 | assert(stat.jobCount == 1)
41 | sum
42 | }
43 | res should be(5050 * index)
44 | }
45 |
46 | sparkMonitor.withMonitoringSession { stat =>
47 | (1 to 2).foreach { index =>
48 | xs.map(_ * index).sum()
49 | }
50 | // Spark jobs are running in the same monitoring session
51 | assert(stat.jobCount == 2)
52 | }
53 | }
54 | }
55 |
56 |
--------------------------------------------------------------------------------
/src/main/scala/com/amazon/deequ/analyzers/UniqueValueRatio.scala:
--------------------------------------------------------------------------------
1 | /**
2 | * Copyright 2018 Amazon.com, Inc. or its affiliates. All Rights Reserved.
3 | *
4 | * Licensed under the Apache License, Version 2.0 (the "License"). You may not
5 | * use this file except in compliance with the License. A copy of the License
6 | * is located at
7 | *
8 | * http://aws.amazon.com/apache2.0/
9 | *
10 | * or in the "license" file accompanying this file. This file is distributed on
11 | * an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either
12 | * express or implied. See the License for the specific language governing
13 | * permissions and limitations under the License.
14 | *
15 | */
16 |
17 | package com.amazon.deequ.analyzers
18 |
19 | import com.amazon.deequ.analyzers.Analyzers.COUNT_COL
20 | import com.amazon.deequ.metrics.DoubleMetric
21 | import org.apache.spark.sql.{Column, Row}
22 | import org.apache.spark.sql.functions.{col, count, lit, sum}
23 | import org.apache.spark.sql.types.DoubleType
24 |
25 | case class UniqueValueRatio(columns: Seq[String], where: Option[String] = None)
26 | extends ScanShareableFrequencyBasedAnalyzer("UniqueValueRatio", columns)
27 | with FilterableAnalyzer {
28 |
29 | override def aggregationFunctions(numRows: Long): Seq[Column] = {
30 | sum(col(COUNT_COL).equalTo(lit(1)).cast(DoubleType)) :: count("*") :: Nil
31 | }
32 |
33 | override def fromAggregationResult(result: Row, offset: Int): DoubleMetric = {
34 | val numUniqueValues = result.getDouble(offset)
35 | val numDistinctValues = result.getLong(offset + 1).toDouble
36 |
37 | toSuccessMetric(numUniqueValues / numDistinctValues)
38 | }
39 |
40 | override def filterCondition: Option[String] = where
41 | }
42 |
43 | object UniqueValueRatio {
44 | def apply(column: String): UniqueValueRatio = {
45 | new UniqueValueRatio(column :: Nil)
46 | }
47 |
48 | def apply(column: String, where: Option[String]): UniqueValueRatio = {
49 | new UniqueValueRatio(column :: Nil, where)
50 | }
51 | }
52 |
--------------------------------------------------------------------------------
/src/main/scala/com/amazon/deequ/analyzers/Maximum.scala:
--------------------------------------------------------------------------------
1 | /**
2 | * Copyright 2018 Amazon.com, Inc. or its affiliates. All Rights Reserved.
3 | *
4 | * Licensed under the Apache License, Version 2.0 (the "License"). You may not
5 | * use this file except in compliance with the License. A copy of the License
6 | * is located at
7 | *
8 | * http://aws.amazon.com/apache2.0/
9 | *
10 | * or in the "license" file accompanying this file. This file is distributed on
11 | * an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either
12 | * express or implied. See the License for the specific language governing
13 | * permissions and limitations under the License.
14 | *
15 | */
16 |
17 | package com.amazon.deequ.analyzers
18 |
19 | import com.amazon.deequ.analyzers.Preconditions.{hasColumn, isNumeric}
20 | import org.apache.spark.sql.{Column, Row}
21 | import org.apache.spark.sql.functions.max
22 | import org.apache.spark.sql.types.{DoubleType, StructType}
23 | import Analyzers._
24 |
25 | case class MaxState(maxValue: Double) extends DoubleValuedState[MaxState] {
26 |
27 | override def sum(other: MaxState): MaxState = {
28 | MaxState(math.max(maxValue, other.maxValue))
29 | }
30 |
31 | override def metricValue(): Double = {
32 | maxValue
33 | }
34 | }
35 |
36 | case class Maximum(column: String, where: Option[String] = None)
37 | extends StandardScanShareableAnalyzer[MaxState]("Maximum", column)
38 | with FilterableAnalyzer {
39 |
40 | override def aggregationFunctions(): Seq[Column] = {
41 | max(conditionalSelection(column, where)).cast(DoubleType) :: Nil
42 | }
43 |
44 | override def fromAggregationResult(result: Row, offset: Int): Option[MaxState] = {
45 |
46 | ifNoNullsIn(result, offset) { _ =>
47 | MaxState(result.getDouble(offset))
48 | }
49 | }
50 |
51 | override protected def additionalPreconditions(): Seq[StructType => Unit] = {
52 | hasColumn(column) :: isNumeric(column) :: Nil
53 | }
54 |
55 | override def filterCondition: Option[String] = where
56 | }
57 |
--------------------------------------------------------------------------------
/src/main/scala/com/amazon/deequ/analyzers/Minimum.scala:
--------------------------------------------------------------------------------
1 | /**
2 | * Copyright 2018 Amazon.com, Inc. or its affiliates. All Rights Reserved.
3 | *
4 | * Licensed under the Apache License, Version 2.0 (the "License"). You may not
5 | * use this file except in compliance with the License. A copy of the License
6 | * is located at
7 | *
8 | * http://aws.amazon.com/apache2.0/
9 | *
10 | * or in the "license" file accompanying this file. This file is distributed on
11 | * an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either
12 | * express or implied. See the License for the specific language governing
13 | * permissions and limitations under the License.
14 | *
15 | */
16 |
17 | package com.amazon.deequ.analyzers
18 |
19 | import com.amazon.deequ.analyzers.Preconditions.{hasColumn, isNumeric}
20 | import org.apache.spark.sql.{Column, Row}
21 | import org.apache.spark.sql.functions.min
22 | import org.apache.spark.sql.types.{DoubleType, StructType}
23 | import Analyzers._
24 |
25 | case class MinState(minValue: Double) extends DoubleValuedState[MinState] {
26 |
27 | override def sum(other: MinState): MinState = {
28 | MinState(math.min(minValue, other.minValue))
29 | }
30 |
31 | override def metricValue(): Double = {
32 | minValue
33 | }
34 | }
35 |
36 | case class Minimum(column: String, where: Option[String] = None)
37 | extends StandardScanShareableAnalyzer[MinState]("Minimum", column)
38 | with FilterableAnalyzer {
39 |
40 | override def aggregationFunctions(): Seq[Column] = {
41 | min(conditionalSelection(column, where)).cast(DoubleType) :: Nil
42 | }
43 |
44 | override def fromAggregationResult(result: Row, offset: Int): Option[MinState] = {
45 |
46 | ifNoNullsIn(result, offset) { _ =>
47 | MinState(result.getDouble(offset))
48 | }
49 | }
50 |
51 | override protected def additionalPreconditions(): Seq[StructType => Unit] = {
52 | hasColumn(column) :: isNumeric(column) :: Nil
53 | }
54 |
55 | override def filterCondition: Option[String] = where
56 | }
57 |
--------------------------------------------------------------------------------
/src/test/scala/com/amazon/deequ/anomalydetection/HistoryUtilsTest.scala:
--------------------------------------------------------------------------------
1 | /**
2 | * Copyright 2018 Amazon.com, Inc. or its affiliates. All Rights Reserved.
3 | *
4 | * Licensed under the Apache License, Version 2.0 (the "License"). You may not
5 | * use this file except in compliance with the License. A copy of the License
6 | * is located at
7 | *
8 | * http://aws.amazon.com/apache2.0/
9 | *
10 | * or in the "license" file accompanying this file. This file is distributed on
11 | * an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either
12 | * express or implied. See the License for the specific language governing
13 | * permissions and limitations under the License.
14 | *
15 | */
16 |
17 | package com.amazon.deequ.anomalydetection
18 |
19 | import com.amazon.deequ.metrics.{DoubleMetric, Entity}
20 | import org.scalatest.{Matchers, WordSpec}
21 |
22 | import scala.util.{Failure, Success}
23 |
24 | class HistoryUtilsTest extends WordSpec with Matchers {
25 |
26 | "History Utils" should {
27 | val sampleException = new IllegalArgumentException()
28 |
29 | val noneMetric = None
30 | val metricWithNoValue = Some(DoubleMetric(Entity.Column, "metric-name", "instance-name",
31 | Failure(sampleException)))
32 | val metricWithValue = Some(DoubleMetric(Entity.Column, "metric-name", "instance-name",
33 | Success(50)))
34 |
35 | "extract optinal metric value" in {
36 | assert(HistoryUtils.extractMetricValue[Double](noneMetric).isEmpty)
37 | assert(HistoryUtils.extractMetricValue[Double](metricWithNoValue).isEmpty)
38 | assert(HistoryUtils.extractMetricValue[Double](metricWithValue).contains(50))
39 |
40 | }
41 | "extract optinal metric values" in {
42 | val metrics = Seq(0L -> noneMetric, 1L -> metricWithNoValue, 2L -> metricWithValue)
43 | assert(HistoryUtils.extractMetricValues[Double](metrics) == Seq(DataPoint[Double](0L, None),
44 | DataPoint[Double](1L, None), DataPoint[Double](2, Some(50))))
45 | }
46 | }
47 | }
48 |
49 |
--------------------------------------------------------------------------------
/src/main/scala/com/amazon/deequ/analyzers/catalyst/StatefulCorrelation.scala:
--------------------------------------------------------------------------------
1 | /**
2 | * Copyright 2018 Amazon.com, Inc. or its affiliates. All Rights Reserved.
3 | *
4 | * Licensed under the Apache License, Version 2.0 (the "License"). You may not
5 | * use this file except in compliance with the License. A copy of the License
6 | * is located at
7 | *
8 | * http://aws.amazon.com/apache2.0/
9 | *
10 | * or in the "license" file accompanying this file. This file is distributed on
11 | * an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either
12 | * express or implied. See the License for the specific language governing
13 | * permissions and limitations under the License.
14 | *
15 | */
16 |
17 | package org.apache.spark.sql
18 |
19 | import org.apache.spark.sql.catalyst.expressions.aggregate.Corr
20 | import org.apache.spark.sql.catalyst.expressions._
21 | import org.apache.spark.sql.types._
22 |
23 | /** Adjusted version of org.apache.spark.sql.catalyst.expressions.aggregate.Corr */
24 | private[sql] class StatefulCorrelation(x: Expression, y: Expression) extends Corr(x, y) {
25 |
26 | override def dataType: org.apache.spark.sql.types.DataType =
27 | StructType(StructField("n", DoubleType) :: StructField("xAvg", DoubleType) ::
28 | StructField("yAvg", DoubleType) :: StructField("ck", DoubleType) ::
29 | StructField("xMk", DoubleType) :: StructField("yMk", DoubleType) :: Nil)
30 |
31 | override val evaluateExpression: Expression = {
32 | CreateStruct(n :: xAvg :: yAvg :: ck :: xMk :: yMk :: Nil)
33 | }
34 |
35 | override def prettyName: String = "stateful_corr"
36 |
37 | override def canEqual(other: Any): Boolean = other.isInstanceOf[StatefulCorrelation]
38 |
39 | override def equals(other: Any): Boolean = other match {
40 | case that: StatefulCorrelation =>
41 | (that canEqual this) && evaluateExpression == that.evaluateExpression
42 | case _ => false
43 | }
44 |
45 | override def hashCode(): Int = {
46 | val state = Seq(super.hashCode(), evaluateExpression)
47 | state.map { _.hashCode() }.foldLeft(0) {(a, b) => 31 * a + b }
48 | }
49 | }
50 |
--------------------------------------------------------------------------------
/src/main/scala/com/amazon/deequ/analyzers/Mean.scala:
--------------------------------------------------------------------------------
1 | /**
2 | * Copyright 2018 Amazon.com, Inc. or its affiliates. All Rights Reserved.
3 | *
4 | * Licensed under the Apache License, Version 2.0 (the "License"). You may not
5 | * use this file except in compliance with the License. A copy of the License
6 | * is located at
7 | *
8 | * http://aws.amazon.com/apache2.0/
9 | *
10 | * or in the "license" file accompanying this file. This file is distributed on
11 | * an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either
12 | * express or implied. See the License for the specific language governing
13 | * permissions and limitations under the License.
14 | *
15 | */
16 |
17 | package com.amazon.deequ.analyzers
18 |
19 | import com.amazon.deequ.analyzers.Preconditions.{hasColumn, isNumeric}
20 | import org.apache.spark.sql.{Column, Row}
21 | import org.apache.spark.sql.functions.{count, sum}
22 | import org.apache.spark.sql.types.{DoubleType, StructType, LongType}
23 | import Analyzers._
24 |
25 | case class MeanState(sum: Double, count: Long) extends DoubleValuedState[MeanState] {
26 |
27 | override def sum(other: MeanState): MeanState = {
28 | MeanState(sum + other.sum, count + other.count)
29 | }
30 |
31 | override def metricValue(): Double = {
32 | if (count == 0L) Double.NaN else sum / count
33 | }
34 | }
35 |
36 | case class Mean(column: String, where: Option[String] = None)
37 | extends StandardScanShareableAnalyzer[MeanState]("Mean", column)
38 | with FilterableAnalyzer {
39 |
40 | override def aggregationFunctions(): Seq[Column] = {
41 | sum(conditionalSelection(column, where)).cast(DoubleType) ::
42 | count(conditionalSelection(column, where)).cast(LongType) :: Nil
43 | }
44 |
45 | override def fromAggregationResult(result: Row, offset: Int): Option[MeanState] = {
46 |
47 | ifNoNullsIn(result, offset, howMany = 2) { _ =>
48 | MeanState(result.getDouble(offset), result.getLong(offset + 1))
49 | }
50 | }
51 |
52 | override protected def additionalPreconditions(): Seq[StructType => Unit] = {
53 | hasColumn(column) :: isNumeric(column) :: Nil
54 | }
55 |
56 | override def filterCondition: Option[String] = where
57 | }
58 |
--------------------------------------------------------------------------------
/src/main/scala/com/amazon/deequ/metrics/HistogramMetric.scala:
--------------------------------------------------------------------------------
1 | /**
2 | * Copyright 2018 Amazon.com, Inc. or its affiliates. All Rights Reserved.
3 | *
4 | * Licensed under the Apache License, Version 2.0 (the "License"). You may not
5 | * use this file except in compliance with the License. A copy of the License
6 | * is located at
7 | *
8 | * http://aws.amazon.com/apache2.0/
9 | *
10 | * or in the "license" file accompanying this file. This file is distributed on
11 | * an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either
12 | * express or implied. See the License for the specific language governing
13 | * permissions and limitations under the License.
14 | *
15 | */
16 |
17 | package com.amazon.deequ.metrics
18 |
19 | import scala.util.{Failure, Success, Try}
20 |
21 | case class DistributionValue(absolute: Long, ratio: Double)
22 |
23 | case class Distribution(values: Map[String, DistributionValue], numberOfBins: Long) {
24 |
25 | def apply(key: String): DistributionValue = {
26 | values(key)
27 | }
28 |
29 | def argmax: String = {
30 | val (distributionKey, _) = values.toSeq
31 | .maxBy { case (_, distributionValue) => distributionValue.absolute }
32 |
33 | distributionKey
34 | }
35 | }
36 |
37 | case class HistogramMetric(column: String, value: Try[Distribution]) extends Metric[Distribution] {
38 | val entity: Entity.Value = Entity.Column
39 | val instance: String = column
40 | val name = "Histogram"
41 |
42 | def flatten(): Seq[DoubleMetric] = {
43 | value
44 | .map { distribution =>
45 | val numberOfBins = Seq(DoubleMetric(entity, s"$name.bins", instance,
46 | Success(distribution.numberOfBins.toDouble)))
47 |
48 | val details = distribution.values
49 | .flatMap { case (key, distValue) =>
50 | DoubleMetric(entity, s"$name.abs.$key", instance, Success(distValue.absolute)) ::
51 | DoubleMetric(entity, s"$name.ratio.$key", instance, Success(distValue.ratio)) :: Nil
52 | }
53 | numberOfBins ++ details
54 | }
55 | .recover {
56 | case e: Exception => Seq(DoubleMetric(entity, s"$name.bins", instance, Failure(e)))
57 | }
58 | .get
59 | }
60 |
61 | }
62 |
--------------------------------------------------------------------------------
/src/test/scala/com/amazon/deequ/anomalydetection/AnomalyDetectionTestUtils.scala:
--------------------------------------------------------------------------------
1 | /**
2 | * Copyright 2018 Amazon.com, Inc. or its affiliates. All Rights Reserved.
3 | *
4 | * Licensed under the Apache License, Version 2.0 (the "License"). You may not
5 | * use this file except in compliance with the License. A copy of the License
6 | * is located at
7 | *
8 | * http://aws.amazon.com/apache2.0/
9 | *
10 | * or in the "license" file accompanying this file. This file is distributed on
11 | * an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either
12 | * express or implied. See the License for the specific language governing
13 | * permissions and limitations under the License.
14 | *
15 | */
16 |
17 | package com.amazon.deequ.anomalydetection
18 |
19 | import scala.util.matching.Regex
20 |
21 | /**
22 | * Utilities to test Anomaly Detection methods and related modules
23 | */
24 | object AnomalyDetectionTestUtils {
25 |
26 | private val numericalValueRegex: Regex = """([+-]?([0-9]*[.])?[0-9]+([Ee][0-9]+)?)""".r
27 |
28 | /**
29 | * Finds the first numerical value in a string
30 | *
31 | * @param details The string containing a numerical value
32 | * @throws IllegalArgumentException Thrown if no value could be found
33 | * @return The value itself
34 | */
35 | def firstDoubleFromString(details: String): Double = {
36 | val firstValue = numericalValueRegex.findFirstIn(details)
37 |
38 | require(firstValue.isDefined, "Input string did not contain a numerical value")
39 |
40 | firstValue.get.toString.toDouble
41 | }
42 |
43 | /**
44 | * Finds the first three numerical values in a string
45 | *
46 | * @param details The string containing at least three numerical values
47 | * @throws IllegalArgumentException Thrown if less than 3 values could be found
48 | * @return The values themselves
49 | */
50 | def firstThreeDoublesFromString(details: String): (Double, Double, Double) = {
51 | val values = numericalValueRegex.findAllIn(details).toVector.map(_.toString.toDouble)
52 |
53 | require(values.length >= 3, "Input string did not contain at least 3 numerical values.")
54 |
55 | (values(0), values(1), values(2))
56 | }
57 | }
58 |
--------------------------------------------------------------------------------
/src/main/scala/com/amazon/deequ/metrics/Metric.scala:
--------------------------------------------------------------------------------
1 | /**
2 | * Copyright 2018 Amazon.com, Inc. or its affiliates. All Rights Reserved.
3 | *
4 | * Licensed under the Apache License, Version 2.0 (the "License"). You may not
5 | * use this file except in compliance with the License. A copy of the License
6 | * is located at
7 | *
8 | * http://aws.amazon.com/apache2.0/
9 | *
10 | * or in the "license" file accompanying this file. This file is distributed on
11 | * an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either
12 | * express or implied. See the License for the specific language governing
13 | * permissions and limitations under the License.
14 | *
15 | */
16 |
17 | package com.amazon.deequ.metrics
18 |
19 | import scala.util.{Failure, Success, Try}
20 |
21 | object Entity extends Enumeration {
22 | val Dataset, Column, Mutlicolumn = Value
23 | }
24 |
25 | /** Common trait for all data quality metrics */
26 | trait Metric[T] {
27 | val entity: Entity.Value
28 | val instance: String
29 | val name: String
30 | val value: Try[T]
31 |
32 | /*
33 | * Composite metric objects e.g histogram can implement this method to
34 | * returned flattened view of the internal values in terms of double metrics.
35 | * @see HistogramMetric for sample
36 | */
37 | def flatten(): Seq[DoubleMetric]
38 | }
39 |
40 | /** Common trait for all data quality metrics where the value is double */
41 | case class DoubleMetric(
42 | entity: Entity.Value,
43 | name: String,
44 | instance: String,
45 | value: Try[Double])
46 | extends Metric[Double] {
47 |
48 | override def flatten(): Seq[DoubleMetric] = Seq(this)
49 | }
50 |
51 | case class KeyedDoubleMetric(
52 | entity: Entity.Value,
53 | name: String,
54 | instance: String,
55 | value: Try[Map[String, Double]])
56 | extends Metric[Map[String, Double]] {
57 |
58 | override def flatten(): Seq[DoubleMetric] = {
59 | if (value.isSuccess) {
60 | value.get.map { case (key, correspondingValue) =>
61 | DoubleMetric(entity, s"$name-$key", instance, Success(correspondingValue))
62 | }
63 | .toSeq
64 | } else {
65 | Seq(DoubleMetric(entity, s"$name", instance, Failure(value.failed.get)))
66 | }
67 | }
68 | }
69 |
--------------------------------------------------------------------------------
/src/test/scala/com/amazon/deequ/analyzers/StatesTest.scala:
--------------------------------------------------------------------------------
1 | /**
2 | * Copyright 2018 Amazon.com, Inc. or its affiliates. All Rights Reserved.
3 | *
4 | * Licensed under the Apache License, Version 2.0 (the "License"). You may not
5 | * use this file except in compliance with the License. A copy of the License
6 | * is located at
7 | *
8 | * http://aws.amazon.com/apache2.0/
9 | *
10 | * or in the "license" file accompanying this file. This file is distributed on
11 | * an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either
12 | * express or implied. See the License for the specific language governing
13 | * permissions and limitations under the License.
14 | *
15 | */
16 |
17 | package com.amazon.deequ.analyzers
18 |
19 | import com.amazon.deequ.SparkContextSpec
20 | import com.amazon.deequ.utils.FixtureSupport
21 | import org.scalatest.matchers.should.Matchers
22 | import org.scalatest.wordspec.AnyWordSpec
23 |
24 | class StatesTest extends AnyWordSpec with Matchers with SparkContextSpec with FixtureSupport {
25 |
26 | "FrequenciesAndNumRows" should {
27 | "merge correctly" in withSparkSession { session =>
28 |
29 | import session.implicits._
30 |
31 | val dataA = Seq("A", "A", "B").toDF("att1")
32 | val dataB = Seq("A", "C", "C").toDF("att1")
33 |
34 | val stateA = FrequencyBasedAnalyzer.computeFrequencies(dataA, "att1" :: Nil)
35 | val stateB = FrequencyBasedAnalyzer.computeFrequencies(dataB, "att1" :: Nil)
36 |
37 | val stateAB = stateA.sum(stateB)
38 |
39 | println(stateA.frequencies.schema)
40 | stateA.frequencies.collect().foreach { println }
41 | println()
42 |
43 | println(stateB.frequencies.schema)
44 | stateB.frequencies.collect().foreach { println }
45 | println()
46 |
47 | println(stateAB.frequencies.schema)
48 | stateAB.frequencies.collect().foreach { println }
49 |
50 | val mergedFrequencies = stateAB.frequencies.collect()
51 | .map { row => row.getString(0) -> row.getLong(1) }
52 | .toMap
53 |
54 | assert(mergedFrequencies.size == 3)
55 | assert(mergedFrequencies.get("A").contains(3))
56 | assert(mergedFrequencies.get("B").contains(1))
57 | assert(mergedFrequencies.get("C").contains(2))
58 | }
59 | }
60 | }
61 |
--------------------------------------------------------------------------------
/src/main/scala/com/amazon/deequ/anomalydetection/SimpleThresholdStrategy.scala:
--------------------------------------------------------------------------------
1 | /**
2 | * Copyright 2018 Amazon.com, Inc. or its affiliates. All Rights Reserved.
3 | *
4 | * Licensed under the Apache License, Version 2.0 (the "License"). You may not
5 | * use this file except in compliance with the License. A copy of the License
6 | * is located at
7 | *
8 | * http://aws.amazon.com/apache2.0/
9 | *
10 | * or in the "license" file accompanying this file. This file is distributed on
11 | * an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either
12 | * express or implied. See the License for the specific language governing
13 | * permissions and limitations under the License.
14 | *
15 | */
16 |
17 | package com.amazon.deequ.anomalydetection
18 |
19 | /**
20 | * A simple anomaly detection method that checks if values are in a specified range.
21 | *
22 | * @param lowerBound Lower bound of accepted range of values
23 | * @param upperBound Upper bound of accepted range of values
24 | */
25 | case class SimpleThresholdStrategy(
26 | lowerBound: Double = Double.MinValue,
27 | upperBound: Double)
28 | extends AnomalyDetectionStrategy {
29 |
30 | require(lowerBound <= upperBound, "The lower bound must be smaller or equal to the upper bound.")
31 |
32 | /**
33 | * Search for anomalies in a series of data points.
34 | *
35 | * @param dataSeries The data contained in a Vector of Doubles
36 | * @param searchInterval The indices between which anomalies should be detected. [a, b).
37 | * @return The indices of all anomalies in the interval and their corresponding wrapper object.
38 | */
39 | override def detect(
40 | dataSeries: Vector[Double],
41 | searchInterval: (Int, Int)): Seq[(Int, Anomaly)] = {
42 |
43 | val (searchStart, searchEnd) = searchInterval
44 |
45 | require (searchStart <= searchEnd, "The start of the interval can't be larger than the end.")
46 |
47 | dataSeries.zipWithIndex
48 | .slice(searchStart, searchEnd)
49 | .filter { case (value, _) => value < lowerBound || value > upperBound }
50 | .map { case (value, index) =>
51 |
52 | val detail = Some(s"[SimpleThresholdStrategy]: Value $value is not in " +
53 | s"bounds [$lowerBound, $upperBound]")
54 |
55 | (index, Anomaly(Option(value), 1.0, detail))
56 | }
57 | }
58 | }
59 |
--------------------------------------------------------------------------------
/src/main/scala/com/amazon/deequ/suggestions/rules/NonNegativeNumbersRule.scala:
--------------------------------------------------------------------------------
1 | /**
2 | * Copyright 2018 Amazon.com, Inc. or its affiliates. All Rights Reserved.
3 | *
4 | * Licensed under the Apache License, Version 2.0 (the "License"). You may not
5 | * use this file except in compliance with the License. A copy of the License
6 | * is located at
7 | *
8 | * http://aws.amazon.com/apache2.0/
9 | *
10 | * or in the "license" file accompanying this file. This file is distributed on
11 | * an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either
12 | * express or implied. See the License for the specific language governing
13 | * permissions and limitations under the License.
14 | *
15 | */
16 |
17 | package com.amazon.deequ.suggestions.rules
18 |
19 | import com.amazon.deequ.checks.Check
20 | import com.amazon.deequ.constraints.Constraint.complianceConstraint
21 | import com.amazon.deequ.profiles.{ColumnProfile, NumericColumnProfile}
22 | import com.amazon.deequ.suggestions.ConstraintSuggestion
23 |
24 | /** If we see only non-negative numbers in a column, we suggest a corresponding constraint */
25 | case class NonNegativeNumbersRule() extends ConstraintRule[ColumnProfile] {
26 |
27 | override def shouldBeApplied(profile: ColumnProfile, numRecords: Long): Boolean = {
28 | profile match {
29 | case numericProfile: NumericColumnProfile => numericProfile.minimum.exists(_ >= 0.0)
30 | case _ => false
31 | }
32 | }
33 |
34 | override def candidate(profile: ColumnProfile, numRecords: Long): ConstraintSuggestion = {
35 |
36 | val description = s"'${profile.column}' has no negative values"
37 | val constraint = complianceConstraint(description, s"${profile.column} >= 0", Check.IsOne)
38 |
39 | val minimum = profile match {
40 | case numericProfile: NumericColumnProfile
41 | if numericProfile.minimum.isDefined => numericProfile.minimum.get.toString
42 | case _ => "Error while calculating minimum!"
43 | }
44 |
45 | ConstraintSuggestion(
46 | constraint,
47 | profile.column,
48 | "Minimum: " + minimum,
49 | description,
50 | this,
51 | s""".isNonNegative("${profile.column}")"""
52 | )
53 | }
54 |
55 | override val ruleDescription: String = "If we see only non-negative numbers in a " +
56 | "column, we suggest a corresponding constraint"
57 | }
58 |
--------------------------------------------------------------------------------
/src/main/scala/com/amazon/deequ/suggestions/rules/UniqueIfApproximatelyUniqueRule.scala:
--------------------------------------------------------------------------------
1 | /**
2 | * Copyright 2018 Amazon.com, Inc. or its affiliates. All Rights Reserved.
3 | *
4 | * Licensed under the Apache License, Version 2.0 (the "License"). You may not
5 | * use this file except in compliance with the License. A copy of the License
6 | * is located at
7 | *
8 | * http://aws.amazon.com/apache2.0/
9 | *
10 | * or in the "license" file accompanying this file. This file is distributed on
11 | * an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either
12 | * express or implied. See the License for the specific language governing
13 | * permissions and limitations under the License.
14 | *
15 | */
16 |
17 | package com.amazon.deequ.suggestions.rules
18 |
19 | import com.amazon.deequ.checks.Check
20 | import com.amazon.deequ.constraints.Constraint.uniquenessConstraint
21 | import com.amazon.deequ.profiles.ColumnProfile
22 | import com.amazon.deequ.suggestions.ConstraintSuggestion
23 |
24 | /**
25 | * If the ratio of approximate num distinct values in a column is close to the number of records
26 | * (within error of HLL sketch), we suggest a UNIQUE constraint
27 | */
28 | case class UniqueIfApproximatelyUniqueRule() extends ConstraintRule[ColumnProfile] {
29 |
30 | override def shouldBeApplied(profile: ColumnProfile, numRecords: Long): Boolean = {
31 |
32 | val approximateDistinctness = profile.approximateNumDistinctValues.toDouble / numRecords
33 |
34 | // TODO This bound depends on the error guarantees of the HLL sketch
35 | profile.completeness == 1.0 && math.abs(1.0 - approximateDistinctness) <= 0.08
36 | }
37 |
38 | override def candidate(profile: ColumnProfile, numRecords: Long): ConstraintSuggestion = {
39 |
40 | val constraint = uniquenessConstraint(Seq(profile.column), Check.IsOne)
41 | val approximateDistinctness = profile.approximateNumDistinctValues.toDouble / numRecords
42 |
43 | ConstraintSuggestion(
44 | constraint,
45 | profile.column,
46 | "ApproxDistinctness: " + approximateDistinctness.toString,
47 | s"'${profile.column}' is unique",
48 | this,
49 | s""".isUnique("${profile.column}")"""
50 | )
51 | }
52 |
53 | override val ruleDescription: String = "If the ratio of approximate num distinct values " +
54 | "in a column is close to the number of records (within the error of the HLL sketch), " +
55 | "we suggest a UNIQUE constraint"
56 | }
57 |
--------------------------------------------------------------------------------
/src/main/scala/com/amazon/deequ/analyzers/NonSampleCompactor.scala:
--------------------------------------------------------------------------------
1 | /**
2 | * Copyright 2018 Amazon.com, Inc. or its affiliates. All Rights Reserved.
3 | *
4 | * Licensed under the Apache License, Version 2.0 (the "License"). You may not
5 | * use this file except in compliance with the License. A copy of the License
6 | * is located at
7 | *
8 | * http://aws.amazon.com/apache2.0/
9 | *
10 | * or in the "license" file accompanying this file. This file is distributed on
11 | * an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either
12 | * express or implied. See the License for the specific language governing
13 | * permissions and limitations under the License.
14 | *
15 | */
16 |
17 | package com.amazon.deequ.analyzers
18 |
19 | import scala.collection.mutable.ArrayBuffer
20 | import scala.reflect.ClassTag
21 | import scala.util.Random
22 |
23 | /**
24 | * A quantile sketcher whose output is half the size of its input.
25 | *
26 | * @tparam T type of the items being sketched. There should an ordering
27 | * over this item type
28 | */
29 | class NonSampleCompactor[T]()
30 | (implicit ordering: Ordering[T],
31 | ct: ClassTag[T])
32 | extends Serializable {
33 |
34 | var numOfCompress = 0
35 | var offset = 0
36 | var buffer: ArrayBuffer[T] = ArrayBuffer[T]()
37 |
38 | private def findOdd(items: Int): Option[T] = items % 2 match {
39 | case 1 => Some(buffer(math.max(items - 1, 0)))
40 | case _ => None
41 | }
42 |
43 | def compact : Array[T] = {
44 | var items = buffer.length
45 | val len = items - (items % 2)
46 | if (numOfCompress % 2 == 1) {
47 | offset = 1 - offset
48 | }
49 | // else {
50 | // offset = if (Random.nextBoolean()) 1 else 0
51 | // }
52 | val sortedBuffer = buffer.toArray.slice(0, len).sorted
53 |
54 | /** Selects half of the items from this level compactor to the next level compactor.
55 | * e.g. if sortedBuffer is Array(1,2,3,4), if offset is 1, output = Array(2,4),
56 | * and if offset is 0, output = Array(1,3), this will be the input to the next level compactor.
57 | */
58 | val output = (offset until len by 2).map(sortedBuffer(_)).toArray
59 | val tail = findOdd(items)
60 | items = items % 2
61 | var newBuffer = ArrayBuffer[T]()
62 | if (tail.isDefined) {
63 | newBuffer = newBuffer :+ tail.get
64 | }
65 | buffer = newBuffer
66 | numOfCompress = numOfCompress + 1
67 | output
68 | }
69 | }
70 |
71 |
--------------------------------------------------------------------------------
/src/test/resources/EMRSparkShellTest.scala:
--------------------------------------------------------------------------------
1 | /**
2 | * Copyright 2018 Amazon.com, Inc. or its affiliates. All Rights Reserved.
3 | *
4 | * Licensed under the Apache License, Version 2.0 (the "License"). You may not
5 | * use this file except in compliance with the License. A copy of the License
6 | * is located at
7 | *
8 | * http://aws.amazon.com/apache2.0/
9 | *
10 | * or in the "license" file accompanying this file. This file is distributed on
11 | * an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either
12 | * express or implied. See the License for the specific language governing
13 | * permissions and limitations under the License.
14 | *
15 | */
16 |
17 | /*
18 | For testing inside EMR or other flavors of spark cluster. Run commands after building git repo from source.
19 | Add additional test classes as needed
20 | scala 2.12
21 | spark-shell -i /src/test/resources/EMRSparkShellTest.txt \
22 | --packages org.scalatest:scalatest_2.12:3.1.2,org.scalamock:scalamock_2.12:4.4.0,org.scala-lang:scala-compiler:2.12.10,\
23 | org.mockito:mockito-core:2.28.2,org.openjdk.jmh:jmh-core:1.23,org.openjdk.jmh:jmh-generator-annprocess:1.23,org.apache.datasketches:datasketches-java:1.3.0-incubating \
24 | --jars /target/deequ_2.12-1.1.0-SNAPSHOT.jar,/target/deequ_2.12-1.1.0-SNAPSHOT-tests.jar
25 |
26 | scala 2.11
27 | spark-shell -i /src/test/resources/EMRSparkShellTest.txt \
28 | --packages org.scalatest:scalatest_2.11:3.1.2,org.scalamock:scalamock_2.11:4.4.0,org.scala-lang:scala-compiler:2.11.10,\
29 | org.mockito:mockito-core:2.28.2,org.openjdk.jmh:jmh-core:1.23,org.openjdk.jmh:jmh-generator-annprocess:1.23,org.apache.datasketches:datasketches-java:1.3.0-incubating \
30 | --jars /target/deequ-1.1.0-SNAPSHOT.jar,/target/spark-deequ-testing/deequ-1.1.0-SNAPSHOT-tests.jar
31 | */
32 |
33 | import com.amazon.deequ.analyzers.{AnalysisTest, AnalyzerTests, IncrementalAnalysisTest}
34 | import com.amazon.deequ.analyzers.runners.{AnalysisRunnerTests, AnalyzerContextTest}
35 | import com.amazon.deequ.{VerificationResultTest, VerificationSuiteTest}
36 |
37 | (new VerificationSuiteTest).execute()
38 | (new VerificationResultTest).execute()
39 | (new AnalysisRunnerTests).execute()
40 | (new AnalyzerContextTest).execute()
41 | (new AnalysisTest).execute()
42 | (new AnalyzerTests).execute()
43 | (new IncrementalAnalysisTest).execute()
44 | //Add additional test classes as needed
45 |
--------------------------------------------------------------------------------
/src/test/scala/com/amazon/deequ/checks/FilterableCheckTest.scala:
--------------------------------------------------------------------------------
1 | /**
2 | * Copyright 2018 Amazon.com, Inc. or its affiliates. All Rights Reserved.
3 | *
4 | * Licensed under the Apache License, Version 2.0 (the "License"). You may not
5 | * use this file except in compliance with the License. A copy of the License
6 | * is located at
7 | *
8 | * http://aws.amazon.com/apache2.0/
9 | *
10 | * or in the "license" file accompanying this file. This file is distributed on
11 | * an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either
12 | * express or implied. See the License for the specific language governing
13 | * permissions and limitations under the License.
14 | *
15 | */
16 |
17 | package com.amazon.deequ
18 | package checks
19 |
20 | import com.amazon.deequ.analyzers.{Completeness, Compliance}
21 | import com.amazon.deequ.utils.FixtureSupport
22 | import org.scalatest.matchers.should.Matchers
23 | import org.scalatest.wordspec.AnyWordSpec
24 |
25 |
26 | class FilterableCheckTest extends AnyWordSpec
27 | with Matchers
28 | with SparkContextSpec
29 | with FixtureSupport {
30 |
31 | "Filterable checks" should {
32 | "build correctly" in {
33 |
34 | val check = Check(CheckLevel.Error, "someCheck")
35 | .isComplete("col1")
36 | .isComplete("col2").where("marketplace = 'EU'")
37 | .hasCompleteness("col3", _ >= 0.9).where("marketplace = 'NA'")
38 | .satisfies("someCol > 5", "const1")
39 | .satisfies("someCol > 10", "const2").where("marketplace = 'EU'")
40 |
41 | val completenessAnalyzers =
42 | check.requiredAnalyzers()
43 | .filter { _.isInstanceOf[Completeness] }
44 | .map { _.asInstanceOf[Completeness] }
45 | .toArray
46 | .sortBy { _.column }
47 |
48 | assert(completenessAnalyzers.length == 3)
49 | assert(completenessAnalyzers.head.where.isEmpty)
50 | assert(completenessAnalyzers(1).where.contains("marketplace = 'EU'"))
51 | assert(completenessAnalyzers(2).where.contains("marketplace = 'NA'"))
52 |
53 | val complianceAnalyzers =
54 | check.requiredAnalyzers()
55 | .filter { _.isInstanceOf[Compliance] }
56 | .map { _.asInstanceOf[Compliance] }
57 | .toArray
58 | .sortBy { _.instance }
59 |
60 | assert(complianceAnalyzers.length == 2)
61 | assert(complianceAnalyzers.head.where.isEmpty)
62 | assert(complianceAnalyzers(1).where.contains("marketplace = 'EU'"))
63 | }
64 | }
65 |
66 | }
67 |
--------------------------------------------------------------------------------
/src/main/scala/com/amazon/deequ/analyzers/Compliance.scala:
--------------------------------------------------------------------------------
1 | /**
2 | * Copyright 2018 Amazon.com, Inc. or its affiliates. All Rights Reserved.
3 | *
4 | * Licensed under the Apache License, Version 2.0 (the "License"). You may not
5 | * use this file except in compliance with the License. A copy of the License
6 | * is located at
7 | *
8 | * http://aws.amazon.com/apache2.0/
9 | *
10 | * or in the "license" file accompanying this file. This file is distributed on
11 | * an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either
12 | * express or implied. See the License for the specific language governing
13 | * permissions and limitations under the License.
14 | *
15 | */
16 |
17 | package com.amazon.deequ.analyzers
18 |
19 | import org.apache.spark.sql.types.IntegerType
20 | import org.apache.spark.sql.{Column, Row}
21 | import org.apache.spark.sql.functions._
22 | import Analyzers._
23 |
24 | /**
25 | * Compliance is a measure of the fraction of rows that complies with the given column constraint.
26 | * E.g if the constraint is "att1>3" and data frame has 5 rows with att1 column value greater than
27 | * 3 and 10 rows under 3; a DoubleMetric would be returned with 0.33 value
28 | *
29 | * @param instance Unlike other column analyzers (e.g completeness) this analyzer can not
30 | * infer to the metric instance name from column name.
31 | * Also the constraint given here can be referring to multiple columns,
32 | * so metric instance name should be provided,
33 | * describing what the analysis being done for.
34 | * @param predicate SQL-predicate to apply per row
35 | * @param where Additional filter to apply before the analyzer is run.
36 | */
37 | case class Compliance(instance: String, predicate: String, where: Option[String] = None)
38 | extends StandardScanShareableAnalyzer[NumMatchesAndCount]("Compliance", instance)
39 | with FilterableAnalyzer {
40 |
41 | override def fromAggregationResult(result: Row, offset: Int): Option[NumMatchesAndCount] = {
42 |
43 | ifNoNullsIn(result, offset, howMany = 2) { _ =>
44 | NumMatchesAndCount(result.getLong(offset), result.getLong(offset + 1))
45 | }
46 | }
47 |
48 | override def aggregationFunctions(): Seq[Column] = {
49 |
50 | val summation = sum(conditionalSelection(expr(predicate), where).cast(IntegerType))
51 |
52 | summation :: conditionalCount(where) :: Nil
53 | }
54 |
55 | override def filterCondition: Option[String] = where
56 | }
57 |
--------------------------------------------------------------------------------
/src/main/scala/com/amazon/deequ/suggestions/rules/RetainTypeRule.scala:
--------------------------------------------------------------------------------
1 | /**
2 | * Copyright 2018 Amazon.com, Inc. or its affiliates. All Rights Reserved.
3 | *
4 | * Licensed under the Apache License, Version 2.0 (the "License"). You may not
5 | * use this file except in compliance with the License. A copy of the License
6 | * is located at
7 | *
8 | * http://aws.amazon.com/apache2.0/
9 | *
10 | * or in the "license" file accompanying this file. This file is distributed on
11 | * an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either
12 | * express or implied. See the License for the specific language governing
13 | * permissions and limitations under the License.
14 | *
15 | */
16 |
17 | package com.amazon.deequ.suggestions.rules
18 |
19 | import com.amazon.deequ.analyzers.DataTypeInstances
20 | import com.amazon.deequ.checks.Check
21 | import com.amazon.deequ.constraints.ConstrainableDataTypes
22 | import com.amazon.deequ.constraints.Constraint.dataTypeConstraint
23 | import com.amazon.deequ.profiles.ColumnProfile
24 | import com.amazon.deequ.suggestions.ConstraintSuggestion
25 |
26 | /** If we detect a non-string type, we suggest a type constraint */
27 | case class RetainTypeRule() extends ConstraintRule[ColumnProfile] {
28 |
29 | override def shouldBeApplied(profile: ColumnProfile, numRecords: Long): Boolean = {
30 | val testableType = profile.dataType match {
31 | case DataTypeInstances.Integral | DataTypeInstances.Fractional | DataTypeInstances.Boolean =>
32 | true
33 | case _ => false
34 | }
35 |
36 | profile.isDataTypeInferred && testableType
37 | }
38 |
39 | override def candidate(profile: ColumnProfile, numRecords: Long): ConstraintSuggestion = {
40 |
41 | val typeToCheck = profile.dataType match {
42 | case DataTypeInstances.Fractional => ConstrainableDataTypes.Fractional
43 | case DataTypeInstances.Integral => ConstrainableDataTypes.Integral
44 | case DataTypeInstances.Boolean => ConstrainableDataTypes.Boolean
45 | }
46 |
47 | val constraint = dataTypeConstraint(profile.column, typeToCheck, Check.IsOne)
48 |
49 | ConstraintSuggestion(
50 | constraint,
51 | profile.column,
52 | "DataType: " + profile.dataType.toString,
53 | s"'${profile.column}' has type ${profile.dataType}",
54 | this,
55 | s""".hasDataType("${profile.column}", ConstrainableDataTypes.${profile.dataType})"""
56 | )
57 | }
58 |
59 | override val ruleDescription: String = "If we detect a non-string type, we suggest a " +
60 | "type constraint"
61 | }
62 |
--------------------------------------------------------------------------------
/src/main/scala/com/amazon/deequ/suggestions/ConstraintSuggestionResult.scala:
--------------------------------------------------------------------------------
1 | /**
2 | * Copyright 2018 Amazon.com, Inc. or its affiliates. All Rights Reserved.
3 | *
4 | * Licensed under the Apache License, Version 2.0 (the "License"). You may not
5 | * use this file except in compliance with the License. A copy of the License
6 | * is located at
7 | *
8 | * http://aws.amazon.com/apache2.0/
9 | *
10 | * or in the "license" file accompanying this file. This file is distributed on
11 | * an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either
12 | * express or implied. See the License for the specific language governing
13 | * permissions and limitations under the License.
14 | *
15 | */
16 |
17 | package com.amazon.deequ.suggestions
18 |
19 | import com.amazon.deequ.VerificationResult
20 | import com.amazon.deequ.checks.CheckStatus
21 | import com.amazon.deequ.profiles.{ColumnProfile, ColumnProfiles}
22 |
23 | /**
24 | * The result returned from the ConstraintSuggestionSuite
25 | *
26 | * @param columnProfiles The column profiles
27 | * @param numRecordsUsedForProfiling The number of records that were used for computing
28 | * the column profiles
29 | * @param constraintSuggestions The suggested constraints
30 | * @param verificationResult The verificationResult in case a train/test split was used
31 | */
32 | case class ConstraintSuggestionResult(
33 | columnProfiles: Map[String, ColumnProfile],
34 | numRecordsUsedForProfiling: Long,
35 | constraintSuggestions: Map[String, Seq[ConstraintSuggestion]],
36 | verificationResult: Option[VerificationResult] = None)
37 |
38 |
39 | object ConstraintSuggestionResult {
40 |
41 | def getColumnProfilesAsJson(constraintSuggestionResult: ConstraintSuggestionResult): String = {
42 |
43 | ColumnProfiles
44 | .toJson(constraintSuggestionResult.columnProfiles.values.toSeq)
45 | }
46 |
47 | def getConstraintSuggestionsAsJson(constraintSuggestionResult: ConstraintSuggestionResult)
48 | : String = {
49 | ConstraintSuggestions
50 | .toJson(constraintSuggestionResult.constraintSuggestions.values.fold(Seq.empty)( _ ++ _))
51 | }
52 |
53 | def getEvaluationResultsAsJson(constraintSuggestionResult: ConstraintSuggestionResult)
54 | : String = {
55 |
56 | ConstraintSuggestions
57 | .evaluationResultsToJson(
58 | constraintSuggestionResult.constraintSuggestions.values.fold(Seq.empty)( _ ++ _),
59 | constraintSuggestionResult.verificationResult.getOrElse(
60 | VerificationResult(CheckStatus.Warning, Map.empty, Map.empty)))
61 | }
62 | }
63 |
--------------------------------------------------------------------------------
/src/main/scala/com/amazon/deequ/analyzers/ApproxCountDistinct.scala:
--------------------------------------------------------------------------------
1 | /**
2 | * Copyright 2018 Amazon.com, Inc. or its affiliates. All Rights Reserved.
3 | *
4 | * Licensed under the Apache License, Version 2.0 (the "License"). You may not
5 | * use this file except in compliance with the License. A copy of the License
6 | * is located at
7 | *
8 | * http://aws.amazon.com/apache2.0/
9 | *
10 | * or in the "license" file accompanying this file. This file is distributed on
11 | * an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either
12 | * express or implied. See the License for the specific language governing
13 | * permissions and limitations under the License.
14 | *
15 | */
16 |
17 | package com.amazon.deequ.analyzers
18 |
19 | import com.amazon.deequ.analyzers.Preconditions.hasColumn
20 | import org.apache.spark.sql.DeequFunctions.stateful_approx_count_distinct
21 | import org.apache.spark.sql.catalyst.expressions.aggregate.DeequHyperLogLogPlusPlusUtils
22 | import org.apache.spark.sql.types.StructType
23 | import org.apache.spark.sql.{Column, Row}
24 | import Analyzers._
25 |
26 | case class ApproxCountDistinctState(words: Array[Long])
27 | extends DoubleValuedState[ApproxCountDistinctState] {
28 |
29 | override def sum(other: ApproxCountDistinctState): ApproxCountDistinctState = {
30 | ApproxCountDistinctState(DeequHyperLogLogPlusPlusUtils.merge(words, other.words))
31 | }
32 |
33 | override def metricValue(): Double = {
34 | DeequHyperLogLogPlusPlusUtils.count(words)
35 | }
36 |
37 | override def toString: String = {
38 | s"ApproxCountDistinctState(${words.mkString(",")})"
39 | }
40 | }
41 |
42 | /**
43 | * Compute approximated count distinct with HyperLogLogPlusPlus.
44 | *
45 | * @param column Which column to compute this aggregation on.
46 | */
47 | case class ApproxCountDistinct(column: String, where: Option[String] = None)
48 | extends StandardScanShareableAnalyzer[ApproxCountDistinctState]("ApproxCountDistinct", column)
49 | with FilterableAnalyzer {
50 |
51 | override def aggregationFunctions(): Seq[Column] = {
52 | stateful_approx_count_distinct(conditionalSelection(column, where)) :: Nil
53 | }
54 |
55 | override def fromAggregationResult(result: Row, offset: Int): Option[ApproxCountDistinctState] = {
56 |
57 | ifNoNullsIn(result, offset) { _ =>
58 | DeequHyperLogLogPlusPlusUtils.wordsFromBytes(result.getAs[Array[Byte]](offset))
59 | }
60 | }
61 |
62 | override protected def additionalPreconditions(): Seq[StructType => Unit] = {
63 | hasColumn(column) :: Nil
64 | }
65 |
66 | override def filterCondition: Option[String] = where
67 | }
68 |
--------------------------------------------------------------------------------
/src/main/scala/com/amazon/deequ/analyzers/Analysis.scala:
--------------------------------------------------------------------------------
1 | /**
2 | * Copyright 2018 Amazon.com, Inc. or its affiliates. All Rights Reserved.
3 | *
4 | * Licensed under the Apache License, Version 2.0 (the "License"). You may not
5 | * use this file except in compliance with the License. A copy of the License
6 | * is located at
7 | *
8 | * http://aws.amazon.com/apache2.0/
9 | *
10 | * or in the "license" file accompanying this file. This file is distributed on
11 | * an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either
12 | * express or implied. See the License for the specific language governing
13 | * permissions and limitations under the License.
14 | *
15 | */
16 |
17 | package com.amazon.deequ.analyzers
18 |
19 | import com.amazon.deequ.analyzers.runners.{AnalysisRunner, AnalyzerContext}
20 | import com.amazon.deequ.metrics.Metric
21 | import org.apache.spark.sql.DataFrame
22 | import org.apache.spark.storage.StorageLevel
23 |
24 | /**
25 | * Defines a set of analyzers to run on data.
26 | *
27 | * @param analyzers
28 | */
29 | case class Analysis(analyzers: Seq[Analyzer[_, Metric[_]]] = Seq.empty) {
30 |
31 | def addAnalyzer(analyzer: Analyzer[_, Metric[_]]): Analysis = {
32 | Analysis(analyzers :+ analyzer)
33 | }
34 |
35 | def addAnalyzers(otherAnalyzers: Seq[Analyzer[_, Metric[_]]]): Analysis = {
36 | Analysis(analyzers ++ otherAnalyzers)
37 | }
38 |
39 | /**
40 | * Compute the metrics from the analyzers configured in the analyis
41 | *
42 | * @param data data on which to operate
43 | * @param aggregateWith load existing states for the configured analyzers and aggregate them
44 | * (optional)
45 | * @param saveStatesWith persist resulting states for the configured analyzers (optional)
46 | * @param storageLevelOfGroupedDataForMultiplePasses caching level for grouped data that must
47 | * be accessed multiple times (use
48 | * StorageLevel.NONE to completely disable
49 | * caching)
50 | * @return
51 | */
52 | @deprecated("Use the AnalysisRunner instead (the onData method there)", "24-09-2019")
53 | def run(
54 | data: DataFrame,
55 | aggregateWith: Option[StateLoader] = None,
56 | saveStatesWith: Option[StatePersister] = None,
57 | storageLevelOfGroupedDataForMultiplePasses: StorageLevel = StorageLevel.MEMORY_AND_DISK)
58 | : AnalyzerContext = {
59 |
60 | AnalysisRunner.doAnalysisRun(data, analyzers, aggregateWith = aggregateWith,
61 | saveStatesWith = saveStatesWith)
62 | }
63 | }
64 |
--------------------------------------------------------------------------------
/src/main/scala/com/amazon/deequ/analyzers/StandardDeviation.scala:
--------------------------------------------------------------------------------
1 | /**
2 | * Copyright 2018 Amazon.com, Inc. or its affiliates. All Rights Reserved.
3 | *
4 | * Licensed under the Apache License, Version 2.0 (the "License"). You may not
5 | * use this file except in compliance with the License. A copy of the License
6 | * is located at
7 | *
8 | * http://aws.amazon.com/apache2.0/
9 | *
10 | * or in the "license" file accompanying this file. This file is distributed on
11 | * an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either
12 | * express or implied. See the License for the specific language governing
13 | * permissions and limitations under the License.
14 | *
15 | */
16 |
17 | package com.amazon.deequ.analyzers
18 |
19 | import com.amazon.deequ.analyzers.Preconditions.{hasColumn, isNumeric}
20 | import org.apache.spark.sql.DeequFunctions.stateful_stddev_pop
21 | import org.apache.spark.sql.{Column, Row}
22 | import org.apache.spark.sql.types.StructType
23 | import Analyzers._
24 |
25 | case class StandardDeviationState(
26 | n: Double,
27 | avg: Double,
28 | m2: Double)
29 | extends DoubleValuedState[StandardDeviationState] {
30 |
31 | require(n > 0.0, "Standard deviation is undefined for n = 0.")
32 |
33 | override def metricValue(): Double = {
34 | math.sqrt(m2 / n)
35 | }
36 |
37 | override def sum(other: StandardDeviationState): StandardDeviationState = {
38 | val newN = n + other.n
39 | val delta = other.avg - avg
40 | val deltaN = if (newN == 0.0) 0.0 else delta / newN
41 |
42 | StandardDeviationState(newN, avg + deltaN * other.n,
43 | m2 + other.m2 + delta * deltaN * n * other.n)
44 | }
45 | }
46 |
47 | case class StandardDeviation(column: String, where: Option[String] = None)
48 | extends StandardScanShareableAnalyzer[StandardDeviationState]("StandardDeviation", column)
49 | with FilterableAnalyzer {
50 |
51 | override def aggregationFunctions(): Seq[Column] = {
52 | stateful_stddev_pop(conditionalSelection(column, where)) :: Nil
53 | }
54 |
55 | override def fromAggregationResult(result: Row, offset: Int): Option[StandardDeviationState] = {
56 |
57 | if (result.isNullAt(offset)) {
58 | None
59 | } else {
60 | val row = result.getAs[Row](offset)
61 | val n = row.getDouble(0)
62 |
63 | if (n == 0.0) {
64 | None
65 | } else {
66 | Some(StandardDeviationState(n, row.getDouble(1), row.getDouble(2)))
67 | }
68 | }
69 | }
70 |
71 | override protected def additionalPreconditions(): Seq[StructType => Unit] = {
72 | hasColumn(column) :: isNumeric(column) :: Nil
73 | }
74 |
75 | override def filterCondition: Option[String] = where
76 | }
77 |
--------------------------------------------------------------------------------
/src/test/scala/com/amazon/deequ/metrics/MetricsTests.scala:
--------------------------------------------------------------------------------
1 | /**
2 | * Copyright 2018 Amazon.com, Inc. or its affiliates. All Rights Reserved.
3 | *
4 | * Licensed under the Apache License, Version 2.0 (the "License"). You may not
5 | * use this file except in compliance with the License. A copy of the License
6 | * is located at
7 | *
8 | * http://aws.amazon.com/apache2.0/
9 | *
10 | * or in the "license" file accompanying this file. This file is distributed on
11 | * an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either
12 | * express or implied. See the License for the specific language governing
13 | * permissions and limitations under the License.
14 | *
15 | */
16 |
17 | package com.amazon.deequ.metrics
18 |
19 | import com.amazon.deequ.analyzers.DataTypeInstances
20 | import org.scalatest.{Matchers, WordSpec}
21 |
22 | import scala.util.{Failure, Success}
23 |
24 |
25 | class MetricsTests extends WordSpec with Matchers {
26 | val sampleException = new IllegalArgumentException()
27 | "Double metric" should {
28 | "flatten and return itself" in {
29 | val metric = DoubleMetric(Entity.Column, "metric-name", "instance-name", Success(50))
30 | assert(metric.flatten() == List(metric))
31 | }
32 |
33 | "flatten in case of an error" in {
34 | val metric = DoubleMetric(Entity.Column, "metric-name", "instance-name",
35 | Failure(sampleException))
36 | assert(metric.flatten() == List(metric))
37 | }
38 | }
39 |
40 | "Histogram metric" should {
41 | "flatten matched and unmatched" in {
42 | val distribution = Distribution(
43 | Map("a" -> DistributionValue(6, 0.6), "b" -> DistributionValue(4, 0.4)), 2)
44 |
45 | val metric = HistogramMetric("instance-name", Success(distribution))
46 |
47 | val expected = Seq(
48 | DoubleMetric(Entity.Column, "Histogram.bins", "instance-name", Success(2)),
49 | DoubleMetric(Entity.Column, "Histogram.abs.a", "instance-name", Success(6)),
50 | DoubleMetric(Entity.Column, "Histogram.abs.b", "instance-name", Success(4)),
51 | DoubleMetric(Entity.Column, "Histogram.ratio.a", "instance-name", Success(0.6)),
52 | DoubleMetric(Entity.Column, "Histogram.ratio.b", "instance-name", Success(0.4))
53 | ).toSet
54 | assert(metric.flatten().toSet == expected)
55 | }
56 |
57 | "flatten matched and unmatched in case of an error" in {
58 | val metric = HistogramMetric("instance-name", Failure(sampleException))
59 |
60 | val expected = Seq(DoubleMetric(Entity.Column, "Histogram.bins", "instance-name",
61 | Failure(sampleException))).toSet
62 | assert(metric.flatten().toSet == expected)
63 | }
64 | }
65 |
66 | }
67 |
--------------------------------------------------------------------------------
/src/test/scala/com/amazon/deequ/SparkMonitor.scala:
--------------------------------------------------------------------------------
1 | /**
2 | * Copyright 2018 Amazon.com, Inc. or its affiliates. All Rights Reserved.
3 | *
4 | * Licensed under the Apache License, Version 2.0 (the "License"). You may not
5 | * use this file except in compliance with the License. A copy of the License
6 | * is located at
7 | *
8 | * http://aws.amazon.com/apache2.0/
9 | *
10 | * or in the "license" file accompanying this file. This file is distributed on
11 | * an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either
12 | * express or implied. See the License for the specific language governing
13 | * permissions and limitations under the License.
14 | *
15 | */
16 |
17 | package com.amazon.deequ
18 |
19 | import org.apache.spark.scheduler.{SparkListener, SparkListenerJobStart, SparkListenerStageCompleted, StageInfo}
20 |
21 | /**
22 | * A class representing a statistics about a sparkSession.
23 | * Currently, only number of spark jobs submitted and its stages are being tracked.
24 | */
25 | class SparkSessionStats {
26 | private var numberOfJobsSubmitted = 0
27 | private var stageInfos = Seq[StageInfo]()
28 |
29 | def jobCount: Int = {
30 | numberOfJobsSubmitted
31 | }
32 |
33 | def allExecutedStages: Seq[StageInfo] = {
34 | stageInfos
35 | }
36 |
37 | def recordJobStart(jobStart: SparkListenerJobStart): Unit = {
38 | numberOfJobsSubmitted += 1
39 | }
40 |
41 | def recordStageInfos(stageInfo: StageInfo): Unit = {
42 | stageInfos = stageInfos :+ stageInfo
43 | }
44 |
45 | def reset(): Unit = {
46 | numberOfJobsSubmitted = 0
47 | stageInfos = Seq[StageInfo]()
48 | }
49 |
50 | }
51 |
52 | /**
53 | * A SparkListener implementation to monitor spark jobs submitted
54 | */
55 | class SparkMonitor extends SparkListener {
56 | val stat = new SparkSessionStats
57 |
58 | override def onJobStart(jobStart: SparkListenerJobStart) {
59 | stat.recordJobStart(jobStart)
60 | println(s"Job started with ${jobStart.stageInfos.size} stages: $jobStart " +
61 | s"details : ${jobStart.stageInfos.map(_.name)}")
62 |
63 | }
64 |
65 | override def onStageCompleted(stageCompleted: SparkListenerStageCompleted): Unit = {
66 | stat.recordStageInfos(stageCompleted.stageInfo)
67 | println(s"Stage ${stageCompleted.stageInfo.stageId} completed with " +
68 | s"${stageCompleted.stageInfo.numTasks} tasks.")
69 | }
70 |
71 | /**
72 | * @param testFun thunk to run with SparkSessionStats as an argument.
73 | * Provides a monitoring session where the stats are being reset at the beginning
74 | *
75 | */
76 | def withMonitoringSession(testFun: (SparkSessionStats) => Any): Any = {
77 | stat.reset
78 | testFun(stat)
79 | }
80 |
81 | }
82 |
--------------------------------------------------------------------------------
/src/main/scala/com/amazon/deequ/suggestions/rules/RetainCompletenessRule.scala:
--------------------------------------------------------------------------------
1 | /**
2 | * Copyright 2018 Amazon.com, Inc. or its affiliates. All Rights Reserved.
3 | *
4 | * Licensed under the Apache License, Version 2.0 (the "License"). You may not
5 | * use this file except in compliance with the License. A copy of the License
6 | * is located at
7 | *
8 | * http://aws.amazon.com/apache2.0/
9 | *
10 | * or in the "license" file accompanying this file. This file is distributed on
11 | * an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either
12 | * express or implied. See the License for the specific language governing
13 | * permissions and limitations under the License.
14 | *
15 | */
16 |
17 | package com.amazon.deequ.suggestions.rules
18 |
19 | import com.amazon.deequ.constraints.Constraint.completenessConstraint
20 | import com.amazon.deequ.profiles._
21 | import com.amazon.deequ.suggestions.ConstraintSuggestion
22 | import scala.math.BigDecimal.RoundingMode
23 |
24 | /**
25 | * If a column is incomplete in the sample, we model its completeness as a binomial variable,
26 | * estimate a confidence interval and use this to define a lower bound for the completeness
27 | */
28 | case class RetainCompletenessRule() extends ConstraintRule[ColumnProfile] {
29 |
30 | override def shouldBeApplied(profile: ColumnProfile, numRecords: Long): Boolean = {
31 | profile.completeness > 0.2 && profile.completeness < 1.0
32 | }
33 |
34 | override def candidate(profile: ColumnProfile, numRecords: Long): ConstraintSuggestion = {
35 |
36 | val p = profile.completeness
37 | val n = numRecords
38 | val z = 1.96
39 |
40 | // TODO this needs to be more robust for p's close to 0 or 1
41 | val targetCompleteness = BigDecimal(p - z * math.sqrt(p * (1 - p) / n))
42 | .setScale(2, RoundingMode.DOWN).toDouble
43 |
44 | val constraint = completenessConstraint(profile.column, _ >= targetCompleteness)
45 |
46 | val boundInPercent = ((1.0 - targetCompleteness) * 100).toInt
47 |
48 | val description = s"'${profile.column}' has less than $boundInPercent% missing values"
49 |
50 | ConstraintSuggestion(
51 | constraint,
52 | profile.column,
53 | "Completeness: " + profile.completeness.toString,
54 | description,
55 | this,
56 | s""".hasCompleteness("${profile.column}", _ >= $targetCompleteness,
57 | | Some("It should be above $targetCompleteness!"))"""
58 | .stripMargin.replaceAll("\n", "")
59 | )
60 | }
61 |
62 | override val ruleDescription: String = "If a column is incomplete in the sample, " +
63 | "we model its completeness as a binomial variable, estimate a confidence interval " +
64 | "and use this to define a lower bound for the completeness"
65 | }
66 |
--------------------------------------------------------------------------------
/src/test/scala/com/amazon/deequ/anomalydetection/SimpleThresholdStrategyTest.scala:
--------------------------------------------------------------------------------
1 | /**
2 | * Copyright 2018 Amazon.com, Inc. or its affiliates. All Rights Reserved.
3 | *
4 | * Licensed under the Apache License, Version 2.0 (the "License"). You may not
5 | * use this file except in compliance with the License. A copy of the License
6 | * is located at
7 | *
8 | * http://aws.amazon.com/apache2.0/
9 | *
10 | * or in the "license" file accompanying this file. This file is distributed on
11 | * an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either
12 | * express or implied. See the License for the specific language governing
13 | * permissions and limitations under the License.
14 | *
15 | */
16 |
17 | package com.amazon.deequ.anomalydetection
18 |
19 | import org.scalatest.{Matchers, WordSpec}
20 |
21 | class SimpleThresholdStrategyTest extends WordSpec with Matchers {
22 |
23 | "Simple Threshold Strategy" should {
24 |
25 | val strategy = SimpleThresholdStrategy(upperBound = 1.0)
26 | val data = Vector(-1.0, 2.0, 3.0, 0.5)
27 | val expected = Seq((1, Anomaly(Option(2.0), 1.0)), (2, Anomaly(Option(3.0), 1.0)))
28 |
29 | "detect values above threshold" in {
30 | val anomalyResult = strategy.detect(data, (0, 4))
31 |
32 | assert(anomalyResult == expected)
33 | }
34 |
35 | "detect all values without range specified" in {
36 | val anomalyResult = strategy.detect(data)
37 |
38 | assert(anomalyResult == expected)
39 | }
40 |
41 | "work fine with empty input" in {
42 | val emptySeries = Vector[Double]()
43 | val anomalyResult = strategy.detect(emptySeries)
44 |
45 | assert(anomalyResult == Seq[(Int, Anomaly)]())
46 | }
47 |
48 | "work with upper and lower threshold" in {
49 | val tS = SimpleThresholdStrategy(lowerBound = -0.5, upperBound = 1.0)
50 | val anomalyResult = tS.detect(data)
51 |
52 | assert(anomalyResult == Seq((0, Anomaly(Option(-1.0), 1.0)),
53 | (1, Anomaly(Option(2.0), 1.0)), (2, Anomaly(Option(3.0), 1.0))))
54 | }
55 |
56 | "throw an error when thresholds are not ordered " in {
57 | intercept[IllegalArgumentException] {
58 | val ts = SimpleThresholdStrategy(lowerBound = 2.0, upperBound = 1.0)
59 | }
60 | }
61 |
62 | "produce error message with correct value and bounds" in {
63 | val result = strategy.detect(data)
64 |
65 | result.foreach { case (_, anom) =>
66 | val (value, lowerBound, upperBound) =
67 | AnomalyDetectionTestUtils.firstThreeDoublesFromString(anom.detail.get)
68 |
69 | assert(anom.value.isDefined && value === anom.value.get)
70 | assert(value < lowerBound || value > upperBound)
71 | }
72 | }
73 | }
74 | }
75 |
--------------------------------------------------------------------------------
/docs/key-concepts.md:
--------------------------------------------------------------------------------
1 | # Key Concepts in the Codebase
2 | There are a few key concepts that will help you to understand the codebase.
3 |
4 | ## Metrics, Analyzers, and State
5 | Metrics represent some metric associated with the data that changes over time. For example counting the rows in a
6 | DataFrame.
7 |
8 | An Analyzer knows how to calculate a Metric based on some input DataFrame.
9 |
10 | State is an optimization - it represents the state of the data, from which a metric can be calculated. This intermediate
11 | state can then be used to calculate future metrics more quickly. Check out the examples for some further details.
12 |
13 | ## Overall flow of running deequ checks
14 | When running checks a user specifies a DataFrame and a number of checks to do on that DataFrame. Many checks in Deequ
15 | are based on metrics which describe the data. In order to perform the checks the user requests deequ follows the
16 | following process:
17 | * First deequ figures out which Analyzers are required
18 | * Metrics are calculated using those Analyzers
19 | ** Metrics are also stored if a MetricsRepository is provided
20 | ** Intermediate state is stored if a StatePersister is provided
21 | ** Intermediate state is used for metric calculations if a StateLoader is provided
22 | * Checks are evaluated using the calculated Metrics
23 |
24 | The reason it works this way is for performance, primarily because calculating metrics at the same time gives the
25 | opportunity to calculate them in fewer passes over the data.
26 |
27 | ### Analyzers
28 | Types of analyzers:
29 | * ScanShareableAnalyzer - an analyzer which computes a metric based on a straight scan over the data, without any
30 | grouping being required
31 | * GroupingAnalyzer - an analyzer that requires the data to be grouped by a set of columns before the metric can be
32 | calculated
33 |
34 | ### Metrics
35 | A metric includes the following key details
36 | * name - the name for the type of metric
37 | * entity - the type of entity the metric is recorded against. e.g. A column, dataset, or multicolumn
38 | * instance - information about this instance of the metric. For example this could be the column name the metric is
39 | operating on
40 | * value - the value of the metric at a point in time. The type of this value varies between metrics.
41 |
42 | #### Metrics storage
43 | Metrics can be stored in a metrics repository. An entry in the repository consists of:
44 | * A resultKey, which is a combination of a timestamp and a map of tags. Typically a user may want to record things
45 | like the data source (e.g. table name) with the tags. The resultKey can be used to lookup stored metrics
46 | * An analyzerContext, which consists of a map of Analyzers to Metrics
47 |
48 | ### State
49 | Please consult the examples or the codebase for more details on State.
--------------------------------------------------------------------------------
/src/main/scala/com/amazon/deequ/examples/KLLCheckExample.scala:
--------------------------------------------------------------------------------
1 | /**
2 | * Copyright 2018 Amazon.com, Inc. or its affiliates. All Rights Reserved.
3 | *
4 | * Licensed under the Apache License, Version 2.0 (the "License"). You may not
5 | * use this file except in compliance with the License. A copy of the License
6 | * is located at
7 | *
8 | * http://aws.amazon.com/apache2.0/
9 | *
10 | * or in the "license" file accompanying this file. This file is distributed on
11 | * an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either
12 | * express or implied. See the License for the specific language governing
13 | * permissions and limitations under the License.
14 | *
15 | */
16 |
17 | package com.amazon.deequ.examples
18 |
19 | import ExampleUtils.{itemsAsDataframe, withSpark}
20 | import com.amazon.deequ.VerificationSuite
21 | import com.amazon.deequ.analyzers.KLLParameters
22 | import com.amazon.deequ.checks.{Check, CheckLevel, CheckStatus}
23 | import com.amazon.deequ.constraints.ConstraintStatus
24 | import org.apache.spark.sql.types.DoubleType
25 |
26 | private[examples] object KLLCheckExample extends App {
27 |
28 | withSpark { session =>
29 |
30 | val data = itemsAsDataframe(session,
31 | Item(1, "Thingy A", "awesome thing.", "high", 0),
32 | Item(2, "Thingy B", "available at http://thingb.com", null, 0),
33 | Item(3, null, null, "low", 5),
34 | Item(4, "Thingy D", "checkout https://thingd.ca", "low", 10),
35 | Item(5, "Thingy E", null, "high", 12))
36 |
37 | val newData = data.select(data("numViews").cast(DoubleType).as("numViews"))
38 |
39 | val verificationResult = VerificationSuite()
40 | .onData(newData)
41 | .addCheck(
42 | Check(CheckLevel.Error, "integrity checks")
43 | // we expect 5 records
44 | .hasSize(_ == 5)
45 | // we expect the maximum of tips to be not more than 10
46 | .hasMax("numViews", _ <= 10)
47 | // we expect the sketch size to be at least 16
48 | .kllSketchSatisfies("numViews", _.parameters(1) >= 16,
49 | kllParameters = Option(KLLParameters(2, 0.64, 2))))
50 | .run()
51 |
52 | if (verificationResult.status == CheckStatus.Success) {
53 | println("The data passed the test, everything is fine!")
54 | } else {
55 | println("We found errors in the data, the following constraints were not satisfied:\n")
56 |
57 | val resultsForAllConstraints = verificationResult.checkResults
58 | .flatMap { case (_, checkResult) => checkResult.constraintResults }
59 |
60 | resultsForAllConstraints
61 | .filter { _.status != ConstraintStatus.Success }
62 | .foreach { result =>
63 | println(s"${result.constraint} failed: ${result.message.get}")
64 | }
65 | }
66 |
67 | }
68 | }
69 |
70 |
--------------------------------------------------------------------------------
/src/main/scala/com/amazon/deequ/analyzers/runners/MetricCalculationException.scala:
--------------------------------------------------------------------------------
1 | /**
2 | * Copyright 2018 Amazon.com, Inc. or its affiliates. All Rights Reserved.
3 | *
4 | * Licensed under the Apache License, Version 2.0 (the "License"). You may not
5 | * use this file except in compliance with the License. A copy of the License
6 | * is located at
7 | *
8 | * http://aws.amazon.com/apache2.0/
9 | *
10 | * or in the "license" file accompanying this file. This file is distributed on
11 | * an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either
12 | * express or implied. See the License for the specific language governing
13 | * permissions and limitations under the License.
14 | *
15 | */
16 |
17 | package com.amazon.deequ.analyzers.runners
18 |
19 | abstract class MetricCalculationException(message: String) extends Exception(message)
20 |
21 | class MetricCalculationRuntimeException(message: String)
22 | extends MetricCalculationException(message) {
23 |
24 | def this(message: String, cause: Throwable) {
25 | this(message)
26 | initCause(cause)
27 | }
28 |
29 | def this(cause: Throwable) {
30 | this(Option(cause).map(_.toString).orNull, cause)
31 | }
32 | }
33 |
34 | class MetricCalculationPreconditionException(message: String)
35 | extends MetricCalculationException(message)
36 |
37 |
38 | class NoSuchColumnException(message: String)
39 | extends MetricCalculationPreconditionException(message)
40 |
41 | class WrongColumnTypeException(message: String)
42 | extends MetricCalculationPreconditionException(message)
43 |
44 | class NoColumnsSpecifiedException(message: String)
45 | extends MetricCalculationPreconditionException(message)
46 |
47 | class NumberOfSpecifiedColumnsException(message: String)
48 | extends MetricCalculationPreconditionException(message)
49 |
50 | class IllegalAnalyzerParameterException(
51 | message: String)
52 | extends MetricCalculationPreconditionException(message)
53 |
54 | class EmptyStateException(message: String) extends MetricCalculationRuntimeException(message)
55 |
56 |
57 | object MetricCalculationException {
58 |
59 | private[deequ] def getApproxQuantileIllegalParamMessage(quantile: Double): String = {
60 | "Quantile parameter must be in the closed interval [0, 1]. " +
61 | s"Currently, the value is: $quantile!"
62 | }
63 |
64 | private[deequ] def getApproxQuantileIllegalErrorParamMessage(relativeError: Double): String = {
65 | "Relative error parameter must be in the closed interval [0, 1]. " +
66 | s"Currently, the value is: $relativeError!"
67 | }
68 |
69 | def wrapIfNecessary(exception: Throwable)
70 | : MetricCalculationException = {
71 |
72 | exception match {
73 | case error: MetricCalculationException => error
74 | case error: Throwable => new MetricCalculationRuntimeException(error)
75 | }
76 | }
77 |
78 | }
79 |
--------------------------------------------------------------------------------
/src/main/scala/com/amazon/deequ/examples/IncrementalMetricsExample.scala:
--------------------------------------------------------------------------------
1 | /**
2 | * Copyright 2018 Amazon.com, Inc. or its affiliates. All Rights Reserved.
3 | *
4 | * Licensed under the Apache License, Version 2.0 (the "License"). You may not
5 | * use this file except in compliance with the License. A copy of the License
6 | * is located at
7 | *
8 | * http://aws.amazon.com/apache2.0/
9 | *
10 | * or in the "license" file accompanying this file. This file is distributed on
11 | * an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either
12 | * express or implied. See the License for the specific language governing
13 | * permissions and limitations under the License.
14 | *
15 | */
16 |
17 | package com.amazon.deequ.examples
18 |
19 | import ExampleUtils.{itemsAsDataframe, withSpark}
20 | import com.amazon.deequ.analyzers.{Analysis, ApproxCountDistinct, Completeness, InMemoryStateProvider, Size}
21 | import com.amazon.deequ.analyzers.runners.AnalysisRunner
22 |
23 | private[examples] object IncrementalMetricsExample extends App {
24 |
25 | /* NOTE: Stateful support is still work in progress, and is therefore not yet integrated into
26 | VerificationSuite. We showcase however how to incrementally compute metrics on a growing
27 | dataset using the AnalysisRunner. */
28 |
29 | withSpark { session =>
30 |
31 | val data = itemsAsDataframe(session,
32 | Item(1, "Thingy A", "awesome thing.", "high", 0),
33 | Item(2, "Thingy B", "available tomorrow", "low", 0),
34 | Item(3, "Thing C", null, null, 5))
35 |
36 | val moreData = itemsAsDataframe(session,
37 | Item(4, "Thingy D", null, "low", 10),
38 | Item(5, "Thingy E", null, "high", 12))
39 |
40 |
41 | val analysis = Analysis()
42 | .addAnalyzer(Size())
43 | .addAnalyzer(ApproxCountDistinct("id"))
44 | .addAnalyzer(Completeness("productName"))
45 | .addAnalyzer(Completeness("description"))
46 |
47 | val stateStore = InMemoryStateProvider()
48 |
49 | val metricsForData = AnalysisRunner.run(
50 | data = data,
51 | analysis = analysis,
52 | saveStatesWith = Some(stateStore) // persist the internal state of the computation
53 | )
54 |
55 | // We update the metrics now from the stored states without having to access the previous data!
56 | val metricsAfterAddingMoreData = AnalysisRunner.run(
57 | data = moreData,
58 | analysis = analysis,
59 | aggregateWith = Some(stateStore) // continue from internal state of the computation
60 | )
61 |
62 | println("Metrics for the first 3 records:\n")
63 | metricsForData.metricMap.foreach { case (analyzer, metric) =>
64 | println(s"\t$analyzer: ${metric.value.get}")
65 | }
66 |
67 | println("\nMetrics after adding 2 more records:\n")
68 | metricsAfterAddingMoreData.metricMap.foreach { case (analyzer, metric) =>
69 | println(s"\t$analyzer: ${metric.value.get}")
70 | }
71 |
72 | }
73 | }
74 |
--------------------------------------------------------------------------------
/src/main/scala/com/amazon/deequ/io/DfsUtils.scala:
--------------------------------------------------------------------------------
1 | /**
2 | * Copyright 2018 Amazon.com, Inc. or its affiliates. All Rights Reserved.
3 | *
4 | * Licensed under the Apache License, Version 2.0 (the "License"). You may not
5 | * use this file except in compliance with the License. A copy of the License
6 | * is located at
7 | *
8 | * http://aws.amazon.com/apache2.0/
9 | *
10 | * or in the "license" file accompanying this file. This file is distributed on
11 | * an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either
12 | * express or implied. See the License for the specific language governing
13 | * permissions and limitations under the License.
14 | *
15 | */
16 |
17 | package com.amazon.deequ.io
18 |
19 | import java.io.{BufferedWriter, OutputStreamWriter}
20 |
21 | import org.apache.hadoop.fs.{FSDataInputStream, FSDataOutputStream, FileSystem, Path}
22 | import org.apache.spark.sql.SparkSession
23 |
24 | private[deequ] object DfsUtils {
25 |
26 | /* Helper function to read from a binary file on S3 */
27 | def readFromFileOnDfs[T](session: SparkSession, path: String)
28 | (readFunc: FSDataInputStream => T): T = {
29 |
30 | val (fs, qualifiedPath) = asQualifiedPath(session, path)
31 | val input = fs.open(qualifiedPath)
32 |
33 | try {
34 | readFunc(input)
35 | } finally {
36 | if (input != null) {
37 | input.close()
38 | }
39 | }
40 | }
41 |
42 | /* Helper function to write to a binary file on S3 */
43 | def writeToFileOnDfs(session: SparkSession, path: String, overwrite: Boolean = false)
44 | (writeFunc: FSDataOutputStream => Unit): Unit = {
45 |
46 | val (fs, qualifiedPath) = asQualifiedPath(session, path)
47 | val output = fs.create(qualifiedPath, overwrite)
48 |
49 | try {
50 | writeFunc(output)
51 | } finally {
52 | if (output != null) {
53 | output.close()
54 | }
55 | }
56 | }
57 |
58 | /* Helper function to write to a binary file on S3 */
59 | def writeToTextFileOnDfs(session: SparkSession, path: String, overwrite: Boolean = false)
60 | (writeFunc: BufferedWriter => Unit): Unit = {
61 |
62 | val (fs, qualifiedPath) = asQualifiedPath(session, path)
63 | val output = fs.create(qualifiedPath, overwrite)
64 |
65 | try {
66 | val writer = new BufferedWriter(new OutputStreamWriter(output))
67 | writeFunc(writer)
68 | writer.close()
69 | } finally {
70 | if (output != null) {
71 | output.close()
72 | }
73 | }
74 | }
75 |
76 | /* Make sure we write to the correct filesystem, as EMR clusters also have an internal HDFS */
77 | def asQualifiedPath(session: SparkSession, path: String): (FileSystem, Path) = {
78 | val hdfsPath = new Path(path)
79 | val fs = hdfsPath.getFileSystem(session.sparkContext.hadoopConfiguration)
80 | val qualifiedPath = hdfsPath.makeQualified(fs.getUri, fs.getWorkingDirectory)
81 |
82 | (fs, qualifiedPath)
83 | }
84 |
85 | }
86 |
--------------------------------------------------------------------------------
/src/main/scala/com/amazon/deequ/anomalydetection/RelativeRateOfChangeStrategy.scala:
--------------------------------------------------------------------------------
1 | /**
2 | * Copyright 2018 Amazon.com, Inc. or its affiliates. All Rights Reserved.
3 | *
4 | * Licensed under the Apache License, Version 2.0 (the "License"). You may not
5 | * use this file except in compliance with the License. A copy of the License
6 | * is located at
7 | *
8 | * http://aws.amazon.com/apache2.0/
9 | *
10 | * or in the "license" file accompanying this file. This file is distributed on
11 | * an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either
12 | * express or implied. See the License for the specific language governing
13 | * permissions and limitations under the License.
14 | *
15 | */
16 |
17 | package com.amazon.deequ.anomalydetection
18 |
19 | import breeze.linalg.DenseVector
20 |
21 | /**
22 | * Detects anomalies based on the values' rate of change.
23 | * The order of the difference can be set manually.
24 | * If it is set to 0, this strategy acts like the [[SimpleThresholdStrategy]].
25 | *
26 | * RelativeRateOfChangeStrategy(Some(0.9), Some(1.1), 1) for example
27 | * calculates the first discrete difference
28 | * and if some point's value changes by more than 10.0 Percent in one timestep,
29 | * it flags it as an anomaly.
30 | *
31 | * @param maxRateDecrease Lower bound of accepted relative change (as new value / old value).
32 | * @param maxRateIncrease Upper bound of accepted relative change (as new value / old value).
33 | * @param order Order of the calculated difference.
34 | * Set to 1 it calculates the difference between two consecutive values.
35 | */
36 | case class RelativeRateOfChangeStrategy(
37 | maxRateDecrease: Option[Double] = None,
38 | maxRateIncrease: Option[Double] = None,
39 | order: Int = 1)
40 | extends BaseChangeStrategy {
41 |
42 | /**
43 | * Calculates the rate of change with respect to the specified order.
44 | * If the order is set to 1, the resulting value for a point at index i
45 | * is equal to dataSeries (i) / dataSeries(i - 1).
46 | * Note that this difference cannot be calculated for the first [[order]] elements in the vector.
47 | * The resulting vector is therefore smaller by [[order]] elements.
48 | *
49 | * @param dataSeries The values contained in a DenseVector[Double]
50 | * @param order The order of the derivative.
51 | * @return A vector with the resulting rates of change for all values
52 | * except the first [[order]] elements.
53 | */
54 | override def diff(dataSeries: DenseVector[Double], order: Int): DenseVector[Double] = {
55 | require(order > 0, "Order of diff cannot be zero or negative")
56 | if (dataSeries.length == 0) {
57 | dataSeries
58 | } else {
59 | val valuesRight = dataSeries.slice(order, dataSeries.length)
60 | val valuesLeft = dataSeries.slice(0, dataSeries.length - order)
61 | valuesRight / valuesLeft
62 | }
63 | }
64 | }
65 |
--------------------------------------------------------------------------------
/src/test/scala/com/amazon/deequ/KLL/KLLBenchmark.java:
--------------------------------------------------------------------------------
1 | /**
2 | * Copyright 2018 Amazon.com, Inc. or its affiliates. All Rights Reserved.
3 | *
4 | * Licensed under the Apache License, Version 2.0 (the "License"). You may not
5 | * use this file except in compliance with the License. A copy of the License
6 | * is located at
7 | *
8 | * http://aws.amazon.com/apache2.0/
9 | *
10 | * or in the "license" file accompanying this file. This file is distributed on
11 | * an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either
12 | * express or implied. See the License for the specific language governing
13 | * permissions and limitations under the License.
14 | *
15 | */
16 |
17 | package com.amazon.deequ.KLL;
18 |
19 | import com.amazon.deequ.analyzers.QuantileNonSample;
20 | import org.apache.datasketches.kll.KllFloatsSketch;
21 | import org.openjdk.jmh.annotations.Benchmark;
22 | import org.openjdk.jmh.annotations.BenchmarkMode;
23 | import org.openjdk.jmh.annotations.Fork;
24 | import org.openjdk.jmh.annotations.Mode;
25 | import org.openjdk.jmh.annotations.OutputTimeUnit;
26 | import org.openjdk.jmh.infra.Blackhole;
27 | import org.openjdk.jmh.runner.Runner;
28 | import org.openjdk.jmh.runner.RunnerException;
29 | import org.openjdk.jmh.runner.options.Options;
30 | import org.openjdk.jmh.runner.options.OptionsBuilder;
31 |
32 | import java.util.Random;
33 | import java.util.concurrent.TimeUnit;
34 |
35 | @BenchmarkMode(Mode.AverageTime)
36 | @OutputTimeUnit(TimeUnit.MILLISECONDS)
37 | @Fork(value = 2, jvmArgs = {"-Xms2G", "-Xmx2G"})
38 | public class KLLBenchmark {
39 |
40 | private static final int N = 10_000_000;
41 |
42 | private static float[] DATA_FOR_TESTING = createData();
43 |
44 | public static void main(String[] args) throws RunnerException {
45 |
46 | Options opt = new OptionsBuilder()
47 | .include(KLLBenchmark.class.getSimpleName())
48 | .forks(1)
49 | .build();
50 |
51 | new Runner(opt).run();
52 | }
53 |
54 | private static float[] createData() {
55 | Random prng = new Random();
56 | float[] numbers = new float[N];
57 | for (int i = 0; i < N; i++) {
58 | numbers[i] = prng.nextFloat();
59 | }
60 | return numbers;
61 | }
62 |
63 | @Benchmark
64 | public void sumArray(Blackhole bh) {
65 | float sum = 0.0f;
66 | for (int i = 0; i < N; i++) {
67 | sum += DATA_FOR_TESTING[i];
68 | }
69 | bh.consume(sum);
70 | }
71 |
72 | @Benchmark
73 | public void sketchArrayWithKLL(Blackhole bh) {
74 | QuantileNonSample sketch = KLLBenchmarkHelper.floatSketch();
75 | for (int i = 0; i < N; i++) {
76 | sketch.update(DATA_FOR_TESTING[i]);
77 | }
78 | bh.consume(sketch);
79 | }
80 |
81 | @Benchmark
82 | public void sketchArrayWithJavaSketchesKLL(Blackhole bh) {
83 | KllFloatsSketch sketch = new KllFloatsSketch();
84 | for (int i = 0; i < N; i++) {
85 | sketch.update(DATA_FOR_TESTING[i]);
86 | }
87 | bh.consume(sketch);
88 | }
89 | }
90 |
--------------------------------------------------------------------------------
/src/main/scala/com/amazon/deequ/examples/BasicExample.scala:
--------------------------------------------------------------------------------
1 | /**
2 | * Copyright 2018 Amazon.com, Inc. or its affiliates. All Rights Reserved.
3 | *
4 | * Licensed under the Apache License, Version 2.0 (the "License"). You may not
5 | * use this file except in compliance with the License. A copy of the License
6 | * is located at
7 | *
8 | * http://aws.amazon.com/apache2.0/
9 | *
10 | * or in the "license" file accompanying this file. This file is distributed on
11 | * an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either
12 | * express or implied. See the License for the specific language governing
13 | * permissions and limitations under the License.
14 | *
15 | */
16 |
17 | package com.amazon.deequ.examples
18 |
19 | import ExampleUtils.{withSpark, itemsAsDataframe}
20 | import com.amazon.deequ.VerificationSuite
21 | import com.amazon.deequ.checks.{Check, CheckLevel, CheckStatus}
22 | import com.amazon.deequ.constraints.ConstraintStatus
23 |
24 | private[examples] object BasicExample extends App {
25 |
26 | withSpark { session =>
27 |
28 | val data = itemsAsDataframe(session,
29 | Item(1, "Thingy A", "awesome thing.", "high", 0),
30 | Item(2, "Thingy B", "available at http://thingb.com", null, 0),
31 | Item(3, null, null, "low", 5),
32 | Item(4, "Thingy D", "checkout https://thingd.ca", "low", 10),
33 | Item(5, "Thingy E", null, "high", 12))
34 |
35 | val verificationResult = VerificationSuite()
36 | .onData(data)
37 | .addCheck(
38 | Check(CheckLevel.Error, "integrity checks")
39 | // we expect 5 records
40 | .hasSize(_ == 5)
41 | // 'id' should never be NULL
42 | .isComplete("id")
43 | // 'id' should not contain duplicates
44 | .isUnique("id")
45 | // 'productName' should never be NULL
46 | .isComplete("productName")
47 | // 'priority' should only contain the values "high" and "low"
48 | .isContainedIn("priority", Array("high", "low"))
49 | // 'numViews' should not contain negative values
50 | .isNonNegative("numViews"))
51 | .addCheck(
52 | Check(CheckLevel.Warning, "distribution checks")
53 | // at least half of the 'description's should contain a url
54 | .containsURL("description", _ >= 0.5)
55 | // half of the items should have less than 10 'numViews'
56 | .hasApproxQuantile("numViews", 0.5, _ <= 10))
57 | .run()
58 |
59 | if (verificationResult.status == CheckStatus.Success) {
60 | println("The data passed the test, everything is fine!")
61 | } else {
62 | println("We found errors in the data, the following constraints were not satisfied:\n")
63 |
64 | val resultsForAllConstraints = verificationResult.checkResults
65 | .flatMap { case (_, checkResult) => checkResult.constraintResults }
66 |
67 | resultsForAllConstraints
68 | .filter { _.status != ConstraintStatus.Success }
69 | .foreach { result =>
70 | println(s"${result.constraint} failed: ${result.message.get}")
71 | }
72 | }
73 |
74 | }
75 | }
76 |
--------------------------------------------------------------------------------
/src/test/scala/com/amazon/deequ/SparkContextSpec.scala:
--------------------------------------------------------------------------------
1 | /**
2 | * Copyright 2018 Amazon.com, Inc. or its affiliates. All Rights Reserved.
3 | *
4 | * Licensed under the Apache License, Version 2.0 (the "License"). You may not
5 | * use this file except in compliance with the License. A copy of the License
6 | * is located at
7 | *
8 | * http://aws.amazon.com/apache2.0/
9 | *
10 | * or in the "license" file accompanying this file. This file is distributed on
11 | * an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either
12 | * express or implied. See the License for the specific language governing
13 | * permissions and limitations under the License.
14 | *
15 | */
16 |
17 | package com.amazon.deequ
18 |
19 | import org.apache.spark.SparkContext
20 | import org.apache.spark.sql.{SQLContext, SparkSession}
21 |
22 | /**
23 | * To be mixed with Tests so they can use a default spark context suitable for testing
24 | */
25 | trait SparkContextSpec {
26 |
27 | /**
28 | * @param testFun thunk to run with SparkSession as an argument
29 | */
30 | def withSparkSession(testFun: SparkSession => Any): Unit = {
31 | val session = setupSparkSession
32 | try {
33 | testFun(session)
34 | } finally {
35 | /* empty cache of RDD size, as the referred ids are only valid within a session */
36 | tearDownSparkSession(session)
37 | }
38 | }
39 |
40 | /**
41 | * @param testFun thunk to run with SparkSession and SparkMonitor as an argument for the tests
42 | * that would like to get details on spark jobs submitted
43 | *
44 | */
45 | def withMonitorableSparkSession(testFun: (SparkSession, SparkMonitor) => Any): Unit = {
46 | val monitor = new SparkMonitor
47 | val session = setupSparkSession
48 | session.sparkContext.addSparkListener(monitor)
49 | try {
50 | testFun(session, monitor)
51 | } finally {
52 | tearDownSparkSession(session)
53 | }
54 | }
55 |
56 | /**
57 | * @param testFun thunk to run with SparkContext as an argument
58 | */
59 | def withSparkContext(testFun: SparkContext => Any) {
60 | withSparkSession(session => testFun(session.sparkContext))
61 | }
62 |
63 | /**
64 | * @param testFun thunk to run with SQLContext as an argument
65 | */
66 | def withSparkSqlContext(testFun: SQLContext => Any) {
67 | withSparkSession(session => testFun(session.sqlContext))
68 | }
69 |
70 | /**
71 | * Setups a local sparkSession
72 | *
73 | * @return sparkSession to be used
74 | */
75 | private def setupSparkSession = {
76 | val session = SparkSession.builder()
77 | .master("local")
78 | .appName("test")
79 | .config("spark.ui.enabled", "false")
80 | .config("spark.sql.shuffle.partitions", 2.toString)
81 | .getOrCreate()
82 | session.sparkContext.setCheckpointDir(System.getProperty("java.io.tmpdir"))
83 | session
84 | }
85 |
86 | /**
87 | * Tears down the sparkSession
88 | *
89 | * @param session Session to be stopped
90 | * @return
91 | */
92 | private def tearDownSparkSession(session: SparkSession) = {
93 | session.stop()
94 | System.clearProperty("spark.driver.port")
95 | }
96 |
97 | }
98 |
--------------------------------------------------------------------------------
/src/main/scala/com/amazon/deequ/examples/ConstraintSuggestionExample.scala:
--------------------------------------------------------------------------------
1 | /**
2 | * Copyright 2018 Amazon.com, Inc. or its affiliates. All Rights Reserved.
3 | *
4 | * Licensed under the Apache License, Version 2.0 (the "License"). You may not
5 | * use this file except in compliance with the License. A copy of the License
6 | * is located at
7 | *
8 | * http://aws.amazon.com/apache2.0/
9 | *
10 | * or in the "license" file accompanying this file. This file is distributed on
11 | * an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either
12 | * express or implied. See the License for the specific language governing
13 | * permissions and limitations under the License.
14 | *
15 | */
16 |
17 | package com.amazon.deequ.examples
18 |
19 | import com.amazon.deequ.examples.ExampleUtils.withSpark
20 | import com.amazon.deequ.suggestions.{ConstraintSuggestionRunner, Rules}
21 |
22 | private[examples] object ConstraintSuggestionExample extends App {
23 |
24 | withSpark { session =>
25 |
26 | // Lets first generate some example data
27 | val rows = session.sparkContext.parallelize(Seq(
28 | RawData("thingA", "13.0", "IN_TRANSIT", "true"),
29 | RawData("thingA", "5", "DELAYED", "false"),
30 | RawData("thingB", null, "DELAYED", null),
31 | RawData("thingC", null, "IN_TRANSIT", "false"),
32 | RawData("thingD", "1.0", "DELAYED", "true"),
33 | RawData("thingC", "7.0", "UNKNOWN", null),
34 | RawData("thingC", "24", "UNKNOWN", null),
35 | RawData("thingE", "20", "DELAYED", "false"),
36 | RawData("thingA", "13.0", "IN_TRANSIT", "true"),
37 | RawData("thingA", "5", "DELAYED", "false"),
38 | RawData("thingB", null, "DELAYED", null),
39 | RawData("thingC", null, "IN_TRANSIT", "false"),
40 | RawData("thingD", "1.0", "DELAYED", "true"),
41 | RawData("thingC", "17.0", "UNKNOWN", null),
42 | RawData("thingC", "22", "UNKNOWN", null),
43 | RawData("thingE", "23", "DELAYED", "false")
44 | ))
45 |
46 | val data = session.createDataFrame(rows)
47 |
48 | // We ask deequ to compute constraint suggestions for us on the data
49 | // It will profile the data and than apply a set of rules specified in addConstraintRules()
50 | // to suggest constraints
51 | val suggestionResult = ConstraintSuggestionRunner()
52 | .onData(data)
53 | .addConstraintRules(Rules.DEFAULT)
54 | .run()
55 |
56 | // We can now investigate the constraints that deequ suggested. We get a textual description
57 | // and the corresponding scala code for each suggested constraint
58 | //
59 | // Note that the constraint suggestion is based on heuristic rules and assumes that the data it
60 | // is shown is 'static' and correct, which might often not be the case in the real world.
61 | // Therefore the suggestions should always be manually reviewed before being applied in real
62 | // deployments.
63 | suggestionResult.constraintSuggestions.foreach { case (column, suggestions) =>
64 | suggestions.foreach { suggestion =>
65 | println(s"Constraint suggestion for '$column':\t${suggestion.description}\n" +
66 | s"The corresponding scala code is ${suggestion.codeForConstraint}\n")
67 | }
68 | }
69 |
70 | }
71 | }
72 |
--------------------------------------------------------------------------------
/src/main/scala/com/amazon/deequ/analyzers/catalyst/DeequFunctions.scala:
--------------------------------------------------------------------------------
1 | /**
2 | * Copyright 2018 Amazon.com, Inc. or its affiliates. All Rights Reserved.
3 | *
4 | * Licensed under the Apache License, Version 2.0 (the "License"). You may not
5 | * use this file except in compliance with the License. A copy of the License
6 | * is located at
7 | *
8 | * http://aws.amazon.com/apache2.0/
9 | *
10 | * or in the "license" file accompanying this file. This file is distributed on
11 | * an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either
12 | * express or implied. See the License for the specific language governing
13 | * permissions and limitations under the License.
14 | *
15 | */
16 |
17 | package org.apache.spark.sql
18 |
19 |
20 | import com.amazon.deequ.analyzers.KLLSketch
21 | import org.apache.spark.sql.catalyst.expressions.aggregate.{AggregateFunction, StatefulApproxQuantile, StatefulHyperloglogPlus}
22 | import org.apache.spark.sql.catalyst.expressions.Literal
23 |
24 | /* Custom aggregation functions used internally by deequ */
25 | object DeequFunctions {
26 |
27 | private[this] def withAggregateFunction(
28 | func: AggregateFunction,
29 | isDistinct: Boolean = false): Column = {
30 |
31 | Column(func.toAggregateExpression(isDistinct))
32 | }
33 |
34 | /** Pearson correlation with state */
35 | def stateful_corr(columnA: String, columnB: String): Column = {
36 | stateful_corr(Column(columnA), Column(columnB))
37 | }
38 |
39 | /** Pearson correlation with state */
40 | def stateful_corr(columnA: Column, columnB: Column): Column = withAggregateFunction {
41 | new StatefulCorrelation(columnA.expr, columnB.expr)
42 | }
43 |
44 | /** Standard deviation with state */
45 | def stateful_stddev_pop(column: String): Column = {
46 | stateful_stddev_pop(Column(column))
47 | }
48 |
49 | /** Standard deviation with state */
50 | def stateful_stddev_pop(column: Column): Column = withAggregateFunction {
51 | StatefulStdDevPop(column.expr)
52 | }
53 |
54 | /** Approximate number of distinct values with state via HLL's */
55 | def stateful_approx_count_distinct(column: String): Column = {
56 | stateful_approx_count_distinct(Column(column))
57 | }
58 |
59 | /** Approximate number of distinct values with state via HLL's */
60 | def stateful_approx_count_distinct(column: Column): Column = withAggregateFunction {
61 | StatefulHyperloglogPlus(column.expr)
62 | }
63 |
64 | def stateful_approx_quantile(
65 | column: Column,
66 | relativeError: Double)
67 | : Column = withAggregateFunction {
68 |
69 | StatefulApproxQuantile(
70 | column.expr,
71 | // val relativeError = 1.0D / accuracy inside StatefulApproxQuantile
72 | Literal(1.0 / relativeError),
73 | mutableAggBufferOffset = 0,
74 | inputAggBufferOffset = 0
75 | )
76 | }
77 |
78 | /** Data type detection with state */
79 | def stateful_datatype(column: Column): Column = {
80 | val statefulDataType = new StatefulDataType()
81 | statefulDataType(column)
82 | }
83 |
84 | def stateful_kll(
85 | column: Column,
86 | sketchSize: Int,
87 | shrinkingFactor: Double): Column = {
88 | val statefulKLL = new StatefulKLLSketch(sketchSize, shrinkingFactor)
89 | statefulKLL(column)
90 | }
91 | }
92 |
93 |
94 |
--------------------------------------------------------------------------------
/src/main/scala/com/amazon/deequ/suggestions/rules/CategoricalRangeRule.scala:
--------------------------------------------------------------------------------
1 | /**
2 | * Copyright 2018 Amazon.com, Inc. or its affiliates. All Rights Reserved.
3 | *
4 | * Licensed under the Apache License, Version 2.0 (the "License"). You may not
5 | * use this file except in compliance with the License. A copy of the License
6 | * is located at
7 | *
8 | * http://aws.amazon.com/apache2.0/
9 | *
10 | * or in the "license" file accompanying this file. This file is distributed on
11 | * an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either
12 | * express or implied. See the License for the specific language governing
13 | * permissions and limitations under the License.
14 | *
15 | */
16 |
17 | package com.amazon.deequ.suggestions.rules
18 |
19 | import com.amazon.deequ.analyzers.{DataTypeInstances, Histogram}
20 | import com.amazon.deequ.checks.Check
21 | import com.amazon.deequ.constraints.Constraint.complianceConstraint
22 | import com.amazon.deequ.profiles.ColumnProfile
23 | import com.amazon.deequ.suggestions.ConstraintSuggestion
24 | import org.apache.commons.lang3.StringEscapeUtils
25 |
26 | /** If we see a categorical range for a column, we suggest an IS IN (...) constraint */
27 | case class CategoricalRangeRule() extends ConstraintRule[ColumnProfile] {
28 |
29 | override def shouldBeApplied(profile: ColumnProfile, numRecords: Long): Boolean = {
30 | val hasHistogram = profile.histogram.isDefined && (
31 | profile.dataType == DataTypeInstances.String ||
32 | profile.dataType == DataTypeInstances.Integral
33 | )
34 |
35 | if (hasHistogram) {
36 | val entries = profile.histogram.get.values
37 |
38 | val numUniqueElements = entries.count { case (_, value) => value.absolute == 1L }
39 |
40 | val uniqueValueRatio = numUniqueElements.toDouble / entries.size
41 |
42 | // TODO find a principled way to define this threshold...
43 | uniqueValueRatio <= 0.1
44 | } else {
45 | false
46 | }
47 | }
48 |
49 | override def candidate(profile: ColumnProfile, numRecords: Long): ConstraintSuggestion = {
50 |
51 | val valuesByPopularity = profile.histogram.get.values.toArray
52 | .filterNot { case (key, _) => key == Histogram.NullFieldReplacement }
53 | .sortBy { case (_, value) => value.absolute }
54 | .reverse
55 |
56 | val categoriesSql = valuesByPopularity
57 | // the character "'" can be contained in category names
58 | .map { case (key, _) => key.replace("'", "''") }
59 | .mkString("'", "', '", "'")
60 |
61 | val categoriesCode = valuesByPopularity
62 | .map { case (key, _) => StringEscapeUtils.escapeJava(key) }
63 | .mkString(""""""", """", """", """"""")
64 |
65 | val description = s"'${profile.column}' has value range $categoriesSql"
66 | val columnCondition = s"`${profile.column}` IN ($categoriesSql)"
67 | val constraint = complianceConstraint(description, columnCondition, Check.IsOne)
68 |
69 | ConstraintSuggestion(
70 | constraint,
71 | profile.column,
72 | "Compliance: 1",
73 | description,
74 | this,
75 | s""".isContainedIn("${profile.column}", Array($categoriesCode))"""
76 | )
77 | }
78 |
79 | override val ruleDescription: String = "If we see a categorical range for a " +
80 | "column, we suggest an IS IN (...) constraint"
81 | }
82 |
--------------------------------------------------------------------------------
/src/main/scala/com/amazon/deequ/examples/DataProfilingExample.scala:
--------------------------------------------------------------------------------
1 | /**
2 | * Copyright 2018 Amazon.com, Inc. or its affiliates. All Rights Reserved.
3 | *
4 | * Licensed under the Apache License, Version 2.0 (the "License"). You may not
5 | * use this file except in compliance with the License. A copy of the License
6 | * is located at
7 | *
8 | * http://aws.amazon.com/apache2.0/
9 | *
10 | * or in the "license" file accompanying this file. This file is distributed on
11 | * an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either
12 | * express or implied. See the License for the specific language governing
13 | * permissions and limitations under the License.
14 | *
15 | */
16 |
17 | package com.amazon.deequ.examples
18 |
19 | import com.amazon.deequ.examples.ExampleUtils.withSpark
20 | import com.amazon.deequ.profiles.{ColumnProfilerRunner, NumericColumnProfile}
21 |
22 | case class RawData(productName: String, totalNumber: String, status: String, valuable: String)
23 |
24 | private[examples] object DataProfilingExample extends App {
25 |
26 | withSpark { session =>
27 |
28 | /* We profile raw data, mostly in string format (e.g., from a csv file) */
29 | val rows = session.sparkContext.parallelize(Seq(
30 | RawData("thingA", "13.0", "IN_TRANSIT", "true"),
31 | RawData("thingA", "5", "DELAYED", "false"),
32 | RawData("thingB", null, "DELAYED", null),
33 | RawData("thingC", null, "IN_TRANSIT", "false"),
34 | RawData("thingD", "1.0", "DELAYED", "true"),
35 | RawData("thingC", "7.0", "UNKNOWN", null),
36 | RawData("thingC", "20", "UNKNOWN", null),
37 | RawData("thingE", "20", "DELAYED", "false")
38 | ))
39 |
40 | val rawData = session.createDataFrame(rows)
41 |
42 | /* Make deequ profile this data. It will execute the three passes over the data and avoid
43 | any shuffles. */
44 | val result = ColumnProfilerRunner()
45 | .onData(rawData)
46 | .run()
47 |
48 | /* We get a profile for each column which allows to inspect the completeness of the column,
49 | the approximate number of distinct values and the inferred datatype. */
50 | result.profiles.foreach { case (productName, profile) =>
51 |
52 | println(s"Column '$productName':\n " +
53 | s"\tcompleteness: ${profile.completeness}\n" +
54 | s"\tapproximate number of distinct values: ${profile.approximateNumDistinctValues}\n" +
55 | s"\tdatatype: ${profile.dataType}\n")
56 | }
57 |
58 | /* For numeric columns, we get descriptive statistics */
59 | val totalNumberProfile = result.profiles("totalNumber").asInstanceOf[NumericColumnProfile]
60 |
61 | println(s"Statistics of 'totalNumber':\n" +
62 | s"\tminimum: ${totalNumberProfile.minimum.get}\n" +
63 | s"\tmaximum: ${totalNumberProfile.maximum.get}\n" +
64 | s"\tmean: ${totalNumberProfile.mean.get}\n" +
65 | s"\tstandard deviation: ${totalNumberProfile.stdDev.get}\n")
66 |
67 | val statusProfile = result.profiles("status")
68 |
69 | /* For columns with a low number of distinct values, we get the full value distribution. */
70 | println("Value distribution in 'stats':")
71 | statusProfile.histogram.foreach {
72 | _.values.foreach { case (key, entry) =>
73 | println(s"\t$key occurred ${entry.absolute} times (ratio is ${entry.ratio})")
74 | }
75 | }
76 |
77 | }
78 | }
79 |
--------------------------------------------------------------------------------
/src/main/scala/com/amazon/deequ/analyzers/Distance.scala:
--------------------------------------------------------------------------------
1 | /**
2 | * Copyright 2018 Amazon.com, Inc. or its affiliates. All Rights Reserved.
3 | *
4 | * Licensed under the Apache License, Version 2.0 (the "License"). You may not
5 | * use this file except in compliance with the License. A copy of the License
6 | * is located at
7 | *
8 | * http://aws.amazon.com/apache2.0/
9 | *
10 | * or in the "license" file accompanying this file. This file is distributed on
11 | * an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either
12 | * express or implied. See the License for the specific language governing
13 | * permissions and limitations under the License.
14 | *
15 | */
16 |
17 | package com.amazon.deequ.analyzers
18 |
19 | object Distance {
20 |
21 | /** Calculate distance of numerical profiles based on KLL Sketches and L-Infinity Distance */
22 | def numericalDistance(
23 | sample1: QuantileNonSample[Double],
24 | sample2: QuantileNonSample[Double],
25 | correctForLowNumberOfSamples: Boolean = false)
26 | : Double = {
27 | val rankMap1 = sample1.getRankMap()
28 | val rankMap2 = sample2.getRankMap()
29 | val combinedKeys = rankMap1.keySet.union(rankMap2.keySet)
30 | val n = rankMap1.valuesIterator.max.toDouble
31 | val m = rankMap2.valuesIterator.max.toDouble
32 | var linfSimple = 0.0
33 |
34 | combinedKeys.foreach { key =>
35 | val cdf1 = sample1.getRank(key, rankMap1) / n
36 | val cdf2 = sample2.getRank(key, rankMap2) / m
37 | val cdfDiff = Math.abs(cdf1 - cdf2)
38 | linfSimple = Math.max(linfSimple, cdfDiff)
39 | }
40 | selectMetrics(linfSimple, n, m, correctForLowNumberOfSamples)
41 | }
42 |
43 | /** Calculate distance of categorical profiles based on L-Infinity Distance */
44 | def categoricalDistance(
45 | sample1: scala.collection.mutable.Map[String, Long],
46 | sample2: scala.collection.mutable.Map[String, Long],
47 | correctForLowNumberOfSamples: Boolean = false)
48 | : Double = {
49 |
50 | var n = 0.0
51 | var m = 0.0
52 | sample1.keySet.foreach { key =>
53 | n += sample1(key)
54 | }
55 | sample2.keySet.foreach { key =>
56 | m += sample2(key)
57 | }
58 | val combinedKeys = sample1.keySet.union(sample2.keySet)
59 | var linfSimple = 0.0
60 |
61 | combinedKeys.foreach { key =>
62 | val cdf1 = sample1.getOrElse(key, 0L) / n
63 | val cdf2 = sample2.getOrElse(key, 0L) / m
64 | val cdfDiff = Math.abs(cdf1 - cdf2)
65 | linfSimple = Math.max(linfSimple, cdfDiff)
66 | }
67 | selectMetrics(linfSimple, n, m, correctForLowNumberOfSamples)
68 | }
69 |
70 | /** Select which metrics to compute (linf_simple or linf_robust)
71 | * based on whether samples are enough */
72 | private[this] def selectMetrics(
73 | linfSimple: Double,
74 | n: Double,
75 | m: Double,
76 | correctForLowNumberOfSamples: Boolean = false)
77 | : Double = {
78 | if (correctForLowNumberOfSamples) {
79 | linfSimple
80 | } else {
81 | // This formula is based on “Two-sample Kolmogorov–Smirnov test"
82 | // Reference: https://en.m.wikipedia.org/wiki/Kolmogorov%E2%80%93Smirnov_test
83 | val linfRobust = Math.max(0.0, linfSimple - 1.8 * Math.sqrt((n + m) / (n * m)))
84 | linfRobust
85 | }
86 | }
87 | }
88 |
89 |
--------------------------------------------------------------------------------
/src/test/scala/com/amazon/deequ/KLL/KLLDistanceTest.scala:
--------------------------------------------------------------------------------
1 | /**
2 | * Copyright 2018 Amazon.com, Inc. or its affiliates. All Rights Reserved.
3 | *
4 | * Licensed under the Apache License, Version 2.0 (the "License"). You may not
5 | * use this file except in compliance with the License. A copy of the License
6 | * is located at
7 | *
8 | * http://aws.amazon.com/apache2.0/
9 | *
10 | * or in the "license" file accompanying this file. This file is distributed on
11 | * an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either
12 | * express or implied. See the License for the specific language governing
13 | * permissions and limitations under the License.
14 | *
15 | */
16 |
17 | package com.amazon.deequ.KLL
18 |
19 | import com.amazon.deequ.SparkContextSpec
20 | import com.amazon.deequ.analyzers.{Distance, QuantileNonSample}
21 | import com.amazon.deequ.utils.FixtureSupport
22 | import org.scalatest.{Matchers, WordSpec}
23 |
24 | class KLLDistanceTest extends WordSpec with Matchers with SparkContextSpec
25 | with FixtureSupport{
26 |
27 | "KLL distance calculator should compute correct linf_simple" in {
28 | var sample1 = new QuantileNonSample[Double](4, 0.64)
29 | var sample2 = new QuantileNonSample[Double](4, 0.64)
30 | sample1.reconstruct(4, 0.64, Array(Array(1, 2, 3, 4)))
31 | sample2.reconstruct(4, 0.64, Array(Array(2, 3, 4, 5)))
32 | assert(Distance.numericalDistance(sample1, sample2, true) == 0.25)
33 | }
34 |
35 | "KLL distance calculator should compute correct linf_robust" in {
36 | var sample1 = new QuantileNonSample[Double](4, 0.64)
37 | var sample2 = new QuantileNonSample[Double](4, 0.64)
38 | sample1.reconstruct(4, 0.64, Array(Array(1, 2, 3, 4)))
39 | sample2.reconstruct(4, 0.64, Array(Array(2, 3, 4, 5)))
40 | assert(Distance.numericalDistance(sample1, sample2) == 0.0)
41 | }
42 |
43 | "Categorial distance should compute correct linf_simple" in {
44 | val sample1 = scala.collection.mutable.Map(
45 | "a" -> 10L, "b" -> 20L, "c" -> 25L, "d" -> 10L, "e" -> 5L)
46 | val sample2 = scala.collection.mutable.Map(
47 | "a" -> 11L, "b" -> 20L, "c" -> 25L, "d" -> 10L, "e" -> 10L)
48 | assert(Distance.categoricalDistance(sample1,
49 | sample2, true) == 0.06015037593984962)
50 | }
51 |
52 | "Categorial distance should compute correct linf_robust" in {
53 | val sample1 = scala.collection.mutable.Map(
54 | "a" -> 10L, "b" -> 20L, "c" -> 25L, "d" -> 10L, "e" -> 5L)
55 | val sample2 = scala.collection.mutable.Map(
56 | "a" -> 11L, "b" -> 20L, "c" -> 25L, "d" -> 10L, "e" -> 10L)
57 | assert(Distance.categoricalDistance(sample1, sample2) == 0.0)
58 | }
59 |
60 | "Categorial distance should compute correct linf_simple with different bin value" in {
61 | val sample1 = scala.collection.mutable.Map(
62 | "a" -> 10L, "b" -> 20L, "c" -> 25L, "d" -> 10L, "e" -> 5L)
63 | val sample2 = scala.collection.mutable.Map(
64 | "f" -> 11L, "a" -> 20L, "c" -> 25L, "d" -> 10L, "e" -> 10L)
65 | assert(Distance.categoricalDistance(sample1,
66 | sample2, true) == 0.2857142857142857)
67 | }
68 |
69 | "Categorial distance should compute correct linf_robust with different bin value" in {
70 | val sample1 = scala.collection.mutable.Map(
71 | "a" -> 10L, "b" -> 20L, "c" -> 25L, "d" -> 10L, "e" -> 5L)
72 | val sample2 = scala.collection.mutable.Map(
73 | "f" -> 11L, "a" -> 20L, "c" -> 25L, "d" -> 10L, "e" -> 10L)
74 | assert(Distance.categoricalDistance(sample1, sample2) == 0.0)
75 | }
76 | }
77 |
--------------------------------------------------------------------------------
/src/main/scala/com/amazon/deequ/analyzers/catalyst/StatefulDataType.scala:
--------------------------------------------------------------------------------
1 | /**
2 | * Copyright 2018 Amazon.com, Inc. or its affiliates. All Rights Reserved.
3 | *
4 | * Licensed under the Apache License, Version 2.0 (the "License"). You may not
5 | * use this file except in compliance with the License. A copy of the License
6 | * is located at
7 | *
8 | * http://aws.amazon.com/apache2.0/
9 | *
10 | * or in the "license" file accompanying this file. This file is distributed on
11 | * an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either
12 | * express or implied. See the License for the specific language governing
13 | * permissions and limitations under the License.
14 | *
15 | */
16 |
17 | package org.apache.spark.sql
18 |
19 | import com.amazon.deequ.analyzers.DataTypeHistogram
20 | import org.apache.spark.sql.expressions.{MutableAggregationBuffer, UserDefinedAggregateFunction}
21 | import org.apache.spark.sql.types._
22 |
23 | import scala.util.matching.Regex
24 |
25 |
26 | private[sql] class StatefulDataType extends UserDefinedAggregateFunction {
27 |
28 | val SIZE_IN_BYTES = 40
29 |
30 | val NULL_POS = 0
31 | val FRACTIONAL_POS = 1
32 | val INTEGRAL_POS = 2
33 | val BOOLEAN_POS = 3
34 | val STRING_POS = 4
35 |
36 | val FRACTIONAL: Regex = """^(-|\+)? ?\d+((\.\d+)|((?:\.\d+)?[Ee][-+]?\d+))$""".r
37 | val INTEGRAL: Regex = """^(-|\+)? ?\d+$""".r
38 | val BOOLEAN: Regex = """^(true|false)$""".r
39 |
40 | override def inputSchema: StructType = StructType(StructField("value", StringType) :: Nil)
41 |
42 | override def bufferSchema: StructType = StructType(StructField("null", LongType) ::
43 | StructField("fractional", LongType) :: StructField("integral", LongType) ::
44 | StructField("boolean", LongType) :: StructField("string", LongType) :: Nil)
45 |
46 | override def dataType: types.DataType = BinaryType
47 |
48 | override def deterministic: Boolean = true
49 |
50 | override def initialize(buffer: MutableAggregationBuffer): Unit = {
51 | buffer(NULL_POS) = 0L
52 | buffer(FRACTIONAL_POS) = 0L
53 | buffer(INTEGRAL_POS) = 0L
54 | buffer(BOOLEAN_POS) = 0L
55 | buffer(STRING_POS) = 0L
56 | }
57 |
58 | override def update(buffer: MutableAggregationBuffer, input: Row): Unit = {
59 | if (input.isNullAt(0)) {
60 | buffer(NULL_POS) = buffer.getLong(NULL_POS) + 1L
61 | } else {
62 | input.getString(0) match {
63 | case FRACTIONAL(_*) => buffer(FRACTIONAL_POS) = buffer.getLong(FRACTIONAL_POS) + 1L
64 | case INTEGRAL(_*) => buffer(INTEGRAL_POS) = buffer.getLong(INTEGRAL_POS) + 1L
65 | case BOOLEAN(_*) => buffer(BOOLEAN_POS) = buffer.getLong(BOOLEAN_POS) + 1L
66 | case _ => buffer(STRING_POS) = buffer.getLong(STRING_POS) + 1L
67 | }
68 | }
69 | }
70 |
71 | override def merge(buffer1: MutableAggregationBuffer, buffer2: Row): Unit = {
72 | buffer1(NULL_POS) = buffer1.getLong(NULL_POS) + buffer2.getLong(NULL_POS)
73 | buffer1(FRACTIONAL_POS) = buffer1.getLong(FRACTIONAL_POS) + buffer2.getLong(FRACTIONAL_POS)
74 | buffer1(INTEGRAL_POS) = buffer1.getLong(INTEGRAL_POS) + buffer2.getLong(INTEGRAL_POS)
75 | buffer1(BOOLEAN_POS) = buffer1.getLong(BOOLEAN_POS) + buffer2.getLong(BOOLEAN_POS)
76 | buffer1(STRING_POS) = buffer1.getLong(STRING_POS) + buffer2.getLong(STRING_POS)
77 | }
78 |
79 | override def evaluate(buffer: Row): Any = {
80 | DataTypeHistogram.toBytes(buffer.getLong(NULL_POS), buffer.getLong(FRACTIONAL_POS),
81 | buffer.getLong(INTEGRAL_POS), buffer.getLong(BOOLEAN_POS), buffer.getLong(STRING_POS))
82 | }
83 | }
84 |
--------------------------------------------------------------------------------
/src/test/scala/com/amazon/deequ/analyzers/UniquenessTest.scala:
--------------------------------------------------------------------------------
1 | /**
2 | * Copyright 2018 Amazon.com, Inc. or its affiliates. All Rights Reserved.
3 | *
4 | * Licensed under the Apache License, Version 2.0 (the "License"). You may not
5 | * use this file except in compliance with the License. A copy of the License
6 | * is located at
7 | *
8 | * http://aws.amazon.com/apache2.0/
9 | *
10 | * or in the "license" file accompanying this file. This file is distributed on
11 | * an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either
12 | * express or implied. See the License for the specific language governing
13 | * permissions and limitations under the License.
14 | *
15 | */
16 |
17 | package com.amazon.deequ.analyzers
18 |
19 | import com.amazon.deequ.SparkContextSpec
20 | import com.amazon.deequ.analyzers.runners.AnalysisRunner
21 | import com.amazon.deequ.metrics.DoubleMetric
22 | import com.amazon.deequ.utils.FixtureSupport
23 | import org.apache.spark.sql.{DataFrame, SparkSession}
24 | import org.scalatest.matchers.should.Matchers
25 | import org.scalatest.wordspec.AnyWordSpec
26 |
27 | class UniquenessTest extends AnyWordSpec with Matchers with SparkContextSpec with FixtureSupport {
28 |
29 | def uniquenessSampleData(sparkSession: SparkSession): DataFrame = {
30 | import sparkSession.implicits._
31 |
32 | // Example from https://github.com/awslabs/deequ/issues/178
33 | Seq(
34 | ("India", "Xavier House, 2nd Floor", "St. Peter Colony, Perry Road", "Bandra (West)"),
35 | ("India", "503 Godavari", "Sir Pochkhanwala Road", "Worli"),
36 | ("India", "4/4 Seema Society", "N Dutta Road, Four Bungalows", "Andheri"),
37 | ("India", "1001D Abhishek Apartments", "Juhu Versova Road", "Andheri"),
38 | ("India", "95, Hill Road", null, null),
39 | ("India", "90 Cuffe Parade", "Taj President Hotel", "Cuffe Parade"),
40 | ("India", "4, Seven PM", "Sir Pochkhanwala Rd", "Worli"),
41 | ("India", "1453 Sahar Road", null, null)
42 | )
43 | .toDF("Country", "Address Line 1", "Address Line 2", "Address Line 3")
44 | }
45 |
46 | "Uniqueness" should {
47 |
48 | "be correct for multiple fields" in withSparkSession { session =>
49 |
50 | val data = uniquenessSampleData(session)
51 |
52 | val stateStore = InMemoryStateProvider()
53 |
54 | val uniquenessA1 = Uniqueness("Address Line 1")
55 | val uniquenessA13 = Uniqueness(Seq("Address Line 1", "Address Line 2", "Address Line 3"))
56 |
57 | val analysis = Analysis(Seq(uniquenessA1, uniquenessA13))
58 |
59 | val result = AnalysisRunner.run(data, analysis, saveStatesWith = Some(stateStore))
60 |
61 | assert(result.metric(uniquenessA1).get.asInstanceOf[DoubleMetric].value.get == 1.0)
62 | assert(result.metric(uniquenessA13).get.asInstanceOf[DoubleMetric].value.get == 1.0)
63 | }
64 | }
65 |
66 | "Filtered Uniqueness" in withSparkSession { sparkSession =>
67 | import sparkSession.implicits._
68 | val df = Seq(
69 | ("1", "unique"),
70 | ("2", "unique"),
71 | ("3", "duplicate"),
72 | ("3", "duplicate"),
73 | ("4", "unique")
74 | ).toDF("value", "type")
75 |
76 | val stateStore = InMemoryStateProvider()
77 |
78 | val uniqueness = Uniqueness("value")
79 | val uniquenessWithFilter = Uniqueness(Seq("value"), Some("type = 'unique'"))
80 |
81 | val analysis = Analysis(Seq(uniqueness, uniquenessWithFilter))
82 |
83 | val result = AnalysisRunner.run(df, analysis, saveStatesWith = Some(stateStore))
84 |
85 | assert(result.metric(uniqueness).get.asInstanceOf[DoubleMetric].value.get == 0.6)
86 | assert(result.metric(uniquenessWithFilter).get.asInstanceOf[DoubleMetric].value.get == 1.0)
87 | }
88 | }
89 |
--------------------------------------------------------------------------------
/src/main/scala/com/amazon/deequ/analyzers/Correlation.scala:
--------------------------------------------------------------------------------
1 | /**
2 | * Copyright 2018 Amazon.com, Inc. or its affiliates. All Rights Reserved.
3 | *
4 | * Licensed under the Apache License, Version 2.0 (the "License"). You may not
5 | * use this file except in compliance with the License. A copy of the License
6 | * is located at
7 | *
8 | * http://aws.amazon.com/apache2.0/
9 | *
10 | * or in the "license" file accompanying this file. This file is distributed on
11 | * an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either
12 | * express or implied. See the License for the specific language governing
13 | * permissions and limitations under the License.
14 | *
15 | */
16 |
17 | package com.amazon.deequ.analyzers
18 |
19 | import com.amazon.deequ.analyzers.Preconditions.{hasColumn, isNumeric}
20 | import com.amazon.deequ.metrics.Entity
21 | import org.apache.spark.sql.DeequFunctions.stateful_corr
22 | import org.apache.spark.sql.{Column, Row}
23 | import org.apache.spark.sql.types.StructType
24 | import Analyzers._
25 |
26 | case class CorrelationState(
27 | n: Double,
28 | xAvg: Double,
29 | yAvg: Double,
30 | ck: Double,
31 | xMk: Double,
32 | yMk: Double)
33 | extends DoubleValuedState[CorrelationState] {
34 |
35 | require(n > 0.0, "Correlation undefined for n = 0.")
36 |
37 | override def sum(other: CorrelationState): CorrelationState = {
38 | val n1 = n
39 | val n2 = other.n
40 | val newN = n1 + n2
41 | val dx = other.xAvg - xAvg
42 | val dxN = if (newN == 0.0) 0.0 else dx / newN
43 | val dy = other.yAvg - yAvg
44 | val dyN = if (newN == 0.0) 0.0 else dy / newN
45 | val newXAvg = xAvg + dxN * n2
46 | val newYAvg = yAvg + dyN * n2
47 | val newCk = ck + other.ck + dx * dyN * n1 * n2
48 | val newXMk = xMk + other.xMk + dx * dxN * n1 * n2
49 | val newYMk = yMk + other.yMk + dy * dyN * n1 * n2
50 |
51 | CorrelationState(newN, newXAvg, newYAvg, newCk, newXMk, newYMk)
52 | }
53 |
54 | override def metricValue(): Double = {
55 | ck / math.sqrt(xMk * yMk)
56 | }
57 | }
58 |
59 | /**
60 | * Computes the pearson correlation coefficient between the two given columns
61 | *
62 | * @param firstColumn First input column for computation
63 | * @param secondColumn Second input column for computation
64 | */
65 | case class Correlation(
66 | firstColumn: String,
67 | secondColumn: String,
68 | where: Option[String] = None)
69 | extends StandardScanShareableAnalyzer[CorrelationState]("Correlation",
70 | s"$firstColumn,$secondColumn", Entity.Mutlicolumn)
71 | with FilterableAnalyzer {
72 |
73 | override def aggregationFunctions(): Seq[Column] = {
74 |
75 | val firstSelection = conditionalSelection(firstColumn, where)
76 | val secondSelection = conditionalSelection(secondColumn, where)
77 |
78 | stateful_corr(firstSelection, secondSelection) :: Nil
79 | }
80 |
81 | override def fromAggregationResult(result: Row, offset: Int): Option[CorrelationState] = {
82 |
83 | if (result.isNullAt(offset)) {
84 | None
85 | } else {
86 | val row = result.getAs[Row](offset)
87 | val n = row.getDouble(0)
88 | if (n > 0.0) {
89 | Some(CorrelationState(
90 | n,
91 | row.getDouble(1),
92 | row.getDouble(2),
93 | row.getDouble(3),
94 | row.getDouble(4),
95 | row.getDouble(5)))
96 | } else {
97 | None
98 | }
99 | }
100 | }
101 |
102 | override protected def additionalPreconditions(): Seq[StructType => Unit] = {
103 | hasColumn(firstColumn) :: isNumeric(firstColumn) :: hasColumn(secondColumn) ::
104 | isNumeric(secondColumn) :: Nil
105 | }
106 |
107 | override def filterCondition: Option[String] = where
108 | }
109 |
--------------------------------------------------------------------------------
/src/main/scala/com/amazon/deequ/examples/MetricsRepositoryExample.scala:
--------------------------------------------------------------------------------
1 | /**
2 | * Copyright 2018 Amazon.com, Inc. or its affiliates. All Rights Reserved.
3 | *
4 | * Licensed under the Apache License, Version 2.0 (the "License"). You may not
5 | * use this file except in compliance with the License. A copy of the License
6 | * is located at
7 | *
8 | * http://aws.amazon.com/apache2.0/
9 | *
10 | * or in the "license" file accompanying this file. This file is distributed on
11 | * an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either
12 | * express or implied. See the License for the specific language governing
13 | * permissions and limitations under the License.
14 | *
15 | */
16 |
17 | package com.amazon.deequ.examples
18 |
19 | import java.io.File
20 |
21 | import com.amazon.deequ.VerificationSuite
22 | import com.amazon.deequ.analyzers.Completeness
23 | import com.amazon.deequ.checks.{Check, CheckLevel}
24 | import com.amazon.deequ.examples.ExampleUtils.{itemsAsDataframe, withSpark}
25 | import com.amazon.deequ.repository.fs.FileSystemMetricsRepository
26 | import com.amazon.deequ.repository.{MetricsRepository, ResultKey}
27 | import com.google.common.io.Files
28 |
29 | object MetricsRepositoryExample extends App {
30 |
31 | withSpark { session =>
32 |
33 | // The toy data on which we will compute metrics
34 | val data = itemsAsDataframe(session,
35 | Item(1, "Thingy A", "awesome thing.", "high", 0),
36 | Item(2, "Thingy B", "available at http://thingb.com", null, 0),
37 | Item(3, null, null, "low", 5),
38 | Item(4, "Thingy D", "checkout https://thingd.ca", "low", 10),
39 | Item(5, "Thingy E", null, "high", 12))
40 |
41 | // A json file in which the computed metrics will be stored
42 | val metricsFile = new File(Files.createTempDir(), "metrics.json")
43 |
44 | // The repository which we will use to stored and load computed metrics; we use the local disk,
45 | // but it also supports HDFS and S3
46 | val repository: MetricsRepository =
47 | FileSystemMetricsRepository(session, metricsFile.getAbsolutePath)
48 |
49 | // The key under which we store the results, needs a timestamp and supports arbitrary
50 | // tags in the form of key-value pairs
51 | val resultKey = ResultKey(System.currentTimeMillis(), Map("tag" -> "repositoryExample"))
52 |
53 | VerificationSuite()
54 | .onData(data)
55 | // Some integrity checks
56 | .addCheck(Check(CheckLevel.Error, "integrity checks")
57 | .hasSize(_ == 5)
58 | .isComplete("id")
59 | .isComplete("productName")
60 | .isContainedIn("priority", Array("high", "low"))
61 | .isNonNegative("numViews"))
62 | // We want to store the computed metrics for the checks in our repository
63 | .useRepository(repository)
64 | .saveOrAppendResult(resultKey)
65 | .run()
66 |
67 | // We can now retrieve the metrics from the repository in different ways, e.g.:
68 |
69 |
70 | // We can load the metric for a particular analyzer stored under our result key:
71 | val completenessOfProductName = repository
72 | .loadByKey(resultKey).get
73 | .metric(Completeness("productName")).get
74 |
75 | println(s"The completeness of the productName column is: $completenessOfProductName")
76 |
77 | // We can query the repository for all metrics from the last 10 minutes and get them as json
78 | val json = repository.load()
79 | .after(System.currentTimeMillis() - 10000)
80 | .getSuccessMetricsAsJson()
81 |
82 | println(s"Metrics from the last 10 minutes:\n$json")
83 |
84 | // Finally we can also query by tag value and retrieve the result in the form of a dataframe
85 | repository.load()
86 | .withTagValues(Map("tag" -> "repositoryExample"))
87 | .getSuccessMetricsAsDataFrame(session)
88 | .show()
89 | }
90 | }
91 |
--------------------------------------------------------------------------------
/src/main/scala/com/amazon/deequ/examples/AnomalyDetectionExample.scala:
--------------------------------------------------------------------------------
1 | /**
2 | * Copyright 2018 Amazon.com, Inc. or its affiliates. All Rights Reserved.
3 | *
4 | * Licensed under the Apache License, Version 2.0 (the "License"). You may not
5 | * use this file except in compliance with the License. A copy of the License
6 | * is located at
7 | *
8 | * http://aws.amazon.com/apache2.0/
9 | *
10 | * or in the "license" file accompanying this file. This file is distributed on
11 | * an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either
12 | * express or implied. See the License for the specific language governing
13 | * permissions and limitations under the License.
14 | *
15 | */
16 |
17 | package com.amazon.deequ.examples
18 |
19 | import com.amazon.deequ.VerificationSuite
20 | import com.amazon.deequ.analyzers.Size
21 | import com.amazon.deequ.anomalydetection.RelativeRateOfChangeStrategy
22 | import com.amazon.deequ.examples.ExampleUtils.{itemsAsDataframe, withSpark}
23 | import com.amazon.deequ.repository.ResultKey
24 | import com.amazon.deequ.repository.memory.InMemoryMetricsRepository
25 | import com.amazon.deequ.checks.CheckStatus._
26 |
27 | private[examples] object AnomalyDetectionExample extends App {
28 |
29 | withSpark { session =>
30 |
31 | /* In this simple example, we assume that we compute metrics on a dataset every day and we want
32 | to ensure that they don't change drastically. For sake of simplicity, we just look at the
33 | size of the data */
34 |
35 | /* Anomaly detection operates on metrics stored in a metric repository, so lets create one */
36 | val metricsRepository = new InMemoryMetricsRepository()
37 |
38 | /* This is the key which we use to store the metrics for the dataset from yesterday */
39 | val yesterdaysKey = ResultKey(System.currentTimeMillis() - 24 * 60 * 1000)
40 |
41 | /* Yesterday, the data had only two rows */
42 | val yesterdaysDataset = itemsAsDataframe(session,
43 | Item(1, "Thingy A", "awesome thing.", "high", 0),
44 | Item(2, "Thingy B", "available at http://thingb.com", null, 0))
45 |
46 | /* We test for anomalies in the size of the data, it should not increase by more than 2x. Note
47 | that we store the resulting metrics in our repository */
48 | VerificationSuite()
49 | .onData(yesterdaysDataset)
50 | .useRepository(metricsRepository)
51 | .saveOrAppendResult(yesterdaysKey)
52 | .addAnomalyCheck(
53 | RelativeRateOfChangeStrategy(maxRateIncrease = Some(2.0)),
54 | Size()
55 | )
56 | .run()
57 |
58 | /* Todays data has five rows, so the data size more than doubled and our anomaly check should
59 | catch this */
60 | val todaysDataset = itemsAsDataframe(session,
61 | Item(1, "Thingy A", "awesome thing.", "high", 0),
62 | Item(2, "Thingy B", "available at http://thingb.com", null, 0),
63 | Item(3, null, null, "low", 5),
64 | Item(4, "Thingy D", "checkout https://thingd.ca", "low", 10),
65 | Item(5, "Thingy E", null, "high", 12))
66 |
67 | /* The key for today's result */
68 | val todaysKey = ResultKey(System.currentTimeMillis())
69 |
70 | /* Repeat the anomaly check for today's data */
71 | val verificationResult = VerificationSuite()
72 | .onData(todaysDataset)
73 | .useRepository(metricsRepository)
74 | .saveOrAppendResult(todaysKey)
75 | .addAnomalyCheck(
76 | RelativeRateOfChangeStrategy(maxRateIncrease = Some(2.0)),
77 | Size()
78 | )
79 | .run()
80 |
81 | /* Did we find an anomaly? */
82 | if (verificationResult.status != Success) {
83 | println("Anomaly detected in the Size() metric!")
84 |
85 | /* Lets have a look at the actual metrics. */
86 | metricsRepository
87 | .load()
88 | .forAnalyzers(Seq(Size()))
89 | .getSuccessMetricsAsDataFrame(session)
90 | .show()
91 | }
92 | }
93 |
94 | }
95 |
--------------------------------------------------------------------------------
/src/main/scala/com/amazon/deequ/analyzers/PatternMatch.scala:
--------------------------------------------------------------------------------
1 | /**
2 | * Copyright 2018 Amazon.com, Inc. or its affiliates. All Rights Reserved.
3 | *
4 | * Licensed under the Apache License, Version 2.0 (the "License"). You may not
5 | * use this file except in compliance with the License. A copy of the License
6 | * is located at
7 | *
8 | * http://aws.amazon.com/apache2.0/
9 | *
10 | * or in the "license" file accompanying this file. This file is distributed on
11 | * an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either
12 | * express or implied. See the License for the specific language governing
13 | * permissions and limitations under the License.
14 | *
15 | */
16 |
17 | package com.amazon.deequ.analyzers
18 |
19 | import com.amazon.deequ.analyzers.Analyzers._
20 | import com.amazon.deequ.analyzers.Preconditions.{hasColumn, isString}
21 | import org.apache.spark.sql.{Column, Row}
22 | import org.apache.spark.sql.functions.{col, lit, regexp_extract, sum, when}
23 | import org.apache.spark.sql.types.{IntegerType, StructType}
24 |
25 | import scala.util.matching.Regex
26 |
27 | /**
28 | * PatternMatch is a measure of the fraction of rows that complies with a given
29 | * column regex constraint. E.g if the constraint is Patterns.CREDITCARD and the
30 | * data frame has 5 rows which contain a credit card number in a certain column
31 | * according to the regex and and 10 rows that do not, a DoubleMetric would be
32 | * returned with 0.33 as value
33 | *
34 | * @param column Column to do the pattern match analysis on
35 | * @param pattern The regular expression to check for
36 | * @param where Additional filter to apply before the analyzer is run.
37 | */
38 | case class PatternMatch(column: String, pattern: Regex, where: Option[String] = None)
39 | extends StandardScanShareableAnalyzer[NumMatchesAndCount]("PatternMatch", column)
40 | with FilterableAnalyzer {
41 |
42 | override def fromAggregationResult(result: Row, offset: Int): Option[NumMatchesAndCount] = {
43 | ifNoNullsIn(result, offset, howMany = 2) { _ =>
44 | NumMatchesAndCount(result.getLong(offset), result.getLong(offset + 1))
45 | }
46 | }
47 |
48 | override def aggregationFunctions(): Seq[Column] = {
49 |
50 | val expression = when(regexp_extract(col(column), pattern.toString(), 0) =!= lit(""), 1)
51 | .otherwise(0)
52 |
53 | val summation = sum(conditionalSelection(expression, where).cast(IntegerType))
54 |
55 | summation :: conditionalCount(where) :: Nil
56 | }
57 |
58 | override def filterCondition: Option[String] = where
59 |
60 | override protected def additionalPreconditions(): Seq[StructType => Unit] = {
61 | hasColumn(column) :: isString(column) :: Nil
62 | }
63 | }
64 |
65 | object Patterns {
66 |
67 | // scalastyle:off
68 | // http://emailregex.com
69 | val EMAIL: Regex = """(?:[a-z0-9!#$%&'*+/=?^_`{|}~-]+(?:\.[a-z0-9!#$%&'*+/=?^_`{|}~-]+)*|"(?:[\x01-\x08\x0b\x0c\x0e-\x1f\x21\x23-\x5b\x5d-\x7f]|\\[\x01-\x09\x0b\x0c\x0e-\x7f])*")@(?:(?:[a-z0-9](?:[a-z0-9-]*[a-z0-9])?\.)+[a-z0-9](?:[a-z0-9-]*[a-z0-9])?|\[(?:(?:25[0-5]|2[0-4][0-9]|[01]?[0-9][0-9]?)\.){3}(?:25[0-5]|2[0-4][0-9]|[01]?[0-9][0-9]?|[a-z0-9-]*[a-z0-9]:(?:[\x01-\x08\x0b\x0c\x0e-\x1f\x21-\x5a\x53-\x7f]|\\[\x01-\x09\x0b\x0c\x0e-\x7f])+)\])""".r
70 |
71 | // https://mathiasbynens.be/demo/url-regex stephenhay
72 | val URL: Regex = """(https?|ftp)://[^\s/$.?#].[^\s]*""".r
73 |
74 | val SOCIAL_SECURITY_NUMBER_US: Regex = """((?!219-09-9999|078-05-1120)(?!666|000|9\d{2})\d{3}-(?!00)\d{2}-(?!0{4})\d{4})|((?!219 09 9999|078 05 1120)(?!666|000|9\d{2})\d{3} (?!00)\d{2} (?!0{4})\d{4})|((?!219099999|078051120)(?!666|000|9\d{2})\d{3}(?!00)\d{2}(?!0{4})\d{4})""".r
75 |
76 | // Visa, MasterCard, AMEX, Diners Club
77 | // http://www.richardsramblings.com/regex/credit-card-numbers/
78 | val CREDITCARD: Regex = """\b(?:3[47]\d{2}([\ \-]?)\d{6}\1\d|(?:(?:4\d|5[1-5]|65)\d{2}|6011)([\ \-]?)\d{4}\2\d{4}\2)\d{4}\b""".r
79 | // scalastyle:on
80 | }
81 |
--------------------------------------------------------------------------------
/src/main/scala/com/amazon/deequ/analyzers/catalyst/StatefulKLLSketch.scala:
--------------------------------------------------------------------------------
1 | /**
2 | * Copyright 2018 Amazon.com, Inc. or its affiliates. All Rights Reserved.
3 | *
4 | * Licensed under the Apache License, Version 2.0 (the "License"). You may not
5 | * use this file except in compliance with the License. A copy of the License
6 | * is located at
7 | *
8 | * http://aws.amazon.com/apache2.0/
9 | *
10 | * or in the "license" file accompanying this file. This file is distributed on
11 | * an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either
12 | * express or implied. See the License for the specific language governing
13 | * permissions and limitations under the License.
14 | *
15 | */
16 |
17 | package org.apache.spark.sql
18 |
19 | import java.nio.ByteBuffer
20 |
21 | import com.amazon.deequ.analyzers.QuantileNonSample
22 | import com.amazon.deequ.analyzers.catalyst.KLLSketchSerializer
23 | import com.google.common.primitives.Doubles
24 |
25 | import org.apache.spark.sql.expressions.{MutableAggregationBuffer, UserDefinedAggregateFunction}
26 | import org.apache.spark.sql.types._
27 |
28 |
29 | private [sql] class StatefulKLLSketch(
30 | sketchSize: Int,
31 | shrinkingFactor: Double)
32 | extends UserDefinedAggregateFunction{
33 |
34 | val OBJECT_POS = 0
35 | val MIN_POS = 1
36 | val MAX_POS = 2
37 |
38 | override def inputSchema: StructType = StructType(StructField("value", DoubleType) :: Nil)
39 |
40 | override def bufferSchema: StructType = StructType(StructField("data", BinaryType) ::
41 | StructField("minimum", DoubleType) :: StructField("maximum", DoubleType) :: Nil)
42 |
43 | override def dataType: DataType = BinaryType
44 |
45 | override def deterministic: Boolean = true
46 |
47 | override def initialize(buffer: MutableAggregationBuffer): Unit = {
48 | val qsketch = new QuantileNonSample[Double](sketchSize, shrinkingFactor)
49 | buffer(OBJECT_POS) = serialize(qsketch)
50 | buffer(MIN_POS) = Int.MaxValue.toDouble
51 | buffer(MAX_POS) = Int.MinValue.toDouble
52 | }
53 |
54 | override def update(buffer: MutableAggregationBuffer, input: Row): Unit = {
55 | if (input.isNullAt(OBJECT_POS)) {
56 | return
57 | }
58 |
59 | val tmp = input.getDouble(OBJECT_POS)
60 | val kll = deserialize(buffer.getAs[Array[Byte]](OBJECT_POS))
61 | kll.update(tmp)
62 | buffer(OBJECT_POS) = serialize(kll)
63 | buffer(MIN_POS) = Math.min(buffer.getDouble(MIN_POS), tmp)
64 | buffer(MAX_POS) = Math.max(buffer.getDouble(MAX_POS), tmp)
65 | }
66 |
67 | override def merge(buffer1: MutableAggregationBuffer, buffer2: Row): Unit = {
68 | if (buffer2.isNullAt(OBJECT_POS)) {
69 | return
70 | }
71 |
72 | val kll_this = deserialize(buffer1.getAs[Array[Byte]](OBJECT_POS))
73 | val kll_other = deserialize(buffer2.getAs[Array[Byte]](OBJECT_POS))
74 | val kll_ret = kll_this.merge(kll_other)
75 | buffer1(OBJECT_POS) = serialize(kll_ret)
76 | buffer1(MIN_POS) = Math.min(buffer1.getDouble(MIN_POS), buffer2.getDouble(MIN_POS))
77 | buffer1(MAX_POS) = Math.max(buffer1.getDouble(MAX_POS), buffer2.getDouble(MAX_POS))
78 | }
79 |
80 | override def evaluate(buffer: Row): Any = {
81 | toBytes(buffer.getDouble(MIN_POS),
82 | buffer.getDouble(MAX_POS),
83 | buffer.getAs[Array[Byte]](OBJECT_POS))
84 | }
85 |
86 | def toBytes(min: Double, max: Double, obj: Array[Byte]): Array[Byte] = {
87 | val buffer2 = ByteBuffer.wrap(new Array(Doubles.BYTES + Doubles.BYTES + obj.length))
88 | buffer2.putDouble(min)
89 | buffer2.putDouble(max)
90 | buffer2.put(obj)
91 | buffer2.array()
92 | }
93 |
94 | def serialize(obj: QuantileNonSample[Double]): Array[Byte] = {
95 | KLLSketchSerializer.serializer.serialize(obj)
96 | }
97 |
98 | def deserialize(bytes: Array[Byte]): QuantileNonSample[Double] = {
99 | KLLSketchSerializer.serializer.deserialize(bytes)
100 | }
101 | }
102 |
103 |
--------------------------------------------------------------------------------
/CONTRIBUTING.md:
--------------------------------------------------------------------------------
1 | # Contributing Guidelines
2 |
3 | Thank you for your interest in contributing to our project. Whether it's a bug report, new feature, correction, or additional
4 | documentation, we greatly value feedback and contributions from our community.
5 |
6 | Please read through this document before submitting any issues or pull requests to ensure we have all the necessary
7 | information to effectively respond to your bug report or contribution.
8 |
9 |
10 | ## Reporting Bugs/Feature Requests
11 |
12 | We welcome you to use the GitHub issue tracker to report bugs or suggest features.
13 |
14 | When filing an issue, please check [existing open](https://github.com/awslabs/deequ/issues), or [recently closed](https://github.com/awslabs/deequ/issues?utf8=%E2%9C%93&q=is%3Aissue%20is%3Aclosed%20), issues to make sure somebody else hasn't already
15 | reported the issue. Please try to include as much information as you can. Details like these are incredibly useful:
16 |
17 | * A reproducible test case or series of steps
18 | * The version of our code being used
19 | * Any modifications you've made relevant to the bug
20 | * Anything unusual about your environment or deployment
21 |
22 |
23 | ## Contributing via Pull Requests
24 | Contributions via pull requests are much appreciated. Before sending us a pull request, please ensure that:
25 |
26 | 1. You are working against the latest source on the *master* branch.
27 | 2. You check existing open, and recently merged, pull requests to make sure someone else hasn't addressed the problem already.
28 | 3. You open an issue to discuss any significant work - we would hate for your time to be wasted.
29 |
30 | To send us a pull request, please:
31 |
32 | 1. Fork the repository.
33 | 2. Modify the source; please focus on the specific change you are contributing. If you also reformat all the code, it will be hard for us to focus on your change.
34 | 3. Please ensure that your code follows our [code conventions](https://github.com/databricks/scala-style-guide), which we adopted from Apache Spark
35 | 4. Ensure local tests pass.
36 | 5. Commit to your fork using clear commit messages.
37 | 6. Send us a pull request, answering any default questions in the pull request interface.
38 | 7. Pay attention to any automated CI failures reported in the pull request, and stay involved in the conversation.
39 |
40 | GitHub provides additional document on [forking a repository](https://help.github.com/articles/fork-a-repo/) and
41 | [creating a pull request](https://help.github.com/articles/creating-a-pull-request/).
42 |
43 |
44 | ## Finding contributions to work on
45 | Looking at the existing issues is a great way to find something to contribute on. As our projects, by default, use the default GitHub issue labels ((enhancement/bug/duplicate/help wanted/invalid/question/wontfix), looking at any ['help wanted'](https://github.com/awslabs/deequ/labels/help%20wanted) issues is a great place to start.
46 |
47 |
48 | ## Understanding the existing codebase
49 | You may find the [documentation on the key concepts](/docs/key-concepts.md) in the codebase helpful.
50 |
51 | ## Code of Conduct
52 | This project has adopted the [Amazon Open Source Code of Conduct](https://aws.github.io/code-of-conduct).
53 | For more information see the [Code of Conduct FAQ](https://aws.github.io/code-of-conduct-faq) or contact
54 | opensource-codeofconduct@amazon.com with any additional questions or comments.
55 |
56 |
57 | ## Security issue notifications
58 | If you discover a potential security issue in this project we ask that you notify AWS/Amazon Security via our [vulnerability reporting page](http://aws.amazon.com/security/vulnerability-reporting/). Please do **not** create a public github issue.
59 |
60 |
61 | ## Licensing
62 |
63 | See the [LICENSE](https://github.com/awslabs/deequ/blob/master/LICENSE) file for our project's licensing. We will ask you to confirm the licensing of your contribution.
64 |
65 | We may ask you to sign a [Contributor License Agreement (CLA)](http://en.wikipedia.org/wiki/Contributor_License_Agreement) for larger changes.
66 |
--------------------------------------------------------------------------------
/src/main/scala/com/amazon/deequ/anomalydetection/BatchNormalStrategy.scala:
--------------------------------------------------------------------------------
1 | /**
2 | * Copyright 2018 Amazon.com, Inc. or its affiliates. All Rights Reserved.
3 | *
4 | * Licensed under the Apache License, Version 2.0 (the "License"). You may not
5 | * use this file except in compliance with the License. A copy of the License
6 | * is located at
7 | *
8 | * http://aws.amazon.com/apache2.0/
9 | *
10 | * or in the "license" file accompanying this file. This file is distributed on
11 | * an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either
12 | * express or implied. See the License for the specific language governing
13 | * permissions and limitations under the License.
14 | *
15 | */
16 |
17 | package com.amazon.deequ.anomalydetection
18 |
19 | import breeze.stats.meanAndVariance
20 |
21 |
22 | /**
23 | * Detects anomalies based on the mean and standard deviation of all available values.
24 | * Assumes that the data is normally distributed.
25 | *
26 | * @param lowerDeviationFactor Detect anomalies if they are
27 | * smaller than mean - lowerDeviationFactor * stdDev
28 | * @param upperDeviationFactor Detect anomalies if they are
29 | * bigger than mean + upperDeviationFactor * stdDev
30 | * @param includeInterval Whether or not values inside the detection interval should be
31 | * included in the calculation of the mean/ stdDev
32 | */
33 | case class BatchNormalStrategy(
34 | lowerDeviationFactor: Option[Double] = Some(3.0),
35 | upperDeviationFactor: Option[Double] = Some(3.0),
36 | includeInterval: Boolean = false) extends AnomalyDetectionStrategy {
37 |
38 | require(lowerDeviationFactor.isDefined || upperDeviationFactor.isDefined,
39 | "At least one factor has to be specified.")
40 |
41 | require(lowerDeviationFactor.getOrElse(1.0) >= 0 && upperDeviationFactor.getOrElse(1.0) >= 0,
42 | "Factors cannot be smaller than zero.")
43 |
44 |
45 | /**
46 | * Search for anomalies in a series of data points.
47 | *
48 | * @param dataSeries The data contained in a Vector of Doubles
49 | * @param searchInterval The indices between which anomalies should be detected. [a, b).
50 | * @return The indices of all anomalies in the interval and their corresponding wrapper object.
51 | */
52 | override def detect(
53 | dataSeries: Vector[Double],
54 | searchInterval: (Int, Int)): Seq[(Int, Anomaly)] = {
55 |
56 | val (searchStart, searchEnd) = searchInterval
57 |
58 | require(searchStart <= searchEnd, "The start of the interval can't be larger than the end.")
59 |
60 | require(dataSeries.nonEmpty, "Data series is empty. Can't calculate mean/ stdDev.")
61 |
62 | val searchIntervalLength = searchEnd - searchStart
63 |
64 | require(includeInterval || searchIntervalLength < dataSeries.length,
65 | "Excluding values in searchInterval from calculation but not enough values remain to " +
66 | "calculate mean and stdDev.")
67 |
68 | val mAV = if (includeInterval) {
69 | meanAndVariance(dataSeries)
70 | } else {
71 | val valuesBeforeInterval = dataSeries.slice(0, searchStart)
72 | val valuesAfterInterval = dataSeries.slice(searchEnd, dataSeries.length)
73 | val dataSeriesWithoutInterval = valuesBeforeInterval ++ valuesAfterInterval
74 |
75 | meanAndVariance(dataSeriesWithoutInterval)
76 | }
77 |
78 | val mean = mAV.mean
79 | val stdDev = mAV.stdDev
80 |
81 | val upperBound = mean + upperDeviationFactor.getOrElse(Double.MaxValue) * stdDev
82 | val lowerBound = mean - lowerDeviationFactor.getOrElse(Double.MaxValue) * stdDev
83 |
84 | dataSeries.zipWithIndex
85 | .slice(searchStart, searchEnd)
86 | .filter { case (value, _) => value > upperBound || value < lowerBound }
87 | .map { case (value, index) =>
88 |
89 | val detail = Some(s"[BatchNormalStrategy]: Value $value is not in " +
90 | s"bounds [$lowerBound, $upperBound].")
91 |
92 | (index, Anomaly(Option(value), 1.0, detail))
93 | }
94 | }
95 | }
96 |
--------------------------------------------------------------------------------
/src/main/scala/com/amazon/deequ/analyzers/MutualInformation.scala:
--------------------------------------------------------------------------------
1 | /**
2 | * Copyright 2018 Amazon.com, Inc. or its affiliates. All Rights Reserved.
3 | *
4 | * Licensed under the Apache License, Version 2.0 (the "License"). You may not
5 | * use this file except in compliance with the License. A copy of the License
6 | * is located at
7 | *
8 | * http://aws.amazon.com/apache2.0/
9 | *
10 | * or in the "license" file accompanying this file. This file is distributed on
11 | * an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either
12 | * express or implied. See the License for the specific language governing
13 | * permissions and limitations under the License.
14 | *
15 | */
16 |
17 | package com.amazon.deequ.analyzers
18 |
19 | import com.amazon.deequ.analyzers.Analyzers._
20 | import com.amazon.deequ.metrics.{DoubleMetric, Entity}
21 | import org.apache.spark.sql.functions.{col, sum, udf}
22 | import org.apache.spark.sql.types.StructType
23 | import Analyzers.COUNT_COL
24 | import com.amazon.deequ.analyzers.runners.MetricCalculationException
25 |
26 | /**
27 | * Mutual Information describes how much information about one column can be inferred from another
28 | * column.
29 | *
30 | * If two columns are independent of each other, then nothing can be inferred from one column about
31 | * the other, and mutual information is zero. If there is a functional dependency of one column to
32 | * another and vice versa, then all information of the two columns are shared, and mutual
33 | * information is the entropy of each column.
34 | */
35 | case class MutualInformation(columns: Seq[String], where: Option[String] = None)
36 | extends FrequencyBasedAnalyzer(columns)
37 | with FilterableAnalyzer {
38 |
39 | override def computeMetricFrom(state: Option[FrequenciesAndNumRows]): DoubleMetric = {
40 |
41 | state match {
42 |
43 | case Some(theState) =>
44 | val total = theState.numRows
45 | val Seq(col1, col2) = columns
46 |
47 | val freqCol1 = s"__deequ_f1_$col1"
48 | val freqCol2 = s"__deequ_f2_$col2"
49 |
50 | val jointStats = theState.frequencies
51 |
52 | val marginalStats1 = jointStats
53 | .select(col1, COUNT_COL)
54 | .groupBy(col1)
55 | .agg(sum(COUNT_COL).as(freqCol1))
56 |
57 | val marginalStats2 = jointStats
58 | .select(col2, COUNT_COL)
59 | .groupBy(col2)
60 | .agg(sum(COUNT_COL).as(freqCol2))
61 |
62 |
63 | val miUdf = udf {
64 | (px: Double, py: Double, pxy: Double) =>
65 | (pxy / total) * math.log((pxy / total) / ((px / total) * (py / total)))
66 | }
67 |
68 | val miCol = s"__deequ_mi_${col1}_$col2"
69 | val value = jointStats
70 | .join(marginalStats1, usingColumn = col1)
71 | .join(marginalStats2, usingColumn = col2)
72 | .withColumn(miCol, miUdf(col(freqCol1), col(freqCol2), col(COUNT_COL)))
73 | .agg(sum(miCol))
74 |
75 | val resultRow = value.head()
76 |
77 | if (resultRow.isNullAt(0)) {
78 | metricFromEmpty(this, "MutualInformation", columns.mkString(","), Entity.Mutlicolumn)
79 | } else {
80 | metricFromValue(resultRow.getDouble(0), "MutualInformation", columns.mkString(","),
81 | Entity.Mutlicolumn)
82 | }
83 |
84 | case None =>
85 | metricFromEmpty(this, "MutualInformation", columns.mkString(","), Entity.Mutlicolumn)
86 | }
87 | }
88 |
89 |
90 | /** We need at least one grouping column, and all specified columns must exist */
91 | override def preconditions: Seq[StructType => Unit] = {
92 | Preconditions.exactlyNColumns(columns, 2) +: super.preconditions
93 | }
94 |
95 | override def toFailureMetric(exception: Exception): DoubleMetric = {
96 | metricFromFailure(exception, "MutualInformation", columns.mkString(","), Entity.Mutlicolumn)
97 | }
98 |
99 | override def filterCondition: Option[String] = where
100 | }
101 |
102 | object MutualInformation {
103 | def apply(columnA: String, columnB: String): MutualInformation = {
104 | new MutualInformation(columnA :: columnB :: Nil)
105 | }
106 | }
107 |
--------------------------------------------------------------------------------
/src/main/scala/com/amazon/deequ/metrics/KLLMetric.scala:
--------------------------------------------------------------------------------
1 | /**
2 | * Copyright 2018 Amazon.com, Inc. or its affiliates. All Rights Reserved.
3 | *
4 | * Licensed under the Apache License, Version 2.0 (the "License"). You may not
5 | * use this file except in compliance with the License. A copy of the License
6 | * is located at
7 | *
8 | * http://aws.amazon.com/apache2.0/
9 | *
10 | * or in the "license" file accompanying this file. This file is distributed on
11 | * an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either
12 | * express or implied. See the License for the specific language governing
13 | * permissions and limitations under the License.
14 | *
15 | */
16 |
17 | package com.amazon.deequ.metrics
18 |
19 | import com.amazon.deequ.analyzers.QuantileNonSample
20 |
21 | import scala.util.{Failure, Success, Try}
22 | import scala.util.control.Breaks._
23 |
24 | case class BucketValue(lowValue: Double, highValue: Double, count: Long)
25 |
26 | case class BucketDistribution(
27 | buckets: List[BucketValue],
28 | parameters: List[Double],
29 | data: Array[Array[Double]]) {
30 |
31 | def computePercentiles(): Array[Double] = {
32 |
33 | val sketchSize = parameters(0).toInt
34 | val shrinkingFactor = parameters(1)
35 |
36 | val quantileNonSample = new QuantileNonSample[Double](sketchSize, shrinkingFactor)
37 | quantileNonSample.reconstruct(sketchSize, shrinkingFactor, data)
38 |
39 | quantileNonSample.quantiles(100)
40 | }
41 |
42 | /**
43 | * Get relevant bucketValue with index of bucket.
44 | * @param key index of bucket
45 | * @return The metrics for the bucket
46 | */
47 | def apply(key: Int): BucketValue = {
48 | buckets(key)
49 | }
50 |
51 | /**
52 | * Find the index of bucket which contains the most items.
53 | * @return The index of bucket which contains the most items.
54 | */
55 | def argmax: Int = {
56 | var currentMax = 0L
57 | var maxBucket = 0
58 | buckets.foreach { bucket =>
59 | if (bucket.count > currentMax) {
60 | currentMax = bucket.count
61 | maxBucket = buckets.indexOf(bucket)
62 | }
63 | }
64 | maxBucket
65 | }
66 |
67 | /**
68 | * Check if it is equal with two BucketDistribution.
69 | * @param obj object to compare
70 | * @return true if equal
71 | */
72 | override def equals(obj: Any): Boolean = {
73 | obj match {
74 | case that: BucketDistribution =>
75 | var check = that.isInstanceOf[BucketDistribution] &&
76 | this.buckets.equals(that.buckets) &&
77 | this.parameters.equals(that.parameters) &&
78 | this.data.length == that.data.length
79 | breakable {
80 | for (i <- this.data.indices) {
81 | if (!this.data(i).sameElements(that.data(i))) {
82 | check = false
83 | break
84 | }
85 | }
86 | }
87 | check
88 | case _ => false
89 | }
90 | }
91 |
92 | // TODO not sure if thats correct...
93 | override def hashCode(): Int = super.hashCode()
94 | }
95 |
96 | case class KLLMetric(column: String, value: Try[BucketDistribution])
97 | extends Metric[BucketDistribution] {
98 |
99 | val entity: Entity.Value = Entity.Column
100 | val instance: String = column
101 | val name = "KLL"
102 |
103 | def flatten(): Seq[DoubleMetric] = {
104 | value
105 | .map { distribution =>
106 | val numberOfBuckets = Seq(DoubleMetric(entity, s"$name.buckets", instance,
107 | Success(distribution.buckets.length.toDouble)))
108 |
109 | val details = distribution.buckets
110 | .flatMap { distValue =>
111 | DoubleMetric(entity, s"$name.low", instance, Success(distValue.lowValue)) ::
112 | DoubleMetric(entity, s"$name.high", instance, Success(distValue.highValue)) ::
113 | DoubleMetric(entity, s"$name.count", instance, Success(distValue.count)) :: Nil
114 | }
115 | numberOfBuckets ++ details
116 | }
117 | .recover {
118 | case e: Exception => Seq(DoubleMetric(entity, s"$name.buckets", instance, Failure(e)))
119 | }
120 | .get
121 | }
122 |
123 | }
124 |
--------------------------------------------------------------------------------
/src/main/scala/com/amazon/deequ/profiles/ColumnProfilerRunner.scala:
--------------------------------------------------------------------------------
1 | /**
2 | * Copyright 2018 Amazon.com, Inc. or its affiliates. All Rights Reserved.
3 | *
4 | * Licensed under the Apache License, Version 2.0 (the "License"). You may not
5 | * use this file except in compliance with the License. A copy of the License
6 | * is located at
7 | *
8 | * http://aws.amazon.com/apache2.0/
9 | *
10 | * or in the "license" file accompanying this file. This file is distributed on
11 | * an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either
12 | * express or implied. See the License for the specific language governing
13 | * permissions and limitations under the License.
14 | *
15 | */
16 |
17 | package com.amazon.deequ.profiles
18 |
19 | import com.amazon.deequ.analyzers.{DataTypeInstances, KLLParameters}
20 | import com.amazon.deequ.io.DfsUtils
21 | import com.amazon.deequ.repository.{MetricsRepository, ResultKey}
22 | import org.apache.spark.annotation.Experimental
23 | import org.apache.spark.sql.{DataFrame, SparkSession}
24 |
25 | private[profiles] case class ColumnProfilerRunBuilderMetricsRepositoryOptions(
26 | metricsRepository: Option[MetricsRepository],
27 | reuseExistingResultsKey: Option[ResultKey],
28 | failIfResultsForReusingMissing: Boolean,
29 | saveOrAppendResultsKey: Option[ResultKey])
30 |
31 | private[profiles] case class ColumnProfilerRunBuilderFileOutputOptions(
32 | session: Option[SparkSession],
33 | saveColumnProfilesJsonToPath: Option[String],
34 | overwriteResults: Boolean)
35 |
36 | @Experimental
37 | class ColumnProfilerRunner {
38 |
39 | def onData(data: DataFrame): ColumnProfilerRunBuilder = {
40 | new ColumnProfilerRunBuilder(data)
41 | }
42 |
43 | private[profiles] def run(
44 | data: DataFrame,
45 | restrictToColumns: Option[Seq[String]],
46 | lowCardinalityHistogramThreshold: Int,
47 | printStatusUpdates: Boolean,
48 | cacheInputs: Boolean,
49 | fileOutputOptions: ColumnProfilerRunBuilderFileOutputOptions,
50 | metricsRepositoryOptions: ColumnProfilerRunBuilderMetricsRepositoryOptions,
51 | kllProfiling: Boolean,
52 | kllParameters: Option[KLLParameters],
53 | predefinedTypes: Map[String, DataTypeInstances.Value])
54 | : ColumnProfiles = {
55 |
56 | if (cacheInputs) {
57 | data.cache()
58 | }
59 |
60 | val columnProfiles = ColumnProfiler
61 | .profile(
62 | data,
63 | restrictToColumns,
64 | printStatusUpdates,
65 | lowCardinalityHistogramThreshold,
66 | metricsRepositoryOptions.metricsRepository,
67 | metricsRepositoryOptions.reuseExistingResultsKey,
68 | metricsRepositoryOptions.failIfResultsForReusingMissing,
69 | metricsRepositoryOptions.saveOrAppendResultsKey,
70 | kllProfiling,
71 | kllParameters,
72 | predefinedTypes
73 | )
74 |
75 | saveColumnProfilesJsonToFileSystemIfNecessary(
76 | fileOutputOptions,
77 | printStatusUpdates,
78 | columnProfiles
79 | )
80 |
81 | if (cacheInputs) {
82 | data.unpersist()
83 | }
84 |
85 | columnProfiles
86 | }
87 |
88 | private[this] def saveColumnProfilesJsonToFileSystemIfNecessary(
89 | fileOutputOptions: ColumnProfilerRunBuilderFileOutputOptions,
90 | printStatusUpdates: Boolean,
91 | columnProfiles: ColumnProfiles)
92 | : Unit = {
93 |
94 | fileOutputOptions.session.foreach { session =>
95 | fileOutputOptions.saveColumnProfilesJsonToPath.foreach { profilesOutput =>
96 | if (printStatusUpdates) {
97 | println(s"### WRITING COLUMN PROFILES TO $profilesOutput")
98 | }
99 |
100 | DfsUtils.writeToTextFileOnDfs(session, profilesOutput,
101 | overwrite = fileOutputOptions.overwriteResults) { writer =>
102 | writer.append(ColumnProfiles.toJson(columnProfiles.profiles.values.toSeq).toString)
103 | writer.newLine()
104 | }
105 | }
106 | }
107 | }
108 | }
109 |
110 | object ColumnProfilerRunner {
111 |
112 | def apply(): ColumnProfilerRunner = {
113 | new ColumnProfilerRunner()
114 | }
115 | }
116 |
--------------------------------------------------------------------------------
/src/main/scala/com/amazon/deequ/suggestions/ConstraintSuggestion.scala:
--------------------------------------------------------------------------------
1 | /**
2 | * Copyright 2018 Amazon.com, Inc. or its affiliates. All Rights Reserved.
3 | *
4 | * Licensed under the Apache License, Version 2.0 (the "License"). You may not
5 | * use this file except in compliance with the License. A copy of the License
6 | * is located at
7 | *
8 | * http://aws.amazon.com/apache2.0/
9 | *
10 | * or in the "license" file accompanying this file. This file is distributed on
11 | * an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either
12 | * express or implied. See the License for the specific language governing
13 | * permissions and limitations under the License.
14 | *
15 | */
16 |
17 | package com.amazon.deequ.suggestions
18 |
19 | import com.amazon.deequ.VerificationResult
20 | import com.amazon.deequ.constraints.Constraint
21 | import com.amazon.deequ.profiles.ColumnProfile
22 | import com.amazon.deequ.suggestions.rules.ConstraintRule
23 | import com.google.gson.{GsonBuilder, JsonArray, JsonObject}
24 |
25 | case class ConstraintSuggestion(
26 | constraint: Constraint,
27 | columnName: String,
28 | currentValue: String,
29 | description: String,
30 | suggestingRule: ConstraintRule[ColumnProfile],
31 | codeForConstraint: String
32 | )
33 |
34 | object ConstraintSuggestions {
35 |
36 | private[this] val CONSTRANT_SUGGESTIONS_FIELD = "constraint_suggestions"
37 |
38 | private[suggestions] def toJson(constraintSuggestions: Seq[ConstraintSuggestion]): String = {
39 |
40 | val json = new JsonObject()
41 |
42 | val constraintsJson = new JsonArray()
43 |
44 | constraintSuggestions.foreach { constraintSuggestion =>
45 |
46 | val constraintJson = new JsonObject()
47 | addSharedProperties(constraintJson, constraintSuggestion)
48 |
49 | constraintsJson.add(constraintJson)
50 | }
51 |
52 | json.add(CONSTRANT_SUGGESTIONS_FIELD, constraintsJson)
53 |
54 | val gson = new GsonBuilder()
55 | .setPrettyPrinting()
56 | .create()
57 |
58 | gson.toJson(json)
59 | }
60 |
61 | private[suggestions] def evaluationResultsToJson(
62 | constraintSuggestions: Seq[ConstraintSuggestion],
63 | result: VerificationResult)
64 | : String = {
65 |
66 | val constraintResults = result.checkResults
67 | .map { case (_, checkResult) => checkResult }
68 | .headOption.map { checkResult =>
69 | checkResult.constraintResults
70 | }
71 | .getOrElse(Seq.empty)
72 |
73 | val json = new JsonObject()
74 |
75 | val constraintEvaluations = new JsonArray()
76 |
77 | val constraintResultsOnTestSet = constraintResults.map { checkResult =>
78 | checkResult.status.toString
79 | }
80 |
81 | constraintSuggestions.zipAll(constraintResultsOnTestSet, null, "Unknown")
82 | .foreach { case (constraintSuggestion, constraintResult) =>
83 |
84 | val constraintEvaluation = new JsonObject()
85 | addSharedProperties(constraintEvaluation, constraintSuggestion)
86 |
87 | constraintEvaluation.addProperty("constraint_result_on_test_set",
88 | constraintResult)
89 |
90 | constraintEvaluations.add(constraintEvaluation)
91 | }
92 |
93 | json.add(CONSTRANT_SUGGESTIONS_FIELD, constraintEvaluations)
94 |
95 | val gson = new GsonBuilder()
96 | .setPrettyPrinting()
97 | .create()
98 |
99 | gson.toJson(json)
100 | }
101 |
102 | private[this] def addSharedProperties(
103 | jsonObject: JsonObject,
104 | constraintSuggestion: ConstraintSuggestion)
105 | : Unit = {
106 |
107 | jsonObject.addProperty("constraint_name", constraintSuggestion.constraint.toString)
108 | jsonObject.addProperty("column_name", constraintSuggestion.columnName)
109 | jsonObject.addProperty("current_value", constraintSuggestion.currentValue)
110 | jsonObject.addProperty("description", constraintSuggestion.description)
111 | jsonObject.addProperty("suggesting_rule", constraintSuggestion.suggestingRule.toString)
112 | jsonObject.addProperty("rule_description", constraintSuggestion.suggestingRule.ruleDescription)
113 | jsonObject.addProperty("code_for_constraint", constraintSuggestion.codeForConstraint)
114 | }
115 | }
116 |
--------------------------------------------------------------------------------