├── NOTICE
├── .gitignore
├── .travis.yml
├── test-data
    └── README.md
├── .github
    └── PULL_REQUEST_TEMPLATE.md
├── CODE_OF_CONDUCT.md
├── src
    ├── test
    │   ├── resources
    │   │   ├── log4j.properties
    │   │   └── EMRSparkShellTest.scala
    │   └── scala
    │   │   └── com
    │   │       └── amazon
    │   │           └── deequ
    │   │               ├── KLL
    │   │                   ├── KLLBenchmarkHelper.scala
    │   │                   ├── KLLBenchmark.java
    │   │                   └── KLLDistanceTest.scala
    │   │               ├── utils
    │   │                   ├── TempFileUtils.scala
    │   │                   ├── CollectionUtils.scala
    │   │                   └── AssertionUtils.scala
    │   │               ├── constraints
    │   │                   └── ConstraintUtils.scala
    │   │               ├── examples
    │   │                   └── ExamplesTest.scala
    │   │               ├── package.scala
    │   │               ├── checks
    │   │                   ├── ColumnConditionTest.scala
    │   │                   └── FilterableCheckTest.scala
    │   │               ├── anomalydetection
    │   │                   ├── RateOfChangeStrategyTest.scala
    │   │                   ├── AnomalyDetectionTestUtilsTest.scala
    │   │                   ├── HistoryUtilsTest.scala
    │   │                   ├── AnomalyDetectionTestUtils.scala
    │   │                   └── SimpleThresholdStrategyTest.scala
    │   │               ├── DatatypeSuggestionTest.scala
    │   │               ├── SparkBasicTest.scala
    │   │               ├── analyzers
    │   │                   ├── StatesTest.scala
    │   │                   └── UniquenessTest.scala
    │   │               ├── metrics
    │   │                   └── MetricsTests.scala
    │   │               ├── SparkMonitor.scala
    │   │               └── SparkContextSpec.scala
    └── main
    │   └── scala
    │       └── com
    │           └── amazon
    │               └── deequ
    │                   ├── analyzers
    │                       ├── FilterableAnalyzer.scala
    │                       ├── CountDistinct.scala
    │                       ├── catalyst
    │                       │   ├── StatefulStdDevPop.scala
    │                       │   ├── StatefulCorrelation.scala
    │                       │   ├── DeequFunctions.scala
    │                       │   ├── StatefulDataType.scala
    │                       │   └── StatefulKLLSketch.scala
    │                       ├── Distinctness.scala
    │                       ├── Entropy.scala
    │                       ├── Uniqueness.scala
    │                       ├── Size.scala
    │                       ├── MaxLength.scala
    │                       ├── MinLength.scala
    │                       ├── Sum.scala
    │                       ├── Completeness.scala
    │                       ├── UniqueValueRatio.scala
    │                       ├── Maximum.scala
    │                       ├── Minimum.scala
    │                       ├── Mean.scala
    │                       ├── NonSampleCompactor.scala
    │                       ├── Compliance.scala
    │                       ├── ApproxCountDistinct.scala
    │                       ├── Analysis.scala
    │                       ├── StandardDeviation.scala
    │                       ├── runners
    │                       │   └── MetricCalculationException.scala
    │                       ├── Distance.scala
    │                       ├── Correlation.scala
    │                       ├── PatternMatch.scala
    │                       └── MutualInformation.scala
    │                   ├── examples
    │                       ├── entities.scala
    │                       ├── ExampleUtils.scala
    │                       ├── KLLCheckExample.scala
    │                       ├── IncrementalMetricsExample.scala
    │                       ├── BasicExample.scala
    │                       ├── ConstraintSuggestionExample.scala
    │                       ├── DataProfilingExample.scala
    │                       ├── MetricsRepositoryExample.scala
    │                       └── AnomalyDetectionExample.scala
    │                   ├── constraints
    │                       └── ConstrainableDataTypes.scala
    │                   ├── checks
    │                       ├── ColumnCondition.scala
    │                       └── CheckWithLastConstraintFilterable.scala
    │                   ├── anomalydetection
    │                       ├── RateOfChangeStrategy.scala
    │                       ├── AnomalyDetectionStrategy.scala
    │                       ├── AbsoluteChangeStrategy.scala
    │                       ├── HistoryUtils.scala
    │                       ├── DetectionResult.scala
    │                       ├── SimpleThresholdStrategy.scala
    │                       ├── RelativeRateOfChangeStrategy.scala
    │                       └── BatchNormalStrategy.scala
    │                   ├── suggestions
    │                       ├── rules
    │                       │   ├── ConstraintRule.scala
    │                       │   ├── CompleteIfCompleteRule.scala
    │                       │   ├── NonNegativeNumbersRule.scala
    │                       │   ├── UniqueIfApproximatelyUniqueRule.scala
    │                       │   ├── RetainTypeRule.scala
    │                       │   ├── RetainCompletenessRule.scala
    │                       │   └── CategoricalRangeRule.scala
    │                       ├── ConstraintSuggestionResult.scala
    │                       └── ConstraintSuggestion.scala
    │                   ├── repository
    │                       └── MetricsRepository.scala
    │                   ├── metrics
    │                       ├── HistogramMetric.scala
    │                       ├── Metric.scala
    │                       └── KLLMetric.scala
    │                   ├── io
    │                       └── DfsUtils.scala
    │                   └── profiles
    │                       └── ColumnProfilerRunner.scala
├── Makefile
├── settings.xml
├── docs
    └── key-concepts.md
└── CONTRIBUTING.md


/NOTICE:
--------------------------------------------------------------------------------
1 | Deequ
2 | Copyright 2018 Amazon.com, Inc. or its affiliates. All Rights Reserved. 
3 | 


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
1 | .idea/
2 | *.iml
3 | **/*.iml
4 | target/.travis/public-signing-key.gpg
5 | target/
6 | 


--------------------------------------------------------------------------------
/.travis.yml:
--------------------------------------------------------------------------------
1 | language: java
2 | jdk: oraclejdk8
3 | dist: trusty
4 | 
5 | script: make build
6 | 


--------------------------------------------------------------------------------
/test-data/README.md:
--------------------------------------------------------------------------------
1 | # Dataset used for testing
2 | 
3 |   * [titanic.csv](https://www.kaggle.com/c/titanic/data)


--------------------------------------------------------------------------------
/.github/PULL_REQUEST_TEMPLATE.md:
--------------------------------------------------------------------------------
1 | *Issue #, if available:*
2 | 
3 | *Description of changes:*
4 | 
5 | 
6 | By submitting this pull request, I confirm that my contribution is made under the terms of the Apache 2.0 license.
7 | 


--------------------------------------------------------------------------------
/CODE_OF_CONDUCT.md:
--------------------------------------------------------------------------------
1 | ## Code of Conduct
2 | This project has adopted the [Amazon Open Source Code of Conduct](https://aws.github.io/code-of-conduct). 
3 | For more information see the [Code of Conduct FAQ](https://aws.github.io/code-of-conduct-faq) or contact 
4 | opensource-codeofconduct@amazon.com with any additional questions or comments.
5 | 


--------------------------------------------------------------------------------
/src/test/resources/log4j.properties:
--------------------------------------------------------------------------------
 1 | # Change this to set Spark log level
 2 | log4j.logger.org.apache.spark=WARN
 3 | 
 4 | # Silence akka remoting
 5 | log4j.logger.Remoting=WARN
 6 | 
 7 | # Ignore messages below warning level from Jetty, because it's a bit verbose
 8 | log4j.logger.org.eclipse.jetty=WARN
 9 | 
10 | # INFO log level not required for tests
11 | log4j.logger.org.apache=WARN
12 | 


--------------------------------------------------------------------------------
/Makefile:
--------------------------------------------------------------------------------
 1 | # mvn profiles for the different supported 
 2 | # Spark and Scala versions. Uncomment
 3 | # the one that you want to use. You can also 
 4 | # override the profile on the command line:
 5 | # `make MVN_PROFILE=spark-2.4-scala-2.11 build`
 6 | MVN_PROFILE := spark-3.0-scala-2.12
 7 | # MVN_PROFILE := spark-2.4-scala-2.11
 8 | # MVN_PROFILE := spark-2.3-scala-2.11
 9 | # MVN_PROFILE := spark-2.2-scala-2.11
10 | 
11 | # Build the project for specific Spark and 
12 | # Scala versions. You can change the profile 
13 | # variable to use a different Scala or Spark 
14 | # version (see list above).
15 | # If you need more log ouput remove the -q flag.
16 | build:
17 | 	mvn clean install -q -P $(MVN_PROFILE)
18 | 


--------------------------------------------------------------------------------
/settings.xml:
--------------------------------------------------------------------------------
 1 | <settings>	
 2 |     <servers>	
 3 |         <server>	
 4 |             <id>ossrh</id>	
 5 |             <username>${env.MAVEN_REPO_USERNAME}</username>	
 6 |             <password>${env.MAVEN_REPO_PASSWORD}</password>	
 7 |         </server>	
 8 |     </servers>	
 9 |     <profiles>	
10 |         <profile>	
11 |             <id>release</id>	
12 |             <activation>	
13 |                 <activeByDefault>true</activeByDefault>	
14 |             </activation>	
15 |             <properties>	
16 |                 <gpg.executable>gpg</gpg.executable>	
17 |                 <gpg.keyname>72A07B34207DF21F2CD468178D0084713489CE20</gpg.keyname>	
18 |                 <gpg.passphrase>${env.MAVEN_GPG_PASSPHRASE}</gpg.passphrase>	
19 |             </properties>	
20 |         </profile>	
21 |     </profiles>	
22 | </settings> 
23 | 


--------------------------------------------------------------------------------
/src/main/scala/com/amazon/deequ/analyzers/FilterableAnalyzer.scala:
--------------------------------------------------------------------------------
 1 | /**
 2 |  * Copyright 2018 Amazon.com, Inc. or its affiliates. All Rights Reserved.
 3 |  *
 4 |  * Licensed under the Apache License, Version 2.0 (the "License"). You may not
 5 |  * use this file except in compliance with the License. A copy of the License
 6 |  * is located at
 7 |  *
 8 |  *     http://aws.amazon.com/apache2.0/
 9 |  *
10 |  * or in the "license" file accompanying this file. This file is distributed on
11 |  * an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either
12 |  * express or implied. See the License for the specific language governing
13 |  * permissions and limitations under the License.
14 |  *
15 |  */
16 | 
17 | package com.amazon.deequ.analyzers
18 | 
19 | /**
20 |  * Common trait for Analyzers that support dataset filtering
21 |  */
22 | trait FilterableAnalyzer {
23 |   def filterCondition: Option[String]
24 | }
25 | 


--------------------------------------------------------------------------------
/src/main/scala/com/amazon/deequ/examples/entities.scala:
--------------------------------------------------------------------------------
 1 | /**
 2 |  * Copyright 2018 Amazon.com, Inc. or its affiliates. All Rights Reserved.
 3 |  *
 4 |  * Licensed under the Apache License, Version 2.0 (the "License"). You may not
 5 |  * use this file except in compliance with the License. A copy of the License
 6 |  * is located at
 7 |  *
 8 |  *     http://aws.amazon.com/apache2.0/
 9 |  *
10 |  * or in the "license" file accompanying this file. This file is distributed on
11 |  * an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either
12 |  * express or implied. See the License for the specific language governing
13 |  * permissions and limitations under the License.
14 |  *
15 |  */
16 | 
17 | package com.amazon.deequ.examples
18 | 
19 | private[deequ] case class Item(
20 |     id: Long,
21 |     productName: String,
22 |     description: String,
23 |     priority: String,
24 |     numViews: Long
25 | )
26 | 
27 | private[deequ] case class Manufacturer(
28 |     id: Long,
29 |     manufacturerName: String,
30 |     countryCode: String
31 | )
32 | 


--------------------------------------------------------------------------------
/src/test/scala/com/amazon/deequ/KLL/KLLBenchmarkHelper.scala:
--------------------------------------------------------------------------------
 1 | /**
 2 |  * Copyright 2018 Amazon.com, Inc. or its affiliates. All Rights Reserved.
 3 |  *
 4 |  * Licensed under the Apache License, Version 2.0 (the "License"). You may not
 5 |  * use this file except in compliance with the License. A copy of the License
 6 |  * is located at
 7 |  *
 8 |  *     http://aws.amazon.com/apache2.0/
 9 |  *
10 |  * or in the "license" file accompanying this file. This file is distributed on
11 |  * an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either
12 |  * express or implied. See the License for the specific language governing
13 |  * permissions and limitations under the License.
14 |  *
15 |  */
16 | 
17 | package com.amazon.deequ.KLL
18 | 
19 | import com.amazon.deequ.analyzers.{KLLSketch, QuantileNonSample}
20 | 
21 | object KLLBenchmarkHelper {
22 | 
23 |   def floatSketch(): QuantileNonSample[java.lang.Float] = {
24 |     new QuantileNonSample[java.lang.Float](KLLSketch.DEFAULT_SKETCH_SIZE,
25 |       KLLSketch.DEFAULT_SHRINKING_FACTOR)
26 |   }
27 | 
28 | }
29 | 


--------------------------------------------------------------------------------
/src/test/scala/com/amazon/deequ/utils/TempFileUtils.scala:
--------------------------------------------------------------------------------
 1 | /**
 2 |   * Copyright 2018 Amazon.com, Inc. or its affiliates. All Rights Reserved.
 3 |   *
 4 |   * Licensed under the Apache License, Version 2.0 (the "License"). You may not
 5 |   * use this file except in compliance with the License. A copy of the License
 6 |   * is located at
 7 |   *
 8 |   *     http://aws.amazon.com/apache2.0/
 9 |   *
10 |   * or in the "license" file accompanying this file. This file is distributed on
11 |   * an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either
12 |   * express or implied. See the License for the specific language governing
13 |   * permissions and limitations under the License.
14 |   *
15 |   */
16 | 
17 | package com.amazon.deequ.utils
18 | 
19 | import java.nio.file.Files
20 | import java.util.UUID
21 | 
22 | object TempFileUtils {
23 |   def tempDir(prefix: String = UUID.randomUUID().toString): String = {
24 |     val tempDir = Files.createTempDirectory(prefix).toFile
25 |     tempDir.deleteOnExit()
26 |     tempDir.getAbsolutePath
27 |   }
28 | }
29 | 


--------------------------------------------------------------------------------
/src/main/scala/com/amazon/deequ/constraints/ConstrainableDataTypes.scala:
--------------------------------------------------------------------------------
 1 | /**
 2 |  * Copyright 2018 Amazon.com, Inc. or its affiliates. All Rights Reserved.
 3 |  *
 4 |  * Licensed under the Apache License, Version 2.0 (the "License"). You may not
 5 |  * use this file except in compliance with the License. A copy of the License
 6 |  * is located at
 7 |  *
 8 |  *     http://aws.amazon.com/apache2.0/
 9 |  *
10 |  * or in the "license" file accompanying this file. This file is distributed on
11 |  * an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either
12 |  * express or implied. See the License for the specific language governing
13 |  * permissions and limitations under the License.
14 |  *
15 |  */
16 | 
17 | package com.amazon.deequ.constraints
18 | 
19 | object ConstrainableDataTypes extends Enumeration {
20 |   val Null: Value = Value(0)
21 |   val Fractional: Value = Value(1)
22 |   val Integral: Value = Value(2)
23 |   val Boolean: Value = Value(3)
24 |   val String: Value = Value(4)
25 |   val Numeric: Value = Value(5) // Union of integral and fractional
26 | }
27 | 


--------------------------------------------------------------------------------
/src/test/scala/com/amazon/deequ/utils/CollectionUtils.scala:
--------------------------------------------------------------------------------
 1 | /**
 2 |   * Copyright 2018 Amazon.com, Inc. or its affiliates. All Rights Reserved.
 3 |   *
 4 |   * Licensed under the Apache License, Version 2.0 (the "License"). You may not
 5 |   * use this file except in compliance with the License. A copy of the License
 6 |   * is located at
 7 |   *
 8 |   *     http://aws.amazon.com/apache2.0/
 9 |   *
10 |   * or in the "license" file accompanying this file. This file is distributed on
11 |   * an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either
12 |   * express or implied. See the License for the specific language governing
13 |   * permissions and limitations under the License.
14 |   *
15 |   */
16 | 
17 | package com.amazon.deequ.utils
18 | 
19 | object CollectionUtils {
20 | 
21 |   implicit class SeqExtensions[A](val source: Seq[A]) {
22 |     def forEachOrder(f: Seq[A] => Any): Unit = {
23 |       source.combinations(source.size)
24 |         .flatMap { _.permutations }
25 |         .foreach { distinctOrder => f(distinctOrder) }
26 |     }
27 | 
28 |   }
29 | }
30 | 


--------------------------------------------------------------------------------
/src/main/scala/com/amazon/deequ/checks/ColumnCondition.scala:
--------------------------------------------------------------------------------
 1 | /**
 2 |  * Copyright 2018 Amazon.com, Inc. or its affiliates. All Rights Reserved.
 3 |  *
 4 |  * Licensed under the Apache License, Version 2.0 (the "License"). You may not
 5 |  * use this file except in compliance with the License. A copy of the License
 6 |  * is located at
 7 |  *
 8 |  *     http://aws.amazon.com/apache2.0/
 9 |  *
10 |  * or in the "license" file accompanying this file. This file is distributed on
11 |  * an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either
12 |  * express or implied. See the License for the specific language governing
13 |  * permissions and limitations under the License.
14 |  *
15 |  */
16 | 
17 | package com.amazon.deequ.checks
18 | 
19 | import org.apache.spark.sql.functions.{col}
20 | 
21 | private[checks] object ColumnCondition {
22 | 
23 |   def isEachNotNull(cols: Seq[String]): String = {
24 |     cols
25 |       .map(col(_).isNotNull)
26 |       .reduce(_ and _)
27 |       .toString()
28 |   }
29 | 
30 |   def isAnyNotNull(cols: Seq[String]): String = {
31 |     cols
32 |       .map(col(_).isNotNull)
33 |       .reduce(_ or _)
34 |       .toString()
35 |   }
36 | }
37 | 


--------------------------------------------------------------------------------
/src/test/scala/com/amazon/deequ/constraints/ConstraintUtils.scala:
--------------------------------------------------------------------------------
 1 | /**
 2 |   * Copyright 2018 Amazon.com, Inc. or its affiliates. All Rights Reserved.
 3 |   *
 4 |   * Licensed under the Apache License, Version 2.0 (the "License"). You may not
 5 |   * use this file except in compliance with the License. A copy of the License
 6 |   * is located at
 7 |   *
 8 |   *     http://aws.amazon.com/apache2.0/
 9 |   *
10 |   * or in the "license" file accompanying this file. This file is distributed on
11 |   * an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either
12 |   * express or implied. See the License for the specific language governing
13 |   * permissions and limitations under the License.
14 |   *
15 |   */
16 | 
17 | package com.amazon.deequ.constraints
18 | 
19 | import org.apache.spark.sql.DataFrame
20 | 
21 | object ConstraintUtils {
22 | 
23 |   def calculate(constraint: Constraint, df: DataFrame): ConstraintResult = {
24 | 
25 |     val analysisBasedConstraint = constraint match {
26 |         case nc: ConstraintDecorator => nc.inner
27 |         case c: Constraint => c
28 |     }
29 | 
30 |     analysisBasedConstraint.asInstanceOf[AnalysisBasedConstraint[_, _, _]].calculateAndEvaluate(df)
31 |   }
32 | }
33 | 


--------------------------------------------------------------------------------
/src/test/scala/com/amazon/deequ/examples/ExamplesTest.scala:
--------------------------------------------------------------------------------
 1 | /**
 2 |  * Copyright 2018 Amazon.com, Inc. or its affiliates. All Rights Reserved.
 3 |  *
 4 |  * Licensed under the Apache License, Version 2.0 (the "License"). You may not
 5 |  * use this file except in compliance with the License. A copy of the License
 6 |  * is located at
 7 |  *
 8 |  *     http://aws.amazon.com/apache2.0/
 9 |  *
10 |  * or in the "license" file accompanying this file. This file is distributed on
11 |  * an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either
12 |  * express or implied. See the License for the specific language governing
13 |  * permissions and limitations under the License.
14 |  *
15 |  */
16 | 
17 | package com.amazon.deequ.examples
18 | 
19 | import org.scalatest.WordSpec
20 | 
21 | class ExamplesTest extends WordSpec {
22 | 
23 |   "all examples" should {
24 |     "run without errors" in {
25 |       BasicExample.main(Array.empty)
26 |       IncrementalMetricsExample.main(Array.empty)
27 |       MetricsRepositoryExample.main(Array.empty)
28 |       UpdateMetricsOnPartitionedDataExample.main(Array.empty)
29 |       DataProfilingExample.main(Array.empty)
30 |       AnomalyDetectionExample.main(Array.empty)
31 |       ConstraintSuggestionExample.main(Array.empty)
32 |     }
33 |   }
34 | 
35 | }
36 | 


--------------------------------------------------------------------------------
/src/test/scala/com/amazon/deequ/package.scala:
--------------------------------------------------------------------------------
 1 | /**
 2 |   * Copyright 2018 Amazon.com, Inc. or its affiliates. All Rights Reserved.
 3 |   *
 4 |   * Licensed under the Apache License, Version 2.0 (the "License"). You may not
 5 |   * use this file except in compliance with the License. A copy of the License
 6 |   * is located at
 7 |   *
 8 |   *     http://aws.amazon.com/apache2.0/
 9 |   *
10 |   * or in the "license" file accompanying this file. This file is distributed on
11 |   * an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either
12 |   * express or implied. See the License for the specific language governing
13 |   * permissions and limitations under the License.
14 |   *
15 |   */
16 | 
17 | package com.amazon
18 | 
19 | import org.apache.spark.sql.{DataFrame, Row, SparkSession}
20 | import org.apache.spark.sql.types.{StructField, StructType}
21 | import org.apache.spark.sql.types.{ DataType => SparkDT }
22 | 
23 | package object deequ {
24 |   def dataFrameWithColumn(
25 |       name: String,
26 |       columnType: SparkDT,
27 |       sparkSession: SparkSession,
28 |       values: Row*)
29 |     : DataFrame = {
30 | 
31 |     import scala.collection.JavaConverters._
32 |     val struct = StructType(StructField(name, columnType) :: Nil)
33 |     sparkSession.createDataFrame(values.asJava, struct).toDF(name)
34 |   }
35 | }
36 | 


--------------------------------------------------------------------------------
/src/main/scala/com/amazon/deequ/anomalydetection/RateOfChangeStrategy.scala:
--------------------------------------------------------------------------------
 1 | /**
 2 |  * Copyright 2018 Amazon.com, Inc. or its affiliates. All Rights Reserved.
 3 |  *
 4 |  * Licensed under the Apache License, Version 2.0 (the "License"). You may not
 5 |  * use this file except in compliance with the License. A copy of the License
 6 |  * is located at
 7 |  *
 8 |  *     http://aws.amazon.com/apache2.0/
 9 |  *
10 |  * or in the "license" file accompanying this file. This file is distributed on
11 |  * an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either
12 |  * express or implied. See the License for the specific language governing
13 |  * permissions and limitations under the License.
14 |  *
15 |  */
16 | 
17 | 
18 | package com.amazon.deequ.anomalydetection
19 | 
20 | /**
21 |   * Provided for backwards compatibility.
22 |   * the old [[RateOfChangeStrategy]] actually detects absolute changes
23 |   * so it has been migrated to [[AbsoluteChangeStrategy]]
24 |   * use [[RelativeRateOfChangeStrategy]] if you want to
25 |   * detect relative changes to the previous values
26 |   */
27 | @deprecated("use AbsoluteChangeStrategy instead which describes the strategy more accurately")
28 | case class RateOfChangeStrategy(
29 |   maxRateDecrease: Option[Double] = None,
30 |   maxRateIncrease: Option[Double] = None,
31 |   order: Int = 1) extends BaseChangeStrategy
32 | 


--------------------------------------------------------------------------------
/src/main/scala/com/amazon/deequ/anomalydetection/AnomalyDetectionStrategy.scala:
--------------------------------------------------------------------------------
 1 | /**
 2 |  * Copyright 2018 Amazon.com, Inc. or its affiliates. All Rights Reserved.
 3 |  *
 4 |  * Licensed under the Apache License, Version 2.0 (the "License"). You may not
 5 |  * use this file except in compliance with the License. A copy of the License
 6 |  * is located at
 7 |  *
 8 |  *     http://aws.amazon.com/apache2.0/
 9 |  *
10 |  * or in the "license" file accompanying this file. This file is distributed on
11 |  * an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either
12 |  * express or implied. See the License for the specific language governing
13 |  * permissions and limitations under the License.
14 |  *
15 |  */
16 | 
17 | package com.amazon.deequ.anomalydetection
18 | 
19 | /** Interface for all strategies that spot anomalies in a series of data points. */
20 | trait AnomalyDetectionStrategy {
21 | 
22 |   /**
23 |     * Search for anomalies in a series of data points.
24 |     *
25 |     * @param dataSeries     The data contained in a Vector of Doubles
26 |     * @param searchInterval The indices between which anomalies should be detected. [a, b).
27 |     * @return The indices of all anomalies in the interval and their corresponding wrapper object.
28 |     */
29 |   def detect(
30 |     dataSeries: Vector[Double],
31 |     searchInterval: (Int, Int) = (0, Int.MaxValue)): Seq[(Int, Anomaly)]
32 | }
33 | 


--------------------------------------------------------------------------------
/src/test/scala/com/amazon/deequ/checks/ColumnConditionTest.scala:
--------------------------------------------------------------------------------
 1 | /**
 2 |   * Copyright 2018 Amazon.com, Inc. or its affiliates. All Rights Reserved.
 3 |   *
 4 |   * Licensed under the Apache License, Version 2.0 (the "License"). You may not
 5 |   * use this file except in compliance with the License. A copy of the License
 6 |   * is located at
 7 |   *
 8 |   * http://aws.amazon.com/apache2.0/
 9 |   *
10 |   * or in the "license" file accompanying this file. This file is distributed on
11 |   * an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either
12 |   * express or implied. See the License for the specific language governing
13 |   * permissions and limitations under the License.
14 |   *
15 |   */
16 | 
17 | package com.amazon.deequ.checks
18 | 
19 | 
20 | import org.scalatest.wordspec.AnyWordSpec
21 | 
22 | class ColumnConditionTest extends AnyWordSpec {
23 | 
24 |   "ColumnCondition" should {
25 | 
26 |     "return the correct isEachNotNull condition" in {
27 |       assert(
28 |         ColumnCondition.isEachNotNull(Seq("att1", "att2", "att3")) ==
29 |         "(((att1 IS NOT NULL) AND (att2 IS NOT NULL)) AND (att3 IS NOT NULL))"
30 |       )
31 |     }
32 | 
33 |     "return the correct isAnyNotNull condition" in {
34 |       assert(
35 |         ColumnCondition.isAnyNotNull(Seq("att1", "att2", "att3")) ==
36 |           "(((att1 IS NOT NULL) OR (att2 IS NOT NULL)) OR (att3 IS NOT NULL))"
37 |       )
38 |     }
39 |   }
40 | 
41 | }
42 | 
43 | 
44 | 


--------------------------------------------------------------------------------
/src/main/scala/com/amazon/deequ/analyzers/CountDistinct.scala:
--------------------------------------------------------------------------------
 1 | /**
 2 |  * Copyright 2018 Amazon.com, Inc. or its affiliates. All Rights Reserved.
 3 |  *
 4 |  * Licensed under the Apache License, Version 2.0 (the "License"). You may not
 5 |  * use this file except in compliance with the License. A copy of the License
 6 |  * is located at
 7 |  *
 8 |  *     http://aws.amazon.com/apache2.0/
 9 |  *
10 |  * or in the "license" file accompanying this file. This file is distributed on
11 |  * an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either
12 |  * express or implied. See the License for the specific language governing
13 |  * permissions and limitations under the License.
14 |  *
15 |  */
16 | 
17 | package com.amazon.deequ.analyzers
18 | 
19 | import com.amazon.deequ.metrics.DoubleMetric
20 | import org.apache.spark.sql.{Column, Row}
21 | import org.apache.spark.sql.functions.count
22 | import Analyzers._
23 | 
24 | case class CountDistinct(columns: Seq[String])
25 |   extends ScanShareableFrequencyBasedAnalyzer("CountDistinct", columns) {
26 | 
27 |   override def aggregationFunctions(numRows: Long): Seq[Column] = {
28 |     count("*") :: Nil
29 |   }
30 | 
31 |   override def fromAggregationResult(result: Row, offset: Int): DoubleMetric = {
32 |     toSuccessMetric(result.getLong(offset).toDouble)
33 |   }
34 | }
35 | 
36 | object CountDistinct {
37 |   def apply(column: String): CountDistinct = {
38 |     new CountDistinct(column :: Nil)
39 |   }
40 | }
41 | 


--------------------------------------------------------------------------------
/src/main/scala/com/amazon/deequ/analyzers/catalyst/StatefulStdDevPop.scala:
--------------------------------------------------------------------------------
 1 | /**
 2 |  * Copyright 2018 Amazon.com, Inc. or its affiliates. All Rights Reserved.
 3 |  *
 4 |  * Licensed under the Apache License, Version 2.0 (the "License"). You may not
 5 |  * use this file except in compliance with the License. A copy of the License
 6 |  * is located at
 7 |  *
 8 |  *     http://aws.amazon.com/apache2.0/
 9 |  *
10 |  * or in the "license" file accompanying this file. This file is distributed on
11 |  * an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either
12 |  * express or implied. See the License for the specific language governing
13 |  * permissions and limitations under the License.
14 |  *
15 |  */
16 | 
17 | package org.apache.spark.sql
18 | 
19 | import org.apache.spark.sql.catalyst.expressions._
20 | import org.apache.spark.sql.catalyst.expressions.aggregate.CentralMomentAgg
21 | import org.apache.spark.sql.types._
22 | 
23 | /** Adjusted version of org.apache.spark.sql.catalyst.expressions.aggregate.StddevPop */
24 | private[sql] case class StatefulStdDevPop(child: Expression) extends CentralMomentAgg(child) {
25 | 
26 |   override protected def momentOrder = 2
27 | 
28 |   override def dataType: DataType = StructType(StructField("n", DoubleType) ::
29 |     StructField("avg", DoubleType) :: StructField("m2", DoubleType) :: Nil)
30 | 
31 |   override val evaluateExpression: Expression = CreateStruct(n :: avg :: m2 :: Nil)
32 | 
33 |   override def prettyName: String = "stateful_stddev_pop"
34 | }
35 | 


--------------------------------------------------------------------------------
/src/main/scala/com/amazon/deequ/suggestions/rules/ConstraintRule.scala:
--------------------------------------------------------------------------------
 1 | /**
 2 |  * Copyright 2018 Amazon.com, Inc. or its affiliates. All Rights Reserved.
 3 |  *
 4 |  * Licensed under the Apache License, Version 2.0 (the "License"). You may not
 5 |  * use this file except in compliance with the License. A copy of the License
 6 |  * is located at
 7 |  *
 8 |  *     http://aws.amazon.com/apache2.0/
 9 |  *
10 |  * or in the "license" file accompanying this file. This file is distributed on
11 |  * an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either
12 |  * express or implied. See the License for the specific language governing
13 |  * permissions and limitations under the License.
14 |  *
15 |  */
16 | 
17 | package com.amazon.deequ.suggestions.rules
18 | 
19 | import com.amazon.deequ.profiles.ColumnProfile
20 | import com.amazon.deequ.suggestions._
21 | 
22 | /** Abstract base class for all constraint suggestion rules */
23 | abstract class ConstraintRule[P <: ColumnProfile] {
24 | 
25 |   val ruleDescription: String
26 | 
27 |   /**
28 |     * Decide whether the rule should be applied to a particular column
29 |     *
30 |     * @param profile  profile of the column
31 |     * @param numRecords overall number of records
32 |     * @return
33 |     */
34 |   def shouldBeApplied(profile: P, numRecords: Long): Boolean
35 | 
36 |   /**
37 |     * Generated a suggested constraint for the column
38 |     *
39 |     * @param profile  profile of the column
40 |     * @param numRecords overall number of records
41 |     * @return
42 |     */
43 |   def candidate(profile: P, numRecords: Long): ConstraintSuggestion
44 | }
45 | 


--------------------------------------------------------------------------------
/src/main/scala/com/amazon/deequ/analyzers/Distinctness.scala:
--------------------------------------------------------------------------------
 1 | /**
 2 |  * Copyright 2018 Amazon.com, Inc. or its affiliates. All Rights Reserved.
 3 |  *
 4 |  * Licensed under the Apache License, Version 2.0 (the "License"). You may not
 5 |  * use this file except in compliance with the License. A copy of the License
 6 |  * is located at
 7 |  *
 8 |  *     http://aws.amazon.com/apache2.0/
 9 |  *
10 |  * or in the "license" file accompanying this file. This file is distributed on
11 |  * an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either
12 |  * express or implied. See the License for the specific language governing
13 |  * permissions and limitations under the License.
14 |  *
15 |  */
16 | 
17 | package com.amazon.deequ.analyzers
18 | 
19 | import com.amazon.deequ.analyzers.Analyzers.COUNT_COL
20 | import org.apache.spark.sql.functions.{col, sum}
21 | import org.apache.spark.sql.types.DoubleType
22 | import org.apache.spark.sql.Column
23 | 
24 | /**
25 |   * Distinctness is the fraction of distinct values of a column(s).
26 |   *
27 |   * @param columns  the column(s) for which to compute distinctness
28 |   */
29 | case class Distinctness(columns: Seq[String], where: Option[String] = None)
30 |   extends ScanShareableFrequencyBasedAnalyzer("Distinctness", columns)
31 |   with FilterableAnalyzer {
32 | 
33 |   override def aggregationFunctions(numRows: Long): Seq[Column] = {
34 |     (sum(col(COUNT_COL).geq(1).cast(DoubleType)) / numRows) :: Nil
35 |   }
36 | 
37 |   override def filterCondition: Option[String] = where
38 | }
39 | 
40 | object Distinctness {
41 |   def apply(column: String): Distinctness = {
42 |     new Distinctness(column :: Nil)
43 |   }
44 | }
45 | 


--------------------------------------------------------------------------------
/src/test/scala/com/amazon/deequ/anomalydetection/RateOfChangeStrategyTest.scala:
--------------------------------------------------------------------------------
 1 | /**
 2 |  * Copyright 2018 Amazon.com, Inc. or its affiliates. All Rights Reserved.
 3 |  *
 4 |  * Licensed under the Apache License, Version 2.0 (the "License"). You may not
 5 |  * use this file except in compliance with the License. A copy of the License
 6 |  * is located at
 7 |  *
 8 |  *     http://aws.amazon.com/apache2.0/
 9 |  *
10 |  * or in the "license" file accompanying this file. This file is distributed on
11 |  * an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either
12 |  * express or implied. See the License for the specific language governing
13 |  * permissions and limitations under the License.
14 |  *
15 |  */
16 | 
17 | package com.amazon.deequ.anomalydetection
18 | 
19 | import org.scalatest.{Matchers, WordSpec}
20 | 
21 | /**
22 |  * The tested class RateOfChangeStrategy is deprecated.
23 |  * This test is to ensure backwards compatibility for deequ checks that still rely on this strategy.
24 |  */
25 | class RateOfChangeStrategyTest extends WordSpec with Matchers {
26 | 
27 |   "RateOfChange Strategy" should {
28 | 
29 |     val strategy = RateOfChangeStrategy(Some(-2.0), Some(2.0))
30 |     val data = (for (i <- 0 to 50) yield {
31 |       if (i < 20 || i > 30) {
32 |         1.0
33 |       } else {
34 |         if (i % 2 == 0) i else -i
35 |       }
36 |     }).toVector
37 | 
38 |     "detect all anomalies if no interval specified" in {
39 |       val anomalyResult = strategy.detect(data)
40 |       val expected = for (i <- 20 to 31) yield {
41 |         (i, Anomaly(Option(data(i)), 1.0))
42 |       }
43 |       assert(anomalyResult == expected)
44 |     }
45 |   }
46 | }
47 | 


--------------------------------------------------------------------------------
/src/main/scala/com/amazon/deequ/analyzers/Entropy.scala:
--------------------------------------------------------------------------------
 1 | /**
 2 |  * Copyright 2018 Amazon.com, Inc. or its affiliates. All Rights Reserved.
 3 |  *
 4 |  * Licensed under the Apache License, Version 2.0 (the "License"). You may not
 5 |  * use this file except in compliance with the License. A copy of the License
 6 |  * is located at
 7 |  *
 8 |  *     http://aws.amazon.com/apache2.0/
 9 |  *
10 |  * or in the "license" file accompanying this file. This file is distributed on
11 |  * an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either
12 |  * express or implied. See the License for the specific language governing
13 |  * permissions and limitations under the License.
14 |  *
15 |  */
16 | 
17 | package com.amazon.deequ.analyzers
18 | 
19 | import com.amazon.deequ.analyzers.Analyzers.COUNT_COL
20 | import org.apache.spark.sql.Column
21 | import org.apache.spark.sql.functions.{col, sum, udf}
22 | 
23 | /**
24 |   * Entropy is a measure of the level of information contained in a message. Given the probability
25 |   * distribution over values in a column, it describes how many bits are required to identify a
26 |   * value.
27 |   */
28 | case class Entropy(column: String, where: Option[String] = None)
29 |   extends ScanShareableFrequencyBasedAnalyzer("Entropy", column :: Nil)
30 |   with FilterableAnalyzer {
31 | 
32 |   override def aggregationFunctions(numRows: Long): Seq[Column] = {
33 |     val summands = udf { (count: Double) =>
34 |       if (count == 0.0) {
35 |         0.0
36 |       } else {
37 |         -(count / numRows) * math.log(count / numRows)
38 |       }
39 |     }
40 | 
41 |     sum(summands(col(COUNT_COL))) :: Nil
42 |   }
43 | 
44 |   override def filterCondition: Option[String] = where
45 | }
46 | 


--------------------------------------------------------------------------------
/src/main/scala/com/amazon/deequ/anomalydetection/AbsoluteChangeStrategy.scala:
--------------------------------------------------------------------------------
 1 | /**
 2 |  * Copyright 2018 Amazon.com, Inc. or its affiliates. All Rights Reserved.
 3 |  *
 4 |  * Licensed under the Apache License, Version 2.0 (the "License"). You may not
 5 |  * use this file except in compliance with the License. A copy of the License
 6 |  * is located at
 7 |  *
 8 |  *     http://aws.amazon.com/apache2.0/
 9 |  *
10 |  * or in the "license" file accompanying this file. This file is distributed on
11 |  * an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either
12 |  * express or implied. See the License for the specific language governing
13 |  * permissions and limitations under the License.
14 |  *
15 |  */
16 | 
17 | package com.amazon.deequ.anomalydetection
18 | 
19 | /**
20 |   * Detects anomalies based on the values' absolute change.
21 |   * The order of the difference can be set manually.
22 |   * If it is set to 0, this strategy acts like the [[SimpleThresholdStrategy]].
23 |   *
24 |   * AbsoluteChangeStrategy(Some(-10.0), Some(10.0), 1) for example
25 |   * calculates the first discrete difference
26 |   * and if some point's value changes by more than 10.0 in one timestep, it flags it as an anomaly.
27 |   *
28 |   * @param maxRateDecrease Upper bound of accepted decrease (lower bound of increase).
29 |   * @param maxRateIncrease Upper bound of accepted growth.
30 |   * @param order           Order of the calculated difference.
31 |   *                        Set to 1 it calculates the difference between two consecutive values.
32 |   */
33 | case class AbsoluteChangeStrategy(
34 |   maxRateDecrease: Option[Double] = None,
35 |   maxRateIncrease: Option[Double] = None,
36 |   order: Int = 1) extends BaseChangeStrategy
37 | 


--------------------------------------------------------------------------------
/src/main/scala/com/amazon/deequ/examples/ExampleUtils.scala:
--------------------------------------------------------------------------------
 1 | /**
 2 |  * Copyright 2018 Amazon.com, Inc. or its affiliates. All Rights Reserved.
 3 |  *
 4 |  * Licensed under the Apache License, Version 2.0 (the "License"). You may not
 5 |  * use this file except in compliance with the License. A copy of the License
 6 |  * is located at
 7 |  *
 8 |  *     http://aws.amazon.com/apache2.0/
 9 |  *
10 |  * or in the "license" file accompanying this file. This file is distributed on
11 |  * an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either
12 |  * express or implied. See the License for the specific language governing
13 |  * permissions and limitations under the License.
14 |  *
15 |  */
16 | 
17 | package com.amazon.deequ.examples
18 | 
19 | import org.apache.spark.sql.{DataFrame, SparkSession}
20 | 
21 | private[deequ] object ExampleUtils {
22 | 
23 |   def withSpark(func: SparkSession => Unit): Unit = {
24 |     val session = SparkSession.builder()
25 |       .master("local")
26 |       .appName("test")
27 |       .config("spark.ui.enabled", "false")
28 |       .getOrCreate()
29 |     session.sparkContext.setCheckpointDir(System.getProperty("java.io.tmpdir"))
30 | 
31 |     try {
32 |       func(session)
33 |     } finally {
34 |       session.stop()
35 |       System.clearProperty("spark.driver.port")
36 |     }
37 |   }
38 | 
39 |   def itemsAsDataframe(session: SparkSession, items: Item*): DataFrame = {
40 |     val rdd = session.sparkContext.parallelize(items)
41 |     session.createDataFrame(rdd)
42 |   }
43 | 
44 |   def manufacturersAsDataframe(session: SparkSession, manufacturers: Manufacturer*): DataFrame = {
45 |     val rdd = session.sparkContext.parallelize(manufacturers)
46 |     session.createDataFrame(rdd)
47 |   }
48 | }
49 | 


--------------------------------------------------------------------------------
/src/main/scala/com/amazon/deequ/analyzers/Uniqueness.scala:
--------------------------------------------------------------------------------
 1 | /**
 2 |  * Copyright 2018 Amazon.com, Inc. or its affiliates. All Rights Reserved.
 3 |  *
 4 |  * Licensed under the Apache License, Version 2.0 (the "License"). You may not
 5 |  * use this file except in compliance with the License. A copy of the License
 6 |  * is located at
 7 |  *
 8 |  *     http://aws.amazon.com/apache2.0/
 9 |  *
10 |  * or in the "license" file accompanying this file. This file is distributed on
11 |  * an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either
12 |  * express or implied. See the License for the specific language governing
13 |  * permissions and limitations under the License.
14 |  *
15 |  */
16 | 
17 | package com.amazon.deequ.analyzers
18 | 
19 | import com.amazon.deequ.analyzers.Analyzers.COUNT_COL
20 | import org.apache.spark.sql.Column
21 | import org.apache.spark.sql.functions.{col, lit, sum}
22 | import org.apache.spark.sql.types.DoubleType
23 | 
24 | /** Uniqueness is the fraction of unique values of a column(s), i.e.,
25 |   * values that occur exactly once. */
26 | case class Uniqueness(columns: Seq[String], where: Option[String] = None)
27 |   extends ScanShareableFrequencyBasedAnalyzer("Uniqueness", columns)
28 |   with FilterableAnalyzer {
29 | 
30 |   override def aggregationFunctions(numRows: Long): Seq[Column] = {
31 |     (sum(col(COUNT_COL).equalTo(lit(1)).cast(DoubleType)) / numRows) :: Nil
32 |   }
33 | 
34 |   override def filterCondition: Option[String] = where
35 | }
36 | 
37 | object Uniqueness {
38 |   def apply(column: String): Uniqueness = {
39 |     new Uniqueness(column :: Nil)
40 |   }
41 | 
42 |   def apply(column: String, where: Option[String]): Uniqueness = {
43 |     new Uniqueness(column :: Nil, where)
44 |   }
45 | }
46 | 


--------------------------------------------------------------------------------
/src/main/scala/com/amazon/deequ/analyzers/Size.scala:
--------------------------------------------------------------------------------
 1 | /**
 2 |  * Copyright 2018 Amazon.com, Inc. or its affiliates. All Rights Reserved.
 3 |  *
 4 |  * Licensed under the Apache License, Version 2.0 (the "License"). You may not
 5 |  * use this file except in compliance with the License. A copy of the License
 6 |  * is located at
 7 |  *
 8 |  *     http://aws.amazon.com/apache2.0/
 9 |  *
10 |  * or in the "license" file accompanying this file. This file is distributed on
11 |  * an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either
12 |  * express or implied. See the License for the specific language governing
13 |  * permissions and limitations under the License.
14 |  *
15 |  */
16 | 
17 | package com.amazon.deequ.analyzers
18 | 
19 | import com.amazon.deequ.metrics.Entity
20 | import org.apache.spark.sql.{Column, Row}
21 | import Analyzers._
22 | 
23 | case class NumMatches(numMatches: Long) extends DoubleValuedState[NumMatches] {
24 | 
25 |   override def sum(other: NumMatches): NumMatches = {
26 |     NumMatches(numMatches + other.numMatches)
27 |   }
28 | 
29 |   override def metricValue(): Double = {
30 |     numMatches.toDouble
31 |   }
32 | 
33 | }
34 | 
35 | /** Size is the number of rows in a DataFrame. */
36 | case class Size(where: Option[String] = None)
37 |   extends StandardScanShareableAnalyzer[NumMatches]("Size", "*", Entity.Dataset)
38 |   with FilterableAnalyzer {
39 | 
40 |   override def aggregationFunctions(): Seq[Column] = {
41 |     conditionalCount(where) :: Nil
42 |   }
43 | 
44 |   override def fromAggregationResult(result: Row, offset: Int): Option[NumMatches] = {
45 |     ifNoNullsIn(result, offset) { _ =>
46 |       NumMatches(result.getLong(offset))
47 |     }
48 |   }
49 | 
50 |   override def filterCondition: Option[String] = where
51 | }
52 | 


--------------------------------------------------------------------------------
/src/main/scala/com/amazon/deequ/analyzers/MaxLength.scala:
--------------------------------------------------------------------------------
 1 | /**
 2 |  * Copyright 2018 Amazon.com, Inc. or its affiliates. All Rights Reserved.
 3 |  *
 4 |  * Licensed under the Apache License, Version 2.0 (the "License"). You may not
 5 |  * use this file except in compliance with the License. A copy of the License
 6 |  * is located at
 7 |  *
 8 |  *     http://aws.amazon.com/apache2.0/
 9 |  *
10 |  * or in the "license" file accompanying this file. This file is distributed on
11 |  * an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either
12 |  * express or implied. See the License for the specific language governing
13 |  * permissions and limitations under the License.
14 |  *
15 |  */
16 | 
17 | package com.amazon.deequ.analyzers
18 | 
19 | import com.amazon.deequ.analyzers.Analyzers._
20 | import com.amazon.deequ.analyzers.Preconditions.{hasColumn, isString}
21 | import org.apache.spark.sql.functions.{length, max}
22 | import org.apache.spark.sql.types.{DoubleType, StructType}
23 | import org.apache.spark.sql.{Column, Row}
24 | 
25 | case class MaxLength(column: String, where: Option[String] = None)
26 |   extends StandardScanShareableAnalyzer[MaxState]("MaxLength", column)
27 |   with FilterableAnalyzer {
28 | 
29 |   override def aggregationFunctions(): Seq[Column] = {
30 |     max(length(conditionalSelection(column, where))).cast(DoubleType) :: Nil
31 |   }
32 | 
33 |   override def fromAggregationResult(result: Row, offset: Int): Option[MaxState] = {
34 |     ifNoNullsIn(result, offset) { _ =>
35 |       MaxState(result.getDouble(offset))
36 |     }
37 |   }
38 | 
39 |   override protected def additionalPreconditions(): Seq[StructType => Unit] = {
40 |     hasColumn(column):: isString(column) :: Nil
41 |   }
42 | 
43 |   override def filterCondition: Option[String] = where
44 | }
45 | 


--------------------------------------------------------------------------------
/src/test/scala/com/amazon/deequ/DatatypeSuggestionTest.scala:
--------------------------------------------------------------------------------
 1 | /**
 2 |   * Copyright 2018 Amazon.com, Inc. or its affiliates. All Rights Reserved.
 3 |   *
 4 |   * Licensed under the Apache License, Version 2.0 (the "License"). You may not
 5 |   * use this file except in compliance with the License. A copy of the License
 6 |   * is located at
 7 |   *
 8 |   *     http://aws.amazon.com/apache2.0/
 9 |   *
10 |   * or in the "license" file accompanying this file. This file is distributed on
11 |   * an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either
12 |   * express or implied. See the License for the specific language governing
13 |   * permissions and limitations under the License.
14 |   *
15 |   */
16 | 
17 | package com.amazon.deequ
18 | 
19 | import com.amazon.deequ.profiles.{ColumnProfiler, ColumnProfiles, StandardColumnProfile}
20 | import com.amazon.deequ.utils.FixtureSupport
21 | import org.apache.spark.sql.{DataFrame, SparkSession}
22 | import org.scalamock.scalatest.MockFactory
23 | import org.scalatest.{Matchers, WordSpec}
24 | 
25 | class DatatypeSuggestionTest extends WordSpec with Matchers with SparkContextSpec
26 |   with FixtureSupport with MockFactory{
27 | 
28 |   "Column Profiler" should {
29 |     "return the correct datatype(String) in case of profiling empty string columns" in
30 |       withSparkSession { sparkSession =>
31 | 
32 |         val df = getEmptyColumnDataDf(sparkSession = sparkSession)
33 | 
34 |         val profile = ColumnProfiler
35 |                         .profile(df, Option(Seq("att1")))
36 |                         .profiles("att1")
37 | 
38 |         assert(profile.isInstanceOf[StandardColumnProfile])
39 |         assert(profile.isDataTypeInferred && profile.dataType.toString.equalsIgnoreCase("String"))
40 |       }
41 |   }
42 | 
43 | }
44 | 


--------------------------------------------------------------------------------
/src/main/scala/com/amazon/deequ/analyzers/MinLength.scala:
--------------------------------------------------------------------------------
 1 | /**
 2 |  * Copyright 2018 Amazon.com, Inc. or its affiliates. All Rights Reserved.
 3 |  *
 4 |  * Licensed under the Apache License, Version 2.0 (the "License"). You may not
 5 |  * use this file except in compliance with the License. A copy of the License
 6 |  * is located at
 7 |  *
 8 |  *     http://aws.amazon.com/apache2.0/
 9 |  *
10 |  * or in the "license" file accompanying this file. This file is distributed on
11 |  * an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either
12 |  * express or implied. See the License for the specific language governing
13 |  * permissions and limitations under the License.
14 |  *
15 |  */
16 | 
17 | package com.amazon.deequ.analyzers
18 | 
19 | import com.amazon.deequ.analyzers.Analyzers._
20 | import com.amazon.deequ.analyzers.Preconditions.{hasColumn, isString}
21 | import org.apache.spark.sql.functions.{length, min}
22 | import org.apache.spark.sql.types.{DoubleType, StructType}
23 | import org.apache.spark.sql.{Column, Row}
24 | 
25 | case class MinLength(column: String, where: Option[String] = None)
26 |   extends StandardScanShareableAnalyzer[MinState]("MinLength", column)
27 |   with FilterableAnalyzer {
28 | 
29 |   override def aggregationFunctions(): Seq[Column] = {
30 |     min(length(conditionalSelection(column, where))).cast(DoubleType) :: Nil
31 |   }
32 | 
33 |   override def fromAggregationResult(result: Row, offset: Int): Option[MinState] = {
34 |     ifNoNullsIn(result, offset) { _ =>
35 |       MinState(result.getDouble(offset))
36 |     }
37 |   }
38 | 
39 |   override protected def additionalPreconditions(): Seq[StructType => Unit] = {
40 |     hasColumn(column) :: isString(column) :: Nil
41 |   }
42 | 
43 |   override def filterCondition: Option[String] = where
44 | }
45 | 


--------------------------------------------------------------------------------
/src/main/scala/com/amazon/deequ/anomalydetection/HistoryUtils.scala:
--------------------------------------------------------------------------------
 1 | /**
 2 |  * Copyright 2018 Amazon.com, Inc. or its affiliates. All Rights Reserved.
 3 |  *
 4 |  * Licensed under the Apache License, Version 2.0 (the "License"). You may not
 5 |  * use this file except in compliance with the License. A copy of the License
 6 |  * is located at
 7 |  *
 8 |  *     http://aws.amazon.com/apache2.0/
 9 |  *
10 |  * or in the "license" file accompanying this file. This file is distributed on
11 |  * an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either
12 |  * express or implied. See the License for the specific language governing
13 |  * permissions and limitations under the License.
14 |  *
15 |  */
16 | 
17 | package com.amazon.deequ.anomalydetection
18 | 
19 | import com.amazon.deequ.metrics.Metric
20 | 
21 | /**
22 |   * Contains utility methods to convert tuples of date and metric to a DataPoint
23 |   */
24 | private[deequ] object HistoryUtils {
25 | 
26 |   /**
27 |     * Given a sequence of dated optional metrics, return sequence of dated optional metric values.
28 |     *
29 |     * @param metrics Sequence of dated optional metrics
30 |     * @tparam M Type of the metric value
31 |     * @return Sequence of dated optional metric values
32 |     */
33 |   def extractMetricValues[M](metrics: Seq[(Long, Option[Metric[M]])]): Seq[DataPoint[M]] = {
34 |     metrics.map { case (date, metric) => DataPoint(date, extractMetricValue[M](metric)) }
35 |   }
36 | 
37 |   /**
38 |     * Given an optional metric,returns optional metric value
39 |     *
40 |     * @param metric Optional metric
41 |     * @tparam M Type of the metric value
42 |     * @return Optional metric value
43 |     */
44 |   def extractMetricValue[M](metric: Option[Metric[M]]): Option[M] = {
45 |     metric.flatMap(_.value.toOption)
46 |   }
47 | 
48 | }
49 | 


--------------------------------------------------------------------------------
/src/test/scala/com/amazon/deequ/anomalydetection/AnomalyDetectionTestUtilsTest.scala:
--------------------------------------------------------------------------------
 1 | /**
 2 |   * Copyright 2018 Amazon.com, Inc. or its affiliates. All Rights Reserved.
 3 |   *
 4 |   * Licensed under the Apache License, Version 2.0 (the "License"). You may not
 5 |   * use this file except in compliance with the License. A copy of the License
 6 |   * is located at
 7 |   *
 8 |   *     http://aws.amazon.com/apache2.0/
 9 |   *
10 |   * or in the "license" file accompanying this file. This file is distributed on
11 |   * an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either
12 |   * express or implied. See the License for the specific language governing
13 |   * permissions and limitations under the License.
14 |   *
15 |   */
16 | 
17 | package com.amazon.deequ.anomalydetection
18 | 
19 | import org.scalatest.{Matchers, WordSpec}
20 | 
21 | class AnomalyDetectionTestUtilsTest extends WordSpec with Matchers {
22 | 
23 |   "AnomalyDetectionTestUtilsTest" should {
24 | 
25 |     "throw an exception if no value found" in {
26 |       intercept[IllegalArgumentException] {
27 |         AnomalyDetectionTestUtils.firstDoubleFromString("noNumber")
28 |       }
29 |       intercept[IllegalArgumentException] {
30 |         AnomalyDetectionTestUtils.firstThreeDoublesFromString("noNumber")
31 |       }
32 |     }
33 | 
34 |     "find first value" in {
35 |       val str = "xx3.141yyu4.2"
36 |       val value = AnomalyDetectionTestUtils.firstDoubleFromString(str)
37 |       assert(value == 3.141)
38 |     }
39 | 
40 |     "find all 3 values" in {
41 |       val str = "In this 1 string are 3.000 values, not 42.01"
42 | 
43 |       val (first, second, third) = AnomalyDetectionTestUtils.firstThreeDoublesFromString(str)
44 |       assert(first === 1)
45 |       assert(second === 3.0)
46 |       assert(third === 42.01)
47 |     }
48 |   }
49 | }
50 | 


--------------------------------------------------------------------------------
/src/test/scala/com/amazon/deequ/utils/AssertionUtils.scala:
--------------------------------------------------------------------------------
 1 | /**
 2 |   * Copyright 2018 Amazon.com, Inc. or its affiliates. All Rights Reserved.
 3 |   *
 4 |   * Licensed under the Apache License, Version 2.0 (the "License"). You may not
 5 |   * use this file except in compliance with the License. A copy of the License
 6 |   * is located at
 7 |   *
 8 |   *     http://aws.amazon.com/apache2.0/
 9 |   *
10 |   * or in the "license" file accompanying this file. This file is distributed on
11 |   * an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either
12 |   * express or implied. See the License for the specific language governing
13 |   * permissions and limitations under the License.
14 |   *
15 |   */
16 | 
17 | package com.amazon.deequ.utils
18 | 
19 | import scala.util.{Failure, Success, Try}
20 | 
21 | object AssertionUtils {
22 | 
23 |   implicit class TryUtils[A](something: Try[A]) {
24 |     def compare[B](other: Try[B]): Boolean = {
25 |       (something, other) match {
26 |         case (Success(a), Success(b)) => a == b
27 |         case (Failure(a), Failure(b)) => a.getClass == b.getClass && (a.getMessage == b.getMessage)
28 |         case (_, _) => false
29 |       }
30 |     }
31 |     def compareFailureTypes[B](other: Try[B]): Boolean = {
32 |       (something, other) match {
33 |         case (Failure(a), Failure(b)) => a.getClass == b.getClass
34 |         case (_, _) => false
35 |       }
36 |     }
37 |     def compareOuterAndInnerFailureTypes[B](other: Try[B]): Boolean = {
38 |       (something, other) match {
39 |         case (Failure(a: Throwable), Failure(b: Throwable))
40 |           if (a.getCause != null) && (b.getCause != null) =>
41 |             (a.getClass == b.getClass) && (a.getCause.getClass == b.getCause.getClass)
42 |         case (_, _) => false
43 |       }
44 |     }
45 | 
46 |   }
47 | 
48 | }
49 | 


--------------------------------------------------------------------------------
/src/main/scala/com/amazon/deequ/suggestions/rules/CompleteIfCompleteRule.scala:
--------------------------------------------------------------------------------
 1 | /**
 2 |  * Copyright 2018 Amazon.com, Inc. or its affiliates. All Rights Reserved.
 3 |  *
 4 |  * Licensed under the Apache License, Version 2.0 (the "License"). You may not
 5 |  * use this file except in compliance with the License. A copy of the License
 6 |  * is located at
 7 |  *
 8 |  *     http://aws.amazon.com/apache2.0/
 9 |  *
10 |  * or in the "license" file accompanying this file. This file is distributed on
11 |  * an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either
12 |  * express or implied. See the License for the specific language governing
13 |  * permissions and limitations under the License.
14 |  *
15 |  */
16 | 
17 | package com.amazon.deequ.suggestions.rules
18 | 
19 | import com.amazon.deequ.checks.Check
20 | import com.amazon.deequ.constraints.Constraint.completenessConstraint
21 | import com.amazon.deequ.profiles.ColumnProfile
22 | import com.amazon.deequ.suggestions.ConstraintSuggestion
23 | 
24 | /** If a column is complete in the sample, we suggest a NOT NULL constraint */
25 | case class CompleteIfCompleteRule() extends ConstraintRule[ColumnProfile] {
26 | 
27 |   override def shouldBeApplied(profile: ColumnProfile, numRecords: Long): Boolean = {
28 |     profile.completeness == 1.0
29 |   }
30 | 
31 |   override def candidate(profile: ColumnProfile, numRecords: Long): ConstraintSuggestion = {
32 | 
33 |     val constraint = completenessConstraint(profile.column, Check.IsOne)
34 | 
35 |     ConstraintSuggestion(
36 |       constraint,
37 |       profile.column,
38 |       "Completeness: " + profile.completeness.toString,
39 |       s"'${profile.column}' is not null",
40 |       this,
41 |       s""".isComplete("${profile.column}")"""
42 |     )
43 |   }
44 | 
45 |   override val ruleDescription: String = "If a column is complete in the sample, " +
46 |     "we suggest a NOT NULL constraint"
47 | }
48 | 


--------------------------------------------------------------------------------
/src/main/scala/com/amazon/deequ/anomalydetection/DetectionResult.scala:
--------------------------------------------------------------------------------
 1 | /**
 2 |  * Copyright 2018 Amazon.com, Inc. or its affiliates. All Rights Reserved.
 3 |  *
 4 |  * Licensed under the Apache License, Version 2.0 (the "License"). You may not
 5 |  * use this file except in compliance with the License. A copy of the License
 6 |  * is located at
 7 |  *
 8 |  *     http://aws.amazon.com/apache2.0/
 9 |  *
10 |  * or in the "license" file accompanying this file. This file is distributed on
11 |  * an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either
12 |  * express or implied. See the License for the specific language governing
13 |  * permissions and limitations under the License.
14 |  *
15 |  */
16 | 
17 | package com.amazon.deequ.anomalydetection
18 | 
19 | class Anomaly(
20 |     val value: Option[Double],
21 |     val confidence: Double,
22 |     val detail: Option[String] = None) {
23 | 
24 |   def canEqual(that: Any): Boolean = {
25 |     that.isInstanceOf[Anomaly]
26 |   }
27 | 
28 |   /**
29 |     * Tests anomalies for equality. Ignores detailed explanation.
30 |     *
31 |     * @param obj The object/ anomaly to compare against
32 |     * @return true, if and only if the value and confidence are the same
33 |     */
34 |   override def equals(obj: Any): Boolean = {
35 |     obj match {
36 |       case anomaly: Anomaly => anomaly.value == value && anomaly.confidence == confidence
37 |       case _ => false
38 |     }
39 |   }
40 | 
41 |   override def hashCode: Int = {
42 |     val prime = 31
43 |     var result = 1
44 |     result = prime * result + (if (value == null) 0 else value.hashCode)
45 |     prime * result + confidence.hashCode
46 |   }
47 | 
48 | }
49 | 
50 | object Anomaly {
51 |   def apply(value: Option[Double], confidence: Double, detail: Option[String] = None): Anomaly = {
52 |     new Anomaly(value, confidence, detail)
53 |   }
54 | }
55 | 
56 | case class DetectionResult(anomalies: Seq[(Long, Anomaly)] = Seq.empty)
57 | 


--------------------------------------------------------------------------------
/src/main/scala/com/amazon/deequ/analyzers/Sum.scala:
--------------------------------------------------------------------------------
 1 | /**
 2 |  * Copyright 2018 Amazon.com, Inc. or its affiliates. All Rights Reserved.
 3 |  *
 4 |  * Licensed under the Apache License, Version 2.0 (the "License"). You may not
 5 |  * use this file except in compliance with the License. A copy of the License
 6 |  * is located at
 7 |  *
 8 |  *     http://aws.amazon.com/apache2.0/
 9 |  *
10 |  * or in the "license" file accompanying this file. This file is distributed on
11 |  * an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either
12 |  * express or implied. See the License for the specific language governing
13 |  * permissions and limitations under the License.
14 |  *
15 |  */
16 | 
17 | package com.amazon.deequ.analyzers
18 | 
19 | import com.amazon.deequ.analyzers.Preconditions.{hasColumn, isNumeric}
20 | import org.apache.spark.sql.functions.sum
21 | import org.apache.spark.sql.types.{DoubleType, StructType}
22 | import org.apache.spark.sql.{Column, Row}
23 | import Analyzers._
24 | 
25 | case class SumState(sum: Double) extends DoubleValuedState[SumState] {
26 | 
27 |   override def sum(other: SumState): SumState = {
28 |     SumState(sum + other.sum)
29 |   }
30 | 
31 |   override def metricValue(): Double = {
32 |     sum
33 |   }
34 | }
35 | 
36 | case class Sum(column: String, where: Option[String] = None)
37 |   extends StandardScanShareableAnalyzer[SumState]("Sum", column)
38 |   with FilterableAnalyzer {
39 | 
40 |   override def aggregationFunctions(): Seq[Column] = {
41 |     sum(conditionalSelection(column, where)).cast(DoubleType) :: Nil
42 |   }
43 | 
44 |   override def fromAggregationResult(result: Row, offset: Int): Option[SumState] = {
45 |     ifNoNullsIn(result, offset) { _ =>
46 |       SumState(result.getDouble(offset))
47 |     }
48 |   }
49 | 
50 |   override protected def additionalPreconditions(): Seq[StructType => Unit] = {
51 |     hasColumn(column) :: isNumeric(column) :: Nil
52 |   }
53 | 
54 |   override def filterCondition: Option[String] = where
55 | }
56 | 


--------------------------------------------------------------------------------
/src/main/scala/com/amazon/deequ/analyzers/Completeness.scala:
--------------------------------------------------------------------------------
 1 | /**
 2 |  * Copyright 2018 Amazon.com, Inc. or its affiliates. All Rights Reserved.
 3 |  *
 4 |  * Licensed under the Apache License, Version 2.0 (the "License"). You may not
 5 |  * use this file except in compliance with the License. A copy of the License
 6 |  * is located at
 7 |  *
 8 |  *     http://aws.amazon.com/apache2.0/
 9 |  *
10 |  * or in the "license" file accompanying this file. This file is distributed on
11 |  * an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either
12 |  * express or implied. See the License for the specific language governing
13 |  * permissions and limitations under the License.
14 |  *
15 |  */
16 | 
17 | package com.amazon.deequ.analyzers
18 | 
19 | import com.amazon.deequ.analyzers.Preconditions.{hasColumn, isNotNested}
20 | import org.apache.spark.sql.functions.sum
21 | import org.apache.spark.sql.types.{IntegerType, StructType}
22 | import Analyzers._
23 | import org.apache.spark.sql.{Column, Row}
24 | 
25 | /** Completeness is the fraction of non-null values in a column of a DataFrame. */
26 | case class Completeness(column: String, where: Option[String] = None) extends
27 |   StandardScanShareableAnalyzer[NumMatchesAndCount]("Completeness", column) with
28 |   FilterableAnalyzer {
29 | 
30 |   override def fromAggregationResult(result: Row, offset: Int): Option[NumMatchesAndCount] = {
31 | 
32 |     ifNoNullsIn(result, offset, howMany = 2) { _ =>
33 |       NumMatchesAndCount(result.getLong(offset), result.getLong(offset + 1))
34 |     }
35 |   }
36 | 
37 |   override def aggregationFunctions(): Seq[Column] = {
38 | 
39 |     val summation = sum(conditionalSelection(column, where).isNotNull.cast(IntegerType))
40 | 
41 |     summation :: conditionalCount(where) :: Nil
42 |   }
43 | 
44 |   override protected def additionalPreconditions(): Seq[StructType => Unit] = {
45 |     hasColumn(column) :: isNotNested(column) :: Nil
46 |   }
47 | 
48 |   override def filterCondition: Option[String] = where
49 | }
50 | 


--------------------------------------------------------------------------------
/src/main/scala/com/amazon/deequ/checks/CheckWithLastConstraintFilterable.scala:
--------------------------------------------------------------------------------
 1 | /**
 2 |  * Copyright 2018 Amazon.com, Inc. or its affiliates. All Rights Reserved.
 3 |  *
 4 |  * Licensed under the Apache License, Version 2.0 (the "License"). You may not
 5 |  * use this file except in compliance with the License. A copy of the License
 6 |  * is located at
 7 |  *
 8 |  *     http://aws.amazon.com/apache2.0/
 9 |  *
10 |  * or in the "license" file accompanying this file. This file is distributed on
11 |  * an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either
12 |  * express or implied. See the License for the specific language governing
13 |  * permissions and limitations under the License.
14 |  *
15 |  */
16 | 
17 | package com.amazon.deequ.checks
18 | 
19 | import com.amazon.deequ.constraints.Constraint
20 | 
21 | /** Allows to replace the last configured constraint in a check with a filtered version */
22 | class CheckWithLastConstraintFilterable(
23 |     level: CheckLevel.Value,
24 |     description: String,
25 |     constraints: Seq[Constraint],
26 |     createReplacement: Option[String] => Constraint)
27 |   extends Check(level, description, constraints) {
28 | 
29 |   /**
30 |     * Defines a filter to apply before evaluating the previous constraint
31 |     *
32 |     * @param filter SparkSQL predicate to apply
33 |     * @return
34 |     */
35 |   def where(filter: String): Check = {
36 | 
37 |     val adjustedConstraints =
38 |       constraints.take(constraints.size - 1) :+ createReplacement(Option(filter))
39 | 
40 |     Check(level, description, adjustedConstraints)
41 |   }
42 | }
43 | 
44 | object CheckWithLastConstraintFilterable {
45 |   def apply(
46 |       level: CheckLevel.Value,
47 |       description: String,
48 |       constraints: Seq[Constraint],
49 |       createReplacement: Option[String] => Constraint
50 |     ): CheckWithLastConstraintFilterable = {
51 | 
52 |     new CheckWithLastConstraintFilterable(level, description, constraints, createReplacement)
53 |   }
54 | }
55 | 


--------------------------------------------------------------------------------
/src/main/scala/com/amazon/deequ/repository/MetricsRepository.scala:
--------------------------------------------------------------------------------
 1 | /**
 2 |  * Copyright 2018 Amazon.com, Inc. or its affiliates. All Rights Reserved.
 3 |  *
 4 |  * Licensed under the Apache License, Version 2.0 (the "License"). You may not
 5 |  * use this file except in compliance with the License. A copy of the License
 6 |  * is located at
 7 |  *
 8 |  *     http://aws.amazon.com/apache2.0/
 9 |  *
10 |  * or in the "license" file accompanying this file. This file is distributed on
11 |  * an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either
12 |  * express or implied. See the License for the specific language governing
13 |  * permissions and limitations under the License.
14 |  *
15 |  */
16 | 
17 | package com.amazon.deequ.repository
18 | 
19 | import com.amazon.deequ.analyzers.runners.AnalyzerContext
20 | 
21 | /**
22 |   * Common trait for RepositoryIndexes where deequ runs can be stored.
23 |   * Repository provides methods to store AnalysisResults(metrics) and VerificationResults(if any)
24 |   */
25 | trait MetricsRepository {
26 | 
27 |   /**
28 |     * Saves Analysis results (metrics)
29 |     *
30 |     * @param resultKey       A ResultKey that uniquely identifies a AnalysisResult
31 |     * @param analyzerContext The resulting AnalyzerContext of an Analysis
32 |     */
33 |   def save(resultKey: ResultKey, analyzerContext: AnalyzerContext): Unit
34 | 
35 |   /**
36 |     * Get a AnalyzerContext saved using exactly the same resultKey if present
37 |     */
38 |   def loadByKey(resultKey: ResultKey): Option[AnalyzerContext]
39 | 
40 |   /** Get a builder class to construct a loading query to get AnalysisResults */
41 |   def load(): MetricsRepositoryMultipleResultsLoader
42 | 
43 | }
44 | 
45 | /**
46 |   * Information that uniquely identifies a AnalysisResult
47 |   *
48 |   * @param dataSetDate A date related to the AnalysisResult
49 |   * @param tags        A map with additional annotations
50 |   */
51 | case class ResultKey(dataSetDate: Long, tags: Map[String, String] = Map.empty)
52 | 


--------------------------------------------------------------------------------
/src/test/scala/com/amazon/deequ/SparkBasicTest.scala:
--------------------------------------------------------------------------------
 1 | /**
 2 |  * Copyright 2018 Amazon.com, Inc. or its affiliates. All Rights Reserved.
 3 |  *
 4 |  * Licensed under the Apache License, Version 2.0 (the "License"). You may not
 5 |  * use this file except in compliance with the License. A copy of the License
 6 |  * is located at
 7 |  *
 8 |  *     http://aws.amazon.com/apache2.0/
 9 |  *
10 |  * or in the "license" file accompanying this file. This file is distributed on
11 |  * an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either
12 |  * express or implied. See the License for the specific language governing
13 |  * permissions and limitations under the License.
14 |  *
15 |  */
16 | 
17 | package com.amazon.deequ
18 | 
19 | import org.scalatest.{Matchers, WordSpec}
20 | 
21 | class SparkBasicTest extends WordSpec with Matchers with SparkContextSpec {
22 |   "check that initializing a spark context and a basic example works" in
23 |     withSparkSession { sparkSession =>
24 |       val sc = sparkSession.sparkContext
25 |       val xs = sc.parallelize(1 to 100)
26 |       val res = xs.sum()
27 |       res should be(5050)
28 |     }
29 | 
30 |   "check that monitoring spark session works" in
31 |     withMonitorableSparkSession { (sparkSession, sparkMonitor) =>
32 |       val sc = sparkSession.sparkContext
33 |       val xs = sc.parallelize(1 to 100)
34 | 
35 | 
36 |       (1 to 2).foreach { index =>
37 |         val res = sparkMonitor.withMonitoringSession { stat =>
38 |           val sum = xs.map(_ * index).sum()
39 |           // Spark jobs are running in different monitoring sessions
40 |           assert(stat.jobCount == 1)
41 |           sum
42 |         }
43 |         res should be(5050 * index)
44 |       }
45 | 
46 |       sparkMonitor.withMonitoringSession { stat =>
47 |         (1 to 2).foreach { index =>
48 |           xs.map(_ * index).sum()
49 |         }
50 |         // Spark jobs are running in the same monitoring session
51 |         assert(stat.jobCount == 2)
52 |       }
53 |     }
54 | }
55 | 
56 | 


--------------------------------------------------------------------------------
/src/main/scala/com/amazon/deequ/analyzers/UniqueValueRatio.scala:
--------------------------------------------------------------------------------
 1 | /**
 2 |  * Copyright 2018 Amazon.com, Inc. or its affiliates. All Rights Reserved.
 3 |  *
 4 |  * Licensed under the Apache License, Version 2.0 (the "License"). You may not
 5 |  * use this file except in compliance with the License. A copy of the License
 6 |  * is located at
 7 |  *
 8 |  *     http://aws.amazon.com/apache2.0/
 9 |  *
10 |  * or in the "license" file accompanying this file. This file is distributed on
11 |  * an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either
12 |  * express or implied. See the License for the specific language governing
13 |  * permissions and limitations under the License.
14 |  *
15 |  */
16 | 
17 | package com.amazon.deequ.analyzers
18 | 
19 | import com.amazon.deequ.analyzers.Analyzers.COUNT_COL
20 | import com.amazon.deequ.metrics.DoubleMetric
21 | import org.apache.spark.sql.{Column, Row}
22 | import org.apache.spark.sql.functions.{col, count, lit, sum}
23 | import org.apache.spark.sql.types.DoubleType
24 | 
25 | case class UniqueValueRatio(columns: Seq[String], where: Option[String] = None)
26 |   extends ScanShareableFrequencyBasedAnalyzer("UniqueValueRatio", columns)
27 |   with FilterableAnalyzer {
28 | 
29 |   override def aggregationFunctions(numRows: Long): Seq[Column] = {
30 |     sum(col(COUNT_COL).equalTo(lit(1)).cast(DoubleType)) :: count("*") :: Nil
31 |   }
32 | 
33 |   override def fromAggregationResult(result: Row, offset: Int): DoubleMetric = {
34 |     val numUniqueValues = result.getDouble(offset)
35 |     val numDistinctValues = result.getLong(offset + 1).toDouble
36 | 
37 |     toSuccessMetric(numUniqueValues / numDistinctValues)
38 |   }
39 | 
40 |   override def filterCondition: Option[String] = where
41 | }
42 | 
43 | object UniqueValueRatio {
44 |   def apply(column: String): UniqueValueRatio = {
45 |     new UniqueValueRatio(column :: Nil)
46 |   }
47 | 
48 |   def apply(column: String, where: Option[String]): UniqueValueRatio = {
49 |     new UniqueValueRatio(column :: Nil, where)
50 |   }
51 | }
52 | 


--------------------------------------------------------------------------------
/src/main/scala/com/amazon/deequ/analyzers/Maximum.scala:
--------------------------------------------------------------------------------
 1 | /**
 2 |  * Copyright 2018 Amazon.com, Inc. or its affiliates. All Rights Reserved.
 3 |  *
 4 |  * Licensed under the Apache License, Version 2.0 (the "License"). You may not
 5 |  * use this file except in compliance with the License. A copy of the License
 6 |  * is located at
 7 |  *
 8 |  *     http://aws.amazon.com/apache2.0/
 9 |  *
10 |  * or in the "license" file accompanying this file. This file is distributed on
11 |  * an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either
12 |  * express or implied. See the License for the specific language governing
13 |  * permissions and limitations under the License.
14 |  *
15 |  */
16 | 
17 | package com.amazon.deequ.analyzers
18 | 
19 | import com.amazon.deequ.analyzers.Preconditions.{hasColumn, isNumeric}
20 | import org.apache.spark.sql.{Column, Row}
21 | import org.apache.spark.sql.functions.max
22 | import org.apache.spark.sql.types.{DoubleType, StructType}
23 | import Analyzers._
24 | 
25 | case class MaxState(maxValue: Double) extends DoubleValuedState[MaxState] {
26 | 
27 |   override def sum(other: MaxState): MaxState = {
28 |     MaxState(math.max(maxValue, other.maxValue))
29 |   }
30 | 
31 |   override def metricValue(): Double = {
32 |     maxValue
33 |   }
34 | }
35 | 
36 | case class Maximum(column: String, where: Option[String] = None)
37 |   extends StandardScanShareableAnalyzer[MaxState]("Maximum", column)
38 |   with FilterableAnalyzer {
39 | 
40 |   override def aggregationFunctions(): Seq[Column] = {
41 |     max(conditionalSelection(column, where)).cast(DoubleType) :: Nil
42 |   }
43 | 
44 |   override def fromAggregationResult(result: Row, offset: Int): Option[MaxState] = {
45 | 
46 |     ifNoNullsIn(result, offset) { _ =>
47 |       MaxState(result.getDouble(offset))
48 |     }
49 |   }
50 | 
51 |   override protected def additionalPreconditions(): Seq[StructType => Unit] = {
52 |     hasColumn(column) :: isNumeric(column) :: Nil
53 |   }
54 | 
55 |   override def filterCondition: Option[String] = where
56 | }
57 | 


--------------------------------------------------------------------------------
/src/main/scala/com/amazon/deequ/analyzers/Minimum.scala:
--------------------------------------------------------------------------------
 1 | /**
 2 |  * Copyright 2018 Amazon.com, Inc. or its affiliates. All Rights Reserved.
 3 |  *
 4 |  * Licensed under the Apache License, Version 2.0 (the "License"). You may not
 5 |  * use this file except in compliance with the License. A copy of the License
 6 |  * is located at
 7 |  *
 8 |  *     http://aws.amazon.com/apache2.0/
 9 |  *
10 |  * or in the "license" file accompanying this file. This file is distributed on
11 |  * an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either
12 |  * express or implied. See the License for the specific language governing
13 |  * permissions and limitations under the License.
14 |  *
15 |  */
16 | 
17 | package com.amazon.deequ.analyzers
18 | 
19 | import com.amazon.deequ.analyzers.Preconditions.{hasColumn, isNumeric}
20 | import org.apache.spark.sql.{Column, Row}
21 | import org.apache.spark.sql.functions.min
22 | import org.apache.spark.sql.types.{DoubleType, StructType}
23 | import Analyzers._
24 | 
25 | case class MinState(minValue: Double) extends DoubleValuedState[MinState] {
26 | 
27 |   override def sum(other: MinState): MinState = {
28 |     MinState(math.min(minValue, other.minValue))
29 |   }
30 | 
31 |   override def metricValue(): Double = {
32 |     minValue
33 |   }
34 | }
35 | 
36 | case class Minimum(column: String, where: Option[String] = None)
37 |   extends StandardScanShareableAnalyzer[MinState]("Minimum", column)
38 |   with FilterableAnalyzer {
39 | 
40 |   override def aggregationFunctions(): Seq[Column] = {
41 |     min(conditionalSelection(column, where)).cast(DoubleType) :: Nil
42 |   }
43 | 
44 |   override def fromAggregationResult(result: Row, offset: Int): Option[MinState] = {
45 | 
46 |     ifNoNullsIn(result, offset) { _ =>
47 |       MinState(result.getDouble(offset))
48 |     }
49 |   }
50 | 
51 |   override protected def additionalPreconditions(): Seq[StructType => Unit] = {
52 |     hasColumn(column) :: isNumeric(column) :: Nil
53 |   }
54 | 
55 |   override def filterCondition: Option[String] = where
56 | }
57 | 


--------------------------------------------------------------------------------
/src/test/scala/com/amazon/deequ/anomalydetection/HistoryUtilsTest.scala:
--------------------------------------------------------------------------------
 1 | /**
 2 |   * Copyright 2018 Amazon.com, Inc. or its affiliates. All Rights Reserved.
 3 |   *
 4 |   * Licensed under the Apache License, Version 2.0 (the "License"). You may not
 5 |   * use this file except in compliance with the License. A copy of the License
 6 |   * is located at
 7 |   *
 8 |   *     http://aws.amazon.com/apache2.0/
 9 |   *
10 |   * or in the "license" file accompanying this file. This file is distributed on
11 |   * an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either
12 |   * express or implied. See the License for the specific language governing
13 |   * permissions and limitations under the License.
14 |   *
15 |   */
16 | 
17 | package com.amazon.deequ.anomalydetection
18 | 
19 | import com.amazon.deequ.metrics.{DoubleMetric, Entity}
20 | import org.scalatest.{Matchers, WordSpec}
21 | 
22 | import scala.util.{Failure, Success}
23 | 
24 | class HistoryUtilsTest extends WordSpec with Matchers {
25 | 
26 |   "History Utils" should {
27 |     val sampleException = new IllegalArgumentException()
28 | 
29 |     val noneMetric = None
30 |     val metricWithNoValue = Some(DoubleMetric(Entity.Column, "metric-name", "instance-name",
31 |       Failure(sampleException)))
32 |     val metricWithValue = Some(DoubleMetric(Entity.Column, "metric-name", "instance-name",
33 |       Success(50)))
34 | 
35 |     "extract optinal metric value" in {
36 |       assert(HistoryUtils.extractMetricValue[Double](noneMetric).isEmpty)
37 |       assert(HistoryUtils.extractMetricValue[Double](metricWithNoValue).isEmpty)
38 |       assert(HistoryUtils.extractMetricValue[Double](metricWithValue).contains(50))
39 | 
40 |     }
41 |     "extract optinal metric values" in {
42 |       val metrics = Seq(0L -> noneMetric, 1L -> metricWithNoValue, 2L -> metricWithValue)
43 |       assert(HistoryUtils.extractMetricValues[Double](metrics) == Seq(DataPoint[Double](0L, None),
44 |         DataPoint[Double](1L, None), DataPoint[Double](2, Some(50))))
45 |     }
46 |   }
47 | }
48 | 
49 | 


--------------------------------------------------------------------------------
/src/main/scala/com/amazon/deequ/analyzers/catalyst/StatefulCorrelation.scala:
--------------------------------------------------------------------------------
 1 | /**
 2 |  * Copyright 2018 Amazon.com, Inc. or its affiliates. All Rights Reserved.
 3 |  *
 4 |  * Licensed under the Apache License, Version 2.0 (the "License"). You may not
 5 |  * use this file except in compliance with the License. A copy of the License
 6 |  * is located at
 7 |  *
 8 |  *     http://aws.amazon.com/apache2.0/
 9 |  *
10 |  * or in the "license" file accompanying this file. This file is distributed on
11 |  * an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either
12 |  * express or implied. See the License for the specific language governing
13 |  * permissions and limitations under the License.
14 |  *
15 |  */
16 | 
17 | package org.apache.spark.sql
18 | 
19 | import org.apache.spark.sql.catalyst.expressions.aggregate.Corr
20 | import org.apache.spark.sql.catalyst.expressions._
21 | import org.apache.spark.sql.types._
22 | 
23 | /** Adjusted version of org.apache.spark.sql.catalyst.expressions.aggregate.Corr */
24 | private[sql] class StatefulCorrelation(x: Expression, y: Expression) extends Corr(x, y) {
25 | 
26 |   override def dataType: org.apache.spark.sql.types.DataType =
27 |     StructType(StructField("n", DoubleType) :: StructField("xAvg", DoubleType) ::
28 |       StructField("yAvg", DoubleType) :: StructField("ck", DoubleType) ::
29 |       StructField("xMk", DoubleType) :: StructField("yMk", DoubleType) :: Nil)
30 | 
31 |   override val evaluateExpression: Expression = {
32 |     CreateStruct(n :: xAvg :: yAvg :: ck :: xMk :: yMk :: Nil)
33 |   }
34 | 
35 |   override def prettyName: String = "stateful_corr"
36 | 
37 |   override def canEqual(other: Any): Boolean = other.isInstanceOf[StatefulCorrelation]
38 | 
39 |   override def equals(other: Any): Boolean = other match {
40 |     case that: StatefulCorrelation =>
41 |       (that canEqual this) && evaluateExpression == that.evaluateExpression
42 |     case _ => false
43 |   }
44 | 
45 |   override def hashCode(): Int = {
46 |     val state = Seq(super.hashCode(), evaluateExpression)
47 |     state.map { _.hashCode() }.foldLeft(0) {(a, b) => 31 * a + b }
48 |   }
49 | }
50 | 


--------------------------------------------------------------------------------
/src/main/scala/com/amazon/deequ/analyzers/Mean.scala:
--------------------------------------------------------------------------------
 1 | /**
 2 |  * Copyright 2018 Amazon.com, Inc. or its affiliates. All Rights Reserved.
 3 |  *
 4 |  * Licensed under the Apache License, Version 2.0 (the "License"). You may not
 5 |  * use this file except in compliance with the License. A copy of the License
 6 |  * is located at
 7 |  *
 8 |  *     http://aws.amazon.com/apache2.0/
 9 |  *
10 |  * or in the "license" file accompanying this file. This file is distributed on
11 |  * an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either
12 |  * express or implied. See the License for the specific language governing
13 |  * permissions and limitations under the License.
14 |  *
15 |  */
16 | 
17 | package com.amazon.deequ.analyzers
18 | 
19 | import com.amazon.deequ.analyzers.Preconditions.{hasColumn, isNumeric}
20 | import org.apache.spark.sql.{Column, Row}
21 | import org.apache.spark.sql.functions.{count, sum}
22 | import org.apache.spark.sql.types.{DoubleType, StructType, LongType}
23 | import Analyzers._
24 | 
25 | case class MeanState(sum: Double, count: Long) extends DoubleValuedState[MeanState] {
26 | 
27 |   override def sum(other: MeanState): MeanState = {
28 |     MeanState(sum + other.sum, count + other.count)
29 |   }
30 | 
31 |   override def metricValue(): Double = {
32 |     if (count == 0L) Double.NaN else sum / count
33 |   }
34 | }
35 | 
36 | case class Mean(column: String, where: Option[String] = None)
37 |   extends StandardScanShareableAnalyzer[MeanState]("Mean", column)
38 |   with FilterableAnalyzer {
39 | 
40 |   override def aggregationFunctions(): Seq[Column] = {
41 |     sum(conditionalSelection(column, where)).cast(DoubleType) ::
42 |       count(conditionalSelection(column, where)).cast(LongType) :: Nil
43 |   }
44 | 
45 |   override def fromAggregationResult(result: Row, offset: Int): Option[MeanState] = {
46 | 
47 |     ifNoNullsIn(result, offset, howMany = 2) { _ =>
48 |       MeanState(result.getDouble(offset), result.getLong(offset + 1))
49 |     }
50 |   }
51 | 
52 |   override protected def additionalPreconditions(): Seq[StructType => Unit] = {
53 |     hasColumn(column) :: isNumeric(column) :: Nil
54 |   }
55 | 
56 |   override def filterCondition: Option[String] = where
57 | }
58 | 


--------------------------------------------------------------------------------
/src/main/scala/com/amazon/deequ/metrics/HistogramMetric.scala:
--------------------------------------------------------------------------------
 1 | /**
 2 |  * Copyright 2018 Amazon.com, Inc. or its affiliates. All Rights Reserved.
 3 |  *
 4 |  * Licensed under the Apache License, Version 2.0 (the "License"). You may not
 5 |  * use this file except in compliance with the License. A copy of the License
 6 |  * is located at
 7 |  *
 8 |  *     http://aws.amazon.com/apache2.0/
 9 |  *
10 |  * or in the "license" file accompanying this file. This file is distributed on
11 |  * an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either
12 |  * express or implied. See the License for the specific language governing
13 |  * permissions and limitations under the License.
14 |  *
15 |  */
16 | 
17 | package com.amazon.deequ.metrics
18 | 
19 | import scala.util.{Failure, Success, Try}
20 | 
21 | case class DistributionValue(absolute: Long, ratio: Double)
22 | 
23 | case class Distribution(values: Map[String, DistributionValue], numberOfBins: Long) {
24 | 
25 |   def apply(key: String): DistributionValue = {
26 |     values(key)
27 |   }
28 | 
29 |   def argmax: String = {
30 |     val (distributionKey, _) = values.toSeq
31 |       .maxBy { case (_, distributionValue) => distributionValue.absolute }
32 | 
33 |     distributionKey
34 |   }
35 | }
36 | 
37 | case class HistogramMetric(column: String, value: Try[Distribution]) extends Metric[Distribution] {
38 |   val entity: Entity.Value = Entity.Column
39 |   val instance: String = column
40 |   val name = "Histogram"
41 | 
42 |   def flatten(): Seq[DoubleMetric] = {
43 |     value
44 |       .map { distribution =>
45 |         val numberOfBins = Seq(DoubleMetric(entity, s"$name.bins", instance,
46 |           Success(distribution.numberOfBins.toDouble)))
47 | 
48 |         val details = distribution.values
49 |           .flatMap { case (key, distValue) =>
50 |             DoubleMetric(entity, s"$name.abs.$key", instance, Success(distValue.absolute)) ::
51 |               DoubleMetric(entity, s"$name.ratio.$key", instance, Success(distValue.ratio)) :: Nil
52 |           }
53 |         numberOfBins ++ details
54 |       }
55 |       .recover {
56 |         case e: Exception => Seq(DoubleMetric(entity, s"$name.bins", instance, Failure(e)))
57 |       }
58 |       .get
59 |   }
60 | 
61 | }
62 | 


--------------------------------------------------------------------------------
/src/test/scala/com/amazon/deequ/anomalydetection/AnomalyDetectionTestUtils.scala:
--------------------------------------------------------------------------------
 1 | /**
 2 |   * Copyright 2018 Amazon.com, Inc. or its affiliates. All Rights Reserved.
 3 |   *
 4 |   * Licensed under the Apache License, Version 2.0 (the "License"). You may not
 5 |   * use this file except in compliance with the License. A copy of the License
 6 |   * is located at
 7 |   *
 8 |   *     http://aws.amazon.com/apache2.0/
 9 |   *
10 |   * or in the "license" file accompanying this file. This file is distributed on
11 |   * an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either
12 |   * express or implied. See the License for the specific language governing
13 |   * permissions and limitations under the License.
14 |   *
15 |   */
16 | 
17 | package com.amazon.deequ.anomalydetection
18 | 
19 | import scala.util.matching.Regex
20 | 
21 | /**
22 |   * Utilities to test Anomaly Detection methods and related modules
23 |   */
24 | object AnomalyDetectionTestUtils {
25 | 
26 |   private val numericalValueRegex: Regex = """([+-]?([0-9]*[.])?[0-9]+([Ee][0-9]+)?)""".r
27 | 
28 |   /**
29 |     * Finds the first numerical value in a string
30 |     *
31 |     * @param details The string containing a numerical value
32 |     * @throws IllegalArgumentException Thrown if no value could be found
33 |     * @return The value itself
34 |     */
35 |   def firstDoubleFromString(details: String): Double = {
36 |     val firstValue = numericalValueRegex.findFirstIn(details)
37 | 
38 |     require(firstValue.isDefined, "Input string did not contain a numerical value")
39 | 
40 |     firstValue.get.toString.toDouble
41 |   }
42 | 
43 |   /**
44 |     * Finds the first three numerical values in a string
45 |     *
46 |     * @param details The string containing at least three numerical values
47 |     * @throws IllegalArgumentException Thrown if less than 3 values could be found
48 |     * @return The values themselves
49 |     */
50 |   def firstThreeDoublesFromString(details: String): (Double, Double, Double) = {
51 |     val values = numericalValueRegex.findAllIn(details).toVector.map(_.toString.toDouble)
52 | 
53 |     require(values.length >= 3, "Input string did not contain at least 3 numerical values.")
54 | 
55 |     (values(0), values(1), values(2))
56 |   }
57 | }
58 | 


--------------------------------------------------------------------------------
/src/main/scala/com/amazon/deequ/metrics/Metric.scala:
--------------------------------------------------------------------------------
 1 | /**
 2 |  * Copyright 2018 Amazon.com, Inc. or its affiliates. All Rights Reserved.
 3 |  *
 4 |  * Licensed under the Apache License, Version 2.0 (the "License"). You may not
 5 |  * use this file except in compliance with the License. A copy of the License
 6 |  * is located at
 7 |  *
 8 |  *     http://aws.amazon.com/apache2.0/
 9 |  *
10 |  * or in the "license" file accompanying this file. This file is distributed on
11 |  * an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either
12 |  * express or implied. See the License for the specific language governing
13 |  * permissions and limitations under the License.
14 |  *
15 |  */
16 | 
17 | package com.amazon.deequ.metrics
18 | 
19 | import scala.util.{Failure, Success, Try}
20 | 
21 | object Entity extends Enumeration {
22 |   val Dataset, Column, Mutlicolumn = Value
23 | }
24 | 
25 | /** Common trait for all data quality metrics */
26 | trait Metric[T] {
27 |   val entity: Entity.Value
28 |   val instance: String
29 |   val name: String
30 |   val value: Try[T]
31 | 
32 |   /*
33 |    * Composite metric objects e.g histogram can implement this method to
34 |    * returned flattened view of the internal values in terms of double metrics.
35 |    * @see HistogramMetric for sample
36 |    */
37 |   def flatten(): Seq[DoubleMetric]
38 | }
39 | 
40 | /** Common trait for all data quality metrics where the value is double */
41 | case class DoubleMetric(
42 |     entity: Entity.Value,
43 |     name: String,
44 |     instance: String,
45 |     value: Try[Double])
46 |   extends Metric[Double] {
47 | 
48 |   override def flatten(): Seq[DoubleMetric] = Seq(this)
49 | }
50 | 
51 | case class KeyedDoubleMetric(
52 |     entity: Entity.Value,
53 |     name: String,
54 |     instance: String,
55 |     value: Try[Map[String, Double]])
56 |   extends Metric[Map[String, Double]] {
57 | 
58 |   override def flatten(): Seq[DoubleMetric] = {
59 |     if (value.isSuccess) {
60 |       value.get.map { case (key, correspondingValue) =>
61 |         DoubleMetric(entity, s"$name-$key", instance, Success(correspondingValue))
62 |       }
63 |       .toSeq
64 |     } else {
65 |       Seq(DoubleMetric(entity, s"$name", instance, Failure(value.failed.get)))
66 |     }
67 |   }
68 | }
69 | 


--------------------------------------------------------------------------------
/src/test/scala/com/amazon/deequ/analyzers/StatesTest.scala:
--------------------------------------------------------------------------------
 1 | /**
 2 |   * Copyright 2018 Amazon.com, Inc. or its affiliates. All Rights Reserved.
 3 |   *
 4 |   * Licensed under the Apache License, Version 2.0 (the "License"). You may not
 5 |   * use this file except in compliance with the License. A copy of the License
 6 |   * is located at
 7 |   *
 8 |   *     http://aws.amazon.com/apache2.0/
 9 |   *
10 |   * or in the "license" file accompanying this file. This file is distributed on
11 |   * an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either
12 |   * express or implied. See the License for the specific language governing
13 |   * permissions and limitations under the License.
14 |   *
15 |   */
16 | 
17 | package com.amazon.deequ.analyzers
18 | 
19 | import com.amazon.deequ.SparkContextSpec
20 | import com.amazon.deequ.utils.FixtureSupport
21 | import org.scalatest.matchers.should.Matchers
22 | import org.scalatest.wordspec.AnyWordSpec
23 | 
24 | class StatesTest extends AnyWordSpec with Matchers with SparkContextSpec with FixtureSupport {
25 | 
26 |   "FrequenciesAndNumRows" should {
27 |     "merge correctly" in withSparkSession { session =>
28 | 
29 |       import session.implicits._
30 | 
31 |       val dataA = Seq("A", "A", "B").toDF("att1")
32 |       val dataB = Seq("A", "C", "C").toDF("att1")
33 | 
34 |       val stateA = FrequencyBasedAnalyzer.computeFrequencies(dataA, "att1" :: Nil)
35 |       val stateB = FrequencyBasedAnalyzer.computeFrequencies(dataB, "att1" :: Nil)
36 | 
37 |       val stateAB = stateA.sum(stateB)
38 | 
39 |       println(stateA.frequencies.schema)
40 |       stateA.frequencies.collect().foreach { println }
41 |       println()
42 | 
43 |       println(stateB.frequencies.schema)
44 |       stateB.frequencies.collect().foreach { println }
45 |       println()
46 | 
47 |       println(stateAB.frequencies.schema)
48 |       stateAB.frequencies.collect().foreach { println }
49 | 
50 |       val mergedFrequencies = stateAB.frequencies.collect()
51 |         .map { row => row.getString(0) -> row.getLong(1) }
52 |         .toMap
53 | 
54 |       assert(mergedFrequencies.size == 3)
55 |       assert(mergedFrequencies.get("A").contains(3))
56 |       assert(mergedFrequencies.get("B").contains(1))
57 |       assert(mergedFrequencies.get("C").contains(2))
58 |     }
59 |   }
60 | }
61 | 


--------------------------------------------------------------------------------
/src/main/scala/com/amazon/deequ/anomalydetection/SimpleThresholdStrategy.scala:
--------------------------------------------------------------------------------
 1 | /**
 2 |  * Copyright 2018 Amazon.com, Inc. or its affiliates. All Rights Reserved.
 3 |  *
 4 |  * Licensed under the Apache License, Version 2.0 (the "License"). You may not
 5 |  * use this file except in compliance with the License. A copy of the License
 6 |  * is located at
 7 |  *
 8 |  *     http://aws.amazon.com/apache2.0/
 9 |  *
10 |  * or in the "license" file accompanying this file. This file is distributed on
11 |  * an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either
12 |  * express or implied. See the License for the specific language governing
13 |  * permissions and limitations under the License.
14 |  *
15 |  */
16 | 
17 | package com.amazon.deequ.anomalydetection
18 | 
19 | /**
20 |   * A simple anomaly detection method that checks if values are in a specified range.
21 |   *
22 |   * @param lowerBound Lower bound of accepted range of values
23 |   * @param upperBound Upper bound of accepted range of values
24 |   */
25 | case class SimpleThresholdStrategy(
26 |     lowerBound: Double = Double.MinValue,
27 |     upperBound: Double)
28 |   extends AnomalyDetectionStrategy {
29 | 
30 |   require(lowerBound <= upperBound, "The lower bound must be smaller or equal to the upper bound.")
31 | 
32 |   /**
33 |     * Search for anomalies in a series of data points.
34 |     *
35 |     * @param dataSeries     The data contained in a Vector of Doubles
36 |     * @param searchInterval The indices between which anomalies should be detected. [a, b).
37 |     * @return The indices of all anomalies in the interval and their corresponding wrapper object.
38 |     */
39 |   override def detect(
40 |     dataSeries: Vector[Double],
41 |     searchInterval: (Int, Int)): Seq[(Int, Anomaly)] = {
42 | 
43 |     val (searchStart, searchEnd) = searchInterval
44 | 
45 |     require (searchStart <= searchEnd, "The start of the interval can't be larger than the end.")
46 | 
47 |     dataSeries.zipWithIndex
48 |       .slice(searchStart, searchEnd)
49 |       .filter { case (value, _) => value < lowerBound || value > upperBound }
50 |       .map { case (value, index) =>
51 | 
52 |         val detail = Some(s"[SimpleThresholdStrategy]: Value $value is not in " +
53 |           s"bounds [$lowerBound, $upperBound]")
54 | 
55 |         (index, Anomaly(Option(value), 1.0, detail))
56 |       }
57 |   }
58 | }
59 | 


--------------------------------------------------------------------------------
/src/main/scala/com/amazon/deequ/suggestions/rules/NonNegativeNumbersRule.scala:
--------------------------------------------------------------------------------
 1 | /**
 2 |  * Copyright 2018 Amazon.com, Inc. or its affiliates. All Rights Reserved.
 3 |  *
 4 |  * Licensed under the Apache License, Version 2.0 (the "License"). You may not
 5 |  * use this file except in compliance with the License. A copy of the License
 6 |  * is located at
 7 |  *
 8 |  *     http://aws.amazon.com/apache2.0/
 9 |  *
10 |  * or in the "license" file accompanying this file. This file is distributed on
11 |  * an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either
12 |  * express or implied. See the License for the specific language governing
13 |  * permissions and limitations under the License.
14 |  *
15 |  */
16 | 
17 | package com.amazon.deequ.suggestions.rules
18 | 
19 | import com.amazon.deequ.checks.Check
20 | import com.amazon.deequ.constraints.Constraint.complianceConstraint
21 | import com.amazon.deequ.profiles.{ColumnProfile, NumericColumnProfile}
22 | import com.amazon.deequ.suggestions.ConstraintSuggestion
23 | 
24 | /** If we see only non-negative numbers in a column, we suggest a corresponding constraint */
25 | case class NonNegativeNumbersRule() extends ConstraintRule[ColumnProfile] {
26 | 
27 |   override def shouldBeApplied(profile: ColumnProfile, numRecords: Long): Boolean = {
28 |     profile match {
29 |       case numericProfile: NumericColumnProfile => numericProfile.minimum.exists(_ >= 0.0)
30 |       case _ => false
31 |     }
32 |   }
33 | 
34 |   override def candidate(profile: ColumnProfile, numRecords: Long): ConstraintSuggestion = {
35 | 
36 |     val description = s"'${profile.column}' has no negative values"
37 |     val constraint = complianceConstraint(description, s"${profile.column} >= 0", Check.IsOne)
38 | 
39 |     val minimum = profile match {
40 |       case numericProfile: NumericColumnProfile
41 |         if numericProfile.minimum.isDefined => numericProfile.minimum.get.toString
42 |       case _ => "Error while calculating minimum!"
43 |     }
44 | 
45 |     ConstraintSuggestion(
46 |       constraint,
47 |       profile.column,
48 |       "Minimum: " + minimum,
49 |       description,
50 |       this,
51 |       s""".isNonNegative("${profile.column}")"""
52 |     )
53 |   }
54 | 
55 |   override val ruleDescription: String = "If we see only non-negative numbers in a " +
56 |     "column, we suggest a corresponding constraint"
57 | }
58 | 


--------------------------------------------------------------------------------
/src/main/scala/com/amazon/deequ/suggestions/rules/UniqueIfApproximatelyUniqueRule.scala:
--------------------------------------------------------------------------------
 1 | /**
 2 |  * Copyright 2018 Amazon.com, Inc. or its affiliates. All Rights Reserved.
 3 |  *
 4 |  * Licensed under the Apache License, Version 2.0 (the "License"). You may not
 5 |  * use this file except in compliance with the License. A copy of the License
 6 |  * is located at
 7 |  *
 8 |  *     http://aws.amazon.com/apache2.0/
 9 |  *
10 |  * or in the "license" file accompanying this file. This file is distributed on
11 |  * an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either
12 |  * express or implied. See the License for the specific language governing
13 |  * permissions and limitations under the License.
14 |  *
15 |  */
16 | 
17 | package com.amazon.deequ.suggestions.rules
18 | 
19 | import com.amazon.deequ.checks.Check
20 | import com.amazon.deequ.constraints.Constraint.uniquenessConstraint
21 | import com.amazon.deequ.profiles.ColumnProfile
22 | import com.amazon.deequ.suggestions.ConstraintSuggestion
23 | 
24 | /**
25 |   * If the ratio of approximate num distinct values in a column is close to the number of records
26 |   * (within error of HLL sketch), we suggest a UNIQUE constraint
27 |   */
28 | case class UniqueIfApproximatelyUniqueRule() extends ConstraintRule[ColumnProfile] {
29 | 
30 |   override def shouldBeApplied(profile: ColumnProfile, numRecords: Long): Boolean = {
31 | 
32 |     val approximateDistinctness = profile.approximateNumDistinctValues.toDouble / numRecords
33 | 
34 |     // TODO This bound depends on the error guarantees of the HLL sketch
35 |     profile.completeness == 1.0 && math.abs(1.0 - approximateDistinctness) <= 0.08
36 |   }
37 | 
38 |   override def candidate(profile: ColumnProfile, numRecords: Long): ConstraintSuggestion = {
39 | 
40 |     val constraint = uniquenessConstraint(Seq(profile.column), Check.IsOne)
41 |     val approximateDistinctness = profile.approximateNumDistinctValues.toDouble / numRecords
42 | 
43 |     ConstraintSuggestion(
44 |       constraint,
45 |       profile.column,
46 |       "ApproxDistinctness: " + approximateDistinctness.toString,
47 |       s"'${profile.column}' is unique",
48 |       this,
49 |       s""".isUnique("${profile.column}")"""
50 |     )
51 |   }
52 | 
53 |   override val ruleDescription: String = "If the ratio of approximate num distinct values " +
54 |     "in a column is close to the number of records (within the error of the HLL sketch), " +
55 |     "we suggest a UNIQUE constraint"
56 | }
57 | 


--------------------------------------------------------------------------------
/src/main/scala/com/amazon/deequ/analyzers/NonSampleCompactor.scala:
--------------------------------------------------------------------------------
 1 | /**
 2 |  * Copyright 2018 Amazon.com, Inc. or its affiliates. All Rights Reserved.
 3 |  *
 4 |  * Licensed under the Apache License, Version 2.0 (the "License"). You may not
 5 |  * use this file except in compliance with the License. A copy of the License
 6 |  * is located at
 7 |  *
 8 |  *     http://aws.amazon.com/apache2.0/
 9 |  *
10 |  * or in the "license" file accompanying this file. This file is distributed on
11 |  * an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either
12 |  * express or implied. See the License for the specific language governing
13 |  * permissions and limitations under the License.
14 |  *
15 |  */
16 | 
17 | package com.amazon.deequ.analyzers
18 | 
19 | import scala.collection.mutable.ArrayBuffer
20 | import scala.reflect.ClassTag
21 | import scala.util.Random
22 | 
23 | /**
24 |  * A quantile sketcher whose output is half the size of its input.
25 |  *
26 |  * @tparam T type of the items being sketched. There should an ordering
27 |  *           over this item type
28 |  */
29 | class NonSampleCompactor[T]()
30 |      (implicit ordering: Ordering[T],
31 |       ct: ClassTag[T])
32 |   extends Serializable {
33 | 
34 |   var numOfCompress = 0
35 |   var offset = 0
36 |   var buffer: ArrayBuffer[T] = ArrayBuffer[T]()
37 | 
38 |   private def findOdd(items: Int): Option[T] = items % 2 match {
39 |     case 1 => Some(buffer(math.max(items - 1, 0)))
40 |     case _ => None
41 |   }
42 | 
43 |   def compact : Array[T] = {
44 |     var items = buffer.length
45 |     val len = items - (items % 2)
46 |     if (numOfCompress % 2 == 1) {
47 |       offset = 1 - offset
48 |     }
49 | //    else {
50 | //      offset = if (Random.nextBoolean()) 1 else 0
51 | //    }
52 |     val sortedBuffer = buffer.toArray.slice(0, len).sorted
53 | 
54 |     /** Selects half of the items from this level compactor to the next level compactor.
55 |      * e.g. if sortedBuffer is Array(1,2,3,4), if offset is 1, output = Array(2,4),
56 |      * and if offset is 0, output = Array(1,3), this will be the input to the next level compactor.
57 |      */
58 |     val output = (offset until len by 2).map(sortedBuffer(_)).toArray
59 |     val tail = findOdd(items)
60 |     items = items % 2
61 |     var newBuffer = ArrayBuffer[T]()
62 |     if (tail.isDefined) {
63 |       newBuffer = newBuffer :+ tail.get
64 |     }
65 |     buffer = newBuffer
66 |     numOfCompress = numOfCompress + 1
67 |     output
68 |   }
69 | }
70 | 
71 | 


--------------------------------------------------------------------------------
/src/test/resources/EMRSparkShellTest.scala:
--------------------------------------------------------------------------------
 1 | /**
 2 |   * Copyright 2018 Amazon.com, Inc. or its affiliates. All Rights Reserved.
 3 |   *
 4 |   * Licensed under the Apache License, Version 2.0 (the "License"). You may not
 5 |   * use this file except in compliance with the License. A copy of the License
 6 |   * is located at
 7 |   *
 8 |   *     http://aws.amazon.com/apache2.0/
 9 |   *
10 |   * or in the "license" file accompanying this file. This file is distributed on
11 |   * an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either
12 |   * express or implied. See the License for the specific language governing
13 |   * permissions and limitations under the License.
14 |   *
15 |   */
16 | 
17 | /*
18 | For testing inside EMR or other flavors of spark cluster. Run commands after building git repo from source.
19 | Add additional test classes as needed
20 | scala 2.12
21 | spark-shell -i <local_git_repo>/src/test/resources/EMRSparkShellTest.txt \
22 | --packages org.scalatest:scalatest_2.12:3.1.2,org.scalamock:scalamock_2.12:4.4.0,org.scala-lang:scala-compiler:2.12.10,\
23 | org.mockito:mockito-core:2.28.2,org.openjdk.jmh:jmh-core:1.23,org.openjdk.jmh:jmh-generator-annprocess:1.23,org.apache.datasketches:datasketches-java:1.3.0-incubating \
24 | --jars <local_git_repo>/target/deequ_2.12-1.1.0-SNAPSHOT.jar,<local_git_repo>/target/deequ_2.12-1.1.0-SNAPSHOT-tests.jar
25 | 
26 | scala 2.11
27 | spark-shell -i <local_git_repo>/src/test/resources/EMRSparkShellTest.txt \
28 | --packages org.scalatest:scalatest_2.11:3.1.2,org.scalamock:scalamock_2.11:4.4.0,org.scala-lang:scala-compiler:2.11.10,\
29 | org.mockito:mockito-core:2.28.2,org.openjdk.jmh:jmh-core:1.23,org.openjdk.jmh:jmh-generator-annprocess:1.23,org.apache.datasketches:datasketches-java:1.3.0-incubating \
30 | --jars <local_git_repo>/target/deequ-1.1.0-SNAPSHOT.jar,<local_git_repo>/target/spark-deequ-testing/deequ-1.1.0-SNAPSHOT-tests.jar
31 |  */
32 | 
33 | import com.amazon.deequ.analyzers.{AnalysisTest, AnalyzerTests, IncrementalAnalysisTest}
34 | import com.amazon.deequ.analyzers.runners.{AnalysisRunnerTests, AnalyzerContextTest}
35 | import com.amazon.deequ.{VerificationResultTest, VerificationSuiteTest}
36 | 
37 | (new VerificationSuiteTest).execute()
38 | (new VerificationResultTest).execute()
39 | (new AnalysisRunnerTests).execute()
40 | (new AnalyzerContextTest).execute()
41 | (new AnalysisTest).execute()
42 | (new AnalyzerTests).execute()
43 | (new IncrementalAnalysisTest).execute()
44 | //Add additional test classes as needed
45 | 


--------------------------------------------------------------------------------
/src/test/scala/com/amazon/deequ/checks/FilterableCheckTest.scala:
--------------------------------------------------------------------------------
 1 | /**
 2 |   * Copyright 2018 Amazon.com, Inc. or its affiliates. All Rights Reserved.
 3 |   *
 4 |   * Licensed under the Apache License, Version 2.0 (the "License"). You may not
 5 |   * use this file except in compliance with the License. A copy of the License
 6 |   * is located at
 7 |   *
 8 |   *     http://aws.amazon.com/apache2.0/
 9 |   *
10 |   * or in the "license" file accompanying this file. This file is distributed on
11 |   * an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either
12 |   * express or implied. See the License for the specific language governing
13 |   * permissions and limitations under the License.
14 |   *
15 |   */
16 | 
17 | package com.amazon.deequ
18 | package checks
19 | 
20 | import com.amazon.deequ.analyzers.{Completeness, Compliance}
21 | import com.amazon.deequ.utils.FixtureSupport
22 | import org.scalatest.matchers.should.Matchers
23 | import org.scalatest.wordspec.AnyWordSpec
24 | 
25 | 
26 | class FilterableCheckTest extends AnyWordSpec
27 |   with Matchers
28 |   with SparkContextSpec
29 |   with FixtureSupport {
30 | 
31 |   "Filterable checks" should {
32 |     "build correctly" in {
33 | 
34 |       val check = Check(CheckLevel.Error, "someCheck")
35 |         .isComplete("col1")
36 |         .isComplete("col2").where("marketplace = 'EU'")
37 |         .hasCompleteness("col3", _ >= 0.9).where("marketplace = 'NA'")
38 |         .satisfies("someCol > 5", "const1")
39 |         .satisfies("someCol > 10", "const2").where("marketplace = 'EU'")
40 | 
41 |       val completenessAnalyzers =
42 |         check.requiredAnalyzers()
43 |           .filter { _.isInstanceOf[Completeness] }
44 |           .map { _.asInstanceOf[Completeness] }
45 |           .toArray
46 |           .sortBy { _.column }
47 | 
48 |       assert(completenessAnalyzers.length == 3)
49 |       assert(completenessAnalyzers.head.where.isEmpty)
50 |       assert(completenessAnalyzers(1).where.contains("marketplace = 'EU'"))
51 |       assert(completenessAnalyzers(2).where.contains("marketplace = 'NA'"))
52 | 
53 |       val complianceAnalyzers =
54 |         check.requiredAnalyzers()
55 |           .filter { _.isInstanceOf[Compliance] }
56 |           .map { _.asInstanceOf[Compliance] }
57 |           .toArray
58 |           .sortBy { _.instance }
59 | 
60 |       assert(complianceAnalyzers.length == 2)
61 |       assert(complianceAnalyzers.head.where.isEmpty)
62 |       assert(complianceAnalyzers(1).where.contains("marketplace = 'EU'"))
63 |     }
64 |   }
65 | 
66 | }
67 | 


--------------------------------------------------------------------------------
/src/main/scala/com/amazon/deequ/analyzers/Compliance.scala:
--------------------------------------------------------------------------------
 1 | /**
 2 |  * Copyright 2018 Amazon.com, Inc. or its affiliates. All Rights Reserved.
 3 |  *
 4 |  * Licensed under the Apache License, Version 2.0 (the "License"). You may not
 5 |  * use this file except in compliance with the License. A copy of the License
 6 |  * is located at
 7 |  *
 8 |  *     http://aws.amazon.com/apache2.0/
 9 |  *
10 |  * or in the "license" file accompanying this file. This file is distributed on
11 |  * an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either
12 |  * express or implied. See the License for the specific language governing
13 |  * permissions and limitations under the License.
14 |  *
15 |  */
16 | 
17 | package com.amazon.deequ.analyzers
18 | 
19 | import org.apache.spark.sql.types.IntegerType
20 | import org.apache.spark.sql.{Column, Row}
21 | import org.apache.spark.sql.functions._
22 | import Analyzers._
23 | 
24 | /**
25 |   * Compliance is a measure of the fraction of rows that complies with the given column constraint.
26 |   * E.g if the constraint is "att1>3" and data frame has 5 rows with att1 column value greater than
27 |   * 3 and 10 rows under 3; a DoubleMetric would be returned with 0.33 value
28 |   *
29 |   * @param instance         Unlike other column analyzers (e.g completeness) this analyzer can not
30 |   *                         infer to the metric instance name from column name.
31 |   *                         Also the constraint given here can be referring to multiple columns,
32 |   *                         so metric instance name should be provided,
33 |   *                         describing what the analysis being done for.
34 |   * @param predicate SQL-predicate to apply per row
35 |   * @param where Additional filter to apply before the analyzer is run.
36 |   */
37 | case class Compliance(instance: String, predicate: String, where: Option[String] = None)
38 |   extends StandardScanShareableAnalyzer[NumMatchesAndCount]("Compliance", instance)
39 |   with FilterableAnalyzer {
40 | 
41 |   override def fromAggregationResult(result: Row, offset: Int): Option[NumMatchesAndCount] = {
42 | 
43 |     ifNoNullsIn(result, offset, howMany = 2) { _ =>
44 |       NumMatchesAndCount(result.getLong(offset), result.getLong(offset + 1))
45 |     }
46 |   }
47 | 
48 |   override def aggregationFunctions(): Seq[Column] = {
49 | 
50 |     val summation = sum(conditionalSelection(expr(predicate), where).cast(IntegerType))
51 | 
52 |     summation :: conditionalCount(where) :: Nil
53 |   }
54 | 
55 |   override def filterCondition: Option[String] = where
56 | }
57 | 


--------------------------------------------------------------------------------
/src/main/scala/com/amazon/deequ/suggestions/rules/RetainTypeRule.scala:
--------------------------------------------------------------------------------
 1 | /**
 2 |  * Copyright 2018 Amazon.com, Inc. or its affiliates. All Rights Reserved.
 3 |  *
 4 |  * Licensed under the Apache License, Version 2.0 (the "License"). You may not
 5 |  * use this file except in compliance with the License. A copy of the License
 6 |  * is located at
 7 |  *
 8 |  *     http://aws.amazon.com/apache2.0/
 9 |  *
10 |  * or in the "license" file accompanying this file. This file is distributed on
11 |  * an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either
12 |  * express or implied. See the License for the specific language governing
13 |  * permissions and limitations under the License.
14 |  *
15 |  */
16 | 
17 | package com.amazon.deequ.suggestions.rules
18 | 
19 | import com.amazon.deequ.analyzers.DataTypeInstances
20 | import com.amazon.deequ.checks.Check
21 | import com.amazon.deequ.constraints.ConstrainableDataTypes
22 | import com.amazon.deequ.constraints.Constraint.dataTypeConstraint
23 | import com.amazon.deequ.profiles.ColumnProfile
24 | import com.amazon.deequ.suggestions.ConstraintSuggestion
25 | 
26 | /** If we detect a non-string type, we suggest a type constraint */
27 | case class RetainTypeRule() extends ConstraintRule[ColumnProfile] {
28 | 
29 |   override def shouldBeApplied(profile: ColumnProfile, numRecords: Long): Boolean = {
30 |     val testableType = profile.dataType match {
31 |       case DataTypeInstances.Integral | DataTypeInstances.Fractional | DataTypeInstances.Boolean =>
32 |         true
33 |       case _ => false
34 |     }
35 | 
36 |     profile.isDataTypeInferred && testableType
37 |   }
38 | 
39 |   override def candidate(profile: ColumnProfile, numRecords: Long): ConstraintSuggestion = {
40 | 
41 |     val typeToCheck = profile.dataType match {
42 |       case DataTypeInstances.Fractional => ConstrainableDataTypes.Fractional
43 |       case DataTypeInstances.Integral => ConstrainableDataTypes.Integral
44 |       case DataTypeInstances.Boolean => ConstrainableDataTypes.Boolean
45 |     }
46 | 
47 |     val constraint = dataTypeConstraint(profile.column, typeToCheck, Check.IsOne)
48 | 
49 |     ConstraintSuggestion(
50 |       constraint,
51 |       profile.column,
52 |       "DataType: " + profile.dataType.toString,
53 |       s"'${profile.column}' has type ${profile.dataType}",
54 |       this,
55 |       s""".hasDataType("${profile.column}", ConstrainableDataTypes.${profile.dataType})"""
56 |     )
57 |   }
58 | 
59 |   override val ruleDescription: String = "If we detect a non-string type, we suggest a " +
60 |     "type constraint"
61 | }
62 | 


--------------------------------------------------------------------------------
/src/main/scala/com/amazon/deequ/suggestions/ConstraintSuggestionResult.scala:
--------------------------------------------------------------------------------
 1 | /**
 2 |  * Copyright 2018 Amazon.com, Inc. or its affiliates. All Rights Reserved.
 3 |  *
 4 |  * Licensed under the Apache License, Version 2.0 (the "License"). You may not
 5 |  * use this file except in compliance with the License. A copy of the License
 6 |  * is located at
 7 |  *
 8 |  *     http://aws.amazon.com/apache2.0/
 9 |  *
10 |  * or in the "license" file accompanying this file. This file is distributed on
11 |  * an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either
12 |  * express or implied. See the License for the specific language governing
13 |  * permissions and limitations under the License.
14 |  *
15 |  */
16 | 
17 | package com.amazon.deequ.suggestions
18 | 
19 | import com.amazon.deequ.VerificationResult
20 | import com.amazon.deequ.checks.CheckStatus
21 | import com.amazon.deequ.profiles.{ColumnProfile, ColumnProfiles}
22 | 
23 | /**
24 |   * The result returned from the ConstraintSuggestionSuite
25 |   *
26 |   * @param columnProfiles The column profiles
27 |   * @param numRecordsUsedForProfiling The number of records that were used for computing
28 |   *                                   the column profiles
29 |   * @param constraintSuggestions The suggested constraints
30 |   * @param verificationResult The verificationResult in case a train/test split was used
31 |   */
32 | case class ConstraintSuggestionResult(
33 |   columnProfiles: Map[String, ColumnProfile],
34 |   numRecordsUsedForProfiling: Long,
35 |   constraintSuggestions: Map[String, Seq[ConstraintSuggestion]],
36 |   verificationResult: Option[VerificationResult] = None)
37 | 
38 | 
39 | object ConstraintSuggestionResult {
40 | 
41 |   def getColumnProfilesAsJson(constraintSuggestionResult: ConstraintSuggestionResult): String = {
42 | 
43 |     ColumnProfiles
44 |       .toJson(constraintSuggestionResult.columnProfiles.values.toSeq)
45 |   }
46 | 
47 |   def getConstraintSuggestionsAsJson(constraintSuggestionResult: ConstraintSuggestionResult)
48 |     : String = {
49 |     ConstraintSuggestions
50 |       .toJson(constraintSuggestionResult.constraintSuggestions.values.fold(Seq.empty)( _ ++ _))
51 |   }
52 | 
53 |   def getEvaluationResultsAsJson(constraintSuggestionResult: ConstraintSuggestionResult)
54 |     : String = {
55 | 
56 |     ConstraintSuggestions
57 |       .evaluationResultsToJson(
58 |         constraintSuggestionResult.constraintSuggestions.values.fold(Seq.empty)( _ ++ _),
59 |         constraintSuggestionResult.verificationResult.getOrElse(
60 |           VerificationResult(CheckStatus.Warning, Map.empty, Map.empty)))
61 |   }
62 | }
63 | 


--------------------------------------------------------------------------------
/src/main/scala/com/amazon/deequ/analyzers/ApproxCountDistinct.scala:
--------------------------------------------------------------------------------
 1 | /**
 2 |  * Copyright 2018 Amazon.com, Inc. or its affiliates. All Rights Reserved.
 3 |  *
 4 |  * Licensed under the Apache License, Version 2.0 (the "License"). You may not
 5 |  * use this file except in compliance with the License. A copy of the License
 6 |  * is located at
 7 |  *
 8 |  *     http://aws.amazon.com/apache2.0/
 9 |  *
10 |  * or in the "license" file accompanying this file. This file is distributed on
11 |  * an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either
12 |  * express or implied. See the License for the specific language governing
13 |  * permissions and limitations under the License.
14 |  *
15 |  */
16 | 
17 | package com.amazon.deequ.analyzers
18 | 
19 | import com.amazon.deequ.analyzers.Preconditions.hasColumn
20 | import org.apache.spark.sql.DeequFunctions.stateful_approx_count_distinct
21 | import org.apache.spark.sql.catalyst.expressions.aggregate.DeequHyperLogLogPlusPlusUtils
22 | import org.apache.spark.sql.types.StructType
23 | import org.apache.spark.sql.{Column, Row}
24 | import Analyzers._
25 | 
26 | case class ApproxCountDistinctState(words: Array[Long])
27 |   extends DoubleValuedState[ApproxCountDistinctState] {
28 | 
29 |   override def sum(other: ApproxCountDistinctState): ApproxCountDistinctState = {
30 |     ApproxCountDistinctState(DeequHyperLogLogPlusPlusUtils.merge(words, other.words))
31 |   }
32 | 
33 |   override def metricValue(): Double = {
34 |     DeequHyperLogLogPlusPlusUtils.count(words)
35 |   }
36 | 
37 |   override def toString: String = {
38 |     s"ApproxCountDistinctState(${words.mkString(",")})"
39 |   }
40 | }
41 | 
42 | /**
43 |   * Compute approximated count distinct with HyperLogLogPlusPlus.
44 |   *
45 |   * @param column Which column to compute this aggregation on.
46 |   */
47 | case class ApproxCountDistinct(column: String, where: Option[String] = None)
48 |   extends StandardScanShareableAnalyzer[ApproxCountDistinctState]("ApproxCountDistinct", column)
49 |   with FilterableAnalyzer {
50 | 
51 |   override def aggregationFunctions(): Seq[Column] = {
52 |     stateful_approx_count_distinct(conditionalSelection(column, where)) :: Nil
53 |   }
54 | 
55 |   override def fromAggregationResult(result: Row, offset: Int): Option[ApproxCountDistinctState] = {
56 | 
57 |     ifNoNullsIn(result, offset) { _ =>
58 |       DeequHyperLogLogPlusPlusUtils.wordsFromBytes(result.getAs[Array[Byte]](offset))
59 |     }
60 |   }
61 | 
62 |   override protected def additionalPreconditions(): Seq[StructType => Unit] = {
63 |     hasColumn(column) :: Nil
64 |   }
65 | 
66 |   override def filterCondition: Option[String] = where
67 | }
68 | 


--------------------------------------------------------------------------------
/src/main/scala/com/amazon/deequ/analyzers/Analysis.scala:
--------------------------------------------------------------------------------
 1 | /**
 2 |  * Copyright 2018 Amazon.com, Inc. or its affiliates. All Rights Reserved.
 3 |  *
 4 |  * Licensed under the Apache License, Version 2.0 (the "License"). You may not
 5 |  * use this file except in compliance with the License. A copy of the License
 6 |  * is located at
 7 |  *
 8 |  *     http://aws.amazon.com/apache2.0/
 9 |  *
10 |  * or in the "license" file accompanying this file. This file is distributed on
11 |  * an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either
12 |  * express or implied. See the License for the specific language governing
13 |  * permissions and limitations under the License.
14 |  *
15 |  */
16 | 
17 | package com.amazon.deequ.analyzers
18 | 
19 | import com.amazon.deequ.analyzers.runners.{AnalysisRunner, AnalyzerContext}
20 | import com.amazon.deequ.metrics.Metric
21 | import org.apache.spark.sql.DataFrame
22 | import org.apache.spark.storage.StorageLevel
23 | 
24 | /**
25 |   * Defines a set of analyzers to run on data.
26 |   *
27 |   * @param analyzers
28 |   */
29 | case class Analysis(analyzers: Seq[Analyzer[_, Metric[_]]] = Seq.empty) {
30 | 
31 |   def addAnalyzer(analyzer: Analyzer[_, Metric[_]]): Analysis = {
32 |     Analysis(analyzers :+ analyzer)
33 |   }
34 | 
35 |   def addAnalyzers(otherAnalyzers: Seq[Analyzer[_, Metric[_]]]): Analysis = {
36 |     Analysis(analyzers ++ otherAnalyzers)
37 |   }
38 | 
39 |   /**
40 |     * Compute the metrics from the analyzers configured in the analyis
41 |     *
42 |     * @param data data on which to operate
43 |     * @param aggregateWith load existing states for the configured analyzers and aggregate them
44 |     *                      (optional)
45 |     * @param saveStatesWith persist resulting states for the configured analyzers (optional)
46 |     * @param storageLevelOfGroupedDataForMultiplePasses caching level for grouped data that must
47 |     *                                                   be accessed multiple times (use
48 |     *                                                   StorageLevel.NONE to completely disable
49 |     *                                                   caching)
50 |     * @return
51 |     */
52 |   @deprecated("Use the AnalysisRunner instead (the onData method there)", "24-09-2019")
53 |   def run(
54 |       data: DataFrame,
55 |       aggregateWith: Option[StateLoader] = None,
56 |       saveStatesWith: Option[StatePersister] = None,
57 |       storageLevelOfGroupedDataForMultiplePasses: StorageLevel = StorageLevel.MEMORY_AND_DISK)
58 |     : AnalyzerContext = {
59 | 
60 |     AnalysisRunner.doAnalysisRun(data, analyzers, aggregateWith = aggregateWith,
61 |       saveStatesWith = saveStatesWith)
62 |   }
63 | }
64 | 


--------------------------------------------------------------------------------
/src/main/scala/com/amazon/deequ/analyzers/StandardDeviation.scala:
--------------------------------------------------------------------------------
 1 | /**
 2 |  * Copyright 2018 Amazon.com, Inc. or its affiliates. All Rights Reserved.
 3 |  *
 4 |  * Licensed under the Apache License, Version 2.0 (the "License"). You may not
 5 |  * use this file except in compliance with the License. A copy of the License
 6 |  * is located at
 7 |  *
 8 |  *     http://aws.amazon.com/apache2.0/
 9 |  *
10 |  * or in the "license" file accompanying this file. This file is distributed on
11 |  * an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either
12 |  * express or implied. See the License for the specific language governing
13 |  * permissions and limitations under the License.
14 |  *
15 |  */
16 | 
17 | package com.amazon.deequ.analyzers
18 | 
19 | import com.amazon.deequ.analyzers.Preconditions.{hasColumn, isNumeric}
20 | import org.apache.spark.sql.DeequFunctions.stateful_stddev_pop
21 | import org.apache.spark.sql.{Column, Row}
22 | import org.apache.spark.sql.types.StructType
23 | import Analyzers._
24 | 
25 | case class StandardDeviationState(
26 |     n: Double,
27 |     avg: Double,
28 |     m2: Double)
29 |   extends DoubleValuedState[StandardDeviationState] {
30 | 
31 |   require(n > 0.0, "Standard deviation is undefined for n = 0.")
32 | 
33 |   override def metricValue(): Double = {
34 |     math.sqrt(m2 / n)
35 |   }
36 | 
37 |   override def sum(other: StandardDeviationState): StandardDeviationState = {
38 |     val newN = n + other.n
39 |     val delta = other.avg - avg
40 |     val deltaN = if (newN == 0.0) 0.0 else delta / newN
41 | 
42 |     StandardDeviationState(newN, avg + deltaN * other.n,
43 |       m2 + other.m2 + delta * deltaN * n * other.n)
44 |   }
45 | }
46 | 
47 | case class StandardDeviation(column: String, where: Option[String] = None)
48 |   extends StandardScanShareableAnalyzer[StandardDeviationState]("StandardDeviation", column)
49 |   with FilterableAnalyzer {
50 | 
51 |   override def aggregationFunctions(): Seq[Column] = {
52 |     stateful_stddev_pop(conditionalSelection(column, where)) :: Nil
53 |   }
54 | 
55 |   override def fromAggregationResult(result: Row, offset: Int): Option[StandardDeviationState] = {
56 | 
57 |     if (result.isNullAt(offset)) {
58 |       None
59 |     } else {
60 |       val row = result.getAs[Row](offset)
61 |       val n = row.getDouble(0)
62 | 
63 |       if (n == 0.0) {
64 |         None
65 |       } else {
66 |         Some(StandardDeviationState(n, row.getDouble(1), row.getDouble(2)))
67 |       }
68 |     }
69 |   }
70 | 
71 |   override protected def additionalPreconditions(): Seq[StructType => Unit] = {
72 |     hasColumn(column) :: isNumeric(column) :: Nil
73 |   }
74 | 
75 |   override def filterCondition: Option[String] = where
76 | }
77 | 


--------------------------------------------------------------------------------
/src/test/scala/com/amazon/deequ/metrics/MetricsTests.scala:
--------------------------------------------------------------------------------
 1 | /**
 2 |   * Copyright 2018 Amazon.com, Inc. or its affiliates. All Rights Reserved.
 3 |   *
 4 |   * Licensed under the Apache License, Version 2.0 (the "License"). You may not
 5 |   * use this file except in compliance with the License. A copy of the License
 6 |   * is located at
 7 |   *
 8 |   *     http://aws.amazon.com/apache2.0/
 9 |   *
10 |   * or in the "license" file accompanying this file. This file is distributed on
11 |   * an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either
12 |   * express or implied. See the License for the specific language governing
13 |   * permissions and limitations under the License.
14 |   *
15 |   */
16 | 
17 | package com.amazon.deequ.metrics
18 | 
19 | import com.amazon.deequ.analyzers.DataTypeInstances
20 | import org.scalatest.{Matchers, WordSpec}
21 | 
22 | import scala.util.{Failure, Success}
23 | 
24 | 
25 | class MetricsTests extends WordSpec with Matchers {
26 |   val sampleException = new IllegalArgumentException()
27 |   "Double metric" should {
28 |     "flatten and return itself" in {
29 |       val metric = DoubleMetric(Entity.Column, "metric-name", "instance-name", Success(50))
30 |       assert(metric.flatten() == List(metric))
31 |     }
32 | 
33 |     "flatten in case of an error" in {
34 |       val metric = DoubleMetric(Entity.Column, "metric-name", "instance-name",
35 |         Failure(sampleException))
36 |       assert(metric.flatten() == List(metric))
37 |     }
38 |   }
39 | 
40 |   "Histogram metric" should {
41 |     "flatten matched and unmatched" in {
42 |       val distribution = Distribution(
43 |         Map("a" -> DistributionValue(6, 0.6), "b" -> DistributionValue(4, 0.4)), 2)
44 | 
45 |       val metric = HistogramMetric("instance-name", Success(distribution))
46 | 
47 |       val expected = Seq(
48 |         DoubleMetric(Entity.Column, "Histogram.bins", "instance-name", Success(2)),
49 |         DoubleMetric(Entity.Column, "Histogram.abs.a", "instance-name", Success(6)),
50 |         DoubleMetric(Entity.Column, "Histogram.abs.b", "instance-name", Success(4)),
51 |         DoubleMetric(Entity.Column, "Histogram.ratio.a", "instance-name", Success(0.6)),
52 |         DoubleMetric(Entity.Column, "Histogram.ratio.b", "instance-name", Success(0.4))
53 |       ).toSet
54 |       assert(metric.flatten().toSet == expected)
55 |     }
56 | 
57 |     "flatten matched and unmatched in case of an error" in {
58 |       val metric = HistogramMetric("instance-name", Failure(sampleException))
59 | 
60 |       val expected = Seq(DoubleMetric(Entity.Column, "Histogram.bins", "instance-name",
61 |         Failure(sampleException))).toSet
62 |       assert(metric.flatten().toSet == expected)
63 |     }
64 |   }
65 | 
66 | }
67 | 


--------------------------------------------------------------------------------
/src/test/scala/com/amazon/deequ/SparkMonitor.scala:
--------------------------------------------------------------------------------
 1 | /**
 2 |   * Copyright 2018 Amazon.com, Inc. or its affiliates. All Rights Reserved.
 3 |   *
 4 |   * Licensed under the Apache License, Version 2.0 (the "License"). You may not
 5 |   * use this file except in compliance with the License. A copy of the License
 6 |   * is located at
 7 |   *
 8 |   *     http://aws.amazon.com/apache2.0/
 9 |   *
10 |   * or in the "license" file accompanying this file. This file is distributed on
11 |   * an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either
12 |   * express or implied. See the License for the specific language governing
13 |   * permissions and limitations under the License.
14 |   *
15 |   */
16 | 
17 | package com.amazon.deequ
18 | 
19 | import org.apache.spark.scheduler.{SparkListener, SparkListenerJobStart, SparkListenerStageCompleted, StageInfo}
20 | 
21 | /**
22 |   * A class representing a statistics about a sparkSession.
23 |   * Currently, only number of spark jobs submitted and its stages are being tracked.
24 |   */
25 | class SparkSessionStats {
26 |   private var numberOfJobsSubmitted = 0
27 |   private var stageInfos = Seq[StageInfo]()
28 | 
29 |   def jobCount: Int = {
30 |     numberOfJobsSubmitted
31 |   }
32 | 
33 |   def allExecutedStages: Seq[StageInfo] = {
34 |     stageInfos
35 |   }
36 | 
37 |   def recordJobStart(jobStart: SparkListenerJobStart): Unit = {
38 |     numberOfJobsSubmitted += 1
39 |   }
40 | 
41 |   def recordStageInfos(stageInfo: StageInfo): Unit = {
42 |     stageInfos = stageInfos :+ stageInfo
43 |   }
44 | 
45 |   def reset(): Unit = {
46 |     numberOfJobsSubmitted = 0
47 |     stageInfos = Seq[StageInfo]()
48 |   }
49 | 
50 | }
51 | 
52 | /**
53 |   * A SparkListener implementation to monitor spark jobs submitted
54 |   */
55 | class SparkMonitor extends SparkListener {
56 |   val stat = new SparkSessionStats
57 | 
58 |   override def onJobStart(jobStart: SparkListenerJobStart) {
59 |     stat.recordJobStart(jobStart)
60 |     println(s"Job started with ${jobStart.stageInfos.size} stages: $jobStart " +
61 |       s"details : ${jobStart.stageInfos.map(_.name)}")
62 | 
63 |   }
64 | 
65 |   override def onStageCompleted(stageCompleted: SparkListenerStageCompleted): Unit = {
66 |     stat.recordStageInfos(stageCompleted.stageInfo)
67 |     println(s"Stage ${stageCompleted.stageInfo.stageId} completed with " +
68 |       s"${stageCompleted.stageInfo.numTasks} tasks.")
69 |   }
70 | 
71 |   /**
72 |     * @param testFun thunk to run with SparkSessionStats as an argument.
73 |     *                Provides a monitoring session where the stats are being reset at the beginning
74 |     *
75 |     */
76 |   def withMonitoringSession(testFun: (SparkSessionStats) => Any): Any = {
77 |     stat.reset
78 |     testFun(stat)
79 |   }
80 | 
81 | }
82 | 


--------------------------------------------------------------------------------
/src/main/scala/com/amazon/deequ/suggestions/rules/RetainCompletenessRule.scala:
--------------------------------------------------------------------------------
 1 | /**
 2 |  * Copyright 2018 Amazon.com, Inc. or its affiliates. All Rights Reserved.
 3 |  *
 4 |  * Licensed under the Apache License, Version 2.0 (the "License"). You may not
 5 |  * use this file except in compliance with the License. A copy of the License
 6 |  * is located at
 7 |  *
 8 |  *     http://aws.amazon.com/apache2.0/
 9 |  *
10 |  * or in the "license" file accompanying this file. This file is distributed on
11 |  * an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either
12 |  * express or implied. See the License for the specific language governing
13 |  * permissions and limitations under the License.
14 |  *
15 |  */
16 | 
17 | package com.amazon.deequ.suggestions.rules
18 | 
19 | import com.amazon.deequ.constraints.Constraint.completenessConstraint
20 | import com.amazon.deequ.profiles._
21 | import com.amazon.deequ.suggestions.ConstraintSuggestion
22 | import scala.math.BigDecimal.RoundingMode
23 | 
24 | /**
25 |   * If a column is incomplete in the sample, we model its completeness as a binomial variable,
26 |   * estimate a confidence interval and use this to define a lower bound for the completeness
27 |   */
28 | case class RetainCompletenessRule() extends ConstraintRule[ColumnProfile] {
29 | 
30 |   override def shouldBeApplied(profile: ColumnProfile, numRecords: Long): Boolean = {
31 |     profile.completeness > 0.2 && profile.completeness < 1.0
32 |   }
33 | 
34 |   override def candidate(profile: ColumnProfile, numRecords: Long): ConstraintSuggestion = {
35 | 
36 |     val p = profile.completeness
37 |     val n = numRecords
38 |     val z = 1.96
39 | 
40 |     // TODO this needs to be more robust for p's close to 0 or 1
41 |     val targetCompleteness = BigDecimal(p - z * math.sqrt(p * (1 - p) / n))
42 |       .setScale(2, RoundingMode.DOWN).toDouble
43 | 
44 |     val constraint = completenessConstraint(profile.column, _ >= targetCompleteness)
45 | 
46 |     val boundInPercent = ((1.0 - targetCompleteness) * 100).toInt
47 | 
48 |     val description = s"'${profile.column}' has less than $boundInPercent% missing values"
49 | 
50 |     ConstraintSuggestion(
51 |       constraint,
52 |       profile.column,
53 |       "Completeness: " + profile.completeness.toString,
54 |       description,
55 |       this,
56 |       s""".hasCompleteness("${profile.column}", _ >= $targetCompleteness,
57 |          | Some("It should be above $targetCompleteness!"))"""
58 |         .stripMargin.replaceAll("\n", "")
59 |     )
60 |   }
61 | 
62 |   override val ruleDescription: String = "If a column is incomplete in the sample, " +
63 |     "we model its completeness as a binomial variable, estimate a confidence interval " +
64 |     "and use this to define a lower bound for the completeness"
65 | }
66 | 


--------------------------------------------------------------------------------
/src/test/scala/com/amazon/deequ/anomalydetection/SimpleThresholdStrategyTest.scala:
--------------------------------------------------------------------------------
 1 | /**
 2 |   * Copyright 2018 Amazon.com, Inc. or its affiliates. All Rights Reserved.
 3 |   *
 4 |   * Licensed under the Apache License, Version 2.0 (the "License"). You may not
 5 |   * use this file except in compliance with the License. A copy of the License
 6 |   * is located at
 7 |   *
 8 |   *     http://aws.amazon.com/apache2.0/
 9 |   *
10 |   * or in the "license" file accompanying this file. This file is distributed on
11 |   * an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either
12 |   * express or implied. See the License for the specific language governing
13 |   * permissions and limitations under the License.
14 |   *
15 |   */
16 | 
17 | package com.amazon.deequ.anomalydetection
18 | 
19 | import org.scalatest.{Matchers, WordSpec}
20 | 
21 | class SimpleThresholdStrategyTest extends WordSpec with Matchers {
22 | 
23 |   "Simple Threshold Strategy" should {
24 | 
25 |     val strategy = SimpleThresholdStrategy(upperBound = 1.0)
26 |     val data = Vector(-1.0, 2.0, 3.0, 0.5)
27 |     val expected = Seq((1, Anomaly(Option(2.0), 1.0)), (2, Anomaly(Option(3.0), 1.0)))
28 | 
29 |     "detect values above threshold" in {
30 |       val anomalyResult = strategy.detect(data, (0, 4))
31 | 
32 |       assert(anomalyResult == expected)
33 |     }
34 | 
35 |     "detect all values without range specified" in {
36 |       val anomalyResult = strategy.detect(data)
37 | 
38 |       assert(anomalyResult == expected)
39 |     }
40 | 
41 |     "work fine with empty input" in {
42 |       val emptySeries = Vector[Double]()
43 |       val anomalyResult = strategy.detect(emptySeries)
44 | 
45 |       assert(anomalyResult == Seq[(Int, Anomaly)]())
46 |     }
47 | 
48 |     "work with upper and lower threshold" in {
49 |       val tS = SimpleThresholdStrategy(lowerBound = -0.5, upperBound = 1.0)
50 |       val anomalyResult = tS.detect(data)
51 | 
52 |       assert(anomalyResult == Seq((0, Anomaly(Option(-1.0), 1.0)),
53 |         (1, Anomaly(Option(2.0), 1.0)), (2, Anomaly(Option(3.0), 1.0))))
54 |     }
55 | 
56 |     "throw an error when thresholds are not ordered " in {
57 |       intercept[IllegalArgumentException] {
58 |         val ts = SimpleThresholdStrategy(lowerBound = 2.0, upperBound = 1.0)
59 |       }
60 |     }
61 | 
62 |     "produce error message with correct value and bounds" in {
63 |       val result = strategy.detect(data)
64 | 
65 |       result.foreach { case (_, anom) =>
66 |         val (value, lowerBound, upperBound) =
67 |           AnomalyDetectionTestUtils.firstThreeDoublesFromString(anom.detail.get)
68 | 
69 |         assert(anom.value.isDefined && value === anom.value.get)
70 |         assert(value < lowerBound || value > upperBound)
71 |       }
72 |     }
73 |   }
74 | }
75 | 


--------------------------------------------------------------------------------
/docs/key-concepts.md:
--------------------------------------------------------------------------------
 1 | # Key Concepts in the Codebase
 2 | There are a few key concepts that will help you to understand the codebase.
 3 | 
 4 | ## Metrics, Analyzers, and State
 5 | Metrics represent some metric associated with the data that changes over time. For example counting the rows in a
 6 | DataFrame.
 7 | 
 8 | An Analyzer knows how to calculate a Metric based on some input DataFrame.
 9 | 
10 | State is an optimization - it represents the state of the data, from which a metric can be calculated. This intermediate
11 | state can then be used to calculate future metrics more quickly. Check out the examples for some further details.
12 | 
13 | ## Overall flow of running deequ checks
14 | When running checks a user specifies a DataFrame and a number of checks to do on that DataFrame. Many checks in Deequ
15 | are based on metrics which describe the data. In order to perform the checks the user requests deequ follows the
16 | following process:
17 | * First deequ figures out which Analyzers are required
18 | * Metrics are calculated using those Analyzers
19 | ** Metrics are also stored if a MetricsRepository is provided 
20 | ** Intermediate state is stored if a StatePersister is provided
21 | ** Intermediate state is used for metric calculations if a StateLoader is provided
22 | * Checks are evaluated using the calculated Metrics
23 | 
24 | The reason it works this way is for performance, primarily because calculating metrics at the same time gives the
25 | opportunity to calculate them in fewer passes over the data. 
26 | 
27 | ### Analyzers
28 | Types of analyzers:
29 | * ScanShareableAnalyzer - an analyzer which computes a metric based on a straight scan over the data, without any
30 | grouping being required
31 | * GroupingAnalyzer - an analyzer that requires the data to be grouped by a set of columns before the metric can be
32 | calculated
33 | 
34 | ### Metrics
35 | A metric includes the following key details
36 | * name - the name for the type of metric
37 | * entity - the type of entity the metric is recorded against. e.g. A column, dataset, or multicolumn
38 | * instance - information about this instance of the metric. For example this could be the column name the metric is
39 | operating on
40 | * value - the value of the metric at a point in time. The type of this value varies between metrics.
41 | 
42 | #### Metrics storage
43 | Metrics can be stored in a metrics repository. An entry in the repository consists of:
44 | * A resultKey, which is a combination of a timestamp and a map of tags. Typically a user may want to record things
45 | like the data source (e.g. table name) with the tags. The resultKey can be used to lookup stored metrics
46 | * An analyzerContext, which consists of a map of Analyzers to Metrics
47 | 
48 | ### State
49 | Please consult the examples or the codebase for more details on State. 


--------------------------------------------------------------------------------
/src/main/scala/com/amazon/deequ/examples/KLLCheckExample.scala:
--------------------------------------------------------------------------------
 1 | /**
 2 |  * Copyright 2018 Amazon.com, Inc. or its affiliates. All Rights Reserved.
 3 |  *
 4 |  * Licensed under the Apache License, Version 2.0 (the "License"). You may not
 5 |  * use this file except in compliance with the License. A copy of the License
 6 |  * is located at
 7 |  *
 8 |  *     http://aws.amazon.com/apache2.0/
 9 |  *
10 |  * or in the "license" file accompanying this file. This file is distributed on
11 |  * an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either
12 |  * express or implied. See the License for the specific language governing
13 |  * permissions and limitations under the License.
14 |  *
15 |  */
16 | 
17 | package com.amazon.deequ.examples
18 | 
19 | import ExampleUtils.{itemsAsDataframe, withSpark}
20 | import com.amazon.deequ.VerificationSuite
21 | import com.amazon.deequ.analyzers.KLLParameters
22 | import com.amazon.deequ.checks.{Check, CheckLevel, CheckStatus}
23 | import com.amazon.deequ.constraints.ConstraintStatus
24 | import org.apache.spark.sql.types.DoubleType
25 | 
26 | private[examples] object KLLCheckExample extends App {
27 | 
28 |   withSpark { session =>
29 | 
30 |     val data = itemsAsDataframe(session,
31 |       Item(1, "Thingy A", "awesome thing.", "high", 0),
32 |       Item(2, "Thingy B", "available at http://thingb.com", null, 0),
33 |       Item(3, null, null, "low", 5),
34 |       Item(4, "Thingy D", "checkout https://thingd.ca", "low", 10),
35 |       Item(5, "Thingy E", null, "high", 12))
36 | 
37 |     val newData = data.select(data("numViews").cast(DoubleType).as("numViews"))
38 | 
39 |     val verificationResult = VerificationSuite()
40 |       .onData(newData)
41 |       .addCheck(
42 |         Check(CheckLevel.Error, "integrity checks")
43 |           // we expect 5 records
44 |           .hasSize(_ == 5)
45 |           // we expect the maximum of tips to be not more than 10
46 |           .hasMax("numViews", _ <= 10)
47 |           // we expect the sketch size to be at least 16
48 |           .kllSketchSatisfies("numViews", _.parameters(1) >= 16,
49 |             kllParameters = Option(KLLParameters(2, 0.64, 2))))
50 |       .run()
51 | 
52 |     if (verificationResult.status == CheckStatus.Success) {
53 |       println("The data passed the test, everything is fine!")
54 |     } else {
55 |       println("We found errors in the data, the following constraints were not satisfied:\n")
56 | 
57 |       val resultsForAllConstraints = verificationResult.checkResults
58 |         .flatMap { case (_, checkResult) => checkResult.constraintResults }
59 | 
60 |       resultsForAllConstraints
61 |         .filter { _.status != ConstraintStatus.Success }
62 |         .foreach { result =>
63 |           println(s"${result.constraint} failed: ${result.message.get}")
64 |         }
65 |     }
66 | 
67 |   }
68 | }
69 | 
70 | 


--------------------------------------------------------------------------------
/src/main/scala/com/amazon/deequ/analyzers/runners/MetricCalculationException.scala:
--------------------------------------------------------------------------------
 1 | /**
 2 |  * Copyright 2018 Amazon.com, Inc. or its affiliates. All Rights Reserved.
 3 |  *
 4 |  * Licensed under the Apache License, Version 2.0 (the "License"). You may not
 5 |  * use this file except in compliance with the License. A copy of the License
 6 |  * is located at
 7 |  *
 8 |  *     http://aws.amazon.com/apache2.0/
 9 |  *
10 |  * or in the "license" file accompanying this file. This file is distributed on
11 |  * an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either
12 |  * express or implied. See the License for the specific language governing
13 |  * permissions and limitations under the License.
14 |  *
15 |  */
16 | 
17 | package com.amazon.deequ.analyzers.runners
18 | 
19 | abstract class MetricCalculationException(message: String) extends Exception(message)
20 | 
21 | class MetricCalculationRuntimeException(message: String)
22 |   extends MetricCalculationException(message) {
23 | 
24 |   def this(message: String, cause: Throwable) {
25 |     this(message)
26 |     initCause(cause)
27 |   }
28 | 
29 |   def this(cause: Throwable) {
30 |     this(Option(cause).map(_.toString).orNull, cause)
31 |   }
32 | }
33 | 
34 | class MetricCalculationPreconditionException(message: String)
35 |   extends MetricCalculationException(message)
36 | 
37 | 
38 | class NoSuchColumnException(message: String)
39 |   extends MetricCalculationPreconditionException(message)
40 | 
41 | class WrongColumnTypeException(message: String)
42 |   extends MetricCalculationPreconditionException(message)
43 | 
44 | class NoColumnsSpecifiedException(message: String)
45 |   extends MetricCalculationPreconditionException(message)
46 | 
47 | class NumberOfSpecifiedColumnsException(message: String)
48 |   extends MetricCalculationPreconditionException(message)
49 | 
50 | class IllegalAnalyzerParameterException(
51 |     message: String)
52 |   extends MetricCalculationPreconditionException(message)
53 | 
54 | class EmptyStateException(message: String) extends MetricCalculationRuntimeException(message)
55 | 
56 | 
57 | object MetricCalculationException {
58 | 
59 |   private[deequ] def getApproxQuantileIllegalParamMessage(quantile: Double): String = {
60 |     "Quantile parameter must be in the closed interval [0, 1]. " +
61 |       s"Currently, the value is: $quantile!"
62 |   }
63 | 
64 |   private[deequ] def getApproxQuantileIllegalErrorParamMessage(relativeError: Double): String = {
65 |     "Relative error parameter must be in the closed interval [0, 1]. " +
66 |       s"Currently, the value is: $relativeError!"
67 |   }
68 | 
69 |   def wrapIfNecessary(exception: Throwable)
70 |     : MetricCalculationException = {
71 | 
72 |     exception match {
73 |       case error: MetricCalculationException => error
74 |       case error: Throwable => new MetricCalculationRuntimeException(error)
75 |     }
76 |   }
77 | 
78 | }
79 | 


--------------------------------------------------------------------------------
/src/main/scala/com/amazon/deequ/examples/IncrementalMetricsExample.scala:
--------------------------------------------------------------------------------
 1 | /**
 2 |  * Copyright 2018 Amazon.com, Inc. or its affiliates. All Rights Reserved.
 3 |  *
 4 |  * Licensed under the Apache License, Version 2.0 (the "License"). You may not
 5 |  * use this file except in compliance with the License. A copy of the License
 6 |  * is located at
 7 |  *
 8 |  *     http://aws.amazon.com/apache2.0/
 9 |  *
10 |  * or in the "license" file accompanying this file. This file is distributed on
11 |  * an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either
12 |  * express or implied. See the License for the specific language governing
13 |  * permissions and limitations under the License.
14 |  *
15 |  */
16 | 
17 | package com.amazon.deequ.examples
18 | 
19 | import ExampleUtils.{itemsAsDataframe, withSpark}
20 | import com.amazon.deequ.analyzers.{Analysis, ApproxCountDistinct, Completeness, InMemoryStateProvider, Size}
21 | import com.amazon.deequ.analyzers.runners.AnalysisRunner
22 | 
23 | private[examples] object IncrementalMetricsExample extends App {
24 | 
25 |   /* NOTE: Stateful support is still work in progress, and is therefore not yet integrated into
26 |      VerificationSuite. We showcase however how to incrementally compute metrics on a growing
27 |      dataset using the AnalysisRunner. */
28 | 
29 |   withSpark { session =>
30 | 
31 |     val data = itemsAsDataframe(session,
32 |       Item(1, "Thingy A", "awesome thing.", "high", 0),
33 |       Item(2, "Thingy B", "available tomorrow", "low", 0),
34 |       Item(3, "Thing C", null, null, 5))
35 | 
36 |     val moreData = itemsAsDataframe(session,
37 |       Item(4, "Thingy D", null, "low", 10),
38 |       Item(5, "Thingy E", null, "high", 12))
39 | 
40 | 
41 |     val analysis = Analysis()
42 |       .addAnalyzer(Size())
43 |       .addAnalyzer(ApproxCountDistinct("id"))
44 |       .addAnalyzer(Completeness("productName"))
45 |       .addAnalyzer(Completeness("description"))
46 | 
47 |     val stateStore = InMemoryStateProvider()
48 | 
49 |     val metricsForData = AnalysisRunner.run(
50 |       data = data,
51 |       analysis = analysis,
52 |       saveStatesWith = Some(stateStore) // persist the internal state of the computation
53 |     )
54 | 
55 |     // We update the metrics now from the stored states without having to access the previous data!
56 |     val metricsAfterAddingMoreData = AnalysisRunner.run(
57 |       data = moreData,
58 |       analysis = analysis,
59 |       aggregateWith = Some(stateStore) // continue from internal state of the computation
60 |     )
61 | 
62 |     println("Metrics for the first 3 records:\n")
63 |     metricsForData.metricMap.foreach { case (analyzer, metric) =>
64 |       println(s"\t$analyzer: ${metric.value.get}")
65 |     }
66 | 
67 |     println("\nMetrics after adding 2 more records:\n")
68 |     metricsAfterAddingMoreData.metricMap.foreach { case (analyzer, metric) =>
69 |       println(s"\t$analyzer: ${metric.value.get}")
70 |     }
71 | 
72 |   }
73 | }
74 | 


--------------------------------------------------------------------------------
/src/main/scala/com/amazon/deequ/io/DfsUtils.scala:
--------------------------------------------------------------------------------
 1 | /**
 2 |  * Copyright 2018 Amazon.com, Inc. or its affiliates. All Rights Reserved.
 3 |  *
 4 |  * Licensed under the Apache License, Version 2.0 (the "License"). You may not
 5 |  * use this file except in compliance with the License. A copy of the License
 6 |  * is located at
 7 |  *
 8 |  *     http://aws.amazon.com/apache2.0/
 9 |  *
10 |  * or in the "license" file accompanying this file. This file is distributed on
11 |  * an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either
12 |  * express or implied. See the License for the specific language governing
13 |  * permissions and limitations under the License.
14 |  *
15 |  */
16 | 
17 | package com.amazon.deequ.io
18 | 
19 | import java.io.{BufferedWriter, OutputStreamWriter}
20 | 
21 | import org.apache.hadoop.fs.{FSDataInputStream, FSDataOutputStream, FileSystem, Path}
22 | import org.apache.spark.sql.SparkSession
23 | 
24 | private[deequ] object DfsUtils {
25 | 
26 |   /* Helper function to read from a binary file on S3 */
27 |   def readFromFileOnDfs[T](session: SparkSession, path: String)
28 |     (readFunc: FSDataInputStream => T): T = {
29 | 
30 |     val (fs, qualifiedPath) = asQualifiedPath(session, path)
31 |     val input = fs.open(qualifiedPath)
32 | 
33 |     try {
34 |       readFunc(input)
35 |     } finally {
36 |       if (input != null) {
37 |         input.close()
38 |       }
39 |     }
40 |   }
41 | 
42 |   /* Helper function to write to a binary file on S3 */
43 |   def writeToFileOnDfs(session: SparkSession, path: String, overwrite: Boolean = false)
44 |     (writeFunc: FSDataOutputStream => Unit): Unit = {
45 | 
46 |     val (fs, qualifiedPath) = asQualifiedPath(session, path)
47 |     val output = fs.create(qualifiedPath, overwrite)
48 | 
49 |     try {
50 |       writeFunc(output)
51 |     } finally {
52 |       if (output != null) {
53 |         output.close()
54 |       }
55 |     }
56 |   }
57 | 
58 |   /* Helper function to write to a binary file on S3 */
59 |   def writeToTextFileOnDfs(session: SparkSession, path: String, overwrite: Boolean = false)
60 |     (writeFunc: BufferedWriter => Unit): Unit = {
61 | 
62 |     val (fs, qualifiedPath) = asQualifiedPath(session, path)
63 |     val output = fs.create(qualifiedPath, overwrite)
64 | 
65 |     try {
66 |       val writer = new BufferedWriter(new OutputStreamWriter(output))
67 |       writeFunc(writer)
68 |       writer.close()
69 |     } finally {
70 |       if (output != null) {
71 |         output.close()
72 |       }
73 |     }
74 |   }
75 | 
76 |   /* Make sure we write to the correct filesystem, as EMR clusters also have an internal HDFS */
77 |   def asQualifiedPath(session: SparkSession, path: String): (FileSystem, Path) = {
78 |     val hdfsPath = new Path(path)
79 |     val fs = hdfsPath.getFileSystem(session.sparkContext.hadoopConfiguration)
80 |     val qualifiedPath = hdfsPath.makeQualified(fs.getUri, fs.getWorkingDirectory)
81 | 
82 |     (fs, qualifiedPath)
83 |   }
84 | 
85 | }
86 | 


--------------------------------------------------------------------------------
/src/main/scala/com/amazon/deequ/anomalydetection/RelativeRateOfChangeStrategy.scala:
--------------------------------------------------------------------------------
 1 | /**
 2 |  * Copyright 2018 Amazon.com, Inc. or its affiliates. All Rights Reserved.
 3 |  *
 4 |  * Licensed under the Apache License, Version 2.0 (the "License"). You may not
 5 |  * use this file except in compliance with the License. A copy of the License
 6 |  * is located at
 7 |  *
 8 |  *     http://aws.amazon.com/apache2.0/
 9 |  *
10 |  * or in the "license" file accompanying this file. This file is distributed on
11 |  * an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either
12 |  * express or implied. See the License for the specific language governing
13 |  * permissions and limitations under the License.
14 |  *
15 |  */
16 | 
17 | package com.amazon.deequ.anomalydetection
18 | 
19 | import breeze.linalg.DenseVector
20 | 
21 | /**
22 |   * Detects anomalies based on the values' rate of change.
23 |   * The order of the difference can be set manually.
24 |   * If it is set to 0, this strategy acts like the [[SimpleThresholdStrategy]].
25 |   *
26 |   * RelativeRateOfChangeStrategy(Some(0.9), Some(1.1), 1) for example
27 |   * calculates the first discrete difference
28 |   * and if some point's value changes by more than 10.0 Percent in one timestep,
29 |   * it flags it as an anomaly.
30 |   *
31 |   * @param maxRateDecrease Lower bound of accepted relative change (as new value / old value).
32 |   * @param maxRateIncrease Upper bound of accepted relative change (as new value / old value).
33 |   * @param order           Order of the calculated difference.
34 |   *                        Set to 1 it calculates the difference between two consecutive values.
35 |   */
36 | case class RelativeRateOfChangeStrategy(
37 |     maxRateDecrease: Option[Double] = None,
38 |     maxRateIncrease: Option[Double] = None,
39 |     order: Int = 1)
40 |   extends BaseChangeStrategy {
41 | 
42 |   /**
43 |     * Calculates the rate of change with respect to the specified order.
44 |     * If the order is set to 1, the resulting value for a point at index i
45 |     * is equal to dataSeries (i) / dataSeries(i - 1).
46 |     * Note that this difference cannot be calculated for the first [[order]] elements in the vector.
47 |     * The resulting vector is therefore smaller by [[order]] elements.
48 |     *
49 |     * @param dataSeries The values contained in a DenseVector[Double]
50 |     * @param order      The order of the derivative.
51 |     * @return A vector with the resulting rates of change for all values
52 |     *         except the first [[order]] elements.
53 |     */
54 |   override def diff(dataSeries: DenseVector[Double], order: Int): DenseVector[Double] = {
55 |     require(order > 0, "Order of diff cannot be zero or negative")
56 |     if (dataSeries.length == 0) {
57 |       dataSeries
58 |     } else {
59 |       val valuesRight = dataSeries.slice(order, dataSeries.length)
60 |       val valuesLeft = dataSeries.slice(0, dataSeries.length - order)
61 |       valuesRight / valuesLeft
62 |     }
63 |   }
64 | }
65 | 


--------------------------------------------------------------------------------
/src/test/scala/com/amazon/deequ/KLL/KLLBenchmark.java:
--------------------------------------------------------------------------------
 1 | /**
 2 |  * Copyright 2018 Amazon.com, Inc. or its affiliates. All Rights Reserved.
 3 |  *
 4 |  * Licensed under the Apache License, Version 2.0 (the "License"). You may not
 5 |  * use this file except in compliance with the License. A copy of the License
 6 |  * is located at
 7 |  *
 8 |  *     http://aws.amazon.com/apache2.0/
 9 |  *
10 |  * or in the "license" file accompanying this file. This file is distributed on
11 |  * an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either
12 |  * express or implied. See the License for the specific language governing
13 |  * permissions and limitations under the License.
14 |  *
15 |  */
16 | 
17 | package com.amazon.deequ.KLL;
18 | 
19 | import com.amazon.deequ.analyzers.QuantileNonSample;
20 | import org.apache.datasketches.kll.KllFloatsSketch;
21 | import org.openjdk.jmh.annotations.Benchmark;
22 | import org.openjdk.jmh.annotations.BenchmarkMode;
23 | import org.openjdk.jmh.annotations.Fork;
24 | import org.openjdk.jmh.annotations.Mode;
25 | import org.openjdk.jmh.annotations.OutputTimeUnit;
26 | import org.openjdk.jmh.infra.Blackhole;
27 | import org.openjdk.jmh.runner.Runner;
28 | import org.openjdk.jmh.runner.RunnerException;
29 | import org.openjdk.jmh.runner.options.Options;
30 | import org.openjdk.jmh.runner.options.OptionsBuilder;
31 | 
32 | import java.util.Random;
33 | import java.util.concurrent.TimeUnit;
34 | 
35 | @BenchmarkMode(Mode.AverageTime)
36 | @OutputTimeUnit(TimeUnit.MILLISECONDS)
37 | @Fork(value = 2, jvmArgs = {"-Xms2G", "-Xmx2G"})
38 | public class KLLBenchmark {
39 | 
40 |   private static final int N = 10_000_000;
41 | 
42 |   private static float[] DATA_FOR_TESTING = createData();
43 | 
44 |   public static void main(String[] args) throws RunnerException {
45 | 
46 |     Options opt = new OptionsBuilder()
47 |         .include(KLLBenchmark.class.getSimpleName())
48 |         .forks(1)
49 |         .build();
50 | 
51 |     new Runner(opt).run();
52 |   }
53 | 
54 |   private static float[] createData() {
55 |     Random prng = new Random();
56 |     float[] numbers = new float[N];
57 |     for (int i = 0; i < N; i++) {
58 |       numbers[i] = prng.nextFloat();
59 |     }
60 |     return numbers;
61 |   }
62 | 
63 |   @Benchmark
64 |   public void sumArray(Blackhole bh) {
65 |     float sum = 0.0f;
66 |     for (int i = 0; i < N; i++) {
67 |       sum += DATA_FOR_TESTING[i];
68 |     }
69 |     bh.consume(sum);
70 |   }
71 | 
72 |   @Benchmark
73 |   public void sketchArrayWithKLL(Blackhole bh) {
74 |     QuantileNonSample<Float> sketch = KLLBenchmarkHelper.floatSketch();
75 |     for (int i = 0; i < N; i++) {
76 |       sketch.update(DATA_FOR_TESTING[i]);
77 |     }
78 |     bh.consume(sketch);
79 |   }
80 | 
81 |   @Benchmark
82 |   public void sketchArrayWithJavaSketchesKLL(Blackhole bh) {
83 |     KllFloatsSketch sketch = new KllFloatsSketch();
84 |     for (int i = 0; i < N; i++) {
85 |       sketch.update(DATA_FOR_TESTING[i]);
86 |     }
87 |     bh.consume(sketch);
88 |   }
89 | }
90 | 


--------------------------------------------------------------------------------
/src/main/scala/com/amazon/deequ/examples/BasicExample.scala:
--------------------------------------------------------------------------------
 1 | /**
 2 |  * Copyright 2018 Amazon.com, Inc. or its affiliates. All Rights Reserved.
 3 |  *
 4 |  * Licensed under the Apache License, Version 2.0 (the "License"). You may not
 5 |  * use this file except in compliance with the License. A copy of the License
 6 |  * is located at
 7 |  *
 8 |  *     http://aws.amazon.com/apache2.0/
 9 |  *
10 |  * or in the "license" file accompanying this file. This file is distributed on
11 |  * an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either
12 |  * express or implied. See the License for the specific language governing
13 |  * permissions and limitations under the License.
14 |  *
15 |  */
16 | 
17 | package com.amazon.deequ.examples
18 | 
19 | import ExampleUtils.{withSpark, itemsAsDataframe}
20 | import com.amazon.deequ.VerificationSuite
21 | import com.amazon.deequ.checks.{Check, CheckLevel, CheckStatus}
22 | import com.amazon.deequ.constraints.ConstraintStatus
23 | 
24 | private[examples] object BasicExample extends App {
25 | 
26 |   withSpark { session =>
27 | 
28 |     val data = itemsAsDataframe(session,
29 |       Item(1, "Thingy A", "awesome thing.", "high", 0),
30 |       Item(2, "Thingy B", "available at http://thingb.com", null, 0),
31 |       Item(3, null, null, "low", 5),
32 |       Item(4, "Thingy D", "checkout https://thingd.ca", "low", 10),
33 |       Item(5, "Thingy E", null, "high", 12))
34 | 
35 |     val verificationResult = VerificationSuite()
36 |       .onData(data)
37 |       .addCheck(
38 |         Check(CheckLevel.Error, "integrity checks")
39 |           // we expect 5 records
40 |           .hasSize(_ == 5)
41 |           // 'id' should never be NULL
42 |           .isComplete("id")
43 |           // 'id' should not contain duplicates
44 |           .isUnique("id")
45 |           // 'productName' should never be NULL
46 |           .isComplete("productName")
47 |           // 'priority' should only contain the values "high" and "low"
48 |           .isContainedIn("priority", Array("high", "low"))
49 |           // 'numViews' should not contain negative values
50 |           .isNonNegative("numViews"))
51 |       .addCheck(
52 |         Check(CheckLevel.Warning, "distribution checks")
53 |           // at least half of the 'description's should contain a url
54 |           .containsURL("description", _ >= 0.5)
55 |           // half of the items should have less than 10 'numViews'
56 |           .hasApproxQuantile("numViews", 0.5, _ <= 10))
57 |       .run()
58 | 
59 |     if (verificationResult.status == CheckStatus.Success) {
60 |       println("The data passed the test, everything is fine!")
61 |     } else {
62 |       println("We found errors in the data, the following constraints were not satisfied:\n")
63 | 
64 |       val resultsForAllConstraints = verificationResult.checkResults
65 |         .flatMap { case (_, checkResult) => checkResult.constraintResults }
66 | 
67 |       resultsForAllConstraints
68 |         .filter { _.status != ConstraintStatus.Success }
69 |         .foreach { result =>
70 |           println(s"${result.constraint} failed: ${result.message.get}")
71 |         }
72 |     }
73 | 
74 |   }
75 | }
76 | 


--------------------------------------------------------------------------------
/src/test/scala/com/amazon/deequ/SparkContextSpec.scala:
--------------------------------------------------------------------------------
 1 | /**
 2 |  * Copyright 2018 Amazon.com, Inc. or its affiliates. All Rights Reserved.
 3 |  *
 4 |  * Licensed under the Apache License, Version 2.0 (the "License"). You may not
 5 |  * use this file except in compliance with the License. A copy of the License
 6 |  * is located at
 7 |  *
 8 |  *     http://aws.amazon.com/apache2.0/
 9 |  *
10 |  * or in the "license" file accompanying this file. This file is distributed on
11 |  * an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either
12 |  * express or implied. See the License for the specific language governing
13 |  * permissions and limitations under the License.
14 |  *
15 |  */
16 | 
17 | package com.amazon.deequ
18 | 
19 | import org.apache.spark.SparkContext
20 | import org.apache.spark.sql.{SQLContext, SparkSession}
21 | 
22 | /**
23 |   * To be mixed with Tests so they can use a default spark context suitable for testing
24 |   */
25 | trait SparkContextSpec {
26 | 
27 |   /**
28 |     * @param testFun thunk to run with SparkSession as an argument
29 |     */
30 |   def withSparkSession(testFun: SparkSession => Any): Unit = {
31 |     val session = setupSparkSession
32 |     try {
33 |       testFun(session)
34 |     } finally {
35 |       /* empty cache of RDD size, as the referred ids are only valid within a session */
36 |       tearDownSparkSession(session)
37 |     }
38 |   }
39 | 
40 |   /**
41 |     * @param testFun thunk to run with SparkSession and SparkMonitor as an argument for the tests
42 |     *                that would like to get details on spark jobs submitted
43 |     *
44 |     */
45 |   def withMonitorableSparkSession(testFun: (SparkSession, SparkMonitor) => Any): Unit = {
46 |     val monitor = new SparkMonitor
47 |     val session = setupSparkSession
48 |     session.sparkContext.addSparkListener(monitor)
49 |     try {
50 |       testFun(session, monitor)
51 |     } finally {
52 |       tearDownSparkSession(session)
53 |     }
54 |   }
55 | 
56 |   /**
57 |     * @param testFun thunk to run with SparkContext as an argument
58 |     */
59 |   def withSparkContext(testFun: SparkContext => Any) {
60 |     withSparkSession(session => testFun(session.sparkContext))
61 |   }
62 | 
63 |   /**
64 |     * @param testFun thunk to run with SQLContext as an argument
65 |     */
66 |   def withSparkSqlContext(testFun: SQLContext => Any) {
67 |     withSparkSession(session => testFun(session.sqlContext))
68 |   }
69 | 
70 |   /**
71 |     * Setups a local sparkSession
72 |     *
73 |     * @return sparkSession to be used
74 |     */
75 |   private def setupSparkSession = {
76 |     val session = SparkSession.builder()
77 |       .master("local")
78 |       .appName("test")
79 |       .config("spark.ui.enabled", "false")
80 |       .config("spark.sql.shuffle.partitions", 2.toString)
81 |       .getOrCreate()
82 |     session.sparkContext.setCheckpointDir(System.getProperty("java.io.tmpdir"))
83 |     session
84 |   }
85 | 
86 |   /**
87 |     * Tears down the sparkSession
88 |     *
89 |     * @param session Session to be stopped
90 |     * @return
91 |     */
92 |   private def tearDownSparkSession(session: SparkSession) = {
93 |     session.stop()
94 |     System.clearProperty("spark.driver.port")
95 |   }
96 | 
97 | }
98 | 


--------------------------------------------------------------------------------
/src/main/scala/com/amazon/deequ/examples/ConstraintSuggestionExample.scala:
--------------------------------------------------------------------------------
 1 | /**
 2 |  * Copyright 2018 Amazon.com, Inc. or its affiliates. All Rights Reserved.
 3 |  *
 4 |  * Licensed under the Apache License, Version 2.0 (the "License"). You may not
 5 |  * use this file except in compliance with the License. A copy of the License
 6 |  * is located at
 7 |  *
 8 |  *     http://aws.amazon.com/apache2.0/
 9 |  *
10 |  * or in the "license" file accompanying this file. This file is distributed on
11 |  * an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either
12 |  * express or implied. See the License for the specific language governing
13 |  * permissions and limitations under the License.
14 |  *
15 |  */
16 | 
17 | package com.amazon.deequ.examples
18 | 
19 | import com.amazon.deequ.examples.ExampleUtils.withSpark
20 | import com.amazon.deequ.suggestions.{ConstraintSuggestionRunner, Rules}
21 | 
22 | private[examples] object ConstraintSuggestionExample extends App {
23 | 
24 |   withSpark { session =>
25 | 
26 |     // Lets first generate some example data
27 |     val rows = session.sparkContext.parallelize(Seq(
28 |       RawData("thingA", "13.0", "IN_TRANSIT", "true"),
29 |       RawData("thingA", "5", "DELAYED", "false"),
30 |       RawData("thingB", null, "DELAYED", null),
31 |       RawData("thingC", null, "IN_TRANSIT", "false"),
32 |       RawData("thingD", "1.0", "DELAYED", "true"),
33 |       RawData("thingC", "7.0", "UNKNOWN", null),
34 |       RawData("thingC", "24", "UNKNOWN", null),
35 |       RawData("thingE", "20", "DELAYED", "false"),
36 |       RawData("thingA", "13.0", "IN_TRANSIT", "true"),
37 |       RawData("thingA", "5", "DELAYED", "false"),
38 |       RawData("thingB", null, "DELAYED", null),
39 |       RawData("thingC", null, "IN_TRANSIT", "false"),
40 |       RawData("thingD", "1.0", "DELAYED", "true"),
41 |       RawData("thingC", "17.0", "UNKNOWN", null),
42 |       RawData("thingC", "22", "UNKNOWN", null),
43 |       RawData("thingE", "23", "DELAYED", "false")
44 |     ))
45 | 
46 |     val data = session.createDataFrame(rows)
47 | 
48 |     // We ask deequ to compute constraint suggestions for us on the data
49 |     // It will profile the data and than apply a set of rules specified in addConstraintRules()
50 |     // to suggest constraints
51 |     val suggestionResult = ConstraintSuggestionRunner()
52 |       .onData(data)
53 |       .addConstraintRules(Rules.DEFAULT)
54 |       .run()
55 | 
56 |     // We can now investigate the constraints that deequ suggested. We get a textual description
57 |     // and the corresponding scala code for each suggested constraint
58 |     //
59 |     // Note that the constraint suggestion is based on heuristic rules and assumes that the data it
60 |     // is shown is 'static' and correct, which might often not be the case in the real world.
61 |     // Therefore the suggestions should always be manually reviewed before being applied in real
62 |     // deployments.
63 |     suggestionResult.constraintSuggestions.foreach { case (column, suggestions) =>
64 |       suggestions.foreach { suggestion =>
65 |         println(s"Constraint suggestion for '$column':\t${suggestion.description}\n" +
66 |           s"The corresponding scala code is ${suggestion.codeForConstraint}\n")
67 |       }
68 |     }
69 | 
70 |   }
71 | }
72 | 


--------------------------------------------------------------------------------
/src/main/scala/com/amazon/deequ/analyzers/catalyst/DeequFunctions.scala:
--------------------------------------------------------------------------------
 1 | /**
 2 |  * Copyright 2018 Amazon.com, Inc. or its affiliates. All Rights Reserved.
 3 |  *
 4 |  * Licensed under the Apache License, Version 2.0 (the "License"). You may not
 5 |  * use this file except in compliance with the License. A copy of the License
 6 |  * is located at
 7 |  *
 8 |  *     http://aws.amazon.com/apache2.0/
 9 |  *
10 |  * or in the "license" file accompanying this file. This file is distributed on
11 |  * an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either
12 |  * express or implied. See the License for the specific language governing
13 |  * permissions and limitations under the License.
14 |  *
15 |  */
16 | 
17 | package org.apache.spark.sql
18 | 
19 | 
20 | import com.amazon.deequ.analyzers.KLLSketch
21 | import org.apache.spark.sql.catalyst.expressions.aggregate.{AggregateFunction, StatefulApproxQuantile, StatefulHyperloglogPlus}
22 | import org.apache.spark.sql.catalyst.expressions.Literal
23 | 
24 | /* Custom aggregation functions used internally by deequ */
25 | object DeequFunctions {
26 | 
27 |   private[this] def withAggregateFunction(
28 |       func: AggregateFunction,
29 |       isDistinct: Boolean = false): Column = {
30 | 
31 |     Column(func.toAggregateExpression(isDistinct))
32 |   }
33 | 
34 |   /** Pearson correlation with state */
35 |   def stateful_corr(columnA: String, columnB: String): Column = {
36 |     stateful_corr(Column(columnA), Column(columnB))
37 |   }
38 | 
39 |   /** Pearson correlation with state */
40 |   def stateful_corr(columnA: Column, columnB: Column): Column = withAggregateFunction {
41 |     new StatefulCorrelation(columnA.expr, columnB.expr)
42 |   }
43 | 
44 |   /** Standard deviation with state */
45 |   def stateful_stddev_pop(column: String): Column = {
46 |     stateful_stddev_pop(Column(column))
47 |   }
48 | 
49 |   /** Standard deviation with state */
50 |   def stateful_stddev_pop(column: Column): Column = withAggregateFunction {
51 |     StatefulStdDevPop(column.expr)
52 |   }
53 | 
54 |   /** Approximate number of distinct values with state via HLL's */
55 |   def stateful_approx_count_distinct(column: String): Column = {
56 |     stateful_approx_count_distinct(Column(column))
57 |   }
58 | 
59 |   /** Approximate number of distinct values with state via HLL's */
60 |   def stateful_approx_count_distinct(column: Column): Column = withAggregateFunction {
61 |     StatefulHyperloglogPlus(column.expr)
62 |   }
63 | 
64 |   def stateful_approx_quantile(
65 |       column: Column,
66 |       relativeError: Double)
67 |     : Column = withAggregateFunction {
68 | 
69 |     StatefulApproxQuantile(
70 |       column.expr,
71 |       // val relativeError = 1.0D / accuracy inside StatefulApproxQuantile
72 |       Literal(1.0 / relativeError),
73 |       mutableAggBufferOffset = 0,
74 |       inputAggBufferOffset = 0
75 |     )
76 |   }
77 | 
78 |   /** Data type detection with state */
79 |   def stateful_datatype(column: Column): Column = {
80 |     val statefulDataType = new StatefulDataType()
81 |     statefulDataType(column)
82 |   }
83 | 
84 |   def stateful_kll(
85 |       column: Column,
86 |       sketchSize: Int,
87 |       shrinkingFactor: Double): Column = {
88 |     val statefulKLL = new StatefulKLLSketch(sketchSize, shrinkingFactor)
89 |     statefulKLL(column)
90 |   }
91 | }
92 | 
93 | 
94 | 


--------------------------------------------------------------------------------
/src/main/scala/com/amazon/deequ/suggestions/rules/CategoricalRangeRule.scala:
--------------------------------------------------------------------------------
 1 | /**
 2 |  * Copyright 2018 Amazon.com, Inc. or its affiliates. All Rights Reserved.
 3 |  *
 4 |  * Licensed under the Apache License, Version 2.0 (the "License"). You may not
 5 |  * use this file except in compliance with the License. A copy of the License
 6 |  * is located at
 7 |  *
 8 |  *     http://aws.amazon.com/apache2.0/
 9 |  *
10 |  * or in the "license" file accompanying this file. This file is distributed on
11 |  * an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either
12 |  * express or implied. See the License for the specific language governing
13 |  * permissions and limitations under the License.
14 |  *
15 |  */
16 | 
17 | package com.amazon.deequ.suggestions.rules
18 | 
19 | import com.amazon.deequ.analyzers.{DataTypeInstances, Histogram}
20 | import com.amazon.deequ.checks.Check
21 | import com.amazon.deequ.constraints.Constraint.complianceConstraint
22 | import com.amazon.deequ.profiles.ColumnProfile
23 | import com.amazon.deequ.suggestions.ConstraintSuggestion
24 | import org.apache.commons.lang3.StringEscapeUtils
25 | 
26 | /** If we see a categorical range for a column, we suggest an IS IN (...) constraint */
27 | case class CategoricalRangeRule() extends ConstraintRule[ColumnProfile] {
28 | 
29 |   override def shouldBeApplied(profile: ColumnProfile, numRecords: Long): Boolean = {
30 |     val hasHistogram = profile.histogram.isDefined && (
31 |       profile.dataType == DataTypeInstances.String ||
32 |       profile.dataType == DataTypeInstances.Integral
33 |     )
34 | 
35 |     if (hasHistogram) {
36 |       val entries = profile.histogram.get.values
37 | 
38 |       val numUniqueElements = entries.count { case (_, value) => value.absolute == 1L }
39 | 
40 |       val uniqueValueRatio = numUniqueElements.toDouble / entries.size
41 | 
42 |       // TODO find a principled way to define this threshold...
43 |       uniqueValueRatio <= 0.1
44 |     } else {
45 |       false
46 |     }
47 |   }
48 | 
49 |   override def candidate(profile: ColumnProfile, numRecords: Long): ConstraintSuggestion = {
50 | 
51 |     val valuesByPopularity = profile.histogram.get.values.toArray
52 |       .filterNot { case (key, _) => key == Histogram.NullFieldReplacement }
53 |       .sortBy { case (_, value) => value.absolute }
54 |       .reverse
55 | 
56 |     val categoriesSql = valuesByPopularity
57 |       // the character "'" can be contained in category names
58 |       .map { case (key, _) => key.replace("'", "''") }
59 |       .mkString("'", "', '", "'")
60 | 
61 |     val categoriesCode = valuesByPopularity
62 |       .map { case (key, _) => StringEscapeUtils.escapeJava(key) }
63 |       .mkString(""""""", """", """", """"""")
64 | 
65 |     val description = s"'${profile.column}' has value range $categoriesSql"
66 |     val columnCondition = s"`${profile.column}` IN ($categoriesSql)"
67 |     val constraint = complianceConstraint(description, columnCondition, Check.IsOne)
68 | 
69 |     ConstraintSuggestion(
70 |       constraint,
71 |       profile.column,
72 |       "Compliance: 1",
73 |       description,
74 |       this,
75 |       s""".isContainedIn("${profile.column}", Array($categoriesCode))"""
76 |     )
77 |   }
78 | 
79 |   override val ruleDescription: String = "If we see a categorical range for a " +
80 |     "column, we suggest an IS IN (...) constraint"
81 | }
82 | 


--------------------------------------------------------------------------------
/src/main/scala/com/amazon/deequ/examples/DataProfilingExample.scala:
--------------------------------------------------------------------------------
 1 | /**
 2 |  * Copyright 2018 Amazon.com, Inc. or its affiliates. All Rights Reserved.
 3 |  *
 4 |  * Licensed under the Apache License, Version 2.0 (the "License"). You may not
 5 |  * use this file except in compliance with the License. A copy of the License
 6 |  * is located at
 7 |  *
 8 |  *     http://aws.amazon.com/apache2.0/
 9 |  *
10 |  * or in the "license" file accompanying this file. This file is distributed on
11 |  * an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either
12 |  * express or implied. See the License for the specific language governing
13 |  * permissions and limitations under the License.
14 |  *
15 |  */
16 | 
17 | package com.amazon.deequ.examples
18 | 
19 | import com.amazon.deequ.examples.ExampleUtils.withSpark
20 | import com.amazon.deequ.profiles.{ColumnProfilerRunner, NumericColumnProfile}
21 | 
22 | case class RawData(productName: String, totalNumber: String, status: String, valuable: String)
23 | 
24 | private[examples] object DataProfilingExample extends App {
25 | 
26 |   withSpark { session =>
27 | 
28 |     /* We profile raw data, mostly in string format (e.g., from a csv file) */
29 |     val rows = session.sparkContext.parallelize(Seq(
30 |       RawData("thingA", "13.0", "IN_TRANSIT", "true"),
31 |       RawData("thingA", "5", "DELAYED", "false"),
32 |       RawData("thingB", null, "DELAYED", null),
33 |       RawData("thingC", null, "IN_TRANSIT", "false"),
34 |       RawData("thingD", "1.0", "DELAYED", "true"),
35 |       RawData("thingC", "7.0", "UNKNOWN", null),
36 |       RawData("thingC", "20", "UNKNOWN", null),
37 |       RawData("thingE", "20", "DELAYED", "false")
38 |     ))
39 | 
40 |     val rawData = session.createDataFrame(rows)
41 | 
42 |     /* Make deequ profile this data. It will execute the three passes over the data and avoid
43 |        any shuffles. */
44 |     val result = ColumnProfilerRunner()
45 |       .onData(rawData)
46 |       .run()
47 | 
48 |     /* We get a profile for each column which allows to inspect the completeness of the column,
49 |        the approximate number of distinct values and the inferred datatype. */
50 |     result.profiles.foreach { case (productName, profile) =>
51 | 
52 |       println(s"Column '$productName':\n " +
53 |         s"\tcompleteness: ${profile.completeness}\n" +
54 |         s"\tapproximate number of distinct values: ${profile.approximateNumDistinctValues}\n" +
55 |         s"\tdatatype: ${profile.dataType}\n")
56 |     }
57 | 
58 |     /* For numeric columns, we get descriptive statistics */
59 |     val totalNumberProfile = result.profiles("totalNumber").asInstanceOf[NumericColumnProfile]
60 | 
61 |     println(s"Statistics of 'totalNumber':\n" +
62 |       s"\tminimum: ${totalNumberProfile.minimum.get}\n" +
63 |       s"\tmaximum: ${totalNumberProfile.maximum.get}\n" +
64 |       s"\tmean: ${totalNumberProfile.mean.get}\n" +
65 |       s"\tstandard deviation: ${totalNumberProfile.stdDev.get}\n")
66 | 
67 |     val statusProfile = result.profiles("status")
68 | 
69 |     /* For columns with a low number of distinct values, we get the full value distribution. */
70 |     println("Value distribution in 'stats':")
71 |     statusProfile.histogram.foreach {
72 |       _.values.foreach { case (key, entry) =>
73 |         println(s"\t$key occurred ${entry.absolute} times (ratio is ${entry.ratio})")
74 |       }
75 |     }
76 | 
77 |   }
78 | }
79 | 


--------------------------------------------------------------------------------
/src/main/scala/com/amazon/deequ/analyzers/Distance.scala:
--------------------------------------------------------------------------------
 1 | /**
 2 |  * Copyright 2018 Amazon.com, Inc. or its affiliates. All Rights Reserved.
 3 |  *
 4 |  * Licensed under the Apache License, Version 2.0 (the "License"). You may not
 5 |  * use this file except in compliance with the License. A copy of the License
 6 |  * is located at
 7 |  *
 8 |  *     http://aws.amazon.com/apache2.0/
 9 |  *
10 |  * or in the "license" file accompanying this file. This file is distributed on
11 |  * an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either
12 |  * express or implied. See the License for the specific language governing
13 |  * permissions and limitations under the License.
14 |  *
15 |  */
16 | 
17 | package com.amazon.deequ.analyzers
18 | 
19 | object Distance {
20 | 
21 |     /** Calculate distance of numerical profiles based on KLL Sketches and L-Infinity Distance */
22 |     def numericalDistance(
23 |       sample1: QuantileNonSample[Double],
24 |       sample2: QuantileNonSample[Double],
25 |       correctForLowNumberOfSamples: Boolean = false)
26 |     : Double = {
27 |       val rankMap1 = sample1.getRankMap()
28 |       val rankMap2 = sample2.getRankMap()
29 |       val combinedKeys = rankMap1.keySet.union(rankMap2.keySet)
30 |       val n = rankMap1.valuesIterator.max.toDouble
31 |       val m = rankMap2.valuesIterator.max.toDouble
32 |       var linfSimple = 0.0
33 | 
34 |       combinedKeys.foreach { key =>
35 |         val cdf1 = sample1.getRank(key, rankMap1) / n
36 |         val cdf2 = sample2.getRank(key, rankMap2) / m
37 |         val cdfDiff = Math.abs(cdf1 - cdf2)
38 |         linfSimple = Math.max(linfSimple, cdfDiff)
39 |       }
40 |       selectMetrics(linfSimple, n, m, correctForLowNumberOfSamples)
41 |     }
42 | 
43 |     /** Calculate distance of categorical profiles based on L-Infinity Distance */
44 |     def categoricalDistance(
45 |       sample1: scala.collection.mutable.Map[String, Long],
46 |       sample2: scala.collection.mutable.Map[String, Long],
47 |       correctForLowNumberOfSamples: Boolean = false)
48 |     : Double = {
49 | 
50 |       var n = 0.0
51 |       var m = 0.0
52 |       sample1.keySet.foreach { key =>
53 |         n += sample1(key)
54 |       }
55 |       sample2.keySet.foreach { key =>
56 |         m += sample2(key)
57 |       }
58 |       val combinedKeys = sample1.keySet.union(sample2.keySet)
59 |       var linfSimple = 0.0
60 | 
61 |       combinedKeys.foreach { key =>
62 |         val cdf1 = sample1.getOrElse(key, 0L) / n
63 |         val cdf2 = sample2.getOrElse(key, 0L) / m
64 |         val cdfDiff = Math.abs(cdf1 - cdf2)
65 |         linfSimple = Math.max(linfSimple, cdfDiff)
66 |       }
67 |       selectMetrics(linfSimple, n, m, correctForLowNumberOfSamples)
68 |     }
69 | 
70 |   /** Select which metrics to compute (linf_simple or linf_robust)
71 |    *  based on whether samples are enough */
72 |    private[this] def selectMetrics(
73 |      linfSimple: Double,
74 |      n: Double,
75 |      m: Double,
76 |      correctForLowNumberOfSamples: Boolean = false)
77 |    : Double = {
78 |      if (correctForLowNumberOfSamples) {
79 |        linfSimple
80 |      } else {
81 |        // This formula is based on  “Two-sample Kolmogorov–Smirnov test"
82 |        // Reference: https://en.m.wikipedia.org/wiki/Kolmogorov%E2%80%93Smirnov_test
83 |        val linfRobust = Math.max(0.0, linfSimple - 1.8 * Math.sqrt((n + m) / (n * m)))
84 |        linfRobust
85 |      }
86 |    }
87 | }
88 | 
89 | 


--------------------------------------------------------------------------------
/src/test/scala/com/amazon/deequ/KLL/KLLDistanceTest.scala:
--------------------------------------------------------------------------------
 1 | /**
 2 |  * Copyright 2018 Amazon.com, Inc. or its affiliates. All Rights Reserved.
 3 |  *
 4 |  * Licensed under the Apache License, Version 2.0 (the "License"). You may not
 5 |  * use this file except in compliance with the License. A copy of the License
 6 |  * is located at
 7 |  *
 8 |  *     http://aws.amazon.com/apache2.0/
 9 |  *
10 |  * or in the "license" file accompanying this file. This file is distributed on
11 |  * an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either
12 |  * express or implied. See the License for the specific language governing
13 |  * permissions and limitations under the License.
14 |  *
15 |  */
16 | 
17 | package com.amazon.deequ.KLL
18 | 
19 | import com.amazon.deequ.SparkContextSpec
20 | import com.amazon.deequ.analyzers.{Distance, QuantileNonSample}
21 | import com.amazon.deequ.utils.FixtureSupport
22 | import org.scalatest.{Matchers, WordSpec}
23 | 
24 | class KLLDistanceTest extends WordSpec with Matchers with SparkContextSpec
25 |   with FixtureSupport{
26 | 
27 |   "KLL distance calculator should compute correct linf_simple" in {
28 |     var sample1 = new QuantileNonSample[Double](4, 0.64)
29 |     var sample2 = new QuantileNonSample[Double](4, 0.64)
30 |     sample1.reconstruct(4, 0.64, Array(Array(1, 2, 3, 4)))
31 |     sample2.reconstruct(4, 0.64, Array(Array(2, 3, 4, 5)))
32 |     assert(Distance.numericalDistance(sample1, sample2, true) == 0.25)
33 |   }
34 | 
35 |   "KLL distance calculator should compute correct linf_robust" in {
36 |     var sample1 = new QuantileNonSample[Double](4, 0.64)
37 |     var sample2 = new QuantileNonSample[Double](4, 0.64)
38 |     sample1.reconstruct(4, 0.64, Array(Array(1, 2, 3, 4)))
39 |     sample2.reconstruct(4, 0.64, Array(Array(2, 3, 4, 5)))
40 |     assert(Distance.numericalDistance(sample1, sample2) == 0.0)
41 |   }
42 | 
43 |   "Categorial distance should compute correct linf_simple" in {
44 |     val sample1 = scala.collection.mutable.Map(
45 |       "a" -> 10L, "b" -> 20L, "c" -> 25L, "d" -> 10L, "e" -> 5L)
46 |     val sample2 = scala.collection.mutable.Map(
47 |       "a" -> 11L, "b" -> 20L, "c" -> 25L, "d" -> 10L, "e" -> 10L)
48 |     assert(Distance.categoricalDistance(sample1,
49 |       sample2, true) == 0.06015037593984962)
50 |   }
51 | 
52 |   "Categorial distance should compute correct linf_robust" in {
53 |     val sample1 = scala.collection.mutable.Map(
54 |       "a" -> 10L, "b" -> 20L, "c" -> 25L, "d" -> 10L, "e" -> 5L)
55 |     val sample2 = scala.collection.mutable.Map(
56 |       "a" -> 11L, "b" -> 20L, "c" -> 25L, "d" -> 10L, "e" -> 10L)
57 |     assert(Distance.categoricalDistance(sample1, sample2) == 0.0)
58 |   }
59 | 
60 |   "Categorial distance should compute correct linf_simple with different bin value" in {
61 |     val sample1 = scala.collection.mutable.Map(
62 |       "a" -> 10L, "b" -> 20L, "c" -> 25L, "d" -> 10L, "e" -> 5L)
63 |     val sample2 = scala.collection.mutable.Map(
64 |       "f" -> 11L, "a" -> 20L, "c" -> 25L, "d" -> 10L, "e" -> 10L)
65 |     assert(Distance.categoricalDistance(sample1,
66 |       sample2, true) == 0.2857142857142857)
67 |   }
68 | 
69 |   "Categorial distance should compute correct linf_robust with different bin value" in {
70 |     val sample1 = scala.collection.mutable.Map(
71 |       "a" -> 10L, "b" -> 20L, "c" -> 25L, "d" -> 10L, "e" -> 5L)
72 |     val sample2 = scala.collection.mutable.Map(
73 |       "f" -> 11L, "a" -> 20L, "c" -> 25L, "d" -> 10L, "e" -> 10L)
74 |     assert(Distance.categoricalDistance(sample1, sample2) == 0.0)
75 |   }
76 | }
77 | 


--------------------------------------------------------------------------------
/src/main/scala/com/amazon/deequ/analyzers/catalyst/StatefulDataType.scala:
--------------------------------------------------------------------------------
 1 | /**
 2 |  * Copyright 2018 Amazon.com, Inc. or its affiliates. All Rights Reserved.
 3 |  *
 4 |  * Licensed under the Apache License, Version 2.0 (the "License"). You may not
 5 |  * use this file except in compliance with the License. A copy of the License
 6 |  * is located at
 7 |  *
 8 |  *     http://aws.amazon.com/apache2.0/
 9 |  *
10 |  * or in the "license" file accompanying this file. This file is distributed on
11 |  * an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either
12 |  * express or implied. See the License for the specific language governing
13 |  * permissions and limitations under the License.
14 |  *
15 |  */
16 | 
17 | package org.apache.spark.sql
18 | 
19 | import com.amazon.deequ.analyzers.DataTypeHistogram
20 | import org.apache.spark.sql.expressions.{MutableAggregationBuffer, UserDefinedAggregateFunction}
21 | import org.apache.spark.sql.types._
22 | 
23 | import scala.util.matching.Regex
24 | 
25 | 
26 | private[sql] class StatefulDataType extends UserDefinedAggregateFunction {
27 | 
28 |   val SIZE_IN_BYTES = 40
29 | 
30 |   val NULL_POS = 0
31 |   val FRACTIONAL_POS = 1
32 |   val INTEGRAL_POS = 2
33 |   val BOOLEAN_POS = 3
34 |   val STRING_POS = 4
35 | 
36 |   val FRACTIONAL: Regex = """^(-|\+)? ?\d+((\.\d+)|((?:\.\d+)?[Ee][-+]?\d+))$""".r
37 |   val INTEGRAL: Regex = """^(-|\+)? ?\d+$""".r
38 |   val BOOLEAN: Regex = """^(true|false)$""".r
39 | 
40 |   override def inputSchema: StructType = StructType(StructField("value", StringType) :: Nil)
41 | 
42 |   override def bufferSchema: StructType = StructType(StructField("null", LongType) ::
43 |     StructField("fractional", LongType) :: StructField("integral", LongType) ::
44 |     StructField("boolean", LongType) :: StructField("string", LongType) :: Nil)
45 | 
46 |   override def dataType: types.DataType = BinaryType
47 | 
48 |   override def deterministic: Boolean = true
49 | 
50 |   override def initialize(buffer: MutableAggregationBuffer): Unit = {
51 |     buffer(NULL_POS) = 0L
52 |     buffer(FRACTIONAL_POS) = 0L
53 |     buffer(INTEGRAL_POS) = 0L
54 |     buffer(BOOLEAN_POS) = 0L
55 |     buffer(STRING_POS) = 0L
56 |   }
57 | 
58 |   override def update(buffer: MutableAggregationBuffer, input: Row): Unit = {
59 |     if (input.isNullAt(0)) {
60 |       buffer(NULL_POS) = buffer.getLong(NULL_POS) + 1L
61 |     } else {
62 |       input.getString(0) match {
63 |         case FRACTIONAL(_*) => buffer(FRACTIONAL_POS) = buffer.getLong(FRACTIONAL_POS) + 1L
64 |         case INTEGRAL(_*) => buffer(INTEGRAL_POS) = buffer.getLong(INTEGRAL_POS) + 1L
65 |         case BOOLEAN(_*) => buffer(BOOLEAN_POS) = buffer.getLong(BOOLEAN_POS) + 1L
66 |         case _ => buffer(STRING_POS) = buffer.getLong(STRING_POS) + 1L
67 |       }
68 |     }
69 |   }
70 | 
71 |   override def merge(buffer1: MutableAggregationBuffer, buffer2: Row): Unit = {
72 |     buffer1(NULL_POS) = buffer1.getLong(NULL_POS) + buffer2.getLong(NULL_POS)
73 |     buffer1(FRACTIONAL_POS) = buffer1.getLong(FRACTIONAL_POS) + buffer2.getLong(FRACTIONAL_POS)
74 |     buffer1(INTEGRAL_POS) = buffer1.getLong(INTEGRAL_POS) + buffer2.getLong(INTEGRAL_POS)
75 |     buffer1(BOOLEAN_POS) = buffer1.getLong(BOOLEAN_POS) + buffer2.getLong(BOOLEAN_POS)
76 |     buffer1(STRING_POS) = buffer1.getLong(STRING_POS) + buffer2.getLong(STRING_POS)
77 |   }
78 | 
79 |   override def evaluate(buffer: Row): Any = {
80 |     DataTypeHistogram.toBytes(buffer.getLong(NULL_POS), buffer.getLong(FRACTIONAL_POS),
81 |       buffer.getLong(INTEGRAL_POS), buffer.getLong(BOOLEAN_POS), buffer.getLong(STRING_POS))
82 |   }
83 | }
84 | 


--------------------------------------------------------------------------------
/src/test/scala/com/amazon/deequ/analyzers/UniquenessTest.scala:
--------------------------------------------------------------------------------
 1 | /**
 2 |  * Copyright 2018 Amazon.com, Inc. or its affiliates. All Rights Reserved.
 3 |  *
 4 |  * Licensed under the Apache License, Version 2.0 (the "License"). You may not
 5 |  * use this file except in compliance with the License. A copy of the License
 6 |  * is located at
 7 |  *
 8 |  *     http://aws.amazon.com/apache2.0/
 9 |  *
10 |  * or in the "license" file accompanying this file. This file is distributed on
11 |  * an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either
12 |  * express or implied. See the License for the specific language governing
13 |  * permissions and limitations under the License.
14 |  *
15 |  */
16 | 
17 | package com.amazon.deequ.analyzers
18 | 
19 | import com.amazon.deequ.SparkContextSpec
20 | import com.amazon.deequ.analyzers.runners.AnalysisRunner
21 | import com.amazon.deequ.metrics.DoubleMetric
22 | import com.amazon.deequ.utils.FixtureSupport
23 | import org.apache.spark.sql.{DataFrame, SparkSession}
24 | import org.scalatest.matchers.should.Matchers
25 | import org.scalatest.wordspec.AnyWordSpec
26 | 
27 | class UniquenessTest extends AnyWordSpec with Matchers with SparkContextSpec with FixtureSupport {
28 | 
29 |   def uniquenessSampleData(sparkSession: SparkSession): DataFrame = {
30 |     import sparkSession.implicits._
31 | 
32 |     // Example from https://github.com/awslabs/deequ/issues/178
33 |     Seq(
34 |       ("India", "Xavier House, 2nd Floor", "St. Peter Colony, Perry Road", "Bandra (West)"),
35 |       ("India", "503 Godavari", "Sir Pochkhanwala Road", "Worli"),
36 |       ("India", "4/4 Seema Society", "N Dutta Road, Four Bungalows", "Andheri"),
37 |       ("India", "1001D Abhishek Apartments", "Juhu Versova Road", "Andheri"),
38 |       ("India", "95, Hill Road", null, null),
39 |       ("India", "90 Cuffe Parade", "Taj President Hotel", "Cuffe Parade"),
40 |       ("India", "4, Seven PM", "Sir Pochkhanwala Rd", "Worli"),
41 |       ("India", "1453 Sahar Road", null, null)
42 |     )
43 |       .toDF("Country", "Address Line 1", "Address Line 2", "Address Line 3")
44 |   }
45 | 
46 |   "Uniqueness" should {
47 | 
48 |     "be correct for multiple fields" in withSparkSession { session =>
49 | 
50 |       val data = uniquenessSampleData(session)
51 | 
52 |       val stateStore = InMemoryStateProvider()
53 | 
54 |       val uniquenessA1 = Uniqueness("Address Line 1")
55 |       val uniquenessA13 = Uniqueness(Seq("Address Line 1", "Address Line 2", "Address Line 3"))
56 | 
57 |       val analysis = Analysis(Seq(uniquenessA1, uniquenessA13))
58 | 
59 |       val result = AnalysisRunner.run(data, analysis, saveStatesWith = Some(stateStore))
60 | 
61 |       assert(result.metric(uniquenessA1).get.asInstanceOf[DoubleMetric].value.get == 1.0)
62 |       assert(result.metric(uniquenessA13).get.asInstanceOf[DoubleMetric].value.get == 1.0)
63 |     }
64 |   }
65 | 
66 |   "Filtered Uniqueness" in withSparkSession { sparkSession =>
67 |     import sparkSession.implicits._
68 |     val df = Seq(
69 |       ("1", "unique"),
70 |       ("2", "unique"),
71 |       ("3", "duplicate"),
72 |       ("3", "duplicate"),
73 |       ("4", "unique")
74 |     ).toDF("value", "type")
75 | 
76 |     val stateStore = InMemoryStateProvider()
77 | 
78 |     val uniqueness = Uniqueness("value")
79 |     val uniquenessWithFilter = Uniqueness(Seq("value"), Some("type = 'unique'"))
80 | 
81 |     val analysis = Analysis(Seq(uniqueness, uniquenessWithFilter))
82 | 
83 |     val result = AnalysisRunner.run(df, analysis, saveStatesWith = Some(stateStore))
84 | 
85 |     assert(result.metric(uniqueness).get.asInstanceOf[DoubleMetric].value.get == 0.6)
86 |     assert(result.metric(uniquenessWithFilter).get.asInstanceOf[DoubleMetric].value.get == 1.0)
87 |   }
88 | }
89 | 


--------------------------------------------------------------------------------
/src/main/scala/com/amazon/deequ/analyzers/Correlation.scala:
--------------------------------------------------------------------------------
  1 | /**
  2 |  * Copyright 2018 Amazon.com, Inc. or its affiliates. All Rights Reserved.
  3 |  *
  4 |  * Licensed under the Apache License, Version 2.0 (the "License"). You may not
  5 |  * use this file except in compliance with the License. A copy of the License
  6 |  * is located at
  7 |  *
  8 |  *     http://aws.amazon.com/apache2.0/
  9 |  *
 10 |  * or in the "license" file accompanying this file. This file is distributed on
 11 |  * an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either
 12 |  * express or implied. See the License for the specific language governing
 13 |  * permissions and limitations under the License.
 14 |  *
 15 |  */
 16 | 
 17 | package com.amazon.deequ.analyzers
 18 | 
 19 | import com.amazon.deequ.analyzers.Preconditions.{hasColumn, isNumeric}
 20 | import com.amazon.deequ.metrics.Entity
 21 | import org.apache.spark.sql.DeequFunctions.stateful_corr
 22 | import org.apache.spark.sql.{Column, Row}
 23 | import org.apache.spark.sql.types.StructType
 24 | import Analyzers._
 25 | 
 26 | case class CorrelationState(
 27 |     n: Double,
 28 |     xAvg: Double,
 29 |     yAvg: Double,
 30 |     ck: Double,
 31 |     xMk: Double,
 32 |     yMk: Double)
 33 |   extends DoubleValuedState[CorrelationState] {
 34 | 
 35 |   require(n > 0.0, "Correlation undefined for n = 0.")
 36 | 
 37 |   override def sum(other: CorrelationState): CorrelationState = {
 38 |     val n1 = n
 39 |     val n2 = other.n
 40 |     val newN = n1 + n2
 41 |     val dx = other.xAvg - xAvg
 42 |     val dxN = if (newN == 0.0) 0.0 else dx / newN
 43 |     val dy = other.yAvg - yAvg
 44 |     val dyN = if (newN == 0.0) 0.0 else dy / newN
 45 |     val newXAvg = xAvg + dxN * n2
 46 |     val newYAvg = yAvg + dyN * n2
 47 |     val newCk = ck + other.ck + dx * dyN * n1 * n2
 48 |     val newXMk = xMk + other.xMk + dx * dxN * n1 * n2
 49 |     val newYMk = yMk + other.yMk + dy * dyN * n1 * n2
 50 | 
 51 |     CorrelationState(newN, newXAvg, newYAvg, newCk, newXMk, newYMk)
 52 |   }
 53 | 
 54 |   override def metricValue(): Double = {
 55 |     ck / math.sqrt(xMk * yMk)
 56 |   }
 57 | }
 58 | 
 59 | /**
 60 |   * Computes the pearson correlation coefficient between the two given columns
 61 |   *
 62 |   * @param firstColumn First input column for computation
 63 |   * @param secondColumn Second input column for computation
 64 |   */
 65 | case class Correlation(
 66 |     firstColumn: String,
 67 |     secondColumn: String,
 68 |     where: Option[String] = None)
 69 |   extends StandardScanShareableAnalyzer[CorrelationState]("Correlation",
 70 |     s"$firstColumn,$secondColumn", Entity.Mutlicolumn)
 71 |   with FilterableAnalyzer {
 72 | 
 73 |   override def aggregationFunctions(): Seq[Column] = {
 74 | 
 75 |     val firstSelection = conditionalSelection(firstColumn, where)
 76 |     val secondSelection = conditionalSelection(secondColumn, where)
 77 | 
 78 |     stateful_corr(firstSelection, secondSelection) :: Nil
 79 |   }
 80 | 
 81 |   override def fromAggregationResult(result: Row, offset: Int): Option[CorrelationState] = {
 82 | 
 83 |     if (result.isNullAt(offset)) {
 84 |       None
 85 |     } else {
 86 |       val row = result.getAs[Row](offset)
 87 |       val n = row.getDouble(0)
 88 |       if (n > 0.0) {
 89 |         Some(CorrelationState(
 90 |           n,
 91 |           row.getDouble(1),
 92 |           row.getDouble(2),
 93 |           row.getDouble(3),
 94 |           row.getDouble(4),
 95 |           row.getDouble(5)))
 96 |       } else {
 97 |         None
 98 |       }
 99 |     }
100 |   }
101 | 
102 |   override protected def additionalPreconditions(): Seq[StructType => Unit] = {
103 |     hasColumn(firstColumn) :: isNumeric(firstColumn) :: hasColumn(secondColumn) ::
104 |       isNumeric(secondColumn) :: Nil
105 |   }
106 | 
107 |   override def filterCondition: Option[String] = where
108 | }
109 | 


--------------------------------------------------------------------------------
/src/main/scala/com/amazon/deequ/examples/MetricsRepositoryExample.scala:
--------------------------------------------------------------------------------
 1 | /**
 2 |  * Copyright 2018 Amazon.com, Inc. or its affiliates. All Rights Reserved.
 3 |  *
 4 |  * Licensed under the Apache License, Version 2.0 (the "License"). You may not
 5 |  * use this file except in compliance with the License. A copy of the License
 6 |  * is located at
 7 |  *
 8 |  *     http://aws.amazon.com/apache2.0/
 9 |  *
10 |  * or in the "license" file accompanying this file. This file is distributed on
11 |  * an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either
12 |  * express or implied. See the License for the specific language governing
13 |  * permissions and limitations under the License.
14 |  *
15 |  */
16 | 
17 | package com.amazon.deequ.examples
18 | 
19 | import java.io.File
20 | 
21 | import com.amazon.deequ.VerificationSuite
22 | import com.amazon.deequ.analyzers.Completeness
23 | import com.amazon.deequ.checks.{Check, CheckLevel}
24 | import com.amazon.deequ.examples.ExampleUtils.{itemsAsDataframe, withSpark}
25 | import com.amazon.deequ.repository.fs.FileSystemMetricsRepository
26 | import com.amazon.deequ.repository.{MetricsRepository, ResultKey}
27 | import com.google.common.io.Files
28 | 
29 | object MetricsRepositoryExample extends App {
30 | 
31 |   withSpark { session =>
32 | 
33 |     // The toy data on which we will compute metrics
34 |     val data = itemsAsDataframe(session,
35 |       Item(1, "Thingy A", "awesome thing.", "high", 0),
36 |       Item(2, "Thingy B", "available at http://thingb.com", null, 0),
37 |       Item(3, null, null, "low", 5),
38 |       Item(4, "Thingy D", "checkout https://thingd.ca", "low", 10),
39 |       Item(5, "Thingy E", null, "high", 12))
40 | 
41 |     // A json file in which the computed metrics will be stored
42 |     val metricsFile = new File(Files.createTempDir(), "metrics.json")
43 | 
44 |     // The repository which we will use to stored and load computed metrics; we use the local disk,
45 |     // but it also supports HDFS and S3
46 |     val repository: MetricsRepository =
47 |       FileSystemMetricsRepository(session, metricsFile.getAbsolutePath)
48 | 
49 |     // The key under which we store the results, needs a timestamp and supports arbitrary
50 |     // tags in the form of key-value pairs
51 |     val resultKey = ResultKey(System.currentTimeMillis(), Map("tag" -> "repositoryExample"))
52 | 
53 |     VerificationSuite()
54 |       .onData(data)
55 |       // Some integrity checks
56 |       .addCheck(Check(CheckLevel.Error, "integrity checks")
57 |         .hasSize(_ == 5)
58 |         .isComplete("id")
59 |         .isComplete("productName")
60 |         .isContainedIn("priority", Array("high", "low"))
61 |         .isNonNegative("numViews"))
62 |       // We want to store the computed metrics for the checks in our repository
63 |       .useRepository(repository)
64 |       .saveOrAppendResult(resultKey)
65 |       .run()
66 | 
67 |     // We can now retrieve the metrics from the repository in different ways, e.g.:
68 | 
69 | 
70 |     // We can load the metric for a particular analyzer stored under our result key:
71 |     val completenessOfProductName = repository
72 |       .loadByKey(resultKey).get
73 |       .metric(Completeness("productName")).get
74 | 
75 |     println(s"The completeness of the productName column is: $completenessOfProductName")
76 | 
77 |     // We can query the repository for all metrics from the last 10 minutes and get them as json
78 |     val json = repository.load()
79 |       .after(System.currentTimeMillis() - 10000)
80 |       .getSuccessMetricsAsJson()
81 | 
82 |     println(s"Metrics from the last 10 minutes:\n$json")
83 | 
84 |     // Finally we can also query by tag value and retrieve the result in the form of a dataframe
85 |     repository.load()
86 |       .withTagValues(Map("tag" -> "repositoryExample"))
87 |       .getSuccessMetricsAsDataFrame(session)
88 |       .show()
89 |   }
90 | }
91 | 


--------------------------------------------------------------------------------
/src/main/scala/com/amazon/deequ/examples/AnomalyDetectionExample.scala:
--------------------------------------------------------------------------------
 1 | /**
 2 |  * Copyright 2018 Amazon.com, Inc. or its affiliates. All Rights Reserved.
 3 |  *
 4 |  * Licensed under the Apache License, Version 2.0 (the "License"). You may not
 5 |  * use this file except in compliance with the License. A copy of the License
 6 |  * is located at
 7 |  *
 8 |  *     http://aws.amazon.com/apache2.0/
 9 |  *
10 |  * or in the "license" file accompanying this file. This file is distributed on
11 |  * an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either
12 |  * express or implied. See the License for the specific language governing
13 |  * permissions and limitations under the License.
14 |  *
15 |  */
16 | 
17 | package com.amazon.deequ.examples
18 | 
19 | import com.amazon.deequ.VerificationSuite
20 | import com.amazon.deequ.analyzers.Size
21 | import com.amazon.deequ.anomalydetection.RelativeRateOfChangeStrategy
22 | import com.amazon.deequ.examples.ExampleUtils.{itemsAsDataframe, withSpark}
23 | import com.amazon.deequ.repository.ResultKey
24 | import com.amazon.deequ.repository.memory.InMemoryMetricsRepository
25 | import com.amazon.deequ.checks.CheckStatus._
26 | 
27 | private[examples] object AnomalyDetectionExample extends App {
28 | 
29 |   withSpark { session =>
30 | 
31 |     /* In this simple example, we assume that we compute metrics on a dataset every day and we want
32 |    to ensure that they don't change drastically. For sake of simplicity, we just look at the
33 |    size of the data */
34 | 
35 |     /* Anomaly detection operates on metrics stored in a metric repository, so lets create one */
36 |     val metricsRepository = new InMemoryMetricsRepository()
37 | 
38 |     /* This is the key which we use to store the metrics for the dataset from yesterday */
39 |     val yesterdaysKey = ResultKey(System.currentTimeMillis() - 24 * 60 * 1000)
40 | 
41 |     /* Yesterday, the data had only two rows */
42 |     val yesterdaysDataset = itemsAsDataframe(session,
43 |       Item(1, "Thingy A", "awesome thing.", "high", 0),
44 |       Item(2, "Thingy B", "available at http://thingb.com", null, 0))
45 | 
46 |     /* We test for anomalies in the size of the data, it should not increase by more than 2x. Note
47 |        that we store the resulting metrics in our repository */
48 |     VerificationSuite()
49 |       .onData(yesterdaysDataset)
50 |       .useRepository(metricsRepository)
51 |       .saveOrAppendResult(yesterdaysKey)
52 |       .addAnomalyCheck(
53 |         RelativeRateOfChangeStrategy(maxRateIncrease = Some(2.0)),
54 |         Size()
55 |       )
56 |       .run()
57 | 
58 |     /* Todays data has five rows, so the data size more than doubled and our anomaly check should
59 |        catch this */
60 |     val todaysDataset = itemsAsDataframe(session,
61 |       Item(1, "Thingy A", "awesome thing.", "high", 0),
62 |       Item(2, "Thingy B", "available at http://thingb.com", null, 0),
63 |       Item(3, null, null, "low", 5),
64 |       Item(4, "Thingy D", "checkout https://thingd.ca", "low", 10),
65 |       Item(5, "Thingy E", null, "high", 12))
66 | 
67 |     /* The key for today's result */
68 |     val todaysKey = ResultKey(System.currentTimeMillis())
69 | 
70 |     /* Repeat the anomaly check for today's data */
71 |     val verificationResult = VerificationSuite()
72 |       .onData(todaysDataset)
73 |       .useRepository(metricsRepository)
74 |       .saveOrAppendResult(todaysKey)
75 |       .addAnomalyCheck(
76 |         RelativeRateOfChangeStrategy(maxRateIncrease = Some(2.0)),
77 |         Size()
78 |       )
79 |       .run()
80 | 
81 |     /* Did we find an anomaly? */
82 |     if (verificationResult.status != Success) {
83 |       println("Anomaly detected in the Size() metric!")
84 | 
85 |       /* Lets have a look at the actual metrics. */
86 |       metricsRepository
87 |         .load()
88 |         .forAnalyzers(Seq(Size()))
89 |         .getSuccessMetricsAsDataFrame(session)
90 |         .show()
91 |     }
92 |   }
93 | 
94 | }
95 | 


--------------------------------------------------------------------------------
/src/main/scala/com/amazon/deequ/analyzers/PatternMatch.scala:
--------------------------------------------------------------------------------
 1 | /**
 2 |  * Copyright 2018 Amazon.com, Inc. or its affiliates. All Rights Reserved.
 3 |  *
 4 |  * Licensed under the Apache License, Version 2.0 (the "License"). You may not
 5 |  * use this file except in compliance with the License. A copy of the License
 6 |  * is located at
 7 |  *
 8 |  *     http://aws.amazon.com/apache2.0/
 9 |  *
10 |  * or in the "license" file accompanying this file. This file is distributed on
11 |  * an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either
12 |  * express or implied. See the License for the specific language governing
13 |  * permissions and limitations under the License.
14 |  *
15 |  */
16 | 
17 | package com.amazon.deequ.analyzers
18 | 
19 | import com.amazon.deequ.analyzers.Analyzers._
20 | import com.amazon.deequ.analyzers.Preconditions.{hasColumn, isString}
21 | import org.apache.spark.sql.{Column, Row}
22 | import org.apache.spark.sql.functions.{col, lit, regexp_extract, sum, when}
23 | import org.apache.spark.sql.types.{IntegerType, StructType}
24 | 
25 | import scala.util.matching.Regex
26 | 
27 | /**
28 |   * PatternMatch is a measure of the fraction of rows that complies with a given
29 |   * column regex constraint. E.g if the constraint is Patterns.CREDITCARD and the
30 |   * data frame has 5 rows which contain a credit card number in a certain column
31 |   * according to the regex and and 10 rows that do not, a DoubleMetric would be
32 |   * returned with 0.33 as value
33 |   *
34 |   * @param column     Column to do the pattern match analysis on
35 |   * @param pattern    The regular expression to check for
36 |   * @param where      Additional filter to apply before the analyzer is run.
37 |   */
38 | case class PatternMatch(column: String, pattern: Regex, where: Option[String] = None)
39 |   extends StandardScanShareableAnalyzer[NumMatchesAndCount]("PatternMatch", column)
40 |   with FilterableAnalyzer {
41 | 
42 |   override def fromAggregationResult(result: Row, offset: Int): Option[NumMatchesAndCount] = {
43 |     ifNoNullsIn(result, offset, howMany = 2) { _ =>
44 |       NumMatchesAndCount(result.getLong(offset), result.getLong(offset + 1))
45 |     }
46 |   }
47 | 
48 |   override def aggregationFunctions(): Seq[Column] = {
49 | 
50 |     val expression = when(regexp_extract(col(column), pattern.toString(), 0) =!= lit(""), 1)
51 |       .otherwise(0)
52 | 
53 |     val summation = sum(conditionalSelection(expression, where).cast(IntegerType))
54 | 
55 |     summation :: conditionalCount(where) :: Nil
56 |   }
57 | 
58 |   override def filterCondition: Option[String] = where
59 | 
60 |   override protected def additionalPreconditions(): Seq[StructType => Unit] = {
61 |     hasColumn(column) :: isString(column) :: Nil
62 |   }
63 | }
64 | 
65 | object Patterns {
66 | 
67 |   // scalastyle:off
68 |   // http://emailregex.com
69 |   val EMAIL: Regex = """(?:[a-z0-9!#$%&'*+/=?^_`{|}~-]+(?:\.[a-z0-9!#$%&'*+/=?^_`{|}~-]+)*|"(?:[\x01-\x08\x0b\x0c\x0e-\x1f\x21\x23-\x5b\x5d-\x7f]|\\[\x01-\x09\x0b\x0c\x0e-\x7f])*")@(?:(?:[a-z0-9](?:[a-z0-9-]*[a-z0-9])?\.)+[a-z0-9](?:[a-z0-9-]*[a-z0-9])?|\[(?:(?:25[0-5]|2[0-4][0-9]|[01]?[0-9][0-9]?)\.){3}(?:25[0-5]|2[0-4][0-9]|[01]?[0-9][0-9]?|[a-z0-9-]*[a-z0-9]:(?:[\x01-\x08\x0b\x0c\x0e-\x1f\x21-\x5a\x53-\x7f]|\\[\x01-\x09\x0b\x0c\x0e-\x7f])+)\])""".r
70 | 
71 |   // https://mathiasbynens.be/demo/url-regex stephenhay
72 |   val URL: Regex = """(https?|ftp)://[^\s/$.?#].[^\s]*""".r
73 | 
74 |   val SOCIAL_SECURITY_NUMBER_US: Regex = """((?!219-09-9999|078-05-1120)(?!666|000|9\d{2})\d{3}-(?!00)\d{2}-(?!0{4})\d{4})|((?!219 09 9999|078 05 1120)(?!666|000|9\d{2})\d{3} (?!00)\d{2} (?!0{4})\d{4})|((?!219099999|078051120)(?!666|000|9\d{2})\d{3}(?!00)\d{2}(?!0{4})\d{4})""".r
75 | 
76 |   // Visa, MasterCard, AMEX, Diners Club
77 |   // http://www.richardsramblings.com/regex/credit-card-numbers/
78 |   val CREDITCARD: Regex = """\b(?:3[47]\d{2}([\ \-]?)\d{6}\1\d|(?:(?:4\d|5[1-5]|65)\d{2}|6011)([\ \-]?)\d{4}\2\d{4}\2)\d{4}\b""".r
79 |   // scalastyle:on
80 | }
81 | 


--------------------------------------------------------------------------------
/src/main/scala/com/amazon/deequ/analyzers/catalyst/StatefulKLLSketch.scala:
--------------------------------------------------------------------------------
  1 | /**
  2 |  * Copyright 2018 Amazon.com, Inc. or its affiliates. All Rights Reserved.
  3 |  *
  4 |  * Licensed under the Apache License, Version 2.0 (the "License"). You may not
  5 |  * use this file except in compliance with the License. A copy of the License
  6 |  * is located at
  7 |  *
  8 |  *     http://aws.amazon.com/apache2.0/
  9 |  *
 10 |  * or in the "license" file accompanying this file. This file is distributed on
 11 |  * an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either
 12 |  * express or implied. See the License for the specific language governing
 13 |  * permissions and limitations under the License.
 14 |  *
 15 |  */
 16 | 
 17 | package org.apache.spark.sql
 18 | 
 19 | import java.nio.ByteBuffer
 20 | 
 21 | import com.amazon.deequ.analyzers.QuantileNonSample
 22 | import com.amazon.deequ.analyzers.catalyst.KLLSketchSerializer
 23 | import com.google.common.primitives.Doubles
 24 | 
 25 | import org.apache.spark.sql.expressions.{MutableAggregationBuffer, UserDefinedAggregateFunction}
 26 | import org.apache.spark.sql.types._
 27 | 
 28 | 
 29 | private [sql] class StatefulKLLSketch(
 30 |     sketchSize: Int,
 31 |     shrinkingFactor: Double)
 32 |   extends UserDefinedAggregateFunction{
 33 | 
 34 |   val OBJECT_POS = 0
 35 |   val MIN_POS = 1
 36 |   val MAX_POS = 2
 37 | 
 38 |   override def inputSchema: StructType = StructType(StructField("value", DoubleType) :: Nil)
 39 | 
 40 |   override def bufferSchema: StructType = StructType(StructField("data", BinaryType) ::
 41 |     StructField("minimum", DoubleType) :: StructField("maximum", DoubleType) :: Nil)
 42 | 
 43 |   override def dataType: DataType = BinaryType
 44 | 
 45 |   override def deterministic: Boolean = true
 46 | 
 47 |   override def initialize(buffer: MutableAggregationBuffer): Unit = {
 48 |     val qsketch = new QuantileNonSample[Double](sketchSize, shrinkingFactor)
 49 |     buffer(OBJECT_POS) = serialize(qsketch)
 50 |     buffer(MIN_POS) = Int.MaxValue.toDouble
 51 |     buffer(MAX_POS) = Int.MinValue.toDouble
 52 |   }
 53 | 
 54 |   override def update(buffer: MutableAggregationBuffer, input: Row): Unit = {
 55 |     if (input.isNullAt(OBJECT_POS)) {
 56 |       return
 57 |     }
 58 | 
 59 |     val tmp = input.getDouble(OBJECT_POS)
 60 |     val kll = deserialize(buffer.getAs[Array[Byte]](OBJECT_POS))
 61 |     kll.update(tmp)
 62 |     buffer(OBJECT_POS) = serialize(kll)
 63 |     buffer(MIN_POS) = Math.min(buffer.getDouble(MIN_POS), tmp)
 64 |     buffer(MAX_POS) = Math.max(buffer.getDouble(MAX_POS), tmp)
 65 |   }
 66 | 
 67 |   override def merge(buffer1: MutableAggregationBuffer, buffer2: Row): Unit = {
 68 |     if (buffer2.isNullAt(OBJECT_POS)) {
 69 |       return
 70 |     }
 71 | 
 72 |     val kll_this = deserialize(buffer1.getAs[Array[Byte]](OBJECT_POS))
 73 |     val kll_other = deserialize(buffer2.getAs[Array[Byte]](OBJECT_POS))
 74 |     val kll_ret = kll_this.merge(kll_other)
 75 |     buffer1(OBJECT_POS) = serialize(kll_ret)
 76 |     buffer1(MIN_POS) = Math.min(buffer1.getDouble(MIN_POS), buffer2.getDouble(MIN_POS))
 77 |     buffer1(MAX_POS) = Math.max(buffer1.getDouble(MAX_POS), buffer2.getDouble(MAX_POS))
 78 |   }
 79 | 
 80 |   override def evaluate(buffer: Row): Any = {
 81 |     toBytes(buffer.getDouble(MIN_POS),
 82 |       buffer.getDouble(MAX_POS),
 83 |       buffer.getAs[Array[Byte]](OBJECT_POS))
 84 |   }
 85 | 
 86 |   def toBytes(min: Double, max: Double, obj: Array[Byte]): Array[Byte] = {
 87 |     val buffer2 = ByteBuffer.wrap(new Array(Doubles.BYTES + Doubles.BYTES + obj.length))
 88 |     buffer2.putDouble(min)
 89 |     buffer2.putDouble(max)
 90 |     buffer2.put(obj)
 91 |     buffer2.array()
 92 |   }
 93 | 
 94 |   def serialize(obj: QuantileNonSample[Double]): Array[Byte] = {
 95 |     KLLSketchSerializer.serializer.serialize(obj)
 96 |   }
 97 | 
 98 |   def deserialize(bytes: Array[Byte]): QuantileNonSample[Double] = {
 99 |     KLLSketchSerializer.serializer.deserialize(bytes)
100 |   }
101 | }
102 | 
103 | 


--------------------------------------------------------------------------------
/CONTRIBUTING.md:
--------------------------------------------------------------------------------
 1 | # Contributing Guidelines
 2 | 
 3 | Thank you for your interest in contributing to our project. Whether it's a bug report, new feature, correction, or additional 
 4 | documentation, we greatly value feedback and contributions from our community.
 5 | 
 6 | Please read through this document before submitting any issues or pull requests to ensure we have all the necessary 
 7 | information to effectively respond to your bug report or contribution.
 8 | 
 9 | 
10 | ## Reporting Bugs/Feature Requests
11 | 
12 | We welcome you to use the GitHub issue tracker to report bugs or suggest features.
13 | 
14 | When filing an issue, please check [existing open](https://github.com/awslabs/deequ/issues), or [recently closed](https://github.com/awslabs/deequ/issues?utf8=%E2%9C%93&q=is%3Aissue%20is%3Aclosed%20), issues to make sure somebody else hasn't already 
15 | reported the issue. Please try to include as much information as you can. Details like these are incredibly useful:
16 | 
17 | * A reproducible test case or series of steps
18 | * The version of our code being used
19 | * Any modifications you've made relevant to the bug
20 | * Anything unusual about your environment or deployment
21 | 
22 | 
23 | ## Contributing via Pull Requests
24 | Contributions via pull requests are much appreciated. Before sending us a pull request, please ensure that:
25 | 
26 | 1. You are working against the latest source on the *master* branch.
27 | 2. You check existing open, and recently merged, pull requests to make sure someone else hasn't addressed the problem already.
28 | 3. You open an issue to discuss any significant work - we would hate for your time to be wasted.
29 | 
30 | To send us a pull request, please:
31 | 
32 | 1. Fork the repository.
33 | 2. Modify the source; please focus on the specific change you are contributing. If you also reformat all the code, it will be hard for us to focus on your change.
34 | 3. Please ensure that your code follows our [code conventions](https://github.com/databricks/scala-style-guide), which we adopted from Apache Spark
35 | 4. Ensure local tests pass.
36 | 5. Commit to your fork using clear commit messages.
37 | 6. Send us a pull request, answering any default questions in the pull request interface.
38 | 7. Pay attention to any automated CI failures reported in the pull request, and stay involved in the conversation.
39 | 
40 | GitHub provides additional document on [forking a repository](https://help.github.com/articles/fork-a-repo/) and 
41 | [creating a pull request](https://help.github.com/articles/creating-a-pull-request/).
42 | 
43 | 
44 | ## Finding contributions to work on
45 | Looking at the existing issues is a great way to find something to contribute on. As our projects, by default, use the default GitHub issue labels ((enhancement/bug/duplicate/help wanted/invalid/question/wontfix), looking at any ['help wanted'](https://github.com/awslabs/deequ/labels/help%20wanted) issues is a great place to start. 
46 | 
47 | 
48 | ## Understanding the existing codebase
49 | You may find the [documentation on the key concepts](/docs/key-concepts.md) in the codebase helpful.
50 | 
51 | ## Code of Conduct
52 | This project has adopted the [Amazon Open Source Code of Conduct](https://aws.github.io/code-of-conduct). 
53 | For more information see the [Code of Conduct FAQ](https://aws.github.io/code-of-conduct-faq) or contact 
54 | opensource-codeofconduct@amazon.com with any additional questions or comments.
55 | 
56 | 
57 | ## Security issue notifications
58 | If you discover a potential security issue in this project we ask that you notify AWS/Amazon Security via our [vulnerability reporting page](http://aws.amazon.com/security/vulnerability-reporting/). Please do **not** create a public github issue.
59 | 
60 | 
61 | ## Licensing
62 | 
63 | See the [LICENSE](https://github.com/awslabs/deequ/blob/master/LICENSE) file for our project's licensing. We will ask you to confirm the licensing of your contribution.
64 | 
65 | We may ask you to sign a [Contributor License Agreement (CLA)](http://en.wikipedia.org/wiki/Contributor_License_Agreement) for larger changes.
66 | 


--------------------------------------------------------------------------------
/src/main/scala/com/amazon/deequ/anomalydetection/BatchNormalStrategy.scala:
--------------------------------------------------------------------------------
 1 | /**
 2 |  * Copyright 2018 Amazon.com, Inc. or its affiliates. All Rights Reserved.
 3 |  *
 4 |  * Licensed under the Apache License, Version 2.0 (the "License"). You may not
 5 |  * use this file except in compliance with the License. A copy of the License
 6 |  * is located at
 7 |  *
 8 |  *     http://aws.amazon.com/apache2.0/
 9 |  *
10 |  * or in the "license" file accompanying this file. This file is distributed on
11 |  * an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either
12 |  * express or implied. See the License for the specific language governing
13 |  * permissions and limitations under the License.
14 |  *
15 |  */
16 | 
17 | package com.amazon.deequ.anomalydetection
18 | 
19 | import breeze.stats.meanAndVariance
20 | 
21 | 
22 | /**
23 |   * Detects anomalies based on the mean and standard deviation of all available values.
24 |   * Assumes that the data is normally distributed.
25 |   *
26 |   * @param lowerDeviationFactor Detect anomalies if they are
27 |   *                             smaller than mean - lowerDeviationFactor * stdDev
28 |   * @param upperDeviationFactor Detect anomalies if they are
29 |   *                             bigger than mean + upperDeviationFactor * stdDev
30 |   * @param includeInterval      Whether or not values inside the detection interval should be
31 |   *                             included in the calculation of the mean/ stdDev
32 |   */
33 | case class BatchNormalStrategy(
34 |   lowerDeviationFactor: Option[Double] = Some(3.0),
35 |   upperDeviationFactor: Option[Double] = Some(3.0),
36 |   includeInterval: Boolean = false) extends AnomalyDetectionStrategy {
37 | 
38 |   require(lowerDeviationFactor.isDefined || upperDeviationFactor.isDefined,
39 |     "At least one factor has to be specified.")
40 | 
41 |   require(lowerDeviationFactor.getOrElse(1.0) >= 0 && upperDeviationFactor.getOrElse(1.0) >= 0,
42 |     "Factors cannot be smaller than zero.")
43 | 
44 | 
45 |   /**
46 |     * Search for anomalies in a series of data points.
47 |     *
48 |     * @param dataSeries     The data contained in a Vector of Doubles
49 |     * @param searchInterval The indices between which anomalies should be detected. [a, b).
50 |     * @return The indices of all anomalies in the interval and their corresponding wrapper object.
51 |     */
52 |   override def detect(
53 |     dataSeries: Vector[Double],
54 |     searchInterval: (Int, Int)): Seq[(Int, Anomaly)] = {
55 | 
56 |     val (searchStart, searchEnd) = searchInterval
57 | 
58 |     require(searchStart <= searchEnd, "The start of the interval can't be larger than the end.")
59 | 
60 |     require(dataSeries.nonEmpty, "Data series is empty. Can't calculate mean/ stdDev.")
61 | 
62 |     val searchIntervalLength = searchEnd - searchStart
63 | 
64 |     require(includeInterval || searchIntervalLength < dataSeries.length,
65 |       "Excluding values in searchInterval from calculation but not enough values remain to " +
66 |       "calculate mean and stdDev.")
67 | 
68 |     val mAV = if (includeInterval) {
69 |       meanAndVariance(dataSeries)
70 |     } else {
71 |       val valuesBeforeInterval = dataSeries.slice(0, searchStart)
72 |       val valuesAfterInterval = dataSeries.slice(searchEnd, dataSeries.length)
73 |       val dataSeriesWithoutInterval = valuesBeforeInterval ++ valuesAfterInterval
74 | 
75 |       meanAndVariance(dataSeriesWithoutInterval)
76 |     }
77 | 
78 |     val mean = mAV.mean
79 |     val stdDev = mAV.stdDev
80 | 
81 |     val upperBound = mean + upperDeviationFactor.getOrElse(Double.MaxValue) * stdDev
82 |     val lowerBound = mean - lowerDeviationFactor.getOrElse(Double.MaxValue) * stdDev
83 | 
84 |     dataSeries.zipWithIndex
85 |       .slice(searchStart, searchEnd)
86 |       .filter { case (value, _) => value > upperBound || value < lowerBound }
87 |       .map { case (value, index) =>
88 | 
89 |         val detail = Some(s"[BatchNormalStrategy]: Value $value is not in " +
90 |           s"bounds [$lowerBound, $upperBound].")
91 | 
92 |         (index, Anomaly(Option(value), 1.0, detail))
93 |       }
94 |   }
95 | }
96 | 


--------------------------------------------------------------------------------
/src/main/scala/com/amazon/deequ/analyzers/MutualInformation.scala:
--------------------------------------------------------------------------------
  1 | /**
  2 |  * Copyright 2018 Amazon.com, Inc. or its affiliates. All Rights Reserved.
  3 |  *
  4 |  * Licensed under the Apache License, Version 2.0 (the "License"). You may not
  5 |  * use this file except in compliance with the License. A copy of the License
  6 |  * is located at
  7 |  *
  8 |  *     http://aws.amazon.com/apache2.0/
  9 |  *
 10 |  * or in the "license" file accompanying this file. This file is distributed on
 11 |  * an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either
 12 |  * express or implied. See the License for the specific language governing
 13 |  * permissions and limitations under the License.
 14 |  *
 15 |  */
 16 | 
 17 | package com.amazon.deequ.analyzers
 18 | 
 19 | import com.amazon.deequ.analyzers.Analyzers._
 20 | import com.amazon.deequ.metrics.{DoubleMetric, Entity}
 21 | import org.apache.spark.sql.functions.{col, sum, udf}
 22 | import org.apache.spark.sql.types.StructType
 23 | import Analyzers.COUNT_COL
 24 | import com.amazon.deequ.analyzers.runners.MetricCalculationException
 25 | 
 26 | /**
 27 |   * Mutual Information describes how much information about one column can be inferred from another
 28 |   * column.
 29 |   *
 30 |   * If two columns are independent of each other, then nothing can be inferred from one column about
 31 |   * the other, and mutual information is zero. If there is a functional dependency of one column to
 32 |   * another and vice versa, then all information of the two columns are shared, and mutual
 33 |   * information is the entropy of each column.
 34 |   */
 35 | case class MutualInformation(columns: Seq[String], where: Option[String] = None)
 36 |   extends FrequencyBasedAnalyzer(columns)
 37 |     with FilterableAnalyzer {
 38 | 
 39 |   override def computeMetricFrom(state: Option[FrequenciesAndNumRows]): DoubleMetric = {
 40 | 
 41 |     state match {
 42 | 
 43 |       case Some(theState) =>
 44 |         val total = theState.numRows
 45 |         val Seq(col1, col2) = columns
 46 | 
 47 |         val freqCol1 = s"__deequ_f1_$col1"
 48 |         val freqCol2 = s"__deequ_f2_$col2"
 49 | 
 50 |         val jointStats = theState.frequencies
 51 | 
 52 |         val marginalStats1 = jointStats
 53 |           .select(col1, COUNT_COL)
 54 |           .groupBy(col1)
 55 |           .agg(sum(COUNT_COL).as(freqCol1))
 56 | 
 57 |         val marginalStats2 = jointStats
 58 |           .select(col2, COUNT_COL)
 59 |           .groupBy(col2)
 60 |           .agg(sum(COUNT_COL).as(freqCol2))
 61 | 
 62 | 
 63 |         val miUdf = udf {
 64 |           (px: Double, py: Double, pxy: Double) =>
 65 |             (pxy / total) * math.log((pxy / total) / ((px / total) * (py / total)))
 66 |         }
 67 | 
 68 |         val miCol = s"__deequ_mi_${col1}_$col2"
 69 |         val value = jointStats
 70 |           .join(marginalStats1, usingColumn = col1)
 71 |           .join(marginalStats2, usingColumn = col2)
 72 |           .withColumn(miCol, miUdf(col(freqCol1), col(freqCol2), col(COUNT_COL)))
 73 |           .agg(sum(miCol))
 74 | 
 75 |         val resultRow = value.head()
 76 | 
 77 |         if (resultRow.isNullAt(0)) {
 78 |           metricFromEmpty(this, "MutualInformation", columns.mkString(","), Entity.Mutlicolumn)
 79 |         } else {
 80 |           metricFromValue(resultRow.getDouble(0), "MutualInformation", columns.mkString(","),
 81 |             Entity.Mutlicolumn)
 82 |         }
 83 | 
 84 |       case None =>
 85 |         metricFromEmpty(this, "MutualInformation", columns.mkString(","), Entity.Mutlicolumn)
 86 |     }
 87 |   }
 88 | 
 89 | 
 90 |   /** We need at least one grouping column, and all specified columns must exist */
 91 |   override def preconditions: Seq[StructType => Unit] = {
 92 |     Preconditions.exactlyNColumns(columns, 2) +: super.preconditions
 93 |   }
 94 | 
 95 |   override def toFailureMetric(exception: Exception): DoubleMetric = {
 96 |     metricFromFailure(exception, "MutualInformation", columns.mkString(","), Entity.Mutlicolumn)
 97 |   }
 98 | 
 99 |   override def filterCondition: Option[String] = where
100 | }
101 | 
102 | object MutualInformation {
103 |   def apply(columnA: String, columnB: String): MutualInformation = {
104 |     new MutualInformation(columnA :: columnB :: Nil)
105 |   }
106 | }
107 | 


--------------------------------------------------------------------------------
/src/main/scala/com/amazon/deequ/metrics/KLLMetric.scala:
--------------------------------------------------------------------------------
  1 | /**
  2 |  * Copyright 2018 Amazon.com, Inc. or its affiliates. All Rights Reserved.
  3 |  *
  4 |  * Licensed under the Apache License, Version 2.0 (the "License"). You may not
  5 |  * use this file except in compliance with the License. A copy of the License
  6 |  * is located at
  7 |  *
  8 |  *     http://aws.amazon.com/apache2.0/
  9 |  *
 10 |  * or in the "license" file accompanying this file. This file is distributed on
 11 |  * an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either
 12 |  * express or implied. See the License for the specific language governing
 13 |  * permissions and limitations under the License.
 14 |  *
 15 |  */
 16 | 
 17 | package com.amazon.deequ.metrics
 18 | 
 19 | import com.amazon.deequ.analyzers.QuantileNonSample
 20 | 
 21 | import scala.util.{Failure, Success, Try}
 22 | import scala.util.control.Breaks._
 23 | 
 24 | case class BucketValue(lowValue: Double, highValue: Double, count: Long)
 25 | 
 26 | case class BucketDistribution(
 27 |     buckets: List[BucketValue],
 28 |     parameters: List[Double],
 29 |     data: Array[Array[Double]]) {
 30 | 
 31 |   def computePercentiles(): Array[Double] = {
 32 | 
 33 |     val sketchSize = parameters(0).toInt
 34 |     val shrinkingFactor = parameters(1)
 35 | 
 36 |     val quantileNonSample = new QuantileNonSample[Double](sketchSize, shrinkingFactor)
 37 |     quantileNonSample.reconstruct(sketchSize, shrinkingFactor, data)
 38 | 
 39 |     quantileNonSample.quantiles(100)
 40 |   }
 41 | 
 42 |   /**
 43 |    * Get relevant bucketValue with index of bucket.
 44 |    * @param key index of bucket
 45 |    * @return The metrics for the bucket
 46 |    */
 47 |   def apply(key: Int): BucketValue = {
 48 |     buckets(key)
 49 |   }
 50 | 
 51 |   /**
 52 |    * Find the index of bucket which contains the most items.
 53 |    * @return The index of bucket which contains the most items.
 54 |    */
 55 |   def argmax: Int = {
 56 |     var currentMax = 0L
 57 |     var maxBucket = 0
 58 |     buckets.foreach { bucket =>
 59 |       if (bucket.count > currentMax) {
 60 |         currentMax = bucket.count
 61 |         maxBucket = buckets.indexOf(bucket)
 62 |       }
 63 |     }
 64 |     maxBucket
 65 |   }
 66 | 
 67 |   /**
 68 |    * Check if it is equal with two BucketDistribution.
 69 |    * @param obj object to compare
 70 |    * @return true if equal
 71 |    */
 72 |   override def equals(obj: Any): Boolean = {
 73 |     obj match {
 74 |       case that: BucketDistribution =>
 75 |         var check = that.isInstanceOf[BucketDistribution] &&
 76 |           this.buckets.equals(that.buckets) &&
 77 |           this.parameters.equals(that.parameters) &&
 78 |           this.data.length == that.data.length
 79 |         breakable {
 80 |           for (i <- this.data.indices) {
 81 |             if (!this.data(i).sameElements(that.data(i))) {
 82 |               check = false
 83 |               break
 84 |             }
 85 |           }
 86 |         }
 87 |         check
 88 |       case _ => false
 89 |     }
 90 |   }
 91 | 
 92 |   // TODO not sure if thats correct...
 93 |   override def hashCode(): Int = super.hashCode()
 94 | }
 95 | 
 96 | case class KLLMetric(column: String, value: Try[BucketDistribution])
 97 |   extends Metric[BucketDistribution] {
 98 | 
 99 |   val entity: Entity.Value = Entity.Column
100 |   val instance: String = column
101 |   val name = "KLL"
102 | 
103 |   def flatten(): Seq[DoubleMetric] = {
104 |     value
105 |       .map { distribution =>
106 |         val numberOfBuckets = Seq(DoubleMetric(entity, s"$name.buckets", instance,
107 |           Success(distribution.buckets.length.toDouble)))
108 | 
109 |         val details = distribution.buckets
110 |           .flatMap { distValue =>
111 |             DoubleMetric(entity, s"$name.low", instance, Success(distValue.lowValue)) ::
112 |               DoubleMetric(entity, s"$name.high", instance, Success(distValue.highValue)) ::
113 |               DoubleMetric(entity, s"$name.count", instance, Success(distValue.count)) :: Nil
114 |           }
115 |         numberOfBuckets ++ details
116 |       }
117 |       .recover {
118 |         case e: Exception => Seq(DoubleMetric(entity, s"$name.buckets", instance, Failure(e)))
119 |       }
120 |       .get
121 |   }
122 | 
123 | }
124 | 


--------------------------------------------------------------------------------
/src/main/scala/com/amazon/deequ/profiles/ColumnProfilerRunner.scala:
--------------------------------------------------------------------------------
  1 | /**
  2 |   * Copyright 2018 Amazon.com, Inc. or its affiliates. All Rights Reserved.
  3 |   *
  4 |   * Licensed under the Apache License, Version 2.0 (the "License"). You may not
  5 |   * use this file except in compliance with the License. A copy of the License
  6 |   * is located at
  7 |   *
  8 |   *     http://aws.amazon.com/apache2.0/
  9 |   *
 10 |   * or in the "license" file accompanying this file. This file is distributed on
 11 |   * an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either
 12 |   * express or implied. See the License for the specific language governing
 13 |   * permissions and limitations under the License.
 14 |   *
 15 |   */
 16 | 
 17 | package com.amazon.deequ.profiles
 18 | 
 19 | import com.amazon.deequ.analyzers.{DataTypeInstances, KLLParameters}
 20 | import com.amazon.deequ.io.DfsUtils
 21 | import com.amazon.deequ.repository.{MetricsRepository, ResultKey}
 22 | import org.apache.spark.annotation.Experimental
 23 | import org.apache.spark.sql.{DataFrame, SparkSession}
 24 | 
 25 | private[profiles] case class ColumnProfilerRunBuilderMetricsRepositoryOptions(
 26 |       metricsRepository: Option[MetricsRepository],
 27 |       reuseExistingResultsKey: Option[ResultKey],
 28 |       failIfResultsForReusingMissing: Boolean,
 29 |       saveOrAppendResultsKey: Option[ResultKey])
 30 | 
 31 | private[profiles] case class ColumnProfilerRunBuilderFileOutputOptions(
 32 |       session: Option[SparkSession],
 33 |       saveColumnProfilesJsonToPath: Option[String],
 34 |       overwriteResults: Boolean)
 35 | 
 36 | @Experimental
 37 | class ColumnProfilerRunner {
 38 | 
 39 |   def onData(data: DataFrame): ColumnProfilerRunBuilder = {
 40 |     new ColumnProfilerRunBuilder(data)
 41 |   }
 42 | 
 43 |   private[profiles] def run(
 44 |       data: DataFrame,
 45 |       restrictToColumns: Option[Seq[String]],
 46 |       lowCardinalityHistogramThreshold: Int,
 47 |       printStatusUpdates: Boolean,
 48 |       cacheInputs: Boolean,
 49 |       fileOutputOptions: ColumnProfilerRunBuilderFileOutputOptions,
 50 |       metricsRepositoryOptions: ColumnProfilerRunBuilderMetricsRepositoryOptions,
 51 |       kllProfiling: Boolean,
 52 |       kllParameters: Option[KLLParameters],
 53 |       predefinedTypes: Map[String, DataTypeInstances.Value])
 54 |     : ColumnProfiles = {
 55 | 
 56 |     if (cacheInputs) {
 57 |       data.cache()
 58 |     }
 59 | 
 60 |     val columnProfiles = ColumnProfiler
 61 |       .profile(
 62 |         data,
 63 |         restrictToColumns,
 64 |         printStatusUpdates,
 65 |         lowCardinalityHistogramThreshold,
 66 |         metricsRepositoryOptions.metricsRepository,
 67 |         metricsRepositoryOptions.reuseExistingResultsKey,
 68 |         metricsRepositoryOptions.failIfResultsForReusingMissing,
 69 |         metricsRepositoryOptions.saveOrAppendResultsKey,
 70 |         kllProfiling,
 71 |         kllParameters,
 72 |         predefinedTypes
 73 |       )
 74 | 
 75 |     saveColumnProfilesJsonToFileSystemIfNecessary(
 76 |       fileOutputOptions,
 77 |       printStatusUpdates,
 78 |       columnProfiles
 79 |     )
 80 | 
 81 |     if (cacheInputs) {
 82 |       data.unpersist()
 83 |     }
 84 | 
 85 |     columnProfiles
 86 |   }
 87 | 
 88 |   private[this] def saveColumnProfilesJsonToFileSystemIfNecessary(
 89 |       fileOutputOptions: ColumnProfilerRunBuilderFileOutputOptions,
 90 |       printStatusUpdates: Boolean,
 91 |       columnProfiles: ColumnProfiles)
 92 |     : Unit = {
 93 | 
 94 |     fileOutputOptions.session.foreach { session =>
 95 |       fileOutputOptions.saveColumnProfilesJsonToPath.foreach { profilesOutput =>
 96 |         if (printStatusUpdates) {
 97 |           println(s"### WRITING COLUMN PROFILES TO $profilesOutput")
 98 |         }
 99 | 
100 |         DfsUtils.writeToTextFileOnDfs(session, profilesOutput,
101 |           overwrite = fileOutputOptions.overwriteResults) { writer =>
102 |             writer.append(ColumnProfiles.toJson(columnProfiles.profiles.values.toSeq).toString)
103 |             writer.newLine()
104 |           }
105 |         }
106 |     }
107 |   }
108 | }
109 | 
110 | object ColumnProfilerRunner {
111 | 
112 |   def apply(): ColumnProfilerRunner = {
113 |     new ColumnProfilerRunner()
114 |   }
115 | }
116 | 


--------------------------------------------------------------------------------
/src/main/scala/com/amazon/deequ/suggestions/ConstraintSuggestion.scala:
--------------------------------------------------------------------------------
  1 | /**
  2 |  * Copyright 2018 Amazon.com, Inc. or its affiliates. All Rights Reserved.
  3 |  *
  4 |  * Licensed under the Apache License, Version 2.0 (the "License"). You may not
  5 |  * use this file except in compliance with the License. A copy of the License
  6 |  * is located at
  7 |  *
  8 |  *     http://aws.amazon.com/apache2.0/
  9 |  *
 10 |  * or in the "license" file accompanying this file. This file is distributed on
 11 |  * an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either
 12 |  * express or implied. See the License for the specific language governing
 13 |  * permissions and limitations under the License.
 14 |  *
 15 |  */
 16 | 
 17 | package com.amazon.deequ.suggestions
 18 | 
 19 | import com.amazon.deequ.VerificationResult
 20 | import com.amazon.deequ.constraints.Constraint
 21 | import com.amazon.deequ.profiles.ColumnProfile
 22 | import com.amazon.deequ.suggestions.rules.ConstraintRule
 23 | import com.google.gson.{GsonBuilder, JsonArray, JsonObject}
 24 | 
 25 | case class ConstraintSuggestion(
 26 |     constraint: Constraint,
 27 |     columnName: String,
 28 |     currentValue: String,
 29 |     description: String,
 30 |     suggestingRule: ConstraintRule[ColumnProfile],
 31 |     codeForConstraint: String
 32 | )
 33 | 
 34 | object ConstraintSuggestions {
 35 | 
 36 |   private[this] val CONSTRANT_SUGGESTIONS_FIELD = "constraint_suggestions"
 37 | 
 38 |   private[suggestions] def toJson(constraintSuggestions: Seq[ConstraintSuggestion]): String = {
 39 | 
 40 |     val json = new JsonObject()
 41 | 
 42 |     val constraintsJson = new JsonArray()
 43 | 
 44 |     constraintSuggestions.foreach { constraintSuggestion =>
 45 | 
 46 |       val constraintJson = new JsonObject()
 47 |       addSharedProperties(constraintJson, constraintSuggestion)
 48 | 
 49 |       constraintsJson.add(constraintJson)
 50 |     }
 51 | 
 52 |     json.add(CONSTRANT_SUGGESTIONS_FIELD, constraintsJson)
 53 | 
 54 |     val gson = new GsonBuilder()
 55 |       .setPrettyPrinting()
 56 |       .create()
 57 | 
 58 |     gson.toJson(json)
 59 |   }
 60 | 
 61 |   private[suggestions] def evaluationResultsToJson(
 62 |       constraintSuggestions: Seq[ConstraintSuggestion],
 63 |       result: VerificationResult)
 64 |     : String = {
 65 | 
 66 |     val constraintResults = result.checkResults
 67 |       .map { case (_, checkResult) => checkResult }
 68 |       .headOption.map { checkResult =>
 69 |         checkResult.constraintResults
 70 |       }
 71 |       .getOrElse(Seq.empty)
 72 | 
 73 |     val json = new JsonObject()
 74 | 
 75 |     val constraintEvaluations = new JsonArray()
 76 | 
 77 |     val constraintResultsOnTestSet = constraintResults.map { checkResult =>
 78 |       checkResult.status.toString
 79 |     }
 80 | 
 81 |     constraintSuggestions.zipAll(constraintResultsOnTestSet, null, "Unknown")
 82 |       .foreach { case (constraintSuggestion, constraintResult) =>
 83 | 
 84 |         val constraintEvaluation = new JsonObject()
 85 |         addSharedProperties(constraintEvaluation, constraintSuggestion)
 86 | 
 87 |         constraintEvaluation.addProperty("constraint_result_on_test_set",
 88 |           constraintResult)
 89 | 
 90 |         constraintEvaluations.add(constraintEvaluation)
 91 |       }
 92 | 
 93 |     json.add(CONSTRANT_SUGGESTIONS_FIELD, constraintEvaluations)
 94 | 
 95 |     val gson = new GsonBuilder()
 96 |       .setPrettyPrinting()
 97 |       .create()
 98 | 
 99 |     gson.toJson(json)
100 |   }
101 | 
102 |   private[this] def addSharedProperties(
103 |       jsonObject: JsonObject,
104 |       constraintSuggestion: ConstraintSuggestion)
105 |     : Unit = {
106 | 
107 |     jsonObject.addProperty("constraint_name", constraintSuggestion.constraint.toString)
108 |     jsonObject.addProperty("column_name", constraintSuggestion.columnName)
109 |     jsonObject.addProperty("current_value", constraintSuggestion.currentValue)
110 |     jsonObject.addProperty("description", constraintSuggestion.description)
111 |     jsonObject.addProperty("suggesting_rule", constraintSuggestion.suggestingRule.toString)
112 |     jsonObject.addProperty("rule_description", constraintSuggestion.suggestingRule.ruleDescription)
113 |     jsonObject.addProperty("code_for_constraint", constraintSuggestion.codeForConstraint)
114 |   }
115 | }
116 | 


--------------------------------------------------------------------------------