├── .github
├── ISSUE_TEMPLATE
│ ├── bug_report.md
│ ├── feature_request.md
│ └── general-inquiry.md
├── PULL_REQUEST_TEMPLATE.md
└── workflows
│ └── maven.yml
├── .gitignore
├── CODE_OF_CONDUCT.md
├── CONTRIBUTING.md
├── LICENSE
├── Makefile
├── NOTICE
├── README.md
├── deequ-scalastyle.xml
├── docs
└── key-concepts.md
├── pom.xml
├── settings.xml
├── src
├── main
│ └── scala
│ │ └── com
│ │ └── amazon
│ │ └── deequ
│ │ ├── VerificationResult.scala
│ │ ├── VerificationRunBuilder.scala
│ │ ├── VerificationSuite.scala
│ │ ├── analyzers
│ │ ├── Analysis.scala
│ │ ├── Analyzer.scala
│ │ ├── ApproxCountDistinct.scala
│ │ ├── ApproxQuantile.scala
│ │ ├── ApproxQuantiles.scala
│ │ ├── ColumnCount.scala
│ │ ├── ColumnExists.scala
│ │ ├── Completeness.scala
│ │ ├── Compliance.scala
│ │ ├── Correlation.scala
│ │ ├── CountDistinct.scala
│ │ ├── CustomAggregator.scala
│ │ ├── CustomSql.scala
│ │ ├── DataType.scala
│ │ ├── DatasetMatchAnalyzer.scala
│ │ ├── DatasetMatchState.scala
│ │ ├── Distance.scala
│ │ ├── Distinctness.scala
│ │ ├── Entropy.scala
│ │ ├── ExactQuantile.scala
│ │ ├── FilterableAnalyzer.scala
│ │ ├── GroupingAnalyzers.scala
│ │ ├── Histogram.scala
│ │ ├── HistogramBase.scala
│ │ ├── HistogramBinned.scala
│ │ ├── KLLSketch.scala
│ │ ├── MaxLength.scala
│ │ ├── Maximum.scala
│ │ ├── Mean.scala
│ │ ├── MinLength.scala
│ │ ├── Minimum.scala
│ │ ├── MutualInformation.scala
│ │ ├── NonSampleCompactor.scala
│ │ ├── PatternMatch.scala
│ │ ├── QuantileNonSample.scala
│ │ ├── RatioOfSums.scala
│ │ ├── Size.scala
│ │ ├── StandardDeviation.scala
│ │ ├── StateProvider.scala
│ │ ├── Sum.scala
│ │ ├── UniqueValueRatio.scala
│ │ ├── Uniqueness.scala
│ │ ├── applicability
│ │ │ └── Applicability.scala
│ │ ├── catalyst
│ │ │ ├── AttributeReferenceCreation.java
│ │ │ ├── DeequFunctions.scala
│ │ │ ├── HLLConstants.scala
│ │ │ ├── KLLSketchSerializer.scala
│ │ │ ├── StatefulApproxQuantile.scala
│ │ │ ├── StatefulCorrelation.scala
│ │ │ ├── StatefulDataType.scala
│ │ │ ├── StatefulHyperloglogPlus.scala
│ │ │ ├── StatefulKLLSketch.scala
│ │ │ └── StatefulStdDevPop.scala
│ │ └── runners
│ │ │ ├── AnalysisRunBuilder.scala
│ │ │ ├── AnalysisRunner.scala
│ │ │ ├── AnalyzerContext.scala
│ │ │ ├── KLLRunner.scala
│ │ │ └── MetricCalculationException.scala
│ │ ├── anomalydetection
│ │ ├── AbsoluteChangeStrategy.scala
│ │ ├── AnomalyDetectionStrategy.scala
│ │ ├── AnomalyDetector.scala
│ │ ├── BaseChangeStrategy.scala
│ │ ├── BatchNormalStrategy.scala
│ │ ├── DetectionResult.scala
│ │ ├── HistoryUtils.scala
│ │ ├── OnlineNormalStrategy.scala
│ │ ├── RateOfChangeStrategy.scala
│ │ ├── RelativeRateOfChangeStrategy.scala
│ │ ├── SimpleThresholdStrategy.scala
│ │ └── seasonal
│ │ │ └── HoltWinters.scala
│ │ ├── checks
│ │ ├── Check.scala
│ │ ├── CheckWithLastConstraintFilterable.scala
│ │ └── ColumnCondition.scala
│ │ ├── comparison
│ │ ├── ComparisonBase.scala
│ │ ├── ComparisonResult.scala
│ │ ├── DataSynchronization.scala
│ │ └── ReferentialIntegrity.scala
│ │ ├── constraints
│ │ ├── AnalysisBasedConstraint.scala
│ │ ├── ConstrainableDataTypes.scala
│ │ └── Constraint.scala
│ │ ├── dqdl
│ │ ├── EvaluateDataQuality.scala
│ │ ├── execution
│ │ │ ├── DQDLExecutor.scala
│ │ │ ├── DefaultOperandEvaluator.scala
│ │ │ └── executors
│ │ │ │ ├── DeequRulesExecutor.scala
│ │ │ │ └── UnsupportedRulesExecutor.scala
│ │ ├── model
│ │ │ ├── ExecutableRule.scala
│ │ │ └── RuleOutcome.scala
│ │ ├── translation
│ │ │ ├── DQDLRuleConverter.scala
│ │ │ ├── DQDLRuleTranslator.scala
│ │ │ ├── DeequOutcomeTranslator.scala
│ │ │ └── rules
│ │ │ │ ├── ColumnCorrelationRule.scala
│ │ │ │ ├── ColumnExistsRule.scala
│ │ │ │ ├── ColumnLengthRule.scala
│ │ │ │ ├── CompletenessRule.scala
│ │ │ │ ├── CustomSqlRule.scala
│ │ │ │ ├── DistinctValuesCountRule.scala
│ │ │ │ ├── EntropyRule.scala
│ │ │ │ ├── IsCompleteRule.scala
│ │ │ │ ├── IsPrimaryKeyRule.scala
│ │ │ │ ├── IsUniqueRule.scala
│ │ │ │ ├── MeanRule.scala
│ │ │ │ ├── RowCountRule.scala
│ │ │ │ ├── StandardDeviationRule.scala
│ │ │ │ ├── SumRule.scala
│ │ │ │ ├── UniqueValueRatioRule.scala
│ │ │ │ └── UniquenessRule.scala
│ │ └── util
│ │ │ ├── DQDLUtility.scala
│ │ │ └── DefaultDQDLParser.scala
│ │ ├── examples
│ │ ├── AnomalyDetectionExample.scala
│ │ ├── BasicExample.scala
│ │ ├── ConstraintSuggestionExample.scala
│ │ ├── DataProfilingExample.scala
│ │ ├── ExampleUtils.scala
│ │ ├── IncrementalMetricsExample.scala
│ │ ├── KLLCheckExample.scala
│ │ ├── KLLExample.scala
│ │ ├── MetricsRepositoryExample.scala
│ │ ├── UpdateMetricsOnPartitionedDataExample.scala
│ │ ├── algebraic_states_example.md
│ │ ├── anomaly_detection_example.md
│ │ ├── constraint_suggestion_example.md
│ │ ├── data_profiling_example.md
│ │ ├── entities.scala
│ │ └── metrics_repository_example.md
│ │ ├── io
│ │ └── DfsUtils.scala
│ │ ├── metrics
│ │ ├── HistogramBinnedMetric.scala
│ │ ├── HistogramMetric.scala
│ │ ├── KLLMetric.scala
│ │ └── Metric.scala
│ │ ├── profiles
│ │ ├── ColumnProfile.scala
│ │ ├── ColumnProfiler.scala
│ │ ├── ColumnProfilerRunBuilder.scala
│ │ └── ColumnProfilerRunner.scala
│ │ ├── repository
│ │ ├── AnalysisResult.scala
│ │ ├── AnalysisResultSerde.scala
│ │ ├── MetricsRepository.scala
│ │ ├── MetricsRepositoryMultipleResultsLoader.scala
│ │ ├── fs
│ │ │ └── FileSystemMetricsRepository.scala
│ │ ├── memory
│ │ │ └── InMemoryMetricsRepository.scala
│ │ └── sparktable
│ │ │ └── SparkMetricsRepository.scala
│ │ ├── schema
│ │ └── RowLevelSchemaValidator.scala
│ │ ├── suggestions
│ │ ├── ConstraintSuggestion.scala
│ │ ├── ConstraintSuggestionResult.scala
│ │ ├── ConstraintSuggestionRunBuilder.scala
│ │ ├── ConstraintSuggestionRunner.scala
│ │ └── rules
│ │ │ ├── CategoricalRangeRule.scala
│ │ │ ├── CompleteIfCompleteRule.scala
│ │ │ ├── ConstraintRule.scala
│ │ │ ├── FractionalCategoricalRangeRule.scala
│ │ │ ├── HasMax.scala
│ │ │ ├── HasMaxLength.scala
│ │ │ ├── HasMean.scala
│ │ │ ├── HasMin.scala
│ │ │ ├── HasMinLength.scala
│ │ │ ├── HasStandardDeviation.scala
│ │ │ ├── NonNegativeNumbersRule.scala
│ │ │ ├── RetainCompletenessRule.scala
│ │ │ ├── RetainTypeRule.scala
│ │ │ ├── UniqueIfApproximatelyUniqueRule.scala
│ │ │ └── interval
│ │ │ ├── ConfidenceIntervalStrategy.scala
│ │ │ ├── WaldIntervalStrategy.scala
│ │ │ └── WilsonScoreIntervalStrategy.scala
│ │ └── utilities
│ │ └── ColumnUtil.scala
└── test
│ ├── resources
│ ├── EMRSparkShellTest.scala
│ └── log4j.properties
│ └── scala
│ └── com
│ └── amazon
│ └── deequ
│ ├── DatatypeSuggestionTest.scala
│ ├── KLL
│ ├── KLLBenchmark.java
│ ├── KLLBenchmarkHelper.scala
│ ├── KLLDistanceTest.scala
│ ├── KLLProbTest.scala
│ └── KLLProfileTest.scala
│ ├── SparkBasicTest.scala
│ ├── SparkContextSpec.scala
│ ├── SparkMonitor.scala
│ ├── SuggestionAndVerificationIntegrationTest.scala
│ ├── VerificationResultTest.scala
│ ├── VerificationSuiteTest.scala
│ ├── analyzers
│ ├── AnalysisTest.scala
│ ├── AnalyzerTests.scala
│ ├── ColumnCountTest.scala
│ ├── CompletenessTest.scala
│ ├── ComplianceTest.scala
│ ├── CustomAggregatorTest.scala
│ ├── CustomSqlTest.scala
│ ├── DistinctnessTest.scala
│ ├── HistogramBinnedTest.scala
│ ├── HistogramTest.scala
│ ├── IncrementalAnalysisTest.scala
│ ├── IncrementalAnalyzerTest.scala
│ ├── MaxLengthTest.scala
│ ├── MaximumTest.scala
│ ├── MinLengthTest.scala
│ ├── MinimumTest.scala
│ ├── NullHandlingTests.scala
│ ├── PartitionedTableIntegrationTest.scala
│ ├── PatternMatchTest.scala
│ ├── StateAggregationIntegrationTest.scala
│ ├── StateAggregationTests.scala
│ ├── StateProviderTest.scala
│ ├── StatesTest.scala
│ ├── UniquenessTest.scala
│ └── runners
│ │ ├── AnalysisRunnerTests.scala
│ │ └── AnalyzerContextTest.scala
│ ├── anomalydetection
│ ├── AbsoluteChangeStrategyTest.scala
│ ├── AnomalyDetectionTestUtils.scala
│ ├── AnomalyDetectionTestUtilsTest.scala
│ ├── AnomalyDetectorTest.scala
│ ├── BatchNormalStrategyTest.scala
│ ├── HistoryUtilsTest.scala
│ ├── OnlineNormalStrategyTest.scala
│ ├── RateOfChangeStrategyTest.scala
│ ├── RelativeRateOfChangeStrategyTest.scala
│ ├── SimpleThresholdStrategyTest.scala
│ └── seasonal
│ │ └── HoltWintersTest.scala
│ ├── checks
│ ├── ApplicabilityTest.scala
│ ├── CheckTest.scala
│ ├── ColumnConditionTest.scala
│ ├── CustomSqlCheckTest.scala
│ └── FilterableCheckTest.scala
│ ├── comparison
│ ├── DataSynchronizationTest.scala
│ └── ReferentialIntegrityTest.scala
│ ├── constraints
│ ├── AnalysisBasedConstraintTest.scala
│ ├── ConstraintUtils.scala
│ └── ConstraintsTest.scala
│ ├── dqdl
│ ├── DefaultDQDLParserTest.scala
│ ├── EvaluateDataQualitySpec.scala
│ ├── execution
│ │ └── DQDLExecutorSpec.scala
│ └── translation
│ │ ├── DQDLRuleTranslatorSpec.scala
│ │ └── rules
│ │ └── ColumnLengthRuleSpec.scala
│ ├── examples
│ ├── ExamplesTest.scala
│ ├── JavaDQDLExample.java
│ └── ScalaDQDLExample.scala
│ ├── metrics
│ └── MetricsTests.scala
│ ├── package.scala
│ ├── profiles
│ ├── ColumnProfilerRunnerTest.scala
│ └── ColumnProfilerTest.scala
│ ├── repository
│ ├── AnalysisResultSerdeTest.scala
│ ├── AnalysisResultTest.scala
│ ├── MetricsRepositoryAnomalyDetectionIntegrationTest.scala
│ ├── MetricsRepositoryMultipleResultsLoaderTest.scala
│ ├── fs
│ │ └── FileSystemMetricsRepositoryTest.scala
│ ├── memory
│ │ └── InMemoryMetricsRepositoryTest.scala
│ └── sparktable
│ │ └── SparkTableMetricsRepositoryTest.scala
│ ├── schema
│ └── RowLevelSchemaValidatorTest.scala
│ ├── suggestions
│ ├── ConstraintSuggestionResultTest.scala
│ ├── ConstraintSuggestionRunnerTest.scala
│ ├── ConstraintSuggestionsIntegrationTest.scala
│ └── rules
│ │ ├── ConstraintRulesTest.scala
│ │ └── interval
│ │ └── IntervalStrategyTest.scala
│ └── utils
│ ├── AssertionUtils.scala
│ ├── CollectionUtils.scala
│ ├── ConditionUtils.scala
│ ├── FixtureSupport.scala
│ └── TempFileUtils.scala
└── test-data
├── README.md
└── titanic.csv
/.github/ISSUE_TEMPLATE/bug_report.md:
--------------------------------------------------------------------------------
1 | ---
2 | name: Bug report
3 | about: Create a report to help us improve
4 | title: "[BUG]"
5 | labels: bug
6 | assignees: ''
7 |
8 | ---
9 |
10 | **Describe the bug**
11 | A clear and concise description of what the bug is.
12 |
13 | **To Reproduce**
14 | Steps to reproduce the behavior:
15 | 1. Go to '...'
16 | 2. Click on '....'
17 | 3. Scroll down to '....'
18 | 4. See error
19 |
20 | **Expected behavior**
21 | A clear and concise description of what you expected to happen.
22 |
23 | **Screenshots**
24 | If applicable, add screenshots to help explain your problem.
25 |
26 | **Additional context**
27 | Add any other context about the problem here.
28 |
--------------------------------------------------------------------------------
/.github/ISSUE_TEMPLATE/feature_request.md:
--------------------------------------------------------------------------------
1 | ---
2 | name: Feature request
3 | about: Suggest an idea for this project
4 | title: "[FEATURE]"
5 | labels: enhancement
6 | assignees: ''
7 |
8 | ---
9 |
10 | **Is your feature request related to a problem? Please describe.**
11 | A clear and concise description of what the problem is. Ex. I'm always frustrated when [...]
12 |
13 | **Describe the solution you'd like**
14 | A clear and concise description of what you want to happen.
15 |
16 | **Describe alternatives you've considered**
17 | A clear and concise description of any alternative solutions or features you've considered.
18 |
19 | **Additional context**
20 | Add any other context or screenshots about the feature request here.
21 |
--------------------------------------------------------------------------------
/.github/ISSUE_TEMPLATE/general-inquiry.md:
--------------------------------------------------------------------------------
1 | ---
2 | name: General Inquiry
3 | about: Ask general questions about this project
4 | title: ''
5 | labels: question
6 | assignees: ''
7 |
8 | ---
9 |
10 | **Ask questions that don't apply to the other templates (Bug report, Feature request)**
11 |
--------------------------------------------------------------------------------
/.github/PULL_REQUEST_TEMPLATE.md:
--------------------------------------------------------------------------------
1 | *Issue #, if available:*
2 |
3 | *Description of changes:*
4 |
5 |
6 | By submitting this pull request, I confirm that my contribution is made under the terms of the Apache 2.0 license.
7 |
--------------------------------------------------------------------------------
/.github/workflows/maven.yml:
--------------------------------------------------------------------------------
1 | name: Java CI with Maven
2 |
3 | on:
4 | push:
5 | branches: [ "master" ]
6 | pull_request:
7 | branches: [ "master" ]
8 |
9 | jobs:
10 | build:
11 |
12 | runs-on: ubuntu-latest
13 |
14 | steps:
15 | - uses: actions/checkout@v3
16 | - name: Set up JDK 8
17 | uses: actions/setup-java@v3
18 | with:
19 | java-version: '8'
20 | distribution: 'corretto'
21 | cache: maven
22 | - name: Build with Maven
23 | run: mvn clean verify
24 |
25 |
--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
1 | .idea/
2 | *.iml
3 | **/*.iml
4 | target/
5 | .metals/
6 | .vscode/
7 | .bloop/
8 | .DS_Store
9 | .scalafmt.conf
10 | *.log
11 |
--------------------------------------------------------------------------------
/CODE_OF_CONDUCT.md:
--------------------------------------------------------------------------------
1 | ## Code of Conduct
2 | This project has adopted the [Amazon Open Source Code of Conduct](https://aws.github.io/code-of-conduct).
3 | For more information see the [Code of Conduct FAQ](https://aws.github.io/code-of-conduct-faq) or contact
4 | opensource-codeofconduct@amazon.com with any additional questions or comments.
5 |
--------------------------------------------------------------------------------
/Makefile:
--------------------------------------------------------------------------------
1 | build:
2 | mvn clean install
3 | compile:
4 | mvn clean compile
--------------------------------------------------------------------------------
/NOTICE:
--------------------------------------------------------------------------------
1 | Deequ
2 | Copyright 2018 Amazon.com, Inc. or its affiliates. All Rights Reserved.
3 |
--------------------------------------------------------------------------------
/docs/key-concepts.md:
--------------------------------------------------------------------------------
1 | # Key Concepts in the Codebase
2 | There are a few key concepts that will help you to understand the codebase.
3 |
4 | ## Metrics, Analyzers, and State
5 | Metrics represent some metric associated with the data that changes over time. For example counting the rows in a
6 | DataFrame.
7 |
8 | An Analyzer knows how to calculate a Metric based on some input DataFrame.
9 |
10 | State is an optimization - it represents the state of the data, from which a metric can be calculated. This intermediate
11 | state can then be used to calculate future metrics more quickly. Check out the examples for some further details.
12 |
13 | ## Overall flow of running deequ checks
14 | When running checks a user specifies a DataFrame and a number of checks to do on that DataFrame. Many checks in Deequ
15 | are based on metrics which describe the data. In order to perform the checks the user requests deequ follows the
16 | following process:
17 | * First deequ figures out which Analyzers are required
18 | * Metrics are calculated using those Analyzers:
19 | * Metrics are also stored if a MetricsRepository is provided
20 | * Intermediate state is stored if a StatePersister is provided
21 | * Intermediate state is used for metric calculations if a StateLoader is provided
22 | * Checks are evaluated using the calculated Metrics
23 |
24 | The reason it works this way is for performance, primarily because calculating metrics at the same time gives the
25 | opportunity to calculate them in fewer passes over the data.
26 |
27 | ### Analyzers
28 | Types of analyzers:
29 | * ScanShareableAnalyzer - an analyzer which computes a metric based on a straight scan over the data, without any
30 | grouping being required
31 | * GroupingAnalyzer - an analyzer that requires the data to be grouped by a set of columns before the metric can be
32 | calculated
33 |
34 | ### Metrics
35 | A metric includes the following key details:
36 | * name - the name for the type of metric
37 | * entity - the type of entity the metric is recorded against. e.g. A column, dataset, or multicolumn
38 | * instance - information about this instance of the metric. For example this could be the column name the metric is
39 | operating on
40 | * value - the value of the metric at a point in time. The type of this value varies between metrics.
41 |
42 | #### Metrics storage
43 | Metrics can be stored in a metrics repository. An entry in the repository consists of:
44 | * A resultKey, which is a combination of a timestamp and a map of tags. Typically a user may want to record things
45 | like the data source (e.g. table name) with the tags. The resultKey can be used to lookup stored metrics
46 | * An analyzerContext, which consists of a map of Analyzers to Metrics
47 |
48 | ### State
49 | Please consult the examples or the codebase for more details on State.
50 |
--------------------------------------------------------------------------------
/settings.xml:
--------------------------------------------------------------------------------
1 |
This method applies the specified data quality ruleset to the input DataFrame and returns a new 40 | * DataFrame summarizing the overall quality results, including any issues detected during the analysis.
41 | * 42 | * @param df the Spark DataFrame to analyze. 43 | * @param rulesetDefinition the data quality ruleset (defined in DQDL string format) to apply to the DataFrame. 44 | * @return a Spark DataFrame containing the aggregated data quality results. 45 | */ 46 | def process(df: DataFrame, rulesetDefinition: String): DataFrame = { 47 | // 1. Parse the ruleset 48 | val ruleset: DQRuleset = DefaultDQDLParser.parse(rulesetDefinition) 49 | 50 | // 2. Translate the dqRuleset into a corresponding ExecutableRules. 51 | val executableRules: Seq[ExecutableRule] = DQDLRuleTranslator.toExecutableRules(ruleset) 52 | 53 | // 3. Execute the rules against the DataFrame. 54 | val executedRulesResult = DQDLExecutor.executeRules(executableRules, df) 55 | 56 | // 4. Translate the results into a Spark DataFrame. 57 | DeequOutcomeTranslator.translate(executedRulesResult, df) 58 | } 59 | 60 | } 61 | -------------------------------------------------------------------------------- /src/main/scala/com/amazon/deequ/dqdl/execution/DQDLExecutor.scala: -------------------------------------------------------------------------------- 1 | /** 2 | * Copyright 2025 Amazon.com, Inc. or its affiliates. All Rights Reserved. 3 | * 4 | * Licensed under the Apache License, Version 2.0 (the "License"). You may not 5 | * use this file except in compliance with the License. A copy of the License 6 | * is located at 7 | * 8 | * http://aws.amazon.com/apache2.0/ 9 | * 10 | * or in the "license" file accompanying this file. This file is distributed on 11 | * an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either 12 | * express or implied. See the License for the specific language governing 13 | * permissions and limitations under the License. 14 | * 15 | */ 16 | 17 | package com.amazon.deequ.dqdl.execution 18 | 19 | import com.amazon.deequ.dqdl.execution.executors.{DeequRulesExecutor, UnsupportedRulesExecutor} 20 | import com.amazon.deequ.dqdl.model.{DeequExecutableRule, ExecutableRule, Failed, RuleOutcome, UnsupportedExecutableRule} 21 | import org.apache.spark.sql.DataFrame 22 | import software.amazon.glue.dqdl.model.DQRule 23 | 24 | 25 | /** 26 | * Executes DQDL rules on a Spark DataFrame. 27 | */ 28 | object DQDLExecutor { 29 | 30 | trait RuleExecutor[T <: ExecutableRule] { 31 | def executeRules(rules: Seq[T], df: DataFrame): Map[DQRule, RuleOutcome] 32 | } 33 | 34 | // Map from rule class to its executor 35 | private val executors = Map[Class[_ <: ExecutableRule], RuleExecutor[_ <: ExecutableRule]]( 36 | classOf[DeequExecutableRule] -> DeequRulesExecutor, 37 | classOf[UnsupportedExecutableRule] -> UnsupportedRulesExecutor 38 | ) 39 | 40 | def executeRules(rules: Seq[ExecutableRule], df: DataFrame): Map[DQRule, RuleOutcome] = { 41 | // Group rules to execute each group with the corresponding executor 42 | val rulesByType = rules.groupBy(_.getClass) 43 | 44 | rulesByType.flatMap { 45 | case (ruleClass, rules) => 46 | executors.get(ruleClass) match { 47 | case Some(executor) => executor.asInstanceOf[RuleExecutor[ExecutableRule]].executeRules(rules, df) 48 | case None => handleError(rules) 49 | } 50 | } 51 | } 52 | 53 | private def handleError(rules: Seq[ExecutableRule]) = { 54 | rules.map { rule => 55 | rule.dqRule -> RuleOutcome( 56 | rule.dqRule, 57 | Failed, 58 | Some(s"No executor found for rule type: ${rule.dqRule.getRuleType}") 59 | ) 60 | } 61 | 62 | } 63 | } 64 | -------------------------------------------------------------------------------- /src/main/scala/com/amazon/deequ/dqdl/execution/executors/UnsupportedRulesExecutor.scala: -------------------------------------------------------------------------------- 1 | /** 2 | * Copyright 2025 Amazon.com, Inc. or its affiliates. All Rights Reserved. 3 | * 4 | * Licensed under the Apache License, Version 2.0 (the "License"). You may not 5 | * use this file except in compliance with the License. A copy of the License 6 | * is located at 7 | * 8 | * http://aws.amazon.com/apache2.0/ 9 | * 10 | * or in the "license" file accompanying this file. This file is distributed on 11 | * an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either 12 | * express or implied. See the License for the specific language governing 13 | * permissions and limitations under the License. 14 | * 15 | */ 16 | 17 | package com.amazon.deequ.dqdl.execution.executors 18 | 19 | import com.amazon.deequ.dqdl.execution.DQDLExecutor 20 | import com.amazon.deequ.dqdl.model.{Failed, RuleOutcome, UnsupportedExecutableRule} 21 | import org.apache.spark.sql.DataFrame 22 | import software.amazon.glue.dqdl.model.DQRule 23 | 24 | object UnsupportedRulesExecutor extends DQDLExecutor.RuleExecutor[UnsupportedExecutableRule] { 25 | 26 | override def executeRules(rules: Seq[UnsupportedExecutableRule], df: DataFrame): Map[DQRule, RuleOutcome] = 27 | rules.map { r => 28 | val failureReason = "Rule (or nested rule) not supported" + r.reason.map(re => s" due to: $re").getOrElse("") 29 | r.dqRule -> RuleOutcome(r.dqRule, Failed, Some(failureReason)) 30 | }.toMap 31 | 32 | } 33 | -------------------------------------------------------------------------------- /src/main/scala/com/amazon/deequ/dqdl/model/ExecutableRule.scala: -------------------------------------------------------------------------------- 1 | /** 2 | * Copyright 2025 Amazon.com, Inc. or its affiliates. All Rights Reserved. 3 | * 4 | * Licensed under the Apache License, Version 2.0 (the "License"). You may not 5 | * use this file except in compliance with the License. A copy of the License 6 | * is located at 7 | * 8 | * http://aws.amazon.com/apache2.0/ 9 | * 10 | * or in the "license" file accompanying this file. This file is distributed on 11 | * an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either 12 | * express or implied. See the License for the specific language governing 13 | * permissions and limitations under the License. 14 | * 15 | */ 16 | 17 | package com.amazon.deequ.dqdl.model 18 | 19 | import com.amazon.deequ.checks.Check 20 | import com.amazon.deequ.dqdl.util.DQDLUtility.convertWhereClauseForMetric 21 | import software.amazon.glue.dqdl.model.DQRule 22 | 23 | trait ExecutableRule { 24 | val dqRule: DQRule 25 | val evaluatedMetricName: Option[String] 26 | } 27 | 28 | case class UnsupportedExecutableRule(dqRule: DQRule, reason: Option[String] = None) extends ExecutableRule { 29 | override val evaluatedMetricName: Option[String] = None 30 | } 31 | 32 | case class DeequExecutableRule(dqRule: DQRule, 33 | check: Check, 34 | deequMetricMappings: Seq[DeequMetricMapping] = Seq.empty) extends ExecutableRule { 35 | 36 | private val Delim = "." 37 | val evaluatedMetricName: Option[String] = deequMetricMappings match { 38 | case mappings if mappings.size == 1 => 39 | Some(s"${mappings.head.entity}$Delim${mappings.head.instance}$Delim${mappings.head.name}") 40 | case _ => None // Multiple metrics reported for rule; cannot determine name for general case 41 | } 42 | } 43 | 44 | case class DeequMetricMapping(entity: String, 45 | instance: String, 46 | name: String, 47 | deequName: String, 48 | deequInstance: Option[String] = None, 49 | disambiguator: Option[String] = None) 50 | 51 | object DeequMetricMapping { 52 | def apply(entity: String, 53 | instance: String, 54 | name: String, 55 | deequName: String, 56 | deequInstance: Option[String], 57 | rule: DQRule): DeequMetricMapping = { 58 | new DeequMetricMapping( 59 | entity, 60 | instance, 61 | name, 62 | deequName, 63 | deequInstance, 64 | convertWhereClauseForMetric(rule.getWhereClause) 65 | ) 66 | } 67 | } 68 | -------------------------------------------------------------------------------- /src/main/scala/com/amazon/deequ/dqdl/model/RuleOutcome.scala: -------------------------------------------------------------------------------- 1 | /** 2 | * Copyright 2025 Amazon.com, Inc. or its affiliates. All Rights Reserved. 3 | * 4 | * Licensed under the Apache License, Version 2.0 (the "License"). You may not 5 | * use this file except in compliance with the License. A copy of the License 6 | * is located at 7 | * 8 | * http://aws.amazon.com/apache2.0/ 9 | * 10 | * or in the "license" file accompanying this file. This file is distributed on 11 | * an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either 12 | * express or implied. See the License for the specific language governing 13 | * permissions and limitations under the License. 14 | * 15 | */ 16 | 17 | package com.amazon.deequ.dqdl.model 18 | 19 | import com.amazon.deequ.checks.{CheckResult, CheckStatus} 20 | import software.amazon.glue.dqdl.model.DQRule 21 | 22 | sealed trait OutcomeStatus { 23 | def asString: String 24 | 25 | def &&(other: OutcomeStatus): OutcomeStatus = this match { 26 | case Passed if (other == Passed) => Passed 27 | case _ => Failed 28 | } 29 | 30 | def ||(other: OutcomeStatus): OutcomeStatus = this match { 31 | case Passed => Passed 32 | case _ if other == Passed => Passed 33 | case _ => Failed 34 | } 35 | } 36 | 37 | 38 | case object Passed extends OutcomeStatus { 39 | def asString: String = "Passed" 40 | } 41 | 42 | case object Failed extends OutcomeStatus { 43 | def asString: String = "Failed" 44 | } 45 | 46 | case class RuleOutcome(rule: DQRule, 47 | outcome: OutcomeStatus, 48 | failureReason: Option[String], 49 | evaluatedMetrics: Map[String, Double] = Map.empty, 50 | evaluatedRule: Option[DQRule] = None) { 51 | } 52 | 53 | object RuleOutcome { 54 | def apply(r: DQRule, checkResult: CheckResult): RuleOutcome = { 55 | val messages: Seq[String] = checkResult.constraintResults.flatMap { constraintResult => 56 | constraintResult.message.map { message => 57 | val metricName = constraintResult.metric.map(_.name).getOrElse("") 58 | if (metricName.equals("Completeness") && r.getRuleType.equals("ColumnValues")) { 59 | "Value: NULL does not meet the constraint requirement!" 60 | } else message 61 | } 62 | } 63 | val optionalMessage = messages.size match { 64 | case 0 => None 65 | case _ => Option(messages.mkString(System.lineSeparator())) 66 | } 67 | val checkPassed = checkResult.status == CheckStatus.Success 68 | RuleOutcome(r, if (checkPassed) Passed else Failed, optionalMessage) 69 | } 70 | } 71 | -------------------------------------------------------------------------------- /src/main/scala/com/amazon/deequ/dqdl/translation/DQDLRuleConverter.scala: -------------------------------------------------------------------------------- 1 | /** 2 | * Copyright 2025 Amazon.com, Inc. or its affiliates. All Rights Reserved. 3 | * 4 | * Licensed under the Apache License, Version 2.0 (the "License"). You may not 5 | * use this file except in compliance with the License. A copy of the License 6 | * is located at 7 | * 8 | * http://aws.amazon.com/apache2.0/ 9 | * 10 | * or in the "license" file accompanying this file. This file is distributed on 11 | * an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either 12 | * express or implied. See the License for the specific language governing 13 | * permissions and limitations under the License. 14 | * 15 | */ 16 | 17 | package com.amazon.deequ.dqdl.translation 18 | 19 | import com.amazon.deequ.checks.Check 20 | import com.amazon.deequ.dqdl.execution.DefaultOperandEvaluator 21 | import com.amazon.deequ.dqdl.model.DeequMetricMapping 22 | import software.amazon.glue.dqdl.model.DQRule 23 | import software.amazon.glue.dqdl.model.condition.number.NumberBasedCondition 24 | 25 | 26 | trait DQDLRuleConverter { 27 | def convert(rule: DQRule): Either[String, (Check, Seq[DeequMetricMapping])] 28 | 29 | def assertionAsScala(dqRule: DQRule, e: NumberBasedCondition): Double => Boolean = { 30 | val evaluator = DefaultOperandEvaluator 31 | (d: Double) => e.evaluate(d, dqRule, evaluator) 32 | } 33 | } 34 | 35 | 36 | -------------------------------------------------------------------------------- /src/main/scala/com/amazon/deequ/dqdl/translation/DeequOutcomeTranslator.scala: -------------------------------------------------------------------------------- 1 | /** 2 | * Copyright 2025 Amazon.com, Inc. or its affiliates. All Rights Reserved. 3 | * 4 | * Licensed under the Apache License, Version 2.0 (the "License"). You may not 5 | * use this file except in compliance with the License. A copy of the License 6 | * is located at 7 | * 8 | * http://aws.amazon.com/apache2.0/ 9 | * 10 | * or in the "license" file accompanying this file. This file is distributed on 11 | * an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either 12 | * express or implied. See the License for the specific language governing 13 | * permissions and limitations under the License. 14 | * 15 | */ 16 | 17 | package com.amazon.deequ.dqdl.translation 18 | 19 | import com.amazon.deequ.dqdl.model.{Failed, RuleOutcome} 20 | import org.apache.spark.sql.{DataFrame, SparkSession} 21 | import software.amazon.glue.dqdl.model.{DQRule, DQRuleLogicalOperator} 22 | 23 | /** 24 | * Translates the outcome of Deequ checks (RuleOutcome) 25 | * into a Spark DataFrame containing the results. 26 | */ 27 | object DeequOutcomeTranslator { 28 | 29 | def translate(executedRulesResult: Map[DQRule, RuleOutcome], df: DataFrame): DataFrame = { 30 | 31 | // Reuse SparkSession from the existing DataFrame 32 | val spark = df.sparkSession 33 | import spark.implicits._ 34 | 35 | executedRulesResult.values.toSeq.map { r => 36 | ( 37 | r.rule.toString, 38 | r.outcome.asString, 39 | r.failureReason.orNull, 40 | r.evaluatedMetrics, 41 | r.evaluatedRule.map(_.toString).orNull 42 | ) 43 | }.toDF("Rule", "Outcome", "FailureReason", "EvaluatedMetrics", "EvaluatedRule") 44 | } 45 | 46 | } 47 | -------------------------------------------------------------------------------- /src/main/scala/com/amazon/deequ/dqdl/translation/rules/ColumnCorrelationRule.scala: -------------------------------------------------------------------------------- 1 | /** 2 | * Copyright 2025 Amazon.com, Inc. or its affiliates. All Rights Reserved. 3 | * 4 | * Licensed under the Apache License, Version 2.0 (the "License"). You may not 5 | * use this file except in compliance with the License. A copy of the License 6 | * is located at 7 | * 8 | * http://aws.amazon.com/apache2.0/ 9 | * 10 | * or in the "license" file accompanying this file. This file is distributed on 11 | * an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either 12 | * express or implied. See the License for the specific language governing 13 | * permissions and limitations under the License. 14 | * 15 | */ 16 | 17 | package com.amazon.deequ.dqdl.translation.rules 18 | 19 | import com.amazon.deequ.checks.{Check, CheckLevel} 20 | import com.amazon.deequ.dqdl.model.DeequMetricMapping 21 | import com.amazon.deequ.dqdl.translation.DQDLRuleConverter 22 | import com.amazon.deequ.dqdl.util.DQDLUtility.addWhereClause 23 | import software.amazon.glue.dqdl.model.DQRule 24 | import software.amazon.glue.dqdl.model.condition.number.NumberBasedCondition 25 | 26 | import scala.collection.JavaConverters._ 27 | 28 | case class ColumnCorrelationRule() extends DQDLRuleConverter { 29 | override def convert(rule: DQRule): Either[String, (Check, Seq[DeequMetricMapping])] = { 30 | val col1 = rule.getParameters.asScala("TargetColumn1") 31 | val col2 = rule.getParameters.asScala("TargetColumn2") 32 | val check = Check(CheckLevel.Error, java.util.UUID.randomUUID.toString) 33 | .hasCorrelation(col1, col2, assertionAsScala(rule, rule.getCondition.asInstanceOf[NumberBasedCondition])) 34 | Right( 35 | addWhereClause(rule, check), 36 | Seq(DeequMetricMapping("Multicolumn", s"$col1,$col2", "ColumnCorrelation", "Correlation", None, rule = rule))) 37 | } 38 | } 39 | -------------------------------------------------------------------------------- /src/main/scala/com/amazon/deequ/dqdl/translation/rules/ColumnExistsRule.scala: -------------------------------------------------------------------------------- 1 | /** 2 | * Copyright 2025 Amazon.com, Inc. or its affiliates. All Rights Reserved. 3 | * 4 | * Licensed under the Apache License, Version 2.0 (the "License"). You may not 5 | * use this file except in compliance with the License. A copy of the License 6 | * is located at 7 | * 8 | * http://aws.amazon.com/apache2.0/ 9 | * 10 | * or in the "license" file accompanying this file. This file is distributed on 11 | * an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either 12 | * express or implied. See the License for the specific language governing 13 | * permissions and limitations under the License. 14 | * 15 | */ 16 | 17 | package com.amazon.deequ.dqdl.translation.rules 18 | 19 | import com.amazon.deequ.checks.{Check, CheckLevel} 20 | import com.amazon.deequ.dqdl.model.DeequMetricMapping 21 | import com.amazon.deequ.dqdl.translation.DQDLRuleConverter 22 | import software.amazon.glue.dqdl.model.DQRule 23 | 24 | import scala.collection.JavaConverters._ 25 | 26 | case class ColumnExistsRule() extends DQDLRuleConverter { 27 | override def convert(rule: DQRule): Either[String, (Check, Seq[DeequMetricMapping])] = { 28 | val col = rule.getParameters.asScala("TargetColumn") 29 | val check = Check(CheckLevel.Error, java.util.UUID.randomUUID.toString) 30 | .hasColumn(col) 31 | Right((check, Seq(DeequMetricMapping("Dataset", col, "ColumnExists", "ColumnExists", None, rule = rule)))) 32 | } 33 | } 34 | -------------------------------------------------------------------------------- /src/main/scala/com/amazon/deequ/dqdl/translation/rules/CompletenessRule.scala: -------------------------------------------------------------------------------- 1 | /** 2 | * Copyright 2025 Amazon.com, Inc. or its affiliates. All Rights Reserved. 3 | * 4 | * Licensed under the Apache License, Version 2.0 (the "License"). You may not 5 | * use this file except in compliance with the License. A copy of the License 6 | * is located at 7 | * 8 | * http://aws.amazon.com/apache2.0/ 9 | * 10 | * or in the "license" file accompanying this file. This file is distributed on 11 | * an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either 12 | * express or implied. See the License for the specific language governing 13 | * permissions and limitations under the License. 14 | * 15 | */ 16 | 17 | package com.amazon.deequ.dqdl.translation.rules 18 | 19 | import com.amazon.deequ.checks.{Check, CheckLevel} 20 | import com.amazon.deequ.dqdl.model.DeequMetricMapping 21 | import com.amazon.deequ.dqdl.translation.DQDLRuleConverter 22 | import com.amazon.deequ.dqdl.util.DQDLUtility.addWhereClause 23 | import software.amazon.glue.dqdl.model.DQRule 24 | import software.amazon.glue.dqdl.model.condition.number.NumberBasedCondition 25 | 26 | import scala.collection.JavaConverters._ 27 | 28 | case class CompletenessRule() extends DQDLRuleConverter { 29 | override def convert(rule: DQRule): Either[String, (Check, Seq[DeequMetricMapping])] = { 30 | val col = rule.getParameters.asScala("TargetColumn") 31 | val check = Check(CheckLevel.Error, java.util.UUID.randomUUID.toString) 32 | .hasCompleteness(col, assertionAsScala(rule, rule.getCondition.asInstanceOf[NumberBasedCondition]), None, None) 33 | Right( 34 | addWhereClause(rule, check), 35 | Seq(DeequMetricMapping("Column", col, "Completeness", "Completeness", None, rule = rule))) 36 | } 37 | } 38 | -------------------------------------------------------------------------------- /src/main/scala/com/amazon/deequ/dqdl/translation/rules/CustomSqlRule.scala: -------------------------------------------------------------------------------- 1 | /** 2 | * Copyright 2025 Amazon.com, Inc. or its affiliates. All Rights Reserved. 3 | * 4 | * Licensed under the Apache License, Version 2.0 (the "License"). You may not 5 | * use this file except in compliance with the License. A copy of the License 6 | * is located at 7 | * 8 | * http://aws.amazon.com/apache2.0/ 9 | * 10 | * or in the "license" file accompanying this file. This file is distributed on 11 | * an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either 12 | * express or implied. See the License for the specific language governing 13 | * permissions and limitations under the License. 14 | * 15 | */ 16 | 17 | package com.amazon.deequ.dqdl.translation.rules 18 | 19 | import com.amazon.deequ.checks.{Check, CheckLevel} 20 | import com.amazon.deequ.dqdl.model.DeequMetricMapping 21 | import com.amazon.deequ.dqdl.translation.DQDLRuleConverter 22 | import software.amazon.glue.dqdl.model.DQRule 23 | import software.amazon.glue.dqdl.model.condition.number.NumberBasedCondition 24 | 25 | import scala.collection.JavaConverters._ 26 | 27 | case class CustomSqlRule() extends DQDLRuleConverter { 28 | override def convert(rule: DQRule): Either[String, (Check, Seq[DeequMetricMapping])] = { 29 | val fn = assertionAsScala(rule, rule.getCondition.asInstanceOf[NumberBasedCondition]) 30 | val statement = rule.getParameters.asScala("CustomSqlStatement") 31 | val check = Check(CheckLevel.Error, java.util.UUID.randomUUID.toString).customSql(statement, rc => fn(rc.toDouble)) 32 | Right(check, Seq(DeequMetricMapping("Dataset", "*", "CustomSQL", "CustomSQL", None, rule = rule))) 33 | } 34 | } 35 | -------------------------------------------------------------------------------- /src/main/scala/com/amazon/deequ/dqdl/translation/rules/DistinctValuesCountRule.scala: -------------------------------------------------------------------------------- 1 | /** 2 | * Copyright 2025 Amazon.com, Inc. or its affiliates. All Rights Reserved. 3 | * 4 | * Licensed under the Apache License, Version 2.0 (the "License"). You may not 5 | * use this file except in compliance with the License. A copy of the License 6 | * is located at 7 | * 8 | * http://aws.amazon.com/apache2.0/ 9 | * 10 | * or in the "license" file accompanying this file. This file is distributed on 11 | * an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either 12 | * express or implied. See the License for the specific language governing 13 | * permissions and limitations under the License. 14 | * 15 | */ 16 | 17 | package com.amazon.deequ.dqdl.translation.rules 18 | 19 | import com.amazon.deequ.checks.{Check, CheckLevel} 20 | import com.amazon.deequ.dqdl.model.DeequMetricMapping 21 | import com.amazon.deequ.dqdl.translation.DQDLRuleConverter 22 | import com.amazon.deequ.dqdl.util.DQDLUtility.addWhereClause 23 | import software.amazon.glue.dqdl.model.DQRule 24 | import software.amazon.glue.dqdl.model.condition.number.NumberBasedCondition 25 | 26 | import scala.collection.JavaConverters._ 27 | 28 | case class DistinctValuesCountRule() extends DQDLRuleConverter { 29 | override def convert(rule: DQRule): Either[String, (Check, Seq[DeequMetricMapping])] = { 30 | val col = rule.getParameters.asScala("TargetColumn") 31 | val fn = assertionAsScala(rule, rule.getCondition.asInstanceOf[NumberBasedCondition]) 32 | val check = Check(CheckLevel.Error, java.util.UUID.randomUUID.toString) 33 | .hasNumberOfDistinctValues(col, rc => fn(rc.toDouble)) 34 | Right( 35 | addWhereClause(rule, check), 36 | Seq(DeequMetricMapping("Column", col, "DistinctValuesCount", "Histogram.bins", None, rule = rule))) 37 | } 38 | } 39 | -------------------------------------------------------------------------------- /src/main/scala/com/amazon/deequ/dqdl/translation/rules/EntropyRule.scala: -------------------------------------------------------------------------------- 1 | /** 2 | * Copyright 2025 Amazon.com, Inc. or its affiliates. All Rights Reserved. 3 | * 4 | * Licensed under the Apache License, Version 2.0 (the "License"). You may not 5 | * use this file except in compliance with the License. A copy of the License 6 | * is located at 7 | * 8 | * http://aws.amazon.com/apache2.0/ 9 | * 10 | * or in the "license" file accompanying this file. This file is distributed on 11 | * an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either 12 | * express or implied. See the License for the specific language governing 13 | * permissions and limitations under the License. 14 | * 15 | */ 16 | 17 | package com.amazon.deequ.dqdl.translation.rules 18 | 19 | import com.amazon.deequ.checks.{Check, CheckLevel} 20 | import com.amazon.deequ.dqdl.model.DeequMetricMapping 21 | import com.amazon.deequ.dqdl.translation.DQDLRuleConverter 22 | import com.amazon.deequ.dqdl.util.DQDLUtility.addWhereClause 23 | import software.amazon.glue.dqdl.model.DQRule 24 | import software.amazon.glue.dqdl.model.condition.number.NumberBasedCondition 25 | 26 | import scala.collection.JavaConverters._ 27 | 28 | case class EntropyRule() extends DQDLRuleConverter { 29 | override def convert(rule: DQRule): Either[String, (Check, Seq[DeequMetricMapping])] = { 30 | val col = rule.getParameters.asScala("TargetColumn") 31 | val check = Check(CheckLevel.Error, java.util.UUID.randomUUID.toString) 32 | .hasEntropy(col, assertionAsScala(rule, rule.getCondition.asInstanceOf[NumberBasedCondition])) 33 | Right( 34 | addWhereClause(rule, check), 35 | Seq(DeequMetricMapping("Column", col, "Entropy", "Entropy", None, rule = rule))) 36 | } 37 | } 38 | -------------------------------------------------------------------------------- /src/main/scala/com/amazon/deequ/dqdl/translation/rules/IsCompleteRule.scala: -------------------------------------------------------------------------------- 1 | /** 2 | * Copyright 2025 Amazon.com, Inc. or its affiliates. All Rights Reserved. 3 | * 4 | * Licensed under the Apache License, Version 2.0 (the "License"). You may not 5 | * use this file except in compliance with the License. A copy of the License 6 | * is located at 7 | * 8 | * http://aws.amazon.com/apache2.0/ 9 | * 10 | * or in the "license" file accompanying this file. This file is distributed on 11 | * an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either 12 | * express or implied. See the License for the specific language governing 13 | * permissions and limitations under the License. 14 | * 15 | */ 16 | 17 | package com.amazon.deequ.dqdl.translation.rules 18 | 19 | import com.amazon.deequ.checks.{Check, CheckLevel} 20 | import com.amazon.deequ.dqdl.model.DeequMetricMapping 21 | import com.amazon.deequ.dqdl.translation.DQDLRuleConverter 22 | import com.amazon.deequ.dqdl.util.DQDLUtility.addWhereClause 23 | import software.amazon.glue.dqdl.model.DQRule 24 | import software.amazon.glue.dqdl.model.condition.number.NumberBasedCondition 25 | 26 | import scala.collection.JavaConverters._ 27 | 28 | case class IsCompleteRule() extends DQDLRuleConverter { 29 | override def convert(rule: DQRule): Either[String, (Check, Seq[DeequMetricMapping])] = { 30 | val col = rule.getParameters.asScala("TargetColumn") 31 | val check = Check(CheckLevel.Error, java.util.UUID.randomUUID.toString).isComplete(col) 32 | Right( 33 | addWhereClause(rule, check), 34 | Seq(DeequMetricMapping("Column", col, "Completeness", "Completeness", None, rule = rule))) 35 | } 36 | } 37 | -------------------------------------------------------------------------------- /src/main/scala/com/amazon/deequ/dqdl/translation/rules/IsPrimaryKeyRule.scala: -------------------------------------------------------------------------------- 1 | /** 2 | * Copyright 2025 Amazon.com, Inc. or its affiliates. All Rights Reserved. 3 | * 4 | * Licensed under the Apache License, Version 2.0 (the "License"). You may not 5 | * use this file except in compliance with the License. A copy of the License 6 | * is located at 7 | * 8 | * http://aws.amazon.com/apache2.0/ 9 | * 10 | * or in the "license" file accompanying this file. This file is distributed on 11 | * an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either 12 | * express or implied. See the License for the specific language governing 13 | * permissions and limitations under the License. 14 | * 15 | */ 16 | 17 | package com.amazon.deequ.dqdl.translation.rules 18 | 19 | import com.amazon.deequ.checks.{Check, CheckLevel} 20 | import com.amazon.deequ.dqdl.model.DeequMetricMapping 21 | import com.amazon.deequ.dqdl.translation.DQDLRuleConverter 22 | import com.amazon.deequ.dqdl.util.DQDLUtility.addWhereClause 23 | import software.amazon.glue.dqdl.model.DQRule 24 | import software.amazon.glue.dqdl.model.condition.number.NumberBasedCondition 25 | 26 | import scala.collection.JavaConverters._ 27 | 28 | case class IsPrimaryKeyRule() extends DQDLRuleConverter { 29 | override def convert(rule: DQRule): Either[String, (Check, Seq[DeequMetricMapping])] = { 30 | val columns = rule.getParameters.asScala.collect { case (k, v) if k.startsWith("TargetColumn") => v }.toSeq 31 | val check = Check(CheckLevel.Error, java.util.UUID.randomUUID.toString) 32 | columns match { 33 | case Nil => Left("Required parameters not found") 34 | case Seq(singleCol) => 35 | val singleColCheck = addWhereClause(rule, check.isPrimaryKey(singleCol, None)).isComplete(singleCol, None) 36 | Right( 37 | addWhereClause(rule, singleColCheck), 38 | Seq( 39 | DeequMetricMapping("Column", singleCol, "Uniqueness", "Uniqueness", None, rule = rule), 40 | DeequMetricMapping("Column", singleCol, "Completeness", "Completeness", None, rule = rule) 41 | ) 42 | ) 43 | case cols@(head +: tail) => 44 | val multiColCheck = addWhereClause(rule, check.isPrimaryKey(head, None, tail: _*)) 45 | cols.foldLeft(multiColCheck) { 46 | (mc, col) => addWhereClause(rule, mc.isComplete(col)) 47 | } 48 | val completenessMetricMappings = cols.map( 49 | col => DeequMetricMapping("Column", col, "Completeness", "Completeness", None, rule = rule) 50 | ) 51 | Right( 52 | multiColCheck, 53 | Seq(DeequMetricMapping("Multicolumn", cols.mkString(","), "Uniqueness", "Uniqueness", None, rule = rule)) 54 | ++ completenessMetricMappings 55 | ) 56 | } 57 | } 58 | } 59 | -------------------------------------------------------------------------------- /src/main/scala/com/amazon/deequ/dqdl/translation/rules/IsUniqueRule.scala: -------------------------------------------------------------------------------- 1 | /** 2 | * Copyright 2025 Amazon.com, Inc. or its affiliates. All Rights Reserved. 3 | * 4 | * Licensed under the Apache License, Version 2.0 (the "License"). You may not 5 | * use this file except in compliance with the License. A copy of the License 6 | * is located at 7 | * 8 | * http://aws.amazon.com/apache2.0/ 9 | * 10 | * or in the "license" file accompanying this file. This file is distributed on 11 | * an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either 12 | * express or implied. See the License for the specific language governing 13 | * permissions and limitations under the License. 14 | * 15 | */ 16 | 17 | package com.amazon.deequ.dqdl.translation.rules 18 | 19 | import com.amazon.deequ.checks.{Check, CheckLevel} 20 | import com.amazon.deequ.dqdl.model.DeequMetricMapping 21 | import com.amazon.deequ.dqdl.translation.DQDLRuleConverter 22 | import com.amazon.deequ.dqdl.util.DQDLUtility.addWhereClause 23 | import software.amazon.glue.dqdl.model.DQRule 24 | import software.amazon.glue.dqdl.model.condition.number.NumberBasedCondition 25 | 26 | import scala.collection.JavaConverters._ 27 | 28 | case class IsUniqueRule() extends DQDLRuleConverter { 29 | override def convert(rule: DQRule): Either[String, (Check, Seq[DeequMetricMapping])] = { 30 | val columns = rule.getParameters.asScala.collect { case (k, v) if k.startsWith("TargetColumn") => v }.toSeq 31 | val check = Check(CheckLevel.Error, java.util.UUID.randomUUID.toString) 32 | columns match { 33 | case Nil => Left("Required parameters not found") 34 | 35 | case Seq(singleCol) => 36 | val singleColCheck = check.isUnique(singleCol) 37 | Right((addWhereClause(rule, singleColCheck), 38 | Seq(DeequMetricMapping("Column", singleCol, "Uniqueness", "Uniqueness", None, rule = rule)))) 39 | 40 | case cols@(head +: tail) => 41 | val multiColCheck = check.areUnique(columns) 42 | Right( 43 | addWhereClause(rule, multiColCheck), 44 | Seq(DeequMetricMapping("Multicolumn", columns.mkString(","), "Uniqueness", "Uniqueness", None, rule = rule))) 45 | } 46 | } 47 | } 48 | -------------------------------------------------------------------------------- /src/main/scala/com/amazon/deequ/dqdl/translation/rules/MeanRule.scala: -------------------------------------------------------------------------------- 1 | /** 2 | * Copyright 2025 Amazon.com, Inc. or its affiliates. All Rights Reserved. 3 | * 4 | * Licensed under the Apache License, Version 2.0 (the "License"). You may not 5 | * use this file except in compliance with the License. A copy of the License 6 | * is located at 7 | * 8 | * http://aws.amazon.com/apache2.0/ 9 | * 10 | * or in the "license" file accompanying this file. This file is distributed on 11 | * an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either 12 | * express or implied. See the License for the specific language governing 13 | * permissions and limitations under the License. 14 | * 15 | */ 16 | 17 | package com.amazon.deequ.dqdl.translation.rules 18 | 19 | import com.amazon.deequ.checks.{Check, CheckLevel} 20 | import com.amazon.deequ.dqdl.model.DeequMetricMapping 21 | import com.amazon.deequ.dqdl.translation.DQDLRuleConverter 22 | import com.amazon.deequ.dqdl.util.DQDLUtility.addWhereClause 23 | import software.amazon.glue.dqdl.model.DQRule 24 | import software.amazon.glue.dqdl.model.condition.number.NumberBasedCondition 25 | 26 | import scala.collection.JavaConverters._ 27 | 28 | case class MeanRule() extends DQDLRuleConverter { 29 | override def convert(rule: DQRule): Either[String, (Check, Seq[DeequMetricMapping])] = { 30 | val col = rule.getParameters.asScala("TargetColumn") 31 | val check = Check(CheckLevel.Error, java.util.UUID.randomUUID.toString) 32 | .hasMean(col, assertionAsScala(rule, rule.getCondition.asInstanceOf[NumberBasedCondition])) 33 | Right(addWhereClause(rule, check), Seq(DeequMetricMapping("Column", col, "Mean", "Mean", None, rule = rule))) 34 | } 35 | } 36 | -------------------------------------------------------------------------------- /src/main/scala/com/amazon/deequ/dqdl/translation/rules/RowCountRule.scala: -------------------------------------------------------------------------------- 1 | /** 2 | * Copyright 2025 Amazon.com, Inc. or its affiliates. All Rights Reserved. 3 | * 4 | * Licensed under the Apache License, Version 2.0 (the "License"). You may not 5 | * use this file except in compliance with the License. A copy of the License 6 | * is located at 7 | * 8 | * http://aws.amazon.com/apache2.0/ 9 | * 10 | * or in the "license" file accompanying this file. This file is distributed on 11 | * an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either 12 | * express or implied. See the License for the specific language governing 13 | * permissions and limitations under the License. 14 | * 15 | */ 16 | 17 | package com.amazon.deequ.dqdl.translation.rules 18 | 19 | import com.amazon.deequ.checks.{Check, CheckLevel} 20 | import com.amazon.deequ.dqdl.model.DeequMetricMapping 21 | import com.amazon.deequ.dqdl.translation.DQDLRuleConverter 22 | import com.amazon.deequ.dqdl.util.DQDLUtility.addWhereClause 23 | import software.amazon.glue.dqdl.model.DQRule 24 | import software.amazon.glue.dqdl.model.condition.number.NumberBasedCondition 25 | 26 | case class RowCountRule() extends DQDLRuleConverter { 27 | override def convert(rule: DQRule): Either[String, (Check, Seq[DeequMetricMapping])] = { 28 | val fn = assertionAsScala(rule, rule.getCondition.asInstanceOf[NumberBasedCondition]) 29 | val check = Check(CheckLevel.Error, java.util.UUID.randomUUID.toString).hasSize(rc => fn(rc.toDouble)) 30 | Right(addWhereClause(rule, check), Seq(DeequMetricMapping("Dataset", "*", "RowCount", "Size", None, rule = rule))) 31 | } 32 | } 33 | -------------------------------------------------------------------------------- /src/main/scala/com/amazon/deequ/dqdl/translation/rules/StandardDeviationRule.scala: -------------------------------------------------------------------------------- 1 | /** 2 | * Copyright 2025 Amazon.com, Inc. or its affiliates. All Rights Reserved. 3 | * 4 | * Licensed under the Apache License, Version 2.0 (the "License"). You may not 5 | * use this file except in compliance with the License. A copy of the License 6 | * is located at 7 | * 8 | * http://aws.amazon.com/apache2.0/ 9 | * 10 | * or in the "license" file accompanying this file. This file is distributed on 11 | * an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either 12 | * express or implied. See the License for the specific language governing 13 | * permissions and limitations under the License. 14 | * 15 | */ 16 | 17 | package com.amazon.deequ.dqdl.translation.rules 18 | 19 | import com.amazon.deequ.checks.{Check, CheckLevel} 20 | import com.amazon.deequ.dqdl.model.DeequMetricMapping 21 | import com.amazon.deequ.dqdl.translation.DQDLRuleConverter 22 | import com.amazon.deequ.dqdl.util.DQDLUtility.addWhereClause 23 | import software.amazon.glue.dqdl.model.DQRule 24 | import software.amazon.glue.dqdl.model.condition.number.NumberBasedCondition 25 | 26 | import scala.collection.JavaConverters._ 27 | 28 | case class StandardDeviationRule() extends DQDLRuleConverter { 29 | override def convert(rule: DQRule): Either[String, (Check, Seq[DeequMetricMapping])] = { 30 | val col = rule.getParameters.asScala("TargetColumn") 31 | val check = Check(CheckLevel.Error, java.util.UUID.randomUUID.toString) 32 | .hasStandardDeviation(col, assertionAsScala(rule, rule.getCondition.asInstanceOf[NumberBasedCondition])) 33 | Right( 34 | addWhereClause(rule, check), 35 | Seq(DeequMetricMapping("Column", col, "StandardDeviation", "StandardDeviation", None, rule = rule))) 36 | } 37 | } 38 | -------------------------------------------------------------------------------- /src/main/scala/com/amazon/deequ/dqdl/translation/rules/SumRule.scala: -------------------------------------------------------------------------------- 1 | /** 2 | * Copyright 2025 Amazon.com, Inc. or its affiliates. All Rights Reserved. 3 | * 4 | * Licensed under the Apache License, Version 2.0 (the "License"). You may not 5 | * use this file except in compliance with the License. A copy of the License 6 | * is located at 7 | * 8 | * http://aws.amazon.com/apache2.0/ 9 | * 10 | * or in the "license" file accompanying this file. This file is distributed on 11 | * an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either 12 | * express or implied. See the License for the specific language governing 13 | * permissions and limitations under the License. 14 | * 15 | */ 16 | 17 | package com.amazon.deequ.dqdl.translation.rules 18 | 19 | import com.amazon.deequ.checks.{Check, CheckLevel} 20 | import com.amazon.deequ.dqdl.model.DeequMetricMapping 21 | import com.amazon.deequ.dqdl.translation.DQDLRuleConverter 22 | import com.amazon.deequ.dqdl.util.DQDLUtility.addWhereClause 23 | import software.amazon.glue.dqdl.model.DQRule 24 | import software.amazon.glue.dqdl.model.condition.number.NumberBasedCondition 25 | 26 | import scala.collection.JavaConverters._ 27 | 28 | case class SumRule() extends DQDLRuleConverter { 29 | override def convert(rule: DQRule): Either[String, (Check, Seq[DeequMetricMapping])] = { 30 | val col = rule.getParameters.asScala("TargetColumn") 31 | val check = Check(CheckLevel.Error, java.util.UUID.randomUUID.toString) 32 | .hasSum(col, assertionAsScala(rule, rule.getCondition.asInstanceOf[NumberBasedCondition])) 33 | Right(addWhereClause(rule, check), Seq(DeequMetricMapping("Column", col, "Sum", "Sum", None, rule = rule))) 34 | } 35 | } 36 | -------------------------------------------------------------------------------- /src/main/scala/com/amazon/deequ/dqdl/translation/rules/UniqueValueRatioRule.scala: -------------------------------------------------------------------------------- 1 | /** 2 | * Copyright 2025 Amazon.com, Inc. or its affiliates. All Rights Reserved. 3 | * 4 | * Licensed under the Apache License, Version 2.0 (the "License"). You may not 5 | * use this file except in compliance with the License. A copy of the License 6 | * is located at 7 | * 8 | * http://aws.amazon.com/apache2.0/ 9 | * 10 | * or in the "license" file accompanying this file. This file is distributed on 11 | * an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either 12 | * express or implied. See the License for the specific language governing 13 | * permissions and limitations under the License. 14 | * 15 | */ 16 | 17 | package com.amazon.deequ.dqdl.translation.rules 18 | 19 | import com.amazon.deequ.checks.{Check, CheckLevel} 20 | import com.amazon.deequ.dqdl.model.DeequMetricMapping 21 | import com.amazon.deequ.dqdl.translation.DQDLRuleConverter 22 | import com.amazon.deequ.dqdl.util.DQDLUtility.addWhereClause 23 | import software.amazon.glue.dqdl.model.DQRule 24 | import software.amazon.glue.dqdl.model.condition.number.NumberBasedCondition 25 | 26 | import scala.collection.JavaConverters._ 27 | 28 | case class UniqueValueRatioRule() extends DQDLRuleConverter { 29 | override def convert(rule: DQRule): Either[String, (Check, Seq[DeequMetricMapping])] = { 30 | val col = rule.getParameters.asScala("TargetColumn") 31 | val check = Check(CheckLevel.Error, java.util.UUID.randomUUID.toString) 32 | .hasUniqueValueRatio(Seq(col), assertionAsScala(rule, rule.getCondition.asInstanceOf[NumberBasedCondition])) 33 | Right( 34 | addWhereClause(rule, check), 35 | Seq(DeequMetricMapping("Column", col, "UniqueValueRatio", "UniqueValueRatio", None, rule = rule))) 36 | } 37 | } 38 | -------------------------------------------------------------------------------- /src/main/scala/com/amazon/deequ/dqdl/translation/rules/UniquenessRule.scala: -------------------------------------------------------------------------------- 1 | /** 2 | * Copyright 2025 Amazon.com, Inc. or its affiliates. All Rights Reserved. 3 | * 4 | * Licensed under the Apache License, Version 2.0 (the "License"). You may not 5 | * use this file except in compliance with the License. A copy of the License 6 | * is located at 7 | * 8 | * http://aws.amazon.com/apache2.0/ 9 | * 10 | * or in the "license" file accompanying this file. This file is distributed on 11 | * an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either 12 | * express or implied. See the License for the specific language governing 13 | * permissions and limitations under the License. 14 | * 15 | */ 16 | 17 | package com.amazon.deequ.dqdl.translation.rules 18 | 19 | import com.amazon.deequ.checks.{Check, CheckLevel} 20 | import com.amazon.deequ.dqdl.model.DeequMetricMapping 21 | import com.amazon.deequ.dqdl.translation.DQDLRuleConverter 22 | import com.amazon.deequ.dqdl.util.DQDLUtility.addWhereClause 23 | import software.amazon.glue.dqdl.model.DQRule 24 | import software.amazon.glue.dqdl.model.condition.number.NumberBasedCondition 25 | 26 | import scala.collection.JavaConverters._ 27 | 28 | case class UniquenessRule() extends DQDLRuleConverter { 29 | override def convert(rule: DQRule): Either[String, (Check, Seq[DeequMetricMapping])] = { 30 | val columns: Seq[String] = rule.getParameters.asScala.collect { 31 | case (k, v) if k.startsWith("TargetColumn") => v 32 | }.toSeq 33 | val check = Check(CheckLevel.Error, java.util.UUID.randomUUID.toString) 34 | columns match { 35 | case Nil => Left("Required parameters not found") 36 | 37 | case Seq(singleCol) => 38 | val singleColCheck = check 39 | .hasUniqueness(singleCol, assertionAsScala(rule, rule.getCondition.asInstanceOf[NumberBasedCondition])) 40 | Right((addWhereClause(rule, singleColCheck), 41 | Seq(DeequMetricMapping("Column", singleCol, "Uniqueness", "Uniqueness", None, rule = rule)))) 42 | 43 | case cols@(head +: tail) => 44 | val multiColCheck = check 45 | .hasUniqueness(columns, assertionAsScala(rule, rule.getCondition.asInstanceOf[NumberBasedCondition])) 46 | Right( 47 | addWhereClause(rule, multiColCheck), 48 | Seq(DeequMetricMapping("Multicolumn", columns.mkString(","), "Uniqueness", "Uniqueness", None, rule = rule))) 49 | } 50 | } 51 | } 52 | -------------------------------------------------------------------------------- /src/main/scala/com/amazon/deequ/dqdl/util/DQDLUtility.scala: -------------------------------------------------------------------------------- 1 | /** 2 | * Copyright 2025 Amazon.com, Inc. or its affiliates. All Rights Reserved. 3 | * 4 | * Licensed under the Apache License, Version 2.0 (the "License"). You may not 5 | * use this file except in compliance with the License. A copy of the License 6 | * is located at 7 | * 8 | * http://aws.amazon.com/apache2.0/ 9 | * 10 | * or in the "license" file accompanying this file. This file is distributed on 11 | * an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either 12 | * express or implied. See the License for the specific language governing 13 | * permissions and limitations under the License. 14 | * 15 | */ 16 | 17 | package com.amazon.deequ.dqdl.util 18 | 19 | import com.amazon.deequ.checks.{Check, CheckWithLastConstraintFilterable} 20 | import software.amazon.glue.dqdl.model.DQRule 21 | 22 | object DQDLUtility { 23 | 24 | def convertWhereClauseForMetric(whereClause: String): Option[String] = 25 | Option(whereClause).map(_ => s"(where: $whereClause)") 26 | 27 | def isWhereClausePresent(rule: DQRule): Boolean = { 28 | rule.getWhereClause != null 29 | } 30 | 31 | def addWhereClause(rule: DQRule, check: CheckWithLastConstraintFilterable): Check = 32 | if (isWhereClausePresent(rule)) check.where(rule.getWhereClause) 33 | else check 34 | 35 | def requiresToBeQuoted(s: String): Boolean = { 36 | if (s.startsWith("`") && s.endsWith("`")) false else { 37 | val specialCharsRegex = """[^a-zA-Z0-9]""".r 38 | specialCharsRegex.findFirstMatchIn(s).isDefined 39 | } 40 | } 41 | 42 | } 43 | -------------------------------------------------------------------------------- /src/main/scala/com/amazon/deequ/dqdl/util/DefaultDQDLParser.scala: -------------------------------------------------------------------------------- 1 | /** 2 | * Copyright 2025 Amazon.com, Inc. or its affiliates. All Rights Reserved. 3 | * 4 | * Licensed under the Apache License, Version 2.0 (the "License"). You may not 5 | * use this file except in compliance with the License. A copy of the License 6 | * is located at 7 | * 8 | * http://aws.amazon.com/apache2.0/ 9 | * 10 | * or in the "license" file accompanying this file. This file is distributed on 11 | * an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either 12 | * express or implied. See the License for the specific language governing 13 | * permissions and limitations under the License. 14 | * 15 | */ 16 | 17 | package com.amazon.deequ.dqdl.util 18 | 19 | import software.amazon.glue.dqdl.exception.InvalidDataQualityRulesetException 20 | import software.amazon.glue.dqdl.model.DQRuleset 21 | import software.amazon.glue.dqdl.parser.DQDLParser 22 | 23 | import scala.util.{Failure, Success, Try} 24 | 25 | trait DQDLParserTrait { 26 | def parse(ruleset: String): DQRuleset 27 | } 28 | 29 | object DefaultDQDLParser extends DQDLParserTrait { 30 | override def parse(ruleset: String): DQRuleset = { 31 | val dqdlParser: DQDLParser = new DQDLParser() 32 | val dqRuleset: DQRuleset = Try { 33 | dqdlParser.parse(ruleset) 34 | } match { 35 | case Success(value) => value 36 | case Failure(ex: InvalidDataQualityRulesetException) => throw new IllegalArgumentException(ex.getMessage) 37 | case Failure(ex) => throw ex 38 | } 39 | dqRuleset 40 | } 41 | } 42 | -------------------------------------------------------------------------------- /src/main/scala/com/amazon/deequ/examples/ExampleUtils.scala: -------------------------------------------------------------------------------- 1 | /** 2 | * Copyright 2018 Amazon.com, Inc. or its affiliates. All Rights Reserved. 3 | * 4 | * Licensed under the Apache License, Version 2.0 (the "License"). You may not 5 | * use this file except in compliance with the License. A copy of the License 6 | * is located at 7 | * 8 | * http://aws.amazon.com/apache2.0/ 9 | * 10 | * or in the "license" file accompanying this file. This file is distributed on 11 | * an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either 12 | * express or implied. See the License for the specific language governing 13 | * permissions and limitations under the License. 14 | * 15 | */ 16 | 17 | package com.amazon.deequ.examples 18 | 19 | import org.apache.spark.sql.{DataFrame, SparkSession} 20 | 21 | private[deequ] object ExampleUtils { 22 | 23 | def withSpark(func: SparkSession => Unit): Unit = { 24 | val session = SparkSession.builder() 25 | .master("local") 26 | .appName("test") 27 | .config("spark.ui.enabled", "false") 28 | .getOrCreate() 29 | session.sparkContext.setCheckpointDir(System.getProperty("java.io.tmpdir")) 30 | 31 | try { 32 | func(session) 33 | } finally { 34 | session.stop() 35 | System.clearProperty("spark.driver.port") 36 | } 37 | } 38 | 39 | def itemsAsDataframe(session: SparkSession, items: Item*): DataFrame = { 40 | val rdd = session.sparkContext.parallelize(items) 41 | session.createDataFrame(rdd) 42 | } 43 | 44 | def manufacturersAsDataframe(session: SparkSession, manufacturers: Manufacturer*): DataFrame = { 45 | val rdd = session.sparkContext.parallelize(manufacturers) 46 | session.createDataFrame(rdd) 47 | } 48 | } 49 | -------------------------------------------------------------------------------- /src/main/scala/com/amazon/deequ/examples/KLLCheckExample.scala: -------------------------------------------------------------------------------- 1 | /** 2 | * Copyright 2018 Amazon.com, Inc. or its affiliates. All Rights Reserved. 3 | * 4 | * Licensed under the Apache License, Version 2.0 (the "License"). You may not 5 | * use this file except in compliance with the License. A copy of the License 6 | * is located at 7 | * 8 | * http://aws.amazon.com/apache2.0/ 9 | * 10 | * or in the "license" file accompanying this file. This file is distributed on 11 | * an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either 12 | * express or implied. See the License for the specific language governing 13 | * permissions and limitations under the License. 14 | * 15 | */ 16 | 17 | package com.amazon.deequ.examples 18 | 19 | import ExampleUtils.{itemsAsDataframe, withSpark} 20 | import com.amazon.deequ.VerificationSuite 21 | import com.amazon.deequ.analyzers.KLLParameters 22 | import com.amazon.deequ.checks.{Check, CheckLevel, CheckStatus} 23 | import com.amazon.deequ.constraints.ConstraintStatus 24 | import org.apache.spark.sql.types.DoubleType 25 | 26 | private[examples] object KLLCheckExample extends App { 27 | 28 | withSpark { session => 29 | 30 | val data = itemsAsDataframe(session, 31 | Item(1, "Thingy A", "awesome thing.", "high", 0), 32 | Item(2, "Thingy B", "available at http://thingb.com", null, 0), 33 | Item(3, null, null, "low", 5), 34 | Item(4, "Thingy D", "checkout https://thingd.ca", "low", 10), 35 | Item(5, "Thingy E", null, "high", 12)) 36 | 37 | val newData = data.select(data("numViews").cast(DoubleType).as("numViews")) 38 | 39 | val verificationResult = VerificationSuite() 40 | .onData(newData) 41 | .addCheck( 42 | Check(CheckLevel.Error, "integrity checks") 43 | // we expect 5 records 44 | .hasSize(_ == 5) 45 | // we expect the maximum of tips to be not more than 10 46 | .hasMax("numViews", _ <= 10) 47 | // we expect the sketch size to be at least 16 48 | .kllSketchSatisfies("numViews", _.parameters(1) >= 16, 49 | kllParameters = Option(KLLParameters(2, 0.64, 2)))) 50 | .run() 51 | 52 | if (verificationResult.status == CheckStatus.Success) { 53 | println("The data passed the test, everything is fine!") 54 | } else { 55 | println("We found errors in the data, the following constraints were not satisfied:\n") 56 | 57 | val resultsForAllConstraints = verificationResult.checkResults 58 | .flatMap { case (_, checkResult) => checkResult.constraintResults } 59 | 60 | resultsForAllConstraints 61 | .filter { _.status != ConstraintStatus.Success } 62 | .foreach { result => 63 | println(s"${result.constraint} failed: ${result.message.get}") 64 | } 65 | } 66 | 67 | } 68 | } 69 | 70 | -------------------------------------------------------------------------------- /src/main/scala/com/amazon/deequ/examples/entities.scala: -------------------------------------------------------------------------------- 1 | /** 2 | * Copyright 2018 Amazon.com, Inc. or its affiliates. All Rights Reserved. 3 | * 4 | * Licensed under the Apache License, Version 2.0 (the "License"). You may not 5 | * use this file except in compliance with the License. A copy of the License 6 | * is located at 7 | * 8 | * http://aws.amazon.com/apache2.0/ 9 | * 10 | * or in the "license" file accompanying this file. This file is distributed on 11 | * an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either 12 | * express or implied. See the License for the specific language governing 13 | * permissions and limitations under the License. 14 | * 15 | */ 16 | 17 | package com.amazon.deequ.examples 18 | 19 | private[deequ] case class Item( 20 | id: Long, 21 | productName: String, 22 | description: String, 23 | priority: String, 24 | numViews: Long 25 | ) 26 | 27 | private[deequ] case class Manufacturer( 28 | id: Long, 29 | manufacturerName: String, 30 | countryCode: String 31 | ) 32 | -------------------------------------------------------------------------------- /src/main/scala/com/amazon/deequ/metrics/HistogramBinnedMetric.scala: -------------------------------------------------------------------------------- 1 | /** 2 | * Copyright 2025 Amazon.com, Inc. or its affiliates. All Rights Reserved. 3 | * 4 | * Licensed under the Apache License, Version 2.0 (the "License"). You may not 5 | * use this file except in compliance with the License. A copy of the License 6 | * is located at 7 | * 8 | * http://aws.amazon.com/apache2.0/ 9 | * 10 | * or in the "license" file accompanying this file. This file is distributed on 11 | * an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either 12 | * express or implied. See the License for the specific language governing 13 | * permissions and limitations under the License. 14 | * 15 | */ 16 | 17 | package com.amazon.deequ.metrics 18 | 19 | import scala.util.Failure 20 | import scala.util.Success 21 | import scala.util.Try 22 | 23 | case class BinData(binStart: Double, binEnd: Double, frequency: Long, ratio: Double) 24 | 25 | case class DistributionBinned( 26 | bins: Vector[BinData], 27 | numberOfBins: Long 28 | ) { 29 | 30 | def apply(index: Int): BinData = bins(index) 31 | 32 | def getInterval(index: Int): String = { 33 | val bin = bins(index) 34 | if (index == bins.length - 1) { 35 | f"[${bin.binStart}%.2f, ${bin.binEnd}%.2f]" 36 | } else { 37 | f"[${bin.binStart}%.2f, ${bin.binEnd}%.2f)" 38 | } 39 | } 40 | } 41 | 42 | case class HistogramBinnedMetric(column: String, value: Try[DistributionBinned]) extends Metric[DistributionBinned] { 43 | 44 | val entity = Entity.Column 45 | val instance = column 46 | val name = "HistogramBinned" 47 | 48 | def flatten(): Seq[DoubleMetric] = { 49 | value 50 | .map { distribution => 51 | val numberOfBins = DoubleMetric(entity, s"$name.bins", instance, 52 | Success(distribution.numberOfBins)) 53 | 54 | val details = distribution.bins.zipWithIndex.map { case (binData, index) => 55 | DoubleMetric(entity, s"$name.abs.bin$index", instance, Success(binData.frequency)) 56 | } 57 | numberOfBins +: details 58 | } 59 | .recover { 60 | case e: Exception => Seq(DoubleMetric(entity, s"$name.bins", instance, Failure(e))) 61 | } 62 | .get 63 | } 64 | } 65 | -------------------------------------------------------------------------------- /src/main/scala/com/amazon/deequ/metrics/HistogramMetric.scala: -------------------------------------------------------------------------------- 1 | /** 2 | * Copyright 2018 Amazon.com, Inc. or its affiliates. All Rights Reserved. 3 | * 4 | * Licensed under the Apache License, Version 2.0 (the "License"). You may not 5 | * use this file except in compliance with the License. A copy of the License 6 | * is located at 7 | * 8 | * http://aws.amazon.com/apache2.0/ 9 | * 10 | * or in the "license" file accompanying this file. This file is distributed on 11 | * an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either 12 | * express or implied. See the License for the specific language governing 13 | * permissions and limitations under the License. 14 | * 15 | */ 16 | 17 | package com.amazon.deequ.metrics 18 | 19 | import scala.util.{Failure, Success, Try} 20 | 21 | case class DistributionValue(absolute: Long, ratio: Double) 22 | 23 | case class Distribution(values: Map[String, DistributionValue], numberOfBins: Long) { 24 | 25 | def apply(key: String): DistributionValue = { 26 | values(key) 27 | } 28 | 29 | def argmax: String = { 30 | val (distributionKey, _) = values.toSeq 31 | .maxBy { case (_, distributionValue) => distributionValue.absolute } 32 | 33 | distributionKey 34 | } 35 | } 36 | 37 | case class HistogramMetric(column: String, value: Try[Distribution]) extends Metric[Distribution] { 38 | val entity: Entity.Value = Entity.Column 39 | val instance: String = column 40 | val name = "Histogram" 41 | 42 | def flatten(): Seq[DoubleMetric] = { 43 | value 44 | .map { distribution => 45 | val numberOfBins = Seq(DoubleMetric(entity, s"$name.bins", instance, 46 | Success(distribution.numberOfBins.toDouble))) 47 | 48 | val details = distribution.values 49 | .flatMap { case (key, distValue) => 50 | DoubleMetric(entity, s"$name.abs.$key", instance, Success(distValue.absolute)) :: 51 | DoubleMetric(entity, s"$name.ratio.$key", instance, Success(distValue.ratio)) :: Nil 52 | } 53 | numberOfBins ++ details 54 | } 55 | .recover { 56 | case e: Exception => Seq(DoubleMetric(entity, s"$name.bins", instance, Failure(e))) 57 | } 58 | .get 59 | } 60 | 61 | } 62 | -------------------------------------------------------------------------------- /src/main/scala/com/amazon/deequ/repository/MetricsRepository.scala: -------------------------------------------------------------------------------- 1 | /** 2 | * Copyright 2018 Amazon.com, Inc. or its affiliates. All Rights Reserved. 3 | * 4 | * Licensed under the Apache License, Version 2.0 (the "License"). You may not 5 | * use this file except in compliance with the License. A copy of the License 6 | * is located at 7 | * 8 | * http://aws.amazon.com/apache2.0/ 9 | * 10 | * or in the "license" file accompanying this file. This file is distributed on 11 | * an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either 12 | * express or implied. See the License for the specific language governing 13 | * permissions and limitations under the License. 14 | * 15 | */ 16 | 17 | package com.amazon.deequ.repository 18 | 19 | import com.amazon.deequ.analyzers.runners.AnalyzerContext 20 | 21 | /** 22 | * Common trait for RepositoryIndexes where deequ runs can be stored. 23 | * Repository provides methods to store AnalysisResults(metrics) and VerificationResults(if any) 24 | */ 25 | trait MetricsRepository { 26 | 27 | /** 28 | * Saves Analysis results (metrics) 29 | * 30 | * @param resultKey A ResultKey that uniquely identifies a AnalysisResult 31 | * @param analyzerContext The resulting AnalyzerContext of an Analysis 32 | */ 33 | def save(resultKey: ResultKey, analyzerContext: AnalyzerContext): Unit 34 | 35 | /** 36 | * Get a AnalyzerContext saved using exactly the same resultKey if present 37 | */ 38 | def loadByKey(resultKey: ResultKey): Option[AnalyzerContext] 39 | 40 | /** Get a builder class to construct a loading query to get AnalysisResults */ 41 | def load(): MetricsRepositoryMultipleResultsLoader 42 | 43 | } 44 | 45 | /** 46 | * Information that uniquely identifies a AnalysisResult 47 | * 48 | * @param dataSetDate A date related to the AnalysisResult 49 | * @param tags A map with additional annotations 50 | */ 51 | case class ResultKey(dataSetDate: Long, tags: Map[String, String] = Map.empty) 52 | -------------------------------------------------------------------------------- /src/main/scala/com/amazon/deequ/suggestions/ConstraintSuggestionResult.scala: -------------------------------------------------------------------------------- 1 | /** 2 | * Copyright 2018 Amazon.com, Inc. or its affiliates. All Rights Reserved. 3 | * 4 | * Licensed under the Apache License, Version 2.0 (the "License"). You may not 5 | * use this file except in compliance with the License. A copy of the License 6 | * is located at 7 | * 8 | * http://aws.amazon.com/apache2.0/ 9 | * 10 | * or in the "license" file accompanying this file. This file is distributed on 11 | * an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either 12 | * express or implied. See the License for the specific language governing 13 | * permissions and limitations under the License. 14 | * 15 | */ 16 | 17 | package com.amazon.deequ.suggestions 18 | 19 | import com.amazon.deequ.VerificationResult 20 | import com.amazon.deequ.checks.CheckStatus 21 | import com.amazon.deequ.profiles.{ColumnProfile, ColumnProfiles} 22 | 23 | /** 24 | * The result returned from the ConstraintSuggestionSuite 25 | * 26 | * @param columnProfiles The column profiles 27 | * @param numRecordsUsedForProfiling The number of records that were used for computing 28 | * the column profiles 29 | * @param constraintSuggestions The suggested constraints 30 | * @param verificationResult The verificationResult in case a train/test split was used 31 | */ 32 | case class ConstraintSuggestionResult( 33 | columnProfiles: Map[String, ColumnProfile], 34 | numRecordsUsedForProfiling: Long, 35 | constraintSuggestions: Map[String, Seq[ConstraintSuggestion]], 36 | verificationResult: Option[VerificationResult] = None) 37 | 38 | 39 | object ConstraintSuggestionResult { 40 | 41 | def getColumnProfilesAsJson(constraintSuggestionResult: ConstraintSuggestionResult): String = { 42 | 43 | ColumnProfiles 44 | .toJson(constraintSuggestionResult.columnProfiles.values.toSeq) 45 | } 46 | 47 | def getConstraintSuggestionsAsJson(constraintSuggestionResult: ConstraintSuggestionResult) 48 | : String = { 49 | ConstraintSuggestions 50 | .toJson(constraintSuggestionResult.constraintSuggestions.values.fold(Seq.empty)( _ ++ _)) 51 | } 52 | 53 | def getEvaluationResultsAsJson(constraintSuggestionResult: ConstraintSuggestionResult) 54 | : String = { 55 | 56 | ConstraintSuggestions 57 | .evaluationResultsToJson( 58 | constraintSuggestionResult.constraintSuggestions.values.fold(Seq.empty)( _ ++ _), 59 | constraintSuggestionResult.verificationResult.getOrElse( 60 | VerificationResult(CheckStatus.Warning, Map.empty, Map.empty))) 61 | } 62 | } 63 | -------------------------------------------------------------------------------- /src/main/scala/com/amazon/deequ/suggestions/rules/CompleteIfCompleteRule.scala: -------------------------------------------------------------------------------- 1 | /** 2 | * Copyright 2018 Amazon.com, Inc. or its affiliates. All Rights Reserved. 3 | * 4 | * Licensed under the Apache License, Version 2.0 (the "License"). You may not 5 | * use this file except in compliance with the License. A copy of the License 6 | * is located at 7 | * 8 | * http://aws.amazon.com/apache2.0/ 9 | * 10 | * or in the "license" file accompanying this file. This file is distributed on 11 | * an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either 12 | * express or implied. See the License for the specific language governing 13 | * permissions and limitations under the License. 14 | * 15 | */ 16 | 17 | package com.amazon.deequ.suggestions.rules 18 | 19 | import com.amazon.deequ.checks.Check 20 | import com.amazon.deequ.constraints.Constraint.completenessConstraint 21 | import com.amazon.deequ.profiles.ColumnProfile 22 | import com.amazon.deequ.suggestions.CommonConstraintSuggestion 23 | import com.amazon.deequ.suggestions.ConstraintSuggestion 24 | 25 | /** If a column is complete in the sample, we suggest a NOT NULL constraint */ 26 | case class CompleteIfCompleteRule() extends ConstraintRule[ColumnProfile] { 27 | 28 | override def shouldBeApplied(profile: ColumnProfile, numRecords: Long): Boolean = { 29 | profile.completeness == 1.0 30 | } 31 | 32 | override def candidate(profile: ColumnProfile, numRecords: Long): ConstraintSuggestion = { 33 | 34 | val constraint = completenessConstraint(profile.column, Check.IsOne) 35 | 36 | CommonConstraintSuggestion( 37 | constraint, 38 | profile.column, 39 | "Completeness: " + profile.completeness.toString, 40 | s"'${profile.column}' is not null", 41 | this, 42 | s""".isComplete("${profile.column}")""" 43 | ) 44 | } 45 | 46 | override val ruleDescription: String = "If a column is complete in the sample, " + 47 | "we suggest a NOT NULL constraint" 48 | } 49 | -------------------------------------------------------------------------------- /src/main/scala/com/amazon/deequ/suggestions/rules/ConstraintRule.scala: -------------------------------------------------------------------------------- 1 | /** 2 | * Copyright 2018 Amazon.com, Inc. or its affiliates. All Rights Reserved. 3 | * 4 | * Licensed under the Apache License, Version 2.0 (the "License"). You may not 5 | * use this file except in compliance with the License. A copy of the License 6 | * is located at 7 | * 8 | * http://aws.amazon.com/apache2.0/ 9 | * 10 | * or in the "license" file accompanying this file. This file is distributed on 11 | * an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either 12 | * express or implied. See the License for the specific language governing 13 | * permissions and limitations under the License. 14 | * 15 | */ 16 | 17 | package com.amazon.deequ.suggestions.rules 18 | 19 | import com.amazon.deequ.profiles.ColumnProfile 20 | import com.amazon.deequ.suggestions.ConstraintSuggestion 21 | 22 | /** Abstract base class for all constraint suggestion rules */ 23 | abstract class ConstraintRule[P <: ColumnProfile] { 24 | 25 | val ruleDescription: String 26 | 27 | /** 28 | * Decide whether the rule should be applied to a particular column 29 | * 30 | * @param profile profile of the column 31 | * @param numRecords overall number of records 32 | * @return 33 | */ 34 | def shouldBeApplied(profile: P, numRecords: Long): Boolean 35 | 36 | /** 37 | * Generated a suggested constraint for the column 38 | * 39 | * @param profile profile of the column 40 | * @param numRecords overall number of records 41 | * @return 42 | */ 43 | def candidate(profile: P, numRecords: Long): ConstraintSuggestion 44 | } 45 | -------------------------------------------------------------------------------- /src/main/scala/com/amazon/deequ/suggestions/rules/HasMax.scala: -------------------------------------------------------------------------------- 1 | /** 2 | * Copyright 2023 Amazon.com, Inc. or its affiliates. All Rights Reserved. 3 | * 4 | * Licensed under the Apache License, Version 2.0 (the "License"). You may not 5 | * use this file except in compliance with the License. A copy of the License 6 | * is located at 7 | * 8 | * http://aws.amazon.com/apache2.0/ 9 | * 10 | * or in the "license" file accompanying this file. This file is distributed on 11 | * an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either 12 | * express or implied. See the License for the specific language governing 13 | * permissions and limitations under the License. 14 | * 15 | */ 16 | 17 | package com.amazon.deequ.suggestions.rules 18 | 19 | import com.amazon.deequ.constraints.Constraint.maxConstraint 20 | import com.amazon.deequ.profiles.ColumnProfile 21 | import com.amazon.deequ.profiles.NumericColumnProfile 22 | import com.amazon.deequ.suggestions.CommonConstraintSuggestion 23 | import com.amazon.deequ.suggestions.ConstraintSuggestion 24 | 25 | /** If we see only non-negative numbers in a column, we suggest a corresponding 26 | * constraint 27 | */ 28 | case class HasMax() extends ConstraintRule[ColumnProfile] { 29 | 30 | override def shouldBeApplied(profile: ColumnProfile, numRecords: Long): Boolean = { 31 | profile match { 32 | case np: NumericColumnProfile => np.maximum.isDefined 33 | case _ => false 34 | } 35 | } 36 | 37 | override def candidate(profile: ColumnProfile, numRecords: Long): ConstraintSuggestion = { 38 | val maximum: Double = profile match { case np: NumericColumnProfile => np.maximum.get } 39 | 40 | val description = s"'${profile.column}' <= $maximum" 41 | val constraint = maxConstraint(profile.column, _ == maximum) 42 | 43 | CommonConstraintSuggestion( 44 | constraint, 45 | profile.column, 46 | s"Maximum: $maximum", 47 | description, 48 | this, 49 | s""".hasMax("${profile.column}", _ == $maximum)""" 50 | ) 51 | } 52 | 53 | override val ruleDescription: String = "If we see a numeric column, " + 54 | "we suggest a corresponding Maximum value constraint" 55 | } 56 | -------------------------------------------------------------------------------- /src/main/scala/com/amazon/deequ/suggestions/rules/HasMaxLength.scala: -------------------------------------------------------------------------------- 1 | /** 2 | * Copyright 2022 Amazon.com, Inc. or its affiliates. All Rights Reserved. 3 | * 4 | * Licensed under the Apache License, Version 2.0 (the "License"). You may not 5 | * use this file except in compliance with the License. A copy of the License 6 | * is located at 7 | * 8 | * http://aws.amazon.com/apache2.0/ 9 | * 10 | * or in the "license" file accompanying this file. This file is distributed on 11 | * an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either 12 | * express or implied. See the License for the specific language governing 13 | * permissions and limitations under the License. 14 | * 15 | */ 16 | 17 | package com.amazon.deequ.suggestions.rules 18 | 19 | import com.amazon.deequ.constraints.Constraint.maxLengthConstraint 20 | import com.amazon.deequ.profiles.ColumnProfile 21 | import com.amazon.deequ.profiles.StringColumnProfile 22 | import com.amazon.deequ.suggestions.CommonConstraintSuggestion 23 | import com.amazon.deequ.suggestions.ConstraintSuggestion 24 | 25 | case class HasMaxLength() extends ConstraintRule[ColumnProfile] { 26 | override def shouldBeApplied(profile: ColumnProfile, numRecords: Long): Boolean = { 27 | profile match { 28 | case profile: StringColumnProfile => profile.maxLength.isDefined 29 | case _ => false 30 | } 31 | } 32 | 33 | override def candidate(profile: ColumnProfile, numRecords: Long): ConstraintSuggestion = { 34 | val maxLength: Double = profile.asInstanceOf[StringColumnProfile].maxLength.get 35 | 36 | val constraint = maxLengthConstraint(profile.column, _ <= maxLength) 37 | 38 | CommonConstraintSuggestion( 39 | constraint, 40 | profile.column, 41 | "MaxLength: " + profile.completeness.toString, 42 | s"The length of '${profile.column}' <= $maxLength", 43 | this, 44 | s""".hasMaxLength("${profile.column}", _ <= $maxLength)""" 45 | ) 46 | } 47 | 48 | override val ruleDescription: String = "If we see a string column, " + 49 | "we suggest a corresponding Maximum length constraint" 50 | } 51 | -------------------------------------------------------------------------------- /src/main/scala/com/amazon/deequ/suggestions/rules/HasMean.scala: -------------------------------------------------------------------------------- 1 | /** 2 | * Copyright 2023 Amazon.com, Inc. or its affiliates. All Rights Reserved. 3 | * 4 | * Licensed under the Apache License, Version 2.0 (the "License"). You may not 5 | * use this file except in compliance with the License. A copy of the License 6 | * is located at 7 | * 8 | * http://aws.amazon.com/apache2.0/ 9 | * 10 | * or in the "license" file accompanying this file. This file is distributed on 11 | * an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either 12 | * express or implied. See the License for the specific language governing 13 | * permissions and limitations under the License. 14 | * 15 | */ 16 | 17 | package com.amazon.deequ.suggestions.rules 18 | 19 | import com.amazon.deequ.constraints.Constraint.meanConstraint 20 | import com.amazon.deequ.profiles.ColumnProfile 21 | import com.amazon.deequ.profiles.NumericColumnProfile 22 | import com.amazon.deequ.suggestions.CommonConstraintSuggestion 23 | import com.amazon.deequ.suggestions.ConstraintSuggestion 24 | 25 | /** If we see only non-negative numbers in a column, we suggest a corresponding 26 | * constraint 27 | */ 28 | case class HasMean() extends ConstraintRule[ColumnProfile] { 29 | 30 | override def shouldBeApplied(profile: ColumnProfile, numRecords: Long): Boolean = { 31 | profile match { 32 | case np: NumericColumnProfile => np.mean.isDefined 33 | case _ => false 34 | } 35 | } 36 | 37 | override def candidate(profile: ColumnProfile, numRecords: Long): ConstraintSuggestion = { 38 | val mean: Double = profile match { case np: NumericColumnProfile => np.maximum.get } 39 | 40 | val description = s"'${profile.column}' <= $mean" 41 | val constraint = meanConstraint(profile.column, _ == mean) 42 | 43 | CommonConstraintSuggestion( 44 | constraint, 45 | profile.column, 46 | s"Mean: $mean", 47 | description, 48 | this, 49 | s""".hasMean("${profile.column}", _ == $mean)""" 50 | ) 51 | } 52 | 53 | override val ruleDescription: String = "If we see a numeric column, " + 54 | "we suggest a corresponding Mean value constraint" 55 | } 56 | -------------------------------------------------------------------------------- /src/main/scala/com/amazon/deequ/suggestions/rules/HasMin.scala: -------------------------------------------------------------------------------- 1 | /** 2 | * Copyright 2023 Amazon.com, Inc. or its affiliates. All Rights Reserved. 3 | * 4 | * Licensed under the Apache License, Version 2.0 (the "License"). You may not 5 | * use this file except in compliance with the License. A copy of the License 6 | * is located at 7 | * 8 | * http://aws.amazon.com/apache2.0/ 9 | * 10 | * or in the "license" file accompanying this file. This file is distributed on 11 | * an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either 12 | * express or implied. See the License for the specific language governing 13 | * permissions and limitations under the License. 14 | * 15 | */ 16 | 17 | package com.amazon.deequ.suggestions.rules 18 | 19 | import com.amazon.deequ.constraints.Constraint.minConstraint 20 | import com.amazon.deequ.profiles.ColumnProfile 21 | import com.amazon.deequ.profiles.NumericColumnProfile 22 | import com.amazon.deequ.suggestions.CommonConstraintSuggestion 23 | import com.amazon.deequ.suggestions.ConstraintSuggestion 24 | 25 | /** If we see only non-negative numbers in a column, we suggest a corresponding 26 | * constraint 27 | */ 28 | case class HasMin() extends ConstraintRule[ColumnProfile] { 29 | 30 | override def shouldBeApplied(profile: ColumnProfile, numRecords: Long): Boolean = { 31 | profile match { 32 | case np: NumericColumnProfile => np.minimum.isDefined 33 | case _ => false 34 | } 35 | } 36 | 37 | override def candidate(profile: ColumnProfile, numRecords: Long): ConstraintSuggestion = { 38 | val minimum: Double = profile match { case np: NumericColumnProfile => np.minimum.get } 39 | 40 | val description = s"'${profile.column}' >= $minimum" 41 | val constraint = minConstraint(profile.column, _ == minimum) 42 | 43 | CommonConstraintSuggestion( 44 | constraint, 45 | profile.column, 46 | s"Minimum: $minimum", 47 | description, 48 | this, 49 | s""".hasMin("${profile.column}", _ == $minimum)""" 50 | ) 51 | } 52 | 53 | override val ruleDescription: String = "If we see a numeric column, " + 54 | "we suggest a corresponding Minimum value constraint" 55 | } 56 | -------------------------------------------------------------------------------- /src/main/scala/com/amazon/deequ/suggestions/rules/HasMinLength.scala: -------------------------------------------------------------------------------- 1 | /** 2 | * Copyright 2022 Amazon.com, Inc. or its affiliates. All Rights Reserved. 3 | * 4 | * Licensed under the Apache License, Version 2.0 (the "License"). You may not 5 | * use this file except in compliance with the License. A copy of the License 6 | * is located at 7 | * 8 | * http://aws.amazon.com/apache2.0/ 9 | * 10 | * or in the "license" file accompanying this file. This file is distributed on 11 | * an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either 12 | * express or implied. See the License for the specific language governing 13 | * permissions and limitations under the License. 14 | * 15 | */ 16 | 17 | package com.amazon.deequ.suggestions.rules 18 | 19 | import com.amazon.deequ.constraints.Constraint.minLengthConstraint 20 | import com.amazon.deequ.profiles.ColumnProfile 21 | import com.amazon.deequ.profiles.StringColumnProfile 22 | import com.amazon.deequ.suggestions.CommonConstraintSuggestion 23 | import com.amazon.deequ.suggestions.ConstraintSuggestion 24 | 25 | case class HasMinLength() extends ConstraintRule[ColumnProfile] { 26 | override def shouldBeApplied(profile: ColumnProfile, numRecords: Long): Boolean = { 27 | profile match { 28 | case profile: StringColumnProfile => profile.minLength.isDefined 29 | case _ => false 30 | } 31 | } 32 | 33 | override def candidate(profile: ColumnProfile, numRecords: Long): ConstraintSuggestion = { 34 | val stringProfile = profile.asInstanceOf[StringColumnProfile] 35 | val minLength: Double = stringProfile.minLength.get 36 | 37 | val constraint = minLengthConstraint(profile.column, _ >= minLength) 38 | 39 | CommonConstraintSuggestion( 40 | constraint, 41 | profile.column, 42 | "MinLength: " + minLength, 43 | s"The length of '${profile.column}' >= $minLength", 44 | this, 45 | s""".hasMinLength("${profile.column}", _ >= $minLength)""" 46 | ) 47 | } 48 | 49 | override val ruleDescription: String = "If we see a string column, " + 50 | "we suggest a corresponding Minimum length constraint" 51 | } 52 | -------------------------------------------------------------------------------- /src/main/scala/com/amazon/deequ/suggestions/rules/HasStandardDeviation.scala: -------------------------------------------------------------------------------- 1 | /** 2 | * Copyright 2023 Amazon.com, Inc. or its affiliates. All Rights Reserved. 3 | * 4 | * Licensed under the Apache License, Version 2.0 (the "License"). You may not 5 | * use this file except in compliance with the License. A copy of the License 6 | * is located at 7 | * 8 | * http://aws.amazon.com/apache2.0/ 9 | * 10 | * or in the "license" file accompanying this file. This file is distributed on 11 | * an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either 12 | * express or implied. See the License for the specific language governing 13 | * permissions and limitations under the License. 14 | * 15 | */ 16 | 17 | package com.amazon.deequ.suggestions.rules 18 | 19 | import com.amazon.deequ.constraints.Constraint.standardDeviationConstraint 20 | import com.amazon.deequ.profiles.ColumnProfile 21 | import com.amazon.deequ.profiles.NumericColumnProfile 22 | import com.amazon.deequ.suggestions.CommonConstraintSuggestion 23 | import com.amazon.deequ.suggestions.ConstraintSuggestion 24 | 25 | /** If we see only non-negative numbers in a column, we suggest a corresponding 26 | * constraint 27 | */ 28 | case class HasStandardDeviation() extends ConstraintRule[ColumnProfile] { 29 | 30 | override def shouldBeApplied(profile: ColumnProfile, numRecords: Long): Boolean = { 31 | profile match { 32 | case np: NumericColumnProfile => np.mean.isDefined 33 | case _ => false 34 | } 35 | } 36 | 37 | override def candidate(profile: ColumnProfile, numRecords: Long): ConstraintSuggestion = { 38 | val stdDev: Double = profile match { case np: NumericColumnProfile => np.stdDev.get } 39 | 40 | val description = s"'${profile.column}' <= $stdDev" 41 | val constraint = standardDeviationConstraint(profile.column, _ == stdDev) 42 | 43 | CommonConstraintSuggestion( 44 | constraint, 45 | profile.column, 46 | s"stdDev: $stdDev", 47 | description, 48 | this, 49 | s""".hasStandardDeviation("${profile.column}", _ == $stdDev)""" 50 | ) 51 | } 52 | 53 | override val ruleDescription: String = "If we see a numeric column, " + 54 | "we suggest a corresponding standard deviation value constraint" 55 | } 56 | -------------------------------------------------------------------------------- /src/main/scala/com/amazon/deequ/suggestions/rules/NonNegativeNumbersRule.scala: -------------------------------------------------------------------------------- 1 | /** 2 | * Copyright 2018 Amazon.com, Inc. or its affiliates. All Rights Reserved. 3 | * 4 | * Licensed under the Apache License, Version 2.0 (the "License"). You may not 5 | * use this file except in compliance with the License. A copy of the License 6 | * is located at 7 | * 8 | * http://aws.amazon.com/apache2.0/ 9 | * 10 | * or in the "license" file accompanying this file. This file is distributed on 11 | * an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either 12 | * express or implied. See the License for the specific language governing 13 | * permissions and limitations under the License. 14 | * 15 | */ 16 | 17 | package com.amazon.deequ.suggestions.rules 18 | 19 | import com.amazon.deequ.checks.Check 20 | import com.amazon.deequ.constraints.Constraint.complianceConstraint 21 | import com.amazon.deequ.profiles.ColumnProfile 22 | import com.amazon.deequ.profiles.NumericColumnProfile 23 | import com.amazon.deequ.suggestions.CommonConstraintSuggestion 24 | import com.amazon.deequ.suggestions.ConstraintSuggestion 25 | 26 | /** If we see only non-negative numbers in a column, we suggest a corresponding constraint */ 27 | case class NonNegativeNumbersRule() extends ConstraintRule[ColumnProfile] { 28 | 29 | override def shouldBeApplied(profile: ColumnProfile, numRecords: Long): Boolean = { 30 | profile match { 31 | case numericProfile: NumericColumnProfile => numericProfile.minimum.exists(_ >= 0.0) 32 | case _ => false 33 | } 34 | } 35 | 36 | override def candidate(profile: ColumnProfile, numRecords: Long): ConstraintSuggestion = { 37 | 38 | val description = s"'${profile.column}' has no negative values" 39 | val constraint = complianceConstraint(description, 40 | s"${profile.column} >= 0", 41 | Check.IsOne, 42 | columns = List(profile.column)) 43 | 44 | val minimum = profile match { 45 | case numericProfile: NumericColumnProfile 46 | if numericProfile.minimum.isDefined => numericProfile.minimum.get.toString 47 | case _ => "Error while calculating minimum!" 48 | } 49 | 50 | CommonConstraintSuggestion( 51 | constraint, 52 | profile.column, 53 | "Minimum: " + minimum, 54 | description, 55 | this, 56 | s""".isNonNegative("${profile.column}")""" 57 | ) 58 | } 59 | 60 | override val ruleDescription: String = "If we see only non-negative numbers in a " + 61 | "column, we suggest a corresponding constraint" 62 | } 63 | -------------------------------------------------------------------------------- /src/main/scala/com/amazon/deequ/suggestions/rules/RetainTypeRule.scala: -------------------------------------------------------------------------------- 1 | /** 2 | * Copyright 2018 Amazon.com, Inc. or its affiliates. All Rights Reserved. 3 | * 4 | * Licensed under the Apache License, Version 2.0 (the "License"). You may not 5 | * use this file except in compliance with the License. A copy of the License 6 | * is located at 7 | * 8 | * http://aws.amazon.com/apache2.0/ 9 | * 10 | * or in the "license" file accompanying this file. This file is distributed on 11 | * an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either 12 | * express or implied. See the License for the specific language governing 13 | * permissions and limitations under the License. 14 | * 15 | */ 16 | 17 | package com.amazon.deequ.suggestions.rules 18 | 19 | import com.amazon.deequ.analyzers.DataTypeInstances 20 | import com.amazon.deequ.checks.Check 21 | import com.amazon.deequ.constraints.ConstrainableDataTypes 22 | import com.amazon.deequ.constraints.Constraint.dataTypeConstraint 23 | import com.amazon.deequ.profiles.ColumnProfile 24 | import com.amazon.deequ.suggestions.CommonConstraintSuggestion 25 | import com.amazon.deequ.suggestions.ConstraintSuggestion 26 | 27 | /** If we detect a non-string type, we suggest a type constraint */ 28 | case class RetainTypeRule() extends ConstraintRule[ColumnProfile] { 29 | 30 | override def shouldBeApplied(profile: ColumnProfile, numRecords: Long): Boolean = { 31 | val testableType = profile.dataType match { 32 | case DataTypeInstances.Integral | DataTypeInstances.Fractional | DataTypeInstances.Boolean => 33 | true 34 | case _ => false 35 | } 36 | 37 | profile.isDataTypeInferred && testableType 38 | } 39 | 40 | override def candidate(profile: ColumnProfile, numRecords: Long): ConstraintSuggestion = { 41 | 42 | val typeToCheck = profile.dataType match { 43 | case DataTypeInstances.Fractional => ConstrainableDataTypes.Fractional 44 | case DataTypeInstances.Integral => ConstrainableDataTypes.Integral 45 | case DataTypeInstances.Boolean => ConstrainableDataTypes.Boolean 46 | } 47 | 48 | val constraint = dataTypeConstraint(profile.column, typeToCheck, Check.IsOne) 49 | 50 | CommonConstraintSuggestion( 51 | constraint, 52 | profile.column, 53 | "DataType: " + profile.dataType.toString, 54 | s"'${profile.column}' has type ${profile.dataType}", 55 | this, 56 | s""".hasDataType("${profile.column}", ConstrainableDataTypes.${profile.dataType})""" 57 | ) 58 | } 59 | 60 | override val ruleDescription: String = "If we detect a non-string type, we suggest a " + 61 | "type constraint" 62 | } 63 | -------------------------------------------------------------------------------- /src/main/scala/com/amazon/deequ/suggestions/rules/UniqueIfApproximatelyUniqueRule.scala: -------------------------------------------------------------------------------- 1 | /** 2 | * Copyright 2018 Amazon.com, Inc. or its affiliates. All Rights Reserved. 3 | * 4 | * Licensed under the Apache License, Version 2.0 (the "License"). You may not 5 | * use this file except in compliance with the License. A copy of the License 6 | * is located at 7 | * 8 | * http://aws.amazon.com/apache2.0/ 9 | * 10 | * or in the "license" file accompanying this file. This file is distributed on 11 | * an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either 12 | * express or implied. See the License for the specific language governing 13 | * permissions and limitations under the License. 14 | * 15 | */ 16 | 17 | package com.amazon.deequ.suggestions.rules 18 | 19 | import com.amazon.deequ.checks.Check 20 | import com.amazon.deequ.constraints.Constraint.uniquenessConstraint 21 | import com.amazon.deequ.profiles.ColumnProfile 22 | import com.amazon.deequ.suggestions.CommonConstraintSuggestion 23 | import com.amazon.deequ.suggestions.ConstraintSuggestion 24 | 25 | /** 26 | * If the ratio of approximate num distinct values in a column is close to the number of records 27 | * (within error of HLL sketch), we suggest a UNIQUE constraint 28 | */ 29 | case class UniqueIfApproximatelyUniqueRule() extends ConstraintRule[ColumnProfile] { 30 | 31 | override def shouldBeApplied(profile: ColumnProfile, numRecords: Long): Boolean = { 32 | 33 | val approximateDistinctness = profile.approximateNumDistinctValues.toDouble / numRecords 34 | 35 | // TODO This bound depends on the error guarantees of the HLL sketch 36 | profile.completeness == 1.0 && math.abs(1.0 - approximateDistinctness) <= 0.08 37 | } 38 | 39 | override def candidate(profile: ColumnProfile, numRecords: Long): ConstraintSuggestion = { 40 | 41 | val constraint = uniquenessConstraint(Seq(profile.column), Check.IsOne) 42 | val approximateDistinctness = profile.approximateNumDistinctValues.toDouble / numRecords 43 | 44 | CommonConstraintSuggestion( 45 | constraint, 46 | profile.column, 47 | "ApproxDistinctness: " + approximateDistinctness.toString, 48 | s"'${profile.column}' is unique", 49 | this, 50 | s""".isUnique("${profile.column}")""" 51 | ) 52 | } 53 | 54 | override val ruleDescription: String = "If the ratio of approximate num distinct values " + 55 | "in a column is close to the number of records (within the error of the HLL sketch), " + 56 | "we suggest a UNIQUE constraint" 57 | } 58 | -------------------------------------------------------------------------------- /src/main/scala/com/amazon/deequ/suggestions/rules/interval/ConfidenceIntervalStrategy.scala: -------------------------------------------------------------------------------- 1 | /** 2 | * Copyright 2018 Amazon.com, Inc. or its affiliates. All Rights Reserved. 3 | * 4 | * Licensed under the Apache License, Version 2.0 (the "License"). You may not 5 | * use this file except in compliance with the License. A copy of the License 6 | * is located at 7 | * 8 | * http://aws.amazon.com/apache2.0/ 9 | * 10 | * or in the "license" file accompanying this file. This file is distributed on 11 | * an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either 12 | * express or implied. See the License for the specific language governing 13 | * permissions and limitations under the License. 14 | * 15 | */ 16 | 17 | package com.amazon.deequ.suggestions.rules.interval 18 | 19 | import breeze.stats.distributions.{Gaussian, Rand} 20 | import com.amazon.deequ.suggestions.rules.interval.ConfidenceIntervalStrategy._ 21 | 22 | /** 23 | * Strategy for calculate confidence interval 24 | * */ 25 | trait ConfidenceIntervalStrategy { 26 | 27 | /** 28 | * Generated confidence interval interval 29 | * @param pHat sample of the population that share a trait 30 | * @param numRecords overall number of records 31 | * @param confidence confidence level of method used to estimate the interval. 32 | * @return 33 | */ 34 | def calculateTargetConfidenceInterval( 35 | pHat: Double, 36 | numRecords: Long, 37 | confidence: Double = defaultConfidence 38 | ): ConfidenceInterval 39 | 40 | def validateInput(pHat: Double, confidence: Double): Unit = { 41 | require(0.0 <= pHat && pHat <= 1.0, "pHat must be between 0.0 and 1.0") 42 | require(0.0 <= confidence && confidence <= 1.0, "confidence must be between 0.0 and 1.0") 43 | } 44 | 45 | def calculateZScore(confidence: Double): Double = Gaussian(0, 1)(Rand).inverseCdf(1 - ((1.0 - confidence)/ 2.0)) 46 | } 47 | 48 | object ConfidenceIntervalStrategy { 49 | val defaultConfidence = 0.95 50 | val defaultIntervalStrategy: ConfidenceIntervalStrategy = WaldIntervalStrategy() 51 | 52 | case class ConfidenceInterval(lowerBound: Double, upperBound: Double) 53 | } 54 | 55 | 56 | -------------------------------------------------------------------------------- /src/main/scala/com/amazon/deequ/suggestions/rules/interval/WaldIntervalStrategy.scala: -------------------------------------------------------------------------------- 1 | /** 2 | * Copyright 2018 Amazon.com, Inc. or its affiliates. All Rights Reserved. 3 | * 4 | * Licensed under the Apache License, Version 2.0 (the "License"). You may not 5 | * use this file except in compliance with the License. A copy of the License 6 | * is located at 7 | * 8 | * http://aws.amazon.com/apache2.0/ 9 | * 10 | * or in the "license" file accompanying this file. This file is distributed on 11 | * an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either 12 | * express or implied. See the License for the specific language governing 13 | * permissions and limitations under the License. 14 | * 15 | */ 16 | 17 | package com.amazon.deequ.suggestions.rules.interval 18 | 19 | import com.amazon.deequ.suggestions.rules.interval.ConfidenceIntervalStrategy.ConfidenceInterval 20 | import com.amazon.deequ.suggestions.rules.interval.ConfidenceIntervalStrategy.defaultConfidence 21 | 22 | import scala.math.BigDecimal.RoundingMode 23 | 24 | /** 25 | * Implements the Wald Interval method for creating a binomial proportion confidence interval. Provided for backwards 26 | * compatibility. using [[WaldIntervalStrategy]] for calculating confidence interval can be problematic when dealing 27 | * with small sample sizes or proportions close to 0 or 1. It also have poorer coverage and might produce confidence 28 | * limit outside the range of [0,1] 29 | * @see 31 | * Normal approximation interval (Wikipedia) 32 | */ 33 | @deprecated("WilsonScoreIntervalStrategy is recommended for calculating confidence interval") 34 | case class WaldIntervalStrategy() extends ConfidenceIntervalStrategy { 35 | def calculateTargetConfidenceInterval( 36 | pHat: Double, 37 | numRecords: Long, 38 | confidence: Double = defaultConfidence 39 | ): ConfidenceInterval = { 40 | validateInput(pHat, confidence) 41 | val successRatio = BigDecimal(pHat) 42 | val marginOfError = BigDecimal(calculateZScore(confidence) * math.sqrt(pHat * (1 - pHat) / numRecords)) 43 | val lowerBound = (successRatio - marginOfError).setScale(2, RoundingMode.DOWN).toDouble 44 | val upperBound = (successRatio + marginOfError).setScale(2, RoundingMode.UP).toDouble 45 | ConfidenceInterval(lowerBound, upperBound) 46 | } 47 | } 48 | -------------------------------------------------------------------------------- /src/main/scala/com/amazon/deequ/suggestions/rules/interval/WilsonScoreIntervalStrategy.scala: -------------------------------------------------------------------------------- 1 | /** 2 | * Copyright 2018 Amazon.com, Inc. or its affiliates. All Rights Reserved. 3 | * 4 | * Licensed under the Apache License, Version 2.0 (the "License"). You may not 5 | * use this file except in compliance with the License. A copy of the License 6 | * is located at 7 | * 8 | * http://aws.amazon.com/apache2.0/ 9 | * 10 | * or in the "license" file accompanying this file. This file is distributed on 11 | * an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either 12 | * express or implied. See the License for the specific language governing 13 | * permissions and limitations under the License. 14 | * 15 | */ 16 | 17 | package com.amazon.deequ.suggestions.rules.interval 18 | 19 | import com.amazon.deequ.suggestions.rules.interval.ConfidenceIntervalStrategy.ConfidenceInterval 20 | import com.amazon.deequ.suggestions.rules.interval.ConfidenceIntervalStrategy.defaultConfidence 21 | 22 | import scala.math.BigDecimal.RoundingMode 23 | 24 | /** 25 | * Using Wilson score method for creating a binomial proportion confidence interval. 26 | * 27 | * @see 29 | * Wilson score interval (Wikipedia) 30 | */ 31 | case class WilsonScoreIntervalStrategy() extends ConfidenceIntervalStrategy { 32 | 33 | def calculateTargetConfidenceInterval( 34 | pHat: Double, numRecords: Long, 35 | confidence: Double = defaultConfidence 36 | ): ConfidenceInterval = { 37 | validateInput(pHat, confidence) 38 | val zScore = calculateZScore(confidence) 39 | val zSquareOverN = math.pow(zScore, 2) / numRecords 40 | val factor = 1.0 / (1 + zSquareOverN) 41 | val adjustedSuccessRatio = pHat + zSquareOverN/2 42 | val marginOfError = zScore * math.sqrt(pHat * (1 - pHat)/numRecords + zSquareOverN/(4 * numRecords)) 43 | val lowerBound = BigDecimal(factor * (adjustedSuccessRatio - marginOfError)).setScale(2, RoundingMode.DOWN).toDouble 44 | val upperBound = BigDecimal(factor * (adjustedSuccessRatio + marginOfError)).setScale(2, RoundingMode.UP).toDouble 45 | ConfidenceInterval(lowerBound, upperBound) 46 | } 47 | } 48 | -------------------------------------------------------------------------------- /src/main/scala/com/amazon/deequ/utilities/ColumnUtil.scala: -------------------------------------------------------------------------------- 1 | /** 2 | * Copyright 2023 Amazon.com, Inc. or its affiliates. All Rights Reserved. 3 | * 4 | * Licensed under the Apache License, Version 2.0 (the "License"). You may not 5 | * use this file except in compliance with the License. A copy of the License 6 | * is located at 7 | * 8 | * http://aws.amazon.com/apache2.0/ 9 | * 10 | * or in the "license" file accompanying this file. This file is distributed on 11 | * an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either 12 | * express or implied. See the License for the specific language governing 13 | * permissions and limitations under the License. 14 | * 15 | */ 16 | 17 | package com.amazon.deequ.utilities 18 | 19 | object ColumnUtil { 20 | 21 | def removeEscapeColumn(column: String): String = { 22 | if (column.startsWith("`") && column.endsWith("`")) { 23 | column.substring(1, column.length - 1) 24 | } else { 25 | column 26 | } 27 | } 28 | 29 | def escapeColumn(column: String): String = { 30 | if (column.contains(".")) { 31 | "`" + column + "`" 32 | } else { 33 | column 34 | } 35 | } 36 | 37 | } 38 | -------------------------------------------------------------------------------- /src/test/resources/EMRSparkShellTest.scala: -------------------------------------------------------------------------------- 1 | /** 2 | * Copyright 2018 Amazon.com, Inc. or its affiliates. All Rights Reserved. 3 | * 4 | * Licensed under the Apache License, Version 2.0 (the "License"). You may not 5 | * use this file except in compliance with the License. A copy of the License 6 | * is located at 7 | * 8 | * http://aws.amazon.com/apache2.0/ 9 | * 10 | * or in the "license" file accompanying this file. This file is distributed on 11 | * an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either 12 | * express or implied. See the License for the specific language governing 13 | * permissions and limitations under the License. 14 | * 15 | */ 16 | 17 | /* 18 | For testing inside EMR or other flavors of spark cluster. Run commands after building git repo from source. 19 | Add additional test classes as needed 20 | scala 2.12 21 | spark-shell -i4 | * Licensed under the Apache License, Version 2.0 (the "License"). You may not 5 | * use this file except in compliance with the License. A copy of the License 6 | * is located at 7 | *
8 | * http://aws.amazon.com/apache2.0/ 9 | *
10 | * or in the "license" file accompanying this file. This file is distributed on
11 | * an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either
12 | * express or implied. See the License for the specific language governing
13 | * permissions and limitations under the License.
14 | */
15 |
16 | package com.amazon.deequ.examples;
17 |
18 | import org.apache.spark.sql.Dataset;
19 | import org.apache.spark.sql.Row;
20 | import org.apache.spark.sql.SparkSession;
21 | import com.amazon.deequ.dqdl.EvaluateDataQuality;
22 |
23 |
24 | public class JavaDQDLExample {
25 |
26 | /**
27 | * Main method demonstrating the data quality evaluation
28 | */
29 | public static void main(String[] args) {
30 | // Initialize Spark session
31 | SparkSession sparkSession = SparkSession.builder()
32 | .appName("JavaDataQualityExample")
33 | .master("local[*]")
34 | .getOrCreate();
35 |
36 | try {
37 |
38 | // Create sample data
39 | Dataset