├── .git-blame-ignore-revs
├── .github
    ├── .scala-steward.conf
    ├── CODEOWNERS
    ├── ISSUE_TEMPLATE
    │   ├── bug_report.md
    │   ├── new_check.md
    │   └── other-feature-request.md
    ├── PULL_REQUEST_TEMPLATE
    │   └── new_check.md
    ├── dependabot.yml
    ├── stale.yaml
    └── workflows
    │   ├── ci.yaml
    │   ├── release.yaml
    │   └── scala-steward.yaml
├── .gitignore
├── .scalafmt.conf
├── Brewfile
├── CONTRIBUTING.md
├── COPYRIGHT
├── LICENSE
├── Makefile
├── NOTICE
├── README.md
├── bin
    └── sbt
├── build.sbt
├── project
    ├── assembly.sbt
    ├── build.properties
    └── plugins.sbt
├── scalastyle-config.xml
└── src
    ├── main
        ├── resources
        │   └── log4j-dv-spark.properties
        └── scala
        │   └── com
        │       └── target
        │           └── data_validator
        │               ├── CliOptionParser.scala
        │               ├── ConfigParser.scala
        │               ├── ConfigVar.scala
        │               ├── Emailer.scala
        │               ├── EnvironmentVariables.scala
        │               ├── EventGenerator.scala
        │               ├── EventLog.scala
        │               ├── ExpressionUtils.scala
        │               ├── GenTestData.scala
        │               ├── HTMLBits.scala
        │               ├── IO.scala
        │               ├── JsonEncoders.scala
        │               ├── JsonUtils.scala
        │               ├── Main.scala
        │               ├── Reports.scala
        │               ├── Substitutable.scala
        │               ├── ValidatorConfig.scala
        │               ├── ValidatorEvent.scala
        │               ├── ValidatorOutput.scala
        │               ├── ValidatorTable.scala
        │               ├── VarSubstitution.scala
        │               ├── stats
        │                   ├── Bin.scala
        │                   ├── CompleteStats.scala
        │                   ├── FirstPassStats.scala
        │                   ├── FirstPassStatsAggregator.scala
        │                   ├── Histogram.scala
        │                   ├── SecondPassStats.scala
        │                   └── SecondPassStatsAggregator.scala
        │               └── validator
        │                   ├── ColStats.scala
        │                   ├── ColumnBased.scala
        │                   ├── ColumnSumCheck.scala
        │                   ├── JsonDecoders.scala
        │                   ├── NegativeCheck.scala
        │                   ├── NullCheck.scala
        │                   ├── RangeCheck.scala
        │                   ├── RowBased.scala
        │                   ├── StringLengthCheck.scala
        │                   ├── StringRegexCheck.scala
        │                   ├── TwoPassCheapCheck.scala
        │                   ├── UniqueCheck.scala
        │                   └── ValidatorBase.scala
    └── test
        ├── resources
            ├── format_test.jsonl
            └── test_config.yaml
        └── scala
            └── com
                └── target
                    ├── TestingSparkSession.scala
                    └── data_validator
                        ├── CliOptionParserSpec.scala
                        ├── ConfigParserSpec.scala
                        ├── ConfigVarSubSpec.scala
                        ├── EmailerSpec.scala
                        ├── EnvironmentVariablesSpec.scala
                        ├── ExpressionUtilsSpec.scala
                        ├── IOSpec.scala
                        ├── JsonUtilsSpec.scala
                        ├── TestHelpers.scala
                        ├── ValidatorBaseSpec.scala
                        ├── ValidatorOutputSpec.scala
                        ├── ValidatorSpecifiedFormatLoaderSpec.scala
                        ├── ValidatorTableSpec.scala
                        ├── VarSubstitutionSpec.scala
                        ├── stats
                            ├── FirstPassStatsAggregatorSpec.scala
                            ├── NumericData.scala
                            └── SecondPassStatsAggregatorSpec.scala
                        └── validator
                            ├── ColStatsSpec.scala
                            ├── ColumnBasedSpec.scala
                            ├── ColumnSumCheckSpec.scala
                            ├── ConfigVarSpec.scala
                            ├── Mocker.scala
                            ├── NegativeCheckSpec.scala
                            ├── NullCheckSpec.scala
                            ├── RangeCheckSpec.scala
                            ├── RowBasedSpec.scala
                            ├── StringLengthCheckSpec.scala
                            ├── StringRegexCheckSpec.scala
                            ├── TestHelpersSpec.scala
                            └── UniqueCheckSpec.scala


/.git-blame-ignore-revs:
--------------------------------------------------------------------------------
1 | # Scala Steward: Reformat with scalafmt 3.5.9
2 | 4a3a718ed6d94e89d7a478aa040831b85b5c5580
3 | 


--------------------------------------------------------------------------------
/.github/.scala-steward.conf:
--------------------------------------------------------------------------------
  1 | # pullRequests.frequency allows to control how often or when Scala Steward
  2 | # is allowed to create pull requests.
  3 | #
  4 | # Possible values:
  5 | #   @asap
  6 | #     PRs are created without delay.
  7 | #
  8 | #   <timespan>
  9 | #     PRs are created only again after the given timespan since the last PR
 10 | #     has passed. Example values are "36 hours", "1 day", or "14 days".
 11 | 
 12 | #   <CRON expression>
 13 | #     PRs are created roughly according to the given CRON expression.
 14 | #
 15 | #     CRON expressions consist of five fields:
 16 | #     minutes, hour of day, day of month, month, and day of week.
 17 | #
 18 | #     See https://www.alonsodomin.me/cron4s/userguide/index.html#parsing for
 19 | #     more information about the CRON expressions that are supported.
 20 | #
 21 | #     Note that the date parts of the CRON expression are matched exactly
 22 | #     while the time parts are only used to abide to the frequency of
 23 | #     the given expression.
 24 | #
 25 | # Default: @asap
 26 | #
 27 | #pullRequests.frequency = "0 0 ? * 3" # every thursday on midnight
 28 | pullRequests.frequency = "7 days"
 29 | 
 30 | # Only these dependencies which match the given patterns are updated.
 31 | #
 32 | # Each pattern must have `groupId`, and may have `artifactId` and `version`.
 33 | # Defaults to empty `[]` which mean Scala Steward will update all dependencies.
 34 | # updates.allow  = [ { groupId = "com.example" } ]
 35 | 
 36 | # The dependencies which match the given version pattern are updated.
 37 | # Dependencies that are not listed will be updated.
 38 | #
 39 | # Each pattern must have `groupId`, `version` and optional `artifactId`.
 40 | # Defaults to empty `[]` which mean Scala Steward will update all dependencies.
 41 | updates.pin  = [
 42 |   { groupId = "org.apache.spark", artifactId = "spark-sql", version = "2.3.4" }
 43 | ]
 44 | 
 45 | # The dependencies which match the given pattern are NOT updated.
 46 | #
 47 | # Each pattern must have `groupId`, and may have `artifactId` and `version`.
 48 | # Defaults to empty `[]` which mean Scala Steward will not ignore dependencies.
 49 | # TODO: multi-version build coming soon
 50 | updates.ignore = [ { groupId = "org.scala-lang", artifactId = "scala-library" } ]
 51 | 
 52 | # If set, Scala Steward will only create or update `n` PRs each time it runs (see `pullRequests.frequency` above).
 53 | # Useful if running frequently and/or CI build are costly
 54 | # Default: None
 55 | # updates.limit = 5
 56 | 
 57 | # The extensions of files that should be updated.
 58 | # Default: [".scala", ".sbt", ".sbt.shared", ".sc", ".yml", "pom.xml"]
 59 | # updates.fileExtensions = [".scala", ".sbt", ".sbt.shared", ".sc", ".yml", ".md", ".markdown", ".txt"]
 60 | 
 61 | # If "on-conflicts", Scala Steward will update the PR it created to resolve conflicts as
 62 | # long as you don't change it yourself.
 63 | # If "always", Scala Steward will always update the PR it created as long as
 64 | # you don't change it yourself.
 65 | # If "never", Scala Steward will never update the PR
 66 | # Default: "on-conflicts"
 67 | # updatePullRequests = "always" | "on-conflicts" | "never"
 68 | 
 69 | # If set, Scala Steward will use this message template for the commit messages and PR titles.
 70 | # Supported variables: ${artifactName}, ${currentVersion}, ${nextVersion} and ${default}
 71 | # Default: "${default}" which is equivalent to "Update ${artifactName} to ${nextVersion}" 
 72 | commits.message = "Update ${artifactName} from ${currentVersion} to ${nextVersion}"
 73 | 
 74 | # If true and when upgrading version in .scalafmt.conf, Scala Steward will perform scalafmt 
 75 | # and add a separate commit when format changed. So you don't need reformat manually and can merge PR.
 76 | # If false, Scala Steward will not perform scalafmt, so your CI may abort when reformat needed.
 77 | # Default: true
 78 | scalafmt.runAfterUpgrading = false
 79 | 
 80 | # It is possible to have multiple scala projects in a single repository. In that case the folders containing the projects (build.sbt folders)
 81 | # are specified using the buildRoots property. Note that the paths used there are relative and if the repo directory itself also contains a build.sbt the dot can be used to specify it.
 82 | # Default: ["."]
 83 | # buildRoots = [ ".", "subfolder/projectA" ]
 84 | 
 85 | # Define commands that are executed after an update via a hook.
 86 | # A groupId and/or artifactId can be defined to only execute after certain dependencies are updated. If neither is defined, the hook runs for every update.
 87 | # postUpdateHooks = [{
 88 | #   command = ["sbt", "protobufGenerate"],
 89 | #   commitMessage = "Regenerated protobuf files",
 90 | #   groupId = "com.github.sbt",
 91 | #   artifactId = "sbt-protobuf"
 92 | # }]
 93 | 
 94 | # You can override some config options for dependencies that matches the given pattern.
 95 | # Currently, "pullRequests" can be overridden.  
 96 | # Each pattern must have `groupId`, and may have `artifactId` and `version`.
 97 | # First-matched entry is used.
 98 | # More-specific entry should be placed before less-specific entry.
 99 | #
100 | # Default: empty `[]`
101 | # dependencyOverrides = [
102 | #   {
103 | #     dependency = { groupId = "com.example", artifactId = "foo", version = "2." },
104 | #     pullRequests = { frequency = "1 day" },
105 | #   },
106 | #   {
107 | #     dependency = { groupId = "com.example", artifactId = "foo" },
108 | #     pullRequests = { frequency = "30 day" },
109 | #   },
110 | #   {
111 | #     dependency = { groupId = "com.example" },
112 | #     pullRequests = { frequency = "14 day" },
113 | #   }
114 | # ]
115 | 


--------------------------------------------------------------------------------
/.github/CODEOWNERS:
--------------------------------------------------------------------------------
1 | * @target/data-validator-members @c-horn
2 | 


--------------------------------------------------------------------------------
/.github/ISSUE_TEMPLATE/bug_report.md:
--------------------------------------------------------------------------------
 1 | ---
 2 | name: Bug report
 3 | about: Create a report to help us improve
 4 | title: ''
 5 | labels: bug
 6 | assignees: ''
 7 | 
 8 | ---
 9 | 
10 | **Describe the bug**
11 | <!-- A clear and concise description of what the bug is. -->
12 | 
13 | **To Reproduce**
14 | <!-- 
15 | Include the following
16 | 1. Configuration example, reduced to the bare minimum that reproduces the problem.
17 | 2. Command invocation 
18 | 3. If you are building from source, include the output of `make doctor` here, too.
19 | -->
20 | 
21 | **Expected behavior**
22 | <!-- A clear and concise description of what you expected to happen. -->
23 | 
24 | **Log output**
25 | <!-- Do not screenshot text! Link to a gist for logs if they're more than 20 lines: https://gist.github.com -->
26 | 
27 | 
28 | <!-- Delete all comment lines to save GitHub some byte storage! -->
29 | 


--------------------------------------------------------------------------------
/.github/ISSUE_TEMPLATE/new_check.md:
--------------------------------------------------------------------------------
 1 | ---
 2 | name: New check
 3 | about: Suggest a new check
 4 | title: 'New check:'
 5 | labels: enhancement
 6 | assignees: ''
 7 | 
 8 | ---
 9 | 
10 | # What would you like to check?
11 | 
12 | <!-- Provide as much detail as you can about what precisely you want to check. -->
13 | 
14 | # What does the configuration look like?
15 | 
16 | <!-- 
17 | Provide documentation of how you envision the configuration of the validator to function.  Please follow the style of other validators in the README, see https://github.com/target/data-validator#validators.
18 | Most of our rework requests are the result of an unclear vision for the interface to the check!
19 | -->
20 | 
21 | # Are you going to work on it, or are you asking for it?
22 | 
23 | - [ ] Asking
24 | - [ ] Working
25 | 
26 | ## If _working_ on it, when do you think you'll have a PR ready?
27 | 
28 | <!-- Delete the previous header above if you're just asking for the check. -->
29 | 


--------------------------------------------------------------------------------
/.github/ISSUE_TEMPLATE/other-feature-request.md:
--------------------------------------------------------------------------------
 1 | ---
 2 | name: Other feature request
 3 | about: Suggest an idea for this project
 4 | title: ''
 5 | labels: enhancement
 6 | assignees: ''
 7 | 
 8 | ---
 9 | 
10 | **Is your feature request related to a problem? Please describe.**
11 | A clear and concise description of what the problem is. Ex. I'm always frustrated when [...]
12 | 
13 | **Describe the solution you'd like**
14 | A clear and concise description of what you want to happen.
15 | 
16 | **Describe alternatives you've considered**
17 | A clear and concise description of any alternative solutions or features you've considered.
18 | 
19 | **Additional context**
20 | Add any other context or screenshots about the feature request here.
21 | 


--------------------------------------------------------------------------------
/.github/PULL_REQUEST_TEMPLATE/new_check.md:
--------------------------------------------------------------------------------
 1 | ---
 2 | name: New check
 3 | about: Submit a new check
 4 | title: 'New check:'
 5 | labels: 'enhancement'
 6 | assignees: ''
 7 | 
 8 | ---
 9 | 
10 | # What issue is this PR solving?
11 | 
12 | <!-- Provide the link. -->
13 | 
14 | # What does the configuration look like?
15 | 
16 | <!-- 
17 | #### validatorName
18 | 
19 | Provide a brief description of check and the condition that will result in a failure.  Document any arguments in the table below.
20 |  
21 | | Arg | Type | Description |
22 | |-----|------|-------------| 
23 | -->
24 | 
25 | # Have you completed all of these?
26 | 
27 | - [ ] Add configuration documentation to the Validators section of the README.  You should be able to copy this from the previous section.
28 | - [ ] Pass the style checker requirements without warnings or errors (`sbt test` will not work without compliance!)
29 | - [ ] Does not modify any of the other validators. Please review the section Refactoring in the CONTRIBUTING.md.
30 | - [ ] Include tests. Submissions without tests will not be considered. Test the following things:
31 |      - [ ] Configuration parsing
32 |      - [ ] Configuration sanity checking
33 |      - [ ] Variable substitution
34 |      - [ ] Actual check functionality
35 | 


--------------------------------------------------------------------------------
/.github/dependabot.yml:
--------------------------------------------------------------------------------
 1 | version: 2
 2 | updates:
 3 |   # Check for updates to GitHub Actions every weekday
 4 |   - package-ecosystem: "github-actions"
 5 |     directory: "/"
 6 |     schedule:
 7 |       interval: "daily"
 8 | 
 9 | # Until Dependabot provides Scala updates, they are are handed by the Scala Steward action.
10 | # https://github.com/target/data-validator/blob/master/.github/workflows/scala-steward.yaml
11 | # Check periodically for updates: https://docs.github.com/en/code-security/dependabot/dependabot-version-updates/configuration-options-for-the-dependabot.yml-file#package-ecosystem
12 | 


--------------------------------------------------------------------------------
/.github/stale.yaml:
--------------------------------------------------------------------------------
 1 | # Number of days of inactivity before an issue becomes stale
 2 | daysUntilStale: 28
 3 | # Number of days of inactivity before a stale issue is closed
 4 | daysUntilClose: 7
 5 | # Issues with these labels will never be considered stale
 6 | exemptLabels:
 7 |   - hold
 8 |   - blocked
 9 |   - security
10 | # Label to use when marking an issue as stale
11 | staleLabel: stale
12 | # Comment to post when marking an issue as stale. Set to `false` to disable
13 | markComment: >
14 |   This issue has been automatically marked as stale because it has not had
15 |   recent activity. It will be closed if no further activity occurs. Thank you
16 |   for your contributions. @target/data-validator-members, please take a look.
17 | # Comment to post when closing a stale issue. Set to `false` to disable
18 | closeComment: >
19 |   This issue was closed because it did not see activity within five weeks.
20 |   @target/data-validator-members, please reopen and reexamine at your earliest
21 |   convenience if this ticket should not be lost to the ether.
22 | 


--------------------------------------------------------------------------------
/.github/workflows/ci.yaml:
--------------------------------------------------------------------------------
 1 | name: Continuous Integration
 2 | 
 3 | on:
 4 |   workflow_call:
 5 |   merge_group:
 6 |     branches: ['*']
 7 |   pull_request:
 8 |     branches: ['*']
 9 |   push:
10 |     branches: ['*']
11 | 
12 | concurrency:
13 |   group: ${{ github.workflow }}-${{ github.event.pull_request.number || github.ref }}
14 |   cancel-in-progress: true
15 | 
16 | env:
17 |   GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
18 | 
19 | jobs:
20 |   build:
21 |     name: Build and Test
22 |     runs-on: ubuntu-latest
23 |     steps:
24 |       - name: Checkout current branch (full)
25 |         uses: actions/checkout@v4
26 |       - uses: coursier/cache-action@v6
27 |         with:
28 |           extraKey: ${{ runner.os }}
29 |       - uses: coursier/setup-action@v1
30 |         with:
31 |           jvm: adopt:1.8
32 |       - name: Build, test, and package project on Spark 3.5
33 |         run: bin/sbt clean compile test package makePom -DsparkVersion=3.5.1
34 |       - name: Build and package project on "legacy" Spark
35 |         run: bin/sbt clean compile package makePom
36 | 


--------------------------------------------------------------------------------
/.github/workflows/release.yaml:
--------------------------------------------------------------------------------
 1 | name: Release
 2 | 
 3 | on:
 4 |   release:
 5 |     types: [published]
 6 |   #push:
 7 |   #  tags: "[1-9]+.[0-9]+.[0-9]+"
 8 | 
 9 | env:
10 |   GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
11 | 
12 | jobs:
13 |   deploy:
14 |     name: Release
15 |     runs-on: ubuntu-latest
16 |     steps:
17 |       - name: Checkout current branch (full)
18 |         uses: actions/checkout@v4
19 |       - uses: coursier/cache-action@v6
20 |         with:
21 |           extraKey: ${{ runner.os }}
22 |       - uses: coursier/setup-action@v1
23 |         with:
24 |           jvm: adopt:1.8
25 |       # uses sbt-github-packages, see build.sbt
26 |       - name: Publish with SBT
27 |         run: bin/sbt publish
28 |       - name: Publish with SBT
29 |         run: bin/sbt publish -DsparkVersion=3.5.1
30 | 


--------------------------------------------------------------------------------
/.github/workflows/scala-steward.yaml:
--------------------------------------------------------------------------------
 1 | # This workflow will launch at 00:00 every Sunday
 2 | on:
 3 |   workflow_dispatch:
 4 |   schedule:
 5 |     # At 09:00 on day-of-month 1 and 15 
 6 |     # https://crontab.guru/#0_9_1,15_*_*
 7 |     - cron: '0 9 1,15 * *'
 8 | 
 9 | name: Launch Scala Steward
10 | 
11 | env:
12 |   # This is required because SBT is configured to look at env:GITHUB_TOKEN or git:github.token or env:SHELL
13 |   # to get a token for publishing even when not executing the publish task. Most of the time, SHELL is set
14 |   # but apparently not inside GitHub Actions runners. Setting _something invalid_ satisfies the
15 |   # GitHub Packages plugin safely and allows the operation to proceed.
16 |   SHELL: "/bin/bash"
17 | 
18 | jobs:
19 |   scala-steward:
20 |     runs-on: ubuntu-latest
21 |     name: Launch Scala Steward
22 |     steps:
23 |       - name: Install JDK for Scala Steward use
24 |         uses: actions/setup-java@v4
25 |         with:
26 |           distribution: 'temurin'
27 |           java-version: '11'
28 |       - name: Launch Scala Steward
29 |         uses: scala-steward-org/scala-steward-action@v2
30 |         with:
31 |           github-token: ${{ secrets.REPO_GITHUB_TOKEN }}
32 |           author-email: "41898282+github-actions[bot]@users.noreply.github.com"
33 |           author-name: "github-actions[bot]"
34 | 


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
 1 | # generated artifacts
 2 | project/target
 3 | project/project/target
 4 | target
 5 | !src/main/scala/com/target
 6 | !src/test/scala/com/target
 7 | 
 8 | # build server stuff
 9 | .bsp
10 | # autogenerated test data
11 | testData.orc
12 | 


--------------------------------------------------------------------------------
/.scalafmt.conf:
--------------------------------------------------------------------------------
 1 | version = 3.8.6
 2 | runner.dialect = scala211
 3 | project.git = true
 4 | align.preset = none
 5 | # Disabled in default since this operation is potentially
 6 | # dangerous if you define your own stripMargin with different
 7 | # semantics from the stdlib stripMargin.
 8 | assumeStandardLibraryStripMargin = true
 9 | onTestFailure = "To fix this, run 'sbt scalafmt' from the project root directory"
10 | maxColumn = 118
11 | rewrite.rules = [RedundantParens, PreferCurlyFors, SortModifiers]
12 | docstrings.style = SpaceAsterisk
13 | indent.main = 2
14 | 
15 | 


--------------------------------------------------------------------------------
/Brewfile:
--------------------------------------------------------------------------------
1 | if OS.mac?
2 |   # We need a version of a JDK older than what's kept in the main cask repository.
3 |   tap "homebrew/cask-versions"
4 |   # The JDK formerly known as AdoptOpenJDK, a build of OpenJDK
5 |   # Use JDK8 because that's a solid base for older Hadoop clusters
6 |   # Also, Spark 2.3.x essentially requires Scala 2.11.x, and that combination may necessitate Java 8.
7 |   cask "homebrew/cask-versions/temurin8"
8 | end


--------------------------------------------------------------------------------
/CONTRIBUTING.md:
--------------------------------------------------------------------------------
 1 | # Contributing to Data Validator
 2 | 
 3 | ## Fixing a bug you found
 4 | 
 5 | 1. Check existing [issues](https://github.com/target/data-validator/issues) first to see if anyone else has reported the bug
 6 | 2. Report the bug if no one else has reported it.
 7 | 3. Fix the bug.
 8 | 4. Submit a PR.
 9 | 5. Be ready for some back-and-forth between maintainers and you.
10 | 
11 | ## Creating new checks
12 | 
13 | 1. Check existing [issues](https://github.com/target/data-validator/issues) first to see if anyone else has reported a desire for the check.
14 | 2. If it doesn't exist (it probably won't!) then create a new issue:
15 |     1. Provide an example of how you would like the configuration for the check to look. Most of our rework requests are the result of an unclear vision for the interface to the check!
16 |     2. Clearly state if you are intending to work on it or if you are simply asking for it. If you're going to work on it, please provide a timeline for delivery. If you're just asking for it, you're done after you've submitted the request issue.
17 | 3. Work on it!
18 |     1. Abide by the style checker requirements.
19 |     2. Include tests. Submissions without tests will not be considered. Test the following things:
20 |         1. Configuration parsing
21 |         2. Configuration sanity checking
22 |         3. Variable substitution
23 |         4. Actual check functionality
24 | 
25 | ## Refactoring
26 | 
27 | Follow the new checks procedure, but instead of providing a configuration example, clearly explain:
28 | 
29 | 1. How the current state of things negatively affects the extensibility of Data Validator.
30 | 2. How you intend to remedy the situation with the minimum amount of code changed
31 | 
32 | **Do not mix refactoring with the addition of a new check in the same pull request.** We will reject and ask that they be done in separate PRs to keep things manageable.
33 | 
34 | ## Development Environment Setup
35 | 
36 | Developers on **macOS** should be able to clone, run `make deps build`, and be ready for a development cycle.
37 | This assumes that [Homebrew](https://brew.sh/) is already installed, as is common for macOS developers.
38 | 
39 | Developers on **Linux or Windows** will need to install a Java 8 JDK, preferably 
40 | the [Temurin JDK from the Adoptium Working Group](https://adoptium.net/) of the [Eclipse Foundation](https://www.eclipse.org)
41 | or another JDK in the OpenJDK family.
42 | 
43 | Run `make help` to see common tasks. Make tasks are provided for those unfamiliar with
44 | running `sbt` in console mode.
45 | Those preferring `sbt` are assumed to know what they're doing but can get a quick refresher
46 | by looking at the tasks in the `Makefile`.
47 | 


--------------------------------------------------------------------------------
/COPYRIGHT:
--------------------------------------------------------------------------------
1 | Copyright (c) 2019 Target Brands, Inc.
2 | 


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
 1 | Copyright (c) 2019 Target Brands, Inc.
 2 | 
 3 | Licensed under the Apache License, Version 2.0 (the "License");
 4 | you may not use this file except in compliance with the License.
 5 | You may obtain a copy of the License at
 6 | 
 7 |     http://www.apache.org/licenses/LICENSE-2.0
 8 | 
 9 | Unless required by applicable law or agreed to in writing, software
10 | distributed under the License is distributed on an "AS IS" BASIS,
11 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | See the License for the specific language governing permissions and
13 | limitations under the License.
14 | 


--------------------------------------------------------------------------------
/Makefile:
--------------------------------------------------------------------------------
 1 | SBT ?= bin/sbt
 2 | 
 3 | ##@ Utilities
 4 | 
 5 | .PHONY: help
 6 | help: ## Prints help for targets with comments
 7 | 	@awk 'BEGIN {FS = ":.*##"; printf "\nUsage:\n  make \033[36m\033[0m\n"} /^[a-zA-Z0-9_-]+:.*?##/ { printf "  \033[36m%-15s\033[0m %s\n", $$1, $$2 } /^##@/ { printf "\n\033[1m%s\033[0m\n", substr($$0, 5) } ' $(MAKEFILE_LIST)
 8 | 
 9 | ##@ Development Setup
10 | 
11 | deps-sys: Brewfile ## Installs system-wide dependencies
12 | 	(command -v brew > /dev/null && brew bundle) || true
13 | 
14 | ##@ Development Cycle
15 | 
16 | .PHONY: test
17 | test: ## Runs tests
18 | 	$(SBT) test
19 | 
20 | .PHONY: check
21 | check: ## Runs linters and other checks
22 | 	$(SBT) scalastyle
23 | 
24 | .PHONY: build
25 | build:
26 | 	$(SBT) assembly
27 | 
28 | .PHONY: format-scala
29 | format-scala: ## Formats all Scala code
30 | 	$(SBT) scalafmt
31 | 
32 | ##@ Maintenance Tasks
33 | 
34 | refresh-sbt: ## Retrieve the latest version of sbt launcher
35 | 	curl https://raw.githubusercontent.com/paulp/sbt-extras/master/sbt > bin/sbt
36 | 
37 | UNAME := $(shell uname -s)
38 | ifeq ($(UNAME), Linux)
39 | OS_INFO_CMD=lsb_release -a 2>/dev/null
40 | endif
41 | ifeq ($(UNAME), Darwin)
42 | OS_INFO_CMD=sw_vers
43 | endif
44 | 
45 | ##@ Debugging
46 | 
47 | doctor: ## Show important details about compilation environment
48 | 	java -version
49 | 	$(OS_INFO_CMD)
50 | 	bin/sbt version
51 | 	git log -1 HEAD --oneline
52 | 
53 | 


--------------------------------------------------------------------------------
/NOTICE:
--------------------------------------------------------------------------------
 1 | The following license applies to sbt-extras code incorporated into
 2 | the bin/sbt file of this project:
 3 | 
 4 | // Generated from http://www.opensource.org/licenses/bsd-license.php
 5 | Copyright (c) 2011, Paul Phillips. All rights reserved.
 6 | 
 7 | Redistribution and use in source and binary forms, with or without
 8 | modification, are permitted provided that the following conditions are
 9 | met:
10 | 
11 |     * Redistributions of source code must retain the above copyright
12 | notice, this list of conditions and the following disclaimer.
13 |     * Redistributions in binary form must reproduce the above copyright
14 | notice, this list of conditions and the following disclaimer in the
15 | documentation and/or other materials provided with the distribution.
16 |     * Neither the name of the author nor the names of its contributors
17 | may be used to endorse or promote products derived from this software
18 | without specific prior written permission.
19 | 
20 | THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
21 | "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
22 | LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
23 | A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
24 | HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
25 | SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED
26 | TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
27 | PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
28 | LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
29 | NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
30 | SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.


--------------------------------------------------------------------------------
/build.sbt:
--------------------------------------------------------------------------------
  1 | name := "data-validator"
  2 | organization := "com.target"
  3 | 
  4 | val sparkVersion = settingKey[String]("Spark version")
  5 | 
  6 | sparkVersion := System.getProperty("sparkVersion", "2.3.4")
  7 | 
  8 | scalaVersion := {
  9 |   if (sparkVersion.value > "3.0") {
 10 |     "2.12.19"
 11 |   } else {
 12 |     "2.11.12"
 13 |   }
 14 | }
 15 | 
 16 | val sparkValidationVersion = settingKey[String]("Version of package")
 17 | 
 18 | sparkValidationVersion := "0.15.0"
 19 | 
 20 | version := sparkVersion.value + "_" + sparkValidationVersion.value
 21 | 
 22 | val circeVersion = settingKey[String]("Circe version")
 23 | val circeYamlVersion = settingKey[String]("Circe YAML version")
 24 | 
 25 | circeVersion := {
 26 |   if (sparkVersion.value > "3.0") {
 27 |     "0.14.6"
 28 |   } else {
 29 |     "0.11.2"
 30 |   }
 31 | }
 32 | 
 33 | circeYamlVersion := {
 34 |   if (sparkVersion.value > "3.0") {
 35 |     "0.15.1"
 36 |   } else {
 37 |     "0.10.1"
 38 |   }
 39 | }
 40 | 
 41 | //addDependencyTreePlugin
 42 | enablePlugins(GitVersioning)
 43 | git.useGitDescribe := true
 44 | ThisBuild / versionScheme := Some("early-semver")
 45 | 
 46 | /////////////
 47 | // Publishing
 48 | /////////////
 49 | githubOwner := "target"
 50 | githubRepository := "data-validator"
 51 | // this unfortunately must be set strangely because GitHub requires a token for pulling packages
 52 | // and sbt-github-packages does not allow the user to configure the resolver not to be used.
 53 | // https://github.com/djspiewak/sbt-github-packages/issues/28
 54 | githubTokenSource := (TokenSource.Environment("GITHUB_TOKEN") ||
 55 |   TokenSource.GitConfig("github.token") ||
 56 |   TokenSource.Environment("SHELL")) // it's safe to assume this exists and is not unique
 57 | 
 58 | publishTo := githubPublishTo.value
 59 | 
 60 | enablePlugins(BuildInfoPlugin)
 61 | buildInfoKeys := Seq[BuildInfoKey](name, version, scalaVersion, sbtVersion)
 62 | buildInfoPackage := "com.target.data_validator"
 63 | 
 64 | libraryDependencies ++= Seq(
 65 |   "com.typesafe.scala-logging" %% "scala-logging" % "3.9.5",
 66 |   "com.github.scopt" %% "scopt" % "4.1.0",
 67 |   "com.sun.mail" % "javax.mail" % "1.6.2",
 68 |   "com.lihaoyi" %% "scalatags" % "0.12.0",
 69 |   "io.circe" %% "circe-yaml" % circeYamlVersion.value,
 70 |   "io.circe" %% "circe-core" % circeVersion.value,
 71 |   "io.circe" %% "circe-generic" % circeVersion.value,
 72 |   "io.circe" %% "circe-parser" % circeVersion.value,
 73 |   "org.apache.spark" %% "spark-sql" % sparkVersion.value % Provided,
 74 |   "junit" % "junit" % "4.13.2" % Test,
 75 |   "org.scalatest" %% "scalatest" % "3.2.19" % Test,
 76 |   "com.github.sbt" % "junit-interface" % "0.13.3" % Test exclude ("junit", "junit-dep")
 77 | )
 78 | 
 79 | Test / fork := true
 80 | javaOptions ++= (if (sparkVersion.value > "3.0" && System.getenv("MODERN_JAVA") == "TRUE") {
 81 |   // For modern Java we need to open up a lot of config options.
 82 |   Seq("-Xms4048M", "-Xmx4048M",
 83 |     // these were added in JDK 11 and newer, apparently.
 84 |     "-Dio.netty.tryReflectionSetAccessible=true",
 85 |     "--add-opens=java.base/java.lang=ALL-UNNAMED",
 86 |     "--add-opens=java.base/java.io=ALL-UNNAMED",
 87 |     "--add-opens=java.base/sun.nio.ch=ALL-UNNAMED")
 88 | } else {
 89 |     Seq("-Xms4048M", "-Xmx4048M")
 90 | })
 91 | Test / parallelExecution := false
 92 | // required for unit tests, but not set in some environments
 93 | Test / envVars ++= Map(
 94 |   "JAVA_HOME" ->
 95 |     Option(System.getenv("JAVA_HOME"))
 96 |       .getOrElse(System.getProperty("java.home"))
 97 | )
 98 | 
 99 | assembly / mainClass := Some("com.target.data_validator.Main")
100 | 
101 | assembly / assemblyShadeRules := Seq(
102 |         ShadeRule.rename("shapeless.**" -> "new_shapeless.@1").inAll,
103 |         ShadeRule.rename("cats.kernel.**" -> s"new_cats.kernel.@1").inAll
104 |       )
105 | 
106 | // Enforces scalastyle checks
107 | val compileScalastyle = TaskKey[Unit]("compileScalastyle")
108 | scalastyleFailOnWarning := true
109 | scalastyleFailOnError := true
110 | 
111 | compileScalastyle := (Compile / scalastyle).toTask("").value
112 | (Compile / compile) := ((Compile / compile) dependsOn compileScalastyle).value
113 | 
114 | (Compile / run) := Defaults
115 |   .runTask(
116 |     Compile / fullClasspath,
117 |     Compile / run / mainClass,
118 |     Compile / run / runner
119 |   )
120 |   .evaluated
121 | 
122 | (Compile / runMain) := Defaults.runMainTask(Compile / fullClasspath, Compile / run / runner).evaluated
123 | TaskKey[Unit]("generateTestData") := {
124 |   libraryDependencies += "org.apache.spark" %% "spark-sql" % sparkVersion.value
125 |   (Compile / runMain).toTask(" com.target.data_validator.GenTestData").value
126 | }
127 | 


--------------------------------------------------------------------------------
/project/assembly.sbt:
--------------------------------------------------------------------------------
1 | addSbtPlugin("com.eed3si9n" % "sbt-assembly" % "2.3.1")
2 | 


--------------------------------------------------------------------------------
/project/build.properties:
--------------------------------------------------------------------------------
1 | sbt.version = 1.10.10
2 | 


--------------------------------------------------------------------------------
/project/plugins.sbt:
--------------------------------------------------------------------------------
1 | addSbtPlugin("org.scalastyle" %% "scalastyle-sbt-plugin" % "1.0.0")
2 | addSbtPlugin("com.github.sbt" % "sbt-git" % "2.1.0")
3 | addSbtPlugin("com.eed3si9n" % "sbt-buildinfo" % "0.13.1")
4 | addSbtPlugin("com.codecommit" % "sbt-github-packages" % "0.5.3")
5 | addSbtPlugin("org.scalameta" % "sbt-scalafmt" % "2.5.4")
6 | addSbtPlugin("ch.epfl.scala" % "sbt-scalafix" % "0.14.2")


--------------------------------------------------------------------------------
/src/main/resources/log4j-dv-spark.properties:
--------------------------------------------------------------------------------
 1 | # Set everything to be logged to the console
 2 | log4j.rootCategory=WARN, console
 3 | log4j.appender.console=org.apache.log4j.ConsoleAppender
 4 | log4j.appender.console.target=System.err
 5 | log4j.appender.console.layout=org.apache.log4j.PatternLayout
 6 | log4j.appender.console.layout.ConversionPattern=%d{yy/MM/dd HH:mm:ss} %p %C{1}: %m%n
 7 |  
 8 |  
 9 | # Settings to quiet third party logs that are too verbose
10 | log4j.logger.org.spark-project.jetty=WARN
11 | log4j.logger.org.spark-project.jetty.util.component.AbstractLifeCycle=ERROR
12 | log4j.logger.org.apache.spark.repl.SparkIMain$exprTyper=INFO
13 | log4j.logger.org.apache.spark.repl.SparkILoop$SparkILoopInterpreter=INFO
14 | log4j.logger.org.apache.parquet=ERROR
15 | log4j.logger.parquet=ERROR
16 |  
17 |  
18 | # SPARK-9183: Settings to avoid annoying messages when looking up nonexistent UDFs in SparkSQL with Hive support
19 | log4j.logger.org.apache.hadoop.hive.metastore.RetryingHMSHandler=FATAL
20 | log4j.logger.org.apache.hadoop.hive.ql.exec.FunctionRegistry=ERROR
21 | log4j.logger.org.apache.spark.sql=WARN
22 | 
23 | log4j.logger.org.apache.spark.repl.Main=WARN
24 | log4j.logger.org.apache.spark.scheduler.TaskSetManager=ERROR
25 | log4j.logger.org.apache.spark.ExecutorAllocationManager=ERROR
26 | 
27 | # Logging for this application
28 | log4j.logger.com.target.data_validator=INFO
29 | 


--------------------------------------------------------------------------------
/src/main/scala/com/target/data_validator/CliOptionParser.scala:
--------------------------------------------------------------------------------
 1 | package com.target.data_validator
 2 | 
 3 | import scopt.OptionParser
 4 | 
 5 | case class CliOptions(
 6 |     configFilename: String = "",
 7 |     verbose: Boolean = false,
 8 |     jsonReport: Option[String] = None,
 9 |     htmlReport: Option[String] = None,
10 |     exitErrorOnFail: Boolean = true,
11 |     vars: Map[String, String] = Map(),
12 |     emailOnPass: Boolean = false
13 | )
14 | 
15 | object CliOptionParser {
16 | 
17 |   def parser: OptionParser[CliOptions] = new OptionParser[CliOptions]("data-validator") {
18 |     head(BuildInfo.name, "v" + BuildInfo.version)
19 | 
20 |     version("version")
21 | 
22 |     opt[Unit]("verbose").action((_, c) => c.copy(verbose = true)).text("Print additional debug output.")
23 | 
24 |     opt[String]("config")
25 |       .action((fn, c) => c.copy(configFilename = fn))
26 |       .text(
27 |         "required validator config .yaml filename, " +
28 |           "prefix w/ 'classpath:' to load configuration from JVM classpath/resources, " +
29 |           "ex. '--config classpath:/config.yaml'"
30 |       )
31 | 
32 |     opt[String]("jsonReport").action((fn, c) => c.copy(jsonReport = Some(fn))).text("optional JSON report filename")
33 | 
34 |     opt[String]("htmlReport").action((fn, c) => c.copy(htmlReport = Some(fn))).text("optional HTML report filename")
35 | 
36 |     opt[Map[String, String]]("vars")
37 |       .valueName("k1=v1,k2=v2...")
38 |       .action((x, c) => c.copy(vars = x))
39 |       .text("other arguments")
40 | 
41 |     opt[Boolean]("exitErrorOnFail")
42 |       .valueName("true|false")
43 |       .action((x, c) => c.copy(exitErrorOnFail = x))
44 |       .text(
45 |         "optional when true, if validator fails, call System.exit(-1) " +
46 |           "Defaults to True, but will change to False in future version."
47 |       )
48 | 
49 |     opt[Boolean]("emailOnPass")
50 |       .valueName("true|false")
51 |       .action((x, c) => c.copy(emailOnPass = x))
52 |       .text("optional when true, sends email on validation success. Default: false")
53 | 
54 |     help("help").text("Show this help message and exit.")
55 |   }
56 | }
57 | 


--------------------------------------------------------------------------------
/src/main/scala/com/target/data_validator/ConfigParser.scala:
--------------------------------------------------------------------------------
 1 | package com.target.data_validator
 2 | 
 3 | import cats.syntax.either._
 4 | import cats.syntax.functor._
 5 | import com.typesafe.scalalogging.LazyLogging
 6 | import io.circe._
 7 | import io.circe.generic.auto._
 8 | import io.circe.yaml.parser
 9 | 
10 | import scala.io.{BufferedSource, Source}
11 | import scala.util.{Failure, Success, Try}
12 | 
13 | object ConfigParser extends LazyLogging {
14 | 
15 |   // IntelliJ Says this import isn't needed, but it won't compile without it.
16 |   import validator.JsonDecoders._
17 |   import ConfigVar._
18 | 
19 |   implicit val decodeTable: Decoder[ValidatorTable] =
20 |     List[Decoder[ValidatorTable]](
21 |       Decoder[ValidatorHiveTable].widen,
22 |       Decoder[ValidatorOrcFile].widen,
23 |       Decoder[ValidatorParquetFile].widen,
24 |       Decoder[ValidatorSpecifiedFormatLoader].widen
25 |     ).reduceLeft(_ or _)
26 | 
27 |   def configFromJson(json: Json): Either[DecodingFailure, ValidatorConfig] = {
28 |     logger.debug(s"Json config: $json")
29 |     json.as[ValidatorConfig]
30 |   }
31 | 
32 |   private def bufferContentsAsString(buffer: BufferedSource): String = {
33 |     val contents = buffer.mkString
34 |     buffer.close()
35 |     contents
36 |   }
37 | 
38 |   private def loadFromFile(filename: String): String = {
39 |     logger.info(s"Attempting to load `$filename` from file system")
40 |     val buffer = Source.fromFile(filename)
41 |     bufferContentsAsString(buffer)
42 |   }
43 | 
44 |   private def loadFromClasspath(filename: String): String = {
45 |     logger.info(s"Attempting to load `$filename` from classpath")
46 |     val is = getClass.getResourceAsStream(filename)
47 |     val buffer = Source.fromInputStream(is)
48 |     bufferContentsAsString(buffer)
49 |   }
50 | 
51 |   def parseFile(filename: String, cliMap: Map[String, String]): Either[Error, ValidatorConfig] = {
52 |     logger.info(s"Parsing `$filename`")
53 | 
54 |     Try {
55 |       if (filename.startsWith("classpath:")) {
56 |         loadFromClasspath(filename.stripPrefix("classpath:"))
57 |       } else {
58 |         loadFromFile(filename)
59 |       }
60 |     } match {
61 |       case Success(contents) => parse(contents)
62 |       case Failure(thr) => Left[Error, ValidatorConfig](DecodingFailure.fromThrowable(thr, List.empty))
63 |     }
64 |   }
65 | 
66 |   def parse(conf: String): Either[Error, ValidatorConfig] = parser.parse(conf).flatMap(configFromJson)
67 | 
68 |   def main(args: Array[String]): Unit = {
69 |     logger.info(s"Args[${args.length}]: $args")
70 |     val filename = args(0)
71 |     var error = false
72 | 
73 |     parseFile(filename, Map.empty) match {
74 |       case Left(pe) => logger.error(s"Failed to parse $filename, ${pe.getMessage}"); error = true
75 |       case Right(config) => logger.info(s"Config: $config")
76 |     }
77 | 
78 |     System.exit(if (error) 1 else 0)
79 |   }
80 | }
81 | 


--------------------------------------------------------------------------------
/src/main/scala/com/target/data_validator/ConfigVar.scala:
--------------------------------------------------------------------------------
  1 | package com.target.data_validator
  2 | 
  3 | import cats.syntax.functor._
  4 | import com.target.data_validator.EnvironmentVariables.{Error, Inaccessible, Present, Unset}
  5 | import com.typesafe.scalalogging.LazyLogging
  6 | import io.circe.{Decoder, Json}
  7 | import io.circe.generic.auto._
  8 | import org.apache.spark.sql.SparkSession
  9 | 
 10 | import scala.sys.process.{Process, ProcessLogger}
 11 | import scala.util.{Failure, Success, Try}
 12 | 
 13 | sealed trait ConfigVar extends EventLog with Substitutable {
 14 |   def addEntry(spark: SparkSession, varSub: VarSubstitution): Boolean
 15 | }
 16 | 
 17 | case class NameValue(name: String, value: Json) extends ConfigVar {
 18 |   override def addEntry(spark: SparkSession, varSub: VarSubstitution): Boolean = {
 19 |     logger.debug(s"name: $name value: ${value.noSpaces}")
 20 |     varSub.add(name, getVarSubJson(value, name, varSub))
 21 |   }
 22 | }
 23 | 
 24 | case class NameEnv(name: String, env: String) extends ConfigVar {
 25 | 
 26 |   override def addEntry(spark: SparkSession, varSub: VarSubstitution): Boolean = {
 27 |     val newEnv = getVarSub(env, name, varSub)
 28 |     EnvironmentVariables.get(newEnv) match {
 29 |       case Inaccessible(message) => logger.error(message); true
 30 |       case Error(message) => logger.error(message); true
 31 |       case Unset => {
 32 |         val msg = s"Variable '$name' cannot be processed env variable '$newEnv' not found!"
 33 |         logger.error(msg)
 34 |         addEvent(ValidatorError(msg))
 35 |         true
 36 |       }
 37 |       case Present(value) => {
 38 |         val resolvedEnvVar = getVarSubJson(JsonUtils.string2Json(value), name, varSub)
 39 |         logger.info(s"name: $name env: $env getEnv: $value resolvedEnvVar: $resolvedEnvVar")
 40 |         varSub.add(name, resolvedEnvVar)
 41 |       }
 42 |     }
 43 |   }
 44 | }
 45 | 
 46 | case class NameShell(name: String, shell: String) extends ConfigVar {
 47 |   override def addEntry(spark: SparkSession, varSub: VarSubstitution): Boolean = {
 48 |     val newShell = getVarSub(shell, "shell", varSub)
 49 |     val timer = new ValidatorTimer(s"NameShell($name, $newShell)")
 50 |     val out = new StringBuilder
 51 |     val err = new StringBuilder
 52 |     val processLogger = ProcessLogger(out append _, err append _)
 53 |     addEvent(timer)
 54 |     timer.time {
 55 |       Try(Process(Seq("/bin/sh", "-c", newShell)) ! processLogger) match {
 56 |         case Failure(exception) =>
 57 |           validatorError(s"NameShell($name, $newShell) Failed with exception $exception"); true
 58 |         case Success(exitCode) if exitCode != 0 =>
 59 |           validatorError(
 60 |             s"NameShell($name, $newShell) Ran but returned exitCode: $exitCode stderr: ${err.toString()}"
 61 |           )
 62 |           true
 63 |         case Success(0) if out.isEmpty =>
 64 |           validatorError(s"NameShell($name, $newShell) Ran but returned No output")
 65 |           true
 66 |         case Success(0) =>
 67 |           val value = out.toString.split("\n").head
 68 |           logger.debug(s"name: $name shell: $newShell output: $value")
 69 |           varSub.add(name, getVarSubJson(JsonUtils.string2Json(value), name, varSub)); false
 70 |       }
 71 |     }
 72 |   }
 73 | }
 74 | 
 75 | case class NameSql(name: String, sql: String) extends ConfigVar {
 76 |   override def addEntry(spark: SparkSession, varSub: VarSubstitution): Boolean = {
 77 |     val timer = new ValidatorTimer(s"NameSql($name, $sql)")
 78 |     addEvent(timer)
 79 |     timer.time {
 80 |       Try(spark.sql(getVarSub(sql, name, varSub)).head(1)) match {
 81 |         case Failure(exception) =>
 82 |           validatorError(s"NameSql($name, $sql) Failed with exception $exception")
 83 |           true
 84 |         case Success(rows) if rows.isEmpty =>
 85 |           validatorError(s"NameSql($name, $sql) Returned 0 rows.")
 86 |           true
 87 |         case Success(rows) =>
 88 |           val json = JsonUtils.row2Json(rows.head, 0)
 89 |           logger.debug(s"name: $name sql: $sql result: ${rows.head} json: ${JsonUtils.debugJson(json)}")
 90 |           varSub.add(name, json)
 91 |       }
 92 |     }
 93 |   }
 94 | }
 95 | 
 96 | object ConfigVar extends LazyLogging {
 97 | 
 98 |   implicit val decodeConfigVar: Decoder[ConfigVar] = List[Decoder[ConfigVar]](
 99 |     Decoder[NameValue].widen,
100 |     Decoder[NameShell].widen,
101 |     Decoder[NameEnv].widen,
102 |     Decoder[NameSql].widen
103 |   ).reduceLeft(_ or _)
104 | 
105 | }
106 | 


--------------------------------------------------------------------------------
/src/main/scala/com/target/data_validator/Emailer.scala:
--------------------------------------------------------------------------------
  1 | package com.target.data_validator
  2 | 
  3 | import java.util.{Date, Properties}
  4 | 
  5 | import com.typesafe.scalalogging.LazyLogging
  6 | import javax.mail._
  7 | import javax.mail.internet._
  8 | 
  9 | import scala.util.Try
 10 | 
 11 | case class EmailConfig(
 12 |     smtpHost: String,
 13 |     subject: String,
 14 |     from: String,
 15 |     to: List[String],
 16 |     cc: Option[List[String]] = None,
 17 |     bcc: Option[List[String]] = None
 18 | ) extends EventLog
 19 |     with Substitutable {
 20 |   def substituteVariables(dict: VarSubstitution): EmailConfig = {
 21 |     EmailConfig(
 22 |       getVarSub(smtpHost, "smtpHost", dict),
 23 |       getVarSub(subject, "email.subject", dict),
 24 |       getVarSub(from, "email.from", dict),
 25 |       to.map(getVarSub(_, "email.to", dict)),
 26 |       cc.map(_.map(getVarSub(_, "email.cc", dict))),
 27 |       bcc.map(_.map(getVarSub(_, "email.bcc", dict)))
 28 |     )
 29 |   }
 30 | }
 31 | 
 32 | object Emailer extends LazyLogging {
 33 | 
 34 |   def createMessage(smtpHost: String): Message = {
 35 |     val properties = new Properties()
 36 |     properties.put("mail.smtp.host", smtpHost)
 37 |     val session = Session.getInstance(properties)
 38 |     new MimeMessage(session)
 39 |   }
 40 | 
 41 |   def setMessageRecipients(message: Message, recipients: Seq[String], recipientType: Message.RecipientType): Int = {
 42 |     val parsedAddresses = recipients.map(x => Try(InternetAddress.parse(x)))
 43 | 
 44 |     val (goodParsed, badParsed) = parsedAddresses.partition(_.isSuccess)
 45 | 
 46 |     badParsed.foreach(x => logger.error(s"EmailAddress from $recipientType threw exception $x"))
 47 | 
 48 |     val goodAddresses: Array[Address] = goodParsed.flatMap(_.get.toSeq).seq.toArray
 49 | 
 50 |     if (!goodAddresses.isEmpty) {
 51 |       message.setRecipients(recipientType, goodAddresses)
 52 |     }
 53 | 
 54 |     goodAddresses.length
 55 |   }
 56 | 
 57 |   def setFrom(message: Message, from: String): Boolean = {
 58 |     try {
 59 |       val frm = InternetAddress.parse(from, true)
 60 |       message.setFrom(frm.head)
 61 |       false
 62 |     } catch {
 63 |       case ae: AddressException =>
 64 |         logger.error(s"setFrom InternetAddress parse failed, $ae")
 65 |         true
 66 |       case me: MessagingException =>
 67 |         logger.error(s"setFrom failed, $me")
 68 |         true
 69 |     }
 70 |   }
 71 | 
 72 |   def createEmptyMessage(
 73 |       smtpHost: String,
 74 |       subject: String,
 75 |       from: String,
 76 |       to: Seq[String],
 77 |       cc: Seq[String],
 78 |       bcc: Seq[String]
 79 |   ): Option[Message] = {
 80 | 
 81 |     logger.debug(s"Creating Message frm: $from to: ${to.mkString(", ")} sub: $subject")
 82 |     val message = createMessage(smtpHost)
 83 | 
 84 |     val validRecipients = setMessageRecipients(message, cc, Message.RecipientType.CC) +
 85 |       setMessageRecipients(message, bcc, Message.RecipientType.BCC) +
 86 |       setMessageRecipients(message, to, Message.RecipientType.TO)
 87 |     if (validRecipients == 0) {
 88 |       logger.error("Must specify at least 1 valid email address in TO, CC, or BCC")
 89 |       None
 90 |     } else if (setFrom(message, from)) {
 91 |       logger.error(s"setFrom($from) failed!")
 92 |       None
 93 |     } else {
 94 |       message.setSentDate(new Date())
 95 |       message.setSubject(subject)
 96 |       Some(message)
 97 |     }
 98 |   }
 99 | 
100 |   def createEmptyMessage(ec: EmailConfig): Option[Message] =
101 |     createEmptyMessage(
102 |       ec.smtpHost,
103 |       ec.subject,
104 |       ec.from,
105 |       ec.to,
106 |       ec.cc.getOrElse(Seq.empty),
107 |       ec.bcc.getOrElse(Seq.empty)
108 |     )
109 | 
110 |   def sendMessage(message: Message, body: String, mime: String): Boolean = {
111 |     message.setContent(body, mime)
112 |     val id = message.hashCode().toHexString
113 |     try {
114 |       logger.info(s"Sending email #$id [${message.getSubject}] to [${message.getAllRecipients.mkString(", ")}]")
115 |       Transport.send(message)
116 |       logger.info(s"Email #$id sent successfully to all recipients.")
117 |       false
118 |     } catch {
119 |       case sfe: SendFailedException =>
120 |         handleSendFailedException(id, sfe)
121 |         true
122 |       case me: MessagingException =>
123 |         logger.error(s"Failure to send email #$id: $me")
124 |         true
125 |     }
126 |   }
127 | 
128 |   private def handleSendFailedException(id: String, sfe: SendFailedException): Unit = {
129 |     logger.warn(s"Failure to send email #$id: ${sfe.getMessage}")
130 |     Option(sfe.getValidSentAddresses) match {
131 |       case Some(addresses) => logger.warn(s"Email #$id was sent to [${addresses.mkString(", ")}]")
132 |       case None => logger.info("No emails were sent successfully.")
133 |     }
134 |     Option(sfe.getValidUnsentAddresses) match {
135 |       case Some(addresses) => logger.error(s"Email #$id was not sent to [${addresses.mkString(", ")}]")
136 |       case None =>
137 |     }
138 |     Option(sfe.getInvalidAddresses) match {
139 |       case Some(addresses) => logger.error(s"Email #$id has invalid addresses: [${addresses.mkString(", ")}]")
140 |       case None =>
141 |     }
142 |   }
143 | 
144 |   def sendTextMessage(
145 |       smtpHost: String,
146 |       body: String,
147 |       subject: String,
148 |       from: String,
149 |       to: Seq[String],
150 |       cc: Seq[String] = Nil,
151 |       bcc: Seq[String] = Nil
152 |   ): Boolean =
153 |     createEmptyMessage(smtpHost, subject, from, to, cc, bcc) match {
154 |       case None =>
155 |         logger.error("createMessage failed!")
156 |         true
157 |       case Some(message) =>
158 |         sendMessage(message, body, "text/plain; charset=us-ascii")
159 |     }
160 | 
161 |   def sendTextMessage(emailConfig: EmailConfig, body: String): Boolean = {
162 |     createEmptyMessage(emailConfig) match {
163 |       case None =>
164 |         logger.error("createMessage failed!")
165 |         true
166 |       case Some(message) =>
167 |         sendMessage(message, body, "text/plain; charset=us-ascii")
168 |     }
169 |   }
170 | 
171 |   def sendHtmlMessage(
172 |       smtpHost: String,
173 |       body: String,
174 |       subject: String,
175 |       from: String,
176 |       to: Seq[String],
177 |       cc: Seq[String] = Nil,
178 |       bcc: Seq[String] = Nil
179 |   ): Boolean =
180 |     createEmptyMessage(smtpHost, subject, from, to, cc, bcc) match {
181 |       case None =>
182 |         logger.error("createMessage failed!")
183 |         true
184 |       case Some(message) =>
185 |         sendMessage(message, body, "text/html")
186 |     }
187 | 
188 |   def sendHtmlMessage(config: EmailConfig, body: String): Boolean = {
189 |     createEmptyMessage(config) match {
190 |       case None =>
191 |         logger.error("createMessage failed!")
192 |         true
193 |       case Some(message) =>
194 |         sendMessage(message, body, "text/html")
195 |     }
196 |   }
197 | 
198 | }
199 | 


--------------------------------------------------------------------------------
/src/main/scala/com/target/data_validator/EnvironmentVariables.scala:
--------------------------------------------------------------------------------
 1 | package com.target.data_validator
 2 | 
 3 | import scala.collection.mutable
 4 | import scala.util.Try
 5 | 
 6 | object EnvironmentVariables {
 7 |   type MaybeEnvVar = Try[Option[String]]
 8 | 
 9 |   val accessedEnvVars: mutable.Map[String, MaybeEnvVar] = mutable.Map.empty
10 | 
11 |   def get(key: String): EnvVarResult = {
12 |     getWithHandlers(key)(
13 |       whenError = { case throwable: Throwable => Inaccessible(throwable) },
14 |       whenUnset = { Unset },
15 |       whenPresent = { Present }
16 |     ).recover { case throwable: Throwable => Error(throwable) }.get
17 |   }
18 | 
19 |   def getWithHandlers[T](key: String)(
20 |       whenError: PartialFunction[Throwable, T],
21 |       whenUnset: => T,
22 |       whenPresent: String => T
23 |   ): Try[T] = {
24 |     tryGet(key)
25 |       .map(_.map(whenPresent).getOrElse(whenUnset))
26 |       .recover(whenError)
27 |   }
28 | 
29 |   def tryGet(key: String): MaybeEnvVar = {
30 |     val result = Try(System.getenv(key)).map(Option(_))
31 |     accessedEnvVars += key -> result
32 |     result
33 |   }
34 | 
35 |   sealed trait EnvVarResult {
36 |     def toString: String
37 |   }
38 |   case class Present(value: String) extends EnvVarResult {
39 |     override def toString: String = value
40 |   }
41 |   case class Inaccessible(message: String) extends EnvVarResult {
42 |     override val toString: String = s"<inaccessible: $message>"
43 |   }
44 |   object Inaccessible {
45 |     def apply(throwable: Throwable): Inaccessible = Inaccessible(throwable.getMessage)
46 |   }
47 |   case object Unset extends EnvVarResult {
48 |     override val toString: String = "<unset>"
49 |   }
50 |   case class Error(message: String) extends EnvVarResult {
51 |     override val toString: String = s"<error: $message>"
52 |   }
53 |   object Error {
54 |     def apply(throwable: Throwable): Error = Error(throwable.getMessage)
55 |   }
56 | }
57 | 


--------------------------------------------------------------------------------
/src/main/scala/com/target/data_validator/EventGenerator.scala:
--------------------------------------------------------------------------------
1 | package com.target.data_validator
2 | 
3 | trait EventGenerator {
4 |   def addEvent(ve: ValidatorEvent): Unit
5 | }
6 | 


--------------------------------------------------------------------------------
/src/main/scala/com/target/data_validator/EventLog.scala:
--------------------------------------------------------------------------------
 1 | package com.target.data_validator
 2 | 
 3 | import com.typesafe.scalalogging.LazyLogging
 4 | 
 5 | import scala.collection.mutable.ListBuffer
 6 | 
 7 | trait EventLog extends EventGenerator with LazyLogging {
 8 |   def addEvent(ve: ValidatorEvent): Unit = EventLog.events.append(ve)
 9 | 
10 |   def validatorError(msg: String): Unit = {
11 |     logger.error(msg)
12 |     addEvent(ValidatorError(msg))
13 |   }
14 | }
15 | 
16 | object EventLog extends LazyLogging {
17 |   val events = new ListBuffer[ValidatorEvent]
18 | }
19 | 


--------------------------------------------------------------------------------
/src/main/scala/com/target/data_validator/ExpressionUtils.scala:
--------------------------------------------------------------------------------
 1 | package com.target.data_validator
 2 | 
 3 | import org.apache.spark.sql.catalyst.expressions.{Expression, Or}
 4 | 
 5 | object ExpressionUtils {
 6 | 
 7 |   /** Takes a List[Expression] and joins them together into on big Or() expression.
 8 |     * @param exprs
 9 |     *   \- Non Empty List of Expressions.
10 |     * @return
11 |     *   Or of all Expressions. throws IllegalArgumentException if exprs is empty.
12 |     */
13 |   @throws[IllegalArgumentException]
14 |   def orFromList(exprs: List[Expression]): Expression = exprs match {
15 |     case exp :: Nil => exp
16 |     case lhs :: rhs :: Nil => Or(lhs, rhs)
17 |     case lhs :: rhs :: rest => rest.foldRight(Or(lhs, rhs))(Or(_, _))
18 |     case Nil => throw new IllegalArgumentException("exprs must be nonEmpty")
19 |   }
20 | }
21 | 


--------------------------------------------------------------------------------
/src/main/scala/com/target/data_validator/GenTestData.scala:
--------------------------------------------------------------------------------
 1 | package com.target.data_validator
 2 | 
 3 | import org.apache.spark.sql.{DataFrame, Row, SparkSession}
 4 | import org.apache.spark.sql.types.{IntegerType, StringType, StructField, StructType}
 5 | 
 6 | object GenTestData {
 7 | 
 8 |   val schema = StructType(
 9 |     List(
10 |       StructField("id", IntegerType),
11 |       StructField("label", StringType),
12 |       StructField("div7", StringType, nullable = true)
13 |     )
14 |   )
15 | 
16 |   val label: Vector[String] = Vector("zero", "one", "two", "three", "four", "five", "six", "seven", "eight", "nine")
17 | 
18 |   def mkLabel(x: Int): List[String] = {
19 |     if (x == 0) {
20 |       Nil
21 |     } else {
22 |       val y = x % 10
23 |       label(y) :: mkLabel(x / 10)
24 |     }
25 |   }
26 | 
27 |   def genData(spark: SparkSession): DataFrame = {
28 |     val rg = spark.sparkContext.parallelize(Range(0, 100)) // scalastyle:off magic.number
29 |     spark.createDataFrame(
30 |       rg.map(x => Row(x, mkLabel(x).reverse.mkString(" "), if (x % 7 == 0) null else "NotNull") // scalastyle:off null
31 |       ),
32 |       schema
33 |     )
34 |   }
35 | 
36 |   def main(args: Array[String]): Unit = {
37 |     val spark = SparkSession.builder
38 |       .appName("genTestData")
39 |       .master(args.headOption.getOrElse("local"))
40 |       .getOrCreate()
41 | 
42 |     spark.sparkContext.setLogLevel("WARN") // Spark is very noisy.
43 | 
44 |     try {
45 |       val df = genData(spark).coalesce(1)
46 |       df.write.orc("testData.orc")
47 |     } finally {
48 |       spark.stop()
49 |     }
50 |   }
51 | 
52 | }
53 | 


--------------------------------------------------------------------------------
/src/main/scala/com/target/data_validator/HTMLBits.scala:
--------------------------------------------------------------------------------
 1 | package com.target.data_validator
 2 | 
 3 | import scalatags.Text.all._
 4 | 
 5 | /** Place for various HTMLBits that are used in generating HTML report.
 6 |   */
 7 | object HTMLBits {
 8 |   def pass: Tag = span(backgroundColor := "mediumseagreen")("PASS")
 9 |   def fail: Tag = span(backgroundColor := "tomato")("FAIL")
10 | 
11 |   def status(failed: Boolean): Tag = if (failed) { fail }
12 |   else { pass }
13 | }
14 | 


--------------------------------------------------------------------------------
/src/main/scala/com/target/data_validator/JsonEncoders.scala:
--------------------------------------------------------------------------------
  1 | package com.target.data_validator
  2 | 
  3 | import com.target.data_validator.validator._
  4 | import com.typesafe.scalalogging.LazyLogging
  5 | import io.circe._
  6 | import io.circe.syntax._
  7 | 
  8 | object JsonEncoders extends LazyLogging {
  9 | 
 10 |   // Used by ValidatorQuickCheckError to make sure Json types are right.
 11 |   private def any2json(a: Any): Json = a match {
 12 |     case i: Int => i.asJson
 13 |     case l: Long => l.asJson
 14 |     case f: Float => f.asJson
 15 |     case d: Double => d.asJson
 16 |     case s: String => Json.fromString(s)
 17 |     case b: Boolean => b.asJson
 18 |     case a: Any =>
 19 |       logger.warn(s"Unknown type `${a.getClass.getCanonicalName}` defaulting to string.")
 20 |       Json.fromString(a.toString)
 21 |   }
 22 | 
 23 |   // scalastyle:off cyclomatic.complexity
 24 |   implicit val eventEncoder: Encoder[ValidatorEvent] = new Encoder[ValidatorEvent] {
 25 |     override def apply(a: ValidatorEvent): Json = a match {
 26 |       case vc: ValidatorCounter =>
 27 |         Json.obj(
 28 |           ("type", Json.fromString("counter")),
 29 |           ("name", Json.fromString(vc.name)),
 30 |           ("value", Json.fromLong(vc.value))
 31 |         )
 32 |       case vg: ValidatorGood =>
 33 |         Json.obj(
 34 |           ("type", Json.fromString("good")),
 35 |           ("msg", Json.fromString(vg.msg))
 36 |         )
 37 |       case ve: ValidatorError =>
 38 |         Json.obj(
 39 |           ("type", Json.fromString("error")),
 40 |           ("failed", Json.fromBoolean(ve.failed)),
 41 |           ("msg", Json.fromString(ve.msg))
 42 |         )
 43 |       case vt: ValidatorTimer =>
 44 |         Json.obj(
 45 |           ("type", Json.fromString("timer")),
 46 |           ("label", Json.fromString(vt.label)),
 47 |           ("ns", Json.fromLong(vt.duration))
 48 |         )
 49 |       case vce: ValidatorCheckEvent =>
 50 |         Json.obj(
 51 |           ("type", Json.fromString("checkEvent")),
 52 |           ("failed", Json.fromBoolean(vce.failed)),
 53 |           ("label", Json.fromString(vce.label)),
 54 |           ("count", Json.fromLong(vce.count)),
 55 |           ("errorCount", Json.fromLong(vce.errorCount))
 56 |         )
 57 |       case cbvce: ColumnBasedValidatorCheckEvent =>
 58 |         Json.obj(
 59 |           ("type", Json.fromString("columnBasedCheckEvent")),
 60 |           ("failed", Json.fromBoolean(cbvce.failed)),
 61 |           ("message", Json.fromString(cbvce.msg)),
 62 |           ("data", Json.fromFields(cbvce.data.map(x => (x._1, Json.fromString(x._2)))))
 63 |         )
 64 |       case qce: ValidatorQuickCheckError =>
 65 |         Json.obj(
 66 |           ("type", Json.fromString("quickCheckError")),
 67 |           ("failed", Json.fromBoolean(qce.failed)),
 68 |           ("message", Json.fromString(qce.message)),
 69 |           ("key", Json.fromFields(qce.key.map(x => (x._1, any2json(x._2)))))
 70 |         )
 71 |       case vs: VarSubEvent =>
 72 |         Json.obj(
 73 |           ("type", Json.fromString("variableSubstitution")),
 74 |           ("src", Json.fromString(vs.src)),
 75 |           ("dest", Json.fromString(vs.dest))
 76 |         )
 77 |       case vs: VarSubJsonEvent =>
 78 |         Json.obj(
 79 |           ("type", Json.fromString("variableSubstitution")),
 80 |           ("src", Json.fromString(vs.src)),
 81 |           ("dest", vs.dest)
 82 |         )
 83 |       case vj: JsonEvent => vj.json
 84 |     }
 85 |   }
 86 |   // scalastyle:on cyclomatic.complexity
 87 | 
 88 |   implicit val baseEncoder: Encoder[ValidatorBase] = new Encoder[ValidatorBase] {
 89 |     final def apply(a: ValidatorBase): Json = a.toJson
 90 |   }
 91 | 
 92 |   implicit val tableEncoder: Encoder[ValidatorTable] = new Encoder[ValidatorTable] {
 93 |     final override def apply(a: ValidatorTable): Json = a match {
 94 |       case vh: ValidatorHiveTable =>
 95 |         Json.obj(
 96 |           ("db", Json.fromString(vh.db)),
 97 |           ("table", Json.fromString(vh.table)),
 98 |           ("failed", vh.failed.asJson),
 99 |           ("keyColumns", vh.keyColumns.asJson),
100 |           ("checks", vh.checks.asJson),
101 |           ("events", vh.getEvents.asJson)
102 |         )
103 |       case vo: ValidatorOrcFile =>
104 |         Json.obj(
105 |           ("orcFile", Json.fromString(vo.orcFile)),
106 |           ("failed", vo.failed.asJson),
107 |           ("keyColumns", vo.keyColumns.asJson),
108 |           ("checks", vo.checks.asJson),
109 |           ("events", vo.getEvents.asJson)
110 |         )
111 |       case vp: ValidatorParquetFile =>
112 |         Json.obj(
113 |           ("parquetFile", Json.fromString(vp.parquetFile)),
114 |           ("failed", vp.failed.asJson),
115 |           ("keyColumns", vp.keyColumns.asJson),
116 |           ("checks", vp.checks.asJson),
117 |           ("events", vp.getEvents.asJson)
118 |         )
119 |       case vdf: ValidatorDataFrame =>
120 |         Json.obj(
121 |           ("dfLabel", vdf.label.asJson),
122 |           ("failed", vdf.failed.asJson),
123 |           ("keyColumns", vdf.keyColumns.asJson),
124 |           ("checks", vdf.checks.asJson),
125 |           ("events", vdf.getEvents.asJson)
126 |         )
127 |       case vcf: ValidatorSpecifiedFormatLoader =>
128 |         Json.obj(
129 |           ("format", Json.fromString(vcf.format)),
130 |           ("options", vcf.options.asJson),
131 |           ("loadData", vcf.loadData.asJson),
132 |           ("failed", vcf.failed.asJson),
133 |           ("keyColumns", vcf.keyColumns.asJson),
134 |           ("checks", vcf.checks.asJson),
135 |           ("events", vcf.getEvents.asJson)
136 |         )
137 |     }
138 |   }
139 | 
140 |   implicit val configVarEncoder: Encoder[ConfigVar] = new Encoder[ConfigVar] {
141 |     override def apply(a: ConfigVar): Json = a match {
142 |       case nv: NameValue =>
143 |         Json.obj(
144 |           ("name", Json.fromString(nv.name)),
145 |           ("value", nv.value)
146 |         )
147 |       case ne: NameEnv =>
148 |         Json.obj(
149 |           ("name", Json.fromString(ne.name)),
150 |           ("env", Json.fromString(ne.env))
151 |         )
152 |       case nshell: NameShell =>
153 |         Json.obj(
154 |           ("name", Json.fromString(nshell.name)),
155 |           ("shell", Json.fromString(nshell.shell))
156 |         )
157 |       case nsql: NameSql =>
158 |         Json.obj(
159 |           ("name", Json.fromString(nsql.name)),
160 |           ("shell", Json.fromString(nsql.sql))
161 |         )
162 |       case x =>
163 |         logger.error(s"Unknown configVar type: $x")
164 |         throw new RuntimeException(s"Unknown configVar type: $x")
165 |     }
166 |   }
167 | 
168 |   implicit val configOutputEncoder: Encoder[ValidatorOutput] = new Encoder[ValidatorOutput] {
169 |     override def apply(a: ValidatorOutput): Json = a match {
170 | 
171 |       case file: FileOutput =>
172 |         Json.obj(
173 |           ("filename", Json.fromString(file.filename)),
174 |           ("append", Json.fromBoolean(file.append.getOrElse(false)))
175 |         )
176 |       case pipe: PipeOutput =>
177 |         Json.obj(
178 |           ("pipe", Json.fromString(pipe.pipe)),
179 |           ("ignoreError", Json.fromBoolean(pipe.ignoreError.getOrElse(false)))
180 |         )
181 |       case x =>
182 |         logger.error(s"Unknown output type: $x")
183 |         throw new RuntimeException(s"Unknown output type: $x")
184 |     }
185 |   }
186 | 
187 | }
188 | 


--------------------------------------------------------------------------------
/src/main/scala/com/target/data_validator/JsonUtils.scala:
--------------------------------------------------------------------------------
 1 | package com.target.data_validator
 2 | 
 3 | import com.typesafe.scalalogging.LazyLogging
 4 | import io.circe.{parser, Json}
 5 | import org.apache.spark.sql.Row
 6 | import org.apache.spark.sql.types._
 7 | 
 8 | object JsonUtils extends LazyLogging {
 9 | 
10 |   def string2Json(v: String): Json = parser.parse(v) match {
11 |     case Right(b) => b
12 |     case Left(_) => Json.fromString(v)
13 |   }
14 | 
15 |   // scalastyle:off cyclomatic.complexity
16 |   def debugJson(j: Json): String = j match {
17 |     case _ if j.isNull => s"Json NULL"
18 |     case _ if j.isNumber => s"Json NUM: ${j.asNumber.get}"
19 |     case _ if j.isArray => s"Json ARR: ${j.noSpaces}"
20 |     case _ if j.isBoolean => s"Json BOOLEAN: ${j.asBoolean.get}"
21 |     case _ if j.isObject => s"Json OBJECT: ${j.noSpaces}"
22 |     case _ if j.asString.isDefined => s"Json STRING: ${j.asString.get}"
23 |     case _ => s"Json UNKNOWN[${j.getClass.getSimpleName}]: ${j.noSpaces}"
24 |   }
25 | 
26 |   /** Turn Row into JSon
27 |     * @return
28 |     *   Json Object
29 |     */
30 |   def row2Json(row: Row): Json = {
31 |     val fields = row.schema.fieldNames.zipWithIndex.map { case (fieldName, idx) =>
32 |       (fieldName, row2Json(row, idx))
33 |     }
34 |     Json.obj(fields: _*)
35 |   }
36 | 
37 |   /** Take Row, and turn col into Json.
38 |     * @return
39 |     *   Json
40 |     */
41 |   def row2Json(row: Row, col: Int): Json = {
42 |     val dataType = row.schema(col).dataType
43 |     dataType match {
44 |       case StringType => Json.fromString(row.getString(col))
45 |       case LongType => Json.fromLong(row.getLong(col))
46 |       case IntegerType => Json.fromInt(row.getInt(col))
47 |       case NullType => Json.Null
48 |       case BooleanType => Json.fromBoolean(row.getBoolean(col))
49 |       case DoubleType => Json.fromDoubleOrNull(row.getDouble(col))
50 |       case _: StructType => row2Json(row.getStruct(col))
51 |       case _ =>
52 |         logger.error(
53 |           s"Unimplemented dataType '${dataType.typeName}' in column: ${row.schema(col).name} " +
54 |             "Please report this as a bug."
55 |         )
56 |         Json.Null
57 |     }
58 |   }
59 |   // scalastyle:on cyclomatic.complexity
60 | }
61 | 


--------------------------------------------------------------------------------
/src/main/scala/com/target/data_validator/Main.scala:
--------------------------------------------------------------------------------
  1 | package com.target.data_validator
  2 | 
  3 | import java.util.Properties
  4 | 
  5 | import com.typesafe.scalalogging.LazyLogging
  6 | import org.apache.log4j.{Level, Logger, PropertyConfigurator}
  7 | import org.apache.spark.sql.SparkSession
  8 | import scopt.OptionParser
  9 | 
 10 | object Main extends LazyLogging with EventLog {
 11 | 
 12 |   def loadConfigRun(mainConfig: CliOptions): (Boolean, Boolean) =
 13 |     ConfigParser.parseFile(mainConfig.configFilename, mainConfig.vars) match {
 14 |       case Left(error) =>
 15 |         logger.error(s"Failed to parse config file '${mainConfig.configFilename}, $error")
 16 |         (true, false)
 17 |       case Right(validatorConfig) => runChecks(mainConfig, validatorConfig)
 18 |     }
 19 | 
 20 |   def resolveVariables(
 21 |       spark: SparkSession,
 22 |       mainConfig: CliOptions,
 23 |       config: ValidatorConfig,
 24 |       varSub: VarSubstitution
 25 |   ): Option[ValidatorConfig] = {
 26 |     varSub.addMap(mainConfig.vars)
 27 | 
 28 |     config.vars match {
 29 |       case None => config.substituteVariables(varSub)
 30 |       case Some(vars) =>
 31 |         if (vars.map(_.addEntry(spark, varSub)).exists(x => x)) {
 32 |           validatorError("Failed to resolve config variables")
 33 |           None
 34 |         } else {
 35 |           config.substituteVariables(varSub)
 36 |         }
 37 |     }
 38 |   }
 39 | 
 40 |   private def checkFile(spark: SparkSession, filename: Option[String], append: Boolean): Boolean = {
 41 |     logger.info(s"filename: $filename append: $append")
 42 |     if (filename.isDefined) {
 43 |       logger.info(s"CheckFile $filename")
 44 |       val ret = filename.exists(!IO.canAppendOrCreate(_, append)(spark))
 45 |       logger.info(s"Checking file '${filename.get} append: $append failed: $ret")
 46 |       if (ret) {
 47 |         logger.error(s"Filename: ${filename.get} error!")
 48 |       }
 49 |       ret
 50 |     } else {
 51 |       false
 52 |     }
 53 |   }
 54 | 
 55 |   def checkCliOutputs(spark: SparkSession, mainConfig: CliOptions): Boolean = {
 56 |     logger.info(s"Checking Cli Outputs htmlReport: ${mainConfig.htmlReport} jsonReport: ${mainConfig.jsonReport}")
 57 |     checkFile(spark, mainConfig.htmlReport, append = false) ||
 58 |     checkFile(spark, mainConfig.jsonReport, append = true)
 59 |   }
 60 | 
 61 |   def checkConfig(
 62 |       spark: SparkSession,
 63 |       mainConfig: CliOptions,
 64 |       config: ValidatorConfig,
 65 |       varSub: VarSubstitution
 66 |   ): Boolean = checkCliOutputs(spark, mainConfig) || config.configCheck(spark, varSub)
 67 | 
 68 |   def runSparkChecks(
 69 |       spark: SparkSession,
 70 |       mainConfig: CliOptions,
 71 |       config: ValidatorConfig,
 72 |       varSub: VarSubstitution
 73 |   ): Boolean = {
 74 |     logger.info("Running sparkChecks")
 75 |     Seq(config.quickChecks(spark, varSub), config.costlyChecks(spark, varSub)).exists(x => x)
 76 |   }
 77 | 
 78 |   /*
 79 |    * There are 2 types of errors we return (fatal, validator_status)
 80 |    * If fatal, we need to System.exit(1)
 81 |    * Otherwise we print a message `VALIDATOR_STATUS=PASS|FAIL
 82 |    */
 83 |   def runChecks(mainConfig: CliOptions, origConfig: ValidatorConfig): (Boolean, Boolean) = {
 84 |     val varSub = new VarSubstitution
 85 | 
 86 |     implicit val spark: SparkSession =
 87 |       SparkSession.builder.appName("data-validator").enableHiveSupport().getOrCreate()
 88 | 
 89 |     if (mainConfig.verbose) {
 90 |       logger.info("Verbose Flag detected")
 91 |       logger.info(s"Original config: $origConfig")
 92 |       Logger.getRootLogger.setLevel(Level.DEBUG)
 93 |     }
 94 | 
 95 |     // Resolve config
 96 |     val (fatal, validator_fail) = resolveVariables(spark, mainConfig, origConfig, varSub)
 97 |       .map { config =>
 98 |         val fatal = config.failed || checkConfig(spark, mainConfig, config, varSub)
 99 |         if (fatal) {
100 |           (fatal, false)
101 |         } else {
102 |           // Result is true in case of validation failure, otherwise false.
103 |           val validatorFail = runSparkChecks(spark, mainConfig, config, varSub)
104 | 
105 |           if (validatorFail || mainConfig.emailOnPass) {
106 |             Reports.emailReport(mainConfig, config, varSub)
107 |           }
108 |           Reports.jsonReport(mainConfig, config, varSub)
109 | 
110 |           (fatal, validatorFail)
111 |         }
112 |       }
113 |       .getOrElse((true, false))
114 |     spark.stop()
115 | 
116 |     (fatal, validator_fail)
117 |   }
118 | 
119 |   def configLogging(): Unit = {
120 |     val props = new Properties()
121 |     props.load(getClass.getResourceAsStream("/log4j-dv-spark.properties"))
122 |     // props.list(System.err)
123 |     PropertyConfigurator.configure(props)
124 |     logger.info("Logging configured!")
125 |   }
126 | 
127 |   def main(args: Array[String]): Unit = {
128 |     configLogging()
129 | 
130 |     val parser = CliOptionParser.parser
131 | 
132 |     logger.info("Data Validator")
133 | 
134 |     parser.parse(args, CliOptions()) match {
135 |       case Some(cliConfig: CliOptions) =>
136 |         val (fatal, validatorFail) = loadConfigRun(cliConfig)
137 | 
138 |         if (fatal || validatorFail) {
139 |           logger.error("data-validator failed!")
140 |           println("DATA_VALIDATOR_STATUS=FAIL") // scalastyle:ignore
141 |         } else {
142 |           logger.info("data-validator success!")
143 |           println("DATA_VALIDATOR_STATUS=PASS") // scalastyle:ignore
144 |         }
145 | 
146 |         if (fatal || (validatorFail && cliConfig.exitErrorOnFail)) {
147 |           System.exit(-1)
148 |         }
149 |       case None =>
150 |         logger.error("Failed to Parse Command line Options.")
151 |         println("DATA_VALIDATOR_STATUS=FAIL") // scalastyle:ignore
152 |         System.exit(-1)
153 |     }
154 |   }
155 | }
156 | 


--------------------------------------------------------------------------------
/src/main/scala/com/target/data_validator/Reports.scala:
--------------------------------------------------------------------------------
 1 | package com.target.data_validator
 2 | 
 3 | import com.typesafe.scalalogging.LazyLogging
 4 | import org.apache.spark.sql.SparkSession
 5 | 
 6 | object Reports extends LazyLogging with EventLog {
 7 | 
 8 |   def emailReport(
 9 |       mainConfig: CliOptions,
10 |       config: ValidatorConfig,
11 |       varSub: VarSubstitution
12 |   )(implicit spark: SparkSession): Unit = {
13 |     if (mainConfig.htmlReport.isDefined || config.email.isDefined) {
14 |       val htmlReport = config.generateHTMLReport()
15 | 
16 |       mainConfig.htmlReport.foreach { htmlFilename =>
17 |         logger.info(s"Writing HTML report to $htmlFilename")
18 |         IO.writeHTML(htmlFilename, htmlReport)
19 |       }
20 | 
21 |       config.email.foreach { emailConfig =>
22 |         logger.info(s"Sending email report emailConfig: $emailConfig")
23 |         Emailer.sendHtmlMessage(emailConfig, htmlReport.render)
24 |       }
25 |     }
26 |   }
27 | 
28 |   def jsonReport(
29 |       mainConfig: CliOptions,
30 |       config: ValidatorConfig,
31 |       varSub: VarSubstitution
32 |   )(implicit spark: SparkSession): Unit = {
33 |     if (config.outputs.isDefined || mainConfig.jsonReport.isDefined) {
34 |       val jsonReport = config.genJsonReport(varSub)
35 |       mainConfig.jsonReport.foreach { jsonFilename =>
36 |         logger.info(s"Writing JSON report to $jsonFilename")
37 |         IO.writeJSON(jsonFilename, jsonReport, append = true)
38 |       }
39 | 
40 |       for {
41 |         outputs <- config.outputs
42 |         out <- outputs
43 |       } {
44 |         if (out.write(jsonReport)) {
45 |           val msg = s"ERROR: Failed to write out: $out"
46 |           logger.error(msg)
47 |           validatorError(msg)
48 |         }
49 |       }
50 |     }
51 |   }
52 | 
53 | }
54 | 


--------------------------------------------------------------------------------
/src/main/scala/com/target/data_validator/Substitutable.scala:
--------------------------------------------------------------------------------
 1 | package com.target.data_validator
 2 | 
 3 | import com.typesafe.scalalogging.LazyLogging
 4 | import io.circe.Json
 5 | 
 6 | trait Substitutable extends LazyLogging with EventGenerator {
 7 |   def getVarSub(v: String, field: String, dict: VarSubstitution): String =
 8 |     dict.replaceVars(v) match {
 9 |       case Left(newV) =>
10 |         if (v != newV) {
11 |           logger.info(s"Substituting $field var: $v with `$newV`")
12 |           addEvent(VarSubEvent(v, newV))
13 |         }
14 |         newV
15 |       case Right(event) =>
16 |         addEvent(event)
17 |         logger.warn(s"Field: $field msg: $event")
18 |         v
19 |     }
20 | 
21 |   def getVarSubJson(j: Json, field: String, dict: VarSubstitution): Json =
22 |     dict.replaceJsonVars(j) match {
23 |       case Left(newJ) =>
24 |         if (j != newJ) {
25 |           logger.info(s"Substituting Json $field Json: $j with `$newJ`")
26 |           addEvent(VarSubJsonEvent(j.toString(), newJ))
27 |         }
28 |         newJ
29 |       case Right(event) =>
30 |         addEvent(event)
31 |         logger.warn(s"Field: $field msg: $event")
32 |         j
33 |     }
34 | }
35 | 


--------------------------------------------------------------------------------
/src/main/scala/com/target/data_validator/ValidatorConfig.scala:
--------------------------------------------------------------------------------
  1 | package com.target.data_validator
  2 | 
  3 | import java.net.InetAddress
  4 | 
  5 | import com.target.data_validator.EnvironmentVariables.MaybeEnvVar
  6 | import com.typesafe.scalalogging.LazyLogging
  7 | import io.circe.Json
  8 | import io.circe.generic.auto._
  9 | import io.circe.syntax._
 10 | import org.apache.spark.sql.SparkSession
 11 | 
 12 | import scala.collection.JavaConverters._
 13 | import scala.util.Try
 14 | import scalatags.Text.all._
 15 | 
 16 | case class ValidatorConfig(
 17 |     numKeyCols: Int,
 18 |     numErrorsToReport: Int,
 19 |     email: Option[EmailConfig],
 20 |     detailedErrors: Boolean = true,
 21 |     vars: Option[List[ConfigVar]],
 22 |     outputs: Option[List[ValidatorOutput]],
 23 |     tables: List[ValidatorTable]
 24 | ) extends LazyLogging {
 25 | 
 26 |   def failed: Boolean = tables.exists(_.failed)
 27 | 
 28 |   def checkOutputs(session: SparkSession): Boolean = outputs match {
 29 |     case Some(outs) => outs.map(_.configCheck(session)).exists(x => x)
 30 |     case None => false
 31 |   }
 32 | 
 33 |   def checkTables(session: SparkSession, dict: VarSubstitution): Boolean = {
 34 |     val error = tables.map(_.configCheck(session, dict)).exists(x => x)
 35 |     if (error) {
 36 |       logger.error("checkTables failed!")
 37 |     }
 38 |     error
 39 |   }
 40 | 
 41 |   def configCheck(session: SparkSession, dict: VarSubstitution): Boolean = {
 42 |     val outputsError = checkOutputs(session)
 43 |     val tableError = checkTables(session, dict)
 44 |     if (outputsError || tableError) {
 45 |       logger.error("configCheck failed!")
 46 |     }
 47 |     outputsError || tableError
 48 |   }
 49 | 
 50 |   def quickChecks(session: SparkSession, dict: VarSubstitution): Boolean = {
 51 |     logger.info("Running Quick Checks...")
 52 |     tables.map(_.quickChecks(session, dict)(this)).exists(x => x)
 53 |   }
 54 | 
 55 |   def costlyChecks(session: SparkSession, dict: VarSubstitution): Boolean = {
 56 |     logger.info("Running Costly Checks...")
 57 |     tables.map(_.costlyChecks(session, dict)(this)).exists(x => x)
 58 |   }
 59 | 
 60 |   def generateHTMLReport(): Tag = html(h1("Validator Report"), hr(), tables.map(_.generateHTMLReport()))
 61 | 
 62 |   def substituteVariables(varSub: VarSubstitution): Option[ValidatorConfig] = {
 63 |     logger.info("substituteVariables()")
 64 |     Some(
 65 |       this.copy(
 66 |         email = this.email.map(_.substituteVariables(varSub)),
 67 |         tables = this.tables.map(_.substituteVariables(varSub)),
 68 |         outputs = this.outputs.map(_.map(_.substituteVariables(varSub)))
 69 |       )
 70 |     )
 71 |   }
 72 | 
 73 |   def genJsonReport(varSub: VarSubstitution)(implicit spark: SparkSession): Json = {
 74 |     import JsonEncoders._
 75 | 
 76 |     Json.obj(
 77 |       ("numKeyCols", numKeyCols.asJson),
 78 |       ("numErrorsToReport", numErrorsToReport.asJson),
 79 |       ("email", email.asJson),
 80 |       ("detailedErrors", detailedErrors.asJson),
 81 |       ("vars", vars.asJson),
 82 |       ("varSubDict", varSub.dict.asJson),
 83 |       ("failed", failed.asJson),
 84 |       ("buildInfo", ValidatorConfig.buildInfoJson),
 85 |       ("runtimeInfo", ValidatorConfig.runtimeInfoJson(spark)),
 86 |       ("outputs", outputs.asJson),
 87 |       ("tables", tables.asJson),
 88 |       ("events", EventLog.events.asJson)
 89 |     )
 90 |   }
 91 | }
 92 | 
 93 | object ValidatorConfig {
 94 |   private def buildInfoJson: Json = Json.obj(
 95 |     ("name", Json.fromString(BuildInfo.name)),
 96 |     ("version", Json.fromString(BuildInfo.version)),
 97 |     ("scalaVersion", Json.fromString(BuildInfo.scalaVersion)),
 98 |     ("sbtVersion", Json.fromString(BuildInfo.sbtVersion)),
 99 |     ("sparkVersion", Json.fromString(org.apache.spark.SPARK_VERSION)),
100 |     ("javaVersion", Json.fromString(System.getProperty("java.version")))
101 |   )
102 | 
103 |   private def propsToJson: Json = {
104 |     val props = System.getProperties.asScala.toList.map(x => (x._1, Json.fromString(x._2)))
105 |     Json.obj(props: _*)
106 |   }
107 | 
108 |   private def envToJson: Json = {
109 |     def extractFromAccessionList(pair: (String, MaybeEnvVar)) = {
110 |       pair._1 -> Json.fromString(pair._2.map(_.getOrElse("<unset>")).getOrElse("<inaccessible>"))
111 |     }
112 | 
113 |     val env = EnvironmentVariables.accessedEnvVars.map(extractFromAccessionList)
114 |     Json.obj(env.toSeq: _*)
115 |   }
116 | 
117 |   private def runtimeInfoJson(spark: SparkSession): Json = {
118 |     val startTimeMs = spark.sparkContext.startTime
119 |     val endTimeMs = System.currentTimeMillis()
120 |     val durationMs = endTimeMs - startTimeMs
121 |     Json.obj(
122 |       ("hostname", Json.fromString(Try(InetAddress.getLocalHost.getHostName).getOrElse("UNKNOWN"))),
123 |       ("applicationId", Json.fromString(spark.sparkContext.applicationId)),
124 |       ("sparkUser", Json.fromString(spark.sparkContext.sparkUser)),
125 |       ("startTimeMs", Json.fromLong(startTimeMs)),
126 |       ("endTimeMs", Json.fromLong(endTimeMs)),
127 |       ("durationMs", Json.fromLong(durationMs)),
128 |       ("properties", propsToJson),
129 |       ("environment", envToJson)
130 |     )
131 |   }
132 | }
133 | 


--------------------------------------------------------------------------------
/src/main/scala/com/target/data_validator/ValidatorEvent.scala:
--------------------------------------------------------------------------------
  1 | package com.target.data_validator
  2 | 
  3 | import java.util.concurrent.TimeUnit
  4 | 
  5 | import io.circe.Json
  6 | 
  7 | import scalatags.Text
  8 | import scalatags.Text.all._
  9 | 
 10 | trait ValidatorEvent {
 11 |   def failed: Boolean
 12 |   def toHTML: Tag
 13 | 
 14 |   def failedHTML: Tag = HTMLBits.status(failed)
 15 | }
 16 | 
 17 | case class ValidatorCounter(name: String, value: Long) extends ValidatorEvent {
 18 |   override def failed: Boolean = false
 19 |   override def toHTML: Tag = {
 20 |     div(cls := "counter")(s"Counter - $name: $value")
 21 |   }
 22 | }
 23 | 
 24 | case class ValidatorError(msg: String) extends ValidatorEvent {
 25 |   override def failed: Boolean = true
 26 | 
 27 |   override def toHTML: Text.all.Tag = div(cls := "error")(failedHTML, msg)
 28 | }
 29 | 
 30 | case class ValidatorCheckEvent(failure: Boolean, label: String, count: Long, errorCount: Long)
 31 |     extends ValidatorEvent {
 32 |   override def failed: Boolean = failure
 33 | 
 34 |   override def toHTML: Text.all.Tag = {
 35 |     val pct = "%4.2f%%".format((errorCount * 100.0) / count)
 36 |     div(cls := "checkEvent")(failedHTML, s" - $label count: $count errors: $errorCount pct: $pct")
 37 |   }
 38 | }
 39 | 
 40 | case class ColumnBasedValidatorCheckEvent(
 41 |     failure: Boolean,
 42 |     data: Map[String, String],
 43 |     msg: String
 44 | ) extends ValidatorEvent {
 45 |   override def failed: Boolean = failure
 46 | 
 47 |   override def toHTML: Text.all.Tag = {
 48 |     div(cls := "checkEvent")(failedHTML, s" - $msg")
 49 |   }
 50 | }
 51 | 
 52 | class ValidatorTimer(val label: String) extends ValidatorEvent {
 53 |   var duration = 0L
 54 | 
 55 |   override def failed: Boolean = false
 56 | 
 57 |   def time[R](block: => R): R = {
 58 |     val start = System.nanoTime()
 59 |     val result =
 60 |       try {
 61 |         block
 62 |       } finally {
 63 |         duration = System.nanoTime() - start
 64 |       }
 65 |     result
 66 |   }
 67 | 
 68 |   def toSecs: Long = TimeUnit.SECONDS.convert(duration, TimeUnit.NANOSECONDS)
 69 | 
 70 |   override def toHTML: Text.all.Tag = div(cls := "timer")(s"Timer: $label took $toSecs seconds.")
 71 | 
 72 |   override def toString: String = s"Time: $label Duration: $toSecs seconds"
 73 | }
 74 | 
 75 | case class ValidatorQuickCheckError(key: List[(String, Any)], value: Any, message: String) extends ValidatorEvent {
 76 |   override def failed: Boolean = true
 77 |   override def toHTML: Text.all.Tag = div(cls := "quickCheckError")(failedHTML, " - " + toString)
 78 | 
 79 |   def keyToString: String = "{" + key.map { case (c, v) => s"$c:$v" }.mkString(", ") + "}"
 80 | 
 81 |   override def toString: String = {
 82 |     val vStr = Option(value).getOrElse("(NULL)").toString
 83 |     s"ValidatorQuickCheckError(key: $keyToString, value: $vStr msg: $message)"
 84 |   }
 85 | }
 86 | 
 87 | case class ValidatorGood(msg: String) extends ValidatorEvent {
 88 |   override def failed: Boolean = false
 89 |   override def toString: String = msg
 90 |   override def toHTML: Text.all.Tag = div(cls := "good")(msg)
 91 | }
 92 | 
 93 | case class VarSubEvent(src: String, dest: String) extends ValidatorEvent {
 94 |   override def failed: Boolean = false
 95 |   override def toString: String = s"VarSub src: $src dest: $dest"
 96 |   override def toHTML: Text.all.Tag = div(cls := "subEvent")(toString)
 97 | }
 98 | 
 99 | case class VarSubJsonEvent(src: String, dest: Json) extends ValidatorEvent {
100 |   override def failed: Boolean = false
101 |   override def toString: String = s"VarSub src: $src dest: ${dest.noSpaces}"
102 |   override def toHTML: Text.all.Tag = div(cls := "subEvent")(toString)
103 | }
104 | 
105 | case class JsonEvent(json: Json) extends ValidatorEvent {
106 |   override def failed: Boolean = false
107 |   override def toString: String = s"JsonEvent: json:${json.noSpaces}"
108 |   override def toHTML: Text.all.Tag = div(cls := "jsonEvent")(toString)
109 | }
110 | 


--------------------------------------------------------------------------------
/src/main/scala/com/target/data_validator/ValidatorOutput.scala:
--------------------------------------------------------------------------------
 1 | package com.target.data_validator
 2 | 
 3 | import cats.syntax.functor._
 4 | import io.circe.{Decoder, Json}
 5 | import io.circe.generic.auto._
 6 | import org.apache.spark.sql.SparkSession
 7 | 
 8 | abstract class ValidatorOutput extends Substitutable with EventLog {
 9 |   def write(json: Json)(implicit spark: SparkSession): Boolean
10 |   def substituteVariables(dict: VarSubstitution): ValidatorOutput
11 |   def configCheck(spark: SparkSession): Boolean
12 | }
13 | 
14 | case class PipeOutput(pipe: String, ignoreError: Option[Boolean]) extends ValidatorOutput {
15 | 
16 |   override def write(json: Json)(implicit spark: SparkSession): Boolean = {
17 |     logger.info(s"Piping json output to '$pipe' ignoreError: ${ignoreError.getOrElse(false)}")
18 |     val timer = new ValidatorTimer(s"PipeOutput($pipe)")
19 |     addEvent(timer)
20 | 
21 |     val (fail, out, err) = timer.time(IO.writeStringToPipe(pipe, json.noSpaces))
22 | 
23 |     if (fail) {
24 |       logger.error(s"Program `$pipe` failed!")
25 |       if (out.isEmpty) {
26 |         logger.error("stdout empty!")
27 |       } else {
28 |         out.foreach(o => logger.error(s"stdout: $o"))
29 |       }
30 |       if (err.isEmpty) {
31 |         logger.error("stderr empty!")
32 |       } else {
33 |         err.foreach(o => logger.error(s"stderr: $o"))
34 |       }
35 |       !ignoreError.getOrElse(false)
36 |     } else {
37 |       false
38 |     }
39 |   }
40 | 
41 |   override def substituteVariables(dict: VarSubstitution): ValidatorOutput =
42 |     this.copy(getVarSub(pipe, "pipe", dict))
43 | 
44 |   override def configCheck(spark: SparkSession): Boolean = {
45 |     val ret = IO.canExecute(pipe.split("\\s").head)(spark)
46 |     if (!ret) {
47 |       val msg = s"Pipe:'$pipe' not executable!"
48 |       validatorError(msg)
49 |     }
50 |     !ret
51 |   }
52 | }
53 | 
54 | case class FileOutput(filename: String, append: Option[Boolean]) extends ValidatorOutput {
55 | 
56 |   override def write(json: Json)(implicit spark: SparkSession): Boolean = {
57 |     logger.info(s"Writing json output to file '$filename append: ${append.getOrElse(false)}")
58 |     val timer = new ValidatorTimer(s"FileOutput($filename)")
59 |     timer.time(IO.writeJSON(filename, json, append.getOrElse(false)))
60 |   }
61 | 
62 |   override def substituteVariables(dict: VarSubstitution): ValidatorOutput =
63 |     this.copy(getVarSub(filename, "filename", dict))
64 |   override def configCheck(spark: SparkSession): Boolean = {
65 |     val ret = IO.canAppendOrCreate(filename, append.getOrElse(false))(spark)
66 |     if (!ret) {
67 |       val msg = s"FileOutput '$filename' append: $append cannot write or append!"
68 |       logger.error(msg)
69 |       validatorError(msg)
70 |     }
71 |     !ret
72 |   }
73 | }
74 | 
75 | object ValidatorOutput {
76 |   implicit val decodeOutputs: Decoder[ValidatorOutput] = List[Decoder[ValidatorOutput]](
77 |     Decoder[PipeOutput].widen,
78 |     Decoder[FileOutput].widen
79 |   ).reduce(_ or _)
80 | }
81 | 


--------------------------------------------------------------------------------
/src/main/scala/com/target/data_validator/VarSubstitution.scala:
--------------------------------------------------------------------------------
  1 | package com.target.data_validator
  2 | 
  3 | import com.typesafe.scalalogging.LazyLogging
  4 | import io.circe.Json
  5 | 
  6 | import scala.collection.mutable
  7 | 
  8 | // Helper Class to handle variable substitution and manage dict of (k,v)
  9 | 
 10 | class VarSubstitution() extends LazyLogging {
 11 |   import VarSubstitution._
 12 | 
 13 |   val dict = new mutable.HashMap[String, Json]()
 14 | 
 15 |   /** Adds (k,v) to dictionary.
 16 |     *
 17 |     * @param key
 18 |     *   \- key of value in dictionary.
 19 |     * @param value
 20 |     *   \- value of key in dictionary.
 21 |     * @return
 22 |     *   True on error.
 23 |     */
 24 |   def add(key: String, value: Json): Boolean = {
 25 |     if (VAR_REGEX.findFirstIn(key).isEmpty) {
 26 |       logger.error(s"Bad key: $key, must follow variable rules.")
 27 |       true
 28 |     } else if (value.asString.exists(VAR_BODY_REGEX.findFirstIn(_).isDefined)) {
 29 |       logger.error(s"Cannot have variable defined in value: $value!")
 30 |       true
 31 |     } else {
 32 |       if (dict.contains(key)) {
 33 |         logger.warn(s"Dict already contains key: '$key' current: '${dict(key)}' v: '$value' not overriding.")
 34 |         true
 35 |       } else {
 36 |         dict += (key -> value)
 37 |         false
 38 |       }
 39 |     }
 40 |   }
 41 | 
 42 |   /** Adds a String to dictionary.
 43 |     * @param value
 44 |     *   \- gets converted to Json
 45 |     * @return
 46 |     *   True on error
 47 |     */
 48 |   def addString(key: String, value: String): Boolean = {
 49 |     replaceVars(value) match {
 50 |       case Left(newValue) => add(key, JsonUtils.string2Json(newValue))
 51 |       case Right(_) => true // Bug: Ignoring the ValidatorError
 52 |     }
 53 |   }
 54 | 
 55 |   /** Removes key from dictionary.
 56 |     *
 57 |     * @param k
 58 |     *   \- key to be removed.
 59 |     * @return
 60 |     *   True on error.
 61 |     */
 62 |   def remove(k: String): Boolean = {
 63 |     if (dict.contains(k)) {
 64 |       dict.remove(k)
 65 |       false
 66 |     } else {
 67 |       logger.warn(s"remove(k:$k) Dict doesn't contain specified key.")
 68 |       true
 69 |     }
 70 |   }
 71 | 
 72 |   /** replaces variables in String.
 73 |     *
 74 |     * @param s
 75 |     *   \- string to replace variables in.
 76 |     * @return
 77 |     *   Left(new string) on Success, Right(ValidatorEvent) on Error
 78 |     */
 79 |   def replaceVars(s: String): Either[String, ValidatorEvent] = {
 80 |     val variableJson = findVars(s).toSeq.map(x => (x, getVarName(x).flatMap(dict.get)))
 81 |     val (foundVariableJson, missingVariableJson) = variableJson.partition(_._2.isDefined)
 82 |     foundVariableJson.foreach(x => logger.debug(s"foundVar: $x"))
 83 |     missingVariableJson.foreach(x => logger.debug(s"missingVar: $x"))
 84 | 
 85 |     val newString = foundVariableJson.foldRight(s) { (vj, ns) =>
 86 |       val (variable, json) = vj
 87 |       logger.debug(s"accum: $ns variable: $variable json: $json")
 88 |       val replacement = jsonToString(json.get)
 89 |       replaceAll(ns, variable, replacement)
 90 |     }
 91 | 
 92 |     val errs = missingVariableJson.map(x => x._1)
 93 |     errs.foreach(x => logger.debug(s"errs: $x"))
 94 |     if (errs.nonEmpty) {
 95 |       Right(
 96 |         ValidatorError(
 97 |           "VariableSubstitution: Can't find values for the following keys, " +
 98 |             s"${errs.flatMap(getVarName).mkString(",")}"
 99 |         )
100 |       )
101 |     } else {
102 |       if (s != newString) {
103 |         logger.debug(s"Replaced '$s' with '$newString'")
104 |       }
105 |       Left(newString)
106 |     }
107 |   }
108 | 
109 |   private def jsonToString(j: Json): String = {
110 |     if (j.isString) {
111 |       j.asString.get
112 |     } else {
113 |       j.toString()
114 |     }
115 |   }
116 | 
117 |   def replaceJsonVars(j: Json): Either[Json, ValidatorEvent] = {
118 |     if (j.isString) {
119 |       replaceVars(j.asString.get).left.map(JsonUtils.string2Json)
120 |     } else {
121 |       // Since variables are only in Strings, return j.
122 |       Left(j)
123 |     }
124 |   }
125 | 
126 |   private def logDupKeys(k: String, v: String): Unit = {
127 |     logger.info(s"Adding dict entry k: $k v:`$v`")
128 |     if (dict.contains(k)) logger.warn(s"Replacing key: $k old: ${dict(k)} with new: $v")
129 |   }
130 | 
131 |   /** Adds the map m to dict
132 |     */
133 |   def addMap(m: Map[String, String]): Unit = {
134 |     val kj = m.map { case (k, v) =>
135 |       logDupKeys(k, v)
136 |       (k, JsonUtils.string2Json(v))
137 |     }
138 |     dict ++= kj
139 |   }
140 | 
141 |   override def equals(obj: Any): Boolean =
142 |     obj.isInstanceOf[VarSubstitution] && obj.asInstanceOf[VarSubstitution].dict == dict
143 | 
144 |   override def hashCode(): Int = dict.hashCode()
145 | }
146 | 
147 | object VarSubstitution extends LazyLogging {
148 |   private val VAR_REGEX_STR = "[A-Za-z][A-Za-z0-9_]*"
149 |   private val VAR_REGEX = VAR_REGEX_STR.r
150 |   private val VAR_BODY_REGEX = ("\\$" + VAR_REGEX_STR + "|\\$\\{" + VAR_REGEX_STR + "\\}").r
151 | 
152 |   def findVars(s: String): Set[String] = {
153 |     VAR_BODY_REGEX.findAllIn(s).toSet
154 |   }
155 | 
156 |   /** Checks if s is a variable.
157 |     * @param s
158 |     *   \- string to check
159 |     * @return
160 |     *   true if s is a variable.
161 |     */
162 |   def isVariable(s: String): Boolean = s.startsWith("$") && VAR_BODY_REGEX.findFirstMatchIn(s).isDefined
163 | 
164 |   /** Replaces all the occurrences of oldVal in src with newVal.
165 |     *
166 |     * @param src
167 |     *   \- source string that contains values to be replaced.
168 |     * @param oldVal
169 |     *   \- old Value that will be replaced by newValue.
170 |     * @param newVal
171 |     *   \- new Value that will replace oldValue.
172 |     * @return
173 |     *   new string with newVal were oldVal was.
174 |     */
175 |   def replaceAll(src: String, oldVal: String, newVal: String): String = {
176 |     val buf = new StringBuffer(src)
177 |     var idx = buf.indexOf(oldVal)
178 |     while (idx >= 0) {
179 |       buf.replace(idx, idx + oldVal.length, newVal)
180 |       idx = buf.indexOf(oldVal, idx + newVal.length)
181 |     }
182 |     val ret = buf.toString
183 |     logger.debug(s"src: $src oldVal: $oldVal newVal: $newVal ret: $ret")
184 |     ret
185 |   }
186 | 
187 |   /** gets the variable name from the variable. ie "$\{foo\}" returns "foo"
188 |     *
189 |     * @param rawVar
190 |     *   \- variable with control chars.
191 |     * @return
192 |     *   variable without '$' or '{','}'
193 |     */
194 |   def getVarName(rawVar: String): Option[String] = {
195 |     val ret = if (rawVar.startsWith("${")) {
196 |       if (rawVar.endsWith("}")) {
197 |         Some(rawVar.substring(2, rawVar.length - 1))
198 |       } else {
199 |         None
200 |       }
201 |     } else if (rawVar.startsWith("$")) {
202 |       Some(rawVar.substring(1))
203 |     } else {
204 |       logger.error(s"Illegal Variable $rawVar")
205 |       None
206 |     }
207 |     logger.debug(s"getVarName(K: $rawVar) ret: $ret")
208 |     ret
209 |   }
210 | }
211 | 


--------------------------------------------------------------------------------
/src/main/scala/com/target/data_validator/stats/Bin.scala:
--------------------------------------------------------------------------------
1 | package com.target.data_validator.stats
2 | 
3 | case class Bin(lowerBound: Double, upperBound: Double, count: Long)
4 | 


--------------------------------------------------------------------------------
/src/main/scala/com/target/data_validator/stats/CompleteStats.scala:
--------------------------------------------------------------------------------
 1 | package com.target.data_validator.stats
 2 | 
 3 | import io.circe._
 4 | import io.circe.generic.semiauto._
 5 | 
 6 | case class CompleteStats(
 7 |     name: String,
 8 |     column: String,
 9 |     count: Long,
10 |     mean: Double,
11 |     min: Double,
12 |     max: Double,
13 |     stdDev: Double,
14 |     histogram: Histogram
15 | )
16 | 
17 | object CompleteStats {
18 |   implicit val binEncoder: Encoder[Bin] = deriveEncoder
19 |   implicit val histogramEncoder: Encoder[Histogram] = deriveEncoder
20 |   implicit val encoder: Encoder[CompleteStats] = deriveEncoder
21 | 
22 |   implicit val binDecoder: Decoder[Bin] = deriveDecoder
23 |   implicit val histogramDecoder: Decoder[Histogram] = deriveDecoder
24 |   implicit val decoder: Decoder[CompleteStats] = deriveDecoder
25 | 
26 |   def apply(
27 |       name: String,
28 |       column: String,
29 |       firstPassStats: FirstPassStats,
30 |       secondPassStats: SecondPassStats
31 |   ): CompleteStats = CompleteStats(
32 |     name = name,
33 |     column = column,
34 |     count = firstPassStats.count,
35 |     mean = firstPassStats.mean,
36 |     min = firstPassStats.min,
37 |     max = firstPassStats.max,
38 |     stdDev = secondPassStats.stdDev,
39 |     histogram = secondPassStats.histogram
40 |   )
41 | }
42 | 


--------------------------------------------------------------------------------
/src/main/scala/com/target/data_validator/stats/FirstPassStats.scala:
--------------------------------------------------------------------------------
 1 | package com.target.data_validator.stats
 2 | 
 3 | import org.apache.spark.sql.Row
 4 | import org.apache.spark.sql.catalyst.ScalaReflection
 5 | import org.apache.spark.sql.types.DataType
 6 | 
 7 | case class FirstPassStats(count: Long, mean: Double, min: Double, max: Double)
 8 | 
 9 | object FirstPassStats {
10 |   def dataType: DataType = ScalaReflection
11 |     .schemaFor[FirstPassStats]
12 |     .dataType
13 | 
14 |   /** Convert from Spark SQL row format to case class [[FirstPassStats]] format.
15 |     *
16 |     * @param row
17 |     *   a complex column of [[org.apache.spark.sql.types.StructType]] output of [[FirstPassStatsAggregator]]
18 |     * @return
19 |     *   struct format converted to [[FirstPassStats]]
20 |     */
21 |   def fromRowRepr(row: Row): FirstPassStats = {
22 |     FirstPassStats(
23 |       count = row.getLong(0),
24 |       mean = row.getDouble(1),
25 |       min = row.getDouble(2),
26 |       max = row.getDouble(3)
27 |     )
28 |   }
29 | 
30 | }
31 | 


--------------------------------------------------------------------------------
/src/main/scala/com/target/data_validator/stats/FirstPassStatsAggregator.scala:
--------------------------------------------------------------------------------
 1 | package com.target.data_validator.stats
 2 | 
 3 | import org.apache.spark.sql.Row
 4 | import org.apache.spark.sql.expressions.{MutableAggregationBuffer, UserDefinedAggregateFunction}
 5 | import org.apache.spark.sql.types._
 6 | 
 7 | /** Calculate the count, mean, min and maximum values of a numeric column.
 8 |   */
 9 | class FirstPassStatsAggregator extends UserDefinedAggregateFunction {
10 | 
11 |   /** input is a single column of `DoubleType`
12 |     */
13 |   override def inputSchema: StructType = new StructType().add("value", DoubleType)
14 | 
15 |   /** buffer keeps state for the count, sum, min and max
16 |     */
17 |   override def bufferSchema: StructType = new StructType()
18 |     .add(StructField("count", LongType))
19 |     .add(StructField("sum", DoubleType))
20 |     .add(StructField("min", DoubleType))
21 |     .add(StructField("max", DoubleType))
22 | 
23 |   private val count = bufferSchema.fieldIndex("count")
24 |   private val sum = bufferSchema.fieldIndex("sum")
25 |   private val min = bufferSchema.fieldIndex("min")
26 |   private val max = bufferSchema.fieldIndex("max")
27 | 
28 |   /** specifies the return type when using the UDAF
29 |     */
30 |   override def dataType: DataType = FirstPassStats.dataType
31 | 
32 |   /** These calculations are deterministic
33 |     */
34 |   override def deterministic: Boolean = true
35 | 
36 |   /** set the initial values for count, sum, min and max
37 |     */
38 |   override def initialize(buffer: MutableAggregationBuffer): Unit = {
39 |     buffer(count) = 0L
40 |     buffer(sum) = 0.0
41 |     buffer(min) = Double.MaxValue
42 |     buffer(max) = Double.MinValue
43 |   }
44 | 
45 |   /** update the count, sum, min and max buffer values
46 |     */
47 |   override def update(buffer: MutableAggregationBuffer, input: Row): Unit = {
48 |     buffer(count) = buffer.getLong(count) + 1
49 |     buffer(sum) = buffer.getDouble(sum) + input.getDouble(0)
50 |     buffer(min) = math.min(input.getDouble(0), buffer.getDouble(min))
51 |     buffer(max) = math.max(input.getDouble(0), buffer.getDouble(max))
52 |   }
53 | 
54 |   /** reduce the count, sum, min and max values of two buffers
55 |     */
56 |   override def merge(buffer1: MutableAggregationBuffer, buffer2: Row): Unit = {
57 |     buffer1(count) = buffer1.getLong(count) + buffer2.getLong(count)
58 |     buffer1(sum) = buffer1.getDouble(sum) + buffer2.getDouble(sum)
59 |     buffer1(min) = math.min(buffer1.getDouble(min), buffer2.getDouble(min))
60 |     buffer1(max) = math.max(buffer1.getDouble(max), buffer2.getDouble(max))
61 |   }
62 | 
63 |   /** evaluate the count, mean, min and max values of a column
64 |     */
65 |   override def evaluate(buffer: Row): Any = {
66 |     FirstPassStats(
67 |       buffer.getLong(count),
68 |       buffer.getDouble(sum) / buffer.getLong(count),
69 |       buffer.getDouble(min),
70 |       buffer.getDouble(max)
71 |     )
72 |   }
73 | 
74 | }
75 | 


--------------------------------------------------------------------------------
/src/main/scala/com/target/data_validator/stats/Histogram.scala:
--------------------------------------------------------------------------------
1 | package com.target.data_validator.stats
2 | 
3 | case class Histogram(bins: Seq[Bin])
4 | 


--------------------------------------------------------------------------------
/src/main/scala/com/target/data_validator/stats/SecondPassStats.scala:
--------------------------------------------------------------------------------
 1 | package com.target.data_validator.stats
 2 | 
 3 | import org.apache.spark.sql.Row
 4 | import org.apache.spark.sql.catalyst.ScalaReflection
 5 | import org.apache.spark.sql.types.DataType
 6 | 
 7 | case class SecondPassStats(stdDev: Double, histogram: Histogram)
 8 | 
 9 | object SecondPassStats {
10 |   def dataType: DataType = ScalaReflection
11 |     .schemaFor[SecondPassStats]
12 |     .dataType
13 | 
14 |   /** Convert from Spark SQL row format to case class [[SecondPassStats]] format.
15 |     *
16 |     * @param row
17 |     *   a complex column of [[org.apache.spark.sql.types.StructType]] output of [[SecondPassStatsAggregator]]
18 |     * @return
19 |     *   struct format converted to [[SecondPassStats]]
20 |     */
21 |   def fromRowRepr(row: Row): SecondPassStats = {
22 |     SecondPassStats(
23 |       stdDev = row.getDouble(0),
24 |       histogram = Histogram(
25 |         row.getStruct(1).getSeq[Row](0) map { bin =>
26 |           Bin(
27 |             lowerBound = bin.getDouble(0),
28 |             upperBound = bin.getDouble(1),
29 |             count = bin.getLong(2)
30 |           )
31 |         }
32 |       )
33 |     )
34 |   }
35 | 
36 | }
37 | 


--------------------------------------------------------------------------------
/src/main/scala/com/target/data_validator/stats/SecondPassStatsAggregator.scala:
--------------------------------------------------------------------------------
 1 | package com.target.data_validator.stats
 2 | 
 3 | import org.apache.spark.sql.Row
 4 | import org.apache.spark.sql.expressions.{MutableAggregationBuffer, UserDefinedAggregateFunction}
 5 | import org.apache.spark.sql.types._
 6 | 
 7 | /** Calculate the standard deviation and histogram of a numeric column
 8 |   */
 9 | class SecondPassStatsAggregator(firstPassStats: FirstPassStats) extends UserDefinedAggregateFunction {
10 | 
11 |   val NUMBER_OF_BINS = 10
12 | 
13 |   private val binSize = (firstPassStats.max - firstPassStats.min) / NUMBER_OF_BINS
14 |   private val upperBounds = for (i <- 1 to NUMBER_OF_BINS) yield { firstPassStats.min + i * binSize }
15 | 
16 |   /** input is a single column of `DoubleType`
17 |     */
18 |   override def inputSchema: StructType = new StructType().add("value", DoubleType)
19 | 
20 |   /** buffer keeps state for the total count, sumOfSquares, and individual bin counts
21 |     */
22 |   override def bufferSchema: StructType = StructType(
23 |     List(
24 |       StructField("count", LongType),
25 |       StructField("sumOfSquares", DoubleType),
26 |       StructField("bin1count", LongType),
27 |       StructField("bin2count", LongType),
28 |       StructField("bin3count", LongType),
29 |       StructField("bin4count", LongType),
30 |       StructField("bin5count", LongType),
31 |       StructField("bin6count", LongType),
32 |       StructField("bin7count", LongType),
33 |       StructField("bin8count", LongType),
34 |       StructField("bin9count", LongType),
35 |       StructField("bin10count", LongType)
36 |     )
37 |   )
38 | 
39 |   private val count = bufferSchema.fieldIndex("count")
40 |   private val sumOfSquares = bufferSchema.fieldIndex("sumOfSquares")
41 |   private val binStart = bufferSchema.fieldIndex("bin1count")
42 |   private val binEnd = bufferSchema.fieldIndex("bin10count")
43 | 
44 |   /** specifies the return type when using the UDAF
45 |     */
46 |   override def dataType: DataType = SecondPassStats.dataType
47 | 
48 |   /** these calculations are deterministic
49 |     */
50 |   override def deterministic: Boolean = true
51 | 
52 |   /** set the initial values for count, sum of squares and individual bin counts
53 |     */
54 |   override def initialize(buffer: MutableAggregationBuffer): Unit = {
55 |     buffer(count) = 0L
56 |     buffer(sumOfSquares) = 0.0
57 |     for (i <- binStart to binEnd) { buffer(i) = 0L }
58 |   }
59 | 
60 |   /** update the count, sum of squares and individual bin counts
61 |     */
62 |   override def update(buffer: MutableAggregationBuffer, input: Row): Unit = {
63 |     buffer(count) = buffer.getLong(count) + 1
64 |     buffer(sumOfSquares) = buffer.getDouble(sumOfSquares) + math.pow(input.getDouble(0) - firstPassStats.mean, 2)
65 |     // determine the index of the bin that we should increment
66 |     val binIndex =
67 |       binStart + math.min(NUMBER_OF_BINS - 1, math.floor((input.getDouble(0) - firstPassStats.min) / binSize).toInt)
68 |     buffer(binIndex) = buffer.getLong(binIndex) + 1
69 |   }
70 | 
71 |   /** reduce the count, sum of squares and individual bin counts
72 |     */
73 |   override def merge(buffer1: MutableAggregationBuffer, buffer2: Row): Unit = {
74 |     buffer1(count) = buffer1.getLong(count) + buffer2.getLong(count)
75 |     buffer1(sumOfSquares) = buffer1.getDouble(sumOfSquares) + buffer2.getDouble(sumOfSquares)
76 |     for (i <- binStart to binEnd) {
77 |       buffer1(i) = buffer1.getLong(i) + buffer2.getLong(i)
78 |     }
79 |   }
80 | 
81 |   /** evaluate the standard deviation and define bins of histogram
82 |     */
83 |   override def evaluate(buffer: Row): Any = {
84 |     val bins: Seq[Bin] = for (i <- binStart to binEnd) yield {
85 |       val bIndex = i - binStart
86 |       i match {
87 |         case start if i == binStart => Bin(firstPassStats.min, upperBounds(bIndex), buffer.getLong(start))
88 |         case end if i == binEnd => Bin(upperBounds(bIndex - 1), firstPassStats.max, buffer.getLong(end))
89 |         case _ => Bin(upperBounds(bIndex - 1), upperBounds(bIndex), buffer.getLong(i))
90 |       }
91 |     }
92 |     SecondPassStats(
93 |       math.sqrt(buffer.getDouble(sumOfSquares) / (buffer.getLong(count) - 1)),
94 |       Histogram(bins)
95 |     )
96 |   }
97 | 
98 | }
99 | 


--------------------------------------------------------------------------------
/src/main/scala/com/target/data_validator/validator/ColStats.scala:
--------------------------------------------------------------------------------
  1 | package com.target.data_validator.validator
  2 | 
  3 | import com.target.data_validator._
  4 | import com.target.data_validator.stats._
  5 | import io.circe
  6 | import io.circe.Json
  7 | import io.circe.generic.semiauto._
  8 | import io.circe.syntax._
  9 | import org.apache.spark.sql._
 10 | import org.apache.spark.sql.catalyst.analysis.UnresolvedAttribute
 11 | import org.apache.spark.sql.catalyst.expressions.Expression
 12 | import org.apache.spark.sql.functions._
 13 | import org.apache.spark.sql.types.{NumericType, StructType}
 14 | 
 15 | import scala.concurrent.Promise
 16 | import scala.util._
 17 | 
 18 | /** This validator implements a set of column metrics on a specified column by performing two I/O passes over the
 19 |   * input table.
 20 |   *
 21 |   * @param column
 22 |   *   the column to collect stats on
 23 |   */
 24 | final case class ColStats(column: String) extends TwoPassCheapCheck {
 25 |   import ValidatorBase._
 26 |   import com.target.data_validator.JsonEncoders.eventEncoder
 27 | 
 28 |   private val tempColumnName = column + "_" + hashCode
 29 | 
 30 |   override def name: String = "colstats"
 31 | 
 32 |   private val promiseToDoFirstPass: Promise[FirstPassStats] = Promise()
 33 | 
 34 |   // invoke only after first pass has completed
 35 |   private def unsafeFirstPassAccessor = promiseToDoFirstPass.future.value match {
 36 |     case None =>
 37 |       throw new IllegalStateException(
 38 |         "ColStats costly histograms requires that the pre-processing projections " +
 39 |           "are executed first to generate first pass stats."
 40 |       )
 41 |     case Some(Success(firstPassStats)) =>
 42 |       firstPassStats
 43 |     case Some(Failure(e)) =>
 44 |       throw e
 45 |   }
 46 | 
 47 |   // expression to aggregate first pass of stats
 48 |   override def firstPassSelect(): Column = {
 49 |     val firstPassAgg = new FirstPassStatsAggregator
 50 |     firstPassAgg(new Column(UnresolvedAttribute(column))) as tempColumnName
 51 |   }
 52 | 
 53 |   // extract first pass stats from output row
 54 |   override def sinkFirstPassRow(row: Row): Unit = {
 55 |     promiseToDoFirstPass complete Try {
 56 |       val rStats = row.getAs[Row](tempColumnName)
 57 |       FirstPassStats.fromRowRepr(rStats)
 58 |     }
 59 |   }
 60 | 
 61 |   // generate second pass ("Quick Check") expression
 62 |   // NOTE: this call implicitly REQUIRES that the first pass has completed
 63 |   override def select(schema: StructType, dict: VarSubstitution): Expression = {
 64 |     val agg = new SecondPassStatsAggregator(unsafeFirstPassAccessor)
 65 |     agg(col(column)).expr
 66 |   }
 67 | 
 68 |   // construct complete stats from quick check output row
 69 |   override def quickCheck(r: Row, count: Long, idx: Int): Boolean = {
 70 |     val rStats = r.getAs[Row](idx)
 71 |     val secondPassStats = SecondPassStats.fromRowRepr(rStats)
 72 |     val completeStats = CompleteStats(
 73 |       name = s"`$column` stats",
 74 |       column = column,
 75 |       firstPassStats = unsafeFirstPassAccessor,
 76 |       secondPassStats = secondPassStats
 77 |     )
 78 | 
 79 |     val json = completeStats.asJson
 80 |     logger.info(s"VarJsonEvent:${json.spaces2}")
 81 |     addEvent(JsonEvent(json))
 82 | 
 83 |     false
 84 |   }
 85 | 
 86 |   override def substituteVariables(dict: VarSubstitution): ValidatorBase = this
 87 | 
 88 |   override def configCheck(df: DataFrame): Boolean = {
 89 |     if (isColumnInDataFrame(df, column)) {
 90 |       df.schema(column).dataType match {
 91 |         case _: NumericType => false
 92 |         case badType =>
 93 |           val msg = s"Column $name type:$badType is not Numeric."
 94 |           logger.error(msg)
 95 |           addEvent(ValidatorError(msg))
 96 |           failed
 97 |       }
 98 |     } else {
 99 |       val msg = s"Column $name not in data frame."
100 |       logger.error(msg)
101 |       addEvent(ValidatorError(msg))
102 |       failed
103 |     }
104 |   }
105 | 
106 |   override def toJson: Json = Json.obj(
107 |     ("type", Json.fromString("colstats")),
108 |     ("column", Json.fromString(column)),
109 |     ("failed", Json.fromBoolean(failed)),
110 |     ("events", this.getEvents.asJson)
111 |   )
112 | 
113 | }
114 | 
115 | object ColStats {
116 |   implicit val encoder: circe.Encoder[ColumnSumCheck] = deriveEncoder[ColumnSumCheck]
117 |   implicit val decoder: circe.Decoder[ColumnSumCheck] = deriveDecoder[ColumnSumCheck]
118 | }
119 | 


--------------------------------------------------------------------------------
/src/main/scala/com/target/data_validator/validator/ColumnBased.scala:
--------------------------------------------------------------------------------
  1 | package com.target.data_validator.validator
  2 | 
  3 | import com.target.data_validator.{ColumnBasedValidatorCheckEvent, ValidatorCounter, ValidatorError, VarSubstitution}
  4 | import com.target.data_validator.JsonEncoders.eventEncoder
  5 | import io.circe.Json
  6 | import io.circe.syntax._
  7 | import org.apache.spark.sql.{DataFrame, Row}
  8 | import org.apache.spark.sql.catalyst.analysis.UnresolvedAttribute
  9 | import org.apache.spark.sql.catalyst.expressions.Expression
 10 | import org.apache.spark.sql.catalyst.expressions.aggregate.Max
 11 | import org.apache.spark.sql.types._
 12 | 
 13 | import scala.collection.immutable.ListMap
 14 | import scala.math.abs
 15 | 
 16 | abstract class ColumnBased(column: String, condTest: Expression) extends CheapCheck {
 17 |   override def select(schema: StructType, dict: VarSubstitution): Expression = condTest
 18 | 
 19 |   // ColumnBased checks don't have per row error details.
 20 |   def hasQuickErrorDetails: Boolean = false
 21 | 
 22 |   // calculates and returns the pct error as a string
 23 |   def calculatePctError(expected: Double, actual: Double, formatStr: String = "%4.2f%%"): String = {
 24 | 
 25 |     if (expected == actual) {
 26 |       formatStr.format(0.00) // if expected == actual, error % should be 0, even if expected is 0
 27 |     } else if (expected == 0.0) {
 28 |       "undefined"
 29 |     } else {
 30 |       val pct = abs(((expected - actual) * 100.0) / expected)
 31 |       formatStr.format(pct)
 32 |     }
 33 |   }
 34 | }
 35 | 
 36 | case class MinNumRows(minNumRows: Json) extends ColumnBased("", ValidatorBase.L0) {
 37 |   override def name: String = "MinNumRows"
 38 | 
 39 |   override def substituteVariables(dict: VarSubstitution): ValidatorBase = {
 40 |     val ret = MinNumRows(getVarSubJson(minNumRows, "minNumRows", dict))
 41 |     getEvents.foreach(ret.addEvent)
 42 |     ret
 43 |   }
 44 | 
 45 |   override def configCheck(df: DataFrame): Boolean = {
 46 | 
 47 |     def notNaturalNumber(): Unit = {
 48 |       val msg = "minNumRows must be a natural number"
 49 |       logger.error(msg)
 50 |       addEvent(ValidatorError(msg))
 51 |     }
 52 | 
 53 |     minNumRows.asNumber match {
 54 |       case Some(jsonNumber) =>
 55 |         jsonNumber.toLong match {
 56 |           case Some(x) if x > 0 =>
 57 |           case _ => notNaturalNumber()
 58 |         }
 59 |       case _ => notNaturalNumber()
 60 |     }
 61 |     failed
 62 |   }
 63 | 
 64 |   override def quickCheck(row: Row, count: Long, idx: Int): Boolean = {
 65 |     // Convert to `JsonNumber` then to `Long`
 66 |     // safe because already handled in `configCheck`
 67 |     val minNumRowsLong = minNumRows.asNumber.get.toLong.get
 68 | 
 69 |     failed = count < minNumRowsLong
 70 |     val pctError = if (failed) calculatePctError(minNumRowsLong, count) else "0.00%"
 71 |     addEvent(ValidatorCounter("rowCount", count))
 72 |     val msg = s"MinNumRowsCheck Expected: $minNumRows Actual: $count Relative Error: $pctError"
 73 |     val data = ListMap("expected" -> minNumRows.toString, "actual" -> count.toString, "relative_error" -> pctError)
 74 |     addEvent(ColumnBasedValidatorCheckEvent(failed, data, msg))
 75 |     failed
 76 |   }
 77 | 
 78 |   override def toJson: Json = Json.obj(
 79 |     ("type", Json.fromString("rowCount")),
 80 |     ("minNumRows", minNumRows),
 81 |     ("failed", Json.fromBoolean(failed)),
 82 |     ("events", this.getEvents.asJson)
 83 |   )
 84 | 
 85 |   override def toString: String = name + s"(minNumRows: $minNumRows)"
 86 | }
 87 | 
 88 | case class ColumnMaxCheck(column: String, value: Json)
 89 |     extends ColumnBased(column, Max(UnresolvedAttribute.quoted(column)).toAggregateExpression()) {
 90 | 
 91 |   override def substituteVariables(dict: VarSubstitution): ValidatorBase = {
 92 |     val ret = copy(column = getVarSub(column, "column", dict), value = getVarSubJson(value, "value", dict))
 93 |     this.getEvents.foreach(ret.addEvent)
 94 |     ret
 95 |   }
 96 | 
 97 |   override def configCheck(df: DataFrame): Boolean = checkTypes(df, column, value)
 98 | 
 99 |   override def quickCheck(row: Row, count: Long, idx: Int): Boolean = {
100 |     val dataType = row.schema(idx).dataType
101 |     val rMax = row(idx)
102 |     logger.info(s"rMax: $rMax colType: $dataType value: $value valueClass: ${value.getClass.getCanonicalName}")
103 | 
104 |     def resultForString: (ListMap[String, String], String) = {
105 |       val (expected, actual) = (value.asString.getOrElse(""), row.getString(idx))
106 | 
107 |       failed = expected != actual
108 |       val data = ListMap("expected" -> expected, "actual" -> actual)
109 |       val errorMsg = s"ColumnMaxCheck $column[StringType]: Expected: $expected Actual: $actual"
110 | 
111 |       (data, errorMsg)
112 |     }
113 | 
114 |     def resultForNumeric: (ListMap[String, String], String) = {
115 |       val num = value.asNumber.get
116 |       var cmp_params = (0.0, 0.0) // (expected, actual)
117 | 
118 |       dataType match {
119 |         case ByteType => cmp_params = (num.toByte.getOrElse[Byte](-1), row.getByte(idx))
120 |         case ShortType => cmp_params = (num.toShort.getOrElse[Short](-1), row.getShort(idx))
121 |         case IntegerType => cmp_params = (num.toInt.getOrElse[Int](-1), row.getInt(idx))
122 |         case LongType => cmp_params = (num.toLong.getOrElse[Long](-1), row.getLong(idx))
123 |         case FloatType => cmp_params = (num.toDouble, row.getFloat(idx))
124 |         case DoubleType => cmp_params = (num.toDouble, row.getDouble(idx))
125 |       }
126 | 
127 |       failed = cmp_params._1 != cmp_params._2
128 |       val pctError = if (failed) calculatePctError(cmp_params._1, cmp_params._2) else "0.00%"
129 |       val data = ListMap("expected" -> num.toString, "actual" -> rMax.toString, "relative_error" -> pctError)
130 |       val errorMsg = s"ColumnMaxCheck $column[$dataType]: Expected: $num Actual: $rMax Relative Error: $pctError"
131 | 
132 |       (data, errorMsg)
133 |     }
134 | 
135 |     def resultForOther: (ListMap[String, String], String) = {
136 |       logger.error(
137 |         s"""ColumnMaxCheck for type: $dataType, Row: $row not implemented!
138 |           |Please open a bug report on the data-validator issue tracker.""".stripMargin
139 |       )
140 |       failed = true
141 |       val errorMsg = s"ColumnMaxCheck is not supported for data type $dataType"
142 | 
143 |       (ListMap.empty[String, String], errorMsg)
144 |     }
145 | 
146 |     val (data, errorMsg) = dataType match {
147 |       case StringType => resultForString
148 |       case _: NumericType => resultForNumeric
149 |       case _ => resultForOther
150 |     }
151 | 
152 |     logger.debug(s"MaxValue compared Row: $row with value: $value failed: $failed")
153 |     if (failed) {
154 |       addEvent(ColumnBasedValidatorCheckEvent(failed, data, errorMsg))
155 |     }
156 |     failed
157 |   }
158 | 
159 |   override def toJson: Json = Json.obj(
160 |     ("type", Json.fromString("columnMaxCheck")),
161 |     ("column", Json.fromString(column)),
162 |     ("value", value),
163 |     ("failed", Json.fromBoolean(failed)),
164 |     ("events", this.getEvents.asJson)
165 |   )
166 | }
167 | 


--------------------------------------------------------------------------------
/src/main/scala/com/target/data_validator/validator/ColumnSumCheck.scala:
--------------------------------------------------------------------------------
  1 | package com.target.data_validator.validator
  2 | 
  3 | import com.target.data_validator.{ColumnBasedValidatorCheckEvent, JsonEncoders, ValidatorError, VarSubstitution}
  4 | import io.circe._
  5 | import io.circe.generic.semiauto._
  6 | import io.circe.syntax._
  7 | import org.apache.spark.sql.{DataFrame, Row}
  8 | import org.apache.spark.sql.catalyst.analysis.UnresolvedAttribute
  9 | import org.apache.spark.sql.catalyst.expressions.aggregate.Sum
 10 | import org.apache.spark.sql.types._
 11 | 
 12 | import scala.collection.immutable.ListMap
 13 | 
 14 | case class ColumnSumCheck(
 15 |     column: String,
 16 |     minValue: Option[Json] = None,
 17 |     maxValue: Option[Json] = None,
 18 |     inclusive: Option[Json] = None
 19 | ) extends ColumnBased(column, Sum(UnresolvedAttribute.quoted(column)).toAggregateExpression()) {
 20 | 
 21 |   private val minOrMax: Either[String, Unit] = if (minValue.isEmpty && maxValue.isEmpty) {
 22 |     Left("'minValue' or 'maxValue' or both must be defined")
 23 |   } else {
 24 |     Right()
 25 |   }
 26 | 
 27 |   private val lowerBound: Either[String, Double] = minValue match {
 28 |     case Some(json) =>
 29 |       if (json.isNumber) { Right(json.asNumber.get.toDouble) }
 30 |       else { Left(s"'minValue' defined but type is not a Number, is: ${json.name}") }
 31 |     case None => Right(Double.MinValue)
 32 |   }
 33 | 
 34 |   private val upperBound: Either[String, Double] = maxValue match {
 35 |     case Some(json) =>
 36 |       if (json.isNumber) { Right(json.asNumber.get.toDouble) }
 37 |       else { Left(s"'maxValue' defined but type is not a Number, is: ${json.name}") }
 38 |     case None => Right(Double.MaxValue)
 39 |   }
 40 | 
 41 |   private val minLessThanMax: Either[String, Unit] = (lowerBound, upperBound) match {
 42 |     case (Right(lower), Right(upper)) if lower >= upper =>
 43 |       Left(s"'minValue': $lower must be less than 'maxValue': $upper")
 44 |     case _ => Right()
 45 |   }
 46 | 
 47 |   private val inclusiveBounds: Either[String, Boolean] = inclusive match {
 48 |     case Some(json) =>
 49 |       if (json.isBoolean) { Right(json.asBoolean.get) }
 50 |       else { Left(s"'inclusive' defined but type is not Boolean, is: ${json.name}") }
 51 |     case None => Right(false)
 52 |   }
 53 | 
 54 |   override def name: String = "columnSumCheck"
 55 | 
 56 |   override def quickCheck(r: Row, count: Long, idx: Int): Boolean = {
 57 | 
 58 |     val dataType = r.schema(idx).dataType
 59 |     val isInclusive = inclusiveBounds.right.get
 60 |     val lowerBoundValue = lowerBound.right.get
 61 |     val upperBoundValue = upperBound.right.get
 62 | 
 63 |     def evaluate(sum: Double): Boolean = {
 64 |       if (isInclusive) { sum > upperBoundValue || sum < lowerBoundValue }
 65 |       else { sum >= upperBoundValue || sum <= lowerBoundValue }
 66 |     }
 67 | 
 68 |     def getPctError(sum: Double): String = {
 69 |       if (sum < lowerBoundValue) {
 70 |         calculatePctError(lowerBoundValue, sum)
 71 |       } else if (sum > upperBoundValue) {
 72 |         calculatePctError(upperBoundValue, sum)
 73 |       } else if (!isInclusive && (sum == upperBoundValue || sum == lowerBoundValue)) {
 74 |         "undefined"
 75 |       } else {
 76 |         "0.00%"
 77 |       }
 78 |     }
 79 | 
 80 |     def getData(pctError: String): ListMap[String, String] = {
 81 |       val initial: ListMap[String, String] = ((minValue, maxValue) match {
 82 |         case (Some(x), Some(y)) =>
 83 |           ListMap("lower_bound" -> x.asNumber.get.toString, "upper_bound" -> y.asNumber.get.toString)
 84 |         case (None, Some(y)) => ListMap("upper_bound" -> y.asNumber.get.toString)
 85 |         case (Some(x), None) => ListMap("lower_bound" -> x.asNumber.get.toString)
 86 |         case (None, None) => throw new RuntimeException("Must define at least one of minValue or maxValue.")
 87 |       })
 88 |       initial ++ List("inclusive" -> isInclusive.toString, "actual" -> r(idx).toString, "relative_error" -> pctError)
 89 |     }
 90 | 
 91 |     val actualSum: Double = dataType match {
 92 |       case ByteType => r.getByte(idx)
 93 |       case ShortType => r.getShort(idx)
 94 |       case IntegerType => r.getInt(idx)
 95 |       case LongType => r.getLong(idx)
 96 |       case FloatType => r.getFloat(idx)
 97 |       case DoubleType => r.getDouble(idx)
 98 |       case ut => throw new Exception(s"Unsupported type for $name found in schema: $ut")
 99 |     }
100 | 
101 |     failed = evaluate(actualSum)
102 |     val pctError = getPctError(actualSum)
103 |     val data = getData(pctError)
104 | 
105 |     val bounds = minValue.getOrElse(" ") :: maxValue.getOrElse("") :: Nil
106 |     val prettyBounds = if (isInclusive) {
107 |       bounds.mkString("[", ", ", "]")
108 |     } else {
109 |       bounds.mkString("(", ", ", ")")
110 |     }
111 | 
112 |     val msg =
113 |       s"$name on $column[$dataType]: Expected Range: $prettyBounds Actual: ${r(idx)} Relative Error: $pctError"
114 |     addEvent(ColumnBasedValidatorCheckEvent(failed, data, msg))
115 |     failed
116 |   }
117 | 
118 |   override def substituteVariables(dict: VarSubstitution): ValidatorBase = {
119 |     val ret = copy(
120 |       column = getVarSub(column, "column", dict),
121 |       minValue = minValue.map(getVarSubJson(_, "minValue", dict)),
122 |       maxValue = maxValue.map(getVarSubJson(_, "maxValue", dict)),
123 |       inclusive = inclusive.map(getVarSubJson(_, "inclusive", dict))
124 |     )
125 |     this.getEvents.foreach(ret.addEvent)
126 |     ret
127 |   }
128 | 
129 |   override def configCheck(df: DataFrame): Boolean = {
130 |     logger.debug(s"Full check config: ${this.toString}")
131 |     Seq(
132 |       minOrMax,
133 |       lowerBound,
134 |       upperBound,
135 |       minLessThanMax,
136 |       inclusiveBounds
137 |     ).foreach {
138 |       case Left(msg) =>
139 |         logger.error(msg)
140 |         addEvent(ValidatorError(msg))
141 |       case _ =>
142 |     }
143 | 
144 |     findColumnInDataFrame(df, column) match {
145 |       case Some(ft) if ft.dataType.isInstanceOf[NumericType] =>
146 |       case Some(ft) =>
147 |         val msg = s"Column: $column found, but not of numericType type: ${ft.dataType}"
148 |         logger.error(msg)
149 |         addEvent(ValidatorError(msg))
150 |       case None =>
151 |         val msg = s"Column: $column not found in schema"
152 |         logger.error(msg)
153 |         addEvent(ValidatorError(msg))
154 |     }
155 |     failed
156 |   }
157 | 
158 |   override def toJson: Json = {
159 |     import JsonEncoders.eventEncoder
160 |     val additionalFieldsForReport = Json.fromFields(
161 |       Set(
162 |         "type" -> Json.fromString("columnSumCheck"),
163 |         "failed" -> Json.fromBoolean(failed),
164 |         "events" -> getEvents.asJson
165 |       )
166 |     )
167 | 
168 |     val base = ColumnSumCheck.encoder(this)
169 |     base.deepMerge(additionalFieldsForReport)
170 |   }
171 | }
172 | 
173 | object ColumnSumCheck {
174 |   val encoder: Encoder[ColumnSumCheck] = deriveEncoder[ColumnSumCheck]
175 |   val decoder: Decoder[ColumnSumCheck] = deriveDecoder[ColumnSumCheck]
176 |   def fromJson(c: HCursor): Either[DecodingFailure, ValidatorBase] = decoder.apply(c)
177 | }
178 | 


--------------------------------------------------------------------------------
/src/main/scala/com/target/data_validator/validator/JsonDecoders.scala:
--------------------------------------------------------------------------------
 1 | package com.target.data_validator.validator
 2 | 
 3 | import cats.syntax.either._
 4 | import com.typesafe.scalalogging.LazyLogging
 5 | import io.circe.{Decoder, DecodingFailure, HCursor}
 6 | import io.circe.generic.auto._
 7 | 
 8 | object JsonDecoders extends LazyLogging {
 9 | 
10 |   implicit val decodeChecks: Decoder[ValidatorBase] = new Decoder[ValidatorBase] {
11 |     // FIXME: specifying this Function here instead of Decoder[ValidatorBase] is a smell that these checks
12 |     //        ought to have proper decoder objects instead of a method.
13 |     //        I.e., we're not using the Circe Decoder API as intended.
14 |     private lazy val decoders = Map[String, HCursor => Either[DecodingFailure, ValidatorBase]](
15 |       "rowCount" -> { _.as[MinNumRows] },
16 |       "nullCheck" -> NullCheck.fromJson,
17 |       "negativeCheck" -> NegativeCheck.fromJson,
18 |       "columnMaxCheck" -> { _.as[ColumnMaxCheck] },
19 |       "rangeCheck" -> RangeCheck.fromJson,
20 |       "uniqueCheck" -> UniqueCheck.fromJson,
21 |       "stringLengthCheck" -> StringLengthCheck.fromJson,
22 |       "stringRegexCheck" -> StringRegexCheck.fromJson,
23 |       "columnSumCheck" -> ColumnSumCheck.fromJson,
24 |       "colstats" -> implicitly[Decoder[ColStats]].apply // serdes defined implicitly on companion
25 |     )
26 | 
27 |     final def apply(c: HCursor): Decoder.Result[ValidatorBase] = c.downField("type").as[String].flatMap(getDecoder(c))
28 | 
29 |     private def getDecoder(cursor: HCursor)(checkType: String) = {
30 |       decoders
31 |         .get(checkType)
32 |         .map(_(cursor)) match {
33 |         case Some(x) => x
34 |         case None =>
35 |           logger.error(s"Unknown Check `$checkType` in config! Choose one of: ${decoders.keys.mkString(", ")}.")
36 |           throw new RuntimeException(s"Unknown Check in config `$checkType`")
37 |       }
38 |     }
39 |   }
40 | }
41 | 


--------------------------------------------------------------------------------
/src/main/scala/com/target/data_validator/validator/NegativeCheck.scala:
--------------------------------------------------------------------------------
 1 | package com.target.data_validator.validator
 2 | 
 3 | import com.target.data_validator.{ValidatorError, VarSubstitution}
 4 | import com.target.data_validator.JsonEncoders.eventEncoder
 5 | import com.target.data_validator.validator.ValidatorBase.I0
 6 | import com.typesafe.scalalogging.LazyLogging
 7 | import io.circe.{DecodingFailure, HCursor, Json}
 8 | import io.circe.syntax._
 9 | import org.apache.spark.sql.DataFrame
10 | import org.apache.spark.sql.catalyst.analysis.UnresolvedAttribute
11 | import org.apache.spark.sql.catalyst.expressions.{Expression, LessThan}
12 | import org.apache.spark.sql.types.{NumericType, StructType}
13 | 
14 | case class NegativeCheck(column: String, threshold: Option[String]) extends RowBased {
15 | 
16 |   override def substituteVariables(dict: VarSubstitution): ValidatorBase = {
17 |     val ret = NegativeCheck(getVarSub(column, "column", dict), threshold.map(getVarSub(_, "threshold", dict)))
18 |     getEvents.foreach(ret.addEvent)
19 |     ret
20 |   }
21 | 
22 |   override def configCheck(df: DataFrame): Boolean = {
23 |     findColumnInDataFrame(df, column) match {
24 |       case Some(ft) if ft.dataType.isInstanceOf[NumericType] => Unit
25 |       case Some(ft) =>
26 |         val msg = s"Column: $column found, but not of numericType type: ${ft.dataType}"
27 |         logger.error(msg)
28 |         addEvent(ValidatorError(msg))
29 |       case None =>
30 |         val msg = s"Column: $column not found in schema."
31 |         logger.error(msg)
32 |         addEvent(ValidatorError(msg))
33 |     }
34 |     configCheckThreshold
35 |     failed
36 |   }
37 | 
38 |   override def colTest(schema: StructType, dict: VarSubstitution): Expression =
39 |     LessThan(UnresolvedAttribute(column), I0)
40 | 
41 |   override def toJson: Json = Json.obj(
42 |     ("type", Json.fromString("negativeCheck")),
43 |     ("column", Json.fromString(column)),
44 |     ("threshold", Json.fromString(threshold.getOrElse("0"))),
45 |     ("failed", Json.fromBoolean(failed)),
46 |     ("events", this.getEvents.asJson)
47 |   )
48 | }
49 | 
50 | object NegativeCheck extends LazyLogging {
51 |   def fromJson(c: HCursor): Either[DecodingFailure, ValidatorBase] = {
52 |     val column = c.downField("column").as[String].right.get
53 |     val threshold = c.downField("threshold").as[String].right.toOption
54 | 
55 |     logger.debug(s"Parsing NegativeCheck(column:$column, threshold:$threshold) config.")
56 |     scala.util.Right(NegativeCheck(column, threshold))
57 |   }
58 | }
59 | 


--------------------------------------------------------------------------------
/src/main/scala/com/target/data_validator/validator/NullCheck.scala:
--------------------------------------------------------------------------------
 1 | package com.target.data_validator.validator
 2 | 
 3 | import com.target.data_validator.JsonEncoders.eventEncoder
 4 | import com.target.data_validator.VarSubstitution
 5 | import com.typesafe.scalalogging.LazyLogging
 6 | import io.circe.{DecodingFailure, HCursor, Json}
 7 | import io.circe.syntax._
 8 | import org.apache.spark.sql.catalyst.analysis.UnresolvedAttribute
 9 | import org.apache.spark.sql.catalyst.expressions.{Expression, IsNull}
10 | import org.apache.spark.sql.types.StructType
11 | 
12 | case class NullCheck(column: String, threshold: Option[String]) extends RowBased {
13 | 
14 |   override def substituteVariables(dict: VarSubstitution): ValidatorBase = {
15 |     val ret = NullCheck(getVarSub(column, "column", dict), threshold.map(getVarSub(_, "threshold", dict)))
16 |     getEvents.foreach(ret.addEvent)
17 |     ret
18 |   }
19 | 
20 |   override def colTest(schema: StructType, dict: VarSubstitution): Expression = IsNull(UnresolvedAttribute(column))
21 | 
22 |   override def toJson: Json = Json.obj(
23 |     ("type", Json.fromString("nullCheck")),
24 |     ("column", Json.fromString(column)),
25 |     ("failed", Json.fromBoolean(failed)),
26 |     ("events", this.getEvents.asJson)
27 |   )
28 | }
29 | 
30 | object NullCheck extends LazyLogging {
31 |   def fromJson(c: HCursor): Either[DecodingFailure, ValidatorBase] = {
32 |     val column = c.downField("column").as[String].right.get
33 |     val threshold = c.downField("threshold").as[String].right.toOption
34 | 
35 |     logger.debug(s"Parsing NullCheck(column:$column, threshold:$threshold) config.")
36 |     scala.util.Right(NullCheck(column, threshold))
37 |   }
38 | 
39 | }
40 | 


--------------------------------------------------------------------------------
/src/main/scala/com/target/data_validator/validator/RangeCheck.scala:
--------------------------------------------------------------------------------
  1 | package com.target.data_validator.validator
  2 | 
  3 | import com.target.data_validator.{JsonEncoders, ValidatorError, VarSubstitution}
  4 | import com.target.data_validator.JsonUtils.debugJson
  5 | import com.target.data_validator.validator.ValidatorBase._
  6 | import com.typesafe.scalalogging.LazyLogging
  7 | import io.circe.{DecodingFailure, HCursor, Json}
  8 | import io.circe.syntax._
  9 | import org.apache.spark.sql.DataFrame
 10 | import org.apache.spark.sql.catalyst.analysis.UnresolvedAttribute
 11 | import org.apache.spark.sql.catalyst.expressions._
 12 | import org.apache.spark.sql.types.{DataType, StructType}
 13 | 
 14 | case class RangeCheck(
 15 |     column: String,
 16 |     minValue: Option[Json],
 17 |     maxValue: Option[Json],
 18 |     inclusive: Option[Json],
 19 |     threshold: Option[String]
 20 | ) extends RowBased {
 21 | 
 22 |   override def substituteVariables(dict: VarSubstitution): ValidatorBase = {
 23 |     val ret = RangeCheck(
 24 |       getVarSub(column, "column", dict),
 25 |       minValue.map(getVarSubJson(_, "minValue", dict)),
 26 |       maxValue.map(getVarSubJson(_, "maxValue", dict)),
 27 |       inclusive.map(getVarSubJson(_, "inclusive", dict)),
 28 |       threshold.map(getVarSub(_, "threshold", dict))
 29 |     )
 30 |     getEvents.foreach(ret.addEvent)
 31 |     ret
 32 |   }
 33 | 
 34 |   private def cmpExpr(
 35 |       colExpr: Expression,
 36 |       value: Option[Json],
 37 |       colType: DataType,
 38 |       cmp: (Expression, Expression) => Expression
 39 |   ): Option[Expression] = {
 40 |     value.map { v => cmp(colExpr, createLiteralOrUnresolvedAttribute(colType, v)) }
 41 |   }
 42 | 
 43 |   override def colTest(schema: StructType, dict: VarSubstitution): Expression = {
 44 |     val colType = schema(column).dataType
 45 |     val colExp = UnresolvedAttribute(column)
 46 |     val (minCmpExp, maxCmpExp) = if (inclusive.flatMap(_.asBoolean).getOrElse(false)) {
 47 |       (LessThan, GreaterThan)
 48 |     } else {
 49 |       (LessThanOrEqual, GreaterThanOrEqual)
 50 |     }
 51 | 
 52 |     val minValueExpression = cmpExpr(colExp, minValue, colType, minCmpExp)
 53 |     val maxValueExpression = cmpExpr(colExp, maxValue, colType, maxCmpExp)
 54 | 
 55 |     val ret = (minValueExpression, maxValueExpression) match {
 56 |       case (Some(x), None) => x
 57 |       case (None, Some(y)) => y
 58 |       case (Some(x), Some(y)) => Or(x, y)
 59 |       case _ => throw new RuntimeException("Must define min or max value.")
 60 |     }
 61 |     logger.debug(s"Expr: $ret")
 62 |     ret
 63 |   }
 64 | 
 65 |   private def checkMinLessThanMax(values: List[Json]): Unit = {
 66 | 
 67 |     if (values.forall(_.isNumber)) {
 68 |       values.flatMap(_.asNumber) match {
 69 |         case mv :: xv :: Nil if mv.toDouble >= xv.toDouble =>
 70 |           addEvent(ValidatorError(s"Min: ${minValue.get} must be less than max: ${maxValue.get}"))
 71 |         case _ =>
 72 |       }
 73 |     } else if (values.forall(_.isString)) {
 74 |       values.flatMap(_.asString) match {
 75 |         case mv :: xv :: Nil if mv == xv =>
 76 |           addEvent(ValidatorError(s"Min[String]: $mv must be less than max[String]: $xv"))
 77 |         case _ =>
 78 |       }
 79 |     } else {
 80 |       // Not Strings or Numbers
 81 |       addEvent(ValidatorError(s"Unsupported type in ${values.map(debugJson).mkString(", ")}"))
 82 |     }
 83 |   }
 84 | 
 85 |   override def configCheck(df: DataFrame): Boolean = {
 86 | 
 87 |     val values = (minValue :: maxValue :: Nil).flatten
 88 |     if (values.isEmpty) {
 89 |       addEvent(ValidatorError("Must defined minValue or maxValue or both."))
 90 |     }
 91 | 
 92 |     checkMinLessThanMax(values)
 93 | 
 94 |     val colType = findColumnInDataFrame(df, column)
 95 |     if (colType.isDefined) {
 96 |       val dataType = colType.get.dataType
 97 | 
 98 |       if (values.map(c => checkValue(df.schema, column, dataType, c)).exists(x => x)) {
 99 |         addEvent(ValidatorError(s"Range constraint types not compatible with column[$dataType]:'$column'"))
100 |       }
101 |     }
102 | 
103 |     if (inclusive.isDefined && inclusive.get.asBoolean.isEmpty) {
104 |       logger.error(s"Inclusive defined but not Bool, $inclusive")
105 |       addEvent(ValidatorError(s"Inclusive flag is defined, but is not a boolean, inclusive: ${inclusive.get}"))
106 |     }
107 | 
108 |     failed
109 |   }
110 | 
111 |   override def toJson: Json = {
112 |     import JsonEncoders.eventEncoder
113 |     val fields = Seq(
114 |       ("type", Json.fromString("rangeCheck")),
115 |       ("column", Json.fromString(column))
116 |     ) ++
117 |       minValue.map(mv => ("minValue", mv)) ++
118 |       maxValue.map(mv => ("maxValue", mv)) ++
119 |       Seq(
120 |         ("inclusive", Json.fromBoolean(inclusive.flatMap(_.asBoolean).getOrElse(false))),
121 |         ("events", getEvents.asJson)
122 |       )
123 |     Json.obj(fields: _*)
124 |   }
125 | }
126 | 
127 | object RangeCheck extends LazyLogging {
128 |   def fromJson(c: HCursor): Either[DecodingFailure, ValidatorBase] = {
129 |     val column = c.downField("column").as[String].right.get
130 |     val minValueJ = c.downField("minValue").as[Json].right.toOption
131 |     val maxValueJ = c.downField("maxValue").as[Json].right.toOption
132 |     val inclusiveJ = c.downField("inclusive").as[Json].right.toOption
133 |     val threshold = c.downField("threshold").as[String].right.toOption
134 | 
135 |     logger.debug(s"column: $column")
136 |     logger.debug(s"minValue: $minValueJ type: ${minValueJ.getClass.getCanonicalName}")
137 |     logger.debug(s"maxValue: $maxValueJ type: ${maxValueJ.getClass.getCanonicalName}")
138 |     logger.debug(s"inclusive: $inclusiveJ type: ${inclusiveJ.getClass.getCanonicalName}")
139 |     logger.debug(s"threshold: $threshold")
140 | 
141 |     c.focus.foreach { f => logger.debug(s"RangeCheckJson: ${f.spaces2}") }
142 |     scala.util.Right(RangeCheck(column, minValueJ, maxValueJ, inclusiveJ, threshold))
143 |   }
144 | }
145 | 


--------------------------------------------------------------------------------
/src/main/scala/com/target/data_validator/validator/RowBased.scala:
--------------------------------------------------------------------------------
  1 | package com.target.data_validator.validator
  2 | 
  3 | import com.target.data_validator._
  4 | import com.target.data_validator.validator.ValidatorBase.{isColumnInDataFrame, L0, L1}
  5 | import org.apache.spark.sql.{DataFrame, Row}
  6 | import org.apache.spark.sql.catalyst.expressions._
  7 | import org.apache.spark.sql.types.StructType
  8 | 
  9 | import scala.util.matching.Regex
 10 | 
 11 | abstract class RowBased extends CheapCheck {
 12 | 
 13 |   val column: String
 14 |   val threshold: Option[String]
 15 | 
 16 |   def configCheck(df: DataFrame): Boolean = {
 17 |     configCheckColumn(df)
 18 |     configCheckThreshold
 19 |     failed
 20 |   }
 21 | 
 22 |   def configCheckColumn(df: DataFrame): Boolean = {
 23 |     if (isColumnInDataFrame(df, column)) {
 24 |       logger.debug(s"Column: $column found in table.")
 25 |       false
 26 |     } else {
 27 |       val msg = s"Column: $column not found in schema."
 28 |       logger.error(msg)
 29 |       addEvent(ValidatorError(msg))
 30 |       failed
 31 |     }
 32 |   }
 33 | 
 34 |   def configCheckThreshold: Boolean = {
 35 |     if (threshold.isDefined) {
 36 |       val ret = threshold.flatMap(RowBased.THRESHOLD_NUMBER_REGEX.findFirstIn).isEmpty
 37 |       if (ret) {
 38 |         val msg = s"Threshold `${threshold.get}` not parsable."
 39 |         logger.error(msg)
 40 |         addEvent(ValidatorError(msg))
 41 |       }
 42 |       ret
 43 |     } else {
 44 |       false
 45 |     }
 46 |   }
 47 | 
 48 |   def colTest(schema: StructType, dict: VarSubstitution): Expression
 49 | 
 50 |   def select(schema: StructType, dict: VarSubstitution): Expression = If(colTest(schema, dict), L1, L0)
 51 | 
 52 |   /** Calculates the max acceptable number of errors from threshold and rowCount.
 53 |     * @param rowCount
 54 |     *   of table.
 55 |     * @return
 56 |     *   max number of errors we can tolerate. if threshold < 1, then its a percentage of rowCount. if threshold ends
 57 |     *   with '%' then its percentage of rowCount if threshold is > 1, then its maxErrors.
 58 |     */
 59 |   def calcErrorCountThreshold(rowCount: Long): Long = {
 60 |     threshold
 61 |       .map { t =>
 62 |         val tempThreshold = t.stripSuffix("%").toDouble
 63 |         val ret: Long = if (t.endsWith("%")) {
 64 |           // Has '%', so divide by 100.0
 65 |           (tempThreshold * (rowCount / 100.0)).toLong
 66 |         } else if (tempThreshold < 1.0) {
 67 |           // Percentage without the '%'
 68 |           (tempThreshold * rowCount).toLong
 69 |         } else {
 70 |           // Number of rows
 71 |           tempThreshold.toLong
 72 |         }
 73 |         logger.info(s"Threshold:${threshold.get} tempThreshold:$tempThreshold ret:$ret")
 74 |         ret
 75 |       }
 76 |       .getOrElse(0)
 77 |   }
 78 | 
 79 |   override def quickCheck(row: Row, count: Long, idx: Int): Boolean = {
 80 |     logger.debug(s"quickCheck $column Row: $row count: $count idx: $idx")
 81 |     if (count > 0) {
 82 |       val errorCount = row.getLong(idx)
 83 |       val errorCountThreshold = calcErrorCountThreshold(count)
 84 | 
 85 |       addEvent(ValidatorCounter("rowCount", count))
 86 |       addEvent(ValidatorCounter("errorCount", errorCount))
 87 |       if (errorCountThreshold > 0) {
 88 |         addEvent(ValidatorCounter("errorCountThreshold", errorCountThreshold))
 89 |       }
 90 | 
 91 |       val failure = errorCount > errorCountThreshold
 92 |       if (failure) {
 93 |         logger.error(
 94 |           s"Quick check for $name on $column failed, $errorCount errors in $count rows"
 95 |             + s" errorCountThreshold: $errorCountThreshold"
 96 |         )
 97 |       }
 98 |       addEvent(ValidatorCheckEvent(failure, s"$name on column '$column'", count, errorCount))
 99 |     } else {
100 |       logger.warn(s"No Rows to check for $toString!")
101 |     }
102 |     failed
103 |   }
104 | 
105 |   def quickCheckDetail(row: Row, key: Seq[(String, Any)], idx: Int, dict: VarSubstitution): Unit = {
106 |     val r = row.get(idx)
107 |     val column = row.schema.fieldNames(idx)
108 |     addEvent(
109 |       ValidatorQuickCheckError(key.toList, r, name + s" failed! $column = $r and ${colTest(row.schema, dict)}")
110 |     )
111 |   }
112 | }
113 | 
114 | object RowBased {
115 |   val THRESHOLD_NUMBER_REGEX: Regex = "^([0-9]+\\.*[0-9]*)\\s*%{0,1}$".r // scalastyle:ignore
116 | }
117 | 


--------------------------------------------------------------------------------
/src/main/scala/com/target/data_validator/validator/StringLengthCheck.scala:
--------------------------------------------------------------------------------
  1 | package com.target.data_validator.validator
  2 | 
  3 | import com.target.data_validator.{JsonEncoders, ValidatorError, VarSubstitution}
  4 | import com.target.data_validator.JsonUtils.debugJson
  5 | import com.target.data_validator.validator.ValidatorBase._
  6 | import com.typesafe.scalalogging.LazyLogging
  7 | import io.circe.{DecodingFailure, HCursor, Json}
  8 | import io.circe.syntax._
  9 | import org.apache.spark.sql.DataFrame
 10 | import org.apache.spark.sql.catalyst.analysis.UnresolvedAttribute
 11 | import org.apache.spark.sql.catalyst.expressions._
 12 | import org.apache.spark.sql.types.{IntegerType, StringType, StructType}
 13 | 
 14 | case class StringLengthCheck(
 15 |     column: String,
 16 |     minLength: Option[Json],
 17 |     maxLength: Option[Json],
 18 |     threshold: Option[String]
 19 | ) extends RowBased {
 20 | 
 21 |   override def substituteVariables(dict: VarSubstitution): ValidatorBase = {
 22 | 
 23 |     val ret = StringLengthCheck(
 24 |       getVarSub(column, "column", dict),
 25 |       minLength.map(getVarSubJson(_, "minLength", dict)),
 26 |       maxLength.map(getVarSubJson(_, "maxLength", dict)),
 27 |       threshold.map(getVarSub(_, "threshold", dict))
 28 |     )
 29 |     getEvents.foreach(ret.addEvent)
 30 |     ret
 31 |   }
 32 | 
 33 |   private def cmpExpr(
 34 |       colExpr: Expression,
 35 |       value: Option[Json],
 36 |       cmp: (Expression, Expression) => Expression
 37 |   ): Option[Expression] = {
 38 |     value.map { v => cmp(colExpr, createLiteralOrUnresolvedAttribute(IntegerType, v)) }
 39 |   }
 40 | 
 41 |   override def colTest(schema: StructType, dict: VarSubstitution): Expression = {
 42 | 
 43 |     val colExp = Length(UnresolvedAttribute(column))
 44 | 
 45 |     val minLengthExpression = cmpExpr(colExp, minLength, LessThan)
 46 |     val maxLengthExpression = cmpExpr(colExp, maxLength, GreaterThan)
 47 | 
 48 |     val ret = (minLengthExpression, maxLengthExpression) match {
 49 |       case (Some(x), None) => x
 50 |       case (None, Some(y)) => y
 51 |       case (Some(x), Some(y)) => Or(x, y)
 52 |       case _ => throw new RuntimeException("Must define min or max value.")
 53 |     }
 54 |     logger.debug(s"Expr: $ret")
 55 |     ret
 56 |   }
 57 | 
 58 |   private def checkMinLessThanOrEqualToMax(values: List[Json]): Unit = {
 59 | 
 60 |     if (values.forall(_.isNumber)) {
 61 |       values.flatMap(_.asNumber) match {
 62 |         case mv :: xv :: Nil if mv.toDouble > xv.toDouble =>
 63 |           addEvent(ValidatorError(s"min: ${minLength.get} must be less than or equal to max: ${maxLength.get}"))
 64 |         case _ =>
 65 |       }
 66 |     } else if (values.forall(_.isString)) {
 67 |       values.flatMap(_.asString) match {
 68 |         case mv :: xv :: Nil if mv == xv =>
 69 |           addEvent(ValidatorError(s"Min[String]: $mv must be less than max[String]: $xv"))
 70 |         case _ =>
 71 |       }
 72 |     } else {
 73 |       // Not Strings or Numbers
 74 |       addEvent(ValidatorError(s"Unsupported type in ${values.map(debugJson).mkString(", ")}"))
 75 |     }
 76 |   }
 77 | 
 78 |   override def configCheck(df: DataFrame): Boolean = {
 79 | 
 80 |     // Verify if at least one of min or max is specified.
 81 |     val values = (minLength :: maxLength :: Nil).flatten
 82 |     if (values.isEmpty) {
 83 |       addEvent(ValidatorError("Must define minLength or maxLength or both."))
 84 |     }
 85 | 
 86 |     // Verify that min is less than max
 87 |     checkMinLessThanOrEqualToMax(values)
 88 | 
 89 |     // Verify that the data type of the specified column is a String.
 90 |     val colType = findColumnInDataFrame(df, column)
 91 |     if (colType.isDefined) {
 92 |       val dataType = colType.get.dataType
 93 |       if (!dataType.isInstanceOf[StringType]) {
 94 |         addEvent(ValidatorError(s"Data type of column '$column' must be String, but was found to be $dataType"))
 95 |       }
 96 |     }
 97 | 
 98 |     failed
 99 |   }
100 | 
101 |   override def toJson: Json = {
102 |     import JsonEncoders.eventEncoder
103 |     val fields = Seq(
104 |       ("type", Json.fromString("stringLengthCheck")),
105 |       ("column", Json.fromString(column))
106 |     ) ++
107 |       minLength.map(mv => ("minLength", mv)) ++
108 |       maxLength.map(mv => ("maxLength", mv)) ++
109 |       Seq(
110 |         ("events", getEvents.asJson)
111 |       )
112 |     Json.obj(fields: _*)
113 |   }
114 | }
115 | 
116 | object StringLengthCheck extends LazyLogging {
117 |   def fromJson(c: HCursor): Either[DecodingFailure, ValidatorBase] = {
118 |     val column = c.downField("column").as[String].right.get
119 |     val minLengthJ = c.downField("minLength").as[Json].right.toOption
120 |     val maxLengthJ = c.downField("maxLength").as[Json].right.toOption
121 |     val threshold = c.downField("threshold").as[String].right.toOption
122 | 
123 |     logger.debug(s"column: $column")
124 |     logger.debug(s"minLength: $minLengthJ type: ${minLengthJ.getClass.getCanonicalName}")
125 |     logger.debug(s"maxLength: $maxLengthJ type: ${maxLengthJ.getClass.getCanonicalName}")
126 |     logger.debug(s"threshold: $threshold type: ${threshold.getClass.getCanonicalName}")
127 |     scala.util.Right(StringLengthCheck(column, minLengthJ, maxLengthJ, threshold))
128 |   }
129 | }
130 | 


--------------------------------------------------------------------------------
/src/main/scala/com/target/data_validator/validator/StringRegexCheck.scala:
--------------------------------------------------------------------------------
 1 | package com.target.data_validator.validator
 2 | 
 3 | import com.target.data_validator.{JsonEncoders, ValidatorError, VarSubstitution}
 4 | import com.target.data_validator.validator.ValidatorBase._
 5 | import com.typesafe.scalalogging.LazyLogging
 6 | import io.circe.{DecodingFailure, HCursor, Json}
 7 | import io.circe.syntax._
 8 | import org.apache.spark.sql.DataFrame
 9 | import org.apache.spark.sql.catalyst.analysis.UnresolvedAttribute
10 | import org.apache.spark.sql.catalyst.expressions._
11 | import org.apache.spark.sql.types.{StringType, StructType}
12 | 
13 | case class StringRegexCheck(
14 |     column: String,
15 |     regex: Option[Json],
16 |     threshold: Option[String]
17 | ) extends RowBased {
18 | 
19 |   override def substituteVariables(dict: VarSubstitution): ValidatorBase = {
20 | 
21 |     val ret = StringRegexCheck(
22 |       getVarSub(column, "column", dict),
23 |       regex.map(getVarSubJson(_, "regex", dict)),
24 |       threshold.map(getVarSub(_, "threshold", dict))
25 |     )
26 |     getEvents.foreach(ret.addEvent)
27 |     ret
28 |   }
29 | 
30 |   override def colTest(schema: StructType, dict: VarSubstitution): Expression = {
31 | 
32 |     val colExp = UnresolvedAttribute(column)
33 | 
34 |     val regexExpression = regex.map { r => RLike(colExp, createLiteralOrUnresolvedAttribute(StringType, r)) }
35 | 
36 |     val ret = regexExpression match {
37 |       /*
38 |         RLike returns false if the column value is null.
39 |         To avoid counting null values as validation failures (like other validations),
40 |         an explicit non null check on the column value is required.
41 |        */
42 |       case Some(x) => And(Not(x), IsNotNull(colExp))
43 |       case _ => throw new RuntimeException("Must define a regex.")
44 |     }
45 |     logger.debug(s"Expr: $ret")
46 |     ret
47 |   }
48 | 
49 |   override def configCheck(df: DataFrame): Boolean = {
50 | 
51 |     // Verify if regex is specified.
52 |     val values = (regex :: Nil).flatten
53 |     if (values.isEmpty) {
54 |       addEvent(ValidatorError("Must define a regex."))
55 |     }
56 | 
57 |     // Verify that the data type of the specified column is a String.
58 |     val colType = findColumnInDataFrame(df, column)
59 |     if (colType.isDefined) {
60 |       val dataType = colType.get.dataType
61 |       if (!dataType.isInstanceOf[StringType]) {
62 |         addEvent(ValidatorError(s"Data type of column '$column' must be String, but was found to be $dataType"))
63 |       }
64 |     }
65 | 
66 |     failed
67 |   }
68 | 
69 |   override def toJson: Json = {
70 |     import JsonEncoders.eventEncoder
71 |     val fields = Seq(
72 |       ("type", Json.fromString("stringRegexCheck")),
73 |       ("column", Json.fromString(column))
74 |     ) ++
75 |       regex.map(r => ("regex", r)) ++
76 |       Seq(
77 |         ("events", getEvents.asJson)
78 |       )
79 |     Json.obj(fields: _*)
80 |   }
81 | }
82 | 
83 | object StringRegexCheck extends LazyLogging {
84 |   def fromJson(c: HCursor): Either[DecodingFailure, ValidatorBase] = {
85 |     val column = c.downField("column").as[String].right.get
86 |     val regex = c.downField("regex").as[Json].right.toOption
87 |     val threshold = c.downField("threshold").as[String].right.toOption
88 | 
89 |     logger.debug(s"column: $column")
90 |     logger.debug(s"regex: $regex type: ${regex.getClass.getCanonicalName}")
91 |     logger.debug(s"threshold: $threshold type: ${threshold.getClass.getCanonicalName}")
92 |     scala.util.Right(StringRegexCheck(column, regex, threshold))
93 |   }
94 | }
95 | 


--------------------------------------------------------------------------------
/src/main/scala/com/target/data_validator/validator/TwoPassCheapCheck.scala:
--------------------------------------------------------------------------------
 1 | package com.target.data_validator.validator
 2 | 
 3 | import org.apache.spark.sql._
 4 | 
 5 | /** extension of [[CheapCheck]] with an assumption that DV will:
 6 |   *   - complete a pre-pass stage that generates an intermediary aggregate
 7 |   *   - provide that intermediary aggregate so it can be used in generating the final check expression
 8 |   */
 9 | abstract class TwoPassCheapCheck extends CheapCheck {
10 | 
11 |   def hasQuickErrorDetails: Boolean = false
12 | 
13 |   /** defined by implementor, should generate one row of aggregated output that can then be handled by
14 |     * [[sinkFirstPassRow]]
15 |     */
16 |   def firstPassSelect(): Column
17 | 
18 |   /** defined by implementor, notify the cheap check of the result of the first pass projection
19 |     *
20 |     * NOTE: the contract for this check type assumes you call this method BEFORE [[CheapCheck.select]]
21 |     */
22 |   def sinkFirstPassRow(row: Row): Unit
23 | 
24 | }
25 | 


--------------------------------------------------------------------------------
/src/main/scala/com/target/data_validator/validator/UniqueCheck.scala:
--------------------------------------------------------------------------------
 1 | package com.target.data_validator.validator
 2 | 
 3 | import com.target.data_validator.{ValidatorError, ValidatorGood, ValidatorTimer, VarSubstitution}
 4 | import com.typesafe.scalalogging.LazyLogging
 5 | import io.circe.{DecodingFailure, HCursor, Json}
 6 | import io.circe.syntax._
 7 | import org.apache.spark.sql.{Column, DataFrame}
 8 | 
 9 | case class UniqueCheck(columns: Seq[String]) extends CostlyCheck {
10 | 
11 |   override def substituteVariables(dict: VarSubstitution): ValidatorBase = {
12 |     val newColumns = columns.map(getVarSub(_, "columns", dict))
13 |     val ret = UniqueCheck(newColumns)
14 |     this.getEvents.foreach(ret.addEvent)
15 |     ret
16 |   }
17 | 
18 |   override def configCheck(df: DataFrame): Boolean = {
19 |     columns.exists(findColumnInDataFrame(df, _).isEmpty)
20 |   }
21 | 
22 |   override def toJson: Json = {
23 |     import com.target.data_validator.JsonEncoders.eventEncoder
24 |     val fields = Seq(
25 |       ("type", Json.fromString("uniqueCheck")),
26 |       ("columns", Json.fromValues(columns.map(Json.fromString))),
27 |       ("failed", Json.fromBoolean(failed)),
28 |       ("events", this.getEvents.asJson)
29 |     )
30 | 
31 |     Json.fromFields(fields)
32 |   }
33 | 
34 |   override def costlyCheck(df: DataFrame): Boolean = {
35 |     val cols = columns.map(new Column(_))
36 |     val timer = new ValidatorTimer(s"UniqueCheck($columns)")
37 |     addEvent(timer)
38 |     // Note: this computes the count of the number of distinct keys (if you will) that have at least one duplicated row.
39 |     // It's not number of duplicated rows.
40 |     val ret = timer.time(df.select(cols: _*).groupBy(cols: _*).count().where("count > 1").count())
41 |     logger.info(s"costlyCheck: cols:$cols ret:$ret")
42 |     if (ret > 0) {
43 |       addEvent(ValidatorError(s"$ret duplicates found!"))
44 |     } else {
45 |       addEvent(ValidatorGood("no duplicates found."))
46 |     }
47 | 
48 |     failed
49 |   }
50 | }
51 | 
52 | object UniqueCheck extends LazyLogging {
53 | 
54 |   def fromJson(c: HCursor): Either[DecodingFailure, ValidatorBase] = {
55 |     val columns = c.downField("columns").as[Seq[String]]
56 |     columns.right.map(UniqueCheck(_))
57 |   }
58 | 
59 | }
60 | 


--------------------------------------------------------------------------------
/src/test/resources/format_test.jsonl:
--------------------------------------------------------------------------------
 1 | {"name": "Mal", "age": 49}
 2 | {"name": "Zoe", "age": 33}
 3 | {"name": "Wash", "age": 39}
 4 | {"name": "Jayne", "age": 43}
 5 | {"name": "Kaylee", "age": 26}
 6 | {"name": "Simon", "age": 27}
 7 | {"name": "River", "age": 18}
 8 | {"name": "Inara", "age": 25}
 9 | {"name": "Book", "age": 68}
10 | 


--------------------------------------------------------------------------------
/src/test/resources/test_config.yaml:
--------------------------------------------------------------------------------
 1 | numKeyCols: 2
 2 | numErrorsToReport: 742
 3 | email:
 4 |   smtpHost: smtpHost
 5 |   subject: subject
 6 |   from: from
 7 |   to:
 8 |    - to
 9 | detailedErrors: true
10 | vars:
11 |   - name: foo
12 |     value: bar
13 | 
14 | outputs:
15 |   - filename: /user/home/sample.json
16 | 
17 |   - pipe: /apps/dv2kafka.py
18 |     ignoreError: true
19 | tables:
20 |   - db: foo
21 |     table: bar
22 |     keyColumns:
23 |       - one
24 |       - two
25 |     checks:
26 |       - type: rowCount
27 |         minNumRows: 10294
28 |       - type: nullCheck
29 |         column: mdse_item_i
30 |   - orcFile: LocalFile.orc
31 |     condition: "foo < 10"
32 |     checks:
33 |       - type: nullCheck
34 |         column: start_d
35 |   - parquetFile: LocFile.parquet
36 |     condition: "bar < 10"
37 |     checks:
38 |       - type: nullCheck
39 |         column: end_d
40 |   - format: llama
41 |     checks:
42 |       - type: nullCheck
43 |         column: start_d
44 |     options:
45 |       key: value
46 |     loadData:
47 |       - data.llama


--------------------------------------------------------------------------------
/src/test/scala/com/target/TestingSparkSession.scala:
--------------------------------------------------------------------------------
 1 | package com.target
 2 | 
 3 | import java.util.Properties
 4 | 
 5 | import org.apache.log4j.PropertyConfigurator
 6 | import org.apache.spark.SparkContext
 7 | import org.apache.spark.sql.{SparkSession, SQLContext}
 8 | import org.scalatest._
 9 | 
10 | trait TestingSparkSession extends BeforeAndAfterAll { self: Suite =>
11 | 
12 |   lazy val spark: SparkSession = TestingSparkSession.sparkSingleton
13 |   lazy val sc: SparkContext = spark.sparkContext
14 |   lazy val sqlContext: SQLContext = spark.sqlContext
15 | 
16 | }
17 | 
18 | object TestingSparkSession {
19 | 
20 |   /** config a log4j properties used for testsuite. Copied from org.apache.spark.utils.Util because it private.
21 |     */
22 |   def configTestLog4j(levelOther: String, levelMe: String): Unit = {
23 |     val pro = new Properties()
24 |     pro.put("log4j.rootLogger", s"$levelOther, console")
25 |     pro.put("log4j.appender.console", "org.apache.log4j.ConsoleAppender")
26 |     pro.put("log4j.appender.console.target", "System.err")
27 |     pro.put("log4j.appender.console.layout", "org.apache.log4j.PatternLayout")
28 |     pro.put(
29 |       "log4j.appender.console.layout.ConversionPattern",
30 |       "%d{yy/MM/dd HH:mm:ss} %p %c{1}: %m%n"
31 |     ) // scalastyle:ignore regex
32 |     pro.put(s"log4j.logger.${this.getClass.getPackage.getName}", levelMe)
33 |     PropertyConfigurator.configure(pro)
34 |   }
35 | 
36 |   lazy val sparkSingleton: SparkSession = {
37 |     configTestLog4j("OFF", "OFF")
38 |     SparkSession
39 |       .builder()
40 |       .config("spark.executor.memory", "512mb")
41 |       .config("spark.ui.showConsoleProgress", value = false)
42 |       .master("local[2]")
43 |       .getOrCreate()
44 |   }
45 | 
46 | }
47 | 


--------------------------------------------------------------------------------
/src/test/scala/com/target/data_validator/CliOptionParserSpec.scala:
--------------------------------------------------------------------------------
 1 | package com.target.data_validator
 2 | 
 3 | import org.scalatest.funspec.AnyFunSpec
 4 | import org.scalatest.matchers.should.Matchers
 5 | 
 6 | class CliOptionParserSpec extends AnyFunSpec with Matchers {
 7 | 
 8 |   describe("CliOptionParser") {
 9 |     describe("parsing") {
10 |       it("does not handle var option values with commas") {
11 |         val args = Array("--vars", "keyA=value1,value2,keyB=value3")
12 |         CliOptionParser.parser.parse(args, CliOptions()) should be(None)
13 |       }
14 |     }
15 |   }
16 | }
17 | 


--------------------------------------------------------------------------------
/src/test/scala/com/target/data_validator/ConfigParserSpec.scala:
--------------------------------------------------------------------------------
  1 | package com.target.data_validator
  2 | 
  3 | import com.target.TestingSparkSession
  4 | import com.target.data_validator.validator.{MinNumRows, NullCheck}
  5 | import io.circe.Json
  6 | import org.scalatest.BeforeAndAfterAll
  7 | import org.scalatest.funspec.AnyFunSpec
  8 | 
  9 | class ConfigParserSpec extends AnyFunSpec with BeforeAndAfterAll {
 10 | 
 11 |   // Silence is golden!
 12 |   override def beforeAll(): Unit = TestingSparkSession.configTestLog4j("OFF", "OFF")
 13 | 
 14 |   val expectedConfiguration = ValidatorConfig(
 15 |     2,
 16 |     742, // scalastyle:ignore magic.number
 17 |     Some(EmailConfig("smtpHost", "subject", "from", List("to"))),
 18 |     detailedErrors = true,
 19 |     Some(List(NameValue("foo", Json.fromString("bar")))),
 20 |     Some(
 21 |       List[ValidatorOutput](
 22 |         FileOutput("/user/home/sample.json", None),
 23 |         PipeOutput("/apps/dv2kafka.py", Some(true))
 24 |       )
 25 |     ),
 26 |     List(
 27 |       ValidatorHiveTable(
 28 |         "foo",
 29 |         "bar",
 30 |         Some(List("one", "two")),
 31 |         None,
 32 |         List(MinNumRows(Json.fromInt(10294)), NullCheck("mdse_item_i", None)) // scalastyle:ignore magic.number
 33 |       ),
 34 |       ValidatorOrcFile("LocalFile.orc", None, Some("foo < 10"), List(NullCheck("start_d", None))),
 35 |       ValidatorParquetFile("LocFile.parquet", None, Some("bar < 10"), List(NullCheck("end_d", None))),
 36 |       ValidatorSpecifiedFormatLoader(
 37 |         format = "llama",
 38 |         keyColumns = None,
 39 |         condition = None,
 40 |         checks = List(NullCheck("start_d", None)),
 41 |         options = Some(Map("key" -> "value")),
 42 |         loadData = Some(List("data.llama"))
 43 |       )
 44 |     )
 45 |   )
 46 | 
 47 |   describe("ConfigParser") {
 48 | 
 49 |     describe("parse") {
 50 | 
 51 |       it("should correctly parse simple yaml config") {
 52 |         val config = ConfigParser.parse("""
 53 |           | numKeyCols: 2
 54 |           | numErrorsToReport: 742
 55 |           | email:
 56 |           |   smtpHost: smtpHost
 57 |           |   subject: subject
 58 |           |   from: from
 59 |           |   to:
 60 |           |    - to
 61 |           | detailedErrors: true
 62 |           | vars:
 63 |           |   - name: foo
 64 |           |     value: bar
 65 |           |
 66 |           | outputs:
 67 |           |   - filename: /user/home/sample.json
 68 |           |
 69 |           |   - pipe: /apps/dv2kafka.py
 70 |           |     ignoreError: true
 71 |           | tables:
 72 |           |   - db: foo
 73 |           |     table: bar
 74 |           |     keyColumns:
 75 |           |       - one
 76 |           |       - two
 77 |           |     checks:
 78 |           |       - type: rowCount
 79 |           |         minNumRows: 10294
 80 |           |       - type: nullCheck
 81 |           |         column: mdse_item_i
 82 |           |   - orcFile: LocalFile.orc
 83 |           |     condition: "foo < 10"
 84 |           |     checks:
 85 |           |       - type: nullCheck
 86 |           |         column: start_d
 87 |           |   - parquetFile: LocFile.parquet
 88 |           |     condition: "bar < 10"
 89 |           |     checks:
 90 |           |       - type: nullCheck
 91 |           |         column: end_d
 92 |           |   - format: llama
 93 |           |     checks:
 94 |           |       - type: nullCheck
 95 |           |         column: start_d
 96 |           |     options:
 97 |           |       key: value
 98 |           |     loadData:
 99 |           |       - data.llama
100 |           """.stripMargin)
101 | 
102 |         assert(config == Right(expectedConfiguration))
103 |       }
104 | 
105 |     }
106 | 
107 |     describe("parseFile") {
108 | 
109 |       it("should support loading config files by path") {
110 |         val output = ConfigParser.parseFile("src/test/resources/test_config.yaml", Map.empty)
111 |         assert(output == Right(expectedConfiguration))
112 |       }
113 | 
114 |       it("should support classpath configuration loading with the prefix 'classpath:'") {
115 |         val output = ConfigParser.parseFile("classpath:/test_config.yaml", Map.empty)
116 |         assert(output == Right(expectedConfiguration))
117 |       }
118 | 
119 |       it("should not confuse classpath and non classpath file loading") {
120 |         val paths = Seq("classpath:src/test/resources/test_config.yaml", "test_config.yaml")
121 | 
122 |         paths.foreach { path =>
123 |           val output = ConfigParser.parseFile(path, Map.empty)
124 |           assert(output.isLeft)
125 |         }
126 |       }
127 | 
128 |     }
129 | 
130 |   }
131 | 
132 | }
133 | 


--------------------------------------------------------------------------------
/src/test/scala/com/target/data_validator/ConfigVarSubSpec.scala:
--------------------------------------------------------------------------------
  1 | package com.target.data_validator
  2 | 
  3 | import com.target.TestingSparkSession
  4 | import com.target.data_validator.validator.{ColumnMaxCheck, MinNumRows, NegativeCheck, NullCheck}
  5 | import io.circe.Json
  6 | import org.scalatest.funspec.AnyFunSpec
  7 | import org.scalatest.matchers.should.Matchers
  8 | 
  9 | class ConfigVarSubSpec extends AnyFunSpec with Matchers with TestingSparkSession {
 10 | 
 11 |   val baseMap: Map[String, String] =
 12 |     Map("one" -> "1", "two" -> "2", "three" -> "3", "four" -> "4", "five" -> "5", "six" -> "6")
 13 | 
 14 |   val dict: VarSubstitution = {
 15 |     val d = new VarSubstitution
 16 |     d.addMap(baseMap)
 17 |     d
 18 |   }
 19 | 
 20 |   describe("ConfigVariable Substitutions") {
 21 | 
 22 |     describe("ValidatorTable children") {
 23 | 
 24 |       it("ValidatorHiveTable var substitution should work") {
 25 |         val sut = ValidatorHiveTable(
 26 |           "database$one",
 27 |           "table$two",
 28 |           Some(List("Col$three", "Col$four")),
 29 |           Some("$five == $six"),
 30 |           List.empty
 31 |         )
 32 |         assert(
 33 |           sut.substituteVariables(dict) ==
 34 |             ValidatorHiveTable("database1", "table2", Some(List("Col3", "Col4")), Some("5 == 6"), List.empty)
 35 |         )
 36 |       }
 37 | 
 38 |       it("Validator OrcFile substitution should work") {
 39 |         val sut = ValidatorOrcFile(
 40 |           "/$one/$two/orcFile",
 41 |           Some(List("Col$three", "Col$four")),
 42 |           Some("$five == $six"),
 43 |           List.empty
 44 |         )
 45 |         assert(
 46 |           sut.substituteVariables(dict) ==
 47 |             ValidatorOrcFile("/1/2/orcFile", Some(List("Col3", "Col4")), Some("5 == 6"), List.empty)
 48 |         )
 49 |       }
 50 | 
 51 |       it("ValidatorDataFrame substitution should work") {
 52 |         val df = spark.emptyDataFrame
 53 |         val sut = ValidatorDataFrame(df, Some(List("Col$three", "Col$four")), Some("$five == $six"), List.empty)
 54 |         assert(
 55 |           sut.substituteVariables(dict) ==
 56 |             ValidatorDataFrame(df, Some(List("Col3", "Col4")), Some("5 == 6"), List.empty)
 57 |         )
 58 |       }
 59 | 
 60 |     }
 61 | 
 62 |     describe("ValidatorBase children") {
 63 | 
 64 |       describe("ColumnBased children") {
 65 | 
 66 |         describe("MinNumRows") {
 67 | 
 68 |           it("should substitute variables properly") {
 69 |             val sut = MinNumRows(Json.fromString("$one"))
 70 |             assert(sut.substituteVariables(dict) == MinNumRows(Json.fromInt(1)))
 71 |           }
 72 | 
 73 |         }
 74 | 
 75 |         describe("ColumnMaxCheck") {
 76 | 
 77 |           it("should substitute variables properly") {
 78 |             val sut = ColumnMaxCheck("Col$six", Json.fromString("$five"))
 79 |             val newColMaxCheck = sut.substituteVariables(dict).asInstanceOf[ColumnMaxCheck]
 80 |             assert(newColMaxCheck.column == "Col6")
 81 |             assert(newColMaxCheck.value == Json.fromInt(5)) // scalastyle:ignore
 82 |             assert(sut.substituteVariables(dict) == ColumnMaxCheck("Col6", Json.fromInt(5))) // scalastyle:ignore
 83 |             assert(!sut.failed)
 84 |           }
 85 | 
 86 |           it("should fail on bad variables") {
 87 |             val check = ColumnMaxCheck("Col$six", Json.fromString("$fivefour"))
 88 |             val sut = check.substituteVariables(dict)
 89 |             assert(sut.failed)
 90 |           }
 91 | 
 92 |         }
 93 | 
 94 |       }
 95 | 
 96 |       describe("RowBased children") {
 97 | 
 98 |         describe("NegativeCheck") {
 99 | 
100 |           it("NegativeCheck") {
101 |             val sut = NegativeCheck("Col$four", None)
102 |             assert(sut.substituteVariables(dict) == NegativeCheck("Col4", None))
103 |           }
104 | 
105 |           it("NegativeCheck bad variable substitution should fail") {
106 |             val check = NegativeCheck("Col$fourfour", None)
107 |             val sut = check.substituteVariables(dict)
108 |             assert(sut.failed)
109 |           }
110 | 
111 |         }
112 | 
113 |         describe("NullCheck") {
114 | 
115 |           it("should substitute variables properly") {
116 |             val sut = NullCheck("Col${one}", None)
117 |             assert(sut.substituteVariables(dict) == NullCheck("Col1", None))
118 |           }
119 | 
120 |           it("bad variable substitution should fail") {
121 |             val check = NullCheck("Col${unknown}", None)
122 |             val sut = check.substituteVariables(dict)
123 |             assert(sut.failed)
124 |           }
125 | 
126 |         }
127 | 
128 |       }
129 | 
130 |     }
131 | 
132 |   }
133 | 
134 | }
135 | 


--------------------------------------------------------------------------------
/src/test/scala/com/target/data_validator/EnvironmentVariablesSpec.scala:
--------------------------------------------------------------------------------
 1 | package com.target.data_validator
 2 | 
 3 | import com.target.data_validator.EnvironmentVariables.{Inaccessible, Present, Unset}
 4 | import org.scalatest.matchers.should.Matchers
 5 | import org.scalatest.wordspec.AnyWordSpec
 6 | 
 7 | class EnvironmentVariablesSpec extends AnyWordSpec with Matchers {
 8 | 
 9 |   "EnvironmentVariables" should {
10 |     "get envvars" when {
11 |       "an envvar exists" in {
12 |         EnvironmentVariables.get("HOME") should be(Present(System.getenv("HOME")))
13 |       }
14 |       "an envvar doesn't exist" in {
15 |         EnvironmentVariables.get("NOPE") should be(Unset)
16 |       }
17 |       "an envvar isn't an envvar" in {
18 |         EnvironmentVariables.get(null) shouldBe a[Inaccessible] // scalastyle:ignore
19 |       }
20 |     }
21 |     "log envvars" when {
22 |       "using get" in {
23 |         EnvironmentVariables.get("HOME")
24 |         EnvironmentVariables.accessedEnvVars.keySet should contain("HOME")
25 |       }
26 |       "using tryGet" in {
27 |         EnvironmentVariables.tryGet("HOME")
28 |         EnvironmentVariables.accessedEnvVars.keySet should contain("HOME")
29 |       }
30 |     }
31 |   }
32 | }
33 | 


--------------------------------------------------------------------------------
/src/test/scala/com/target/data_validator/ExpressionUtilsSpec.scala:
--------------------------------------------------------------------------------
 1 | package com.target.data_validator
 2 | 
 3 | import ExpressionUtils.orFromList
 4 | import com.target.data_validator.validator.ValidatorBase._
 5 | import org.apache.spark.sql.catalyst.expressions.{GreaterThan, Or}
 6 | import org.scalatest._
 7 | import org.scalatest.funspec.AnyFunSpec
 8 | import org.scalatest.matchers.should.Matchers
 9 | 
10 | class ExpressionUtilsSpec extends AnyFunSpec with Matchers {
11 | 
12 |   describe("ExpressionUtils") {
13 | 
14 |     describe("orFromList()") {
15 | 
16 |       val expr1 = GreaterThan(L0, L1)
17 | 
18 |       it("Simpler case 1 expression") {
19 |         assert(orFromList(expr1 :: Nil) == expr1)
20 |       }
21 | 
22 |       it("Standard case 2 expressions") {
23 |         assert(orFromList(expr1 :: expr1 :: Nil) == Or(expr1, expr1))
24 |       }
25 | 
26 |       it("More then 2 case") {
27 |         assert(orFromList(expr1 :: expr1 :: expr1 :: Nil) == Or(expr1, Or(expr1, expr1)))
28 |       }
29 | 
30 |       it("Failure case, empty list.") {
31 |         assertThrows[java.lang.IllegalArgumentException](orFromList(Nil))
32 |       }
33 | 
34 |     }
35 | 
36 |   }
37 | 
38 | }
39 | 


--------------------------------------------------------------------------------
/src/test/scala/com/target/data_validator/IOSpec.scala:
--------------------------------------------------------------------------------
  1 | package com.target.data_validator
  2 | 
  3 | import java.nio.file.Files
  4 | 
  5 | import com.target.TestingSparkSession
  6 | import io.circe.{parser, Json}
  7 | import scala.io.Source.fromFile
  8 | import scala.util.Random._
  9 | import scalatags.Text.all._
 10 | import org.scalatest.funspec.AnyFunSpec
 11 | import org.scalatest.matchers.should.Matchers
 12 | 
 13 | class IOSpec extends AnyFunSpec with Matchers with TestingSparkSession {
 14 |   val SAMPLE_HTML = html(h1("H1"), "Sample HTML Doc")
 15 |   val SAMPLE_JSON: Json = parser
 16 |     .parse("""{
 17 |       | "one" : 1,
 18 |       | "two" : [ 1,2 ],
 19 |       | "three": { "a":1.0,"b":2.0,"c":3.0 }
 20 |       |}
 21 |            """.stripMargin)
 22 |     .right
 23 |     .getOrElse(Json.Null)
 24 | 
 25 |   def createRandomTempFilename: String = {
 26 |     val useDefault = null
 27 |     val file = Files.createTempFile("data-validator_temp", useDefault)
 28 |     file.toAbsolutePath.toString
 29 |   }
 30 | 
 31 |   def rm(filename: String): Boolean = {
 32 |     val f = new java.io.File(filename)
 33 |     f.delete()
 34 |   }
 35 | 
 36 |   describe("Local Disk") {
 37 | 
 38 |     it("should write HTML") {
 39 |       val filename = createRandomTempFilename
 40 |       assert(!IO.writeHTML(filename, SAMPLE_HTML)(spark))
 41 |       fromFile(filename).mkString should be(SAMPLE_HTML.render + "\n")
 42 |       assert(rm(filename))
 43 |     }
 44 | 
 45 |     it("should write JSON") {
 46 |       val filename = createRandomTempFilename
 47 |       assert(!IO.writeJSON(filename, SAMPLE_JSON)(spark))
 48 |       fromFile(filename).mkString should be(SAMPLE_JSON.noSpaces + "\n")
 49 |       assert(rm(filename))
 50 |     }
 51 | 
 52 |     it("file:/// should be able to write") {
 53 |       val baseFilename = createRandomTempFilename
 54 |       val filename = "file://" + baseFilename
 55 |       val data = List.fill(128)(nextPrintableChar).mkString + IO.NEW_LINE // scalastyle:ignore
 56 |       assert(!IO.writeString(filename, data)(spark))
 57 |       fromFile(baseFilename).mkString should be(data)
 58 |       assert(rm(baseFilename))
 59 |     }
 60 | 
 61 |     describe("canAppendOrWrite") {
 62 | 
 63 |       it("returns false when it should") {
 64 |         val badFilename = "/dir/that/does/not/exist/junk.txt"
 65 |         assert(!IO.canAppendOrCreate(badFilename, append = false)(spark))
 66 |       }
 67 | 
 68 |       it("returns true when it should") {
 69 |         assert(IO.canAppendOrCreate(createRandomTempFilename, append = false)(spark))
 70 |       }
 71 | 
 72 |     }
 73 | 
 74 |     describe("canExecute") {
 75 | 
 76 |       it("returns true for local executable") {
 77 |         assert(IO.canExecute("/usr/bin/wc")(spark))
 78 |       }
 79 | 
 80 |       it("returns false for hdfs file") {
 81 |         assert(!IO.canExecute(IO.HDFS_SCHEMA_PREFIX + "foo.bar")(spark))
 82 |       }
 83 | 
 84 |       it("returns false for local non-executable") {
 85 |         assert(!IO.canExecute("/etc/passwd")(spark))
 86 |       }
 87 | 
 88 |       it("returns false for non-existent file") {
 89 |         assert(!IO.canExecute(createRandomTempFilename)(spark))
 90 |       }
 91 | 
 92 |     }
 93 | 
 94 |   }
 95 | 
 96 |   describe("writeStringToPipe") {
 97 | 
 98 |     it("Fails for bad path") {
 99 |       val (fail, out, _) = IO.writeStringToPipe("/bad/path", nextString(200)) // scalastyle:ignore
100 |       assert(fail)
101 |       assert(out.isEmpty)
102 |     }
103 | 
104 |     it("Works for wc and captures stdout") {
105 |       val str = nextString(200) // scalastyle:ignore
106 |       val (fail, out, err) = IO.writeStringToPipe("/usr/bin/wc -c", str)
107 |       assert(!fail)
108 |       assert(out.length == 1)
109 |       assert(out.head.dropWhile(_.isWhitespace).toInt == str.getBytes.length)
110 |       assert(err.isEmpty)
111 |     }
112 | 
113 |     it("works when program fails") {
114 |       val (fail, out, err) = IO.writeStringToPipe("false", "")
115 |       assert(fail)
116 |       assert(out.isEmpty)
117 |       assert(err.isEmpty)
118 |     }
119 | 
120 |     it("captures stderr and doesn't fail") {
121 |       val (fail, out, err) = IO.writeStringToPipe("echo ERR >&2", "")
122 |       assert(!fail)
123 |       assert(out.isEmpty)
124 |       assert(err == List("ERR"))
125 |     }
126 | 
127 |   }
128 |   // TODO: Add hdfs tests using https://github.com/sakserv/hadoop-mini-clusters
129 | }
130 | 


--------------------------------------------------------------------------------
/src/test/scala/com/target/data_validator/JsonUtilsSpec.scala:
--------------------------------------------------------------------------------
  1 | package com.target.data_validator
  2 | 
  3 | import com.target.TestingSparkSession
  4 | import com.target.data_validator.JsonUtils._
  5 | import io.circe.Json
  6 | import org.apache.spark.sql.Row
  7 | import org.apache.spark.sql.types._
  8 | 
  9 | import scala.util.Random
 10 | import org.scalatest.funspec.AnyFunSpec
 11 | import org.scalatest.matchers.should.Matchers
 12 | 
 13 | class JsonUtilsSpec extends AnyFunSpec with Matchers with TestingSparkSession {
 14 |   val TEST_STRING_LENGTH = 10
 15 | 
 16 |   describe("JsonUtils") {
 17 | 
 18 |     describe("string2Json") {
 19 | 
 20 |       it("Simple Int into Json") {
 21 |         val randInt = Random.nextInt
 22 |         assert(string2Json(s"$randInt") == Json.fromInt(randInt))
 23 |       }
 24 | 
 25 |       it("String into Json") {
 26 |         val randString = Random.nextString(TEST_STRING_LENGTH)
 27 |         assert(string2Json(randString) == Json.fromString(randString))
 28 |       }
 29 | 
 30 |       it("garbage doesn't crash") {
 31 |         val garbageString = "{]2389s0fj2}"
 32 |         assert(string2Json(garbageString) == Json.fromString(garbageString))
 33 |       }
 34 | 
 35 |     }
 36 | 
 37 |     describe("debugJson") {
 38 | 
 39 |       it("Int") {
 40 |         val randInt = Random.nextInt()
 41 |         assert(debugJson(Json.fromInt(randInt)) == s"Json NUM: $randInt")
 42 |       }
 43 | 
 44 |       it("Double") {
 45 |         val randDouble = Random.nextDouble()
 46 |         assert(debugJson(Json.fromDoubleOrNull(randDouble)) == s"Json NUM: $randDouble")
 47 |       }
 48 | 
 49 |       it("String") {
 50 |         val randString = Random.nextString(TEST_STRING_LENGTH)
 51 |         assert(debugJson(Json.fromString(randString)) == s"Json STRING: $randString")
 52 |       }
 53 | 
 54 |       it("Boolean") {
 55 |         val randBool = Random.nextBoolean()
 56 |         assert(debugJson(Json.fromBoolean(randBool)) == s"Json BOOLEAN: $randBool")
 57 | 
 58 |       }
 59 | 
 60 |       it("Array") {
 61 |         val randArray = Range(0, TEST_STRING_LENGTH).map(_ => Json.fromInt(Random.nextInt))
 62 |         assert(debugJson(Json.fromValues(randArray)) contains "Json ARR:")
 63 |       }
 64 | 
 65 |       it("Null") {
 66 |         assert(debugJson(Json.Null) == "Json NULL")
 67 |       }
 68 | 
 69 |     }
 70 | 
 71 |     describe("row2Json") {
 72 | 
 73 |       val TEST_STRING = Random.nextString(TEST_STRING_LENGTH)
 74 |       val TEST_LONG = Random.nextLong
 75 |       val TEST_INT = Random.nextInt
 76 |       val TEST_BOOLEAN = Random.nextBoolean
 77 |       val TEST_DOUBLE = Random.nextDouble
 78 | 
 79 |       val schema = StructType(
 80 |         List(
 81 |           StructField("string", StringType),
 82 |           StructField("long", LongType),
 83 |           StructField("int", IntegerType),
 84 |           StructField("null", NullType),
 85 |           StructField("bool", BooleanType),
 86 |           StructField("double", DoubleType)
 87 |         )
 88 |       )
 89 | 
 90 |       val sampleData =
 91 |         List(Row(TEST_STRING, TEST_LONG, TEST_INT, null, TEST_BOOLEAN, TEST_DOUBLE)) // scalastyle:ignore
 92 | 
 93 |       def mkRow: Row = spark.createDataFrame(sc.parallelize(sampleData), schema).head()
 94 | 
 95 |       it("Row with String") {
 96 |         val sut = mkRow
 97 |         assert(row2Json(sut, 0) == Json.fromString(TEST_STRING))
 98 |       }
 99 | 
100 |       it("Row with long") {
101 |         val sut = mkRow
102 |         assert(row2Json(sut, 1) == Json.fromLong(TEST_LONG))
103 |       }
104 | 
105 |       it("Row with int") {
106 |         val sut = mkRow
107 |         assert(row2Json(sut, 2) == Json.fromInt(TEST_INT))
108 |       }
109 | 
110 |       it("Row with null") {
111 |         val sut = mkRow
112 |         assert(row2Json(sut, 3) == Json.Null)
113 |       }
114 | 
115 |       it("Row with bool") {
116 |         val sut = mkRow
117 |         assert(row2Json(sut, 4) == Json.fromBoolean(TEST_BOOLEAN)) // scalastyle:ignore
118 |       }
119 | 
120 |       it("Row with double") {
121 |         val sut = mkRow
122 |         assert(row2Json(sut, 5) == Json.fromDoubleOrNull(TEST_DOUBLE)) // scalastyle:ignore
123 |       }
124 | 
125 |       it("Full Row") {
126 |         val sut = mkRow
127 |         assert(
128 |           row2Json(sut) == Json.obj(
129 |             ("string", Json.fromString(TEST_STRING)),
130 |             ("long", Json.fromLong(TEST_LONG)),
131 |             ("int", Json.fromInt(TEST_INT)),
132 |             ("null", Json.Null),
133 |             ("bool", Json.fromBoolean(TEST_BOOLEAN)),
134 |             ("double", Json.fromDoubleOrNull(TEST_DOUBLE))
135 |           )
136 |         )
137 |       }
138 | 
139 |     }
140 | 
141 |   }
142 | 
143 | }
144 | 


--------------------------------------------------------------------------------
/src/test/scala/com/target/data_validator/TestHelpers.scala:
--------------------------------------------------------------------------------
 1 | package com.target.data_validator
 2 | 
 3 | import io.circe.Json
 4 | import io.circe.yaml.parser
 5 | import org.apache.spark.SparkContext
 6 | import org.apache.spark.sql.{DataFrame, Row, SparkSession}
 7 | import org.apache.spark.sql.types._
 8 | 
 9 | object TestHelpers {
10 | 
11 |   def parseYaml(yaml: String): Json = {
12 |     parser.parse(yaml).right.getOrElse(Json.Null)
13 |   }
14 | 
15 |   def mkDict(args: (String, String)*): VarSubstitution = {
16 |     val ret = new VarSubstitution
17 |     args.foreach(kv => ret.addString(kv._1, kv._2))
18 |     ret
19 |   }
20 | 
21 |   def mkDictJson(elems: (String, String)*): VarSubstitution = {
22 |     val ret = new VarSubstitution
23 |     elems.foreach(e => ret.add(e._1, JsonUtils.string2Json(e._2)))
24 |     ret
25 |   }
26 | 
27 |   def mkConfig(tables: List[ValidatorTable]): ValidatorConfig =
28 |     ValidatorConfig(2, 10, None, detailedErrors = false, None, None, tables) // scalastyle:ignore
29 | 
30 |   def mkDataFrame(data: List[Row], schema: StructType)(implicit spark: SparkSession, sc: SparkContext): DataFrame =
31 |     spark.createDataFrame(sc.parallelize(data), schema)
32 | 
33 |   def guessType(v: Any): DataType = v.getClass.getCanonicalName match {
34 |     case "java.lang.Short" => ShortType
35 |     case "java.lang.String" => StringType
36 |     case "java.lang.Integer" => IntegerType
37 |     case "java.lang.Double" => DoubleType
38 |     case "java.lang.Boolean" => BooleanType
39 |     case "java.lang.Long" => LongType
40 |     case "java.lang.Byte" => ByteType
41 |     case _ => throw new IllegalArgumentException(s"Unknown type '${v.getClass.getCanonicalName}'")
42 |   }
43 | 
44 |   def mkSchema(args: (String, List[Any])*): StructType = {
45 |     StructType(args.map(x => StructField(x._1, guessType(x._2.head))))
46 |   }
47 | 
48 |   def mkRows(args: (String, List[Any])*): List[Row] = {
49 |     val len = args.head._2.length
50 |     require(args.forall(_._2.length == len))
51 |     (0 until len).map(i => Row(args.map(_._2.apply(i)): _*)).toList
52 |   }
53 | 
54 |   /** creates dataFrame from array of (label, List[Any])
55 |     * @param spark
56 |     * @param args
57 |     *   \- is array of tuple(String, List[Any]) supported types are String, Double, Int, Long
58 |     * @return
59 |     *   DataFrame
60 |     *
61 |     * ie mkDf(("item" -> List("Eggs", "Milk", "Bread", "Cheese")), ("price" -> List( 5.49, 3.89, 4.50, 6.00),
62 |     * ("quantity" -> List( 12, 5, 2, 10)))
63 |     *
64 |     * will return a dataframe
65 |     */
66 |   def mkDf(spark: SparkSession, args: (String, List[Any])*): DataFrame = {
67 |     require(args.forall(_._2.length == args.head._2.length))
68 |     val schema = mkSchema(args: _*)
69 |     val data = mkRows(args: _*)
70 |     spark.createDataFrame(spark.sparkContext.parallelize(data), schema)
71 |   }
72 | }
73 | 


--------------------------------------------------------------------------------
/src/test/scala/com/target/data_validator/ValidatorOutputSpec.scala:
--------------------------------------------------------------------------------
 1 | package com.target.data_validator
 2 | 
 3 | import com.target.TestingSparkSession
 4 | import org.scalatest.funspec.AnyFunSpec
 5 | import org.scalatest.matchers.should.Matchers
 6 | 
 7 | class ValidatorOutputSpec extends AnyFunSpec with Matchers with TestingSparkSession {
 8 | 
 9 |   val dict = new VarSubstitution()
10 | 
11 |   describe("ValidatorOutput") {
12 | 
13 |     describe("PipeOutput") {
14 | 
15 |       it("variable substitution") {
16 |         dict.addString("TMPDIR", "/tmp")
17 |         val sut = PipeOutput("$TMPDIR/foo.sh", None)
18 |         assert(sut.substituteVariables(dict) == PipeOutput("/tmp/foo.sh", None))
19 |       }
20 | 
21 |     }
22 | 
23 |     describe("FileOutput") {
24 | 
25 |       it("variable substitution") {
26 |         dict.addString("TMPDIR", "/tmp")
27 |         val sut = FileOutput("$TMPDIR/foo.json", None)
28 |         assert(sut.substituteVariables(dict) == FileOutput("/tmp/foo.json", None))
29 |       }
30 | 
31 |     }
32 | 
33 |   }
34 | 
35 | }
36 | 


--------------------------------------------------------------------------------
/src/test/scala/com/target/data_validator/ValidatorSpecifiedFormatLoaderSpec.scala:
--------------------------------------------------------------------------------
 1 | package com.target.data_validator
 2 | 
 3 | import com.target.TestingSparkSession
 4 | import com.target.data_validator.TestHelpers.{mkConfig, mkDict}
 5 | import com.target.data_validator.validator.{ColumnMaxCheck, MinNumRows, NegativeCheck, NullCheck}
 6 | import org.scalatest.matchers.should.Matchers
 7 | import org.scalatest.wordspec.AnyWordSpec
 8 | 
 9 | class ValidatorSpecifiedFormatLoaderSpec extends AnyWordSpec with Matchers with TestingSparkSession {
10 |   "ValidatorSpecifiedFormatLoader" should {
11 |     "load json" in {
12 |       val loader = ValidatorSpecifiedFormatLoader(
13 |         format = "json",
14 |         keyColumns = Some(List("age")),
15 |         condition = None,
16 |         checks = List(
17 |           NegativeCheck("age", None),
18 |           NullCheck("age", None),
19 |           ColumnMaxCheck("age", JsonUtils.string2Json("68")),
20 |           MinNumRows(JsonUtils.string2Json("9"))
21 |         ),
22 |         options = None,
23 |         loadData = Some(List("src/test/resources/format_test.jsonl"))
24 |       )
25 | 
26 |       val didFail = loader.quickChecks(spark, mkDict())(mkConfig(List(loader)))
27 | 
28 |       didFail should be(false)
29 |       loader.getEvents should have size 2
30 |       loader.getEvents
31 |         .collectFirst { case vc: ValidatorCounter => vc }
32 |         .get
33 |         .value should be(9)
34 |     }
35 |   }
36 | }
37 | 


--------------------------------------------------------------------------------
/src/test/scala/com/target/data_validator/VarSubstitutionSpec.scala:
--------------------------------------------------------------------------------
  1 | package com.target.data_validator
  2 | 
  3 | import org.scalatest.funspec.AnyFunSpec
  4 | import org.scalatest.matchers.should.Matchers
  5 | 
  6 | class VarSubstitutionSpec extends AnyFunSpec with Matchers {
  7 | 
  8 |   describe("VarSubstitution") {
  9 | 
 10 |     it("adding var twice fails") {
 11 |       val dict = new VarSubstitution()
 12 |       assert(!dict.addString("foo", "bar"))
 13 |       assert(dict.addString("foo", "baz"))
 14 |       assert(dict.replaceVars("$foo") == Left("bar"))
 15 |     }
 16 | 
 17 |     it("adding invalid variable fails") {
 18 |       val dict = new VarSubstitution()
 19 |       assert(dict.addString("99", "99"))
 20 |     }
 21 | 
 22 |     it("simple var substitution works") {
 23 |       val dict = new VarSubstitution
 24 |       assert(!dict.addString("animal", "fox"))
 25 |       assert(dict.replaceVars("The quick brown $animal.") == Left("The quick brown fox."))
 26 |     }
 27 | 
 28 |     it("simple var substitution works for scala type variables") {
 29 |       val dict = new VarSubstitution
 30 |       assert(!dict.addString("animal", "fox"))
 31 |       assert(dict.replaceVars("The quick brown ${animal}.") == Left("The quick brown fox."))
 32 |     }
 33 | 
 34 |     it("missing var produces error") {
 35 |       val dict = new VarSubstitution
 36 |       assert(
 37 |         dict.replaceVars("The quick $color fox.") ==
 38 |           Right(ValidatorError("VariableSubstitution: Can't find values for the following keys, color"))
 39 |       )
 40 |     }
 41 | 
 42 |     it("missing scala var produces error") {
 43 |       val dict = new VarSubstitution
 44 |       assert(
 45 |         dict.replaceVars("The quick ${color} fox.") ==
 46 |           Right(ValidatorError("VariableSubstitution: Can't find values for the following keys, color"))
 47 |       )
 48 |     }
 49 | 
 50 |     it("adding map works") {
 51 |       val dict = new VarSubstitution
 52 |       dict.addMap(Map[String, String]("one" -> "1", "two" -> "2"))
 53 |       assert(dict.dict.size == 2)
 54 |       assert(dict.replaceVars("$one, $two") == Left("1, 2"))
 55 |     }
 56 | 
 57 |     it("short 1 char variables") {
 58 |       val dict = new VarSubstitution
 59 |       dict.addString("f", "foo")
 60 |       assert(dict.replaceVars("${f}|$f") == Left("foo|foo")) // scalastyle:ignore
 61 |     }
 62 | 
 63 |     describe("VarSubstitution.replaceAll") {
 64 | 
 65 |       it("single replacement") {
 66 |         assert(VarSubstitution.replaceAll("This is a test.", " a ", " not a ") == "This is not a test.")
 67 |       }
 68 | 
 69 |       it("multiple replacements") {
 70 |         assert(VarSubstitution.replaceAll("$o $o $o", "$o", "one") == "one one one")
 71 |       }
 72 | 
 73 |       it("no replacement") {
 74 |         val str = "String with nothing to replace."
 75 |         assert(VarSubstitution.replaceAll(str, "xx", "yy") == str)
 76 |       }
 77 | 
 78 |     }
 79 | 
 80 |     describe("VarSubstitution.getVarName") {
 81 | 
 82 |       it("normal var") {
 83 |         assert(VarSubstitution.getVarName("$foo").contains("foo"))
 84 |       }
 85 | 
 86 |       it("scala type var") {
 87 |         assert(VarSubstitution.getVarName("${foo}").contains("foo"))
 88 |       }
 89 | 
 90 |       it("bad scala type variable") {
 91 |         assert(VarSubstitution.getVarName("${foo").isEmpty)
 92 |       }
 93 | 
 94 |     }
 95 | 
 96 |     describe("VarSubstitution findVars") {
 97 | 
 98 |       it("approves of simple var") {
 99 |         assert(VarSubstitution.findVars("$one, $two, $three") == Set("$one", "$two", "$three"))
100 |       }
101 | 
102 |       it("approves of scala vars") {
103 |         assert(VarSubstitution.findVars("${one}, ${two}, ${three}") == Set("${one}", "${two}", "${three}"))
104 |       }
105 | 
106 |       it("does find bad vars") {
107 |         assert(VarSubstitution.findVars("$11, $6nop, ${junk") == Set.empty)
108 |       }
109 | 
110 |     }
111 | 
112 |     describe("VarSubstitution isVar") {
113 | 
114 |       it("simple var") {
115 |         assert(VarSubstitution.isVariable("$foo"))
116 |       }
117 | 
118 |       it("scala var") {
119 |         assert(VarSubstitution.isVariable("${foo}"))
120 |       }
121 | 
122 |       it("not a var") {
123 |         assert(!VarSubstitution.isVariable("foo"))
124 |       }
125 | 
126 |     }
127 | 
128 |   }
129 | 
130 | }
131 | 


--------------------------------------------------------------------------------
/src/test/scala/com/target/data_validator/stats/FirstPassStatsAggregatorSpec.scala:
--------------------------------------------------------------------------------
 1 | package com.target.data_validator.stats
 2 | 
 3 | import com.target.TestingSparkSession
 4 | import org.scalatest.funspec.AnyFunSpec
 5 | import org.scalatest.matchers.should.Matchers
 6 | 
 7 | class FirstPassStatsAggregatorSpec extends AnyFunSpec with Matchers with TestingSparkSession {
 8 | 
 9 |   describe("FirstPassStatsAggregator") {
10 | 
11 |     it("should correctly calculate the count, mean, min and max values") {
12 | 
13 |       import spark.implicits._
14 |       val data = NumericData.data.toDS
15 | 
16 |       val agg1 = new FirstPassStatsAggregator
17 |       val stats = data
18 |         .select(agg1(data("value1")).as("stats"))
19 |         .select(
20 |           "stats.count",
21 |           "stats.mean",
22 |           "stats.min",
23 |           "stats.max"
24 |         )
25 |         .as[FirstPassStats]
26 |         .collect
27 | 
28 |       stats.headOption match {
29 |         case Some(s) =>
30 |           assert(s.count === NumericData.firstPassStats.count)
31 |           assert(s.mean === NumericData.firstPassStats.mean)
32 |           assert(s.min === NumericData.firstPassStats.min)
33 |           assert(s.max === NumericData.firstPassStats.max)
34 |         case None => assert(false)
35 |       }
36 | 
37 |     }
38 | 
39 |   }
40 | 
41 | }
42 | 


--------------------------------------------------------------------------------
/src/test/scala/com/target/data_validator/stats/NumericData.scala:
--------------------------------------------------------------------------------
 1 | package com.target.data_validator.stats
 2 | 
 3 | case class NumericData(value1: Double)
 4 | 
 5 | object NumericData {
 6 | 
 7 |   val data: Seq[NumericData] = Seq(
 8 |     NumericData(0.0),
 9 |     NumericData(1.0),
10 |     NumericData(2.0),
11 |     NumericData(3.0),
12 |     NumericData(4.0),
13 |     NumericData(5.0),
14 |     NumericData(6.0),
15 |     NumericData(7.0),
16 |     NumericData(8.0),
17 |     NumericData(9.0)
18 |   )
19 | 
20 |   // scalastyle:off
21 |   val firstPassStats = FirstPassStats(10, 4.5, 0, 9)
22 |   val secondPassStats = SecondPassStats(
23 |     3.0276503540974917,
24 |     Histogram(
25 |       Seq(
26 |         Bin(0.0, 0.9, 1),
27 |         Bin(0.9, 1.8, 1),
28 |         Bin(1.8, 2.7, 1),
29 |         Bin(2.7, 3.6, 1),
30 |         Bin(3.6, 4.5, 1),
31 |         Bin(4.5, 5.4, 1),
32 |         Bin(5.4, 6.3, 1),
33 |         Bin(6.3, 7.2, 1),
34 |         Bin(7.2, 8.1, 1),
35 |         Bin(8.1, 9.0, 1)
36 |       )
37 |     )
38 |   )
39 |   // scalastyle:on
40 | 
41 | }
42 | 


--------------------------------------------------------------------------------
/src/test/scala/com/target/data_validator/stats/SecondPassStatsAggregatorSpec.scala:
--------------------------------------------------------------------------------
 1 | package com.target.data_validator.stats
 2 | 
 3 | import com.target.TestingSparkSession
 4 | import org.scalatest.funspec.AnyFunSpec
 5 | import org.scalatest.matchers.should.Matchers
 6 | 
 7 | class SecondPassStatsAggregatorSpec extends AnyFunSpec with Matchers with TestingSparkSession {
 8 | 
 9 |   describe("SecondPassStatsAggregator") {
10 | 
11 |     import spark.implicits._
12 |     val data = NumericData.data.toDS
13 | 
14 |     it("should correctly calculate the standard deviation and histogram") {
15 |       val stats1 = NumericData.firstPassStats
16 |       val agg2 = new SecondPassStatsAggregator(stats1)
17 | 
18 |       val stats2 = data
19 |         .select(agg2(data("value1")).as("stats"))
20 |         .select(
21 |           "stats.stdDev",
22 |           "stats.histogram"
23 |         )
24 |         .as[SecondPassStats]
25 |         .collect
26 | 
27 |       stats2.headOption match {
28 |         case Some(s) =>
29 |           assert(s.stdDev === NumericData.secondPassStats.stdDev)
30 |           assert(s.histogram === NumericData.secondPassStats.histogram)
31 |         case None => assert(false)
32 |       }
33 | 
34 |     }
35 | 
36 |     it("should freely convert from spark Row type with the provided companion function") {
37 |       val stats1 = NumericData.firstPassStats
38 |       val agg2 = new SecondPassStatsAggregator(stats1)
39 |       val outputRow = data.select(agg2(data("value1"))).head
40 |       val outputStruct = outputRow.getStruct(0)
41 | 
42 |       SecondPassStats.fromRowRepr(outputStruct) shouldBe NumericData.secondPassStats
43 |     }
44 | 
45 |   }
46 | 
47 | }
48 | 


--------------------------------------------------------------------------------
/src/test/scala/com/target/data_validator/validator/ColStatsSpec.scala:
--------------------------------------------------------------------------------
  1 | package com.target.data_validator.validator
  2 | 
  3 | import com.target.TestingSparkSession
  4 | import com.target.data_validator._
  5 | import com.target.data_validator.stats._
  6 | import io.circe.Json
  7 | import org.scalatest._
  8 | import org.scalatest.funspec.AnyFunSpec
  9 | import org.scalatest.matchers.should.Matchers
 10 | 
 11 | // scalastyle:off magic.number
 12 | class ColStatsSpec extends AnyFunSpec with Matchers with TestingSparkSession {
 13 |   import spark.implicits._
 14 | 
 15 |   describe("ColStats + ValidatorDataFrame") {
 16 | 
 17 |     val variables = new VarSubstitution
 18 |     val sampleDS = spark.createDataset(ColStatsSpec.sample)
 19 |     val validatorTable = ValidatorDataFrame(
 20 |       df = sampleDS.toDF,
 21 |       checks = List(
 22 |         new ColStats("a"),
 23 |         new ColStats("b"),
 24 |         new NullCheck("a", None),
 25 |         new NullCheck("b", None),
 26 |         new ColumnSumCheck("a", minValue = Some(Json.fromInt(0))),
 27 |         new ColumnSumCheck("b", minValue = Some(Json.fromInt(0)))
 28 |       ),
 29 |       keyColumns = None,
 30 |       condition = None
 31 |     )
 32 |     val validatorConfig = ValidatorConfig(0, 5, None, true, None, None, List(validatorTable))
 33 | 
 34 |     it("should run ColStats alongside other row and column based checks without error") {
 35 |       validatorConfig.quickChecks(spark, variables) shouldBe false
 36 |       validatorConfig.costlyChecks(spark, variables) shouldBe false
 37 |     }
 38 | 
 39 |     it("should generate the appropriate ColStats entries in report.json") {
 40 |       val report = validatorConfig.genJsonReport(variables)(spark)
 41 |       val summaries = report \\ "events" flatMap { json =>
 42 |         json.as[Seq[CompleteStats]] match {
 43 |           case Right(summary) => summary
 44 |           case _ => Seq.empty
 45 |         }
 46 |       }
 47 | 
 48 |       summaries.toSet shouldBe Set(ColStatsSpec.statsA, ColStatsSpec.statsB)
 49 |     }
 50 | 
 51 |   }
 52 | 
 53 | }
 54 | 
 55 | object ColStatsSpec {
 56 | 
 57 |   case class Sample(a: Long, b: Double)
 58 | 
 59 |   val sample = Seq(
 60 |     Sample(2, 0.3922),
 61 |     Sample(3, 0.4765),
 62 |     Sample(4, 0.1918),
 63 |     Sample(5, 0.0536),
 64 |     Sample(6, 0.4949),
 65 |     Sample(7, 0.5810),
 66 |     Sample(8, 0.2978),
 67 |     Sample(9, 0.0729),
 68 |     Sample(10, 0.868),
 69 |     Sample(11, 0.325),
 70 |     Sample(12, 0.305),
 71 |     Sample(13, 0.217),
 72 |     Sample(14, 0.193),
 73 |     Sample(15, 0.405),
 74 |     Sample(16, 0.443),
 75 |     Sample(17, 0.103),
 76 |     Sample(18, 0.435),
 77 |     Sample(19, 0.953),
 78 |     Sample(20, 0.519),
 79 |     Sample(21, 0.958)
 80 |   )
 81 | 
 82 |   val statsA = CompleteStats(
 83 |     "`a` stats",
 84 |     "a",
 85 |     20,
 86 |     11.5,
 87 |     2.0,
 88 |     21.0,
 89 |     5.916079783099616,
 90 |     Histogram(
 91 |       Seq(
 92 |         Bin(2.0, 3.9, 2),
 93 |         Bin(3.9, 5.8, 2),
 94 |         Bin(5.8, 7.699999999999999, 2),
 95 |         Bin(7.699999999999999, 9.6, 2),
 96 |         Bin(9.6, 11.5, 2),
 97 |         Bin(11.5, 13.399999999999999, 2),
 98 |         Bin(13.399999999999999, 15.299999999999999, 2),
 99 |         Bin(15.299999999999999, 17.2, 2),
100 |         Bin(17.2, 19.099999999999998, 2),
101 |         Bin(19.099999999999998, 21.0, 2)
102 |       )
103 |     )
104 |   )
105 | 
106 |   val statsB = CompleteStats(
107 |     "`b` stats",
108 |     "b",
109 |     20,
110 |     0.414235,
111 |     0.0536,
112 |     0.958,
113 |     0.26725316654123255,
114 |     Histogram(
115 |       Seq(
116 |         Bin(0.0536, 0.14404, 3),
117 |         Bin(0.14404, 0.23448, 3),
118 |         Bin(0.23448, 0.32492, 2),
119 |         Bin(0.32492, 0.41535999999999995, 3),
120 |         Bin(0.41535999999999995, 0.5057999999999999, 4),
121 |         Bin(0.5057999999999999, 0.59624, 2),
122 |         Bin(0.59624, 0.68668, 0),
123 |         Bin(0.68668, 0.7771199999999999, 0),
124 |         Bin(0.7771199999999999, 0.8675599999999999, 0),
125 |         Bin(0.8675599999999999, 0.958, 3)
126 |       )
127 |     )
128 |   )
129 | 
130 | }
131 | 


--------------------------------------------------------------------------------
/src/test/scala/com/target/data_validator/validator/ColumnBasedSpec.scala:
--------------------------------------------------------------------------------
  1 | package com.target.data_validator.validator
  2 | 
  3 | import com.target.TestingSparkSession
  4 | import com.target.data_validator._
  5 | import io.circe.Json
  6 | import io.circe.parser._
  7 | import org.apache.spark.sql.Row
  8 | import org.apache.spark.sql.types._
  9 | import org.scalatest._
 10 | 
 11 | import scala.collection.immutable.ListMap
 12 | import org.scalatest.funspec.AnyFunSpec
 13 | import org.scalatest.matchers.should.Matchers
 14 | 
 15 | class ColumnBasedSpec extends AnyFunSpec with Matchers with TestingSparkSession {
 16 | 
 17 |   describe("columnMaxCheck") {
 18 | 
 19 |     val schema = StructType(
 20 |       List(
 21 |         StructField("key", StringType),
 22 |         StructField("data", StringType),
 23 |         StructField("number", IntegerType),
 24 |         StructField("byte", ByteType),
 25 |         StructField("double", DoubleType)
 26 |       )
 27 |     )
 28 | 
 29 |     val sampleData = List(
 30 |       Row("one", "2018/10/01", 3, 10.toByte, 2.0),
 31 |       Row("two", "2018/10/02", 2, 20.toByte, 3.5),
 32 |       Row("three", "2018/10/31", 1, 30.toByte, 1.7)
 33 |     )
 34 | 
 35 |     def mkValidatorConfig(checks: List[ValidatorBase]): ValidatorConfig =
 36 |       ValidatorConfig(
 37 |         1,
 38 |         10, // scalastyle:ignore magic.number
 39 |         None,
 40 |         detailedErrors = false,
 41 |         None,
 42 |         None,
 43 |         List(ValidatorDataFrame(spark.createDataFrame(sc.parallelize(sampleData), schema), None, None, checks))
 44 |       )
 45 | 
 46 |     it("should be able to be configured from json/YAML") {
 47 |       val json = """{ "type": "columnMaxCheck", "column": "rel_d", "value": "2018/10/20" }"""
 48 |       assert(
 49 |         decode[ValidatorBase](json)(JsonDecoders.decodeChecks) ==
 50 |           Right(ColumnMaxCheck("rel_d", Json.fromString("2018/10/20")))
 51 |       )
 52 |     }
 53 | 
 54 |     it("should fail when column doesn't exist") {
 55 |       val dict = new VarSubstitution
 56 |       val sut = mkValidatorConfig(List(ColumnMaxCheck("junk", Json.fromString("2018/10/31"))))
 57 |       assert(sut.configCheck(spark, dict))
 58 |       assert(sut.failed)
 59 |     }
 60 | 
 61 |     it("should not fail when value matches max column value") {
 62 |       val dict = new VarSubstitution
 63 |       val sut = mkValidatorConfig(List(ColumnMaxCheck("data", Json.fromString("2018/10/31"))))
 64 |       assert(!sut.configCheck(spark, dict))
 65 |       assert(!sut.quickChecks(spark, dict))
 66 |       assert(!sut.failed)
 67 |     }
 68 | 
 69 |     it("should fail when value doesn't match max column value") {
 70 |       val dict = new VarSubstitution
 71 |       val columnMaxCheck = ColumnMaxCheck("data", Json.fromString("2018/11/01"))
 72 |       val sut = mkValidatorConfig(List(columnMaxCheck))
 73 |       assert(!sut.configCheck(spark, dict))
 74 |       assert(sut.quickChecks(spark, dict))
 75 |       assert(sut.failed)
 76 |       assert(
 77 |         columnMaxCheck.getEvents contains ColumnBasedValidatorCheckEvent(
 78 |           failure = true,
 79 |           ListMap("expected" -> "2018/11/01", "actual" -> "2018/10/31"),
 80 |           "ColumnMaxCheck data[StringType]: Expected: 2018/11/01 Actual: 2018/10/31"
 81 |         )
 82 |       )
 83 |     }
 84 | 
 85 |     it("should not fail with numeric column matches max value") {
 86 |       val dict = new VarSubstitution
 87 |       val sut = mkValidatorConfig(List(ColumnMaxCheck("number", Json.fromInt(3))))
 88 |       assert(!sut.configCheck(spark, dict))
 89 |       assert(!sut.quickChecks(spark, dict))
 90 |       assert(!sut.failed)
 91 |     }
 92 | 
 93 |     it("should fail when numeric column doesn't match max value") {
 94 |       val dict = new VarSubstitution
 95 |       val columnMaxCheck = ColumnMaxCheck("number", Json.fromInt(100)) // scalastyle:ignore magic.number
 96 |       val sut = mkValidatorConfig(List(columnMaxCheck))
 97 |       assert(!sut.configCheck(spark, dict))
 98 |       assert(sut.quickChecks(spark, dict))
 99 |       assert(sut.failed)
100 |       assert(
101 |         columnMaxCheck.getEvents contains ColumnBasedValidatorCheckEvent(
102 |           failure = true,
103 |           ListMap("expected" -> "100", "actual" -> "3", "relative_error" -> "97.00%"),
104 |           "ColumnMaxCheck number[IntegerType]: Expected: 100 Actual: 3 Relative Error: 97.00%"
105 |         )
106 |       )
107 |     }
108 | 
109 |     it("should fail with undefined error % when numeric column doesn't match max value and expected value is 0") {
110 |       val dict = new VarSubstitution
111 |       val columnMaxCheck = ColumnMaxCheck("number", Json.fromInt(0))
112 |       val sut = mkValidatorConfig(List(columnMaxCheck))
113 |       assert(!sut.configCheck(spark, dict))
114 |       assert(sut.quickChecks(spark, dict))
115 |       assert(sut.failed)
116 |       assert(
117 |         columnMaxCheck.getEvents contains ColumnBasedValidatorCheckEvent(
118 |           failure = true,
119 |           ListMap("expected" -> "0", "actual" -> "3", "relative_error" -> "undefined"),
120 |           "ColumnMaxCheck number[IntegerType]: Expected: 0 Actual: 3 Relative Error: undefined"
121 |         )
122 |       )
123 |     }
124 | 
125 |     it("should not fail when double column matches max value") {
126 |       val dict = new VarSubstitution
127 |       val sut = mkValidatorConfig(List(ColumnMaxCheck("double", Json.fromDouble(3.5).get)))
128 |       assert(!sut.configCheck(spark, dict))
129 |       assert(!sut.quickChecks(spark, dict))
130 |       assert(!sut.failed)
131 |     }
132 | 
133 |     it("should fail when double column doesn't match max value") {
134 |       val dict = new VarSubstitution
135 |       val columnMaxCheck = ColumnMaxCheck("double", Json.fromDouble(5.0).get)
136 |       val sut = mkValidatorConfig(List(columnMaxCheck))
137 |       assert(!sut.configCheck(spark, dict))
138 |       assert(sut.quickChecks(spark, dict))
139 |       assert(sut.failed)
140 |       assert(
141 |         columnMaxCheck.getEvents contains ColumnBasedValidatorCheckEvent(
142 |           failure = true,
143 |           ListMap("expected" -> "5.0", "actual" -> "3.5", "relative_error" -> "30.00%"),
144 |           "ColumnMaxCheck double[DoubleType]: Expected: 5.0 Actual: 3.5 Relative Error: 30.00%"
145 |         )
146 |       )
147 |     }
148 | 
149 |     it("should fail when byte column and value overflows") {
150 |       val dict = new VarSubstitution
151 |       val sut = mkValidatorConfig(List(ColumnMaxCheck("byte", Json.fromInt(1000)))) // scalastyle:ignore
152 |       assert(sut.configCheck(spark, dict))
153 |       assert(sut.failed)
154 |     }
155 | 
156 |     it("should fail when byte column and string value") {
157 |       val dict = new VarSubstitution
158 |       val sut = mkValidatorConfig(List(ColumnMaxCheck("byte", Json.fromString("bit"))))
159 |       assert(sut.configCheck(spark, dict))
160 |       assert(sut.failed)
161 |     }
162 | 
163 |     it("variable substitution should produce VarSubJsonEvent()") {
164 |       val vars = new VarSubstitution
165 |       vars.addString("col", "byte")
166 |       val sut = ColumnMaxCheck("${col}", Json.fromInt(100)).substituteVariables(vars) // scalastyle:ignore
167 |       assert(!sut.failed)
168 |       assert(sut.getEvents contains VarSubEvent("${col}", "byte"))
169 |     }
170 | 
171 |   }
172 | 
173 | }
174 | 


--------------------------------------------------------------------------------
/src/test/scala/com/target/data_validator/validator/ConfigVarSpec.scala:
--------------------------------------------------------------------------------
  1 | package com.target.data_validator.validator
  2 | 
  3 | import cats.syntax.either._
  4 | import com.target.TestingSparkSession
  5 | import com.target.data_validator._
  6 | import com.target.data_validator.ConfigVar._
  7 | import io.circe._
  8 | import io.circe.generic.auto._
  9 | import io.circe.parser._
 10 | import io.circe.syntax._
 11 | import org.apache.spark.sql.Row
 12 | import org.apache.spark.sql.types.{IntegerType, StructField, StructType}
 13 | 
 14 | import scala.util.Random
 15 | import org.scalatest.funspec.AnyFunSpec
 16 | import org.scalatest.matchers.should.Matchers
 17 | 
 18 | class ConfigVarSpec extends AnyFunSpec with Matchers with TestingSparkSession {
 19 | 
 20 |   describe("ConfigVar") {
 21 | 
 22 |     describe("NameValue") {
 23 | 
 24 |       it("from Json snippet") {
 25 |         val json: Json =
 26 |           parse("""{ "name": "foo", "value": "bar" }""").getOrElse(Json.Null)
 27 |         val sut = json.as[ConfigVar]
 28 |         assert(sut == Right(NameValue("foo", Json.fromString("bar"))))
 29 |       }
 30 | 
 31 |       it("addEntry works") {
 32 |         val bar = Json.fromString("bar")
 33 |         val sut = NameValue("foo", bar)
 34 |         val varSub = new VarSubstitution
 35 |         assert(!sut.addEntry(spark, varSub))
 36 |         assert(varSub.dict.get("foo") contains bar)
 37 |       }
 38 | 
 39 |       it("asJson works") {
 40 |         val sut = NameValue("bar", Json.fromString("foo"))
 41 |         assert(sut.asJson.noSpaces == """{"name":"bar","value":"foo"}""")
 42 |       }
 43 | 
 44 |       it("var sub in value") {
 45 |         val varSub = new VarSubstitution
 46 |         assert(!varSub.addString("four", "4"))
 47 |         val sut = NameValue("foo", Json.fromString("${four} score"))
 48 |         assert(!sut.addEntry(spark, varSub))
 49 |         assert(varSub.dict.get("foo").contains(Json.fromString("4 score")))
 50 |       }
 51 | 
 52 |       it("var sub fails when value doesn't exist") {
 53 |         val varSub = new VarSubstitution
 54 |         val sut = NameValue("foo", Json.fromString("${four} score"))
 55 |         assert(sut.addEntry(spark, varSub))
 56 |       }
 57 | 
 58 |     }
 59 | 
 60 |     describe("NameEnv") {
 61 | 
 62 |       it("from Json snippet") {
 63 |         val json: Json = parse("""{ "name":"foo", "env":"ENV"}""").getOrElse(Json.Null)
 64 |         val sut = json.as[ConfigVar]
 65 |         assert(sut == Right(NameEnv("foo", "ENV")))
 66 |       }
 67 | 
 68 |       it("addEntry works") {
 69 |         val sut = NameEnv("java_home", "JAVA_HOME")
 70 |         val varSub = new VarSubstitution
 71 |         assert(!sut.addEntry(spark, varSub))
 72 |         assert(varSub.dict.get("java_home") contains Json.fromString(System.getenv("JAVA_HOME")))
 73 |       }
 74 | 
 75 |       it("asJson works") {
 76 |         val sut = NameEnv("foo", "bar")
 77 |         assert(sut.asJson.noSpaces == """{"name":"foo","env":"bar"}""")
 78 |       }
 79 | 
 80 |       it("var sub in env value") {
 81 |         val sut = NameEnv("java_home", "JAVA_${h}")
 82 |         val varSub = new VarSubstitution
 83 |         assert(!varSub.addString("h", "HOME"))
 84 |         assert(!sut.addEntry(spark, varSub))
 85 |         assert(varSub.dict("java_home").asString contains System.getenv("JAVA_HOME"))
 86 |       }
 87 | 
 88 |       it("var sub fails when value doesn't exist") {
 89 |         val sut = NameEnv("java_home", "JAVA_${h}")
 90 |         val varSub = new VarSubstitution
 91 |         assert(sut.addEntry(spark, varSub))
 92 |       }
 93 | 
 94 |     }
 95 | 
 96 |     describe("NameShell") {
 97 | 
 98 |       it("from Json snippet") {
 99 |         val json: Json = parse("""{ "name":"foo", "shell":"false"}""").getOrElse(Json.Null)
100 |         val sut = json.as[ConfigVar]
101 |         assert(sut == Right(NameShell("foo", "false")))
102 |       }
103 | 
104 |       it("addEntry works as expected") {
105 |         val sut = NameShell("one", "echo 1")
106 |         val varSub = new VarSubstitution
107 |         assert(!sut.addEntry(spark, varSub))
108 |         assert(varSub.dict("one") == Json.fromInt(1))
109 |       }
110 | 
111 |       it("asJson works") {
112 |         val sut = NameShell("one", "echo 1")
113 |         assert(sut.asJson.noSpaces == """{"name":"one","shell":"echo 1"}""")
114 |       }
115 | 
116 |       it("bad command works as expected") {
117 |         val sut = NameShell("one", "/bad/command")
118 |         val varSub = new VarSubstitution
119 |         assert(sut.addEntry(spark, varSub))
120 |         assert(EventLog.events exists {
121 |           case ValidatorError(msg) =>
122 |             msg.startsWith("NameShell(one, /bad/command) Ran but returned exitCode: 127 stderr:")
123 |           case _ => false
124 |         })
125 |       }
126 | 
127 |       it("no output works as expected") {
128 |         val sut = NameShell("one", "true")
129 |         val varSub = new VarSubstitution
130 |         assert(sut.addEntry(spark, varSub))
131 |         assert(!varSub.dict.contains("one"))
132 |       }
133 | 
134 |       it("command failing works as expected") {
135 |         val sut = NameShell("one", "echo 1 && false")
136 |         val varSub = new VarSubstitution
137 |         assert(sut.addEntry(spark, varSub))
138 |         assert(!varSub.dict.contains("one"))
139 |         assert(
140 |           EventLog.events contains
141 |             ValidatorError("NameShell(one, echo 1 && false) Ran but returned exitCode: 1 stderr: ")
142 |         )
143 |       }
144 | 
145 |       it("variable substitution in command works") {
146 |         val varSub = new VarSubstitution
147 |         val valueJson = Json.fromInt(Random.nextInt)
148 |         varSub.add("one", valueJson)
149 |         val sut = NameShell("one", "echo $one")
150 |         assert(!sut.addEntry(spark, varSub))
151 |         assert(varSub.dict("one") == valueJson)
152 |       }
153 |     }
154 | 
155 |     describe("NameSql") {
156 | 
157 |       it("from Json snippet") {
158 |         val json: Json = parse("""{ "name":"foo", "sql":"select 1"}""").getOrElse(Json.Null)
159 |         val sut = json.as[ConfigVar]
160 |         assert(sut == Right(NameSql("foo", "select 1")))
161 |       }
162 | 
163 |       it("addEntry works as expected") {
164 |         val sut = NameSql("one", "select 1")
165 |         val varSub = new VarSubstitution
166 |         assert(!sut.addEntry(spark, varSub))
167 |         assert(varSub.dict("one") == Json.fromInt(1))
168 |       }
169 | 
170 |       it("asJson works") {
171 |         val sut = NameSql("one", "select 1")
172 |         assert(sut.asJson.noSpaces == """{"name":"one","sql":"select 1"}""")
173 |       }
174 | 
175 |       it("bad sql works as expected") {
176 |         val sut = NameSql("one", "bad sql")
177 |         val varSub = new VarSubstitution
178 |         assert(sut.addEntry(spark, varSub))
179 |       }
180 | 
181 |       it("empty query") {
182 |         val schema = StructType(List(StructField("data", IntegerType)))
183 |         val df = spark.createDataFrame(sc.parallelize(List(Row(10))), schema) // scalastyle:ignore
184 |         df.createTempView("MyTable")
185 |         val sut = NameSql("one", "select data from MyTable where data < 10")
186 |         val varSub = new VarSubstitution
187 |         assert(sut.addEntry(spark, varSub))
188 |       }
189 | 
190 |     }
191 | 
192 |   }
193 | 
194 | }
195 | 


--------------------------------------------------------------------------------
/src/test/scala/com/target/data_validator/validator/Mocker.scala:
--------------------------------------------------------------------------------
 1 | package com.target.data_validator.validator
 2 | 
 3 | import com.target.data_validator._
 4 | import org.apache.spark.sql.types.StructType
 5 | import org.apache.spark.sql.{DataFrame, Row, SparkSession}
 6 | import io.circe.Json
 7 | 
 8 | trait Mocker {
 9 | 
10 |   def mkDataFrame(spark: SparkSession, data: List[Row], schema: StructType): DataFrame =
11 |     spark.createDataFrame(spark.sparkContext.parallelize(data), schema)
12 | 
13 |   def mkParams(params: List[Tuple2[String, Any]] = List.empty): VarSubstitution = {
14 |     val dict = new VarSubstitution
15 |     params.foreach { pair =>
16 |       pair._2 match {
17 |         case p: Json => dict.add(pair._1, pair._2.asInstanceOf[Json])
18 |         case p: String => dict.addString(pair._1, pair._2.asInstanceOf[String])
19 |       }
20 |     }
21 |     dict
22 |   }
23 | }
24 | 


--------------------------------------------------------------------------------
/src/test/scala/com/target/data_validator/validator/NegativeCheckSpec.scala:
--------------------------------------------------------------------------------
  1 | package com.target.data_validator.validator
  2 | 
  3 | import com.target.TestingSparkSession
  4 | import com.target.data_validator.TestHelpers.{mkDf, mkDict, parseYaml}
  5 | import com.target.data_validator.{ValidatorConfig, ValidatorDataFrame, ValidatorError}
  6 | import org.scalatest.funspec.AnyFunSpec
  7 | import org.scalatest.matchers.should.Matchers
  8 | 
  9 | class NegativeCheckSpec extends AnyFunSpec with Matchers with TestingSparkSession {
 10 | 
 11 |   describe("NullCheck") {
 12 | 
 13 |     describe("config parsing") {
 14 |       it("basic config") {
 15 |         val json = parseYaml("""
 16 |           |type: negativeCheck
 17 |           |column: foo
 18 |             """.stripMargin)
 19 | 
 20 |         val sut = JsonDecoders.decodeChecks.decodeJson(json)
 21 |         assert(sut == Right(NegativeCheck("foo", None)))
 22 |       }
 23 | 
 24 |       it("optional threshold") {
 25 |         val json = parseYaml("""
 26 |           |type: negativeCheck
 27 |           |column: foo
 28 |           |threshold: 10.0%
 29 |             """.stripMargin)
 30 | 
 31 |         val sut = JsonDecoders.decodeChecks.decodeJson(json)
 32 |         assert(sut == Right(NegativeCheck("foo", Some("10.0%"))))
 33 |       }
 34 |       it("config error") {
 35 |         val json = parseYaml("""
 36 |           |type: negativeCheck
 37 |           |garbage
 38 |             """.stripMargin)
 39 | 
 40 |         val sut = JsonDecoders.decodeChecks.decodeJson(json)
 41 |         assert(sut.isLeft) // Todo: Maybe add better check. Left is generic failure.
 42 |       }
 43 | 
 44 |     }
 45 | 
 46 |     describe("variable substitution") {
 47 |       it("success substitution") {
 48 |         var dict = mkDict("threshold" -> "20%", "column" -> "foo")
 49 |         var sut = NegativeCheck("$column", Some("$threshold"))
 50 |         assert(sut.substituteVariables(dict) == NegativeCheck("foo", Some("20%")))
 51 |         assert(!sut.failed)
 52 |       }
 53 | 
 54 |       it("error on substitution issues") {
 55 |         var dict = mkDict()
 56 |         var sut = NegativeCheck("$column", Some("$threshold"))
 57 |         assert(sut.substituteVariables(dict) == sut)
 58 |         assert(sut.failed)
 59 |         assert(
 60 |           sut.getEvents contains
 61 |             ValidatorError(
 62 |               "VariableSubstitution: Can't find values for the following keys, "
 63 |                 + "column"
 64 |             )
 65 |         )
 66 |         assert(
 67 |           sut.getEvents contains
 68 |             ValidatorError(
 69 |               "VariableSubstitution: Can't find values for the following keys, "
 70 |                 + "threshold"
 71 |             )
 72 |         )
 73 |       }
 74 |     }
 75 | 
 76 |     describe("check configuration") {
 77 |       it("Column Exists") {
 78 |         val df = mkDf(spark = spark, "price" -> List(1.99))
 79 |         val sut = NegativeCheck("price", None)
 80 |         assert(!sut.configCheck(df))
 81 |       }
 82 | 
 83 |       it("Column doesn't exist") {
 84 |         val df = mkDf(spark = spark, "price" -> List(1.99))
 85 |         val sut = NegativeCheck("junk", None)
 86 |         assert(sut.configCheck(df))
 87 |         assert(sut.failed)
 88 |         assert(sut.getEvents contains ValidatorError("Column: junk not found in schema."))
 89 |       }
 90 | 
 91 |       it("Column exists but is wrong type") {
 92 |         val df = mkDf(spark = spark, "item" -> List("eggs"))
 93 |         val sut = NegativeCheck("item", None)
 94 |         assert(sut.configCheck(df))
 95 |         assert(sut.failed)
 96 |         assert(sut.getEvents contains ValidatorError("Column: item found, but not of numericType type: StringType"))
 97 |       }
 98 | 
 99 |     }
100 | 
101 |     describe("functionality") {
102 | 
103 |       it("success") {
104 |         val df = mkDf(spark, "price" -> List(1.99, 1.50, 2.50)) // scalastyle:ignore
105 |         val sut = ValidatorDataFrame(df, None, None, List(NegativeCheck("price", None)))
106 |         val config = ValidatorConfig(1, 1, None, false, None, None, List.empty)
107 |         assert(!sut.quickChecks(spark, mkDict())(config))
108 |         assert(!sut.failed)
109 |       }
110 | 
111 |       it("fails") {
112 |         val df = mkDf(spark, "price" -> List(1.99, -1.50, 2.50)) // scalastyle:ignore
113 |         val sut = ValidatorDataFrame(df, None, None, List(NegativeCheck("price", None)))
114 |         val config = ValidatorConfig(1, 1, None, false, None, None, List.empty)
115 |         assert(sut.quickChecks(spark, mkDict())(config))
116 |         assert(sut.failed)
117 | 
118 |       }
119 | 
120 |       it("threshold success") {
121 |         val df = mkDf(spark, "price" -> List(1.99, -1.50, 2.50)) // scalastyle:ignore
122 |         val sut = ValidatorDataFrame(df, None, None, List(NegativeCheck("price", Some("1"))))
123 |         val config = ValidatorConfig(1, 1, None, false, None, None, List.empty)
124 |         assert(!sut.quickChecks(spark, mkDict())(config))
125 |         assert(!sut.failed)
126 | 
127 |       }
128 | 
129 |       it("threshold failure") {
130 |         val df = mkDf(spark, "price" -> List(1.99, -1.50, -2.50)) // scalastyle:ignore
131 |         val sut = ValidatorDataFrame(df, None, None, List(NegativeCheck("price", Some("1"))))
132 |         val config = ValidatorConfig(1, 1, None, false, None, None, List.empty)
133 |         assert(sut.quickChecks(spark, mkDict())(config))
134 |         assert(sut.failed)
135 |       }
136 |     }
137 |   }
138 | }
139 | 


--------------------------------------------------------------------------------
/src/test/scala/com/target/data_validator/validator/NullCheckSpec.scala:
--------------------------------------------------------------------------------
  1 | package com.target.data_validator.validator
  2 | 
  3 | import com.target.TestingSparkSession
  4 | import com.target.data_validator.{ValidatorConfig, ValidatorDataFrame, ValidatorError}
  5 | import com.target.data_validator.TestHelpers._
  6 | import org.scalatest.funspec.AnyFunSpec
  7 | import org.scalatest.matchers.should.Matchers
  8 | 
  9 | class NullCheckSpec extends AnyFunSpec with Matchers with TestingSparkSession {
 10 | 
 11 |   describe("NullCheck") {
 12 |     describe("config parsing") {
 13 |       it("basic config") {
 14 |         val json = parseYaml("""
 15 |           |type: nullCheck
 16 |           |column: foo
 17 |           """.stripMargin)
 18 | 
 19 |         val sut = JsonDecoders.decodeChecks.decodeJson(json)
 20 |         assert(sut == Right(NullCheck("foo", None)))
 21 |       }
 22 | 
 23 |       it("optional threshold") {
 24 |         val json = parseYaml("""
 25 |           |type: nullCheck
 26 |           |column: foo
 27 |           |threshold: 10.0%
 28 |           """.stripMargin)
 29 | 
 30 |         val sut = JsonDecoders.decodeChecks.decodeJson(json)
 31 |         assert(sut == Right(NullCheck("foo", Some("10.0%"))))
 32 |       }
 33 |       it("config error") {
 34 |         val json = parseYaml("""
 35 |           |type: nullCheck
 36 |           |garbage
 37 |           """.stripMargin)
 38 | 
 39 |         val sut = JsonDecoders.decodeChecks.decodeJson(json)
 40 |         assert(sut.isLeft) // Todo: Maybe add better check. Left is generic failure.
 41 |       }
 42 | 
 43 |     }
 44 | 
 45 |     describe("variable substitution") {
 46 |       it("success substitution") {
 47 |         val dict = mkDict("threshold" -> "20%", "column" -> "foo")
 48 |         val sut = NullCheck("$column", Some("$threshold"))
 49 |         assert(sut.substituteVariables(dict) == NullCheck("foo", Some("20%")))
 50 |         assert(!sut.failed)
 51 |       }
 52 | 
 53 |       it("error on substitution issues") {
 54 |         val dict = mkDict()
 55 |         val sut = NullCheck("$column", Some("$threshold"))
 56 |         assert(sut.substituteVariables(dict) == sut)
 57 |         assert(sut.failed)
 58 |         assert(
 59 |           sut.getEvents contains
 60 |             ValidatorError(
 61 |               "VariableSubstitution: Can't find values for the following keys, "
 62 |                 + "column"
 63 |             )
 64 |         )
 65 |         assert(
 66 |           sut.getEvents contains
 67 |             ValidatorError(
 68 |               "VariableSubstitution: Can't find values for the following keys, "
 69 |                 + "threshold"
 70 |             )
 71 |         )
 72 |       }
 73 |     }
 74 | 
 75 |     describe("checkconfiguration") {
 76 | 
 77 |       it("Column Exists") {
 78 |         val df = mkDf(spark = spark, "item" -> List("Eggs"))
 79 |         val sut = NullCheck("item", None)
 80 |         assert(!sut.configCheck(df))
 81 |       }
 82 | 
 83 |       it("Column doesn't exist") {
 84 |         val df = mkDf(spark = spark, "item" -> List("Eggs"), "price" -> List(0.99), "perishable" -> List(true))
 85 |         val sut = NullCheck("junk", None)
 86 |         assert(sut.configCheck(df))
 87 |         assert(sut.failed)
 88 |         assert(sut.getEvents contains ValidatorError("Column: junk not found in schema."))
 89 |       }
 90 |     }
 91 | 
 92 |     describe("functionality") {
 93 | 
 94 |       it("success") {
 95 |         val df = mkDf(spark, "item" -> List("item1", "item2", "item3"))
 96 |         val sut = ValidatorDataFrame(df, None, None, List(NullCheck("item", None)))
 97 |         val config = ValidatorConfig(1, 1, None, false, None, None, List.empty)
 98 |         assert(!sut.quickChecks(spark, mkDict())(config))
 99 |         assert(!sut.failed)
100 |       }
101 | 
102 |       it("fails") {
103 |         val df = mkDf(spark, "item" -> List("item1", "item2", "item3", null)) // scalastyle:ignore
104 |         val sut = ValidatorDataFrame(df, None, None, List(NullCheck("item", None)))
105 |         val config = ValidatorConfig(1, 1, None, false, None, None, List.empty)
106 |         assert(sut.quickChecks(spark, mkDict())(config))
107 |         assert(sut.failed)
108 | 
109 |       }
110 | 
111 |       it("threshold success") {
112 |         val df = mkDf(spark, "item" -> List("item1", "item2", "item3", null)) // scalastyle:ignore
113 |         val sut = ValidatorDataFrame(df, None, None, List(NullCheck("item", Some("1"))))
114 |         val config = ValidatorConfig(1, 1, None, false, None, None, List.empty)
115 |         assert(!sut.quickChecks(spark, mkDict())(config))
116 |         assert(!sut.failed)
117 | 
118 |       }
119 | 
120 |       it("threshold failure") {
121 |         val df = mkDf(spark, "item" -> List("item1", "item2", "item3", null, null)) // scalastyle:ignore
122 |         val sut = ValidatorDataFrame(df, None, None, List(NullCheck("item", Some("1"))))
123 |         val config = ValidatorConfig(1, 1, None, false, None, None, List.empty)
124 |         assert(sut.quickChecks(spark, mkDict())(config))
125 |         assert(sut.failed)
126 |       }
127 |     }
128 |   }
129 | }
130 | 


--------------------------------------------------------------------------------
/src/test/scala/com/target/data_validator/validator/TestHelpersSpec.scala:
--------------------------------------------------------------------------------
 1 | package com.target.data_validator.validator
 2 | 
 3 | import com.target.TestingSparkSession
 4 | import com.target.data_validator.TestHelpers._
 5 | import io.circe.Json
 6 | import org.apache.spark.sql.Row
 7 | import org.apache.spark.sql.types._
 8 | import org.scalatest.funspec.AnyFunSpec
 9 | import org.scalatest.matchers.should.Matchers
10 | 
11 | class TestHelpersSpec extends AnyFunSpec with Matchers with TestingSparkSession {
12 | 
13 |   val data = List(
14 |     "item" -> List("item1", "item2", "item3", null), // scalastyle:ignore
15 |     "price" -> List(1.99, 2.99, 3.99, 0.0),
16 |     "count" -> List(1, 2, 3, 0),
17 |     "instock" -> List(true, false, true, false)
18 |   )
19 | 
20 |   val expectedSchema = StructType(
21 |     List(
22 |       StructField("item", StringType),
23 |       StructField("price", DoubleType),
24 |       StructField("count", IntegerType),
25 |       StructField("instock", BooleanType)
26 |     )
27 |   )
28 | 
29 |   describe("parseYml") {
30 |     it("parses simple yml") {
31 |       val sut = parseYaml("""
32 |         |double: 2.01
33 |         |int: 10293
34 |         |string: foo
35 |         |array:
36 |         | - one
37 |         | - two
38 |         | - three
39 |         """.stripMargin)
40 |       assert(
41 |         sut == Json.obj(
42 |           ("double", Json.fromDouble(2.01).get),
43 |           ("int", Json.fromInt(10293)), // scalastyle:ignore
44 |           ("string", Json.fromString("foo")),
45 |           ("array", Json.arr(Seq("one", "two", "three").map(Json.fromString): _*))
46 |         )
47 |       )
48 |     }
49 |   }
50 | 
51 |   describe("mkDict") {
52 |     it("simple case") {
53 |       val sut = mkDict("key" -> "value")
54 |       assert(sut.dict("key") == Json.fromString("value"))
55 |     }
56 |   }
57 | 
58 |   describe("guessType") {
59 |     it("double") {
60 |       assert(guessType(1.99) == DoubleType) // scalastyle: ignore
61 |     }
62 | 
63 |     it("int") {
64 |       assert(guessType(1) == IntegerType) // scalastyle: ignore
65 |     }
66 | 
67 |     it("string") {
68 |       assert(guessType("string") == StringType)
69 |     }
70 | 
71 |     it("boolean") {
72 |       assert(guessType(true) == BooleanType)
73 |     }
74 | 
75 |   }
76 | 
77 |   describe("mkSchema") {
78 |     it("simple") {
79 |       assert(mkSchema(data: _*) == expectedSchema)
80 |     }
81 |   }
82 | 
83 |   describe("mkRows") {
84 |     assert(
85 |       mkRows(data: _*) == List(
86 |         Row("item1", 1.99, 1, true),
87 |         Row("item2", 2.99, 2, false),
88 |         Row("item3", 3.99, 3, true),
89 |         Row(null, 0.0, 0, false)
90 |       )
91 |     ) // scalastyle:ignore
92 |   }
93 |   // mkDf
94 | 
95 |   describe("mkDf") {}
96 | 
97 | }
98 | 


--------------------------------------------------------------------------------
/src/test/scala/com/target/data_validator/validator/UniqueCheckSpec.scala:
--------------------------------------------------------------------------------
  1 | package com.target.data_validator.validator
  2 | 
  3 | import com.target.TestingSparkSession
  4 | import com.target.data_validator._
  5 | import io.circe.Json
  6 | import org.apache.spark.sql.{DataFrame, Row, SparkSession}
  7 | import org.apache.spark.sql.types._
  8 | import org.scalatest.funspec.AnyFunSpec
  9 | import org.scalatest.matchers.should.Matchers
 10 | 
 11 | class UniqueCheckSpec extends AnyFunSpec with Matchers with TestingSparkSession {
 12 | 
 13 |   val schema = StructType(
 14 |     List(StructField("item", StringType), StructField("location", IntegerType), StructField("price", DoubleType))
 15 |   )
 16 | 
 17 |   val defData = List(Row("Eggs", 1, 4.00), Row("Milk", 1, 10.27), Row("Eggs", 1, 5.00), Row("Eggs", 2, 2.00))
 18 |   def mkDataFrame(spark: SparkSession, data: List[Row]): DataFrame =
 19 |     spark.createDataFrame(sc.parallelize(data), schema)
 20 | 
 21 |   describe("fromJson") {
 22 |     it("create fromJson") {
 23 |       import com.target.data_validator.validator.JsonDecoders.decodeChecks
 24 |       val yaml =
 25 |         """---
 26 |           |- type: uniqueCheck
 27 |           |  columns:
 28 |           |   - foo
 29 |           |   - bar
 30 |         """.stripMargin
 31 |       val json = io.circe.yaml.parser.parse(yaml).right.getOrElse(Json.Null)
 32 |       val sut = json.as[Array[ValidatorBase]]
 33 |       assert(sut.isRight)
 34 |       assert(sut.right.get contains UniqueCheck(Array("foo", "bar")))
 35 |     }
 36 |   }
 37 | 
 38 |   describe("substituteVariables") {
 39 |     it("replaces variables") {
 40 |       val dict = new VarSubstitution
 41 |       dict.addString("col1", "foo")
 42 |       dict.addString("col2", "bar")
 43 |       val sut = UniqueCheck(List("${col1}", "$col2"))
 44 |       assert(sut.substituteVariables(dict) == UniqueCheck(List("foo", "bar")))
 45 |       assert(!sut.failed)
 46 |     }
 47 | 
 48 |   }
 49 | 
 50 |   describe("configCheck") {
 51 |     it("good columns") {
 52 |       val sut = UniqueCheck(List("item", "location"))
 53 |       val df = mkDataFrame(spark, defData)
 54 |       assert(!sut.configCheck(df))
 55 |       assert(!sut.failed)
 56 |     }
 57 | 
 58 |     it("bad column") {
 59 |       val sut = UniqueCheck(List("item", "city"))
 60 |       val df = mkDataFrame(spark, defData)
 61 |       assert(sut.configCheck(df))
 62 |       assert(sut.failed)
 63 |     }
 64 | 
 65 |   }
 66 | 
 67 |   describe("costlyCheck") {
 68 | 
 69 |     it("finds error") {
 70 |       val sut = UniqueCheck(Seq("item"))
 71 |       val df = mkDataFrame(spark, defData)
 72 |       assert(sut.costlyCheck(df))
 73 |       assert(sut.failed)
 74 |       assert(sut.getEvents contains ValidatorError("1 duplicates found!"))
 75 |     }
 76 | 
 77 |     it("finds error with multiple columns") {
 78 |       val sut = UniqueCheck(Seq("item", "location"))
 79 |       val df = mkDataFrame(spark, defData)
 80 |       assert(sut.costlyCheck(df))
 81 |       assert(sut.failed)
 82 |       assert(sut.getEvents contains ValidatorError("1 duplicates found!"))
 83 |     }
 84 | 
 85 |     it("no error") {
 86 |       val sut = UniqueCheck(Seq("price"))
 87 |       val df = mkDataFrame(spark, defData)
 88 |       assert(!sut.costlyCheck(df))
 89 |       assert(!sut.failed)
 90 |       assert(sut.getEvents contains ValidatorGood("no duplicates found."))
 91 |     }
 92 |   }
 93 | 
 94 |   describe("toJson") {
 95 | 
 96 |     it("generates correct json") {
 97 |       val sut = UniqueCheck(Seq("item"))
 98 |       assert(
 99 |         sut.toJson == Json.fromFields(
100 |           Seq(
101 |             ("type", Json.fromString("uniqueCheck")),
102 |             ("columns", Json.fromValues(List(Json.fromString("item")))),
103 |             ("failed", Json.fromBoolean(false)),
104 |             ("events", Json.fromValues(Seq.empty))
105 |           )
106 |         )
107 |       )
108 |     }
109 |   }
110 | 
111 |   describe("completeExample") {
112 |     it("happy path that finds error") {
113 |       val uc = UniqueCheck(List("item"))
114 |       val dict = new VarSubstitution
115 |       val df = mkDataFrame(spark, defData)
116 |       val sut = ValidatorConfig(
117 |         1,
118 |         1,
119 |         None,
120 |         detailedErrors = false,
121 |         None,
122 |         None,
123 |         List(ValidatorDataFrame(df, None, None, List(uc)))
124 |       )
125 | 
126 |       assert(!sut.configCheck(spark, dict))
127 |       assert(!sut.quickChecks(spark, dict))
128 |       assert(sut.costlyChecks(spark, dict))
129 |       assert(sut.failed)
130 |     }
131 | 
132 |     it("happy path that doesn't find error") {
133 |       val uc = UniqueCheck(List("price"))
134 |       val dict = new VarSubstitution
135 |       val df = mkDataFrame(spark, defData)
136 |       val sut = ValidatorConfig(
137 |         1,
138 |         1,
139 |         None,
140 |         detailedErrors = false,
141 |         None,
142 |         None,
143 |         List(ValidatorDataFrame(df, None, None, List(uc)))
144 |       )
145 | 
146 |       assert(!sut.configCheck(spark, dict))
147 |       assert(!sut.costlyChecks(spark, dict))
148 |       assert(!sut.failed)
149 |     }
150 |   }
151 | 
152 | }
153 | 


--------------------------------------------------------------------------------