├── .git-blame-ignore-revs ├── .github ├── .scala-steward.conf ├── CODEOWNERS ├── ISSUE_TEMPLATE │ ├── bug_report.md │ ├── new_check.md │ └── other-feature-request.md ├── PULL_REQUEST_TEMPLATE │ └── new_check.md ├── dependabot.yml ├── stale.yaml └── workflows │ ├── ci.yaml │ ├── release.yaml │ └── scala-steward.yaml ├── .gitignore ├── .scalafmt.conf ├── Brewfile ├── CONTRIBUTING.md ├── COPYRIGHT ├── LICENSE ├── Makefile ├── NOTICE ├── README.md ├── bin └── sbt ├── build.sbt ├── project ├── assembly.sbt ├── build.properties └── plugins.sbt ├── scalastyle-config.xml └── src ├── main ├── resources │ └── log4j-dv-spark.properties └── scala │ └── com │ └── target │ └── data_validator │ ├── CliOptionParser.scala │ ├── ConfigParser.scala │ ├── ConfigVar.scala │ ├── Emailer.scala │ ├── EnvironmentVariables.scala │ ├── EventGenerator.scala │ ├── EventLog.scala │ ├── ExpressionUtils.scala │ ├── GenTestData.scala │ ├── HTMLBits.scala │ ├── IO.scala │ ├── JsonEncoders.scala │ ├── JsonUtils.scala │ ├── Main.scala │ ├── Reports.scala │ ├── Substitutable.scala │ ├── ValidatorConfig.scala │ ├── ValidatorEvent.scala │ ├── ValidatorOutput.scala │ ├── ValidatorTable.scala │ ├── VarSubstitution.scala │ ├── stats │ ├── Bin.scala │ ├── CompleteStats.scala │ ├── FirstPassStats.scala │ ├── FirstPassStatsAggregator.scala │ ├── Histogram.scala │ ├── SecondPassStats.scala │ └── SecondPassStatsAggregator.scala │ └── validator │ ├── ColStats.scala │ ├── ColumnBased.scala │ ├── ColumnSumCheck.scala │ ├── JsonDecoders.scala │ ├── NegativeCheck.scala │ ├── NullCheck.scala │ ├── RangeCheck.scala │ ├── RowBased.scala │ ├── StringLengthCheck.scala │ ├── StringRegexCheck.scala │ ├── TwoPassCheapCheck.scala │ ├── UniqueCheck.scala │ └── ValidatorBase.scala └── test ├── resources ├── format_test.jsonl └── test_config.yaml └── scala └── com └── target ├── TestingSparkSession.scala └── data_validator ├── CliOptionParserSpec.scala ├── ConfigParserSpec.scala ├── ConfigVarSubSpec.scala ├── EmailerSpec.scala ├── EnvironmentVariablesSpec.scala ├── ExpressionUtilsSpec.scala ├── IOSpec.scala ├── JsonUtilsSpec.scala ├── TestHelpers.scala ├── ValidatorBaseSpec.scala ├── ValidatorOutputSpec.scala ├── ValidatorSpecifiedFormatLoaderSpec.scala ├── ValidatorTableSpec.scala ├── VarSubstitutionSpec.scala ├── stats ├── FirstPassStatsAggregatorSpec.scala ├── NumericData.scala └── SecondPassStatsAggregatorSpec.scala └── validator ├── ColStatsSpec.scala ├── ColumnBasedSpec.scala ├── ColumnSumCheckSpec.scala ├── ConfigVarSpec.scala ├── Mocker.scala ├── NegativeCheckSpec.scala ├── NullCheckSpec.scala ├── RangeCheckSpec.scala ├── RowBasedSpec.scala ├── StringLengthCheckSpec.scala ├── StringRegexCheckSpec.scala ├── TestHelpersSpec.scala └── UniqueCheckSpec.scala /.git-blame-ignore-revs: -------------------------------------------------------------------------------- 1 | # Scala Steward: Reformat with scalafmt 3.5.9 2 | 4a3a718ed6d94e89d7a478aa040831b85b5c5580 3 | -------------------------------------------------------------------------------- /.github/.scala-steward.conf: -------------------------------------------------------------------------------- 1 | # pullRequests.frequency allows to control how often or when Scala Steward 2 | # is allowed to create pull requests. 3 | # 4 | # Possible values: 5 | # @asap 6 | # PRs are created without delay. 7 | # 8 | # 9 | # PRs are created only again after the given timespan since the last PR 10 | # has passed. Example values are "36 hours", "1 day", or "14 days". 11 | 12 | # 13 | # PRs are created roughly according to the given CRON expression. 14 | # 15 | # CRON expressions consist of five fields: 16 | # minutes, hour of day, day of month, month, and day of week. 17 | # 18 | # See https://www.alonsodomin.me/cron4s/userguide/index.html#parsing for 19 | # more information about the CRON expressions that are supported. 20 | # 21 | # Note that the date parts of the CRON expression are matched exactly 22 | # while the time parts are only used to abide to the frequency of 23 | # the given expression. 24 | # 25 | # Default: @asap 26 | # 27 | #pullRequests.frequency = "0 0 ? * 3" # every thursday on midnight 28 | pullRequests.frequency = "7 days" 29 | 30 | # Only these dependencies which match the given patterns are updated. 31 | # 32 | # Each pattern must have `groupId`, and may have `artifactId` and `version`. 33 | # Defaults to empty `[]` which mean Scala Steward will update all dependencies. 34 | # updates.allow = [ { groupId = "com.example" } ] 35 | 36 | # The dependencies which match the given version pattern are updated. 37 | # Dependencies that are not listed will be updated. 38 | # 39 | # Each pattern must have `groupId`, `version` and optional `artifactId`. 40 | # Defaults to empty `[]` which mean Scala Steward will update all dependencies. 41 | updates.pin = [ 42 | { groupId = "org.apache.spark", artifactId = "spark-sql", version = "2.3.4" } 43 | ] 44 | 45 | # The dependencies which match the given pattern are NOT updated. 46 | # 47 | # Each pattern must have `groupId`, and may have `artifactId` and `version`. 48 | # Defaults to empty `[]` which mean Scala Steward will not ignore dependencies. 49 | # TODO: multi-version build coming soon 50 | updates.ignore = [ { groupId = "org.scala-lang", artifactId = "scala-library" } ] 51 | 52 | # If set, Scala Steward will only create or update `n` PRs each time it runs (see `pullRequests.frequency` above). 53 | # Useful if running frequently and/or CI build are costly 54 | # Default: None 55 | # updates.limit = 5 56 | 57 | # The extensions of files that should be updated. 58 | # Default: [".scala", ".sbt", ".sbt.shared", ".sc", ".yml", "pom.xml"] 59 | # updates.fileExtensions = [".scala", ".sbt", ".sbt.shared", ".sc", ".yml", ".md", ".markdown", ".txt"] 60 | 61 | # If "on-conflicts", Scala Steward will update the PR it created to resolve conflicts as 62 | # long as you don't change it yourself. 63 | # If "always", Scala Steward will always update the PR it created as long as 64 | # you don't change it yourself. 65 | # If "never", Scala Steward will never update the PR 66 | # Default: "on-conflicts" 67 | # updatePullRequests = "always" | "on-conflicts" | "never" 68 | 69 | # If set, Scala Steward will use this message template for the commit messages and PR titles. 70 | # Supported variables: ${artifactName}, ${currentVersion}, ${nextVersion} and ${default} 71 | # Default: "${default}" which is equivalent to "Update ${artifactName} to ${nextVersion}" 72 | commits.message = "Update ${artifactName} from ${currentVersion} to ${nextVersion}" 73 | 74 | # If true and when upgrading version in .scalafmt.conf, Scala Steward will perform scalafmt 75 | # and add a separate commit when format changed. So you don't need reformat manually and can merge PR. 76 | # If false, Scala Steward will not perform scalafmt, so your CI may abort when reformat needed. 77 | # Default: true 78 | scalafmt.runAfterUpgrading = false 79 | 80 | # It is possible to have multiple scala projects in a single repository. In that case the folders containing the projects (build.sbt folders) 81 | # are specified using the buildRoots property. Note that the paths used there are relative and if the repo directory itself also contains a build.sbt the dot can be used to specify it. 82 | # Default: ["."] 83 | # buildRoots = [ ".", "subfolder/projectA" ] 84 | 85 | # Define commands that are executed after an update via a hook. 86 | # A groupId and/or artifactId can be defined to only execute after certain dependencies are updated. If neither is defined, the hook runs for every update. 87 | # postUpdateHooks = [{ 88 | # command = ["sbt", "protobufGenerate"], 89 | # commitMessage = "Regenerated protobuf files", 90 | # groupId = "com.github.sbt", 91 | # artifactId = "sbt-protobuf" 92 | # }] 93 | 94 | # You can override some config options for dependencies that matches the given pattern. 95 | # Currently, "pullRequests" can be overridden. 96 | # Each pattern must have `groupId`, and may have `artifactId` and `version`. 97 | # First-matched entry is used. 98 | # More-specific entry should be placed before less-specific entry. 99 | # 100 | # Default: empty `[]` 101 | # dependencyOverrides = [ 102 | # { 103 | # dependency = { groupId = "com.example", artifactId = "foo", version = "2." }, 104 | # pullRequests = { frequency = "1 day" }, 105 | # }, 106 | # { 107 | # dependency = { groupId = "com.example", artifactId = "foo" }, 108 | # pullRequests = { frequency = "30 day" }, 109 | # }, 110 | # { 111 | # dependency = { groupId = "com.example" }, 112 | # pullRequests = { frequency = "14 day" }, 113 | # } 114 | # ] 115 | -------------------------------------------------------------------------------- /.github/CODEOWNERS: -------------------------------------------------------------------------------- 1 | * @target/data-validator-members @c-horn 2 | -------------------------------------------------------------------------------- /.github/ISSUE_TEMPLATE/bug_report.md: -------------------------------------------------------------------------------- 1 | --- 2 | name: Bug report 3 | about: Create a report to help us improve 4 | title: '' 5 | labels: bug 6 | assignees: '' 7 | 8 | --- 9 | 10 | **Describe the bug** 11 | 12 | 13 | **To Reproduce** 14 | 20 | 21 | **Expected behavior** 22 | 23 | 24 | **Log output** 25 | 26 | 27 | 28 | 29 | -------------------------------------------------------------------------------- /.github/ISSUE_TEMPLATE/new_check.md: -------------------------------------------------------------------------------- 1 | --- 2 | name: New check 3 | about: Suggest a new check 4 | title: 'New check:' 5 | labels: enhancement 6 | assignees: '' 7 | 8 | --- 9 | 10 | # What would you like to check? 11 | 12 | 13 | 14 | # What does the configuration look like? 15 | 16 | 20 | 21 | # Are you going to work on it, or are you asking for it? 22 | 23 | - [ ] Asking 24 | - [ ] Working 25 | 26 | ## If _working_ on it, when do you think you'll have a PR ready? 27 | 28 | 29 | -------------------------------------------------------------------------------- /.github/ISSUE_TEMPLATE/other-feature-request.md: -------------------------------------------------------------------------------- 1 | --- 2 | name: Other feature request 3 | about: Suggest an idea for this project 4 | title: '' 5 | labels: enhancement 6 | assignees: '' 7 | 8 | --- 9 | 10 | **Is your feature request related to a problem? Please describe.** 11 | A clear and concise description of what the problem is. Ex. I'm always frustrated when [...] 12 | 13 | **Describe the solution you'd like** 14 | A clear and concise description of what you want to happen. 15 | 16 | **Describe alternatives you've considered** 17 | A clear and concise description of any alternative solutions or features you've considered. 18 | 19 | **Additional context** 20 | Add any other context or screenshots about the feature request here. 21 | -------------------------------------------------------------------------------- /.github/PULL_REQUEST_TEMPLATE/new_check.md: -------------------------------------------------------------------------------- 1 | --- 2 | name: New check 3 | about: Submit a new check 4 | title: 'New check:' 5 | labels: 'enhancement' 6 | assignees: '' 7 | 8 | --- 9 | 10 | # What issue is this PR solving? 11 | 12 | 13 | 14 | # What does the configuration look like? 15 | 16 | 24 | 25 | # Have you completed all of these? 26 | 27 | - [ ] Add configuration documentation to the Validators section of the README. You should be able to copy this from the previous section. 28 | - [ ] Pass the style checker requirements without warnings or errors (`sbt test` will not work without compliance!) 29 | - [ ] Does not modify any of the other validators. Please review the section Refactoring in the CONTRIBUTING.md. 30 | - [ ] Include tests. Submissions without tests will not be considered. Test the following things: 31 | - [ ] Configuration parsing 32 | - [ ] Configuration sanity checking 33 | - [ ] Variable substitution 34 | - [ ] Actual check functionality 35 | -------------------------------------------------------------------------------- /.github/dependabot.yml: -------------------------------------------------------------------------------- 1 | version: 2 2 | updates: 3 | # Check for updates to GitHub Actions every weekday 4 | - package-ecosystem: "github-actions" 5 | directory: "/" 6 | schedule: 7 | interval: "daily" 8 | 9 | # Until Dependabot provides Scala updates, they are are handed by the Scala Steward action. 10 | # https://github.com/target/data-validator/blob/master/.github/workflows/scala-steward.yaml 11 | # Check periodically for updates: https://docs.github.com/en/code-security/dependabot/dependabot-version-updates/configuration-options-for-the-dependabot.yml-file#package-ecosystem 12 | -------------------------------------------------------------------------------- /.github/stale.yaml: -------------------------------------------------------------------------------- 1 | # Number of days of inactivity before an issue becomes stale 2 | daysUntilStale: 28 3 | # Number of days of inactivity before a stale issue is closed 4 | daysUntilClose: 7 5 | # Issues with these labels will never be considered stale 6 | exemptLabels: 7 | - hold 8 | - blocked 9 | - security 10 | # Label to use when marking an issue as stale 11 | staleLabel: stale 12 | # Comment to post when marking an issue as stale. Set to `false` to disable 13 | markComment: > 14 | This issue has been automatically marked as stale because it has not had 15 | recent activity. It will be closed if no further activity occurs. Thank you 16 | for your contributions. @target/data-validator-members, please take a look. 17 | # Comment to post when closing a stale issue. Set to `false` to disable 18 | closeComment: > 19 | This issue was closed because it did not see activity within five weeks. 20 | @target/data-validator-members, please reopen and reexamine at your earliest 21 | convenience if this ticket should not be lost to the ether. 22 | -------------------------------------------------------------------------------- /.github/workflows/ci.yaml: -------------------------------------------------------------------------------- 1 | name: Continuous Integration 2 | 3 | on: 4 | workflow_call: 5 | merge_group: 6 | branches: ['*'] 7 | pull_request: 8 | branches: ['*'] 9 | push: 10 | branches: ['*'] 11 | 12 | concurrency: 13 | group: ${{ github.workflow }}-${{ github.event.pull_request.number || github.ref }} 14 | cancel-in-progress: true 15 | 16 | env: 17 | GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} 18 | 19 | jobs: 20 | build: 21 | name: Build and Test 22 | runs-on: ubuntu-latest 23 | steps: 24 | - name: Checkout current branch (full) 25 | uses: actions/checkout@v4 26 | - uses: coursier/cache-action@v6 27 | with: 28 | extraKey: ${{ runner.os }} 29 | - uses: coursier/setup-action@v1 30 | with: 31 | jvm: adopt:1.8 32 | - name: Build, test, and package project on Spark 3.5 33 | run: bin/sbt clean compile test package makePom -DsparkVersion=3.5.1 34 | - name: Build and package project on "legacy" Spark 35 | run: bin/sbt clean compile package makePom 36 | -------------------------------------------------------------------------------- /.github/workflows/release.yaml: -------------------------------------------------------------------------------- 1 | name: Release 2 | 3 | on: 4 | release: 5 | types: [published] 6 | #push: 7 | # tags: "[1-9]+.[0-9]+.[0-9]+" 8 | 9 | env: 10 | GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} 11 | 12 | jobs: 13 | deploy: 14 | name: Release 15 | runs-on: ubuntu-latest 16 | steps: 17 | - name: Checkout current branch (full) 18 | uses: actions/checkout@v4 19 | - uses: coursier/cache-action@v6 20 | with: 21 | extraKey: ${{ runner.os }} 22 | - uses: coursier/setup-action@v1 23 | with: 24 | jvm: adopt:1.8 25 | # uses sbt-github-packages, see build.sbt 26 | - name: Publish with SBT 27 | run: bin/sbt publish 28 | - name: Publish with SBT 29 | run: bin/sbt publish -DsparkVersion=3.5.1 30 | -------------------------------------------------------------------------------- /.github/workflows/scala-steward.yaml: -------------------------------------------------------------------------------- 1 | # This workflow will launch at 00:00 every Sunday 2 | on: 3 | workflow_dispatch: 4 | schedule: 5 | # At 09:00 on day-of-month 1 and 15 6 | # https://crontab.guru/#0_9_1,15_*_* 7 | - cron: '0 9 1,15 * *' 8 | 9 | name: Launch Scala Steward 10 | 11 | env: 12 | # This is required because SBT is configured to look at env:GITHUB_TOKEN or git:github.token or env:SHELL 13 | # to get a token for publishing even when not executing the publish task. Most of the time, SHELL is set 14 | # but apparently not inside GitHub Actions runners. Setting _something invalid_ satisfies the 15 | # GitHub Packages plugin safely and allows the operation to proceed. 16 | SHELL: "/bin/bash" 17 | 18 | jobs: 19 | scala-steward: 20 | runs-on: ubuntu-latest 21 | name: Launch Scala Steward 22 | steps: 23 | - name: Install JDK for Scala Steward use 24 | uses: actions/setup-java@v4 25 | with: 26 | distribution: 'temurin' 27 | java-version: '11' 28 | - name: Launch Scala Steward 29 | uses: scala-steward-org/scala-steward-action@v2 30 | with: 31 | github-token: ${{ secrets.REPO_GITHUB_TOKEN }} 32 | author-email: "41898282+github-actions[bot]@users.noreply.github.com" 33 | author-name: "github-actions[bot]" 34 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | # generated artifacts 2 | project/target 3 | project/project/target 4 | target 5 | !src/main/scala/com/target 6 | !src/test/scala/com/target 7 | 8 | # build server stuff 9 | .bsp 10 | # autogenerated test data 11 | testData.orc 12 | -------------------------------------------------------------------------------- /.scalafmt.conf: -------------------------------------------------------------------------------- 1 | version = 3.8.6 2 | runner.dialect = scala211 3 | project.git = true 4 | align.preset = none 5 | # Disabled in default since this operation is potentially 6 | # dangerous if you define your own stripMargin with different 7 | # semantics from the stdlib stripMargin. 8 | assumeStandardLibraryStripMargin = true 9 | onTestFailure = "To fix this, run 'sbt scalafmt' from the project root directory" 10 | maxColumn = 118 11 | rewrite.rules = [RedundantParens, PreferCurlyFors, SortModifiers] 12 | docstrings.style = SpaceAsterisk 13 | indent.main = 2 14 | 15 | -------------------------------------------------------------------------------- /Brewfile: -------------------------------------------------------------------------------- 1 | if OS.mac? 2 | # We need a version of a JDK older than what's kept in the main cask repository. 3 | tap "homebrew/cask-versions" 4 | # The JDK formerly known as AdoptOpenJDK, a build of OpenJDK 5 | # Use JDK8 because that's a solid base for older Hadoop clusters 6 | # Also, Spark 2.3.x essentially requires Scala 2.11.x, and that combination may necessitate Java 8. 7 | cask "homebrew/cask-versions/temurin8" 8 | end -------------------------------------------------------------------------------- /CONTRIBUTING.md: -------------------------------------------------------------------------------- 1 | # Contributing to Data Validator 2 | 3 | ## Fixing a bug you found 4 | 5 | 1. Check existing [issues](https://github.com/target/data-validator/issues) first to see if anyone else has reported the bug 6 | 2. Report the bug if no one else has reported it. 7 | 3. Fix the bug. 8 | 4. Submit a PR. 9 | 5. Be ready for some back-and-forth between maintainers and you. 10 | 11 | ## Creating new checks 12 | 13 | 1. Check existing [issues](https://github.com/target/data-validator/issues) first to see if anyone else has reported a desire for the check. 14 | 2. If it doesn't exist (it probably won't!) then create a new issue: 15 | 1. Provide an example of how you would like the configuration for the check to look. Most of our rework requests are the result of an unclear vision for the interface to the check! 16 | 2. Clearly state if you are intending to work on it or if you are simply asking for it. If you're going to work on it, please provide a timeline for delivery. If you're just asking for it, you're done after you've submitted the request issue. 17 | 3. Work on it! 18 | 1. Abide by the style checker requirements. 19 | 2. Include tests. Submissions without tests will not be considered. Test the following things: 20 | 1. Configuration parsing 21 | 2. Configuration sanity checking 22 | 3. Variable substitution 23 | 4. Actual check functionality 24 | 25 | ## Refactoring 26 | 27 | Follow the new checks procedure, but instead of providing a configuration example, clearly explain: 28 | 29 | 1. How the current state of things negatively affects the extensibility of Data Validator. 30 | 2. How you intend to remedy the situation with the minimum amount of code changed 31 | 32 | **Do not mix refactoring with the addition of a new check in the same pull request.** We will reject and ask that they be done in separate PRs to keep things manageable. 33 | 34 | ## Development Environment Setup 35 | 36 | Developers on **macOS** should be able to clone, run `make deps build`, and be ready for a development cycle. 37 | This assumes that [Homebrew](https://brew.sh/) is already installed, as is common for macOS developers. 38 | 39 | Developers on **Linux or Windows** will need to install a Java 8 JDK, preferably 40 | the [Temurin JDK from the Adoptium Working Group](https://adoptium.net/) of the [Eclipse Foundation](https://www.eclipse.org) 41 | or another JDK in the OpenJDK family. 42 | 43 | Run `make help` to see common tasks. Make tasks are provided for those unfamiliar with 44 | running `sbt` in console mode. 45 | Those preferring `sbt` are assumed to know what they're doing but can get a quick refresher 46 | by looking at the tasks in the `Makefile`. 47 | -------------------------------------------------------------------------------- /COPYRIGHT: -------------------------------------------------------------------------------- 1 | Copyright (c) 2019 Target Brands, Inc. 2 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | Copyright (c) 2019 Target Brands, Inc. 2 | 3 | Licensed under the Apache License, Version 2.0 (the "License"); 4 | you may not use this file except in compliance with the License. 5 | You may obtain a copy of the License at 6 | 7 | http://www.apache.org/licenses/LICENSE-2.0 8 | 9 | Unless required by applicable law or agreed to in writing, software 10 | distributed under the License is distributed on an "AS IS" BASIS, 11 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | See the License for the specific language governing permissions and 13 | limitations under the License. 14 | -------------------------------------------------------------------------------- /Makefile: -------------------------------------------------------------------------------- 1 | SBT ?= bin/sbt 2 | 3 | ##@ Utilities 4 | 5 | .PHONY: help 6 | help: ## Prints help for targets with comments 7 | @awk 'BEGIN {FS = ":.*##"; printf "\nUsage:\n make \033[36m\033[0m\n"} /^[a-zA-Z0-9_-]+:.*?##/ { printf " \033[36m%-15s\033[0m %s\n", $$1, $$2 } /^##@/ { printf "\n\033[1m%s\033[0m\n", substr($$0, 5) } ' $(MAKEFILE_LIST) 8 | 9 | ##@ Development Setup 10 | 11 | deps-sys: Brewfile ## Installs system-wide dependencies 12 | (command -v brew > /dev/null && brew bundle) || true 13 | 14 | ##@ Development Cycle 15 | 16 | .PHONY: test 17 | test: ## Runs tests 18 | $(SBT) test 19 | 20 | .PHONY: check 21 | check: ## Runs linters and other checks 22 | $(SBT) scalastyle 23 | 24 | .PHONY: build 25 | build: 26 | $(SBT) assembly 27 | 28 | .PHONY: format-scala 29 | format-scala: ## Formats all Scala code 30 | $(SBT) scalafmt 31 | 32 | ##@ Maintenance Tasks 33 | 34 | refresh-sbt: ## Retrieve the latest version of sbt launcher 35 | curl https://raw.githubusercontent.com/paulp/sbt-extras/master/sbt > bin/sbt 36 | 37 | UNAME := $(shell uname -s) 38 | ifeq ($(UNAME), Linux) 39 | OS_INFO_CMD=lsb_release -a 2>/dev/null 40 | endif 41 | ifeq ($(UNAME), Darwin) 42 | OS_INFO_CMD=sw_vers 43 | endif 44 | 45 | ##@ Debugging 46 | 47 | doctor: ## Show important details about compilation environment 48 | java -version 49 | $(OS_INFO_CMD) 50 | bin/sbt version 51 | git log -1 HEAD --oneline 52 | 53 | -------------------------------------------------------------------------------- /NOTICE: -------------------------------------------------------------------------------- 1 | The following license applies to sbt-extras code incorporated into 2 | the bin/sbt file of this project: 3 | 4 | // Generated from http://www.opensource.org/licenses/bsd-license.php 5 | Copyright (c) 2011, Paul Phillips. All rights reserved. 6 | 7 | Redistribution and use in source and binary forms, with or without 8 | modification, are permitted provided that the following conditions are 9 | met: 10 | 11 | * Redistributions of source code must retain the above copyright 12 | notice, this list of conditions and the following disclaimer. 13 | * Redistributions in binary form must reproduce the above copyright 14 | notice, this list of conditions and the following disclaimer in the 15 | documentation and/or other materials provided with the distribution. 16 | * Neither the name of the author nor the names of its contributors 17 | may be used to endorse or promote products derived from this software 18 | without specific prior written permission. 19 | 20 | THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS 21 | "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT 22 | LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR 23 | A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT 24 | HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, 25 | SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED 26 | TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR 27 | PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF 28 | LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING 29 | NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS 30 | SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. -------------------------------------------------------------------------------- /build.sbt: -------------------------------------------------------------------------------- 1 | name := "data-validator" 2 | organization := "com.target" 3 | 4 | val sparkVersion = settingKey[String]("Spark version") 5 | 6 | sparkVersion := System.getProperty("sparkVersion", "2.3.4") 7 | 8 | scalaVersion := { 9 | if (sparkVersion.value > "3.0") { 10 | "2.12.19" 11 | } else { 12 | "2.11.12" 13 | } 14 | } 15 | 16 | val sparkValidationVersion = settingKey[String]("Version of package") 17 | 18 | sparkValidationVersion := "0.15.0" 19 | 20 | version := sparkVersion.value + "_" + sparkValidationVersion.value 21 | 22 | val circeVersion = settingKey[String]("Circe version") 23 | val circeYamlVersion = settingKey[String]("Circe YAML version") 24 | 25 | circeVersion := { 26 | if (sparkVersion.value > "3.0") { 27 | "0.14.6" 28 | } else { 29 | "0.11.2" 30 | } 31 | } 32 | 33 | circeYamlVersion := { 34 | if (sparkVersion.value > "3.0") { 35 | "0.15.1" 36 | } else { 37 | "0.10.1" 38 | } 39 | } 40 | 41 | //addDependencyTreePlugin 42 | enablePlugins(GitVersioning) 43 | git.useGitDescribe := true 44 | ThisBuild / versionScheme := Some("early-semver") 45 | 46 | ///////////// 47 | // Publishing 48 | ///////////// 49 | githubOwner := "target" 50 | githubRepository := "data-validator" 51 | // this unfortunately must be set strangely because GitHub requires a token for pulling packages 52 | // and sbt-github-packages does not allow the user to configure the resolver not to be used. 53 | // https://github.com/djspiewak/sbt-github-packages/issues/28 54 | githubTokenSource := (TokenSource.Environment("GITHUB_TOKEN") || 55 | TokenSource.GitConfig("github.token") || 56 | TokenSource.Environment("SHELL")) // it's safe to assume this exists and is not unique 57 | 58 | publishTo := githubPublishTo.value 59 | 60 | enablePlugins(BuildInfoPlugin) 61 | buildInfoKeys := Seq[BuildInfoKey](name, version, scalaVersion, sbtVersion) 62 | buildInfoPackage := "com.target.data_validator" 63 | 64 | libraryDependencies ++= Seq( 65 | "com.typesafe.scala-logging" %% "scala-logging" % "3.9.5", 66 | "com.github.scopt" %% "scopt" % "4.1.0", 67 | "com.sun.mail" % "javax.mail" % "1.6.2", 68 | "com.lihaoyi" %% "scalatags" % "0.12.0", 69 | "io.circe" %% "circe-yaml" % circeYamlVersion.value, 70 | "io.circe" %% "circe-core" % circeVersion.value, 71 | "io.circe" %% "circe-generic" % circeVersion.value, 72 | "io.circe" %% "circe-parser" % circeVersion.value, 73 | "org.apache.spark" %% "spark-sql" % sparkVersion.value % Provided, 74 | "junit" % "junit" % "4.13.2" % Test, 75 | "org.scalatest" %% "scalatest" % "3.2.19" % Test, 76 | "com.github.sbt" % "junit-interface" % "0.13.3" % Test exclude ("junit", "junit-dep") 77 | ) 78 | 79 | Test / fork := true 80 | javaOptions ++= (if (sparkVersion.value > "3.0" && System.getenv("MODERN_JAVA") == "TRUE") { 81 | // For modern Java we need to open up a lot of config options. 82 | Seq("-Xms4048M", "-Xmx4048M", 83 | // these were added in JDK 11 and newer, apparently. 84 | "-Dio.netty.tryReflectionSetAccessible=true", 85 | "--add-opens=java.base/java.lang=ALL-UNNAMED", 86 | "--add-opens=java.base/java.io=ALL-UNNAMED", 87 | "--add-opens=java.base/sun.nio.ch=ALL-UNNAMED") 88 | } else { 89 | Seq("-Xms4048M", "-Xmx4048M") 90 | }) 91 | Test / parallelExecution := false 92 | // required for unit tests, but not set in some environments 93 | Test / envVars ++= Map( 94 | "JAVA_HOME" -> 95 | Option(System.getenv("JAVA_HOME")) 96 | .getOrElse(System.getProperty("java.home")) 97 | ) 98 | 99 | assembly / mainClass := Some("com.target.data_validator.Main") 100 | 101 | assembly / assemblyShadeRules := Seq( 102 | ShadeRule.rename("shapeless.**" -> "new_shapeless.@1").inAll, 103 | ShadeRule.rename("cats.kernel.**" -> s"new_cats.kernel.@1").inAll 104 | ) 105 | 106 | // Enforces scalastyle checks 107 | val compileScalastyle = TaskKey[Unit]("compileScalastyle") 108 | scalastyleFailOnWarning := true 109 | scalastyleFailOnError := true 110 | 111 | compileScalastyle := (Compile / scalastyle).toTask("").value 112 | (Compile / compile) := ((Compile / compile) dependsOn compileScalastyle).value 113 | 114 | (Compile / run) := Defaults 115 | .runTask( 116 | Compile / fullClasspath, 117 | Compile / run / mainClass, 118 | Compile / run / runner 119 | ) 120 | .evaluated 121 | 122 | (Compile / runMain) := Defaults.runMainTask(Compile / fullClasspath, Compile / run / runner).evaluated 123 | TaskKey[Unit]("generateTestData") := { 124 | libraryDependencies += "org.apache.spark" %% "spark-sql" % sparkVersion.value 125 | (Compile / runMain).toTask(" com.target.data_validator.GenTestData").value 126 | } 127 | -------------------------------------------------------------------------------- /project/assembly.sbt: -------------------------------------------------------------------------------- 1 | addSbtPlugin("com.eed3si9n" % "sbt-assembly" % "2.3.1") 2 | -------------------------------------------------------------------------------- /project/build.properties: -------------------------------------------------------------------------------- 1 | sbt.version = 1.10.10 2 | -------------------------------------------------------------------------------- /project/plugins.sbt: -------------------------------------------------------------------------------- 1 | addSbtPlugin("org.scalastyle" %% "scalastyle-sbt-plugin" % "1.0.0") 2 | addSbtPlugin("com.github.sbt" % "sbt-git" % "2.1.0") 3 | addSbtPlugin("com.eed3si9n" % "sbt-buildinfo" % "0.13.1") 4 | addSbtPlugin("com.codecommit" % "sbt-github-packages" % "0.5.3") 5 | addSbtPlugin("org.scalameta" % "sbt-scalafmt" % "2.5.4") 6 | addSbtPlugin("ch.epfl.scala" % "sbt-scalafix" % "0.14.2") -------------------------------------------------------------------------------- /src/main/resources/log4j-dv-spark.properties: -------------------------------------------------------------------------------- 1 | # Set everything to be logged to the console 2 | log4j.rootCategory=WARN, console 3 | log4j.appender.console=org.apache.log4j.ConsoleAppender 4 | log4j.appender.console.target=System.err 5 | log4j.appender.console.layout=org.apache.log4j.PatternLayout 6 | log4j.appender.console.layout.ConversionPattern=%d{yy/MM/dd HH:mm:ss} %p %C{1}: %m%n 7 | 8 | 9 | # Settings to quiet third party logs that are too verbose 10 | log4j.logger.org.spark-project.jetty=WARN 11 | log4j.logger.org.spark-project.jetty.util.component.AbstractLifeCycle=ERROR 12 | log4j.logger.org.apache.spark.repl.SparkIMain$exprTyper=INFO 13 | log4j.logger.org.apache.spark.repl.SparkILoop$SparkILoopInterpreter=INFO 14 | log4j.logger.org.apache.parquet=ERROR 15 | log4j.logger.parquet=ERROR 16 | 17 | 18 | # SPARK-9183: Settings to avoid annoying messages when looking up nonexistent UDFs in SparkSQL with Hive support 19 | log4j.logger.org.apache.hadoop.hive.metastore.RetryingHMSHandler=FATAL 20 | log4j.logger.org.apache.hadoop.hive.ql.exec.FunctionRegistry=ERROR 21 | log4j.logger.org.apache.spark.sql=WARN 22 | 23 | log4j.logger.org.apache.spark.repl.Main=WARN 24 | log4j.logger.org.apache.spark.scheduler.TaskSetManager=ERROR 25 | log4j.logger.org.apache.spark.ExecutorAllocationManager=ERROR 26 | 27 | # Logging for this application 28 | log4j.logger.com.target.data_validator=INFO 29 | -------------------------------------------------------------------------------- /src/main/scala/com/target/data_validator/CliOptionParser.scala: -------------------------------------------------------------------------------- 1 | package com.target.data_validator 2 | 3 | import scopt.OptionParser 4 | 5 | case class CliOptions( 6 | configFilename: String = "", 7 | verbose: Boolean = false, 8 | jsonReport: Option[String] = None, 9 | htmlReport: Option[String] = None, 10 | exitErrorOnFail: Boolean = true, 11 | vars: Map[String, String] = Map(), 12 | emailOnPass: Boolean = false 13 | ) 14 | 15 | object CliOptionParser { 16 | 17 | def parser: OptionParser[CliOptions] = new OptionParser[CliOptions]("data-validator") { 18 | head(BuildInfo.name, "v" + BuildInfo.version) 19 | 20 | version("version") 21 | 22 | opt[Unit]("verbose").action((_, c) => c.copy(verbose = true)).text("Print additional debug output.") 23 | 24 | opt[String]("config") 25 | .action((fn, c) => c.copy(configFilename = fn)) 26 | .text( 27 | "required validator config .yaml filename, " + 28 | "prefix w/ 'classpath:' to load configuration from JVM classpath/resources, " + 29 | "ex. '--config classpath:/config.yaml'" 30 | ) 31 | 32 | opt[String]("jsonReport").action((fn, c) => c.copy(jsonReport = Some(fn))).text("optional JSON report filename") 33 | 34 | opt[String]("htmlReport").action((fn, c) => c.copy(htmlReport = Some(fn))).text("optional HTML report filename") 35 | 36 | opt[Map[String, String]]("vars") 37 | .valueName("k1=v1,k2=v2...") 38 | .action((x, c) => c.copy(vars = x)) 39 | .text("other arguments") 40 | 41 | opt[Boolean]("exitErrorOnFail") 42 | .valueName("true|false") 43 | .action((x, c) => c.copy(exitErrorOnFail = x)) 44 | .text( 45 | "optional when true, if validator fails, call System.exit(-1) " + 46 | "Defaults to True, but will change to False in future version." 47 | ) 48 | 49 | opt[Boolean]("emailOnPass") 50 | .valueName("true|false") 51 | .action((x, c) => c.copy(emailOnPass = x)) 52 | .text("optional when true, sends email on validation success. Default: false") 53 | 54 | help("help").text("Show this help message and exit.") 55 | } 56 | } 57 | -------------------------------------------------------------------------------- /src/main/scala/com/target/data_validator/ConfigParser.scala: -------------------------------------------------------------------------------- 1 | package com.target.data_validator 2 | 3 | import cats.syntax.either._ 4 | import cats.syntax.functor._ 5 | import com.typesafe.scalalogging.LazyLogging 6 | import io.circe._ 7 | import io.circe.generic.auto._ 8 | import io.circe.yaml.parser 9 | 10 | import scala.io.{BufferedSource, Source} 11 | import scala.util.{Failure, Success, Try} 12 | 13 | object ConfigParser extends LazyLogging { 14 | 15 | // IntelliJ Says this import isn't needed, but it won't compile without it. 16 | import validator.JsonDecoders._ 17 | import ConfigVar._ 18 | 19 | implicit val decodeTable: Decoder[ValidatorTable] = 20 | List[Decoder[ValidatorTable]]( 21 | Decoder[ValidatorHiveTable].widen, 22 | Decoder[ValidatorOrcFile].widen, 23 | Decoder[ValidatorParquetFile].widen, 24 | Decoder[ValidatorSpecifiedFormatLoader].widen 25 | ).reduceLeft(_ or _) 26 | 27 | def configFromJson(json: Json): Either[DecodingFailure, ValidatorConfig] = { 28 | logger.debug(s"Json config: $json") 29 | json.as[ValidatorConfig] 30 | } 31 | 32 | private def bufferContentsAsString(buffer: BufferedSource): String = { 33 | val contents = buffer.mkString 34 | buffer.close() 35 | contents 36 | } 37 | 38 | private def loadFromFile(filename: String): String = { 39 | logger.info(s"Attempting to load `$filename` from file system") 40 | val buffer = Source.fromFile(filename) 41 | bufferContentsAsString(buffer) 42 | } 43 | 44 | private def loadFromClasspath(filename: String): String = { 45 | logger.info(s"Attempting to load `$filename` from classpath") 46 | val is = getClass.getResourceAsStream(filename) 47 | val buffer = Source.fromInputStream(is) 48 | bufferContentsAsString(buffer) 49 | } 50 | 51 | def parseFile(filename: String, cliMap: Map[String, String]): Either[Error, ValidatorConfig] = { 52 | logger.info(s"Parsing `$filename`") 53 | 54 | Try { 55 | if (filename.startsWith("classpath:")) { 56 | loadFromClasspath(filename.stripPrefix("classpath:")) 57 | } else { 58 | loadFromFile(filename) 59 | } 60 | } match { 61 | case Success(contents) => parse(contents) 62 | case Failure(thr) => Left[Error, ValidatorConfig](DecodingFailure.fromThrowable(thr, List.empty)) 63 | } 64 | } 65 | 66 | def parse(conf: String): Either[Error, ValidatorConfig] = parser.parse(conf).flatMap(configFromJson) 67 | 68 | def main(args: Array[String]): Unit = { 69 | logger.info(s"Args[${args.length}]: $args") 70 | val filename = args(0) 71 | var error = false 72 | 73 | parseFile(filename, Map.empty) match { 74 | case Left(pe) => logger.error(s"Failed to parse $filename, ${pe.getMessage}"); error = true 75 | case Right(config) => logger.info(s"Config: $config") 76 | } 77 | 78 | System.exit(if (error) 1 else 0) 79 | } 80 | } 81 | -------------------------------------------------------------------------------- /src/main/scala/com/target/data_validator/ConfigVar.scala: -------------------------------------------------------------------------------- 1 | package com.target.data_validator 2 | 3 | import cats.syntax.functor._ 4 | import com.target.data_validator.EnvironmentVariables.{Error, Inaccessible, Present, Unset} 5 | import com.typesafe.scalalogging.LazyLogging 6 | import io.circe.{Decoder, Json} 7 | import io.circe.generic.auto._ 8 | import org.apache.spark.sql.SparkSession 9 | 10 | import scala.sys.process.{Process, ProcessLogger} 11 | import scala.util.{Failure, Success, Try} 12 | 13 | sealed trait ConfigVar extends EventLog with Substitutable { 14 | def addEntry(spark: SparkSession, varSub: VarSubstitution): Boolean 15 | } 16 | 17 | case class NameValue(name: String, value: Json) extends ConfigVar { 18 | override def addEntry(spark: SparkSession, varSub: VarSubstitution): Boolean = { 19 | logger.debug(s"name: $name value: ${value.noSpaces}") 20 | varSub.add(name, getVarSubJson(value, name, varSub)) 21 | } 22 | } 23 | 24 | case class NameEnv(name: String, env: String) extends ConfigVar { 25 | 26 | override def addEntry(spark: SparkSession, varSub: VarSubstitution): Boolean = { 27 | val newEnv = getVarSub(env, name, varSub) 28 | EnvironmentVariables.get(newEnv) match { 29 | case Inaccessible(message) => logger.error(message); true 30 | case Error(message) => logger.error(message); true 31 | case Unset => { 32 | val msg = s"Variable '$name' cannot be processed env variable '$newEnv' not found!" 33 | logger.error(msg) 34 | addEvent(ValidatorError(msg)) 35 | true 36 | } 37 | case Present(value) => { 38 | val resolvedEnvVar = getVarSubJson(JsonUtils.string2Json(value), name, varSub) 39 | logger.info(s"name: $name env: $env getEnv: $value resolvedEnvVar: $resolvedEnvVar") 40 | varSub.add(name, resolvedEnvVar) 41 | } 42 | } 43 | } 44 | } 45 | 46 | case class NameShell(name: String, shell: String) extends ConfigVar { 47 | override def addEntry(spark: SparkSession, varSub: VarSubstitution): Boolean = { 48 | val newShell = getVarSub(shell, "shell", varSub) 49 | val timer = new ValidatorTimer(s"NameShell($name, $newShell)") 50 | val out = new StringBuilder 51 | val err = new StringBuilder 52 | val processLogger = ProcessLogger(out append _, err append _) 53 | addEvent(timer) 54 | timer.time { 55 | Try(Process(Seq("/bin/sh", "-c", newShell)) ! processLogger) match { 56 | case Failure(exception) => 57 | validatorError(s"NameShell($name, $newShell) Failed with exception $exception"); true 58 | case Success(exitCode) if exitCode != 0 => 59 | validatorError( 60 | s"NameShell($name, $newShell) Ran but returned exitCode: $exitCode stderr: ${err.toString()}" 61 | ) 62 | true 63 | case Success(0) if out.isEmpty => 64 | validatorError(s"NameShell($name, $newShell) Ran but returned No output") 65 | true 66 | case Success(0) => 67 | val value = out.toString.split("\n").head 68 | logger.debug(s"name: $name shell: $newShell output: $value") 69 | varSub.add(name, getVarSubJson(JsonUtils.string2Json(value), name, varSub)); false 70 | } 71 | } 72 | } 73 | } 74 | 75 | case class NameSql(name: String, sql: String) extends ConfigVar { 76 | override def addEntry(spark: SparkSession, varSub: VarSubstitution): Boolean = { 77 | val timer = new ValidatorTimer(s"NameSql($name, $sql)") 78 | addEvent(timer) 79 | timer.time { 80 | Try(spark.sql(getVarSub(sql, name, varSub)).head(1)) match { 81 | case Failure(exception) => 82 | validatorError(s"NameSql($name, $sql) Failed with exception $exception") 83 | true 84 | case Success(rows) if rows.isEmpty => 85 | validatorError(s"NameSql($name, $sql) Returned 0 rows.") 86 | true 87 | case Success(rows) => 88 | val json = JsonUtils.row2Json(rows.head, 0) 89 | logger.debug(s"name: $name sql: $sql result: ${rows.head} json: ${JsonUtils.debugJson(json)}") 90 | varSub.add(name, json) 91 | } 92 | } 93 | } 94 | } 95 | 96 | object ConfigVar extends LazyLogging { 97 | 98 | implicit val decodeConfigVar: Decoder[ConfigVar] = List[Decoder[ConfigVar]]( 99 | Decoder[NameValue].widen, 100 | Decoder[NameShell].widen, 101 | Decoder[NameEnv].widen, 102 | Decoder[NameSql].widen 103 | ).reduceLeft(_ or _) 104 | 105 | } 106 | -------------------------------------------------------------------------------- /src/main/scala/com/target/data_validator/Emailer.scala: -------------------------------------------------------------------------------- 1 | package com.target.data_validator 2 | 3 | import java.util.{Date, Properties} 4 | 5 | import com.typesafe.scalalogging.LazyLogging 6 | import javax.mail._ 7 | import javax.mail.internet._ 8 | 9 | import scala.util.Try 10 | 11 | case class EmailConfig( 12 | smtpHost: String, 13 | subject: String, 14 | from: String, 15 | to: List[String], 16 | cc: Option[List[String]] = None, 17 | bcc: Option[List[String]] = None 18 | ) extends EventLog 19 | with Substitutable { 20 | def substituteVariables(dict: VarSubstitution): EmailConfig = { 21 | EmailConfig( 22 | getVarSub(smtpHost, "smtpHost", dict), 23 | getVarSub(subject, "email.subject", dict), 24 | getVarSub(from, "email.from", dict), 25 | to.map(getVarSub(_, "email.to", dict)), 26 | cc.map(_.map(getVarSub(_, "email.cc", dict))), 27 | bcc.map(_.map(getVarSub(_, "email.bcc", dict))) 28 | ) 29 | } 30 | } 31 | 32 | object Emailer extends LazyLogging { 33 | 34 | def createMessage(smtpHost: String): Message = { 35 | val properties = new Properties() 36 | properties.put("mail.smtp.host", smtpHost) 37 | val session = Session.getInstance(properties) 38 | new MimeMessage(session) 39 | } 40 | 41 | def setMessageRecipients(message: Message, recipients: Seq[String], recipientType: Message.RecipientType): Int = { 42 | val parsedAddresses = recipients.map(x => Try(InternetAddress.parse(x))) 43 | 44 | val (goodParsed, badParsed) = parsedAddresses.partition(_.isSuccess) 45 | 46 | badParsed.foreach(x => logger.error(s"EmailAddress from $recipientType threw exception $x")) 47 | 48 | val goodAddresses: Array[Address] = goodParsed.flatMap(_.get.toSeq).seq.toArray 49 | 50 | if (!goodAddresses.isEmpty) { 51 | message.setRecipients(recipientType, goodAddresses) 52 | } 53 | 54 | goodAddresses.length 55 | } 56 | 57 | def setFrom(message: Message, from: String): Boolean = { 58 | try { 59 | val frm = InternetAddress.parse(from, true) 60 | message.setFrom(frm.head) 61 | false 62 | } catch { 63 | case ae: AddressException => 64 | logger.error(s"setFrom InternetAddress parse failed, $ae") 65 | true 66 | case me: MessagingException => 67 | logger.error(s"setFrom failed, $me") 68 | true 69 | } 70 | } 71 | 72 | def createEmptyMessage( 73 | smtpHost: String, 74 | subject: String, 75 | from: String, 76 | to: Seq[String], 77 | cc: Seq[String], 78 | bcc: Seq[String] 79 | ): Option[Message] = { 80 | 81 | logger.debug(s"Creating Message frm: $from to: ${to.mkString(", ")} sub: $subject") 82 | val message = createMessage(smtpHost) 83 | 84 | val validRecipients = setMessageRecipients(message, cc, Message.RecipientType.CC) + 85 | setMessageRecipients(message, bcc, Message.RecipientType.BCC) + 86 | setMessageRecipients(message, to, Message.RecipientType.TO) 87 | if (validRecipients == 0) { 88 | logger.error("Must specify at least 1 valid email address in TO, CC, or BCC") 89 | None 90 | } else if (setFrom(message, from)) { 91 | logger.error(s"setFrom($from) failed!") 92 | None 93 | } else { 94 | message.setSentDate(new Date()) 95 | message.setSubject(subject) 96 | Some(message) 97 | } 98 | } 99 | 100 | def createEmptyMessage(ec: EmailConfig): Option[Message] = 101 | createEmptyMessage( 102 | ec.smtpHost, 103 | ec.subject, 104 | ec.from, 105 | ec.to, 106 | ec.cc.getOrElse(Seq.empty), 107 | ec.bcc.getOrElse(Seq.empty) 108 | ) 109 | 110 | def sendMessage(message: Message, body: String, mime: String): Boolean = { 111 | message.setContent(body, mime) 112 | val id = message.hashCode().toHexString 113 | try { 114 | logger.info(s"Sending email #$id [${message.getSubject}] to [${message.getAllRecipients.mkString(", ")}]") 115 | Transport.send(message) 116 | logger.info(s"Email #$id sent successfully to all recipients.") 117 | false 118 | } catch { 119 | case sfe: SendFailedException => 120 | handleSendFailedException(id, sfe) 121 | true 122 | case me: MessagingException => 123 | logger.error(s"Failure to send email #$id: $me") 124 | true 125 | } 126 | } 127 | 128 | private def handleSendFailedException(id: String, sfe: SendFailedException): Unit = { 129 | logger.warn(s"Failure to send email #$id: ${sfe.getMessage}") 130 | Option(sfe.getValidSentAddresses) match { 131 | case Some(addresses) => logger.warn(s"Email #$id was sent to [${addresses.mkString(", ")}]") 132 | case None => logger.info("No emails were sent successfully.") 133 | } 134 | Option(sfe.getValidUnsentAddresses) match { 135 | case Some(addresses) => logger.error(s"Email #$id was not sent to [${addresses.mkString(", ")}]") 136 | case None => 137 | } 138 | Option(sfe.getInvalidAddresses) match { 139 | case Some(addresses) => logger.error(s"Email #$id has invalid addresses: [${addresses.mkString(", ")}]") 140 | case None => 141 | } 142 | } 143 | 144 | def sendTextMessage( 145 | smtpHost: String, 146 | body: String, 147 | subject: String, 148 | from: String, 149 | to: Seq[String], 150 | cc: Seq[String] = Nil, 151 | bcc: Seq[String] = Nil 152 | ): Boolean = 153 | createEmptyMessage(smtpHost, subject, from, to, cc, bcc) match { 154 | case None => 155 | logger.error("createMessage failed!") 156 | true 157 | case Some(message) => 158 | sendMessage(message, body, "text/plain; charset=us-ascii") 159 | } 160 | 161 | def sendTextMessage(emailConfig: EmailConfig, body: String): Boolean = { 162 | createEmptyMessage(emailConfig) match { 163 | case None => 164 | logger.error("createMessage failed!") 165 | true 166 | case Some(message) => 167 | sendMessage(message, body, "text/plain; charset=us-ascii") 168 | } 169 | } 170 | 171 | def sendHtmlMessage( 172 | smtpHost: String, 173 | body: String, 174 | subject: String, 175 | from: String, 176 | to: Seq[String], 177 | cc: Seq[String] = Nil, 178 | bcc: Seq[String] = Nil 179 | ): Boolean = 180 | createEmptyMessage(smtpHost, subject, from, to, cc, bcc) match { 181 | case None => 182 | logger.error("createMessage failed!") 183 | true 184 | case Some(message) => 185 | sendMessage(message, body, "text/html") 186 | } 187 | 188 | def sendHtmlMessage(config: EmailConfig, body: String): Boolean = { 189 | createEmptyMessage(config) match { 190 | case None => 191 | logger.error("createMessage failed!") 192 | true 193 | case Some(message) => 194 | sendMessage(message, body, "text/html") 195 | } 196 | } 197 | 198 | } 199 | -------------------------------------------------------------------------------- /src/main/scala/com/target/data_validator/EnvironmentVariables.scala: -------------------------------------------------------------------------------- 1 | package com.target.data_validator 2 | 3 | import scala.collection.mutable 4 | import scala.util.Try 5 | 6 | object EnvironmentVariables { 7 | type MaybeEnvVar = Try[Option[String]] 8 | 9 | val accessedEnvVars: mutable.Map[String, MaybeEnvVar] = mutable.Map.empty 10 | 11 | def get(key: String): EnvVarResult = { 12 | getWithHandlers(key)( 13 | whenError = { case throwable: Throwable => Inaccessible(throwable) }, 14 | whenUnset = { Unset }, 15 | whenPresent = { Present } 16 | ).recover { case throwable: Throwable => Error(throwable) }.get 17 | } 18 | 19 | def getWithHandlers[T](key: String)( 20 | whenError: PartialFunction[Throwable, T], 21 | whenUnset: => T, 22 | whenPresent: String => T 23 | ): Try[T] = { 24 | tryGet(key) 25 | .map(_.map(whenPresent).getOrElse(whenUnset)) 26 | .recover(whenError) 27 | } 28 | 29 | def tryGet(key: String): MaybeEnvVar = { 30 | val result = Try(System.getenv(key)).map(Option(_)) 31 | accessedEnvVars += key -> result 32 | result 33 | } 34 | 35 | sealed trait EnvVarResult { 36 | def toString: String 37 | } 38 | case class Present(value: String) extends EnvVarResult { 39 | override def toString: String = value 40 | } 41 | case class Inaccessible(message: String) extends EnvVarResult { 42 | override val toString: String = s"" 43 | } 44 | object Inaccessible { 45 | def apply(throwable: Throwable): Inaccessible = Inaccessible(throwable.getMessage) 46 | } 47 | case object Unset extends EnvVarResult { 48 | override val toString: String = "" 49 | } 50 | case class Error(message: String) extends EnvVarResult { 51 | override val toString: String = s"" 52 | } 53 | object Error { 54 | def apply(throwable: Throwable): Error = Error(throwable.getMessage) 55 | } 56 | } 57 | -------------------------------------------------------------------------------- /src/main/scala/com/target/data_validator/EventGenerator.scala: -------------------------------------------------------------------------------- 1 | package com.target.data_validator 2 | 3 | trait EventGenerator { 4 | def addEvent(ve: ValidatorEvent): Unit 5 | } 6 | -------------------------------------------------------------------------------- /src/main/scala/com/target/data_validator/EventLog.scala: -------------------------------------------------------------------------------- 1 | package com.target.data_validator 2 | 3 | import com.typesafe.scalalogging.LazyLogging 4 | 5 | import scala.collection.mutable.ListBuffer 6 | 7 | trait EventLog extends EventGenerator with LazyLogging { 8 | def addEvent(ve: ValidatorEvent): Unit = EventLog.events.append(ve) 9 | 10 | def validatorError(msg: String): Unit = { 11 | logger.error(msg) 12 | addEvent(ValidatorError(msg)) 13 | } 14 | } 15 | 16 | object EventLog extends LazyLogging { 17 | val events = new ListBuffer[ValidatorEvent] 18 | } 19 | -------------------------------------------------------------------------------- /src/main/scala/com/target/data_validator/ExpressionUtils.scala: -------------------------------------------------------------------------------- 1 | package com.target.data_validator 2 | 3 | import org.apache.spark.sql.catalyst.expressions.{Expression, Or} 4 | 5 | object ExpressionUtils { 6 | 7 | /** Takes a List[Expression] and joins them together into on big Or() expression. 8 | * @param exprs 9 | * \- Non Empty List of Expressions. 10 | * @return 11 | * Or of all Expressions. throws IllegalArgumentException if exprs is empty. 12 | */ 13 | @throws[IllegalArgumentException] 14 | def orFromList(exprs: List[Expression]): Expression = exprs match { 15 | case exp :: Nil => exp 16 | case lhs :: rhs :: Nil => Or(lhs, rhs) 17 | case lhs :: rhs :: rest => rest.foldRight(Or(lhs, rhs))(Or(_, _)) 18 | case Nil => throw new IllegalArgumentException("exprs must be nonEmpty") 19 | } 20 | } 21 | -------------------------------------------------------------------------------- /src/main/scala/com/target/data_validator/GenTestData.scala: -------------------------------------------------------------------------------- 1 | package com.target.data_validator 2 | 3 | import org.apache.spark.sql.{DataFrame, Row, SparkSession} 4 | import org.apache.spark.sql.types.{IntegerType, StringType, StructField, StructType} 5 | 6 | object GenTestData { 7 | 8 | val schema = StructType( 9 | List( 10 | StructField("id", IntegerType), 11 | StructField("label", StringType), 12 | StructField("div7", StringType, nullable = true) 13 | ) 14 | ) 15 | 16 | val label: Vector[String] = Vector("zero", "one", "two", "three", "four", "five", "six", "seven", "eight", "nine") 17 | 18 | def mkLabel(x: Int): List[String] = { 19 | if (x == 0) { 20 | Nil 21 | } else { 22 | val y = x % 10 23 | label(y) :: mkLabel(x / 10) 24 | } 25 | } 26 | 27 | def genData(spark: SparkSession): DataFrame = { 28 | val rg = spark.sparkContext.parallelize(Range(0, 100)) // scalastyle:off magic.number 29 | spark.createDataFrame( 30 | rg.map(x => Row(x, mkLabel(x).reverse.mkString(" "), if (x % 7 == 0) null else "NotNull") // scalastyle:off null 31 | ), 32 | schema 33 | ) 34 | } 35 | 36 | def main(args: Array[String]): Unit = { 37 | val spark = SparkSession.builder 38 | .appName("genTestData") 39 | .master(args.headOption.getOrElse("local")) 40 | .getOrCreate() 41 | 42 | spark.sparkContext.setLogLevel("WARN") // Spark is very noisy. 43 | 44 | try { 45 | val df = genData(spark).coalesce(1) 46 | df.write.orc("testData.orc") 47 | } finally { 48 | spark.stop() 49 | } 50 | } 51 | 52 | } 53 | -------------------------------------------------------------------------------- /src/main/scala/com/target/data_validator/HTMLBits.scala: -------------------------------------------------------------------------------- 1 | package com.target.data_validator 2 | 3 | import scalatags.Text.all._ 4 | 5 | /** Place for various HTMLBits that are used in generating HTML report. 6 | */ 7 | object HTMLBits { 8 | def pass: Tag = span(backgroundColor := "mediumseagreen")("PASS") 9 | def fail: Tag = span(backgroundColor := "tomato")("FAIL") 10 | 11 | def status(failed: Boolean): Tag = if (failed) { fail } 12 | else { pass } 13 | } 14 | -------------------------------------------------------------------------------- /src/main/scala/com/target/data_validator/JsonEncoders.scala: -------------------------------------------------------------------------------- 1 | package com.target.data_validator 2 | 3 | import com.target.data_validator.validator._ 4 | import com.typesafe.scalalogging.LazyLogging 5 | import io.circe._ 6 | import io.circe.syntax._ 7 | 8 | object JsonEncoders extends LazyLogging { 9 | 10 | // Used by ValidatorQuickCheckError to make sure Json types are right. 11 | private def any2json(a: Any): Json = a match { 12 | case i: Int => i.asJson 13 | case l: Long => l.asJson 14 | case f: Float => f.asJson 15 | case d: Double => d.asJson 16 | case s: String => Json.fromString(s) 17 | case b: Boolean => b.asJson 18 | case a: Any => 19 | logger.warn(s"Unknown type `${a.getClass.getCanonicalName}` defaulting to string.") 20 | Json.fromString(a.toString) 21 | } 22 | 23 | // scalastyle:off cyclomatic.complexity 24 | implicit val eventEncoder: Encoder[ValidatorEvent] = new Encoder[ValidatorEvent] { 25 | override def apply(a: ValidatorEvent): Json = a match { 26 | case vc: ValidatorCounter => 27 | Json.obj( 28 | ("type", Json.fromString("counter")), 29 | ("name", Json.fromString(vc.name)), 30 | ("value", Json.fromLong(vc.value)) 31 | ) 32 | case vg: ValidatorGood => 33 | Json.obj( 34 | ("type", Json.fromString("good")), 35 | ("msg", Json.fromString(vg.msg)) 36 | ) 37 | case ve: ValidatorError => 38 | Json.obj( 39 | ("type", Json.fromString("error")), 40 | ("failed", Json.fromBoolean(ve.failed)), 41 | ("msg", Json.fromString(ve.msg)) 42 | ) 43 | case vt: ValidatorTimer => 44 | Json.obj( 45 | ("type", Json.fromString("timer")), 46 | ("label", Json.fromString(vt.label)), 47 | ("ns", Json.fromLong(vt.duration)) 48 | ) 49 | case vce: ValidatorCheckEvent => 50 | Json.obj( 51 | ("type", Json.fromString("checkEvent")), 52 | ("failed", Json.fromBoolean(vce.failed)), 53 | ("label", Json.fromString(vce.label)), 54 | ("count", Json.fromLong(vce.count)), 55 | ("errorCount", Json.fromLong(vce.errorCount)) 56 | ) 57 | case cbvce: ColumnBasedValidatorCheckEvent => 58 | Json.obj( 59 | ("type", Json.fromString("columnBasedCheckEvent")), 60 | ("failed", Json.fromBoolean(cbvce.failed)), 61 | ("message", Json.fromString(cbvce.msg)), 62 | ("data", Json.fromFields(cbvce.data.map(x => (x._1, Json.fromString(x._2))))) 63 | ) 64 | case qce: ValidatorQuickCheckError => 65 | Json.obj( 66 | ("type", Json.fromString("quickCheckError")), 67 | ("failed", Json.fromBoolean(qce.failed)), 68 | ("message", Json.fromString(qce.message)), 69 | ("key", Json.fromFields(qce.key.map(x => (x._1, any2json(x._2))))) 70 | ) 71 | case vs: VarSubEvent => 72 | Json.obj( 73 | ("type", Json.fromString("variableSubstitution")), 74 | ("src", Json.fromString(vs.src)), 75 | ("dest", Json.fromString(vs.dest)) 76 | ) 77 | case vs: VarSubJsonEvent => 78 | Json.obj( 79 | ("type", Json.fromString("variableSubstitution")), 80 | ("src", Json.fromString(vs.src)), 81 | ("dest", vs.dest) 82 | ) 83 | case vj: JsonEvent => vj.json 84 | } 85 | } 86 | // scalastyle:on cyclomatic.complexity 87 | 88 | implicit val baseEncoder: Encoder[ValidatorBase] = new Encoder[ValidatorBase] { 89 | final def apply(a: ValidatorBase): Json = a.toJson 90 | } 91 | 92 | implicit val tableEncoder: Encoder[ValidatorTable] = new Encoder[ValidatorTable] { 93 | final override def apply(a: ValidatorTable): Json = a match { 94 | case vh: ValidatorHiveTable => 95 | Json.obj( 96 | ("db", Json.fromString(vh.db)), 97 | ("table", Json.fromString(vh.table)), 98 | ("failed", vh.failed.asJson), 99 | ("keyColumns", vh.keyColumns.asJson), 100 | ("checks", vh.checks.asJson), 101 | ("events", vh.getEvents.asJson) 102 | ) 103 | case vo: ValidatorOrcFile => 104 | Json.obj( 105 | ("orcFile", Json.fromString(vo.orcFile)), 106 | ("failed", vo.failed.asJson), 107 | ("keyColumns", vo.keyColumns.asJson), 108 | ("checks", vo.checks.asJson), 109 | ("events", vo.getEvents.asJson) 110 | ) 111 | case vp: ValidatorParquetFile => 112 | Json.obj( 113 | ("parquetFile", Json.fromString(vp.parquetFile)), 114 | ("failed", vp.failed.asJson), 115 | ("keyColumns", vp.keyColumns.asJson), 116 | ("checks", vp.checks.asJson), 117 | ("events", vp.getEvents.asJson) 118 | ) 119 | case vdf: ValidatorDataFrame => 120 | Json.obj( 121 | ("dfLabel", vdf.label.asJson), 122 | ("failed", vdf.failed.asJson), 123 | ("keyColumns", vdf.keyColumns.asJson), 124 | ("checks", vdf.checks.asJson), 125 | ("events", vdf.getEvents.asJson) 126 | ) 127 | case vcf: ValidatorSpecifiedFormatLoader => 128 | Json.obj( 129 | ("format", Json.fromString(vcf.format)), 130 | ("options", vcf.options.asJson), 131 | ("loadData", vcf.loadData.asJson), 132 | ("failed", vcf.failed.asJson), 133 | ("keyColumns", vcf.keyColumns.asJson), 134 | ("checks", vcf.checks.asJson), 135 | ("events", vcf.getEvents.asJson) 136 | ) 137 | } 138 | } 139 | 140 | implicit val configVarEncoder: Encoder[ConfigVar] = new Encoder[ConfigVar] { 141 | override def apply(a: ConfigVar): Json = a match { 142 | case nv: NameValue => 143 | Json.obj( 144 | ("name", Json.fromString(nv.name)), 145 | ("value", nv.value) 146 | ) 147 | case ne: NameEnv => 148 | Json.obj( 149 | ("name", Json.fromString(ne.name)), 150 | ("env", Json.fromString(ne.env)) 151 | ) 152 | case nshell: NameShell => 153 | Json.obj( 154 | ("name", Json.fromString(nshell.name)), 155 | ("shell", Json.fromString(nshell.shell)) 156 | ) 157 | case nsql: NameSql => 158 | Json.obj( 159 | ("name", Json.fromString(nsql.name)), 160 | ("shell", Json.fromString(nsql.sql)) 161 | ) 162 | case x => 163 | logger.error(s"Unknown configVar type: $x") 164 | throw new RuntimeException(s"Unknown configVar type: $x") 165 | } 166 | } 167 | 168 | implicit val configOutputEncoder: Encoder[ValidatorOutput] = new Encoder[ValidatorOutput] { 169 | override def apply(a: ValidatorOutput): Json = a match { 170 | 171 | case file: FileOutput => 172 | Json.obj( 173 | ("filename", Json.fromString(file.filename)), 174 | ("append", Json.fromBoolean(file.append.getOrElse(false))) 175 | ) 176 | case pipe: PipeOutput => 177 | Json.obj( 178 | ("pipe", Json.fromString(pipe.pipe)), 179 | ("ignoreError", Json.fromBoolean(pipe.ignoreError.getOrElse(false))) 180 | ) 181 | case x => 182 | logger.error(s"Unknown output type: $x") 183 | throw new RuntimeException(s"Unknown output type: $x") 184 | } 185 | } 186 | 187 | } 188 | -------------------------------------------------------------------------------- /src/main/scala/com/target/data_validator/JsonUtils.scala: -------------------------------------------------------------------------------- 1 | package com.target.data_validator 2 | 3 | import com.typesafe.scalalogging.LazyLogging 4 | import io.circe.{parser, Json} 5 | import org.apache.spark.sql.Row 6 | import org.apache.spark.sql.types._ 7 | 8 | object JsonUtils extends LazyLogging { 9 | 10 | def string2Json(v: String): Json = parser.parse(v) match { 11 | case Right(b) => b 12 | case Left(_) => Json.fromString(v) 13 | } 14 | 15 | // scalastyle:off cyclomatic.complexity 16 | def debugJson(j: Json): String = j match { 17 | case _ if j.isNull => s"Json NULL" 18 | case _ if j.isNumber => s"Json NUM: ${j.asNumber.get}" 19 | case _ if j.isArray => s"Json ARR: ${j.noSpaces}" 20 | case _ if j.isBoolean => s"Json BOOLEAN: ${j.asBoolean.get}" 21 | case _ if j.isObject => s"Json OBJECT: ${j.noSpaces}" 22 | case _ if j.asString.isDefined => s"Json STRING: ${j.asString.get}" 23 | case _ => s"Json UNKNOWN[${j.getClass.getSimpleName}]: ${j.noSpaces}" 24 | } 25 | 26 | /** Turn Row into JSon 27 | * @return 28 | * Json Object 29 | */ 30 | def row2Json(row: Row): Json = { 31 | val fields = row.schema.fieldNames.zipWithIndex.map { case (fieldName, idx) => 32 | (fieldName, row2Json(row, idx)) 33 | } 34 | Json.obj(fields: _*) 35 | } 36 | 37 | /** Take Row, and turn col into Json. 38 | * @return 39 | * Json 40 | */ 41 | def row2Json(row: Row, col: Int): Json = { 42 | val dataType = row.schema(col).dataType 43 | dataType match { 44 | case StringType => Json.fromString(row.getString(col)) 45 | case LongType => Json.fromLong(row.getLong(col)) 46 | case IntegerType => Json.fromInt(row.getInt(col)) 47 | case NullType => Json.Null 48 | case BooleanType => Json.fromBoolean(row.getBoolean(col)) 49 | case DoubleType => Json.fromDoubleOrNull(row.getDouble(col)) 50 | case _: StructType => row2Json(row.getStruct(col)) 51 | case _ => 52 | logger.error( 53 | s"Unimplemented dataType '${dataType.typeName}' in column: ${row.schema(col).name} " + 54 | "Please report this as a bug." 55 | ) 56 | Json.Null 57 | } 58 | } 59 | // scalastyle:on cyclomatic.complexity 60 | } 61 | -------------------------------------------------------------------------------- /src/main/scala/com/target/data_validator/Main.scala: -------------------------------------------------------------------------------- 1 | package com.target.data_validator 2 | 3 | import java.util.Properties 4 | 5 | import com.typesafe.scalalogging.LazyLogging 6 | import org.apache.log4j.{Level, Logger, PropertyConfigurator} 7 | import org.apache.spark.sql.SparkSession 8 | import scopt.OptionParser 9 | 10 | object Main extends LazyLogging with EventLog { 11 | 12 | def loadConfigRun(mainConfig: CliOptions): (Boolean, Boolean) = 13 | ConfigParser.parseFile(mainConfig.configFilename, mainConfig.vars) match { 14 | case Left(error) => 15 | logger.error(s"Failed to parse config file '${mainConfig.configFilename}, $error") 16 | (true, false) 17 | case Right(validatorConfig) => runChecks(mainConfig, validatorConfig) 18 | } 19 | 20 | def resolveVariables( 21 | spark: SparkSession, 22 | mainConfig: CliOptions, 23 | config: ValidatorConfig, 24 | varSub: VarSubstitution 25 | ): Option[ValidatorConfig] = { 26 | varSub.addMap(mainConfig.vars) 27 | 28 | config.vars match { 29 | case None => config.substituteVariables(varSub) 30 | case Some(vars) => 31 | if (vars.map(_.addEntry(spark, varSub)).exists(x => x)) { 32 | validatorError("Failed to resolve config variables") 33 | None 34 | } else { 35 | config.substituteVariables(varSub) 36 | } 37 | } 38 | } 39 | 40 | private def checkFile(spark: SparkSession, filename: Option[String], append: Boolean): Boolean = { 41 | logger.info(s"filename: $filename append: $append") 42 | if (filename.isDefined) { 43 | logger.info(s"CheckFile $filename") 44 | val ret = filename.exists(!IO.canAppendOrCreate(_, append)(spark)) 45 | logger.info(s"Checking file '${filename.get} append: $append failed: $ret") 46 | if (ret) { 47 | logger.error(s"Filename: ${filename.get} error!") 48 | } 49 | ret 50 | } else { 51 | false 52 | } 53 | } 54 | 55 | def checkCliOutputs(spark: SparkSession, mainConfig: CliOptions): Boolean = { 56 | logger.info(s"Checking Cli Outputs htmlReport: ${mainConfig.htmlReport} jsonReport: ${mainConfig.jsonReport}") 57 | checkFile(spark, mainConfig.htmlReport, append = false) || 58 | checkFile(spark, mainConfig.jsonReport, append = true) 59 | } 60 | 61 | def checkConfig( 62 | spark: SparkSession, 63 | mainConfig: CliOptions, 64 | config: ValidatorConfig, 65 | varSub: VarSubstitution 66 | ): Boolean = checkCliOutputs(spark, mainConfig) || config.configCheck(spark, varSub) 67 | 68 | def runSparkChecks( 69 | spark: SparkSession, 70 | mainConfig: CliOptions, 71 | config: ValidatorConfig, 72 | varSub: VarSubstitution 73 | ): Boolean = { 74 | logger.info("Running sparkChecks") 75 | Seq(config.quickChecks(spark, varSub), config.costlyChecks(spark, varSub)).exists(x => x) 76 | } 77 | 78 | /* 79 | * There are 2 types of errors we return (fatal, validator_status) 80 | * If fatal, we need to System.exit(1) 81 | * Otherwise we print a message `VALIDATOR_STATUS=PASS|FAIL 82 | */ 83 | def runChecks(mainConfig: CliOptions, origConfig: ValidatorConfig): (Boolean, Boolean) = { 84 | val varSub = new VarSubstitution 85 | 86 | implicit val spark: SparkSession = 87 | SparkSession.builder.appName("data-validator").enableHiveSupport().getOrCreate() 88 | 89 | if (mainConfig.verbose) { 90 | logger.info("Verbose Flag detected") 91 | logger.info(s"Original config: $origConfig") 92 | Logger.getRootLogger.setLevel(Level.DEBUG) 93 | } 94 | 95 | // Resolve config 96 | val (fatal, validator_fail) = resolveVariables(spark, mainConfig, origConfig, varSub) 97 | .map { config => 98 | val fatal = config.failed || checkConfig(spark, mainConfig, config, varSub) 99 | if (fatal) { 100 | (fatal, false) 101 | } else { 102 | // Result is true in case of validation failure, otherwise false. 103 | val validatorFail = runSparkChecks(spark, mainConfig, config, varSub) 104 | 105 | if (validatorFail || mainConfig.emailOnPass) { 106 | Reports.emailReport(mainConfig, config, varSub) 107 | } 108 | Reports.jsonReport(mainConfig, config, varSub) 109 | 110 | (fatal, validatorFail) 111 | } 112 | } 113 | .getOrElse((true, false)) 114 | spark.stop() 115 | 116 | (fatal, validator_fail) 117 | } 118 | 119 | def configLogging(): Unit = { 120 | val props = new Properties() 121 | props.load(getClass.getResourceAsStream("/log4j-dv-spark.properties")) 122 | // props.list(System.err) 123 | PropertyConfigurator.configure(props) 124 | logger.info("Logging configured!") 125 | } 126 | 127 | def main(args: Array[String]): Unit = { 128 | configLogging() 129 | 130 | val parser = CliOptionParser.parser 131 | 132 | logger.info("Data Validator") 133 | 134 | parser.parse(args, CliOptions()) match { 135 | case Some(cliConfig: CliOptions) => 136 | val (fatal, validatorFail) = loadConfigRun(cliConfig) 137 | 138 | if (fatal || validatorFail) { 139 | logger.error("data-validator failed!") 140 | println("DATA_VALIDATOR_STATUS=FAIL") // scalastyle:ignore 141 | } else { 142 | logger.info("data-validator success!") 143 | println("DATA_VALIDATOR_STATUS=PASS") // scalastyle:ignore 144 | } 145 | 146 | if (fatal || (validatorFail && cliConfig.exitErrorOnFail)) { 147 | System.exit(-1) 148 | } 149 | case None => 150 | logger.error("Failed to Parse Command line Options.") 151 | println("DATA_VALIDATOR_STATUS=FAIL") // scalastyle:ignore 152 | System.exit(-1) 153 | } 154 | } 155 | } 156 | -------------------------------------------------------------------------------- /src/main/scala/com/target/data_validator/Reports.scala: -------------------------------------------------------------------------------- 1 | package com.target.data_validator 2 | 3 | import com.typesafe.scalalogging.LazyLogging 4 | import org.apache.spark.sql.SparkSession 5 | 6 | object Reports extends LazyLogging with EventLog { 7 | 8 | def emailReport( 9 | mainConfig: CliOptions, 10 | config: ValidatorConfig, 11 | varSub: VarSubstitution 12 | )(implicit spark: SparkSession): Unit = { 13 | if (mainConfig.htmlReport.isDefined || config.email.isDefined) { 14 | val htmlReport = config.generateHTMLReport() 15 | 16 | mainConfig.htmlReport.foreach { htmlFilename => 17 | logger.info(s"Writing HTML report to $htmlFilename") 18 | IO.writeHTML(htmlFilename, htmlReport) 19 | } 20 | 21 | config.email.foreach { emailConfig => 22 | logger.info(s"Sending email report emailConfig: $emailConfig") 23 | Emailer.sendHtmlMessage(emailConfig, htmlReport.render) 24 | } 25 | } 26 | } 27 | 28 | def jsonReport( 29 | mainConfig: CliOptions, 30 | config: ValidatorConfig, 31 | varSub: VarSubstitution 32 | )(implicit spark: SparkSession): Unit = { 33 | if (config.outputs.isDefined || mainConfig.jsonReport.isDefined) { 34 | val jsonReport = config.genJsonReport(varSub) 35 | mainConfig.jsonReport.foreach { jsonFilename => 36 | logger.info(s"Writing JSON report to $jsonFilename") 37 | IO.writeJSON(jsonFilename, jsonReport, append = true) 38 | } 39 | 40 | for { 41 | outputs <- config.outputs 42 | out <- outputs 43 | } { 44 | if (out.write(jsonReport)) { 45 | val msg = s"ERROR: Failed to write out: $out" 46 | logger.error(msg) 47 | validatorError(msg) 48 | } 49 | } 50 | } 51 | } 52 | 53 | } 54 | -------------------------------------------------------------------------------- /src/main/scala/com/target/data_validator/Substitutable.scala: -------------------------------------------------------------------------------- 1 | package com.target.data_validator 2 | 3 | import com.typesafe.scalalogging.LazyLogging 4 | import io.circe.Json 5 | 6 | trait Substitutable extends LazyLogging with EventGenerator { 7 | def getVarSub(v: String, field: String, dict: VarSubstitution): String = 8 | dict.replaceVars(v) match { 9 | case Left(newV) => 10 | if (v != newV) { 11 | logger.info(s"Substituting $field var: $v with `$newV`") 12 | addEvent(VarSubEvent(v, newV)) 13 | } 14 | newV 15 | case Right(event) => 16 | addEvent(event) 17 | logger.warn(s"Field: $field msg: $event") 18 | v 19 | } 20 | 21 | def getVarSubJson(j: Json, field: String, dict: VarSubstitution): Json = 22 | dict.replaceJsonVars(j) match { 23 | case Left(newJ) => 24 | if (j != newJ) { 25 | logger.info(s"Substituting Json $field Json: $j with `$newJ`") 26 | addEvent(VarSubJsonEvent(j.toString(), newJ)) 27 | } 28 | newJ 29 | case Right(event) => 30 | addEvent(event) 31 | logger.warn(s"Field: $field msg: $event") 32 | j 33 | } 34 | } 35 | -------------------------------------------------------------------------------- /src/main/scala/com/target/data_validator/ValidatorConfig.scala: -------------------------------------------------------------------------------- 1 | package com.target.data_validator 2 | 3 | import java.net.InetAddress 4 | 5 | import com.target.data_validator.EnvironmentVariables.MaybeEnvVar 6 | import com.typesafe.scalalogging.LazyLogging 7 | import io.circe.Json 8 | import io.circe.generic.auto._ 9 | import io.circe.syntax._ 10 | import org.apache.spark.sql.SparkSession 11 | 12 | import scala.collection.JavaConverters._ 13 | import scala.util.Try 14 | import scalatags.Text.all._ 15 | 16 | case class ValidatorConfig( 17 | numKeyCols: Int, 18 | numErrorsToReport: Int, 19 | email: Option[EmailConfig], 20 | detailedErrors: Boolean = true, 21 | vars: Option[List[ConfigVar]], 22 | outputs: Option[List[ValidatorOutput]], 23 | tables: List[ValidatorTable] 24 | ) extends LazyLogging { 25 | 26 | def failed: Boolean = tables.exists(_.failed) 27 | 28 | def checkOutputs(session: SparkSession): Boolean = outputs match { 29 | case Some(outs) => outs.map(_.configCheck(session)).exists(x => x) 30 | case None => false 31 | } 32 | 33 | def checkTables(session: SparkSession, dict: VarSubstitution): Boolean = { 34 | val error = tables.map(_.configCheck(session, dict)).exists(x => x) 35 | if (error) { 36 | logger.error("checkTables failed!") 37 | } 38 | error 39 | } 40 | 41 | def configCheck(session: SparkSession, dict: VarSubstitution): Boolean = { 42 | val outputsError = checkOutputs(session) 43 | val tableError = checkTables(session, dict) 44 | if (outputsError || tableError) { 45 | logger.error("configCheck failed!") 46 | } 47 | outputsError || tableError 48 | } 49 | 50 | def quickChecks(session: SparkSession, dict: VarSubstitution): Boolean = { 51 | logger.info("Running Quick Checks...") 52 | tables.map(_.quickChecks(session, dict)(this)).exists(x => x) 53 | } 54 | 55 | def costlyChecks(session: SparkSession, dict: VarSubstitution): Boolean = { 56 | logger.info("Running Costly Checks...") 57 | tables.map(_.costlyChecks(session, dict)(this)).exists(x => x) 58 | } 59 | 60 | def generateHTMLReport(): Tag = html(h1("Validator Report"), hr(), tables.map(_.generateHTMLReport())) 61 | 62 | def substituteVariables(varSub: VarSubstitution): Option[ValidatorConfig] = { 63 | logger.info("substituteVariables()") 64 | Some( 65 | this.copy( 66 | email = this.email.map(_.substituteVariables(varSub)), 67 | tables = this.tables.map(_.substituteVariables(varSub)), 68 | outputs = this.outputs.map(_.map(_.substituteVariables(varSub))) 69 | ) 70 | ) 71 | } 72 | 73 | def genJsonReport(varSub: VarSubstitution)(implicit spark: SparkSession): Json = { 74 | import JsonEncoders._ 75 | 76 | Json.obj( 77 | ("numKeyCols", numKeyCols.asJson), 78 | ("numErrorsToReport", numErrorsToReport.asJson), 79 | ("email", email.asJson), 80 | ("detailedErrors", detailedErrors.asJson), 81 | ("vars", vars.asJson), 82 | ("varSubDict", varSub.dict.asJson), 83 | ("failed", failed.asJson), 84 | ("buildInfo", ValidatorConfig.buildInfoJson), 85 | ("runtimeInfo", ValidatorConfig.runtimeInfoJson(spark)), 86 | ("outputs", outputs.asJson), 87 | ("tables", tables.asJson), 88 | ("events", EventLog.events.asJson) 89 | ) 90 | } 91 | } 92 | 93 | object ValidatorConfig { 94 | private def buildInfoJson: Json = Json.obj( 95 | ("name", Json.fromString(BuildInfo.name)), 96 | ("version", Json.fromString(BuildInfo.version)), 97 | ("scalaVersion", Json.fromString(BuildInfo.scalaVersion)), 98 | ("sbtVersion", Json.fromString(BuildInfo.sbtVersion)), 99 | ("sparkVersion", Json.fromString(org.apache.spark.SPARK_VERSION)), 100 | ("javaVersion", Json.fromString(System.getProperty("java.version"))) 101 | ) 102 | 103 | private def propsToJson: Json = { 104 | val props = System.getProperties.asScala.toList.map(x => (x._1, Json.fromString(x._2))) 105 | Json.obj(props: _*) 106 | } 107 | 108 | private def envToJson: Json = { 109 | def extractFromAccessionList(pair: (String, MaybeEnvVar)) = { 110 | pair._1 -> Json.fromString(pair._2.map(_.getOrElse("")).getOrElse("")) 111 | } 112 | 113 | val env = EnvironmentVariables.accessedEnvVars.map(extractFromAccessionList) 114 | Json.obj(env.toSeq: _*) 115 | } 116 | 117 | private def runtimeInfoJson(spark: SparkSession): Json = { 118 | val startTimeMs = spark.sparkContext.startTime 119 | val endTimeMs = System.currentTimeMillis() 120 | val durationMs = endTimeMs - startTimeMs 121 | Json.obj( 122 | ("hostname", Json.fromString(Try(InetAddress.getLocalHost.getHostName).getOrElse("UNKNOWN"))), 123 | ("applicationId", Json.fromString(spark.sparkContext.applicationId)), 124 | ("sparkUser", Json.fromString(spark.sparkContext.sparkUser)), 125 | ("startTimeMs", Json.fromLong(startTimeMs)), 126 | ("endTimeMs", Json.fromLong(endTimeMs)), 127 | ("durationMs", Json.fromLong(durationMs)), 128 | ("properties", propsToJson), 129 | ("environment", envToJson) 130 | ) 131 | } 132 | } 133 | -------------------------------------------------------------------------------- /src/main/scala/com/target/data_validator/ValidatorEvent.scala: -------------------------------------------------------------------------------- 1 | package com.target.data_validator 2 | 3 | import java.util.concurrent.TimeUnit 4 | 5 | import io.circe.Json 6 | 7 | import scalatags.Text 8 | import scalatags.Text.all._ 9 | 10 | trait ValidatorEvent { 11 | def failed: Boolean 12 | def toHTML: Tag 13 | 14 | def failedHTML: Tag = HTMLBits.status(failed) 15 | } 16 | 17 | case class ValidatorCounter(name: String, value: Long) extends ValidatorEvent { 18 | override def failed: Boolean = false 19 | override def toHTML: Tag = { 20 | div(cls := "counter")(s"Counter - $name: $value") 21 | } 22 | } 23 | 24 | case class ValidatorError(msg: String) extends ValidatorEvent { 25 | override def failed: Boolean = true 26 | 27 | override def toHTML: Text.all.Tag = div(cls := "error")(failedHTML, msg) 28 | } 29 | 30 | case class ValidatorCheckEvent(failure: Boolean, label: String, count: Long, errorCount: Long) 31 | extends ValidatorEvent { 32 | override def failed: Boolean = failure 33 | 34 | override def toHTML: Text.all.Tag = { 35 | val pct = "%4.2f%%".format((errorCount * 100.0) / count) 36 | div(cls := "checkEvent")(failedHTML, s" - $label count: $count errors: $errorCount pct: $pct") 37 | } 38 | } 39 | 40 | case class ColumnBasedValidatorCheckEvent( 41 | failure: Boolean, 42 | data: Map[String, String], 43 | msg: String 44 | ) extends ValidatorEvent { 45 | override def failed: Boolean = failure 46 | 47 | override def toHTML: Text.all.Tag = { 48 | div(cls := "checkEvent")(failedHTML, s" - $msg") 49 | } 50 | } 51 | 52 | class ValidatorTimer(val label: String) extends ValidatorEvent { 53 | var duration = 0L 54 | 55 | override def failed: Boolean = false 56 | 57 | def time[R](block: => R): R = { 58 | val start = System.nanoTime() 59 | val result = 60 | try { 61 | block 62 | } finally { 63 | duration = System.nanoTime() - start 64 | } 65 | result 66 | } 67 | 68 | def toSecs: Long = TimeUnit.SECONDS.convert(duration, TimeUnit.NANOSECONDS) 69 | 70 | override def toHTML: Text.all.Tag = div(cls := "timer")(s"Timer: $label took $toSecs seconds.") 71 | 72 | override def toString: String = s"Time: $label Duration: $toSecs seconds" 73 | } 74 | 75 | case class ValidatorQuickCheckError(key: List[(String, Any)], value: Any, message: String) extends ValidatorEvent { 76 | override def failed: Boolean = true 77 | override def toHTML: Text.all.Tag = div(cls := "quickCheckError")(failedHTML, " - " + toString) 78 | 79 | def keyToString: String = "{" + key.map { case (c, v) => s"$c:$v" }.mkString(", ") + "}" 80 | 81 | override def toString: String = { 82 | val vStr = Option(value).getOrElse("(NULL)").toString 83 | s"ValidatorQuickCheckError(key: $keyToString, value: $vStr msg: $message)" 84 | } 85 | } 86 | 87 | case class ValidatorGood(msg: String) extends ValidatorEvent { 88 | override def failed: Boolean = false 89 | override def toString: String = msg 90 | override def toHTML: Text.all.Tag = div(cls := "good")(msg) 91 | } 92 | 93 | case class VarSubEvent(src: String, dest: String) extends ValidatorEvent { 94 | override def failed: Boolean = false 95 | override def toString: String = s"VarSub src: $src dest: $dest" 96 | override def toHTML: Text.all.Tag = div(cls := "subEvent")(toString) 97 | } 98 | 99 | case class VarSubJsonEvent(src: String, dest: Json) extends ValidatorEvent { 100 | override def failed: Boolean = false 101 | override def toString: String = s"VarSub src: $src dest: ${dest.noSpaces}" 102 | override def toHTML: Text.all.Tag = div(cls := "subEvent")(toString) 103 | } 104 | 105 | case class JsonEvent(json: Json) extends ValidatorEvent { 106 | override def failed: Boolean = false 107 | override def toString: String = s"JsonEvent: json:${json.noSpaces}" 108 | override def toHTML: Text.all.Tag = div(cls := "jsonEvent")(toString) 109 | } 110 | -------------------------------------------------------------------------------- /src/main/scala/com/target/data_validator/ValidatorOutput.scala: -------------------------------------------------------------------------------- 1 | package com.target.data_validator 2 | 3 | import cats.syntax.functor._ 4 | import io.circe.{Decoder, Json} 5 | import io.circe.generic.auto._ 6 | import org.apache.spark.sql.SparkSession 7 | 8 | abstract class ValidatorOutput extends Substitutable with EventLog { 9 | def write(json: Json)(implicit spark: SparkSession): Boolean 10 | def substituteVariables(dict: VarSubstitution): ValidatorOutput 11 | def configCheck(spark: SparkSession): Boolean 12 | } 13 | 14 | case class PipeOutput(pipe: String, ignoreError: Option[Boolean]) extends ValidatorOutput { 15 | 16 | override def write(json: Json)(implicit spark: SparkSession): Boolean = { 17 | logger.info(s"Piping json output to '$pipe' ignoreError: ${ignoreError.getOrElse(false)}") 18 | val timer = new ValidatorTimer(s"PipeOutput($pipe)") 19 | addEvent(timer) 20 | 21 | val (fail, out, err) = timer.time(IO.writeStringToPipe(pipe, json.noSpaces)) 22 | 23 | if (fail) { 24 | logger.error(s"Program `$pipe` failed!") 25 | if (out.isEmpty) { 26 | logger.error("stdout empty!") 27 | } else { 28 | out.foreach(o => logger.error(s"stdout: $o")) 29 | } 30 | if (err.isEmpty) { 31 | logger.error("stderr empty!") 32 | } else { 33 | err.foreach(o => logger.error(s"stderr: $o")) 34 | } 35 | !ignoreError.getOrElse(false) 36 | } else { 37 | false 38 | } 39 | } 40 | 41 | override def substituteVariables(dict: VarSubstitution): ValidatorOutput = 42 | this.copy(getVarSub(pipe, "pipe", dict)) 43 | 44 | override def configCheck(spark: SparkSession): Boolean = { 45 | val ret = IO.canExecute(pipe.split("\\s").head)(spark) 46 | if (!ret) { 47 | val msg = s"Pipe:'$pipe' not executable!" 48 | validatorError(msg) 49 | } 50 | !ret 51 | } 52 | } 53 | 54 | case class FileOutput(filename: String, append: Option[Boolean]) extends ValidatorOutput { 55 | 56 | override def write(json: Json)(implicit spark: SparkSession): Boolean = { 57 | logger.info(s"Writing json output to file '$filename append: ${append.getOrElse(false)}") 58 | val timer = new ValidatorTimer(s"FileOutput($filename)") 59 | timer.time(IO.writeJSON(filename, json, append.getOrElse(false))) 60 | } 61 | 62 | override def substituteVariables(dict: VarSubstitution): ValidatorOutput = 63 | this.copy(getVarSub(filename, "filename", dict)) 64 | override def configCheck(spark: SparkSession): Boolean = { 65 | val ret = IO.canAppendOrCreate(filename, append.getOrElse(false))(spark) 66 | if (!ret) { 67 | val msg = s"FileOutput '$filename' append: $append cannot write or append!" 68 | logger.error(msg) 69 | validatorError(msg) 70 | } 71 | !ret 72 | } 73 | } 74 | 75 | object ValidatorOutput { 76 | implicit val decodeOutputs: Decoder[ValidatorOutput] = List[Decoder[ValidatorOutput]]( 77 | Decoder[PipeOutput].widen, 78 | Decoder[FileOutput].widen 79 | ).reduce(_ or _) 80 | } 81 | -------------------------------------------------------------------------------- /src/main/scala/com/target/data_validator/VarSubstitution.scala: -------------------------------------------------------------------------------- 1 | package com.target.data_validator 2 | 3 | import com.typesafe.scalalogging.LazyLogging 4 | import io.circe.Json 5 | 6 | import scala.collection.mutable 7 | 8 | // Helper Class to handle variable substitution and manage dict of (k,v) 9 | 10 | class VarSubstitution() extends LazyLogging { 11 | import VarSubstitution._ 12 | 13 | val dict = new mutable.HashMap[String, Json]() 14 | 15 | /** Adds (k,v) to dictionary. 16 | * 17 | * @param key 18 | * \- key of value in dictionary. 19 | * @param value 20 | * \- value of key in dictionary. 21 | * @return 22 | * True on error. 23 | */ 24 | def add(key: String, value: Json): Boolean = { 25 | if (VAR_REGEX.findFirstIn(key).isEmpty) { 26 | logger.error(s"Bad key: $key, must follow variable rules.") 27 | true 28 | } else if (value.asString.exists(VAR_BODY_REGEX.findFirstIn(_).isDefined)) { 29 | logger.error(s"Cannot have variable defined in value: $value!") 30 | true 31 | } else { 32 | if (dict.contains(key)) { 33 | logger.warn(s"Dict already contains key: '$key' current: '${dict(key)}' v: '$value' not overriding.") 34 | true 35 | } else { 36 | dict += (key -> value) 37 | false 38 | } 39 | } 40 | } 41 | 42 | /** Adds a String to dictionary. 43 | * @param value 44 | * \- gets converted to Json 45 | * @return 46 | * True on error 47 | */ 48 | def addString(key: String, value: String): Boolean = { 49 | replaceVars(value) match { 50 | case Left(newValue) => add(key, JsonUtils.string2Json(newValue)) 51 | case Right(_) => true // Bug: Ignoring the ValidatorError 52 | } 53 | } 54 | 55 | /** Removes key from dictionary. 56 | * 57 | * @param k 58 | * \- key to be removed. 59 | * @return 60 | * True on error. 61 | */ 62 | def remove(k: String): Boolean = { 63 | if (dict.contains(k)) { 64 | dict.remove(k) 65 | false 66 | } else { 67 | logger.warn(s"remove(k:$k) Dict doesn't contain specified key.") 68 | true 69 | } 70 | } 71 | 72 | /** replaces variables in String. 73 | * 74 | * @param s 75 | * \- string to replace variables in. 76 | * @return 77 | * Left(new string) on Success, Right(ValidatorEvent) on Error 78 | */ 79 | def replaceVars(s: String): Either[String, ValidatorEvent] = { 80 | val variableJson = findVars(s).toSeq.map(x => (x, getVarName(x).flatMap(dict.get))) 81 | val (foundVariableJson, missingVariableJson) = variableJson.partition(_._2.isDefined) 82 | foundVariableJson.foreach(x => logger.debug(s"foundVar: $x")) 83 | missingVariableJson.foreach(x => logger.debug(s"missingVar: $x")) 84 | 85 | val newString = foundVariableJson.foldRight(s) { (vj, ns) => 86 | val (variable, json) = vj 87 | logger.debug(s"accum: $ns variable: $variable json: $json") 88 | val replacement = jsonToString(json.get) 89 | replaceAll(ns, variable, replacement) 90 | } 91 | 92 | val errs = missingVariableJson.map(x => x._1) 93 | errs.foreach(x => logger.debug(s"errs: $x")) 94 | if (errs.nonEmpty) { 95 | Right( 96 | ValidatorError( 97 | "VariableSubstitution: Can't find values for the following keys, " + 98 | s"${errs.flatMap(getVarName).mkString(",")}" 99 | ) 100 | ) 101 | } else { 102 | if (s != newString) { 103 | logger.debug(s"Replaced '$s' with '$newString'") 104 | } 105 | Left(newString) 106 | } 107 | } 108 | 109 | private def jsonToString(j: Json): String = { 110 | if (j.isString) { 111 | j.asString.get 112 | } else { 113 | j.toString() 114 | } 115 | } 116 | 117 | def replaceJsonVars(j: Json): Either[Json, ValidatorEvent] = { 118 | if (j.isString) { 119 | replaceVars(j.asString.get).left.map(JsonUtils.string2Json) 120 | } else { 121 | // Since variables are only in Strings, return j. 122 | Left(j) 123 | } 124 | } 125 | 126 | private def logDupKeys(k: String, v: String): Unit = { 127 | logger.info(s"Adding dict entry k: $k v:`$v`") 128 | if (dict.contains(k)) logger.warn(s"Replacing key: $k old: ${dict(k)} with new: $v") 129 | } 130 | 131 | /** Adds the map m to dict 132 | */ 133 | def addMap(m: Map[String, String]): Unit = { 134 | val kj = m.map { case (k, v) => 135 | logDupKeys(k, v) 136 | (k, JsonUtils.string2Json(v)) 137 | } 138 | dict ++= kj 139 | } 140 | 141 | override def equals(obj: Any): Boolean = 142 | obj.isInstanceOf[VarSubstitution] && obj.asInstanceOf[VarSubstitution].dict == dict 143 | 144 | override def hashCode(): Int = dict.hashCode() 145 | } 146 | 147 | object VarSubstitution extends LazyLogging { 148 | private val VAR_REGEX_STR = "[A-Za-z][A-Za-z0-9_]*" 149 | private val VAR_REGEX = VAR_REGEX_STR.r 150 | private val VAR_BODY_REGEX = ("\\$" + VAR_REGEX_STR + "|\\$\\{" + VAR_REGEX_STR + "\\}").r 151 | 152 | def findVars(s: String): Set[String] = { 153 | VAR_BODY_REGEX.findAllIn(s).toSet 154 | } 155 | 156 | /** Checks if s is a variable. 157 | * @param s 158 | * \- string to check 159 | * @return 160 | * true if s is a variable. 161 | */ 162 | def isVariable(s: String): Boolean = s.startsWith("$") && VAR_BODY_REGEX.findFirstMatchIn(s).isDefined 163 | 164 | /** Replaces all the occurrences of oldVal in src with newVal. 165 | * 166 | * @param src 167 | * \- source string that contains values to be replaced. 168 | * @param oldVal 169 | * \- old Value that will be replaced by newValue. 170 | * @param newVal 171 | * \- new Value that will replace oldValue. 172 | * @return 173 | * new string with newVal were oldVal was. 174 | */ 175 | def replaceAll(src: String, oldVal: String, newVal: String): String = { 176 | val buf = new StringBuffer(src) 177 | var idx = buf.indexOf(oldVal) 178 | while (idx >= 0) { 179 | buf.replace(idx, idx + oldVal.length, newVal) 180 | idx = buf.indexOf(oldVal, idx + newVal.length) 181 | } 182 | val ret = buf.toString 183 | logger.debug(s"src: $src oldVal: $oldVal newVal: $newVal ret: $ret") 184 | ret 185 | } 186 | 187 | /** gets the variable name from the variable. ie "$\{foo\}" returns "foo" 188 | * 189 | * @param rawVar 190 | * \- variable with control chars. 191 | * @return 192 | * variable without '$' or '{','}' 193 | */ 194 | def getVarName(rawVar: String): Option[String] = { 195 | val ret = if (rawVar.startsWith("${")) { 196 | if (rawVar.endsWith("}")) { 197 | Some(rawVar.substring(2, rawVar.length - 1)) 198 | } else { 199 | None 200 | } 201 | } else if (rawVar.startsWith("$")) { 202 | Some(rawVar.substring(1)) 203 | } else { 204 | logger.error(s"Illegal Variable $rawVar") 205 | None 206 | } 207 | logger.debug(s"getVarName(K: $rawVar) ret: $ret") 208 | ret 209 | } 210 | } 211 | -------------------------------------------------------------------------------- /src/main/scala/com/target/data_validator/stats/Bin.scala: -------------------------------------------------------------------------------- 1 | package com.target.data_validator.stats 2 | 3 | case class Bin(lowerBound: Double, upperBound: Double, count: Long) 4 | -------------------------------------------------------------------------------- /src/main/scala/com/target/data_validator/stats/CompleteStats.scala: -------------------------------------------------------------------------------- 1 | package com.target.data_validator.stats 2 | 3 | import io.circe._ 4 | import io.circe.generic.semiauto._ 5 | 6 | case class CompleteStats( 7 | name: String, 8 | column: String, 9 | count: Long, 10 | mean: Double, 11 | min: Double, 12 | max: Double, 13 | stdDev: Double, 14 | histogram: Histogram 15 | ) 16 | 17 | object CompleteStats { 18 | implicit val binEncoder: Encoder[Bin] = deriveEncoder 19 | implicit val histogramEncoder: Encoder[Histogram] = deriveEncoder 20 | implicit val encoder: Encoder[CompleteStats] = deriveEncoder 21 | 22 | implicit val binDecoder: Decoder[Bin] = deriveDecoder 23 | implicit val histogramDecoder: Decoder[Histogram] = deriveDecoder 24 | implicit val decoder: Decoder[CompleteStats] = deriveDecoder 25 | 26 | def apply( 27 | name: String, 28 | column: String, 29 | firstPassStats: FirstPassStats, 30 | secondPassStats: SecondPassStats 31 | ): CompleteStats = CompleteStats( 32 | name = name, 33 | column = column, 34 | count = firstPassStats.count, 35 | mean = firstPassStats.mean, 36 | min = firstPassStats.min, 37 | max = firstPassStats.max, 38 | stdDev = secondPassStats.stdDev, 39 | histogram = secondPassStats.histogram 40 | ) 41 | } 42 | -------------------------------------------------------------------------------- /src/main/scala/com/target/data_validator/stats/FirstPassStats.scala: -------------------------------------------------------------------------------- 1 | package com.target.data_validator.stats 2 | 3 | import org.apache.spark.sql.Row 4 | import org.apache.spark.sql.catalyst.ScalaReflection 5 | import org.apache.spark.sql.types.DataType 6 | 7 | case class FirstPassStats(count: Long, mean: Double, min: Double, max: Double) 8 | 9 | object FirstPassStats { 10 | def dataType: DataType = ScalaReflection 11 | .schemaFor[FirstPassStats] 12 | .dataType 13 | 14 | /** Convert from Spark SQL row format to case class [[FirstPassStats]] format. 15 | * 16 | * @param row 17 | * a complex column of [[org.apache.spark.sql.types.StructType]] output of [[FirstPassStatsAggregator]] 18 | * @return 19 | * struct format converted to [[FirstPassStats]] 20 | */ 21 | def fromRowRepr(row: Row): FirstPassStats = { 22 | FirstPassStats( 23 | count = row.getLong(0), 24 | mean = row.getDouble(1), 25 | min = row.getDouble(2), 26 | max = row.getDouble(3) 27 | ) 28 | } 29 | 30 | } 31 | -------------------------------------------------------------------------------- /src/main/scala/com/target/data_validator/stats/FirstPassStatsAggregator.scala: -------------------------------------------------------------------------------- 1 | package com.target.data_validator.stats 2 | 3 | import org.apache.spark.sql.Row 4 | import org.apache.spark.sql.expressions.{MutableAggregationBuffer, UserDefinedAggregateFunction} 5 | import org.apache.spark.sql.types._ 6 | 7 | /** Calculate the count, mean, min and maximum values of a numeric column. 8 | */ 9 | class FirstPassStatsAggregator extends UserDefinedAggregateFunction { 10 | 11 | /** input is a single column of `DoubleType` 12 | */ 13 | override def inputSchema: StructType = new StructType().add("value", DoubleType) 14 | 15 | /** buffer keeps state for the count, sum, min and max 16 | */ 17 | override def bufferSchema: StructType = new StructType() 18 | .add(StructField("count", LongType)) 19 | .add(StructField("sum", DoubleType)) 20 | .add(StructField("min", DoubleType)) 21 | .add(StructField("max", DoubleType)) 22 | 23 | private val count = bufferSchema.fieldIndex("count") 24 | private val sum = bufferSchema.fieldIndex("sum") 25 | private val min = bufferSchema.fieldIndex("min") 26 | private val max = bufferSchema.fieldIndex("max") 27 | 28 | /** specifies the return type when using the UDAF 29 | */ 30 | override def dataType: DataType = FirstPassStats.dataType 31 | 32 | /** These calculations are deterministic 33 | */ 34 | override def deterministic: Boolean = true 35 | 36 | /** set the initial values for count, sum, min and max 37 | */ 38 | override def initialize(buffer: MutableAggregationBuffer): Unit = { 39 | buffer(count) = 0L 40 | buffer(sum) = 0.0 41 | buffer(min) = Double.MaxValue 42 | buffer(max) = Double.MinValue 43 | } 44 | 45 | /** update the count, sum, min and max buffer values 46 | */ 47 | override def update(buffer: MutableAggregationBuffer, input: Row): Unit = { 48 | buffer(count) = buffer.getLong(count) + 1 49 | buffer(sum) = buffer.getDouble(sum) + input.getDouble(0) 50 | buffer(min) = math.min(input.getDouble(0), buffer.getDouble(min)) 51 | buffer(max) = math.max(input.getDouble(0), buffer.getDouble(max)) 52 | } 53 | 54 | /** reduce the count, sum, min and max values of two buffers 55 | */ 56 | override def merge(buffer1: MutableAggregationBuffer, buffer2: Row): Unit = { 57 | buffer1(count) = buffer1.getLong(count) + buffer2.getLong(count) 58 | buffer1(sum) = buffer1.getDouble(sum) + buffer2.getDouble(sum) 59 | buffer1(min) = math.min(buffer1.getDouble(min), buffer2.getDouble(min)) 60 | buffer1(max) = math.max(buffer1.getDouble(max), buffer2.getDouble(max)) 61 | } 62 | 63 | /** evaluate the count, mean, min and max values of a column 64 | */ 65 | override def evaluate(buffer: Row): Any = { 66 | FirstPassStats( 67 | buffer.getLong(count), 68 | buffer.getDouble(sum) / buffer.getLong(count), 69 | buffer.getDouble(min), 70 | buffer.getDouble(max) 71 | ) 72 | } 73 | 74 | } 75 | -------------------------------------------------------------------------------- /src/main/scala/com/target/data_validator/stats/Histogram.scala: -------------------------------------------------------------------------------- 1 | package com.target.data_validator.stats 2 | 3 | case class Histogram(bins: Seq[Bin]) 4 | -------------------------------------------------------------------------------- /src/main/scala/com/target/data_validator/stats/SecondPassStats.scala: -------------------------------------------------------------------------------- 1 | package com.target.data_validator.stats 2 | 3 | import org.apache.spark.sql.Row 4 | import org.apache.spark.sql.catalyst.ScalaReflection 5 | import org.apache.spark.sql.types.DataType 6 | 7 | case class SecondPassStats(stdDev: Double, histogram: Histogram) 8 | 9 | object SecondPassStats { 10 | def dataType: DataType = ScalaReflection 11 | .schemaFor[SecondPassStats] 12 | .dataType 13 | 14 | /** Convert from Spark SQL row format to case class [[SecondPassStats]] format. 15 | * 16 | * @param row 17 | * a complex column of [[org.apache.spark.sql.types.StructType]] output of [[SecondPassStatsAggregator]] 18 | * @return 19 | * struct format converted to [[SecondPassStats]] 20 | */ 21 | def fromRowRepr(row: Row): SecondPassStats = { 22 | SecondPassStats( 23 | stdDev = row.getDouble(0), 24 | histogram = Histogram( 25 | row.getStruct(1).getSeq[Row](0) map { bin => 26 | Bin( 27 | lowerBound = bin.getDouble(0), 28 | upperBound = bin.getDouble(1), 29 | count = bin.getLong(2) 30 | ) 31 | } 32 | ) 33 | ) 34 | } 35 | 36 | } 37 | -------------------------------------------------------------------------------- /src/main/scala/com/target/data_validator/stats/SecondPassStatsAggregator.scala: -------------------------------------------------------------------------------- 1 | package com.target.data_validator.stats 2 | 3 | import org.apache.spark.sql.Row 4 | import org.apache.spark.sql.expressions.{MutableAggregationBuffer, UserDefinedAggregateFunction} 5 | import org.apache.spark.sql.types._ 6 | 7 | /** Calculate the standard deviation and histogram of a numeric column 8 | */ 9 | class SecondPassStatsAggregator(firstPassStats: FirstPassStats) extends UserDefinedAggregateFunction { 10 | 11 | val NUMBER_OF_BINS = 10 12 | 13 | private val binSize = (firstPassStats.max - firstPassStats.min) / NUMBER_OF_BINS 14 | private val upperBounds = for (i <- 1 to NUMBER_OF_BINS) yield { firstPassStats.min + i * binSize } 15 | 16 | /** input is a single column of `DoubleType` 17 | */ 18 | override def inputSchema: StructType = new StructType().add("value", DoubleType) 19 | 20 | /** buffer keeps state for the total count, sumOfSquares, and individual bin counts 21 | */ 22 | override def bufferSchema: StructType = StructType( 23 | List( 24 | StructField("count", LongType), 25 | StructField("sumOfSquares", DoubleType), 26 | StructField("bin1count", LongType), 27 | StructField("bin2count", LongType), 28 | StructField("bin3count", LongType), 29 | StructField("bin4count", LongType), 30 | StructField("bin5count", LongType), 31 | StructField("bin6count", LongType), 32 | StructField("bin7count", LongType), 33 | StructField("bin8count", LongType), 34 | StructField("bin9count", LongType), 35 | StructField("bin10count", LongType) 36 | ) 37 | ) 38 | 39 | private val count = bufferSchema.fieldIndex("count") 40 | private val sumOfSquares = bufferSchema.fieldIndex("sumOfSquares") 41 | private val binStart = bufferSchema.fieldIndex("bin1count") 42 | private val binEnd = bufferSchema.fieldIndex("bin10count") 43 | 44 | /** specifies the return type when using the UDAF 45 | */ 46 | override def dataType: DataType = SecondPassStats.dataType 47 | 48 | /** these calculations are deterministic 49 | */ 50 | override def deterministic: Boolean = true 51 | 52 | /** set the initial values for count, sum of squares and individual bin counts 53 | */ 54 | override def initialize(buffer: MutableAggregationBuffer): Unit = { 55 | buffer(count) = 0L 56 | buffer(sumOfSquares) = 0.0 57 | for (i <- binStart to binEnd) { buffer(i) = 0L } 58 | } 59 | 60 | /** update the count, sum of squares and individual bin counts 61 | */ 62 | override def update(buffer: MutableAggregationBuffer, input: Row): Unit = { 63 | buffer(count) = buffer.getLong(count) + 1 64 | buffer(sumOfSquares) = buffer.getDouble(sumOfSquares) + math.pow(input.getDouble(0) - firstPassStats.mean, 2) 65 | // determine the index of the bin that we should increment 66 | val binIndex = 67 | binStart + math.min(NUMBER_OF_BINS - 1, math.floor((input.getDouble(0) - firstPassStats.min) / binSize).toInt) 68 | buffer(binIndex) = buffer.getLong(binIndex) + 1 69 | } 70 | 71 | /** reduce the count, sum of squares and individual bin counts 72 | */ 73 | override def merge(buffer1: MutableAggregationBuffer, buffer2: Row): Unit = { 74 | buffer1(count) = buffer1.getLong(count) + buffer2.getLong(count) 75 | buffer1(sumOfSquares) = buffer1.getDouble(sumOfSquares) + buffer2.getDouble(sumOfSquares) 76 | for (i <- binStart to binEnd) { 77 | buffer1(i) = buffer1.getLong(i) + buffer2.getLong(i) 78 | } 79 | } 80 | 81 | /** evaluate the standard deviation and define bins of histogram 82 | */ 83 | override def evaluate(buffer: Row): Any = { 84 | val bins: Seq[Bin] = for (i <- binStart to binEnd) yield { 85 | val bIndex = i - binStart 86 | i match { 87 | case start if i == binStart => Bin(firstPassStats.min, upperBounds(bIndex), buffer.getLong(start)) 88 | case end if i == binEnd => Bin(upperBounds(bIndex - 1), firstPassStats.max, buffer.getLong(end)) 89 | case _ => Bin(upperBounds(bIndex - 1), upperBounds(bIndex), buffer.getLong(i)) 90 | } 91 | } 92 | SecondPassStats( 93 | math.sqrt(buffer.getDouble(sumOfSquares) / (buffer.getLong(count) - 1)), 94 | Histogram(bins) 95 | ) 96 | } 97 | 98 | } 99 | -------------------------------------------------------------------------------- /src/main/scala/com/target/data_validator/validator/ColStats.scala: -------------------------------------------------------------------------------- 1 | package com.target.data_validator.validator 2 | 3 | import com.target.data_validator._ 4 | import com.target.data_validator.stats._ 5 | import io.circe 6 | import io.circe.Json 7 | import io.circe.generic.semiauto._ 8 | import io.circe.syntax._ 9 | import org.apache.spark.sql._ 10 | import org.apache.spark.sql.catalyst.analysis.UnresolvedAttribute 11 | import org.apache.spark.sql.catalyst.expressions.Expression 12 | import org.apache.spark.sql.functions._ 13 | import org.apache.spark.sql.types.{NumericType, StructType} 14 | 15 | import scala.concurrent.Promise 16 | import scala.util._ 17 | 18 | /** This validator implements a set of column metrics on a specified column by performing two I/O passes over the 19 | * input table. 20 | * 21 | * @param column 22 | * the column to collect stats on 23 | */ 24 | final case class ColStats(column: String) extends TwoPassCheapCheck { 25 | import ValidatorBase._ 26 | import com.target.data_validator.JsonEncoders.eventEncoder 27 | 28 | private val tempColumnName = column + "_" + hashCode 29 | 30 | override def name: String = "colstats" 31 | 32 | private val promiseToDoFirstPass: Promise[FirstPassStats] = Promise() 33 | 34 | // invoke only after first pass has completed 35 | private def unsafeFirstPassAccessor = promiseToDoFirstPass.future.value match { 36 | case None => 37 | throw new IllegalStateException( 38 | "ColStats costly histograms requires that the pre-processing projections " + 39 | "are executed first to generate first pass stats." 40 | ) 41 | case Some(Success(firstPassStats)) => 42 | firstPassStats 43 | case Some(Failure(e)) => 44 | throw e 45 | } 46 | 47 | // expression to aggregate first pass of stats 48 | override def firstPassSelect(): Column = { 49 | val firstPassAgg = new FirstPassStatsAggregator 50 | firstPassAgg(new Column(UnresolvedAttribute(column))) as tempColumnName 51 | } 52 | 53 | // extract first pass stats from output row 54 | override def sinkFirstPassRow(row: Row): Unit = { 55 | promiseToDoFirstPass complete Try { 56 | val rStats = row.getAs[Row](tempColumnName) 57 | FirstPassStats.fromRowRepr(rStats) 58 | } 59 | } 60 | 61 | // generate second pass ("Quick Check") expression 62 | // NOTE: this call implicitly REQUIRES that the first pass has completed 63 | override def select(schema: StructType, dict: VarSubstitution): Expression = { 64 | val agg = new SecondPassStatsAggregator(unsafeFirstPassAccessor) 65 | agg(col(column)).expr 66 | } 67 | 68 | // construct complete stats from quick check output row 69 | override def quickCheck(r: Row, count: Long, idx: Int): Boolean = { 70 | val rStats = r.getAs[Row](idx) 71 | val secondPassStats = SecondPassStats.fromRowRepr(rStats) 72 | val completeStats = CompleteStats( 73 | name = s"`$column` stats", 74 | column = column, 75 | firstPassStats = unsafeFirstPassAccessor, 76 | secondPassStats = secondPassStats 77 | ) 78 | 79 | val json = completeStats.asJson 80 | logger.info(s"VarJsonEvent:${json.spaces2}") 81 | addEvent(JsonEvent(json)) 82 | 83 | false 84 | } 85 | 86 | override def substituteVariables(dict: VarSubstitution): ValidatorBase = this 87 | 88 | override def configCheck(df: DataFrame): Boolean = { 89 | if (isColumnInDataFrame(df, column)) { 90 | df.schema(column).dataType match { 91 | case _: NumericType => false 92 | case badType => 93 | val msg = s"Column $name type:$badType is not Numeric." 94 | logger.error(msg) 95 | addEvent(ValidatorError(msg)) 96 | failed 97 | } 98 | } else { 99 | val msg = s"Column $name not in data frame." 100 | logger.error(msg) 101 | addEvent(ValidatorError(msg)) 102 | failed 103 | } 104 | } 105 | 106 | override def toJson: Json = Json.obj( 107 | ("type", Json.fromString("colstats")), 108 | ("column", Json.fromString(column)), 109 | ("failed", Json.fromBoolean(failed)), 110 | ("events", this.getEvents.asJson) 111 | ) 112 | 113 | } 114 | 115 | object ColStats { 116 | implicit val encoder: circe.Encoder[ColumnSumCheck] = deriveEncoder[ColumnSumCheck] 117 | implicit val decoder: circe.Decoder[ColumnSumCheck] = deriveDecoder[ColumnSumCheck] 118 | } 119 | -------------------------------------------------------------------------------- /src/main/scala/com/target/data_validator/validator/ColumnBased.scala: -------------------------------------------------------------------------------- 1 | package com.target.data_validator.validator 2 | 3 | import com.target.data_validator.{ColumnBasedValidatorCheckEvent, ValidatorCounter, ValidatorError, VarSubstitution} 4 | import com.target.data_validator.JsonEncoders.eventEncoder 5 | import io.circe.Json 6 | import io.circe.syntax._ 7 | import org.apache.spark.sql.{DataFrame, Row} 8 | import org.apache.spark.sql.catalyst.analysis.UnresolvedAttribute 9 | import org.apache.spark.sql.catalyst.expressions.Expression 10 | import org.apache.spark.sql.catalyst.expressions.aggregate.Max 11 | import org.apache.spark.sql.types._ 12 | 13 | import scala.collection.immutable.ListMap 14 | import scala.math.abs 15 | 16 | abstract class ColumnBased(column: String, condTest: Expression) extends CheapCheck { 17 | override def select(schema: StructType, dict: VarSubstitution): Expression = condTest 18 | 19 | // ColumnBased checks don't have per row error details. 20 | def hasQuickErrorDetails: Boolean = false 21 | 22 | // calculates and returns the pct error as a string 23 | def calculatePctError(expected: Double, actual: Double, formatStr: String = "%4.2f%%"): String = { 24 | 25 | if (expected == actual) { 26 | formatStr.format(0.00) // if expected == actual, error % should be 0, even if expected is 0 27 | } else if (expected == 0.0) { 28 | "undefined" 29 | } else { 30 | val pct = abs(((expected - actual) * 100.0) / expected) 31 | formatStr.format(pct) 32 | } 33 | } 34 | } 35 | 36 | case class MinNumRows(minNumRows: Json) extends ColumnBased("", ValidatorBase.L0) { 37 | override def name: String = "MinNumRows" 38 | 39 | override def substituteVariables(dict: VarSubstitution): ValidatorBase = { 40 | val ret = MinNumRows(getVarSubJson(minNumRows, "minNumRows", dict)) 41 | getEvents.foreach(ret.addEvent) 42 | ret 43 | } 44 | 45 | override def configCheck(df: DataFrame): Boolean = { 46 | 47 | def notNaturalNumber(): Unit = { 48 | val msg = "minNumRows must be a natural number" 49 | logger.error(msg) 50 | addEvent(ValidatorError(msg)) 51 | } 52 | 53 | minNumRows.asNumber match { 54 | case Some(jsonNumber) => 55 | jsonNumber.toLong match { 56 | case Some(x) if x > 0 => 57 | case _ => notNaturalNumber() 58 | } 59 | case _ => notNaturalNumber() 60 | } 61 | failed 62 | } 63 | 64 | override def quickCheck(row: Row, count: Long, idx: Int): Boolean = { 65 | // Convert to `JsonNumber` then to `Long` 66 | // safe because already handled in `configCheck` 67 | val minNumRowsLong = minNumRows.asNumber.get.toLong.get 68 | 69 | failed = count < minNumRowsLong 70 | val pctError = if (failed) calculatePctError(minNumRowsLong, count) else "0.00%" 71 | addEvent(ValidatorCounter("rowCount", count)) 72 | val msg = s"MinNumRowsCheck Expected: $minNumRows Actual: $count Relative Error: $pctError" 73 | val data = ListMap("expected" -> minNumRows.toString, "actual" -> count.toString, "relative_error" -> pctError) 74 | addEvent(ColumnBasedValidatorCheckEvent(failed, data, msg)) 75 | failed 76 | } 77 | 78 | override def toJson: Json = Json.obj( 79 | ("type", Json.fromString("rowCount")), 80 | ("minNumRows", minNumRows), 81 | ("failed", Json.fromBoolean(failed)), 82 | ("events", this.getEvents.asJson) 83 | ) 84 | 85 | override def toString: String = name + s"(minNumRows: $minNumRows)" 86 | } 87 | 88 | case class ColumnMaxCheck(column: String, value: Json) 89 | extends ColumnBased(column, Max(UnresolvedAttribute.quoted(column)).toAggregateExpression()) { 90 | 91 | override def substituteVariables(dict: VarSubstitution): ValidatorBase = { 92 | val ret = copy(column = getVarSub(column, "column", dict), value = getVarSubJson(value, "value", dict)) 93 | this.getEvents.foreach(ret.addEvent) 94 | ret 95 | } 96 | 97 | override def configCheck(df: DataFrame): Boolean = checkTypes(df, column, value) 98 | 99 | override def quickCheck(row: Row, count: Long, idx: Int): Boolean = { 100 | val dataType = row.schema(idx).dataType 101 | val rMax = row(idx) 102 | logger.info(s"rMax: $rMax colType: $dataType value: $value valueClass: ${value.getClass.getCanonicalName}") 103 | 104 | def resultForString: (ListMap[String, String], String) = { 105 | val (expected, actual) = (value.asString.getOrElse(""), row.getString(idx)) 106 | 107 | failed = expected != actual 108 | val data = ListMap("expected" -> expected, "actual" -> actual) 109 | val errorMsg = s"ColumnMaxCheck $column[StringType]: Expected: $expected Actual: $actual" 110 | 111 | (data, errorMsg) 112 | } 113 | 114 | def resultForNumeric: (ListMap[String, String], String) = { 115 | val num = value.asNumber.get 116 | var cmp_params = (0.0, 0.0) // (expected, actual) 117 | 118 | dataType match { 119 | case ByteType => cmp_params = (num.toByte.getOrElse[Byte](-1), row.getByte(idx)) 120 | case ShortType => cmp_params = (num.toShort.getOrElse[Short](-1), row.getShort(idx)) 121 | case IntegerType => cmp_params = (num.toInt.getOrElse[Int](-1), row.getInt(idx)) 122 | case LongType => cmp_params = (num.toLong.getOrElse[Long](-1), row.getLong(idx)) 123 | case FloatType => cmp_params = (num.toDouble, row.getFloat(idx)) 124 | case DoubleType => cmp_params = (num.toDouble, row.getDouble(idx)) 125 | } 126 | 127 | failed = cmp_params._1 != cmp_params._2 128 | val pctError = if (failed) calculatePctError(cmp_params._1, cmp_params._2) else "0.00%" 129 | val data = ListMap("expected" -> num.toString, "actual" -> rMax.toString, "relative_error" -> pctError) 130 | val errorMsg = s"ColumnMaxCheck $column[$dataType]: Expected: $num Actual: $rMax Relative Error: $pctError" 131 | 132 | (data, errorMsg) 133 | } 134 | 135 | def resultForOther: (ListMap[String, String], String) = { 136 | logger.error( 137 | s"""ColumnMaxCheck for type: $dataType, Row: $row not implemented! 138 | |Please open a bug report on the data-validator issue tracker.""".stripMargin 139 | ) 140 | failed = true 141 | val errorMsg = s"ColumnMaxCheck is not supported for data type $dataType" 142 | 143 | (ListMap.empty[String, String], errorMsg) 144 | } 145 | 146 | val (data, errorMsg) = dataType match { 147 | case StringType => resultForString 148 | case _: NumericType => resultForNumeric 149 | case _ => resultForOther 150 | } 151 | 152 | logger.debug(s"MaxValue compared Row: $row with value: $value failed: $failed") 153 | if (failed) { 154 | addEvent(ColumnBasedValidatorCheckEvent(failed, data, errorMsg)) 155 | } 156 | failed 157 | } 158 | 159 | override def toJson: Json = Json.obj( 160 | ("type", Json.fromString("columnMaxCheck")), 161 | ("column", Json.fromString(column)), 162 | ("value", value), 163 | ("failed", Json.fromBoolean(failed)), 164 | ("events", this.getEvents.asJson) 165 | ) 166 | } 167 | -------------------------------------------------------------------------------- /src/main/scala/com/target/data_validator/validator/ColumnSumCheck.scala: -------------------------------------------------------------------------------- 1 | package com.target.data_validator.validator 2 | 3 | import com.target.data_validator.{ColumnBasedValidatorCheckEvent, JsonEncoders, ValidatorError, VarSubstitution} 4 | import io.circe._ 5 | import io.circe.generic.semiauto._ 6 | import io.circe.syntax._ 7 | import org.apache.spark.sql.{DataFrame, Row} 8 | import org.apache.spark.sql.catalyst.analysis.UnresolvedAttribute 9 | import org.apache.spark.sql.catalyst.expressions.aggregate.Sum 10 | import org.apache.spark.sql.types._ 11 | 12 | import scala.collection.immutable.ListMap 13 | 14 | case class ColumnSumCheck( 15 | column: String, 16 | minValue: Option[Json] = None, 17 | maxValue: Option[Json] = None, 18 | inclusive: Option[Json] = None 19 | ) extends ColumnBased(column, Sum(UnresolvedAttribute.quoted(column)).toAggregateExpression()) { 20 | 21 | private val minOrMax: Either[String, Unit] = if (minValue.isEmpty && maxValue.isEmpty) { 22 | Left("'minValue' or 'maxValue' or both must be defined") 23 | } else { 24 | Right() 25 | } 26 | 27 | private val lowerBound: Either[String, Double] = minValue match { 28 | case Some(json) => 29 | if (json.isNumber) { Right(json.asNumber.get.toDouble) } 30 | else { Left(s"'minValue' defined but type is not a Number, is: ${json.name}") } 31 | case None => Right(Double.MinValue) 32 | } 33 | 34 | private val upperBound: Either[String, Double] = maxValue match { 35 | case Some(json) => 36 | if (json.isNumber) { Right(json.asNumber.get.toDouble) } 37 | else { Left(s"'maxValue' defined but type is not a Number, is: ${json.name}") } 38 | case None => Right(Double.MaxValue) 39 | } 40 | 41 | private val minLessThanMax: Either[String, Unit] = (lowerBound, upperBound) match { 42 | case (Right(lower), Right(upper)) if lower >= upper => 43 | Left(s"'minValue': $lower must be less than 'maxValue': $upper") 44 | case _ => Right() 45 | } 46 | 47 | private val inclusiveBounds: Either[String, Boolean] = inclusive match { 48 | case Some(json) => 49 | if (json.isBoolean) { Right(json.asBoolean.get) } 50 | else { Left(s"'inclusive' defined but type is not Boolean, is: ${json.name}") } 51 | case None => Right(false) 52 | } 53 | 54 | override def name: String = "columnSumCheck" 55 | 56 | override def quickCheck(r: Row, count: Long, idx: Int): Boolean = { 57 | 58 | val dataType = r.schema(idx).dataType 59 | val isInclusive = inclusiveBounds.right.get 60 | val lowerBoundValue = lowerBound.right.get 61 | val upperBoundValue = upperBound.right.get 62 | 63 | def evaluate(sum: Double): Boolean = { 64 | if (isInclusive) { sum > upperBoundValue || sum < lowerBoundValue } 65 | else { sum >= upperBoundValue || sum <= lowerBoundValue } 66 | } 67 | 68 | def getPctError(sum: Double): String = { 69 | if (sum < lowerBoundValue) { 70 | calculatePctError(lowerBoundValue, sum) 71 | } else if (sum > upperBoundValue) { 72 | calculatePctError(upperBoundValue, sum) 73 | } else if (!isInclusive && (sum == upperBoundValue || sum == lowerBoundValue)) { 74 | "undefined" 75 | } else { 76 | "0.00%" 77 | } 78 | } 79 | 80 | def getData(pctError: String): ListMap[String, String] = { 81 | val initial: ListMap[String, String] = ((minValue, maxValue) match { 82 | case (Some(x), Some(y)) => 83 | ListMap("lower_bound" -> x.asNumber.get.toString, "upper_bound" -> y.asNumber.get.toString) 84 | case (None, Some(y)) => ListMap("upper_bound" -> y.asNumber.get.toString) 85 | case (Some(x), None) => ListMap("lower_bound" -> x.asNumber.get.toString) 86 | case (None, None) => throw new RuntimeException("Must define at least one of minValue or maxValue.") 87 | }) 88 | initial ++ List("inclusive" -> isInclusive.toString, "actual" -> r(idx).toString, "relative_error" -> pctError) 89 | } 90 | 91 | val actualSum: Double = dataType match { 92 | case ByteType => r.getByte(idx) 93 | case ShortType => r.getShort(idx) 94 | case IntegerType => r.getInt(idx) 95 | case LongType => r.getLong(idx) 96 | case FloatType => r.getFloat(idx) 97 | case DoubleType => r.getDouble(idx) 98 | case ut => throw new Exception(s"Unsupported type for $name found in schema: $ut") 99 | } 100 | 101 | failed = evaluate(actualSum) 102 | val pctError = getPctError(actualSum) 103 | val data = getData(pctError) 104 | 105 | val bounds = minValue.getOrElse(" ") :: maxValue.getOrElse("") :: Nil 106 | val prettyBounds = if (isInclusive) { 107 | bounds.mkString("[", ", ", "]") 108 | } else { 109 | bounds.mkString("(", ", ", ")") 110 | } 111 | 112 | val msg = 113 | s"$name on $column[$dataType]: Expected Range: $prettyBounds Actual: ${r(idx)} Relative Error: $pctError" 114 | addEvent(ColumnBasedValidatorCheckEvent(failed, data, msg)) 115 | failed 116 | } 117 | 118 | override def substituteVariables(dict: VarSubstitution): ValidatorBase = { 119 | val ret = copy( 120 | column = getVarSub(column, "column", dict), 121 | minValue = minValue.map(getVarSubJson(_, "minValue", dict)), 122 | maxValue = maxValue.map(getVarSubJson(_, "maxValue", dict)), 123 | inclusive = inclusive.map(getVarSubJson(_, "inclusive", dict)) 124 | ) 125 | this.getEvents.foreach(ret.addEvent) 126 | ret 127 | } 128 | 129 | override def configCheck(df: DataFrame): Boolean = { 130 | logger.debug(s"Full check config: ${this.toString}") 131 | Seq( 132 | minOrMax, 133 | lowerBound, 134 | upperBound, 135 | minLessThanMax, 136 | inclusiveBounds 137 | ).foreach { 138 | case Left(msg) => 139 | logger.error(msg) 140 | addEvent(ValidatorError(msg)) 141 | case _ => 142 | } 143 | 144 | findColumnInDataFrame(df, column) match { 145 | case Some(ft) if ft.dataType.isInstanceOf[NumericType] => 146 | case Some(ft) => 147 | val msg = s"Column: $column found, but not of numericType type: ${ft.dataType}" 148 | logger.error(msg) 149 | addEvent(ValidatorError(msg)) 150 | case None => 151 | val msg = s"Column: $column not found in schema" 152 | logger.error(msg) 153 | addEvent(ValidatorError(msg)) 154 | } 155 | failed 156 | } 157 | 158 | override def toJson: Json = { 159 | import JsonEncoders.eventEncoder 160 | val additionalFieldsForReport = Json.fromFields( 161 | Set( 162 | "type" -> Json.fromString("columnSumCheck"), 163 | "failed" -> Json.fromBoolean(failed), 164 | "events" -> getEvents.asJson 165 | ) 166 | ) 167 | 168 | val base = ColumnSumCheck.encoder(this) 169 | base.deepMerge(additionalFieldsForReport) 170 | } 171 | } 172 | 173 | object ColumnSumCheck { 174 | val encoder: Encoder[ColumnSumCheck] = deriveEncoder[ColumnSumCheck] 175 | val decoder: Decoder[ColumnSumCheck] = deriveDecoder[ColumnSumCheck] 176 | def fromJson(c: HCursor): Either[DecodingFailure, ValidatorBase] = decoder.apply(c) 177 | } 178 | -------------------------------------------------------------------------------- /src/main/scala/com/target/data_validator/validator/JsonDecoders.scala: -------------------------------------------------------------------------------- 1 | package com.target.data_validator.validator 2 | 3 | import cats.syntax.either._ 4 | import com.typesafe.scalalogging.LazyLogging 5 | import io.circe.{Decoder, DecodingFailure, HCursor} 6 | import io.circe.generic.auto._ 7 | 8 | object JsonDecoders extends LazyLogging { 9 | 10 | implicit val decodeChecks: Decoder[ValidatorBase] = new Decoder[ValidatorBase] { 11 | // FIXME: specifying this Function here instead of Decoder[ValidatorBase] is a smell that these checks 12 | // ought to have proper decoder objects instead of a method. 13 | // I.e., we're not using the Circe Decoder API as intended. 14 | private lazy val decoders = Map[String, HCursor => Either[DecodingFailure, ValidatorBase]]( 15 | "rowCount" -> { _.as[MinNumRows] }, 16 | "nullCheck" -> NullCheck.fromJson, 17 | "negativeCheck" -> NegativeCheck.fromJson, 18 | "columnMaxCheck" -> { _.as[ColumnMaxCheck] }, 19 | "rangeCheck" -> RangeCheck.fromJson, 20 | "uniqueCheck" -> UniqueCheck.fromJson, 21 | "stringLengthCheck" -> StringLengthCheck.fromJson, 22 | "stringRegexCheck" -> StringRegexCheck.fromJson, 23 | "columnSumCheck" -> ColumnSumCheck.fromJson, 24 | "colstats" -> implicitly[Decoder[ColStats]].apply // serdes defined implicitly on companion 25 | ) 26 | 27 | final def apply(c: HCursor): Decoder.Result[ValidatorBase] = c.downField("type").as[String].flatMap(getDecoder(c)) 28 | 29 | private def getDecoder(cursor: HCursor)(checkType: String) = { 30 | decoders 31 | .get(checkType) 32 | .map(_(cursor)) match { 33 | case Some(x) => x 34 | case None => 35 | logger.error(s"Unknown Check `$checkType` in config! Choose one of: ${decoders.keys.mkString(", ")}.") 36 | throw new RuntimeException(s"Unknown Check in config `$checkType`") 37 | } 38 | } 39 | } 40 | } 41 | -------------------------------------------------------------------------------- /src/main/scala/com/target/data_validator/validator/NegativeCheck.scala: -------------------------------------------------------------------------------- 1 | package com.target.data_validator.validator 2 | 3 | import com.target.data_validator.{ValidatorError, VarSubstitution} 4 | import com.target.data_validator.JsonEncoders.eventEncoder 5 | import com.target.data_validator.validator.ValidatorBase.I0 6 | import com.typesafe.scalalogging.LazyLogging 7 | import io.circe.{DecodingFailure, HCursor, Json} 8 | import io.circe.syntax._ 9 | import org.apache.spark.sql.DataFrame 10 | import org.apache.spark.sql.catalyst.analysis.UnresolvedAttribute 11 | import org.apache.spark.sql.catalyst.expressions.{Expression, LessThan} 12 | import org.apache.spark.sql.types.{NumericType, StructType} 13 | 14 | case class NegativeCheck(column: String, threshold: Option[String]) extends RowBased { 15 | 16 | override def substituteVariables(dict: VarSubstitution): ValidatorBase = { 17 | val ret = NegativeCheck(getVarSub(column, "column", dict), threshold.map(getVarSub(_, "threshold", dict))) 18 | getEvents.foreach(ret.addEvent) 19 | ret 20 | } 21 | 22 | override def configCheck(df: DataFrame): Boolean = { 23 | findColumnInDataFrame(df, column) match { 24 | case Some(ft) if ft.dataType.isInstanceOf[NumericType] => Unit 25 | case Some(ft) => 26 | val msg = s"Column: $column found, but not of numericType type: ${ft.dataType}" 27 | logger.error(msg) 28 | addEvent(ValidatorError(msg)) 29 | case None => 30 | val msg = s"Column: $column not found in schema." 31 | logger.error(msg) 32 | addEvent(ValidatorError(msg)) 33 | } 34 | configCheckThreshold 35 | failed 36 | } 37 | 38 | override def colTest(schema: StructType, dict: VarSubstitution): Expression = 39 | LessThan(UnresolvedAttribute(column), I0) 40 | 41 | override def toJson: Json = Json.obj( 42 | ("type", Json.fromString("negativeCheck")), 43 | ("column", Json.fromString(column)), 44 | ("threshold", Json.fromString(threshold.getOrElse("0"))), 45 | ("failed", Json.fromBoolean(failed)), 46 | ("events", this.getEvents.asJson) 47 | ) 48 | } 49 | 50 | object NegativeCheck extends LazyLogging { 51 | def fromJson(c: HCursor): Either[DecodingFailure, ValidatorBase] = { 52 | val column = c.downField("column").as[String].right.get 53 | val threshold = c.downField("threshold").as[String].right.toOption 54 | 55 | logger.debug(s"Parsing NegativeCheck(column:$column, threshold:$threshold) config.") 56 | scala.util.Right(NegativeCheck(column, threshold)) 57 | } 58 | } 59 | -------------------------------------------------------------------------------- /src/main/scala/com/target/data_validator/validator/NullCheck.scala: -------------------------------------------------------------------------------- 1 | package com.target.data_validator.validator 2 | 3 | import com.target.data_validator.JsonEncoders.eventEncoder 4 | import com.target.data_validator.VarSubstitution 5 | import com.typesafe.scalalogging.LazyLogging 6 | import io.circe.{DecodingFailure, HCursor, Json} 7 | import io.circe.syntax._ 8 | import org.apache.spark.sql.catalyst.analysis.UnresolvedAttribute 9 | import org.apache.spark.sql.catalyst.expressions.{Expression, IsNull} 10 | import org.apache.spark.sql.types.StructType 11 | 12 | case class NullCheck(column: String, threshold: Option[String]) extends RowBased { 13 | 14 | override def substituteVariables(dict: VarSubstitution): ValidatorBase = { 15 | val ret = NullCheck(getVarSub(column, "column", dict), threshold.map(getVarSub(_, "threshold", dict))) 16 | getEvents.foreach(ret.addEvent) 17 | ret 18 | } 19 | 20 | override def colTest(schema: StructType, dict: VarSubstitution): Expression = IsNull(UnresolvedAttribute(column)) 21 | 22 | override def toJson: Json = Json.obj( 23 | ("type", Json.fromString("nullCheck")), 24 | ("column", Json.fromString(column)), 25 | ("failed", Json.fromBoolean(failed)), 26 | ("events", this.getEvents.asJson) 27 | ) 28 | } 29 | 30 | object NullCheck extends LazyLogging { 31 | def fromJson(c: HCursor): Either[DecodingFailure, ValidatorBase] = { 32 | val column = c.downField("column").as[String].right.get 33 | val threshold = c.downField("threshold").as[String].right.toOption 34 | 35 | logger.debug(s"Parsing NullCheck(column:$column, threshold:$threshold) config.") 36 | scala.util.Right(NullCheck(column, threshold)) 37 | } 38 | 39 | } 40 | -------------------------------------------------------------------------------- /src/main/scala/com/target/data_validator/validator/RangeCheck.scala: -------------------------------------------------------------------------------- 1 | package com.target.data_validator.validator 2 | 3 | import com.target.data_validator.{JsonEncoders, ValidatorError, VarSubstitution} 4 | import com.target.data_validator.JsonUtils.debugJson 5 | import com.target.data_validator.validator.ValidatorBase._ 6 | import com.typesafe.scalalogging.LazyLogging 7 | import io.circe.{DecodingFailure, HCursor, Json} 8 | import io.circe.syntax._ 9 | import org.apache.spark.sql.DataFrame 10 | import org.apache.spark.sql.catalyst.analysis.UnresolvedAttribute 11 | import org.apache.spark.sql.catalyst.expressions._ 12 | import org.apache.spark.sql.types.{DataType, StructType} 13 | 14 | case class RangeCheck( 15 | column: String, 16 | minValue: Option[Json], 17 | maxValue: Option[Json], 18 | inclusive: Option[Json], 19 | threshold: Option[String] 20 | ) extends RowBased { 21 | 22 | override def substituteVariables(dict: VarSubstitution): ValidatorBase = { 23 | val ret = RangeCheck( 24 | getVarSub(column, "column", dict), 25 | minValue.map(getVarSubJson(_, "minValue", dict)), 26 | maxValue.map(getVarSubJson(_, "maxValue", dict)), 27 | inclusive.map(getVarSubJson(_, "inclusive", dict)), 28 | threshold.map(getVarSub(_, "threshold", dict)) 29 | ) 30 | getEvents.foreach(ret.addEvent) 31 | ret 32 | } 33 | 34 | private def cmpExpr( 35 | colExpr: Expression, 36 | value: Option[Json], 37 | colType: DataType, 38 | cmp: (Expression, Expression) => Expression 39 | ): Option[Expression] = { 40 | value.map { v => cmp(colExpr, createLiteralOrUnresolvedAttribute(colType, v)) } 41 | } 42 | 43 | override def colTest(schema: StructType, dict: VarSubstitution): Expression = { 44 | val colType = schema(column).dataType 45 | val colExp = UnresolvedAttribute(column) 46 | val (minCmpExp, maxCmpExp) = if (inclusive.flatMap(_.asBoolean).getOrElse(false)) { 47 | (LessThan, GreaterThan) 48 | } else { 49 | (LessThanOrEqual, GreaterThanOrEqual) 50 | } 51 | 52 | val minValueExpression = cmpExpr(colExp, minValue, colType, minCmpExp) 53 | val maxValueExpression = cmpExpr(colExp, maxValue, colType, maxCmpExp) 54 | 55 | val ret = (minValueExpression, maxValueExpression) match { 56 | case (Some(x), None) => x 57 | case (None, Some(y)) => y 58 | case (Some(x), Some(y)) => Or(x, y) 59 | case _ => throw new RuntimeException("Must define min or max value.") 60 | } 61 | logger.debug(s"Expr: $ret") 62 | ret 63 | } 64 | 65 | private def checkMinLessThanMax(values: List[Json]): Unit = { 66 | 67 | if (values.forall(_.isNumber)) { 68 | values.flatMap(_.asNumber) match { 69 | case mv :: xv :: Nil if mv.toDouble >= xv.toDouble => 70 | addEvent(ValidatorError(s"Min: ${minValue.get} must be less than max: ${maxValue.get}")) 71 | case _ => 72 | } 73 | } else if (values.forall(_.isString)) { 74 | values.flatMap(_.asString) match { 75 | case mv :: xv :: Nil if mv == xv => 76 | addEvent(ValidatorError(s"Min[String]: $mv must be less than max[String]: $xv")) 77 | case _ => 78 | } 79 | } else { 80 | // Not Strings or Numbers 81 | addEvent(ValidatorError(s"Unsupported type in ${values.map(debugJson).mkString(", ")}")) 82 | } 83 | } 84 | 85 | override def configCheck(df: DataFrame): Boolean = { 86 | 87 | val values = (minValue :: maxValue :: Nil).flatten 88 | if (values.isEmpty) { 89 | addEvent(ValidatorError("Must defined minValue or maxValue or both.")) 90 | } 91 | 92 | checkMinLessThanMax(values) 93 | 94 | val colType = findColumnInDataFrame(df, column) 95 | if (colType.isDefined) { 96 | val dataType = colType.get.dataType 97 | 98 | if (values.map(c => checkValue(df.schema, column, dataType, c)).exists(x => x)) { 99 | addEvent(ValidatorError(s"Range constraint types not compatible with column[$dataType]:'$column'")) 100 | } 101 | } 102 | 103 | if (inclusive.isDefined && inclusive.get.asBoolean.isEmpty) { 104 | logger.error(s"Inclusive defined but not Bool, $inclusive") 105 | addEvent(ValidatorError(s"Inclusive flag is defined, but is not a boolean, inclusive: ${inclusive.get}")) 106 | } 107 | 108 | failed 109 | } 110 | 111 | override def toJson: Json = { 112 | import JsonEncoders.eventEncoder 113 | val fields = Seq( 114 | ("type", Json.fromString("rangeCheck")), 115 | ("column", Json.fromString(column)) 116 | ) ++ 117 | minValue.map(mv => ("minValue", mv)) ++ 118 | maxValue.map(mv => ("maxValue", mv)) ++ 119 | Seq( 120 | ("inclusive", Json.fromBoolean(inclusive.flatMap(_.asBoolean).getOrElse(false))), 121 | ("events", getEvents.asJson) 122 | ) 123 | Json.obj(fields: _*) 124 | } 125 | } 126 | 127 | object RangeCheck extends LazyLogging { 128 | def fromJson(c: HCursor): Either[DecodingFailure, ValidatorBase] = { 129 | val column = c.downField("column").as[String].right.get 130 | val minValueJ = c.downField("minValue").as[Json].right.toOption 131 | val maxValueJ = c.downField("maxValue").as[Json].right.toOption 132 | val inclusiveJ = c.downField("inclusive").as[Json].right.toOption 133 | val threshold = c.downField("threshold").as[String].right.toOption 134 | 135 | logger.debug(s"column: $column") 136 | logger.debug(s"minValue: $minValueJ type: ${minValueJ.getClass.getCanonicalName}") 137 | logger.debug(s"maxValue: $maxValueJ type: ${maxValueJ.getClass.getCanonicalName}") 138 | logger.debug(s"inclusive: $inclusiveJ type: ${inclusiveJ.getClass.getCanonicalName}") 139 | logger.debug(s"threshold: $threshold") 140 | 141 | c.focus.foreach { f => logger.debug(s"RangeCheckJson: ${f.spaces2}") } 142 | scala.util.Right(RangeCheck(column, minValueJ, maxValueJ, inclusiveJ, threshold)) 143 | } 144 | } 145 | -------------------------------------------------------------------------------- /src/main/scala/com/target/data_validator/validator/RowBased.scala: -------------------------------------------------------------------------------- 1 | package com.target.data_validator.validator 2 | 3 | import com.target.data_validator._ 4 | import com.target.data_validator.validator.ValidatorBase.{isColumnInDataFrame, L0, L1} 5 | import org.apache.spark.sql.{DataFrame, Row} 6 | import org.apache.spark.sql.catalyst.expressions._ 7 | import org.apache.spark.sql.types.StructType 8 | 9 | import scala.util.matching.Regex 10 | 11 | abstract class RowBased extends CheapCheck { 12 | 13 | val column: String 14 | val threshold: Option[String] 15 | 16 | def configCheck(df: DataFrame): Boolean = { 17 | configCheckColumn(df) 18 | configCheckThreshold 19 | failed 20 | } 21 | 22 | def configCheckColumn(df: DataFrame): Boolean = { 23 | if (isColumnInDataFrame(df, column)) { 24 | logger.debug(s"Column: $column found in table.") 25 | false 26 | } else { 27 | val msg = s"Column: $column not found in schema." 28 | logger.error(msg) 29 | addEvent(ValidatorError(msg)) 30 | failed 31 | } 32 | } 33 | 34 | def configCheckThreshold: Boolean = { 35 | if (threshold.isDefined) { 36 | val ret = threshold.flatMap(RowBased.THRESHOLD_NUMBER_REGEX.findFirstIn).isEmpty 37 | if (ret) { 38 | val msg = s"Threshold `${threshold.get}` not parsable." 39 | logger.error(msg) 40 | addEvent(ValidatorError(msg)) 41 | } 42 | ret 43 | } else { 44 | false 45 | } 46 | } 47 | 48 | def colTest(schema: StructType, dict: VarSubstitution): Expression 49 | 50 | def select(schema: StructType, dict: VarSubstitution): Expression = If(colTest(schema, dict), L1, L0) 51 | 52 | /** Calculates the max acceptable number of errors from threshold and rowCount. 53 | * @param rowCount 54 | * of table. 55 | * @return 56 | * max number of errors we can tolerate. if threshold < 1, then its a percentage of rowCount. if threshold ends 57 | * with '%' then its percentage of rowCount if threshold is > 1, then its maxErrors. 58 | */ 59 | def calcErrorCountThreshold(rowCount: Long): Long = { 60 | threshold 61 | .map { t => 62 | val tempThreshold = t.stripSuffix("%").toDouble 63 | val ret: Long = if (t.endsWith("%")) { 64 | // Has '%', so divide by 100.0 65 | (tempThreshold * (rowCount / 100.0)).toLong 66 | } else if (tempThreshold < 1.0) { 67 | // Percentage without the '%' 68 | (tempThreshold * rowCount).toLong 69 | } else { 70 | // Number of rows 71 | tempThreshold.toLong 72 | } 73 | logger.info(s"Threshold:${threshold.get} tempThreshold:$tempThreshold ret:$ret") 74 | ret 75 | } 76 | .getOrElse(0) 77 | } 78 | 79 | override def quickCheck(row: Row, count: Long, idx: Int): Boolean = { 80 | logger.debug(s"quickCheck $column Row: $row count: $count idx: $idx") 81 | if (count > 0) { 82 | val errorCount = row.getLong(idx) 83 | val errorCountThreshold = calcErrorCountThreshold(count) 84 | 85 | addEvent(ValidatorCounter("rowCount", count)) 86 | addEvent(ValidatorCounter("errorCount", errorCount)) 87 | if (errorCountThreshold > 0) { 88 | addEvent(ValidatorCounter("errorCountThreshold", errorCountThreshold)) 89 | } 90 | 91 | val failure = errorCount > errorCountThreshold 92 | if (failure) { 93 | logger.error( 94 | s"Quick check for $name on $column failed, $errorCount errors in $count rows" 95 | + s" errorCountThreshold: $errorCountThreshold" 96 | ) 97 | } 98 | addEvent(ValidatorCheckEvent(failure, s"$name on column '$column'", count, errorCount)) 99 | } else { 100 | logger.warn(s"No Rows to check for $toString!") 101 | } 102 | failed 103 | } 104 | 105 | def quickCheckDetail(row: Row, key: Seq[(String, Any)], idx: Int, dict: VarSubstitution): Unit = { 106 | val r = row.get(idx) 107 | val column = row.schema.fieldNames(idx) 108 | addEvent( 109 | ValidatorQuickCheckError(key.toList, r, name + s" failed! $column = $r and ${colTest(row.schema, dict)}") 110 | ) 111 | } 112 | } 113 | 114 | object RowBased { 115 | val THRESHOLD_NUMBER_REGEX: Regex = "^([0-9]+\\.*[0-9]*)\\s*%{0,1}$".r // scalastyle:ignore 116 | } 117 | -------------------------------------------------------------------------------- /src/main/scala/com/target/data_validator/validator/StringLengthCheck.scala: -------------------------------------------------------------------------------- 1 | package com.target.data_validator.validator 2 | 3 | import com.target.data_validator.{JsonEncoders, ValidatorError, VarSubstitution} 4 | import com.target.data_validator.JsonUtils.debugJson 5 | import com.target.data_validator.validator.ValidatorBase._ 6 | import com.typesafe.scalalogging.LazyLogging 7 | import io.circe.{DecodingFailure, HCursor, Json} 8 | import io.circe.syntax._ 9 | import org.apache.spark.sql.DataFrame 10 | import org.apache.spark.sql.catalyst.analysis.UnresolvedAttribute 11 | import org.apache.spark.sql.catalyst.expressions._ 12 | import org.apache.spark.sql.types.{IntegerType, StringType, StructType} 13 | 14 | case class StringLengthCheck( 15 | column: String, 16 | minLength: Option[Json], 17 | maxLength: Option[Json], 18 | threshold: Option[String] 19 | ) extends RowBased { 20 | 21 | override def substituteVariables(dict: VarSubstitution): ValidatorBase = { 22 | 23 | val ret = StringLengthCheck( 24 | getVarSub(column, "column", dict), 25 | minLength.map(getVarSubJson(_, "minLength", dict)), 26 | maxLength.map(getVarSubJson(_, "maxLength", dict)), 27 | threshold.map(getVarSub(_, "threshold", dict)) 28 | ) 29 | getEvents.foreach(ret.addEvent) 30 | ret 31 | } 32 | 33 | private def cmpExpr( 34 | colExpr: Expression, 35 | value: Option[Json], 36 | cmp: (Expression, Expression) => Expression 37 | ): Option[Expression] = { 38 | value.map { v => cmp(colExpr, createLiteralOrUnresolvedAttribute(IntegerType, v)) } 39 | } 40 | 41 | override def colTest(schema: StructType, dict: VarSubstitution): Expression = { 42 | 43 | val colExp = Length(UnresolvedAttribute(column)) 44 | 45 | val minLengthExpression = cmpExpr(colExp, minLength, LessThan) 46 | val maxLengthExpression = cmpExpr(colExp, maxLength, GreaterThan) 47 | 48 | val ret = (minLengthExpression, maxLengthExpression) match { 49 | case (Some(x), None) => x 50 | case (None, Some(y)) => y 51 | case (Some(x), Some(y)) => Or(x, y) 52 | case _ => throw new RuntimeException("Must define min or max value.") 53 | } 54 | logger.debug(s"Expr: $ret") 55 | ret 56 | } 57 | 58 | private def checkMinLessThanOrEqualToMax(values: List[Json]): Unit = { 59 | 60 | if (values.forall(_.isNumber)) { 61 | values.flatMap(_.asNumber) match { 62 | case mv :: xv :: Nil if mv.toDouble > xv.toDouble => 63 | addEvent(ValidatorError(s"min: ${minLength.get} must be less than or equal to max: ${maxLength.get}")) 64 | case _ => 65 | } 66 | } else if (values.forall(_.isString)) { 67 | values.flatMap(_.asString) match { 68 | case mv :: xv :: Nil if mv == xv => 69 | addEvent(ValidatorError(s"Min[String]: $mv must be less than max[String]: $xv")) 70 | case _ => 71 | } 72 | } else { 73 | // Not Strings or Numbers 74 | addEvent(ValidatorError(s"Unsupported type in ${values.map(debugJson).mkString(", ")}")) 75 | } 76 | } 77 | 78 | override def configCheck(df: DataFrame): Boolean = { 79 | 80 | // Verify if at least one of min or max is specified. 81 | val values = (minLength :: maxLength :: Nil).flatten 82 | if (values.isEmpty) { 83 | addEvent(ValidatorError("Must define minLength or maxLength or both.")) 84 | } 85 | 86 | // Verify that min is less than max 87 | checkMinLessThanOrEqualToMax(values) 88 | 89 | // Verify that the data type of the specified column is a String. 90 | val colType = findColumnInDataFrame(df, column) 91 | if (colType.isDefined) { 92 | val dataType = colType.get.dataType 93 | if (!dataType.isInstanceOf[StringType]) { 94 | addEvent(ValidatorError(s"Data type of column '$column' must be String, but was found to be $dataType")) 95 | } 96 | } 97 | 98 | failed 99 | } 100 | 101 | override def toJson: Json = { 102 | import JsonEncoders.eventEncoder 103 | val fields = Seq( 104 | ("type", Json.fromString("stringLengthCheck")), 105 | ("column", Json.fromString(column)) 106 | ) ++ 107 | minLength.map(mv => ("minLength", mv)) ++ 108 | maxLength.map(mv => ("maxLength", mv)) ++ 109 | Seq( 110 | ("events", getEvents.asJson) 111 | ) 112 | Json.obj(fields: _*) 113 | } 114 | } 115 | 116 | object StringLengthCheck extends LazyLogging { 117 | def fromJson(c: HCursor): Either[DecodingFailure, ValidatorBase] = { 118 | val column = c.downField("column").as[String].right.get 119 | val minLengthJ = c.downField("minLength").as[Json].right.toOption 120 | val maxLengthJ = c.downField("maxLength").as[Json].right.toOption 121 | val threshold = c.downField("threshold").as[String].right.toOption 122 | 123 | logger.debug(s"column: $column") 124 | logger.debug(s"minLength: $minLengthJ type: ${minLengthJ.getClass.getCanonicalName}") 125 | logger.debug(s"maxLength: $maxLengthJ type: ${maxLengthJ.getClass.getCanonicalName}") 126 | logger.debug(s"threshold: $threshold type: ${threshold.getClass.getCanonicalName}") 127 | scala.util.Right(StringLengthCheck(column, minLengthJ, maxLengthJ, threshold)) 128 | } 129 | } 130 | -------------------------------------------------------------------------------- /src/main/scala/com/target/data_validator/validator/StringRegexCheck.scala: -------------------------------------------------------------------------------- 1 | package com.target.data_validator.validator 2 | 3 | import com.target.data_validator.{JsonEncoders, ValidatorError, VarSubstitution} 4 | import com.target.data_validator.validator.ValidatorBase._ 5 | import com.typesafe.scalalogging.LazyLogging 6 | import io.circe.{DecodingFailure, HCursor, Json} 7 | import io.circe.syntax._ 8 | import org.apache.spark.sql.DataFrame 9 | import org.apache.spark.sql.catalyst.analysis.UnresolvedAttribute 10 | import org.apache.spark.sql.catalyst.expressions._ 11 | import org.apache.spark.sql.types.{StringType, StructType} 12 | 13 | case class StringRegexCheck( 14 | column: String, 15 | regex: Option[Json], 16 | threshold: Option[String] 17 | ) extends RowBased { 18 | 19 | override def substituteVariables(dict: VarSubstitution): ValidatorBase = { 20 | 21 | val ret = StringRegexCheck( 22 | getVarSub(column, "column", dict), 23 | regex.map(getVarSubJson(_, "regex", dict)), 24 | threshold.map(getVarSub(_, "threshold", dict)) 25 | ) 26 | getEvents.foreach(ret.addEvent) 27 | ret 28 | } 29 | 30 | override def colTest(schema: StructType, dict: VarSubstitution): Expression = { 31 | 32 | val colExp = UnresolvedAttribute(column) 33 | 34 | val regexExpression = regex.map { r => RLike(colExp, createLiteralOrUnresolvedAttribute(StringType, r)) } 35 | 36 | val ret = regexExpression match { 37 | /* 38 | RLike returns false if the column value is null. 39 | To avoid counting null values as validation failures (like other validations), 40 | an explicit non null check on the column value is required. 41 | */ 42 | case Some(x) => And(Not(x), IsNotNull(colExp)) 43 | case _ => throw new RuntimeException("Must define a regex.") 44 | } 45 | logger.debug(s"Expr: $ret") 46 | ret 47 | } 48 | 49 | override def configCheck(df: DataFrame): Boolean = { 50 | 51 | // Verify if regex is specified. 52 | val values = (regex :: Nil).flatten 53 | if (values.isEmpty) { 54 | addEvent(ValidatorError("Must define a regex.")) 55 | } 56 | 57 | // Verify that the data type of the specified column is a String. 58 | val colType = findColumnInDataFrame(df, column) 59 | if (colType.isDefined) { 60 | val dataType = colType.get.dataType 61 | if (!dataType.isInstanceOf[StringType]) { 62 | addEvent(ValidatorError(s"Data type of column '$column' must be String, but was found to be $dataType")) 63 | } 64 | } 65 | 66 | failed 67 | } 68 | 69 | override def toJson: Json = { 70 | import JsonEncoders.eventEncoder 71 | val fields = Seq( 72 | ("type", Json.fromString("stringRegexCheck")), 73 | ("column", Json.fromString(column)) 74 | ) ++ 75 | regex.map(r => ("regex", r)) ++ 76 | Seq( 77 | ("events", getEvents.asJson) 78 | ) 79 | Json.obj(fields: _*) 80 | } 81 | } 82 | 83 | object StringRegexCheck extends LazyLogging { 84 | def fromJson(c: HCursor): Either[DecodingFailure, ValidatorBase] = { 85 | val column = c.downField("column").as[String].right.get 86 | val regex = c.downField("regex").as[Json].right.toOption 87 | val threshold = c.downField("threshold").as[String].right.toOption 88 | 89 | logger.debug(s"column: $column") 90 | logger.debug(s"regex: $regex type: ${regex.getClass.getCanonicalName}") 91 | logger.debug(s"threshold: $threshold type: ${threshold.getClass.getCanonicalName}") 92 | scala.util.Right(StringRegexCheck(column, regex, threshold)) 93 | } 94 | } 95 | -------------------------------------------------------------------------------- /src/main/scala/com/target/data_validator/validator/TwoPassCheapCheck.scala: -------------------------------------------------------------------------------- 1 | package com.target.data_validator.validator 2 | 3 | import org.apache.spark.sql._ 4 | 5 | /** extension of [[CheapCheck]] with an assumption that DV will: 6 | * - complete a pre-pass stage that generates an intermediary aggregate 7 | * - provide that intermediary aggregate so it can be used in generating the final check expression 8 | */ 9 | abstract class TwoPassCheapCheck extends CheapCheck { 10 | 11 | def hasQuickErrorDetails: Boolean = false 12 | 13 | /** defined by implementor, should generate one row of aggregated output that can then be handled by 14 | * [[sinkFirstPassRow]] 15 | */ 16 | def firstPassSelect(): Column 17 | 18 | /** defined by implementor, notify the cheap check of the result of the first pass projection 19 | * 20 | * NOTE: the contract for this check type assumes you call this method BEFORE [[CheapCheck.select]] 21 | */ 22 | def sinkFirstPassRow(row: Row): Unit 23 | 24 | } 25 | -------------------------------------------------------------------------------- /src/main/scala/com/target/data_validator/validator/UniqueCheck.scala: -------------------------------------------------------------------------------- 1 | package com.target.data_validator.validator 2 | 3 | import com.target.data_validator.{ValidatorError, ValidatorGood, ValidatorTimer, VarSubstitution} 4 | import com.typesafe.scalalogging.LazyLogging 5 | import io.circe.{DecodingFailure, HCursor, Json} 6 | import io.circe.syntax._ 7 | import org.apache.spark.sql.{Column, DataFrame} 8 | 9 | case class UniqueCheck(columns: Seq[String]) extends CostlyCheck { 10 | 11 | override def substituteVariables(dict: VarSubstitution): ValidatorBase = { 12 | val newColumns = columns.map(getVarSub(_, "columns", dict)) 13 | val ret = UniqueCheck(newColumns) 14 | this.getEvents.foreach(ret.addEvent) 15 | ret 16 | } 17 | 18 | override def configCheck(df: DataFrame): Boolean = { 19 | columns.exists(findColumnInDataFrame(df, _).isEmpty) 20 | } 21 | 22 | override def toJson: Json = { 23 | import com.target.data_validator.JsonEncoders.eventEncoder 24 | val fields = Seq( 25 | ("type", Json.fromString("uniqueCheck")), 26 | ("columns", Json.fromValues(columns.map(Json.fromString))), 27 | ("failed", Json.fromBoolean(failed)), 28 | ("events", this.getEvents.asJson) 29 | ) 30 | 31 | Json.fromFields(fields) 32 | } 33 | 34 | override def costlyCheck(df: DataFrame): Boolean = { 35 | val cols = columns.map(new Column(_)) 36 | val timer = new ValidatorTimer(s"UniqueCheck($columns)") 37 | addEvent(timer) 38 | // Note: this computes the count of the number of distinct keys (if you will) that have at least one duplicated row. 39 | // It's not number of duplicated rows. 40 | val ret = timer.time(df.select(cols: _*).groupBy(cols: _*).count().where("count > 1").count()) 41 | logger.info(s"costlyCheck: cols:$cols ret:$ret") 42 | if (ret > 0) { 43 | addEvent(ValidatorError(s"$ret duplicates found!")) 44 | } else { 45 | addEvent(ValidatorGood("no duplicates found.")) 46 | } 47 | 48 | failed 49 | } 50 | } 51 | 52 | object UniqueCheck extends LazyLogging { 53 | 54 | def fromJson(c: HCursor): Either[DecodingFailure, ValidatorBase] = { 55 | val columns = c.downField("columns").as[Seq[String]] 56 | columns.right.map(UniqueCheck(_)) 57 | } 58 | 59 | } 60 | -------------------------------------------------------------------------------- /src/test/resources/format_test.jsonl: -------------------------------------------------------------------------------- 1 | {"name": "Mal", "age": 49} 2 | {"name": "Zoe", "age": 33} 3 | {"name": "Wash", "age": 39} 4 | {"name": "Jayne", "age": 43} 5 | {"name": "Kaylee", "age": 26} 6 | {"name": "Simon", "age": 27} 7 | {"name": "River", "age": 18} 8 | {"name": "Inara", "age": 25} 9 | {"name": "Book", "age": 68} 10 | -------------------------------------------------------------------------------- /src/test/resources/test_config.yaml: -------------------------------------------------------------------------------- 1 | numKeyCols: 2 2 | numErrorsToReport: 742 3 | email: 4 | smtpHost: smtpHost 5 | subject: subject 6 | from: from 7 | to: 8 | - to 9 | detailedErrors: true 10 | vars: 11 | - name: foo 12 | value: bar 13 | 14 | outputs: 15 | - filename: /user/home/sample.json 16 | 17 | - pipe: /apps/dv2kafka.py 18 | ignoreError: true 19 | tables: 20 | - db: foo 21 | table: bar 22 | keyColumns: 23 | - one 24 | - two 25 | checks: 26 | - type: rowCount 27 | minNumRows: 10294 28 | - type: nullCheck 29 | column: mdse_item_i 30 | - orcFile: LocalFile.orc 31 | condition: "foo < 10" 32 | checks: 33 | - type: nullCheck 34 | column: start_d 35 | - parquetFile: LocFile.parquet 36 | condition: "bar < 10" 37 | checks: 38 | - type: nullCheck 39 | column: end_d 40 | - format: llama 41 | checks: 42 | - type: nullCheck 43 | column: start_d 44 | options: 45 | key: value 46 | loadData: 47 | - data.llama -------------------------------------------------------------------------------- /src/test/scala/com/target/TestingSparkSession.scala: -------------------------------------------------------------------------------- 1 | package com.target 2 | 3 | import java.util.Properties 4 | 5 | import org.apache.log4j.PropertyConfigurator 6 | import org.apache.spark.SparkContext 7 | import org.apache.spark.sql.{SparkSession, SQLContext} 8 | import org.scalatest._ 9 | 10 | trait TestingSparkSession extends BeforeAndAfterAll { self: Suite => 11 | 12 | lazy val spark: SparkSession = TestingSparkSession.sparkSingleton 13 | lazy val sc: SparkContext = spark.sparkContext 14 | lazy val sqlContext: SQLContext = spark.sqlContext 15 | 16 | } 17 | 18 | object TestingSparkSession { 19 | 20 | /** config a log4j properties used for testsuite. Copied from org.apache.spark.utils.Util because it private. 21 | */ 22 | def configTestLog4j(levelOther: String, levelMe: String): Unit = { 23 | val pro = new Properties() 24 | pro.put("log4j.rootLogger", s"$levelOther, console") 25 | pro.put("log4j.appender.console", "org.apache.log4j.ConsoleAppender") 26 | pro.put("log4j.appender.console.target", "System.err") 27 | pro.put("log4j.appender.console.layout", "org.apache.log4j.PatternLayout") 28 | pro.put( 29 | "log4j.appender.console.layout.ConversionPattern", 30 | "%d{yy/MM/dd HH:mm:ss} %p %c{1}: %m%n" 31 | ) // scalastyle:ignore regex 32 | pro.put(s"log4j.logger.${this.getClass.getPackage.getName}", levelMe) 33 | PropertyConfigurator.configure(pro) 34 | } 35 | 36 | lazy val sparkSingleton: SparkSession = { 37 | configTestLog4j("OFF", "OFF") 38 | SparkSession 39 | .builder() 40 | .config("spark.executor.memory", "512mb") 41 | .config("spark.ui.showConsoleProgress", value = false) 42 | .master("local[2]") 43 | .getOrCreate() 44 | } 45 | 46 | } 47 | -------------------------------------------------------------------------------- /src/test/scala/com/target/data_validator/CliOptionParserSpec.scala: -------------------------------------------------------------------------------- 1 | package com.target.data_validator 2 | 3 | import org.scalatest.funspec.AnyFunSpec 4 | import org.scalatest.matchers.should.Matchers 5 | 6 | class CliOptionParserSpec extends AnyFunSpec with Matchers { 7 | 8 | describe("CliOptionParser") { 9 | describe("parsing") { 10 | it("does not handle var option values with commas") { 11 | val args = Array("--vars", "keyA=value1,value2,keyB=value3") 12 | CliOptionParser.parser.parse(args, CliOptions()) should be(None) 13 | } 14 | } 15 | } 16 | } 17 | -------------------------------------------------------------------------------- /src/test/scala/com/target/data_validator/ConfigParserSpec.scala: -------------------------------------------------------------------------------- 1 | package com.target.data_validator 2 | 3 | import com.target.TestingSparkSession 4 | import com.target.data_validator.validator.{MinNumRows, NullCheck} 5 | import io.circe.Json 6 | import org.scalatest.BeforeAndAfterAll 7 | import org.scalatest.funspec.AnyFunSpec 8 | 9 | class ConfigParserSpec extends AnyFunSpec with BeforeAndAfterAll { 10 | 11 | // Silence is golden! 12 | override def beforeAll(): Unit = TestingSparkSession.configTestLog4j("OFF", "OFF") 13 | 14 | val expectedConfiguration = ValidatorConfig( 15 | 2, 16 | 742, // scalastyle:ignore magic.number 17 | Some(EmailConfig("smtpHost", "subject", "from", List("to"))), 18 | detailedErrors = true, 19 | Some(List(NameValue("foo", Json.fromString("bar")))), 20 | Some( 21 | List[ValidatorOutput]( 22 | FileOutput("/user/home/sample.json", None), 23 | PipeOutput("/apps/dv2kafka.py", Some(true)) 24 | ) 25 | ), 26 | List( 27 | ValidatorHiveTable( 28 | "foo", 29 | "bar", 30 | Some(List("one", "two")), 31 | None, 32 | List(MinNumRows(Json.fromInt(10294)), NullCheck("mdse_item_i", None)) // scalastyle:ignore magic.number 33 | ), 34 | ValidatorOrcFile("LocalFile.orc", None, Some("foo < 10"), List(NullCheck("start_d", None))), 35 | ValidatorParquetFile("LocFile.parquet", None, Some("bar < 10"), List(NullCheck("end_d", None))), 36 | ValidatorSpecifiedFormatLoader( 37 | format = "llama", 38 | keyColumns = None, 39 | condition = None, 40 | checks = List(NullCheck("start_d", None)), 41 | options = Some(Map("key" -> "value")), 42 | loadData = Some(List("data.llama")) 43 | ) 44 | ) 45 | ) 46 | 47 | describe("ConfigParser") { 48 | 49 | describe("parse") { 50 | 51 | it("should correctly parse simple yaml config") { 52 | val config = ConfigParser.parse(""" 53 | | numKeyCols: 2 54 | | numErrorsToReport: 742 55 | | email: 56 | | smtpHost: smtpHost 57 | | subject: subject 58 | | from: from 59 | | to: 60 | | - to 61 | | detailedErrors: true 62 | | vars: 63 | | - name: foo 64 | | value: bar 65 | | 66 | | outputs: 67 | | - filename: /user/home/sample.json 68 | | 69 | | - pipe: /apps/dv2kafka.py 70 | | ignoreError: true 71 | | tables: 72 | | - db: foo 73 | | table: bar 74 | | keyColumns: 75 | | - one 76 | | - two 77 | | checks: 78 | | - type: rowCount 79 | | minNumRows: 10294 80 | | - type: nullCheck 81 | | column: mdse_item_i 82 | | - orcFile: LocalFile.orc 83 | | condition: "foo < 10" 84 | | checks: 85 | | - type: nullCheck 86 | | column: start_d 87 | | - parquetFile: LocFile.parquet 88 | | condition: "bar < 10" 89 | | checks: 90 | | - type: nullCheck 91 | | column: end_d 92 | | - format: llama 93 | | checks: 94 | | - type: nullCheck 95 | | column: start_d 96 | | options: 97 | | key: value 98 | | loadData: 99 | | - data.llama 100 | """.stripMargin) 101 | 102 | assert(config == Right(expectedConfiguration)) 103 | } 104 | 105 | } 106 | 107 | describe("parseFile") { 108 | 109 | it("should support loading config files by path") { 110 | val output = ConfigParser.parseFile("src/test/resources/test_config.yaml", Map.empty) 111 | assert(output == Right(expectedConfiguration)) 112 | } 113 | 114 | it("should support classpath configuration loading with the prefix 'classpath:'") { 115 | val output = ConfigParser.parseFile("classpath:/test_config.yaml", Map.empty) 116 | assert(output == Right(expectedConfiguration)) 117 | } 118 | 119 | it("should not confuse classpath and non classpath file loading") { 120 | val paths = Seq("classpath:src/test/resources/test_config.yaml", "test_config.yaml") 121 | 122 | paths.foreach { path => 123 | val output = ConfigParser.parseFile(path, Map.empty) 124 | assert(output.isLeft) 125 | } 126 | } 127 | 128 | } 129 | 130 | } 131 | 132 | } 133 | -------------------------------------------------------------------------------- /src/test/scala/com/target/data_validator/ConfigVarSubSpec.scala: -------------------------------------------------------------------------------- 1 | package com.target.data_validator 2 | 3 | import com.target.TestingSparkSession 4 | import com.target.data_validator.validator.{ColumnMaxCheck, MinNumRows, NegativeCheck, NullCheck} 5 | import io.circe.Json 6 | import org.scalatest.funspec.AnyFunSpec 7 | import org.scalatest.matchers.should.Matchers 8 | 9 | class ConfigVarSubSpec extends AnyFunSpec with Matchers with TestingSparkSession { 10 | 11 | val baseMap: Map[String, String] = 12 | Map("one" -> "1", "two" -> "2", "three" -> "3", "four" -> "4", "five" -> "5", "six" -> "6") 13 | 14 | val dict: VarSubstitution = { 15 | val d = new VarSubstitution 16 | d.addMap(baseMap) 17 | d 18 | } 19 | 20 | describe("ConfigVariable Substitutions") { 21 | 22 | describe("ValidatorTable children") { 23 | 24 | it("ValidatorHiveTable var substitution should work") { 25 | val sut = ValidatorHiveTable( 26 | "database$one", 27 | "table$two", 28 | Some(List("Col$three", "Col$four")), 29 | Some("$five == $six"), 30 | List.empty 31 | ) 32 | assert( 33 | sut.substituteVariables(dict) == 34 | ValidatorHiveTable("database1", "table2", Some(List("Col3", "Col4")), Some("5 == 6"), List.empty) 35 | ) 36 | } 37 | 38 | it("Validator OrcFile substitution should work") { 39 | val sut = ValidatorOrcFile( 40 | "/$one/$two/orcFile", 41 | Some(List("Col$three", "Col$four")), 42 | Some("$five == $six"), 43 | List.empty 44 | ) 45 | assert( 46 | sut.substituteVariables(dict) == 47 | ValidatorOrcFile("/1/2/orcFile", Some(List("Col3", "Col4")), Some("5 == 6"), List.empty) 48 | ) 49 | } 50 | 51 | it("ValidatorDataFrame substitution should work") { 52 | val df = spark.emptyDataFrame 53 | val sut = ValidatorDataFrame(df, Some(List("Col$three", "Col$four")), Some("$five == $six"), List.empty) 54 | assert( 55 | sut.substituteVariables(dict) == 56 | ValidatorDataFrame(df, Some(List("Col3", "Col4")), Some("5 == 6"), List.empty) 57 | ) 58 | } 59 | 60 | } 61 | 62 | describe("ValidatorBase children") { 63 | 64 | describe("ColumnBased children") { 65 | 66 | describe("MinNumRows") { 67 | 68 | it("should substitute variables properly") { 69 | val sut = MinNumRows(Json.fromString("$one")) 70 | assert(sut.substituteVariables(dict) == MinNumRows(Json.fromInt(1))) 71 | } 72 | 73 | } 74 | 75 | describe("ColumnMaxCheck") { 76 | 77 | it("should substitute variables properly") { 78 | val sut = ColumnMaxCheck("Col$six", Json.fromString("$five")) 79 | val newColMaxCheck = sut.substituteVariables(dict).asInstanceOf[ColumnMaxCheck] 80 | assert(newColMaxCheck.column == "Col6") 81 | assert(newColMaxCheck.value == Json.fromInt(5)) // scalastyle:ignore 82 | assert(sut.substituteVariables(dict) == ColumnMaxCheck("Col6", Json.fromInt(5))) // scalastyle:ignore 83 | assert(!sut.failed) 84 | } 85 | 86 | it("should fail on bad variables") { 87 | val check = ColumnMaxCheck("Col$six", Json.fromString("$fivefour")) 88 | val sut = check.substituteVariables(dict) 89 | assert(sut.failed) 90 | } 91 | 92 | } 93 | 94 | } 95 | 96 | describe("RowBased children") { 97 | 98 | describe("NegativeCheck") { 99 | 100 | it("NegativeCheck") { 101 | val sut = NegativeCheck("Col$four", None) 102 | assert(sut.substituteVariables(dict) == NegativeCheck("Col4", None)) 103 | } 104 | 105 | it("NegativeCheck bad variable substitution should fail") { 106 | val check = NegativeCheck("Col$fourfour", None) 107 | val sut = check.substituteVariables(dict) 108 | assert(sut.failed) 109 | } 110 | 111 | } 112 | 113 | describe("NullCheck") { 114 | 115 | it("should substitute variables properly") { 116 | val sut = NullCheck("Col${one}", None) 117 | assert(sut.substituteVariables(dict) == NullCheck("Col1", None)) 118 | } 119 | 120 | it("bad variable substitution should fail") { 121 | val check = NullCheck("Col${unknown}", None) 122 | val sut = check.substituteVariables(dict) 123 | assert(sut.failed) 124 | } 125 | 126 | } 127 | 128 | } 129 | 130 | } 131 | 132 | } 133 | 134 | } 135 | -------------------------------------------------------------------------------- /src/test/scala/com/target/data_validator/EnvironmentVariablesSpec.scala: -------------------------------------------------------------------------------- 1 | package com.target.data_validator 2 | 3 | import com.target.data_validator.EnvironmentVariables.{Inaccessible, Present, Unset} 4 | import org.scalatest.matchers.should.Matchers 5 | import org.scalatest.wordspec.AnyWordSpec 6 | 7 | class EnvironmentVariablesSpec extends AnyWordSpec with Matchers { 8 | 9 | "EnvironmentVariables" should { 10 | "get envvars" when { 11 | "an envvar exists" in { 12 | EnvironmentVariables.get("HOME") should be(Present(System.getenv("HOME"))) 13 | } 14 | "an envvar doesn't exist" in { 15 | EnvironmentVariables.get("NOPE") should be(Unset) 16 | } 17 | "an envvar isn't an envvar" in { 18 | EnvironmentVariables.get(null) shouldBe a[Inaccessible] // scalastyle:ignore 19 | } 20 | } 21 | "log envvars" when { 22 | "using get" in { 23 | EnvironmentVariables.get("HOME") 24 | EnvironmentVariables.accessedEnvVars.keySet should contain("HOME") 25 | } 26 | "using tryGet" in { 27 | EnvironmentVariables.tryGet("HOME") 28 | EnvironmentVariables.accessedEnvVars.keySet should contain("HOME") 29 | } 30 | } 31 | } 32 | } 33 | -------------------------------------------------------------------------------- /src/test/scala/com/target/data_validator/ExpressionUtilsSpec.scala: -------------------------------------------------------------------------------- 1 | package com.target.data_validator 2 | 3 | import ExpressionUtils.orFromList 4 | import com.target.data_validator.validator.ValidatorBase._ 5 | import org.apache.spark.sql.catalyst.expressions.{GreaterThan, Or} 6 | import org.scalatest._ 7 | import org.scalatest.funspec.AnyFunSpec 8 | import org.scalatest.matchers.should.Matchers 9 | 10 | class ExpressionUtilsSpec extends AnyFunSpec with Matchers { 11 | 12 | describe("ExpressionUtils") { 13 | 14 | describe("orFromList()") { 15 | 16 | val expr1 = GreaterThan(L0, L1) 17 | 18 | it("Simpler case 1 expression") { 19 | assert(orFromList(expr1 :: Nil) == expr1) 20 | } 21 | 22 | it("Standard case 2 expressions") { 23 | assert(orFromList(expr1 :: expr1 :: Nil) == Or(expr1, expr1)) 24 | } 25 | 26 | it("More then 2 case") { 27 | assert(orFromList(expr1 :: expr1 :: expr1 :: Nil) == Or(expr1, Or(expr1, expr1))) 28 | } 29 | 30 | it("Failure case, empty list.") { 31 | assertThrows[java.lang.IllegalArgumentException](orFromList(Nil)) 32 | } 33 | 34 | } 35 | 36 | } 37 | 38 | } 39 | -------------------------------------------------------------------------------- /src/test/scala/com/target/data_validator/IOSpec.scala: -------------------------------------------------------------------------------- 1 | package com.target.data_validator 2 | 3 | import java.nio.file.Files 4 | 5 | import com.target.TestingSparkSession 6 | import io.circe.{parser, Json} 7 | import scala.io.Source.fromFile 8 | import scala.util.Random._ 9 | import scalatags.Text.all._ 10 | import org.scalatest.funspec.AnyFunSpec 11 | import org.scalatest.matchers.should.Matchers 12 | 13 | class IOSpec extends AnyFunSpec with Matchers with TestingSparkSession { 14 | val SAMPLE_HTML = html(h1("H1"), "Sample HTML Doc") 15 | val SAMPLE_JSON: Json = parser 16 | .parse("""{ 17 | | "one" : 1, 18 | | "two" : [ 1,2 ], 19 | | "three": { "a":1.0,"b":2.0,"c":3.0 } 20 | |} 21 | """.stripMargin) 22 | .right 23 | .getOrElse(Json.Null) 24 | 25 | def createRandomTempFilename: String = { 26 | val useDefault = null 27 | val file = Files.createTempFile("data-validator_temp", useDefault) 28 | file.toAbsolutePath.toString 29 | } 30 | 31 | def rm(filename: String): Boolean = { 32 | val f = new java.io.File(filename) 33 | f.delete() 34 | } 35 | 36 | describe("Local Disk") { 37 | 38 | it("should write HTML") { 39 | val filename = createRandomTempFilename 40 | assert(!IO.writeHTML(filename, SAMPLE_HTML)(spark)) 41 | fromFile(filename).mkString should be(SAMPLE_HTML.render + "\n") 42 | assert(rm(filename)) 43 | } 44 | 45 | it("should write JSON") { 46 | val filename = createRandomTempFilename 47 | assert(!IO.writeJSON(filename, SAMPLE_JSON)(spark)) 48 | fromFile(filename).mkString should be(SAMPLE_JSON.noSpaces + "\n") 49 | assert(rm(filename)) 50 | } 51 | 52 | it("file:/// should be able to write") { 53 | val baseFilename = createRandomTempFilename 54 | val filename = "file://" + baseFilename 55 | val data = List.fill(128)(nextPrintableChar).mkString + IO.NEW_LINE // scalastyle:ignore 56 | assert(!IO.writeString(filename, data)(spark)) 57 | fromFile(baseFilename).mkString should be(data) 58 | assert(rm(baseFilename)) 59 | } 60 | 61 | describe("canAppendOrWrite") { 62 | 63 | it("returns false when it should") { 64 | val badFilename = "/dir/that/does/not/exist/junk.txt" 65 | assert(!IO.canAppendOrCreate(badFilename, append = false)(spark)) 66 | } 67 | 68 | it("returns true when it should") { 69 | assert(IO.canAppendOrCreate(createRandomTempFilename, append = false)(spark)) 70 | } 71 | 72 | } 73 | 74 | describe("canExecute") { 75 | 76 | it("returns true for local executable") { 77 | assert(IO.canExecute("/usr/bin/wc")(spark)) 78 | } 79 | 80 | it("returns false for hdfs file") { 81 | assert(!IO.canExecute(IO.HDFS_SCHEMA_PREFIX + "foo.bar")(spark)) 82 | } 83 | 84 | it("returns false for local non-executable") { 85 | assert(!IO.canExecute("/etc/passwd")(spark)) 86 | } 87 | 88 | it("returns false for non-existent file") { 89 | assert(!IO.canExecute(createRandomTempFilename)(spark)) 90 | } 91 | 92 | } 93 | 94 | } 95 | 96 | describe("writeStringToPipe") { 97 | 98 | it("Fails for bad path") { 99 | val (fail, out, _) = IO.writeStringToPipe("/bad/path", nextString(200)) // scalastyle:ignore 100 | assert(fail) 101 | assert(out.isEmpty) 102 | } 103 | 104 | it("Works for wc and captures stdout") { 105 | val str = nextString(200) // scalastyle:ignore 106 | val (fail, out, err) = IO.writeStringToPipe("/usr/bin/wc -c", str) 107 | assert(!fail) 108 | assert(out.length == 1) 109 | assert(out.head.dropWhile(_.isWhitespace).toInt == str.getBytes.length) 110 | assert(err.isEmpty) 111 | } 112 | 113 | it("works when program fails") { 114 | val (fail, out, err) = IO.writeStringToPipe("false", "") 115 | assert(fail) 116 | assert(out.isEmpty) 117 | assert(err.isEmpty) 118 | } 119 | 120 | it("captures stderr and doesn't fail") { 121 | val (fail, out, err) = IO.writeStringToPipe("echo ERR >&2", "") 122 | assert(!fail) 123 | assert(out.isEmpty) 124 | assert(err == List("ERR")) 125 | } 126 | 127 | } 128 | // TODO: Add hdfs tests using https://github.com/sakserv/hadoop-mini-clusters 129 | } 130 | -------------------------------------------------------------------------------- /src/test/scala/com/target/data_validator/JsonUtilsSpec.scala: -------------------------------------------------------------------------------- 1 | package com.target.data_validator 2 | 3 | import com.target.TestingSparkSession 4 | import com.target.data_validator.JsonUtils._ 5 | import io.circe.Json 6 | import org.apache.spark.sql.Row 7 | import org.apache.spark.sql.types._ 8 | 9 | import scala.util.Random 10 | import org.scalatest.funspec.AnyFunSpec 11 | import org.scalatest.matchers.should.Matchers 12 | 13 | class JsonUtilsSpec extends AnyFunSpec with Matchers with TestingSparkSession { 14 | val TEST_STRING_LENGTH = 10 15 | 16 | describe("JsonUtils") { 17 | 18 | describe("string2Json") { 19 | 20 | it("Simple Int into Json") { 21 | val randInt = Random.nextInt 22 | assert(string2Json(s"$randInt") == Json.fromInt(randInt)) 23 | } 24 | 25 | it("String into Json") { 26 | val randString = Random.nextString(TEST_STRING_LENGTH) 27 | assert(string2Json(randString) == Json.fromString(randString)) 28 | } 29 | 30 | it("garbage doesn't crash") { 31 | val garbageString = "{]2389s0fj2}" 32 | assert(string2Json(garbageString) == Json.fromString(garbageString)) 33 | } 34 | 35 | } 36 | 37 | describe("debugJson") { 38 | 39 | it("Int") { 40 | val randInt = Random.nextInt() 41 | assert(debugJson(Json.fromInt(randInt)) == s"Json NUM: $randInt") 42 | } 43 | 44 | it("Double") { 45 | val randDouble = Random.nextDouble() 46 | assert(debugJson(Json.fromDoubleOrNull(randDouble)) == s"Json NUM: $randDouble") 47 | } 48 | 49 | it("String") { 50 | val randString = Random.nextString(TEST_STRING_LENGTH) 51 | assert(debugJson(Json.fromString(randString)) == s"Json STRING: $randString") 52 | } 53 | 54 | it("Boolean") { 55 | val randBool = Random.nextBoolean() 56 | assert(debugJson(Json.fromBoolean(randBool)) == s"Json BOOLEAN: $randBool") 57 | 58 | } 59 | 60 | it("Array") { 61 | val randArray = Range(0, TEST_STRING_LENGTH).map(_ => Json.fromInt(Random.nextInt)) 62 | assert(debugJson(Json.fromValues(randArray)) contains "Json ARR:") 63 | } 64 | 65 | it("Null") { 66 | assert(debugJson(Json.Null) == "Json NULL") 67 | } 68 | 69 | } 70 | 71 | describe("row2Json") { 72 | 73 | val TEST_STRING = Random.nextString(TEST_STRING_LENGTH) 74 | val TEST_LONG = Random.nextLong 75 | val TEST_INT = Random.nextInt 76 | val TEST_BOOLEAN = Random.nextBoolean 77 | val TEST_DOUBLE = Random.nextDouble 78 | 79 | val schema = StructType( 80 | List( 81 | StructField("string", StringType), 82 | StructField("long", LongType), 83 | StructField("int", IntegerType), 84 | StructField("null", NullType), 85 | StructField("bool", BooleanType), 86 | StructField("double", DoubleType) 87 | ) 88 | ) 89 | 90 | val sampleData = 91 | List(Row(TEST_STRING, TEST_LONG, TEST_INT, null, TEST_BOOLEAN, TEST_DOUBLE)) // scalastyle:ignore 92 | 93 | def mkRow: Row = spark.createDataFrame(sc.parallelize(sampleData), schema).head() 94 | 95 | it("Row with String") { 96 | val sut = mkRow 97 | assert(row2Json(sut, 0) == Json.fromString(TEST_STRING)) 98 | } 99 | 100 | it("Row with long") { 101 | val sut = mkRow 102 | assert(row2Json(sut, 1) == Json.fromLong(TEST_LONG)) 103 | } 104 | 105 | it("Row with int") { 106 | val sut = mkRow 107 | assert(row2Json(sut, 2) == Json.fromInt(TEST_INT)) 108 | } 109 | 110 | it("Row with null") { 111 | val sut = mkRow 112 | assert(row2Json(sut, 3) == Json.Null) 113 | } 114 | 115 | it("Row with bool") { 116 | val sut = mkRow 117 | assert(row2Json(sut, 4) == Json.fromBoolean(TEST_BOOLEAN)) // scalastyle:ignore 118 | } 119 | 120 | it("Row with double") { 121 | val sut = mkRow 122 | assert(row2Json(sut, 5) == Json.fromDoubleOrNull(TEST_DOUBLE)) // scalastyle:ignore 123 | } 124 | 125 | it("Full Row") { 126 | val sut = mkRow 127 | assert( 128 | row2Json(sut) == Json.obj( 129 | ("string", Json.fromString(TEST_STRING)), 130 | ("long", Json.fromLong(TEST_LONG)), 131 | ("int", Json.fromInt(TEST_INT)), 132 | ("null", Json.Null), 133 | ("bool", Json.fromBoolean(TEST_BOOLEAN)), 134 | ("double", Json.fromDoubleOrNull(TEST_DOUBLE)) 135 | ) 136 | ) 137 | } 138 | 139 | } 140 | 141 | } 142 | 143 | } 144 | -------------------------------------------------------------------------------- /src/test/scala/com/target/data_validator/TestHelpers.scala: -------------------------------------------------------------------------------- 1 | package com.target.data_validator 2 | 3 | import io.circe.Json 4 | import io.circe.yaml.parser 5 | import org.apache.spark.SparkContext 6 | import org.apache.spark.sql.{DataFrame, Row, SparkSession} 7 | import org.apache.spark.sql.types._ 8 | 9 | object TestHelpers { 10 | 11 | def parseYaml(yaml: String): Json = { 12 | parser.parse(yaml).right.getOrElse(Json.Null) 13 | } 14 | 15 | def mkDict(args: (String, String)*): VarSubstitution = { 16 | val ret = new VarSubstitution 17 | args.foreach(kv => ret.addString(kv._1, kv._2)) 18 | ret 19 | } 20 | 21 | def mkDictJson(elems: (String, String)*): VarSubstitution = { 22 | val ret = new VarSubstitution 23 | elems.foreach(e => ret.add(e._1, JsonUtils.string2Json(e._2))) 24 | ret 25 | } 26 | 27 | def mkConfig(tables: List[ValidatorTable]): ValidatorConfig = 28 | ValidatorConfig(2, 10, None, detailedErrors = false, None, None, tables) // scalastyle:ignore 29 | 30 | def mkDataFrame(data: List[Row], schema: StructType)(implicit spark: SparkSession, sc: SparkContext): DataFrame = 31 | spark.createDataFrame(sc.parallelize(data), schema) 32 | 33 | def guessType(v: Any): DataType = v.getClass.getCanonicalName match { 34 | case "java.lang.Short" => ShortType 35 | case "java.lang.String" => StringType 36 | case "java.lang.Integer" => IntegerType 37 | case "java.lang.Double" => DoubleType 38 | case "java.lang.Boolean" => BooleanType 39 | case "java.lang.Long" => LongType 40 | case "java.lang.Byte" => ByteType 41 | case _ => throw new IllegalArgumentException(s"Unknown type '${v.getClass.getCanonicalName}'") 42 | } 43 | 44 | def mkSchema(args: (String, List[Any])*): StructType = { 45 | StructType(args.map(x => StructField(x._1, guessType(x._2.head)))) 46 | } 47 | 48 | def mkRows(args: (String, List[Any])*): List[Row] = { 49 | val len = args.head._2.length 50 | require(args.forall(_._2.length == len)) 51 | (0 until len).map(i => Row(args.map(_._2.apply(i)): _*)).toList 52 | } 53 | 54 | /** creates dataFrame from array of (label, List[Any]) 55 | * @param spark 56 | * @param args 57 | * \- is array of tuple(String, List[Any]) supported types are String, Double, Int, Long 58 | * @return 59 | * DataFrame 60 | * 61 | * ie mkDf(("item" -> List("Eggs", "Milk", "Bread", "Cheese")), ("price" -> List( 5.49, 3.89, 4.50, 6.00), 62 | * ("quantity" -> List( 12, 5, 2, 10))) 63 | * 64 | * will return a dataframe 65 | */ 66 | def mkDf(spark: SparkSession, args: (String, List[Any])*): DataFrame = { 67 | require(args.forall(_._2.length == args.head._2.length)) 68 | val schema = mkSchema(args: _*) 69 | val data = mkRows(args: _*) 70 | spark.createDataFrame(spark.sparkContext.parallelize(data), schema) 71 | } 72 | } 73 | -------------------------------------------------------------------------------- /src/test/scala/com/target/data_validator/ValidatorOutputSpec.scala: -------------------------------------------------------------------------------- 1 | package com.target.data_validator 2 | 3 | import com.target.TestingSparkSession 4 | import org.scalatest.funspec.AnyFunSpec 5 | import org.scalatest.matchers.should.Matchers 6 | 7 | class ValidatorOutputSpec extends AnyFunSpec with Matchers with TestingSparkSession { 8 | 9 | val dict = new VarSubstitution() 10 | 11 | describe("ValidatorOutput") { 12 | 13 | describe("PipeOutput") { 14 | 15 | it("variable substitution") { 16 | dict.addString("TMPDIR", "/tmp") 17 | val sut = PipeOutput("$TMPDIR/foo.sh", None) 18 | assert(sut.substituteVariables(dict) == PipeOutput("/tmp/foo.sh", None)) 19 | } 20 | 21 | } 22 | 23 | describe("FileOutput") { 24 | 25 | it("variable substitution") { 26 | dict.addString("TMPDIR", "/tmp") 27 | val sut = FileOutput("$TMPDIR/foo.json", None) 28 | assert(sut.substituteVariables(dict) == FileOutput("/tmp/foo.json", None)) 29 | } 30 | 31 | } 32 | 33 | } 34 | 35 | } 36 | -------------------------------------------------------------------------------- /src/test/scala/com/target/data_validator/ValidatorSpecifiedFormatLoaderSpec.scala: -------------------------------------------------------------------------------- 1 | package com.target.data_validator 2 | 3 | import com.target.TestingSparkSession 4 | import com.target.data_validator.TestHelpers.{mkConfig, mkDict} 5 | import com.target.data_validator.validator.{ColumnMaxCheck, MinNumRows, NegativeCheck, NullCheck} 6 | import org.scalatest.matchers.should.Matchers 7 | import org.scalatest.wordspec.AnyWordSpec 8 | 9 | class ValidatorSpecifiedFormatLoaderSpec extends AnyWordSpec with Matchers with TestingSparkSession { 10 | "ValidatorSpecifiedFormatLoader" should { 11 | "load json" in { 12 | val loader = ValidatorSpecifiedFormatLoader( 13 | format = "json", 14 | keyColumns = Some(List("age")), 15 | condition = None, 16 | checks = List( 17 | NegativeCheck("age", None), 18 | NullCheck("age", None), 19 | ColumnMaxCheck("age", JsonUtils.string2Json("68")), 20 | MinNumRows(JsonUtils.string2Json("9")) 21 | ), 22 | options = None, 23 | loadData = Some(List("src/test/resources/format_test.jsonl")) 24 | ) 25 | 26 | val didFail = loader.quickChecks(spark, mkDict())(mkConfig(List(loader))) 27 | 28 | didFail should be(false) 29 | loader.getEvents should have size 2 30 | loader.getEvents 31 | .collectFirst { case vc: ValidatorCounter => vc } 32 | .get 33 | .value should be(9) 34 | } 35 | } 36 | } 37 | -------------------------------------------------------------------------------- /src/test/scala/com/target/data_validator/VarSubstitutionSpec.scala: -------------------------------------------------------------------------------- 1 | package com.target.data_validator 2 | 3 | import org.scalatest.funspec.AnyFunSpec 4 | import org.scalatest.matchers.should.Matchers 5 | 6 | class VarSubstitutionSpec extends AnyFunSpec with Matchers { 7 | 8 | describe("VarSubstitution") { 9 | 10 | it("adding var twice fails") { 11 | val dict = new VarSubstitution() 12 | assert(!dict.addString("foo", "bar")) 13 | assert(dict.addString("foo", "baz")) 14 | assert(dict.replaceVars("$foo") == Left("bar")) 15 | } 16 | 17 | it("adding invalid variable fails") { 18 | val dict = new VarSubstitution() 19 | assert(dict.addString("99", "99")) 20 | } 21 | 22 | it("simple var substitution works") { 23 | val dict = new VarSubstitution 24 | assert(!dict.addString("animal", "fox")) 25 | assert(dict.replaceVars("The quick brown $animal.") == Left("The quick brown fox.")) 26 | } 27 | 28 | it("simple var substitution works for scala type variables") { 29 | val dict = new VarSubstitution 30 | assert(!dict.addString("animal", "fox")) 31 | assert(dict.replaceVars("The quick brown ${animal}.") == Left("The quick brown fox.")) 32 | } 33 | 34 | it("missing var produces error") { 35 | val dict = new VarSubstitution 36 | assert( 37 | dict.replaceVars("The quick $color fox.") == 38 | Right(ValidatorError("VariableSubstitution: Can't find values for the following keys, color")) 39 | ) 40 | } 41 | 42 | it("missing scala var produces error") { 43 | val dict = new VarSubstitution 44 | assert( 45 | dict.replaceVars("The quick ${color} fox.") == 46 | Right(ValidatorError("VariableSubstitution: Can't find values for the following keys, color")) 47 | ) 48 | } 49 | 50 | it("adding map works") { 51 | val dict = new VarSubstitution 52 | dict.addMap(Map[String, String]("one" -> "1", "two" -> "2")) 53 | assert(dict.dict.size == 2) 54 | assert(dict.replaceVars("$one, $two") == Left("1, 2")) 55 | } 56 | 57 | it("short 1 char variables") { 58 | val dict = new VarSubstitution 59 | dict.addString("f", "foo") 60 | assert(dict.replaceVars("${f}|$f") == Left("foo|foo")) // scalastyle:ignore 61 | } 62 | 63 | describe("VarSubstitution.replaceAll") { 64 | 65 | it("single replacement") { 66 | assert(VarSubstitution.replaceAll("This is a test.", " a ", " not a ") == "This is not a test.") 67 | } 68 | 69 | it("multiple replacements") { 70 | assert(VarSubstitution.replaceAll("$o $o $o", "$o", "one") == "one one one") 71 | } 72 | 73 | it("no replacement") { 74 | val str = "String with nothing to replace." 75 | assert(VarSubstitution.replaceAll(str, "xx", "yy") == str) 76 | } 77 | 78 | } 79 | 80 | describe("VarSubstitution.getVarName") { 81 | 82 | it("normal var") { 83 | assert(VarSubstitution.getVarName("$foo").contains("foo")) 84 | } 85 | 86 | it("scala type var") { 87 | assert(VarSubstitution.getVarName("${foo}").contains("foo")) 88 | } 89 | 90 | it("bad scala type variable") { 91 | assert(VarSubstitution.getVarName("${foo").isEmpty) 92 | } 93 | 94 | } 95 | 96 | describe("VarSubstitution findVars") { 97 | 98 | it("approves of simple var") { 99 | assert(VarSubstitution.findVars("$one, $two, $three") == Set("$one", "$two", "$three")) 100 | } 101 | 102 | it("approves of scala vars") { 103 | assert(VarSubstitution.findVars("${one}, ${two}, ${three}") == Set("${one}", "${two}", "${three}")) 104 | } 105 | 106 | it("does find bad vars") { 107 | assert(VarSubstitution.findVars("$11, $6nop, ${junk") == Set.empty) 108 | } 109 | 110 | } 111 | 112 | describe("VarSubstitution isVar") { 113 | 114 | it("simple var") { 115 | assert(VarSubstitution.isVariable("$foo")) 116 | } 117 | 118 | it("scala var") { 119 | assert(VarSubstitution.isVariable("${foo}")) 120 | } 121 | 122 | it("not a var") { 123 | assert(!VarSubstitution.isVariable("foo")) 124 | } 125 | 126 | } 127 | 128 | } 129 | 130 | } 131 | -------------------------------------------------------------------------------- /src/test/scala/com/target/data_validator/stats/FirstPassStatsAggregatorSpec.scala: -------------------------------------------------------------------------------- 1 | package com.target.data_validator.stats 2 | 3 | import com.target.TestingSparkSession 4 | import org.scalatest.funspec.AnyFunSpec 5 | import org.scalatest.matchers.should.Matchers 6 | 7 | class FirstPassStatsAggregatorSpec extends AnyFunSpec with Matchers with TestingSparkSession { 8 | 9 | describe("FirstPassStatsAggregator") { 10 | 11 | it("should correctly calculate the count, mean, min and max values") { 12 | 13 | import spark.implicits._ 14 | val data = NumericData.data.toDS 15 | 16 | val agg1 = new FirstPassStatsAggregator 17 | val stats = data 18 | .select(agg1(data("value1")).as("stats")) 19 | .select( 20 | "stats.count", 21 | "stats.mean", 22 | "stats.min", 23 | "stats.max" 24 | ) 25 | .as[FirstPassStats] 26 | .collect 27 | 28 | stats.headOption match { 29 | case Some(s) => 30 | assert(s.count === NumericData.firstPassStats.count) 31 | assert(s.mean === NumericData.firstPassStats.mean) 32 | assert(s.min === NumericData.firstPassStats.min) 33 | assert(s.max === NumericData.firstPassStats.max) 34 | case None => assert(false) 35 | } 36 | 37 | } 38 | 39 | } 40 | 41 | } 42 | -------------------------------------------------------------------------------- /src/test/scala/com/target/data_validator/stats/NumericData.scala: -------------------------------------------------------------------------------- 1 | package com.target.data_validator.stats 2 | 3 | case class NumericData(value1: Double) 4 | 5 | object NumericData { 6 | 7 | val data: Seq[NumericData] = Seq( 8 | NumericData(0.0), 9 | NumericData(1.0), 10 | NumericData(2.0), 11 | NumericData(3.0), 12 | NumericData(4.0), 13 | NumericData(5.0), 14 | NumericData(6.0), 15 | NumericData(7.0), 16 | NumericData(8.0), 17 | NumericData(9.0) 18 | ) 19 | 20 | // scalastyle:off 21 | val firstPassStats = FirstPassStats(10, 4.5, 0, 9) 22 | val secondPassStats = SecondPassStats( 23 | 3.0276503540974917, 24 | Histogram( 25 | Seq( 26 | Bin(0.0, 0.9, 1), 27 | Bin(0.9, 1.8, 1), 28 | Bin(1.8, 2.7, 1), 29 | Bin(2.7, 3.6, 1), 30 | Bin(3.6, 4.5, 1), 31 | Bin(4.5, 5.4, 1), 32 | Bin(5.4, 6.3, 1), 33 | Bin(6.3, 7.2, 1), 34 | Bin(7.2, 8.1, 1), 35 | Bin(8.1, 9.0, 1) 36 | ) 37 | ) 38 | ) 39 | // scalastyle:on 40 | 41 | } 42 | -------------------------------------------------------------------------------- /src/test/scala/com/target/data_validator/stats/SecondPassStatsAggregatorSpec.scala: -------------------------------------------------------------------------------- 1 | package com.target.data_validator.stats 2 | 3 | import com.target.TestingSparkSession 4 | import org.scalatest.funspec.AnyFunSpec 5 | import org.scalatest.matchers.should.Matchers 6 | 7 | class SecondPassStatsAggregatorSpec extends AnyFunSpec with Matchers with TestingSparkSession { 8 | 9 | describe("SecondPassStatsAggregator") { 10 | 11 | import spark.implicits._ 12 | val data = NumericData.data.toDS 13 | 14 | it("should correctly calculate the standard deviation and histogram") { 15 | val stats1 = NumericData.firstPassStats 16 | val agg2 = new SecondPassStatsAggregator(stats1) 17 | 18 | val stats2 = data 19 | .select(agg2(data("value1")).as("stats")) 20 | .select( 21 | "stats.stdDev", 22 | "stats.histogram" 23 | ) 24 | .as[SecondPassStats] 25 | .collect 26 | 27 | stats2.headOption match { 28 | case Some(s) => 29 | assert(s.stdDev === NumericData.secondPassStats.stdDev) 30 | assert(s.histogram === NumericData.secondPassStats.histogram) 31 | case None => assert(false) 32 | } 33 | 34 | } 35 | 36 | it("should freely convert from spark Row type with the provided companion function") { 37 | val stats1 = NumericData.firstPassStats 38 | val agg2 = new SecondPassStatsAggregator(stats1) 39 | val outputRow = data.select(agg2(data("value1"))).head 40 | val outputStruct = outputRow.getStruct(0) 41 | 42 | SecondPassStats.fromRowRepr(outputStruct) shouldBe NumericData.secondPassStats 43 | } 44 | 45 | } 46 | 47 | } 48 | -------------------------------------------------------------------------------- /src/test/scala/com/target/data_validator/validator/ColStatsSpec.scala: -------------------------------------------------------------------------------- 1 | package com.target.data_validator.validator 2 | 3 | import com.target.TestingSparkSession 4 | import com.target.data_validator._ 5 | import com.target.data_validator.stats._ 6 | import io.circe.Json 7 | import org.scalatest._ 8 | import org.scalatest.funspec.AnyFunSpec 9 | import org.scalatest.matchers.should.Matchers 10 | 11 | // scalastyle:off magic.number 12 | class ColStatsSpec extends AnyFunSpec with Matchers with TestingSparkSession { 13 | import spark.implicits._ 14 | 15 | describe("ColStats + ValidatorDataFrame") { 16 | 17 | val variables = new VarSubstitution 18 | val sampleDS = spark.createDataset(ColStatsSpec.sample) 19 | val validatorTable = ValidatorDataFrame( 20 | df = sampleDS.toDF, 21 | checks = List( 22 | new ColStats("a"), 23 | new ColStats("b"), 24 | new NullCheck("a", None), 25 | new NullCheck("b", None), 26 | new ColumnSumCheck("a", minValue = Some(Json.fromInt(0))), 27 | new ColumnSumCheck("b", minValue = Some(Json.fromInt(0))) 28 | ), 29 | keyColumns = None, 30 | condition = None 31 | ) 32 | val validatorConfig = ValidatorConfig(0, 5, None, true, None, None, List(validatorTable)) 33 | 34 | it("should run ColStats alongside other row and column based checks without error") { 35 | validatorConfig.quickChecks(spark, variables) shouldBe false 36 | validatorConfig.costlyChecks(spark, variables) shouldBe false 37 | } 38 | 39 | it("should generate the appropriate ColStats entries in report.json") { 40 | val report = validatorConfig.genJsonReport(variables)(spark) 41 | val summaries = report \\ "events" flatMap { json => 42 | json.as[Seq[CompleteStats]] match { 43 | case Right(summary) => summary 44 | case _ => Seq.empty 45 | } 46 | } 47 | 48 | summaries.toSet shouldBe Set(ColStatsSpec.statsA, ColStatsSpec.statsB) 49 | } 50 | 51 | } 52 | 53 | } 54 | 55 | object ColStatsSpec { 56 | 57 | case class Sample(a: Long, b: Double) 58 | 59 | val sample = Seq( 60 | Sample(2, 0.3922), 61 | Sample(3, 0.4765), 62 | Sample(4, 0.1918), 63 | Sample(5, 0.0536), 64 | Sample(6, 0.4949), 65 | Sample(7, 0.5810), 66 | Sample(8, 0.2978), 67 | Sample(9, 0.0729), 68 | Sample(10, 0.868), 69 | Sample(11, 0.325), 70 | Sample(12, 0.305), 71 | Sample(13, 0.217), 72 | Sample(14, 0.193), 73 | Sample(15, 0.405), 74 | Sample(16, 0.443), 75 | Sample(17, 0.103), 76 | Sample(18, 0.435), 77 | Sample(19, 0.953), 78 | Sample(20, 0.519), 79 | Sample(21, 0.958) 80 | ) 81 | 82 | val statsA = CompleteStats( 83 | "`a` stats", 84 | "a", 85 | 20, 86 | 11.5, 87 | 2.0, 88 | 21.0, 89 | 5.916079783099616, 90 | Histogram( 91 | Seq( 92 | Bin(2.0, 3.9, 2), 93 | Bin(3.9, 5.8, 2), 94 | Bin(5.8, 7.699999999999999, 2), 95 | Bin(7.699999999999999, 9.6, 2), 96 | Bin(9.6, 11.5, 2), 97 | Bin(11.5, 13.399999999999999, 2), 98 | Bin(13.399999999999999, 15.299999999999999, 2), 99 | Bin(15.299999999999999, 17.2, 2), 100 | Bin(17.2, 19.099999999999998, 2), 101 | Bin(19.099999999999998, 21.0, 2) 102 | ) 103 | ) 104 | ) 105 | 106 | val statsB = CompleteStats( 107 | "`b` stats", 108 | "b", 109 | 20, 110 | 0.414235, 111 | 0.0536, 112 | 0.958, 113 | 0.26725316654123255, 114 | Histogram( 115 | Seq( 116 | Bin(0.0536, 0.14404, 3), 117 | Bin(0.14404, 0.23448, 3), 118 | Bin(0.23448, 0.32492, 2), 119 | Bin(0.32492, 0.41535999999999995, 3), 120 | Bin(0.41535999999999995, 0.5057999999999999, 4), 121 | Bin(0.5057999999999999, 0.59624, 2), 122 | Bin(0.59624, 0.68668, 0), 123 | Bin(0.68668, 0.7771199999999999, 0), 124 | Bin(0.7771199999999999, 0.8675599999999999, 0), 125 | Bin(0.8675599999999999, 0.958, 3) 126 | ) 127 | ) 128 | ) 129 | 130 | } 131 | -------------------------------------------------------------------------------- /src/test/scala/com/target/data_validator/validator/ColumnBasedSpec.scala: -------------------------------------------------------------------------------- 1 | package com.target.data_validator.validator 2 | 3 | import com.target.TestingSparkSession 4 | import com.target.data_validator._ 5 | import io.circe.Json 6 | import io.circe.parser._ 7 | import org.apache.spark.sql.Row 8 | import org.apache.spark.sql.types._ 9 | import org.scalatest._ 10 | 11 | import scala.collection.immutable.ListMap 12 | import org.scalatest.funspec.AnyFunSpec 13 | import org.scalatest.matchers.should.Matchers 14 | 15 | class ColumnBasedSpec extends AnyFunSpec with Matchers with TestingSparkSession { 16 | 17 | describe("columnMaxCheck") { 18 | 19 | val schema = StructType( 20 | List( 21 | StructField("key", StringType), 22 | StructField("data", StringType), 23 | StructField("number", IntegerType), 24 | StructField("byte", ByteType), 25 | StructField("double", DoubleType) 26 | ) 27 | ) 28 | 29 | val sampleData = List( 30 | Row("one", "2018/10/01", 3, 10.toByte, 2.0), 31 | Row("two", "2018/10/02", 2, 20.toByte, 3.5), 32 | Row("three", "2018/10/31", 1, 30.toByte, 1.7) 33 | ) 34 | 35 | def mkValidatorConfig(checks: List[ValidatorBase]): ValidatorConfig = 36 | ValidatorConfig( 37 | 1, 38 | 10, // scalastyle:ignore magic.number 39 | None, 40 | detailedErrors = false, 41 | None, 42 | None, 43 | List(ValidatorDataFrame(spark.createDataFrame(sc.parallelize(sampleData), schema), None, None, checks)) 44 | ) 45 | 46 | it("should be able to be configured from json/YAML") { 47 | val json = """{ "type": "columnMaxCheck", "column": "rel_d", "value": "2018/10/20" }""" 48 | assert( 49 | decode[ValidatorBase](json)(JsonDecoders.decodeChecks) == 50 | Right(ColumnMaxCheck("rel_d", Json.fromString("2018/10/20"))) 51 | ) 52 | } 53 | 54 | it("should fail when column doesn't exist") { 55 | val dict = new VarSubstitution 56 | val sut = mkValidatorConfig(List(ColumnMaxCheck("junk", Json.fromString("2018/10/31")))) 57 | assert(sut.configCheck(spark, dict)) 58 | assert(sut.failed) 59 | } 60 | 61 | it("should not fail when value matches max column value") { 62 | val dict = new VarSubstitution 63 | val sut = mkValidatorConfig(List(ColumnMaxCheck("data", Json.fromString("2018/10/31")))) 64 | assert(!sut.configCheck(spark, dict)) 65 | assert(!sut.quickChecks(spark, dict)) 66 | assert(!sut.failed) 67 | } 68 | 69 | it("should fail when value doesn't match max column value") { 70 | val dict = new VarSubstitution 71 | val columnMaxCheck = ColumnMaxCheck("data", Json.fromString("2018/11/01")) 72 | val sut = mkValidatorConfig(List(columnMaxCheck)) 73 | assert(!sut.configCheck(spark, dict)) 74 | assert(sut.quickChecks(spark, dict)) 75 | assert(sut.failed) 76 | assert( 77 | columnMaxCheck.getEvents contains ColumnBasedValidatorCheckEvent( 78 | failure = true, 79 | ListMap("expected" -> "2018/11/01", "actual" -> "2018/10/31"), 80 | "ColumnMaxCheck data[StringType]: Expected: 2018/11/01 Actual: 2018/10/31" 81 | ) 82 | ) 83 | } 84 | 85 | it("should not fail with numeric column matches max value") { 86 | val dict = new VarSubstitution 87 | val sut = mkValidatorConfig(List(ColumnMaxCheck("number", Json.fromInt(3)))) 88 | assert(!sut.configCheck(spark, dict)) 89 | assert(!sut.quickChecks(spark, dict)) 90 | assert(!sut.failed) 91 | } 92 | 93 | it("should fail when numeric column doesn't match max value") { 94 | val dict = new VarSubstitution 95 | val columnMaxCheck = ColumnMaxCheck("number", Json.fromInt(100)) // scalastyle:ignore magic.number 96 | val sut = mkValidatorConfig(List(columnMaxCheck)) 97 | assert(!sut.configCheck(spark, dict)) 98 | assert(sut.quickChecks(spark, dict)) 99 | assert(sut.failed) 100 | assert( 101 | columnMaxCheck.getEvents contains ColumnBasedValidatorCheckEvent( 102 | failure = true, 103 | ListMap("expected" -> "100", "actual" -> "3", "relative_error" -> "97.00%"), 104 | "ColumnMaxCheck number[IntegerType]: Expected: 100 Actual: 3 Relative Error: 97.00%" 105 | ) 106 | ) 107 | } 108 | 109 | it("should fail with undefined error % when numeric column doesn't match max value and expected value is 0") { 110 | val dict = new VarSubstitution 111 | val columnMaxCheck = ColumnMaxCheck("number", Json.fromInt(0)) 112 | val sut = mkValidatorConfig(List(columnMaxCheck)) 113 | assert(!sut.configCheck(spark, dict)) 114 | assert(sut.quickChecks(spark, dict)) 115 | assert(sut.failed) 116 | assert( 117 | columnMaxCheck.getEvents contains ColumnBasedValidatorCheckEvent( 118 | failure = true, 119 | ListMap("expected" -> "0", "actual" -> "3", "relative_error" -> "undefined"), 120 | "ColumnMaxCheck number[IntegerType]: Expected: 0 Actual: 3 Relative Error: undefined" 121 | ) 122 | ) 123 | } 124 | 125 | it("should not fail when double column matches max value") { 126 | val dict = new VarSubstitution 127 | val sut = mkValidatorConfig(List(ColumnMaxCheck("double", Json.fromDouble(3.5).get))) 128 | assert(!sut.configCheck(spark, dict)) 129 | assert(!sut.quickChecks(spark, dict)) 130 | assert(!sut.failed) 131 | } 132 | 133 | it("should fail when double column doesn't match max value") { 134 | val dict = new VarSubstitution 135 | val columnMaxCheck = ColumnMaxCheck("double", Json.fromDouble(5.0).get) 136 | val sut = mkValidatorConfig(List(columnMaxCheck)) 137 | assert(!sut.configCheck(spark, dict)) 138 | assert(sut.quickChecks(spark, dict)) 139 | assert(sut.failed) 140 | assert( 141 | columnMaxCheck.getEvents contains ColumnBasedValidatorCheckEvent( 142 | failure = true, 143 | ListMap("expected" -> "5.0", "actual" -> "3.5", "relative_error" -> "30.00%"), 144 | "ColumnMaxCheck double[DoubleType]: Expected: 5.0 Actual: 3.5 Relative Error: 30.00%" 145 | ) 146 | ) 147 | } 148 | 149 | it("should fail when byte column and value overflows") { 150 | val dict = new VarSubstitution 151 | val sut = mkValidatorConfig(List(ColumnMaxCheck("byte", Json.fromInt(1000)))) // scalastyle:ignore 152 | assert(sut.configCheck(spark, dict)) 153 | assert(sut.failed) 154 | } 155 | 156 | it("should fail when byte column and string value") { 157 | val dict = new VarSubstitution 158 | val sut = mkValidatorConfig(List(ColumnMaxCheck("byte", Json.fromString("bit")))) 159 | assert(sut.configCheck(spark, dict)) 160 | assert(sut.failed) 161 | } 162 | 163 | it("variable substitution should produce VarSubJsonEvent()") { 164 | val vars = new VarSubstitution 165 | vars.addString("col", "byte") 166 | val sut = ColumnMaxCheck("${col}", Json.fromInt(100)).substituteVariables(vars) // scalastyle:ignore 167 | assert(!sut.failed) 168 | assert(sut.getEvents contains VarSubEvent("${col}", "byte")) 169 | } 170 | 171 | } 172 | 173 | } 174 | -------------------------------------------------------------------------------- /src/test/scala/com/target/data_validator/validator/ConfigVarSpec.scala: -------------------------------------------------------------------------------- 1 | package com.target.data_validator.validator 2 | 3 | import cats.syntax.either._ 4 | import com.target.TestingSparkSession 5 | import com.target.data_validator._ 6 | import com.target.data_validator.ConfigVar._ 7 | import io.circe._ 8 | import io.circe.generic.auto._ 9 | import io.circe.parser._ 10 | import io.circe.syntax._ 11 | import org.apache.spark.sql.Row 12 | import org.apache.spark.sql.types.{IntegerType, StructField, StructType} 13 | 14 | import scala.util.Random 15 | import org.scalatest.funspec.AnyFunSpec 16 | import org.scalatest.matchers.should.Matchers 17 | 18 | class ConfigVarSpec extends AnyFunSpec with Matchers with TestingSparkSession { 19 | 20 | describe("ConfigVar") { 21 | 22 | describe("NameValue") { 23 | 24 | it("from Json snippet") { 25 | val json: Json = 26 | parse("""{ "name": "foo", "value": "bar" }""").getOrElse(Json.Null) 27 | val sut = json.as[ConfigVar] 28 | assert(sut == Right(NameValue("foo", Json.fromString("bar")))) 29 | } 30 | 31 | it("addEntry works") { 32 | val bar = Json.fromString("bar") 33 | val sut = NameValue("foo", bar) 34 | val varSub = new VarSubstitution 35 | assert(!sut.addEntry(spark, varSub)) 36 | assert(varSub.dict.get("foo") contains bar) 37 | } 38 | 39 | it("asJson works") { 40 | val sut = NameValue("bar", Json.fromString("foo")) 41 | assert(sut.asJson.noSpaces == """{"name":"bar","value":"foo"}""") 42 | } 43 | 44 | it("var sub in value") { 45 | val varSub = new VarSubstitution 46 | assert(!varSub.addString("four", "4")) 47 | val sut = NameValue("foo", Json.fromString("${four} score")) 48 | assert(!sut.addEntry(spark, varSub)) 49 | assert(varSub.dict.get("foo").contains(Json.fromString("4 score"))) 50 | } 51 | 52 | it("var sub fails when value doesn't exist") { 53 | val varSub = new VarSubstitution 54 | val sut = NameValue("foo", Json.fromString("${four} score")) 55 | assert(sut.addEntry(spark, varSub)) 56 | } 57 | 58 | } 59 | 60 | describe("NameEnv") { 61 | 62 | it("from Json snippet") { 63 | val json: Json = parse("""{ "name":"foo", "env":"ENV"}""").getOrElse(Json.Null) 64 | val sut = json.as[ConfigVar] 65 | assert(sut == Right(NameEnv("foo", "ENV"))) 66 | } 67 | 68 | it("addEntry works") { 69 | val sut = NameEnv("java_home", "JAVA_HOME") 70 | val varSub = new VarSubstitution 71 | assert(!sut.addEntry(spark, varSub)) 72 | assert(varSub.dict.get("java_home") contains Json.fromString(System.getenv("JAVA_HOME"))) 73 | } 74 | 75 | it("asJson works") { 76 | val sut = NameEnv("foo", "bar") 77 | assert(sut.asJson.noSpaces == """{"name":"foo","env":"bar"}""") 78 | } 79 | 80 | it("var sub in env value") { 81 | val sut = NameEnv("java_home", "JAVA_${h}") 82 | val varSub = new VarSubstitution 83 | assert(!varSub.addString("h", "HOME")) 84 | assert(!sut.addEntry(spark, varSub)) 85 | assert(varSub.dict("java_home").asString contains System.getenv("JAVA_HOME")) 86 | } 87 | 88 | it("var sub fails when value doesn't exist") { 89 | val sut = NameEnv("java_home", "JAVA_${h}") 90 | val varSub = new VarSubstitution 91 | assert(sut.addEntry(spark, varSub)) 92 | } 93 | 94 | } 95 | 96 | describe("NameShell") { 97 | 98 | it("from Json snippet") { 99 | val json: Json = parse("""{ "name":"foo", "shell":"false"}""").getOrElse(Json.Null) 100 | val sut = json.as[ConfigVar] 101 | assert(sut == Right(NameShell("foo", "false"))) 102 | } 103 | 104 | it("addEntry works as expected") { 105 | val sut = NameShell("one", "echo 1") 106 | val varSub = new VarSubstitution 107 | assert(!sut.addEntry(spark, varSub)) 108 | assert(varSub.dict("one") == Json.fromInt(1)) 109 | } 110 | 111 | it("asJson works") { 112 | val sut = NameShell("one", "echo 1") 113 | assert(sut.asJson.noSpaces == """{"name":"one","shell":"echo 1"}""") 114 | } 115 | 116 | it("bad command works as expected") { 117 | val sut = NameShell("one", "/bad/command") 118 | val varSub = new VarSubstitution 119 | assert(sut.addEntry(spark, varSub)) 120 | assert(EventLog.events exists { 121 | case ValidatorError(msg) => 122 | msg.startsWith("NameShell(one, /bad/command) Ran but returned exitCode: 127 stderr:") 123 | case _ => false 124 | }) 125 | } 126 | 127 | it("no output works as expected") { 128 | val sut = NameShell("one", "true") 129 | val varSub = new VarSubstitution 130 | assert(sut.addEntry(spark, varSub)) 131 | assert(!varSub.dict.contains("one")) 132 | } 133 | 134 | it("command failing works as expected") { 135 | val sut = NameShell("one", "echo 1 && false") 136 | val varSub = new VarSubstitution 137 | assert(sut.addEntry(spark, varSub)) 138 | assert(!varSub.dict.contains("one")) 139 | assert( 140 | EventLog.events contains 141 | ValidatorError("NameShell(one, echo 1 && false) Ran but returned exitCode: 1 stderr: ") 142 | ) 143 | } 144 | 145 | it("variable substitution in command works") { 146 | val varSub = new VarSubstitution 147 | val valueJson = Json.fromInt(Random.nextInt) 148 | varSub.add("one", valueJson) 149 | val sut = NameShell("one", "echo $one") 150 | assert(!sut.addEntry(spark, varSub)) 151 | assert(varSub.dict("one") == valueJson) 152 | } 153 | } 154 | 155 | describe("NameSql") { 156 | 157 | it("from Json snippet") { 158 | val json: Json = parse("""{ "name":"foo", "sql":"select 1"}""").getOrElse(Json.Null) 159 | val sut = json.as[ConfigVar] 160 | assert(sut == Right(NameSql("foo", "select 1"))) 161 | } 162 | 163 | it("addEntry works as expected") { 164 | val sut = NameSql("one", "select 1") 165 | val varSub = new VarSubstitution 166 | assert(!sut.addEntry(spark, varSub)) 167 | assert(varSub.dict("one") == Json.fromInt(1)) 168 | } 169 | 170 | it("asJson works") { 171 | val sut = NameSql("one", "select 1") 172 | assert(sut.asJson.noSpaces == """{"name":"one","sql":"select 1"}""") 173 | } 174 | 175 | it("bad sql works as expected") { 176 | val sut = NameSql("one", "bad sql") 177 | val varSub = new VarSubstitution 178 | assert(sut.addEntry(spark, varSub)) 179 | } 180 | 181 | it("empty query") { 182 | val schema = StructType(List(StructField("data", IntegerType))) 183 | val df = spark.createDataFrame(sc.parallelize(List(Row(10))), schema) // scalastyle:ignore 184 | df.createTempView("MyTable") 185 | val sut = NameSql("one", "select data from MyTable where data < 10") 186 | val varSub = new VarSubstitution 187 | assert(sut.addEntry(spark, varSub)) 188 | } 189 | 190 | } 191 | 192 | } 193 | 194 | } 195 | -------------------------------------------------------------------------------- /src/test/scala/com/target/data_validator/validator/Mocker.scala: -------------------------------------------------------------------------------- 1 | package com.target.data_validator.validator 2 | 3 | import com.target.data_validator._ 4 | import org.apache.spark.sql.types.StructType 5 | import org.apache.spark.sql.{DataFrame, Row, SparkSession} 6 | import io.circe.Json 7 | 8 | trait Mocker { 9 | 10 | def mkDataFrame(spark: SparkSession, data: List[Row], schema: StructType): DataFrame = 11 | spark.createDataFrame(spark.sparkContext.parallelize(data), schema) 12 | 13 | def mkParams(params: List[Tuple2[String, Any]] = List.empty): VarSubstitution = { 14 | val dict = new VarSubstitution 15 | params.foreach { pair => 16 | pair._2 match { 17 | case p: Json => dict.add(pair._1, pair._2.asInstanceOf[Json]) 18 | case p: String => dict.addString(pair._1, pair._2.asInstanceOf[String]) 19 | } 20 | } 21 | dict 22 | } 23 | } 24 | -------------------------------------------------------------------------------- /src/test/scala/com/target/data_validator/validator/NegativeCheckSpec.scala: -------------------------------------------------------------------------------- 1 | package com.target.data_validator.validator 2 | 3 | import com.target.TestingSparkSession 4 | import com.target.data_validator.TestHelpers.{mkDf, mkDict, parseYaml} 5 | import com.target.data_validator.{ValidatorConfig, ValidatorDataFrame, ValidatorError} 6 | import org.scalatest.funspec.AnyFunSpec 7 | import org.scalatest.matchers.should.Matchers 8 | 9 | class NegativeCheckSpec extends AnyFunSpec with Matchers with TestingSparkSession { 10 | 11 | describe("NullCheck") { 12 | 13 | describe("config parsing") { 14 | it("basic config") { 15 | val json = parseYaml(""" 16 | |type: negativeCheck 17 | |column: foo 18 | """.stripMargin) 19 | 20 | val sut = JsonDecoders.decodeChecks.decodeJson(json) 21 | assert(sut == Right(NegativeCheck("foo", None))) 22 | } 23 | 24 | it("optional threshold") { 25 | val json = parseYaml(""" 26 | |type: negativeCheck 27 | |column: foo 28 | |threshold: 10.0% 29 | """.stripMargin) 30 | 31 | val sut = JsonDecoders.decodeChecks.decodeJson(json) 32 | assert(sut == Right(NegativeCheck("foo", Some("10.0%")))) 33 | } 34 | it("config error") { 35 | val json = parseYaml(""" 36 | |type: negativeCheck 37 | |garbage 38 | """.stripMargin) 39 | 40 | val sut = JsonDecoders.decodeChecks.decodeJson(json) 41 | assert(sut.isLeft) // Todo: Maybe add better check. Left is generic failure. 42 | } 43 | 44 | } 45 | 46 | describe("variable substitution") { 47 | it("success substitution") { 48 | var dict = mkDict("threshold" -> "20%", "column" -> "foo") 49 | var sut = NegativeCheck("$column", Some("$threshold")) 50 | assert(sut.substituteVariables(dict) == NegativeCheck("foo", Some("20%"))) 51 | assert(!sut.failed) 52 | } 53 | 54 | it("error on substitution issues") { 55 | var dict = mkDict() 56 | var sut = NegativeCheck("$column", Some("$threshold")) 57 | assert(sut.substituteVariables(dict) == sut) 58 | assert(sut.failed) 59 | assert( 60 | sut.getEvents contains 61 | ValidatorError( 62 | "VariableSubstitution: Can't find values for the following keys, " 63 | + "column" 64 | ) 65 | ) 66 | assert( 67 | sut.getEvents contains 68 | ValidatorError( 69 | "VariableSubstitution: Can't find values for the following keys, " 70 | + "threshold" 71 | ) 72 | ) 73 | } 74 | } 75 | 76 | describe("check configuration") { 77 | it("Column Exists") { 78 | val df = mkDf(spark = spark, "price" -> List(1.99)) 79 | val sut = NegativeCheck("price", None) 80 | assert(!sut.configCheck(df)) 81 | } 82 | 83 | it("Column doesn't exist") { 84 | val df = mkDf(spark = spark, "price" -> List(1.99)) 85 | val sut = NegativeCheck("junk", None) 86 | assert(sut.configCheck(df)) 87 | assert(sut.failed) 88 | assert(sut.getEvents contains ValidatorError("Column: junk not found in schema.")) 89 | } 90 | 91 | it("Column exists but is wrong type") { 92 | val df = mkDf(spark = spark, "item" -> List("eggs")) 93 | val sut = NegativeCheck("item", None) 94 | assert(sut.configCheck(df)) 95 | assert(sut.failed) 96 | assert(sut.getEvents contains ValidatorError("Column: item found, but not of numericType type: StringType")) 97 | } 98 | 99 | } 100 | 101 | describe("functionality") { 102 | 103 | it("success") { 104 | val df = mkDf(spark, "price" -> List(1.99, 1.50, 2.50)) // scalastyle:ignore 105 | val sut = ValidatorDataFrame(df, None, None, List(NegativeCheck("price", None))) 106 | val config = ValidatorConfig(1, 1, None, false, None, None, List.empty) 107 | assert(!sut.quickChecks(spark, mkDict())(config)) 108 | assert(!sut.failed) 109 | } 110 | 111 | it("fails") { 112 | val df = mkDf(spark, "price" -> List(1.99, -1.50, 2.50)) // scalastyle:ignore 113 | val sut = ValidatorDataFrame(df, None, None, List(NegativeCheck("price", None))) 114 | val config = ValidatorConfig(1, 1, None, false, None, None, List.empty) 115 | assert(sut.quickChecks(spark, mkDict())(config)) 116 | assert(sut.failed) 117 | 118 | } 119 | 120 | it("threshold success") { 121 | val df = mkDf(spark, "price" -> List(1.99, -1.50, 2.50)) // scalastyle:ignore 122 | val sut = ValidatorDataFrame(df, None, None, List(NegativeCheck("price", Some("1")))) 123 | val config = ValidatorConfig(1, 1, None, false, None, None, List.empty) 124 | assert(!sut.quickChecks(spark, mkDict())(config)) 125 | assert(!sut.failed) 126 | 127 | } 128 | 129 | it("threshold failure") { 130 | val df = mkDf(spark, "price" -> List(1.99, -1.50, -2.50)) // scalastyle:ignore 131 | val sut = ValidatorDataFrame(df, None, None, List(NegativeCheck("price", Some("1")))) 132 | val config = ValidatorConfig(1, 1, None, false, None, None, List.empty) 133 | assert(sut.quickChecks(spark, mkDict())(config)) 134 | assert(sut.failed) 135 | } 136 | } 137 | } 138 | } 139 | -------------------------------------------------------------------------------- /src/test/scala/com/target/data_validator/validator/NullCheckSpec.scala: -------------------------------------------------------------------------------- 1 | package com.target.data_validator.validator 2 | 3 | import com.target.TestingSparkSession 4 | import com.target.data_validator.{ValidatorConfig, ValidatorDataFrame, ValidatorError} 5 | import com.target.data_validator.TestHelpers._ 6 | import org.scalatest.funspec.AnyFunSpec 7 | import org.scalatest.matchers.should.Matchers 8 | 9 | class NullCheckSpec extends AnyFunSpec with Matchers with TestingSparkSession { 10 | 11 | describe("NullCheck") { 12 | describe("config parsing") { 13 | it("basic config") { 14 | val json = parseYaml(""" 15 | |type: nullCheck 16 | |column: foo 17 | """.stripMargin) 18 | 19 | val sut = JsonDecoders.decodeChecks.decodeJson(json) 20 | assert(sut == Right(NullCheck("foo", None))) 21 | } 22 | 23 | it("optional threshold") { 24 | val json = parseYaml(""" 25 | |type: nullCheck 26 | |column: foo 27 | |threshold: 10.0% 28 | """.stripMargin) 29 | 30 | val sut = JsonDecoders.decodeChecks.decodeJson(json) 31 | assert(sut == Right(NullCheck("foo", Some("10.0%")))) 32 | } 33 | it("config error") { 34 | val json = parseYaml(""" 35 | |type: nullCheck 36 | |garbage 37 | """.stripMargin) 38 | 39 | val sut = JsonDecoders.decodeChecks.decodeJson(json) 40 | assert(sut.isLeft) // Todo: Maybe add better check. Left is generic failure. 41 | } 42 | 43 | } 44 | 45 | describe("variable substitution") { 46 | it("success substitution") { 47 | val dict = mkDict("threshold" -> "20%", "column" -> "foo") 48 | val sut = NullCheck("$column", Some("$threshold")) 49 | assert(sut.substituteVariables(dict) == NullCheck("foo", Some("20%"))) 50 | assert(!sut.failed) 51 | } 52 | 53 | it("error on substitution issues") { 54 | val dict = mkDict() 55 | val sut = NullCheck("$column", Some("$threshold")) 56 | assert(sut.substituteVariables(dict) == sut) 57 | assert(sut.failed) 58 | assert( 59 | sut.getEvents contains 60 | ValidatorError( 61 | "VariableSubstitution: Can't find values for the following keys, " 62 | + "column" 63 | ) 64 | ) 65 | assert( 66 | sut.getEvents contains 67 | ValidatorError( 68 | "VariableSubstitution: Can't find values for the following keys, " 69 | + "threshold" 70 | ) 71 | ) 72 | } 73 | } 74 | 75 | describe("checkconfiguration") { 76 | 77 | it("Column Exists") { 78 | val df = mkDf(spark = spark, "item" -> List("Eggs")) 79 | val sut = NullCheck("item", None) 80 | assert(!sut.configCheck(df)) 81 | } 82 | 83 | it("Column doesn't exist") { 84 | val df = mkDf(spark = spark, "item" -> List("Eggs"), "price" -> List(0.99), "perishable" -> List(true)) 85 | val sut = NullCheck("junk", None) 86 | assert(sut.configCheck(df)) 87 | assert(sut.failed) 88 | assert(sut.getEvents contains ValidatorError("Column: junk not found in schema.")) 89 | } 90 | } 91 | 92 | describe("functionality") { 93 | 94 | it("success") { 95 | val df = mkDf(spark, "item" -> List("item1", "item2", "item3")) 96 | val sut = ValidatorDataFrame(df, None, None, List(NullCheck("item", None))) 97 | val config = ValidatorConfig(1, 1, None, false, None, None, List.empty) 98 | assert(!sut.quickChecks(spark, mkDict())(config)) 99 | assert(!sut.failed) 100 | } 101 | 102 | it("fails") { 103 | val df = mkDf(spark, "item" -> List("item1", "item2", "item3", null)) // scalastyle:ignore 104 | val sut = ValidatorDataFrame(df, None, None, List(NullCheck("item", None))) 105 | val config = ValidatorConfig(1, 1, None, false, None, None, List.empty) 106 | assert(sut.quickChecks(spark, mkDict())(config)) 107 | assert(sut.failed) 108 | 109 | } 110 | 111 | it("threshold success") { 112 | val df = mkDf(spark, "item" -> List("item1", "item2", "item3", null)) // scalastyle:ignore 113 | val sut = ValidatorDataFrame(df, None, None, List(NullCheck("item", Some("1")))) 114 | val config = ValidatorConfig(1, 1, None, false, None, None, List.empty) 115 | assert(!sut.quickChecks(spark, mkDict())(config)) 116 | assert(!sut.failed) 117 | 118 | } 119 | 120 | it("threshold failure") { 121 | val df = mkDf(spark, "item" -> List("item1", "item2", "item3", null, null)) // scalastyle:ignore 122 | val sut = ValidatorDataFrame(df, None, None, List(NullCheck("item", Some("1")))) 123 | val config = ValidatorConfig(1, 1, None, false, None, None, List.empty) 124 | assert(sut.quickChecks(spark, mkDict())(config)) 125 | assert(sut.failed) 126 | } 127 | } 128 | } 129 | } 130 | -------------------------------------------------------------------------------- /src/test/scala/com/target/data_validator/validator/TestHelpersSpec.scala: -------------------------------------------------------------------------------- 1 | package com.target.data_validator.validator 2 | 3 | import com.target.TestingSparkSession 4 | import com.target.data_validator.TestHelpers._ 5 | import io.circe.Json 6 | import org.apache.spark.sql.Row 7 | import org.apache.spark.sql.types._ 8 | import org.scalatest.funspec.AnyFunSpec 9 | import org.scalatest.matchers.should.Matchers 10 | 11 | class TestHelpersSpec extends AnyFunSpec with Matchers with TestingSparkSession { 12 | 13 | val data = List( 14 | "item" -> List("item1", "item2", "item3", null), // scalastyle:ignore 15 | "price" -> List(1.99, 2.99, 3.99, 0.0), 16 | "count" -> List(1, 2, 3, 0), 17 | "instock" -> List(true, false, true, false) 18 | ) 19 | 20 | val expectedSchema = StructType( 21 | List( 22 | StructField("item", StringType), 23 | StructField("price", DoubleType), 24 | StructField("count", IntegerType), 25 | StructField("instock", BooleanType) 26 | ) 27 | ) 28 | 29 | describe("parseYml") { 30 | it("parses simple yml") { 31 | val sut = parseYaml(""" 32 | |double: 2.01 33 | |int: 10293 34 | |string: foo 35 | |array: 36 | | - one 37 | | - two 38 | | - three 39 | """.stripMargin) 40 | assert( 41 | sut == Json.obj( 42 | ("double", Json.fromDouble(2.01).get), 43 | ("int", Json.fromInt(10293)), // scalastyle:ignore 44 | ("string", Json.fromString("foo")), 45 | ("array", Json.arr(Seq("one", "two", "three").map(Json.fromString): _*)) 46 | ) 47 | ) 48 | } 49 | } 50 | 51 | describe("mkDict") { 52 | it("simple case") { 53 | val sut = mkDict("key" -> "value") 54 | assert(sut.dict("key") == Json.fromString("value")) 55 | } 56 | } 57 | 58 | describe("guessType") { 59 | it("double") { 60 | assert(guessType(1.99) == DoubleType) // scalastyle: ignore 61 | } 62 | 63 | it("int") { 64 | assert(guessType(1) == IntegerType) // scalastyle: ignore 65 | } 66 | 67 | it("string") { 68 | assert(guessType("string") == StringType) 69 | } 70 | 71 | it("boolean") { 72 | assert(guessType(true) == BooleanType) 73 | } 74 | 75 | } 76 | 77 | describe("mkSchema") { 78 | it("simple") { 79 | assert(mkSchema(data: _*) == expectedSchema) 80 | } 81 | } 82 | 83 | describe("mkRows") { 84 | assert( 85 | mkRows(data: _*) == List( 86 | Row("item1", 1.99, 1, true), 87 | Row("item2", 2.99, 2, false), 88 | Row("item3", 3.99, 3, true), 89 | Row(null, 0.0, 0, false) 90 | ) 91 | ) // scalastyle:ignore 92 | } 93 | // mkDf 94 | 95 | describe("mkDf") {} 96 | 97 | } 98 | -------------------------------------------------------------------------------- /src/test/scala/com/target/data_validator/validator/UniqueCheckSpec.scala: -------------------------------------------------------------------------------- 1 | package com.target.data_validator.validator 2 | 3 | import com.target.TestingSparkSession 4 | import com.target.data_validator._ 5 | import io.circe.Json 6 | import org.apache.spark.sql.{DataFrame, Row, SparkSession} 7 | import org.apache.spark.sql.types._ 8 | import org.scalatest.funspec.AnyFunSpec 9 | import org.scalatest.matchers.should.Matchers 10 | 11 | class UniqueCheckSpec extends AnyFunSpec with Matchers with TestingSparkSession { 12 | 13 | val schema = StructType( 14 | List(StructField("item", StringType), StructField("location", IntegerType), StructField("price", DoubleType)) 15 | ) 16 | 17 | val defData = List(Row("Eggs", 1, 4.00), Row("Milk", 1, 10.27), Row("Eggs", 1, 5.00), Row("Eggs", 2, 2.00)) 18 | def mkDataFrame(spark: SparkSession, data: List[Row]): DataFrame = 19 | spark.createDataFrame(sc.parallelize(data), schema) 20 | 21 | describe("fromJson") { 22 | it("create fromJson") { 23 | import com.target.data_validator.validator.JsonDecoders.decodeChecks 24 | val yaml = 25 | """--- 26 | |- type: uniqueCheck 27 | | columns: 28 | | - foo 29 | | - bar 30 | """.stripMargin 31 | val json = io.circe.yaml.parser.parse(yaml).right.getOrElse(Json.Null) 32 | val sut = json.as[Array[ValidatorBase]] 33 | assert(sut.isRight) 34 | assert(sut.right.get contains UniqueCheck(Array("foo", "bar"))) 35 | } 36 | } 37 | 38 | describe("substituteVariables") { 39 | it("replaces variables") { 40 | val dict = new VarSubstitution 41 | dict.addString("col1", "foo") 42 | dict.addString("col2", "bar") 43 | val sut = UniqueCheck(List("${col1}", "$col2")) 44 | assert(sut.substituteVariables(dict) == UniqueCheck(List("foo", "bar"))) 45 | assert(!sut.failed) 46 | } 47 | 48 | } 49 | 50 | describe("configCheck") { 51 | it("good columns") { 52 | val sut = UniqueCheck(List("item", "location")) 53 | val df = mkDataFrame(spark, defData) 54 | assert(!sut.configCheck(df)) 55 | assert(!sut.failed) 56 | } 57 | 58 | it("bad column") { 59 | val sut = UniqueCheck(List("item", "city")) 60 | val df = mkDataFrame(spark, defData) 61 | assert(sut.configCheck(df)) 62 | assert(sut.failed) 63 | } 64 | 65 | } 66 | 67 | describe("costlyCheck") { 68 | 69 | it("finds error") { 70 | val sut = UniqueCheck(Seq("item")) 71 | val df = mkDataFrame(spark, defData) 72 | assert(sut.costlyCheck(df)) 73 | assert(sut.failed) 74 | assert(sut.getEvents contains ValidatorError("1 duplicates found!")) 75 | } 76 | 77 | it("finds error with multiple columns") { 78 | val sut = UniqueCheck(Seq("item", "location")) 79 | val df = mkDataFrame(spark, defData) 80 | assert(sut.costlyCheck(df)) 81 | assert(sut.failed) 82 | assert(sut.getEvents contains ValidatorError("1 duplicates found!")) 83 | } 84 | 85 | it("no error") { 86 | val sut = UniqueCheck(Seq("price")) 87 | val df = mkDataFrame(spark, defData) 88 | assert(!sut.costlyCheck(df)) 89 | assert(!sut.failed) 90 | assert(sut.getEvents contains ValidatorGood("no duplicates found.")) 91 | } 92 | } 93 | 94 | describe("toJson") { 95 | 96 | it("generates correct json") { 97 | val sut = UniqueCheck(Seq("item")) 98 | assert( 99 | sut.toJson == Json.fromFields( 100 | Seq( 101 | ("type", Json.fromString("uniqueCheck")), 102 | ("columns", Json.fromValues(List(Json.fromString("item")))), 103 | ("failed", Json.fromBoolean(false)), 104 | ("events", Json.fromValues(Seq.empty)) 105 | ) 106 | ) 107 | ) 108 | } 109 | } 110 | 111 | describe("completeExample") { 112 | it("happy path that finds error") { 113 | val uc = UniqueCheck(List("item")) 114 | val dict = new VarSubstitution 115 | val df = mkDataFrame(spark, defData) 116 | val sut = ValidatorConfig( 117 | 1, 118 | 1, 119 | None, 120 | detailedErrors = false, 121 | None, 122 | None, 123 | List(ValidatorDataFrame(df, None, None, List(uc))) 124 | ) 125 | 126 | assert(!sut.configCheck(spark, dict)) 127 | assert(!sut.quickChecks(spark, dict)) 128 | assert(sut.costlyChecks(spark, dict)) 129 | assert(sut.failed) 130 | } 131 | 132 | it("happy path that doesn't find error") { 133 | val uc = UniqueCheck(List("price")) 134 | val dict = new VarSubstitution 135 | val df = mkDataFrame(spark, defData) 136 | val sut = ValidatorConfig( 137 | 1, 138 | 1, 139 | None, 140 | detailedErrors = false, 141 | None, 142 | None, 143 | List(ValidatorDataFrame(df, None, None, List(uc))) 144 | ) 145 | 146 | assert(!sut.configCheck(spark, dict)) 147 | assert(!sut.costlyChecks(spark, dict)) 148 | assert(!sut.failed) 149 | } 150 | } 151 | 152 | } 153 | --------------------------------------------------------------------------------