├── .gitignore ├── CODE_OF_CONDUCT.md ├── LICENSE ├── README.md ├── SECURITY.md ├── dq ├── databases │ └── DQ │ │ ├── dqaggtable.sql │ │ ├── dqfailtable.sql │ │ ├── dqrulewatermark.sql │ │ ├── dqrundetails.sql │ │ ├── dqwatermark.sql │ │ ├── entityrulemetadata.sql │ │ └── orphanedgemetadata.sql ├── pom.xml └── src │ └── main │ ├── resources │ └── application.properties │ └── scala │ └── com │ └── ms │ └── dq │ ├── framework │ ├── DQFramework.scala │ └── README.md │ ├── rules │ ├── Nullcheck.scala │ ├── Orphanedgecheck.scala │ ├── Schemacheck.scala │ └── Uniquecheck.scala │ └── support │ └── SupportTrait.scala ├── images ├── Data Quality Insights.PNG ├── Entityrulemetadata.png ├── OrphanEdgeMetadata.png └── Results.png ├── notebooks └── DQ Tables to Lake.scala └── sample ├── DataQualityInsights.pbix └── DataQualityRuleEngineUsageSample.scala /.gitignore: -------------------------------------------------------------------------------- 1 | ## Ignore Visual Studio temporary files, build results, and 2 | ## files generated by popular Visual Studio add-ons. 3 | ## 4 | ## Get latest from https://github.com/github/gitignore/blob/master/VisualStudio.gitignore 5 | 6 | # User-specific files 7 | *.rsuser 8 | *.suo 9 | *.user 10 | *.userosscache 11 | *.sln.docstates 12 | 13 | # User-specific files (MonoDevelop/Xamarin Studio) 14 | *.userprefs 15 | 16 | # Mono auto generated files 17 | mono_crash.* 18 | 19 | # Build results 20 | [Dd]ebug/ 21 | [Dd]ebugPublic/ 22 | [Rr]elease/ 23 | [Rr]eleases/ 24 | x64/ 25 | x86/ 26 | [Aa][Rr][Mm]/ 27 | [Aa][Rr][Mm]64/ 28 | bld/ 29 | [Bb]in/ 30 | [Oo]bj/ 31 | [Ll]og/ 32 | [Ll]ogs/ 33 | 34 | # Visual Studio 2015/2017 cache/options directory 35 | .vs/ 36 | # Uncomment if you have tasks that create the project's static files in wwwroot 37 | #wwwroot/ 38 | 39 | # Visual Studio 2017 auto generated files 40 | Generated\ Files/ 41 | 42 | # MSTest test Results 43 | [Tt]est[Rr]esult*/ 44 | [Bb]uild[Ll]og.* 45 | 46 | # NUnit 47 | *.VisualState.xml 48 | TestResult.xml 49 | nunit-*.xml 50 | 51 | # Build Results of an ATL Project 52 | [Dd]ebugPS/ 53 | [Rr]eleasePS/ 54 | dlldata.c 55 | 56 | # Benchmark Results 57 | BenchmarkDotNet.Artifacts/ 58 | 59 | # .NET Core 60 | project.lock.json 61 | project.fragment.lock.json 62 | artifacts/ 63 | 64 | # StyleCop 65 | StyleCopReport.xml 66 | 67 | # Files built by Visual Studio 68 | *_i.c 69 | *_p.c 70 | *_h.h 71 | *.ilk 72 | *.meta 73 | *.obj 74 | *.iobj 75 | *.pch 76 | *.pdb 77 | *.ipdb 78 | *.pgc 79 | *.pgd 80 | *.rsp 81 | *.sbr 82 | *.tlb 83 | *.tli 84 | *.tlh 85 | *.tmp 86 | *.tmp_proj 87 | *_wpftmp.csproj 88 | *.log 89 | *.vspscc 90 | *.vssscc 91 | .builds 92 | *.pidb 93 | *.svclog 94 | *.scc 95 | 96 | # Chutzpah Test files 97 | _Chutzpah* 98 | 99 | # Visual C++ cache files 100 | ipch/ 101 | *.aps 102 | *.ncb 103 | *.opendb 104 | *.opensdf 105 | *.sdf 106 | *.cachefile 107 | *.VC.db 108 | *.VC.VC.opendb 109 | 110 | # Visual Studio profiler 111 | *.psess 112 | *.vsp 113 | *.vspx 114 | *.sap 115 | 116 | # Visual Studio Trace Files 117 | *.e2e 118 | 119 | # TFS 2012 Local Workspace 120 | $tf/ 121 | 122 | # Guidance Automation Toolkit 123 | *.gpState 124 | 125 | # ReSharper is a .NET coding add-in 126 | _ReSharper*/ 127 | *.[Rr]e[Ss]harper 128 | *.DotSettings.user 129 | 130 | # TeamCity is a build add-in 131 | _TeamCity* 132 | 133 | # DotCover is a Code Coverage Tool 134 | *.dotCover 135 | 136 | # AxoCover is a Code Coverage Tool 137 | .axoCover/* 138 | !.axoCover/settings.json 139 | 140 | # Visual Studio code coverage results 141 | *.coverage 142 | *.coveragexml 143 | 144 | # NCrunch 145 | _NCrunch_* 146 | .*crunch*.local.xml 147 | nCrunchTemp_* 148 | 149 | # MightyMoose 150 | *.mm.* 151 | AutoTest.Net/ 152 | 153 | # Web workbench (sass) 154 | .sass-cache/ 155 | 156 | # Installshield output folder 157 | [Ee]xpress/ 158 | 159 | # DocProject is a documentation generator add-in 160 | DocProject/buildhelp/ 161 | DocProject/Help/*.HxT 162 | DocProject/Help/*.HxC 163 | DocProject/Help/*.hhc 164 | DocProject/Help/*.hhk 165 | DocProject/Help/*.hhp 166 | DocProject/Help/Html2 167 | DocProject/Help/html 168 | 169 | # Click-Once directory 170 | publish/ 171 | 172 | # Publish Web Output 173 | *.[Pp]ublish.xml 174 | *.azurePubxml 175 | # Note: Comment the next line if you want to checkin your web deploy settings, 176 | # but database connection strings (with potential passwords) will be unencrypted 177 | *.pubxml 178 | *.publishproj 179 | 180 | # Microsoft Azure Web App publish settings. Comment the next line if you want to 181 | # checkin your Azure Web App publish settings, but sensitive information contained 182 | # in these scripts will be unencrypted 183 | PublishScripts/ 184 | 185 | # NuGet Packages 186 | *.nupkg 187 | # NuGet Symbol Packages 188 | *.snupkg 189 | # The packages folder can be ignored because of Package Restore 190 | **/[Pp]ackages/* 191 | # except build/, which is used as an MSBuild target. 192 | !**/[Pp]ackages/build/ 193 | # Uncomment if necessary however generally it will be regenerated when needed 194 | #!**/[Pp]ackages/repositories.config 195 | # NuGet v3's project.json files produces more ignorable files 196 | *.nuget.props 197 | *.nuget.targets 198 | 199 | # Microsoft Azure Build Output 200 | csx/ 201 | *.build.csdef 202 | 203 | # Microsoft Azure Emulator 204 | ecf/ 205 | rcf/ 206 | 207 | # Windows Store app package directories and files 208 | AppPackages/ 209 | BundleArtifacts/ 210 | Package.StoreAssociation.xml 211 | _pkginfo.txt 212 | *.appx 213 | *.appxbundle 214 | *.appxupload 215 | 216 | # Visual Studio cache files 217 | # files ending in .cache can be ignored 218 | *.[Cc]ache 219 | # but keep track of directories ending in .cache 220 | !?*.[Cc]ache/ 221 | 222 | # Others 223 | ClientBin/ 224 | ~$* 225 | *~ 226 | *.dbmdl 227 | *.dbproj.schemaview 228 | *.jfm 229 | *.pfx 230 | *.publishsettings 231 | orleans.codegen.cs 232 | 233 | # Including strong name files can present a security risk 234 | # (https://github.com/github/gitignore/pull/2483#issue-259490424) 235 | #*.snk 236 | 237 | # Since there are multiple workflows, uncomment next line to ignore bower_components 238 | # (https://github.com/github/gitignore/pull/1529#issuecomment-104372622) 239 | #bower_components/ 240 | 241 | # RIA/Silverlight projects 242 | Generated_Code/ 243 | 244 | # Backup & report files from converting an old project file 245 | # to a newer Visual Studio version. Backup files are not needed, 246 | # because we have git ;-) 247 | _UpgradeReport_Files/ 248 | Backup*/ 249 | UpgradeLog*.XML 250 | UpgradeLog*.htm 251 | ServiceFabricBackup/ 252 | *.rptproj.bak 253 | 254 | # SQL Server files 255 | *.mdf 256 | *.ldf 257 | *.ndf 258 | 259 | # Business Intelligence projects 260 | *.rdl.data 261 | *.bim.layout 262 | *.bim_*.settings 263 | *.rptproj.rsuser 264 | *- [Bb]ackup.rdl 265 | *- [Bb]ackup ([0-9]).rdl 266 | *- [Bb]ackup ([0-9][0-9]).rdl 267 | 268 | # Microsoft Fakes 269 | FakesAssemblies/ 270 | 271 | # GhostDoc plugin setting file 272 | *.GhostDoc.xml 273 | 274 | # Node.js Tools for Visual Studio 275 | .ntvs_analysis.dat 276 | node_modules/ 277 | 278 | # Visual Studio 6 build log 279 | *.plg 280 | 281 | # Visual Studio 6 workspace options file 282 | *.opt 283 | 284 | # Visual Studio 6 auto-generated workspace file (contains which files were open etc.) 285 | *.vbw 286 | 287 | # Visual Studio LightSwitch build output 288 | **/*.HTMLClient/GeneratedArtifacts 289 | **/*.DesktopClient/GeneratedArtifacts 290 | **/*.DesktopClient/ModelManifest.xml 291 | **/*.Server/GeneratedArtifacts 292 | **/*.Server/ModelManifest.xml 293 | _Pvt_Extensions 294 | 295 | # Paket dependency manager 296 | .paket/paket.exe 297 | paket-files/ 298 | 299 | # FAKE - F# Make 300 | .fake/ 301 | 302 | # CodeRush personal settings 303 | .cr/personal 304 | 305 | # Python Tools for Visual Studio (PTVS) 306 | __pycache__/ 307 | *.pyc 308 | 309 | # Cake - Uncomment if you are using it 310 | # tools/** 311 | # !tools/packages.config 312 | 313 | # Tabs Studio 314 | *.tss 315 | 316 | # Telerik's JustMock configuration file 317 | *.jmconfig 318 | 319 | # BizTalk build output 320 | *.btp.cs 321 | *.btm.cs 322 | *.odx.cs 323 | *.xsd.cs 324 | 325 | # OpenCover UI analysis results 326 | OpenCover/ 327 | 328 | # Azure Stream Analytics local run output 329 | ASALocalRun/ 330 | 331 | # MSBuild Binary and Structured Log 332 | *.binlog 333 | 334 | # NVidia Nsight GPU debugger configuration file 335 | *.nvuser 336 | 337 | # MFractors (Xamarin productivity tool) working folder 338 | .mfractor/ 339 | 340 | # Local History for Visual Studio 341 | .localhistory/ 342 | 343 | # BeatPulse healthcheck temp database 344 | healthchecksdb 345 | 346 | # Backup folder for Package Reference Convert tool in Visual Studio 2017 347 | MigrationBackup/ 348 | 349 | # Ionide (cross platform F# VS Code tools) working folder 350 | .ionide/ 351 | -------------------------------------------------------------------------------- /CODE_OF_CONDUCT.md: -------------------------------------------------------------------------------- 1 | # Microsoft Open Source Code of Conduct 2 | 3 | This project has adopted the [Microsoft Open Source Code of Conduct](https://opensource.microsoft.com/codeofconduct/). 4 | 5 | Resources: 6 | 7 | - [Microsoft Open Source Code of Conduct](https://opensource.microsoft.com/codeofconduct/) 8 | - [Microsoft Code of Conduct FAQ](https://opensource.microsoft.com/codeofconduct/faq/) 9 | - Contact [opencode@microsoft.com](mailto:opencode@microsoft.com) with questions or concerns 10 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | Data Quality Rule Engine 2 | MIT License 3 | 4 | Copyright (c) Microsoft Corporation. 5 | 6 | Permission is hereby granted, free of charge, to any person obtaining a copy 7 | of this software and associated documentation files (the "Software"), to deal 8 | in the Software without restriction, including without limitation the rights 9 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 10 | copies of the Software, and to permit persons to whom the Software is 11 | furnished to do so, subject to the following conditions: 12 | 13 | The above copyright notice and this permission notice shall be included in all 14 | copies or substantial portions of the Software. 15 | 16 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 17 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 18 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 19 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 20 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 21 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 22 | SOFTWARE 23 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # Data Quality Rule Engine 2 | The data quality rule engine is a library that will help run data quality checks on datasets. 3 | 4 | ### Data Quality Rules implemented: 5 | * Null Check: Checks whether the values of the column are null or not. 6 | * Unique Check: Checks whether the values of the columns are unique or not. Optionally, it may also check for the latest row for a particular value of a column. 7 | * Orphan Edge Check: Checks whether the dataframe’s values referring to an id of a parent dataframe, exist in the parent dataframe. 8 | * Schema Check: Checks whether the json representing the values of the dataframe follow a given schema. 9 | 10 | There are 2 ways of invoking the Data Quality Rules on a dataframe: 11 | * Metadata driven. 12 | * As a parameter to the function call. 13 | 14 | The details of the Data Quality runs, and the erroneous records are maintained in reporting tables. However, if the User wishes to not have these recorded, the DQ_LOG_RESULTS_FLAG in the application.properties file can be set to false. 15 | 16 | ### Requirements: 17 | The Data Quality Rule Engine is compatible with 18 | * Apache Spark version 3.0.x 19 | * Java 8 20 | * Databricks Cluster with Spark 3.0.x configuration (used only for schema check). 21 | 22 | ### To use the Data Quality Rule Engine in a project: 23 | 1. Install the Data Quality Rule Engine to a repository. 24 | 25 | 2. Add a reference to the repository in pom.xml of your project under the tag: 26 | `` 27 | 28 | 3. Add the framework as a dependency to the pom.xml of your project (Make sure you have dependency to org.scala-lang.): 29 | ``` 30 | 31 | com.ms.dq 32 | DQFramework 33 | LATEST 34 | 35 | ``` 36 | 37 | 4. Then, simply import the class in your code: 38 | 39 | `import com.ms.dq.framework.DQFramework` 40 | 41 | ### To use the Data Quality Rule Engine as a library in the Databricks notebook: 42 | 43 | Build the Data Quality Rule Engine jar and upload the library to the Databricks cluster. 44 | Then, simply import the class in the notebook: 45 | 46 | `import com.ms.dq.framework.DQFramework` 47 | 48 | 49 | ### Metadata Driven: 50 | The Data Quality rules to be applied on a data frame are inserted into metadata tables (details below). The Rule Engine relies on the metadata to execute the rules. 51 | Note that the table names can be configured as per the user’s requirement by updating the application.properties file. 52 | * Entityrulemetadata: A delta table of schema type (source, entity, rule, parameters). For a rule to be executed against a particular entity, an entry for that rule must be present in this table. The parameters field basically contains a JSON of key value pairs pertinent to the rule. The keys for the parameters of the 4 rules are mentioned below and a snippet of the data in the table is attached. 53 | * Null Check 54 | * columnList: String. A comma separated list of the columns to apply null check on. 55 | * Unique Check 56 | * columnList: Array of Strings. Each string is a comma separated list of columns, the combination of which should be unique. 57 | * LatestRowIdentifier (Optional): String. An optional key in case you want the framework to return whether the record is the latest unique row for that column or not as well. Basically, refers to the column which should be used as an identifier for the latest row. 58 | * Orphan Edge Check 59 | * tableName: String. The delta table name for the source, entity on which DQ will run. 60 | * auditColIntName: String. The audit column which will be used for delta identification. (Must be the audit column in int). 61 | * cutOffHours(optional): Int. Default 0. Delta should be considered from these many hours before the run. 62 | * cutOffMinutes(optional): Int. Default 0. Delta should be considered from these many minutes before the run. 63 | * cutOffSeconds(optional): Int. Default 0. Delta should be considered from these many seconds before the run. 64 | 65 | ![EntityRuleMwtadata](https://github.com/microsoft/Data-Quality-Rule-Engine/blob/main/images/Entityrulemetadata.png) 66 | 67 | * Orphanedgemetadata: A metadata table for the orphan edge check. For the orphan edge check to run on a particular source, entity a record for that source, entity must be present in this metadata table. Sample records are provided below: 68 | 69 | ![OrphanEdgeMetadata](https://github.com/microsoft/Data-Quality-Rule-Engine/blob/main/images/OrphanEdgeMetadata.png) 70 | 71 | #### To invoke the Rule Engine: 72 | 73 | 1. Create an object of the framework: 74 | 75 | `val dqObj=new DQFramework()` 76 | 77 | 2. Set the Spark session: 78 | 79 | ``` 80 | var spark = SparkSession.builder.appName("").getOrCreate() 81 | dqObj.setSparkSession(spark) 82 | ``` 83 | 84 | 3. Make the required entries in the metadata tables, as explained in section [Metadata Driven](https://github.com/microsoft/Data-Quality-Rule-Engine#metadata-driven). 85 | 4. Invoke the method to apply the rules: 86 | 87 | `val applyDq = dqObj.applyDq(dataframe,source,entity,schema,pipelineId)` 88 | 89 | Please refer the [DQFramework readme file](https://github.com/microsoft/Data-Quality-Rule-Engine/blob/main/dq/src/main/scala/com/ms/dq/framework/README.md) for the available overloaded methods for applyDq(). 90 | 91 | 92 | ### Parameter Based: 93 | 94 | Another option to invoke the Data Quality Rules is by invoking the applyDQ() with the required parameters. 95 | 96 | 1. Define a mapping of the rules and columns of the data frame. 97 | ``` 98 | val col_rule_map=Map( 99 | "nullcheck" -> """{"columnList":" id,createdDateTime"}""", 100 | "uniquecheck" -> """{"columnList":" ["id,partitionKey", "id"]",”latestrowidentifier“:”ModifiedDate”}""") 101 | ``` 102 | 103 | 2. Provide a mapping of the columns on which the rules are applied and the respective source and entity. 104 | 105 | ``` 106 | val map:Map[String,List[String]]=Map("id"->List("source1","entity1"), 107 | "createdDateTime "->List("source1"," entity1"), 108 | "id"->List("source1"," entity1"), 109 | "partitionKey"->List("source1"," entity1")) 110 | ``` 111 | 112 | 3. Invoke the method to apply the rules: 113 | 114 | `val applyDq = dqObj.applyDq(dataframe,col_rule_map,map,pipelineId)` 115 | 116 | 117 | 118 | ### Results: 119 | The dataframe returned consists of the original dataframe with additional columns for each (rule, column) pair with the name rulename_columnname. These columns are Boolean fields representing whether the row is valid or not. 120 | 121 | For example, nullcheck is applied on createdDate, a new column named createdDate_nullcheck will be present in the dataframe returned. In the rows for which this column has the value true, createdDate values are not null, and for the ones in which they are false, the createdDate value is null. 122 | 123 | ![Results](https://github.com/microsoft/Data-Quality-Rule-Engine/blob/main/images/Results.png) 124 | 125 | 126 | [Here](https://github.com/microsoft/Data-Quality-Rule-Engine/blob/main/sample/DataQualityRuleEngineUsageSample.scala) is a sample notebook that demonstrates the usage of the Data Quality Rule Engine. 127 | 128 | The Data Quality results from the reporting tables can be also used to derive insights. Below is a [sample dashboard](https://github.com/microsoft/Data-Quality-Rule-Engine/blob/main/sample/DataQualityInsights.pbix): 129 | 130 | ![Data Quality Insights](https://github.com/microsoft/Data-Quality-Rule-Engine/blob/main/images/Data%20Quality%20Insights.PNG) 131 | 132 | ## Contributing 133 | 134 | This project welcomes contributions and suggestions. Most contributions require you to agree to a 135 | Contributor License Agreement (CLA) declaring that you have the right to, and actually do, grant us 136 | the rights to use your contribution. For details, visit https://cla.opensource.microsoft.com. 137 | 138 | ### Bugs and Feature requests: 139 | Please use the [GitHub issue tracker](https://github.com/microsoft/Data-Quality-Rule-Engine/issues) to file bugs or feature requests with relevant information. 140 | 141 | ### Creating a Pull Request: 142 | 1. [Create a fork](https://docs.github.com/en/free-pro-team@latest/github/getting-started-with-github/fork-a-repo) of the repository. 143 | 2. Make the required changes, following the existing code conventions. 144 | 3. Ensure the unit test cases pass. 145 | 4. Commit the changes to your fork. 146 | 5. [Create a pull request](https://docs.github.com/en/free-pro-team@latest/github/collaborating-with-issues-and-pull-requests/creating-a-pull-request), with details of the unit tests. 147 | 6. When you submit a pull request, a CLA-bot will automatically determine whether you need to provide a CLA and decorate the PR appropriately (e.g., label, comment). Simply follow the instructions provided by the bot. You will only need to do this once across all repositories using our CLA. 148 | 149 | 150 | This project has adopted the [Microsoft Open Source Code of Conduct](https://opensource.microsoft.com/codeofconduct/). 151 | For more information see the [Code of Conduct FAQ](https://opensource.microsoft.com/codeofconduct/faq/) or 152 | contact [opencode@microsoft.com](mailto:opencode@microsoft.com) with any additional questions or comments. 153 | 154 | ## Trademarks 155 | 156 | This project may contain trademarks or logos for projects, products, or services. Authorized use of Microsoft 157 | trademarks or logos is subject to and must follow 158 | [Microsoft's Trademark & Brand Guidelines](https://www.microsoft.com/en-us/legal/intellectualproperty/trademarks/usage/general). 159 | Use of Microsoft trademarks or logos in modified versions of this project must not cause confusion or imply Microsoft sponsorship. 160 | Any use of third-party trademarks or logos are subject to those third-party's policies. 161 | -------------------------------------------------------------------------------- /SECURITY.md: -------------------------------------------------------------------------------- 1 | 2 | 3 | ## Security 4 | 5 | Microsoft takes the security of our software products and services seriously, which includes all source code repositories managed through our GitHub organizations, which include [Microsoft](https://github.com/Microsoft), [Azure](https://github.com/Azure), [DotNet](https://github.com/dotnet), [AspNet](https://github.com/aspnet), [Xamarin](https://github.com/xamarin), and [our GitHub organizations](https://opensource.microsoft.com/). 6 | 7 | If you believe you have found a security vulnerability in any Microsoft-owned repository that meets [Microsoft's definition of a security vulnerability](https://docs.microsoft.com/en-us/previous-versions/tn-archive/cc751383(v=technet.10)), please report it to us as described below. 8 | 9 | ## Reporting Security Issues 10 | 11 | **Please do not report security vulnerabilities through public GitHub issues.** 12 | 13 | Instead, please report them to the Microsoft Security Response Center (MSRC) at [https://msrc.microsoft.com/create-report](https://msrc.microsoft.com/create-report). 14 | 15 | If you prefer to submit without logging in, send email to [secure@microsoft.com](mailto:secure@microsoft.com). If possible, encrypt your message with our PGP key; please download it from the [Microsoft Security Response Center PGP Key page](https://www.microsoft.com/en-us/msrc/pgp-key-msrc). 16 | 17 | You should receive a response within 24 hours. If for some reason you do not, please follow up via email to ensure we received your original message. Additional information can be found at [microsoft.com/msrc](https://www.microsoft.com/msrc). 18 | 19 | Please include the requested information listed below (as much as you can provide) to help us better understand the nature and scope of the possible issue: 20 | 21 | * Type of issue (e.g. buffer overflow, SQL injection, cross-site scripting, etc.) 22 | * Full paths of source file(s) related to the manifestation of the issue 23 | * The location of the affected source code (tag/branch/commit or direct URL) 24 | * Any special configuration required to reproduce the issue 25 | * Step-by-step instructions to reproduce the issue 26 | * Proof-of-concept or exploit code (if possible) 27 | * Impact of the issue, including how an attacker might exploit the issue 28 | 29 | This information will help us triage your report more quickly. 30 | 31 | If you are reporting for a bug bounty, more complete reports can contribute to a higher bounty award. Please visit our [Microsoft Bug Bounty Program](https://microsoft.com/msrc/bounty) page for more details about our active programs. 32 | 33 | ## Preferred Languages 34 | 35 | We prefer all communications to be in English. 36 | 37 | ## Policy 38 | 39 | Microsoft follows the principle of [Coordinated Vulnerability Disclosure](https://www.microsoft.com/en-us/msrc/cvd). 40 | 41 | -------------------------------------------------------------------------------- /dq/databases/DQ/dqaggtable.sql: -------------------------------------------------------------------------------- 1 | // Copyright (c) Microsoft Corporation. 2 | // Licensed under the MIT License. 3 | 4 | CREATE TABLE IF NOT EXISTS SUBJECT_AREA_DB.DQ_AGGREGATE_TABLE_NAME( 5 | DqAggTableKey STRING, 6 | Source STRING, 7 | Entity STRING, 8 | ColumnName STRING, 9 | Rule STRING, 10 | DQModifiedDate TIMESTAMP, 11 | DQModifiedDateInt BIGINT, 12 | RecordCount BIGINT, 13 | FailCount BIGINT, 14 | SuccessCount BIGINT, 15 | PlatformModifiedDate TIMESTAMP, 16 | PlatformModifiedDateInt BIGINT 17 | ) 18 | USING delta 19 | location 'ADLS_PATH_GEN2/SUBJECT_AREA/DQ_AGGREGATE_TABLE_NAME' -------------------------------------------------------------------------------- /dq/databases/DQ/dqfailtable.sql: -------------------------------------------------------------------------------- 1 | // Copyright (c) Microsoft Corporation. 2 | // Licensed under the MIT License. 3 | 4 | CREATE TABLE IF NOT EXISTS SUBJECT_AREA_DB.DQ_FAIL_TABLE_NAME( 5 | Source STRING, 6 | Entity STRING, 7 | ColumnName STRING, 8 | Rule STRING, 9 | Record STRING, 10 | PlatformModifiedDate TIMESTAMP, 11 | PlatformModifiedDateInt BIGINT, 12 | DqAggTableKey STRING 13 | ) 14 | USING delta 15 | location 'ADLS_PATH_GEN2/SUBJECT_AREA/DQ_FAIL_TABLE_NAME' -------------------------------------------------------------------------------- /dq/databases/DQ/dqrulewatermark.sql: -------------------------------------------------------------------------------- 1 | // Copyright (c) Microsoft Corporation. 2 | // Licensed under the MIT License. 3 | 4 | CREATE TABLE IF NOT EXISTS SUBJECT_AREA_DB.DQ_RULE_WATERMARK_TABLE_NAME 5 | ( 6 | SubjectArea STRING NOT NULL 7 | ,SourceEntityName STRING NOT NULL 8 | ,RuleName STRING NOT NULL 9 | ,WaterMarkStartValue STRING NOT NULL 10 | ,WaterMarkEndValue STRING NOT NULL 11 | ,PlatformModifiedDate TIMESTAMP 12 | ,PlatformModifiedDateInt BIGINT 13 | ) 14 | USING DELTA 15 | LOCATION 'ADLS_PATH_GEN2/SUBJECT_AREA/DQ_RULE_WATERMARK_TABLE_NAME' 16 | -------------------------------------------------------------------------------- /dq/databases/DQ/dqrundetails.sql: -------------------------------------------------------------------------------- 1 | // Copyright (c) Microsoft Corporation. 2 | // Licensed under the MIT License. 3 | 4 | CREATE TABLE IF NOT EXISTS SUBJECT_AREA_DB.DQ_RUNDETAILS_TABLE_NAME( 5 | DqAggTableKey STRING, 6 | Source STRING, 7 | Entity STRING, 8 | ColumnName STRING, 9 | Rule STRING, 10 | RecordCount BIGINT, 11 | PipelineId STRING, 12 | PlatformModifiedDate TIMESTAMP, 13 | PlatformModifiedDateInt BIGINT 14 | ) 15 | USING delta 16 | location 'ADLS_PATH_GEN2/SUBJECT_AREA/DQ_RUNDETAILS_TABLE_NAME' -------------------------------------------------------------------------------- /dq/databases/DQ/dqwatermark.sql: -------------------------------------------------------------------------------- 1 | // Copyright (c) Microsoft Corporation. 2 | // Licensed under the MIT License. 3 | 4 | CREATE TABLE IF NOT EXISTS SUBJECT_AREA_DB.DQ_METADATA_WATERMARK_TABLE_NAME( 5 | dqtable STRING 6 | ,source STRING 7 | ,entity STRING 8 | ,watermarkendvalue STRING 9 | ,omidqcreateddate STRING 10 | ,omidqmodifieddate STRING 11 | ) 12 | USING DELTA 13 | LOCATION 'ADLS_PATH_GEN2/SUBJECT_AREA/DQ_METADATA_WATERMARK_TABLE_NAME' -------------------------------------------------------------------------------- /dq/databases/DQ/entityrulemetadata.sql: -------------------------------------------------------------------------------- 1 | // Copyright (c) Microsoft Corporation. 2 | // Licensed under the MIT License. 3 | 4 | CREATE TABLE IF NOT EXISTS SUBJECT_AREA_DB.DQ_ENTITYRULEMETADATA_TABLE_NAME( 5 | source STRING 6 | ,entity STRING 7 | ,rulename STRING 8 | ,parameters STRING 9 | ) 10 | USING DELTA 11 | LOCATION 'ADLS_PATH_GEN2/SUBJECT_AREA/DQ_ENTITYRULEMETADATA_TABLE_NAME' -------------------------------------------------------------------------------- /dq/databases/DQ/orphanedgemetadata.sql: -------------------------------------------------------------------------------- 1 | // Copyright (c) Microsoft Corporation. 2 | // Licensed under the MIT License. 3 | 4 | CREATE TABLE IF NOT EXISTS SUBJECT_AREA_DB.DQ_ORPHANEDGEMETADATA_TABLE_NAME( 5 | source STRING 6 | ,entity STRING 7 | ,fromcolumnname STRING 8 | ,fromlookupentity STRING 9 | ,fromlookupcolumnname STRING 10 | ,tocolumnname STRING 11 | ,tolookupentity STRING 12 | ,tolookupcolumnname STRING 13 | ,filtercondition STRING 14 | ) 15 | USING DELTA 16 | LOCATION 'ADLS_PATH_GEN2/SUBJECT_AREA/DQ_ORPHANEDGEMETADATA_TABLE_NAME' -------------------------------------------------------------------------------- /dq/pom.xml: -------------------------------------------------------------------------------- 1 | 2 | 4.0.0 3 | com.ms.dq 4 | 0.0.1 5 | DQFramework 6 | DQFramework 7 | Data Quality Framework Module. 8 | 9 | 1.8 10 | 11 | 12 | 13 | 14 | 15 | org.apache.spark 16 | spark-core_2.12 17 | 3.0.1 18 | 19 | 20 | io.netty 21 | netty-all 22 | 23 | 24 | org.codehaus.jackson 25 | jackson-mapper-asl 26 | 27 | 28 | org.codehaus.jackson 29 | jackson-core-asl 30 | 31 | 32 | commons-codec 33 | commons-codec 34 | 35 | 36 | com.microsoft.azure 37 | azure-storage 38 | 39 | 40 | com.google.guava 41 | guava 42 | 43 | 44 | org.eclipse.jetty 45 | jetty-util 46 | 47 | 48 | org.codehaus.jackson 49 | jackson-jaxrs 50 | 51 | 52 | org.codehaus.jackson 53 | jackson-xc 54 | 55 | 56 | org.eclipse.jetty 57 | jetty-http 58 | 59 | 60 | org.eclipse.jetty 61 | jetty-server 62 | 63 | 64 | org.apache.commons 65 | commons-compress 66 | 67 | 68 | com.nimbusds 69 | nimbus-jose-jwt 70 | 71 | 72 | commons-codec 73 | commons-codec 74 | 75 | 76 | org.apache.zookeeper 77 | zookeeper 78 | 79 | 80 | commons-beanutils 81 | commons-beanutils 82 | 83 | 84 | log4j 85 | log4j 86 | 87 | 88 | 89 | 90 | 91 | org.apache.spark 92 | spark-sql_2.12 93 | 3.0.1 94 | 95 | 96 | io.netty 97 | netty-all 98 | 99 | 100 | org.codehaus.jackson 101 | jackson-mapper-asl 102 | 103 | 104 | org.codehaus.jackson 105 | jackson-core-asl 106 | 107 | 108 | commons-codec 109 | commons-codec 110 | 111 | 112 | com.microsoft.azure 113 | azure-storage 114 | 115 | 116 | com.google.guava 117 | guava 118 | 119 | 120 | org.eclipse.jetty 121 | jetty-util 122 | 123 | 124 | org.codehaus.jackson 125 | jackson-jaxrs 126 | 127 | 128 | org.codehaus.jackson 129 | jackson-xc 130 | 131 | 132 | org.eclipse.jetty 133 | jetty-http 134 | 135 | 136 | org.eclipse.jetty 137 | jetty-server 138 | 139 | 140 | org.apache.commons 141 | commons-compress 142 | 143 | 144 | com.nimbusds 145 | nimbus-jose-jwt 146 | 147 | 148 | commons-codec 149 | commons-codec 150 | 151 | 152 | org.apache.zookeeper 153 | zookeeper 154 | 155 | 156 | commons-beanutils 157 | commons-beanutils 158 | 159 | 160 | 161 | 162 | 163 | 164 | org.apache.spark 165 | spark-network-common_2.12 166 | 3.0.1 167 | 168 | 169 | 170 | 171 | com.fasterxml.jackson.core 172 | jackson-databind 173 | 2.11.2 174 | 175 | 176 | 177 | 178 | com.fasterxml.jackson.core 179 | jackson-core 180 | 2.11.2 181 | 182 | 183 | 184 | 185 | org.scala-lang 186 | scala-compiler 187 | 2.12.12 188 | 189 | 190 | 191 | 192 | org.scala-lang 193 | scala-library 194 | 2.12.12 195 | runtime 196 | 197 | 198 | 199 | com.typesafe 200 | config 201 | 1.3.2 202 | 203 | 204 | 205 | 206 | 207 | net.liftweb 208 | lift-json_2.12 209 | 3.4.2 210 | 211 | 212 | 213 | 214 | com.microsoft.azure 215 | azure-data-lake-store-sdk 216 | 2.3.1 217 | 218 | 219 | 220 | org.apache.hadoop 221 | hadoop-azure 222 | 3.1.1 223 | 224 | 225 | io.netty 226 | netty-all 227 | 228 | 229 | org.codehaus.jackson 230 | jackson-mapper-asl 231 | 232 | 233 | org.codehaus.jackson 234 | jackson-core-asl 235 | 236 | 237 | commons-codec 238 | commons-codec 239 | 240 | 241 | com.microsoft.azure 242 | azure-storage 243 | 244 | 245 | com.google.guava 246 | guava 247 | 248 | 249 | org.eclipse.jetty 250 | jetty-util 251 | 252 | 253 | org.codehaus.jackson 254 | jackson-jaxrs 255 | 256 | 257 | org.codehaus.jackson 258 | jackson-xc 259 | 260 | 261 | org.eclipse.jetty 262 | jetty-http 263 | 264 | 265 | org.eclipse.jetty 266 | jetty-server 267 | 268 | 269 | org.apache.commons 270 | commons-compress 271 | 272 | 273 | com.nimbusds 274 | nimbus-jose-jwt 275 | 276 | 277 | commons-codec 278 | commons-codec 279 | 280 | 281 | org.apache.zookeeper 282 | zookeeper 283 | 284 | 285 | commons-beanutils 286 | commons-beanutils 287 | 288 | 289 | log4j 290 | log4j 291 | 292 | 293 | 294 | 295 | 296 | com.databricks 297 | dbutils-api_2.11 298 | 0.0.3 299 | 300 | 301 | org.apache.hadoop 302 | hadoop-azure-datalake 303 | 3.1.1 304 | 305 | 306 | io.netty 307 | netty-all 308 | 309 | 310 | org.codehaus.jackson 311 | jackson-mapper-asl 312 | 313 | 314 | org.codehaus.jackson 315 | jackson-core-asl 316 | 317 | 318 | commons-codec 319 | commons-codec 320 | 321 | 322 | com.microsoft.azure 323 | azure-storage 324 | 325 | 326 | com.google.guava 327 | guava 328 | 329 | 330 | org.eclipse.jetty 331 | jetty-util 332 | 333 | 334 | org.codehaus.jackson 335 | jackson-jaxrs 336 | 337 | 338 | org.codehaus.jackson 339 | jackson-xc 340 | 341 | 342 | org.eclipse.jetty 343 | jetty-http 344 | 345 | 346 | org.eclipse.jetty 347 | jetty-server 348 | 349 | 350 | org.apache.commons 351 | commons-compress 352 | 353 | 354 | com.nimbusds 355 | nimbus-jose-jwt 356 | 357 | 358 | commons-codec 359 | commons-codec 360 | 361 | 362 | org.apache.zookeeper 363 | zookeeper 364 | 365 | 366 | commons-beanutils 367 | commons-beanutils 368 | 369 | 370 | log4j 371 | log4j 372 | 373 | 374 | 375 | 376 | 377 | org.apache.hadoop 378 | hadoop-client 379 | 3.1.1 380 | 381 | 382 | io.netty 383 | netty-all 384 | 385 | 386 | org.codehaus.jackson 387 | jackson-mapper-asl 388 | 389 | 390 | org.codehaus.jackson 391 | jackson-core-asl 392 | 393 | 394 | commons-codec 395 | commons-codec 396 | 397 | 398 | com.microsoft.azure 399 | azure-storage 400 | 401 | 402 | com.google.guava 403 | guava 404 | 405 | 406 | org.eclipse.jetty 407 | jetty-util 408 | 409 | 410 | org.codehaus.jackson 411 | jackson-jaxrs 412 | 413 | 414 | org.codehaus.jackson 415 | jackson-xc 416 | 417 | 418 | org.eclipse.jetty 419 | jetty-http 420 | 421 | 422 | org.eclipse.jetty 423 | jetty-server 424 | 425 | 426 | org.apache.commons 427 | commons-compress 428 | 429 | 430 | com.nimbusds 431 | nimbus-jose-jwt 432 | 433 | 434 | commons-codec 435 | commons-codec 436 | 437 | 438 | org.apache.zookeeper 439 | zookeeper 440 | 441 | 442 | commons-beanutils 443 | commons-beanutils 444 | 445 | 446 | log4j 447 | log4j 448 | 449 | 450 | 451 | 452 | 453 | org.apache.hadoop 454 | hadoop-common 455 | 3.1.1 456 | 457 | 458 | io.netty 459 | netty-all 460 | 461 | 462 | org.codehaus.jackson 463 | jackson-mapper-asl 464 | 465 | 466 | org.codehaus.jackson 467 | jackson-core-asl 468 | 469 | 470 | commons-codec 471 | commons-codec 472 | 473 | 474 | com.microsoft.azure 475 | azure-storage 476 | 477 | 478 | com.google.guava 479 | guava 480 | 481 | 482 | org.eclipse.jetty 483 | jetty-util 484 | 485 | 486 | org.codehaus.jackson 487 | jackson-jaxrs 488 | 489 | 490 | org.codehaus.jackson 491 | jackson-xc 492 | 493 | 494 | org.eclipse.jetty 495 | jetty-http 496 | 497 | 498 | org.eclipse.jetty 499 | jetty-webapp 500 | 501 | 502 | org.eclipse.jetty 503 | jetty-server 504 | 505 | 506 | org.apache.commons 507 | commons-compress 508 | 509 | 510 | com.nimbusds 511 | nimbus-jose-jwt 512 | 513 | 514 | commons-codec 515 | commons-codec 516 | 517 | 518 | org.apache.zookeeper 519 | zookeeper 520 | 521 | 522 | commons-beanutils 523 | commons-beanutils 524 | 525 | 526 | log4j 527 | log4j 528 | 529 | 530 | 531 | 532 | 533 | 534 | com.google.code.gson 535 | gson 536 | 2.8.5 537 | 538 | 539 | 540 | 541 | org.apache.commons 542 | commons-compress 543 | 1.19 544 | 545 | 546 | 547 | 548 | org.apache.httpcomponents 549 | httpclient 550 | 4.5.6 551 | 552 | 553 | commons-codec 554 | commons-codec 555 | 556 | 557 | 558 | 559 | 560 | 561 | javax.xml.crypto 562 | jsr105-api 563 | 1.0.1 564 | 565 | 566 | 567 | 568 | org.apache.commons 569 | commons-configuration2 570 | 2.7 571 | 572 | 573 | 574 | 575 | commons-net 576 | commons-net 577 | 3.6 578 | 579 | 580 | com.databricks 581 | dbutils-api_2.11 582 | 0.0.4 583 | 584 | 585 | 586 | 587 | 588 | src/main/scala 589 | 590 | 591 | src/main/resources 592 | 593 | **/*.properties 594 | 595 | 596 | 597 | 598 | 599 | net.alchim31.maven 600 | scala-maven-plugin 601 | 3.2.1 602 | 603 | src/main/scala 604 | 605 | -Xss4m 606 | -Xms256m 607 | -Xmx4096m 608 | 609 | 610 | 611 | 612 | 613 | compile 614 | testCompile 615 | 616 | 617 | 618 | 619 | 620 | org.apache.maven.plugins 621 | maven-compiler-plugin 622 | 3.6.1 623 | 624 | ${java.version} 625 | 626 | ${java.version} 627 | UTF-8 628 | 629 | 630 | 631 | 632 | org.apache.maven.plugins 633 | maven-install-plugin 634 | 2.5.2 635 | 636 | true 637 | 638 | 639 | 640 | org.apache.maven.plugins 641 | maven-shade-plugin 642 | 3.1.0 643 | 644 | 645 | install 646 | 647 | shade 648 | 649 | 650 | 651 | 652 | *:* 653 | 654 | META-INF/*.DSA 655 | META-INF/*.RSA 656 | META-INF/*.SF 657 | 658 | 659 | 660 | 661 | 662 | 663 | 664 | 665 | 666 | 667 | 668 | 669 | 670 | 671 | DQFramework 672 | 673 | 674 | 675 | 676 | 677 | 678 | 679 | 680 | 681 | 682 | -------------------------------------------------------------------------------- /dq/src/main/resources/application.properties: -------------------------------------------------------------------------------- 1 | SUBJECT_AREA = dq 2 | SUBJECT_AREA_DB = dq 3 | ADLS_PATH_GEN2 = 4 | DQ_RUNDETAILS_TABLE_NAME = dqrundetails 5 | DQ_AGGREGATE_TABLE_NAME = dqaggtable 6 | DQ_FAIL_TABLE_NAME = dqfailtable 7 | DQ_RULE_WATERMARK_TABLE_NAME = dqrulewatermark 8 | DQ_METADATA_WATERMARK_TABLE_NAME = dqwatermark 9 | DQ_ENTITYRULEMETADATA_TABLE_NAME = entityrulemetadata 10 | DQ_ORPHANEDGEMETADATA_TABLE_NAME = orphanedgemetadata 11 | DQ_LOG_RESULTS_FLAG = true 12 | -------------------------------------------------------------------------------- /dq/src/main/scala/com/ms/dq/framework/DQFramework.scala: -------------------------------------------------------------------------------- 1 | // Copyright (c) Microsoft Corporation. 2 | // Licensed under the MIT License. 3 | 4 | package com.ms.dq.framework 5 | 6 | import org.apache.spark.sql.Dataset 7 | import org.apache.spark.sql.Row 8 | import org.apache.spark.sql.functions._ 9 | import org.apache.spark.sql.types.StructType 10 | 11 | import scala.collection.mutable.ListBuffer 12 | import org.apache.spark.sql.expressions.Window 13 | import org.apache.spark.sql.SaveMode 14 | import org.apache.spark.sql.types.LongType 15 | import org.apache.spark.sql.{SQLContext, SQLImplicits, SparkSession} 16 | import org.apache.spark.sql.types.{IntegerType, LongType, StringType, StructField, StructType} 17 | import com.ms.dq.rules._ 18 | 19 | import scala.collection.immutable.Map 20 | import java.util.{Calendar, Date, Properties} 21 | 22 | import com.ms.dq.support.SupportTrait 23 | 24 | import scala.collection.parallel._ 25 | import scala.collection.parallel.immutable.ParMap 26 | import scala.xml.Properties 27 | 28 | class DQFramework extends SupportTrait { 29 | 30 | 31 | var uniquecheck_latestIdCol: String = null; 32 | var spark: SparkSession = null; 33 | 34 | // Set spark session for execution instance 35 | def setSparkSession(_sparkSession: SparkSession) = { 36 | spark = _sparkSession 37 | } 38 | 39 | // Method to invoke Data Quality rules on a dataset by reading the metadata table [METADATA BOUND] 40 | def applyDqInner(df: Dataset[Row], sourceName: String, entityName: String, schema: StructType, pipelineId: String): Dataset[Row] = { 41 | if (spark == null) { 42 | println("Please set spark session. Use function setSparkSession(_sparkSession:SparkSession)") 43 | throw new Exception("Please set spark session. Use function setSparkSession(_sparkSession:SparkSession)") 44 | } 45 | if (sourceName == null) { 46 | println("Source has been sent as null.") 47 | throw new Exception("Source has been sent as null.") 48 | } 49 | if (entityName == null) { 50 | println("Entity has been sent as null.") 51 | throw new Exception("Entity has been sent as null.") 52 | } 53 | 54 | // Read the application.properties 55 | val properties = getProperties() 56 | 57 | // intialise the dataframe to be returned , so that all the existing columns are present 58 | var return_Df = df 59 | if (schema != null) { 60 | //retrieves the schema defined for the particular source and entity in the entityrulemetadata 61 | val metaDataSchema = spark.table(properties.getProperty("SUBJECT_AREA_DB") + "." + properties.getProperty("DQ_ENTITYRULEMETADATA_TABLE_NAME")).filter(lower(col("source")) === sourceName.toLowerCase() && lower(col("entity")) === entityName.toLowerCase()).filter(lower(col("rulename")) === "schemacheck") 62 | if (!metaDataSchema.take(1).isEmpty) { 63 | 64 | val body_col = df.columns.toSeq.head 65 | 66 | println("Applying Schemacheck\n______________") 67 | println("Time before Schemacheck--->" + Calendar.getInstance.getTime()) 68 | return_Df = new Schemacheck().apply(df, schema, body_col, entityName, sourceName, spark, pipelineId, properties) 69 | println("Time after Schemacheck-->" + Calendar.getInstance.getTime()) 70 | } 71 | else { 72 | println("Schemacheck not found for source=" + sourceName + " and entity=" + entityName + " in metadata. PLease make the required entry in " + properties.getProperty("SUBJECT_AREA_DB") + "." + properties.getProperty("DQ_ENTITYRULEMETADATA_TABLE_NAME") + " to run schecmacheck") 73 | } 74 | } 75 | else { 76 | val joinExceptionList=List("orphanedgecheck") 77 | val metaDataNonSchema = spark.table(properties.getProperty("SUBJECT_AREA_DB") + "." + properties.getProperty("DQ_ENTITYRULEMETADATA_TABLE_NAME")).filter(lower(col("source")) === sourceName.toLowerCase() && lower(col("entity")) === entityName.toLowerCase()).filter(lower(col("rulename")) =!= "schemacheck") 78 | if (!metaDataNonSchema.take(1).isEmpty) { 79 | 80 | val originalDfColumns = df.columns.toSeq 81 | 82 | // getting the original columns of the dataframe (dq_uniqueID is added for internal framework purposes) 83 | if (containsIgnoreCase(originalDfColumns, "dq_uniqueID")) { 84 | println("Column Name: dq_uniqueID is reserved for internal use of the DQ Framework. Please pass a dataframe without this column.") 85 | return df 86 | } 87 | val dfUniqueId = df.withColumn("dq_uniqueID", monotonically_increasing_id()) 88 | return_Df = dfUniqueId 89 | 90 | // splitting the task into threads to achieve parallel computation if multiple cores are available 91 | val forkJoinPool = new scala.concurrent.forkjoin.ForkJoinPool(Runtime.getRuntime().availableProcessors() * (spark.sparkContext.statusTracker.getExecutorInfos.length - 1)) 92 | val lst = metaDataNonSchema.select("rulename", "parameters").collect().par 93 | lst.tasksupport = new ForkJoinTaskSupport(forkJoinPool) 94 | 95 | lst.map({ 96 | d => 97 | //get relevant params like ruleName, params ,etc 98 | val ruleName = d.getString(0) 99 | val params = d.getString(1) 100 | val className = "com.ms.dq.rules." + ruleName.toLowerCase().capitalize 101 | var classObject: Class[_] = null 102 | 103 | //sanity check to see if the rule is provided correctly or not 104 | try { 105 | classObject = Class.forName(className) 106 | } 107 | catch { 108 | case e: java.lang.ClassNotFoundException => println("Cannot perform " + ruleName + ", as the required class doesn't exist. Please create required class in file " + className) 109 | println(e.printStackTrace()) 110 | } 111 | 112 | if (classObject != null) { 113 | //printing out the line highlighting which rule is being performed currently and at what time 114 | //so we can also get an idea for how long did the DQ check run 115 | val instance = classObject.newInstance().asInstanceOf[ {def apply(df: Dataset[Row], params: String, entityname: String, sourcename: String, spark: SparkSession,pipelineId : String,properties: Properties): Dataset[Row]}] 116 | println("Applying " + ruleName + "\n___________") 117 | println("Time before " + entityName + " " + ruleName + "-->" + Calendar.getInstance.getTime()) 118 | try { 119 | //applying the specific rule to the dataset 120 | val dfAfterRule = instance.apply(dfUniqueId, params, entityName, sourceName, spark, pipelineId, properties) 121 | 122 | //printing out the time after the DQ check in order to know how much time did the run take and returning the resulting dataframe:return_Df 123 | println("Time after " + entityName + " " + ruleName + "-->" + Calendar.getInstance.getTime()) 124 | val addedCols = dfAfterRule.columns.toSeq.diff(originalDfColumns) 125 | if (!joinExceptionList.contains(ruleName)) { 126 | //for thread-safe operation 127 | synchronized { 128 | return_Df = return_Df.join(dfAfterRule.select(addedCols.head, addedCols.tail: _*), "dq_uniqueID") 129 | } 130 | } 131 | } 132 | catch { 133 | case e:java.lang.NoSuchMethodException => println("Cannot perform " + ruleName + ", as the required class " + className +" does not contain required method. Please create method def apply(df: Dataset[Row], params: String, entityname: String, sourcename: String, spark: SparkSession) in the class.") 134 | println(e.printStackTrace()) 135 | } 136 | } 137 | }) 138 | return_Df = return_Df.drop("dq_uniqueID") 139 | } 140 | else { 141 | println("No Rulecheck found for source=" + sourceName + " and entity=" + entityName + " in metadata. PLease make the required entry in " + properties.getProperty("SUBJECT_AREA_DB") + "." + properties.getProperty("DQ_ENTITYRULEMETADATA_TABLE_NAME") + " to run the required rule checks.") 142 | } 143 | } 144 | return_Df 145 | } 146 | 147 | // Overloaded methods for optional parameters (schema & pipelineId) to invoke Data Quality rules on a dataset by reading the metadata table 148 | def applyDq(df: Dataset[Row], sourceName: String, entityName: String): Dataset[Row] = applyDqInner(df, sourceName, entityName, null, "N/A") 149 | 150 | // Overloaded methods for optional parameters (pipelineId) to invoke Data Quality rules on a dataset by reading the metadata table 151 | def applyDq(df: Dataset[Row], sourceName: String, entityName: String, schema: StructType): Dataset[Row] = applyDqInner(df, sourceName, entityName, schema, "N/A") 152 | 153 | def applyDq(df: Dataset[Row], sourceName: String, entityName: String, schema: StructType, pipelineId: String): Dataset[Row] = applyDqInner(df, sourceName, entityName, schema, pipelineId) 154 | 155 | // Overloaded methods for optional parameters (schema) to invoke Data Quality rules on a dataset by reading the metadata table 156 | def applyDq(df: Dataset[Row], sourceName: String, entityName: String, pipelineId: String): Dataset[Row] = applyDqInner(df, sourceName, entityName, null, pipelineId) 157 | 158 | // Method to invoke Data Quality rules on a dataset by passing the rules as a method parameter [ COMPUTE BOUND] 159 | def applyDq(df:Dataset[Row],rule_param_map: Map[String,String],colEntitySourceMap: Map[String, List[String]],pipelineId: String): Dataset[Row]={ 160 | if (spark == null) { 161 | println("Please set spark session. Use function setSparkSession(_sparkSession:SparkSession)") 162 | throw new Exception("Please set spark session. Use function setSparkSession(_sparkSession:SparkSession)") 163 | } 164 | val originalDfColumns = df.columns.toSeq 165 | 166 | val properties = getProperties() 167 | 168 | //getting the original columns of the dataframe (dq_uniqueID is added for internal framework purposes) 169 | if (containsIgnoreCase(originalDfColumns, "dq_uniqueID")) { 170 | println("Column Name: dq_uniqueID is reserved for internal use of the DQ Framework. Please pass a dataframe without this column.") 171 | return df 172 | } 173 | val dfUniqueId = df.withColumn("dq_uniqueID", monotonically_increasing_id()) 174 | var ans: Dataset[Row] = dfUniqueId 175 | 176 | // To achieve parallel computation if resources are there 177 | val forkJoinPool = new scala.concurrent.forkjoin.ForkJoinPool(Runtime.getRuntime().availableProcessors() * (spark.sparkContext.statusTracker.getExecutorInfos.length - 1)) 178 | val lst = rule_param_map.par 179 | lst.tasksupport = new ForkJoinTaskSupport(forkJoinPool) 180 | 181 | lst.foreach { case (ruleName, params) => 182 | //get relevant params like ruleName, params ,etc 183 | val className = "com.ms.dq.rules." + ruleName.toLowerCase().capitalize 184 | var classObject: Class[_] = null 185 | 186 | //sanity check to see if the rule is provided correctly or not 187 | try { 188 | classObject = Class.forName(className) 189 | } 190 | catch { 191 | case e: java.lang.ClassNotFoundException => println("Cannot perform " + ruleName + ", as the required class doesn't exist. Please create required class in file " + className) 192 | println(e.printStackTrace()) 193 | } 194 | if (classObject != null) { 195 | //printing out the line highlighting which rule is being performed currently and at what time 196 | //so we can also get an idea for how long did the DQ check run 197 | val instance = classObject.newInstance().asInstanceOf[ {def apply(df: Dataset[Row], params: String, colEntitySourceMap: Map[String, List[String]], originalDfColumns: Seq[String], spark: SparkSession, pipelineId: String, properties: Properties): Dataset[Row]}] 198 | println("Applying " + ruleName + "\n___________") 199 | println("Time before " + ruleName + "-->" + Calendar.getInstance.getTime()) 200 | try { 201 | //applying the specific rule to the dataset 202 | val dfAfterRule = instance.apply(dfUniqueId, params, colEntitySourceMap, originalDfColumns, spark, pipelineId, properties) 203 | 204 | //printing out the time after the DQ check in order to know how much time did the run take and returning the resulting dataframe:ans 205 | println("Time after " + ruleName + "-->" + Calendar.getInstance.getTime()) 206 | val addedCols = dfAfterRule.columns.toSeq.diff(originalDfColumns) 207 | synchronized { 208 | ans = ans.join(dfAfterRule.select(addedCols.head, addedCols.tail: _*), "dq_uniqueID") 209 | } 210 | } 211 | catch { 212 | case e:java.lang.NoSuchMethodException => println("Cannot perform " + ruleName + ", as the required class " + className +" does not contain required method. Please create method def apply(df: Dataset[Row], params: String, colEntitySourceMap: Map[String, List[String]], originalDfColumns: Seq[String], spark: SparkSession): Dataset[Row] in the class.") 213 | println(e.printStackTrace()) 214 | } 215 | } 216 | } 217 | ans.drop("dq_uniqueID") 218 | } 219 | } -------------------------------------------------------------------------------- /dq/src/main/scala/com/ms/dq/framework/README.md: -------------------------------------------------------------------------------- 1 | # Overloaded Methods for Metadata Driven Data Quality Check 2 | ### applydq(df: Dataset[Row],sourcename:String,entityname:String): Dataset[Row] : 3 | * Overloaded method that would read the metadata to fetch the rules to be applied, excluding schema check. 4 | * Optional parameters (schema and pipelineId) would be defaulted. 5 | ### applydq(df: Dataset[Row],sourcename:String,entityname:String, schema: StructType): Dataset[Row] : 6 | * Overloaded method that would read the metadata to fetch the rules to be applied, including schema check. 7 | * Optional parameters (pipelineId) would be defaulted. 8 | ### applydq(df: Dataset[Row],sourcename:String,entityname:String,schema: StructType , pipelineId: String): Dataset[Row] : 9 | * Overloaded method that would read the metadata to fetch the rules to be applied, including schema check. 10 | ### applydq(df: Dataset[Row],sourcename:String,entityname:String, pipelineId: String): Dataset[Row] : 11 | * Overloaded method that would read the metadata to fetch the rules to be applied, excluding schema check. 12 | * Optional parameters (schema) would be defaulted. 13 | 14 | -------------------------------------------------------------------------------- /dq/src/main/scala/com/ms/dq/rules/Nullcheck.scala: -------------------------------------------------------------------------------- 1 | // Copyright (c) Microsoft Corporation. 2 | // Licensed under the MIT License. 3 | 4 | package com.ms.dq.rules 5 | 6 | import com.ms.dq.support.SupportTrait 7 | import org.apache.spark.sql.expressions.Window 8 | import org.apache.spark.sql.{Dataset, Row, SparkSession} 9 | import org.apache.spark.sql.functions._ 10 | import org.apache.spark.sql.types.StructType 11 | import org.apache.spark.sql.types.{LongType, StringType, StructField, StructType} 12 | import java.util.{Calendar, Date, Properties} 13 | class Nullcheck extends SupportTrait{ 14 | 15 | // Method to invoke null check on a dataset:df based on metadata entry 16 | // Returns the orginial dataframe along with additional flags indicating whether the required columns have passed/failed the null check for each particular record 17 | def apply(df: Dataset[Row], params: String, entityname: String, sourcename: String, spark: SparkSession, pipelineId: String, properties: Properties): Dataset[Row]={ 18 | try { 19 | val ruleName = "nullcheck" 20 | if (params == null) { 21 | println("Skipping "+ruleName+". Please input Parameters for nullcheck on source=" + sourcename + " and entity=" + entityname + " in " + properties.getProperty("SUBJECT_AREA_DB") + "." + properties.getProperty("DQ_ENTITYRULEMETADATA_TABLE_NAME")) 22 | return df 23 | } 24 | import spark.implicits._ 25 | //getting the required paramaeters from the JSON:params to apply nullcheck (eg , columnList) 26 | val paramsSchema = List( 27 | StructField("params", StringType, false)) 28 | val paramsRow = Seq(Row(params)) 29 | val paramsDf = spark.createDataFrame( 30 | spark.sparkContext.parallelize(paramsRow), 31 | StructType(paramsSchema) 32 | ) 33 | val paramsString = paramsDf.select(col("params") as "params").map(_.toString()) 34 | val readJson = spark.read.json(paramsString).asInstanceOf[Dataset[Row]] 35 | val readJsonCols = readJson.columns.toSeq 36 | //sanity check to validate json:params 37 | if (containsIgnoreCase(readJsonCols, "_corrupt_record")) { 38 | println("Skipping "+ruleName+". The Parameters for nullcheck on source=" + sourcename + " and entity=" + entityname + " are not a valid Json. Please input a valid Json in " + properties.getProperty("SUBJECT_AREA_DB") + "." + properties.getProperty("DQ_ENTITYRULEMETADATA_TABLE_NAME")) 39 | return df 40 | } 41 | //sanity check for required column:columnList in params 42 | if (!containsIgnoreCase(readJsonCols, "columnList")) { 43 | println("Skipping "+ruleName+". Mandatory Key \"columnList\" required in Parameters for nullcheck on source=" + sourcename + " and entity=" + entityname + ". Please make the required changes in " + properties.getProperty("SUBJECT_AREA_DB") + "." + properties.getProperty("DQ_ENTITYRULEMETADATA_TABLE_NAME")) 44 | return df 45 | } 46 | //Getting list of columns:distinctColList to apply nullcheck on 47 | val columns = readJson.select("columnList").first.getString(0) 48 | val colList = columns.split(",").toList 49 | val distinctColList = colList.distinct 50 | 51 | //columns of the dqfailtable 52 | val orderOfFailTable: List[String] = List("Source", "Entity", "ColumnName", "Rule", "Record", "PlatformModifiedDate", "PlatformModifiedDateInt", "DqAggTableKey") 53 | var originalDfColumns = df.columns.toSeq 54 | if (containsIgnoreCase(originalDfColumns, "dq_uniqueID")) { 55 | originalDfColumns = originalDfColumns.filter(!_.contains("dq_uniqueID")) 56 | } 57 | //view name for the final result 58 | val view_uid = java.util.UUID.randomUUID.toString.replace('-', '_') 59 | val resultViewName = "vw_Result_" + view_uid 60 | 61 | //SQL queries for null check will come here 62 | var sqlexpression = "" 63 | //sql expression to get the failed records for nullcheck will come here 64 | var sqllogExpression = "" 65 | //sql expression to log relevant information about the run will come here 66 | var dimaggExpression = "" 67 | 68 | val recordCount = df.count() 69 | val colListAsString = getStringFromSeq(originalDfColumns) 70 | //traversing through all columns to apply null check 71 | for (colName <- distinctColList) { 72 | if (!containsIgnoreCase(originalDfColumns, colName)) { 73 | println("Skipping " + ruleName + " for column " + colName + " as it does not exist in frame provided") 74 | } 75 | else { 76 | val dqAggKey = java.util.UUID.randomUUID.toString 77 | sqlexpression = sqlexpression + ",case when " + colName + " is null then false else true end as " + colName + "_" + ruleName 78 | sqllogExpression = sqllogExpression + (if (sqllogExpression != "") " union all " else "") + "Select to_json(struct(" + colListAsString + "))as Record,'" + sourcename + "' as Source,'" + entityname + "' as Entity,'" + colName + "' as ColumnName,'" + dqAggKey + "' as DqAggTableKey from " + resultViewName + " where " + colName + "_" + ruleName + "= false" 79 | dimaggExpression += (if (dimaggExpression != "") " union all " else "") + "Select '" + dqAggKey + "' as DqAggTableKey, '" + sourcename + "' as Source, '" + entityname + "' as Entity, '" + colName + "' as ColumnName, '" + ruleName + "' as Rule, " + recordCount + " as RecordCount, '" + pipelineId + "' as PipelineId , date_format(current_timestamp, \"y-MM-dd'T'HH:mm:ss.SSS'Z'\") as PlatformModifiedDate, cast(date_format(current_timestamp, \"yyyyMMddHHmmssSSS\") as long) as PlatformModifiedDateInt" 80 | } 81 | } 82 | //sanity check if the null check expression was built successfully 83 | if (sqlexpression == "" || sqllogExpression == "" || dimaggExpression == "") { 84 | return df 85 | } 86 | //creating resulting dataframe with required DQ columns 87 | val inputViewName = "vw_Input_" + view_uid 88 | df.createOrReplaceTempView(inputViewName) 89 | val dqResultDf = spark.sql("select *" + sqlexpression + " from " + inputViewName) 90 | dqResultDf.createOrReplaceTempView(resultViewName) 91 | 92 | if (properties.getProperty("DQ_LOG_RESULTS_FLAG").toBoolean) { 93 | //logging results in required tables 94 | val failedresult = spark.sql(sqllogExpression) 95 | 96 | val current_time = current_timestamp() 97 | var failTable = failedresult.withColumn("Rule", lit(ruleName)).withColumn("PlatformModifiedDate", date_format(current_time, "y-MM-dd'T'HH:mm:ss.SSS'Z'")).withColumn("PlatformModifiedDateInt", date_format(current_time, "yyyyMMddHHmmssSSS").cast(LongType)) 98 | failTable = correctFormat(failTable, orderOfFailTable) 99 | spark.sql("insert into " + properties.getProperty("SUBJECT_AREA_DB") + "." + properties.getProperty("DQ_RUNDETAILS_TABLE_NAME") + " " + dimaggExpression) 100 | val failedViewName = "vw_Failed_" + view_uid 101 | failTable.createOrReplaceTempView(failedViewName) 102 | spark.sql("insert into " + properties.getProperty("SUBJECT_AREA_DB") + "." + properties.getProperty("DQ_FAIL_TABLE_NAME") + " select * from " + failedViewName) 103 | } 104 | else 105 | println("Skipping result logging as DQ_LOG_RESULTS_FLAG is set to " + properties.getProperty("DQ_LOG_RESULTS_FLAG")) 106 | 107 | dqResultDf 108 | } 109 | catch{ 110 | case e: Exception=> println("Return original dataframe. Nullcheck for source->"+sourcename+" and entity->"+entityname+" failed with exception->\n"+e.toString) 111 | df 112 | } 113 | } 114 | 115 | // Method to invoke null check on a dataset based on arguments passed by the user 116 | // Returns the orginial dataframe along with additional flags indicating whether the required columns have passed/failed the null check for each particular record 117 | def apply(df:Dataset[Row],params:String,colEntitySourceMap:Map[String,List[String]],originalDfColumns:Seq[String],spark:SparkSession,pipelineId: String, properties: Properties): Dataset[Row]= { 118 | try { 119 | //applying sanity checks to check the ruleName as "nullcheck" 120 | val ruleName = "nullcheck" 121 | if (params == null) { 122 | println("Skipping " + ruleName + ". Please send Parameters json as string for "+ruleName) 123 | return df 124 | } 125 | import spark.implicits._ 126 | //getting the required paramaeters from the JSON:params to apply nullcheck (eg , columnList) 127 | val paramsSchema = List( 128 | StructField("params", StringType, false)) 129 | val paramsRow = Seq(Row(params)) 130 | val paramsDf = spark.createDataFrame( 131 | spark.sparkContext.parallelize(paramsRow), 132 | StructType(paramsSchema) 133 | ) 134 | val paramsString = paramsDf.select(col("params") as "params").map(_.toString()) 135 | val readJson = spark.read.json(paramsString).asInstanceOf[Dataset[Row]] 136 | val readJsonCols = readJson.columns.toSeq 137 | 138 | //sanity check to validate json:params 139 | if (containsIgnoreCase(readJsonCols, "_corrupt_record")) { 140 | println("Skipping " + ruleName + ". The Parameters for "+ruleName+" are not a valid Json. Please provide a valid Json") 141 | return df 142 | } 143 | //sanity check for required column:columnList in params 144 | if (!containsIgnoreCase(readJsonCols, "columnList")) { 145 | println("Skipping " + ruleName + ". Mandatory Key \"columnList\" required in Parameters for nullcheck") 146 | return df 147 | } 148 | //Getting list of columns:distinctColList to apply nullcheck on 149 | val columns = readJson.select("columnList").first.getString(0) 150 | val colList = columns.split(",").toList 151 | val distinctColList = colList.distinct 152 | 153 | //view name for the final result 154 | val view_uid = java.util.UUID.randomUUID.toString.replace('-', '_') 155 | val resultViewName = "vw_Result_" + view_uid 156 | 157 | ////columns of the dqfailtable 158 | val orderOfFailTable: List[String] = List("Source", "Entity", "ColumnName", "Rule", "Record", "PlatformModifiedDate", "PlatformModifiedDateInt", "DqAggTableKey") 159 | //sql expression for applying nullcheck will come here 160 | var sqlexpression = "" 161 | //sql expression for failed nullcheck will come here 162 | var sqllogExpression = "" 163 | //sql expression for logging relevant information will come here 164 | var dimaggExpression = "" 165 | val recordCount = df.count() 166 | 167 | val colListAsString = getStringFromSeq(originalDfColumns) 168 | //traversing through all columns to apply null check 169 | for (colName <- distinctColList) { 170 | if (!containsIgnoreCase(originalDfColumns, colName)) { 171 | println("Skipping " + ruleName + " for column " + colName + " as it does not exist in frame provided") 172 | } 173 | else { 174 | val dqAggKey = java.util.UUID.randomUUID.toString 175 | val entityname = colEntitySourceMap(colName)(0) 176 | val sourcename = colEntitySourceMap(colName)(1) 177 | sqlexpression = sqlexpression + ",case when " + colName + " is null then false else true end as " + colName + "_" + ruleName 178 | sqllogExpression = sqllogExpression + (if (sqllogExpression != "") " union all " else "") + "Select to_json(struct(" + colListAsString + "))as Record,'" + sourcename + "' as Source,'" + entityname + "' as Entity,'" + colName + "' as ColumnName,'" + dqAggKey + "' as DqAggTableKey from " + resultViewName + " where " + colName + "_" + ruleName + "=false" 179 | dimaggExpression += (if (dimaggExpression != "") " union all " else "") + "Select '" + dqAggKey + "' as DqAggTableKey, '" + sourcename + "' as Source, '" + entityname + "' as Entity, '" + colName + "' as ColumnName, '" + ruleName + "' as Rule, " + recordCount + " as RecordCount, '" + pipelineId + "' as PipelineId ,date_format(current_timestamp, \"y-MM-dd'T'HH:mm:ss.SSS'Z'\") as PlatformModifiedDate, cast(date_format(current_timestamp, \"yyyyMMddHHmmssSSS\") as long) as PlatformModifiedDateInt" 180 | } 181 | } 182 | 183 | //sanity check if the null check expression was built successfully 184 | if (sqlexpression == "" || sqllogExpression == "" || dimaggExpression == "") { 185 | return df 186 | } 187 | 188 | val inputViewName = "vw_Input_" + view_uid 189 | df.createOrReplaceTempView(inputViewName) 190 | //applying null check 191 | val dqResultDf = spark.sql("select *" + sqlexpression + " from " + inputViewName) 192 | 193 | dqResultDf.createOrReplaceTempView(resultViewName) 194 | 195 | if (properties.getProperty("DQ_LOG_RESULTS_FLAG").toBoolean) { 196 | //logging results in required tables 197 | val failedresult = spark.sql(sqllogExpression) 198 | val current_time = current_timestamp() 199 | var failTable = failedresult.withColumn("Rule", lit(ruleName)).withColumn("PlatformModifiedDate", date_format(current_time, "y-MM-dd'T'HH:mm:ss.SSS'Z'")).withColumn("PlatformModifiedDateInt", date_format(current_time, "yyyyMMddHHmmssSSS").cast(LongType)) 200 | failTable = correctFormat(failTable, orderOfFailTable) 201 | spark.sql("insert into " + properties.getProperty("SUBJECT_AREA_DB") + "." + properties.getProperty("DQ_RUNDETAILS_TABLE_NAME") + " " + dimaggExpression) 202 | val failedViewName = "vw_Failed_" + view_uid 203 | failTable.createOrReplaceTempView(failedViewName) 204 | spark.sql("insert into " + properties.getProperty("SUBJECT_AREA_DB") + "." + properties.getProperty("DQ_FAIL_TABLE_NAME") + " select * from " + failedViewName) 205 | } 206 | else 207 | println("Skipping result logging as DQ_LOG_RESULTS_FLAG is set to " + properties.getProperty("DQ_LOG_RESULTS_FLAG")) 208 | 209 | dqResultDf 210 | } 211 | catch { 212 | case e: Exception => println("Returning original Dataframe. Nullcheck failed with Exception-->\n" + e.toString) 213 | df 214 | } 215 | } 216 | } -------------------------------------------------------------------------------- /dq/src/main/scala/com/ms/dq/rules/Orphanedgecheck.scala: -------------------------------------------------------------------------------- 1 | // Copyright (c) Microsoft Corporation. 2 | // Licensed under the MIT License. 3 | 4 | // Checks whether the dataframe’s values referring to ids of its parent dataframes, actually exist in the parent dataframes. 5 | package com.ms.dq.rules 6 | 7 | import java.util.{Calendar, Properties} 8 | 9 | import com.ms.dq.support.SupportTrait 10 | import org.apache.spark.sql.expressions.Window 11 | import org.apache.spark.sql.{Dataset, Row, SparkSession} 12 | import org.apache.spark.sql.functions._ 13 | import org.apache.spark.sql.Dataset 14 | import org.apache.spark.sql.Row 15 | import org.apache.spark.sql.types.{LongType, StringType, StructField, StructType} 16 | class Orphanedgecheck extends SupportTrait{ 17 | // Method to invoke orphan edge check on a dataset:df based on metadata 18 | // Returns the orginial dataframe along with additional flags indicating whether the required columns have passed/failed the orphan edge check for each particular record 19 | def apply(df:Dataset[Row],params:String,entityname:String,sourcename:String,spark:SparkSession, pipelineId: String, properties: Properties):Dataset[Row]= { 20 | try { 21 | val ruleName = "orphanedgecheck" 22 | if (params == null) { 23 | println("Skipping " + ruleName + ". Please input Parameters for " + ruleName + " on source=" + sourcename + " and entity=" + entityname + " in " + properties.getProperty("SUBJECT_AREA_DB") + "." + properties.getProperty("DQ_ENTITYRULEMETADATA_TABLE_NAME")) 24 | return df 25 | } 26 | //getting the required paramaeters from the JSON:params to apply orphanedge check (eg , tableName) 27 | import spark.implicits._ 28 | val paramsSchema = List( 29 | StructField("params", StringType, false)) 30 | val paramsRow = Seq(Row(params)) 31 | val paramsDf = spark.createDataFrame( 32 | spark.sparkContext.parallelize(paramsRow), 33 | StructType(paramsSchema) 34 | ) 35 | val paramsString = paramsDf.select(col("params") as "params").map(_.toString()) 36 | val readJson = spark.read.json(paramsString).asInstanceOf[Dataset[Row]] 37 | val readJsonCols = readJson.columns.toSeq 38 | //sanity check to validate json:params 39 | if (containsIgnoreCase(readJsonCols, "_corrupt_record")) { 40 | println("Skipping " + ruleName + ". The Parameters for " + ruleName + " on source=" + sourcename + " and entity=" + entityname + " are not a valid Json. Please input a valid Json in " + properties.getProperty("SUBJECT_AREA_DB") + "." + properties.getProperty("DQ_ENTITYRULEMETADATA_TABLE_NAME")) 41 | return df 42 | } 43 | //sanity check for required keys in json:params 44 | if (!containsIgnoreCase(readJsonCols, "tableName")) { 45 | println("Skipping " + ruleName + ". Mandatory Key \"tableName\" required in Parameters for " + ruleName + " on source=" + sourcename + " and entity=" + entityname + ". Please make the required changes in " + properties.getProperty("SUBJECT_AREA_DB") + "." + properties.getProperty("DQ_ENTITYRULEMETADATA_TABLE_NAME")) 46 | return df 47 | } 48 | val tableName = readJson.select("tableName").first.getString(0) 49 | if (!containsIgnoreCase(readJsonCols, "auditColIntName")) { 50 | println("Skipping " + ruleName + ". Mandatory Key \"auditColIntName\" required in Parameters for " + ruleName + " on source=" + sourcename + " and entity=" + entityname + ". Please make the required changes in " + properties.getProperty("SUBJECT_AREA_DB") + "." + properties.getProperty("DQ_ENTITYRULEMETADATA_TABLE_NAME")) 51 | return df 52 | } 53 | val auditCol = readJson.select("auditColIntName").first.getString(0) 54 | 55 | //obtaining optional keys from json:params 56 | var hours:Long=0 57 | var minutes:Long=0 58 | var seconds:Long=0 59 | if(containsIgnoreCase(readJsonCols,"cutOffHours")) 60 | { 61 | try { 62 | hours=readJson.select("cutOffHours").first.getLong(0) 63 | } 64 | catch { 65 | case e: Exception=>println("Error in metadata of source->"+sourcename+" entity->"+entityname+" rule->"+ruleName+" parameter->cutOffHours(must be long)") 66 | throw e 67 | } 68 | } 69 | if(containsIgnoreCase(readJsonCols,"cutOffMinutes")) 70 | { 71 | try { 72 | minutes = readJson.select("cutOffMinutes").first.getLong(0) 73 | } 74 | catch { 75 | case e: Exception=>println("Error in metadata of source->"+sourcename+" entity->"+entityname+" rule->"+ruleName+" parameter->cutOffMinutes(must be long)") 76 | throw e 77 | } 78 | } 79 | if(containsIgnoreCase(readJsonCols,"cutOffSeconds")) 80 | { 81 | try { 82 | seconds = readJson.select("cutOffSeconds").first.getLong(0) 83 | } 84 | catch { 85 | case e: Exception=>println("Error in metadata of source->"+sourcename+" entity->"+entityname+" rule->"+ruleName+" parameter->cutOffSeconds(must be long)") 86 | throw e 87 | } 88 | } 89 | 90 | //According to the parameters , computing the cutoff time 91 | var cutOffEndVal=Long.MaxValue.toString() 92 | if(!(hours==0 && minutes ==0 && seconds == 0)) 93 | { 94 | val current_ts=spark.sql("select current_timestamp() as current_timestamp").withColumn("SLA",col("current_timestamp") - expr("INTERVAL "+hours+" HOURS")).withColumn("SLA",col("SLA") - expr("INTERVAL "+minutes+" minutes")).withColumn("SLA",col("SLA") - expr("INTERVAL "+seconds+" seconds")) 95 | cutOffEndVal=current_ts.withColumn("CutOffDate", date_format(col("SLA"), "yyyyMMddHHmmssSSS").cast(LongType)).select("CutOffDate").head().getLong(0).toString() 96 | } 97 | 98 | //getting delta records based on watermark start and cutoff time 99 | val waterMarkStart=ruleWaterMarkStart(sourcename,entityname,ruleName,spark,properties) 100 | val deltaDf=dqDeltaIdentifier(tableName,auditCol,waterMarkStart,cutOffEndVal,spark) 101 | var dqResultDf: Dataset[Row] = null 102 | if(deltaDf.isEmpty) 103 | { 104 | println("Skipping Orphanedgecheck as no delta data is found. Returning original df ") 105 | return df 106 | } 107 | 108 | // getting the original columns of the dataframe (dq_uniqueID is added for internal framework purposes) 109 | var originalDfColumns = deltaDf.columns.toSeq 110 | if (containsIgnoreCase(originalDfColumns, "dq_uniqueID")) { 111 | originalDfColumns = originalDfColumns.filter(!_.contains("dq_uniqueID")) 112 | } 113 | import spark.implicits._ 114 | 115 | // getting required information from metadata along with sanity checks 116 | val edgeMetaData = spark.sql("select *,rank()over( partition by source,entity order by fromlookupentity) as fromlookupentityid,rank()over( partition by source,entity order by tolookupentity) as tolookupentityid from " + properties.getProperty("SUBJECT_AREA_DB") + "." + properties.getProperty("DQ_ORPHANEDGEMETADATA_TABLE_NAME") + " where lower(source)='" + sourcename.toLowerCase() + "' and lower(entity)='" + entityname.toLowerCase() + "'") 117 | val distinctFromId = edgeMetaData.select("fromcolumnname").distinct() 118 | if (distinctFromId.count() >= 2) { 119 | println("Skipping orphanedgecheck.Found Multiple fromcolumnname in " + properties.getProperty("SUBJECT_AREA_DB") + "." + properties.getProperty("DQ_ORPHANEDGEMETADATA_TABLE_NAME") + " for source=" + sourcename + " and entity=" + entityname + ". Only one should exist.") 120 | return deltaDf 121 | } 122 | val fromIdName = distinctFromId.first.getString(0) 123 | if (!containsIgnoreCase(originalDfColumns, fromIdName)) { 124 | println("Skipping " + ruleName + ". " + fromIdName + " not present in provided frame. Please check fromcolumnname in " + properties.getProperty("SUBJECT_AREA_DB") + "." + properties.getProperty("DQ_ORPHANEDGEMETADATA_TABLE_NAME") + " for source=" + sourcename + " and entity=" + entityname) 125 | return deltaDf 126 | } 127 | val distincToId = edgeMetaData.select("tocolumnname").distinct() 128 | if (distincToId.count() >= 2) { 129 | println("Skipping orphanedgecheck..Found Multiple tocolumnname in " + properties.getProperty("SUBJECT_AREA_DB") + "." + properties.getProperty("DQ_ORPHANEDGEMETADATA_TABLE_NAME") + " for source=" + sourcename + " and entity=" + entityname + ". Only one should exist.") 130 | return deltaDf 131 | } 132 | val toIdName = distincToId.first.getString(0) 133 | if (!containsIgnoreCase(originalDfColumns, toIdName)) { 134 | println("Skipping " + ruleName + ". " + toIdName + " not present in provided frame. Please check tocolumnname in " + properties.getProperty("SUBJECT_AREA_DB") + "." + properties.getProperty("DQ_ORPHANEDGEMETADATA_TABLE_NAME") + " for source=" + sourcename + " and entity=" + entityname) 135 | return deltaDf 136 | } 137 | 138 | val orderOfFailTable: List[String] = List("Source", "Entity", "ColumnName", "Rule", "Record", "PlatformModifiedDate", "PlatformModifiedDateInt", "DqAggTableKey") 139 | var failTable = Seq.empty[(String, String, String)].toDF("Record", "ColumnName", "DqAggTableKey") 140 | var sqlcasef = "" 141 | var sqlcaset = "" 142 | var sqlcaseelsef = "" 143 | var sqlcaseelset = "" 144 | var sqljoin = "" 145 | var frmlst = List[Int]() 146 | var tolst = List[Int]() 147 | 148 | //applying orphan edge check 149 | edgeMetaData.select("fromlookupentity", "fromlookupcolumnname", "tolookupentity", "tolookupcolumnname", "filtercondition", "tocolumnname", "fromcolumnname", "fromlookupentityid", "tolookupentityid").sort($"fromlookupentityid", $"tolookupentityid").collect().map( 150 | { 151 | d => 152 | val fromlookupentity = d.getString(0) 153 | val colInFromDf = d.getString(1) 154 | val tolookupentity = d.getString(2) 155 | val colInToDf = d.getString(3) 156 | val filtercondition = d.getString(4) 157 | val tocolumnname = d.getString(5) 158 | val fromcolumnname = d.getString(6) 159 | 160 | val fromlookupentityid = d.get(7).asInstanceOf[Int].toInt 161 | val tolookupentityid = d.get(8).asInstanceOf[Int].toInt 162 | if (!frmlst.contains(fromlookupentityid)) { 163 | sqljoin = sqljoin + s" \nleft join vwinput${fromlookupentity.replace(".", "_")} f${fromlookupentityid} on ${entityname}.${fromcolumnname}=f${fromlookupentityid}.${colInFromDf}" 164 | if (!spark.catalog.tableExists(fromlookupentity)) { 165 | println("Skipping OrphanEdgeCheck. No such Table->" + fromlookupentity) 166 | return deltaDf 167 | } 168 | val fromDf = spark.table(fromlookupentity) 169 | if (!containsIgnoreCase(fromDf.columns.toSeq, colInFromDf)) { 170 | println("Skipping " + ruleName + ". No column " + colInFromDf + " present in table " + fromlookupentity + ". Please check fromlookupentity and fromlookupcolumnname in " + properties.getProperty("SUBJECT_AREA_DB") + "." + properties.getProperty("DQ_ORPHANEDGEMETADATA_TABLE_NAME") + " for source=" + sourcename + " and entity=" + entityname) 171 | return deltaDf 172 | } 173 | fromDf.groupBy(colInFromDf).count().select(colInFromDf).createOrReplaceTempView("vwinput" + fromlookupentity.replace(".", "_")) 174 | frmlst = frmlst :+ fromlookupentityid 175 | } 176 | if (!tolst.contains(tolookupentityid)) { 177 | 178 | sqljoin = sqljoin + s" \nleft join vwinput${tolookupentity.replace(".", "_")} t${tolookupentityid} on ${entityname}.${tocolumnname}=t${tolookupentityid}.${colInToDf}" 179 | if (!spark.catalog.tableExists(tolookupentity)) { 180 | println("Skipping OrphanEdgeCheck. No such Table->" + fromlookupentity) 181 | return deltaDf 182 | } 183 | val toDf = spark.table(tolookupentity) 184 | if (!containsIgnoreCase(toDf.columns.toSeq, colInToDf)) { 185 | println("Skipping " + ruleName + ". No column " + colInToDf + " present in table " + tolookupentity + ". Please check tolookupentity and tolookupcolumnname in " + properties.getProperty("SUBJECT_AREA_DB") + "." + properties.getProperty("DQ_ORPHANEDGEMETADATA_TABLE_NAME") + " for source=" + sourcename + " and entity=" + entityname) 186 | return deltaDf 187 | } 188 | toDf.groupBy(colInToDf).count().select(colInToDf).createOrReplaceTempView("vwinput" + tolookupentity.replace(".", "_")) 189 | tolst = tolst :+ tolookupentityid 190 | } 191 | sqlcasef = sqlcasef + (if (sqlcasef != "") " or " else "") + s"((f${fromlookupentityid}.${colInToDf} is null ) and ${filtercondition}) " 192 | sqlcaset = sqlcaset + (if (sqlcaset != "") " or " else "") + s"((t${tolookupentityid}.${colInToDf} is null ) and ${filtercondition}) " 193 | sqlcaseelsef = sqlcaseelsef + (if (sqlcaseelsef != "") " or " else "") + s"((f${fromlookupentityid}.${colInToDf} is not null ) and ${filtercondition}) " 194 | sqlcaseelset = sqlcaseelset + (if (sqlcaseelset != "") " or " else "") + s"((t${tolookupentityid}.${colInToDf} is not null ) and ${filtercondition}) " 195 | 196 | }) 197 | val dqAggKey = java.util.UUID.randomUUID.toString 198 | val inputViewName = "vw_Input_" + dqAggKey.replace('-', '_') 199 | deltaDf.createOrReplaceTempView(inputViewName) 200 | var sql = s"\n select ${entityname}.*\n ,case when(${sqlcasef} ) then false when (${sqlcaseelsef}) then true end as ${fromIdName}_${ruleName}\n,case when(${sqlcaset} ) then false when (${sqlcaseelset} ) then true end as ${toIdName}_${ruleName}" 201 | sql = sql + s"\n from ${inputViewName} ${entityname} \n ${sqljoin}" 202 | dqResultDf = spark.sql(sql) 203 | val colListAsString = getStringFromSeq(originalDfColumns) 204 | val resultViewName = "vw_Result_" + dqAggKey.replace('-', '_') 205 | dqResultDf.createOrReplaceTempView(resultViewName) 206 | 207 | if (properties.getProperty("DQ_LOG_RESULTS_FLAG").toBoolean) { 208 | //logging results in required tables 209 | val toDqAggKey = java.util.UUID.randomUUID.toString 210 | val failedsql = s" select to_json(struct(${colListAsString})) as Record,'${sourcename}' as Source,'${entityname}' as Entity,'${fromIdName}' as ColumnName,'${dqAggKey}' as DqAggTableKey from ${resultViewName} where ${fromIdName}_${ruleName} =false union all select to_json(struct(${colListAsString}))as Record,'${sourcename}' as Source,'${entityname}' as Entity,'${toIdName}' as ColumnName,'${toDqAggKey}' as DqAggTableKey from ${resultViewName} where ${toIdName}_${ruleName} =false" 211 | val current_time = current_timestamp() 212 | failTable = spark.sql(failedsql) 213 | failTable = failTable.withColumn("Rule", lit(ruleName)).withColumn("PlatformModifiedDate", date_format(current_time, "y-MM-dd'T'HH:mm:ss.SSS'Z'")).withColumn("PlatformModifiedDateInt", date_format(current_time, "yyyyMMddHHmmssSSS").cast(LongType)) 214 | failTable = correctFormat(failTable, orderOfFailTable) 215 | 216 | val recordCount = deltaDf.count() 217 | var dimAggExpression = "Select '" + dqAggKey + "' as DqAggTableKey, '" + sourcename + "' as Source, '" + entityname + "' as Entity, '" + fromIdName + "' as ColumnName, '" + ruleName + "' as Rule, " + recordCount + " as RecordCount, '" + pipelineId + "' as PipelineId, date_format(current_timestamp, \"y-MM-dd'T'HH:mm:ss.SSS'Z'\") as PlatformModifiedDate, cast(date_format(current_timestamp, \"yyyyMMddHHmmssSSS\") as long) as PlatformModifiedDateInt" 218 | dimAggExpression += " union all " 219 | dimAggExpression += "Select '" + toDqAggKey + "' as DqAggTableKey, '" + sourcename + "' as Source, '" + entityname + "' as Entity, '" + toIdName + "' as ColumnName, '" + ruleName + "' as Rule, " + recordCount + " as RecordCount, '" + pipelineId + "' as PipelineId, date_format(current_timestamp, \"y-MM-dd'T'HH:mm:ss.SSS'Z'\") as PlatformModifiedDate, cast(date_format(current_timestamp, \"yyyyMMddHHmmssSSS\") as long) as PlatformModifiedDateInt" 220 | spark.sql("insert into " + properties.getProperty("SUBJECT_AREA_DB") + "." + properties.getProperty("DQ_RUNDETAILS_TABLE_NAME") + " " + dimAggExpression) 221 | 222 | val failedViewName = "vw_Failed_" + dqAggKey.replace('-', '_') 223 | failTable.createOrReplaceTempView(failedViewName) 224 | spark.sql(s"insert into " + properties.getProperty("SUBJECT_AREA_DB") + "." + properties.getProperty("DQ_FAIL_TABLE_NAME") + s" select * from ${failedViewName}") 225 | } 226 | else 227 | println("Skipping result logging as DQ_LOG_RESULTS_FLAG is set to " + properties.getProperty("DQ_LOG_RESULTS_FLAG")) 228 | 229 | val waterMarkEndVal=deltaDf.agg(max(col(auditCol))).head().getLong(0).toString() 230 | updateRuleWaterMark(sourcename,entityname,ruleName,waterMarkEndVal,spark,properties) 231 | 232 | dqResultDf 233 | } 234 | catch { 235 | case e: Exception=> println("Returning original Dataframe. OrphanEdgeCheck for source->"+sourcename+"and entity->"+entityname+" failed with Exception-->\n"+e.toString) 236 | df 237 | } 238 | } 239 | } -------------------------------------------------------------------------------- /dq/src/main/scala/com/ms/dq/rules/Schemacheck.scala: -------------------------------------------------------------------------------- 1 | // Copyright (c) Microsoft Corporation. 2 | // Licensed under the MIT License. 3 | 4 | package com.ms.dq.rules 5 | 6 | import java.util.Properties 7 | 8 | import org.apache.spark.sql.expressions.Window 9 | import org.apache.spark.sql.Dataset 10 | import org.apache.spark.sql.Row 11 | import org.apache.spark.sql.functions._ 12 | import org.apache.spark.sql.types.StructType 13 | import org.apache.spark.sql.{SQLContext, SQLImplicits, SparkSession} 14 | import com.databricks.dbutils_v1.DBUtilsHolder.dbutils 15 | import com.ms.dq.support.SupportTrait 16 | import org.apache.spark.sql.types.{IntegerType, LongType, StringType, StructField, StructType}; 17 | 18 | class Schemacheck extends SupportTrait{ 19 | def check(df:Dataset[Row],ruleName:String,schema:StructType,bodyColumn:String,spark:SparkSession):Dataset[Row]={ 20 | import spark.implicits._ 21 | val dfCount=df.count() 22 | val resultColumn=bodyColumn+"_"+ruleName 23 | val recordDfCols=Seq(bodyColumn) 24 | val dfStringDS=df.select(col(bodyColumn)as bodyColumn).map(_.toString()) 25 | val basePath="DqSchemacheck" 26 | val uuid=java.util.UUID.randomUUID.toString 27 | 28 | // path to store records which are not inline with the schema 29 | val baseDqFolder="/tmp/"+basePath+"/"+uuid 30 | 31 | //imposing schema. Records which are inline with the schema are stored in validRecordsTemp , the others are stored in the baseDqFolder path 32 | val validRecordsTemp=spark.read.option("badRecordsPath", baseDqFolder).schema(schema).json(dfStringDS) 33 | 34 | //reconverting records back to jsons 35 | val validRecords=validRecordsTemp.toJSON.asInstanceOf[Dataset[Row]].toDF(recordDfCols: _*).withColumn(resultColumn,lit(true)) 36 | var resultDf=validRecords 37 | val validRecordsCount=validRecords.count() 38 | 39 | //checking if there were any dq failures 40 | if(validRecordsCount != dfCount){ 41 | //getting all failed records and adding it to the resulting dataframe 42 | val colNameTimestamp="name" 43 | val badRecordsName="bad_records" 44 | val recordName="record" 45 | val fileNameList=dbutils.fs.ls(baseDqFolder) 46 | val fileNameWithTS=fileNameList(0).path 47 | val fileName=fileNameWithTS+"/"+badRecordsName 48 | val badRecordsTemp=spark.read.json(fileName).select(recordName) 49 | val badRecords=badRecordsTemp.withColumn(recordName,col(recordName).substr(lit(2),length(col(recordName))-2)).toDF(recordDfCols: _*).withColumn(resultColumn,lit(false)) 50 | resultDf=validRecords.union(badRecords) 51 | } 52 | resultDf 53 | } 54 | 55 | // Method to invoke schema check on a dataset:df based on metadata 56 | // Returns the original dataframe along with additional flags indicating whether the required columns have passed/failed the schema check for each particular record 57 | // Schema checks the schema of the bodyColumn whose values are expected to be Jsons 58 | def apply(df:Dataset[Row],schema:StructType,bodyColumn:String,entityname:String,sourcename:String,spark:SparkSession, pipelineId: String, properties: Properties):Dataset[Row]= { 59 | try { 60 | import spark.implicits._ 61 | val ruleName = "schemacheck" 62 | val orderOfFailTable: List[String] = List("Source", "Entity", "ColumnName", "Rule", "Record", "PlatformModifiedDate", "PlatformModifiedDateInt", "DqAggTableKey") 63 | val recordDfCols = Seq("Record") 64 | val dqResultDf = check(df, ruleName, schema, bodyColumn, spark) 65 | 66 | if (properties.getProperty("DQ_LOG_RESULTS_FLAG").toBoolean) { 67 | //logging results in required tables 68 | val failed = dqResultDf.filter(col(bodyColumn + "_" + ruleName) === false).select(bodyColumn) 69 | val recordDf = failed.toDF(recordDfCols: _*) 70 | val dqAggKey = java.util.UUID.randomUUID.toString 71 | val current_time = current_timestamp() 72 | 73 | var failTable = recordDf.withColumn("Source", lit(sourcename)).withColumn("Entity", lit(entityname)).withColumn("ColumnName", lit(null)).withColumn("Rule", lit(ruleName)).withColumn("PlatformModifiedDate", date_format(current_time, "y-MM-dd'T'HH:mm:ss.SSS'Z'")).withColumn("PlatformModifiedDateInt", date_format(current_time, "yyyyMMddHHmmssSSS").cast(LongType)).withColumn("DqAggTableKey", lit(dqAggKey)) 74 | failTable = correctFormat(failTable, orderOfFailTable) 75 | 76 | val recordCount = df.count() 77 | val dimAggExpression = "Select '" + dqAggKey + "' as DqAggTableKey, '" + sourcename + "' as Source, '" + entityname + "' as Entity, '' as ColumnName, '" + ruleName + "' as Rule, " + recordCount + " as RecordCount, '" + pipelineId + "' as PipelineId , date_format(current_timestamp, \"y-MM-dd'T'HH:mm:ss.SSS'Z'\") as PlatformModifiedDate, cast(date_format(current_timestamp, \"yyyyMMddHHmmssSSS\") as long) as PlatformModifiedDateInt" 78 | spark.sql("insert into " + properties.getProperty("SUBJECT_AREA_DB") + "." + properties.getProperty("DQ_RUNDETAILS_TABLE_NAME") + " " + dimAggExpression) 79 | 80 | val failedViewName = "vw_Failed_" + dqAggKey.replace('-', '_') 81 | failTable.createOrReplaceTempView(failedViewName) 82 | spark.sql("insert into " + properties.getProperty("SUBJECT_AREA_DB") + "." + properties.getProperty("DQ_FAIL_TABLE_NAME") + " select * from " + failedViewName) 83 | } 84 | else 85 | println("Skipping result logging as DQ_LOG_RESULTS_FLAG is set to " + properties.getProperty("DQ_LOG_RESULTS_FLAG")) 86 | 87 | dqResultDf 88 | } 89 | catch { 90 | case e: Exception=> println("Returning original Dataframe. Schemacheck for Source->"+sourcename+" and entity->"+entityname+" failed with Exception-->\n"+e.toString) 91 | df 92 | } 93 | } 94 | } -------------------------------------------------------------------------------- /dq/src/main/scala/com/ms/dq/rules/Uniquecheck.scala: -------------------------------------------------------------------------------- 1 | // Copyright (c) Microsoft Corporation. 2 | // Licensed under the MIT License. 3 | 4 | package com.ms.dq.rules 5 | 6 | import java.util.Properties 7 | 8 | import com.ms.dq.support.SupportTrait 9 | import org.apache.spark.sql.expressions.Window 10 | import org.apache.spark.sql.{Dataset, Row, SparkSession} 11 | import org.apache.spark.sql.functions._ 12 | import org.apache.spark.sql.types.{LongType, StringType, StructField, StructType} 13 | 14 | class Uniquecheck extends SupportTrait { 15 | // Method to invoke unique check on a dataset:df based on metadata entry 16 | // Returns the orginial dataframe along with additional flags indicating whether the required columns have passed/failed the unique check for each particular record 17 | def apply(df: Dataset[Row], params: String, entityname: String, sourcename: String, spark: SparkSession, pipelineId: String, properties: Properties): Dataset[Row]= { 18 | try { 19 | val ruleName = "uniquecheck" 20 | if (params == null) { 21 | println("Skipping " + ruleName + ". Please input Parameters for " + ruleName + " on source=" + sourcename + " and entity=" + entityname + " in " + properties.getProperty("SUBJECT_AREA_DB") + "." + properties.getProperty("DQ_ENTITYRULEMETADATA_TABLE_NAME")) 22 | return df 23 | } 24 | //getting the required paramaeters from the JSON:params to apply nullcheck (eg , columnList) 25 | import spark.implicits._ 26 | val paramsSchema = List( 27 | StructField("params", StringType, false)) 28 | val paramsRow = Seq(Row(params)) 29 | val paramsDf = spark.createDataFrame( 30 | spark.sparkContext.parallelize(paramsRow), 31 | StructType(paramsSchema) 32 | ) 33 | val paramsString = paramsDf.select(col("params") as "params").map(_.toString()) 34 | val readJson = spark.read.json(paramsString).asInstanceOf[Dataset[Row]] 35 | val readJsonCols = readJson.columns.toSeq 36 | 37 | //sanity check to validate json:params 38 | if (containsIgnoreCase(readJsonCols, "_corrupt_record")) { 39 | println("Skipping " + ruleName + ". The Parameters for " + ruleName + " on source=" + sourcename + " and entity=" + entityname + " are not a valid Json. Please input a valid Json in " + properties.getProperty("SUBJECT_AREA_DB") + "." + properties.getProperty("DQ_ENTITYRULEMETADATA_TABLE_NAME")) 40 | return df 41 | } 42 | //Getting list of columns:distinctColList to apply unique check on 43 | if (!containsIgnoreCase(readJsonCols, "columnList")) { 44 | println("Skipping " + ruleName + ". Mandatory Key \"columnList\" required in Parameters for " + ruleName + " on source=" + sourcename + " and entity=" + entityname + ". Please make the required changes in " + properties.getProperty("SUBJECT_AREA_DB") + "." + properties.getProperty("DQ_ENTITYRULEMETADATA_TABLE_NAME")) 45 | return df 46 | } 47 | //get the number of unique checks to be performed on column (be it single or composite) 48 | var len: Integer =readJson.select(size($"columnList")).first.getInt(0).toInt 49 | var i= 0 50 | var listOfCompositeColumns = List[String]() 51 | for( i <- 0 to len-1) 52 | { 53 | val colm = readJson.select($"columnList".getItem(i)).first.getString(0) 54 | listOfCompositeColumns = listOfCompositeColumns :+ colm 55 | } 56 | val distinctColList = listOfCompositeColumns 57 | 58 | var latestIdentifierCol: String = null 59 | if (containsIgnoreCase(readJsonCols, "latestrowidentifier")) { 60 | latestIdentifierCol = readJson.select("latestrowidentifier").first.getString(0) 61 | } 62 | 63 | var originalDfColumns = df.columns.toSeq 64 | if (containsIgnoreCase(originalDfColumns, "dq_uniqueID")) { 65 | originalDfColumns = originalDfColumns.filter(!_.contains("dq_uniqueID")) 66 | } 67 | 68 | if (latestIdentifierCol != null && !containsIgnoreCase(originalDfColumns, latestIdentifierCol)) { 69 | println("Value for latestrowidentifier=" + latestIdentifierCol + " in parameters for " + ruleName + " source=" + sourcename + " entity=" + entityname + " is not present in the given dataframe. Please make the required change. Proceeding with " + ruleName + " without identifying latest column.") 70 | latestIdentifierCol = null 71 | } 72 | //columns of the dqfailtable in particular order 73 | val orderOfFailTable: List[String] = List("Source", "Entity", "ColumnName", "Rule", "Record", "PlatformModifiedDate", "PlatformModifiedDateInt", "DqAggTableKey") 74 | 75 | //sql expression for uniquecheck will come here 76 | var sqlexpression = "" 77 | //sql expression for building the failed records for uniquecheck will come here 78 | var sqllogExpression = "" 79 | //sql expression for logging details of the DQ rule check will come here 80 | var dimaggExpression = "" 81 | val recordCount = df.count() 82 | 83 | //view name for the final result 84 | val view_uid = java.util.UUID.randomUUID.toString.replace('-', '_') 85 | val resultViewName = "vw_Result_" + view_uid 86 | 87 | val colListAsString = getStringFromSeq(originalDfColumns) 88 | //traversing the columnlist(single/composite) on which uniquecheck is to applied 89 | //in this loop we build the sql expression for uniquecheck 90 | for (colName <- distinctColList) { 91 | 92 | if (!containsIgnoreCase(originalDfColumns, colName.split(","))) { 93 | println("Skipping " + ruleName + " for column " + colName + " as it does not exist in the frame provided. Please check parameters in " + properties.getProperty("SUBJECT_AREA_DB") + "." + properties.getProperty("DQ_ENTITYRULEMETADATA_TABLE_NAME") + " for " + ruleName + " source=" + sourcename + " and entity=" + entityname) 94 | } 95 | else { 96 | //building the column name which will reflect the status of uniquecheck for individual records 97 | //colName = "id,createdDate" will be changed to uniquecheck_id_createdDate which will reflect the status of uniquecheck of a particular recor based on the mentioned columns 98 | val colName_as_col = colName.replaceAll(",", "_") 99 | sqlexpression += ",case when count(*) over (partition by " + colName + ") > 1 then false else true end as " + colName_as_col + "_" + ruleName //replace with function call 100 | if (latestIdentifierCol != null) { 101 | sqlexpression += ",case when row_number() over(partition by " + colName + " order by " + latestIdentifierCol + " desc)=1 then true else false end as LatestRow_" + colName_as_col 102 | } 103 | val dqAggKey = java.util.UUID.randomUUID.toString 104 | sqllogExpression = sqllogExpression + (if (sqllogExpression != "") " union all " else "") + "Select to_json(struct(" + colListAsString + "))as Record,'" + sourcename + "' as Source,'" + entityname + "' as Entity,'" + colName + "' as ColumnName,'" + dqAggKey + "' as DqAggTableKey from " + resultViewName + " where " + colName_as_col + "_" + ruleName + "= false" 105 | dimaggExpression += (if (dimaggExpression != "") " union all " else "") + "Select '" + dqAggKey + "' as DqAggTableKey, '" + sourcename + "' as Source, '" + entityname + "' as Entity, '" + colName + "' as ColumnName, '" + ruleName + "' as Rule, " + recordCount + " as RecordCount, '" + pipelineId + "' as PipelineId , date_format(current_timestamp, \"y-MM-dd'T'HH:mm:ss.SSS'Z'\") as PlatformModifiedDate, cast(date_format(current_timestamp, \"yyyyMMddHHmmssSSS\") as long) as PlatformModifiedDateInt" 106 | } 107 | } 108 | 109 | if (sqlexpression == "" || sqllogExpression == "" || dimaggExpression == "") { 110 | return df 111 | } 112 | 113 | //creating resulting dataframe with required DQ columns 114 | val inputViewName = "vw_Input_" + view_uid 115 | df.createOrReplaceTempView(inputViewName) 116 | val dqResultDf = spark.sql("select *" + sqlexpression + " from " + inputViewName) 117 | dqResultDf.createOrReplaceTempView(resultViewName) 118 | 119 | if (properties.getProperty("DQ_LOG_RESULTS_FLAG").toBoolean) { 120 | //log results into required tables 121 | val failedresult = spark.sql(sqllogExpression) 122 | 123 | val current_time = current_timestamp() 124 | var failTable = failedresult.withColumn("Source", lit(sourcename)).withColumn("Entity", lit(entityname)).withColumn("Rule", lit(ruleName)).withColumn("PlatformModifiedDate", date_format(current_time, "y-MM-dd'T'HH:mm:ss.SSS'Z'")).withColumn("PlatformModifiedDateInt", date_format(current_time, "yyyyMMddHHmmssSSS").cast(LongType)) 125 | failTable = correctFormat(failTable, orderOfFailTable) 126 | spark.sql("insert into " + properties.getProperty("SUBJECT_AREA_DB") + "." + properties.getProperty("DQ_RUNDETAILS_TABLE_NAME") + " " + dimaggExpression) 127 | val failedViewName = "vw_Failed_" + view_uid 128 | failTable.createOrReplaceTempView(failedViewName) 129 | spark.sql("insert into " + properties.getProperty("SUBJECT_AREA_DB") + "." + properties.getProperty("DQ_FAIL_TABLE_NAME") + " select * from " + failedViewName) 130 | } 131 | else 132 | println("Skipping result logging as DQ_LOG_RESULTS_FLAG is set to " + properties.getProperty("DQ_LOG_RESULTS_FLAG")) 133 | 134 | dqResultDf 135 | } 136 | catch { 137 | case e: Exception=> println("Returning original Dataframe. Uniquecheck for Source->"+sourcename+" and entity->"+entityname+" failed with Exception-->\n"+e.toString) 138 | df 139 | } 140 | } 141 | // Method to invoke null check on a dataset based on arguments passed by the user 142 | // Returns the orginial dataframe along with additional flags indicating whether the required columns have passed/failed the unique check for each particular record 143 | def apply(df:Dataset[Row],params:String,colEntitySourceMap:Map[String,List[String]],originalDfColumns:Seq[String],spark:SparkSession,pipelineId: String,properties: Properties):Dataset[Row]={ 144 | try { 145 | //applying sanity checks to check the ruleName as "uniquecheck" 146 | val ruleName = "uniquecheck" 147 | if (params == null) { 148 | println("Skipping " + ruleName + ". Please send Parameters json as string for " + ruleName) 149 | return df 150 | } 151 | 152 | import spark.implicits._ 153 | //getting the required paramaeters from the JSON:params to apply nullcheck (eg , columnList) 154 | val paramsSchema = List( 155 | StructField("params", StringType, false)) 156 | val paramsRow = Seq(Row(params)) 157 | val paramsDf = spark.createDataFrame( 158 | spark.sparkContext.parallelize(paramsRow), 159 | StructType(paramsSchema) 160 | ) 161 | val paramsString = paramsDf.select(col("params") as "params").map(_.toString()) 162 | val readJson = spark.read.json(paramsString).asInstanceOf[Dataset[Row]] 163 | val readJsonCols = readJson.columns.toSeq 164 | //sanity check to validate json:params 165 | if (containsIgnoreCase(readJsonCols, "_corrupt_record")) { 166 | println("Skipping " + ruleName + ". The Parameters for " + ruleName + " are not a valid Json. Please provide a valid Json") 167 | return df 168 | } 169 | //sanity check for required column:columnList in params 170 | if (!containsIgnoreCase(readJsonCols, "columnList")) { 171 | println("Skipping " + ruleName + ". Mandatory Key \"columnList\" required in Parameters for nullcheck") 172 | return df 173 | } 174 | //get the number of unique checks to be performed on column (be it single or composite) 175 | var len: Integer = readJson.select(size($"columnList")).first.getInt(0).toInt 176 | var i = 0 177 | var listOfCompositeColumns = List[String]() 178 | //traversing each of the columnList provided(single/composite) and storing it in list 179 | for (i <- 0 to len - 1) { 180 | val colm = readJson.select($"columnList".getItem(i)).first.getString(0) 181 | listOfCompositeColumns = listOfCompositeColumns :+ colm 182 | } 183 | val distinctColList = listOfCompositeColumns 184 | 185 | var latestIdentifierCol: String = null 186 | if (containsIgnoreCase(readJsonCols, "latestrowidentifier")) { 187 | latestIdentifierCol = readJson.select("latestrowidentifier").first.getString(0) 188 | } 189 | 190 | if (latestIdentifierCol != null && !containsIgnoreCase(originalDfColumns, latestIdentifierCol)) { 191 | println("Value for latestrowidentifier=" + latestIdentifierCol + " in parameters for " + ruleName + " is not present in the given dataframe. Please make the required change. Proceeding with " + ruleName + " without identifying latest column.") 192 | latestIdentifierCol = null 193 | } 194 | //order of the attributes of the failed Table 195 | val orderOfFailTable: List[String] = List("Source", "Entity", "ColumnName", "Rule", "Record", "PlatformModifiedDate", "PlatformModifiedDateInt", "DqAggTableKey") 196 | val colListAsString = getStringFromSeq(originalDfColumns) 197 | //sql expression for building failed records for uniquecheck will come here 198 | var sqllogExpression = "" 199 | //sql expression for uniquecheck will come here 200 | var sqlexpression = "" 201 | //sql expression for logging the DQ run will come here 202 | var dimaggExpression = "" 203 | val recordCount = df.count() 204 | 205 | // 206 | val view_uid = java.util.UUID.randomUUID.toString.replace('-', '_') 207 | val resultViewName = "vw_Result_" + view_uid 208 | // traversing each of the columnlist and applying uniquechek 209 | for (colName <- distinctColList) { 210 | if (!containsIgnoreCase(originalDfColumns, colName.split(","))) { 211 | println("Skipping " + ruleName + " for column " + colName + " as it does not exist in frame provided") 212 | } 213 | else { 214 | //building the column name which will reflect the status of uniquecheck for individual records 215 | //colName = "id,createdDate" will be changed to uniquecheck_id_createdDate which will reflect the status of uniquecheck of a particular recor based on the mentioned columns 216 | val colName_as_col = colName.replaceAll(",", "_") 217 | val entityname = colEntitySourceMap(colName)(0) 218 | val sourcename = colEntitySourceMap(colName)(1) 219 | sqlexpression += ",case when count(*) over (partition by " + colName + ") > 1 then false else true end as " + colName_as_col + "_" + ruleName //replace with function call 220 | if (latestIdentifierCol != null) { 221 | sqlexpression += ",case when row_number() over(partition by " + colName + " order by " + latestIdentifierCol + " desc)=1 then true else false end as LatestRow_" + colName_as_col 222 | } 223 | val dqAggKey = java.util.UUID.randomUUID.toString 224 | sqllogExpression = sqllogExpression + (if (sqllogExpression != "") " union all " else "") + "Select to_json(struct(" + colListAsString + "))as Record,'" + sourcename + "' as Source,'" + entityname + "' as Entity,'" + colName + "' as ColumnName,'" + dqAggKey + "' as DqAggTableKey from " + resultViewName + " where " + colName_as_col + "_" + ruleName + "= false" 225 | dimaggExpression += (if (dimaggExpression != "") " union all " else "") + "Select '" + dqAggKey + "' as DqAggTableKey, '" + sourcename + "' as Source, '" + entityname + "' as Entity, '" + colName + "' as ColumnName, '" + ruleName + "' as Rule, " + recordCount + " as RecordCount, '" + pipelineId + "' as PipelineId, date_format(current_timestamp, \"y-MM-dd'T'HH:mm:ss.SSS'Z'\") as PlatformModifiedDate, cast(date_format(current_timestamp, \"yyyyMMddHHmmssSSS\") as long) as PlatformModifiedDateInt" 226 | } 227 | } 228 | 229 | //sanity check if the unique check expression was built successfully 230 | if (sqlexpression == "" || sqllogExpression == "" || dimaggExpression == "") { 231 | return df 232 | } 233 | 234 | //creating resulting dataframe with required DQ columns 235 | val inputViewName = "vw_Input_" + view_uid 236 | df.createOrReplaceTempView(inputViewName) 237 | val dqResultDf = spark.sql("select *" + sqlexpression + " from " + inputViewName) 238 | dqResultDf.createOrReplaceTempView(resultViewName) 239 | 240 | if (properties.getProperty("DQ_LOG_RESULTS_FLAG").toBoolean) { 241 | //logging results in required tables 242 | val failedresult = spark.sql(sqllogExpression) 243 | val current_time = current_timestamp() 244 | var failTable = failedresult.withColumn("Rule", lit(ruleName)).withColumn("PlatformModifiedDate", date_format(current_time, "y-MM-dd'T'HH:mm:ss.SSS'Z'")).withColumn("PlatformModifiedDateInt", date_format(current_time, "yyyyMMddHHmmssSSS").cast(LongType)) 245 | failTable = correctFormat(failTable, orderOfFailTable) 246 | spark.sql("insert into " + properties.getProperty("SUBJECT_AREA_DB") + "." + properties.getProperty("DQ_RUNDETAILS_TABLE_NAME") + " " + dimaggExpression) 247 | val failedViewName = "vw_Failed_" + view_uid 248 | failTable.createOrReplaceTempView(failedViewName) 249 | spark.sql("insert into " + properties.getProperty("SUBJECT_AREA_DB") + "." + properties.getProperty("DQ_FAIL_TABLE_NAME") + " select * from " + failedViewName) 250 | } 251 | else 252 | println("Skipping result logging as DQ_LOG_RESULTS_FLAG is set to " + properties.getProperty("DQ_LOG_RESULTS_FLAG")) 253 | 254 | dqResultDf 255 | } 256 | catch { 257 | case e: Exception=> println("Returning original Dataframe. Uniquecheck failed with Exception-->\n"+e.toString) 258 | df 259 | } 260 | } 261 | } 262 | -------------------------------------------------------------------------------- /dq/src/main/scala/com/ms/dq/support/SupportTrait.scala: -------------------------------------------------------------------------------- 1 | // Copyright (c) Microsoft Corporation. 2 | // Licensed under the MIT License. 3 | 4 | package com.ms.dq.support 5 | 6 | import java.util.Properties 7 | 8 | import org.apache.spark.sql.{Dataset, Row, SaveMode, SparkSession} 9 | import org.apache.spark.sql.functions._ 10 | 11 | trait SupportTrait { 12 | //returns the dataframe:data ordered according to the list:orderedList 13 | def correctFormat(data: Dataset[Row],orderedList:List[String]): Dataset[Row]={ 14 | data.select(orderedList.head,orderedList.tail: _*) 15 | } 16 | //inserts dataframe:writable into a praticular table:tableName 17 | def writeTableAppendAbsolute(writable: Dataset[Row], tableName: String) = { 18 | writable.write.mode(SaveMode.Append).insertInto(tableName) 19 | } 20 | //checks if a string:value is present in a string or not in a list:list irrespective of its case 21 | def containsIgnoreCase(list:Seq[String],value:String): Boolean={ 22 | list.exists(item => item.toLowerCase() == value.toLowerCase()) 23 | } 24 | 25 | def containsIgnoreCase(list:Seq[String],value:Seq[String]): Boolean={ 26 | var inp="" 27 | for (inp <- value) 28 | { 29 | if(containsIgnoreCase(list,inp) == false) 30 | { 31 | println(inp + " is not present as a column in the dataframe") 32 | return false 33 | } 34 | } 35 | return true 36 | } 37 | 38 | def getStringFromSeq(stringList: Seq[String]): String = { 39 | stringList.map(a=>a).mkString(",") 40 | } 41 | //the function helps identifying the last time the table for a particular source and entity was processed , for delta processing 42 | def ruleWaterMarkStart(source: String, entity: String,rule: String, spark:SparkSession, properties: Properties): String = { 43 | import spark.implicits._ 44 | var d: Dataset[Row] = null 45 | try { 46 | d = spark.table(properties.getProperty("SUBJECT_AREA_DB") + "." + properties.getProperty("DQ_RULE_WATERMARK_TABLE_NAME")).filter($"SubjectArea" === source).filter($"SourceEntityName" === entity).filter($"RuleName" === rule).agg(max("WaterMarkEndValue").alias("startValue")) 47 | if (!d.take(1).isEmpty) { 48 | if (null == d.first().getString(0)) { 49 | "17530101" 50 | } else { 51 | d.first().getString(0) 52 | } 53 | } 54 | else 55 | "17530101" 56 | } 57 | catch { 58 | case ex: Exception => println("Error with Watermark look up. "+ex.toString()) 59 | throw ex 60 | } 61 | } 62 | //function to help in delta processing and returns all records in table name whose watermark value is greater than the start value and less than end value 63 | def dqDeltaIdentifier(tableName: String, auditCol: String,waterMarkStart:String,waterMarkEnd: String,spark:SparkSession):Dataset[Row]={ 64 | try{ 65 | if(!spark.catalog.tableExists(tableName)) 66 | { 67 | println("TableName=>"+tableName+" does not exist. Please check metadata") 68 | throw new Exception("Table Does not exist") 69 | } 70 | val tableDf=spark.table(tableName) 71 | if (!containsIgnoreCase(tableDf.columns.toSeq, auditCol)) { 72 | println("TableName=>"+tableName+" does not contain column-->"+auditCol+". Please check metadata") 73 | throw new Exception(" Column Does not Exist") 74 | } 75 | if(waterMarkEnd.equals(Long.MaxValue.toString())) 76 | { 77 | tableDf.filter(col(auditCol)>waterMarkStart.toLong) 78 | } 79 | else 80 | { 81 | tableDf.filter(col(auditCol)>waterMarkStart.toLong).filter(col(auditCol) println("Error with Delta Identifier. "+ex.toString()) 86 | throw ex 87 | } 88 | } 89 | //function to help in delta processing and returns all records in table name whose watermark value is greater than the start value 90 | def dqDeltaIdentifier(tableName: String, auditCol: String,waterMarkStart:String,spark:SparkSession):Dataset[Row]={ 91 | try{ 92 | if(!spark.catalog.tableExists(tableName)) 93 | { 94 | println("TableName=>"+tableName+" does not exist. Please check metadata") 95 | throw new Exception("Table Does not exist") 96 | } 97 | val tableDf=spark.table(tableName) 98 | if (!containsIgnoreCase(tableDf.columns.toSeq, auditCol)) { 99 | println("TableName=>"+tableName+" does not contain column-->"+auditCol+". Please check metadata") 100 | throw new Exception(" Column Does not Exist") 101 | } 102 | tableDf.filter(col(auditCol)>waterMarkStart.toLong) 103 | } 104 | catch{ 105 | case ex: Exception => println("Error with Delta Identifier. "+ex.toString()) 106 | throw ex 107 | } 108 | } 109 | //the function helps in updating the watermark value of the table for a particular source and entity , for delta processing 110 | def updateRuleWaterMark(source: String, entity: String, rule: String, waterMarkEndVal: String,spark: SparkSession,properties: Properties)={ 111 | try{ 112 | val selectExpression= "Select '" + source + "' as SubjectArea, '" + entity + "' as SourceEntityName, '" + rule + "' as RuleName, '17530101' as WaterMarkStartValue, '" + waterMarkEndVal + "' as WaterMarkEndValue, date_format(current_timestamp, \"y-MM-dd'T'HH:mm:ss.SSS'Z'\") as PlatformModifiedDate, cast(date_format(current_timestamp, \"yyyyMMddHHmmssSSS\") as long) as PlatformModifiedDateInt" 113 | spark.sql("insert into " + properties.getProperty("SUBJECT_AREA_DB") + "." + properties.getProperty("DQ_RULE_WATERMARK_TABLE_NAME") + " "+selectExpression) 114 | } 115 | catch{ 116 | case ex: Exception => println("Error with Watermark Update. "+ex.toString()) 117 | throw ex 118 | } 119 | } 120 | //get properties from application.properties 121 | def getProperties(): Properties = { 122 | val url = getClass().getResource("/application.properties") 123 | val properties: Properties = new Properties() 124 | 125 | if (url != null) { 126 | val source = scala.io.Source.fromURL(url) 127 | properties.load(source.bufferedReader()) 128 | } 129 | else { 130 | println("Properties file cannot be loaded") 131 | throw new java.io.FileNotFoundException("Properties file cannot be loaded"); 132 | } 133 | 134 | return properties 135 | } 136 | } 137 | -------------------------------------------------------------------------------- /images/Data Quality Insights.PNG: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/microsoft/Data-Quality-Rule-Engine/0c3000b1a7d82b9ddbba8f8b9af3e011977c3c5c/images/Data Quality Insights.PNG -------------------------------------------------------------------------------- /images/Entityrulemetadata.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/microsoft/Data-Quality-Rule-Engine/0c3000b1a7d82b9ddbba8f8b9af3e011977c3c5c/images/Entityrulemetadata.png -------------------------------------------------------------------------------- /images/OrphanEdgeMetadata.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/microsoft/Data-Quality-Rule-Engine/0c3000b1a7d82b9ddbba8f8b9af3e011977c3c5c/images/OrphanEdgeMetadata.png -------------------------------------------------------------------------------- /images/Results.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/microsoft/Data-Quality-Rule-Engine/0c3000b1a7d82b9ddbba8f8b9af3e011977c3c5c/images/Results.png -------------------------------------------------------------------------------- /notebooks/DQ Tables to Lake.scala: -------------------------------------------------------------------------------- 1 | // Copyright (c) Microsoft Corporation. 2 | // Licensed under the MIT License. 3 | 4 | // Databricks notebook source 5 | // DBTITLE 1,Input Widgets 6 | import com.databricks.dbutils_v1.DBUtilsHolder.dbutils 7 | 8 | dbutils.widgets.text("keyAdls", "","keyAdls") 9 | dbutils.widgets.text("credendentialAdls", "","credendentialAdls") 10 | dbutils.widgets.text("databricksScope", "","databricksScope") 11 | dbutils.widgets.text("adlsLoginUrl", "","adlsLoginUrl") 12 | dbutils.widgets.text("datalakeName", "","datalakeName") 13 | dbutils.widgets.text("adlsPath", "","adlsPath") 14 | dbutils.widgets.text("subjectAreaDb", "","subjectAreaDb") 15 | dbutils.widgets.text("dqRunDetailsTableName", "","dqRunDetailsTableName") 16 | dbutils.widgets.text("dqFailTableName", "","dqFailTableName") 17 | dbutils.widgets.text("dqAggTableName", "","dqAggTableName") 18 | dbutils.widgets.text("dqWatermarkTableName", "","dqWatermarkTableName") 19 | 20 | 21 | // COMMAND ---------- 22 | 23 | // DBTITLE 1,ADLS Gen2 Access Config 24 | val keyAdls = dbutils.widgets.get("keyAdls") 25 | val credendentialAdls = dbutils.widgets.get("credendentialAdls") 26 | val databricksScope = dbutils.widgets.get("databricksScope") 27 | 28 | val decryptedADLSId = dbutils.secrets.get(scope = databricksScope, key = keyAdls) 29 | val decryptedADLSCredential = dbutils.secrets.get(scope = databricksScope, key = credendentialAdls) 30 | val adlsLoginUrl = dbutils.widgets.get("adlsLoginUrl") 31 | val datalakeName = dbutils.widgets.get("datalakeName") 32 | 33 | //initializing the spark session with the config 34 | 35 | spark.conf.set(s"fs.azure.account.auth.type.${datalakeName}.dfs.core.windows.net", "OAuth") 36 | spark.conf.set(s"fs.azure.account.oauth.provider.type.${datalakeName}.dfs.core.windows.net", "org.apache.hadoop.fs.azurebfs.oauth2.ClientCredsTokenProvider") 37 | spark.conf.set(s"fs.azure.account.oauth2.client.id.${datalakeName}.dfs.core.windows.net", s"${decryptedADLSId}") 38 | spark.conf.set(s"fs.azure.account.oauth2.client.secret.${datalakeName}.dfs.core.windows.net", s"${decryptedADLSCredential}") 39 | spark.conf.set(s"fs.azure.account.oauth2.client.endpoint.${datalakeName}.dfs.core.windows.net", adlsLoginUrl) 40 | spark.conf.set("fs.azure.createRemoteFileSystemDuringInitialization", "true") 41 | 42 | 43 | // COMMAND ---------- 44 | 45 | // DBTITLE 1,Initialize Variables 46 | val subjectAreaDb = dbutils.widgets.get("subjectAreaDb") 47 | val dqRunDetailsTableName = dbutils.widgets.get("dqRunDetailsTableName") 48 | val dqFailTableName = dbutils.widgets.get("dqFailTableName") 49 | val dqAggTableName = dbutils.widgets.get("dqAggTableName") 50 | val dqWatermarkTableName = dbutils.widgets.get("dqWatermarkTableName") 51 | val adlsPath = dbutils.widgets.get("adlsPath") 52 | 53 | // COMMAND ---------- 54 | 55 | // DBTITLE 1,Common Methods 56 | import org.apache.spark.sql.DataFrame 57 | def getwatermarkvalue(dqtablename: String): String ={ 58 | 59 | try{ 60 | val watermarkendvalue = spark.sql("SELECT COALESCE(watermarkendvalue, '1753-01-01') as watermarkvalue FROM " + subjectAreaDb + "." + dqWatermarkTableName + " WHERE lower(dqtable)='" + dqtablename +"'") 61 | 62 | if(watermarkendvalue.head(1).isEmpty) 63 | return "1753-01-01" 64 | else 65 | return watermarkendvalue.first.getString(0) 66 | } 67 | 68 | catch { 69 | case e: Exception => { 70 | println("ERROR : Unable to get the WaterMark value " + e.getMessage) 71 | throw e 72 | } 73 | } 74 | 75 | } 76 | 77 | 78 | def getwatermarkvalue(dqtablename: String, source: String, entity: String): String ={ 79 | 80 | try{ 81 | val watermarkendvalue = spark.sql("SELECT COALESCE(watermarkendvalue, '1753-01-01') as watermarkvalue FROM " + subjectAreaDb + "." + dqWatermarkTableName + " WHERE lower(dqtable)='" + dqtablename +"' and lower(source)='" + source +"' and lower(entity)='" + entity +"'") 82 | 83 | if(watermarkendvalue.head(1).isEmpty) 84 | return "1753-01-01" 85 | else 86 | return watermarkendvalue.first.getString(0) 87 | } 88 | 89 | catch { 90 | case e: Exception => { 91 | println("ERROR : Unable to get the WaterMark value " + e.getMessage) 92 | throw e 93 | } 94 | } 95 | 96 | } 97 | 98 | 99 | def setwatermarkvalue(watermarkendvalue: String, dqtablename: String, source: String, entity: String): Unit ={ 100 | 101 | spark.conf.set("spark.sql.crossJoin.enabled", "true") 102 | val insertQuery = s""" MERGE INTO ${subjectAreaDb}.${dqWatermarkTableName} as Target 103 | USING ( 104 | SELECT '${watermarkendvalue}' AS watermarkendvalue 105 | ,current_timestamp AS omidqcreateddate 106 | ,current_timestamp AS omidqmodifieddate 107 | ,'${dqtablename}' AS dqtable 108 | ,'${source}' AS source 109 | ,'${entity}' AS entity 110 | ) AS Source 111 | ON Target.dqtable ='${dqtablename}' 112 | AND COALESCE(Target.source, '') = COALESCE(Source.source, '') 113 | AND COALESCE(Target.entity, '') = COALESCE(Source.entity, '') 114 | WHEN MATCHED 115 | AND COALESCE(Target.watermarkendvalue, '') <> COALESCE(Source.watermarkendvalue, '') 116 | AND COALESCE(Target.omidqmodifieddate, '') <> COALESCE(Source.omidqmodifieddate, '') 117 | THEN 118 | UPDATE 119 | SET Target.watermarkendvalue = Source.watermarkendvalue 120 | ,Target.omidqmodifieddate = Source.omidqmodifieddate 121 | WHEN NOT MATCHED 122 | THEN 123 | INSERT * """ 124 | 125 | try { 126 | spark.sql(insertQuery) 127 | } 128 | catch { 129 | case e: Exception => { 130 | println("ERROR : Unable to insert the WaterMark value " + e.getMessage) 131 | throw e 132 | } 133 | } 134 | 135 | } 136 | 137 | // COMMAND ---------- 138 | 139 | // DBTITLE 1,Populate dqAggTable 140 | val watermarkvalue_dqfailtable = getwatermarkvalue(dqAggTableName,"dq",dqFailTableName) 141 | println(s"Watermark value for ${dqAggTableName}:${dqFailTableName} is: "+watermarkvalue_dqfailtable) 142 | val watermarkvalue_dqrundetails = getwatermarkvalue(dqAggTableName,"dq",dqRunDetailsTableName) 143 | println(s"Watermark value for ${dqAggTableName}:${dqRunDetailsTableName} is: "+watermarkvalue_dqrundetails) 144 | 145 | val watermarkendvalue_dqfailtable = spark.sql("SELECT COALESCE(MAX(PlatformModifieddate), '1753-01-01') FROM " + subjectAreaDb + "." + dqFailTableName) 146 | println(s"New Watermark value for ${dqAggTableName}:${dqFailTableName} is: "+watermarkendvalue_dqfailtable.first.get(0).toString()) 147 | val watermarkendvalue_dqrundetails = spark.sql("SELECT COALESCE(MAX(PlatformModifieddate), '1753-01-01') FROM " + subjectAreaDb + "." + dqRunDetailsTableName) 148 | println(s"New Watermark value for ${dqAggTableName}:${dqRunDetailsTableName} is: "+watermarkendvalue_dqrundetails.first.get(0).toString()) 149 | 150 | if (watermarkendvalue_dqfailtable.first.get(0).toString() == watermarkvalue_dqfailtable && watermarkendvalue_dqrundetails.first.get(0).toString() == watermarkvalue_dqrundetails) 151 | { 152 | println("No records to update") 153 | } 154 | else 155 | { 156 | println(s"New Records detected. Existing Watermark for ${dqRunDetailsTableName}: " + watermarkvalue_dqrundetails + s" and for ${dqFailTableName}: "+watermarkvalue_dqfailtable) 157 | 158 | val mergeQuery = s""" MERGE 159 | INTO ${subjectAreaDb}.${dqAggTableName} as Target 160 | USING ( 161 | SELECT R.DqAggTableKey AS DqAggTableKey 162 | ,R.Source AS Source 163 | ,R.Entity AS Entity 164 | ,R.ColumnName AS ColumnName 165 | ,R.Rule AS Rule 166 | ,R.PlatformModifiedDate AS DQModifiedDate 167 | ,R.PlatformModifiedDateInt AS DQModifiedDateInt 168 | ,R.RecordCount AS RecordCount 169 | ,COALESCE(F.FailCount, 0) AS FailCount 170 | ,R.RecordCount - COALESCE(F.FailCount, 0) AS SuccessCount 171 | ,current_timestamp AS PlatformModifiedDate 172 | ,CAST(date_format(current_date(), 'yyyyMMddhhmmssSSS') AS Long) AS PlatformModifiedDateInt 173 | FROM ${subjectAreaDb}.${dqRunDetailsTableName} R 174 | LEFT JOIN ( 175 | SELECT DqAggTableKey 176 | ,Count(1) AS FailCount 177 | ,MIN(PlatformModifiedDate) AS FailTablePlatformModifiedDate 178 | FROM ${subjectAreaDb}.${dqFailTableName} 179 | GROUP BY DqAggTableKey) F 180 | ON R.DqAggTableKey = F.DqAggTableKey 181 | WHERE R.PlatformModifiedDate >= '${watermarkvalue_dqrundetails}' 182 | OR F.FailTablePlatformModifiedDate >= '${watermarkvalue_dqfailtable}' 183 | ) AS Source 184 | ON Target.DqAggTableKey = Source.DqAggTableKey 185 | WHEN MATCHED THEN 186 | UPDATE 187 | SET Target.Source = Source.Source 188 | ,Target.Entity = Source.Entity 189 | ,Target.Rule = Source.Rule 190 | ,Target.ColumnName = Source.ColumnName 191 | ,Target.FailCount = Source.FailCount 192 | ,Target.DQModifiedDate = Source.DQModifiedDate 193 | ,Target.DQModifiedDateInt = Source.DQModifiedDateInt 194 | ,Target.RecordCount = Source.RecordCount 195 | ,Target.SuccessCount = Source.SuccessCount 196 | ,Target.PlatformModifiedDate = Source.PlatformModifiedDate 197 | ,Target.PlatformModifiedDateInt = Source.PlatformModifiedDateInt 198 | WHEN NOT MATCHED 199 | THEN 200 | INSERT * """ 201 | 202 | try { 203 | spark.sql(mergeQuery) 204 | } 205 | catch { 206 | case e: Exception => { 207 | println("ERROR : Unable to merge data in dqAggTable " + e.getMessage) 208 | throw e 209 | } 210 | } 211 | 212 | println(s"Setting Watermark for ${dqAggTableName}:${dqFailTableName}. Value: "+ watermarkendvalue_dqfailtable.first.get(0).toString()) 213 | setwatermarkvalue(watermarkendvalue_dqfailtable.first.get(0).toString(), dqAggTableName, "dq", dqFailTableName) 214 | println(s"Setting Watermark for ${dqAggTableName}:${dqRunDetailsTableName}. Value: "+ watermarkendvalue_dqrundetails.first.get(0).toString()) 215 | setwatermarkvalue(watermarkendvalue_dqrundetails.first.get(0).toString(), dqAggTableName , "dq", dqRunDetailsTableName ) 216 | println("Watermark set") 217 | } 218 | 219 | // COMMAND ---------- 220 | 221 | // DBTITLE 1,Write files by Date to cooked folder for dqFailTable 222 | import org.apache.spark.sql.functions._ 223 | import scala.collection.parallel._ 224 | val df = spark.table(s"${subjectAreaDb}.${dqFailTableName}") 225 | val sources = df.select(lower($"Source")).distinct.collect.toList 226 | 227 | val watermarkvalue = getwatermarkvalue(dqFailTableName) 228 | println("Watermark value is: "+watermarkvalue) 229 | val watermarkendvalue = spark.sql("SELECT COALESCE(MAX(PlatformModifieddate), '1753-01-01') FROM " + subjectAreaDb + "." + dqFailTableName) 230 | println("New Watermark value is: "+watermarkendvalue.first.get(0).toString()) 231 | 232 | if (watermarkendvalue.first.get(0).toString() == watermarkvalue) 233 | { 234 | println("No records to update") 235 | } 236 | else 237 | { 238 | println("New Records detected. Watermark: "+watermarkvalue) 239 | val dates = df.filter(to_date($"PlatformModifieddate") >= watermarkvalue).select(to_date($"PlatformModifieddate") as "date").distinct 240 | //display(dates) 241 | 242 | for(d <- dates.collect) 243 | { 244 | println("Writing File for Date: " + d(0)) 245 | val filtereddf = df.filter(to_date($"PlatformModifiedDate") === d(0)) 246 | 247 | filtereddf.repartition(1).write 248 | .format("com.databricks.spark.csv") 249 | .option("header", "true") 250 | .option("sep", "\t") 251 | .option("quoteAll", true) 252 | .option("escape","\"") 253 | .mode("overwrite") 254 | .save(adlsPath+"/" + dqFailTableName + "/" +d(0)+ ".tmp") 255 | 256 | val partitionPath = dbutils.fs.ls(adlsPath+"/" + dqFailTableName + "/" +d(0)+ ".tmp").filter(file => file.name.endsWith("csv"))(0).path 257 | dbutils.fs.cp(partitionPath, adlsPath+"/" + dqFailTableName + "/" +d(0)+ ".tsv") 258 | dbutils.fs.rm(adlsPath+"/" + dqFailTableName + "/" +d(0)+ ".tmp", recurse = true) 259 | 260 | println("Completed writing File for Date: " + d(0)) 261 | 262 | } 263 | } 264 | 265 | println("Setting Watermark. Value: "+ watermarkendvalue.first.get(0).toString()) 266 | setwatermarkvalue(watermarkendvalue.first.get(0).toString(), dqFailTableName , null, null) 267 | println("Watermark set") 268 | 269 | 270 | 271 | 272 | 273 | // COMMAND ---------- 274 | 275 | // DBTITLE 1,Write dqaggtable to cooked folder 276 | val df = spark.table(s"${subjectAreaDb}.${dqAggTableName}") 277 | df.repartition(1).write 278 | .format("com.databricks.spark.csv") 279 | .option("header", "true") 280 | .option("sep", "\t") 281 | .option("quoteAll", true) 282 | .option("escape","\"") 283 | .mode("overwrite") 284 | .save(adlsPath+"/" + dqAggTableName + ".tmp") 285 | 286 | val partitionPath = dbutils.fs.ls(adlsPath+"/" + dqAggTableName + ".tmp/").filter(file => file.name.endsWith("csv"))(0).path 287 | dbutils.fs.cp(partitionPath, adlsPath+"/" + dqAggTableName + ".tsv") 288 | dbutils.fs.rm(adlsPath+"/" + dqAggTableName + ".tmp/", recurse = true) 289 | 290 | 291 | // COMMAND ---------- 292 | 293 | // DBTITLE 1,Write dqrundetails to cooked folder 294 | val df = spark.table(s"${subjectAreaDb}.${dqRunDetailsTableName}") 295 | df.repartition(1).write 296 | .format("com.databricks.spark.csv") 297 | .option("header", "true") 298 | .option("sep", "\t") 299 | .option("quoteAll", true) 300 | .option("escape","\"") 301 | .mode("overwrite") 302 | .save(adlsPath+"/" + dqRunDetailsTableName + ".tmp") 303 | 304 | val partitionPath = dbutils.fs.ls(adlsPath+"/" + dqRunDetailsTableName + ".tmp/").filter(file => file.name.endsWith("csv"))(0).path 305 | dbutils.fs.cp(partitionPath, adlsPath+"/" + dqRunDetailsTableName + ".tsv") 306 | dbutils.fs.rm(adlsPath+"/" + dqRunDetailsTableName + ".tmp/", recurse = true) 307 | 308 | // COMMAND ---------- 309 | 310 | 311 | -------------------------------------------------------------------------------- /sample/DataQualityInsights.pbix: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/microsoft/Data-Quality-Rule-Engine/0c3000b1a7d82b9ddbba8f8b9af3e011977c3c5c/sample/DataQualityInsights.pbix -------------------------------------------------------------------------------- /sample/DataQualityRuleEngineUsageSample.scala: -------------------------------------------------------------------------------- 1 | // Databricks notebook source 2 | // DBTITLE 1,Create Data Frame 3 | val df=spark.sql(s"""select 1 as id,2 as partitionkey,'John' as Name,CAST('2020-12-01T21:06:20.000+0000' as Timestamp) as EventDateTime 4 | union all 5 | select 2 as id, 3 as partitionkey, 'Jack' as Name,current_date as EventDateTime 6 | union all 7 | select 3 as id, 4 as partitionkey, null as Name,current_date as EventDateTime 8 | union all 9 | select 1 as id,2 as partitionkey,'John' as Name,current_date as EventDateTime""") 10 | display(df) 11 | 12 | // COMMAND ---------- 13 | 14 | // DBTITLE 1,Add Entries for Metadata Drive Data Quality Check 15 | // MAGIC %sql 16 | // MAGIC insert into dq.entityrulemetadata 17 | // MAGIC (select 'sourceName' as source,'entityName' as entity,'nullcheck' as rulename, "{\"columnList\":\"name,eventDateTime\"}" as parameters 18 | // MAGIC union all 19 | // MAGIC select 'sourceName' as source,'entityName' as entity,'uniquecheck' as rulename, "{\"columnList\":[\"id\",\"partitionkey\"],\"LatestRowIdentifier\":\"eventDateTime\"}" as parameters) 20 | 21 | // COMMAND ---------- 22 | 23 | // DBTITLE 1,Metadata Driven Data Quality Check 24 | //metadata driven 25 | import com.ms.jedi.dq.framework.DQFramework 26 | val dqObj=new DQFramework() 27 | dqObj.setSparkSession(spark) 28 | val afterDq=dqObj.applyDq(df,"sourceName","entityName") 29 | display(afterDq) 30 | 31 | // COMMAND ---------- 32 | 33 | // DBTITLE 1,Parameter Driven Data Quality Check 34 | //parameter driven 35 | import com.ms.jedi.dq.framework.DQFramework 36 | val dqObj=new DQFramework() 37 | dqObj.setSparkSession(spark) 38 | val col_rule_map=Map("nullcheck"->"""{"columnList":"name,eventDateTime"}""", 39 | "uniquecheck"->"""{"columnList":["id,partitionkey"],"LatestRowIdentifier":"eventDateTime"}""") 40 | val map:Map[String,List[String]]=Map("id,partitionkey"->List("entity1","source1"), 41 | "eventDateTime"->List("entity2","source2"), 42 | "name"->List("entity3","source3")) 43 | val afterDq=dqObj.applyDq(df,col_rule_map,map,"pipelineid") 44 | display(afterDq.select("id","partitionkey","name","eventDateTime","name_nullcheck","eventDateTime_nullcheck","id_partitionkey_uniquecheck","LatestRow_id_partitionkey")) 45 | 46 | // COMMAND ---------- 47 | 48 | // DBTITLE 1,View Reporting data 49 | // MAGIC %sql 50 | // MAGIC select * from dq.dqrundetails where source in ('source1','source2','source3','sourceName') 51 | 52 | // COMMAND ---------- 53 | 54 | // DBTITLE 1,View Reporting data 55 | // MAGIC %sql 56 | // MAGIC select * from dq.dqfailtable where source in ('source1','source2','source3','sourceName') 57 | 58 | // COMMAND ---------- 59 | 60 | // DBTITLE 1,View Reporting data 61 | // MAGIC %sql 62 | // MAGIC select * from dq.dqAggTable where source in ('source1','source2','source3','sourceName') 63 | 64 | // COMMAND ---------- 65 | 66 | 67 | --------------------------------------------------------------------------------