├── .gitignore
├── CODE_OF_CONDUCT.md
├── LICENSE
├── README.md
├── SECURITY.md
├── dq
    ├── databases
    │   └── DQ
    │   │   ├── dqaggtable.sql
    │   │   ├── dqfailtable.sql
    │   │   ├── dqrulewatermark.sql
    │   │   ├── dqrundetails.sql
    │   │   ├── dqwatermark.sql
    │   │   ├── entityrulemetadata.sql
    │   │   └── orphanedgemetadata.sql
    ├── pom.xml
    └── src
    │   └── main
    │       ├── resources
    │           └── application.properties
    │       └── scala
    │           └── com
    │               └── ms
    │                   └── dq
    │                       ├── framework
    │                           ├── DQFramework.scala
    │                           └── README.md
    │                       ├── rules
    │                           ├── Nullcheck.scala
    │                           ├── Orphanedgecheck.scala
    │                           ├── Schemacheck.scala
    │                           └── Uniquecheck.scala
    │                       └── support
    │                           └── SupportTrait.scala
├── images
    ├── Data Quality Insights.PNG
    ├── Entityrulemetadata.png
    ├── OrphanEdgeMetadata.png
    └── Results.png
├── notebooks
    └── DQ Tables to Lake.scala
└── sample
    ├── DataQualityInsights.pbix
    └── DataQualityRuleEngineUsageSample.scala


/.gitignore:
--------------------------------------------------------------------------------
  1 | ## Ignore Visual Studio temporary files, build results, and
  2 | ## files generated by popular Visual Studio add-ons.
  3 | ##
  4 | ## Get latest from https://github.com/github/gitignore/blob/master/VisualStudio.gitignore
  5 | 
  6 | # User-specific files
  7 | *.rsuser
  8 | *.suo
  9 | *.user
 10 | *.userosscache
 11 | *.sln.docstates
 12 | 
 13 | # User-specific files (MonoDevelop/Xamarin Studio)
 14 | *.userprefs
 15 | 
 16 | # Mono auto generated files
 17 | mono_crash.*
 18 | 
 19 | # Build results
 20 | [Dd]ebug/
 21 | [Dd]ebugPublic/
 22 | [Rr]elease/
 23 | [Rr]eleases/
 24 | x64/
 25 | x86/
 26 | [Aa][Rr][Mm]/
 27 | [Aa][Rr][Mm]64/
 28 | bld/
 29 | [Bb]in/
 30 | [Oo]bj/
 31 | [Ll]og/
 32 | [Ll]ogs/
 33 | 
 34 | # Visual Studio 2015/2017 cache/options directory
 35 | .vs/
 36 | # Uncomment if you have tasks that create the project's static files in wwwroot
 37 | #wwwroot/
 38 | 
 39 | # Visual Studio 2017 auto generated files
 40 | Generated\ Files/
 41 | 
 42 | # MSTest test Results
 43 | [Tt]est[Rr]esult*/
 44 | [Bb]uild[Ll]og.*
 45 | 
 46 | # NUnit
 47 | *.VisualState.xml
 48 | TestResult.xml
 49 | nunit-*.xml
 50 | 
 51 | # Build Results of an ATL Project
 52 | [Dd]ebugPS/
 53 | [Rr]eleasePS/
 54 | dlldata.c
 55 | 
 56 | # Benchmark Results
 57 | BenchmarkDotNet.Artifacts/
 58 | 
 59 | # .NET Core
 60 | project.lock.json
 61 | project.fragment.lock.json
 62 | artifacts/
 63 | 
 64 | # StyleCop
 65 | StyleCopReport.xml
 66 | 
 67 | # Files built by Visual Studio
 68 | *_i.c
 69 | *_p.c
 70 | *_h.h
 71 | *.ilk
 72 | *.meta
 73 | *.obj
 74 | *.iobj
 75 | *.pch
 76 | *.pdb
 77 | *.ipdb
 78 | *.pgc
 79 | *.pgd
 80 | *.rsp
 81 | *.sbr
 82 | *.tlb
 83 | *.tli
 84 | *.tlh
 85 | *.tmp
 86 | *.tmp_proj
 87 | *_wpftmp.csproj
 88 | *.log
 89 | *.vspscc
 90 | *.vssscc
 91 | .builds
 92 | *.pidb
 93 | *.svclog
 94 | *.scc
 95 | 
 96 | # Chutzpah Test files
 97 | _Chutzpah*
 98 | 
 99 | # Visual C++ cache files
100 | ipch/
101 | *.aps
102 | *.ncb
103 | *.opendb
104 | *.opensdf
105 | *.sdf
106 | *.cachefile
107 | *.VC.db
108 | *.VC.VC.opendb
109 | 
110 | # Visual Studio profiler
111 | *.psess
112 | *.vsp
113 | *.vspx
114 | *.sap
115 | 
116 | # Visual Studio Trace Files
117 | *.e2e
118 | 
119 | # TFS 2012 Local Workspace
120 | $tf/
121 | 
122 | # Guidance Automation Toolkit
123 | *.gpState
124 | 
125 | # ReSharper is a .NET coding add-in
126 | _ReSharper*/
127 | *.[Rr]e[Ss]harper
128 | *.DotSettings.user
129 | 
130 | # TeamCity is a build add-in
131 | _TeamCity*
132 | 
133 | # DotCover is a Code Coverage Tool
134 | *.dotCover
135 | 
136 | # AxoCover is a Code Coverage Tool
137 | .axoCover/*
138 | !.axoCover/settings.json
139 | 
140 | # Visual Studio code coverage results
141 | *.coverage
142 | *.coveragexml
143 | 
144 | # NCrunch
145 | _NCrunch_*
146 | .*crunch*.local.xml
147 | nCrunchTemp_*
148 | 
149 | # MightyMoose
150 | *.mm.*
151 | AutoTest.Net/
152 | 
153 | # Web workbench (sass)
154 | .sass-cache/
155 | 
156 | # Installshield output folder
157 | [Ee]xpress/
158 | 
159 | # DocProject is a documentation generator add-in
160 | DocProject/buildhelp/
161 | DocProject/Help/*.HxT
162 | DocProject/Help/*.HxC
163 | DocProject/Help/*.hhc
164 | DocProject/Help/*.hhk
165 | DocProject/Help/*.hhp
166 | DocProject/Help/Html2
167 | DocProject/Help/html
168 | 
169 | # Click-Once directory
170 | publish/
171 | 
172 | # Publish Web Output
173 | *.[Pp]ublish.xml
174 | *.azurePubxml
175 | # Note: Comment the next line if you want to checkin your web deploy settings,
176 | # but database connection strings (with potential passwords) will be unencrypted
177 | *.pubxml
178 | *.publishproj
179 | 
180 | # Microsoft Azure Web App publish settings. Comment the next line if you want to
181 | # checkin your Azure Web App publish settings, but sensitive information contained
182 | # in these scripts will be unencrypted
183 | PublishScripts/
184 | 
185 | # NuGet Packages
186 | *.nupkg
187 | # NuGet Symbol Packages
188 | *.snupkg
189 | # The packages folder can be ignored because of Package Restore
190 | **/[Pp]ackages/*
191 | # except build/, which is used as an MSBuild target.
192 | !**/[Pp]ackages/build/
193 | # Uncomment if necessary however generally it will be regenerated when needed
194 | #!**/[Pp]ackages/repositories.config
195 | # NuGet v3's project.json files produces more ignorable files
196 | *.nuget.props
197 | *.nuget.targets
198 | 
199 | # Microsoft Azure Build Output
200 | csx/
201 | *.build.csdef
202 | 
203 | # Microsoft Azure Emulator
204 | ecf/
205 | rcf/
206 | 
207 | # Windows Store app package directories and files
208 | AppPackages/
209 | BundleArtifacts/
210 | Package.StoreAssociation.xml
211 | _pkginfo.txt
212 | *.appx
213 | *.appxbundle
214 | *.appxupload
215 | 
216 | # Visual Studio cache files
217 | # files ending in .cache can be ignored
218 | *.[Cc]ache
219 | # but keep track of directories ending in .cache
220 | !?*.[Cc]ache/
221 | 
222 | # Others
223 | ClientBin/
224 | ~$*
225 | *~
226 | *.dbmdl
227 | *.dbproj.schemaview
228 | *.jfm
229 | *.pfx
230 | *.publishsettings
231 | orleans.codegen.cs
232 | 
233 | # Including strong name files can present a security risk
234 | # (https://github.com/github/gitignore/pull/2483#issue-259490424)
235 | #*.snk
236 | 
237 | # Since there are multiple workflows, uncomment next line to ignore bower_components
238 | # (https://github.com/github/gitignore/pull/1529#issuecomment-104372622)
239 | #bower_components/
240 | 
241 | # RIA/Silverlight projects
242 | Generated_Code/
243 | 
244 | # Backup & report files from converting an old project file
245 | # to a newer Visual Studio version. Backup files are not needed,
246 | # because we have git ;-)
247 | _UpgradeReport_Files/
248 | Backup*/
249 | UpgradeLog*.XML
250 | UpgradeLog*.htm
251 | ServiceFabricBackup/
252 | *.rptproj.bak
253 | 
254 | # SQL Server files
255 | *.mdf
256 | *.ldf
257 | *.ndf
258 | 
259 | # Business Intelligence projects
260 | *.rdl.data
261 | *.bim.layout
262 | *.bim_*.settings
263 | *.rptproj.rsuser
264 | *- [Bb]ackup.rdl
265 | *- [Bb]ackup ([0-9]).rdl
266 | *- [Bb]ackup ([0-9][0-9]).rdl
267 | 
268 | # Microsoft Fakes
269 | FakesAssemblies/
270 | 
271 | # GhostDoc plugin setting file
272 | *.GhostDoc.xml
273 | 
274 | # Node.js Tools for Visual Studio
275 | .ntvs_analysis.dat
276 | node_modules/
277 | 
278 | # Visual Studio 6 build log
279 | *.plg
280 | 
281 | # Visual Studio 6 workspace options file
282 | *.opt
283 | 
284 | # Visual Studio 6 auto-generated workspace file (contains which files were open etc.)
285 | *.vbw
286 | 
287 | # Visual Studio LightSwitch build output
288 | **/*.HTMLClient/GeneratedArtifacts
289 | **/*.DesktopClient/GeneratedArtifacts
290 | **/*.DesktopClient/ModelManifest.xml
291 | **/*.Server/GeneratedArtifacts
292 | **/*.Server/ModelManifest.xml
293 | _Pvt_Extensions
294 | 
295 | # Paket dependency manager
296 | .paket/paket.exe
297 | paket-files/
298 | 
299 | # FAKE - F# Make
300 | .fake/
301 | 
302 | # CodeRush personal settings
303 | .cr/personal
304 | 
305 | # Python Tools for Visual Studio (PTVS)
306 | __pycache__/
307 | *.pyc
308 | 
309 | # Cake - Uncomment if you are using it
310 | # tools/**
311 | # !tools/packages.config
312 | 
313 | # Tabs Studio
314 | *.tss
315 | 
316 | # Telerik's JustMock configuration file
317 | *.jmconfig
318 | 
319 | # BizTalk build output
320 | *.btp.cs
321 | *.btm.cs
322 | *.odx.cs
323 | *.xsd.cs
324 | 
325 | # OpenCover UI analysis results
326 | OpenCover/
327 | 
328 | # Azure Stream Analytics local run output
329 | ASALocalRun/
330 | 
331 | # MSBuild Binary and Structured Log
332 | *.binlog
333 | 
334 | # NVidia Nsight GPU debugger configuration file
335 | *.nvuser
336 | 
337 | # MFractors (Xamarin productivity tool) working folder
338 | .mfractor/
339 | 
340 | # Local History for Visual Studio
341 | .localhistory/
342 | 
343 | # BeatPulse healthcheck temp database
344 | healthchecksdb
345 | 
346 | # Backup folder for Package Reference Convert tool in Visual Studio 2017
347 | MigrationBackup/
348 | 
349 | # Ionide (cross platform F# VS Code tools) working folder
350 | .ionide/
351 | 


--------------------------------------------------------------------------------
/CODE_OF_CONDUCT.md:
--------------------------------------------------------------------------------
 1 | # Microsoft Open Source Code of Conduct
 2 | 
 3 | This project has adopted the [Microsoft Open Source Code of Conduct](https://opensource.microsoft.com/codeofconduct/).
 4 | 
 5 | Resources:
 6 | 
 7 | - [Microsoft Open Source Code of Conduct](https://opensource.microsoft.com/codeofconduct/)
 8 | - [Microsoft Code of Conduct FAQ](https://opensource.microsoft.com/codeofconduct/faq/)
 9 | - Contact [opencode@microsoft.com](mailto:opencode@microsoft.com) with questions or concerns
10 | 


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
 1 |     Data Quality Rule Engine
 2 |     MIT License
 3 | 
 4 |     Copyright (c) Microsoft Corporation.
 5 | 
 6 |     Permission is hereby granted, free of charge, to any person obtaining a copy
 7 |     of this software and associated documentation files (the "Software"), to deal
 8 |     in the Software without restriction, including without limitation the rights
 9 |     to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
10 |     copies of the Software, and to permit persons to whom the Software is
11 |     furnished to do so, subject to the following conditions:
12 | 
13 |     The above copyright notice and this permission notice shall be included in all
14 |     copies or substantial portions of the Software.
15 | 
16 |     THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
17 |     IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
18 |     FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
19 |     AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
20 |     LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
21 |     OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
22 |     SOFTWARE
23 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
  1 | # Data Quality Rule Engine
  2 | The data quality rule engine is a library that will help run data quality checks on datasets.
  3 | 
  4 | ### Data Quality Rules implemented:
  5 | *	Null Check: Checks whether the values of the column are null or not. 
  6 | *	Unique Check: Checks whether the values of the columns are unique or not. Optionally, it may also check for the latest row for a particular value of a column.
  7 | *	Orphan Edge Check: Checks whether the dataframe’s values referring to an id of a parent dataframe, exist in the parent dataframe.
  8 | *	Schema Check: Checks whether the json representing the values of the dataframe follow a given schema.
  9 | 
 10 | There are 2 ways of invoking the Data Quality Rules on a dataframe:
 11 | *	Metadata driven.
 12 | *	As a parameter to the function call.
 13 | 
 14 | The details of the Data Quality runs, and the erroneous records are maintained in reporting tables. However, if the User wishes to not have these recorded, the DQ_LOG_RESULTS_FLAG in the application.properties file can be set to false.
 15 | 
 16 | ### Requirements:
 17 | The Data Quality Rule Engine is compatible with 
 18 | *	Apache Spark version 3.0.x
 19 | *	Java 8
 20 | *	Databricks Cluster with Spark 3.0.x configuration (used only for schema check).
 21 | 
 22 | ### To use the Data Quality Rule Engine in a project:
 23 | 1. Install the Data Quality Rule Engine to a repository.
 24 | 
 25 | 2. Add a reference to the repository in pom.xml of your project under the tag:
 26 | 	`<repository>`
 27 | 
 28 | 3. Add the framework as a dependency to the pom.xml of your project (Make sure you have dependency to org.scala-lang.):
 29 | ```
 30 | <dependency> 
 31 |    <groupId>com.ms.dq</groupId> 
 32 |    <artifactId>DQFramework</artifactId> 
 33 |    <version>LATEST</version> 
 34 | </dependency>
 35 | ```
 36 | 
 37 | 4. Then, simply import the class in your code:
 38 |   
 39 | `import com.ms.dq.framework.DQFramework`
 40 | 
 41 | ### To use the Data Quality Rule Engine as a library in the Databricks notebook:
 42 | 
 43 | Build the Data Quality Rule Engine jar and upload the library to the Databricks cluster.
 44 | Then, simply import the class in the notebook:
 45 | 
 46 | `import com.ms.dq.framework.DQFramework`
 47 |   
 48 | 
 49 | ### Metadata Driven:
 50 | The Data Quality rules to be applied on a data frame are inserted into metadata tables (details below). The Rule Engine relies on the metadata to execute the rules.
 51 | Note that the table names can be configured as per the user’s requirement by updating the application.properties file.
 52 | *	Entityrulemetadata: A delta table of schema type (source, entity, rule, parameters). For a rule to be executed against a particular entity, an entry for that rule must be present in this table. The parameters field basically contains a JSON of key value pairs pertinent to the rule. The keys for the parameters of the 4 rules are mentioned below and a snippet of the data in the table is attached. 
 53 | 	  * Null Check
 54 |         * columnList: String. A comma separated list of the columns to apply null check on.
 55 |     * Unique Check
 56 |         * columnList: Array of Strings. Each string is a comma separated list of columns, the combination of which should be unique. 
 57 |         * LatestRowIdentifier (Optional): String. An optional key in case you want the framework to return whether the record is the latest unique row for that column or not as well. Basically, refers to the column which should be used as an identifier for the latest row.
 58 |      * Orphan Edge Check
 59 |         * tableName: String. The delta table name for the source, entity on which DQ will run. 
 60 |         * auditColIntName: String. The audit column which will be used for delta identification. (Must be the audit column in int). 
 61 |         * cutOffHours(optional): Int. Default 0. Delta should be considered from these many hours before the run.
 62 |         * cutOffMinutes(optional): Int. Default 0. Delta should be considered from these many minutes before the run.
 63 |         * cutOffSeconds(optional): Int. Default 0. Delta should be considered from these many seconds before the run.
 64 |  
 65 |        ![EntityRuleMwtadata](https://github.com/microsoft/Data-Quality-Rule-Engine/blob/main/images/Entityrulemetadata.png)
 66 | 
 67 | * Orphanedgemetadata: A metadata table for the orphan edge check. For the orphan edge check to run on a particular source, entity a record for that source, entity must be present in this metadata table. Sample records are provided below:
 68 | 
 69 |      ![OrphanEdgeMetadata](https://github.com/microsoft/Data-Quality-Rule-Engine/blob/main/images/OrphanEdgeMetadata.png)
 70 | 
 71 | #### To invoke the Rule Engine:
 72 | 
 73 | 1.	Create an object of the framework:
 74 | 
 75 |  	   `val dqObj=new DQFramework()`
 76 | 
 77 | 2.	Set the Spark session:
 78 | 
 79 | 	   ```
 80 | 	   var spark = SparkSession.builder.appName("<App Name>").getOrCreate()
 81 | 	   dqObj.setSparkSession(spark)
 82 | 	   ```
 83 | 
 84 | 3.	Make the required entries in the metadata tables, as explained in section [Metadata Driven](https://github.com/microsoft/Data-Quality-Rule-Engine#metadata-driven).
 85 | 4.	Invoke the method to apply the rules:
 86 | 
 87 |      `val applyDq = dqObj.applyDq(dataframe,source,entity,schema,pipelineId)`
 88 | 
 89 | Please refer the [DQFramework readme file](https://github.com/microsoft/Data-Quality-Rule-Engine/blob/main/dq/src/main/scala/com/ms/dq/framework/README.md) for the available overloaded methods for applyDq().
 90 | 
 91 | 
 92 | ### Parameter Based:
 93 | 
 94 | Another option to invoke the Data Quality Rules is by invoking the applyDQ() with the required parameters.
 95 | 
 96 | 1.	Define a mapping of the rules and columns of the data frame.
 97 |     ```
 98 |     val col_rule_map=Map(
 99 |       "nullcheck" -> """{"columnList":" id,createdDateTime"}""",
100 |       "uniquecheck" -> """{"columnList":" ["id,partitionKey", "id"]",”latestrowidentifier“:”ModifiedDate”}""")
101 |     ```
102 | 
103 | 2.	Provide a mapping of the columns on which the rules are applied and the respective source and entity.
104 | 
105 |       ```
106 |       val map:Map[String,List[String]]=Map("id"->List("source1","entity1"),
107 |         "createdDateTime "->List("source1"," entity1"),
108 |         "id"->List("source1"," entity1"),
109 |         "partitionKey"->List("source1"," entity1"))
110 |       ```
111 | 
112 | 3.	Invoke the method to apply the rules:
113 | 
114 |      `val applyDq = dqObj.applyDq(dataframe,col_rule_map,map,pipelineId)`
115 | 
116 | 
117 | 
118 | ### Results:
119 | The dataframe returned consists of the original dataframe with additional columns for each (rule, column) pair with the name rulename_columnname. These columns are Boolean fields representing whether the row is valid or not. 
120 | 
121 | For example, nullcheck is applied on createdDate, a new column named createdDate_nullcheck will be present in the dataframe returned. In the rows for which this column has the value true, createdDate values are not null, and for the ones in which they are false, the createdDate value is null. 
122 | 
123 |    ![Results](https://github.com/microsoft/Data-Quality-Rule-Engine/blob/main/images/Results.png) 
124 | 
125 | 
126 | [Here](https://github.com/microsoft/Data-Quality-Rule-Engine/blob/main/sample/DataQualityRuleEngineUsageSample.scala) is a sample notebook that demonstrates the usage of the Data Quality Rule Engine.
127 | 
128 | The Data Quality results from the reporting tables can be also used to derive insights. Below is a [sample dashboard](https://github.com/microsoft/Data-Quality-Rule-Engine/blob/main/sample/DataQualityInsights.pbix):
129 | 
130 |    ![Data Quality Insights](https://github.com/microsoft/Data-Quality-Rule-Engine/blob/main/images/Data%20Quality%20Insights.PNG)
131 | 
132 | ## Contributing
133 | 
134 | This project welcomes contributions and suggestions.  Most contributions require you to agree to a
135 | Contributor License Agreement (CLA) declaring that you have the right to, and actually do, grant us
136 | the rights to use your contribution. For details, visit https://cla.opensource.microsoft.com.
137 | 
138 | ### Bugs and Feature requests:
139 | Please use the [GitHub issue tracker](https://github.com/microsoft/Data-Quality-Rule-Engine/issues) to file bugs or feature requests with relevant information.
140 | 
141 | ### Creating a Pull Request:
142 | 1.	[Create a fork](https://docs.github.com/en/free-pro-team@latest/github/getting-started-with-github/fork-a-repo) of the repository.
143 | 2.	Make the required changes, following the existing code conventions.
144 | 3.	Ensure the unit test cases pass.
145 | 4.	Commit the changes to your fork.
146 | 5.	[Create a pull request](https://docs.github.com/en/free-pro-team@latest/github/collaborating-with-issues-and-pull-requests/creating-a-pull-request), with details of the unit tests.
147 | 6.	When you submit a pull request, a CLA-bot will automatically determine whether you need to provide a CLA and decorate the PR appropriately (e.g., label, comment). Simply follow the instructions provided by the bot. You will only need to do this once across all repositories using our CLA.
148 | 
149 | 
150 | This project has adopted the [Microsoft Open Source Code of Conduct](https://opensource.microsoft.com/codeofconduct/).
151 | For more information see the [Code of Conduct FAQ](https://opensource.microsoft.com/codeofconduct/faq/) or
152 | contact [opencode@microsoft.com](mailto:opencode@microsoft.com) with any additional questions or comments.
153 | 
154 | ## Trademarks
155 | 
156 | This project may contain trademarks or logos for projects, products, or services. Authorized use of Microsoft 
157 | trademarks or logos is subject to and must follow 
158 | [Microsoft's Trademark & Brand Guidelines](https://www.microsoft.com/en-us/legal/intellectualproperty/trademarks/usage/general).
159 | Use of Microsoft trademarks or logos in modified versions of this project must not cause confusion or imply Microsoft sponsorship.
160 | Any use of third-party trademarks or logos are subject to those third-party's policies.
161 | 


--------------------------------------------------------------------------------
/SECURITY.md:
--------------------------------------------------------------------------------
 1 | <!-- BEGIN MICROSOFT SECURITY.MD V0.0.5 BLOCK -->
 2 | 
 3 | ## Security
 4 | 
 5 | Microsoft takes the security of our software products and services seriously, which includes all source code repositories managed through our GitHub organizations, which include [Microsoft](https://github.com/Microsoft), [Azure](https://github.com/Azure), [DotNet](https://github.com/dotnet), [AspNet](https://github.com/aspnet), [Xamarin](https://github.com/xamarin), and [our GitHub organizations](https://opensource.microsoft.com/).
 6 | 
 7 | If you believe you have found a security vulnerability in any Microsoft-owned repository that meets [Microsoft's definition of a security vulnerability](https://docs.microsoft.com/en-us/previous-versions/tn-archive/cc751383(v=technet.10)), please report it to us as described below.
 8 | 
 9 | ## Reporting Security Issues
10 | 
11 | **Please do not report security vulnerabilities through public GitHub issues.**
12 | 
13 | Instead, please report them to the Microsoft Security Response Center (MSRC) at [https://msrc.microsoft.com/create-report](https://msrc.microsoft.com/create-report).
14 | 
15 | If you prefer to submit without logging in, send email to [secure@microsoft.com](mailto:secure@microsoft.com).  If possible, encrypt your message with our PGP key; please download it from the [Microsoft Security Response Center PGP Key page](https://www.microsoft.com/en-us/msrc/pgp-key-msrc).
16 | 
17 | You should receive a response within 24 hours. If for some reason you do not, please follow up via email to ensure we received your original message. Additional information can be found at [microsoft.com/msrc](https://www.microsoft.com/msrc). 
18 | 
19 | Please include the requested information listed below (as much as you can provide) to help us better understand the nature and scope of the possible issue:
20 | 
21 |   * Type of issue (e.g. buffer overflow, SQL injection, cross-site scripting, etc.)
22 |   * Full paths of source file(s) related to the manifestation of the issue
23 |   * The location of the affected source code (tag/branch/commit or direct URL)
24 |   * Any special configuration required to reproduce the issue
25 |   * Step-by-step instructions to reproduce the issue
26 |   * Proof-of-concept or exploit code (if possible)
27 |   * Impact of the issue, including how an attacker might exploit the issue
28 | 
29 | This information will help us triage your report more quickly.
30 | 
31 | If you are reporting for a bug bounty, more complete reports can contribute to a higher bounty award. Please visit our [Microsoft Bug Bounty Program](https://microsoft.com/msrc/bounty) page for more details about our active programs.
32 | 
33 | ## Preferred Languages
34 | 
35 | We prefer all communications to be in English.
36 | 
37 | ## Policy
38 | 
39 | Microsoft follows the principle of [Coordinated Vulnerability Disclosure](https://www.microsoft.com/en-us/msrc/cvd).
40 | 
41 | <!-- END MICROSOFT SECURITY.MD BLOCK -->


--------------------------------------------------------------------------------
/dq/databases/DQ/dqaggtable.sql:
--------------------------------------------------------------------------------
 1 | // Copyright (c) Microsoft Corporation.
 2 | // Licensed under the MIT License.
 3 | 
 4 | CREATE TABLE IF NOT EXISTS SUBJECT_AREA_DB.DQ_AGGREGATE_TABLE_NAME(
 5 | DqAggTableKey           STRING,
 6 | Source                  STRING,
 7 | Entity                  STRING,
 8 | ColumnName              STRING,
 9 | Rule                    STRING,
10 | DQModifiedDate          TIMESTAMP,
11 | DQModifiedDateInt       BIGINT,
12 | RecordCount             BIGINT,
13 | FailCount               BIGINT,
14 | SuccessCount            BIGINT,
15 | PlatformModifiedDate    TIMESTAMP,
16 | PlatformModifiedDateInt BIGINT
17 | )
18 | USING delta
19 | location 'ADLS_PATH_GEN2/SUBJECT_AREA/DQ_AGGREGATE_TABLE_NAME'


--------------------------------------------------------------------------------
/dq/databases/DQ/dqfailtable.sql:
--------------------------------------------------------------------------------
 1 | // Copyright (c) Microsoft Corporation.
 2 | // Licensed under the MIT License.
 3 | 
 4 | CREATE TABLE IF NOT EXISTS SUBJECT_AREA_DB.DQ_FAIL_TABLE_NAME(
 5 | Source                  STRING,
 6 | Entity                  STRING,
 7 | ColumnName              STRING,
 8 | Rule                    STRING,
 9 | Record                  STRING,
10 | PlatformModifiedDate    TIMESTAMP,
11 | PlatformModifiedDateInt BIGINT,
12 | DqAggTableKey           STRING
13 | )
14 | USING delta
15 | location 'ADLS_PATH_GEN2/SUBJECT_AREA/DQ_FAIL_TABLE_NAME'


--------------------------------------------------------------------------------
/dq/databases/DQ/dqrulewatermark.sql:
--------------------------------------------------------------------------------
 1 | // Copyright (c) Microsoft Corporation.
 2 | // Licensed under the MIT License.
 3 | 
 4 | CREATE TABLE IF NOT EXISTS SUBJECT_AREA_DB.DQ_RULE_WATERMARK_TABLE_NAME
 5 | (
 6 |     SubjectArea               STRING      NOT NULL
 7 |     ,SourceEntityName         STRING      NOT NULL
 8 |     ,RuleName                 STRING      NOT NULL
 9 |     ,WaterMarkStartValue      STRING      NOT NULL
10 |     ,WaterMarkEndValue        STRING      NOT NULL
11 |     ,PlatformModifiedDate     TIMESTAMP
12 |     ,PlatformModifiedDateInt  BIGINT
13 | )
14 | USING DELTA
15 | LOCATION 'ADLS_PATH_GEN2/SUBJECT_AREA/DQ_RULE_WATERMARK_TABLE_NAME'
16 | 


--------------------------------------------------------------------------------
/dq/databases/DQ/dqrundetails.sql:
--------------------------------------------------------------------------------
 1 | // Copyright (c) Microsoft Corporation.
 2 | // Licensed under the MIT License.
 3 | 
 4 | CREATE TABLE IF NOT EXISTS SUBJECT_AREA_DB.DQ_RUNDETAILS_TABLE_NAME(
 5 | DqAggTableKey           STRING,
 6 | Source                  STRING,
 7 | Entity                  STRING,
 8 | ColumnName              STRING,
 9 | Rule                    STRING,
10 | RecordCount             BIGINT,
11 | PipelineId              STRING,
12 | PlatformModifiedDate    TIMESTAMP,
13 | PlatformModifiedDateInt BIGINT
14 | )
15 | USING delta
16 | location 'ADLS_PATH_GEN2/SUBJECT_AREA/DQ_RUNDETAILS_TABLE_NAME'


--------------------------------------------------------------------------------
/dq/databases/DQ/dqwatermark.sql:
--------------------------------------------------------------------------------
 1 | // Copyright (c) Microsoft Corporation.
 2 | // Licensed under the MIT License.
 3 | 
 4 | CREATE TABLE IF NOT EXISTS SUBJECT_AREA_DB.DQ_METADATA_WATERMARK_TABLE_NAME(
 5 |   dqtable               STRING
 6 |  ,source                STRING
 7 |  ,entity                STRING
 8 |  ,watermarkendvalue     STRING
 9 |  ,omidqcreateddate      STRING
10 |  ,omidqmodifieddate     STRING
11 |  )
12 |   USING DELTA
13 |     LOCATION 'ADLS_PATH_GEN2/SUBJECT_AREA/DQ_METADATA_WATERMARK_TABLE_NAME'


--------------------------------------------------------------------------------
/dq/databases/DQ/entityrulemetadata.sql:
--------------------------------------------------------------------------------
 1 | // Copyright (c) Microsoft Corporation.
 2 | // Licensed under the MIT License.
 3 | 
 4 | CREATE TABLE IF NOT EXISTS SUBJECT_AREA_DB.DQ_ENTITYRULEMETADATA_TABLE_NAME(
 5 |   source        STRING
 6 |  ,entity        STRING
 7 |  ,rulename      STRING
 8 |  ,parameters    STRING
 9 |  )
10 |   USING DELTA
11 |     LOCATION 'ADLS_PATH_GEN2/SUBJECT_AREA/DQ_ENTITYRULEMETADATA_TABLE_NAME'


--------------------------------------------------------------------------------
/dq/databases/DQ/orphanedgemetadata.sql:
--------------------------------------------------------------------------------
 1 | // Copyright (c) Microsoft Corporation.
 2 | // Licensed under the MIT License.
 3 | 
 4 | CREATE TABLE IF NOT EXISTS SUBJECT_AREA_DB.DQ_ORPHANEDGEMETADATA_TABLE_NAME(
 5 |  source                 STRING
 6 |  ,entity                STRING
 7 |  ,fromcolumnname        STRING
 8 |  ,fromlookupentity      STRING
 9 |  ,fromlookupcolumnname  STRING
10 |  ,tocolumnname          STRING
11 |  ,tolookupentity        STRING
12 |  ,tolookupcolumnname    STRING
13 |  ,filtercondition       STRING
14 |  )
15 |   USING DELTA
16 |     LOCATION 'ADLS_PATH_GEN2/SUBJECT_AREA/DQ_ORPHANEDGEMETADATA_TABLE_NAME'


--------------------------------------------------------------------------------
/dq/pom.xml:
--------------------------------------------------------------------------------
  1 | <project xmlns="http://maven.apache.org/POM/4.0.0" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/maven-v4_0_0.xsd">
  2 |     <modelVersion>4.0.0</modelVersion>
  3 |     <groupId>com.ms.dq</groupId>
  4 |     <version>0.0.1</version>
  5 |     <artifactId>DQFramework</artifactId>
  6 |     <name>DQFramework</name>
  7 |     <description> Data Quality Framework Module.</description>
  8 |     <properties>
  9 |         <java.version>1.8</java.version>
 10 |     </properties>
 11 | 
 12 |     <dependencies>
 13 |         <!-- https://mvnrepository.com/artifact/org.apache.spark/spark-core -->
 14 |         <dependency>
 15 |             <groupId>org.apache.spark</groupId>
 16 |             <artifactId>spark-core_2.12</artifactId>
 17 |             <version>3.0.1</version>
 18 |             <exclusions>
 19 |                 <exclusion>
 20 |                     <groupId>io.netty</groupId>
 21 |                     <artifactId>netty-all</artifactId>
 22 |                 </exclusion>
 23 |                 <exclusion>
 24 |                     <groupId>org.codehaus.jackson</groupId>
 25 |                     <artifactId>jackson-mapper-asl</artifactId>
 26 |                 </exclusion>
 27 |                 <exclusion>
 28 |                     <groupId>org.codehaus.jackson</groupId>
 29 |                     <artifactId>jackson-core-asl</artifactId>
 30 |                 </exclusion>
 31 |                 <exclusion>
 32 |                     <groupId>commons-codec</groupId>
 33 |                     <artifactId>commons-codec</artifactId>
 34 |                 </exclusion>
 35 |                 <exclusion>
 36 |                     <groupId>com.microsoft.azure</groupId>
 37 |                     <artifactId>azure-storage</artifactId>
 38 |                 </exclusion>
 39 |                 <exclusion>
 40 |                     <groupId>com.google.guava</groupId>
 41 |                     <artifactId>guava</artifactId>
 42 |                 </exclusion>
 43 |                 <exclusion>
 44 |                     <groupId>org.eclipse.jetty</groupId>
 45 |                     <artifactId>jetty-util</artifactId>
 46 |                 </exclusion>
 47 |                 <exclusion>
 48 |                     <groupId>org.codehaus.jackson</groupId>
 49 |                     <artifactId>jackson-jaxrs</artifactId>
 50 |                 </exclusion>
 51 |                 <exclusion>
 52 |                     <groupId>org.codehaus.jackson</groupId>
 53 |                     <artifactId>jackson-xc</artifactId>
 54 |                 </exclusion>
 55 |                 <exclusion>
 56 |                     <groupId>org.eclipse.jetty</groupId>
 57 |                     <artifactId>jetty-http</artifactId>
 58 |                 </exclusion>
 59 |                 <exclusion>
 60 |                     <groupId>org.eclipse.jetty</groupId>
 61 |                     <artifactId>jetty-server</artifactId>
 62 |                 </exclusion>
 63 |                 <exclusion>
 64 |                     <groupId>org.apache.commons</groupId>
 65 |                     <artifactId>commons-compress</artifactId>
 66 |                 </exclusion>
 67 |                 <exclusion>
 68 |                     <groupId>com.nimbusds</groupId>
 69 |                     <artifactId>nimbus-jose-jwt</artifactId>
 70 |                 </exclusion>
 71 |                 <exclusion>
 72 |                     <groupId>commons-codec</groupId>
 73 |                     <artifactId>commons-codec</artifactId>
 74 |                 </exclusion>
 75 |                 <exclusion>
 76 |                     <groupId>org.apache.zookeeper</groupId>
 77 |                     <artifactId>zookeeper</artifactId>
 78 |                 </exclusion>
 79 |                 <exclusion>
 80 |                     <groupId>commons-beanutils</groupId>
 81 |                     <artifactId>commons-beanutils</artifactId>
 82 |                 </exclusion>
 83 |                 <exclusion>
 84 |                     <groupId>log4j</groupId>
 85 |                     <artifactId>log4j</artifactId>
 86 |                 </exclusion>
 87 |             </exclusions>
 88 |         </dependency>
 89 |         <!-- https://mvnrepository.com/artifact/org.apache.spark/spark-sql -->
 90 |         <dependency>
 91 |             <groupId>org.apache.spark</groupId>
 92 |             <artifactId>spark-sql_2.12</artifactId>
 93 |             <version>3.0.1</version>
 94 |             <exclusions>
 95 |                 <exclusion>
 96 |                     <groupId>io.netty</groupId>
 97 |                     <artifactId>netty-all</artifactId>
 98 |                 </exclusion>
 99 |                 <exclusion>
100 |                     <groupId>org.codehaus.jackson</groupId>
101 |                     <artifactId>jackson-mapper-asl</artifactId>
102 |                 </exclusion>
103 |                 <exclusion>
104 |                     <groupId>org.codehaus.jackson</groupId>
105 |                     <artifactId>jackson-core-asl</artifactId>
106 |                 </exclusion>
107 |                 <exclusion>
108 |                     <groupId>commons-codec</groupId>
109 |                     <artifactId>commons-codec</artifactId>
110 |                 </exclusion>
111 |                 <exclusion>
112 |                     <groupId>com.microsoft.azure</groupId>
113 |                     <artifactId>azure-storage</artifactId>
114 |                 </exclusion>
115 |                 <exclusion>
116 |                     <groupId>com.google.guava</groupId>
117 |                     <artifactId>guava</artifactId>
118 |                 </exclusion>
119 |                 <exclusion>
120 |                     <groupId>org.eclipse.jetty</groupId>
121 |                     <artifactId>jetty-util</artifactId>
122 |                 </exclusion>
123 |                 <exclusion>
124 |                     <groupId>org.codehaus.jackson</groupId>
125 |                     <artifactId>jackson-jaxrs</artifactId>
126 |                 </exclusion>
127 |                 <exclusion>
128 |                     <groupId>org.codehaus.jackson</groupId>
129 |                     <artifactId>jackson-xc</artifactId>
130 |                 </exclusion>
131 |                 <exclusion>
132 |                     <groupId>org.eclipse.jetty</groupId>
133 |                     <artifactId>jetty-http</artifactId>
134 |                 </exclusion>
135 |                 <exclusion>
136 |                     <groupId>org.eclipse.jetty</groupId>
137 |                     <artifactId>jetty-server</artifactId>
138 |                 </exclusion>
139 |                 <exclusion>
140 |                     <groupId>org.apache.commons</groupId>
141 |                     <artifactId>commons-compress</artifactId>
142 |                 </exclusion>
143 |                 <exclusion>
144 |                     <groupId>com.nimbusds</groupId>
145 |                     <artifactId>nimbus-jose-jwt</artifactId>
146 |                 </exclusion>
147 |                 <exclusion>
148 |                     <groupId>commons-codec</groupId>
149 |                     <artifactId>commons-codec</artifactId>
150 |                 </exclusion>
151 |                 <exclusion>
152 |                     <groupId>org.apache.zookeeper</groupId>
153 |                     <artifactId>zookeeper</artifactId>
154 |                 </exclusion>
155 |                 <exclusion>
156 |                     <groupId>commons-beanutils</groupId>
157 |                     <artifactId>commons-beanutils</artifactId>
158 |                 </exclusion>
159 |             </exclusions>
160 |         </dependency>
161 | 
162 |         <!-- https://mvnrepository.com/artifact/org.apache.spark/spark-network-common -->
163 |         <dependency>
164 |             <groupId>org.apache.spark</groupId>
165 |             <artifactId>spark-network-common_2.12</artifactId>
166 |             <version>3.0.1</version>
167 |         </dependency>
168 | 
169 |         <dependency>
170 |             <!-- jackson Library -->
171 |             <groupId>com.fasterxml.jackson.core</groupId>
172 |             <artifactId>jackson-databind</artifactId>
173 |             <version>2.11.2</version>
174 |         </dependency>
175 | 
176 |         <dependency>
177 |             <!-- jackson Library -->
178 |             <groupId>com.fasterxml.jackson.core</groupId>
179 |             <artifactId>jackson-core</artifactId>
180 |             <version>2.11.2</version>
181 |         </dependency>
182 | 
183 |         <!-- https://mvnrepository.com/artifact/org.scala-lang/scala-compiler -->
184 |         <dependency>
185 |             <groupId>org.scala-lang</groupId>
186 |             <artifactId>scala-compiler</artifactId>
187 |             <version>2.12.12</version>
188 |         </dependency>
189 | 
190 |         <!-- https://mvnrepository.com/artifact/org.scala-lang/scala-library -->
191 |         <dependency>
192 |             <groupId>org.scala-lang</groupId>
193 |             <artifactId>scala-library</artifactId>
194 |             <version>2.12.12</version>
195 |             <scope>runtime</scope>
196 |         </dependency>
197 | 
198 |         <dependency>
199 |             <groupId>com.typesafe</groupId>
200 |             <artifactId>config</artifactId>
201 |             <version>1.3.2</version>
202 |         </dependency>
203 | 
204 | 
205 |         <!-- https://mvnrepository.com/artifact/net.liftweb/lift-json -->
206 |         <dependency>
207 |             <groupId>net.liftweb</groupId>
208 |             <artifactId>lift-json_2.12</artifactId>
209 |             <version>3.4.2</version>
210 |         </dependency>
211 | 
212 |         <!-- https://mvnrepository.com/artifact/com.microsoft.azure/azure-data-lake-store-sdk -->
213 |         <dependency>
214 |             <groupId>com.microsoft.azure</groupId>
215 |             <artifactId>azure-data-lake-store-sdk</artifactId>
216 |             <version>2.3.1</version>
217 |         </dependency>
218 | 
219 |         <dependency>
220 |             <groupId>org.apache.hadoop</groupId>
221 |             <artifactId>hadoop-azure</artifactId>
222 |             <version>3.1.1</version>
223 |             <exclusions>
224 |                 <exclusion>
225 |                     <groupId>io.netty</groupId>
226 |                     <artifactId>netty-all</artifactId>
227 |                 </exclusion>
228 |                 <exclusion>
229 |                     <groupId>org.codehaus.jackson</groupId>
230 |                     <artifactId>jackson-mapper-asl</artifactId>
231 |                 </exclusion>
232 |                 <exclusion>
233 |                     <groupId>org.codehaus.jackson</groupId>
234 |                     <artifactId>jackson-core-asl</artifactId>
235 |                 </exclusion>
236 |                 <exclusion>
237 |                     <groupId>commons-codec</groupId>
238 |                     <artifactId>commons-codec</artifactId>
239 |                 </exclusion>
240 |                 <exclusion>
241 |                     <groupId>com.microsoft.azure</groupId>
242 |                     <artifactId>azure-storage</artifactId>
243 |                 </exclusion>
244 |                 <exclusion>
245 |                     <groupId>com.google.guava</groupId>
246 |                     <artifactId>guava</artifactId>
247 |                 </exclusion>
248 |                 <exclusion>
249 |                     <groupId>org.eclipse.jetty</groupId>
250 |                     <artifactId>jetty-util</artifactId>
251 |                 </exclusion>
252 |                 <exclusion>
253 |                     <groupId>org.codehaus.jackson</groupId>
254 |                     <artifactId>jackson-jaxrs</artifactId>
255 |                 </exclusion>
256 |                 <exclusion>
257 |                     <groupId>org.codehaus.jackson</groupId>
258 |                     <artifactId>jackson-xc</artifactId>
259 |                 </exclusion>
260 |                 <exclusion>
261 |                     <groupId>org.eclipse.jetty</groupId>
262 |                     <artifactId>jetty-http</artifactId>
263 |                 </exclusion>
264 |                 <exclusion>
265 |                     <groupId>org.eclipse.jetty</groupId>
266 |                     <artifactId>jetty-server</artifactId>
267 |                 </exclusion>
268 |                 <exclusion>
269 |                     <groupId>org.apache.commons</groupId>
270 |                     <artifactId>commons-compress</artifactId>
271 |                 </exclusion>
272 |                 <exclusion>
273 |                     <groupId>com.nimbusds</groupId>
274 |                     <artifactId>nimbus-jose-jwt</artifactId>
275 |                 </exclusion>
276 |                 <exclusion>
277 |                     <groupId>commons-codec</groupId>
278 |                     <artifactId>commons-codec</artifactId>
279 |                 </exclusion>
280 |                 <exclusion>
281 |                     <groupId>org.apache.zookeeper</groupId>
282 |                     <artifactId>zookeeper</artifactId>
283 |                 </exclusion>
284 |                 <exclusion>
285 |                     <groupId>commons-beanutils</groupId>
286 |                     <artifactId>commons-beanutils</artifactId>
287 |                 </exclusion>
288 |                 <exclusion>
289 |                     <groupId>log4j</groupId>
290 |                     <artifactId>log4j</artifactId>
291 |                 </exclusion>
292 |             </exclusions>
293 |         </dependency>
294 | 
295 |         <dependency>
296 |             <groupId>com.databricks</groupId>
297 |             <artifactId>dbutils-api_2.11</artifactId>
298 |             <version>0.0.3</version>
299 |         </dependency>
300 |         <dependency>
301 |             <groupId>org.apache.hadoop</groupId>
302 |             <artifactId>hadoop-azure-datalake</artifactId>
303 |             <version>3.1.1</version>
304 |             <exclusions>
305 |                 <exclusion>
306 |                     <groupId>io.netty</groupId>
307 |                     <artifactId>netty-all</artifactId>
308 |                 </exclusion>
309 |                 <exclusion>
310 |                     <groupId>org.codehaus.jackson</groupId>
311 |                     <artifactId>jackson-mapper-asl</artifactId>
312 |                 </exclusion>
313 |                 <exclusion>
314 |                     <groupId>org.codehaus.jackson</groupId>
315 |                     <artifactId>jackson-core-asl</artifactId>
316 |                 </exclusion>
317 |                 <exclusion>
318 |                     <groupId>commons-codec</groupId>
319 |                     <artifactId>commons-codec</artifactId>
320 |                 </exclusion>
321 |                 <exclusion>
322 |                     <groupId>com.microsoft.azure</groupId>
323 |                     <artifactId>azure-storage</artifactId>
324 |                 </exclusion>
325 |                 <exclusion>
326 |                     <groupId>com.google.guava</groupId>
327 |                     <artifactId>guava</artifactId>
328 |                 </exclusion>
329 |                 <exclusion>
330 |                     <groupId>org.eclipse.jetty</groupId>
331 |                     <artifactId>jetty-util</artifactId>
332 |                 </exclusion>
333 |                 <exclusion>
334 |                     <groupId>org.codehaus.jackson</groupId>
335 |                     <artifactId>jackson-jaxrs</artifactId>
336 |                 </exclusion>
337 |                 <exclusion>
338 |                     <groupId>org.codehaus.jackson</groupId>
339 |                     <artifactId>jackson-xc</artifactId>
340 |                 </exclusion>
341 |                 <exclusion>
342 |                     <groupId>org.eclipse.jetty</groupId>
343 |                     <artifactId>jetty-http</artifactId>
344 |                 </exclusion>
345 |                 <exclusion>
346 |                     <groupId>org.eclipse.jetty</groupId>
347 |                     <artifactId>jetty-server</artifactId>
348 |                 </exclusion>
349 |                 <exclusion>
350 |                     <groupId>org.apache.commons</groupId>
351 |                     <artifactId>commons-compress</artifactId>
352 |                 </exclusion>
353 |                 <exclusion>
354 |                     <groupId>com.nimbusds</groupId>
355 |                     <artifactId>nimbus-jose-jwt</artifactId>
356 |                 </exclusion>
357 |                 <exclusion>
358 |                     <groupId>commons-codec</groupId>
359 |                     <artifactId>commons-codec</artifactId>
360 |                 </exclusion>
361 |                 <exclusion>
362 |                     <groupId>org.apache.zookeeper</groupId>
363 |                     <artifactId>zookeeper</artifactId>
364 |                 </exclusion>
365 |                 <exclusion>
366 |                     <groupId>commons-beanutils</groupId>
367 |                     <artifactId>commons-beanutils</artifactId>
368 |                 </exclusion>
369 |                 <exclusion>
370 |                     <groupId>log4j</groupId>
371 |                     <artifactId>log4j</artifactId>
372 |                 </exclusion>
373 |             </exclusions>
374 |         </dependency>
375 | 
376 |         <dependency>
377 |             <groupId>org.apache.hadoop</groupId>
378 |             <artifactId>hadoop-client</artifactId>
379 |             <version>3.1.1</version>
380 |             <exclusions>
381 |                 <exclusion>
382 |                     <groupId>io.netty</groupId>
383 |                     <artifactId>netty-all</artifactId>
384 |                 </exclusion>
385 |                 <exclusion>
386 |                     <groupId>org.codehaus.jackson</groupId>
387 |                     <artifactId>jackson-mapper-asl</artifactId>
388 |                 </exclusion>
389 |                 <exclusion>
390 |                     <groupId>org.codehaus.jackson</groupId>
391 |                     <artifactId>jackson-core-asl</artifactId>
392 |                 </exclusion>
393 |                 <exclusion>
394 |                     <groupId>commons-codec</groupId>
395 |                     <artifactId>commons-codec</artifactId>
396 |                 </exclusion>
397 |                 <exclusion>
398 |                     <groupId>com.microsoft.azure</groupId>
399 |                     <artifactId>azure-storage</artifactId>
400 |                 </exclusion>
401 |                 <exclusion>
402 |                     <groupId>com.google.guava</groupId>
403 |                     <artifactId>guava</artifactId>
404 |                 </exclusion>
405 |                 <exclusion>
406 |                     <groupId>org.eclipse.jetty</groupId>
407 |                     <artifactId>jetty-util</artifactId>
408 |                 </exclusion>
409 |                 <exclusion>
410 |                     <groupId>org.codehaus.jackson</groupId>
411 |                     <artifactId>jackson-jaxrs</artifactId>
412 |                 </exclusion>
413 |                 <exclusion>
414 |                     <groupId>org.codehaus.jackson</groupId>
415 |                     <artifactId>jackson-xc</artifactId>
416 |                 </exclusion>
417 |                 <exclusion>
418 |                     <groupId>org.eclipse.jetty</groupId>
419 |                     <artifactId>jetty-http</artifactId>
420 |                 </exclusion>
421 |                 <exclusion>
422 |                     <groupId>org.eclipse.jetty</groupId>
423 |                     <artifactId>jetty-server</artifactId>
424 |                 </exclusion>
425 |                 <exclusion>
426 |                     <groupId>org.apache.commons</groupId>
427 |                     <artifactId>commons-compress</artifactId>
428 |                 </exclusion>
429 |                 <exclusion>
430 |                     <groupId>com.nimbusds</groupId>
431 |                     <artifactId>nimbus-jose-jwt</artifactId>
432 |                 </exclusion>
433 |                 <exclusion>
434 |                     <groupId>commons-codec</groupId>
435 |                     <artifactId>commons-codec</artifactId>
436 |                 </exclusion>
437 |                 <exclusion>
438 |                     <groupId>org.apache.zookeeper</groupId>
439 |                     <artifactId>zookeeper</artifactId>
440 |                 </exclusion>
441 |                 <exclusion>
442 |                     <groupId>commons-beanutils</groupId>
443 |                     <artifactId>commons-beanutils</artifactId>
444 |                 </exclusion>
445 |                 <exclusion>
446 |                     <groupId>log4j</groupId>
447 |                     <artifactId>log4j</artifactId>
448 |                 </exclusion>
449 |             </exclusions>
450 |         </dependency>
451 | 
452 |         <dependency>
453 |             <groupId>org.apache.hadoop</groupId>
454 |             <artifactId>hadoop-common</artifactId>
455 |             <version>3.1.1</version>
456 |             <exclusions>
457 |                 <exclusion>
458 |                     <groupId>io.netty</groupId>
459 |                     <artifactId>netty-all</artifactId>
460 |                 </exclusion>
461 |                 <exclusion>
462 |                     <groupId>org.codehaus.jackson</groupId>
463 |                     <artifactId>jackson-mapper-asl</artifactId>
464 |                 </exclusion>
465 |                 <exclusion>
466 |                     <groupId>org.codehaus.jackson</groupId>
467 |                     <artifactId>jackson-core-asl</artifactId>
468 |                 </exclusion>
469 |                 <exclusion>
470 |                     <groupId>commons-codec</groupId>
471 |                     <artifactId>commons-codec</artifactId>
472 |                 </exclusion>
473 |                 <exclusion>
474 |                     <groupId>com.microsoft.azure</groupId>
475 |                     <artifactId>azure-storage</artifactId>
476 |                 </exclusion>
477 |                 <exclusion>
478 |                     <groupId>com.google.guava</groupId>
479 |                     <artifactId>guava</artifactId>
480 |                 </exclusion>
481 |                 <exclusion>
482 |                     <groupId>org.eclipse.jetty</groupId>
483 |                     <artifactId>jetty-util</artifactId>
484 |                 </exclusion>
485 |                 <exclusion>
486 |                     <groupId>org.codehaus.jackson</groupId>
487 |                     <artifactId>jackson-jaxrs</artifactId>
488 |                 </exclusion>
489 |                 <exclusion>
490 |                     <groupId>org.codehaus.jackson</groupId>
491 |                     <artifactId>jackson-xc</artifactId>
492 |                 </exclusion>
493 |                 <exclusion>
494 |                     <groupId>org.eclipse.jetty</groupId>
495 |                     <artifactId>jetty-http</artifactId>
496 |                 </exclusion>
497 |                 <exclusion>
498 |                     <groupId>org.eclipse.jetty</groupId>
499 |                     <artifactId>jetty-webapp</artifactId>
500 |                 </exclusion>
501 |                 <exclusion>
502 |                     <groupId>org.eclipse.jetty</groupId>
503 |                     <artifactId>jetty-server</artifactId>
504 |                 </exclusion>
505 |                 <exclusion>
506 |                     <groupId>org.apache.commons</groupId>
507 |                     <artifactId>commons-compress</artifactId>
508 |                 </exclusion>
509 |                 <exclusion>
510 |                     <groupId>com.nimbusds</groupId>
511 |                     <artifactId>nimbus-jose-jwt</artifactId>
512 |                 </exclusion>
513 |                 <exclusion>
514 |                     <groupId>commons-codec</groupId>
515 |                     <artifactId>commons-codec</artifactId>
516 |                 </exclusion>
517 |                 <exclusion>
518 |                     <groupId>org.apache.zookeeper</groupId>
519 |                     <artifactId>zookeeper</artifactId>
520 |                 </exclusion>
521 |                 <exclusion>
522 |                     <groupId>commons-beanutils</groupId>
523 |                     <artifactId>commons-beanutils</artifactId>
524 |                 </exclusion>
525 |                 <exclusion>
526 |                     <groupId>log4j</groupId>
527 |                     <artifactId>log4j</artifactId>
528 |                 </exclusion>
529 |             </exclusions>
530 |         </dependency>
531 | 
532 |         <!-- https://mvnrepository.com/artifact/com.google.code.gson/gson -->
533 |         <dependency>
534 |             <groupId>com.google.code.gson</groupId>
535 |             <artifactId>gson</artifactId>
536 |             <version>2.8.5</version>
537 |         </dependency>
538 | 
539 |         <!-- https://mvnrepository.com/artifact/com.google.code.gson/gson -->
540 |         <dependency>
541 |             <groupId>org.apache.commons</groupId>
542 |             <artifactId>commons-compress</artifactId>
543 |             <version>1.19</version>
544 |         </dependency>
545 | 
546 |         <!-- https://mvnrepository.com/artifact/org.apache.httpcomponents/httpclient -->
547 |         <dependency>
548 |             <groupId>org.apache.httpcomponents</groupId>
549 |             <artifactId>httpclient</artifactId>
550 |             <version>4.5.6</version>
551 |             <exclusions>
552 |                 <exclusion>
553 |                     <groupId>commons-codec</groupId>
554 |                     <artifactId>commons-codec</artifactId>
555 |                 </exclusion>
556 |             </exclusions>
557 |         </dependency>
558 | 
559 |         <!-- https://mvnrepository.com/artifact/javax.xml.crypto/jsr105-api -->
560 |         <dependency>
561 |             <groupId>javax.xml.crypto</groupId>
562 |             <artifactId>jsr105-api</artifactId>
563 |             <version>1.0.1</version>
564 |         </dependency>
565 | 
566 |         <!-- https://mvnrepository.com/artifact/org.apache.commons/commons-configuration2 -->
567 |         <dependency>
568 |             <groupId>org.apache.commons</groupId>
569 |             <artifactId>commons-configuration2</artifactId>
570 |             <version>2.7</version>
571 |         </dependency>
572 | 
573 |         <!-- https://mvnrepository.com/artifact/commons-net/commons-net -->
574 |         <dependency>
575 |             <groupId>commons-net</groupId>
576 |             <artifactId>commons-net</artifactId>
577 |             <version>3.6</version>
578 |         </dependency>
579 |         <dependency>
580 |             <groupId>com.databricks</groupId>
581 |             <artifactId>dbutils-api_2.11</artifactId>
582 |             <version>0.0.4</version>
583 |         </dependency>
584 |     </dependencies>
585 | 
586 |     <build>
587 | 
588 |         <sourceDirectory>src/main/scala</sourceDirectory>
589 |         <resources>
590 |             <resource>
591 |                 <directory>src/main/resources</directory>
592 |                 <includes>
593 |                     <include>**/*.properties</include>
594 |                 </includes>
595 |             </resource>
596 |         </resources>
597 |         <plugins>
598 |             <plugin>
599 |                 <groupId>net.alchim31.maven</groupId>
600 |                 <artifactId>scala-maven-plugin</artifactId>
601 |                 <version>3.2.1</version>
602 |                 <configuration>
603 |                     <sourceDir>src/main/scala</sourceDir>
604 |                     <jvmArgs>
605 |                         <jvmArg>-Xss4m</jvmArg>
606 |                         <jvmArg>-Xms256m</jvmArg>
607 |                         <jvmArg>-Xmx4096m</jvmArg>
608 |                     </jvmArgs>
609 |                 </configuration>
610 |                 <executions>
611 |                     <execution>
612 |                         <goals>
613 |                             <goal>compile</goal>
614 |                             <goal>testCompile</goal>
615 |                         </goals>
616 |                     </execution>
617 |                 </executions>
618 |             </plugin>
619 |             <plugin>
620 |                 <groupId>org.apache.maven.plugins</groupId>
621 |                 <artifactId>maven-compiler-plugin</artifactId>
622 |                 <version>3.6.1</version>
623 |                 <configuration>
624 |                     <source>${java.version}</source>
625 | 
626 |                     <target>${java.version}</target>
627 |                     <encoding>UTF-8</encoding>
628 |                 </configuration>
629 |             </plugin>
630 | 
631 |             <plugin>
632 |                 <groupId>org.apache.maven.plugins</groupId>
633 |                 <artifactId>maven-install-plugin</artifactId>
634 |                 <version>2.5.2</version>
635 |                 <configuration>
636 |                     <skip>true</skip>
637 |                 </configuration>
638 |             </plugin>
639 |             <plugin>
640 |                 <groupId>org.apache.maven.plugins</groupId>
641 |                 <artifactId>maven-shade-plugin</artifactId>
642 |                 <version>3.1.0</version>
643 |                 <executions>
644 |                     <execution>
645 |                         <phase>install</phase>
646 |                         <goals>
647 |                             <goal>shade</goal>
648 |                         </goals>
649 |                         <configuration>
650 |                             <filters>
651 |                                 <filter>
652 |                                     <artifact>*:*</artifact>
653 |                                     <excludes>
654 |                                         <exclude>META-INF/*.DSA</exclude>
655 |                                         <exclude>META-INF/*.RSA</exclude>
656 |                                         <exclude>META-INF/*.SF</exclude>
657 |                                     </excludes>
658 | 
659 |                                 </filter>
660 |                             </filters>
661 |                             <artifactSet>
662 |                                 <excludes>
663 |                                     <!-- <exclude>com.apple:AppleJavaExtensions:*</exclude> <exclude>javax.servlet:*</exclude> -->
664 |                                     <!-- <exclude>org.apache.hadoop:*</exclude> -->
665 |                                     <!-- <exclude>org.apache.maven.plugins:*</exclude> -->
666 |                                     <!-- <exclude>org.apache.parquet:*</exclude> -->
667 |                                     <!-- <exclude>org.apache.spark:*</exclude> -->
668 |                                     <!-- <exclude>org.scala-lang:*</exclude> -->
669 |                                 </excludes>
670 |                             </artifactSet>
671 |                             <finalName>DQFramework</finalName>
672 |                         </configuration>
673 |                     </execution>
674 |                 </executions>
675 |             </plugin>
676 |         </plugins>
677 |     </build>
678 | 
679 |     <distributionManagement>
680 |     </distributionManagement>
681 | </project>
682 | 


--------------------------------------------------------------------------------
/dq/src/main/resources/application.properties:
--------------------------------------------------------------------------------
 1 |   SUBJECT_AREA = dq
 2 |   SUBJECT_AREA_DB = dq
 3 |   ADLS_PATH_GEN2 = <LakePath>
 4 |   DQ_RUNDETAILS_TABLE_NAME = dqrundetails
 5 |   DQ_AGGREGATE_TABLE_NAME = dqaggtable
 6 |   DQ_FAIL_TABLE_NAME = dqfailtable
 7 |   DQ_RULE_WATERMARK_TABLE_NAME = dqrulewatermark
 8 |   DQ_METADATA_WATERMARK_TABLE_NAME = dqwatermark
 9 |   DQ_ENTITYRULEMETADATA_TABLE_NAME = entityrulemetadata
10 |   DQ_ORPHANEDGEMETADATA_TABLE_NAME = orphanedgemetadata
11 |   DQ_LOG_RESULTS_FLAG = true
12 | 


--------------------------------------------------------------------------------
/dq/src/main/scala/com/ms/dq/framework/DQFramework.scala:
--------------------------------------------------------------------------------
  1 | // Copyright (c) Microsoft Corporation.
  2 | // Licensed under the MIT License.
  3 | 
  4 | package com.ms.dq.framework
  5 | 
  6 | import org.apache.spark.sql.Dataset
  7 | import org.apache.spark.sql.Row
  8 | import org.apache.spark.sql.functions._
  9 | import org.apache.spark.sql.types.StructType
 10 | 
 11 | import scala.collection.mutable.ListBuffer
 12 | import org.apache.spark.sql.expressions.Window
 13 | import org.apache.spark.sql.SaveMode
 14 | import org.apache.spark.sql.types.LongType
 15 | import org.apache.spark.sql.{SQLContext, SQLImplicits, SparkSession}
 16 | import org.apache.spark.sql.types.{IntegerType, LongType, StringType, StructField, StructType}
 17 | import com.ms.dq.rules._
 18 | 
 19 | import scala.collection.immutable.Map
 20 | import java.util.{Calendar, Date, Properties}
 21 | 
 22 | import com.ms.dq.support.SupportTrait
 23 | 
 24 | import scala.collection.parallel._
 25 | import scala.collection.parallel.immutable.ParMap
 26 | import scala.xml.Properties
 27 | 
 28 | class DQFramework extends SupportTrait {
 29 | 
 30 | 
 31 |   var uniquecheck_latestIdCol: String = null;
 32 |   var spark: SparkSession = null;
 33 | 
 34 |   // Set spark session for execution instance
 35 |   def setSparkSession(_sparkSession: SparkSession) = {
 36 |     spark = _sparkSession
 37 |   }
 38 | 
 39 |   // Method to invoke Data Quality rules on a dataset by reading the metadata table [METADATA BOUND]
 40 |   def applyDqInner(df: Dataset[Row], sourceName: String, entityName: String, schema: StructType, pipelineId: String): Dataset[Row] = {
 41 |     if (spark == null) {
 42 |       println("Please set spark session. Use function setSparkSession(_sparkSession:SparkSession)")
 43 |       throw new Exception("Please set spark session. Use function setSparkSession(_sparkSession:SparkSession)")
 44 |     }
 45 |     if (sourceName == null) {
 46 |       println("Source has been sent as null.")
 47 |       throw new Exception("Source has been sent as null.")
 48 |     }
 49 |     if (entityName == null) {
 50 |       println("Entity has been sent as null.")
 51 |       throw new Exception("Entity has been sent as null.")
 52 |     }
 53 | 
 54 |     // Read the application.properties
 55 |     val properties = getProperties()
 56 | 
 57 |     // intialise the dataframe to be returned , so that all the existing columns are present
 58 |     var return_Df = df
 59 |     if (schema != null) {
 60 |       //retrieves the schema defined for the particular source and entity in the entityrulemetadata
 61 |       val metaDataSchema = spark.table(properties.getProperty("SUBJECT_AREA_DB") + "." + properties.getProperty("DQ_ENTITYRULEMETADATA_TABLE_NAME")).filter(lower(col("source")) === sourceName.toLowerCase() && lower(col("entity")) === entityName.toLowerCase()).filter(lower(col("rulename")) === "schemacheck")
 62 |       if (!metaDataSchema.take(1).isEmpty) {
 63 | 
 64 |         val body_col = df.columns.toSeq.head
 65 | 
 66 |         println("Applying Schemacheck\n______________")
 67 |         println("Time before Schemacheck--->" + Calendar.getInstance.getTime())
 68 |         return_Df = new Schemacheck().apply(df, schema, body_col, entityName, sourceName, spark, pipelineId, properties)
 69 |         println("Time after Schemacheck-->" + Calendar.getInstance.getTime())
 70 |       }
 71 |       else {
 72 |         println("Schemacheck not found for source=" + sourceName + " and entity=" + entityName + " in metadata. PLease make the required entry in " + properties.getProperty("SUBJECT_AREA_DB") + "." + properties.getProperty("DQ_ENTITYRULEMETADATA_TABLE_NAME") + " to run schecmacheck")
 73 |       }
 74 |     }
 75 |     else {
 76 |       val joinExceptionList=List("orphanedgecheck")
 77 |       val metaDataNonSchema = spark.table(properties.getProperty("SUBJECT_AREA_DB") + "." + properties.getProperty("DQ_ENTITYRULEMETADATA_TABLE_NAME")).filter(lower(col("source")) === sourceName.toLowerCase() && lower(col("entity")) === entityName.toLowerCase()).filter(lower(col("rulename")) =!= "schemacheck")
 78 |       if (!metaDataNonSchema.take(1).isEmpty) {
 79 | 
 80 |         val originalDfColumns = df.columns.toSeq
 81 | 
 82 |         // getting the original columns of the dataframe (dq_uniqueID is added for internal framework purposes)
 83 |         if (containsIgnoreCase(originalDfColumns, "dq_uniqueID")) {
 84 |           println("Column Name: dq_uniqueID is reserved for internal use of the DQ Framework. Please pass a dataframe without this column.")
 85 |           return df
 86 |         }
 87 |         val dfUniqueId = df.withColumn("dq_uniqueID", monotonically_increasing_id())
 88 |         return_Df = dfUniqueId
 89 | 
 90 |         // splitting the task into threads to achieve parallel computation if multiple cores are available
 91 |         val forkJoinPool = new scala.concurrent.forkjoin.ForkJoinPool(Runtime.getRuntime().availableProcessors() * (spark.sparkContext.statusTracker.getExecutorInfos.length - 1))
 92 |         val lst = metaDataNonSchema.select("rulename", "parameters").collect().par
 93 |         lst.tasksupport = new ForkJoinTaskSupport(forkJoinPool)
 94 | 
 95 |         lst.map({
 96 |           d =>
 97 |             //get relevant params like ruleName, params ,etc
 98 |             val ruleName = d.getString(0)
 99 |             val params = d.getString(1)
100 |             val className = "com.ms.dq.rules." + ruleName.toLowerCase().capitalize
101 |             var classObject: Class[_] = null
102 | 
103 |             //sanity check to see if the rule is provided correctly or not
104 |             try {
105 |               classObject = Class.forName(className)
106 |             }
107 |             catch {
108 |               case e: java.lang.ClassNotFoundException => println("Cannot perform " + ruleName + ", as the required class doesn't exist. Please create required class in file " + className)
109 |                 println(e.printStackTrace())
110 |             }
111 | 
112 |             if (classObject != null) {
113 |               //printing out the line highlighting which rule is being performed currently and at what time
114 |               //so we can also get an idea for how long did the DQ check run
115 |               val instance = classObject.newInstance().asInstanceOf[ {def apply(df: Dataset[Row], params: String, entityname: String, sourcename: String, spark: SparkSession,pipelineId : String,properties: Properties): Dataset[Row]}]
116 |               println("Applying " + ruleName + "\n___________")
117 |               println("Time before " + entityName + " " + ruleName + "-->" + Calendar.getInstance.getTime())
118 |               try {
119 |                 //applying the specific rule to the dataset
120 |                 val dfAfterRule = instance.apply(dfUniqueId, params, entityName, sourceName, spark, pipelineId, properties)
121 | 
122 |                 //printing out the time after the DQ check in order to know how much time did the run take and returning the resulting dataframe:return_Df
123 |                 println("Time after " + entityName + " " + ruleName + "-->" + Calendar.getInstance.getTime())
124 |                 val addedCols = dfAfterRule.columns.toSeq.diff(originalDfColumns)
125 |                 if (!joinExceptionList.contains(ruleName)) {
126 |                   //for thread-safe operation
127 |                   synchronized {
128 |                     return_Df = return_Df.join(dfAfterRule.select(addedCols.head, addedCols.tail: _*), "dq_uniqueID")
129 |                   }
130 |                 }
131 |               }
132 |               catch {
133 |                 case e:java.lang.NoSuchMethodException => println("Cannot perform " + ruleName + ", as the required class " + className +" does not contain required method. Please create method def apply(df: Dataset[Row], params: String, entityname: String, sourcename: String, spark: SparkSession) in the class.")
134 |                   println(e.printStackTrace())
135 |               }
136 |             }
137 |         })
138 |         return_Df = return_Df.drop("dq_uniqueID")
139 |       }
140 |       else {
141 |         println("No Rulecheck found for source=" + sourceName + " and entity=" + entityName + " in metadata. PLease make the required entry in " + properties.getProperty("SUBJECT_AREA_DB") + "." + properties.getProperty("DQ_ENTITYRULEMETADATA_TABLE_NAME") + " to run the required rule checks.")
142 |       }
143 |     }
144 |     return_Df
145 |   }
146 | 
147 |   // Overloaded methods for optional parameters (schema & pipelineId) to invoke Data Quality rules on a dataset by reading the metadata table
148 |   def applyDq(df: Dataset[Row], sourceName: String, entityName: String): Dataset[Row] = applyDqInner(df, sourceName, entityName, null, "N/A")
149 | 
150 |   // Overloaded methods for optional parameters (pipelineId) to invoke Data Quality rules on a dataset by reading the metadata table
151 |   def applyDq(df: Dataset[Row], sourceName: String, entityName: String, schema: StructType): Dataset[Row] = applyDqInner(df, sourceName, entityName, schema, "N/A")
152 | 
153 |   def applyDq(df: Dataset[Row], sourceName: String, entityName: String, schema: StructType, pipelineId: String): Dataset[Row] = applyDqInner(df, sourceName, entityName, schema, pipelineId)
154 | 
155 |   // Overloaded methods for optional parameters (schema) to invoke Data Quality rules on a dataset by reading the metadata table
156 |   def applyDq(df: Dataset[Row], sourceName: String, entityName: String, pipelineId: String): Dataset[Row] = applyDqInner(df, sourceName, entityName, null, pipelineId)
157 | 
158 |   // Method to invoke Data Quality rules on a dataset by passing the rules as a method parameter [ COMPUTE BOUND]
159 |   def applyDq(df:Dataset[Row],rule_param_map: Map[String,String],colEntitySourceMap: Map[String, List[String]],pipelineId: String): Dataset[Row]={
160 |     if (spark == null) {
161 |       println("Please set spark session. Use function setSparkSession(_sparkSession:SparkSession)")
162 |       throw new Exception("Please set spark session. Use function setSparkSession(_sparkSession:SparkSession)")
163 |     }
164 |     val originalDfColumns = df.columns.toSeq
165 | 
166 |     val properties = getProperties()
167 | 
168 |     //getting the original columns of the dataframe (dq_uniqueID is added for internal framework purposes)
169 |     if (containsIgnoreCase(originalDfColumns, "dq_uniqueID")) {
170 |       println("Column Name: dq_uniqueID is reserved for internal use of the DQ Framework. Please pass a dataframe without this column.")
171 |       return df
172 |     }
173 |     val dfUniqueId = df.withColumn("dq_uniqueID", monotonically_increasing_id())
174 |     var ans: Dataset[Row] = dfUniqueId
175 | 
176 |     // To achieve parallel computation if resources are there
177 |     val forkJoinPool = new scala.concurrent.forkjoin.ForkJoinPool(Runtime.getRuntime().availableProcessors() * (spark.sparkContext.statusTracker.getExecutorInfos.length - 1))
178 |     val lst = rule_param_map.par
179 |     lst.tasksupport = new ForkJoinTaskSupport(forkJoinPool)
180 | 
181 |     lst.foreach { case (ruleName, params) =>
182 |       //get relevant params like ruleName, params ,etc
183 |       val className = "com.ms.dq.rules." + ruleName.toLowerCase().capitalize
184 |       var classObject: Class[_] = null
185 | 
186 |       //sanity check to see if the rule is provided correctly or not
187 |       try {
188 |         classObject = Class.forName(className)
189 |       }
190 |       catch {
191 |         case e: java.lang.ClassNotFoundException => println("Cannot perform " + ruleName + ", as the required class doesn't exist. Please create required class in file " + className)
192 |           println(e.printStackTrace())
193 |       }
194 |       if (classObject != null) {
195 |         //printing out the line highlighting which rule is being performed currently and at what time
196 |         //so we can also get an idea for how long did the DQ check run
197 |         val instance = classObject.newInstance().asInstanceOf[ {def apply(df: Dataset[Row], params: String, colEntitySourceMap: Map[String, List[String]], originalDfColumns: Seq[String], spark: SparkSession, pipelineId: String, properties: Properties): Dataset[Row]}]
198 |         println("Applying " + ruleName + "\n___________")
199 |         println("Time before " + ruleName + "-->" + Calendar.getInstance.getTime())
200 |         try {
201 |           //applying the specific rule to the dataset
202 |           val dfAfterRule = instance.apply(dfUniqueId, params, colEntitySourceMap, originalDfColumns, spark, pipelineId, properties)
203 | 
204 |           //printing out the time after the DQ check in order to know how much time did the run take and returning the resulting dataframe:ans
205 |           println("Time after " + ruleName + "-->" + Calendar.getInstance.getTime())
206 |           val addedCols = dfAfterRule.columns.toSeq.diff(originalDfColumns)
207 |           synchronized {
208 |             ans = ans.join(dfAfterRule.select(addedCols.head, addedCols.tail: _*), "dq_uniqueID")
209 |           }
210 |         }
211 |         catch {
212 |           case e:java.lang.NoSuchMethodException => println("Cannot perform " + ruleName + ", as the required class " + className +" does not contain required method. Please create method def apply(df: Dataset[Row], params: String, colEntitySourceMap: Map[String, List[String]], originalDfColumns: Seq[String], spark: SparkSession): Dataset[Row] in the class.")
213 |             println(e.printStackTrace())
214 |         }
215 |       }
216 |     }
217 |     ans.drop("dq_uniqueID")
218 |   }
219 | }


--------------------------------------------------------------------------------
/dq/src/main/scala/com/ms/dq/framework/README.md:
--------------------------------------------------------------------------------
 1 | # Overloaded Methods for Metadata Driven Data Quality Check
 2 | ### applydq(df: Dataset[Row],sourcename:String,entityname:String): Dataset[Row] :  
 3 | * Overloaded method that would read the metadata to fetch the rules to be applied, excluding schema check.
 4 | * Optional parameters (schema and pipelineId) would be defaulted. 
 5 | ### applydq(df: Dataset[Row],sourcename:String,entityname:String, schema: StructType): Dataset[Row] :  
 6 | * Overloaded method that would read the metadata to fetch the rules to be applied, including schema check.
 7 | * Optional parameters (pipelineId) would be defaulted.
 8 | ### applydq(df: Dataset[Row],sourcename:String,entityname:String,schema: StructType , pipelineId: String): Dataset[Row] :  
 9 | * Overloaded method that would read the metadata to fetch the rules to be applied, including schema check. 
10 | ### applydq(df: Dataset[Row],sourcename:String,entityname:String, pipelineId: String): Dataset[Row] :  
11 | * Overloaded method that would read the metadata to fetch the rules to be applied, excluding schema check. 
12 | * Optional parameters (schema) would be defaulted.
13 | 
14 | 


--------------------------------------------------------------------------------
/dq/src/main/scala/com/ms/dq/rules/Nullcheck.scala:
--------------------------------------------------------------------------------
  1 | // Copyright (c) Microsoft Corporation.
  2 | // Licensed under the MIT License.
  3 | 
  4 | package com.ms.dq.rules
  5 | 
  6 | import com.ms.dq.support.SupportTrait
  7 | import org.apache.spark.sql.expressions.Window
  8 | import org.apache.spark.sql.{Dataset, Row, SparkSession}
  9 | import org.apache.spark.sql.functions._
 10 | import org.apache.spark.sql.types.StructType
 11 | import org.apache.spark.sql.types.{LongType, StringType, StructField, StructType}
 12 | import java.util.{Calendar, Date, Properties}
 13 | class Nullcheck extends SupportTrait{
 14 | 
 15 |   // Method to invoke null check on a dataset:df based on metadata entry
 16 |   // Returns the orginial dataframe along with additional flags indicating whether the required columns have passed/failed the null check for each particular record
 17 |   def apply(df: Dataset[Row], params: String, entityname: String, sourcename: String, spark: SparkSession, pipelineId: String, properties: Properties): Dataset[Row]={
 18 |     try {
 19 |       val ruleName = "nullcheck"
 20 |       if (params == null) {
 21 |         println("Skipping "+ruleName+". Please input Parameters for nullcheck on source=" + sourcename + " and entity=" + entityname + " in " + properties.getProperty("SUBJECT_AREA_DB") + "." + properties.getProperty("DQ_ENTITYRULEMETADATA_TABLE_NAME"))
 22 |         return df
 23 |       }
 24 |       import spark.implicits._
 25 |       //getting the required paramaeters from the JSON:params to apply nullcheck (eg , columnList)
 26 |       val paramsSchema = List(
 27 |         StructField("params", StringType, false))
 28 |       val paramsRow = Seq(Row(params))
 29 |       val paramsDf = spark.createDataFrame(
 30 |         spark.sparkContext.parallelize(paramsRow),
 31 |         StructType(paramsSchema)
 32 |       )
 33 |       val paramsString = paramsDf.select(col("params") as "params").map(_.toString())
 34 |       val readJson = spark.read.json(paramsString).asInstanceOf[Dataset[Row]]
 35 |       val readJsonCols = readJson.columns.toSeq
 36 |       //sanity check to validate json:params
 37 |       if (containsIgnoreCase(readJsonCols, "_corrupt_record")) {
 38 |         println("Skipping "+ruleName+". The Parameters for nullcheck on source=" + sourcename + " and entity=" + entityname + " are not a valid Json. Please input a valid Json in " + properties.getProperty("SUBJECT_AREA_DB") + "." + properties.getProperty("DQ_ENTITYRULEMETADATA_TABLE_NAME"))
 39 |         return df
 40 |       }
 41 |       //sanity check for required column:columnList in params
 42 |       if (!containsIgnoreCase(readJsonCols, "columnList")) {
 43 |         println("Skipping "+ruleName+". Mandatory Key \"columnList\" required in Parameters for nullcheck on source=" + sourcename + " and entity=" + entityname + ". Please make the required changes in " + properties.getProperty("SUBJECT_AREA_DB") + "." + properties.getProperty("DQ_ENTITYRULEMETADATA_TABLE_NAME"))
 44 |         return df
 45 |       }
 46 |       //Getting list of columns:distinctColList to apply nullcheck on
 47 |       val columns = readJson.select("columnList").first.getString(0)
 48 |       val colList = columns.split(",").toList
 49 |       val distinctColList = colList.distinct
 50 | 
 51 |       //columns of the dqfailtable
 52 |       val orderOfFailTable: List[String] = List("Source", "Entity", "ColumnName", "Rule", "Record", "PlatformModifiedDate", "PlatformModifiedDateInt", "DqAggTableKey")
 53 |       var originalDfColumns = df.columns.toSeq
 54 |       if (containsIgnoreCase(originalDfColumns, "dq_uniqueID")) {
 55 |         originalDfColumns = originalDfColumns.filter(!_.contains("dq_uniqueID"))
 56 |       }
 57 |       //view name for the final result
 58 |       val view_uid = java.util.UUID.randomUUID.toString.replace('-', '_')
 59 |       val resultViewName = "vw_Result_" + view_uid
 60 | 
 61 |       //SQL queries for null check will come here
 62 |       var sqlexpression = ""
 63 |       //sql expression to get the failed records for nullcheck will come here
 64 |       var sqllogExpression = ""
 65 |       //sql expression to log relevant information about the run will come here
 66 |       var dimaggExpression = ""
 67 | 
 68 |       val recordCount = df.count()
 69 |       val colListAsString = getStringFromSeq(originalDfColumns)
 70 |       //traversing through all columns to apply null check
 71 |       for (colName <- distinctColList) {
 72 |         if (!containsIgnoreCase(originalDfColumns, colName)) {
 73 |           println("Skipping " + ruleName + " for column " + colName + " as it does not exist in frame provided")
 74 |         }
 75 |         else {
 76 |           val dqAggKey = java.util.UUID.randomUUID.toString
 77 |           sqlexpression = sqlexpression + ",case when " + colName + " is null then false else true end as " + colName + "_" + ruleName
 78 |           sqllogExpression = sqllogExpression + (if (sqllogExpression != "") " union all " else "") + "Select  to_json(struct(" + colListAsString + "))as Record,'" + sourcename + "' as Source,'" + entityname + "' as Entity,'" + colName + "' as ColumnName,'" + dqAggKey + "' as DqAggTableKey  from " + resultViewName + " where " + colName + "_" + ruleName + "= false"
 79 |           dimaggExpression += (if (dimaggExpression != "") " union all " else "") + "Select '" + dqAggKey + "' as DqAggTableKey, '" + sourcename + "' as Source, '" + entityname + "' as Entity, '" + colName + "' as ColumnName, '" + ruleName + "' as Rule, " + recordCount + " as RecordCount, '" + pipelineId + "' as PipelineId ,  date_format(current_timestamp, \"y-MM-dd'T'HH:mm:ss.SSS'Z'\") as PlatformModifiedDate, cast(date_format(current_timestamp, \"yyyyMMddHHmmssSSS\") as long) as PlatformModifiedDateInt"
 80 |         }
 81 |       }
 82 |       //sanity check if the null check expression was built successfully
 83 |       if (sqlexpression == "" || sqllogExpression == "" || dimaggExpression == "") {
 84 |         return df
 85 |       }
 86 |       //creating resulting dataframe with required DQ columns
 87 |       val inputViewName = "vw_Input_" + view_uid
 88 |       df.createOrReplaceTempView(inputViewName)
 89 |       val dqResultDf = spark.sql("select *" + sqlexpression + " from " + inputViewName)
 90 |       dqResultDf.createOrReplaceTempView(resultViewName)
 91 | 
 92 |       if (properties.getProperty("DQ_LOG_RESULTS_FLAG").toBoolean) {
 93 |         //logging results in required tables
 94 |         val failedresult = spark.sql(sqllogExpression)
 95 | 
 96 |         val current_time = current_timestamp()
 97 |         var failTable = failedresult.withColumn("Rule", lit(ruleName)).withColumn("PlatformModifiedDate", date_format(current_time, "y-MM-dd'T'HH:mm:ss.SSS'Z'")).withColumn("PlatformModifiedDateInt", date_format(current_time, "yyyyMMddHHmmssSSS").cast(LongType))
 98 |         failTable = correctFormat(failTable, orderOfFailTable)
 99 |         spark.sql("insert into " + properties.getProperty("SUBJECT_AREA_DB") + "." + properties.getProperty("DQ_RUNDETAILS_TABLE_NAME") + " " + dimaggExpression)
100 |         val failedViewName = "vw_Failed_" + view_uid
101 |         failTable.createOrReplaceTempView(failedViewName)
102 |         spark.sql("insert into " + properties.getProperty("SUBJECT_AREA_DB") + "." + properties.getProperty("DQ_FAIL_TABLE_NAME") + " select * from " + failedViewName)
103 |       }
104 |       else
105 |         println("Skipping result logging as DQ_LOG_RESULTS_FLAG is set to " + properties.getProperty("DQ_LOG_RESULTS_FLAG"))
106 | 
107 |       dqResultDf
108 |     }
109 |     catch{
110 |       case e: Exception=> println("Return original dataframe. Nullcheck for source->"+sourcename+" and entity->"+entityname+" failed with exception->\n"+e.toString)
111 |       df
112 |     }
113 |   }
114 | 
115 |   // Method to invoke null check on a dataset based on arguments passed by the user
116 |   // Returns the orginial dataframe along with additional flags indicating whether the required columns have passed/failed the null check for each particular record
117 |   def apply(df:Dataset[Row],params:String,colEntitySourceMap:Map[String,List[String]],originalDfColumns:Seq[String],spark:SparkSession,pipelineId: String, properties: Properties): Dataset[Row]= {
118 |     try {
119 |       //applying sanity checks to check the ruleName as "nullcheck"
120 |       val ruleName = "nullcheck"
121 |       if (params == null) {
122 |         println("Skipping " + ruleName + ". Please send Parameters json as string for "+ruleName)
123 |         return df
124 |       }
125 |       import spark.implicits._
126 |       //getting the required paramaeters from the JSON:params to apply nullcheck (eg , columnList)
127 |       val paramsSchema = List(
128 |         StructField("params", StringType, false))
129 |       val paramsRow = Seq(Row(params))
130 |       val paramsDf = spark.createDataFrame(
131 |         spark.sparkContext.parallelize(paramsRow),
132 |         StructType(paramsSchema)
133 |       )
134 |       val paramsString = paramsDf.select(col("params") as "params").map(_.toString())
135 |       val readJson = spark.read.json(paramsString).asInstanceOf[Dataset[Row]]
136 |       val readJsonCols = readJson.columns.toSeq
137 | 
138 |       //sanity check to validate json:params
139 |       if (containsIgnoreCase(readJsonCols, "_corrupt_record")) {
140 |         println("Skipping " + ruleName + ". The Parameters for "+ruleName+" are not a valid Json. Please provide a valid Json")
141 |         return df
142 |       }
143 |       //sanity check for required column:columnList in params
144 |       if (!containsIgnoreCase(readJsonCols, "columnList")) {
145 |         println("Skipping " + ruleName + ". Mandatory Key \"columnList\" required in Parameters for nullcheck")
146 |         return df
147 |       }
148 |       //Getting list of columns:distinctColList to apply nullcheck on
149 |       val columns = readJson.select("columnList").first.getString(0)
150 |       val colList = columns.split(",").toList
151 |       val distinctColList = colList.distinct
152 | 
153 |       //view name for the final result
154 |       val view_uid = java.util.UUID.randomUUID.toString.replace('-', '_')
155 |       val resultViewName = "vw_Result_" + view_uid
156 | 
157 |       ////columns of the dqfailtable
158 |       val orderOfFailTable: List[String] = List("Source", "Entity", "ColumnName", "Rule", "Record", "PlatformModifiedDate", "PlatformModifiedDateInt", "DqAggTableKey")
159 |       //sql expression for applying nullcheck will come here
160 |       var sqlexpression = ""
161 |       //sql expression for failed nullcheck will come here
162 |       var sqllogExpression = ""
163 |       //sql expression for logging relevant information will come here
164 |       var dimaggExpression = ""
165 |       val recordCount = df.count()
166 | 
167 |       val colListAsString = getStringFromSeq(originalDfColumns)
168 |       //traversing through all columns to apply null check
169 |       for (colName <- distinctColList) {
170 |         if (!containsIgnoreCase(originalDfColumns, colName)) {
171 |           println("Skipping " + ruleName + " for column " + colName + " as it does not exist in frame provided")
172 |         }
173 |         else {
174 |           val dqAggKey = java.util.UUID.randomUUID.toString
175 |           val entityname = colEntitySourceMap(colName)(0)
176 |           val sourcename = colEntitySourceMap(colName)(1)
177 |           sqlexpression = sqlexpression + ",case when " + colName + " is null then false else true end as " + colName + "_" + ruleName
178 |           sqllogExpression = sqllogExpression + (if (sqllogExpression != "") " union all " else "") + "Select  to_json(struct(" + colListAsString + "))as Record,'" + sourcename + "' as Source,'" + entityname + "' as Entity,'" + colName + "' as ColumnName,'" + dqAggKey + "' as DqAggTableKey  from " + resultViewName + " where " + colName + "_" + ruleName + "=false"
179 |           dimaggExpression += (if (dimaggExpression != "") " union all " else "") + "Select '" + dqAggKey + "' as DqAggTableKey, '" + sourcename + "' as Source, '" + entityname + "' as Entity, '" + colName + "' as ColumnName, '" + ruleName + "' as Rule, " + recordCount + " as RecordCount, '" + pipelineId + "' as PipelineId ,date_format(current_timestamp, \"y-MM-dd'T'HH:mm:ss.SSS'Z'\") as PlatformModifiedDate, cast(date_format(current_timestamp, \"yyyyMMddHHmmssSSS\") as long) as PlatformModifiedDateInt"
180 |         }
181 |       }
182 | 
183 |       //sanity check if the null check expression was built successfully
184 |       if (sqlexpression == "" || sqllogExpression == "" || dimaggExpression == "") {
185 |         return df
186 |       }
187 | 
188 |       val inputViewName = "vw_Input_" + view_uid
189 |       df.createOrReplaceTempView(inputViewName)
190 |       //applying null check
191 |       val dqResultDf = spark.sql("select *" + sqlexpression + " from " + inputViewName)
192 | 
193 |       dqResultDf.createOrReplaceTempView(resultViewName)
194 | 
195 |       if (properties.getProperty("DQ_LOG_RESULTS_FLAG").toBoolean) {
196 |         //logging results in required tables
197 |         val failedresult = spark.sql(sqllogExpression)
198 |         val current_time = current_timestamp()
199 |         var failTable = failedresult.withColumn("Rule", lit(ruleName)).withColumn("PlatformModifiedDate", date_format(current_time, "y-MM-dd'T'HH:mm:ss.SSS'Z'")).withColumn("PlatformModifiedDateInt", date_format(current_time, "yyyyMMddHHmmssSSS").cast(LongType))
200 |         failTable = correctFormat(failTable, orderOfFailTable)
201 |         spark.sql("insert into " + properties.getProperty("SUBJECT_AREA_DB") + "." + properties.getProperty("DQ_RUNDETAILS_TABLE_NAME") + " " + dimaggExpression)
202 |         val failedViewName = "vw_Failed_" + view_uid
203 |         failTable.createOrReplaceTempView(failedViewName)
204 |         spark.sql("insert into " + properties.getProperty("SUBJECT_AREA_DB") + "." + properties.getProperty("DQ_FAIL_TABLE_NAME") + " select * from " + failedViewName)
205 |       }
206 |       else
207 |         println("Skipping result logging as DQ_LOG_RESULTS_FLAG is set to " + properties.getProperty("DQ_LOG_RESULTS_FLAG"))
208 | 
209 |       dqResultDf
210 |     }
211 |     catch {
212 |       case e: Exception => println("Returning original Dataframe. Nullcheck failed with Exception-->\n" + e.toString)
213 |         df
214 |     }
215 |   }
216 | }


--------------------------------------------------------------------------------
/dq/src/main/scala/com/ms/dq/rules/Orphanedgecheck.scala:
--------------------------------------------------------------------------------
  1 | // Copyright (c) Microsoft Corporation.
  2 | // Licensed under the MIT License.
  3 | 
  4 | // Checks whether the dataframe’s values referring to ids of its parent dataframes, actually exist in the parent dataframes.
  5 | package com.ms.dq.rules
  6 | 
  7 | import java.util.{Calendar, Properties}
  8 | 
  9 | import com.ms.dq.support.SupportTrait
 10 | import org.apache.spark.sql.expressions.Window
 11 | import org.apache.spark.sql.{Dataset, Row, SparkSession}
 12 | import org.apache.spark.sql.functions._
 13 | import org.apache.spark.sql.Dataset
 14 | import org.apache.spark.sql.Row
 15 | import org.apache.spark.sql.types.{LongType, StringType, StructField, StructType}
 16 | class Orphanedgecheck extends SupportTrait{
 17 |   // Method to invoke orphan edge check on a dataset:df based on metadata
 18 |   // Returns the orginial dataframe along with additional flags indicating whether the required columns have passed/failed the orphan edge check for each particular record
 19 |   def apply(df:Dataset[Row],params:String,entityname:String,sourcename:String,spark:SparkSession, pipelineId: String, properties: Properties):Dataset[Row]= {
 20 |     try {
 21 |       val ruleName = "orphanedgecheck"
 22 |       if (params == null) {
 23 |         println("Skipping " + ruleName + ". Please input Parameters for " + ruleName + " on source=" + sourcename + " and entity=" + entityname + " in " + properties.getProperty("SUBJECT_AREA_DB") + "." + properties.getProperty("DQ_ENTITYRULEMETADATA_TABLE_NAME"))
 24 |         return df
 25 |       }
 26 |       //getting the required paramaeters from the JSON:params to apply orphanedge check (eg , tableName)
 27 |       import spark.implicits._
 28 |       val paramsSchema = List(
 29 |         StructField("params", StringType, false))
 30 |       val paramsRow = Seq(Row(params))
 31 |       val paramsDf = spark.createDataFrame(
 32 |         spark.sparkContext.parallelize(paramsRow),
 33 |         StructType(paramsSchema)
 34 |       )
 35 |       val paramsString = paramsDf.select(col("params") as "params").map(_.toString())
 36 |       val readJson = spark.read.json(paramsString).asInstanceOf[Dataset[Row]]
 37 |       val readJsonCols = readJson.columns.toSeq
 38 |       //sanity check to validate json:params
 39 |       if (containsIgnoreCase(readJsonCols, "_corrupt_record")) {
 40 |         println("Skipping " + ruleName + ". The Parameters for " + ruleName + " on source=" + sourcename + " and entity=" + entityname + " are not a valid Json. Please input a valid Json in " + properties.getProperty("SUBJECT_AREA_DB") + "." + properties.getProperty("DQ_ENTITYRULEMETADATA_TABLE_NAME"))
 41 |         return df
 42 |       }
 43 |       //sanity check for required keys in json:params
 44 |       if (!containsIgnoreCase(readJsonCols, "tableName")) {
 45 |         println("Skipping " + ruleName + ". Mandatory Key \"tableName\" required in Parameters for " + ruleName + " on source=" + sourcename + " and entity=" + entityname + ". Please make the required changes in " + properties.getProperty("SUBJECT_AREA_DB") + "." + properties.getProperty("DQ_ENTITYRULEMETADATA_TABLE_NAME"))
 46 |         return df
 47 |       }
 48 |       val tableName = readJson.select("tableName").first.getString(0)
 49 |       if (!containsIgnoreCase(readJsonCols, "auditColIntName")) {
 50 |         println("Skipping " + ruleName + ". Mandatory Key \"auditColIntName\" required in Parameters for " + ruleName + " on source=" + sourcename + " and entity=" + entityname + ". Please make the required changes in " + properties.getProperty("SUBJECT_AREA_DB") + "." + properties.getProperty("DQ_ENTITYRULEMETADATA_TABLE_NAME"))
 51 |         return df
 52 |       }
 53 |       val auditCol = readJson.select("auditColIntName").first.getString(0)
 54 | 
 55 |       //obtaining optional keys from json:params
 56 |       var hours:Long=0
 57 |       var minutes:Long=0
 58 |       var seconds:Long=0
 59 |       if(containsIgnoreCase(readJsonCols,"cutOffHours"))
 60 |       {
 61 |         try {
 62 |           hours=readJson.select("cutOffHours").first.getLong(0)
 63 |         }
 64 |         catch {
 65 |           case e: Exception=>println("Error in metadata of source->"+sourcename+" entity->"+entityname+" rule->"+ruleName+" parameter->cutOffHours(must be long)")
 66 |             throw e
 67 |         }
 68 |       }
 69 |       if(containsIgnoreCase(readJsonCols,"cutOffMinutes"))
 70 |       {
 71 |         try {
 72 |           minutes = readJson.select("cutOffMinutes").first.getLong(0)
 73 |         }
 74 |         catch {
 75 |           case e: Exception=>println("Error in metadata of source->"+sourcename+" entity->"+entityname+" rule->"+ruleName+" parameter->cutOffMinutes(must be long)")
 76 |             throw e
 77 |         }
 78 |       }
 79 |       if(containsIgnoreCase(readJsonCols,"cutOffSeconds"))
 80 |       {
 81 |         try {
 82 |           seconds = readJson.select("cutOffSeconds").first.getLong(0)
 83 |         }
 84 |         catch {
 85 |           case e: Exception=>println("Error in metadata of source->"+sourcename+" entity->"+entityname+" rule->"+ruleName+" parameter->cutOffSeconds(must be long)")
 86 |             throw e
 87 |         }
 88 |       }
 89 | 
 90 |       //According to the parameters , computing the cutoff time
 91 |       var cutOffEndVal=Long.MaxValue.toString()
 92 |       if(!(hours==0 && minutes ==0 && seconds == 0))
 93 |         {
 94 |           val current_ts=spark.sql("select current_timestamp() as current_timestamp").withColumn("SLA",col("current_timestamp") - expr("INTERVAL "+hours+" HOURS")).withColumn("SLA",col("SLA") - expr("INTERVAL "+minutes+" minutes")).withColumn("SLA",col("SLA") - expr("INTERVAL "+seconds+" seconds"))
 95 |           cutOffEndVal=current_ts.withColumn("CutOffDate", date_format(col("SLA"), "yyyyMMddHHmmssSSS").cast(LongType)).select("CutOffDate").head().getLong(0).toString()
 96 |         }
 97 | 
 98 |       //getting delta records based on watermark start and cutoff time
 99 |       val waterMarkStart=ruleWaterMarkStart(sourcename,entityname,ruleName,spark,properties)
100 |       val deltaDf=dqDeltaIdentifier(tableName,auditCol,waterMarkStart,cutOffEndVal,spark)
101 |       var dqResultDf: Dataset[Row] = null
102 |       if(deltaDf.isEmpty)
103 |         {
104 |           println("Skipping Orphanedgecheck as no delta data is found. Returning original df ")
105 |           return df
106 |         }
107 | 
108 |       // getting the original columns of the dataframe (dq_uniqueID is added for internal framework purposes)
109 |       var originalDfColumns = deltaDf.columns.toSeq
110 |       if (containsIgnoreCase(originalDfColumns, "dq_uniqueID")) {
111 |         originalDfColumns = originalDfColumns.filter(!_.contains("dq_uniqueID"))
112 |       }
113 |       import spark.implicits._
114 | 
115 |       // getting required information from metadata along with sanity checks
116 |       val edgeMetaData = spark.sql("select *,rank()over( partition by source,entity order by fromlookupentity) as fromlookupentityid,rank()over( partition by source,entity order by tolookupentity) as tolookupentityid from " + properties.getProperty("SUBJECT_AREA_DB") + "." + properties.getProperty("DQ_ORPHANEDGEMETADATA_TABLE_NAME") + " where lower(source)='" + sourcename.toLowerCase() + "' and lower(entity)='" + entityname.toLowerCase() + "'")
117 |       val distinctFromId = edgeMetaData.select("fromcolumnname").distinct()
118 |       if (distinctFromId.count() >= 2) {
119 |         println("Skipping orphanedgecheck.Found Multiple fromcolumnname in " + properties.getProperty("SUBJECT_AREA_DB") + "." + properties.getProperty("DQ_ORPHANEDGEMETADATA_TABLE_NAME") + " for source=" + sourcename + " and entity=" + entityname + ". Only one should exist.")
120 |         return deltaDf
121 |       }
122 |       val fromIdName = distinctFromId.first.getString(0)
123 |       if (!containsIgnoreCase(originalDfColumns, fromIdName)) {
124 |         println("Skipping " + ruleName + ". " + fromIdName + " not present in provided frame. Please check fromcolumnname in " + properties.getProperty("SUBJECT_AREA_DB") + "." + properties.getProperty("DQ_ORPHANEDGEMETADATA_TABLE_NAME") + " for source=" + sourcename + " and entity=" + entityname)
125 |         return deltaDf
126 |       }
127 |       val distincToId = edgeMetaData.select("tocolumnname").distinct()
128 |       if (distincToId.count() >= 2) {
129 |         println("Skipping orphanedgecheck..Found Multiple tocolumnname in " + properties.getProperty("SUBJECT_AREA_DB") + "." + properties.getProperty("DQ_ORPHANEDGEMETADATA_TABLE_NAME") + " for source=" + sourcename + " and entity=" + entityname + ". Only one should exist.")
130 |         return deltaDf
131 |       }
132 |       val toIdName = distincToId.first.getString(0)
133 |       if (!containsIgnoreCase(originalDfColumns, toIdName)) {
134 |         println("Skipping " + ruleName + ". " + toIdName + " not present in provided frame. Please check tocolumnname in " + properties.getProperty("SUBJECT_AREA_DB") + "." + properties.getProperty("DQ_ORPHANEDGEMETADATA_TABLE_NAME") + " for source=" + sourcename + " and entity=" + entityname)
135 |         return deltaDf
136 |       }
137 | 
138 |       val orderOfFailTable: List[String] = List("Source", "Entity", "ColumnName", "Rule", "Record", "PlatformModifiedDate", "PlatformModifiedDateInt", "DqAggTableKey")
139 |       var failTable = Seq.empty[(String, String, String)].toDF("Record", "ColumnName", "DqAggTableKey")
140 |       var sqlcasef = ""
141 |       var sqlcaset = ""
142 |       var sqlcaseelsef = ""
143 |       var sqlcaseelset = ""
144 |       var sqljoin = ""
145 |       var frmlst = List[Int]()
146 |       var tolst = List[Int]()
147 | 
148 |       //applying orphan edge check
149 |       edgeMetaData.select("fromlookupentity", "fromlookupcolumnname", "tolookupentity", "tolookupcolumnname", "filtercondition", "tocolumnname", "fromcolumnname", "fromlookupentityid", "tolookupentityid").sort($"fromlookupentityid", $"tolookupentityid").collect().map(
150 |         {
151 |           d =>
152 |             val fromlookupentity = d.getString(0)
153 |             val colInFromDf = d.getString(1)
154 |             val tolookupentity = d.getString(2)
155 |             val colInToDf = d.getString(3)
156 |             val filtercondition = d.getString(4)
157 |             val tocolumnname = d.getString(5)
158 |             val fromcolumnname = d.getString(6)
159 | 
160 |             val fromlookupentityid = d.get(7).asInstanceOf[Int].toInt
161 |             val tolookupentityid = d.get(8).asInstanceOf[Int].toInt
162 |             if (!frmlst.contains(fromlookupentityid)) {
163 |               sqljoin = sqljoin + s" \nleft join vwinput${fromlookupentity.replace(".", "_")}   f${fromlookupentityid}  on ${entityname}.${fromcolumnname}=f${fromlookupentityid}.${colInFromDf}"
164 |               if (!spark.catalog.tableExists(fromlookupentity)) {
165 |                 println("Skipping OrphanEdgeCheck. No such Table->" + fromlookupentity)
166 |                 return deltaDf
167 |               }
168 |               val fromDf = spark.table(fromlookupentity)
169 |               if (!containsIgnoreCase(fromDf.columns.toSeq, colInFromDf)) {
170 |                 println("Skipping " + ruleName + ". No column " + colInFromDf + " present in table " + fromlookupentity + ". Please check fromlookupentity and fromlookupcolumnname in " + properties.getProperty("SUBJECT_AREA_DB") + "." + properties.getProperty("DQ_ORPHANEDGEMETADATA_TABLE_NAME") + " for source=" + sourcename + " and entity=" + entityname)
171 |                 return deltaDf
172 |               }
173 |               fromDf.groupBy(colInFromDf).count().select(colInFromDf).createOrReplaceTempView("vwinput" + fromlookupentity.replace(".", "_"))
174 |               frmlst = frmlst :+ fromlookupentityid
175 |             }
176 |             if (!tolst.contains(tolookupentityid)) {
177 | 
178 |               sqljoin = sqljoin + s" \nleft join vwinput${tolookupentity.replace(".", "_")}   t${tolookupentityid}  on ${entityname}.${tocolumnname}=t${tolookupentityid}.${colInToDf}"
179 |               if (!spark.catalog.tableExists(tolookupentity)) {
180 |                 println("Skipping OrphanEdgeCheck. No such Table->" + fromlookupentity)
181 |                 return deltaDf
182 |               }
183 |               val toDf = spark.table(tolookupentity)
184 |               if (!containsIgnoreCase(toDf.columns.toSeq, colInToDf)) {
185 |                 println("Skipping " + ruleName + ". No column " + colInToDf + " present in table " + tolookupentity + ". Please check tolookupentity and tolookupcolumnname in " + properties.getProperty("SUBJECT_AREA_DB") + "." + properties.getProperty("DQ_ORPHANEDGEMETADATA_TABLE_NAME") + " for source=" + sourcename + " and entity=" + entityname)
186 |                 return deltaDf
187 |               }
188 |               toDf.groupBy(colInToDf).count().select(colInToDf).createOrReplaceTempView("vwinput" + tolookupentity.replace(".", "_"))
189 |               tolst = tolst :+ tolookupentityid
190 |             }
191 |             sqlcasef = sqlcasef + (if (sqlcasef != "") " or " else "") + s"((f${fromlookupentityid}.${colInToDf} is null ) and  ${filtercondition}) "
192 |             sqlcaset = sqlcaset + (if (sqlcaset != "") " or " else "") + s"((t${tolookupentityid}.${colInToDf} is null ) and  ${filtercondition}) "
193 |             sqlcaseelsef = sqlcaseelsef + (if (sqlcaseelsef != "") " or " else "") + s"((f${fromlookupentityid}.${colInToDf} is not null ) and  ${filtercondition}) "
194 |             sqlcaseelset = sqlcaseelset + (if (sqlcaseelset != "") " or " else "") + s"((t${tolookupentityid}.${colInToDf} is not null ) and  ${filtercondition}) "
195 | 
196 |         })
197 |       val dqAggKey = java.util.UUID.randomUUID.toString
198 |       val inputViewName = "vw_Input_" + dqAggKey.replace('-', '_')
199 |       deltaDf.createOrReplaceTempView(inputViewName)
200 |       var sql = s"\n select ${entityname}.*\n ,case when(${sqlcasef} ) then false when (${sqlcaseelsef}) then true end as ${fromIdName}_${ruleName}\n,case when(${sqlcaset} ) then false when (${sqlcaseelset} ) then true end as ${toIdName}_${ruleName}"
201 |       sql = sql + s"\n from ${inputViewName} ${entityname} \n ${sqljoin}"
202 |       dqResultDf = spark.sql(sql)
203 |       val colListAsString = getStringFromSeq(originalDfColumns)
204 |       val resultViewName = "vw_Result_" + dqAggKey.replace('-', '_')
205 |       dqResultDf.createOrReplaceTempView(resultViewName)
206 | 
207 |       if (properties.getProperty("DQ_LOG_RESULTS_FLAG").toBoolean) {
208 |         //logging results in required tables
209 |         val toDqAggKey = java.util.UUID.randomUUID.toString
210 |         val failedsql = s" select to_json(struct(${colListAsString})) as Record,'${sourcename}' as Source,'${entityname}' as Entity,'${fromIdName}' as ColumnName,'${dqAggKey}' as DqAggTableKey from ${resultViewName} where ${fromIdName}_${ruleName} =false union all select to_json(struct(${colListAsString}))as Record,'${sourcename}' as Source,'${entityname}' as Entity,'${toIdName}' as ColumnName,'${toDqAggKey}' as DqAggTableKey from ${resultViewName} where ${toIdName}_${ruleName} =false"
211 |         val current_time = current_timestamp()
212 |         failTable = spark.sql(failedsql)
213 |         failTable = failTable.withColumn("Rule", lit(ruleName)).withColumn("PlatformModifiedDate", date_format(current_time, "y-MM-dd'T'HH:mm:ss.SSS'Z'")).withColumn("PlatformModifiedDateInt", date_format(current_time, "yyyyMMddHHmmssSSS").cast(LongType))
214 |         failTable = correctFormat(failTable, orderOfFailTable)
215 | 
216 |         val recordCount = deltaDf.count()
217 |         var dimAggExpression = "Select '" + dqAggKey + "' as DqAggTableKey, '" + sourcename + "' as Source, '" + entityname + "' as Entity, '" + fromIdName + "' as ColumnName, '" + ruleName + "' as Rule, " + recordCount + " as RecordCount, '" + pipelineId + "' as PipelineId, date_format(current_timestamp, \"y-MM-dd'T'HH:mm:ss.SSS'Z'\") as PlatformModifiedDate, cast(date_format(current_timestamp, \"yyyyMMddHHmmssSSS\") as long) as PlatformModifiedDateInt"
218 |         dimAggExpression += " union all "
219 |         dimAggExpression += "Select '" + toDqAggKey + "' as DqAggTableKey, '" + sourcename + "' as Source, '" + entityname + "' as Entity, '" + toIdName + "' as ColumnName, '" + ruleName + "' as Rule, " + recordCount + " as RecordCount, '" + pipelineId + "' as PipelineId, date_format(current_timestamp, \"y-MM-dd'T'HH:mm:ss.SSS'Z'\") as PlatformModifiedDate, cast(date_format(current_timestamp, \"yyyyMMddHHmmssSSS\") as long) as PlatformModifiedDateInt"
220 |         spark.sql("insert into " + properties.getProperty("SUBJECT_AREA_DB") + "." + properties.getProperty("DQ_RUNDETAILS_TABLE_NAME") + " " + dimAggExpression)
221 | 
222 |         val failedViewName = "vw_Failed_" + dqAggKey.replace('-', '_')
223 |         failTable.createOrReplaceTempView(failedViewName)
224 |         spark.sql(s"insert into " + properties.getProperty("SUBJECT_AREA_DB") + "." + properties.getProperty("DQ_FAIL_TABLE_NAME") + s" select * from ${failedViewName}")
225 |       }
226 |       else
227 |         println("Skipping result logging as DQ_LOG_RESULTS_FLAG is set to " + properties.getProperty("DQ_LOG_RESULTS_FLAG"))
228 | 
229 |       val waterMarkEndVal=deltaDf.agg(max(col(auditCol))).head().getLong(0).toString()
230 |       updateRuleWaterMark(sourcename,entityname,ruleName,waterMarkEndVal,spark,properties)
231 | 
232 |       dqResultDf
233 |     }
234 |     catch {
235 |       case e: Exception=> println("Returning original Dataframe. OrphanEdgeCheck for source->"+sourcename+"and entity->"+entityname+" failed with Exception-->\n"+e.toString)
236 |         df
237 |     }
238 |   }
239 | }


--------------------------------------------------------------------------------
/dq/src/main/scala/com/ms/dq/rules/Schemacheck.scala:
--------------------------------------------------------------------------------
 1 | // Copyright (c) Microsoft Corporation.
 2 | // Licensed under the MIT License.
 3 | 
 4 | package com.ms.dq.rules
 5 | 
 6 | import java.util.Properties
 7 | 
 8 | import org.apache.spark.sql.expressions.Window
 9 | import org.apache.spark.sql.Dataset
10 | import org.apache.spark.sql.Row
11 | import org.apache.spark.sql.functions._
12 | import org.apache.spark.sql.types.StructType
13 | import org.apache.spark.sql.{SQLContext, SQLImplicits, SparkSession}
14 | import com.databricks.dbutils_v1.DBUtilsHolder.dbutils
15 | import com.ms.dq.support.SupportTrait
16 | import org.apache.spark.sql.types.{IntegerType, LongType, StringType, StructField, StructType};
17 | 
18 | class Schemacheck extends SupportTrait{
19 |   def check(df:Dataset[Row],ruleName:String,schema:StructType,bodyColumn:String,spark:SparkSession):Dataset[Row]={
20 |     import spark.implicits._
21 |     val dfCount=df.count()
22 |     val resultColumn=bodyColumn+"_"+ruleName
23 |     val recordDfCols=Seq(bodyColumn)
24 |     val dfStringDS=df.select(col(bodyColumn)as bodyColumn).map(_.toString())
25 |     val basePath="DqSchemacheck"
26 |     val uuid=java.util.UUID.randomUUID.toString
27 | 
28 |     // path to store records which are not inline with the schema
29 |     val baseDqFolder="/tmp/"+basePath+"/"+uuid
30 | 
31 |     //imposing schema. Records which are inline with the schema are stored in validRecordsTemp , the others are stored in the baseDqFolder path
32 |     val validRecordsTemp=spark.read.option("badRecordsPath", baseDqFolder).schema(schema).json(dfStringDS)
33 | 
34 |     //reconverting records back to jsons
35 |     val validRecords=validRecordsTemp.toJSON.asInstanceOf[Dataset[Row]].toDF(recordDfCols: _*).withColumn(resultColumn,lit(true))
36 |     var resultDf=validRecords
37 |     val validRecordsCount=validRecords.count()
38 | 
39 |     //checking if there were any dq failures
40 |     if(validRecordsCount != dfCount){
41 |       //getting all failed records and adding it to the resulting dataframe
42 |       val colNameTimestamp="name"
43 |       val badRecordsName="bad_records"
44 |       val recordName="record"
45 |       val fileNameList=dbutils.fs.ls(baseDqFolder)
46 |       val fileNameWithTS=fileNameList(0).path
47 |       val fileName=fileNameWithTS+"/"+badRecordsName
48 |       val badRecordsTemp=spark.read.json(fileName).select(recordName)
49 |       val badRecords=badRecordsTemp.withColumn(recordName,col(recordName).substr(lit(2),length(col(recordName))-2)).toDF(recordDfCols: _*).withColumn(resultColumn,lit(false))
50 |       resultDf=validRecords.union(badRecords)
51 |     }
52 |     resultDf
53 |   }
54 | 
55 |   // Method to invoke schema check on a dataset:df based on metadata
56 |   // Returns the original dataframe along with additional flags indicating whether the required columns have passed/failed the schema check for each particular record
57 |   // Schema checks the schema of the bodyColumn whose values are expected to be Jsons
58 |   def apply(df:Dataset[Row],schema:StructType,bodyColumn:String,entityname:String,sourcename:String,spark:SparkSession, pipelineId: String, properties: Properties):Dataset[Row]= {
59 |     try {
60 |       import spark.implicits._
61 |       val ruleName = "schemacheck"
62 |       val orderOfFailTable: List[String] = List("Source", "Entity", "ColumnName", "Rule", "Record", "PlatformModifiedDate", "PlatformModifiedDateInt", "DqAggTableKey")
63 |       val recordDfCols = Seq("Record")
64 |       val dqResultDf = check(df, ruleName, schema, bodyColumn, spark)
65 | 
66 |       if (properties.getProperty("DQ_LOG_RESULTS_FLAG").toBoolean) {
67 |         //logging results in required tables
68 |         val failed = dqResultDf.filter(col(bodyColumn + "_" + ruleName) === false).select(bodyColumn)
69 |         val recordDf = failed.toDF(recordDfCols: _*)
70 |         val dqAggKey = java.util.UUID.randomUUID.toString
71 |         val current_time = current_timestamp()
72 | 
73 |         var failTable = recordDf.withColumn("Source", lit(sourcename)).withColumn("Entity", lit(entityname)).withColumn("ColumnName", lit(null)).withColumn("Rule", lit(ruleName)).withColumn("PlatformModifiedDate", date_format(current_time, "y-MM-dd'T'HH:mm:ss.SSS'Z'")).withColumn("PlatformModifiedDateInt", date_format(current_time, "yyyyMMddHHmmssSSS").cast(LongType)).withColumn("DqAggTableKey", lit(dqAggKey))
74 |         failTable = correctFormat(failTable, orderOfFailTable)
75 | 
76 |         val recordCount = df.count()
77 |         val dimAggExpression = "Select '" + dqAggKey + "' as DqAggTableKey, '" + sourcename + "' as Source, '" + entityname + "' as Entity, '' as ColumnName, '" + ruleName + "' as Rule, " + recordCount + " as RecordCount, '" + pipelineId + "' as PipelineId , date_format(current_timestamp, \"y-MM-dd'T'HH:mm:ss.SSS'Z'\") as PlatformModifiedDate, cast(date_format(current_timestamp, \"yyyyMMddHHmmssSSS\") as long) as PlatformModifiedDateInt"
78 |         spark.sql("insert into " + properties.getProperty("SUBJECT_AREA_DB") + "." + properties.getProperty("DQ_RUNDETAILS_TABLE_NAME") + " " + dimAggExpression)
79 | 
80 |         val failedViewName = "vw_Failed_" + dqAggKey.replace('-', '_')
81 |         failTable.createOrReplaceTempView(failedViewName)
82 |         spark.sql("insert into " + properties.getProperty("SUBJECT_AREA_DB") + "." + properties.getProperty("DQ_FAIL_TABLE_NAME") + " select * from " + failedViewName)
83 |       }
84 |       else
85 |         println("Skipping result logging as DQ_LOG_RESULTS_FLAG is set to " + properties.getProperty("DQ_LOG_RESULTS_FLAG"))
86 | 
87 |       dqResultDf
88 |     }
89 |     catch {
90 |       case e: Exception=> println("Returning original Dataframe. Schemacheck for Source->"+sourcename+" and entity->"+entityname+" failed with Exception-->\n"+e.toString)
91 |         df
92 |     }
93 |   }
94 | }


--------------------------------------------------------------------------------
/dq/src/main/scala/com/ms/dq/rules/Uniquecheck.scala:
--------------------------------------------------------------------------------
  1 | // Copyright (c) Microsoft Corporation.
  2 | // Licensed under the MIT License.
  3 | 
  4 | package com.ms.dq.rules
  5 | 
  6 | import java.util.Properties
  7 | 
  8 | import com.ms.dq.support.SupportTrait
  9 | import org.apache.spark.sql.expressions.Window
 10 | import org.apache.spark.sql.{Dataset, Row, SparkSession}
 11 | import org.apache.spark.sql.functions._
 12 | import org.apache.spark.sql.types.{LongType, StringType, StructField, StructType}
 13 | 
 14 | class Uniquecheck extends SupportTrait {
 15 |   // Method to invoke unique check on a dataset:df based on metadata entry
 16 |   // Returns the orginial dataframe along with additional flags indicating whether the required columns have passed/failed the unique check for each particular record
 17 |   def apply(df: Dataset[Row], params: String, entityname: String, sourcename: String, spark: SparkSession, pipelineId: String, properties: Properties): Dataset[Row]= {
 18 |     try {
 19 |       val ruleName = "uniquecheck"
 20 |       if (params == null) {
 21 |         println("Skipping " + ruleName + ". Please input Parameters for " + ruleName + " on source=" + sourcename + " and entity=" + entityname + " in " + properties.getProperty("SUBJECT_AREA_DB") + "." + properties.getProperty("DQ_ENTITYRULEMETADATA_TABLE_NAME"))
 22 |         return df
 23 |       }
 24 |       //getting the required paramaeters from the JSON:params to apply nullcheck (eg , columnList)
 25 |       import spark.implicits._
 26 |       val paramsSchema = List(
 27 |         StructField("params", StringType, false))
 28 |       val paramsRow = Seq(Row(params))
 29 |       val paramsDf = spark.createDataFrame(
 30 |         spark.sparkContext.parallelize(paramsRow),
 31 |         StructType(paramsSchema)
 32 |       )
 33 |       val paramsString = paramsDf.select(col("params") as "params").map(_.toString())
 34 |       val readJson = spark.read.json(paramsString).asInstanceOf[Dataset[Row]]
 35 |       val readJsonCols = readJson.columns.toSeq
 36 | 
 37 |       //sanity check to validate json:params
 38 |       if (containsIgnoreCase(readJsonCols, "_corrupt_record")) {
 39 |         println("Skipping " + ruleName + ". The Parameters for " + ruleName + " on source=" + sourcename + " and entity=" + entityname + " are not a valid Json. Please input a valid Json in " + properties.getProperty("SUBJECT_AREA_DB") + "." + properties.getProperty("DQ_ENTITYRULEMETADATA_TABLE_NAME"))
 40 |         return df
 41 |       }
 42 |       //Getting list of columns:distinctColList to apply unique check on
 43 |       if (!containsIgnoreCase(readJsonCols, "columnList")) {
 44 |         println("Skipping " + ruleName + ". Mandatory Key \"columnList\" required in Parameters for " + ruleName + " on source=" + sourcename + " and entity=" + entityname + ". Please make the required changes in " + properties.getProperty("SUBJECT_AREA_DB") + "." + properties.getProperty("DQ_ENTITYRULEMETADATA_TABLE_NAME"))
 45 |         return df
 46 |       }
 47 |       //get the number of unique checks to be performed on column (be it single or composite)
 48 |       var len: Integer  =readJson.select(size($"columnList")).first.getInt(0).toInt
 49 |       var i= 0
 50 |       var listOfCompositeColumns = List[String]()
 51 |       for( i <- 0 to len-1)
 52 |       {
 53 |         val colm = readJson.select($"columnList".getItem(i)).first.getString(0)
 54 |         listOfCompositeColumns = listOfCompositeColumns :+ colm
 55 |       }
 56 |       val distinctColList = listOfCompositeColumns
 57 | 
 58 |       var latestIdentifierCol: String = null
 59 |       if (containsIgnoreCase(readJsonCols, "latestrowidentifier")) {
 60 |         latestIdentifierCol = readJson.select("latestrowidentifier").first.getString(0)
 61 |       }
 62 | 
 63 |       var originalDfColumns = df.columns.toSeq
 64 |       if (containsIgnoreCase(originalDfColumns, "dq_uniqueID")) {
 65 |         originalDfColumns = originalDfColumns.filter(!_.contains("dq_uniqueID"))
 66 |       }
 67 | 
 68 |       if (latestIdentifierCol != null && !containsIgnoreCase(originalDfColumns, latestIdentifierCol)) {
 69 |         println("Value for latestrowidentifier=" + latestIdentifierCol + " in parameters for " + ruleName + " source=" + sourcename + " entity=" + entityname + " is not present in the given dataframe. Please make the required change. Proceeding with " + ruleName + " without identifying latest column.")
 70 |         latestIdentifierCol = null
 71 |       }
 72 |       //columns of the dqfailtable in particular order
 73 |       val orderOfFailTable: List[String] = List("Source", "Entity", "ColumnName", "Rule", "Record", "PlatformModifiedDate", "PlatformModifiedDateInt", "DqAggTableKey")
 74 | 
 75 |       //sql expression for uniquecheck will come here
 76 |       var sqlexpression = ""
 77 |       //sql expression for building the failed records for uniquecheck will come here
 78 |       var sqllogExpression = ""
 79 |       //sql expression for logging details of the DQ rule check will come here
 80 |       var dimaggExpression = ""
 81 |       val recordCount = df.count()
 82 | 
 83 |       //view name for the final result
 84 |       val view_uid = java.util.UUID.randomUUID.toString.replace('-', '_')
 85 |       val resultViewName = "vw_Result_" + view_uid
 86 | 
 87 |       val colListAsString = getStringFromSeq(originalDfColumns)
 88 |       //traversing the columnlist(single/composite) on which uniquecheck is to applied
 89 |       //in this loop we build the sql expression for uniquecheck
 90 |       for (colName <- distinctColList) {
 91 | 
 92 |         if (!containsIgnoreCase(originalDfColumns, colName.split(","))) {
 93 |           println("Skipping " + ruleName + " for column " + colName + " as it does not exist in the frame provided. Please check parameters in " + properties.getProperty("SUBJECT_AREA_DB") + "." + properties.getProperty("DQ_ENTITYRULEMETADATA_TABLE_NAME") + " for " + ruleName + " source=" + sourcename + " and entity=" + entityname)
 94 |         }
 95 |         else {
 96 |           //building the column name which  will reflect the status of uniquecheck for individual records
 97 |           //colName = "id,createdDate" will be changed to uniquecheck_id_createdDate which will reflect the status of uniquecheck of a particular recor based on the mentioned columns
 98 |           val colName_as_col = colName.replaceAll(",", "_")
 99 |           sqlexpression += ",case when count(*) over (partition by " + colName + ") > 1 then false else true end as " + colName_as_col + "_" + ruleName //replace with function call
100 |           if (latestIdentifierCol != null) {
101 |             sqlexpression += ",case when row_number() over(partition by " + colName + " order by " + latestIdentifierCol + " desc)=1 then true else false end as LatestRow_" + colName_as_col
102 |           }
103 |           val dqAggKey = java.util.UUID.randomUUID.toString
104 |           sqllogExpression = sqllogExpression + (if (sqllogExpression != "") " union all " else "") + "Select  to_json(struct(" + colListAsString + "))as Record,'" + sourcename + "' as Source,'" + entityname + "' as Entity,'" + colName + "' as ColumnName,'" + dqAggKey + "' as DqAggTableKey  from " + resultViewName + " where " + colName_as_col + "_" + ruleName + "= false"
105 |           dimaggExpression += (if (dimaggExpression != "") " union all " else "") + "Select '" + dqAggKey + "' as DqAggTableKey, '" + sourcename + "' as Source, '" + entityname + "' as Entity, '" + colName + "' as ColumnName, '" + ruleName + "' as Rule, " + recordCount + " as RecordCount, '" + pipelineId + "' as PipelineId ,  date_format(current_timestamp, \"y-MM-dd'T'HH:mm:ss.SSS'Z'\") as PlatformModifiedDate, cast(date_format(current_timestamp, \"yyyyMMddHHmmssSSS\") as long) as PlatformModifiedDateInt"
106 |         }
107 |       }
108 | 
109 |       if (sqlexpression == "" || sqllogExpression == "" || dimaggExpression == "") {
110 |         return df
111 |       }
112 | 
113 |       //creating resulting dataframe with required DQ columns
114 |       val inputViewName = "vw_Input_" + view_uid
115 |       df.createOrReplaceTempView(inputViewName)
116 |       val dqResultDf = spark.sql("select *" + sqlexpression + " from " + inputViewName)
117 |       dqResultDf.createOrReplaceTempView(resultViewName)
118 | 
119 |       if (properties.getProperty("DQ_LOG_RESULTS_FLAG").toBoolean) {
120 |         //log results into required tables
121 |         val failedresult = spark.sql(sqllogExpression)
122 | 
123 |         val current_time = current_timestamp()
124 |         var failTable = failedresult.withColumn("Source", lit(sourcename)).withColumn("Entity", lit(entityname)).withColumn("Rule", lit(ruleName)).withColumn("PlatformModifiedDate", date_format(current_time, "y-MM-dd'T'HH:mm:ss.SSS'Z'")).withColumn("PlatformModifiedDateInt", date_format(current_time, "yyyyMMddHHmmssSSS").cast(LongType))
125 |         failTable = correctFormat(failTable, orderOfFailTable)
126 |         spark.sql("insert into " + properties.getProperty("SUBJECT_AREA_DB") + "." + properties.getProperty("DQ_RUNDETAILS_TABLE_NAME") + " " + dimaggExpression)
127 |         val failedViewName = "vw_Failed_" + view_uid
128 |         failTable.createOrReplaceTempView(failedViewName)
129 |         spark.sql("insert into " + properties.getProperty("SUBJECT_AREA_DB") + "." + properties.getProperty("DQ_FAIL_TABLE_NAME") + " select * from " + failedViewName)
130 |       }
131 |       else
132 |         println("Skipping result logging as DQ_LOG_RESULTS_FLAG is set to " + properties.getProperty("DQ_LOG_RESULTS_FLAG"))
133 | 
134 |       dqResultDf
135 |     }
136 |     catch {
137 |       case e: Exception=> println("Returning original Dataframe. Uniquecheck for Source->"+sourcename+" and entity->"+entityname+" failed with Exception-->\n"+e.toString)
138 |         df
139 |     }
140 |   }
141 |   // Method to invoke null check on a dataset based on arguments passed by the user
142 |   // Returns the orginial dataframe along with additional flags indicating whether the required columns have passed/failed the unique check for each particular record
143 |   def apply(df:Dataset[Row],params:String,colEntitySourceMap:Map[String,List[String]],originalDfColumns:Seq[String],spark:SparkSession,pipelineId: String,properties: Properties):Dataset[Row]={
144 |     try {
145 |       //applying sanity checks to check the ruleName as "uniquecheck"
146 |       val ruleName = "uniquecheck"
147 |       if (params == null) {
148 |         println("Skipping " + ruleName + ". Please send Parameters json as string for " + ruleName)
149 |         return df
150 |       }
151 | 
152 |       import spark.implicits._
153 |       //getting the required paramaeters from the JSON:params to apply nullcheck (eg , columnList)
154 |       val paramsSchema = List(
155 |         StructField("params", StringType, false))
156 |       val paramsRow = Seq(Row(params))
157 |       val paramsDf = spark.createDataFrame(
158 |         spark.sparkContext.parallelize(paramsRow),
159 |         StructType(paramsSchema)
160 |       )
161 |       val paramsString = paramsDf.select(col("params") as "params").map(_.toString())
162 |       val readJson = spark.read.json(paramsString).asInstanceOf[Dataset[Row]]
163 |       val readJsonCols = readJson.columns.toSeq
164 |       //sanity check to validate json:params
165 |       if (containsIgnoreCase(readJsonCols, "_corrupt_record")) {
166 |         println("Skipping " + ruleName + ". The Parameters for " + ruleName + " are not a valid Json. Please provide a valid Json")
167 |         return df
168 |       }
169 |       //sanity check for required column:columnList in params
170 |       if (!containsIgnoreCase(readJsonCols, "columnList")) {
171 |         println("Skipping " + ruleName + ". Mandatory Key \"columnList\" required in Parameters for nullcheck")
172 |         return df
173 |       }
174 |       //get the number of unique checks to be performed on column (be it single or composite)
175 |       var len: Integer = readJson.select(size($"columnList")).first.getInt(0).toInt
176 |       var i = 0
177 |       var listOfCompositeColumns = List[String]()
178 |       //traversing each of the columnList provided(single/composite) and storing it in list
179 |       for (i <- 0 to len - 1) {
180 |         val colm = readJson.select($"columnList".getItem(i)).first.getString(0)
181 |         listOfCompositeColumns = listOfCompositeColumns :+ colm
182 |       }
183 |       val distinctColList = listOfCompositeColumns
184 | 
185 |       var latestIdentifierCol: String = null
186 |       if (containsIgnoreCase(readJsonCols, "latestrowidentifier")) {
187 |         latestIdentifierCol = readJson.select("latestrowidentifier").first.getString(0)
188 |       }
189 | 
190 |       if (latestIdentifierCol != null && !containsIgnoreCase(originalDfColumns, latestIdentifierCol)) {
191 |         println("Value for latestrowidentifier=" + latestIdentifierCol + " in parameters for " + ruleName + "  is not present in the given dataframe. Please make the required change. Proceeding with " + ruleName + " without identifying latest column.")
192 |         latestIdentifierCol = null
193 |       }
194 |       //order of the attributes of the failed Table
195 |       val orderOfFailTable: List[String] = List("Source", "Entity", "ColumnName", "Rule", "Record", "PlatformModifiedDate", "PlatformModifiedDateInt", "DqAggTableKey")
196 |       val colListAsString = getStringFromSeq(originalDfColumns)
197 |       //sql expression for building failed records for uniquecheck will come here
198 |       var sqllogExpression = ""
199 |       //sql expression for uniquecheck will come here
200 |       var sqlexpression = ""
201 |       //sql expression for logging the DQ run will come here
202 |       var dimaggExpression = ""
203 |       val recordCount = df.count()
204 | 
205 |       //
206 |       val view_uid = java.util.UUID.randomUUID.toString.replace('-', '_')
207 |       val resultViewName = "vw_Result_" + view_uid
208 |       // traversing each of the columnlist and applying uniquechek
209 |       for (colName <- distinctColList) {
210 |         if (!containsIgnoreCase(originalDfColumns, colName.split(","))) {
211 |           println("Skipping " + ruleName + " for column " + colName + " as it does not exist in frame provided")
212 |         }
213 |         else {
214 |           //building the column name which  will reflect the status of uniquecheck for individual records
215 |           //colName = "id,createdDate" will be changed to uniquecheck_id_createdDate which will reflect the status of uniquecheck of a particular recor based on the mentioned columns
216 |           val colName_as_col = colName.replaceAll(",", "_")
217 |           val entityname = colEntitySourceMap(colName)(0)
218 |           val sourcename = colEntitySourceMap(colName)(1)
219 |           sqlexpression += ",case when count(*) over (partition by " + colName + ") > 1 then false else true end as " + colName_as_col + "_" + ruleName //replace with function call
220 |           if (latestIdentifierCol != null) {
221 |             sqlexpression += ",case when row_number() over(partition by " + colName + " order by " + latestIdentifierCol + " desc)=1 then true else false end as LatestRow_" + colName_as_col
222 |           }
223 |           val dqAggKey = java.util.UUID.randomUUID.toString
224 |           sqllogExpression = sqllogExpression + (if (sqllogExpression != "") " union all " else "") + "Select  to_json(struct(" + colListAsString + "))as Record,'" + sourcename + "' as Source,'" + entityname + "' as Entity,'" + colName + "' as ColumnName,'" + dqAggKey + "' as DqAggTableKey  from " + resultViewName + " where " + colName_as_col + "_" + ruleName + "= false"
225 |           dimaggExpression += (if (dimaggExpression != "") " union all " else "") + "Select '" + dqAggKey + "' as DqAggTableKey, '" + sourcename + "' as Source, '" + entityname + "' as Entity, '" + colName + "' as ColumnName, '" + ruleName + "' as Rule, " + recordCount + " as RecordCount, '" + pipelineId + "' as PipelineId, date_format(current_timestamp, \"y-MM-dd'T'HH:mm:ss.SSS'Z'\") as PlatformModifiedDate, cast(date_format(current_timestamp, \"yyyyMMddHHmmssSSS\") as long) as PlatformModifiedDateInt"
226 |         }
227 |       }
228 | 
229 |       //sanity check if the unique check expression was built successfully
230 |       if (sqlexpression == "" || sqllogExpression == "" || dimaggExpression == "") {
231 |         return df
232 |       }
233 | 
234 |       //creating resulting dataframe with required DQ columns
235 |       val inputViewName = "vw_Input_" + view_uid
236 |       df.createOrReplaceTempView(inputViewName)
237 |       val dqResultDf = spark.sql("select *" + sqlexpression + " from " + inputViewName)
238 |       dqResultDf.createOrReplaceTempView(resultViewName)
239 | 
240 |       if (properties.getProperty("DQ_LOG_RESULTS_FLAG").toBoolean) {
241 |         //logging results in required tables
242 |         val failedresult = spark.sql(sqllogExpression)
243 |         val current_time = current_timestamp()
244 |         var failTable = failedresult.withColumn("Rule", lit(ruleName)).withColumn("PlatformModifiedDate", date_format(current_time, "y-MM-dd'T'HH:mm:ss.SSS'Z'")).withColumn("PlatformModifiedDateInt", date_format(current_time, "yyyyMMddHHmmssSSS").cast(LongType))
245 |         failTable = correctFormat(failTable, orderOfFailTable)
246 |         spark.sql("insert into " + properties.getProperty("SUBJECT_AREA_DB") + "." + properties.getProperty("DQ_RUNDETAILS_TABLE_NAME") + " " + dimaggExpression)
247 |         val failedViewName = "vw_Failed_" + view_uid
248 |         failTable.createOrReplaceTempView(failedViewName)
249 |         spark.sql("insert into " + properties.getProperty("SUBJECT_AREA_DB") + "." + properties.getProperty("DQ_FAIL_TABLE_NAME") + " select * from " + failedViewName)
250 |       }
251 |       else
252 |             println("Skipping result logging as DQ_LOG_RESULTS_FLAG is set to " + properties.getProperty("DQ_LOG_RESULTS_FLAG"))
253 | 
254 |       dqResultDf
255 |     }
256 |     catch {
257 |       case e: Exception=> println("Returning original Dataframe. Uniquecheck failed with Exception-->\n"+e.toString)
258 |         df
259 |     }
260 |   }
261 | }
262 | 


--------------------------------------------------------------------------------
/dq/src/main/scala/com/ms/dq/support/SupportTrait.scala:
--------------------------------------------------------------------------------
  1 | // Copyright (c) Microsoft Corporation.
  2 | // Licensed under the MIT License.
  3 | 
  4 | package com.ms.dq.support
  5 | 
  6 | import java.util.Properties
  7 | 
  8 | import org.apache.spark.sql.{Dataset, Row, SaveMode, SparkSession}
  9 | import org.apache.spark.sql.functions._
 10 | 
 11 | trait SupportTrait {
 12 |   //returns the dataframe:data ordered according to the list:orderedList
 13 |   def correctFormat(data: Dataset[Row],orderedList:List[String]): Dataset[Row]={
 14 |     data.select(orderedList.head,orderedList.tail: _*)
 15 |   }
 16 |   //inserts dataframe:writable into a praticular table:tableName
 17 |   def writeTableAppendAbsolute(writable: Dataset[Row], tableName: String) = {
 18 |     writable.write.mode(SaveMode.Append).insertInto(tableName)
 19 |   }
 20 |   //checks if a string:value is present in a string or not in a list:list irrespective of its case
 21 |   def containsIgnoreCase(list:Seq[String],value:String): Boolean={
 22 |     list.exists(item => item.toLowerCase() == value.toLowerCase())
 23 |   }
 24 | 
 25 |   def containsIgnoreCase(list:Seq[String],value:Seq[String]): Boolean={
 26 |     var inp=""
 27 |     for (inp <- value)
 28 |     {
 29 |       if(containsIgnoreCase(list,inp) == false)
 30 |         {
 31 |           println(inp + " is not present as a column in the dataframe")
 32 |           return false
 33 |         }
 34 |     }
 35 |     return true
 36 |   }
 37 | 
 38 |   def getStringFromSeq(stringList: Seq[String]): String = {
 39 |     stringList.map(a=>a).mkString(",")
 40 |   }
 41 |   //the function helps identifying the last time the table for a particular source and entity was processed , for delta processing
 42 |   def ruleWaterMarkStart(source: String, entity: String,rule: String, spark:SparkSession, properties: Properties): String = {
 43 |     import spark.implicits._
 44 |     var d: Dataset[Row] = null
 45 |     try {
 46 |       d = spark.table(properties.getProperty("SUBJECT_AREA_DB") + "." + properties.getProperty("DQ_RULE_WATERMARK_TABLE_NAME")).filter($"SubjectArea" === source).filter($"SourceEntityName" === entity).filter($"RuleName" === rule).agg(max("WaterMarkEndValue").alias("startValue"))
 47 |       if (!d.take(1).isEmpty) {
 48 |         if (null == d.first().getString(0)) {
 49 |           "17530101"
 50 |         } else {
 51 |           d.first().getString(0)
 52 |         }
 53 |       }
 54 |       else
 55 |         "17530101"
 56 |     }
 57 |     catch {
 58 |       case ex: Exception => println("Error with Watermark look up. "+ex.toString())
 59 |         throw ex
 60 |     }
 61 |   }
 62 |   //function to help in delta processing and returns all records in table name whose watermark value is greater than the start value and less than end value
 63 |   def dqDeltaIdentifier(tableName: String, auditCol: String,waterMarkStart:String,waterMarkEnd: String,spark:SparkSession):Dataset[Row]={
 64 |     try{
 65 |       if(!spark.catalog.tableExists(tableName))
 66 |       {
 67 |         println("TableName=>"+tableName+" does not exist. Please check metadata")
 68 |         throw new Exception("Table Does not exist")
 69 |       }
 70 |       val tableDf=spark.table(tableName)
 71 |       if (!containsIgnoreCase(tableDf.columns.toSeq, auditCol)) {
 72 |         println("TableName=>"+tableName+" does not contain column-->"+auditCol+". Please check metadata")
 73 |         throw new Exception(" Column Does not Exist")
 74 |       }
 75 |       if(waterMarkEnd.equals(Long.MaxValue.toString()))
 76 |         {
 77 |           tableDf.filter(col(auditCol)>waterMarkStart.toLong)
 78 |         }
 79 |       else
 80 |         {
 81 |           tableDf.filter(col(auditCol)>waterMarkStart.toLong).filter(col(auditCol)<waterMarkEnd.toLong)
 82 |         }
 83 |     }
 84 |     catch{
 85 |       case ex: Exception => println("Error with Delta Identifier. "+ex.toString())
 86 |         throw ex
 87 |     }
 88 |   }
 89 |   //function to help in delta processing and returns all records in table name whose watermark value is greater than the start value
 90 |   def dqDeltaIdentifier(tableName: String, auditCol: String,waterMarkStart:String,spark:SparkSession):Dataset[Row]={
 91 |     try{
 92 |       if(!spark.catalog.tableExists(tableName))
 93 |         {
 94 |           println("TableName=>"+tableName+" does not exist. Please check metadata")
 95 |           throw new Exception("Table Does not exist")
 96 |         }
 97 |       val tableDf=spark.table(tableName)
 98 |       if (!containsIgnoreCase(tableDf.columns.toSeq, auditCol)) {
 99 |         println("TableName=>"+tableName+" does not contain column-->"+auditCol+". Please check metadata")
100 |         throw new Exception(" Column Does not Exist")
101 |       }
102 |         tableDf.filter(col(auditCol)>waterMarkStart.toLong)
103 |     }
104 |     catch{
105 |       case ex: Exception => println("Error with Delta Identifier. "+ex.toString())
106 |         throw ex
107 |     }
108 |   }
109 |   //the function helps in updating the watermark value of the table for a particular source and entity , for delta processing
110 |   def updateRuleWaterMark(source: String, entity: String, rule: String, waterMarkEndVal: String,spark: SparkSession,properties: Properties)={
111 |     try{
112 |       val selectExpression= "Select '" + source + "' as SubjectArea, '" + entity + "' as SourceEntityName, '" + rule + "' as RuleName, '17530101' as WaterMarkStartValue, '" + waterMarkEndVal + "' as WaterMarkEndValue, date_format(current_timestamp, \"y-MM-dd'T'HH:mm:ss.SSS'Z'\") as PlatformModifiedDate, cast(date_format(current_timestamp, \"yyyyMMddHHmmssSSS\") as long) as PlatformModifiedDateInt"
113 |       spark.sql("insert into " + properties.getProperty("SUBJECT_AREA_DB") + "." + properties.getProperty("DQ_RULE_WATERMARK_TABLE_NAME") + " "+selectExpression)
114 |     }
115 |     catch{
116 |       case ex: Exception => println("Error with Watermark Update. "+ex.toString())
117 |         throw ex
118 |     }
119 |   }
120 |   //get properties from application.properties
121 |   def getProperties(): Properties = {
122 |     val url = getClass().getResource("/application.properties")
123 |     val properties: Properties = new Properties()
124 | 
125 |     if (url != null) {
126 |       val source = scala.io.Source.fromURL(url)
127 |       properties.load(source.bufferedReader())
128 |     }
129 |     else {
130 |       println("Properties file cannot be loaded")
131 |       throw new java.io.FileNotFoundException("Properties file cannot be loaded");
132 |     }
133 | 
134 |     return properties
135 |   }
136 | }
137 | 


--------------------------------------------------------------------------------
/images/Data Quality Insights.PNG:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/microsoft/Data-Quality-Rule-Engine/0c3000b1a7d82b9ddbba8f8b9af3e011977c3c5c/images/Data Quality Insights.PNG


--------------------------------------------------------------------------------
/images/Entityrulemetadata.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/microsoft/Data-Quality-Rule-Engine/0c3000b1a7d82b9ddbba8f8b9af3e011977c3c5c/images/Entityrulemetadata.png


--------------------------------------------------------------------------------
/images/OrphanEdgeMetadata.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/microsoft/Data-Quality-Rule-Engine/0c3000b1a7d82b9ddbba8f8b9af3e011977c3c5c/images/OrphanEdgeMetadata.png


--------------------------------------------------------------------------------
/images/Results.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/microsoft/Data-Quality-Rule-Engine/0c3000b1a7d82b9ddbba8f8b9af3e011977c3c5c/images/Results.png


--------------------------------------------------------------------------------
/notebooks/DQ Tables to Lake.scala:
--------------------------------------------------------------------------------
  1 | // Copyright (c) Microsoft Corporation.
  2 | // Licensed under the MIT License.
  3 | 
  4 | // Databricks notebook source
  5 | // DBTITLE 1,Input Widgets
  6 | import com.databricks.dbutils_v1.DBUtilsHolder.dbutils
  7 | 
  8 | dbutils.widgets.text("keyAdls", "","keyAdls")
  9 | dbutils.widgets.text("credendentialAdls", "","credendentialAdls")
 10 | dbutils.widgets.text("databricksScope", "","databricksScope")
 11 | dbutils.widgets.text("adlsLoginUrl", "","adlsLoginUrl")
 12 | dbutils.widgets.text("datalakeName", "","datalakeName")
 13 | dbutils.widgets.text("adlsPath", "","adlsPath")
 14 | dbutils.widgets.text("subjectAreaDb", "","subjectAreaDb")
 15 | dbutils.widgets.text("dqRunDetailsTableName", "","dqRunDetailsTableName")
 16 | dbutils.widgets.text("dqFailTableName", "","dqFailTableName")
 17 | dbutils.widgets.text("dqAggTableName", "","dqAggTableName")
 18 | dbutils.widgets.text("dqWatermarkTableName", "","dqWatermarkTableName")
 19 | 
 20 | 
 21 | // COMMAND ----------
 22 | 
 23 | // DBTITLE 1,ADLS Gen2 Access Config
 24 | val keyAdls = dbutils.widgets.get("keyAdls")
 25 | val credendentialAdls = dbutils.widgets.get("credendentialAdls")
 26 | val databricksScope = dbutils.widgets.get("databricksScope")
 27 | 
 28 | val decryptedADLSId = dbutils.secrets.get(scope = databricksScope, key = keyAdls)
 29 | val decryptedADLSCredential = dbutils.secrets.get(scope = databricksScope, key = credendentialAdls)
 30 | val adlsLoginUrl = dbutils.widgets.get("adlsLoginUrl")
 31 | val datalakeName = dbutils.widgets.get("datalakeName")
 32 | 
 33 | //initializing the spark session with the config
 34 | 
 35 | spark.conf.set(s"fs.azure.account.auth.type.${datalakeName}.dfs.core.windows.net", "OAuth")
 36 | spark.conf.set(s"fs.azure.account.oauth.provider.type.${datalakeName}.dfs.core.windows.net", "org.apache.hadoop.fs.azurebfs.oauth2.ClientCredsTokenProvider")
 37 | spark.conf.set(s"fs.azure.account.oauth2.client.id.${datalakeName}.dfs.core.windows.net", s"${decryptedADLSId}")
 38 | spark.conf.set(s"fs.azure.account.oauth2.client.secret.${datalakeName}.dfs.core.windows.net", s"${decryptedADLSCredential}")
 39 | spark.conf.set(s"fs.azure.account.oauth2.client.endpoint.${datalakeName}.dfs.core.windows.net", adlsLoginUrl)
 40 | spark.conf.set("fs.azure.createRemoteFileSystemDuringInitialization", "true")
 41 | 
 42 | 
 43 | // COMMAND ----------
 44 | 
 45 | // DBTITLE 1,Initialize Variables
 46 | val subjectAreaDb = dbutils.widgets.get("subjectAreaDb")
 47 | val dqRunDetailsTableName = dbutils.widgets.get("dqRunDetailsTableName")
 48 | val dqFailTableName = dbutils.widgets.get("dqFailTableName")
 49 | val dqAggTableName = dbutils.widgets.get("dqAggTableName")
 50 | val dqWatermarkTableName = dbutils.widgets.get("dqWatermarkTableName")
 51 | val adlsPath = dbutils.widgets.get("adlsPath")
 52 | 
 53 | // COMMAND ----------
 54 | 
 55 | // DBTITLE 1,Common Methods
 56 | import org.apache.spark.sql.DataFrame
 57 | def getwatermarkvalue(dqtablename: String): String ={
 58 | 
 59 |   try{
 60 |     val watermarkendvalue = spark.sql("SELECT COALESCE(watermarkendvalue, '1753-01-01') as watermarkvalue FROM " + subjectAreaDb + "." + dqWatermarkTableName + " WHERE lower(dqtable)='" + dqtablename +"'")
 61 | 
 62 |     if(watermarkendvalue.head(1).isEmpty)
 63 |       return "1753-01-01"
 64 |     else
 65 |       return watermarkendvalue.first.getString(0)
 66 |   }
 67 | 
 68 |   catch {
 69 |     case e: Exception => {
 70 |       println("ERROR : Unable to get the WaterMark value " + e.getMessage)
 71 |       throw e
 72 |     }
 73 |   }
 74 | 
 75 | }
 76 | 
 77 | 
 78 | def getwatermarkvalue(dqtablename: String, source: String, entity: String): String ={
 79 | 
 80 |   try{
 81 |     val watermarkendvalue = spark.sql("SELECT COALESCE(watermarkendvalue, '1753-01-01') as watermarkvalue FROM " + subjectAreaDb + "." + dqWatermarkTableName + " WHERE lower(dqtable)='" + dqtablename +"' and lower(source)='" + source +"' and lower(entity)='" + entity +"'")
 82 | 
 83 |     if(watermarkendvalue.head(1).isEmpty)
 84 |       return "1753-01-01"
 85 |     else
 86 |       return watermarkendvalue.first.getString(0)
 87 |   }
 88 | 
 89 |   catch {
 90 |     case e: Exception => {
 91 |       println("ERROR : Unable to get the WaterMark value " + e.getMessage)
 92 |       throw e
 93 |     }
 94 |   }
 95 | 
 96 | }
 97 | 
 98 | 
 99 | def setwatermarkvalue(watermarkendvalue: String, dqtablename: String, source: String, entity: String): Unit ={
100 | 
101 |   spark.conf.set("spark.sql.crossJoin.enabled", "true")
102 |   val insertQuery = s""" MERGE INTO ${subjectAreaDb}.${dqWatermarkTableName} as Target
103 |       USING   (
104 |       SELECT   '${watermarkendvalue}'   AS watermarkendvalue
105 |       ,current_timestamp        AS omidqcreateddate
106 |       ,current_timestamp        AS omidqmodifieddate
107 |       ,'${dqtablename}'         AS dqtable
108 |       ,'${source}'  			AS source
109 |       ,'${entity}'  			AS entity
110 |       ) AS Source
111 |       ON     Target.dqtable ='${dqtablename}'
112 |       AND COALESCE(Target.source, '') = COALESCE(Source.source, '')
113 |       AND COALESCE(Target.entity, '') = COALESCE(Source.entity, '')
114 |       WHEN   MATCHED
115 |       AND COALESCE(Target.watermarkendvalue, '') <> COALESCE(Source.watermarkendvalue, '')
116 |       AND COALESCE(Target.omidqmodifieddate, '') <> COALESCE(Source.omidqmodifieddate, '')
117 |       THEN
118 |       UPDATE
119 |       SET   Target.watermarkendvalue  = Source.watermarkendvalue
120 |       ,Target.omidqmodifieddate = Source.omidqmodifieddate
121 |       WHEN   NOT MATCHED
122 |       THEN
123 |       INSERT * """
124 | 
125 |   try {
126 |     spark.sql(insertQuery)
127 |   }
128 |   catch {
129 |     case e: Exception => {
130 |       println("ERROR : Unable to insert the WaterMark value " + e.getMessage)
131 |       throw e
132 |     }
133 |   }
134 | 
135 | }
136 | 
137 | // COMMAND ----------
138 | 
139 | // DBTITLE 1,Populate dqAggTable
140 | val watermarkvalue_dqfailtable = getwatermarkvalue(dqAggTableName,"dq",dqFailTableName)
141 | println(s"Watermark value for ${dqAggTableName}:${dqFailTableName} is: "+watermarkvalue_dqfailtable)
142 | val watermarkvalue_dqrundetails = getwatermarkvalue(dqAggTableName,"dq",dqRunDetailsTableName)
143 | println(s"Watermark value for ${dqAggTableName}:${dqRunDetailsTableName} is: "+watermarkvalue_dqrundetails)
144 | 
145 | val watermarkendvalue_dqfailtable = spark.sql("SELECT COALESCE(MAX(PlatformModifieddate), '1753-01-01') FROM " + subjectAreaDb + "." + dqFailTableName)
146 | println(s"New Watermark value for ${dqAggTableName}:${dqFailTableName} is: "+watermarkendvalue_dqfailtable.first.get(0).toString())
147 | val watermarkendvalue_dqrundetails = spark.sql("SELECT COALESCE(MAX(PlatformModifieddate), '1753-01-01') FROM " + subjectAreaDb + "." + dqRunDetailsTableName)
148 | println(s"New Watermark value for ${dqAggTableName}:${dqRunDetailsTableName} is: "+watermarkendvalue_dqrundetails.first.get(0).toString())
149 | 
150 | if (watermarkendvalue_dqfailtable.first.get(0).toString() == watermarkvalue_dqfailtable && watermarkendvalue_dqrundetails.first.get(0).toString() == watermarkvalue_dqrundetails)
151 | {
152 |   println("No records to update")
153 | }
154 | else
155 | {
156 |   println(s"New Records detected. Existing Watermark for ${dqRunDetailsTableName}: " + watermarkvalue_dqrundetails + s" and for ${dqFailTableName}: "+watermarkvalue_dqfailtable)
157 | 
158 |   val mergeQuery = s""" MERGE
159 |           INTO   ${subjectAreaDb}.${dqAggTableName} as Target
160 |           USING   (
161 |           SELECT R.DqAggTableKey                    AS DqAggTableKey
162 |           ,R.Source                                 AS Source
163 |           ,R.Entity                                 AS Entity
164 |           ,R.ColumnName                             AS ColumnName
165 |           ,R.Rule                                   AS Rule
166 |           ,R.PlatformModifiedDate                   AS DQModifiedDate
167 |           ,R.PlatformModifiedDateInt                AS DQModifiedDateInt
168 |           ,R.RecordCount                            AS RecordCount
169 |           ,COALESCE(F.FailCount, 0)                 AS FailCount
170 |           ,R.RecordCount - COALESCE(F.FailCount, 0) AS SuccessCount
171 |           ,current_timestamp                        AS PlatformModifiedDate
172 |           ,CAST(date_format(current_date(), 'yyyyMMddhhmmssSSS') AS Long)     AS PlatformModifiedDateInt
173 |           FROM ${subjectAreaDb}.${dqRunDetailsTableName} R
174 |           LEFT JOIN (
175 |           SELECT  DqAggTableKey
176 |           ,Count(1) AS FailCount
177 |           ,MIN(PlatformModifiedDate) AS FailTablePlatformModifiedDate
178 |           FROM ${subjectAreaDb}.${dqFailTableName}
179 |           GROUP BY DqAggTableKey) F
180 |             ON R.DqAggTableKey = F.DqAggTableKey
181 |           WHERE R.PlatformModifiedDate >= '${watermarkvalue_dqrundetails}'
182 |           OR F.FailTablePlatformModifiedDate >= '${watermarkvalue_dqfailtable}'
183 |           ) AS Source
184 |           ON     Target.DqAggTableKey = Source.DqAggTableKey
185 |           WHEN   MATCHED THEN
186 |           UPDATE
187 |           SET   Target.Source             = Source.Source
188 |           ,Target.Entity                  = Source.Entity
189 |           ,Target.Rule                    = Source.Rule
190 |           ,Target.ColumnName              = Source.ColumnName
191 |           ,Target.FailCount		          = Source.FailCount
192 |           ,Target.DQModifiedDate	      = Source.DQModifiedDate
193 |           ,Target.DQModifiedDateInt       = Source.DQModifiedDateInt
194 |           ,Target.RecordCount		      = Source.RecordCount
195 |           ,Target.SuccessCount	          = Source.SuccessCount
196 |           ,Target.PlatformModifiedDate    = Source.PlatformModifiedDate
197 |           ,Target.PlatformModifiedDateInt = Source.PlatformModifiedDateInt
198 |           WHEN   NOT MATCHED
199 |           THEN
200 |           INSERT * """
201 | 
202 |   try {
203 |     spark.sql(mergeQuery)
204 |   }
205 |   catch {
206 |     case e: Exception => {
207 |       println("ERROR : Unable to merge data in dqAggTable " + e.getMessage)
208 |       throw e
209 |     }
210 |   }
211 | 
212 |   println(s"Setting Watermark for ${dqAggTableName}:${dqFailTableName}. Value: "+ watermarkendvalue_dqfailtable.first.get(0).toString())
213 |   setwatermarkvalue(watermarkendvalue_dqfailtable.first.get(0).toString(), dqAggTableName, "dq", dqFailTableName)
214 |   println(s"Setting Watermark for ${dqAggTableName}:${dqRunDetailsTableName}. Value: "+ watermarkendvalue_dqrundetails.first.get(0).toString())
215 |   setwatermarkvalue(watermarkendvalue_dqrundetails.first.get(0).toString(), dqAggTableName , "dq", dqRunDetailsTableName )
216 |   println("Watermark set")
217 | }
218 | 
219 | // COMMAND ----------
220 | 
221 | // DBTITLE 1,Write files by Date to cooked folder for dqFailTable
222 | import org.apache.spark.sql.functions._
223 | import scala.collection.parallel._
224 | val df = spark.table(s"${subjectAreaDb}.${dqFailTableName}")
225 | val sources = df.select(lower($"Source")).distinct.collect.toList
226 | 
227 | val watermarkvalue = getwatermarkvalue(dqFailTableName)
228 | println("Watermark value is: "+watermarkvalue)
229 | val watermarkendvalue = spark.sql("SELECT COALESCE(MAX(PlatformModifieddate), '1753-01-01') FROM " + subjectAreaDb + "." + dqFailTableName)
230 | println("New Watermark value is: "+watermarkendvalue.first.get(0).toString())
231 | 
232 | if (watermarkendvalue.first.get(0).toString() == watermarkvalue)
233 | {
234 |   println("No records to update")
235 | }
236 | else
237 | {
238 |   println("New Records detected. Watermark: "+watermarkvalue)
239 |   val dates = df.filter(to_date($"PlatformModifieddate") >= watermarkvalue).select(to_date($"PlatformModifieddate") as "date").distinct
240 |   //display(dates)
241 | 
242 |   for(d <- dates.collect)
243 |   {
244 |     println("Writing File for Date: " + d(0))
245 |     val filtereddf = df.filter(to_date($"PlatformModifiedDate") === d(0))
246 | 
247 |     filtereddf.repartition(1).write
248 |       .format("com.databricks.spark.csv")
249 |       .option("header", "true")
250 |       .option("sep", "\t")
251 |       .option("quoteAll", true)
252 |       .option("escape","\"")
253 |       .mode("overwrite")
254 |       .save(adlsPath+"/" + dqFailTableName + "/" +d(0)+ ".tmp")
255 | 
256 |     val partitionPath = dbutils.fs.ls(adlsPath+"/" + dqFailTableName + "/" +d(0)+ ".tmp").filter(file => file.name.endsWith("csv"))(0).path
257 |     dbutils.fs.cp(partitionPath, adlsPath+"/" + dqFailTableName + "/"  +d(0)+ ".tsv")
258 |     dbutils.fs.rm(adlsPath+"/" + dqFailTableName + "/" +d(0)+ ".tmp", recurse = true)
259 | 
260 |     println("Completed writing File for Date: " + d(0))
261 | 
262 |   }
263 | }
264 | 
265 | println("Setting Watermark. Value: "+ watermarkendvalue.first.get(0).toString())
266 | setwatermarkvalue(watermarkendvalue.first.get(0).toString(), dqFailTableName , null, null)
267 | println("Watermark set")
268 | 
269 | 
270 | 
271 | 
272 | 
273 | // COMMAND ----------
274 | 
275 | // DBTITLE 1,Write dqaggtable to cooked folder
276 | val df = spark.table(s"${subjectAreaDb}.${dqAggTableName}")
277 | df.repartition(1).write
278 |   .format("com.databricks.spark.csv")
279 |   .option("header", "true")
280 |   .option("sep", "\t")
281 |   .option("quoteAll", true)
282 |   .option("escape","\"")
283 |   .mode("overwrite")
284 |   .save(adlsPath+"/" + dqAggTableName + ".tmp")
285 | 
286 | val partitionPath = dbutils.fs.ls(adlsPath+"/" + dqAggTableName + ".tmp/").filter(file => file.name.endsWith("csv"))(0).path
287 | dbutils.fs.cp(partitionPath, adlsPath+"/" + dqAggTableName + ".tsv")
288 | dbutils.fs.rm(adlsPath+"/" + dqAggTableName +  ".tmp/", recurse = true)
289 | 
290 | 
291 | // COMMAND ----------
292 | 
293 | // DBTITLE 1,Write dqrundetails to cooked folder
294 | val df = spark.table(s"${subjectAreaDb}.${dqRunDetailsTableName}")
295 | df.repartition(1).write
296 |   .format("com.databricks.spark.csv")
297 |   .option("header", "true")
298 |   .option("sep", "\t")
299 |   .option("quoteAll", true)
300 |   .option("escape","\"")
301 |   .mode("overwrite")
302 |   .save(adlsPath+"/" + dqRunDetailsTableName + ".tmp")
303 | 
304 | val partitionPath = dbutils.fs.ls(adlsPath+"/" + dqRunDetailsTableName + ".tmp/").filter(file => file.name.endsWith("csv"))(0).path
305 | dbutils.fs.cp(partitionPath, adlsPath+"/" + dqRunDetailsTableName + ".tsv")
306 | dbutils.fs.rm(adlsPath+"/" + dqRunDetailsTableName + ".tmp/", recurse = true)
307 | 
308 | // COMMAND ----------
309 | 
310 | 
311 | 


--------------------------------------------------------------------------------
/sample/DataQualityInsights.pbix:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/microsoft/Data-Quality-Rule-Engine/0c3000b1a7d82b9ddbba8f8b9af3e011977c3c5c/sample/DataQualityInsights.pbix


--------------------------------------------------------------------------------
/sample/DataQualityRuleEngineUsageSample.scala:
--------------------------------------------------------------------------------
 1 | // Databricks notebook source
 2 | // DBTITLE 1,Create Data Frame
 3 | val df=spark.sql(s"""select 1 as id,2 as partitionkey,'John' as Name,CAST('2020-12-01T21:06:20.000+0000' as Timestamp) as EventDateTime
 4 | union all
 5 | select 2 as id, 3 as partitionkey, 'Jack' as Name,current_date as EventDateTime
 6 | union all 
 7 | select 3 as id, 4 as partitionkey, null as Name,current_date as EventDateTime
 8 | union all 
 9 | select 1 as id,2 as partitionkey,'John' as Name,current_date as EventDateTime""")
10 | display(df)
11 | 
12 | // COMMAND ----------
13 | 
14 | // DBTITLE 1,Add Entries for Metadata Drive Data Quality Check
15 | // MAGIC %sql
16 | // MAGIC insert into dq.entityrulemetadata 
17 | // MAGIC (select 'sourceName' as source,'entityName' as entity,'nullcheck' as rulename, "{\"columnList\":\"name,eventDateTime\"}" as parameters
18 | // MAGIC union all
19 | // MAGIC select 'sourceName' as source,'entityName' as entity,'uniquecheck' as rulename, "{\"columnList\":[\"id\",\"partitionkey\"],\"LatestRowIdentifier\":\"eventDateTime\"}" as parameters)
20 | 
21 | // COMMAND ----------
22 | 
23 | // DBTITLE 1,Metadata Driven Data Quality Check
24 | //metadata driven
25 | import com.ms.jedi.dq.framework.DQFramework    
26 | val dqObj=new DQFramework()
27 | dqObj.setSparkSession(spark)
28 | val afterDq=dqObj.applyDq(df,"sourceName","entityName")
29 | display(afterDq)
30 | 
31 | // COMMAND ----------
32 | 
33 | // DBTITLE 1,Parameter Driven Data Quality  Check
34 | //parameter driven
35 | import com.ms.jedi.dq.framework.DQFramework    
36 | val dqObj=new DQFramework()
37 | dqObj.setSparkSession(spark)
38 | val col_rule_map=Map("nullcheck"->"""{"columnList":"name,eventDateTime"}""",
39 |                     "uniquecheck"->"""{"columnList":["id,partitionkey"],"LatestRowIdentifier":"eventDateTime"}""")
40 | val map:Map[String,List[String]]=Map("id,partitionkey"->List("entity1","source1"),
41 |                                     "eventDateTime"->List("entity2","source2"),
42 |                                     "name"->List("entity3","source3"))
43 | val afterDq=dqObj.applyDq(df,col_rule_map,map,"pipelineid")
44 | display(afterDq.select("id","partitionkey","name","eventDateTime","name_nullcheck","eventDateTime_nullcheck","id_partitionkey_uniquecheck","LatestRow_id_partitionkey"))
45 | 
46 | // COMMAND ----------
47 | 
48 | // DBTITLE 1,View Reporting data
49 | // MAGIC %sql
50 | // MAGIC select * from dq.dqrundetails where source in ('source1','source2','source3','sourceName')
51 | 
52 | // COMMAND ----------
53 | 
54 | // DBTITLE 1,View Reporting data
55 | // MAGIC %sql
56 | // MAGIC select * from dq.dqfailtable where source in ('source1','source2','source3','sourceName')
57 | 
58 | // COMMAND ----------
59 | 
60 | // DBTITLE 1,View Reporting data
61 | // MAGIC %sql
62 | // MAGIC select * from dq.dqAggTable where source in ('source1','source2','source3','sourceName')
63 | 
64 | // COMMAND ----------
65 | 
66 | 
67 | 


--------------------------------------------------------------------------------