├── .gitattributes
├── .github
    └── ISSUE_TEMPLATE
    │   ├── bug_report.yml
    │   └── config.yml
├── .gitignore
├── DatabricksTokenProviderMock
    ├── DatabricksADTokenMock
    │   └── src
    │   │   └── main
    │   │       └── scala
    │   │           └── Token.scala
    └── src
    │   └── main
    │       └── scala
    │           └── Provider.scala
├── LICENSE
├── README.md
├── SECURITY.md
├── artifacts
    └── spark-cdm-connector-assembly-synapse-spark3.3-1.19.7.jar
├── build.sbt
├── documentation
    └── overview.md
├── lib
    ├── SparkCustomEvents-1.0.0.jar
    ├── cdmstandardsandmodels-1.0.1.jar
    ├── hdinsight-spark-metrics_2_4-1.2.jar
    ├── mdsdclient-1.0.jar
    ├── objectmodel-1.7.3.jar
    ├── peregrine-tools-0.2.0-SNAPSHOT.jar
    ├── spark-enhancement_2.11-2.4.2.jar
    └── tokenlibrary_2.11-1.0.jar
├── project
    ├── build.properties
    └── plugins.sbt
├── samples
    ├── Contacts
    │   ├── Contacts.manifest.cdm.json
    │   ├── Customer.cdm.json
    │   ├── CustomerCategory.cdm.json
    │   ├── Entity.cdm.json
    │   ├── NestedExample.cdm.json
    │   ├── Person.cdm.json
    │   ├── TrackedEntity.cdm.json
    │   ├── _salesimports.cdm.json
    │   └── config.json
    ├── SparkCDMsample.scala
    └── SparkCDMsamplePython.ipynb
├── src
    ├── main
    │   ├── main.iml
    │   └── scala
    │   │   └── com
    │   │       └── microsoft
    │   │           └── cdm
    │   │               ├── CDMCatalog.scala
    │   │               ├── CDMIdentifier.scala
    │   │               ├── DefaultSource.scala
    │   │               ├── HadoopTables.scala
    │   │               ├── SparkTable.scala
    │   │               ├── log
    │   │                   └── SparkCDMLogger.scala
    │   │               ├── read
    │   │                   ├── CDMDataReader.scala
    │   │                   ├── CDMInputPartition.scala
    │   │                   ├── CDMPartitionReader.scala
    │   │                   ├── CDMPartitionReaderFactory.scala
    │   │                   ├── CDMReadOptions.scala
    │   │                   ├── CDMScanBuilder.scala
    │   │                   ├── CDMSimpleScan.scala
    │   │                   ├── CSVReaderConnector.scala
    │   │                   ├── ParquetReaderConnector.scala
    │   │                   └── ReaderConnector.scala
    │   │               ├── utils
    │   │                   ├── CDMAuthentication.scala
    │   │                   ├── CDMModelCommon.scala
    │   │                   ├── CDMModelReader.scala
    │   │                   ├── CDMModelWriter.scala
    │   │                   ├── CDMOptions.scala
    │   │                   ├── CDMParquetSchemaConverter.scala
    │   │                   ├── CDMSASTokenProvider.scala
    │   │                   ├── CDMTokenProvider.scala
    │   │                   ├── CDMUtils.scala
    │   │                   ├── CdmAdapterProvider.scala
    │   │                   ├── Constants.scala
    │   │                   ├── CsvParserFactory.scala
    │   │                   ├── DataConverter.scala
    │   │                   ├── DateTimeFormatterHelper.scala
    │   │                   ├── Messages.scala
    │   │                   ├── OverridenCdmStandardsAdapter.scala
    │   │                   ├── SerializedABFSHadoopConf.scala
    │   │                   ├── SparkSerializableConfiguration.scala
    │   │                   ├── StructTypeMetadata.scala
    │   │                   └── TimestampFormatter.scala
    │   │               └── write
    │   │                   ├── CDMBatchWriter.scala
    │   │                   ├── CDMDataWriter.scala
    │   │                   ├── CDMDataWriterFactory.scala
    │   │                   ├── CDMWriteOptions.scala
    │   │                   ├── CDMWriterBuilder.scala
    │   │                   ├── CSVWriterConnector.scala
    │   │                   ├── FileCommitMessage.scala
    │   │                   ├── ParquetWriterConnector.scala
    │   │                   └── WriterConnector.scala
    └── test
    │   ├── resources
    │       └── log4j.properties
    │   └── scala
    │       └── com
    │           └── microsoft
    │               └── cdm
    │                   └── test
    │                       ├── CDMADLS.scala
    │                       ├── CDMUnitTests.scala
    │                       └── TestData.scala
└── test
    ├── spark-cdm-connector-assembly-0.18.2.jar
    ├── spark-cdm-connector-assembly-permissive.jar
    └── tests


/.gitattributes:
--------------------------------------------------------------------------------
1 | .gitattributes text eol=lf
2 | .gitignore text eol=lf
3 | *.build text eol=lf
4 | *.scala
5 | Makefile text eol=lf
6 | 


--------------------------------------------------------------------------------
/.github/ISSUE_TEMPLATE/bug_report.yml:
--------------------------------------------------------------------------------
 1 | name: Bug Report
 2 | description: File a bug report or report an issue
 3 | title: "[Issue] Summary here"
 4 | 
 5 | body:
 6 |   - type: markdown
 7 |     attributes:
 8 |       value: |
 9 |         # Instructions
10 |         
11 |         Please follow the instructions below. Failure to do so may result in your issue being closed.
12 |         
13 |         1. Provide a good title starting with [Bug] or [Issue].
14 |         2. Fill out all sections of this bug report form.
15 | 
16 |   - type: dropdown
17 |     attributes:
18 |       label: Did you read the pinned issues and search the error message?
19 |       description: >-
20 |         Some users might encounter the same error or the fix is already addressed. You can locate the [pinned issues here](https://github.com/Azure/spark-cdm-connector/issues).
21 |         Paste some key words in the error message into the search bar and then click on "In this repository" option to search the repo.
22 |       options:
23 |         - No, but I will read and search it now before creating an issue.
24 |         - Yes, but I didn't find the answer.
25 |     validations:
26 |       required: true
27 | 
28 |   - type: markdown
29 |     attributes:
30 |       value: |
31 |         # Details
32 | 
33 |   - type: textarea
34 |     attributes:
35 |       label: Summary of issue
36 |       description: >-
37 |         Describe the issue you faced in this section.
38 |         Include the code you tried to execute and enclose with ` ``` ` (on its own line) before and after to make it legible.
39 |         Include any details about your dataframe or CDM schema if you think it helps explain the issue.
40 |       placeholder: |
41 |         I followed these steps and ran into an error. The full error stack trace is included in the next section.
42 |         ```
43 |         The code you tried to run.
44 |         ```
45 |     validations:
46 |       required: true
47 | 
48 |   - type: textarea
49 |     attributes:
50 |       label: Error stack trace
51 |       description: >-
52 |         Add the **full error stack trace** if applicable. The UI should probably show it or you can go into the driver logs and get it.
53 |       placeholder: |
54 |         ```
55 |         Some error stack trace here.
56 |         ```
57 |     validations:
58 |       required: false
59 | 
60 |   - type: markdown
61 |     attributes:
62 |       value: |
63 |         # Platform and Setup
64 | 
65 |   - type: input
66 |     attributes:
67 |       label: Platform name
68 |       description: "What platform are you using? Azure Synapse?"
69 |     validations:
70 |       required: true
71 | 
72 |   - type: input
73 |     attributes:
74 |       label: Spark version
75 |       description: "What Spark version is the platform running?"
76 |     validations:
77 |       required: true
78 |       
79 |   - type: input
80 |     attributes:
81 |       label: CDM jar version
82 |       description: |
83 |         What jar version are you using? If you don't know, you can also run this scala code: `com.microsoft.cdm.BuildInfo.version`
84 |     validations:
85 |       required: true
86 | 
87 |   - type: dropdown
88 |     attributes:
89 |       label: What is the format of the data you are trying to read/write?
90 |       description: >-
91 |         If reading, you can look into the storage account folder and it is likely "csv".
92 |         If writing, the default is "csv", unless you specify "parquet" in the write options.
93 |       options:
94 |         - .csv
95 |         - .parquet
96 |     validations:
97 |       required: true
98 | 


--------------------------------------------------------------------------------
/.github/ISSUE_TEMPLATE/config.yml:
--------------------------------------------------------------------------------
1 | # disable blank issue creation
2 | blank_issues_enabled: false
3 | 


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
1 | .idea/
2 | .metals/
3 | .vscode/
4 | target/
5 | sbt.json
6 | build.properties
7 | 


--------------------------------------------------------------------------------
/DatabricksTokenProviderMock/DatabricksADTokenMock/src/main/scala/Token.scala:
--------------------------------------------------------------------------------
1 | package shaded.databricks.v20180920_b33d810.org.apache.hadoop.fs.azurebfs.oauth2
2 | class AzureADToken{
3 |     def getAccessToken(): String = {
4 |       throw new Exception("This function should never be called")
5 |     }
6 | }
7 | 


--------------------------------------------------------------------------------
/DatabricksTokenProviderMock/src/main/scala/Provider.scala:
--------------------------------------------------------------------------------
1 | package com.databricks.backend.daemon.data.client.adl
2 | import shaded.databricks.v20180920_b33d810.org.apache.hadoop.fs.azurebfs.oauth2.AzureADToken
3 | 
4 | class AdlGen2CredentialContextTokenProvider {
5 |     def getToken(): AzureADToken = {
6 |           throw new Exception("Error - this method should never be called")
7 |     }
8 | }
9 | 


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
 1 |     MIT License
 2 | 
 3 |     Copyright (c) Microsoft Corporation. All rights reserved.
 4 | 
 5 |     Permission is hereby granted, free of charge, to any person obtaining a copy
 6 |     of this software and associated documentation files (the "Software"), to deal
 7 |     in the Software without restriction, including without limitation the rights
 8 |     to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 9 |     copies of the Software, and to permit persons to whom the Software is
10 |     furnished to do so, subject to the following conditions:
11 | 
12 |     The above copyright notice and this permission notice shall be included in all
13 |     copies or substantial portions of the Software.
14 | 
15 |     THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 |     IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 |     FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 |     AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 |     LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 |     OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 |     SOFTWARE
22 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | # ❗IMPORTANT NOTICE❗
 2 | The CDM library, which this connector is reliant on, is deprecating the CDM Schema Store. Please upgrade your connector version to the latest version [spark3.3-1.19.7](https://github.com/Azure/spark-cdm-connector/releases/tag/spark3.3-1.19.7) to ensure there is no disruption in your workflows.
 3 | Full details in: https://github.com/Azure/spark-cdm-connector/issues/162
 4 | 
 5 | # spark-cdm-connector
 6 | 
 7 | The Connector is now Generally Available in **Azure Spark for Azure Synapse**. The connector allows Spark dataframes to read and write entities in a CDM folder format residing on ADLS. To get started, please see [Using the Spark CDM Connector](documentation/overview.md).
 8 | 
 9 | Be sure to check the [issues](https://github.com/Azure/spark-cdm-connector/issues) and search the error message before sending mail to asksparkcdm@microsoft.com for questions or feedback.
10 | 
11 | For more information about CDM see: https://docs.microsoft.com/en-us/common-data-model/ 
12 | 
13 | Samples to use the connector with Python and Scala can be found here:
14 |   - [Python sample](samples/SparkCDMsamplePython.ipynb)
15 |   - [Scala sample](samples/SparkCDMsample.scala)
16 | 


--------------------------------------------------------------------------------
/SECURITY.md:
--------------------------------------------------------------------------------
 1 | <!-- BEGIN MICROSOFT SECURITY.MD V0.0.7 BLOCK -->
 2 | 
 3 | ## Security
 4 | 
 5 | Microsoft takes the security of our software products and services seriously, which includes all source code repositories managed through our GitHub organizations, which include [Microsoft](https://github.com/Microsoft), [Azure](https://github.com/Azure), [DotNet](https://github.com/dotnet), [AspNet](https://github.com/aspnet), [Xamarin](https://github.com/xamarin), and [our GitHub organizations](https://opensource.microsoft.com/).
 6 | 
 7 | If you believe you have found a security vulnerability in any Microsoft-owned repository that meets [Microsoft's definition of a security vulnerability](https://aka.ms/opensource/security/definition), please report it to us as described below.
 8 | 
 9 | ## Reporting Security Issues
10 | 
11 | **Please do not report security vulnerabilities through public GitHub issues.**
12 | 
13 | Instead, please report them to the Microsoft Security Response Center (MSRC) at [https://msrc.microsoft.com/create-report](https://aka.ms/opensource/security/create-report).
14 | 
15 | If you prefer to submit without logging in, send email to [secure@microsoft.com](mailto:secure@microsoft.com).  If possible, encrypt your message with our PGP key; please download it from the [Microsoft Security Response Center PGP Key page](https://aka.ms/opensource/security/pgpkey).
16 | 
17 | You should receive a response within 24 hours. If for some reason you do not, please follow up via email to ensure we received your original message. Additional information can be found at [microsoft.com/msrc](https://aka.ms/opensource/security/msrc). 
18 | 
19 | Please include the requested information listed below (as much as you can provide) to help us better understand the nature and scope of the possible issue:
20 | 
21 |   * Type of issue (e.g. buffer overflow, SQL injection, cross-site scripting, etc.)
22 |   * Full paths of source file(s) related to the manifestation of the issue
23 |   * The location of the affected source code (tag/branch/commit or direct URL)
24 |   * Any special configuration required to reproduce the issue
25 |   * Step-by-step instructions to reproduce the issue
26 |   * Proof-of-concept or exploit code (if possible)
27 |   * Impact of the issue, including how an attacker might exploit the issue
28 | 
29 | This information will help us triage your report more quickly.
30 | 
31 | If you are reporting for a bug bounty, more complete reports can contribute to a higher bounty award. Please visit our [Microsoft Bug Bounty Program](https://aka.ms/opensource/security/bounty) page for more details about our active programs.
32 | 
33 | ## Preferred Languages
34 | 
35 | We prefer all communications to be in English.
36 | 
37 | ## Policy
38 | 
39 | Microsoft follows the principle of [Coordinated Vulnerability Disclosure](https://aka.ms/opensource/security/cvd).
40 | 
41 | <!-- END MICROSOFT SECURITY.MD BLOCK -->
42 | 


--------------------------------------------------------------------------------
/artifacts/spark-cdm-connector-assembly-synapse-spark3.3-1.19.7.jar:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Azure/spark-cdm-connector/3a2f09767dabad4c97c3b849210603f08f7ff6bd/artifacts/spark-cdm-connector-assembly-synapse-spark3.3-1.19.7.jar


--------------------------------------------------------------------------------
/build.sbt:
--------------------------------------------------------------------------------
  1 | name := "spark-cdm-connector"
  2 | 
  3 | //the groupid
  4 | organization := "com.microsoft.azure"
  5 | 
  6 | // skip all dependencies in the pom file. This is an uber jar
  7 | // refernce: https://stackoverflow.com/questions/41670018/how-to-prevent-sbt-to-include-test-dependencies-into-the-pom
  8 | 
  9 | import scala.xml.{Node => XmlNode, NodeSeq => XmlNodeSeq, _}
 10 | import scala.xml.transform.{RewriteRule, RuleTransformer}
 11 | pomPostProcess := { (node: XmlNode) =>
 12 |   new RuleTransformer(new RewriteRule {
 13 |     override def transform(node: XmlNode): XmlNodeSeq = node match {
 14 |       case e: Elem if e.label == "dependency" => scala.xml.NodeSeq.Empty
 15 |       case _ => node
 16 |     }
 17 |   }).transform(node).head
 18 | }
 19 | 
 20 | version := "spark3.3-1.19.7"
 21 | 
 22 | crossPaths := false
 23 | ThisBuild / scalaVersion := "2.12.15"
 24 | libraryDependencies += "org.scalatest" %% "scalatest" % "3.0.8" % "test"
 25 | 
 26 | libraryDependencies += "com.fasterxml.jackson.datatype" % "jackson-datatype-jdk8" % "2.13.4"
 27 | libraryDependencies += "com.fasterxml.jackson.datatype" % "jackson-datatype-jsr310" % "2.13.4"
 28 | 
 29 | //these libraries already exist in spark HDI 2.4.0 - don't include them building the uber jar
 30 | dependencyOverrides += "com.fasterxml.jackson.core" % "jackson-databind" % "2.13.4.1"
 31 | libraryDependencies += "com.fasterxml.jackson.module" %% "jackson-module-scala" % "2.13.4"
 32 | libraryDependencies += "com.fasterxml.jackson.core" % "jackson-core" % "2.13.4"
 33 | libraryDependencies += "com.fasterxml.jackson.core" % "jackson-annotations" % "2.13.4"
 34 | libraryDependencies += "org.apache.commons" % "commons-lang3" % "3.12.0"  % "provided"
 35 | libraryDependencies += "org.apache.spark" %% "spark-sql" % "3.3.0" % "provided"
 36 | libraryDependencies += "org.apache.spark" %% "spark-core" % "3.3.0" % "provided"
 37 | libraryDependencies += "org.apache.httpcomponents" % "httpclient" % "4.5.13" % "provided"
 38 | libraryDependencies += "com.google.guava" % "guava" % "14.0.1" % "provided"
 39 | libraryDependencies += "commons-io" % "commons-io" % "2.11.0" % "provided"
 40 | libraryDependencies += "com.microsoft.azure" % "msal4j" % "1.10.1"
 41 | libraryDependencies += "com.microsoft.commondatamodel" % "cdmstandards" % "2.8.0"
 42 | libraryDependencies += "org.apache.hadoop" % "hadoop-azure" % "3.3.1" % "provided"
 43 | libraryDependencies += "org.apache.hadoop" % "hadoop-common" % "3.3.1" % "provided"
 44 | 
 45 | resolvers += "Maven Twitter Releases" at "https://maven.twttr.com/"
 46 | libraryDependencies += "com.hadoop.gplcompression" % "hadoop-lzo" % "0.4.20"
 47 | 
 48 | // The main module depends on the mock'd databricks DataricksokenProvider classes.
 49 | lazy val root = (project in file(".")).
 50 |   enablePlugins(BuildInfoPlugin).
 51 |   settings(
 52 |     buildInfoKeys := Seq[BuildInfoKey](name, version, scalaVersion, sbtVersion),
 53 |     buildInfoPackage := "com.microsoft.cdm"
 54 |   ).dependsOn(child)
 55 | 
 56 | // Create the Databricks mocking library without including DataBricks jars. Their only purpose is to enable
 57 | // compilation. When we deploy to Databricks, these classes will already exist as part of the Databricks runtime.
 58 | // Child and grandchild represent the two mocked Databricks libraries, whose classes are removed when building
 59 | // an uber jar -- see assemblyMergeStrategy below.
 60 | lazy val child = Project("DatabricksTokenProviderMock", file("DatabricksTokenProviderMock"))
 61 |   .settings().dependsOn(grandchild)
 62 | lazy val grandchild = Project("DatabricksADTokenMock", file("DatabricksTokenProviderMock/DatabricksADTokenMock"))
 63 |   .settings()
 64 | 
 65 | //assembly
 66 | assemblyShadeRules in assembly := Seq(
 67 |   ShadeRule.rename("com.microsoft.aad.msal4j.**" -> "shadeiomsal4j.@1").inAll,
 68 |   ShadeRule.rename("com.fasterxml.jackson.**" -> "shadeio.@1").inAll,
 69 |   ShadeRule.rename("com.nimbusds.**" -> "shadeionimbusds.@1").inAll,
 70 |   ShadeRule.rename("net.minidev.**" -> "shadeiominidev.@1").inAll
 71 | )
 72 | 
 73 | 
 74 | assemblyExcludedJars in assembly := {
 75 |   val cp = (fullClasspath in assembly).value
 76 |   cp filter { f =>
 77 |         f.data.getName.contains("tokenlibrary") ||
 78 |         f.data.getName.contains("SparkCustomEvents") ||
 79 |         f.data.getName.contains("hdinsight") ||
 80 |         f.data.getName.contains("peregrine") ||
 81 |         f.data.getName.contains("mdsdclient") ||
 82 |         f.data.getName.contains("spark-enhancement")
 83 |   }
 84 | }
 85 | 
 86 | // build an uber jar
 87 | assemblyMergeStrategy in assembly := {
 88 |   case PathList("META-INF", xs@_*) =>  MergeStrategy.discard
 89 |   //the stubbed-out Databricks jars don't show up in "assemblyExcludedJars" to remove, so manually removing the mocking classes
 90 |   case "shaded/databricks/v20180920_b33d810/org/apache/hadoop/fs/azurebfs/oauth2/AzureADToken.class" => MergeStrategy.discard
 91 |   case "com/databricks/backend/daemon/data/client/adl/AdlGen2CredentialContextTokenProvider.class" => MergeStrategy.discard
 92 |   case x =>  MergeStrategy.first
 93 | }
 94 | 
 95 | // don't bring scala classes into uber jar
 96 | assemblyOption in assembly ~= { _.copy(includeScala = false) }
 97 | 
 98 | // don't run tests with "sbt assembly"
 99 | test in assembly := {}
100 | 
101 | // Below is for publishing
102 | artifact in (Compile, packageBin) := {
103 |   val art = (artifact in (Compile, packageBin)).value
104 |   art.withClassifier(Some(""))
105 | }
106 | 
107 | addArtifact(artifact in (Compile, packageBin), assembly)
108 | 
109 | // Your profile name of the sonatype account. The default is the same with the organization value
110 | sonatypeProfileName := "com.microsoft.azure"
111 | 
112 | // To sync with Maven central, you need to supply the following information:
113 | publishMavenStyle := true
114 | 
115 | // Open-source license of your choice
116 | licenses := Seq("APL2" -> url("http://www.apache.org/licenses/LICENSE-2.0.txt"))
117 | 
118 | // Where is the source code hosted: GitHub or GitLab?
119 | import xerial.sbt.Sonatype._
120 | sonatypeProjectHosting := Some(GitHubHosting("bissont", "Spark-CDM", "tibisso@microsoft.com"))
121 | 
122 | // or if you want to set these fields manually
123 | homepage := Some(url("https://github.com/Azure/spark-cdm-connector"))
124 | scmInfo := Some(
125 |   ScmInfo(
126 |     url("https://github.com/Azure/new-spark-cdm"),
127 |     "scm:git@github.com:Azure/new-spark-cdm.git"
128 |   )
129 | )
130 | developers := List(
131 |   Developer(id="tibisso", name="Timothy Bisson", email="tibisso@microsoft.com", url=url("https://github.com/bissont")),
132 |   Developer(id="srruj", name="Sricheta Ruj", email="Sricheta.Ruj@microsoft.com", url=url("https://github.com/sricheta92"))
133 | )
134 | 
135 | // Remove all additional repository other than Maven Central from POM
136 | ThisBuild / pomIncludeRepository := { _ => false }
137 | ThisBuild / publishTo := sonatypePublishToBundle.value
138 | 
139 | ThisBuild / publishConfiguration := publishConfiguration.value.withOverwrite(true)
140 | ThisBuild / publishLocalConfiguration := publishLocalConfiguration.value.withOverwrite(true)
141 | 


--------------------------------------------------------------------------------
/lib/SparkCustomEvents-1.0.0.jar:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Azure/spark-cdm-connector/3a2f09767dabad4c97c3b849210603f08f7ff6bd/lib/SparkCustomEvents-1.0.0.jar


--------------------------------------------------------------------------------
/lib/cdmstandardsandmodels-1.0.1.jar:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Azure/spark-cdm-connector/3a2f09767dabad4c97c3b849210603f08f7ff6bd/lib/cdmstandardsandmodels-1.0.1.jar


--------------------------------------------------------------------------------
/lib/hdinsight-spark-metrics_2_4-1.2.jar:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Azure/spark-cdm-connector/3a2f09767dabad4c97c3b849210603f08f7ff6bd/lib/hdinsight-spark-metrics_2_4-1.2.jar


--------------------------------------------------------------------------------
/lib/mdsdclient-1.0.jar:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Azure/spark-cdm-connector/3a2f09767dabad4c97c3b849210603f08f7ff6bd/lib/mdsdclient-1.0.jar


--------------------------------------------------------------------------------
/lib/objectmodel-1.7.3.jar:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Azure/spark-cdm-connector/3a2f09767dabad4c97c3b849210603f08f7ff6bd/lib/objectmodel-1.7.3.jar


--------------------------------------------------------------------------------
/lib/peregrine-tools-0.2.0-SNAPSHOT.jar:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Azure/spark-cdm-connector/3a2f09767dabad4c97c3b849210603f08f7ff6bd/lib/peregrine-tools-0.2.0-SNAPSHOT.jar


--------------------------------------------------------------------------------
/lib/spark-enhancement_2.11-2.4.2.jar:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Azure/spark-cdm-connector/3a2f09767dabad4c97c3b849210603f08f7ff6bd/lib/spark-enhancement_2.11-2.4.2.jar


--------------------------------------------------------------------------------
/lib/tokenlibrary_2.11-1.0.jar:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Azure/spark-cdm-connector/3a2f09767dabad4c97c3b849210603f08f7ff6bd/lib/tokenlibrary_2.11-1.0.jar


--------------------------------------------------------------------------------
/project/build.properties:
--------------------------------------------------------------------------------
1 | sbt.version=1.5.5
2 | 


--------------------------------------------------------------------------------
/project/plugins.sbt:
--------------------------------------------------------------------------------
1 | addSbtPlugin("com.eed3si9n" % "sbt-assembly" % "0.14.6")
2 | addSbtPlugin("org.xerial.sbt" % "sbt-sonatype" % "3.4")
3 | addSbtPlugin("com.jsuereth" % "sbt-pgp" % "2.0.0-M2")
4 | addSbtPlugin("com.eed3si9n" % "sbt-buildinfo" % "0.9.0")
5 | addDependencyTreePlugin
6 | 


--------------------------------------------------------------------------------
/samples/Contacts/Contacts.manifest.cdm.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "$schema": "https://raw.githubusercontent.com/microsoft/CDM/master/schemaDocuments/schema.manifest.cdm.json",
 3 |     "jsonSchemaSemanticVersion": "1.0.0",
 4 |     "imports": [
 5 |         {
 6 |             "corpusPath": "cdm:/foundations.cdm.json"
 7 |         }
 8 |     ],
 9 |     "manifestName": "Contacts",
10 |     "explanation": "A logical model of contacts",
11 |     "entities": [
12 |         {
13 |             "type": "LocalEntity",
14 |             "entityName": "Person",
15 |             "entityPath": "Person.cdm.json/Person"
16 |         },
17 |         {
18 |             "type": "LocalEntity",
19 |             "entityName": "Entity",
20 |             "entityPath": "Entity.cdm.json/Entity"
21 |         }
22 |     ]
23 | }


--------------------------------------------------------------------------------
/samples/Contacts/Customer.cdm.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "$schema": "https://raw.githubusercontent.com/microsoft/CDM/master/schemaDocuments/schema.cdm.json",
 3 |     "jsonSchemaSemanticVersion": "1.0.0",
 4 |     "imports": [
 5 |         {
 6 |             "corpusPath": "_salesimports.cdm.json"
 7 |         }
 8 |     ],
 9 |     "definitions": [
10 |         {
11 |             "entityName": "Customer",
12 |             "extendsEntity":"TrackedEntity",
13 |             "hasAttributes": [
14 |                 {
15 |                     "purpose": "hasA",
16 |                     "dataType": "integer",
17 |                     "name": "CustomerId",
18 |                     "description": "Customer identifier"
19 |                 },
20 |                 {
21 |                     "purpose": "hasA",
22 |                     "dataType": "string",
23 |                     "name": "CustomerName",
24 |                     "displayName": "Name ",
25 |                     "maximumLength": 100
26 |                 },
27 |                 {
28 |                     "purpose": "hasA",
29 |                     "dataType": "decimal",
30 |                     "name": "CreditLimit",
31 |                     "appliedTraits": [
32 |                         {
33 |                             "traitReference": "is.dataFormat.numeric.shaped",
34 |                             "arguments": [
35 |                                 {
36 |                                     "name": "precision",
37 |                                     "value": "18"
38 |                                 },
39 |                                 {
40 |                                     "name": "scale",
41 |                                     "value": "2"
42 |                                 }
43 |                             ]
44 |                         }
45 |                     ]
46 |                 }
47 |             ]
48 |         }
49 |     ]
50 | }


--------------------------------------------------------------------------------
/samples/Contacts/CustomerCategory.cdm.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "$schema": "https://raw.githubusercontent.com/microsoft/CDM/master/schemaDocuments/schema.cdm.json",
 3 |     "jsonSchemaSemanticVersion": "1.0.0",
 4 |     "imports": [
 5 |         {
 6 |             "corpusPath": "_salesimports.cdm.json"
 7 |         }
 8 |     ],
 9 |     "definitions": [
10 |         {
11 |             "entityName": "CustomerCategory",
12 |             "extendsEntity":"TrackedEntity",
13 |             "description": "The kind of customer - agent, wholesaler, etc.",
14 |             "hasAttributes": [
15 |                 {
16 |                     "dataType": "integer",
17 |                     "name": "CustomerCategoryId",
18 |                     "displayName": "Category ID"
19 |                 },
20 |                 {
21 |                     "dataType": "string",
22 |                     "name": "CustomerCategoryName"
23 |                 }
24 |             ]
25 |         }
26 |     ]
27 | }


--------------------------------------------------------------------------------
/samples/Contacts/Entity.cdm.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "$schema": "https://raw.githubusercontent.com/microsoft/CDM/master/schemaDocuments/schema.cdm.json",
 3 |     "jsonSchemaSemanticVersion": "1.0.0",
 4 |     "imports": [
 5 |         {
 6 |             "corpusPath": "cdm:/foundations.cdm.json"
 7 |         }
 8 |     ],
 9 |     "definitions": [
10 |         {
11 |             "entityName": "Entity",
12 |             "description": "A base entity type defining common attributes used by other entities",
13 |             "hasAttributes": [
14 |                 {
15 |                     "purpose": "hasA",
16 |                     "dataType": "integer",
17 |                     "name": "identifier",
18 |                     "description": "Identifier of the entity"
19 |                 },
20 |                 {
21 |                     "purpose": "hasA",
22 |                     "dataType": "dateTime",
23 |                     "name": "createdTime",
24 |                     "description": "The UTC time this entity was created"
25 |                 }
26 |             ]
27 |         }
28 |     ]
29 | }


--------------------------------------------------------------------------------
/samples/Contacts/NestedExample.cdm.json:
--------------------------------------------------------------------------------
  1 | {
  2 |   "jsonSchemaSemanticVersion" : "1.0.0",
  3 |   "imports" : [
  4 |     {
  5 |       "corpusPath" : "cdm:/foundations.cdm.json"
  6 |     }
  7 |   ],
  8 |   "definitions" : [
  9 |     {
 10 |       "entityName" : "NestedExample",
 11 |       "hasAttributes" : [
 12 |         {
 13 |           "name" : "id",
 14 |           "dataType" : "integer"
 15 |         },
 16 |         {
 17 |           "entity" : "details",
 18 |           "name" : "details",
 19 |           "resolutionGuidance" : {
 20 |             "imposedDirectives" : [
 21 |               "structured",
 22 |               "noMaxDepth"
 23 |             ],
 24 |             "renameFormat" : "{m}"
 25 |           }
 26 |         }
 27 |       ]
 28 |     },
 29 |     {
 30 |       "entityName" : "details",
 31 |       "hasAttributes" : [
 32 |         {
 33 |           "name" : "name",
 34 |           "dataType" : "string"
 35 |         },
 36 |         {
 37 |           "name" : "USCitizen",
 38 |           "dataType" : "boolean"
 39 |         },
 40 |         {
 41 |           "name" : "salary",
 42 |           "dataType" : "double"
 43 |         },
 44 |         {
 45 |           "name" : "phone",
 46 |           "dataType" : "bigInteger"
 47 |         },
 48 |         {
 49 |           "name" : "birthDate",
 50 |           "dataType" : "date"
 51 |         },
 52 |         {
 53 |           "name" : "bodyMassIndex",
 54 |           "dataType" : "decimal",
 55 |           "appliedTraits" : [
 56 |             {
 57 |               "traitReference" : "is.dataFormat.numeric.shaped",
 58 |               "arguments" : [
 59 |                 {
 60 |                   "name" : "precision",
 61 |                   "value" : 5
 62 |                 },
 63 |                 {
 64 |                   "name" : "scale",
 65 |                   "value" : 2
 66 |                 }
 67 |               ]
 68 |             }
 69 |           ]
 70 |         },
 71 |         {
 72 |           "name" : "createdTime",
 73 |           "dataType" : "dateTime"
 74 |         },
 75 |         {
 76 |           "entity" : "address",
 77 |           "name" : "address",
 78 |           "resolutionGuidance" : {
 79 |             "imposedDirectives" : [
 80 |               "structured",
 81 |               "noMaxDepth"
 82 |             ],
 83 |             "renameFormat" : "{m}"
 84 |           }
 85 |         }
 86 |       ]
 87 |     },
 88 |     {
 89 |       "entityName" : "address",
 90 |       "hasAttributes" : [
 91 |         {
 92 |           "name" : "zipcode",
 93 |           "dataType" : "string"
 94 |         },
 95 |         {
 96 |           "entity" : "street",
 97 |           "name" : "street",
 98 |           "resolutionGuidance" : {
 99 |             "imposedDirectives" : [
100 |               "structured",
101 |               "noMaxDepth"
102 |             ],
103 |             "renameFormat" : "{m}"
104 |           }
105 |         },
106 |         {
107 |           "entity" : "songs",
108 |           "name" : "songs",
109 |           "resolutionGuidance" : {
110 |             "imposedDirectives" : [
111 |               "structured",
112 |               "noMaxDepth"
113 |             ],
114 |             "removedDirectives" : [
115 |               "normalized"
116 |             ],
117 |             "cardinality" : "many",
118 |             "renameFormat" : "{m}"
119 |           }
120 |         }
121 |       ]
122 |     },
123 |     {
124 |       "entityName" : "street",
125 |       "hasAttributes" : [
126 |         {
127 |           "name" : "streetName",
128 |           "dataType" : "string"
129 |         },
130 |         {
131 |           "name" : "streetNumber",
132 |           "dataType" : "integer"
133 |         }
134 |       ]
135 |     },
136 |     {
137 |       "entityName" : "songs",
138 |       "hasAttributes" : [
139 |         {
140 |           "name" : "name",
141 |           "dataType" : "string"
142 |         },
143 |         {
144 |           "name" : "number",
145 |           "dataType" : "integer"
146 |         }
147 |       ]
148 |     }
149 |   ]
150 | }


--------------------------------------------------------------------------------
/samples/Contacts/Person.cdm.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "$schema": "https://raw.githubusercontent.com/microsoft/CDM/master/schemaDocuments/schema.cdm.json",
 3 |     "jsonSchemaSemanticVersion": "1.0.0",
 4 |     "imports": [
 5 |         {
 6 |             "corpusPath": "cdm:/foundations.cdm.json"
 7 |         },
 8 |         {
 9 |             "corpusPath": "Entity.cdm.json"
10 |         }
11 |     ],
12 |     "definitions": [
13 |         {
14 |             "entityName": "Person",
15 |             "extendsEntity": "Entity",
16 |             "description": "An individual",
17 |             "hasAttributes": [
18 |                 {
19 |                     "purpose": "hasA",
20 |                     "dataType": "string",
21 |                     "name": "firstName",
22 |                     "description": "Person's first name",
23 |                     "maximumLength": 100
24 |                 },
25 |                 {
26 |                     "purpose": "hasA",
27 |                     "dataType": "string",
28 |                     "name": "lastName",
29 |                     "description": "Person's last name",
30 |                     "maximumLength": 100
31 |                 },
32 |                 {
33 |                     "purpose": "hasA",
34 |                     "dataType": "date",
35 |                     "name": "birthDate",
36 |                     "description": "Person's date of birth"
37 |                 }
38 |             ]
39 |         }
40 |     ]
41 | }


--------------------------------------------------------------------------------
/samples/Contacts/TrackedEntity.cdm.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "$schema": "https://raw.githubusercontent.com/microsoft/CDM/master/schemaDocuments/schema.cdm.json",
 3 |     "jsonSchemaSemanticVersion": "1.0.0",
 4 |     "imports": [
 5 |         {
 6 |             "corpusPath": "cdm:/foundations.cdm.json"
 7 |         }
 8 |     ],
 9 |     "definitions": [
10 |         {
11 |             "description": "An entity whose modification is tracked ",
12 |             "entityName": "TrackedEntity",
13 |             "hasAttributes": [
14 |                 {
15 |                     "purpose": "modifiedOn",
16 |                     "dataType": "dateTime",
17 |                     "name": "ValidFrom",
18 |                     "description": "The date from which this record is valid"
19 |                 },
20 |                 {
21 |                     "purpose": "hasA",
22 |                     "dataType": "dateTime",
23 |                     "name": "ValidTo",
24 |                     "description": "The date to which this record was valid"                }
25 |             ]
26 |         }
27 |     ]
28 | }


--------------------------------------------------------------------------------
/samples/Contacts/_salesimports.cdm.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "jsonSchemaSemanticVersion": "1.0.0",
 3 |     "imports": [
 4 |         {
 5 |             "corpusPath": "cdm:/foundations.cdm.json"
 6 |         },
 7 |         {
 8 |             "corpusPath": "Customer.cdm.json"
 9 |         },
10 |         {
11 |             "corpusPath": "CustomerCategory.cdm.json"
12 |         },
13 |         {
14 |             "corpusPath": "core:/TrackedEntity.cdm.json"
15 |         }
16 |     ]
17 | }


--------------------------------------------------------------------------------
/samples/Contacts/config.json:
--------------------------------------------------------------------------------
 1 | {
 2 |   "defaultNamespace" : "adls",
 3 |   "adapters" : [
 4 |     {
 5 |       "type" : "adls",
 6 |       "config" : {
 7 |         "hostname" : "srichetastorage.dfs.core.windows.net",
 8 |         "root" : "/outputsubmanifest/example-public-standards",
 9 |         "tenant" : "72f988bf-86f1-41af-91ab-2d7cd011db47",
10 |         "clientId" : "6c3f525f-bdcb-4677-bed6-24f0b43add13",
11 |         "timeout" : 5000,
12 |         "maximumTimeout" : 20000,
13 |         "numberOfRetries" : 2
14 |       },
15 |       "namespace" : "core"
16 |     }
17 |   ]
18 | }


--------------------------------------------------------------------------------
/samples/SparkCDMsample.scala:
--------------------------------------------------------------------------------
  1 | import org.apache.spark.sql.Row
  2 | import org.apache.spark.sql.SaveMode
  3 | import org.apache.spark.sql.types.{ArrayType, BooleanType, DateType, Decimal, DecimalType, DoubleType, IntegerType, LongType, MetadataBuilder, StringType, StructField, StructType, TimestampType}
  4 | 
  5 | // Databricks notebook source
  6 | // Specifying appid, appkey and tenanid is optional in spark-cdm-connector-assembly-0.16.jar with Premium Databricks Cluster and Synapse
  7 | val appid = "<appId>"
  8 | val appkey = "<appKey>"
  9 | val tenantid = "<tenantId>"
 10 | 
 11 | val storageAccountName = "<storageAccount>.dfs.core.windows.net"
 12 | 
 13 | 
 14 | // COMMAND ----------
 15 | 
 16 | // Implicit write case
 17 | // Write a CDM entity with Parquet data files, entity definition is derived from the dataframe schema
 18 | val date= java.sql.Date.valueOf("2015-03-31");
 19 | val timestamp = new java.sql.Timestamp(System.currentTimeMillis());
 20 | var data = Seq(
 21 |   Row("a", 1, true, 12.34, 6L, date, timestamp, Decimal(1.4337879), Decimal(999.00), Decimal(18.8)),
 22 |   Row("b", 1, true, 12.34, 6L, date, timestamp, Decimal(1.4337879), Decimal(999.00), Decimal(18.8))
 23 | )
 24 | 
 25 | var schema = new StructType()
 26 |   .add(StructField("name", StringType, true))
 27 |   .add(StructField("id", IntegerType, true))
 28 |   .add(StructField("flag", BooleanType, true))
 29 |   .add(StructField("salary", DoubleType, true))
 30 |   .add(StructField("phone", LongType, true))
 31 |   .add(StructField("dob", DateType, true))
 32 |   .add(StructField("time", TimestampType, true))
 33 |   .add(StructField("decimal1", DecimalType(15, 3), true))
 34 |   .add(StructField("decimal2", DecimalType(38, 7), true))
 35 |   .add(StructField("decimal3", DecimalType(5, 2), true))
 36 | 
 37 | var df = spark.createDataFrame(spark.sparkContext.parallelize(data), schema)
 38 | 
 39 | // Creates the CDM manifest and adds the entity to it with gzip'd parquet partitions
 40 | // with both physical and logical entity definitions
 41 | df.write.format("com.microsoft.cdm")
 42 |   .option("storage", storageAccountName)
 43 |   .option("manifestPath", container + "/implicitTest/default.manifest.cdm.json")
 44 |   .option("entity", "TestEntity")
 45 |   .option("format", "parquet")
 46 |   .option("compression", "gzip")
 47 |   .save() // If table already exists, add .mode(SaveMode.Overwrite) or delete the /implicitTest folder
 48 | 
 49 | // Append the same dataframe content to the entity in the default CSV format
 50 | df.write.format("com.microsoft.cdm")
 51 |   .option("storage", storageAccountName)
 52 |   .option("manifestPath", container + "/implicitTest/default.manifest.cdm.json")
 53 |   .option("entity", "TestEntity")
 54 |   .option("delimiter", ";")  // Specify what delimiter will be set in the CSV file. Default is comma
 55 |   .option("columnHeaders", false)  // Specify a boolean value - where column header will be shown or not
 56 |   .option("dataFolderFormat", "'year'yyyy'/month'MM")  // Specify data partitions folder with DateTimeFormatter format
 57 |   .option("cdmSource", "builtin") // This fetches the foundation definitions from CDM SDK library
 58 |   .mode(SaveMode.Append)
 59 |   .save()
 60 | 
 61 | var readDf = spark.read.format("com.microsoft.cdm")
 62 |   .option("storage", storageAccountName)
 63 |   .option("manifestPath", container + "/implicitTest/default.manifest.cdm.json")
 64 |   .option("entity", "TestEntity")
 65 |   .load()
 66 | 
 67 | readDf.select("*").show()
 68 | 
 69 | 
 70 | // COMMAND ----------
 71 | 
 72 | // Explicit write, creating an entity in a CDM folder based on a pre-defined model
 73 | 
 74 | // Case 1: Using an entity definition defined in the CDM Github repo
 75 | 
 76 | var data = Seq(
 77 |   Row("1", "2", "3", 4L), Row("4", "5", "6", 8L),Row("7", "8", "9", 4L),Row("10", "11", "12", 8L),Row("13", "14", "15", 4L))
 78 | var schema = new StructType()
 79 |   .add(StructField("teamMembershipId", StringType, true))
 80 |   .add(StructField("systemUserId", StringType, true))
 81 |   .add(StructField("teamId", StringType, true))
 82 |   .add(StructField("versionNumber", LongType, true))
 83 | 
 84 | var df = spark.createDataFrame(spark.sparkContext.parallelize(data, 1), schema)
 85 | df.write.format("com.microsoft.cdm")
 86 |   .option("storage", storageAccountName)
 87 |   .option("manifestPath", container + "/explicitTest/root.manifest.cdm.json")
 88 |   .option("entity", "TeamMembership")
 89 |   .option("entityDefinitionPath", "core/applicationCommon/TeamMembership.cdm.json/TeamMembership")
 90 |   .option("useCdmStandardModelRoot", true)  // sets the model root to the CDM CDN schema documents folder
 91 |   .option("useSubManifest", true)
 92 |   .save() // If table already exists, add .mode(SaveMode.Overwrite)
 93 | 
 94 | var readDf = spark.read.format("com.microsoft.cdm")
 95 |   .option("storage", storageAccountName)
 96 |   .option("manifestPath", container + "/explicitTest/root.manifest.cdm.json")
 97 |   .option("entity", "TeamMembership")
 98 |   .load()
 99 | readDf.select("*").show()
100 | 
101 | 
102 | // COMMAND ----------
103 | 
104 | // Explicit write, creating an entity in a CDM folder based on a pre-defined model
105 | 
106 | // Case 2: Using an entity definition defined in a CDM model stored in ADLS
107 | 
108 | // UPLOAD CDM FILES FIRST
109 | // To run this example, first create a /Models/Contacts folder to your demo container in ADLS gen2,
110 | // then upload the provided Contacts.manifest.cdm.json, Person.cdm.json, Entity.cdm.json files
111 | 
112 | val birthdate= java.sql.Date.valueOf("1991-03-31");
113 | val now = new java.sql.Timestamp(System.currentTimeMillis());
114 | val data2 = Seq(
115 |   Row(1,now,"Donna","Carreras",birthdate),
116 |   Row(2,now,"Keith","Harris",birthdate),
117 |   Row(2,now,"Carla","McGee",birthdate)
118 | )
119 | 
120 | val schema2 = new StructType()
121 |   .add(StructField("identifier", IntegerType))
122 |   .add(StructField("createdTime", TimestampType))
123 |   .add(StructField("firstName", StringType))
124 |   .add(StructField("lastName", StringType))
125 |   .add(StructField("birthDate", DateType))
126 | 
127 | // Create the dataframe that matches the CDM definition of the entity, Person
128 | val df2 = spark.createDataFrame(spark.sparkContext.parallelize(data2, 1), schema2)
129 | df2.write.format("com.microsoft.cdm")
130 |   .option("storage", storageAccountName)
131 |   .option("manifestPath", container + "/Data/Contacts/root.manifest.cdm.json")
132 |   .option("entity", "Person")
133 |   .option("entityDefinitionModelRoot", container + "/Models")
134 |   .option("entityDefinitionPath", "/Contacts/Person.cdm.json/Person")
135 |   .save() // If table already exists, add .mode(SaveMode.Overwrite)
136 | 
137 | val readDf2 = spark.read.format("com.microsoft.cdm")
138 |   .option("storage", storageAccountName)
139 |   .option("manifestPath", container + "/Data/Contacts/root.manifest.cdm.json")
140 |   .option("entity", "Person")
141 |   .load()
142 | readDf2.select("*").show()
143 | 
144 | 
145 | // COMMAND ----------
146 | 
147 | // Override Config Path
148 | 
149 | val timestamp1 = new java.sql.Timestamp(System.currentTimeMillis());
150 | val timestamp2 = new java.sql.Timestamp(System.currentTimeMillis());
151 | val cdata = Seq(
152 |   Row( timestamp1, timestamp2,1, "A", Decimal(33.5)),
153 |   Row( timestamp1, timestamp2, 2, "B", Decimal(42.1)),
154 |   Row( timestamp1, timestamp2, 3, "C", Decimal(7.90))
155 | )
156 | 
157 | val cschema = new StructType()
158 |   .add(StructField("ValidFrom", TimestampType, true))
159 |   .add(StructField("ValidTo", TimestampType, true))
160 |   .add(StructField("CustomerId", IntegerType, true))
161 |   .add(StructField("CustomerName", StringType, true))
162 |   .add(StructField("CreditLimit", DecimalType(18, 2), true))
163 | 
164 | val customerdf = spark.createDataFrame(spark.sparkContext.parallelize(cdata), cschema)
165 | 
166 | customerdf.write.format("com.microsoft.cdm")
167 |   .option("storage", storageAccountName)
168 |   .option("manifestPath", outputContainer + "/customer/default.manifest.cdm.json")
169 |   .option("entity", "TestEntity")
170 |   .option("entityDefinitionPath", "Customer.cdm.json/Customer")  // Customer.cdm.json has an alias - "core"
171 |   .option("entityDefinitionModelRoot", container+ "Models")   // fetches config.json from this location and finds definition of "core" alias, if configPath option is not present
172 |   .option("configPath", "/config")  // Add your config.json to override the above definition. config is the name of container. This will find config.json in container - "config"
173 |   .option("entityDefinitionStorage", "<storage1>.dfs.core.windows.net") // entityDefinitionModelRoot contains in this storage account
174 |   .option("format", "parquet")
175 |   .save()
176 | 
177 | val readDf2 = spark.read.format("com.microsoft.cdm")
178 |   .option("storage", storageAccountName)
179 |   .option("manifestPath", outputContainer + "/customer/default.manifest.cdm.json")
180 |   .option("entity", "TestEntity")
181 |   .load()
182 | readDf2.select("*").show()
183 | 
184 | // COMMAND ----------
185 | 
186 | // Nested Parquet Implicit & Explicit write
187 | 
188 | val birthdate= java.sql.Date.valueOf("1991-03-31");
189 | val now = new java.sql.Timestamp(System.currentTimeMillis());
190 | val data = Seq(
191 | 
192 |   Row(13, Row("Donna Carreras", true, 12.34,6L, birthdate, Decimal(22.7), now,  Row("95110", Row("Bose Street", 321), Array(Row("bieber1", 1), Row("bieber2", 2))))) ,
193 |   Row(24, Row("Keith Harris", false, 12.34,6L, birthdate, Decimal(22.7), now, Row("95134", Row("Estancia Dr", 185), Array(Row("baby1", 3), Row("baby2", 4), Row("baby3", 5), Row("baby4", 6)))))
194 | )
195 | 
196 | val schema = new StructType()
197 |   .add(StructField("id", IntegerType, true))
198 |   .add(StructField("details", new StructType()
199 |     .add(StructField("name", StringType, true))
200 |     .add(StructField("USCitizen", BooleanType, true))
201 |     .add(StructField("salary", DoubleType, true))
202 |     .add(StructField("phone", LongType, true))
203 |     .add(StructField("birthDate", DateType, true))
204 |     .add(StructField("bodyMassIndex",  DecimalType(5,2), true))
205 |     .add(StructField("createdTime", TimestampType, true))
206 |     .add(StructField("address", new StructType()
207 |       .add(StructField("zipcode", StringType, true))
208 |       .add(StructField("street", new StructType()
209 |         .add(StructField("streetName", StringType, true))
210 |         .add(StructField("streetNumber", IntegerType, true))
211 |       )
212 |       )
213 |       .add(StructField("songs", ArrayType(StructType(List(StructField("name", StringType, true),StructField("number", IntegerType, true))), true), true))
214 |     )
215 |     )))
216 | val df = spark.createDataFrame(spark.sparkContext.parallelize(data), schema)
217 | 
218 | // Implicit write
219 | df.write.format("com.microsoft.cdm")
220 |   .option("storage", storageAccountName)
221 |   .option("manifestPath", outputContainer + "/nestedImplicit/default.manifest.cdm.json")
222 |   .option("entity", "NestedExampleImplicit")
223 |   .option("format", "parquet")
224 |   .save()
225 | 
226 | // Explicit write
227 | // To run this example, first create a /Models/Contacts folder to your demo container in ADLS gen2,
228 | // then upload the provided NestedExample.cdm.json file
229 | df.write.format("com.microsoft.cdm")
230 |   .option("storage", storageAccountName)
231 |   .option("manifestPath", outputContainer + "/nestedExplicit/default.manifest.cdm.json")
232 |   .option("entity", "NestedExampleExplicit")
233 |   .option("entityDefinitionPath", "/Contacts/NestedExample.cdm.json/NestedExample")
234 |   .option("entityDefinitionModelRoot", container + "/Models")
235 |   .option("format", "parquet")
236 |   .save()
237 | 
238 | val readImplicit = spark.read.format("com.microsoft.cdm")
239 |   .option("storage", storageAccountName)
240 |   .option("manifestPath", outputContainer + "/nestedImplicit/default.manifest.cdm.json")
241 |   .option("entity", "NestedExampleImplicit")
242 |   .load()
243 | 
244 | val readExplicit = spark.read.format("com.microsoft.cdm")
245 |   .option("storage", storageAccountName)
246 |   .option("manifestPath", outputContainer + "/nestedExplicit/default.manifest.cdm.json")
247 |   .option("entity", "NestedExampleExplicit")
248 |   .load()
249 | 
250 | readImplicit.show(false)
251 | readExplicit.show(false)
252 | 
253 | 


--------------------------------------------------------------------------------
/src/main/main.iml:
--------------------------------------------------------------------------------
 1 | <?xml version="1.0" encoding="UTF-8"?>
 2 | <module type="JAVA_MODULE" version="4">
 3 |   <component name="NewModuleRootManager" inherit-compiler-output="true">
 4 |     <exclude-output />
 5 |     <content url="file://$MODULE_DIR$">
 6 |       <sourceFolder url="file://$MODULE_DIR$/scala" isTestSource="false" />
 7 |     </content>
 8 |     <orderEntry type="inheritedJdk" />
 9 |     <orderEntry type="sourceFolder" forTests="false" />
10 |     <orderEntry type="library" name="scala-sdk-2.13.1" level="application" />
11 |   </component>
12 | </module>


--------------------------------------------------------------------------------
/src/main/scala/com/microsoft/cdm/CDMCatalog.scala:
--------------------------------------------------------------------------------
  1 | package com.microsoft.cdm
  2 | 
  3 | import java.util
  4 | 
  5 | import com.microsoft.cdm.log.SparkCDMLogger
  6 | import com.microsoft.cdm.utils.{CDMOptions, CdmAuthType, EntityNotFoundException, ManifestNotFoundException}
  7 | import org.apache.spark.sql.catalyst.analysis.{NoSuchTableException, TableAlreadyExistsException}
  8 | import org.apache.spark.sql.connector.catalog.{CatalogPlugin, Identifier, NamespaceChange, SupportsNamespaces, Table, TableCatalog, TableChange}
  9 | import org.apache.spark.sql.connector.expressions.Transform
 10 | import org.apache.spark.sql.types.StructType
 11 | import org.apache.spark.sql.util.CaseInsensitiveStringMap
 12 | import org.slf4j.LoggerFactory
 13 | import org.slf4j.event.Level
 14 | 
 15 | class CDMCatalog extends CatalogPlugin with TableCatalog with SupportsNamespaces {
 16 |   val logger  = LoggerFactory.getLogger(classOf[CDMCatalog])
 17 |   var cdmOptions: CDMOptions = _
 18 |   var tables: HadoopTables = _
 19 |   private var options: CaseInsensitiveStringMap = _
 20 | 
 21 |   override def initialize(name: String, options: CaseInsensitiveStringMap): Unit = {
 22 |     logger.info("Initializing CDM Catalog...")
 23 |     this.tables = new HadoopTables()
 24 |   }
 25 | 
 26 |   def setupOptions(options: CaseInsensitiveStringMap): Unit = {
 27 |     this.options = options
 28 |   }
 29 | 
 30 |   @throws(classOf[NoSuchTableException])
 31 |   override def loadTable(ident: Identifier): SparkTable  = {
 32 |     try {
 33 |       val cdmEntity = tables.load(new CDMOptions(options))
 34 |       new SparkTable(cdmEntity.schema, options)
 35 |     } catch {
 36 |       case e: EntityNotFoundException =>  throw new NoSuchTableException(e.getMessage)
 37 |       case e: ManifestNotFoundException => throw new NoSuchTableException(e.getMessage)
 38 |     }
 39 |   }
 40 | 
 41 |   @throws(classOf[TableAlreadyExistsException])
 42 |   override def createTable(ident: Identifier, schema: StructType, partitions: Array[Transform], properties: util.Map[String, String]): Table = {
 43 |      new SparkTable(schema, options) //make it write options
 44 |   }
 45 | 
 46 |   override def alterTable(ident: Identifier, changes: TableChange*): Table = {
 47 |     throw new UnsupportedOperationException("Not supported")
 48 |   }
 49 | 
 50 |   override def dropTable(ident: Identifier): Boolean = throw new UnsupportedOperationException("Not supported")
 51 | 
 52 |   override def renameTable(oldIdent: Identifier, newIdent: Identifier): Unit = throw new UnsupportedOperationException("Not supported")
 53 | 
 54 |   override def listNamespaces(): Array[Array[String]] = throw new UnsupportedOperationException("Not supported")
 55 | 
 56 |   override def listNamespaces(namespace: Array[String]): Array[Array[String]] = throw new UnsupportedOperationException("Not supported")
 57 | 
 58 |   override def loadNamespaceMetadata(namespace: Array[String]): util.Map[String, String] = throw new UnsupportedOperationException("Not supported")
 59 | 
 60 |   override def createNamespace(namespace: Array[String], metadata: util.Map[String, String]): Unit = throw new UnsupportedOperationException("Not supported")
 61 | 
 62 |   override def alterNamespace(namespace: Array[String], changes: NamespaceChange*): Unit = throw new UnsupportedOperationException("Not supported")
 63 | 
 64 |   override def dropNamespace(namespace: Array[String], cascade: Boolean): Boolean = throw new UnsupportedOperationException("Not supported")
 65 | 
 66 |   override def listTables(namespace: Array[String]): Array[Identifier] = throw new UnsupportedOperationException("Not supported")
 67 | 
 68 |   override def toString = s"${this.getClass.getCanonicalName}($name)"
 69 | 
 70 |   override def name(): String = "cdm"
 71 | 
 72 |   private def getRequiredArgument(options: CaseInsensitiveStringMap, arg: String): String = {
 73 |     val result = if (options.containsKey(arg)) options.get(arg) else  {
 74 |       throw new Exception(arg + "argument required")
 75 |     }
 76 |     result
 77 |   }
 78 | 
 79 |   def getAuthType(options: CaseInsensitiveStringMap): String = {
 80 |     val appIdPresent =  options.containsKey("appId")
 81 |     val appKeyPresent =  options.containsKey("appKey")
 82 |     val tenantIdPresent =  options.containsKey("tenantId")
 83 |     val sasTokenPresent =  options.containsKey("sasToken")
 84 |     val result = if (appIdPresent || appKeyPresent|| tenantIdPresent) {
 85 |       //make sure all creds are present
 86 |       if (!appIdPresent || !appKeyPresent || !tenantIdPresent) {
 87 |         throw new Exception("All creds must exist")
 88 |       }
 89 |       SparkCDMLogger.log(Level.INFO,"Using app registration for authentication", logger)
 90 |       CdmAuthType.AppReg.toString()
 91 |     } else if (sasTokenPresent) {
 92 |       SparkCDMLogger.log(Level.INFO,"Using SAS token for authentication", logger)
 93 |       CdmAuthType.Sas.toString()
 94 |     } else {
 95 |       SparkCDMLogger.log(Level.INFO, "Using managed identities for authentication", logger)
 96 |       CdmAuthType.Token.toString()
 97 |     }
 98 |     result
 99 |   }
100 | 
101 | }
102 | 


--------------------------------------------------------------------------------
/src/main/scala/com/microsoft/cdm/CDMIdentifier.scala:
--------------------------------------------------------------------------------
 1 | package com.microsoft.cdm
 2 | 
 3 | import com.microsoft.cdm.utils.CDMOptions
 4 | import org.apache.spark.sql.catalyst.util.CaseInsensitiveMap
 5 | import org.apache.spark.sql.connector.catalog.Identifier
 6 | import org.apache.spark.sql.util.CaseInsensitiveStringMap
 7 | 
 8 | class CDMIdentifier(options: CaseInsensitiveStringMap) extends Identifier{
 9 |   private val cdmOptions = new CDMOptions(options) // used to do option validation
10 | 
11 |   override def namespace(): Array[String] = Array(cdmOptions.storage, cdmOptions.container, cdmOptions.manifestFileName)
12 | 
13 |   override def name(): String = cdmOptions.entity
14 | }
15 | 


--------------------------------------------------------------------------------
/src/main/scala/com/microsoft/cdm/DefaultSource.scala:
--------------------------------------------------------------------------------
 1 | package com.microsoft.cdm
 2 | 
 3 | import org.apache.spark.sql.SparkSession
 4 | import org.apache.spark.sql.connector.catalog.{Identifier, SupportsCatalogOptions, Table}
 5 | import org.apache.spark.sql.connector.expressions.Transform
 6 | import org.apache.spark.sql.types.StructType
 7 | import org.apache.spark.sql.util.CaseInsensitiveStringMap
 8 | import org.slf4j.LoggerFactory
 9 | 
10 | 
11 | class DefaultSource extends SupportsCatalogOptions{
12 | 
13 |     val logger  = LoggerFactory.getLogger(classOf[DefaultSource])
14 | 
15 |     override def inferSchema(options: CaseInsensitiveStringMap): StructType = {
16 |        null
17 |     }
18 |     
19 |     override def getTable(structType: StructType, transforms: Array[Transform], map: java.util.Map[String, String]): Table = {
20 |         try{
21 |         val caseInsensitiveStringMap = new CaseInsensitiveStringMap(map)
22 |         val schema = if (structType != null) {
23 |             structType
24 |         } else {
25 |             inferSchema(caseInsensitiveStringMap)
26 |         }
27 |         new SparkTable(schema, caseInsensitiveStringMap)
28 |         } catch {
29 |             case _ : Exception => {
30 |                  null
31 |             }
32 |         }
33 |     }
34 | 
35 |     override def supportsExternalMetadata(): Boolean = true
36 | 
37 |     override def extractIdentifier(options: CaseInsensitiveStringMap): Identifier = {
38 |         val spark = SparkSession.active;
39 |         spark.conf.set("spark.sql.catalog.cdm", "com.microsoft.cdm.CDMCatalog")
40 |         val cdmcatalog = spark.sessionState.catalogManager.catalog("cdm")
41 |         cdmcatalog.asInstanceOf[CDMCatalog].setupOptions(options)
42 |         new CDMIdentifier(options)
43 |     }
44 | 
45 |     override def extractCatalog(options: CaseInsensitiveStringMap): String = {
46 |         "cdm"
47 |     }
48 | }
49 | 


--------------------------------------------------------------------------------
/src/main/scala/com/microsoft/cdm/HadoopTables.scala:
--------------------------------------------------------------------------------
 1 | package com.microsoft.cdm
 2 | 
 3 | import com.microsoft.cdm.utils.{CDMEntity, CDMModelCommon, CDMOptions, CDMTokenProvider, CdmAuthType, EntityNotFoundException, ManifestNotFoundException, SerializedABFSHadoopConf, SparkSerializableConfiguration}
 4 | 
 5 | class HadoopTables() {
 6 | 
 7 | 
 8 |   def load(cdmOptions: CDMOptions): CDMEntity = {
 9 |     val serializedHadoopConf  = SerializedABFSHadoopConf.getConfiguration(cdmOptions.storage, cdmOptions.container, cdmOptions.auth, cdmOptions.conf)
10 | 
11 |     val tokenProvider =  if (cdmOptions.auth.getAuthType == CdmAuthType.Token.toString()) Some(new CDMTokenProvider(serializedHadoopConf, cdmOptions.storage)) else None
12 | 
13 |     val cdmModel = new CDMModelCommon(cdmOptions.storage,
14 |       cdmOptions.container,
15 |       cdmOptions.manifestPath,
16 |       cdmOptions.manifestFileName,
17 |       cdmOptions.entity,
18 |       "",
19 |       "",
20 |       cdmOptions.auth, tokenProvider,
21 |       cdmOptions.overrideConfigPath,
22 |       cdmOptions.cdmSource,
23 |       "",
24 |       cdmOptions.maxCDMThreads)
25 | 
26 |     val cdmEntity = cdmModel.entityExists(cdmOptions.entity, serializedHadoopConf)
27 | 
28 |     if(cdmEntity.rootManifest == null) {
29 |       throw ManifestNotFoundException("Manifest doesn't exist: " + cdmOptions.manifestFileName)
30 |     }
31 |     if (cdmEntity.entityDec != null ) {
32 |       cdmEntity.schema  = cdmModel.getSchema(cdmEntity.parentManifest, cdmEntity.entityDec)
33 |       cdmEntity
34 |     } else {
35 |       throw EntityNotFoundException("Entity " + cdmOptions.entity + " not found in manifest - " + cdmOptions.manifestFileName)
36 |     }
37 |   }
38 | 
39 | }
40 | 


--------------------------------------------------------------------------------
/src/main/scala/com/microsoft/cdm/SparkTable.scala:
--------------------------------------------------------------------------------
 1 | package com.microsoft.cdm
 2 | 
 3 | import java.util
 4 | 
 5 | import com.microsoft.cdm.read.{CDMReadOptions, CDMScanBuilder}
 6 | import com.microsoft.cdm.write.{CDMWriteOptions, CDMWriterBuilder}
 7 | import org.apache.spark.sql.SaveMode
 8 | import org.apache.spark.sql.connector.catalog.{SupportsRead, SupportsWrite, Table, TableCapability}
 9 | import org.apache.spark.sql.connector.read.ScanBuilder
10 | import org.apache.spark.sql.connector.write.{LogicalWriteInfo, WriteBuilder}
11 | import org.apache.spark.sql.types.StructType
12 | import org.apache.spark.sql.util.CaseInsensitiveStringMap
13 | 
14 | import scala.collection.JavaConverters._
15 | 
16 | class SparkTable(schema: StructType, options: CaseInsensitiveStringMap) extends Table
17 |   with SupportsRead
18 |   with SupportsWrite {
19 | 
20 | 
21 |   override def name(): String = this.getClass.toString
22 | 
23 |   override def schema(): StructType = schema
24 | 
25 |   override def capabilities(): util.Set[TableCapability] = Set(
26 |     TableCapability.ACCEPT_ANY_SCHEMA,
27 |     TableCapability.BATCH_WRITE,
28 |     TableCapability.BATCH_READ,
29 |     TableCapability.OVERWRITE_BY_FILTER,
30 |     TableCapability.OVERWRITE_DYNAMIC,
31 |     TableCapability.TRUNCATE).asJava
32 | 
33 | 
34 |   override def newScanBuilder(options: CaseInsensitiveStringMap): ScanBuilder = {
35 |     new CDMScanBuilder(new CDMReadOptions(options))
36 |   }
37 | 
38 |   override def newWriteBuilder(info: LogicalWriteInfo): WriteBuilder = {
39 |     new CDMWriterBuilder(info.queryId(), info.schema(), SaveMode.Append, new CDMWriteOptions(options))
40 |   }
41 | }


--------------------------------------------------------------------------------
/src/main/scala/com/microsoft/cdm/log/SparkCDMLogger.scala:
--------------------------------------------------------------------------------
 1 | package com.microsoft.cdm.log
 2 | 
 3 | import com.microsoft.cdm.utils.{Constants, Environment, SparkPlatform}
 4 | import com.microsoft.spark.metricevents.{ComponentEventPublisher, ComponentSparkEvent}
 5 | import org.slf4j.Logger
 6 | import org.slf4j.event.Level
 7 | 
 8 | object SparkCDMLogger {
 9 |   var APPNAME: String = "Spark-CDM Connector"
10 | 
11 |   // Application log
12 |   def log(loglevel: Level, message: String, logger: Logger) = {
13 |     loglevel match {
14 |       case Level.ERROR => logger.error(message)
15 |       case Level.INFO => logger.info(message)
16 |       case Level.DEBUG => logger.debug(message)
17 |       case Level.WARN => logger.warn(message)
18 |       case _ =>
19 |     }
20 |   }
21 | 
22 |   def logEventToKusto(className: String, methodName: String, loglevel: Level, message: String, logger: Option[Logger] = None): Unit = {
23 |     if(logger.getOrElse(null) != null) {
24 |       log(loglevel, message, logger.get)
25 |     }
26 |     // Log to Kusto only on Synapse
27 |     if(SparkPlatform.Synapse == Environment.sparkPlatform && Constants.KUSTO_ENABLED) {
28 |       val event = ComponentSparkEvent(APPNAME, className, methodName, None, None, None, Some(message), loglevel)
29 |       ComponentEventPublisher.publishComponentEvent(event)
30 |     }
31 |   }
32 | 
33 |   /* Log event to kusto to know performance of @param code */
34 |   def logEventToKustoForPerf[T](code: => T, className: String, methodName: String, loglevel: Level, message: String, logger: Option[Logger] = None): T = {
35 |     if(logger.getOrElse(null) != null) {
36 |       log(loglevel, message, logger.get)
37 |     }
38 |     // Log to Kusto only on Synapse
39 |     if(SparkPlatform.Synapse == Environment.sparkPlatform && Constants.KUSTO_ENABLED) {
40 |       val event = ComponentSparkEvent(APPNAME, className, methodName, None, None, None, Some(message), loglevel)
41 |       ComponentEventPublisher.publishComponentEventFor(code, event)
42 |     }else{
43 |       code
44 |     }
45 |   }
46 | }
47 | 


--------------------------------------------------------------------------------
/src/main/scala/com/microsoft/cdm/read/CDMDataReader.scala:
--------------------------------------------------------------------------------
  1 | package com.microsoft.cdm.read
  2 | 
  3 | 
  4 | import com.microsoft.cdm.utils.{Constants, DataConverter, Messages}
  5 | import org.apache.commons.io.IOUtils
  6 | import org.apache.spark.sql.catalyst.InternalRow
  7 | import org.apache.spark.sql.connector.read.PartitionReader
  8 | import org.apache.spark.sql.types.StructType
  9 | 
 10 | import java.io.{File, FileOutputStream, InputStream}
 11 | 
 12 | /**
 13 |  * Reads a single partition of CDM data.
 14 |  * @param csvPath ADLSgen2 URI of partition data in CSV format.
 15 |  * @param schema Spark schema of the data in the CSV file
 16 |  *
 17 |  */
 18 | 
 19 | @SerialVersionUID(100L)
 20 | class CDMDataReader(val storage: String,
 21 |                     val container: String,
 22 |                     val fileReader: ReaderConnector,
 23 |                     val header: Boolean,
 24 |                     var schema: StructType,
 25 |                     var dataConverter: DataConverter,
 26 |                     val mode: String) extends PartitionReader[InternalRow] with Serializable {
 27 |   var cnt = 0
 28 |   var row: Array[Any] = _
 29 |   var stream: InputStream =_
 30 |   var headerRead = false
 31 |   fileReader.build
 32 |   /**
 33 |    * Called by the Spark runtime.
 34 |    * @return Boolean indicating whether there is any data left to read.
 35 |    */
 36 |   def next: Boolean = {
 37 |     if (header && !headerRead) {
 38 |       fileReader.readRow
 39 | 
 40 |       //TODO: verify header names match with what we have in CDM
 41 |       println("TODO: Verify header names match")
 42 |       headerRead = true
 43 |     }
 44 | 
 45 |     row = fileReader.readRow
 46 |     row != null
 47 |   }
 48 | 
 49 |   /**
 50 |    * Called by the Spark runtime if there is data left to read.
 51 |    * @return The next row of data.
 52 |    */
 53 |   def get: InternalRow = {
 54 |     if(mode == Constants.FAILFAST && row.length != schema.fields.length) {
 55 |       throw new Exception(Messages.incompatibleFileWithDataframe)
 56 |     }
 57 |     var seq: Seq[Any] = null
 58 |     var setRowToNull = false
 59 |     // When there are more columns in the CSV file than the # of attributes in the cdm entity file.
 60 |     if (row.length > schema.fields.length) {
 61 |       seq = schema.zipWithIndex.map{ case (col, index) =>
 62 |         val dataType = schema.fields(index).dataType
 63 |         fileReader.jsonToData(dataType, row.apply(index), mode)
 64 |       }
 65 |     } else if (row.length < schema.fields.length) {
 66 |       // When there are fewer columns in the CSV file the # of attributes in cdm entity file at the end
 67 |       seq = schema.zipWithIndex.map{ case (col, index) =>
 68 |         if (index >= row.length) {
 69 |           null
 70 |         } else {
 71 |           val dataType = schema.fields(index).dataType
 72 |           fileReader.jsonToData(dataType, row.apply(index), mode)
 73 |         }
 74 |       }
 75 |     } else {
 76 |       seq = row.zipWithIndex.map { case (col, index) =>
 77 |         val dataType = schema.fields(index).dataType
 78 |         fileReader.jsonToData(dataType, row.apply(index), mode)
 79 |       }
 80 |     }
 81 | 
 82 |     /*
 83 |      * If we want to return null for the entire row if any entity fails to be converted, uncomment
 84 |      */
 85 |     /*
 86 |     if (!fileReader.isValidRow) {
 87 |       seq = schema.zipWithIndex.map { _ =>null}
 88 |     }
 89 |   */
 90 | 
 91 |     InternalRow.fromSeq(seq)
 92 |   }
 93 | 
 94 |       /**
 95 |    * Called by the Spark runtime.
 96 |    */
 97 |   def close(): Unit = {
 98 |     fileReader.close
 99 |   }
100 | 
101 | }
102 | 


--------------------------------------------------------------------------------
/src/main/scala/com/microsoft/cdm/read/CDMInputPartition.scala:
--------------------------------------------------------------------------------
 1 | package com.microsoft.cdm.read
 2 | 
 3 | import com.microsoft.cdm.utils.DataConverter
 4 | import org.apache.spark.sql.connector.read.InputPartition
 5 | import org.apache.spark.sql.types.StructType
 6 | 
 7 | /**
 8 |  * Factory class for creating a CDMDataReader responsible for reading a single partition of CDM data.
 9 |  * @param remoteCSVPath ADLSgen2 URI of partition data in CSV format.
10 |  * @param schema Spark schema of the data in the CSV file
11 |  * @param adlProvider Provider for ADLSgen2 data
12 |  * @param dataConverter Converts CSV data into types according to schema
13 |  */
14 | case class CDMInputPartition(val storage: String,
15 |                         val container: String,
16 |                         val fileReader: ReaderConnector,
17 |                         val header: Boolean,
18 |                         var schema: StructType,
19 |                         var dataConverter: DataConverter,
20 |                         val mode: String) extends InputPartition {
21 | }


--------------------------------------------------------------------------------
/src/main/scala/com/microsoft/cdm/read/CDMPartitionReader.scala:
--------------------------------------------------------------------------------
 1 | package com.microsoft.cdm.read
 2 | 
 3 | import org.apache.spark.sql.SparkSession
 4 | import org.apache.spark.sql.catalyst.InternalRow
 5 | import org.apache.spark.sql.connector.read.PartitionReader
 6 | import org.apache.spark.unsafe.types.UTF8String
 7 | 
 8 | class CDMPartitionReader(inputPartition: CDMInputPartition) extends PartitionReader[InternalRow]{
 9 | 
10 |   var index = 0
11 |   val values = Array("1", "2", "3", "4", "5")
12 | 
13 |   var iterator: Iterator[String] = null
14 | 
15 |   @transient
16 |   def next: Boolean = index < values.length
17 | 
18 |   def get = {
19 |     val stringValue = values(index)
20 |     val stringUtf = UTF8String.fromString(stringValue)
21 |     val row = InternalRow(stringUtf)
22 |     index = index + 1
23 |     row
24 |   }
25 | 
26 |   def close() = Unit
27 | 
28 | }
29 | 


--------------------------------------------------------------------------------
/src/main/scala/com/microsoft/cdm/read/CDMPartitionReaderFactory.scala:
--------------------------------------------------------------------------------
 1 | package com.microsoft.cdm.read
 2 | 
 3 | import org.apache.spark.sql.catalyst.InternalRow
 4 | import org.apache.spark.sql.connector.read.{InputPartition, PartitionReader, PartitionReaderFactory}
 5 | import org.apache.spark.sql.types.StructType
 6 | 
 7 | class CDMPartitionReaderFactory() extends PartitionReaderFactory {
 8 |   override def createReader(partition: InputPartition): PartitionReader[InternalRow] = {
 9 |     val p = partition.asInstanceOf[CDMInputPartition]
10 |     new CDMDataReader(p.storage,
11 |       p.container,
12 |       p.fileReader,
13 |       p.header,
14 |       p.schema,
15 |       p.dataConverter,
16 |       p.mode)
17 |   }
18 | }
19 | 


--------------------------------------------------------------------------------
/src/main/scala/com/microsoft/cdm/read/CDMReadOptions.scala:
--------------------------------------------------------------------------------
 1 | package com.microsoft.cdm.read
 2 | 
 3 | import com.microsoft.cdm.utils.{CDMOptions, Constants, Messages}
 4 | import org.apache.spark.sql.util.CaseInsensitiveStringMap
 5 | 
 6 | class CDMReadOptions(options: CaseInsensitiveStringMap) extends  CDMOptions(options) {
 7 | 
 8 |   Constants.MODE = "read"
 9 | 
10 |   val mode = if(options.containsKey("mode")) options.get("mode") else Constants.FAILFAST;
11 |   // if mode is specified, it needs to either failfast or permissive
12 |   if(Constants.DROPMALFORMED.equalsIgnoreCase(mode)) {
13 |     throw new IllegalArgumentException(String.format(Messages.dropMalformedNotSupported))
14 |   } else if(!Constants.PERMISSIVE.equalsIgnoreCase(mode) && !Constants.FAILFAST.equalsIgnoreCase(mode)) {
15 |     throw new IllegalArgumentException(String.format(Messages.invalidMode))
16 |   }
17 | 
18 |   var entDefContAndPath = ""
19 |   var entityDefinitionStorage = ""
20 | 
21 | }
22 | 


--------------------------------------------------------------------------------
/src/main/scala/com/microsoft/cdm/read/CDMScanBuilder.scala:
--------------------------------------------------------------------------------
 1 | package com.microsoft.cdm.read
 2 | 
 3 | 
 4 | import com.microsoft.cdm.utils.{DataConverter}
 5 | import org.apache.spark.sql.connector.read.{Scan, ScanBuilder}
 6 | import org.slf4j.LoggerFactory
 7 | 
 8 | class CDMScanBuilder (cdmOptions: CDMReadOptions) extends ScanBuilder {
 9 |   val logger  = LoggerFactory.getLogger(classOf[CDMScanBuilder])
10 | 
11 |   override def build(): Scan = new CDMSimpleScan(cdmOptions.storage,
12 |                                                  cdmOptions.container,
13 |                                                  cdmOptions.manifestPath,
14 |                                                  cdmOptions.manifestFileName,
15 |                                                  cdmOptions.entity,
16 |                                                  cdmOptions.entDefContAndPath,
17 |                                                  cdmOptions.auth,
18 |                                                  cdmOptions.conf,
19 |                                                  new DataConverter(),
20 |                                                  cdmOptions.cdmSource,
21 |                                                  cdmOptions.entityDefinitionStorage,
22 |                                                  cdmOptions.maxCDMThreads,
23 |                                                  cdmOptions.mode)
24 | }
25 | 


--------------------------------------------------------------------------------
/src/main/scala/com/microsoft/cdm/read/CDMSimpleScan.scala:
--------------------------------------------------------------------------------
  1 | package com.microsoft.cdm.read
  2 | 
  3 | import com.microsoft.cdm.log.SparkCDMLogger
  4 | import com.microsoft.cdm.utils.{Auth, CDMModelReader, CDMSource, CDMTokenProvider, CdmAuthType, Constants, DataConverter, Messages, SerializedABFSHadoopConf, SparkSerializableConfiguration}
  5 | import org.apache.hadoop.conf.Configuration
  6 | import org.apache.spark.sql.connector.read.{Batch, InputPartition, PartitionReaderFactory, Scan}
  7 | import org.apache.spark.sql.types.StructType
  8 | import org.slf4j.LoggerFactory
  9 | import org.slf4j.event.Level
 10 | import java.net.URLDecoder
 11 | import com.microsoft.commondatamodel.objectmodel.cdm.CdmTraitReference
 12 | import scala.collection.JavaConverters._
 13 | 
 14 | 
 15 | case class CDMPartition(partitionNumber: Int, header: Boolean=true) extends InputPartition
 16 | 
 17 | class CDMSimpleScan(val storage: String,
 18 |                     val container: String,
 19 |                     val manifestPath: String,
 20 |                     val manifestFileName: String,
 21 |                     val entityName: String,
 22 |                     val entDefContAndPath: String,
 23 |                     val auth: Auth,
 24 |                     val conf:Configuration,
 25 |                     val dataConverter: DataConverter,
 26 |                     val cdmSource: CDMSource.Value,
 27 |                     val entityDefinitionStorage: String,
 28 |                     val maxCDMThreads: Int,
 29 |                     val mode: String) extends Scan with Batch{
 30 | 
 31 |   val logger  = LoggerFactory.getLogger(classOf[CDMSimpleScan])
 32 | 
 33 |   val serializedHadoopOConf= SerializedABFSHadoopConf.getConfiguration(storage, container, auth, conf)
 34 | 
 35 |   val tokenProvider =  if (auth.getAuthType == CdmAuthType.Token.toString()) Some(new CDMTokenProvider(serializedHadoopOConf, storage)) else None
 36 | 
 37 |   val cdmModel = new CDMModelReader(storage, container, manifestPath, manifestFileName, entityName, entDefContAndPath, auth, tokenProvider, cdmSource, entityDefinitionStorage,
 38 |     maxCDMThreads)
 39 | 
 40 |   //TODO: Make this a accessor class to retrieve tuple items
 41 |   var entity = cdmModel.entityDecHandleError(entityName, serializedHadoopOConf)
 42 | 
 43 |   override def readSchema() = {
 44 |     SparkCDMLogger.logEventToKustoForPerf({
 45 |       cdmModel.getSchema(entity.parentManifest, entity.entityDec)
 46 |     },this.getClass.getName, Thread.currentThread.getStackTrace()(1).getMethodName, Level.DEBUG, "Reading CDM entity and convert it to spark schema", Some(logger))
 47 |   }
 48 | 
 49 |   override def toBatch: Batch = this
 50 | 
 51 |   def getReader(fType: String, uriPath: String, filePath: String, schema: StructType, serializedHadoopConf: SparkSerializableConfiguration, delimiter: Char): ReaderConnector ={
 52 |     return fType match {
 53 |       case   "is.partition.format.CSV" => new CSVReaderConnector(uriPath, filePath, serializedHadoopConf, delimiter, mode)
 54 |       case   "is.partition.format.parquet" => new ParquetReaderConnector(uriPath, filePath, schema, serializedHadoopConf)
 55 |     }
 56 |   }
 57 |   override def planInputPartitions(): Array[InputPartition] = {
 58 |     /* Fetch the partitions and their names from the CDMModel*/
 59 |     val factoryList = new java.util.ArrayList[InputPartition]
 60 |     val man = entity.parentManifest
 61 |     val eDec = entity.entityDec
 62 | 
 63 |     // Calling fileStatusCheckAsync() on model.json breaks because the CDM library
 64 |     // assumes a "entity".cdm.json file exists
 65 |     if (manifestFileName != Constants.MODEL_JSON) {
 66 |       entity.entityDec.fileStatusCheckAsync().get()
 67 |     }
 68 | 
 69 |     for(partition <- eDec.getDataPartitions.asScala) {
 70 |       // the relative location of the path
 71 |       val loc = cdmModel.getRelPath(partition.getLocation)
 72 |       assert(!loc.startsWith("https:/"))
 73 |       val absPath = cdmModel.cdmCorpus.getStorage.createAbsoluteCorpusPath(loc,eDec)
 74 |       //The full path to data partition with the adapter stripped off
 75 |       val relPath= cdmModel.getRelPath(absPath)
 76 | 
 77 |       val uriPrefix = "https://"+storage+container
 78 | 
 79 |       // Decode strings because hadoop cannot parse URI-encoded strings
 80 |       val decodedFilePath = URLDecoder.decode(manifestPath + relPath, "UTF-8")
 81 | 
 82 |       //we track the header and pass it in to the reader so that we know if the first line is a header row
 83 |       var header = false
 84 |       val schema = readSchema();
 85 |       var delimiter = Constants.DEFAULT_DELIMITER
 86 |       val fileReader = {
 87 | 
 88 |         if (partition.getExhibitsTraits.asScala.size > 0 &&
 89 |           partition.getExhibitsTraits.asScala.find(_.getNamedReference.startsWith("is.partition.format")) != None) {
 90 |           val traits = partition.getExhibitsTraits.asScala.find(_.getNamedReference.startsWith("is.partition.format")).get
 91 |           assert(traits.getNamedReference == "is.partition.format.CSV" ||
 92 |             traits.getNamedReference == "is.partition.format.parquet")
 93 |           // if arguments are defined, determine whether the the files have headers prepended to them
 94 |           val arguments = traits.asInstanceOf[CdmTraitReference].getArguments().asScala
 95 |           val headerArg = arguments.find(_.getName() == "columnHeaders")
 96 |           if (headerArg != None) {
 97 |             header = headerArg.get.getValue().toString.toBoolean
 98 |           }
 99 |           val delimiterArg = arguments.find(_.getName() == "delimiter")
100 |           if (delimiterArg != None) {
101 |             val strDelimiter = delimiterArg.get.getValue.toString;
102 |             if(strDelimiter.length > 1) throw new IllegalArgumentException(String.format(Messages.invalidDelimiterCharacter, strDelimiter))
103 |             delimiter = strDelimiter.charAt(0)
104 |           }
105 |           val reader = getReader(traits.getNamedReference, uriPrefix, decodedFilePath, schema, serializedHadoopOConf, delimiter)
106 |           if (reader.isInstanceOf[ParquetReaderConnector] && Constants.PERMISSIVE.equalsIgnoreCase(mode)) {
107 |             throw new IllegalArgumentException(String.format(Messages.invalidPermissiveMode))
108 |           }
109 |           reader
110 |         } else {
111 |           SparkCDMLogger.log(Level.DEBUG, "No Named Reference Trait \"is.partition.format\" (CSV/Parquet", logger)
112 |           new CSVReaderConnector(uriPrefix, decodedFilePath, serializedHadoopOConf, delimiter, mode)
113 |         }
114 |       }
115 | 
116 |       factoryList.add(new CDMInputPartition(storage, container, fileReader, header, readSchema(), dataConverter, mode))
117 |     }
118 |     SparkCDMLogger.log(Level.DEBUG, "Count of partitions - "+eDec.getDataPartitions.size() + " Entity - " + eDec.getEntityName + " Manifest -"+ man.getManifestName, logger)
119 |     factoryList.asScala.toArray
120 |   }
121 | 
122 |   override def createReaderFactory(): PartitionReaderFactory = {
123 |     new CDMPartitionReaderFactory( )
124 |   }
125 | }
126 | 


--------------------------------------------------------------------------------
/src/main/scala/com/microsoft/cdm/read/CSVReaderConnector.scala:
--------------------------------------------------------------------------------
  1 | package com.microsoft.cdm.read
  2 | 
  3 | import java.net.URLDecoder
  4 | import java.time.{Instant, LocalDate, LocalDateTime, LocalTime, ZoneId}
  5 | import java.time.format.{DateTimeFormatter, DateTimeParseException}
  6 | import java.time.temporal.ChronoUnit
  7 | import com.microsoft.cdm.utils.{Constants, CsvParserFactory, SparkSerializableConfiguration}
  8 | import com.microsoft.cdm.log.SparkCDMLogger
  9 | import com.univocity.parsers.csv.CsvParser
 10 | import org.apache.hadoop.fs.Path
 11 | import org.apache.parquet.hadoop.util.HadoopInputFile
 12 | import org.apache.spark.sql.types.{BooleanType, ByteType, DataType, DateType, Decimal, DecimalType, DoubleType, FloatType, IntegerType, LongType, ShortType, StringType, TimestampType}
 13 | import org.apache.spark.unsafe.types.UTF8String
 14 | import org.slf4j.LoggerFactory
 15 | import org.slf4j.event.Level
 16 | 
 17 | class CSVReaderConnector(httpPrefix:String, filePath: String, serConf:SparkSerializableConfiguration, delimiter: Char, mode: String) extends ReaderConnector {
 18 |   val logger  = LoggerFactory.getLogger(classOf[CSVReaderConnector])
 19 |   SparkCDMLogger.log(Level.DEBUG, "CSV Reader for partition at path: " + httpPrefix + filePath, logger)
 20 | 
 21 |   private var goodRow:Boolean = true
 22 |   private var parser: CsvParser = _
 23 |   private val dateFormatStrings = List(
 24 |     "yyyy-MM-dd",
 25 |     "M/d/yyyy" )
 26 | 
 27 |   private val localTimeFormatsNonStandard= List(
 28 |     "M/d/yyyy H:mm",
 29 |     "M/d/yyyy h:mm:ss a",
 30 |     "M/d/yyyy H:mm:ss",
 31 |     "yyyy-MM-dd H:mm:ss.S",
 32 |     "yyyy-MM-dd H:mm:ss.SS",
 33 |     "yyyy-MM-dd H:mm:ss.SSS",
 34 |     "yyyy-MM-dd H:mm:ss.SSSS",
 35 |     "yyyy-MM-dd H:mm:ss.SSSSS",
 36 |     "yyyy-MM-dd H:mm:ss.SSSSSS",
 37 |     "yyyy-MM-dd H:mm:ss",
 38 |     "MMM d yyyy h:mma")
 39 | 
 40 |   private val timeFormatStrings = List(
 41 |     "HH:mm:ss",
 42 |     "HH:mm:ss.S",
 43 |     "HH:mm:ss.SS",
 44 |     "HH:mm:ss.SSS",
 45 |     "HH:mm:ss.SSSS",
 46 |     "HH:mm:ss.SSSSS",
 47 |     "HH:mm:ss.SSSSSS")
 48 | 
 49 |   def build: Unit = {
 50 |     try {
 51 |       val path = new Path(filePath)
 52 |       val inputFile = HadoopInputFile.fromPath(path, serConf.value)
 53 |       val inputStream = inputFile.newStream()
 54 |       parser = CsvParserFactory.build(delimiter)
 55 |       parser.beginParsing {
 56 |         inputStream
 57 |       }
 58 |     } catch {
 59 |       case e: Throwable => SparkCDMLogger.log(Level.ERROR, e.printStackTrace.toString, logger)
 60 |     }
 61 |   }
 62 | 
 63 |   def close(): Unit = {
 64 |   }
 65 | 
 66 |   def readRow(): Array[Any] = {
 67 |     val arr = parser.parseNext()
 68 |     arr.asInstanceOf[Array[Any]]
 69 |   }
 70 | 
 71 |   /* if the conversion failed, value is null, return false to indicate that whole row should be null */
 72 |   def checkResult(ret: Any): (Any, Boolean) = {
 73 |     if (ret == null) {
 74 |       (ret, false)
 75 |     } else {
 76 |       (ret, true)
 77 |     }
 78 |   }
 79 | 
 80 |   def isValidRow(): Boolean = goodRow
 81 | 
 82 |   def jsonToData(dt: DataType, value: Any, mode: String): Any= {
 83 |     /* null is a valid value */
 84 |     if (value == null) {
 85 |       null
 86 |     } else {
 87 | 
 88 |       val result = dt match {
 89 |         case ByteType => util.Try(value.toString.toByte).getOrElse(null)
 90 |         case ShortType => util.Try(value.toString.toShort).getOrElse(null)
 91 |         case IntegerType => util.Try(value.toString.toInt).getOrElse(null)
 92 |         case LongType => util.Try(value.toString.toLong).getOrElse(null)
 93 |         case DoubleType => util.Try(value.toString.toDouble).getOrElse(null)
 94 |         case FloatType => util.Try(value.toString.toFloat).getOrElse(null)
 95 |         case DecimalType() =>  util.Try(Decimal(value.toString)).getOrElse(null)
 96 |         case BooleanType =>  util.Try(value.toString.toBoolean).getOrElse(null)
 97 |         case DateType => {
 98 |           if (value != None && value != null) {
 99 |             val date = tryParseDate(value.toString, mode)
100 | 
101 |             /* If we can't parse the date we return a null. This enables permissive mode to work*/
102 |             if (date == null) {
103 |               null
104 |             } else {
105 |               date.toEpochDay.toInt
106 |             }
107 |           } else {
108 |             null
109 |           }
110 |         }
111 |         case StringType => util.Try(UTF8String.fromString(value.toString)).getOrElse(null)
112 |         case TimestampType => {
113 |           if (value != None && value != null) {
114 |             val date = tryParseDateTime(value.toString, mode)
115 | 
116 |             /* If we can't parse the date we return a null. This enables permissive mode to work*/
117 |             if (date == null) {
118 |               null
119 |             } else {
120 |               date
121 |             }
122 |           } else {
123 |             null
124 |           }
125 |         }
126 |         case _ => util.Try(UTF8String.fromString(value.toString)).getOrElse(null)
127 |       }
128 |       if (result == null) {
129 |         val msg = "Mode: " + mode + ". Could not parse " + value + " as " + dt.simpleString + ", converting to null"
130 |         SparkCDMLogger.log(Level.ERROR,  msg, logger)
131 | 
132 |         /*
133 |          *If we want pure fail-fast, we should add below exception
134 |          */
135 |         /*
136 |         if (Constants.FAILFAST.equalsIgnoreCase(mode)) {
137 |           throw new IllegalArgumentException(msg)
138 |         }*/
139 |         goodRow = false
140 |       }
141 |       result
142 |     }
143 |   }
144 | 
145 |   def tryParseDate(dateString: String, mode: String): LocalDate= {
146 |     for (formatString <- dateFormatStrings) {
147 |       try {
148 |           val dateTimeFormatter = DateTimeFormatter.ofPattern(formatString)
149 |           val localDate= LocalDate.parse(dateString, dateTimeFormatter)
150 |           return localDate
151 |       } catch {
152 |         case e: DateTimeParseException=>
153 |       }
154 |     }
155 | 
156 |     val msg = "Mode: " + mode + ". Could not parse " + dateString + " using any possible format"
157 |     SparkCDMLogger.log(Level.ERROR,  msg, logger)
158 |     if (Constants.FAILFAST.equalsIgnoreCase(mode)) {
159 |       throw new IllegalArgumentException(msg)
160 |     }
161 |     null
162 |   }
163 | 
164 |   def tryParseDateTime(dateString: String, mode: String): java.lang.Long = {
165 | 
166 |     val localTimeFormats = List(DateTimeFormatter.ISO_OFFSET_DATE_TIME,
167 |       DateTimeFormatter.ISO_INSTANT)
168 | 
169 |     /* Conversions that to local time first */
170 |     for (format <- localTimeFormats) {
171 |       var instant: Instant = null;
172 |       try {
173 |         val i = Instant.from(format.parse(dateString))
174 |         val zt = i.atZone(ZoneId.systemDefault())
175 |         instant = zt.toLocalDateTime.atZone(ZoneId.systemDefault()).toInstant();
176 |         return ChronoUnit.MICROS.between(Instant.EPOCH, instant)
177 |       } catch {
178 |         case e: ArithmeticException => {
179 |           return instant.toEpochMilli()*1000
180 |         }
181 |         case e: DateTimeParseException=>
182 |       }
183 |     }
184 | 
185 |     /* Local Time formatting */
186 |     for (format <- List(DateTimeFormatter.ISO_LOCAL_DATE_TIME)) {
187 |       var instant: Instant = null
188 |       try {
189 |         val localDateTime = LocalDateTime.parse(dateString, format)
190 |         instant = localDateTime.atZone(ZoneId.systemDefault()).toInstant();
191 |         return ChronoUnit.MICROS.between(Instant.EPOCH, instant)
192 |       } catch {
193 |         case e: ArithmeticException => {
194 |           return instant.toEpochMilli()*1000
195 |         }
196 |         case e: DateTimeParseException =>
197 |       }
198 |     }
199 | 
200 |     /* Non-common formats in local time */
201 |     for (formatString <- localTimeFormatsNonStandard) {
202 |       var instant: Instant = null
203 |       try {
204 |         val dateTimeFormatter = DateTimeFormatter.ofPattern(formatString)
205 |         val localDateTime = LocalDateTime.parse(dateString, dateTimeFormatter)
206 |         /* Assume non-standard times are in UTC */
207 |         instant =  localDateTime.atZone(ZoneId.of("UTC")).toInstant();
208 |         return ChronoUnit.MICROS.between(Instant.EPOCH, instant)
209 |       } catch {
210 |         case e: ArithmeticException => {
211 |           return instant.toEpochMilli()*1000
212 |         }
213 |         case e: DateTimeParseException =>
214 |       }
215 |     }
216 | 
217 |     /* Just Dates (no-time element) formats formatting */
218 |     for (formatString <- dateFormatStrings) {
219 |       var instant: Instant = null
220 |       try {
221 |         val dateTimeFormatter = DateTimeFormatter.ofPattern(formatString)
222 |         val localDate = LocalDate.parse(dateString, dateTimeFormatter)
223 |         val localDateTime1 = localDate.atStartOfDay();
224 |         instant = localDateTime1.atZone(ZoneId.of("UTC")).toInstant();
225 |         return ChronoUnit.MICROS.between(Instant.EPOCH, instant)
226 |       } catch {
227 |         case e: ArithmeticException => {
228 |           return instant.toEpochMilli()*1000
229 |         }
230 |         case e: DateTimeParseException =>
231 |       }
232 |     }
233 | 
234 |     /* Finally, this could just be a Time - Try that */
235 |     for (formatString <- timeFormatStrings) {
236 |       var instant: Instant = null
237 |       try {
238 |         val formatterTime1 = DateTimeFormatter.ofPattern(formatString)
239 |         val ls = LocalTime.parse(dateString, formatterTime1)
240 |         instant = ls.atDate(LocalDate.of(1970, 1, 1)).atZone(ZoneId.of("UTC")).toInstant
241 |         return ChronoUnit.MICROS.between(Instant.EPOCH, instant)
242 |       } catch {
243 |         case e: ArithmeticException => {
244 |           return instant.toEpochMilli()*1000
245 |         }
246 |         case e: DateTimeParseException =>
247 |       }
248 |     }
249 | 
250 | 
251 |     val msg = "Mode: " + mode + ". Could not parse " + dateString + " using any possible format"
252 |     SparkCDMLogger.log(Level.ERROR,  msg, logger)
253 |     if (Constants.FAILFAST.equalsIgnoreCase(mode)) {
254 |       throw new IllegalArgumentException(msg)
255 |     }
256 |     null
257 |   }
258 | }
259 | 


--------------------------------------------------------------------------------
/src/main/scala/com/microsoft/cdm/read/ParquetReaderConnector.scala:
--------------------------------------------------------------------------------
  1 | package com.microsoft.cdm.read
  2 | 
  3 | import java.io.{ByteArrayInputStream, ObjectInputStream, ObjectOutputStream}
  4 | import java.math.BigInteger
  5 | import java.text.SimpleDateFormat
  6 | import java.time.temporal.{ChronoField}
  7 | import java.time.{Instant, LocalDate, ZoneId}
  8 | import java.util.Base64
  9 | import java.nio.charset.StandardCharsets.UTF_8
 10 | import com.microsoft.cdm.utils.{Constants, SparkSerializableConfiguration}
 11 | import com.microsoft.cdm.log.SparkCDMLogger
 12 | import org.apache.commons.io.output.ByteArrayOutputStream
 13 | import org.apache.hadoop.fs.Path
 14 | import org.apache.parquet.column.page.PageReadStore
 15 | import org.apache.parquet.example.data.Group
 16 | import org.apache.parquet.example.data.simple.{NanoTime, SimpleGroup}
 17 | import org.apache.parquet.example.data.simple.convert.GroupRecordConverter
 18 | import org.apache.parquet.hadoop.ParquetFileReader
 19 | import org.apache.parquet.io.{ColumnIOFactory, MessageColumnIO, RecordReader}
 20 | import org.apache.parquet.schema.{MessageType, OriginalType}
 21 | import org.apache.parquet.schema.PrimitiveType.PrimitiveTypeName
 22 | import org.apache.spark.sql.catalyst.InternalRow
 23 | import org.apache.spark.sql.types.{ArrayType, BooleanType, ByteType, DataType, DateType, Decimal, DecimalType, DoubleType, FloatType, IntegerType, LongType, ShortType, StringType, StructType, TimestampType}
 24 | import org.apache.spark.sql.catalyst.util.{ArrayData, DateTimeUtils}
 25 | import org.apache.spark.unsafe.types.UTF8String
 26 | import org.slf4j.LoggerFactory
 27 | import org.slf4j.event.Level
 28 | 
 29 | 
 30 | class ParquetReaderConnector(httpPrefix: String,
 31 |                              filePath: String,
 32 |                              sparkSchema: StructType,
 33 |                              serializedHadoopConf:SparkSerializableConfiguration) extends ReaderConnector  {
 34 | 
 35 |   /*
 36 |    * Note. If these variables are initialized in the constructor, we get the "objects are not serializable"
 37 |    * for all of the classes below. Therefore, we need to wait to initialize the class until it is on the worker.
 38 |    */
 39 |   var i = 0
 40 |   var rows = 0
 41 |   var pages: PageReadStore = _
 42 |   var schema: MessageType = _
 43 |   var reader: ParquetFileReader = _
 44 |   var recordReader: RecordReader[Group] = _
 45 |   var columnIO: MessageColumnIO = _
 46 |   var path:Path = _
 47 |   var thisSparkSchema: StructType = _
 48 | 
 49 |   val logger  = LoggerFactory.getLogger(classOf[ParquetReaderConnector])
 50 |   SparkCDMLogger.log(Level.DEBUG, "Parquet Reader for partition at path: " + httpPrefix + filePath, logger)
 51 | 
 52 |   def build() {
 53 |     try {
 54 |       val path = new Path(filePath)
 55 |       val readFooter= ParquetFileReader.readFooter(serializedHadoopConf.value, path)
 56 |       schema = readFooter.getFileMetaData.getSchema
 57 |       columnIO = new ColumnIOFactory().getColumnIO(schema);
 58 |       reader = new ParquetFileReader(serializedHadoopConf.value, path, readFooter)
 59 |       //does this have to be in a loop?
 60 |       pages = reader.readNextRowGroup()
 61 |       if(pages != null) {
 62 |         rows = pages.getRowCount().toInt;
 63 |         recordReader = columnIO.getRecordReader(pages, new GroupRecordConverter(schema))
 64 |       }
 65 |       this.thisSparkSchema = sparkSchema
 66 |     } catch {
 67 |       case e: Throwable => SparkCDMLogger.log(Level.ERROR, e.printStackTrace.toString, logger)
 68 |     }
 69 |   }
 70 | 
 71 |   def close (): Unit = {
 72 |     reader.close()
 73 |   }
 74 | 
 75 |   def readRow(): Array[Any] = {
 76 |     if (i < rows) {
 77 |       i += 1
 78 |       getRowAsString(recordReader.read(), thisSparkSchema)
 79 |     } else {
 80 |       pages = reader.readNextRowGroup()
 81 |       if (pages == null) {
 82 |         //No more
 83 |         null
 84 |       } else {
 85 |         recordReader = columnIO.getRecordReader(pages, new GroupRecordConverter(schema))
 86 |         assert(i<rows)
 87 |         i = 0
 88 |         getRowAsString(recordReader.read(), thisSparkSchema)
 89 |       }
 90 |     }
 91 |   }
 92 | 
 93 |   def getNumberOfDaysFromEpoch(value: String) : Any = {
 94 |     if (value != None && value != null) {
 95 |       val date = dateFormatter.parse(value);
 96 |       val localDate = Instant.ofEpochMilli(date.getTime()).atZone(ZoneId.systemDefault()).toLocalDate();
 97 |       val days = localDate.getLong(ChronoField.EPOCH_DAY)
 98 |       days.asInstanceOf[Int]
 99 |     } else {
100 |       null
101 |     }
102 |   }
103 | 
104 |   def getTimeStamp(sg: SimpleGroup, field:Int, index: Int): Long= {
105 |     val bin = sg.getInt96(field,index)
106 |     val nanoTime = NanoTime.fromBinary(bin)
107 |     val dateTimeUtil = DateTimeUtils.fromJulianDay(nanoTime.getJulianDay, nanoTime.getTimeOfDayNanos)
108 |     dateTimeUtil
109 |     //val dateTimeUtilAsString = DateTimeUtils.timestampToString(dateTimeUtil)
110 |     //dateTimeUtilAsString
111 |   }
112 | 
113 | 
114 |   def getDateAsString(sg: SimpleGroup, field: Int, index: Int): String = {
115 |     val localDate =  LocalDate.ofEpochDay(sg.getInteger(field, index))
116 |     val formatter = new SimpleDateFormat(Constants.SINGLE_DATE_FORMAT)
117 |     val date = java.sql.Date.valueOf(localDate)
118 |     val strDate = formatter.format(date)
119 |     strDate
120 |   }
121 | 
122 |   def serializeObject(obj: Object) = {
123 |     val stream: ByteArrayOutputStream = new ByteArrayOutputStream()
124 |     val oos = new ObjectOutputStream(stream)
125 |     oos.writeObject(obj)
126 |     oos.close
127 |     new String(Base64.getEncoder().encode(stream.toByteArray), UTF_8);
128 |   }
129 | 
130 |   def deSerializeObject(arr: Array[Byte]): Array[Any] = {
131 |     val bytes = Base64.getDecoder().decode(arr)
132 |     val ois = new ObjectInputStream(new ByteArrayInputStream(bytes))
133 |     val obj = ois.readObject;
134 |     val strArr = obj.asInstanceOf[Array[Any]]
135 |     ois.close
136 |     strArr
137 |   }
138 | 
139 |   def getRowAsString(g: Group, struct: StructType): Array[Any] = {
140 |     val fieldCount = g.getType().getFieldCount()
141 |     val arr = new Array[Any](fieldCount)
142 |     for (field <- 0 until fieldCount) {
143 |       val valueCount = g.getFieldRepetitionCount(field);
144 | 
145 |       val fieldType = g.getType().getType(field);
146 |       val fieldName = fieldType.getName();
147 |       for (index <- 0 until valueCount) {
148 |         if (fieldType.isPrimitive()) {
149 |           if (g.isInstanceOf[SimpleGroup]) {
150 |             val sg = g.asInstanceOf[SimpleGroup]
151 |             val ptype = fieldType.asPrimitiveType().getPrimitiveTypeName
152 |             if (ptype == PrimitiveTypeName.INT96) {
153 |               arr(field) = getTimeStamp(sg, field, index)
154 |             } else if (ptype == PrimitiveTypeName.INT32 && fieldType.getOriginalType == OriginalType.DATE) {
155 |               arr(field) = getDateAsString(sg, field, index)
156 |             } else if (fieldType.getOriginalType == OriginalType.DECIMAL) {
157 |               val sparkDecimal = struct.fields(field).dataType.asInstanceOf[DecimalType]
158 |               val parquetDecimal = fieldType.asPrimitiveType().getDecimalMetadata
159 |               if(parquetDecimal.getPrecision != sparkDecimal.precision && parquetDecimal.getScale != sparkDecimal.scale) {
160 |                 throw new Exception("Parquet decimal metadata don't match the CDM decimal arguments for field \"" + fieldName +"\". Found Parquet decimal ("+ parquetDecimal.getPrecision +", " + parquetDecimal.getScale+")")
161 |               }
162 |               fieldType.asPrimitiveType().getPrimitiveTypeName match {
163 |                 case PrimitiveTypeName.INT32 =>
164 |                   arr(field) = Decimal.apply(g.getInteger(field, index), parquetDecimal.getPrecision, parquetDecimal.getScale).toString()
165 |                 case PrimitiveTypeName.INT64 =>
166 |                   arr(field) = Decimal.apply(g.getLong(field, index), parquetDecimal.getPrecision, parquetDecimal.getScale).toString()
167 |                 case PrimitiveTypeName.BINARY =>
168 |                   arr(field) = new java.math.BigDecimal(new BigInteger(g.getBinary(field, index).getBytes), parquetDecimal.getScale).toString
169 |                 case PrimitiveTypeName.FIXED_LEN_BYTE_ARRAY =>
170 |                   arr(field) = new java.math.BigDecimal(new BigInteger(g.getBinary(field, index).getBytes), parquetDecimal.getScale).toString
171 |               }
172 |             } else if (fieldType.getOriginalType == OriginalType.TIME_MICROS
173 |               && ptype == PrimitiveTypeName.INT64) {
174 |               arr(field) = g.getLong(field, index)
175 |             } else {
176 |               arr(field) = g.getValueToString(field, index)
177 |             }
178 |           }
179 |         }else{
180 |             val listgroup = g.asInstanceOf[SimpleGroup]
181 |             if (fieldType.getOriginalType == OriginalType.LIST) {
182 |               val elementGroup = listgroup.getGroup(field, index)
183 |               /* get how many structs are present in the array */
184 |               val repeatedListSize = elementGroup.getFieldRepetitionCount("list")
185 |               val serializedCombinedStruct = new StringBuilder
186 |               for(i <- 0 until repeatedListSize) {
187 |                 /* check if its an empty array*/
188 |                 if (elementGroup.getGroup(index, i).toString != "") {
189 |                   val rowAsVal = getRowAsString(elementGroup.getGroup(index, i).getGroup(0, 0), struct.fields(field).dataType.asInstanceOf[ArrayType].elementType.asInstanceOf[StructType])
190 |                   serializedCombinedStruct.append(serializeObject(rowAsVal))
191 |                   serializedCombinedStruct.append(" ")
192 |                 }
193 |               }
194 |               if ( serializedCombinedStruct.length > 0) serializedCombinedStruct.setLength(serializedCombinedStruct.length -1)
195 |               arr(field) = serializedCombinedStruct.toString()
196 |             } else {
197 |               val rowAsVal = getRowAsString(listgroup.getGroup(field, index), struct.fields(field).dataType.asInstanceOf[StructType]);
198 |               /* Serializing is necessary because we want to encode the rowAsVal object as a single string.
199 |                 rowAsVal is an array of string */
200 |               arr(field) = serializeObject(rowAsVal)
201 |             }
202 |         }
203 |       }
204 |     }
205 |     arr
206 |   }
207 | 
208 |   def isValidRow(): Boolean = true
209 | 
210 |   def jsonToData(dt: DataType, value: Any, mode: String): Any = {
211 |     return dt match {
212 |       case ar: ArrayType => {
213 |         util.Try({
214 |           val structs = value.toString.split(" ")
215 |           val seq = structs.zipWithIndex.map{ case (col, index) =>
216 |             val dataType = ar.elementType
217 |             jsonToData(dataType, col, mode)
218 |           }
219 |           ArrayData.toArrayData(seq)
220 |         }).getOrElse(null)
221 |       }
222 |       case BooleanType => util.Try(value.toString.toBoolean).getOrElse(null)
223 |       case ByteType => util.Try(value.toString.toByte).getOrElse(null)
224 |       case ShortType => util.Try(value.toString.toShort).getOrElse(null)
225 |       case DateType => util.Try(getNumberOfDaysFromEpoch(value.toString)).getOrElse(null)
226 |       case DecimalType() => util.Try(Decimal(value.toString)).getOrElse(null)
227 |       case DoubleType => util.Try(value.toString.toDouble).getOrElse(null)
228 |       case FloatType => util.Try(value.toString.toFloat).getOrElse(null)
229 |       case IntegerType => util.Try(value.toString.toInt).getOrElse(null)
230 |       case LongType => util.Try(value.toString.toLong).getOrElse(null)
231 |       case StringType => util.Try(UTF8String.fromString(value.toString)).getOrElse(null)
232 |       case TimestampType => {
233 |         if (value != None && value != null) {
234 |           return value.asInstanceOf[Long]
235 |         } else {
236 |           null
237 |         }
238 |       }
239 |       case st: StructType => {
240 |         util.Try({
241 |           /* we decode the binary string to an array of string containing nested values */
242 |           val arr = deSerializeObject(value.toString.getBytes());
243 |           val seq = arr.zipWithIndex.map { case (col, index) =>
244 |             val dataType = st.fields(index).dataType
245 |             jsonToData(dataType, col, mode)
246 |           }
247 |           val isAllNull = arr.forall(x => x == null)
248 |           if (isAllNull) null else InternalRow.fromSeq(seq)
249 |         }).getOrElse(null)
250 |       }
251 |       case _ => util.Try(UTF8String.fromString(value.toString)).getOrElse(null)
252 |     }
253 |   }
254 | }
255 | 


--------------------------------------------------------------------------------
/src/main/scala/com/microsoft/cdm/read/ReaderConnector.scala:
--------------------------------------------------------------------------------
 1 | package com.microsoft.cdm.read
 2 | 
 3 | import java.text.SimpleDateFormat
 4 | import java.util.TimeZone
 5 | import com.microsoft.cdm.utils.Constants
 6 | import com.microsoft.cdm.utils.TimestampFormatter
 7 | import org.apache.spark.sql.types.DataType
 8 | 
 9 | @SerialVersionUID(100L)
10 | trait ReaderConnector extends Serializable {
11 | 
12 |   val dateFormatter = new SimpleDateFormat(Constants.SINGLE_DATE_FORMAT)
13 |   val timestampFormatter = TimestampFormatter(Constants.TIMESTAMP_FORMAT, TimeZone.getDefault)
14 | 
15 |   /**
16 |    * build() is used as a constructor, to initialize local variables
17 |    */
18 |   def build
19 | 
20 |   /**
21 |    * Close any open streams if they exist
22 |    */
23 |   def close
24 | 
25 |   /**
26 |    * This method is to used to convert to Spark/CDM data types
27 |    * @param dataType
28 |    * @param col
29 |    * @return
30 |    */
31 |   def jsonToData(dataType: DataType, col: Any, mode: String): Any
32 | 
33 | 
34 |   def isValidRow(): Boolean
35 | 
36 |   /**
37 |    * Read a Row as a string.
38 |    * XXX: This is not sufficient for complex types
39 |    */
40 |   def readRow: Array[Any]
41 | }
42 | 


--------------------------------------------------------------------------------
/src/main/scala/com/microsoft/cdm/utils/CDMAuthentication.scala:
--------------------------------------------------------------------------------
 1 | package com.microsoft.cdm.utils
 2 | 
 3 | object CdmAuthType extends Enumeration {
 4 |   val AppReg, Sas, Token = Value
 5 | }
 6 | 
 7 | trait Auth {
 8 |   def getAppId: String
 9 |   def getAppKey: String
10 |   def getTenantId: String
11 |   def getSASToken: String
12 |   def getAuthType: String
13 | }
14 | 
15 | case class SasAuth(sasToken: String) extends Auth {
16 |   override def getAuthType: String = CdmAuthType.Sas.toString()
17 |   override def getSASToken: String = sasToken
18 |   override def getAppId: String = ""
19 |   override def getAppKey: String = ""
20 |   override def getTenantId: String = ""
21 | }
22 | 
23 | case class AppRegAuth(appId: String, appKey: String, tenantId: String) extends Auth {
24 |   override def getAuthType: String = CdmAuthType.AppReg.toString()
25 |   override def getAppId: String = appId
26 |   override def getAppKey: String = appKey
27 |   override def getTenantId: String = tenantId
28 |   override def getSASToken: String = ""
29 | }
30 | 
31 | case class TokenAuth() extends Auth {
32 |   override def getAuthType: String = CdmAuthType.Token.toString()
33 |   override def getAppId: String = ""
34 |   override def getAppKey: String = ""
35 |   override def getTenantId: String = ""
36 |   override def getSASToken: String = ""
37 | }
38 | 


--------------------------------------------------------------------------------
/src/main/scala/com/microsoft/cdm/utils/CDMModelReader.scala:
--------------------------------------------------------------------------------
 1 | package com.microsoft.cdm.utils
 2 | 
 3 | 
 4 | class CDMModelReader(storage: String,
 5 |                      container: String,
 6 |                      manifestPath: String,
 7 |                      manifestFileName: String,
 8 |                      entityName: String,
 9 |                      entDefContAndPath: String,
10 |                      auth: Auth,
11 |                      tokenProvider: Option[CDMTokenProvider],
12 |                      cdmSource: CDMSource.Value,
13 |                      entityDefinitionStorage: String,
14 |                      maxCDMThreads: Int)   extends CDMModelCommon (storage, container, manifestPath, manifestFileName,
15 |   entityName, "", entDefContAndPath, auth,
16 |   tokenProvider, "/", cdmSource, entityDefinitionStorage,
17 |   maxCDMThreads){
18 | 
19 | 
20 | }
21 | 


--------------------------------------------------------------------------------
/src/main/scala/com/microsoft/cdm/utils/CDMOptions.scala:
--------------------------------------------------------------------------------
  1 | package com.microsoft.cdm.utils
  2 | 
  3 | import com.microsoft.cdm.log.SparkCDMLogger
  4 | import org.apache.hadoop.conf.Configuration
  5 | import org.apache.spark.sql.SparkSession
  6 | import org.apache.spark.sql.util.CaseInsensitiveStringMap
  7 | import org.slf4j.LoggerFactory
  8 | import org.slf4j.event.Level
  9 | 
 10 | class CDMOptions(options: CaseInsensitiveStringMap) {
 11 | 
 12 |   val logger  = LoggerFactory.getLogger(classOf[CDMOptions])
 13 | 
 14 |   var appId: String = ""
 15 |   var appKey: String = ""
 16 |   var tenantId : String = ""
 17 |   var sasToken: String = ""
 18 |   var auth: Auth = null
 19 | 
 20 |   val storage = getRequiredArgument(options, "storage")
 21 |   val entity= getRequiredArgument(options,"entity")
 22 |   val newManifestPath= getRequiredArgument(options,"manifestPath")
 23 | 
 24 |   val manipathPathInput =  getContainerManifestPathAndFile(newManifestPath)
 25 |   var manifestPath= manipathPathInput.manifestPath
 26 |   val manifestFileName = manipathPathInput.manifestFileName
 27 |   val container = manipathPathInput.container
 28 | 
 29 |   val maxCDMThreadsString = if (options.containsKey("maxCDMThreads")) options.get("maxCDMThreads") else "100"
 30 |   if (!isNumeric(maxCDMThreadsString)) throw new Exception(String.format("%s - %s", Messages.invalidThreadCount, maxCDMThreadsString))
 31 |   val maxCDMThreads = maxCDMThreadsString.toInt
 32 |   if (maxCDMThreads < 1 ) throw new Exception(String.format("%s - %s", Messages.invalidThreadCount, maxCDMThreadsString))
 33 | 
 34 |   val cdmSource =
 35 |     if (options.containsKey("cdmSource")) {
 36 |       val cdmSourceValue = options.get("cdmSource")
 37 |       CDMSource.getValue(cdmSourceValue)
 38 |     } else
 39 |       CDMSource.REFERENCED
 40 | 
 41 | 
 42 | 
 43 |   var conf : Configuration = SparkSession.builder.getOrCreate.sessionState.newHadoopConf()
 44 |   Environment.sparkPlatform = SparkPlatform.getPlatform(conf)
 45 |   if (getAuthType(options) == CdmAuthType.AppReg.toString()) {
 46 |     appId = getRequiredArgument(options,"appId")
 47 |     appKey = getRequiredArgument(options,"appKey")
 48 |     tenantId = getRequiredArgument(options,"tenantId")
 49 |     auth = AppRegAuth(appId, appKey, tenantId)
 50 |   } else if (getAuthType(options) == CdmAuthType.Sas.toString()) {
 51 |     sasToken = getRequiredArgument(options,"sasToken")
 52 |     auth = SasAuth(sasToken)
 53 |   } else if (getAuthType(options) == CdmAuthType.Token.toString()) {
 54 |     auth = TokenAuth()
 55 |   } else {
 56 |     if (Environment.sparkPlatform == SparkPlatform.Other){
 57 |       throw new Exception(Messages.managedIdentitiesSynapseDataBricksOnly)
 58 |     }
 59 |   }
 60 | 
 61 |   def isNumeric(input: String): Boolean = input.forall(_.isDigit)
 62 | 
 63 |   private def getRequiredArgument(options: CaseInsensitiveStringMap, arg: String): String = {
 64 |     val result = if (options.containsKey(arg)) options.get(arg) else  {
 65 |       throw new Exception(s"'$arg' is a required argument!")
 66 |     }
 67 |     result
 68 |   }
 69 | 
 70 |   def getAuthType(options: CaseInsensitiveStringMap): String = {
 71 |     val appIdPresent =  options.containsKey("appId")
 72 |     val appKeyPresent =  options.containsKey("appKey")
 73 |     val tenantIdPresent =  options.containsKey("tenantId")
 74 |     val sasTokenPresent =  options.containsKey("sasToken")
 75 |     val result = if (appIdPresent || appKeyPresent|| tenantIdPresent) {
 76 |       //make sure all creds are present
 77 |       if (!appIdPresent || !appKeyPresent || !tenantIdPresent) {
 78 |         throw new Exception("All creds must exist")
 79 |       }
 80 |       SparkCDMLogger.log(Level.INFO,"Using app registration for authentication", logger)
 81 |       CdmAuthType.AppReg.toString()
 82 |     } else if (sasTokenPresent) {
 83 |       SparkCDMLogger.log(Level.INFO,"Using SAS token for authentication", logger)
 84 |       CdmAuthType.Sas.toString()
 85 |     } else {
 86 |       SparkCDMLogger.log(Level.INFO, "Using managed identities for authentication", logger)
 87 |       CdmAuthType.Token.toString()
 88 |     }
 89 |     result
 90 |   }
 91 | 
 92 |   def checkValidFileName(manifestFileName: String) =  {
 93 |     if(manifestFileName != Constants.MODEL_JSON &&  !manifestFileName.contains(".manifest.cdm.json")) {
 94 |       throw new Exception(String.format("Invalid manifest filename provided - %s", manifestFileName))
 95 |     }
 96 |   }
 97 | 
 98 |   def getContainerManifestPathAndFile(manifestContainerPath: String) =  {
 99 | 
100 |     var manifestContainerPathTemp = manifestContainerPath
101 |     if(manifestContainerPath.startsWith("/") &&  manifestContainerPath.length > 1) {
102 |       manifestContainerPathTemp =  manifestContainerPath.substring(1)
103 |     }
104 |     val manifestFileNameStartIndex = manifestContainerPathTemp.lastIndexOf("/") + 1
105 |     val manifestFileName = manifestContainerPathTemp.substring(manifestFileNameStartIndex)
106 | 
107 |     checkValidFileName(manifestFileName)
108 | 
109 |     val containerEndIndex = manifestContainerPathTemp.indexOf("/")
110 |     if(containerEndIndex == -1) {
111 |       throw new Exception("Container is not specified in the manifestPath")
112 |     }
113 |     var container = manifestContainerPathTemp.substring(0, containerEndIndex)
114 |     container = if(container.startsWith("/")) container else "/" + container
115 | 
116 |     var manifestPath = manifestContainerPathTemp.substring(containerEndIndex, manifestFileNameStartIndex)
117 |     manifestPath = if(manifestPath.startsWith("/")) manifestPath else "/" + manifestPath
118 | 
119 |     ManifestPath(container, manifestPath, manifestFileName)
120 |   }
121 | 
122 |   val fileFormatType = if(options.containsKey("format")) options.get("format") else "csv"
123 |   val overrideConfigPathIn = if (options.containsKey("configPath")) options.get("configPath") else ""
124 |   val overrideConfigPath =  if (overrideConfigPathIn.startsWith("/")) overrideConfigPathIn else "/" + overrideConfigPathIn
125 | 
126 | 
127 | }
128 | 


--------------------------------------------------------------------------------
/src/main/scala/com/microsoft/cdm/utils/CDMParquetSchemaConverter.scala:
--------------------------------------------------------------------------------
  1 | /*
  2 |  * Licensed to the Apache Software Foundation (ASF) under one or more
  3 |  * contributor license agreements.  See the NOTICE file distributed with
  4 |  * this work for additional information regarding copyright ownership.
  5 |  * The ASF licenses this file to You under the Apache License, Version 2.0
  6 |  * (the "License"); you may not use this file except in compliance with
  7 |  * the License.  You may obtain a copy of the License at
  8 |  *
  9 |  *    http://www.apache.org/licenses/LICENSE-2.0
 10 |  *
 11 |  * Unless required by applicable law or agreed to in writing, software
 12 |  * distributed under the License is distributed on an "AS IS" BASIS,
 13 |  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 14 |  * See the License for the specific language governing permissions and
 15 |  * limitations under the License.
 16 |  */
 17 | 
 18 | package com.microsoft.cdm.utils
 19 | 
 20 | import org.apache.hadoop.conf.Configuration
 21 | import org.apache.parquet.schema._
 22 | import org.apache.parquet.schema.OriginalType._
 23 | import org.apache.parquet.schema.PrimitiveType.PrimitiveTypeName._
 24 | import org.apache.parquet.schema.Type.Repetition._
 25 | import org.apache.parquet.schema.Types.MessageTypeBuilder
 26 | import org.apache.spark.sql.internal.SQLConf
 27 | import org.apache.spark.sql.types._
 28 | 
 29 | 
 30 | /**
 31 |  * This converter class is used to convert Spark SQL [[StructType]] to Parquet [[MessageType]].
 32 |  *
 33 |  * @param writeLegacyParquetFormat Whether to use legacy Parquet format compatible with Spark 1.4
 34 |  *        and prior versions when converting a Catalyst [[StructType]] to a Parquet [[MessageType]].
 35 |  *        When set to false, use standard format defined in parquet-format spec.  This argument only
 36 |  *        affects Parquet write path.
 37 |  * @param outputTimestampType which parquet timestamp type to use when writing.
 38 |  */
 39 | class CDMSparkToParquetSchemaConverter(
 40 |                                         writeLegacyParquetFormat: Boolean = SQLConf.PARQUET_WRITE_LEGACY_FORMAT.defaultValue.get,
 41 |                                         outputTimestampType: SQLConf.ParquetOutputTimestampType.Value =
 42 |                                         SQLConf.ParquetOutputTimestampType.INT96) {
 43 | 
 44 |   def this(conf: SQLConf) = this(
 45 |     writeLegacyParquetFormat = conf.writeLegacyParquetFormat,
 46 |     outputTimestampType = conf.parquetOutputTimestampType)
 47 | 
 48 |   def this(conf: Configuration) = this(
 49 |     writeLegacyParquetFormat = conf.get(SQLConf.PARQUET_WRITE_LEGACY_FORMAT.key).toBoolean,
 50 |     outputTimestampType = SQLConf.ParquetOutputTimestampType.withName(
 51 |       conf.get(SQLConf.PARQUET_OUTPUT_TIMESTAMP_TYPE.key)))
 52 | 
 53 |   def convert(catalystSchema: StructType, cdmSchema: Iterable[Any]): MessageType = {
 54 |     val bm = Types .buildMessage()
 55 |     convert(catalystSchema, cdmSchema, bm)
 56 |     bm.named(SPARK_PARQUET_SCHEMA_NAME)
 57 |   }
 58 | 
 59 |   def convert(schema: StructType, cdmSchema: Iterable[Any], bm : MessageTypeBuilder):Unit = {
 60 |     val arr = cdmSchema.toArray
 61 |     schema.fields.zipWithIndex.foreach { case (field, i) =>
 62 |       bm.addField(convertField(field, if (field.nullable) OPTIONAL else REQUIRED, arr(i)))
 63 |     }
 64 |   }
 65 | 
 66 |   def convertField(field: StructField, repetition: Type.Repetition, cdmType: Any): Type = {
 67 |     checkFieldName(field.name)
 68 | 
 69 |     field.dataType match {
 70 |       // ===================
 71 |       // Simple atomic types
 72 |       // ===================
 73 | 
 74 |       case BooleanType =>
 75 |         Types.primitive(BOOLEAN, repetition).named(field.name)
 76 | 
 77 |       case ByteType =>
 78 |         Types.primitive(INT32, repetition).as(INT_8).named(field.name)
 79 | 
 80 |       case ShortType =>
 81 |         Types.primitive(INT32, repetition).as(INT_16).named(field.name)
 82 | 
 83 |       case IntegerType =>
 84 |         Types.primitive(INT32, repetition).named(field.name)
 85 | 
 86 |       case LongType =>
 87 |         Types.primitive(INT64, repetition).named(field.name)
 88 | 
 89 |       case FloatType =>
 90 |         Types.primitive(FLOAT, repetition).named(field.name)
 91 | 
 92 |       case DoubleType =>
 93 |         Types.primitive(DOUBLE, repetition).named(field.name)
 94 | 
 95 |       case StringType =>
 96 |         Types.primitive(BINARY, repetition).as(UTF8).named(field.name)
 97 | 
 98 |       case DateType =>
 99 |         Types.primitive(INT32, repetition).as(DATE).named(field.name)
100 | 
101 |       // NOTE: Spark SQL can write timestamp values to Parquet using INT96, TIMESTAMP_MICROS or
102 |       // TIMESTAMP_MILLIS. TIMESTAMP_MICROS is recommended but INT96 is the default to keep the
103 |       // behavior same as before.
104 |       //
105 |       // As stated in PARQUET-323, Parquet `INT96` was originally introduced to represent nanosecond
106 |       // timestamp in Impala for some historical reasons.  It's not recommended to be used for any
107 |       // other types and will probably be deprecated in some future version of parquet-format spec.
108 |       // That's the reason why parquet-format spec only defines `TIMESTAMP_MILLIS` and
109 |       // `TIMESTAMP_MICROS` which are both logical types annotating `INT64`.
110 |       //
111 |       // Originally, Spark SQL uses the same nanosecond timestamp type as Impala and Hive.  Starting
112 |       // from Spark 1.5.0, we resort to a timestamp type with microsecond precision so that we can
113 |       // store a timestamp into a `Long`.  This design decision is subject to change though, for
114 |       // example, we may resort to nanosecond precision in the future.
115 |       case TimestampType => {
116 |         /* Spark-CDM:
117 |          * * If there is a metadata field overwrite to type Time (implicit write) set the parquet field to type Time
118 |          * * IF there is an explicit cdmType type of Type "Time", also set the field to type Time */
119 |         if (field.metadata.contains(Constants.MD_DATATYPE_OVERRIDE) &&
120 |           field.metadata.getString(Constants.MD_DATATYPE_OVERRIDE).equals(Constants.MD_DATATYPE_OVERRIDE_TIME)
121 |           || cdmType.equals("Time")) {
122 |           Types.primitive(INT64, repetition).as(OriginalType.TIME_MICROS).named(field.name)
123 |         } else {
124 |           outputTimestampType match {
125 |             case SQLConf.ParquetOutputTimestampType.INT96 =>
126 |               Types.primitive(INT96, repetition).named(field.name)
127 |             case SQLConf.ParquetOutputTimestampType.TIMESTAMP_MICROS =>
128 |               Types.primitive(INT64, repetition).as(TIMESTAMP_MICROS).named(field.name)
129 |             case SQLConf.ParquetOutputTimestampType.TIMESTAMP_MILLIS =>
130 |               Types.primitive(INT64, repetition).as(TIMESTAMP_MILLIS).named(field.name)
131 |           }
132 |         }
133 |       }
134 | 
135 |       case BinaryType =>
136 |         Types.primitive(BINARY, repetition).named(field.name)
137 | 
138 |       case DecimalType() => {
139 |         val decimal = field.dataType.asInstanceOf[DecimalType]
140 |         val precision = decimal.precision
141 |         val scale = decimal.scale
142 |         if (writeLegacyParquetFormat) {
143 |           // ======================
144 |           // Decimals (legacy mode)
145 |           // ======================
146 |           // Spark 1.4.x and prior versions only support decimals with a maximum precision of 18 and
147 |           // always store decimals in fixed-length byte arrays.  To keep compatibility with these older
148 |           // versions, here we convert decimals with all precisions to `FIXED_LEN_BYTE_ARRAY` annotated
149 |           // by `DECIMAL`.
150 |           Types
151 |             .primitive(FIXED_LEN_BYTE_ARRAY, repetition)
152 |             .as(DECIMAL)
153 |             .precision(precision)
154 |             .scale(scale)
155 |             .length(Decimal.minBytesForPrecision(precision))
156 |             .named(field.name)
157 | 
158 |           // ========================
159 |           // Decimals (standard mode)
160 |           // ========================
161 |         } else if (precision <= Decimal.MAX_INT_DIGITS) {
162 |           Types
163 |             .primitive(INT32, repetition)
164 |             .as(DECIMAL)
165 |             .precision(precision)
166 |             .scale(scale)
167 |             .named(field.name)
168 |         } else if (precision <= Decimal.MAX_LONG_DIGITS) {
169 |           Types
170 |             .primitive(INT64, repetition)
171 |             .as(DECIMAL)
172 |             .precision(precision)
173 |             .scale(scale)
174 |             .named(field.name)
175 |         } else {
176 |           Types
177 |             .primitive(FIXED_LEN_BYTE_ARRAY, repetition)
178 |             .as(DECIMAL)
179 |             .precision(precision)
180 |             .scale(scale)
181 |             .length(Decimal.minBytesForPrecision(precision))
182 |             .named(field.name)
183 |         }
184 |       }
185 | 
186 |       // ===================================
187 |       // ArrayType and MapType (legacy mode)
188 |       // ===================================
189 | 
190 |       // Spark 1.4.x and prior versions convert `ArrayType` with nullable elements into a 3-level
191 |       // `LIST` structure.  This behavior is somewhat a hybrid of parquet-hive and parquet-avro
192 |       // (1.6.0rc3): the 3-level structure is similar to parquet-hive while the 3rd level element
193 |       // field name "array" is borrowed from parquet-avro.
194 |       case ArrayType(elementType, nullable @ true) if writeLegacyParquetFormat =>
195 |         // <list-repetition> group <name> (LIST) {
196 |         //   optional group bag {
197 |         //     repeated <element-type> array;
198 |         //   }
199 |         // }
200 | 
201 |         // This should not use `listOfElements` here because this new method checks if the
202 |         // element name is `element` in the `GroupType` and throws an exception if not.
203 |         // As mentioned above, Spark prior to 1.4.x writes `ArrayType` as `LIST` but with
204 |         // `array` as its element name as below. Therefore, we build manually
205 |         // the correct group type here via the builder. (See SPARK-16777)
206 |         Types
207 |           .buildGroup(repetition).as(LIST)
208 |           .addField(Types
209 |             .buildGroup(REPEATED)
210 |             // "array" is the name chosen by parquet-hive (1.7.0 and prior version)
211 |             .addField(convertField(StructField("array", elementType, nullable),if (field.nullable) OPTIONAL else REQUIRED,cdmType.asInstanceOf[Iterable[Any]]))
212 |             .named("bag"))
213 |           .named(field.name)
214 | 
215 |       // Spark 1.4.x and prior versions convert ArrayType with non-nullable elements into a 2-level
216 |       // LIST structure.  This behavior mimics parquet-avro (1.6.0rc3).  Note that this case is
217 |       // covered by the backwards-compatibility rules implemented in `isElementType()`.
218 |       case ArrayType(elementType, nullable @ false) if writeLegacyParquetFormat =>
219 |         // <list-repetition> group <name> (LIST) {
220 |         //   repeated <element-type> element;
221 |         // }
222 | 
223 |         // Here too, we should not use `listOfElements`. (See SPARK-16777)
224 |         Types
225 |           .buildGroup(repetition).as(LIST)
226 |           // "array" is the name chosen by parquet-avro (1.7.0 and prior version)
227 |           .addField(convertField(StructField("array", elementType, nullable), REPEATED, cdmType.asInstanceOf[Iterable[Any]]))
228 |           .named(field.name)
229 | 
230 |       // Spark 1.4.x and prior versions convert MapType into a 3-level group annotated by
231 |       // MAP_KEY_VALUE.  This is covered by `convertGroupField(field: GroupType): DataType`.
232 |       case MapType(keyType, valueType, valueContainsNull) if writeLegacyParquetFormat =>
233 |         // <map-repetition> group <name> (MAP) {
234 |         //   repeated group map (MAP_KEY_VALUE) {
235 |         //     required <key-type> key;
236 |         //     <value-repetition> <value-type> value;
237 |         //   }
238 |         // }
239 |         ConversionPatterns.mapType(
240 |           repetition,
241 |           field.name,
242 |           convertField(StructField("key", keyType, nullable = false),if (field.nullable) OPTIONAL else REQUIRED,cdmType.asInstanceOf[Iterable[Any]]),
243 |           convertField(StructField("value", valueType, valueContainsNull),if (field.nullable) OPTIONAL else REQUIRED,cdmType.asInstanceOf[Iterable[Any]]))
244 | 
245 |       // =====================================
246 |       // ArrayType and MapType (standard mode)
247 |       // =====================================
248 | 
249 |       case ArrayType(elementType, containsNull) if !writeLegacyParquetFormat =>
250 |         // <list-repetition> group <name> (LIST) {
251 |         //   repeated group list {
252 |         //     <element-repetition> <element-type> element;
253 |         //   }
254 |         // }
255 |         Types
256 |           .buildGroup(repetition).as(LIST)
257 |           .addField(
258 |             Types.repeatedGroup()
259 |               .addField(convertField(StructField("element", elementType, containsNull), if (field.nullable) OPTIONAL else REQUIRED,cdmType.asInstanceOf[Iterable[Any]]))
260 |               .named("list"))
261 |           .named(field.name)
262 | 
263 |       case MapType(keyType, valueType, valueContainsNull) =>
264 |         // <map-repetition> group <name> (MAP) {
265 |         //   repeated group key_value {
266 |         //     required <key-type> key;
267 |         //     <value-repetition> <value-type> value;
268 |         //   }
269 |         // }
270 |         Types
271 |           .buildGroup(repetition).as(MAP)
272 |           .addField(
273 |             Types
274 |               .repeatedGroup()
275 |               .addField(convertField(StructField("key", keyType, nullable = false),if (field.nullable) OPTIONAL else REQUIRED, cdmType.asInstanceOf[Iterable[Any]]))
276 |               .addField(convertField(StructField("value", valueType, valueContainsNull),if (field.nullable) OPTIONAL else REQUIRED,cdmType.asInstanceOf[Iterable[Any]]))
277 |               .named("key_value"))
278 |           .named(field.name)
279 | 
280 |       // ===========
281 |       // Other types
282 |       // ===========
283 | 
284 |       case StructType(fields) => {
285 |         val bg = Types.buildGroup(repetition)
286 |         fields.zipWithIndex.foreach{
287 |           case (field, fieldIndex) => {
288 |             val cdmStruct = cdmType.asInstanceOf[List[Any]]
289 |             bg.addField(convertField(field, if (field.nullable) OPTIONAL else REQUIRED,
290 |               cdmStruct(fieldIndex)))
291 |           }
292 |         }
293 |         bg.named(field.name)
294 |       }
295 |       /* fields.foldLeft(Types.buildGroup(repetition)) {(builder, field) => {
296 |          builder.addField(convertField(field, if (field.nullable) OPTIONAL else REQUIRED,cdmType.asInstanceOf[Iterable[Any]]))
297 |        }
298 |        }.named(field.name)
299 | 
300 |        */
301 | 
302 |       //case udt: UserDefinedType[_] =>
303 |       //  convertField(field.copy(dataType = udt.sqlType))
304 | 
305 |       case _ =>
306 |         throw new Exception(s"Unsupported data type ${field.dataType.catalogString}")
307 |     }
308 |   }
309 |   private val SPARK_PARQUET_SCHEMA_NAME = "spark_schema"
310 | 
311 |   private val EMPTY_MESSAGE: MessageType =
312 |     Types.buildMessage().named(SPARK_PARQUET_SCHEMA_NAME)
313 | 
314 |   private def checkFieldName(name: String): Unit = {
315 |     // ,;{}()\n\t= and space are special characters in Parquet schema
316 |     checkConversionRequirement(
317 |       !name.matches(".*[ ,;{}()\n\t=].*"),
318 |       s"""Attribute name "$name" contains invalid character(s) among " ,;{}()\\n\\t=".
319 |          |Please use alias to rename it.
320 |        """.stripMargin.split("\n").mkString(" ").trim)
321 |   }
322 | 
323 |   private def checkFieldNames(names: Seq[String]): Unit = {
324 |     names.foreach(checkFieldName)
325 |   }
326 | 
327 |   private def checkConversionRequirement(f: => Boolean, message: String): Unit = {
328 |     if (!f) {
329 |       throw new Exception(message)
330 |     }
331 |   }
332 | }
333 | 
334 | 


--------------------------------------------------------------------------------
/src/main/scala/com/microsoft/cdm/utils/CDMSASTokenProvider.scala:
--------------------------------------------------------------------------------
 1 | package com.microsoft.cdm.utils
 2 | import com.microsoft.cdm.utils.Constants.SASTOKEN_CONF_SETTING
 3 | import org.apache.hadoop.conf.Configuration
 4 | 
 5 | class CDMSASTokenProvider extends org.apache.hadoop.fs.azurebfs.extensions.SASTokenProvider {
 6 |   var sasToken = ""
 7 |   override def getSASToken(account: String, fileSystem: String, path: String, operation: String): String = {
 8 |     sasToken
 9 |   }
10 | 
11 |   override def initialize(configuration: Configuration, accountName: String): Unit = {
12 |     sasToken = configuration.get(SASTOKEN_CONF_SETTING)
13 |   }
14 | }
15 | 


--------------------------------------------------------------------------------
/src/main/scala/com/microsoft/cdm/utils/CDMTokenProvider.scala:
--------------------------------------------------------------------------------
 1 | package com.microsoft.cdm.utils
 2 | 
 3 | import com.microsoft.azure.synapse.tokenlibrary.TokenLibrary
 4 | import com.microsoft.commondatamodel.objectmodel.utilities.network.TokenProvider
 5 | import com.databricks.backend.daemon.data.client.adl.AdlGen2CredentialContextTokenProvider
 6 | 
 7 | class CDMTokenProvider(serConf: SparkSerializableConfiguration, accountName: String) extends TokenProvider {
 8 |   val platform = SparkPlatform.getPlatform(serConf.value)
 9 |   var startTime = System.currentTimeMillis()
10 | 
11 | 
12 |   var curToken:String =
13 |     if (platform == SparkPlatform.DataBricks) {
14 |       val adpProvider = new AdlGen2CredentialContextTokenProvider()
15 |       val dbToken = adpProvider.getToken().getAccessToken()
16 |       dbToken
17 |     } else if (platform == SparkPlatform.Synapse) {
18 |       getSynapseToken
19 |     } else {
20 |       throw new Exception(Messages.managedIdentitiesSynapseDataBricksOnly)
21 |     }
22 | 
23 |   def isTokenValid(): Boolean = {
24 |     var validToken = true
25 |     if (platform == SparkPlatform.DataBricks) {
26 |       val endTime = System.currentTimeMillis();
27 |       if ((endTime - startTime) > Constants.MILLIS_PER_HOUR) {
28 |         validToken = false
29 |       }
30 |     }
31 |     validToken
32 |   }
33 | 
34 |   private def getSynapseToken: String = {
35 |     val resource = s"""{"audience": "storage", "name": "$accountName"}"""
36 |     val token = TokenLibrary.getAccessToken(resource)
37 |     token.token
38 |   }
39 | 
40 |   private def getCachedSynapseToken: String = {
41 |     if (!TokenLibrary.isValid(curToken)) {
42 |       curToken = getSynapseToken
43 |     }
44 |     "Bearer " + curToken
45 |   }
46 | 
47 |   /*
48 |    *  Databricks token cannot be called from an asynchronous context without an ExecutorService, which is
49 |    *  not what the CDM-SDK does. Their getToken method (below) is called asynchronously without an
50 |    *  executorService. However, since the Databricks token cannot be refreshed inside of a job, we cannot
51 |    *  get a new token even if a job lasts longer than one hour. Therefore, we can simply grab the token
52 |    *  during initialization (synchronous context call) and always return the same cached token. We
53 |    *  need to error out if we try to use the same token on an request that spans over one hour. This is what the
54 |    *  the isTokenValid() is  responsible for.
55 |    *
56 |    *  If the Databricks team implements a refreshToken mechanism, we need the CDM-SDK to implement an
57 |    *  executorService for their asychronous calls so that we can call the Databricks's regreshToken()
58 |    *  method at runTime.
59 |    */
60 |   private def getDataBricksToken: String = {
61 |     "Bearer " + curToken
62 |   }
63 | 
64 |   @Override
65 |   override def getToken: String = {
66 |     platform match {
67 |       case SparkPlatform.DataBricks => getDataBricksToken
68 |       case SparkPlatform.Synapse => getCachedSynapseToken
69 |       case SparkPlatform.Other => throw new Exception(Messages.managedIdentitiesSynapseDataBricksOnly)
70 |     }
71 |   }
72 | }


--------------------------------------------------------------------------------
/src/main/scala/com/microsoft/cdm/utils/CDMUtils.scala:
--------------------------------------------------------------------------------
  1 | package com.microsoft.cdm.utils
  2 | 
  3 | import com.microsoft.cdm.log.SparkCDMLogger
  4 | import java.util.Calendar
  5 | 
  6 | import com.microsoft.commondatamodel.objectmodel.cdm.{CdmCorpusDefinition, CdmEntityDeclarationDefinition, CdmEntityDefinition, CdmManifestDefinition}
  7 | import com.microsoft.commondatamodel.objectmodel.enums.CdmStatusLevel
  8 | import com.microsoft.commondatamodel.objectmodel.utilities.EventCallback
  9 | import org.apache.hadoop.conf.Configuration
 10 | import org.apache.spark.sql.types.StructType
 11 | import org.slf4j.LoggerFactory
 12 | import org.slf4j.event.Level
 13 | 
 14 | import util.control.Breaks._
 15 | import scala.collection.mutable.ArrayBuffer
 16 | 
 17 | case class CDMEntity(rootManifest: CdmManifestDefinition, parentManifest: CdmManifestDefinition, entityDec:CdmEntityDeclarationDefinition, var schema: StructType)
 18 | case class SchemaDiffOutput(isSame: Boolean, diffIndex: Int, path: ArrayBuffer[String])
 19 | case class CDMDecimalType(precision: Int, scale: Int)   {
 20 |   override def toString() : String = {
 21 |     this.getClass.getName + precision + scale;
 22 |   }
 23 | }
 24 | case class FileFormatSettings(fileFormat: String, delimiter: Char, showHeader: Boolean)
 25 | case class ManifestPath(container: String , manifestPath: String, manifestFileName: String)
 26 | 
 27 | /**
 28 |  * Enum containing possible data types for CDM data.
 29 |  */
 30 | 
 31 | object CDMDataType extends Enumeration {
 32 |   val byte, bigInteger, smallInteger, integer, date, dateTime, guid, float, string, double, decimal, boolean, time, entity= Value
 33 | }
 34 | 
 35 | object CDMDataFormat extends Enumeration {
 36 |   val Byte, Int64, Int32, Int16, Date, DateTime, Guid, Float, String, Double, Decimal, Boolean,  DateTimeOffset, Time, entity = Value
 37 | }
 38 | 
 39 | 
 40 | /**
 41 |  * Platform the connector is running on
 42 |  */
 43 | object SparkPlatform extends Enumeration {
 44 |   val DataBricks, Synapse, Local, Other = Value
 45 | 
 46 |   def getPlatform(conf: Configuration): Value = {
 47 |     val host = conf.get("spark.driver.host")
 48 |     // Use these conf settings to determine the platform
 49 |     if (conf.get("spark.databricks.preemption.enabled") != null) {
 50 |       SparkPlatform.DataBricks
 51 |     } else if (conf.get("spark.synapse.session.token") != null){
 52 |       SparkPlatform.Synapse
 53 |     } else if (host.equals("localhost")){
 54 |       SparkPlatform.Local
 55 |     } else {
 56 |       SparkPlatform.Other
 57 |     }
 58 |   }
 59 | }
 60 | 
 61 | // callback implementation to fetch Logs from CDM SDK
 62 | object CDMCallback extends EventCallback {
 63 |   val fromCDMSDK = "CDM-SDK Library"
 64 |   val logger  = LoggerFactory.getLogger(fromCDMSDK)
 65 | 
 66 |   override def apply(cdmStatusLevel: CdmStatusLevel, message: String): Unit = {
 67 |     // Dev debug
 68 |     // println(s"[${cdmStatusLevel}] ${message}")
 69 | 
 70 |     if(cdmStatusLevel == CdmStatusLevel.Error) {
 71 |       SparkCDMLogger.log(Level.ERROR, message, logger)
 72 |       SparkCDMLogger.logEventToKusto(fromCDMSDK, "", Level.ERROR, message)
 73 |       if (message.contains("saveDocumentAsAsync")) {
 74 |         throw new Exception(message)
 75 |       }
 76 |       if (message.contains("Adapter not found for the namespace") && Constants.MODE.equals("write")) {
 77 |         throw new Exception(String.format(Messages.overrideConfigJson, message))
 78 |       }
 79 |       if (Constants.MODE.equals("write")) {
 80 |         throw new Exception(message)
 81 |       }
 82 |     }
 83 |   }
 84 | }
 85 | 
 86 | // Singleton to retrieve the current date as a folder for partitions to be written to
 87 | object CDMDataFolder {
 88 |   def getDataFolderWithDate ():String = {
 89 |     val cal = Calendar.getInstance()
 90 |     val year=  cal.get(Calendar.YEAR)
 91 |     val month =  "%02d".format((cal.get(Calendar.MONTH ) + 1))
 92 |     val day =  "%02d".format(cal.get(Calendar.DATE))
 93 |     Constants.PARTITION_DIR_PATTERN.format(year, month, day)
 94 |   }
 95 | }
 96 | 
 97 | object CDMSource extends Enumeration {
 98 |   val REFERENCED, BUILTIN = Value
 99 | 
100 |   def getValue(input: String) : CDMSource.Value ={
101 |     var result : CDMSource.Value = null
102 |     val cdmSource = CDMSource.values;
103 |     breakable {
104 |       for (cdm <- cdmSource) {
105 |         if (cdm.toString.equalsIgnoreCase(input)) {
106 |           result = cdm
107 |           break
108 |         }
109 |       }
110 |     }
111 |     if(result == null) throw  new IllegalArgumentException(String.format(Messages.invalidCDMSourceName, input)) else result
112 |   }
113 | }
114 | 
115 | 
116 | 
117 | case class EntityNotFoundException(private val message: String = "") extends  Exception(message)
118 | case class ManifestNotFoundException(private val message: String = "") extends  Exception(message)
119 | 


--------------------------------------------------------------------------------
/src/main/scala/com/microsoft/cdm/utils/CdmAdapterProvider.scala:
--------------------------------------------------------------------------------
 1 | package com.microsoft.cdm.utils
 2 | 
 3 | import com.microsoft.commondatamodel.objectmodel.storage.AdlsAdapter
 4 | trait CdmAdapterProvider {
 5 |   def getAdlsAdapter : AdlsAdapter
 6 | }
 7 | 
 8 | object CdmAdapterProvider {
 9 | 
10 |   private class CdmTokenAuthAdapter(val storage: String, val root: String, val tokenProvider: CDMTokenProvider) extends CdmAdapterProvider {
11 |     override def getAdlsAdapter : AdlsAdapter = {
12 |       new AdlsAdapter(storage, root, tokenProvider)
13 |     }
14 |   }
15 | 
16 |   private class CdmAppRegAdapter(val storage: String, val root: String, val auth: Auth) extends CdmAdapterProvider {
17 |     override def getAdlsAdapter: AdlsAdapter = {
18 |       new AdlsAdapter(storage, root, auth.getTenantId, auth.getAppId, auth.getAppKey)
19 |     }
20 |   }
21 | 
22 |   private class CdmSASAuthAdapter(val storage: String, val root: String, val auth: Auth) extends CdmAdapterProvider {
23 |     override def getAdlsAdapter: AdlsAdapter = {
24 |       val adapter = new AdlsAdapter(storage, root)
25 |       adapter.setSasToken(auth.getSASToken)
26 |       adapter
27 |     }
28 |   }
29 | 
30 |   def apply(storage: String, rootPath: String, auth: Auth, token: Option[CDMTokenProvider]): AdlsAdapter = {
31 |     if(auth.getAuthType == CdmAuthType.AppReg.toString()) {
32 |       new CdmAppRegAdapter(storage, rootPath, auth).getAdlsAdapter
33 |     } else if(auth.getAuthType == CdmAuthType.Sas.toString()){
34 |       new CdmSASAuthAdapter(storage, rootPath, auth).getAdlsAdapter
35 |     } else {
36 |       new CdmTokenAuthAdapter(storage, rootPath, token.get).getAdlsAdapter
37 |     }
38 |   }
39 | }
40 | 


--------------------------------------------------------------------------------
/src/main/scala/com/microsoft/cdm/utils/Constants.scala:
--------------------------------------------------------------------------------
 1 | package com.microsoft.cdm.utils
 2 | 
 3 | import java.math.MathContext
 4 | 
 5 | import org.apache.hadoop.conf.Configuration
 6 | 
 7 | /**
 8 |  * Various constants for spark-csv.
 9 |  */
10 | object Constants {
11 | 
12 |   // TODO: ensure these match the data provided
13 |   //val DATE_FORMATS = Array("MM/dd/yyyy", "MM/dd/yyyy hh:mm:ss a")
14 |   //val OUTPUT_FORMAT = "MM/dd/yyyy hh:mm:ss a"
15 | 
16 |   val MILLIS_PER_HOUR = 3600000
17 |   val DECIMAL_PRECISION = 37
18 |   val MATH_CONTEXT = new MathContext(28)
19 |   val SINGLE_DATE_FORMAT = "M/d/yyyy"
20 |   val TIMESTAMP_FORMAT = "yyyy-MM-dd HH:mm:ss.SSS"
21 | 
22 |   val MD_TRAITS = "traits"
23 |   val MD_DATATYPE_OVERRIDE = "datatype"
24 |   val MD_DATATYPE_OVERRIDE_TIME= "Time"
25 | 
26 |   val CDM_ARRAY_TRAIT = "is.linkedEntity.array"
27 |   val CDM_DECIMAL_TRAIT = "is.dataFormat.numeric.shaped"
28 | 
29 |   val LOGICAL_ENTITY_DIR = "LogicalDefinition"
30 |   val defaultCompressionFormat = "snappy"
31 |   val SPARK_MODELROOT_NAMESPACE = "SparkModelRoot"
32 |   val CDM_DEFAULT_PRECISION = 18
33 |   val CDM_DEFAULT_SCALE = 4
34 |   var PRODUCTION = true
35 |   val DEFAULT_DELIMITER = ','
36 |   val SPARK_NAMESPACE = "SparkManifestLocation"
37 |   val PARTITION_DIR_PATTERN ="%s-%s-%s"
38 |   val GLOB_PATTERN="%s-%s-*.%s"
39 |   var EXCEPTION_TEST = false
40 |   var SUBMANIFEST_WITH_OVERWRITTEN_PARTITIONS =  "%s.rename.manifest.cdm.json"
41 |   var MODE = ""
42 |   var MODEL_JSON = "model.json"
43 |   var KUSTO_ENABLED = true
44 |   var SASTOKEN_CONF_SETTING = "com.microsoft.cdm.sastoken"
45 | 
46 |   // For permissive/fail fast CSV reading mode.
47 |   // Could be an Enumeration, but strings enable case-insensitive comparison
48 |   val PERMISSIVE = "permissive"
49 |   val FAILFAST = "failfast"
50 |   val DROPMALFORMED= "dropmalformed"
51 | }
52 | 
53 | object Environment {
54 |   var sparkPlatform: SparkPlatform.Value = _
55 | }
56 | class Constants {
57 | 
58 | }
59 | 


--------------------------------------------------------------------------------
/src/main/scala/com/microsoft/cdm/utils/CsvParserFactory.scala:
--------------------------------------------------------------------------------
 1 | package com.microsoft.cdm.utils
 2 | 
 3 | import com.univocity.parsers.csv.{CsvParser, CsvParserSettings, CsvWriter, CsvWriterSettings}
 4 | 
 5 | import java.io.OutputStreamWriter
 6 | /**
 7 |  * Builds Univocity CsvParser instances.
 8 |  */
 9 | object CsvParserFactory {
10 |   def build(delimiter: Char): CsvParser = {
11 |     val settings = new CsvParserSettings()
12 |     val format = settings.getFormat
13 |     format.setDelimiter(delimiter)
14 |     settings.setLineSeparatorDetectionEnabled(true)
15 |     settings.setMaxCharsPerColumn(-1)
16 |     settings.setMaxColumns(512 * 4)
17 |     new CsvParser(settings)
18 |   }
19 | 
20 |   def buildWriter(outputWriter: OutputStreamWriter, delimiter: Char): CsvWriter = {
21 |     val settings = new CsvWriterSettings()
22 |     settings.getFormat.setDelimiter(delimiter)
23 |     settings.getFormat.setLineSeparator("\n")
24 |     settings.setMaxCharsPerColumn(-1)
25 |     new CsvWriter(outputWriter, settings)
26 |   }
27 | }
28 | 


--------------------------------------------------------------------------------
/src/main/scala/com/microsoft/cdm/utils/DataConverter.scala:
--------------------------------------------------------------------------------
  1 | package com.microsoft.cdm.utils
  2 | 
  3 | import java.text.SimpleDateFormat
  4 | import java.time.format.{DateTimeFormatter, DateTimeFormatterBuilder}
  5 | import java.time.temporal.ChronoField
  6 | import java.time.{Instant, LocalDate, ZoneId}
  7 | import java.util.TimeZone
  8 | import java.util.concurrent.TimeUnit
  9 | 
 10 | import org.apache.spark.sql.types._
 11 | import org.slf4j.LoggerFactory
 12 | 
 13 | /**
 14 |  * Converts between CSV/CDM data and Spark data tpyes.
 15 |  */
 16 | @SerialVersionUID(100L)
 17 | class DataConverter extends Serializable{
 18 | 
 19 |   val logger  = LoggerFactory.getLogger(classOf[DataConverter])
 20 |   val dateFormatter = new SimpleDateFormat(Constants.SINGLE_DATE_FORMAT)
 21 | 
 22 | 
 23 |   def toSparkType(dt: CDMDataFormat.Value, precision: Int, scale: Int) = {
 24 |     val map = Map(
 25 |       CDMDataFormat.Byte -> ByteType,
 26 |       CDMDataFormat.Int16 -> ShortType,
 27 |       CDMDataFormat.Int32 -> IntegerType,
 28 |       CDMDataFormat.Int64 -> LongType,
 29 |       CDMDataFormat.Date -> DateType,
 30 |       CDMDataFormat.DateTime -> TimestampType,
 31 |       CDMDataFormat.String -> StringType,
 32 |       CDMDataFormat.Double -> DoubleType,
 33 |       CDMDataFormat.Decimal -> DecimalType(precision, scale),
 34 |       CDMDataFormat.Boolean -> BooleanType,
 35 |       CDMDataFormat.DateTimeOffset -> TimestampType,
 36 |       CDMDataFormat.Guid -> StringType, //There is no UuidType in Spark
 37 |       CDMDataFormat.Time -> TimestampType,
 38 |       CDMDataFormat.Float -> FloatType
 39 |     )
 40 |     map(dt)
 41 |   }
 42 | 
 43 |   def toParquet(dataType: DataType, data: Any): Any= {
 44 |     (dataType, data) match {
 45 |       case (_, null) => null
 46 |       case (IntegerType, _) => data.asInstanceOf[Int]
 47 |       case (StringType, _) => data.asInstanceOf[String]
 48 |       case _ => data.toString
 49 |     }
 50 |   }
 51 | 
 52 |   def toCdmDataFormat(dt: DataType): CDMDataFormat.Value = {
 53 |     return dt match {
 54 |       case ByteType => CDMDataFormat.Byte
 55 |       case ShortType => CDMDataFormat.Int16
 56 |       case IntegerType => CDMDataFormat.Int32
 57 |       case LongType => CDMDataFormat.Int64
 58 |       case DateType => CDMDataFormat.Date
 59 |       case StringType => CDMDataFormat.String
 60 |       case DoubleType => CDMDataFormat.Double
 61 |       case DecimalType() => CDMDataFormat.Decimal
 62 |       case BooleanType => CDMDataFormat.Boolean
 63 |       case TimestampType => CDMDataFormat.DateTime
 64 |       case structType: StructType => CDMDataFormat.entity
 65 |       case FloatType => CDMDataFormat.Float
 66 |     }
 67 |   }
 68 |   def toCdmDataFormatOverride(dt: String): CDMDataFormat.Value = {
 69 |     return dt match {
 70 |       case Constants.MD_DATATYPE_OVERRIDE_TIME=> CDMDataFormat.Time
 71 |     }
 72 |   }
 73 | 
 74 |   def toCdmDataType(dt: DataType): CDMDataType.Value = {
 75 |     return dt match {
 76 |       case ByteType => CDMDataType.byte
 77 |       case ShortType => CDMDataType.smallInteger
 78 |       case IntegerType => CDMDataType.integer
 79 |       case LongType => CDMDataType.bigInteger
 80 |       case DateType => CDMDataType.date
 81 |       case StringType => CDMDataType.string
 82 |       case DoubleType => CDMDataType.double
 83 |       case DecimalType() => CDMDataType.decimal
 84 |       case BooleanType => CDMDataType.boolean
 85 |       case TimestampType => CDMDataType.dateTime
 86 |       case structType: StructType => CDMDataType.entity
 87 |       case FloatType => CDMDataType.float
 88 |     }
 89 |   }
 90 | 
 91 |   def toCdmDataTypeOverride(dt: String): CDMDataType.Value = {
 92 |     return dt match {
 93 |       case Constants.MD_DATATYPE_OVERRIDE_TIME => CDMDataType.time
 94 |     }
 95 |   }
 96 | 
 97 |   def dataToString(data: Any, dataType: DataType, cdmType:Any): String = {
 98 |     (dataType, data, cdmType) match {
 99 |       case (_, null, _) => null
100 |       case (DateType, v: Number, _) => {
101 |         LocalDate.ofEpochDay(v.intValue()).toString
102 |       }
103 |       case (TimestampType, v: Number, "DateTimeOffset") => {
104 |         val nanoAdjustment  = TimeUnit.MICROSECONDS.toNanos(Math.floorMod(v.asInstanceOf[Long], TimeUnit.SECONDS.toMicros(1)))
105 |         val instant = Instant.ofEpochSecond(TimeUnit.MICROSECONDS.toSeconds(v.asInstanceOf[Long]), nanoAdjustment);
106 |         val date = instant.atZone(ZoneId.systemDefault())
107 |         /*
108 |          * Using this format rather than ISO_OFFSET_DATE_TIME forces the format
109 |          * to use +00:00 when the local time is UTC
110 |          */
111 |         val formatter = new DateTimeFormatterBuilder()
112 |           .appendPattern("yyyy-MM-dd'T'HH:mm:ss")
113 |           .appendFraction(ChronoField.NANO_OF_SECOND, 0, 6, true)
114 |           .appendOffset("+HH:MM","+00:00").toFormatter // min 0 max 6
115 |         date.format(formatter)
116 |       }
117 |       case (TimestampType, v: Number, "DateTime") => {
118 |         val nanoAdjustment  = TimeUnit.MICROSECONDS.toNanos(Math.floorMod(v.asInstanceOf[Long], TimeUnit.SECONDS.toMicros(1)))
119 |         val instant = Instant.ofEpochSecond(TimeUnit.MICROSECONDS.toSeconds(v.asInstanceOf[Long]), nanoAdjustment);
120 |         val date = instant.atZone(ZoneId.systemDefault())
121 |         date.format(DateTimeFormatter.ISO_INSTANT)
122 |       }
123 |       case (TimestampType, v: Long, "Time") => {
124 |         val nanoAdjustment  = TimeUnit.MICROSECONDS.toNanos(Math.floorMod(v.asInstanceOf[Long], TimeUnit.SECONDS.toMicros(1)))
125 |         val instant = Instant.ofEpochSecond(TimeUnit.MICROSECONDS.toSeconds(v.asInstanceOf[Long]), nanoAdjustment);
126 |         val localTime= instant.atZone(ZoneId.of("UTC")).toLocalTime
127 |         localTime.format(DateTimeFormatter.ofPattern("HH:mm:ss.SSSSSS")).toString
128 |       }
129 |       case _ => {
130 |         data.toString
131 |       }
132 |     }
133 |   }
134 | 
135 | }
136 | 
137 | 


--------------------------------------------------------------------------------
/src/main/scala/com/microsoft/cdm/utils/DateTimeFormatterHelper.scala:
--------------------------------------------------------------------------------
 1 | /*
 2 |  * Licensed to the Apache Software Foundation (ASF) under one or more
 3 |  * contributor license agreements.  See the NOTICE file distributed with
 4 |  * this work for additional information regarding copyright ownership.
 5 |  * The ASF licenses this file to You under the Apache License, Version 2.0
 6 |  * (the "License"); you may not use this file except in compliance with
 7 |  * the License.  You may obtain a copy of the License at
 8 |  *
 9 |  *    http://www.apache.org/licenses/LICENSE-2.0
10 |  *
11 |  * Unless required by applicable law or agreed to in writing, software
12 |  * distributed under the License is distributed on an "AS IS" BASIS,
13 |  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 |  * See the License for the specific language governing permissions and
15 |  * limitations under the License.
16 |  */
17 | 
18 | package com.microsoft.cdm.utils
19 | 
20 | import com.google.common.cache.CacheBuilder
21 | import com.microsoft.cdm.utils.DateTimeFormatterHelper._
22 | 
23 | import java.time._
24 | import java.time.chrono.IsoChronology
25 | import java.time.format.{DateTimeFormatter, DateTimeFormatterBuilder, ResolverStyle}
26 | import java.time.temporal.{ChronoField, TemporalAccessor, TemporalQueries}
27 | import java.util.Locale
28 | 
29 | trait DateTimeFormatterHelper {
30 |   protected def toZonedDateTime(
31 |                                  temporalAccessor: TemporalAccessor,
32 |                                  zoneId: ZoneId): ZonedDateTime = {
33 |     // Parsed input might not have time related part. In that case, time component is set to zeros.
34 |     val parsedLocalTime = temporalAccessor.query(TemporalQueries.localTime)
35 |     val localTime = if (parsedLocalTime == null) LocalTime.MIDNIGHT else parsedLocalTime
36 |     // Parsed input must have date component. At least, year must present in temporalAccessor.
37 |     val localDate = temporalAccessor.query(TemporalQueries.localDate)
38 | 
39 |     ZonedDateTime.of(localDate, localTime, zoneId)
40 |   }
41 |   protected def toInstantWithZoneId(temporalAccessor: TemporalAccessor, zoneId: ZoneId): Instant = {
42 |     val localTime = if (temporalAccessor.query(TemporalQueries.localTime) == null) {
43 |       LocalTime.ofNanoOfDay(0)
44 |     } else {
45 |       LocalTime.from(temporalAccessor)
46 |     }
47 |     val localDate = LocalDate.from(temporalAccessor)
48 |     val localDateTime = LocalDateTime.of(localDate, localTime)
49 |     val zonedDateTime = ZonedDateTime.of(localDateTime, zoneId)
50 |     Instant.from(zonedDateTime)
51 |   }
52 | 
53 |   // Gets a formatter from the cache or creates new one. The buildFormatter method can be called
54 |   // a few times with the same parameters in parallel if the cache does not contain values
55 |   // associated to those parameters. Since the formatter is immutable, it does not matter.
56 |   // In this way, synchronised is intentionally omitted in this method to make parallel calls
57 |   // less synchronised.
58 |   // The Cache.get method is not used here to avoid creation of additional instances of Callable.
59 |   protected def getOrCreateFormatter(pattern: String, locale: Locale): DateTimeFormatter = {
60 |     val key = (pattern, locale)
61 |     var formatter = cache.getIfPresent(key)
62 |     if (formatter == null) {
63 |       formatter = buildFormatter(pattern, locale)
64 |       cache.put(key, formatter)
65 |     }
66 |     formatter
67 |   }
68 | }
69 | 
70 | private object DateTimeFormatterHelper {
71 |   val cache = CacheBuilder.newBuilder()
72 |     .maximumSize(128)
73 |     .build[(String, Locale), DateTimeFormatter]()
74 | 
75 |   def buildFormatter(pattern: String, locale: Locale): DateTimeFormatter = {
76 |     new DateTimeFormatterBuilder()
77 |       .parseCaseInsensitive()
78 |       .appendPattern(pattern)
79 |       .parseDefaulting(ChronoField.ERA, 1)
80 |       .parseDefaulting(ChronoField.MONTH_OF_YEAR, 1)
81 |       .parseDefaulting(ChronoField.DAY_OF_MONTH, 1)
82 |       .parseDefaulting(ChronoField.MINUTE_OF_HOUR, 0)
83 |       .parseDefaulting(ChronoField.SECOND_OF_MINUTE, 0)
84 |       .toFormatter(locale)
85 |       .withChronology(IsoChronology.INSTANCE)
86 |       .withResolverStyle(ResolverStyle.STRICT)
87 |   }
88 | }
89 | 


--------------------------------------------------------------------------------
/src/main/scala/com/microsoft/cdm/utils/Messages.scala:
--------------------------------------------------------------------------------
 1 | package com.microsoft.cdm.utils
 2 | 
 3 | import java.math.MathContext
 4 | 
 5 | object Messages {
 6 |   val invalidThreadCount= "The maxCDMThreads parameter is invalid"
 7 |   val invalidIndexSchema = "The dataframe schema does not match the cdm schema for field \"%s\". Path : \"%s\""
 8 |   val mismatchedSizeSchema = "The dataframe schema and cdm schema don't have an equal number of fields. Entity Path:  \"%s\""
 9 |   val invalidCompressionFormat = "Invalid Compression format specified - %s"
10 |   val entityDefinitionModelFileNotFound= "Entity definition model file \"%s\" not found"
11 |   val invalidDecimalFormat = "Invalid Decimal(%s,%s)"
12 |   val onlyStructsInArraySupported = "Arrays with primitive types/MapType/ArrayType not yet supported"
13 |   val cdmDataFormatNotYetSupported = "CDM dataformat for %s  not yet supported"
14 |   val nestedTypesNotSupported = "Cannot write nested types to csv file. Please change the format to parquet."
15 |   val characterNotInRange = "Invalid Delimiter - %s. The provided delimiter should be in a valid char range : 0-65535"
16 |   val managedIdentitiesSynapseDataBricksOnly ="Managed identities only supported on Synapse or Databricks"
17 |   val managedIdentitiesDatabricksTimeout = "Databricks jobs must not last more than 1 hour (they have not refresh mechanism)"
18 |   val invalidDelimiterCharacter = "Invalid Delimiter - %s. Only one character is allowed in delimiter. Input should be a character."
19 |   val overrideConfigJson = "%s. Please override config.json by option \"configPath\""
20 |   val configJsonPathNotFound = "Config.json not found in %s."
21 |   val incorrectDataFolderFormat = "Incorrect Data Folder format %s - follow DateTimeFormatter format"
22 |   val invalidCDMSourceName = "Invalid cdmSource provided - %s. cdmSource can either be - builtin or referenced"
23 |   val invalidBothStandardAndEntityDefCont = "Specifying CdmStandard and entityDefinitionModelRoot is not valid"
24 |   val entityDefStorageAppCredError = "entityDefinitionStorage option is supported only with managed identities."
25 |   val incompatibleFileWithDataframe = "The number of columns in CSV/parquet file is not equal to the number of fields in Spark StructType. Either modify the attributes in manifest to make it equal to the number of columns in CSV/parquet files or modify the csv/parquet file"
26 |   val invalidMode = "Invalid mode provided. Suports - permissive or failfast"
27 |   val invalidPermissiveMode = "Permissive Mode not supported with Parquet files"
28 |   val dropMalformedNotSupported ="DropMalformed mode is not supported"
29 | }


--------------------------------------------------------------------------------
/src/main/scala/com/microsoft/cdm/utils/OverridenCdmStandardsAdapter.scala:
--------------------------------------------------------------------------------
 1 | package com.microsoft.cdm.utils
 2 | 
 3 | import com.microsoft.commondatamodel.objectmodel.storage.CdmStandardsAdapter
 4 | 
 5 | class OverridenCdmStandardsAdapter @throws[ClassNotFoundException] extends CdmStandardsAdapter {
 6 |   override def fetchConfig(): String = {
 7 |     "{\"config\":{},\"type\": \"cdm-standards\"}"
 8 |   }
 9 | }
10 | 


--------------------------------------------------------------------------------
/src/main/scala/com/microsoft/cdm/utils/SerializedABFSHadoopConf.scala:
--------------------------------------------------------------------------------
 1 | package com.microsoft.cdm.utils
 2 | 
 3 | import com.microsoft.cdm.utils.Constants.SASTOKEN_CONF_SETTING
 4 | import org.apache.hadoop.conf.Configuration
 5 | 
 6 | object SerializedABFSHadoopConf {
 7 |   def getConfiguration(storage: String,
 8 |                        container: String,
 9 |                        auth: Auth,
10 |                        conf: Configuration): SparkSerializableConfiguration = {
11 |     conf.set("fs.defaultFS", "abfss:/" + container + "@" + storage + "/")
12 |     if (auth.getAuthType == CdmAuthType.AppReg.toString()) {
13 |       conf.set("fs.azure.account.auth.type", "OAuth")
14 |       conf.set("fs.azure.account.oauth.provider.type", "org.apache.hadoop.fs.azurebfs.oauth2.ClientCredsTokenProvider")
15 |       conf.set("fs.azure.account.oauth2.client.id", auth.getAppId)
16 |       conf.set("fs.azure.account.oauth2.client.secret", auth.getAppKey)
17 |       conf.set("fs.azure.account.oauth2.client.endpoint", "https://login.microsoftonline.com/" + auth.getTenantId + "/oauth2/token")
18 |     } else if (auth.getAuthType == CdmAuthType.Sas.toString()) {
19 |       conf.set("fs.azure.account.auth.type", "SAS")
20 |       conf.set("fs.azure.sas.token.provider.type", "com.microsoft.cdm.utils.CDMSASTokenProvider")
21 |       conf.set("fs.azure.account.hns.enabled", "true")
22 |       conf.set("fs.abfss.impl.disable.cache", "true") // disable cache for abfss creation so that the sas tokens for different folders don't conflict.
23 |       conf.set(SASTOKEN_CONF_SETTING, auth.getSASToken) // setting to store the sas token
24 |     }
25 |     new SparkSerializableConfiguration(new Configuration(conf))
26 |   }
27 | }
28 | 


--------------------------------------------------------------------------------
/src/main/scala/com/microsoft/cdm/utils/SparkSerializableConfiguration.scala:
--------------------------------------------------------------------------------
 1 | package com.microsoft.cdm.utils
 2 | 
 3 | import org.apache.hadoop.conf.Configuration
 4 | import org.apache.hadoop.fs.FileSystem
 5 | 
 6 | import java.io.{IOException, ObjectInputStream, ObjectOutputStream}
 7 | import scala.util.control.NonFatal
 8 | 
 9 | @SerialVersionUID(100L)
10 | class SparkSerializableConfiguration(@transient var value: Configuration) extends Serializable {
11 | 
12 |   def getFileSystem() : FileSystem = {
13 |     FileSystem.get(value)
14 |   }
15 | 
16 |   private def writeObject(out: ObjectOutputStream): Unit = tryOrIOException {
17 |     out.defaultWriteObject()
18 |     value.write(out)
19 |   }
20 | 
21 |   private def readObject(in: ObjectInputStream): Unit = tryOrIOException {
22 |     value = new Configuration(false)
23 |     value.readFields(in)
24 |   }
25 | 
26 |   def tryOrIOException(block: => Unit) {
27 |     try {
28 |       block
29 |     } catch {
30 |       case e: IOException => {
31 |         throw e
32 |       }
33 |       case NonFatal(t) => {
34 |         throw new IOException(t)
35 |       }
36 |     }
37 |   }
38 | }


--------------------------------------------------------------------------------
/src/main/scala/com/microsoft/cdm/utils/StructTypeMetadata.scala:
--------------------------------------------------------------------------------
 1 | package com.microsoft.cdm.utils
 2 | 
 3 | import org.apache.spark.sql.types.{Metadata, StructType}
 4 | 
 5 | import scala.collection.mutable.HashMap
 6 | 
 7 | object StructTypeMetadata{
 8 |   object StructTypeMetadataMap{
 9 |     lazy val metadataMap = HashMap.empty[String, Metadata]
10 |     def setMD(s: String, md: Metadata): Unit = metadataMap(s) = md
11 |     def getMD(s: String): Metadata =  metadataMap.getOrElse(s, Metadata.empty)
12 |   }
13 | 
14 |   // Since StructTypeMetadataMap is a singleton, it's the key that determines which
15 |   // metadata object is returned.
16 |   implicit class StructTypeMetadataMap(s: StructType) {
17 |     def getMetadata(s: String): Metadata = StructTypeMetadataMap.getMD(s)
18 |     def setMetadata(s:String, name: Metadata) = StructTypeMetadataMap.setMD(s, name)
19 |   }
20 | }


--------------------------------------------------------------------------------
/src/main/scala/com/microsoft/cdm/utils/TimestampFormatter.scala:
--------------------------------------------------------------------------------
  1 | /*
  2 |  * Licensed to the Apache Software Foundation (ASF) under one or more
  3 |  * contributor license agreements.  See the NOTICE file distributed with
  4 |  * this work for additional information regarding copyright ownership.
  5 |  * The ASF licenses this file to You under the Apache License, Version 2.0
  6 |  * (the "License"); you may not use this file except in compliance with
  7 |  * the License.  You may obtain a copy of the License at
  8 |  *
  9 |  *    http://www.apache.org/licenses/LICENSE-2.0
 10 |  *
 11 |  * Unless required by applicable law or agreed to in writing, software
 12 |  * distributed under the License is distributed on an "AS IS" BASIS,
 13 |  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 14 |  * See the License for the specific language governing permissions and
 15 |  * limitations under the License.
 16 |  */
 17 | 
 18 | package com.microsoft.cdm.utils
 19 | 
 20 | 
 21 | import org.apache.spark.sql.catalyst.util.{DateTimeConstants, DateTimeUtils}
 22 | 
 23 | import java.text.ParseException
 24 | import java.time._
 25 | import java.time.format.DateTimeParseException
 26 | import java.time.temporal.{TemporalAccessor, TemporalQueries}
 27 | import java.util.concurrent.TimeUnit.NANOSECONDS
 28 | import java.util.{Locale, TimeZone}
 29 | import scala.util.control.NonFatal
 30 | 
 31 | @SerialVersionUID(100L)
 32 | sealed trait TimestampFormatter extends Serializable {
 33 |   /**
 34 |    * Parses a timestamp in a string and converts it to microseconds.
 35 |    *
 36 |    * @param s - string with timestamp to parse
 37 |    * @return microseconds since epoch.
 38 |    * @throws ParseException can be thrown by legacy parser
 39 |    * @throws DateTimeParseException can be thrown by new parser
 40 |    * @throws DateTimeException unable to obtain local date or time
 41 |    */
 42 |   @throws(classOf[ParseException])
 43 |   @throws(classOf[DateTimeParseException])
 44 |   @throws(classOf[DateTimeException])
 45 |   def parse(s: String): Long
 46 |   def format(us: Long): String
 47 | }
 48 | 
 49 | class Iso8601TimestampFormatter(
 50 |     pattern: String,
 51 |     zoneId: ZoneId,
 52 |     locale: Locale) extends TimestampFormatter with DateTimeFormatterHelper {
 53 |   @transient
 54 |   private lazy val formatter = getOrCreateFormatter(pattern, locale)
 55 |   //private val NANOSECONDS = 1000L
 56 |   private def toInstant(s: String): Instant = {
 57 |     val temporalAccessor = formatter.parse(s)
 58 |     if (temporalAccessor.query(TemporalQueries.offset()) == null) {
 59 |       toInstantWithZoneId(temporalAccessor, zoneId)
 60 |     } else {
 61 |       Instant.from(temporalAccessor)
 62 |     }
 63 |   }
 64 |   def instantToMicros(instant: Instant): Long = {
 65 |     val us = Math.multiplyExact(instant.getEpochSecond, DateTimeConstants.MICROS_PER_SECOND)
 66 |     val result = Math.addExact(us, NANOSECONDS.toMicros(instant.getNano))
 67 |     result
 68 |   }
 69 | 
 70 |   private val specialValueRe = """(\p{Alpha}+)\p{Blank}*(.*)""".r
 71 |   private def today(zoneId: ZoneId): ZonedDateTime = {
 72 |     Instant.now().atZone(zoneId).`with`(LocalTime.MIDNIGHT)
 73 |   }
 74 |   def getZoneId(timeZoneId: String): ZoneId = ZoneId.of(timeZoneId, ZoneId.SHORT_IDS)
 75 |   /**
 76 |    * Extracts special values from an input string ignoring case.
 77 |    * @param input - a trimmed string
 78 |    * @param zoneId - zone identifier used to get the current date.
 79 |    * @return some special value in lower case or None.
 80 |    */
 81 |   private def extractSpecialValue(input: String, zoneId: ZoneId): Option[String] = {
 82 |     def isValid(value: String, timeZoneId: String): Boolean = {
 83 |       // Special value can be without any time zone
 84 |       if (timeZoneId.isEmpty) return true
 85 |       // "now" must not have the time zone field
 86 |       if (value.compareToIgnoreCase("now") == 0) return false
 87 |       // If the time zone field presents in the input, it must be resolvable
 88 |       try {
 89 |         getZoneId(timeZoneId)
 90 |         true
 91 |       } catch {
 92 |         case NonFatal(_) => false
 93 |       }
 94 |     }
 95 |     assert(input.trim.length == input.length)
 96 |     if (input.length < 3 || !input(0).isLetter) return None
 97 |     input match {
 98 |       case specialValueRe(v, z) if isValid(v, z) => Some(v.toLowerCase(Locale.US))
 99 |       case _ => None
100 |     }
101 |   }
102 | 
103 | 
104 | 
105 |   override protected def toZonedDateTime(
106 |                                   temporalAccessor: TemporalAccessor,
107 |                                   zoneId: ZoneId): ZonedDateTime = {
108 |     // Parsed input might not have time related part. In that case, time component is set to zeros.
109 |     val parsedLocalTime = temporalAccessor.query(TemporalQueries.localTime)
110 |     val localTime = if (parsedLocalTime == null) LocalTime.MIDNIGHT else parsedLocalTime
111 |     // Parsed input must have date component. At least, year must present in temporalAccessor.
112 |     val localDate = temporalAccessor.query(TemporalQueries.localDate)
113 | 
114 |     ZonedDateTime.of(localDate, localTime, zoneId)
115 |   }
116 | 
117 |   override def parse(s: String): Long = instantToMicros(toInstant(s))
118 | 
119 |   override def format(us: Long): String = {
120 |     val secs = Math.floorDiv(us, DateTimeConstants.MICROS_PER_SECOND)
121 |     val mos = Math.floorMod(us, DateTimeConstants.MICROS_PER_SECOND)
122 |     val instant = Instant.ofEpochSecond(secs, mos * 1000L)
123 | 
124 |     formatter.withZone(zoneId).format(instant)
125 |   }
126 | }
127 | 
128 | object TimestampFormatter {
129 |   val defaultPattern: String = "yyyy-MM-dd HH:mm:ss"
130 |   val defaultLocale: Locale = Locale.US
131 | 
132 |   def apply(format: String, timeZone: TimeZone, locale: Locale): TimestampFormatter = {
133 |     new Iso8601TimestampFormatter(format, timeZone.toZoneId, locale)
134 |   }
135 | 
136 |   def apply(format: String, timeZone: TimeZone): TimestampFormatter = {
137 |     apply(format, timeZone, defaultLocale)
138 |   }
139 | 
140 |   def apply(timeZone: TimeZone): TimestampFormatter = {
141 |     apply(defaultPattern, timeZone, defaultLocale)
142 |   }
143 | }
144 | 


--------------------------------------------------------------------------------
/src/main/scala/com/microsoft/cdm/write/CDMBatchWriter.scala:
--------------------------------------------------------------------------------
  1 | package com.microsoft.cdm.write
  2 | 
  3 | import java.io.IOException
  4 | 
  5 | import com.microsoft.cdm.log.SparkCDMLogger
  6 | import com.microsoft.cdm.utils.{CDMDataFormat, CDMDecimalType, CDMModelWriter, CDMTokenProvider, CdmAuthType, Constants, DataConverter, Messages, SchemaDiffOutput, SerializedABFSHadoopConf}
  7 | import org.apache.hadoop.fs.{FileSystem, Path}
  8 | import org.apache.parquet.hadoop.util.HadoopOutputFile
  9 | import org.apache.spark.sql.SaveMode
 10 | import org.apache.spark.sql.connector.write.{BatchWrite, DataWriterFactory, PhysicalWriteInfo, WriterCommitMessage}
 11 | import org.apache.spark.sql.types.{ArrayType, DecimalType, StringType, StructType}
 12 | import org.slf4j.LoggerFactory
 13 | import org.slf4j.event.Level
 14 | 
 15 | import scala.collection.mutable.{ArrayBuffer, ListBuffer}
 16 | 
 17 | class CDMBatchWriter(jobId: String, writeMode: SaveMode, schema: StructType, cdmOptions: CDMWriteOptions) extends BatchWrite {
 18 | 
 19 |   val logger  = LoggerFactory.getLogger(classOf[CDMBatchWriter])
 20 | 
 21 |   val serializedHadoopConf = SerializedABFSHadoopConf.getConfiguration(cdmOptions.storage, cdmOptions.container, cdmOptions.auth, cdmOptions.conf)
 22 | 
 23 |   val tokenProvider =  if (cdmOptions.auth.getAuthType == CdmAuthType.Token.toString()) Some(new CDMTokenProvider(serializedHadoopConf, cdmOptions.storage)) else None
 24 | 
 25 |   val cdmModel = new CDMModelWriter(cdmOptions.storage,
 26 |                                     cdmOptions.container,
 27 |                                     cdmOptions.manifestPath,
 28 |                                     cdmOptions.manifestFileName,
 29 |                                     cdmOptions.manifestName,
 30 |                                     cdmOptions.entity,
 31 |                                     cdmOptions.useSubManifest,
 32 |                                     cdmOptions.entityDefinition,
 33 |                                     cdmOptions.entDefContAndPath,
 34 |                                     jobId,
 35 |                                     cdmOptions.fileFormatSettings,
 36 |                                     cdmOptions.auth, tokenProvider,
 37 |                                     cdmOptions.overrideConfigPath,
 38 |                                     cdmOptions.cdmSource,
 39 |                                     cdmOptions.entityDefinitionStorage,
 40 |                                     cdmOptions.maxCDMThreads)
 41 | 
 42 |   val cdmEntity = cdmModel.entityExists(cdmOptions.entity, serializedHadoopConf)
 43 | 
 44 |   /* This list is used track the list of partitions in order to know which files to delete in case of an abort */
 45 |   val partitionList = ListBuffer[FileCommitMessage]()
 46 | 
 47 |   if (Constants.PRODUCTION && cdmOptions.manifestFileName.equals(Constants.MODEL_JSON))  {
 48 |     throw new Exception("Writing model.json is not supported.")
 49 |   }
 50 | 
 51 |   def compare(cdmSchema: Iterable[Any], schema: StructType, path: ArrayBuffer[String]): SchemaDiffOutput = {
 52 |     if(cdmSchema.size != schema.fields.length) return SchemaDiffOutput(false, -1, path)
 53 |     val dv = new DataConverter;
 54 |     val arr = cdmSchema.toArray
 55 |     schema.fields.zipWithIndex.foreach{ case (field, i) =>
 56 |       path.append(field.name)
 57 |       if (field.dataType.isInstanceOf[StructType]) {
 58 |         val diff = compare(arr(i).asInstanceOf[Iterable[Any]], field.dataType.asInstanceOf[StructType], path)
 59 |         if(!diff.isSame)  return diff
 60 |       }
 61 |       else if(field.dataType.isInstanceOf[ArrayType]){
 62 |         val arrayElementType = field.dataType.asInstanceOf[ArrayType].elementType
 63 |         val diff = compare(arr(i).asInstanceOf[Iterable[Any]], arrayElementType.asInstanceOf[StructType], path)
 64 |         if(!diff.isSame)  return diff
 65 |       }
 66 |       else if (field.dataType.isInstanceOf[DecimalType]) {
 67 |         if(arr(i).isInstanceOf[CDMDecimalType]) {
 68 |           val cdmDecimal = arr(i).asInstanceOf[CDMDecimalType]
 69 |           val sparkDecimal = field.dataType.asInstanceOf[DecimalType]
 70 |           if(cdmDecimal.precision != sparkDecimal.precision || cdmDecimal.scale != sparkDecimal.scale) {
 71 |             return SchemaDiffOutput(false, i, path)
 72 |           }
 73 |         }else {
 74 |           return SchemaDiffOutput(false, i, path)
 75 |         }
 76 |       }
 77 |       else if (arr(i).equals("Guid")) {
 78 |         if (!field.dataType.equals(StringType)) {
 79 |           return SchemaDiffOutput(false, i, path)
 80 |         }
 81 |       }
 82 |       else {
 83 |         try {
 84 |           if (!dv.toSparkType(CDMDataFormat.withName(arr(i).toString), 0, 0).getClass.equals(field.dataType.getClass)) {
 85 |             return SchemaDiffOutput(false, i, path)
 86 |           }
 87 |         }
 88 |           /*NoSuchElementException will thrown when toSparkType functions doesn't get the specified CDMDataType. This can happen for array, structs and CDMDecimaltype */
 89 |         catch{
 90 |           case e : java.util.NoSuchElementException => {
 91 |             return SchemaDiffOutput(false, i, path)
 92 |           }
 93 |         }
 94 |       }
 95 |       path.remove(path.length - 1) //backtrack
 96 |     }
 97 |     SchemaDiffOutput(true, -1, path)
 98 |   }
 99 | 
100 |   def printPath(path: ArrayBuffer[String]): String = {
101 |     var result = new StringBuilder()
102 |     for (i <- path) {
103 |       result.append(i)
104 |       result.append(" > ")
105 |     }
106 |     if (result.length > 0)  result.setLength(result.length - 3)
107 |     result.toString()
108 |   }
109 | 
110 |   def isNestedTypePresent() : Boolean = {
111 |     val structFieldtype = schema.fields.find(each => (each.dataType.isInstanceOf[StructType] ||  each.dataType.isInstanceOf[ArrayType])).getOrElse(null)
112 |     if(structFieldtype != null) true else false
113 |   }
114 |   override def createBatchWriterFactory(info: PhysicalWriteInfo): DataWriterFactory = {
115 | 
116 |     if(isNestedTypePresent() && "csv".equals(cdmOptions.fileFormatSettings.fileFormat)) {
117 |       throw new IllegalArgumentException(Messages.nestedTypesNotSupported)
118 |     }
119 | 
120 |     var cdmSchema: Iterable[Any] = null
121 |     // Get the time taken to run this block of code in Kusto
122 |     SparkCDMLogger.logEventToKustoForPerf(
123 |       {
124 |         cdmSchema = if (cdmEntity.entityDec != null) {
125 |           if (writeMode == SaveMode.ErrorIfExists) {
126 |             throw new IOException("Entity " + cdmOptions.entity + " exists with SaveMode.ErrorIfExists set")
127 |           } else if (writeMode == SaveMode.Overwrite) {
128 |             if (cdmOptions.entityDefinition == "") {
129 |               // cdm schema will be processed from the dataframe schema in case of overwrite and entity exists
130 |               cdmModel.getCDMSchemaFromStructType(schema)
131 |             } else {
132 |               cdmModel.getCDMSchemaTypesAsSeqFromPredefinedSource(cdmOptions.entityDefinition)
133 |             }
134 |             cdmModel.getCDMSchemaFromStructType(schema)
135 |           } else {
136 |             cdmModel.getCDMSchemaTypesAsSeq(cdmOptions.entity, serializedHadoopConf)
137 |           }
138 |         } else {
139 |           if (cdmOptions.entityDefinition == "") {
140 |             /* create types from scratch*/
141 |             cdmModel.getCDMSchemaFromStructType(schema)
142 |           } else {
143 |             cdmModel.getCDMSchemaTypesAsSeqFromPredefinedSource(cdmOptions.entityDefinition)
144 |           }
145 |         }
146 |       }, this.getClass.getName, Thread.currentThread.getStackTrace()(1).getMethodName, Level.DEBUG, "Get CDM Schema for entityDefinition - " + (if (cdmOptions.entityDefinition.isEmpty)  cdmOptions.entity else cdmOptions.entityDefinition), Some(logger))
147 | 
148 |     val resultPath = ArrayBuffer[String]()
149 |     resultPath.append(cdmOptions.entity)
150 |     val compareResult = compare(cdmSchema, schema, resultPath)
151 |     val prettyPath =  printPath(compareResult.path)
152 | 
153 |     compareResult match {
154 |       case SchemaDiffOutput(false, -1, path) => throw new Exception(String.format(Messages.mismatchedSizeSchema, prettyPath))
155 |       case SchemaDiffOutput(false, i, path) => throw new Exception(String.format(Messages.invalidIndexSchema, prettyPath.substring(prettyPath.lastIndexOf(" ") + 1), prettyPath))
156 |       case _ =>
157 |     }
158 | 
159 |     new CDMDataWriterFactory(cdmOptions.storage,
160 |                              cdmOptions.container,
161 |                              cdmOptions.entity,
162 |                              schema,
163 |                              cdmOptions.manifestPath,
164 |                              cdmOptions.useSubManifest,
165 |                              cdmSchema.toList,
166 | 
167 |       cdmOptions.dataFolder, cdmOptions.fileFormatSettings, jobId, cdmOptions.compressionCodec, serializedHadoopConf)
168 |   }
169 | 
170 |   override def commit(messages: Array[WriterCommitMessage]): Unit = {
171 |     /*
172 |     * The list of partitions to we will write to
173 |     */
174 |     messages.foreach { e =>
175 |       val message = e.asInstanceOf[FileCommitMessage]
176 |       partitionList.append(message)
177 |     }
178 | 
179 |     /*
180 |      * If we are using managed identities and databricks, verify that the token has not expired. If it has expired,
181 |      * notify the user user with an exception
182 |      */
183 |     if (tokenProvider != None && !tokenProvider.get.isTokenValid()) {
184 |       throw new Exception(Messages.managedIdentitiesDatabricksTimeout)
185 |     }
186 | 
187 |     if (cdmEntity.entityDec != null) {
188 |       //  val cdmEntity = cdmModel.getEntityDec(entity, serializedHadoopConf)
189 |       writeMode match {
190 |         case SaveMode.ErrorIfExists => throw new Exception("Entity " + cdmOptions.entity + "exists with SaveMode ErrorIf Exists")
191 |         case SaveMode.Overwrite => {
192 |           val fs= serializedHadoopConf.getFileSystem()
193 |           val oldPartitions = cdmModel.getOldPartitions(fs, cdmEntity)   // Gets the old partitions that need to be deleted after overwritten entity is created.
194 |           SparkCDMLogger.logEventToKustoForPerf(
195 |             {
196 |               if(cdmModel.createEntity(schema, partitionList, true, cdmOptions.dataFolder, serializedHadoopConf, cdmEntity) && cdmOptions.useSubManifest) {
197 |                 deleteOldSubManifest(fs)
198 |               }
199 |               deleteOldPartitionsFromDisk(cdmOptions.useSubManifest, fs, oldPartitions)
200 |             }, this.getClass.getName, Thread.currentThread.getStackTrace()(1).getMethodName, Level.DEBUG, "SaveMode - Overwwite : delete partition and create entity.." + logInfo, Some(logger))
201 | 
202 | 
203 |         }
204 |         case SaveMode.Append => {
205 |           SparkCDMLogger.logEventToKustoForPerf(
206 |             {
207 |               cdmModel.updateEntity(cdmEntity, partitionList, cdmOptions.dataFolder)
208 |             }, this.getClass.getName, Thread.currentThread.getStackTrace()(1).getMethodName, Level.DEBUG, "SaveMode - Append : update entity.." + logInfo, Some(logger))
209 |         }
210 |         case _ => {
211 |           SparkCDMLogger.logEventToKusto(this.getClass.getName, Thread.currentThread.getStackTrace()(1).getMethodName, Level.ERROR, "Other SaveMode." + writeMode, Some(logger))
212 |         }
213 |       }
214 |     } else {
215 |       SparkCDMLogger.logEventToKustoForPerf(
216 |         {
217 |           // Entity does not exist, so create it
218 |           cdmModel.createEntity(schema, partitionList, false, cdmOptions.dataFolder, serializedHadoopConf, cdmEntity)
219 |         }, this.getClass.getName, Thread.currentThread.getStackTrace()(1).getMethodName, Level.DEBUG, "Entity does not exist. Creating entity.." + logInfo, Some(logger))
220 |     }
221 |   }
222 | 
223 |   def logInfo() : String = {
224 | 
225 |     return "[Entity : " + cdmOptions.entity +
226 |       ", Manifest Path : " + cdmOptions.storage + cdmOptions.container + cdmOptions.manifestPath +
227 |       ", entDefContAndPath : " + cdmOptions.entDefContAndPath + cdmOptions.entityDefinition +"]";
228 |   }
229 | 
230 |   // When spark job is aborted, delete the partitions that was created.
231 |   def cleanupOnAbort() = {
232 |     val fs= serializedHadoopConf.getFileSystem()
233 |     var path: Path = null
234 |     for (file <- partitionList) {
235 |       path = new Path(cdmOptions.manifestPath + cdmOptions.entity + "/" + cdmOptions.dataFolder + "/" + file.name + file.extension)
236 |       if (fs.exists(path)) {
237 |         SparkCDMLogger.logEventToKusto(this.getClass.getName,
238 |           Thread.currentThread.getStackTrace()(1).getMethodName,
239 |           Level.ERROR, "CDMDataSourceWriter abort - Deleting partition- " + HadoopOutputFile.fromPath(path, serializedHadoopConf.value).toString,
240 |           Some(logger))
241 |         fs.delete(path)
242 |       }
243 |     }
244 |     // Only in case of overwrite with submanifests
245 |     path = new Path(String.format(Constants.SUBMANIFEST_WITH_OVERWRITTEN_PARTITIONS, cdmOptions.manifestPath + cdmOptions.entity + "/" + cdmOptions.entity))
246 |     if(fs.exists(path)) {
247 |       SparkCDMLogger.logEventToKusto(this.getClass.getName, Thread.currentThread.getStackTrace()(1).getMethodName, Level.ERROR, "CDMDataSourceWriter abort - Deleting submanifest with overwritten partition locations - " + HadoopOutputFile.fromPath(path, serializedHadoopConf.value).toString, Some(logger))
248 |       fs.delete(path)
249 |     }
250 |   }
251 | 
252 | 
253 |   def deleteOldPartitionsFromDisk(useSubManifest: Boolean, fs: FileSystem, oldPartitions: ArrayBuffer[Path]) = {
254 |     for( oldPartition <- oldPartitions) {
255 |       SparkCDMLogger.logEventToKusto(this.getClass.getName, Thread.currentThread.getStackTrace()(1).getMethodName, Level.INFO, "deleting partitions from disk-  " +  oldPartition.toString, Some(logger))
256 |       fs.delete(oldPartition)
257 |     }
258 |   }
259 | 
260 |   def deleteOldSubManifest(fs: FileSystem) = {
261 |     val oldPath = new Path(cdmOptions.manifestPath + cdmOptions.entity + "/" + cdmOptions.entity +".manifest.cdm.json")
262 |     val newPath = new Path(String.format(Constants.SUBMANIFEST_WITH_OVERWRITTEN_PARTITIONS, cdmOptions.manifestPath + cdmOptions.entity + "/" + cdmOptions.entity))
263 |     fs.delete(oldPath)
264 |     SparkCDMLogger.log(Level.INFO, "Renaming " + newPath.getName + " to " + oldPath.getName , logger)
265 |     fs.rename(newPath, oldPath)  // renames <entity>.rename.manifest.cdm.json to <entity>.manifest.cdm.json
266 |   }
267 | 
268 |   override def abort(messages: Array[WriterCommitMessage]): Unit = {
269 |     cleanupOnAbort()
270 |     SparkCDMLogger.logEventToKusto(this.getClass.getName, Thread.currentThread.getStackTrace()(1).getMethodName, Level.ERROR, "CDMBatchWriter abort " + logInfo, Some(logger))
271 | 
272 |   }
273 | }
274 | 


--------------------------------------------------------------------------------
/src/main/scala/com/microsoft/cdm/write/CDMDataWriter.scala:
--------------------------------------------------------------------------------
 1 | package com.microsoft.cdm.write
 2 | 
 3 | import java.io._
 4 | 
 5 | import com.microsoft.cdm.utils.{CsvParserFactory, DataConverter}
 6 | import org.apache.commons.io.FilenameUtils
 7 | import org.apache.spark.sql.catalyst.InternalRow
 8 | import org.apache.spark.sql.connector.write.{DataWriter, WriterCommitMessage}
 9 | import org.apache.spark.sql.types.StructType
10 | 
11 | 
12 | /**
13 |  * Writes a single partition of CDM data to a single CSV in ADLSgen2.
14 |  * @param schema schema of the data to write.
15 |  * @param fileWriter Type of file writer CSV/parquet
16 |  * @param dataConverter converter between Spark and CDM data.
17 |  */
18 | class CDMDataWriter( var schema: StructType,
19 |                      var fileWriter: WriterConnector,
20 |                      var dataConverter: DataConverter) extends DataWriter[InternalRow] {
21 | 
22 |   fileWriter.build(schema)
23 |   /**
24 |    * Called by Spark runtime. Writes a row of data to an in-memory csv file.
25 |    * @param row row of data to write.
26 |    */
27 |   def write(row: InternalRow): Unit = {
28 | 
29 |     // TODO: periodically dump buffer when it gets to a certain size
30 |     fileWriter.writeRow(row, dataConverter)
31 |   }
32 | 
33 |   /**
34 |    * Called by Spark runtime when all data has been written. Uploads the in-memory buffer to the output CSV/parquet file.
35 |    * @return commit message specifying location of the output csv/parquet file.
36 |    */
37 |   def commit: WriterCommitMessage = {
38 | 
39 |     fileWriter.upload
40 | 
41 |     // Pass the file path back so we can add it as a file to the CDM model
42 |     val path = fileWriter.getPath.stripPrefix("/")
43 |     val name = FilenameUtils.getBaseName(path)
44 |     val extension = FilenameUtils.EXTENSION_SEPARATOR_STR + FilenameUtils.getExtension(path)
45 |     new FileCommitMessage(name=name, fileLocation = path, extension)
46 |   }
47 | 
48 |   /**
49 |    * Called by spark runtime.
50 |    */
51 |   def abort(): Unit = {
52 |     /* TODO: Closing is not aborting*/
53 |     fileWriter.abort
54 |   }
55 | 
56 |   override def close(): Unit = {
57 |     fileWriter.close
58 |   }
59 | }
60 | 


--------------------------------------------------------------------------------
/src/main/scala/com/microsoft/cdm/write/CDMDataWriterFactory.scala:
--------------------------------------------------------------------------------
 1 | package com.microsoft.cdm.write
 2 | 
 3 | import java.net.URLDecoder
 4 | 
 5 | import com.microsoft.cdm.utils.{Constants, DataConverter, FileFormatSettings, SparkSerializableConfiguration}
 6 | import org.apache.parquet.hadoop.metadata.CompressionCodecName
 7 | import org.apache.spark.sql.catalyst.InternalRow
 8 | import org.apache.spark.sql.connector.write.{DataWriter, DataWriterFactory}
 9 | import org.apache.spark.sql.types.StructType
10 | 
11 | /**
12 |  * Factory class. Creates a CDMDataWriter instance for a single partition of data to write.
13 |  * @param storage The storage account
14 |  * @param container The container name in the storage account.
15 |  * @param entity The name of the entity that will be written.
16 |  * @param schema Spark schema
17 |  * @param manifestPath path relative to storage and container where manifest will be written
18 |  * @param useSubManifest indicates if subManifest can be used (Boolean)
19 |  * @param cdmSchema cdmSchema
20 |  * @param fileFormatSettings file settings including - header, delimiter, file type [parquet or csv]
21 |  * @param jobId id of the write job.
22 |  * @param compression compression codec name
23 |  * @param serConf Spark serialization configuration
24 |  */
25 | @SerialVersionUID(100L)
26 | class CDMDataWriterFactory(var storage: String,
27 |                            var container: String,
28 |                            var entity: String,
29 |                            var schema: StructType,
30 |                            var manifestPath: String,
31 |                            var useSubManifest: Boolean,
32 |                            var cdmSchema: List[Any],
33 |                            val dataDir: String,
34 |                            var fileFormatSettings: FileFormatSettings,
35 |                            var jobId: String,
36 |                            var compression: CompressionCodecName,
37 |                            var serConf: SparkSerializableConfiguration) extends DataWriterFactory {
38 | 
39 |   // TODO: error handling. we're basically assuming successful writes. Need to add logic to remove/rewrite files on failure.
40 | 
41 |   override def createWriter(partitionId: Int, taskId: Long): DataWriter[InternalRow] = {
42 |     var path = manifestPath + entity + "/" + dataDir
43 |     path = URLDecoder.decode(path, "UTF-8")
44 |     val fileWriter = fileFormatSettings.fileFormat match{
45 |       case "csv" => {
46 |         val prefix = "https://" + storage + container
47 |         val filename = entity + "-" + jobId + "-" +partitionId + ".csv"
48 |         new CSVWriterConnector( prefix, path + "/" + filename, cdmSchema, serConf, fileFormatSettings)
49 |       }
50 |       case "parquet" => {
51 |         val prefix ="https://" +storage + container
52 |         val filename = entity + "-" + jobId + "-" + partitionId + compression.getExtension + ".parquet"
53 |         new ParquetWriterConnector(prefix, path + "/" + filename, cdmSchema, compression, serConf)
54 |       }
55 |     }
56 |     new CDMDataWriter(schema, fileWriter, new DataConverter())
57 |   }
58 | 
59 | }
60 | 


--------------------------------------------------------------------------------
/src/main/scala/com/microsoft/cdm/write/CDMWriteOptions.scala:
--------------------------------------------------------------------------------
  1 | package com.microsoft.cdm.write
  2 | 
  3 | import java.time.LocalDateTime
  4 | import java.time.format.DateTimeFormatter
  5 | 
  6 | import com.microsoft.cdm.utils.{CDMDataFolder, CDMOptions, CdmAuthType, Constants, FileFormatSettings, Messages}
  7 | 
  8 | import sys.process._
  9 | import org.apache.parquet.hadoop.metadata.CompressionCodecName
 10 | import org.apache.spark.sql.util.CaseInsensitiveStringMap
 11 | 
 12 | class CDMWriteOptions(options: CaseInsensitiveStringMap) extends CDMOptions(options) {
 13 | 
 14 |   Constants.MODE = "write"
 15 |   val manifestName = if(options.containsKey("manifestName")) options.get("manifestName") else "default"
 16 |   val useSubManifest = if(options.containsKey("useSubManifest")) options.get("useSubManifest").toBoolean else false
 17 |   val entDefIn = if (options.containsKey("entityDefinitionPath")) options.get("entityDefinitionPath") else ""
 18 |   val useCdmStandard = if (options.containsKey("useCdmStandardModelRoot")) options.get("useCdmStandardModelRoot").toBoolean else false
 19 |   val entDefModelRootIn= if (options.containsKey("entityDefinitionModelRoot")) options.get("entityDefinitionModelRoot") else ""
 20 |   val compressionFormat= if (options.containsKey("compression")) options.get("compression") else "snappy"
 21 |   val customDataFolderPattern = if (options.containsKey("dataFolderFormat")) options.get("dataFolderFormat") else ""
 22 |   var entityDefinitionStorage = if (options.containsKey("entityDefinitionStorage")) options.get("entityDefinitionStorage") else storage
 23 | 
 24 |   if (useCdmStandard &&  !entDefModelRootIn.isEmpty) {
 25 |     throw new Exception(Messages.invalidBothStandardAndEntityDefCont)
 26 |   }
 27 | 
 28 |   if (Constants.PRODUCTION && manifestFileName.equals(Constants.MODEL_JSON))  {
 29 |     throw new Exception("Writing model.json is not supported.")
 30 |   }
 31 | 
 32 |   var entDefContAndPath = getEntityDefinitionPath(useCdmStandard, container, manifestPath, entDefModelRootIn)
 33 |   val compressionCodec = getCompression(compressionFormat)
 34 | 
 35 |   import com.microsoft.cdm.utils.Environment
 36 |   import com.microsoft.cdm.utils.SparkPlatform
 37 | 
 38 |   if ((Environment.sparkPlatform eq SparkPlatform.DataBricks) && compressionFormat.equals("lzo")) checkLzo(compressionCodec)
 39 | 
 40 |   if (getAuthType(options) != CdmAuthType.Token.toString()) {
 41 |     if(!entityDefinitionStorage.equals(storage)) {
 42 |       throw new IllegalArgumentException(Messages.entityDefStorageAppCredError)
 43 |     }
 44 |   }
 45 | 
 46 |   // if there is no entity definition model root, use the output CDM storage account
 47 |   if(entDefModelRootIn.isEmpty) {
 48 |     entityDefinitionStorage = storage
 49 |   }
 50 | 
 51 |   val dataFolder =
 52 |     if (customDataFolderPattern == ""){
 53 |       CDMDataFolder.getDataFolderWithDate()
 54 |     } else {
 55 |       try{
 56 |         val dataFormatter = DateTimeFormatter.ofPattern(customDataFolderPattern)
 57 |         dataFormatter.format(LocalDateTime.now)
 58 |       } catch {
 59 |         case e: Exception => throw new IllegalArgumentException(String.format(Messages.incorrectDataFolderFormat, customDataFolderPattern))
 60 |       }
 61 |     }
 62 | 
 63 |   var entityDefinition = if (entDefIn.isEmpty || entDefIn.startsWith("/")) entDefIn else "/"+ entDefIn
 64 | 
 65 |   val delimiter = getDelimiterChar(options.get("delimiter"))
 66 |   val showHeader = if(options.containsKey("columnHeaders")) options.get("columnHeaders").toBoolean else true
 67 | 
 68 |   val fileFormatSettings = FileFormatSettings(fileFormatType, delimiter, showHeader)
 69 | 
 70 | 
 71 | 
 72 |   // Return the container + path to use for the CDM adapter. If useCdmStandard is set, set it to an empty string, which is the flag
 73 |   // in the CDMCommonModel to use the CdmStandard adapter
 74 |   private def getEntityDefinitionPath(useCdmStandard: Boolean, origContainer:String, origPath: String, entDefModelRootIn: String) = {
 75 | 
 76 |     val entDefPath = if(entDefModelRootIn.isEmpty || entDefModelRootIn.startsWith("/")) entDefModelRootIn else "/"+ entDefModelRootIn
 77 | 
 78 |     if (useCdmStandard) {
 79 |       ""
 80 |     } else {
 81 |       //If "entityDefnitionModelRoot" was empty, use the CDM metadata container -> which would be container plus the origPath
 82 |       if (entDefModelRootIn.isEmpty()) {
 83 |         origContainer + origPath.dropRight(1)
 84 |       } else {
 85 |         entDefPath
 86 |       }
 87 |     }
 88 |   }
 89 | 
 90 |   def getCompression(compressionFormat: String): CompressionCodecName = {
 91 |     try {
 92 |       CompressionCodecName.fromConf(compressionFormat)
 93 |     }catch {
 94 |       case e: IllegalArgumentException =>  throw new IllegalArgumentException(String.format(Messages.invalidCompressionFormat, compressionFormat))
 95 |     }
 96 |   }
 97 | 
 98 |   def checkLzo(compression: CompressionCodecName) = {
 99 |     try {
100 |       val version = "which lzop".!!
101 |       Class.forName(compression.getHadoopCompressionCodecClassName)
102 |     }
103 |     catch {
104 |       case _ => throw new UnsupportedOperationException("Codec class " + compression.getHadoopCompressionCodecClassName + " is not available. " +
105 |         "If running on databricks, please read: https://docs.databricks.com/data/data-sources/read-lzo.html ")
106 |     }
107 |   }
108 | 
109 |   def getDelimiterChar(value: String): Char = {
110 |     val delimiter = if(value != null) value else ",";
111 |     if (delimiter.length > 1) {
112 |       throw new IllegalArgumentException(String.format(Messages.invalidDelimiterCharacter, delimiter))
113 |     }
114 |     delimiter.charAt(0)
115 |   }
116 | 
117 | }
118 | 


--------------------------------------------------------------------------------
/src/main/scala/com/microsoft/cdm/write/CDMWriterBuilder.scala:
--------------------------------------------------------------------------------
 1 | package com.microsoft.cdm.write
 2 | 
 3 | import com.microsoft.cdm.utils.{CDMOptions, Constants}
 4 | import org.apache.spark.sql.SaveMode
 5 | import org.apache.spark.sql.connector.write.{BatchWrite, SupportsOverwrite, SupportsTruncate, WriteBuilder}
 6 | import org.apache.spark.sql.sources.Filter
 7 | import org.apache.spark.sql.types.StructType
 8 | 
 9 |  class CDMWriterBuilder(queryId: String, schema: StructType, writeMode: SaveMode, options: CDMWriteOptions) extends WriteBuilder
10 |   with SupportsOverwrite
11 |   with SupportsTruncate {
12 | 
13 |    Constants.MODE = "write"
14 | 
15 |    override def buildForBatch(): BatchWrite = new CDMBatchWriter(queryId, writeMode, schema, options)
16 | 
17 |    override def overwrite(filters: Array[Filter]): WriteBuilder = new CDMWriterBuilder(queryId, schema, SaveMode.Overwrite, options)
18 | 
19 |  }
20 | 
21 | 


--------------------------------------------------------------------------------
/src/main/scala/com/microsoft/cdm/write/CSVWriterConnector.scala:
--------------------------------------------------------------------------------
 1 | package com.microsoft.cdm.write
 2 | 
 3 | import java.io.OutputStreamWriter
 4 | 
 5 | import com.microsoft.cdm.log.SparkCDMLogger
 6 | import com.microsoft.cdm.utils.{CsvParserFactory, DataConverter, FileFormatSettings, SparkSerializableConfiguration}
 7 | import com.univocity.parsers.csv.CsvWriter
 8 | import org.apache.hadoop.fs.Path
 9 | import org.apache.parquet.hadoop.util.HadoopOutputFile
10 | import org.apache.parquet.io.PositionOutputStream
11 | import org.apache.spark.sql.catalyst.InternalRow
12 | import org.apache.spark.sql.types.StructType
13 | import org.slf4j.LoggerFactory
14 | import org.slf4j.event.Level
15 | 
16 | import scala.collection.JavaConversions
17 | 
18 | class CSVWriterConnector(prefix: String,
19 |                          filePath: String,
20 |                          cdmSchema: List[Any],
21 |                          var serConf:SparkSerializableConfiguration,
22 |                          fileFormatSettings: FileFormatSettings) extends WriterConnector {
23 |   val logger  = LoggerFactory.getLogger(classOf[CSVWriterConnector])
24 |   private var stream:PositionOutputStream = _
25 |   private var writer:CsvWriter = _
26 |   private var schema: StructType = _
27 |   private val httpPath= prefix + filePath
28 |   SparkCDMLogger.log(Level.INFO, "CSV Writer for partition at path: " + prefix + filePath, logger)
29 | 
30 |   def getPath(): String = httpPath
31 | 
32 |   def build(inSchema: StructType): Unit = {
33 |     try {
34 |       schema = inSchema
35 |       val path = new Path(filePath)
36 |       val fs = path.getFileSystem(serConf.value)
37 |       val oFile = HadoopOutputFile.fromPath(path, serConf.value)
38 |       stream = oFile.create(fs.getDefaultBlockSize(path))
39 |       writer = CsvParserFactory.buildWriter(new OutputStreamWriter(stream), fileFormatSettings.delimiter)
40 |       if(fileFormatSettings.showHeader) {
41 |         val headers = schema.fields.map(_.name)
42 |         writer.writeHeaders(headers: _*)
43 |       }
44 |     } catch {
45 |       case e: Throwable => SparkCDMLogger.log(Level.ERROR, e.printStackTrace.toString, logger)
46 |     }
47 |   }
48 | 
49 |   def upload() = {
50 |     writer.close()
51 |   }
52 | 
53 | 
54 |   def writeRow(row: InternalRow, dataConverter: DataConverter): Unit =   {
55 |     val strings = JavaConversions.seqAsJavaList(row.toSeq(schema).zipWithIndex.map{ case(col, index) =>
56 |       dataConverter.dataToString(col, schema.fields(index).dataType, cdmSchema(index))
57 |     })
58 | 
59 |     var strArray = new Array[String](strings.size)
60 |     strArray = strings.toArray(strArray)
61 |     writer.writeRow(strArray)
62 |   }
63 | 
64 |   def abort(): Unit = {
65 |     SparkCDMLogger.log(Level.ERROR, "CSV Writer aborting.." + prefix + filePath, logger)
66 |     writer.close()
67 |   }
68 | 
69 |   def close (): Unit = {
70 |     writer.close()
71 |   }
72 | }
73 | 


--------------------------------------------------------------------------------
/src/main/scala/com/microsoft/cdm/write/FileCommitMessage.scala:
--------------------------------------------------------------------------------
 1 | package com.microsoft.cdm.write
 2 | 
 3 | import org.apache.spark.sql.connector.write.WriterCommitMessage
 4 | 
 5 | // TODO: there's a better scala idiom for this than class
 6 | /**
 7 |  * Commit message returned from CDMDataWriter on successful write. One for each partition gets returned to
 8 |  * CDMDataSourceWriter.
 9 |  * @param name name of the partition.
10 |  * @param fileLocation output csv/parquet file for the partition.
11 |  */
12 | class FileCommitMessage(val name: String, val fileLocation: String, val extension: String) extends WriterCommitMessage {
13 |   name + fileLocation
14 | 
15 |   // return the partition name
16 |   def getPartition(): String = {
17 |     name
18 |   }
19 | }
20 | 


--------------------------------------------------------------------------------
/src/main/scala/com/microsoft/cdm/write/ParquetWriterConnector.scala:
--------------------------------------------------------------------------------
  1 | package com.microsoft.cdm.write
  2 | 
  3 | import java.nio.{ByteBuffer, ByteOrder}
  4 | import java.time.{Instant, ZoneId}
  5 | import java.util.TimeZone
  6 | 
  7 | import com.microsoft.cdm.utils.{CDMSparkToParquetSchemaConverter, DataConverter, Messages, SparkSerializableConfiguration}
  8 | import org.apache.hadoop.fs.Path
  9 | import org.apache.parquet.column.ParquetProperties
 10 | import org.apache.parquet.example.data.Group
 11 | import org.apache.parquet.example.data.simple.{NanoTime, SimpleGroup, SimpleGroupFactory}
 12 | import org.apache.parquet.hadoop.ParquetWriter
 13 | import org.apache.parquet.hadoop.example.GroupWriteSupport
 14 | import org.apache.parquet.io.api.Binary
 15 | import org.apache.parquet.schema.{MessageType, Type}
 16 | import org.apache.spark.sql.catalyst.InternalRow
 17 | import org.apache.spark.sql.catalyst.util.{ArrayData, DateTimeUtils}
 18 | import org.apache.spark.sql.types._
 19 | import org.apache.spark.unsafe.types.UTF8String
 20 | import org.slf4j.LoggerFactory
 21 | import java.util.concurrent.TimeUnit
 22 | 
 23 | import com.microsoft.cdm.log.SparkCDMLogger
 24 | import org.apache.parquet.hadoop.metadata.CompressionCodecName
 25 | import org.apache.parquet.schema.PrimitiveType.PrimitiveTypeName
 26 | import org.apache.parquet.schema.Type.Repetition.{OPTIONAL, REQUIRED}
 27 | import org.slf4j.event.Level
 28 | 
 29 | import scala.collection.JavaConverters._
 30 | 
 31 | 
 32 | class ParquetWriterConnector(httpPrefix:String,
 33 |                              filePath: String,
 34 |                              cdmSchema: List[Any],
 35 |                              compression: CompressionCodecName,
 36 |                              var seriazliedHadoopConf: SparkSerializableConfiguration) extends WriterConnector {
 37 |   val logger  = LoggerFactory.getLogger(classOf[ParquetWriterConnector])
 38 |   private var structType: StructType= _
 39 |   private var schema: MessageType = _
 40 |   private var writer: ParquetWriter[Group]= _
 41 |   private var groupFactory:SimpleGroupFactory = _
 42 |   private var path:Path=_
 43 |   private val httpPath = httpPrefix + filePath
 44 |   def getPath: String = httpPath
 45 |   private var converter: CDMSparkToParquetSchemaConverter=_
 46 | 
 47 |   val NANOS_PER_HOUR: Long = TimeUnit.HOURS.toNanos (1)
 48 |   val NANOS_PER_MINUTE: Long = TimeUnit.MINUTES.toNanos (1)
 49 |   val NANOS_PER_SECOND: Long = TimeUnit.SECONDS.toNanos (1)
 50 |   val NANOS_PER_MILLISECOND: Long = TimeUnit.MILLISECONDS.toNanos (1)
 51 | 
 52 |   SparkCDMLogger.log(Level.INFO, "Parquet Writer for partition at path: " + httpPrefix + filePath, logger)
 53 | 
 54 |   def build(inStructType: StructType): Unit = {
 55 |     try {
 56 |       /*
 57 |        * CCDMSparkToParquetSchemaConverter is a modified version of SparkToParquetSchemaConverter.
 58 |        * Since Spark does not support the TIME type, We use this to tell Parquet that the field type should be TIME.
 59 |        * We do this ine one of two ways:
 60 |        * * Set a metadata overwrite field set to Time on implicit write
 61 |        * * the CDM type is of type Time
 62 |        * In either case, we will set the column to be of type TIME and not type Timestamp.
 63 |        * If Spark supported a Time type, we would not need to do this. However, they do not. See:
 64 |        * https://github.com/apache/spark/pull/25678
 65 |        */
 66 |       converter = new CDMSparkToParquetSchemaConverter(writeLegacyParquetFormat = false)
 67 |       structType = inStructType
 68 |       schema = converter.convert(inStructType, cdmSchema)
 69 |       groupFactory = new SimpleGroupFactory(schema)
 70 |       GroupWriteSupport.setSchema(schema, seriazliedHadoopConf.value)
 71 |       val writeSupport = new GroupWriteSupport()
 72 |       path = new Path(filePath)
 73 |       writer = new ParquetWriter[Group](
 74 |         path,
 75 |         writeSupport,
 76 |         compression,
 77 |         ParquetWriter.DEFAULT_BLOCK_SIZE,
 78 |         ParquetWriter.DEFAULT_PAGE_SIZE,
 79 |         ParquetWriter.DEFAULT_PAGE_SIZE,
 80 |         ParquetWriter.DEFAULT_IS_DICTIONARY_ENABLED,
 81 |         ParquetWriter.DEFAULT_IS_VALIDATING_ENABLED,
 82 |         ParquetProperties.DEFAULT_WRITER_VERSION,
 83 |         seriazliedHadoopConf.value
 84 |       )
 85 |     }
 86 |     catch {
 87 |       case e : Exception => {
 88 |         SparkCDMLogger.log(Level.ERROR, e.printStackTrace.toString, logger)
 89 |       }
 90 |     }
 91 |   }
 92 | 
 93 |   def upload() = {
 94 |     writer.close()
 95 |   }
 96 | 
 97 | 
 98 |   def parseDateToBinary(value: Long) = {
 99 |     val (julianDay, timeOfDayNanos) = DateTimeUtils.toJulianDay(value.toLong)
100 | 
101 |     // Write INT96 timestamp
102 |     val timestampBuffer = new Array[Byte](12)
103 |     val buf = ByteBuffer.wrap(timestampBuffer)
104 |     buf.order(ByteOrder.LITTLE_ENDIAN).putLong(timeOfDayNanos).putInt(julianDay)
105 | 
106 |     // This is the properly encoded INT96 timestamp
107 |     val tsValue = Binary.fromReusedByteArray(timestampBuffer);
108 |     tsValue;
109 |   }
110 | 
111 |   /**
112 |    * Converts Decimal to FIX_LEN_BYTE_ARRAY of len @param typeLength
113 |    * @param decimal
114 |    * @param typeLength
115 |    * @return
116 |    */
117 |   def decimaltoBytes(decimal: Decimal, typeLength: Int): Array[Byte] = {
118 |     val bigDecimal = decimal.toJavaBigDecimal
119 |     val bytes = new Array[Byte](typeLength)
120 |     val fillByte: Byte = if (bigDecimal.signum < 0) 0xFF.toByte else 0x00.toByte
121 |     val unscaled: Array[Byte] = bigDecimal.unscaledValue.toByteArray
122 | 
123 |     // If unscaled.length > typeLength. It means we cant that accomodate it in `bytes` array. We have FIX_LEN_BYTE_ARRAY of size = tyeLength
124 |     if (unscaled.length > typeLength) {
125 |       throw new UnsupportedOperationException("Decimal size greater than "+ typeLength+" bytes")
126 |     }
127 |     // Fill the all bytes with fillByte or unscaled
128 |     val offset = typeLength - unscaled.length
129 |     for( i <- 0 until bytes.length)
130 |     {
131 |       if(i<offset) bytes(i) = fillByte else bytes(i) = unscaled(i-offset)
132 |     }
133 |     bytes
134 |   }
135 | 
136 |   def writeDecimal(group: Group, index: Int, decimal: Decimal) = {
137 |     val primitive = group.getType.getType(index).asPrimitiveType()
138 |     primitive.getPrimitiveTypeName match {
139 |       case PrimitiveTypeName.INT32 =>  group.add(index, decimal.toUnscaledLong.asInstanceOf[Int])
140 |       case PrimitiveTypeName.INT64 => group.add(index, decimal.toUnscaledLong)
141 |       case PrimitiveTypeName.FIXED_LEN_BYTE_ARRAY => {
142 |         val typeLength = primitive.getTypeLength
143 |         val byteArray = decimaltoBytes(decimal, typeLength)
144 |         group.add(index,Binary.fromReusedByteArray(byteArray))
145 |       }
146 |       case PrimitiveTypeName.BINARY => {
147 |         val typeLength = primitive.getTypeLength
148 |         val byteArray = decimaltoBytes(decimal, typeLength)
149 |         group.add(index,Binary.fromReusedByteArray(byteArray))
150 |       }
151 |       case _ => throw new UnsupportedOperationException("Unsupported base type for decimal: " + primitive.getPrimitiveTypeName());
152 |     }
153 |   }
154 | 
155 |   def writeRowUtil(row: InternalRow, group: Group, structType: StructType, cdmSchemaLocal:List[Any]): Unit = {
156 |     if(row != null) {
157 |       row.toSeq(structType).zipWithIndex.foreach {
158 |         case (field, index) => {
159 |           (structType.fields(index).dataType, field) match {
160 |             case (_, null) => {
161 |               // If precision <= scale, spark store null value; we dont want that, so controlling it here..
162 |               if (structType.fields(index).dataType.isInstanceOf[DecimalType]) {
163 |                 val primitive = group.getType.getType(index).asPrimitiveType()
164 |                 val precision = primitive.getDecimalMetadata.getPrecision
165 |                 val scale = primitive.getDecimalMetadata.getScale
166 |                 if (precision <= scale) {
167 |                   throw new IllegalArgumentException(String.format(Messages.invalidDecimalFormat, new Integer(precision), new Integer(scale)))
168 |                 }
169 |               }
170 |             }
171 |             case (ByteType, _) => group.add(index, field.asInstanceOf[Byte])
172 |             case (ShortType, _) => group.add(index, field.asInstanceOf[Short])
173 |             case (ar: ArrayType, _) => {
174 |               val arrayData = field.asInstanceOf[ArrayData]
175 |               val arrayElementType = structType.fields(index).dataType.asInstanceOf[ArrayType].elementType
176 |               /* Convert Spark type to parquet type schema. Here Spark ArrayType gets converted to Parquet GroupType
177 |                  This is the converted parquetType schema  :
178 |                   `optional group field.name (LIST) {
179 |                     repeated group list {
180 |                       optional group element {
181 |                         optional <structfieldname1type> <structfieldname1> ;
182 |                         optional <structfieldname2type> <structfieldname2>;
183 |                       }
184 |                     }
185 |                   }` */
186 |               val parquetType = converter.convertField(structType.fields(index), Type.Repetition.REPEATED, cdmSchemaLocal(index))
187 | 
188 |               /* Create a Row group from the converted schema above to insert data */
189 |               val mainGroup = new SimpleGroup(parquetType.asGroupType())
190 | 
191 |               /* `repeated group list` is represented as 0th index in mainGroup
192 |               Adding that as group here because later we will insert StructType data inside this group.
193 |               Refer: https://github.com/apache/parquet-mr/blob/b2d366a83f293914195f9de86d918f8ddd944374/parquet-column/src/main/java/org/apache/parquet/example/data/simple/SimpleGroup.java#L80 */
194 |               mainGroup.addGroup(0)
195 | 
196 |               /* parquetType will always have field names "list". Get this group to add the array of objects
197 |                  This is repeatedGroup structure
198 |                  repeated group list {
199 |                     optional group element {
200 |                       optional <structfieldname1type> <structfieldname1> ;
201 |                       optional <structfieldname2type> <structfieldname2>;
202 |                     }
203 |                   }  */
204 |               val repeatedGroup = mainGroup.getGroup("list", 0)
205 |               val subGroupFactory = new SimpleGroupFactory(converter.convert(arrayElementType.asInstanceOf[StructType], cdmSchemaLocal(index).asInstanceOf[List[Any]]))
206 |               val iterator = arrayData.toObjectArray(arrayElementType).iterator
207 | 
208 |               /* Iterate through the `arrayData` */
209 |               while (iterator.hasNext) {
210 |                 val subgroup = subGroupFactory.newGroup();
211 |                 val itemType = iterator.next().asInstanceOf[InternalRow]
212 |                 writeRowUtil(itemType, subgroup, arrayElementType.asInstanceOf[StructType],cdmSchemaLocal(index).asInstanceOf[List[Any]]);
213 |                 repeatedGroup.add(0, subgroup)
214 |               }
215 |               group.add(index, mainGroup);
216 |             }
217 |             case (BooleanType, _) => group.add(index, field.asInstanceOf[Boolean])
218 |             case (DateType, _) => group.add(index, field.asInstanceOf[Integer])
219 |             case (DoubleType, _) => group.add(index, field.asInstanceOf[Double])
220 |             case (DecimalType(), _) => {
221 |               val decimal = field.asInstanceOf[Decimal]
222 |               writeDecimal(group, index, decimal)
223 |             }
224 |             case (FloatType, _) => group.add(index, field.asInstanceOf[Float])
225 |             case (IntegerType, _) => group.add(index, field.asInstanceOf[Int])
226 |             case (LongType, _) => group.add(index, field.asInstanceOf[Long])
227 |             case (StringType, _) => {
228 |               val string = field.asInstanceOf[UTF8String].toString
229 |               group.add(index, string)
230 |             }
231 |             case (TimestampType, _) => {
232 |               if (cdmSchemaLocal(index).equals("Time")) {
233 |                 val value = field.asInstanceOf[Long];
234 |                 val nanoAdjustment  = TimeUnit.MICROSECONDS.toNanos(Math.floorMod(value, TimeUnit.SECONDS.toMicros(1)))
235 |                 val instant = Instant.ofEpochSecond(TimeUnit.MICROSECONDS.toSeconds(value.asInstanceOf[Long]), nanoAdjustment);
236 |                 val localTime= instant.atZone(ZoneId.of("UTC")).toLocalTime
237 |                 group.add(index, localTime.toNanoOfDay / 1000)
238 |               } else {
239 |                 val value = field.asInstanceOf[Long];
240 |                 val binary = parseDateToBinary(value)
241 |                 group.add(index, NanoTime.fromBinary(binary))
242 |               }
243 |             }
244 |             case _ => {
245 |               if (structType.fields(index).dataType.isInstanceOf[StructType]) {
246 |                 val subSchema = structType.fields(index).dataType.asInstanceOf[StructType];
247 |                 val subGroupFactory = new SimpleGroupFactory(converter.convert(subSchema, cdmSchemaLocal(index).asInstanceOf[List[Any]]))
248 |                 val subgroup = subGroupFactory.newGroup();
249 |                 writeRowUtil(field.asInstanceOf[InternalRow], subgroup, structType.fields(index).dataType.asInstanceOf[StructType], cdmSchemaLocal(index).asInstanceOf[List[Any]])
250 |                 group.add(index, subgroup);
251 |               } else {
252 |                 group.add(index, field.toString)
253 |               }
254 |             }
255 |           }
256 |         }
257 |       }
258 |     }
259 |   }
260 | 
261 |   def writeRow(row: InternalRow, dataConverter: DataConverter) {
262 |     val group = groupFactory.newGroup()
263 |     writeRowUtil(row, group, structType, cdmSchema);
264 |     writer.write(group)
265 |   }
266 | 
267 | 
268 |   def abort(): Unit = {
269 |     SparkCDMLogger.log(Level.ERROR, "ParquetWriter aborting.." + httpPrefix + filePath, logger)
270 |     writer.close()
271 |   }
272 | 
273 |   def close (): Unit = {
274 |     writer.close()
275 |   }
276 | }
277 | 


--------------------------------------------------------------------------------
/src/main/scala/com/microsoft/cdm/write/WriterConnector.scala:
--------------------------------------------------------------------------------
 1 | package com.microsoft.cdm.write
 2 | 
 3 | import com.microsoft.cdm.utils.DataConverter
 4 | import org.apache.spark.sql.catalyst.InternalRow
 5 | import org.apache.spark.sql.types.StructType
 6 | 
 7 | @SerialVersionUID(100L)
 8 | trait WriterConnector extends Serializable {
 9 |   def getPath: String
10 | 
11 |   def build(schema: StructType)
12 | 
13 |   def writeRow(row: InternalRow, dataConverter: DataConverter)
14 | 
15 |   def upload
16 | 
17 |   def abort
18 | 
19 |   def close
20 | }
21 | 


--------------------------------------------------------------------------------
/src/test/resources/log4j.properties:
--------------------------------------------------------------------------------
1 | log4j.rootLogger=WARN, consoleAppender
2 | log4j.logger.com.microsoft.cdm=INFO, consoleAppender 
3 | log4j.logger.com.microsoft.commondatamodel=WARN, consoleAppender 
4 | 
5 | log4j.appender.consoleAppender=org.apache.log4j.ConsoleAppender
6 | log4j.appender.consoleAppender.layout=org.apache.log4j.PatternLayout
7 | log4j.appender.consoleAppender.layout.ConversionPattern=[%t] %-5p %c %x - %m%n
8 | 


--------------------------------------------------------------------------------
/src/test/scala/com/microsoft/cdm/test/TestData.scala:
--------------------------------------------------------------------------------
  1 | package com.microsoft.cdm.test
  2 | 
  3 | import java.sql.Timestamp
  4 | import java.time.{LocalDate, LocalTime, ZoneId}
  5 | import java.time.format.DateTimeFormatter
  6 | 
  7 | import com.microsoft.cdm.utils.Constants
  8 | import com.microsoft.cdm.utils.Constants.DECIMAL_PRECISION
  9 | import org.apache.spark.sql.{DataFrame, Row, SparkSession}
 10 | import org.apache.spark.sql.types.{ArrayType, BooleanType, ByteType, DateType, Decimal, DecimalType, DoubleType, FloatType, IntegerType, LongType, MetadataBuilder, ShortType, StringType, StructField, StructType, TimestampType}
 11 | 
 12 | class TestData(val spark: SparkSession) {
 13 | 
 14 |   val date= java.sql.Date.valueOf("2015-03-31");
 15 |   val timestamp = new java.sql.Timestamp(System.currentTimeMillis());
 16 | 
 17 |   def prepareDataWithAllTypes(): DataFrame =  {
 18 |     val byteVal = 2.toByte
 19 |     val shortVal = 129.toShort
 20 |     val data = Seq(
 21 |       Row("tim", 1, true, 12.34,6L, date, Decimal(999.00), timestamp, 2f, byteVal, shortVal),
 22 |       Row("tddim", 1, false, 13.34,7L, date, Decimal(434.3), timestamp, 3.59f, byteVal, shortVal),
 23 |       Row("tddim", 1, false, 13.34,7L, date, Decimal(100.0), timestamp, 3.59f, byteVal, shortVal),
 24 |       Row("tddim", 1, false, 13.34,7L, date, Decimal(99.898), timestamp, 3.59f, byteVal, shortVal),
 25 |       Row("tim", 1, true, 12.34,6L, date, Decimal(1.3), timestamp, 3.59f, byteVal, shortVal),
 26 |       Row("tddim", 1, false, 13.34,7L, date, Decimal(99999.3), timestamp, 3590.9f, byteVal, shortVal),
 27 |       Row("tddim", 1, false, 13.34,7L, date, Decimal(4324.4324324), timestamp, 359.8f, byteVal, shortVal),
 28 |       Row("tddim", 1, false, 13.34,7L, date, Decimal(42.4), timestamp, 3.593f, byteVal, shortVal),
 29 |       Row("tddim", 1, false, 13.34,7L, date, Decimal(1.43434), timestamp, 3.59f, byteVal, shortVal),
 30 |       Row("tddim", 1, false, 13.34,7L, date, Decimal(0.0167), timestamp, 3.59f, byteVal, shortVal),
 31 |       Row("tddim", 1, false, 13.34,7L, date, Decimal(0.00032), timestamp, 332.33f, byteVal, shortVal),
 32 |       Row("tddim", 1, false, 13.34,7L, date, Decimal(78.5), timestamp, 3.53232f, byteVal, shortVal)
 33 |     )
 34 | 
 35 |     val schema = new StructType()
 36 |       .add(StructField("name", StringType, true))
 37 |       .add(StructField("id", IntegerType, true))
 38 |       .add(StructField("flag", BooleanType, true))
 39 |       .add(StructField("salary", DoubleType, true))
 40 |       .add(StructField("phone", LongType, true))
 41 |       .add(StructField("dob", DateType, true))
 42 |       .add(StructField("weight",  DecimalType(Constants.DECIMAL_PRECISION,7), true))
 43 |       .add(StructField("time", TimestampType, true))
 44 |       .add(StructField("float", FloatType, true))
 45 |       .add(StructField("byte", ByteType, true))
 46 |       .add(StructField("short", ShortType, true))
 47 | 
 48 |     spark.createDataFrame(spark.sparkContext.parallelize(data), schema)
 49 |   }
 50 | 
 51 | 
 52 |   def prepareNullAndEmptyArrays() = {
 53 |     val data = Seq(
 54 |       Row(Array(null, null)) ,
 55 |       Row(Array()),
 56 |       Row(null),
 57 |       Row(Array(null, Row(null)))
 58 |     )
 59 |     val schema = new StructType()
 60 |       .add(StructField("name", ArrayType(StructType(List(StructField("name", StringType, true)))), true)
 61 |       )
 62 |     val df = spark.createDataFrame(spark.sparkContext.parallelize(data), schema)
 63 |     df
 64 |   }
 65 | 
 66 |   def  prepareSimpleDataArrayWithTime() : DataFrame = {
 67 | 
 68 |     val formatterTime1 = DateTimeFormatter.ofPattern("HH:mm:ss")
 69 |     val ls = LocalTime.parse("10:09:08", formatterTime1)
 70 |     val instant = ls.atDate(LocalDate.of(1970, 1, 1)).atZone(ZoneId.systemDefault()).toInstant
 71 |     val timestamp = Timestamp.from(instant)
 72 | 
 73 |     val md = new MetadataBuilder().putString(Constants.MD_DATATYPE_OVERRIDE, Constants.MD_DATATYPE_OVERRIDE_TIME).build()
 74 |     val data = Seq(
 75 |       Row(Array(Row("RowOneArray1", 1, timestamp))),
 76 |       Row(Array(Row("RowTwoArray1", 3, timestamp), Row("RowTwoArray2", 4, timestamp)))
 77 |     )
 78 |     val schema = new StructType()
 79 |       .add(StructField("name", ArrayType(StructType(
 80 |         List(StructField("name", StringType, true),
 81 |           StructField("number", IntegerType, true),
 82 |           StructField("aTime", TimestampType, true,md)))),
 83 |         true)
 84 |       )
 85 |     val df = spark.createDataFrame(spark.sparkContext.parallelize(data,1), schema)
 86 |     df
 87 |   }
 88 | 
 89 |   def   prepareNestedDataArrays() : DataFrame = {
 90 |     val date= java.sql.Date.valueOf("2015-03-31")
 91 | 
 92 |     val formatterTime1 = DateTimeFormatter.ofPattern("HH:mm:ss")
 93 |     val ls = LocalTime.parse("10:09:08", formatterTime1)
 94 |     val instant = ls.atDate(LocalDate.of(1970, 1, 1)).atZone(ZoneId.systemDefault()).toInstant
 95 |     val timestamp = Timestamp.from(instant)
 96 | 
 97 |     val data = Seq( Row(13, Row("Str1", true, 12.34,6L, date, Decimal(2.3), timestamp, Row("sub1", Row(timestamp), Array(Row("RowOneArray1", 1, timestamp), Row("RowOneArray2", 2, timestamp))))) ,
 98 |       Row(24, Row("Str2", false, 12.34,6L, date, Decimal(2.3), timestamp, Row("sub2", Row(timestamp), Array(Row("RowTwoArray1", 3, timestamp), Row("RowTwoArray2", 4, timestamp), Row("RowTwoArray3", 5, timestamp), Row("RowTwoArray4", 6, timestamp)))))
 99 |     )
100 | 
101 |     val schema = new StructType()
102 |       .add(StructField("id", IntegerType, true))
103 |       .add(StructField("details", new StructType()
104 |         .add(StructField("name", StringType, true))
105 |         .add(StructField("flag", BooleanType, true))
106 |         .add(StructField("salary", DoubleType, true))
107 |         .add(StructField("phone", LongType, true))
108 |         .add(StructField("dob", DateType, true))
109 |         .add(StructField("weight",  DecimalType(DECIMAL_PRECISION,1), true))
110 |         .add(StructField("time", TimestampType, true))
111 |         .add(StructField("subRow", new StructType()
112 |           .add(StructField("name", StringType, true))
113 |           .add(StructField("level3", new StructType()
114 |             .add(StructField("time1", TimestampType, true))
115 |           )
116 |           )
117 |           .add(StructField("hit_songs", ArrayType(StructType(List(StructField("name", StringType, true),
118 |             StructField("number", IntegerType, true),
119 |             StructField("aTime", TimestampType, true))), true), true))
120 |         )
121 |         )))
122 |     val df = spark.createDataFrame(spark.sparkContext.parallelize(data,1), schema)
123 |     df
124 |   }
125 | 
126 |   def prepareNestedData():  DataFrame = {
127 |     val date= java.sql.Date.valueOf("2015-03-31")
128 |     val timestamp = new java.sql.Timestamp(System.currentTimeMillis());
129 |     val data = Seq(
130 |       Row(13, Row("Str1", true, 12.34,6L, date, Decimal(2.3), timestamp,  Row("sub1", Row(timestamp)))) ,
131 |       Row(24, Row("Str2", false, 12.34,6L, date, Decimal(2.3), timestamp, Row("sub2", Row(timestamp))))
132 |     )
133 | 
134 |     val schema = new StructType()
135 |       .add(StructField("id", IntegerType, true))
136 |       .add(StructField("details", new StructType()
137 |         .add(StructField("name", StringType, true))
138 |         .add(StructField("flag", BooleanType, true))
139 |         .add(StructField("salary", DoubleType, true))
140 |         .add(StructField("phone", LongType, true))
141 |         .add(StructField("dob", DateType, true))
142 |         .add(StructField("weight",  DecimalType(DECIMAL_PRECISION,1), true))
143 |         .add(StructField("time", TimestampType, true))
144 |         .add(StructField("subRow", new StructType()
145 |           .add(StructField("name", StringType, true))
146 |           .add(StructField("level3", new StructType()
147 |             .add(StructField("time1", TimestampType, true))
148 |           )
149 |           )
150 |         )
151 |         )))
152 |     val df = spark.createDataFrame(spark.sparkContext.parallelize(data), schema)
153 |     df
154 |   }
155 | 
156 |   def prepareNullData(): DataFrame = {
157 | 
158 |     val data = Seq(
159 |       Row(null, null, null, null,null, null, null, null, null, null, null)
160 |     )
161 | 
162 |     val schema = new StructType()
163 |       .add(StructField("name", StringType, true))
164 |       .add(StructField("id", IntegerType, true))
165 |       .add(StructField("flag", BooleanType, true))
166 |       .add(StructField("salary", DoubleType, true))
167 |       .add(StructField("phone", LongType, true))
168 |       .add(StructField("dob", DateType, true))
169 |       .add(StructField("weight",  DecimalType(Constants.DECIMAL_PRECISION,7), true))
170 |       .add(StructField("time", TimestampType, true))
171 |       .add(StructField("float", FloatType, true))
172 |       .add(StructField("byte", ByteType, true))
173 |       .add(StructField("short", ShortType, true))
174 | 
175 |     spark.createDataFrame(spark.sparkContext.parallelize(data), schema)
176 |   }
177 | 
178 | }
179 | 


--------------------------------------------------------------------------------
/test/spark-cdm-connector-assembly-0.18.2.jar:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Azure/spark-cdm-connector/3a2f09767dabad4c97c3b849210603f08f7ff6bd/test/spark-cdm-connector-assembly-0.18.2.jar


--------------------------------------------------------------------------------
/test/spark-cdm-connector-assembly-permissive.jar:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Azure/spark-cdm-connector/3a2f09767dabad4c97c3b849210603f08f7ff6bd/test/spark-cdm-connector-assembly-permissive.jar


--------------------------------------------------------------------------------
/test/tests:
--------------------------------------------------------------------------------
1 | 
2 | 


--------------------------------------------------------------------------------