├── .gitattributes ├── .github └── ISSUE_TEMPLATE │ ├── bug_report.yml │ └── config.yml ├── .gitignore ├── DatabricksTokenProviderMock ├── DatabricksADTokenMock │ └── src │ │ └── main │ │ └── scala │ │ └── Token.scala └── src │ └── main │ └── scala │ └── Provider.scala ├── LICENSE ├── README.md ├── SECURITY.md ├── artifacts └── spark-cdm-connector-assembly-synapse-spark3.3-1.19.7.jar ├── build.sbt ├── documentation └── overview.md ├── lib ├── SparkCustomEvents-1.0.0.jar ├── cdmstandardsandmodels-1.0.1.jar ├── hdinsight-spark-metrics_2_4-1.2.jar ├── mdsdclient-1.0.jar ├── objectmodel-1.7.3.jar ├── peregrine-tools-0.2.0-SNAPSHOT.jar ├── spark-enhancement_2.11-2.4.2.jar └── tokenlibrary_2.11-1.0.jar ├── project ├── build.properties └── plugins.sbt ├── samples ├── Contacts │ ├── Contacts.manifest.cdm.json │ ├── Customer.cdm.json │ ├── CustomerCategory.cdm.json │ ├── Entity.cdm.json │ ├── NestedExample.cdm.json │ ├── Person.cdm.json │ ├── TrackedEntity.cdm.json │ ├── _salesimports.cdm.json │ └── config.json ├── SparkCDMsample.scala └── SparkCDMsamplePython.ipynb ├── src ├── main │ ├── main.iml │ └── scala │ │ └── com │ │ └── microsoft │ │ └── cdm │ │ ├── CDMCatalog.scala │ │ ├── CDMIdentifier.scala │ │ ├── DefaultSource.scala │ │ ├── HadoopTables.scala │ │ ├── SparkTable.scala │ │ ├── log │ │ └── SparkCDMLogger.scala │ │ ├── read │ │ ├── CDMDataReader.scala │ │ ├── CDMInputPartition.scala │ │ ├── CDMPartitionReader.scala │ │ ├── CDMPartitionReaderFactory.scala │ │ ├── CDMReadOptions.scala │ │ ├── CDMScanBuilder.scala │ │ ├── CDMSimpleScan.scala │ │ ├── CSVReaderConnector.scala │ │ ├── ParquetReaderConnector.scala │ │ └── ReaderConnector.scala │ │ ├── utils │ │ ├── CDMAuthentication.scala │ │ ├── CDMModelCommon.scala │ │ ├── CDMModelReader.scala │ │ ├── CDMModelWriter.scala │ │ ├── CDMOptions.scala │ │ ├── CDMParquetSchemaConverter.scala │ │ ├── CDMSASTokenProvider.scala │ │ ├── CDMTokenProvider.scala │ │ ├── CDMUtils.scala │ │ ├── CdmAdapterProvider.scala │ │ ├── Constants.scala │ │ ├── CsvParserFactory.scala │ │ ├── DataConverter.scala │ │ ├── DateTimeFormatterHelper.scala │ │ ├── Messages.scala │ │ ├── OverridenCdmStandardsAdapter.scala │ │ ├── SerializedABFSHadoopConf.scala │ │ ├── SparkSerializableConfiguration.scala │ │ ├── StructTypeMetadata.scala │ │ └── TimestampFormatter.scala │ │ └── write │ │ ├── CDMBatchWriter.scala │ │ ├── CDMDataWriter.scala │ │ ├── CDMDataWriterFactory.scala │ │ ├── CDMWriteOptions.scala │ │ ├── CDMWriterBuilder.scala │ │ ├── CSVWriterConnector.scala │ │ ├── FileCommitMessage.scala │ │ ├── ParquetWriterConnector.scala │ │ └── WriterConnector.scala └── test │ ├── resources │ └── log4j.properties │ └── scala │ └── com │ └── microsoft │ └── cdm │ └── test │ ├── CDMADLS.scala │ ├── CDMUnitTests.scala │ └── TestData.scala └── test ├── spark-cdm-connector-assembly-0.18.2.jar ├── spark-cdm-connector-assembly-permissive.jar └── tests /.gitattributes: -------------------------------------------------------------------------------- 1 | .gitattributes text eol=lf 2 | .gitignore text eol=lf 3 | *.build text eol=lf 4 | *.scala 5 | Makefile text eol=lf 6 | -------------------------------------------------------------------------------- /.github/ISSUE_TEMPLATE/bug_report.yml: -------------------------------------------------------------------------------- 1 | name: Bug Report 2 | description: File a bug report or report an issue 3 | title: "[Issue] Summary here" 4 | 5 | body: 6 | - type: markdown 7 | attributes: 8 | value: | 9 | # Instructions 10 | 11 | Please follow the instructions below. Failure to do so may result in your issue being closed. 12 | 13 | 1. Provide a good title starting with [Bug] or [Issue]. 14 | 2. Fill out all sections of this bug report form. 15 | 16 | - type: dropdown 17 | attributes: 18 | label: Did you read the pinned issues and search the error message? 19 | description: >- 20 | Some users might encounter the same error or the fix is already addressed. You can locate the [pinned issues here](https://github.com/Azure/spark-cdm-connector/issues). 21 | Paste some key words in the error message into the search bar and then click on "In this repository" option to search the repo. 22 | options: 23 | - No, but I will read and search it now before creating an issue. 24 | - Yes, but I didn't find the answer. 25 | validations: 26 | required: true 27 | 28 | - type: markdown 29 | attributes: 30 | value: | 31 | # Details 32 | 33 | - type: textarea 34 | attributes: 35 | label: Summary of issue 36 | description: >- 37 | Describe the issue you faced in this section. 38 | Include the code you tried to execute and enclose with ` ``` ` (on its own line) before and after to make it legible. 39 | Include any details about your dataframe or CDM schema if you think it helps explain the issue. 40 | placeholder: | 41 | I followed these steps and ran into an error. The full error stack trace is included in the next section. 42 | ``` 43 | The code you tried to run. 44 | ``` 45 | validations: 46 | required: true 47 | 48 | - type: textarea 49 | attributes: 50 | label: Error stack trace 51 | description: >- 52 | Add the **full error stack trace** if applicable. The UI should probably show it or you can go into the driver logs and get it. 53 | placeholder: | 54 | ``` 55 | Some error stack trace here. 56 | ``` 57 | validations: 58 | required: false 59 | 60 | - type: markdown 61 | attributes: 62 | value: | 63 | # Platform and Setup 64 | 65 | - type: input 66 | attributes: 67 | label: Platform name 68 | description: "What platform are you using? Azure Synapse?" 69 | validations: 70 | required: true 71 | 72 | - type: input 73 | attributes: 74 | label: Spark version 75 | description: "What Spark version is the platform running?" 76 | validations: 77 | required: true 78 | 79 | - type: input 80 | attributes: 81 | label: CDM jar version 82 | description: | 83 | What jar version are you using? If you don't know, you can also run this scala code: `com.microsoft.cdm.BuildInfo.version` 84 | validations: 85 | required: true 86 | 87 | - type: dropdown 88 | attributes: 89 | label: What is the format of the data you are trying to read/write? 90 | description: >- 91 | If reading, you can look into the storage account folder and it is likely "csv". 92 | If writing, the default is "csv", unless you specify "parquet" in the write options. 93 | options: 94 | - .csv 95 | - .parquet 96 | validations: 97 | required: true 98 | -------------------------------------------------------------------------------- /.github/ISSUE_TEMPLATE/config.yml: -------------------------------------------------------------------------------- 1 | # disable blank issue creation 2 | blank_issues_enabled: false 3 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | .idea/ 2 | .metals/ 3 | .vscode/ 4 | target/ 5 | sbt.json 6 | build.properties 7 | -------------------------------------------------------------------------------- /DatabricksTokenProviderMock/DatabricksADTokenMock/src/main/scala/Token.scala: -------------------------------------------------------------------------------- 1 | package shaded.databricks.v20180920_b33d810.org.apache.hadoop.fs.azurebfs.oauth2 2 | class AzureADToken{ 3 | def getAccessToken(): String = { 4 | throw new Exception("This function should never be called") 5 | } 6 | } 7 | -------------------------------------------------------------------------------- /DatabricksTokenProviderMock/src/main/scala/Provider.scala: -------------------------------------------------------------------------------- 1 | package com.databricks.backend.daemon.data.client.adl 2 | import shaded.databricks.v20180920_b33d810.org.apache.hadoop.fs.azurebfs.oauth2.AzureADToken 3 | 4 | class AdlGen2CredentialContextTokenProvider { 5 | def getToken(): AzureADToken = { 6 | throw new Exception("Error - this method should never be called") 7 | } 8 | } 9 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) Microsoft Corporation. All rights reserved. 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE 22 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # ❗IMPORTANT NOTICE❗ 2 | The CDM library, which this connector is reliant on, is deprecating the CDM Schema Store. Please upgrade your connector version to the latest version [spark3.3-1.19.7](https://github.com/Azure/spark-cdm-connector/releases/tag/spark3.3-1.19.7) to ensure there is no disruption in your workflows. 3 | Full details in: https://github.com/Azure/spark-cdm-connector/issues/162 4 | 5 | # spark-cdm-connector 6 | 7 | The Connector is now Generally Available in **Azure Spark for Azure Synapse**. The connector allows Spark dataframes to read and write entities in a CDM folder format residing on ADLS. To get started, please see [Using the Spark CDM Connector](documentation/overview.md). 8 | 9 | Be sure to check the [issues](https://github.com/Azure/spark-cdm-connector/issues) and search the error message before sending mail to asksparkcdm@microsoft.com for questions or feedback. 10 | 11 | For more information about CDM see: https://docs.microsoft.com/en-us/common-data-model/ 12 | 13 | Samples to use the connector with Python and Scala can be found here: 14 | - [Python sample](samples/SparkCDMsamplePython.ipynb) 15 | - [Scala sample](samples/SparkCDMsample.scala) 16 | -------------------------------------------------------------------------------- /SECURITY.md: -------------------------------------------------------------------------------- 1 | 2 | 3 | ## Security 4 | 5 | Microsoft takes the security of our software products and services seriously, which includes all source code repositories managed through our GitHub organizations, which include [Microsoft](https://github.com/Microsoft), [Azure](https://github.com/Azure), [DotNet](https://github.com/dotnet), [AspNet](https://github.com/aspnet), [Xamarin](https://github.com/xamarin), and [our GitHub organizations](https://opensource.microsoft.com/). 6 | 7 | If you believe you have found a security vulnerability in any Microsoft-owned repository that meets [Microsoft's definition of a security vulnerability](https://aka.ms/opensource/security/definition), please report it to us as described below. 8 | 9 | ## Reporting Security Issues 10 | 11 | **Please do not report security vulnerabilities through public GitHub issues.** 12 | 13 | Instead, please report them to the Microsoft Security Response Center (MSRC) at [https://msrc.microsoft.com/create-report](https://aka.ms/opensource/security/create-report). 14 | 15 | If you prefer to submit without logging in, send email to [secure@microsoft.com](mailto:secure@microsoft.com). If possible, encrypt your message with our PGP key; please download it from the [Microsoft Security Response Center PGP Key page](https://aka.ms/opensource/security/pgpkey). 16 | 17 | You should receive a response within 24 hours. If for some reason you do not, please follow up via email to ensure we received your original message. Additional information can be found at [microsoft.com/msrc](https://aka.ms/opensource/security/msrc). 18 | 19 | Please include the requested information listed below (as much as you can provide) to help us better understand the nature and scope of the possible issue: 20 | 21 | * Type of issue (e.g. buffer overflow, SQL injection, cross-site scripting, etc.) 22 | * Full paths of source file(s) related to the manifestation of the issue 23 | * The location of the affected source code (tag/branch/commit or direct URL) 24 | * Any special configuration required to reproduce the issue 25 | * Step-by-step instructions to reproduce the issue 26 | * Proof-of-concept or exploit code (if possible) 27 | * Impact of the issue, including how an attacker might exploit the issue 28 | 29 | This information will help us triage your report more quickly. 30 | 31 | If you are reporting for a bug bounty, more complete reports can contribute to a higher bounty award. Please visit our [Microsoft Bug Bounty Program](https://aka.ms/opensource/security/bounty) page for more details about our active programs. 32 | 33 | ## Preferred Languages 34 | 35 | We prefer all communications to be in English. 36 | 37 | ## Policy 38 | 39 | Microsoft follows the principle of [Coordinated Vulnerability Disclosure](https://aka.ms/opensource/security/cvd). 40 | 41 | 42 | -------------------------------------------------------------------------------- /artifacts/spark-cdm-connector-assembly-synapse-spark3.3-1.19.7.jar: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Azure/spark-cdm-connector/3a2f09767dabad4c97c3b849210603f08f7ff6bd/artifacts/spark-cdm-connector-assembly-synapse-spark3.3-1.19.7.jar -------------------------------------------------------------------------------- /build.sbt: -------------------------------------------------------------------------------- 1 | name := "spark-cdm-connector" 2 | 3 | //the groupid 4 | organization := "com.microsoft.azure" 5 | 6 | // skip all dependencies in the pom file. This is an uber jar 7 | // refernce: https://stackoverflow.com/questions/41670018/how-to-prevent-sbt-to-include-test-dependencies-into-the-pom 8 | 9 | import scala.xml.{Node => XmlNode, NodeSeq => XmlNodeSeq, _} 10 | import scala.xml.transform.{RewriteRule, RuleTransformer} 11 | pomPostProcess := { (node: XmlNode) => 12 | new RuleTransformer(new RewriteRule { 13 | override def transform(node: XmlNode): XmlNodeSeq = node match { 14 | case e: Elem if e.label == "dependency" => scala.xml.NodeSeq.Empty 15 | case _ => node 16 | } 17 | }).transform(node).head 18 | } 19 | 20 | version := "spark3.3-1.19.7" 21 | 22 | crossPaths := false 23 | ThisBuild / scalaVersion := "2.12.15" 24 | libraryDependencies += "org.scalatest" %% "scalatest" % "3.0.8" % "test" 25 | 26 | libraryDependencies += "com.fasterxml.jackson.datatype" % "jackson-datatype-jdk8" % "2.13.4" 27 | libraryDependencies += "com.fasterxml.jackson.datatype" % "jackson-datatype-jsr310" % "2.13.4" 28 | 29 | //these libraries already exist in spark HDI 2.4.0 - don't include them building the uber jar 30 | dependencyOverrides += "com.fasterxml.jackson.core" % "jackson-databind" % "2.13.4.1" 31 | libraryDependencies += "com.fasterxml.jackson.module" %% "jackson-module-scala" % "2.13.4" 32 | libraryDependencies += "com.fasterxml.jackson.core" % "jackson-core" % "2.13.4" 33 | libraryDependencies += "com.fasterxml.jackson.core" % "jackson-annotations" % "2.13.4" 34 | libraryDependencies += "org.apache.commons" % "commons-lang3" % "3.12.0" % "provided" 35 | libraryDependencies += "org.apache.spark" %% "spark-sql" % "3.3.0" % "provided" 36 | libraryDependencies += "org.apache.spark" %% "spark-core" % "3.3.0" % "provided" 37 | libraryDependencies += "org.apache.httpcomponents" % "httpclient" % "4.5.13" % "provided" 38 | libraryDependencies += "com.google.guava" % "guava" % "14.0.1" % "provided" 39 | libraryDependencies += "commons-io" % "commons-io" % "2.11.0" % "provided" 40 | libraryDependencies += "com.microsoft.azure" % "msal4j" % "1.10.1" 41 | libraryDependencies += "com.microsoft.commondatamodel" % "cdmstandards" % "2.8.0" 42 | libraryDependencies += "org.apache.hadoop" % "hadoop-azure" % "3.3.1" % "provided" 43 | libraryDependencies += "org.apache.hadoop" % "hadoop-common" % "3.3.1" % "provided" 44 | 45 | resolvers += "Maven Twitter Releases" at "https://maven.twttr.com/" 46 | libraryDependencies += "com.hadoop.gplcompression" % "hadoop-lzo" % "0.4.20" 47 | 48 | // The main module depends on the mock'd databricks DataricksokenProvider classes. 49 | lazy val root = (project in file(".")). 50 | enablePlugins(BuildInfoPlugin). 51 | settings( 52 | buildInfoKeys := Seq[BuildInfoKey](name, version, scalaVersion, sbtVersion), 53 | buildInfoPackage := "com.microsoft.cdm" 54 | ).dependsOn(child) 55 | 56 | // Create the Databricks mocking library without including DataBricks jars. Their only purpose is to enable 57 | // compilation. When we deploy to Databricks, these classes will already exist as part of the Databricks runtime. 58 | // Child and grandchild represent the two mocked Databricks libraries, whose classes are removed when building 59 | // an uber jar -- see assemblyMergeStrategy below. 60 | lazy val child = Project("DatabricksTokenProviderMock", file("DatabricksTokenProviderMock")) 61 | .settings().dependsOn(grandchild) 62 | lazy val grandchild = Project("DatabricksADTokenMock", file("DatabricksTokenProviderMock/DatabricksADTokenMock")) 63 | .settings() 64 | 65 | //assembly 66 | assemblyShadeRules in assembly := Seq( 67 | ShadeRule.rename("com.microsoft.aad.msal4j.**" -> "shadeiomsal4j.@1").inAll, 68 | ShadeRule.rename("com.fasterxml.jackson.**" -> "shadeio.@1").inAll, 69 | ShadeRule.rename("com.nimbusds.**" -> "shadeionimbusds.@1").inAll, 70 | ShadeRule.rename("net.minidev.**" -> "shadeiominidev.@1").inAll 71 | ) 72 | 73 | 74 | assemblyExcludedJars in assembly := { 75 | val cp = (fullClasspath in assembly).value 76 | cp filter { f => 77 | f.data.getName.contains("tokenlibrary") || 78 | f.data.getName.contains("SparkCustomEvents") || 79 | f.data.getName.contains("hdinsight") || 80 | f.data.getName.contains("peregrine") || 81 | f.data.getName.contains("mdsdclient") || 82 | f.data.getName.contains("spark-enhancement") 83 | } 84 | } 85 | 86 | // build an uber jar 87 | assemblyMergeStrategy in assembly := { 88 | case PathList("META-INF", xs@_*) => MergeStrategy.discard 89 | //the stubbed-out Databricks jars don't show up in "assemblyExcludedJars" to remove, so manually removing the mocking classes 90 | case "shaded/databricks/v20180920_b33d810/org/apache/hadoop/fs/azurebfs/oauth2/AzureADToken.class" => MergeStrategy.discard 91 | case "com/databricks/backend/daemon/data/client/adl/AdlGen2CredentialContextTokenProvider.class" => MergeStrategy.discard 92 | case x => MergeStrategy.first 93 | } 94 | 95 | // don't bring scala classes into uber jar 96 | assemblyOption in assembly ~= { _.copy(includeScala = false) } 97 | 98 | // don't run tests with "sbt assembly" 99 | test in assembly := {} 100 | 101 | // Below is for publishing 102 | artifact in (Compile, packageBin) := { 103 | val art = (artifact in (Compile, packageBin)).value 104 | art.withClassifier(Some("")) 105 | } 106 | 107 | addArtifact(artifact in (Compile, packageBin), assembly) 108 | 109 | // Your profile name of the sonatype account. The default is the same with the organization value 110 | sonatypeProfileName := "com.microsoft.azure" 111 | 112 | // To sync with Maven central, you need to supply the following information: 113 | publishMavenStyle := true 114 | 115 | // Open-source license of your choice 116 | licenses := Seq("APL2" -> url("http://www.apache.org/licenses/LICENSE-2.0.txt")) 117 | 118 | // Where is the source code hosted: GitHub or GitLab? 119 | import xerial.sbt.Sonatype._ 120 | sonatypeProjectHosting := Some(GitHubHosting("bissont", "Spark-CDM", "tibisso@microsoft.com")) 121 | 122 | // or if you want to set these fields manually 123 | homepage := Some(url("https://github.com/Azure/spark-cdm-connector")) 124 | scmInfo := Some( 125 | ScmInfo( 126 | url("https://github.com/Azure/new-spark-cdm"), 127 | "scm:git@github.com:Azure/new-spark-cdm.git" 128 | ) 129 | ) 130 | developers := List( 131 | Developer(id="tibisso", name="Timothy Bisson", email="tibisso@microsoft.com", url=url("https://github.com/bissont")), 132 | Developer(id="srruj", name="Sricheta Ruj", email="Sricheta.Ruj@microsoft.com", url=url("https://github.com/sricheta92")) 133 | ) 134 | 135 | // Remove all additional repository other than Maven Central from POM 136 | ThisBuild / pomIncludeRepository := { _ => false } 137 | ThisBuild / publishTo := sonatypePublishToBundle.value 138 | 139 | ThisBuild / publishConfiguration := publishConfiguration.value.withOverwrite(true) 140 | ThisBuild / publishLocalConfiguration := publishLocalConfiguration.value.withOverwrite(true) 141 | -------------------------------------------------------------------------------- /lib/SparkCustomEvents-1.0.0.jar: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Azure/spark-cdm-connector/3a2f09767dabad4c97c3b849210603f08f7ff6bd/lib/SparkCustomEvents-1.0.0.jar -------------------------------------------------------------------------------- /lib/cdmstandardsandmodels-1.0.1.jar: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Azure/spark-cdm-connector/3a2f09767dabad4c97c3b849210603f08f7ff6bd/lib/cdmstandardsandmodels-1.0.1.jar -------------------------------------------------------------------------------- /lib/hdinsight-spark-metrics_2_4-1.2.jar: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Azure/spark-cdm-connector/3a2f09767dabad4c97c3b849210603f08f7ff6bd/lib/hdinsight-spark-metrics_2_4-1.2.jar -------------------------------------------------------------------------------- /lib/mdsdclient-1.0.jar: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Azure/spark-cdm-connector/3a2f09767dabad4c97c3b849210603f08f7ff6bd/lib/mdsdclient-1.0.jar -------------------------------------------------------------------------------- /lib/objectmodel-1.7.3.jar: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Azure/spark-cdm-connector/3a2f09767dabad4c97c3b849210603f08f7ff6bd/lib/objectmodel-1.7.3.jar -------------------------------------------------------------------------------- /lib/peregrine-tools-0.2.0-SNAPSHOT.jar: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Azure/spark-cdm-connector/3a2f09767dabad4c97c3b849210603f08f7ff6bd/lib/peregrine-tools-0.2.0-SNAPSHOT.jar -------------------------------------------------------------------------------- /lib/spark-enhancement_2.11-2.4.2.jar: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Azure/spark-cdm-connector/3a2f09767dabad4c97c3b849210603f08f7ff6bd/lib/spark-enhancement_2.11-2.4.2.jar -------------------------------------------------------------------------------- /lib/tokenlibrary_2.11-1.0.jar: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Azure/spark-cdm-connector/3a2f09767dabad4c97c3b849210603f08f7ff6bd/lib/tokenlibrary_2.11-1.0.jar -------------------------------------------------------------------------------- /project/build.properties: -------------------------------------------------------------------------------- 1 | sbt.version=1.5.5 2 | -------------------------------------------------------------------------------- /project/plugins.sbt: -------------------------------------------------------------------------------- 1 | addSbtPlugin("com.eed3si9n" % "sbt-assembly" % "0.14.6") 2 | addSbtPlugin("org.xerial.sbt" % "sbt-sonatype" % "3.4") 3 | addSbtPlugin("com.jsuereth" % "sbt-pgp" % "2.0.0-M2") 4 | addSbtPlugin("com.eed3si9n" % "sbt-buildinfo" % "0.9.0") 5 | addDependencyTreePlugin 6 | -------------------------------------------------------------------------------- /samples/Contacts/Contacts.manifest.cdm.json: -------------------------------------------------------------------------------- 1 | { 2 | "$schema": "https://raw.githubusercontent.com/microsoft/CDM/master/schemaDocuments/schema.manifest.cdm.json", 3 | "jsonSchemaSemanticVersion": "1.0.0", 4 | "imports": [ 5 | { 6 | "corpusPath": "cdm:/foundations.cdm.json" 7 | } 8 | ], 9 | "manifestName": "Contacts", 10 | "explanation": "A logical model of contacts", 11 | "entities": [ 12 | { 13 | "type": "LocalEntity", 14 | "entityName": "Person", 15 | "entityPath": "Person.cdm.json/Person" 16 | }, 17 | { 18 | "type": "LocalEntity", 19 | "entityName": "Entity", 20 | "entityPath": "Entity.cdm.json/Entity" 21 | } 22 | ] 23 | } -------------------------------------------------------------------------------- /samples/Contacts/Customer.cdm.json: -------------------------------------------------------------------------------- 1 | { 2 | "$schema": "https://raw.githubusercontent.com/microsoft/CDM/master/schemaDocuments/schema.cdm.json", 3 | "jsonSchemaSemanticVersion": "1.0.0", 4 | "imports": [ 5 | { 6 | "corpusPath": "_salesimports.cdm.json" 7 | } 8 | ], 9 | "definitions": [ 10 | { 11 | "entityName": "Customer", 12 | "extendsEntity":"TrackedEntity", 13 | "hasAttributes": [ 14 | { 15 | "purpose": "hasA", 16 | "dataType": "integer", 17 | "name": "CustomerId", 18 | "description": "Customer identifier" 19 | }, 20 | { 21 | "purpose": "hasA", 22 | "dataType": "string", 23 | "name": "CustomerName", 24 | "displayName": "Name ", 25 | "maximumLength": 100 26 | }, 27 | { 28 | "purpose": "hasA", 29 | "dataType": "decimal", 30 | "name": "CreditLimit", 31 | "appliedTraits": [ 32 | { 33 | "traitReference": "is.dataFormat.numeric.shaped", 34 | "arguments": [ 35 | { 36 | "name": "precision", 37 | "value": "18" 38 | }, 39 | { 40 | "name": "scale", 41 | "value": "2" 42 | } 43 | ] 44 | } 45 | ] 46 | } 47 | ] 48 | } 49 | ] 50 | } -------------------------------------------------------------------------------- /samples/Contacts/CustomerCategory.cdm.json: -------------------------------------------------------------------------------- 1 | { 2 | "$schema": "https://raw.githubusercontent.com/microsoft/CDM/master/schemaDocuments/schema.cdm.json", 3 | "jsonSchemaSemanticVersion": "1.0.0", 4 | "imports": [ 5 | { 6 | "corpusPath": "_salesimports.cdm.json" 7 | } 8 | ], 9 | "definitions": [ 10 | { 11 | "entityName": "CustomerCategory", 12 | "extendsEntity":"TrackedEntity", 13 | "description": "The kind of customer - agent, wholesaler, etc.", 14 | "hasAttributes": [ 15 | { 16 | "dataType": "integer", 17 | "name": "CustomerCategoryId", 18 | "displayName": "Category ID" 19 | }, 20 | { 21 | "dataType": "string", 22 | "name": "CustomerCategoryName" 23 | } 24 | ] 25 | } 26 | ] 27 | } -------------------------------------------------------------------------------- /samples/Contacts/Entity.cdm.json: -------------------------------------------------------------------------------- 1 | { 2 | "$schema": "https://raw.githubusercontent.com/microsoft/CDM/master/schemaDocuments/schema.cdm.json", 3 | "jsonSchemaSemanticVersion": "1.0.0", 4 | "imports": [ 5 | { 6 | "corpusPath": "cdm:/foundations.cdm.json" 7 | } 8 | ], 9 | "definitions": [ 10 | { 11 | "entityName": "Entity", 12 | "description": "A base entity type defining common attributes used by other entities", 13 | "hasAttributes": [ 14 | { 15 | "purpose": "hasA", 16 | "dataType": "integer", 17 | "name": "identifier", 18 | "description": "Identifier of the entity" 19 | }, 20 | { 21 | "purpose": "hasA", 22 | "dataType": "dateTime", 23 | "name": "createdTime", 24 | "description": "The UTC time this entity was created" 25 | } 26 | ] 27 | } 28 | ] 29 | } -------------------------------------------------------------------------------- /samples/Contacts/NestedExample.cdm.json: -------------------------------------------------------------------------------- 1 | { 2 | "jsonSchemaSemanticVersion" : "1.0.0", 3 | "imports" : [ 4 | { 5 | "corpusPath" : "cdm:/foundations.cdm.json" 6 | } 7 | ], 8 | "definitions" : [ 9 | { 10 | "entityName" : "NestedExample", 11 | "hasAttributes" : [ 12 | { 13 | "name" : "id", 14 | "dataType" : "integer" 15 | }, 16 | { 17 | "entity" : "details", 18 | "name" : "details", 19 | "resolutionGuidance" : { 20 | "imposedDirectives" : [ 21 | "structured", 22 | "noMaxDepth" 23 | ], 24 | "renameFormat" : "{m}" 25 | } 26 | } 27 | ] 28 | }, 29 | { 30 | "entityName" : "details", 31 | "hasAttributes" : [ 32 | { 33 | "name" : "name", 34 | "dataType" : "string" 35 | }, 36 | { 37 | "name" : "USCitizen", 38 | "dataType" : "boolean" 39 | }, 40 | { 41 | "name" : "salary", 42 | "dataType" : "double" 43 | }, 44 | { 45 | "name" : "phone", 46 | "dataType" : "bigInteger" 47 | }, 48 | { 49 | "name" : "birthDate", 50 | "dataType" : "date" 51 | }, 52 | { 53 | "name" : "bodyMassIndex", 54 | "dataType" : "decimal", 55 | "appliedTraits" : [ 56 | { 57 | "traitReference" : "is.dataFormat.numeric.shaped", 58 | "arguments" : [ 59 | { 60 | "name" : "precision", 61 | "value" : 5 62 | }, 63 | { 64 | "name" : "scale", 65 | "value" : 2 66 | } 67 | ] 68 | } 69 | ] 70 | }, 71 | { 72 | "name" : "createdTime", 73 | "dataType" : "dateTime" 74 | }, 75 | { 76 | "entity" : "address", 77 | "name" : "address", 78 | "resolutionGuidance" : { 79 | "imposedDirectives" : [ 80 | "structured", 81 | "noMaxDepth" 82 | ], 83 | "renameFormat" : "{m}" 84 | } 85 | } 86 | ] 87 | }, 88 | { 89 | "entityName" : "address", 90 | "hasAttributes" : [ 91 | { 92 | "name" : "zipcode", 93 | "dataType" : "string" 94 | }, 95 | { 96 | "entity" : "street", 97 | "name" : "street", 98 | "resolutionGuidance" : { 99 | "imposedDirectives" : [ 100 | "structured", 101 | "noMaxDepth" 102 | ], 103 | "renameFormat" : "{m}" 104 | } 105 | }, 106 | { 107 | "entity" : "songs", 108 | "name" : "songs", 109 | "resolutionGuidance" : { 110 | "imposedDirectives" : [ 111 | "structured", 112 | "noMaxDepth" 113 | ], 114 | "removedDirectives" : [ 115 | "normalized" 116 | ], 117 | "cardinality" : "many", 118 | "renameFormat" : "{m}" 119 | } 120 | } 121 | ] 122 | }, 123 | { 124 | "entityName" : "street", 125 | "hasAttributes" : [ 126 | { 127 | "name" : "streetName", 128 | "dataType" : "string" 129 | }, 130 | { 131 | "name" : "streetNumber", 132 | "dataType" : "integer" 133 | } 134 | ] 135 | }, 136 | { 137 | "entityName" : "songs", 138 | "hasAttributes" : [ 139 | { 140 | "name" : "name", 141 | "dataType" : "string" 142 | }, 143 | { 144 | "name" : "number", 145 | "dataType" : "integer" 146 | } 147 | ] 148 | } 149 | ] 150 | } -------------------------------------------------------------------------------- /samples/Contacts/Person.cdm.json: -------------------------------------------------------------------------------- 1 | { 2 | "$schema": "https://raw.githubusercontent.com/microsoft/CDM/master/schemaDocuments/schema.cdm.json", 3 | "jsonSchemaSemanticVersion": "1.0.0", 4 | "imports": [ 5 | { 6 | "corpusPath": "cdm:/foundations.cdm.json" 7 | }, 8 | { 9 | "corpusPath": "Entity.cdm.json" 10 | } 11 | ], 12 | "definitions": [ 13 | { 14 | "entityName": "Person", 15 | "extendsEntity": "Entity", 16 | "description": "An individual", 17 | "hasAttributes": [ 18 | { 19 | "purpose": "hasA", 20 | "dataType": "string", 21 | "name": "firstName", 22 | "description": "Person's first name", 23 | "maximumLength": 100 24 | }, 25 | { 26 | "purpose": "hasA", 27 | "dataType": "string", 28 | "name": "lastName", 29 | "description": "Person's last name", 30 | "maximumLength": 100 31 | }, 32 | { 33 | "purpose": "hasA", 34 | "dataType": "date", 35 | "name": "birthDate", 36 | "description": "Person's date of birth" 37 | } 38 | ] 39 | } 40 | ] 41 | } -------------------------------------------------------------------------------- /samples/Contacts/TrackedEntity.cdm.json: -------------------------------------------------------------------------------- 1 | { 2 | "$schema": "https://raw.githubusercontent.com/microsoft/CDM/master/schemaDocuments/schema.cdm.json", 3 | "jsonSchemaSemanticVersion": "1.0.0", 4 | "imports": [ 5 | { 6 | "corpusPath": "cdm:/foundations.cdm.json" 7 | } 8 | ], 9 | "definitions": [ 10 | { 11 | "description": "An entity whose modification is tracked ", 12 | "entityName": "TrackedEntity", 13 | "hasAttributes": [ 14 | { 15 | "purpose": "modifiedOn", 16 | "dataType": "dateTime", 17 | "name": "ValidFrom", 18 | "description": "The date from which this record is valid" 19 | }, 20 | { 21 | "purpose": "hasA", 22 | "dataType": "dateTime", 23 | "name": "ValidTo", 24 | "description": "The date to which this record was valid" } 25 | ] 26 | } 27 | ] 28 | } -------------------------------------------------------------------------------- /samples/Contacts/_salesimports.cdm.json: -------------------------------------------------------------------------------- 1 | { 2 | "jsonSchemaSemanticVersion": "1.0.0", 3 | "imports": [ 4 | { 5 | "corpusPath": "cdm:/foundations.cdm.json" 6 | }, 7 | { 8 | "corpusPath": "Customer.cdm.json" 9 | }, 10 | { 11 | "corpusPath": "CustomerCategory.cdm.json" 12 | }, 13 | { 14 | "corpusPath": "core:/TrackedEntity.cdm.json" 15 | } 16 | ] 17 | } -------------------------------------------------------------------------------- /samples/Contacts/config.json: -------------------------------------------------------------------------------- 1 | { 2 | "defaultNamespace" : "adls", 3 | "adapters" : [ 4 | { 5 | "type" : "adls", 6 | "config" : { 7 | "hostname" : "srichetastorage.dfs.core.windows.net", 8 | "root" : "/outputsubmanifest/example-public-standards", 9 | "tenant" : "72f988bf-86f1-41af-91ab-2d7cd011db47", 10 | "clientId" : "6c3f525f-bdcb-4677-bed6-24f0b43add13", 11 | "timeout" : 5000, 12 | "maximumTimeout" : 20000, 13 | "numberOfRetries" : 2 14 | }, 15 | "namespace" : "core" 16 | } 17 | ] 18 | } -------------------------------------------------------------------------------- /samples/SparkCDMsample.scala: -------------------------------------------------------------------------------- 1 | import org.apache.spark.sql.Row 2 | import org.apache.spark.sql.SaveMode 3 | import org.apache.spark.sql.types.{ArrayType, BooleanType, DateType, Decimal, DecimalType, DoubleType, IntegerType, LongType, MetadataBuilder, StringType, StructField, StructType, TimestampType} 4 | 5 | // Databricks notebook source 6 | // Specifying appid, appkey and tenanid is optional in spark-cdm-connector-assembly-0.16.jar with Premium Databricks Cluster and Synapse 7 | val appid = "" 8 | val appkey = "" 9 | val tenantid = "" 10 | 11 | val storageAccountName = ".dfs.core.windows.net" 12 | 13 | 14 | // COMMAND ---------- 15 | 16 | // Implicit write case 17 | // Write a CDM entity with Parquet data files, entity definition is derived from the dataframe schema 18 | val date= java.sql.Date.valueOf("2015-03-31"); 19 | val timestamp = new java.sql.Timestamp(System.currentTimeMillis()); 20 | var data = Seq( 21 | Row("a", 1, true, 12.34, 6L, date, timestamp, Decimal(1.4337879), Decimal(999.00), Decimal(18.8)), 22 | Row("b", 1, true, 12.34, 6L, date, timestamp, Decimal(1.4337879), Decimal(999.00), Decimal(18.8)) 23 | ) 24 | 25 | var schema = new StructType() 26 | .add(StructField("name", StringType, true)) 27 | .add(StructField("id", IntegerType, true)) 28 | .add(StructField("flag", BooleanType, true)) 29 | .add(StructField("salary", DoubleType, true)) 30 | .add(StructField("phone", LongType, true)) 31 | .add(StructField("dob", DateType, true)) 32 | .add(StructField("time", TimestampType, true)) 33 | .add(StructField("decimal1", DecimalType(15, 3), true)) 34 | .add(StructField("decimal2", DecimalType(38, 7), true)) 35 | .add(StructField("decimal3", DecimalType(5, 2), true)) 36 | 37 | var df = spark.createDataFrame(spark.sparkContext.parallelize(data), schema) 38 | 39 | // Creates the CDM manifest and adds the entity to it with gzip'd parquet partitions 40 | // with both physical and logical entity definitions 41 | df.write.format("com.microsoft.cdm") 42 | .option("storage", storageAccountName) 43 | .option("manifestPath", container + "/implicitTest/default.manifest.cdm.json") 44 | .option("entity", "TestEntity") 45 | .option("format", "parquet") 46 | .option("compression", "gzip") 47 | .save() // If table already exists, add .mode(SaveMode.Overwrite) or delete the /implicitTest folder 48 | 49 | // Append the same dataframe content to the entity in the default CSV format 50 | df.write.format("com.microsoft.cdm") 51 | .option("storage", storageAccountName) 52 | .option("manifestPath", container + "/implicitTest/default.manifest.cdm.json") 53 | .option("entity", "TestEntity") 54 | .option("delimiter", ";") // Specify what delimiter will be set in the CSV file. Default is comma 55 | .option("columnHeaders", false) // Specify a boolean value - where column header will be shown or not 56 | .option("dataFolderFormat", "'year'yyyy'/month'MM") // Specify data partitions folder with DateTimeFormatter format 57 | .option("cdmSource", "builtin") // This fetches the foundation definitions from CDM SDK library 58 | .mode(SaveMode.Append) 59 | .save() 60 | 61 | var readDf = spark.read.format("com.microsoft.cdm") 62 | .option("storage", storageAccountName) 63 | .option("manifestPath", container + "/implicitTest/default.manifest.cdm.json") 64 | .option("entity", "TestEntity") 65 | .load() 66 | 67 | readDf.select("*").show() 68 | 69 | 70 | // COMMAND ---------- 71 | 72 | // Explicit write, creating an entity in a CDM folder based on a pre-defined model 73 | 74 | // Case 1: Using an entity definition defined in the CDM Github repo 75 | 76 | var data = Seq( 77 | Row("1", "2", "3", 4L), Row("4", "5", "6", 8L),Row("7", "8", "9", 4L),Row("10", "11", "12", 8L),Row("13", "14", "15", 4L)) 78 | var schema = new StructType() 79 | .add(StructField("teamMembershipId", StringType, true)) 80 | .add(StructField("systemUserId", StringType, true)) 81 | .add(StructField("teamId", StringType, true)) 82 | .add(StructField("versionNumber", LongType, true)) 83 | 84 | var df = spark.createDataFrame(spark.sparkContext.parallelize(data, 1), schema) 85 | df.write.format("com.microsoft.cdm") 86 | .option("storage", storageAccountName) 87 | .option("manifestPath", container + "/explicitTest/root.manifest.cdm.json") 88 | .option("entity", "TeamMembership") 89 | .option("entityDefinitionPath", "core/applicationCommon/TeamMembership.cdm.json/TeamMembership") 90 | .option("useCdmStandardModelRoot", true) // sets the model root to the CDM CDN schema documents folder 91 | .option("useSubManifest", true) 92 | .save() // If table already exists, add .mode(SaveMode.Overwrite) 93 | 94 | var readDf = spark.read.format("com.microsoft.cdm") 95 | .option("storage", storageAccountName) 96 | .option("manifestPath", container + "/explicitTest/root.manifest.cdm.json") 97 | .option("entity", "TeamMembership") 98 | .load() 99 | readDf.select("*").show() 100 | 101 | 102 | // COMMAND ---------- 103 | 104 | // Explicit write, creating an entity in a CDM folder based on a pre-defined model 105 | 106 | // Case 2: Using an entity definition defined in a CDM model stored in ADLS 107 | 108 | // UPLOAD CDM FILES FIRST 109 | // To run this example, first create a /Models/Contacts folder to your demo container in ADLS gen2, 110 | // then upload the provided Contacts.manifest.cdm.json, Person.cdm.json, Entity.cdm.json files 111 | 112 | val birthdate= java.sql.Date.valueOf("1991-03-31"); 113 | val now = new java.sql.Timestamp(System.currentTimeMillis()); 114 | val data2 = Seq( 115 | Row(1,now,"Donna","Carreras",birthdate), 116 | Row(2,now,"Keith","Harris",birthdate), 117 | Row(2,now,"Carla","McGee",birthdate) 118 | ) 119 | 120 | val schema2 = new StructType() 121 | .add(StructField("identifier", IntegerType)) 122 | .add(StructField("createdTime", TimestampType)) 123 | .add(StructField("firstName", StringType)) 124 | .add(StructField("lastName", StringType)) 125 | .add(StructField("birthDate", DateType)) 126 | 127 | // Create the dataframe that matches the CDM definition of the entity, Person 128 | val df2 = spark.createDataFrame(spark.sparkContext.parallelize(data2, 1), schema2) 129 | df2.write.format("com.microsoft.cdm") 130 | .option("storage", storageAccountName) 131 | .option("manifestPath", container + "/Data/Contacts/root.manifest.cdm.json") 132 | .option("entity", "Person") 133 | .option("entityDefinitionModelRoot", container + "/Models") 134 | .option("entityDefinitionPath", "/Contacts/Person.cdm.json/Person") 135 | .save() // If table already exists, add .mode(SaveMode.Overwrite) 136 | 137 | val readDf2 = spark.read.format("com.microsoft.cdm") 138 | .option("storage", storageAccountName) 139 | .option("manifestPath", container + "/Data/Contacts/root.manifest.cdm.json") 140 | .option("entity", "Person") 141 | .load() 142 | readDf2.select("*").show() 143 | 144 | 145 | // COMMAND ---------- 146 | 147 | // Override Config Path 148 | 149 | val timestamp1 = new java.sql.Timestamp(System.currentTimeMillis()); 150 | val timestamp2 = new java.sql.Timestamp(System.currentTimeMillis()); 151 | val cdata = Seq( 152 | Row( timestamp1, timestamp2,1, "A", Decimal(33.5)), 153 | Row( timestamp1, timestamp2, 2, "B", Decimal(42.1)), 154 | Row( timestamp1, timestamp2, 3, "C", Decimal(7.90)) 155 | ) 156 | 157 | val cschema = new StructType() 158 | .add(StructField("ValidFrom", TimestampType, true)) 159 | .add(StructField("ValidTo", TimestampType, true)) 160 | .add(StructField("CustomerId", IntegerType, true)) 161 | .add(StructField("CustomerName", StringType, true)) 162 | .add(StructField("CreditLimit", DecimalType(18, 2), true)) 163 | 164 | val customerdf = spark.createDataFrame(spark.sparkContext.parallelize(cdata), cschema) 165 | 166 | customerdf.write.format("com.microsoft.cdm") 167 | .option("storage", storageAccountName) 168 | .option("manifestPath", outputContainer + "/customer/default.manifest.cdm.json") 169 | .option("entity", "TestEntity") 170 | .option("entityDefinitionPath", "Customer.cdm.json/Customer") // Customer.cdm.json has an alias - "core" 171 | .option("entityDefinitionModelRoot", container+ "Models") // fetches config.json from this location and finds definition of "core" alias, if configPath option is not present 172 | .option("configPath", "/config") // Add your config.json to override the above definition. config is the name of container. This will find config.json in container - "config" 173 | .option("entityDefinitionStorage", ".dfs.core.windows.net") // entityDefinitionModelRoot contains in this storage account 174 | .option("format", "parquet") 175 | .save() 176 | 177 | val readDf2 = spark.read.format("com.microsoft.cdm") 178 | .option("storage", storageAccountName) 179 | .option("manifestPath", outputContainer + "/customer/default.manifest.cdm.json") 180 | .option("entity", "TestEntity") 181 | .load() 182 | readDf2.select("*").show() 183 | 184 | // COMMAND ---------- 185 | 186 | // Nested Parquet Implicit & Explicit write 187 | 188 | val birthdate= java.sql.Date.valueOf("1991-03-31"); 189 | val now = new java.sql.Timestamp(System.currentTimeMillis()); 190 | val data = Seq( 191 | 192 | Row(13, Row("Donna Carreras", true, 12.34,6L, birthdate, Decimal(22.7), now, Row("95110", Row("Bose Street", 321), Array(Row("bieber1", 1), Row("bieber2", 2))))) , 193 | Row(24, Row("Keith Harris", false, 12.34,6L, birthdate, Decimal(22.7), now, Row("95134", Row("Estancia Dr", 185), Array(Row("baby1", 3), Row("baby2", 4), Row("baby3", 5), Row("baby4", 6))))) 194 | ) 195 | 196 | val schema = new StructType() 197 | .add(StructField("id", IntegerType, true)) 198 | .add(StructField("details", new StructType() 199 | .add(StructField("name", StringType, true)) 200 | .add(StructField("USCitizen", BooleanType, true)) 201 | .add(StructField("salary", DoubleType, true)) 202 | .add(StructField("phone", LongType, true)) 203 | .add(StructField("birthDate", DateType, true)) 204 | .add(StructField("bodyMassIndex", DecimalType(5,2), true)) 205 | .add(StructField("createdTime", TimestampType, true)) 206 | .add(StructField("address", new StructType() 207 | .add(StructField("zipcode", StringType, true)) 208 | .add(StructField("street", new StructType() 209 | .add(StructField("streetName", StringType, true)) 210 | .add(StructField("streetNumber", IntegerType, true)) 211 | ) 212 | ) 213 | .add(StructField("songs", ArrayType(StructType(List(StructField("name", StringType, true),StructField("number", IntegerType, true))), true), true)) 214 | ) 215 | ))) 216 | val df = spark.createDataFrame(spark.sparkContext.parallelize(data), schema) 217 | 218 | // Implicit write 219 | df.write.format("com.microsoft.cdm") 220 | .option("storage", storageAccountName) 221 | .option("manifestPath", outputContainer + "/nestedImplicit/default.manifest.cdm.json") 222 | .option("entity", "NestedExampleImplicit") 223 | .option("format", "parquet") 224 | .save() 225 | 226 | // Explicit write 227 | // To run this example, first create a /Models/Contacts folder to your demo container in ADLS gen2, 228 | // then upload the provided NestedExample.cdm.json file 229 | df.write.format("com.microsoft.cdm") 230 | .option("storage", storageAccountName) 231 | .option("manifestPath", outputContainer + "/nestedExplicit/default.manifest.cdm.json") 232 | .option("entity", "NestedExampleExplicit") 233 | .option("entityDefinitionPath", "/Contacts/NestedExample.cdm.json/NestedExample") 234 | .option("entityDefinitionModelRoot", container + "/Models") 235 | .option("format", "parquet") 236 | .save() 237 | 238 | val readImplicit = spark.read.format("com.microsoft.cdm") 239 | .option("storage", storageAccountName) 240 | .option("manifestPath", outputContainer + "/nestedImplicit/default.manifest.cdm.json") 241 | .option("entity", "NestedExampleImplicit") 242 | .load() 243 | 244 | val readExplicit = spark.read.format("com.microsoft.cdm") 245 | .option("storage", storageAccountName) 246 | .option("manifestPath", outputContainer + "/nestedExplicit/default.manifest.cdm.json") 247 | .option("entity", "NestedExampleExplicit") 248 | .load() 249 | 250 | readImplicit.show(false) 251 | readExplicit.show(false) 252 | 253 | -------------------------------------------------------------------------------- /src/main/main.iml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | 10 | 11 | 12 | -------------------------------------------------------------------------------- /src/main/scala/com/microsoft/cdm/CDMCatalog.scala: -------------------------------------------------------------------------------- 1 | package com.microsoft.cdm 2 | 3 | import java.util 4 | 5 | import com.microsoft.cdm.log.SparkCDMLogger 6 | import com.microsoft.cdm.utils.{CDMOptions, CdmAuthType, EntityNotFoundException, ManifestNotFoundException} 7 | import org.apache.spark.sql.catalyst.analysis.{NoSuchTableException, TableAlreadyExistsException} 8 | import org.apache.spark.sql.connector.catalog.{CatalogPlugin, Identifier, NamespaceChange, SupportsNamespaces, Table, TableCatalog, TableChange} 9 | import org.apache.spark.sql.connector.expressions.Transform 10 | import org.apache.spark.sql.types.StructType 11 | import org.apache.spark.sql.util.CaseInsensitiveStringMap 12 | import org.slf4j.LoggerFactory 13 | import org.slf4j.event.Level 14 | 15 | class CDMCatalog extends CatalogPlugin with TableCatalog with SupportsNamespaces { 16 | val logger = LoggerFactory.getLogger(classOf[CDMCatalog]) 17 | var cdmOptions: CDMOptions = _ 18 | var tables: HadoopTables = _ 19 | private var options: CaseInsensitiveStringMap = _ 20 | 21 | override def initialize(name: String, options: CaseInsensitiveStringMap): Unit = { 22 | logger.info("Initializing CDM Catalog...") 23 | this.tables = new HadoopTables() 24 | } 25 | 26 | def setupOptions(options: CaseInsensitiveStringMap): Unit = { 27 | this.options = options 28 | } 29 | 30 | @throws(classOf[NoSuchTableException]) 31 | override def loadTable(ident: Identifier): SparkTable = { 32 | try { 33 | val cdmEntity = tables.load(new CDMOptions(options)) 34 | new SparkTable(cdmEntity.schema, options) 35 | } catch { 36 | case e: EntityNotFoundException => throw new NoSuchTableException(e.getMessage) 37 | case e: ManifestNotFoundException => throw new NoSuchTableException(e.getMessage) 38 | } 39 | } 40 | 41 | @throws(classOf[TableAlreadyExistsException]) 42 | override def createTable(ident: Identifier, schema: StructType, partitions: Array[Transform], properties: util.Map[String, String]): Table = { 43 | new SparkTable(schema, options) //make it write options 44 | } 45 | 46 | override def alterTable(ident: Identifier, changes: TableChange*): Table = { 47 | throw new UnsupportedOperationException("Not supported") 48 | } 49 | 50 | override def dropTable(ident: Identifier): Boolean = throw new UnsupportedOperationException("Not supported") 51 | 52 | override def renameTable(oldIdent: Identifier, newIdent: Identifier): Unit = throw new UnsupportedOperationException("Not supported") 53 | 54 | override def listNamespaces(): Array[Array[String]] = throw new UnsupportedOperationException("Not supported") 55 | 56 | override def listNamespaces(namespace: Array[String]): Array[Array[String]] = throw new UnsupportedOperationException("Not supported") 57 | 58 | override def loadNamespaceMetadata(namespace: Array[String]): util.Map[String, String] = throw new UnsupportedOperationException("Not supported") 59 | 60 | override def createNamespace(namespace: Array[String], metadata: util.Map[String, String]): Unit = throw new UnsupportedOperationException("Not supported") 61 | 62 | override def alterNamespace(namespace: Array[String], changes: NamespaceChange*): Unit = throw new UnsupportedOperationException("Not supported") 63 | 64 | override def dropNamespace(namespace: Array[String], cascade: Boolean): Boolean = throw new UnsupportedOperationException("Not supported") 65 | 66 | override def listTables(namespace: Array[String]): Array[Identifier] = throw new UnsupportedOperationException("Not supported") 67 | 68 | override def toString = s"${this.getClass.getCanonicalName}($name)" 69 | 70 | override def name(): String = "cdm" 71 | 72 | private def getRequiredArgument(options: CaseInsensitiveStringMap, arg: String): String = { 73 | val result = if (options.containsKey(arg)) options.get(arg) else { 74 | throw new Exception(arg + "argument required") 75 | } 76 | result 77 | } 78 | 79 | def getAuthType(options: CaseInsensitiveStringMap): String = { 80 | val appIdPresent = options.containsKey("appId") 81 | val appKeyPresent = options.containsKey("appKey") 82 | val tenantIdPresent = options.containsKey("tenantId") 83 | val sasTokenPresent = options.containsKey("sasToken") 84 | val result = if (appIdPresent || appKeyPresent|| tenantIdPresent) { 85 | //make sure all creds are present 86 | if (!appIdPresent || !appKeyPresent || !tenantIdPresent) { 87 | throw new Exception("All creds must exist") 88 | } 89 | SparkCDMLogger.log(Level.INFO,"Using app registration for authentication", logger) 90 | CdmAuthType.AppReg.toString() 91 | } else if (sasTokenPresent) { 92 | SparkCDMLogger.log(Level.INFO,"Using SAS token for authentication", logger) 93 | CdmAuthType.Sas.toString() 94 | } else { 95 | SparkCDMLogger.log(Level.INFO, "Using managed identities for authentication", logger) 96 | CdmAuthType.Token.toString() 97 | } 98 | result 99 | } 100 | 101 | } 102 | -------------------------------------------------------------------------------- /src/main/scala/com/microsoft/cdm/CDMIdentifier.scala: -------------------------------------------------------------------------------- 1 | package com.microsoft.cdm 2 | 3 | import com.microsoft.cdm.utils.CDMOptions 4 | import org.apache.spark.sql.catalyst.util.CaseInsensitiveMap 5 | import org.apache.spark.sql.connector.catalog.Identifier 6 | import org.apache.spark.sql.util.CaseInsensitiveStringMap 7 | 8 | class CDMIdentifier(options: CaseInsensitiveStringMap) extends Identifier{ 9 | private val cdmOptions = new CDMOptions(options) // used to do option validation 10 | 11 | override def namespace(): Array[String] = Array(cdmOptions.storage, cdmOptions.container, cdmOptions.manifestFileName) 12 | 13 | override def name(): String = cdmOptions.entity 14 | } 15 | -------------------------------------------------------------------------------- /src/main/scala/com/microsoft/cdm/DefaultSource.scala: -------------------------------------------------------------------------------- 1 | package com.microsoft.cdm 2 | 3 | import org.apache.spark.sql.SparkSession 4 | import org.apache.spark.sql.connector.catalog.{Identifier, SupportsCatalogOptions, Table} 5 | import org.apache.spark.sql.connector.expressions.Transform 6 | import org.apache.spark.sql.types.StructType 7 | import org.apache.spark.sql.util.CaseInsensitiveStringMap 8 | import org.slf4j.LoggerFactory 9 | 10 | 11 | class DefaultSource extends SupportsCatalogOptions{ 12 | 13 | val logger = LoggerFactory.getLogger(classOf[DefaultSource]) 14 | 15 | override def inferSchema(options: CaseInsensitiveStringMap): StructType = { 16 | null 17 | } 18 | 19 | override def getTable(structType: StructType, transforms: Array[Transform], map: java.util.Map[String, String]): Table = { 20 | try{ 21 | val caseInsensitiveStringMap = new CaseInsensitiveStringMap(map) 22 | val schema = if (structType != null) { 23 | structType 24 | } else { 25 | inferSchema(caseInsensitiveStringMap) 26 | } 27 | new SparkTable(schema, caseInsensitiveStringMap) 28 | } catch { 29 | case _ : Exception => { 30 | null 31 | } 32 | } 33 | } 34 | 35 | override def supportsExternalMetadata(): Boolean = true 36 | 37 | override def extractIdentifier(options: CaseInsensitiveStringMap): Identifier = { 38 | val spark = SparkSession.active; 39 | spark.conf.set("spark.sql.catalog.cdm", "com.microsoft.cdm.CDMCatalog") 40 | val cdmcatalog = spark.sessionState.catalogManager.catalog("cdm") 41 | cdmcatalog.asInstanceOf[CDMCatalog].setupOptions(options) 42 | new CDMIdentifier(options) 43 | } 44 | 45 | override def extractCatalog(options: CaseInsensitiveStringMap): String = { 46 | "cdm" 47 | } 48 | } 49 | -------------------------------------------------------------------------------- /src/main/scala/com/microsoft/cdm/HadoopTables.scala: -------------------------------------------------------------------------------- 1 | package com.microsoft.cdm 2 | 3 | import com.microsoft.cdm.utils.{CDMEntity, CDMModelCommon, CDMOptions, CDMTokenProvider, CdmAuthType, EntityNotFoundException, ManifestNotFoundException, SerializedABFSHadoopConf, SparkSerializableConfiguration} 4 | 5 | class HadoopTables() { 6 | 7 | 8 | def load(cdmOptions: CDMOptions): CDMEntity = { 9 | val serializedHadoopConf = SerializedABFSHadoopConf.getConfiguration(cdmOptions.storage, cdmOptions.container, cdmOptions.auth, cdmOptions.conf) 10 | 11 | val tokenProvider = if (cdmOptions.auth.getAuthType == CdmAuthType.Token.toString()) Some(new CDMTokenProvider(serializedHadoopConf, cdmOptions.storage)) else None 12 | 13 | val cdmModel = new CDMModelCommon(cdmOptions.storage, 14 | cdmOptions.container, 15 | cdmOptions.manifestPath, 16 | cdmOptions.manifestFileName, 17 | cdmOptions.entity, 18 | "", 19 | "", 20 | cdmOptions.auth, tokenProvider, 21 | cdmOptions.overrideConfigPath, 22 | cdmOptions.cdmSource, 23 | "", 24 | cdmOptions.maxCDMThreads) 25 | 26 | val cdmEntity = cdmModel.entityExists(cdmOptions.entity, serializedHadoopConf) 27 | 28 | if(cdmEntity.rootManifest == null) { 29 | throw ManifestNotFoundException("Manifest doesn't exist: " + cdmOptions.manifestFileName) 30 | } 31 | if (cdmEntity.entityDec != null ) { 32 | cdmEntity.schema = cdmModel.getSchema(cdmEntity.parentManifest, cdmEntity.entityDec) 33 | cdmEntity 34 | } else { 35 | throw EntityNotFoundException("Entity " + cdmOptions.entity + " not found in manifest - " + cdmOptions.manifestFileName) 36 | } 37 | } 38 | 39 | } 40 | -------------------------------------------------------------------------------- /src/main/scala/com/microsoft/cdm/SparkTable.scala: -------------------------------------------------------------------------------- 1 | package com.microsoft.cdm 2 | 3 | import java.util 4 | 5 | import com.microsoft.cdm.read.{CDMReadOptions, CDMScanBuilder} 6 | import com.microsoft.cdm.write.{CDMWriteOptions, CDMWriterBuilder} 7 | import org.apache.spark.sql.SaveMode 8 | import org.apache.spark.sql.connector.catalog.{SupportsRead, SupportsWrite, Table, TableCapability} 9 | import org.apache.spark.sql.connector.read.ScanBuilder 10 | import org.apache.spark.sql.connector.write.{LogicalWriteInfo, WriteBuilder} 11 | import org.apache.spark.sql.types.StructType 12 | import org.apache.spark.sql.util.CaseInsensitiveStringMap 13 | 14 | import scala.collection.JavaConverters._ 15 | 16 | class SparkTable(schema: StructType, options: CaseInsensitiveStringMap) extends Table 17 | with SupportsRead 18 | with SupportsWrite { 19 | 20 | 21 | override def name(): String = this.getClass.toString 22 | 23 | override def schema(): StructType = schema 24 | 25 | override def capabilities(): util.Set[TableCapability] = Set( 26 | TableCapability.ACCEPT_ANY_SCHEMA, 27 | TableCapability.BATCH_WRITE, 28 | TableCapability.BATCH_READ, 29 | TableCapability.OVERWRITE_BY_FILTER, 30 | TableCapability.OVERWRITE_DYNAMIC, 31 | TableCapability.TRUNCATE).asJava 32 | 33 | 34 | override def newScanBuilder(options: CaseInsensitiveStringMap): ScanBuilder = { 35 | new CDMScanBuilder(new CDMReadOptions(options)) 36 | } 37 | 38 | override def newWriteBuilder(info: LogicalWriteInfo): WriteBuilder = { 39 | new CDMWriterBuilder(info.queryId(), info.schema(), SaveMode.Append, new CDMWriteOptions(options)) 40 | } 41 | } -------------------------------------------------------------------------------- /src/main/scala/com/microsoft/cdm/log/SparkCDMLogger.scala: -------------------------------------------------------------------------------- 1 | package com.microsoft.cdm.log 2 | 3 | import com.microsoft.cdm.utils.{Constants, Environment, SparkPlatform} 4 | import com.microsoft.spark.metricevents.{ComponentEventPublisher, ComponentSparkEvent} 5 | import org.slf4j.Logger 6 | import org.slf4j.event.Level 7 | 8 | object SparkCDMLogger { 9 | var APPNAME: String = "Spark-CDM Connector" 10 | 11 | // Application log 12 | def log(loglevel: Level, message: String, logger: Logger) = { 13 | loglevel match { 14 | case Level.ERROR => logger.error(message) 15 | case Level.INFO => logger.info(message) 16 | case Level.DEBUG => logger.debug(message) 17 | case Level.WARN => logger.warn(message) 18 | case _ => 19 | } 20 | } 21 | 22 | def logEventToKusto(className: String, methodName: String, loglevel: Level, message: String, logger: Option[Logger] = None): Unit = { 23 | if(logger.getOrElse(null) != null) { 24 | log(loglevel, message, logger.get) 25 | } 26 | // Log to Kusto only on Synapse 27 | if(SparkPlatform.Synapse == Environment.sparkPlatform && Constants.KUSTO_ENABLED) { 28 | val event = ComponentSparkEvent(APPNAME, className, methodName, None, None, None, Some(message), loglevel) 29 | ComponentEventPublisher.publishComponentEvent(event) 30 | } 31 | } 32 | 33 | /* Log event to kusto to know performance of @param code */ 34 | def logEventToKustoForPerf[T](code: => T, className: String, methodName: String, loglevel: Level, message: String, logger: Option[Logger] = None): T = { 35 | if(logger.getOrElse(null) != null) { 36 | log(loglevel, message, logger.get) 37 | } 38 | // Log to Kusto only on Synapse 39 | if(SparkPlatform.Synapse == Environment.sparkPlatform && Constants.KUSTO_ENABLED) { 40 | val event = ComponentSparkEvent(APPNAME, className, methodName, None, None, None, Some(message), loglevel) 41 | ComponentEventPublisher.publishComponentEventFor(code, event) 42 | }else{ 43 | code 44 | } 45 | } 46 | } 47 | -------------------------------------------------------------------------------- /src/main/scala/com/microsoft/cdm/read/CDMDataReader.scala: -------------------------------------------------------------------------------- 1 | package com.microsoft.cdm.read 2 | 3 | 4 | import com.microsoft.cdm.utils.{Constants, DataConverter, Messages} 5 | import org.apache.commons.io.IOUtils 6 | import org.apache.spark.sql.catalyst.InternalRow 7 | import org.apache.spark.sql.connector.read.PartitionReader 8 | import org.apache.spark.sql.types.StructType 9 | 10 | import java.io.{File, FileOutputStream, InputStream} 11 | 12 | /** 13 | * Reads a single partition of CDM data. 14 | * @param csvPath ADLSgen2 URI of partition data in CSV format. 15 | * @param schema Spark schema of the data in the CSV file 16 | * 17 | */ 18 | 19 | @SerialVersionUID(100L) 20 | class CDMDataReader(val storage: String, 21 | val container: String, 22 | val fileReader: ReaderConnector, 23 | val header: Boolean, 24 | var schema: StructType, 25 | var dataConverter: DataConverter, 26 | val mode: String) extends PartitionReader[InternalRow] with Serializable { 27 | var cnt = 0 28 | var row: Array[Any] = _ 29 | var stream: InputStream =_ 30 | var headerRead = false 31 | fileReader.build 32 | /** 33 | * Called by the Spark runtime. 34 | * @return Boolean indicating whether there is any data left to read. 35 | */ 36 | def next: Boolean = { 37 | if (header && !headerRead) { 38 | fileReader.readRow 39 | 40 | //TODO: verify header names match with what we have in CDM 41 | println("TODO: Verify header names match") 42 | headerRead = true 43 | } 44 | 45 | row = fileReader.readRow 46 | row != null 47 | } 48 | 49 | /** 50 | * Called by the Spark runtime if there is data left to read. 51 | * @return The next row of data. 52 | */ 53 | def get: InternalRow = { 54 | if(mode == Constants.FAILFAST && row.length != schema.fields.length) { 55 | throw new Exception(Messages.incompatibleFileWithDataframe) 56 | } 57 | var seq: Seq[Any] = null 58 | var setRowToNull = false 59 | // When there are more columns in the CSV file than the # of attributes in the cdm entity file. 60 | if (row.length > schema.fields.length) { 61 | seq = schema.zipWithIndex.map{ case (col, index) => 62 | val dataType = schema.fields(index).dataType 63 | fileReader.jsonToData(dataType, row.apply(index), mode) 64 | } 65 | } else if (row.length < schema.fields.length) { 66 | // When there are fewer columns in the CSV file the # of attributes in cdm entity file at the end 67 | seq = schema.zipWithIndex.map{ case (col, index) => 68 | if (index >= row.length) { 69 | null 70 | } else { 71 | val dataType = schema.fields(index).dataType 72 | fileReader.jsonToData(dataType, row.apply(index), mode) 73 | } 74 | } 75 | } else { 76 | seq = row.zipWithIndex.map { case (col, index) => 77 | val dataType = schema.fields(index).dataType 78 | fileReader.jsonToData(dataType, row.apply(index), mode) 79 | } 80 | } 81 | 82 | /* 83 | * If we want to return null for the entire row if any entity fails to be converted, uncomment 84 | */ 85 | /* 86 | if (!fileReader.isValidRow) { 87 | seq = schema.zipWithIndex.map { _ =>null} 88 | } 89 | */ 90 | 91 | InternalRow.fromSeq(seq) 92 | } 93 | 94 | /** 95 | * Called by the Spark runtime. 96 | */ 97 | def close(): Unit = { 98 | fileReader.close 99 | } 100 | 101 | } 102 | -------------------------------------------------------------------------------- /src/main/scala/com/microsoft/cdm/read/CDMInputPartition.scala: -------------------------------------------------------------------------------- 1 | package com.microsoft.cdm.read 2 | 3 | import com.microsoft.cdm.utils.DataConverter 4 | import org.apache.spark.sql.connector.read.InputPartition 5 | import org.apache.spark.sql.types.StructType 6 | 7 | /** 8 | * Factory class for creating a CDMDataReader responsible for reading a single partition of CDM data. 9 | * @param remoteCSVPath ADLSgen2 URI of partition data in CSV format. 10 | * @param schema Spark schema of the data in the CSV file 11 | * @param adlProvider Provider for ADLSgen2 data 12 | * @param dataConverter Converts CSV data into types according to schema 13 | */ 14 | case class CDMInputPartition(val storage: String, 15 | val container: String, 16 | val fileReader: ReaderConnector, 17 | val header: Boolean, 18 | var schema: StructType, 19 | var dataConverter: DataConverter, 20 | val mode: String) extends InputPartition { 21 | } -------------------------------------------------------------------------------- /src/main/scala/com/microsoft/cdm/read/CDMPartitionReader.scala: -------------------------------------------------------------------------------- 1 | package com.microsoft.cdm.read 2 | 3 | import org.apache.spark.sql.SparkSession 4 | import org.apache.spark.sql.catalyst.InternalRow 5 | import org.apache.spark.sql.connector.read.PartitionReader 6 | import org.apache.spark.unsafe.types.UTF8String 7 | 8 | class CDMPartitionReader(inputPartition: CDMInputPartition) extends PartitionReader[InternalRow]{ 9 | 10 | var index = 0 11 | val values = Array("1", "2", "3", "4", "5") 12 | 13 | var iterator: Iterator[String] = null 14 | 15 | @transient 16 | def next: Boolean = index < values.length 17 | 18 | def get = { 19 | val stringValue = values(index) 20 | val stringUtf = UTF8String.fromString(stringValue) 21 | val row = InternalRow(stringUtf) 22 | index = index + 1 23 | row 24 | } 25 | 26 | def close() = Unit 27 | 28 | } 29 | -------------------------------------------------------------------------------- /src/main/scala/com/microsoft/cdm/read/CDMPartitionReaderFactory.scala: -------------------------------------------------------------------------------- 1 | package com.microsoft.cdm.read 2 | 3 | import org.apache.spark.sql.catalyst.InternalRow 4 | import org.apache.spark.sql.connector.read.{InputPartition, PartitionReader, PartitionReaderFactory} 5 | import org.apache.spark.sql.types.StructType 6 | 7 | class CDMPartitionReaderFactory() extends PartitionReaderFactory { 8 | override def createReader(partition: InputPartition): PartitionReader[InternalRow] = { 9 | val p = partition.asInstanceOf[CDMInputPartition] 10 | new CDMDataReader(p.storage, 11 | p.container, 12 | p.fileReader, 13 | p.header, 14 | p.schema, 15 | p.dataConverter, 16 | p.mode) 17 | } 18 | } 19 | -------------------------------------------------------------------------------- /src/main/scala/com/microsoft/cdm/read/CDMReadOptions.scala: -------------------------------------------------------------------------------- 1 | package com.microsoft.cdm.read 2 | 3 | import com.microsoft.cdm.utils.{CDMOptions, Constants, Messages} 4 | import org.apache.spark.sql.util.CaseInsensitiveStringMap 5 | 6 | class CDMReadOptions(options: CaseInsensitiveStringMap) extends CDMOptions(options) { 7 | 8 | Constants.MODE = "read" 9 | 10 | val mode = if(options.containsKey("mode")) options.get("mode") else Constants.FAILFAST; 11 | // if mode is specified, it needs to either failfast or permissive 12 | if(Constants.DROPMALFORMED.equalsIgnoreCase(mode)) { 13 | throw new IllegalArgumentException(String.format(Messages.dropMalformedNotSupported)) 14 | } else if(!Constants.PERMISSIVE.equalsIgnoreCase(mode) && !Constants.FAILFAST.equalsIgnoreCase(mode)) { 15 | throw new IllegalArgumentException(String.format(Messages.invalidMode)) 16 | } 17 | 18 | var entDefContAndPath = "" 19 | var entityDefinitionStorage = "" 20 | 21 | } 22 | -------------------------------------------------------------------------------- /src/main/scala/com/microsoft/cdm/read/CDMScanBuilder.scala: -------------------------------------------------------------------------------- 1 | package com.microsoft.cdm.read 2 | 3 | 4 | import com.microsoft.cdm.utils.{DataConverter} 5 | import org.apache.spark.sql.connector.read.{Scan, ScanBuilder} 6 | import org.slf4j.LoggerFactory 7 | 8 | class CDMScanBuilder (cdmOptions: CDMReadOptions) extends ScanBuilder { 9 | val logger = LoggerFactory.getLogger(classOf[CDMScanBuilder]) 10 | 11 | override def build(): Scan = new CDMSimpleScan(cdmOptions.storage, 12 | cdmOptions.container, 13 | cdmOptions.manifestPath, 14 | cdmOptions.manifestFileName, 15 | cdmOptions.entity, 16 | cdmOptions.entDefContAndPath, 17 | cdmOptions.auth, 18 | cdmOptions.conf, 19 | new DataConverter(), 20 | cdmOptions.cdmSource, 21 | cdmOptions.entityDefinitionStorage, 22 | cdmOptions.maxCDMThreads, 23 | cdmOptions.mode) 24 | } 25 | -------------------------------------------------------------------------------- /src/main/scala/com/microsoft/cdm/read/CDMSimpleScan.scala: -------------------------------------------------------------------------------- 1 | package com.microsoft.cdm.read 2 | 3 | import com.microsoft.cdm.log.SparkCDMLogger 4 | import com.microsoft.cdm.utils.{Auth, CDMModelReader, CDMSource, CDMTokenProvider, CdmAuthType, Constants, DataConverter, Messages, SerializedABFSHadoopConf, SparkSerializableConfiguration} 5 | import org.apache.hadoop.conf.Configuration 6 | import org.apache.spark.sql.connector.read.{Batch, InputPartition, PartitionReaderFactory, Scan} 7 | import org.apache.spark.sql.types.StructType 8 | import org.slf4j.LoggerFactory 9 | import org.slf4j.event.Level 10 | import java.net.URLDecoder 11 | import com.microsoft.commondatamodel.objectmodel.cdm.CdmTraitReference 12 | import scala.collection.JavaConverters._ 13 | 14 | 15 | case class CDMPartition(partitionNumber: Int, header: Boolean=true) extends InputPartition 16 | 17 | class CDMSimpleScan(val storage: String, 18 | val container: String, 19 | val manifestPath: String, 20 | val manifestFileName: String, 21 | val entityName: String, 22 | val entDefContAndPath: String, 23 | val auth: Auth, 24 | val conf:Configuration, 25 | val dataConverter: DataConverter, 26 | val cdmSource: CDMSource.Value, 27 | val entityDefinitionStorage: String, 28 | val maxCDMThreads: Int, 29 | val mode: String) extends Scan with Batch{ 30 | 31 | val logger = LoggerFactory.getLogger(classOf[CDMSimpleScan]) 32 | 33 | val serializedHadoopOConf= SerializedABFSHadoopConf.getConfiguration(storage, container, auth, conf) 34 | 35 | val tokenProvider = if (auth.getAuthType == CdmAuthType.Token.toString()) Some(new CDMTokenProvider(serializedHadoopOConf, storage)) else None 36 | 37 | val cdmModel = new CDMModelReader(storage, container, manifestPath, manifestFileName, entityName, entDefContAndPath, auth, tokenProvider, cdmSource, entityDefinitionStorage, 38 | maxCDMThreads) 39 | 40 | //TODO: Make this a accessor class to retrieve tuple items 41 | var entity = cdmModel.entityDecHandleError(entityName, serializedHadoopOConf) 42 | 43 | override def readSchema() = { 44 | SparkCDMLogger.logEventToKustoForPerf({ 45 | cdmModel.getSchema(entity.parentManifest, entity.entityDec) 46 | },this.getClass.getName, Thread.currentThread.getStackTrace()(1).getMethodName, Level.DEBUG, "Reading CDM entity and convert it to spark schema", Some(logger)) 47 | } 48 | 49 | override def toBatch: Batch = this 50 | 51 | def getReader(fType: String, uriPath: String, filePath: String, schema: StructType, serializedHadoopConf: SparkSerializableConfiguration, delimiter: Char): ReaderConnector ={ 52 | return fType match { 53 | case "is.partition.format.CSV" => new CSVReaderConnector(uriPath, filePath, serializedHadoopConf, delimiter, mode) 54 | case "is.partition.format.parquet" => new ParquetReaderConnector(uriPath, filePath, schema, serializedHadoopConf) 55 | } 56 | } 57 | override def planInputPartitions(): Array[InputPartition] = { 58 | /* Fetch the partitions and their names from the CDMModel*/ 59 | val factoryList = new java.util.ArrayList[InputPartition] 60 | val man = entity.parentManifest 61 | val eDec = entity.entityDec 62 | 63 | // Calling fileStatusCheckAsync() on model.json breaks because the CDM library 64 | // assumes a "entity".cdm.json file exists 65 | if (manifestFileName != Constants.MODEL_JSON) { 66 | entity.entityDec.fileStatusCheckAsync().get() 67 | } 68 | 69 | for(partition <- eDec.getDataPartitions.asScala) { 70 | // the relative location of the path 71 | val loc = cdmModel.getRelPath(partition.getLocation) 72 | assert(!loc.startsWith("https:/")) 73 | val absPath = cdmModel.cdmCorpus.getStorage.createAbsoluteCorpusPath(loc,eDec) 74 | //The full path to data partition with the adapter stripped off 75 | val relPath= cdmModel.getRelPath(absPath) 76 | 77 | val uriPrefix = "https://"+storage+container 78 | 79 | // Decode strings because hadoop cannot parse URI-encoded strings 80 | val decodedFilePath = URLDecoder.decode(manifestPath + relPath, "UTF-8") 81 | 82 | //we track the header and pass it in to the reader so that we know if the first line is a header row 83 | var header = false 84 | val schema = readSchema(); 85 | var delimiter = Constants.DEFAULT_DELIMITER 86 | val fileReader = { 87 | 88 | if (partition.getExhibitsTraits.asScala.size > 0 && 89 | partition.getExhibitsTraits.asScala.find(_.getNamedReference.startsWith("is.partition.format")) != None) { 90 | val traits = partition.getExhibitsTraits.asScala.find(_.getNamedReference.startsWith("is.partition.format")).get 91 | assert(traits.getNamedReference == "is.partition.format.CSV" || 92 | traits.getNamedReference == "is.partition.format.parquet") 93 | // if arguments are defined, determine whether the the files have headers prepended to them 94 | val arguments = traits.asInstanceOf[CdmTraitReference].getArguments().asScala 95 | val headerArg = arguments.find(_.getName() == "columnHeaders") 96 | if (headerArg != None) { 97 | header = headerArg.get.getValue().toString.toBoolean 98 | } 99 | val delimiterArg = arguments.find(_.getName() == "delimiter") 100 | if (delimiterArg != None) { 101 | val strDelimiter = delimiterArg.get.getValue.toString; 102 | if(strDelimiter.length > 1) throw new IllegalArgumentException(String.format(Messages.invalidDelimiterCharacter, strDelimiter)) 103 | delimiter = strDelimiter.charAt(0) 104 | } 105 | val reader = getReader(traits.getNamedReference, uriPrefix, decodedFilePath, schema, serializedHadoopOConf, delimiter) 106 | if (reader.isInstanceOf[ParquetReaderConnector] && Constants.PERMISSIVE.equalsIgnoreCase(mode)) { 107 | throw new IllegalArgumentException(String.format(Messages.invalidPermissiveMode)) 108 | } 109 | reader 110 | } else { 111 | SparkCDMLogger.log(Level.DEBUG, "No Named Reference Trait \"is.partition.format\" (CSV/Parquet", logger) 112 | new CSVReaderConnector(uriPrefix, decodedFilePath, serializedHadoopOConf, delimiter, mode) 113 | } 114 | } 115 | 116 | factoryList.add(new CDMInputPartition(storage, container, fileReader, header, readSchema(), dataConverter, mode)) 117 | } 118 | SparkCDMLogger.log(Level.DEBUG, "Count of partitions - "+eDec.getDataPartitions.size() + " Entity - " + eDec.getEntityName + " Manifest -"+ man.getManifestName, logger) 119 | factoryList.asScala.toArray 120 | } 121 | 122 | override def createReaderFactory(): PartitionReaderFactory = { 123 | new CDMPartitionReaderFactory( ) 124 | } 125 | } 126 | -------------------------------------------------------------------------------- /src/main/scala/com/microsoft/cdm/read/CSVReaderConnector.scala: -------------------------------------------------------------------------------- 1 | package com.microsoft.cdm.read 2 | 3 | import java.net.URLDecoder 4 | import java.time.{Instant, LocalDate, LocalDateTime, LocalTime, ZoneId} 5 | import java.time.format.{DateTimeFormatter, DateTimeParseException} 6 | import java.time.temporal.ChronoUnit 7 | import com.microsoft.cdm.utils.{Constants, CsvParserFactory, SparkSerializableConfiguration} 8 | import com.microsoft.cdm.log.SparkCDMLogger 9 | import com.univocity.parsers.csv.CsvParser 10 | import org.apache.hadoop.fs.Path 11 | import org.apache.parquet.hadoop.util.HadoopInputFile 12 | import org.apache.spark.sql.types.{BooleanType, ByteType, DataType, DateType, Decimal, DecimalType, DoubleType, FloatType, IntegerType, LongType, ShortType, StringType, TimestampType} 13 | import org.apache.spark.unsafe.types.UTF8String 14 | import org.slf4j.LoggerFactory 15 | import org.slf4j.event.Level 16 | 17 | class CSVReaderConnector(httpPrefix:String, filePath: String, serConf:SparkSerializableConfiguration, delimiter: Char, mode: String) extends ReaderConnector { 18 | val logger = LoggerFactory.getLogger(classOf[CSVReaderConnector]) 19 | SparkCDMLogger.log(Level.DEBUG, "CSV Reader for partition at path: " + httpPrefix + filePath, logger) 20 | 21 | private var goodRow:Boolean = true 22 | private var parser: CsvParser = _ 23 | private val dateFormatStrings = List( 24 | "yyyy-MM-dd", 25 | "M/d/yyyy" ) 26 | 27 | private val localTimeFormatsNonStandard= List( 28 | "M/d/yyyy H:mm", 29 | "M/d/yyyy h:mm:ss a", 30 | "M/d/yyyy H:mm:ss", 31 | "yyyy-MM-dd H:mm:ss.S", 32 | "yyyy-MM-dd H:mm:ss.SS", 33 | "yyyy-MM-dd H:mm:ss.SSS", 34 | "yyyy-MM-dd H:mm:ss.SSSS", 35 | "yyyy-MM-dd H:mm:ss.SSSSS", 36 | "yyyy-MM-dd H:mm:ss.SSSSSS", 37 | "yyyy-MM-dd H:mm:ss", 38 | "MMM d yyyy h:mma") 39 | 40 | private val timeFormatStrings = List( 41 | "HH:mm:ss", 42 | "HH:mm:ss.S", 43 | "HH:mm:ss.SS", 44 | "HH:mm:ss.SSS", 45 | "HH:mm:ss.SSSS", 46 | "HH:mm:ss.SSSSS", 47 | "HH:mm:ss.SSSSSS") 48 | 49 | def build: Unit = { 50 | try { 51 | val path = new Path(filePath) 52 | val inputFile = HadoopInputFile.fromPath(path, serConf.value) 53 | val inputStream = inputFile.newStream() 54 | parser = CsvParserFactory.build(delimiter) 55 | parser.beginParsing { 56 | inputStream 57 | } 58 | } catch { 59 | case e: Throwable => SparkCDMLogger.log(Level.ERROR, e.printStackTrace.toString, logger) 60 | } 61 | } 62 | 63 | def close(): Unit = { 64 | } 65 | 66 | def readRow(): Array[Any] = { 67 | val arr = parser.parseNext() 68 | arr.asInstanceOf[Array[Any]] 69 | } 70 | 71 | /* if the conversion failed, value is null, return false to indicate that whole row should be null */ 72 | def checkResult(ret: Any): (Any, Boolean) = { 73 | if (ret == null) { 74 | (ret, false) 75 | } else { 76 | (ret, true) 77 | } 78 | } 79 | 80 | def isValidRow(): Boolean = goodRow 81 | 82 | def jsonToData(dt: DataType, value: Any, mode: String): Any= { 83 | /* null is a valid value */ 84 | if (value == null) { 85 | null 86 | } else { 87 | 88 | val result = dt match { 89 | case ByteType => util.Try(value.toString.toByte).getOrElse(null) 90 | case ShortType => util.Try(value.toString.toShort).getOrElse(null) 91 | case IntegerType => util.Try(value.toString.toInt).getOrElse(null) 92 | case LongType => util.Try(value.toString.toLong).getOrElse(null) 93 | case DoubleType => util.Try(value.toString.toDouble).getOrElse(null) 94 | case FloatType => util.Try(value.toString.toFloat).getOrElse(null) 95 | case DecimalType() => util.Try(Decimal(value.toString)).getOrElse(null) 96 | case BooleanType => util.Try(value.toString.toBoolean).getOrElse(null) 97 | case DateType => { 98 | if (value != None && value != null) { 99 | val date = tryParseDate(value.toString, mode) 100 | 101 | /* If we can't parse the date we return a null. This enables permissive mode to work*/ 102 | if (date == null) { 103 | null 104 | } else { 105 | date.toEpochDay.toInt 106 | } 107 | } else { 108 | null 109 | } 110 | } 111 | case StringType => util.Try(UTF8String.fromString(value.toString)).getOrElse(null) 112 | case TimestampType => { 113 | if (value != None && value != null) { 114 | val date = tryParseDateTime(value.toString, mode) 115 | 116 | /* If we can't parse the date we return a null. This enables permissive mode to work*/ 117 | if (date == null) { 118 | null 119 | } else { 120 | date 121 | } 122 | } else { 123 | null 124 | } 125 | } 126 | case _ => util.Try(UTF8String.fromString(value.toString)).getOrElse(null) 127 | } 128 | if (result == null) { 129 | val msg = "Mode: " + mode + ". Could not parse " + value + " as " + dt.simpleString + ", converting to null" 130 | SparkCDMLogger.log(Level.ERROR, msg, logger) 131 | 132 | /* 133 | *If we want pure fail-fast, we should add below exception 134 | */ 135 | /* 136 | if (Constants.FAILFAST.equalsIgnoreCase(mode)) { 137 | throw new IllegalArgumentException(msg) 138 | }*/ 139 | goodRow = false 140 | } 141 | result 142 | } 143 | } 144 | 145 | def tryParseDate(dateString: String, mode: String): LocalDate= { 146 | for (formatString <- dateFormatStrings) { 147 | try { 148 | val dateTimeFormatter = DateTimeFormatter.ofPattern(formatString) 149 | val localDate= LocalDate.parse(dateString, dateTimeFormatter) 150 | return localDate 151 | } catch { 152 | case e: DateTimeParseException=> 153 | } 154 | } 155 | 156 | val msg = "Mode: " + mode + ". Could not parse " + dateString + " using any possible format" 157 | SparkCDMLogger.log(Level.ERROR, msg, logger) 158 | if (Constants.FAILFAST.equalsIgnoreCase(mode)) { 159 | throw new IllegalArgumentException(msg) 160 | } 161 | null 162 | } 163 | 164 | def tryParseDateTime(dateString: String, mode: String): java.lang.Long = { 165 | 166 | val localTimeFormats = List(DateTimeFormatter.ISO_OFFSET_DATE_TIME, 167 | DateTimeFormatter.ISO_INSTANT) 168 | 169 | /* Conversions that to local time first */ 170 | for (format <- localTimeFormats) { 171 | var instant: Instant = null; 172 | try { 173 | val i = Instant.from(format.parse(dateString)) 174 | val zt = i.atZone(ZoneId.systemDefault()) 175 | instant = zt.toLocalDateTime.atZone(ZoneId.systemDefault()).toInstant(); 176 | return ChronoUnit.MICROS.between(Instant.EPOCH, instant) 177 | } catch { 178 | case e: ArithmeticException => { 179 | return instant.toEpochMilli()*1000 180 | } 181 | case e: DateTimeParseException=> 182 | } 183 | } 184 | 185 | /* Local Time formatting */ 186 | for (format <- List(DateTimeFormatter.ISO_LOCAL_DATE_TIME)) { 187 | var instant: Instant = null 188 | try { 189 | val localDateTime = LocalDateTime.parse(dateString, format) 190 | instant = localDateTime.atZone(ZoneId.systemDefault()).toInstant(); 191 | return ChronoUnit.MICROS.between(Instant.EPOCH, instant) 192 | } catch { 193 | case e: ArithmeticException => { 194 | return instant.toEpochMilli()*1000 195 | } 196 | case e: DateTimeParseException => 197 | } 198 | } 199 | 200 | /* Non-common formats in local time */ 201 | for (formatString <- localTimeFormatsNonStandard) { 202 | var instant: Instant = null 203 | try { 204 | val dateTimeFormatter = DateTimeFormatter.ofPattern(formatString) 205 | val localDateTime = LocalDateTime.parse(dateString, dateTimeFormatter) 206 | /* Assume non-standard times are in UTC */ 207 | instant = localDateTime.atZone(ZoneId.of("UTC")).toInstant(); 208 | return ChronoUnit.MICROS.between(Instant.EPOCH, instant) 209 | } catch { 210 | case e: ArithmeticException => { 211 | return instant.toEpochMilli()*1000 212 | } 213 | case e: DateTimeParseException => 214 | } 215 | } 216 | 217 | /* Just Dates (no-time element) formats formatting */ 218 | for (formatString <- dateFormatStrings) { 219 | var instant: Instant = null 220 | try { 221 | val dateTimeFormatter = DateTimeFormatter.ofPattern(formatString) 222 | val localDate = LocalDate.parse(dateString, dateTimeFormatter) 223 | val localDateTime1 = localDate.atStartOfDay(); 224 | instant = localDateTime1.atZone(ZoneId.of("UTC")).toInstant(); 225 | return ChronoUnit.MICROS.between(Instant.EPOCH, instant) 226 | } catch { 227 | case e: ArithmeticException => { 228 | return instant.toEpochMilli()*1000 229 | } 230 | case e: DateTimeParseException => 231 | } 232 | } 233 | 234 | /* Finally, this could just be a Time - Try that */ 235 | for (formatString <- timeFormatStrings) { 236 | var instant: Instant = null 237 | try { 238 | val formatterTime1 = DateTimeFormatter.ofPattern(formatString) 239 | val ls = LocalTime.parse(dateString, formatterTime1) 240 | instant = ls.atDate(LocalDate.of(1970, 1, 1)).atZone(ZoneId.of("UTC")).toInstant 241 | return ChronoUnit.MICROS.between(Instant.EPOCH, instant) 242 | } catch { 243 | case e: ArithmeticException => { 244 | return instant.toEpochMilli()*1000 245 | } 246 | case e: DateTimeParseException => 247 | } 248 | } 249 | 250 | 251 | val msg = "Mode: " + mode + ". Could not parse " + dateString + " using any possible format" 252 | SparkCDMLogger.log(Level.ERROR, msg, logger) 253 | if (Constants.FAILFAST.equalsIgnoreCase(mode)) { 254 | throw new IllegalArgumentException(msg) 255 | } 256 | null 257 | } 258 | } 259 | -------------------------------------------------------------------------------- /src/main/scala/com/microsoft/cdm/read/ParquetReaderConnector.scala: -------------------------------------------------------------------------------- 1 | package com.microsoft.cdm.read 2 | 3 | import java.io.{ByteArrayInputStream, ObjectInputStream, ObjectOutputStream} 4 | import java.math.BigInteger 5 | import java.text.SimpleDateFormat 6 | import java.time.temporal.{ChronoField} 7 | import java.time.{Instant, LocalDate, ZoneId} 8 | import java.util.Base64 9 | import java.nio.charset.StandardCharsets.UTF_8 10 | import com.microsoft.cdm.utils.{Constants, SparkSerializableConfiguration} 11 | import com.microsoft.cdm.log.SparkCDMLogger 12 | import org.apache.commons.io.output.ByteArrayOutputStream 13 | import org.apache.hadoop.fs.Path 14 | import org.apache.parquet.column.page.PageReadStore 15 | import org.apache.parquet.example.data.Group 16 | import org.apache.parquet.example.data.simple.{NanoTime, SimpleGroup} 17 | import org.apache.parquet.example.data.simple.convert.GroupRecordConverter 18 | import org.apache.parquet.hadoop.ParquetFileReader 19 | import org.apache.parquet.io.{ColumnIOFactory, MessageColumnIO, RecordReader} 20 | import org.apache.parquet.schema.{MessageType, OriginalType} 21 | import org.apache.parquet.schema.PrimitiveType.PrimitiveTypeName 22 | import org.apache.spark.sql.catalyst.InternalRow 23 | import org.apache.spark.sql.types.{ArrayType, BooleanType, ByteType, DataType, DateType, Decimal, DecimalType, DoubleType, FloatType, IntegerType, LongType, ShortType, StringType, StructType, TimestampType} 24 | import org.apache.spark.sql.catalyst.util.{ArrayData, DateTimeUtils} 25 | import org.apache.spark.unsafe.types.UTF8String 26 | import org.slf4j.LoggerFactory 27 | import org.slf4j.event.Level 28 | 29 | 30 | class ParquetReaderConnector(httpPrefix: String, 31 | filePath: String, 32 | sparkSchema: StructType, 33 | serializedHadoopConf:SparkSerializableConfiguration) extends ReaderConnector { 34 | 35 | /* 36 | * Note. If these variables are initialized in the constructor, we get the "objects are not serializable" 37 | * for all of the classes below. Therefore, we need to wait to initialize the class until it is on the worker. 38 | */ 39 | var i = 0 40 | var rows = 0 41 | var pages: PageReadStore = _ 42 | var schema: MessageType = _ 43 | var reader: ParquetFileReader = _ 44 | var recordReader: RecordReader[Group] = _ 45 | var columnIO: MessageColumnIO = _ 46 | var path:Path = _ 47 | var thisSparkSchema: StructType = _ 48 | 49 | val logger = LoggerFactory.getLogger(classOf[ParquetReaderConnector]) 50 | SparkCDMLogger.log(Level.DEBUG, "Parquet Reader for partition at path: " + httpPrefix + filePath, logger) 51 | 52 | def build() { 53 | try { 54 | val path = new Path(filePath) 55 | val readFooter= ParquetFileReader.readFooter(serializedHadoopConf.value, path) 56 | schema = readFooter.getFileMetaData.getSchema 57 | columnIO = new ColumnIOFactory().getColumnIO(schema); 58 | reader = new ParquetFileReader(serializedHadoopConf.value, path, readFooter) 59 | //does this have to be in a loop? 60 | pages = reader.readNextRowGroup() 61 | if(pages != null) { 62 | rows = pages.getRowCount().toInt; 63 | recordReader = columnIO.getRecordReader(pages, new GroupRecordConverter(schema)) 64 | } 65 | this.thisSparkSchema = sparkSchema 66 | } catch { 67 | case e: Throwable => SparkCDMLogger.log(Level.ERROR, e.printStackTrace.toString, logger) 68 | } 69 | } 70 | 71 | def close (): Unit = { 72 | reader.close() 73 | } 74 | 75 | def readRow(): Array[Any] = { 76 | if (i < rows) { 77 | i += 1 78 | getRowAsString(recordReader.read(), thisSparkSchema) 79 | } else { 80 | pages = reader.readNextRowGroup() 81 | if (pages == null) { 82 | //No more 83 | null 84 | } else { 85 | recordReader = columnIO.getRecordReader(pages, new GroupRecordConverter(schema)) 86 | assert(i 164 | arr(field) = Decimal.apply(g.getInteger(field, index), parquetDecimal.getPrecision, parquetDecimal.getScale).toString() 165 | case PrimitiveTypeName.INT64 => 166 | arr(field) = Decimal.apply(g.getLong(field, index), parquetDecimal.getPrecision, parquetDecimal.getScale).toString() 167 | case PrimitiveTypeName.BINARY => 168 | arr(field) = new java.math.BigDecimal(new BigInteger(g.getBinary(field, index).getBytes), parquetDecimal.getScale).toString 169 | case PrimitiveTypeName.FIXED_LEN_BYTE_ARRAY => 170 | arr(field) = new java.math.BigDecimal(new BigInteger(g.getBinary(field, index).getBytes), parquetDecimal.getScale).toString 171 | } 172 | } else if (fieldType.getOriginalType == OriginalType.TIME_MICROS 173 | && ptype == PrimitiveTypeName.INT64) { 174 | arr(field) = g.getLong(field, index) 175 | } else { 176 | arr(field) = g.getValueToString(field, index) 177 | } 178 | } 179 | }else{ 180 | val listgroup = g.asInstanceOf[SimpleGroup] 181 | if (fieldType.getOriginalType == OriginalType.LIST) { 182 | val elementGroup = listgroup.getGroup(field, index) 183 | /* get how many structs are present in the array */ 184 | val repeatedListSize = elementGroup.getFieldRepetitionCount("list") 185 | val serializedCombinedStruct = new StringBuilder 186 | for(i <- 0 until repeatedListSize) { 187 | /* check if its an empty array*/ 188 | if (elementGroup.getGroup(index, i).toString != "") { 189 | val rowAsVal = getRowAsString(elementGroup.getGroup(index, i).getGroup(0, 0), struct.fields(field).dataType.asInstanceOf[ArrayType].elementType.asInstanceOf[StructType]) 190 | serializedCombinedStruct.append(serializeObject(rowAsVal)) 191 | serializedCombinedStruct.append(" ") 192 | } 193 | } 194 | if ( serializedCombinedStruct.length > 0) serializedCombinedStruct.setLength(serializedCombinedStruct.length -1) 195 | arr(field) = serializedCombinedStruct.toString() 196 | } else { 197 | val rowAsVal = getRowAsString(listgroup.getGroup(field, index), struct.fields(field).dataType.asInstanceOf[StructType]); 198 | /* Serializing is necessary because we want to encode the rowAsVal object as a single string. 199 | rowAsVal is an array of string */ 200 | arr(field) = serializeObject(rowAsVal) 201 | } 202 | } 203 | } 204 | } 205 | arr 206 | } 207 | 208 | def isValidRow(): Boolean = true 209 | 210 | def jsonToData(dt: DataType, value: Any, mode: String): Any = { 211 | return dt match { 212 | case ar: ArrayType => { 213 | util.Try({ 214 | val structs = value.toString.split(" ") 215 | val seq = structs.zipWithIndex.map{ case (col, index) => 216 | val dataType = ar.elementType 217 | jsonToData(dataType, col, mode) 218 | } 219 | ArrayData.toArrayData(seq) 220 | }).getOrElse(null) 221 | } 222 | case BooleanType => util.Try(value.toString.toBoolean).getOrElse(null) 223 | case ByteType => util.Try(value.toString.toByte).getOrElse(null) 224 | case ShortType => util.Try(value.toString.toShort).getOrElse(null) 225 | case DateType => util.Try(getNumberOfDaysFromEpoch(value.toString)).getOrElse(null) 226 | case DecimalType() => util.Try(Decimal(value.toString)).getOrElse(null) 227 | case DoubleType => util.Try(value.toString.toDouble).getOrElse(null) 228 | case FloatType => util.Try(value.toString.toFloat).getOrElse(null) 229 | case IntegerType => util.Try(value.toString.toInt).getOrElse(null) 230 | case LongType => util.Try(value.toString.toLong).getOrElse(null) 231 | case StringType => util.Try(UTF8String.fromString(value.toString)).getOrElse(null) 232 | case TimestampType => { 233 | if (value != None && value != null) { 234 | return value.asInstanceOf[Long] 235 | } else { 236 | null 237 | } 238 | } 239 | case st: StructType => { 240 | util.Try({ 241 | /* we decode the binary string to an array of string containing nested values */ 242 | val arr = deSerializeObject(value.toString.getBytes()); 243 | val seq = arr.zipWithIndex.map { case (col, index) => 244 | val dataType = st.fields(index).dataType 245 | jsonToData(dataType, col, mode) 246 | } 247 | val isAllNull = arr.forall(x => x == null) 248 | if (isAllNull) null else InternalRow.fromSeq(seq) 249 | }).getOrElse(null) 250 | } 251 | case _ => util.Try(UTF8String.fromString(value.toString)).getOrElse(null) 252 | } 253 | } 254 | } 255 | -------------------------------------------------------------------------------- /src/main/scala/com/microsoft/cdm/read/ReaderConnector.scala: -------------------------------------------------------------------------------- 1 | package com.microsoft.cdm.read 2 | 3 | import java.text.SimpleDateFormat 4 | import java.util.TimeZone 5 | import com.microsoft.cdm.utils.Constants 6 | import com.microsoft.cdm.utils.TimestampFormatter 7 | import org.apache.spark.sql.types.DataType 8 | 9 | @SerialVersionUID(100L) 10 | trait ReaderConnector extends Serializable { 11 | 12 | val dateFormatter = new SimpleDateFormat(Constants.SINGLE_DATE_FORMAT) 13 | val timestampFormatter = TimestampFormatter(Constants.TIMESTAMP_FORMAT, TimeZone.getDefault) 14 | 15 | /** 16 | * build() is used as a constructor, to initialize local variables 17 | */ 18 | def build 19 | 20 | /** 21 | * Close any open streams if they exist 22 | */ 23 | def close 24 | 25 | /** 26 | * This method is to used to convert to Spark/CDM data types 27 | * @param dataType 28 | * @param col 29 | * @return 30 | */ 31 | def jsonToData(dataType: DataType, col: Any, mode: String): Any 32 | 33 | 34 | def isValidRow(): Boolean 35 | 36 | /** 37 | * Read a Row as a string. 38 | * XXX: This is not sufficient for complex types 39 | */ 40 | def readRow: Array[Any] 41 | } 42 | -------------------------------------------------------------------------------- /src/main/scala/com/microsoft/cdm/utils/CDMAuthentication.scala: -------------------------------------------------------------------------------- 1 | package com.microsoft.cdm.utils 2 | 3 | object CdmAuthType extends Enumeration { 4 | val AppReg, Sas, Token = Value 5 | } 6 | 7 | trait Auth { 8 | def getAppId: String 9 | def getAppKey: String 10 | def getTenantId: String 11 | def getSASToken: String 12 | def getAuthType: String 13 | } 14 | 15 | case class SasAuth(sasToken: String) extends Auth { 16 | override def getAuthType: String = CdmAuthType.Sas.toString() 17 | override def getSASToken: String = sasToken 18 | override def getAppId: String = "" 19 | override def getAppKey: String = "" 20 | override def getTenantId: String = "" 21 | } 22 | 23 | case class AppRegAuth(appId: String, appKey: String, tenantId: String) extends Auth { 24 | override def getAuthType: String = CdmAuthType.AppReg.toString() 25 | override def getAppId: String = appId 26 | override def getAppKey: String = appKey 27 | override def getTenantId: String = tenantId 28 | override def getSASToken: String = "" 29 | } 30 | 31 | case class TokenAuth() extends Auth { 32 | override def getAuthType: String = CdmAuthType.Token.toString() 33 | override def getAppId: String = "" 34 | override def getAppKey: String = "" 35 | override def getTenantId: String = "" 36 | override def getSASToken: String = "" 37 | } 38 | -------------------------------------------------------------------------------- /src/main/scala/com/microsoft/cdm/utils/CDMModelReader.scala: -------------------------------------------------------------------------------- 1 | package com.microsoft.cdm.utils 2 | 3 | 4 | class CDMModelReader(storage: String, 5 | container: String, 6 | manifestPath: String, 7 | manifestFileName: String, 8 | entityName: String, 9 | entDefContAndPath: String, 10 | auth: Auth, 11 | tokenProvider: Option[CDMTokenProvider], 12 | cdmSource: CDMSource.Value, 13 | entityDefinitionStorage: String, 14 | maxCDMThreads: Int) extends CDMModelCommon (storage, container, manifestPath, manifestFileName, 15 | entityName, "", entDefContAndPath, auth, 16 | tokenProvider, "/", cdmSource, entityDefinitionStorage, 17 | maxCDMThreads){ 18 | 19 | 20 | } 21 | -------------------------------------------------------------------------------- /src/main/scala/com/microsoft/cdm/utils/CDMOptions.scala: -------------------------------------------------------------------------------- 1 | package com.microsoft.cdm.utils 2 | 3 | import com.microsoft.cdm.log.SparkCDMLogger 4 | import org.apache.hadoop.conf.Configuration 5 | import org.apache.spark.sql.SparkSession 6 | import org.apache.spark.sql.util.CaseInsensitiveStringMap 7 | import org.slf4j.LoggerFactory 8 | import org.slf4j.event.Level 9 | 10 | class CDMOptions(options: CaseInsensitiveStringMap) { 11 | 12 | val logger = LoggerFactory.getLogger(classOf[CDMOptions]) 13 | 14 | var appId: String = "" 15 | var appKey: String = "" 16 | var tenantId : String = "" 17 | var sasToken: String = "" 18 | var auth: Auth = null 19 | 20 | val storage = getRequiredArgument(options, "storage") 21 | val entity= getRequiredArgument(options,"entity") 22 | val newManifestPath= getRequiredArgument(options,"manifestPath") 23 | 24 | val manipathPathInput = getContainerManifestPathAndFile(newManifestPath) 25 | var manifestPath= manipathPathInput.manifestPath 26 | val manifestFileName = manipathPathInput.manifestFileName 27 | val container = manipathPathInput.container 28 | 29 | val maxCDMThreadsString = if (options.containsKey("maxCDMThreads")) options.get("maxCDMThreads") else "100" 30 | if (!isNumeric(maxCDMThreadsString)) throw new Exception(String.format("%s - %s", Messages.invalidThreadCount, maxCDMThreadsString)) 31 | val maxCDMThreads = maxCDMThreadsString.toInt 32 | if (maxCDMThreads < 1 ) throw new Exception(String.format("%s - %s", Messages.invalidThreadCount, maxCDMThreadsString)) 33 | 34 | val cdmSource = 35 | if (options.containsKey("cdmSource")) { 36 | val cdmSourceValue = options.get("cdmSource") 37 | CDMSource.getValue(cdmSourceValue) 38 | } else 39 | CDMSource.REFERENCED 40 | 41 | 42 | 43 | var conf : Configuration = SparkSession.builder.getOrCreate.sessionState.newHadoopConf() 44 | Environment.sparkPlatform = SparkPlatform.getPlatform(conf) 45 | if (getAuthType(options) == CdmAuthType.AppReg.toString()) { 46 | appId = getRequiredArgument(options,"appId") 47 | appKey = getRequiredArgument(options,"appKey") 48 | tenantId = getRequiredArgument(options,"tenantId") 49 | auth = AppRegAuth(appId, appKey, tenantId) 50 | } else if (getAuthType(options) == CdmAuthType.Sas.toString()) { 51 | sasToken = getRequiredArgument(options,"sasToken") 52 | auth = SasAuth(sasToken) 53 | } else if (getAuthType(options) == CdmAuthType.Token.toString()) { 54 | auth = TokenAuth() 55 | } else { 56 | if (Environment.sparkPlatform == SparkPlatform.Other){ 57 | throw new Exception(Messages.managedIdentitiesSynapseDataBricksOnly) 58 | } 59 | } 60 | 61 | def isNumeric(input: String): Boolean = input.forall(_.isDigit) 62 | 63 | private def getRequiredArgument(options: CaseInsensitiveStringMap, arg: String): String = { 64 | val result = if (options.containsKey(arg)) options.get(arg) else { 65 | throw new Exception(s"'$arg' is a required argument!") 66 | } 67 | result 68 | } 69 | 70 | def getAuthType(options: CaseInsensitiveStringMap): String = { 71 | val appIdPresent = options.containsKey("appId") 72 | val appKeyPresent = options.containsKey("appKey") 73 | val tenantIdPresent = options.containsKey("tenantId") 74 | val sasTokenPresent = options.containsKey("sasToken") 75 | val result = if (appIdPresent || appKeyPresent|| tenantIdPresent) { 76 | //make sure all creds are present 77 | if (!appIdPresent || !appKeyPresent || !tenantIdPresent) { 78 | throw new Exception("All creds must exist") 79 | } 80 | SparkCDMLogger.log(Level.INFO,"Using app registration for authentication", logger) 81 | CdmAuthType.AppReg.toString() 82 | } else if (sasTokenPresent) { 83 | SparkCDMLogger.log(Level.INFO,"Using SAS token for authentication", logger) 84 | CdmAuthType.Sas.toString() 85 | } else { 86 | SparkCDMLogger.log(Level.INFO, "Using managed identities for authentication", logger) 87 | CdmAuthType.Token.toString() 88 | } 89 | result 90 | } 91 | 92 | def checkValidFileName(manifestFileName: String) = { 93 | if(manifestFileName != Constants.MODEL_JSON && !manifestFileName.contains(".manifest.cdm.json")) { 94 | throw new Exception(String.format("Invalid manifest filename provided - %s", manifestFileName)) 95 | } 96 | } 97 | 98 | def getContainerManifestPathAndFile(manifestContainerPath: String) = { 99 | 100 | var manifestContainerPathTemp = manifestContainerPath 101 | if(manifestContainerPath.startsWith("/") && manifestContainerPath.length > 1) { 102 | manifestContainerPathTemp = manifestContainerPath.substring(1) 103 | } 104 | val manifestFileNameStartIndex = manifestContainerPathTemp.lastIndexOf("/") + 1 105 | val manifestFileName = manifestContainerPathTemp.substring(manifestFileNameStartIndex) 106 | 107 | checkValidFileName(manifestFileName) 108 | 109 | val containerEndIndex = manifestContainerPathTemp.indexOf("/") 110 | if(containerEndIndex == -1) { 111 | throw new Exception("Container is not specified in the manifestPath") 112 | } 113 | var container = manifestContainerPathTemp.substring(0, containerEndIndex) 114 | container = if(container.startsWith("/")) container else "/" + container 115 | 116 | var manifestPath = manifestContainerPathTemp.substring(containerEndIndex, manifestFileNameStartIndex) 117 | manifestPath = if(manifestPath.startsWith("/")) manifestPath else "/" + manifestPath 118 | 119 | ManifestPath(container, manifestPath, manifestFileName) 120 | } 121 | 122 | val fileFormatType = if(options.containsKey("format")) options.get("format") else "csv" 123 | val overrideConfigPathIn = if (options.containsKey("configPath")) options.get("configPath") else "" 124 | val overrideConfigPath = if (overrideConfigPathIn.startsWith("/")) overrideConfigPathIn else "/" + overrideConfigPathIn 125 | 126 | 127 | } 128 | -------------------------------------------------------------------------------- /src/main/scala/com/microsoft/cdm/utils/CDMParquetSchemaConverter.scala: -------------------------------------------------------------------------------- 1 | /* 2 | * Licensed to the Apache Software Foundation (ASF) under one or more 3 | * contributor license agreements. See the NOTICE file distributed with 4 | * this work for additional information regarding copyright ownership. 5 | * The ASF licenses this file to You under the Apache License, Version 2.0 6 | * (the "License"); you may not use this file except in compliance with 7 | * the License. You may obtain a copy of the License at 8 | * 9 | * http://www.apache.org/licenses/LICENSE-2.0 10 | * 11 | * Unless required by applicable law or agreed to in writing, software 12 | * distributed under the License is distributed on an "AS IS" BASIS, 13 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | * See the License for the specific language governing permissions and 15 | * limitations under the License. 16 | */ 17 | 18 | package com.microsoft.cdm.utils 19 | 20 | import org.apache.hadoop.conf.Configuration 21 | import org.apache.parquet.schema._ 22 | import org.apache.parquet.schema.OriginalType._ 23 | import org.apache.parquet.schema.PrimitiveType.PrimitiveTypeName._ 24 | import org.apache.parquet.schema.Type.Repetition._ 25 | import org.apache.parquet.schema.Types.MessageTypeBuilder 26 | import org.apache.spark.sql.internal.SQLConf 27 | import org.apache.spark.sql.types._ 28 | 29 | 30 | /** 31 | * This converter class is used to convert Spark SQL [[StructType]] to Parquet [[MessageType]]. 32 | * 33 | * @param writeLegacyParquetFormat Whether to use legacy Parquet format compatible with Spark 1.4 34 | * and prior versions when converting a Catalyst [[StructType]] to a Parquet [[MessageType]]. 35 | * When set to false, use standard format defined in parquet-format spec. This argument only 36 | * affects Parquet write path. 37 | * @param outputTimestampType which parquet timestamp type to use when writing. 38 | */ 39 | class CDMSparkToParquetSchemaConverter( 40 | writeLegacyParquetFormat: Boolean = SQLConf.PARQUET_WRITE_LEGACY_FORMAT.defaultValue.get, 41 | outputTimestampType: SQLConf.ParquetOutputTimestampType.Value = 42 | SQLConf.ParquetOutputTimestampType.INT96) { 43 | 44 | def this(conf: SQLConf) = this( 45 | writeLegacyParquetFormat = conf.writeLegacyParquetFormat, 46 | outputTimestampType = conf.parquetOutputTimestampType) 47 | 48 | def this(conf: Configuration) = this( 49 | writeLegacyParquetFormat = conf.get(SQLConf.PARQUET_WRITE_LEGACY_FORMAT.key).toBoolean, 50 | outputTimestampType = SQLConf.ParquetOutputTimestampType.withName( 51 | conf.get(SQLConf.PARQUET_OUTPUT_TIMESTAMP_TYPE.key))) 52 | 53 | def convert(catalystSchema: StructType, cdmSchema: Iterable[Any]): MessageType = { 54 | val bm = Types .buildMessage() 55 | convert(catalystSchema, cdmSchema, bm) 56 | bm.named(SPARK_PARQUET_SCHEMA_NAME) 57 | } 58 | 59 | def convert(schema: StructType, cdmSchema: Iterable[Any], bm : MessageTypeBuilder):Unit = { 60 | val arr = cdmSchema.toArray 61 | schema.fields.zipWithIndex.foreach { case (field, i) => 62 | bm.addField(convertField(field, if (field.nullable) OPTIONAL else REQUIRED, arr(i))) 63 | } 64 | } 65 | 66 | def convertField(field: StructField, repetition: Type.Repetition, cdmType: Any): Type = { 67 | checkFieldName(field.name) 68 | 69 | field.dataType match { 70 | // =================== 71 | // Simple atomic types 72 | // =================== 73 | 74 | case BooleanType => 75 | Types.primitive(BOOLEAN, repetition).named(field.name) 76 | 77 | case ByteType => 78 | Types.primitive(INT32, repetition).as(INT_8).named(field.name) 79 | 80 | case ShortType => 81 | Types.primitive(INT32, repetition).as(INT_16).named(field.name) 82 | 83 | case IntegerType => 84 | Types.primitive(INT32, repetition).named(field.name) 85 | 86 | case LongType => 87 | Types.primitive(INT64, repetition).named(field.name) 88 | 89 | case FloatType => 90 | Types.primitive(FLOAT, repetition).named(field.name) 91 | 92 | case DoubleType => 93 | Types.primitive(DOUBLE, repetition).named(field.name) 94 | 95 | case StringType => 96 | Types.primitive(BINARY, repetition).as(UTF8).named(field.name) 97 | 98 | case DateType => 99 | Types.primitive(INT32, repetition).as(DATE).named(field.name) 100 | 101 | // NOTE: Spark SQL can write timestamp values to Parquet using INT96, TIMESTAMP_MICROS or 102 | // TIMESTAMP_MILLIS. TIMESTAMP_MICROS is recommended but INT96 is the default to keep the 103 | // behavior same as before. 104 | // 105 | // As stated in PARQUET-323, Parquet `INT96` was originally introduced to represent nanosecond 106 | // timestamp in Impala for some historical reasons. It's not recommended to be used for any 107 | // other types and will probably be deprecated in some future version of parquet-format spec. 108 | // That's the reason why parquet-format spec only defines `TIMESTAMP_MILLIS` and 109 | // `TIMESTAMP_MICROS` which are both logical types annotating `INT64`. 110 | // 111 | // Originally, Spark SQL uses the same nanosecond timestamp type as Impala and Hive. Starting 112 | // from Spark 1.5.0, we resort to a timestamp type with microsecond precision so that we can 113 | // store a timestamp into a `Long`. This design decision is subject to change though, for 114 | // example, we may resort to nanosecond precision in the future. 115 | case TimestampType => { 116 | /* Spark-CDM: 117 | * * If there is a metadata field overwrite to type Time (implicit write) set the parquet field to type Time 118 | * * IF there is an explicit cdmType type of Type "Time", also set the field to type Time */ 119 | if (field.metadata.contains(Constants.MD_DATATYPE_OVERRIDE) && 120 | field.metadata.getString(Constants.MD_DATATYPE_OVERRIDE).equals(Constants.MD_DATATYPE_OVERRIDE_TIME) 121 | || cdmType.equals("Time")) { 122 | Types.primitive(INT64, repetition).as(OriginalType.TIME_MICROS).named(field.name) 123 | } else { 124 | outputTimestampType match { 125 | case SQLConf.ParquetOutputTimestampType.INT96 => 126 | Types.primitive(INT96, repetition).named(field.name) 127 | case SQLConf.ParquetOutputTimestampType.TIMESTAMP_MICROS => 128 | Types.primitive(INT64, repetition).as(TIMESTAMP_MICROS).named(field.name) 129 | case SQLConf.ParquetOutputTimestampType.TIMESTAMP_MILLIS => 130 | Types.primitive(INT64, repetition).as(TIMESTAMP_MILLIS).named(field.name) 131 | } 132 | } 133 | } 134 | 135 | case BinaryType => 136 | Types.primitive(BINARY, repetition).named(field.name) 137 | 138 | case DecimalType() => { 139 | val decimal = field.dataType.asInstanceOf[DecimalType] 140 | val precision = decimal.precision 141 | val scale = decimal.scale 142 | if (writeLegacyParquetFormat) { 143 | // ====================== 144 | // Decimals (legacy mode) 145 | // ====================== 146 | // Spark 1.4.x and prior versions only support decimals with a maximum precision of 18 and 147 | // always store decimals in fixed-length byte arrays. To keep compatibility with these older 148 | // versions, here we convert decimals with all precisions to `FIXED_LEN_BYTE_ARRAY` annotated 149 | // by `DECIMAL`. 150 | Types 151 | .primitive(FIXED_LEN_BYTE_ARRAY, repetition) 152 | .as(DECIMAL) 153 | .precision(precision) 154 | .scale(scale) 155 | .length(Decimal.minBytesForPrecision(precision)) 156 | .named(field.name) 157 | 158 | // ======================== 159 | // Decimals (standard mode) 160 | // ======================== 161 | } else if (precision <= Decimal.MAX_INT_DIGITS) { 162 | Types 163 | .primitive(INT32, repetition) 164 | .as(DECIMAL) 165 | .precision(precision) 166 | .scale(scale) 167 | .named(field.name) 168 | } else if (precision <= Decimal.MAX_LONG_DIGITS) { 169 | Types 170 | .primitive(INT64, repetition) 171 | .as(DECIMAL) 172 | .precision(precision) 173 | .scale(scale) 174 | .named(field.name) 175 | } else { 176 | Types 177 | .primitive(FIXED_LEN_BYTE_ARRAY, repetition) 178 | .as(DECIMAL) 179 | .precision(precision) 180 | .scale(scale) 181 | .length(Decimal.minBytesForPrecision(precision)) 182 | .named(field.name) 183 | } 184 | } 185 | 186 | // =================================== 187 | // ArrayType and MapType (legacy mode) 188 | // =================================== 189 | 190 | // Spark 1.4.x and prior versions convert `ArrayType` with nullable elements into a 3-level 191 | // `LIST` structure. This behavior is somewhat a hybrid of parquet-hive and parquet-avro 192 | // (1.6.0rc3): the 3-level structure is similar to parquet-hive while the 3rd level element 193 | // field name "array" is borrowed from parquet-avro. 194 | case ArrayType(elementType, nullable @ true) if writeLegacyParquetFormat => 195 | // group (LIST) { 196 | // optional group bag { 197 | // repeated array; 198 | // } 199 | // } 200 | 201 | // This should not use `listOfElements` here because this new method checks if the 202 | // element name is `element` in the `GroupType` and throws an exception if not. 203 | // As mentioned above, Spark prior to 1.4.x writes `ArrayType` as `LIST` but with 204 | // `array` as its element name as below. Therefore, we build manually 205 | // the correct group type here via the builder. (See SPARK-16777) 206 | Types 207 | .buildGroup(repetition).as(LIST) 208 | .addField(Types 209 | .buildGroup(REPEATED) 210 | // "array" is the name chosen by parquet-hive (1.7.0 and prior version) 211 | .addField(convertField(StructField("array", elementType, nullable),if (field.nullable) OPTIONAL else REQUIRED,cdmType.asInstanceOf[Iterable[Any]])) 212 | .named("bag")) 213 | .named(field.name) 214 | 215 | // Spark 1.4.x and prior versions convert ArrayType with non-nullable elements into a 2-level 216 | // LIST structure. This behavior mimics parquet-avro (1.6.0rc3). Note that this case is 217 | // covered by the backwards-compatibility rules implemented in `isElementType()`. 218 | case ArrayType(elementType, nullable @ false) if writeLegacyParquetFormat => 219 | // group (LIST) { 220 | // repeated element; 221 | // } 222 | 223 | // Here too, we should not use `listOfElements`. (See SPARK-16777) 224 | Types 225 | .buildGroup(repetition).as(LIST) 226 | // "array" is the name chosen by parquet-avro (1.7.0 and prior version) 227 | .addField(convertField(StructField("array", elementType, nullable), REPEATED, cdmType.asInstanceOf[Iterable[Any]])) 228 | .named(field.name) 229 | 230 | // Spark 1.4.x and prior versions convert MapType into a 3-level group annotated by 231 | // MAP_KEY_VALUE. This is covered by `convertGroupField(field: GroupType): DataType`. 232 | case MapType(keyType, valueType, valueContainsNull) if writeLegacyParquetFormat => 233 | // group (MAP) { 234 | // repeated group map (MAP_KEY_VALUE) { 235 | // required key; 236 | // value; 237 | // } 238 | // } 239 | ConversionPatterns.mapType( 240 | repetition, 241 | field.name, 242 | convertField(StructField("key", keyType, nullable = false),if (field.nullable) OPTIONAL else REQUIRED,cdmType.asInstanceOf[Iterable[Any]]), 243 | convertField(StructField("value", valueType, valueContainsNull),if (field.nullable) OPTIONAL else REQUIRED,cdmType.asInstanceOf[Iterable[Any]])) 244 | 245 | // ===================================== 246 | // ArrayType and MapType (standard mode) 247 | // ===================================== 248 | 249 | case ArrayType(elementType, containsNull) if !writeLegacyParquetFormat => 250 | // group (LIST) { 251 | // repeated group list { 252 | // element; 253 | // } 254 | // } 255 | Types 256 | .buildGroup(repetition).as(LIST) 257 | .addField( 258 | Types.repeatedGroup() 259 | .addField(convertField(StructField("element", elementType, containsNull), if (field.nullable) OPTIONAL else REQUIRED,cdmType.asInstanceOf[Iterable[Any]])) 260 | .named("list")) 261 | .named(field.name) 262 | 263 | case MapType(keyType, valueType, valueContainsNull) => 264 | // group (MAP) { 265 | // repeated group key_value { 266 | // required key; 267 | // value; 268 | // } 269 | // } 270 | Types 271 | .buildGroup(repetition).as(MAP) 272 | .addField( 273 | Types 274 | .repeatedGroup() 275 | .addField(convertField(StructField("key", keyType, nullable = false),if (field.nullable) OPTIONAL else REQUIRED, cdmType.asInstanceOf[Iterable[Any]])) 276 | .addField(convertField(StructField("value", valueType, valueContainsNull),if (field.nullable) OPTIONAL else REQUIRED,cdmType.asInstanceOf[Iterable[Any]])) 277 | .named("key_value")) 278 | .named(field.name) 279 | 280 | // =========== 281 | // Other types 282 | // =========== 283 | 284 | case StructType(fields) => { 285 | val bg = Types.buildGroup(repetition) 286 | fields.zipWithIndex.foreach{ 287 | case (field, fieldIndex) => { 288 | val cdmStruct = cdmType.asInstanceOf[List[Any]] 289 | bg.addField(convertField(field, if (field.nullable) OPTIONAL else REQUIRED, 290 | cdmStruct(fieldIndex))) 291 | } 292 | } 293 | bg.named(field.name) 294 | } 295 | /* fields.foldLeft(Types.buildGroup(repetition)) {(builder, field) => { 296 | builder.addField(convertField(field, if (field.nullable) OPTIONAL else REQUIRED,cdmType.asInstanceOf[Iterable[Any]])) 297 | } 298 | }.named(field.name) 299 | 300 | */ 301 | 302 | //case udt: UserDefinedType[_] => 303 | // convertField(field.copy(dataType = udt.sqlType)) 304 | 305 | case _ => 306 | throw new Exception(s"Unsupported data type ${field.dataType.catalogString}") 307 | } 308 | } 309 | private val SPARK_PARQUET_SCHEMA_NAME = "spark_schema" 310 | 311 | private val EMPTY_MESSAGE: MessageType = 312 | Types.buildMessage().named(SPARK_PARQUET_SCHEMA_NAME) 313 | 314 | private def checkFieldName(name: String): Unit = { 315 | // ,;{}()\n\t= and space are special characters in Parquet schema 316 | checkConversionRequirement( 317 | !name.matches(".*[ ,;{}()\n\t=].*"), 318 | s"""Attribute name "$name" contains invalid character(s) among " ,;{}()\\n\\t=". 319 | |Please use alias to rename it. 320 | """.stripMargin.split("\n").mkString(" ").trim) 321 | } 322 | 323 | private def checkFieldNames(names: Seq[String]): Unit = { 324 | names.foreach(checkFieldName) 325 | } 326 | 327 | private def checkConversionRequirement(f: => Boolean, message: String): Unit = { 328 | if (!f) { 329 | throw new Exception(message) 330 | } 331 | } 332 | } 333 | 334 | -------------------------------------------------------------------------------- /src/main/scala/com/microsoft/cdm/utils/CDMSASTokenProvider.scala: -------------------------------------------------------------------------------- 1 | package com.microsoft.cdm.utils 2 | import com.microsoft.cdm.utils.Constants.SASTOKEN_CONF_SETTING 3 | import org.apache.hadoop.conf.Configuration 4 | 5 | class CDMSASTokenProvider extends org.apache.hadoop.fs.azurebfs.extensions.SASTokenProvider { 6 | var sasToken = "" 7 | override def getSASToken(account: String, fileSystem: String, path: String, operation: String): String = { 8 | sasToken 9 | } 10 | 11 | override def initialize(configuration: Configuration, accountName: String): Unit = { 12 | sasToken = configuration.get(SASTOKEN_CONF_SETTING) 13 | } 14 | } 15 | -------------------------------------------------------------------------------- /src/main/scala/com/microsoft/cdm/utils/CDMTokenProvider.scala: -------------------------------------------------------------------------------- 1 | package com.microsoft.cdm.utils 2 | 3 | import com.microsoft.azure.synapse.tokenlibrary.TokenLibrary 4 | import com.microsoft.commondatamodel.objectmodel.utilities.network.TokenProvider 5 | import com.databricks.backend.daemon.data.client.adl.AdlGen2CredentialContextTokenProvider 6 | 7 | class CDMTokenProvider(serConf: SparkSerializableConfiguration, accountName: String) extends TokenProvider { 8 | val platform = SparkPlatform.getPlatform(serConf.value) 9 | var startTime = System.currentTimeMillis() 10 | 11 | 12 | var curToken:String = 13 | if (platform == SparkPlatform.DataBricks) { 14 | val adpProvider = new AdlGen2CredentialContextTokenProvider() 15 | val dbToken = adpProvider.getToken().getAccessToken() 16 | dbToken 17 | } else if (platform == SparkPlatform.Synapse) { 18 | getSynapseToken 19 | } else { 20 | throw new Exception(Messages.managedIdentitiesSynapseDataBricksOnly) 21 | } 22 | 23 | def isTokenValid(): Boolean = { 24 | var validToken = true 25 | if (platform == SparkPlatform.DataBricks) { 26 | val endTime = System.currentTimeMillis(); 27 | if ((endTime - startTime) > Constants.MILLIS_PER_HOUR) { 28 | validToken = false 29 | } 30 | } 31 | validToken 32 | } 33 | 34 | private def getSynapseToken: String = { 35 | val resource = s"""{"audience": "storage", "name": "$accountName"}""" 36 | val token = TokenLibrary.getAccessToken(resource) 37 | token.token 38 | } 39 | 40 | private def getCachedSynapseToken: String = { 41 | if (!TokenLibrary.isValid(curToken)) { 42 | curToken = getSynapseToken 43 | } 44 | "Bearer " + curToken 45 | } 46 | 47 | /* 48 | * Databricks token cannot be called from an asynchronous context without an ExecutorService, which is 49 | * not what the CDM-SDK does. Their getToken method (below) is called asynchronously without an 50 | * executorService. However, since the Databricks token cannot be refreshed inside of a job, we cannot 51 | * get a new token even if a job lasts longer than one hour. Therefore, we can simply grab the token 52 | * during initialization (synchronous context call) and always return the same cached token. We 53 | * need to error out if we try to use the same token on an request that spans over one hour. This is what the 54 | * the isTokenValid() is responsible for. 55 | * 56 | * If the Databricks team implements a refreshToken mechanism, we need the CDM-SDK to implement an 57 | * executorService for their asychronous calls so that we can call the Databricks's regreshToken() 58 | * method at runTime. 59 | */ 60 | private def getDataBricksToken: String = { 61 | "Bearer " + curToken 62 | } 63 | 64 | @Override 65 | override def getToken: String = { 66 | platform match { 67 | case SparkPlatform.DataBricks => getDataBricksToken 68 | case SparkPlatform.Synapse => getCachedSynapseToken 69 | case SparkPlatform.Other => throw new Exception(Messages.managedIdentitiesSynapseDataBricksOnly) 70 | } 71 | } 72 | } -------------------------------------------------------------------------------- /src/main/scala/com/microsoft/cdm/utils/CDMUtils.scala: -------------------------------------------------------------------------------- 1 | package com.microsoft.cdm.utils 2 | 3 | import com.microsoft.cdm.log.SparkCDMLogger 4 | import java.util.Calendar 5 | 6 | import com.microsoft.commondatamodel.objectmodel.cdm.{CdmCorpusDefinition, CdmEntityDeclarationDefinition, CdmEntityDefinition, CdmManifestDefinition} 7 | import com.microsoft.commondatamodel.objectmodel.enums.CdmStatusLevel 8 | import com.microsoft.commondatamodel.objectmodel.utilities.EventCallback 9 | import org.apache.hadoop.conf.Configuration 10 | import org.apache.spark.sql.types.StructType 11 | import org.slf4j.LoggerFactory 12 | import org.slf4j.event.Level 13 | 14 | import util.control.Breaks._ 15 | import scala.collection.mutable.ArrayBuffer 16 | 17 | case class CDMEntity(rootManifest: CdmManifestDefinition, parentManifest: CdmManifestDefinition, entityDec:CdmEntityDeclarationDefinition, var schema: StructType) 18 | case class SchemaDiffOutput(isSame: Boolean, diffIndex: Int, path: ArrayBuffer[String]) 19 | case class CDMDecimalType(precision: Int, scale: Int) { 20 | override def toString() : String = { 21 | this.getClass.getName + precision + scale; 22 | } 23 | } 24 | case class FileFormatSettings(fileFormat: String, delimiter: Char, showHeader: Boolean) 25 | case class ManifestPath(container: String , manifestPath: String, manifestFileName: String) 26 | 27 | /** 28 | * Enum containing possible data types for CDM data. 29 | */ 30 | 31 | object CDMDataType extends Enumeration { 32 | val byte, bigInteger, smallInteger, integer, date, dateTime, guid, float, string, double, decimal, boolean, time, entity= Value 33 | } 34 | 35 | object CDMDataFormat extends Enumeration { 36 | val Byte, Int64, Int32, Int16, Date, DateTime, Guid, Float, String, Double, Decimal, Boolean, DateTimeOffset, Time, entity = Value 37 | } 38 | 39 | 40 | /** 41 | * Platform the connector is running on 42 | */ 43 | object SparkPlatform extends Enumeration { 44 | val DataBricks, Synapse, Local, Other = Value 45 | 46 | def getPlatform(conf: Configuration): Value = { 47 | val host = conf.get("spark.driver.host") 48 | // Use these conf settings to determine the platform 49 | if (conf.get("spark.databricks.preemption.enabled") != null) { 50 | SparkPlatform.DataBricks 51 | } else if (conf.get("spark.synapse.session.token") != null){ 52 | SparkPlatform.Synapse 53 | } else if (host.equals("localhost")){ 54 | SparkPlatform.Local 55 | } else { 56 | SparkPlatform.Other 57 | } 58 | } 59 | } 60 | 61 | // callback implementation to fetch Logs from CDM SDK 62 | object CDMCallback extends EventCallback { 63 | val fromCDMSDK = "CDM-SDK Library" 64 | val logger = LoggerFactory.getLogger(fromCDMSDK) 65 | 66 | override def apply(cdmStatusLevel: CdmStatusLevel, message: String): Unit = { 67 | // Dev debug 68 | // println(s"[${cdmStatusLevel}] ${message}") 69 | 70 | if(cdmStatusLevel == CdmStatusLevel.Error) { 71 | SparkCDMLogger.log(Level.ERROR, message, logger) 72 | SparkCDMLogger.logEventToKusto(fromCDMSDK, "", Level.ERROR, message) 73 | if (message.contains("saveDocumentAsAsync")) { 74 | throw new Exception(message) 75 | } 76 | if (message.contains("Adapter not found for the namespace") && Constants.MODE.equals("write")) { 77 | throw new Exception(String.format(Messages.overrideConfigJson, message)) 78 | } 79 | if (Constants.MODE.equals("write")) { 80 | throw new Exception(message) 81 | } 82 | } 83 | } 84 | } 85 | 86 | // Singleton to retrieve the current date as a folder for partitions to be written to 87 | object CDMDataFolder { 88 | def getDataFolderWithDate ():String = { 89 | val cal = Calendar.getInstance() 90 | val year= cal.get(Calendar.YEAR) 91 | val month = "%02d".format((cal.get(Calendar.MONTH ) + 1)) 92 | val day = "%02d".format(cal.get(Calendar.DATE)) 93 | Constants.PARTITION_DIR_PATTERN.format(year, month, day) 94 | } 95 | } 96 | 97 | object CDMSource extends Enumeration { 98 | val REFERENCED, BUILTIN = Value 99 | 100 | def getValue(input: String) : CDMSource.Value ={ 101 | var result : CDMSource.Value = null 102 | val cdmSource = CDMSource.values; 103 | breakable { 104 | for (cdm <- cdmSource) { 105 | if (cdm.toString.equalsIgnoreCase(input)) { 106 | result = cdm 107 | break 108 | } 109 | } 110 | } 111 | if(result == null) throw new IllegalArgumentException(String.format(Messages.invalidCDMSourceName, input)) else result 112 | } 113 | } 114 | 115 | 116 | 117 | case class EntityNotFoundException(private val message: String = "") extends Exception(message) 118 | case class ManifestNotFoundException(private val message: String = "") extends Exception(message) 119 | -------------------------------------------------------------------------------- /src/main/scala/com/microsoft/cdm/utils/CdmAdapterProvider.scala: -------------------------------------------------------------------------------- 1 | package com.microsoft.cdm.utils 2 | 3 | import com.microsoft.commondatamodel.objectmodel.storage.AdlsAdapter 4 | trait CdmAdapterProvider { 5 | def getAdlsAdapter : AdlsAdapter 6 | } 7 | 8 | object CdmAdapterProvider { 9 | 10 | private class CdmTokenAuthAdapter(val storage: String, val root: String, val tokenProvider: CDMTokenProvider) extends CdmAdapterProvider { 11 | override def getAdlsAdapter : AdlsAdapter = { 12 | new AdlsAdapter(storage, root, tokenProvider) 13 | } 14 | } 15 | 16 | private class CdmAppRegAdapter(val storage: String, val root: String, val auth: Auth) extends CdmAdapterProvider { 17 | override def getAdlsAdapter: AdlsAdapter = { 18 | new AdlsAdapter(storage, root, auth.getTenantId, auth.getAppId, auth.getAppKey) 19 | } 20 | } 21 | 22 | private class CdmSASAuthAdapter(val storage: String, val root: String, val auth: Auth) extends CdmAdapterProvider { 23 | override def getAdlsAdapter: AdlsAdapter = { 24 | val adapter = new AdlsAdapter(storage, root) 25 | adapter.setSasToken(auth.getSASToken) 26 | adapter 27 | } 28 | } 29 | 30 | def apply(storage: String, rootPath: String, auth: Auth, token: Option[CDMTokenProvider]): AdlsAdapter = { 31 | if(auth.getAuthType == CdmAuthType.AppReg.toString()) { 32 | new CdmAppRegAdapter(storage, rootPath, auth).getAdlsAdapter 33 | } else if(auth.getAuthType == CdmAuthType.Sas.toString()){ 34 | new CdmSASAuthAdapter(storage, rootPath, auth).getAdlsAdapter 35 | } else { 36 | new CdmTokenAuthAdapter(storage, rootPath, token.get).getAdlsAdapter 37 | } 38 | } 39 | } 40 | -------------------------------------------------------------------------------- /src/main/scala/com/microsoft/cdm/utils/Constants.scala: -------------------------------------------------------------------------------- 1 | package com.microsoft.cdm.utils 2 | 3 | import java.math.MathContext 4 | 5 | import org.apache.hadoop.conf.Configuration 6 | 7 | /** 8 | * Various constants for spark-csv. 9 | */ 10 | object Constants { 11 | 12 | // TODO: ensure these match the data provided 13 | //val DATE_FORMATS = Array("MM/dd/yyyy", "MM/dd/yyyy hh:mm:ss a") 14 | //val OUTPUT_FORMAT = "MM/dd/yyyy hh:mm:ss a" 15 | 16 | val MILLIS_PER_HOUR = 3600000 17 | val DECIMAL_PRECISION = 37 18 | val MATH_CONTEXT = new MathContext(28) 19 | val SINGLE_DATE_FORMAT = "M/d/yyyy" 20 | val TIMESTAMP_FORMAT = "yyyy-MM-dd HH:mm:ss.SSS" 21 | 22 | val MD_TRAITS = "traits" 23 | val MD_DATATYPE_OVERRIDE = "datatype" 24 | val MD_DATATYPE_OVERRIDE_TIME= "Time" 25 | 26 | val CDM_ARRAY_TRAIT = "is.linkedEntity.array" 27 | val CDM_DECIMAL_TRAIT = "is.dataFormat.numeric.shaped" 28 | 29 | val LOGICAL_ENTITY_DIR = "LogicalDefinition" 30 | val defaultCompressionFormat = "snappy" 31 | val SPARK_MODELROOT_NAMESPACE = "SparkModelRoot" 32 | val CDM_DEFAULT_PRECISION = 18 33 | val CDM_DEFAULT_SCALE = 4 34 | var PRODUCTION = true 35 | val DEFAULT_DELIMITER = ',' 36 | val SPARK_NAMESPACE = "SparkManifestLocation" 37 | val PARTITION_DIR_PATTERN ="%s-%s-%s" 38 | val GLOB_PATTERN="%s-%s-*.%s" 39 | var EXCEPTION_TEST = false 40 | var SUBMANIFEST_WITH_OVERWRITTEN_PARTITIONS = "%s.rename.manifest.cdm.json" 41 | var MODE = "" 42 | var MODEL_JSON = "model.json" 43 | var KUSTO_ENABLED = true 44 | var SASTOKEN_CONF_SETTING = "com.microsoft.cdm.sastoken" 45 | 46 | // For permissive/fail fast CSV reading mode. 47 | // Could be an Enumeration, but strings enable case-insensitive comparison 48 | val PERMISSIVE = "permissive" 49 | val FAILFAST = "failfast" 50 | val DROPMALFORMED= "dropmalformed" 51 | } 52 | 53 | object Environment { 54 | var sparkPlatform: SparkPlatform.Value = _ 55 | } 56 | class Constants { 57 | 58 | } 59 | -------------------------------------------------------------------------------- /src/main/scala/com/microsoft/cdm/utils/CsvParserFactory.scala: -------------------------------------------------------------------------------- 1 | package com.microsoft.cdm.utils 2 | 3 | import com.univocity.parsers.csv.{CsvParser, CsvParserSettings, CsvWriter, CsvWriterSettings} 4 | 5 | import java.io.OutputStreamWriter 6 | /** 7 | * Builds Univocity CsvParser instances. 8 | */ 9 | object CsvParserFactory { 10 | def build(delimiter: Char): CsvParser = { 11 | val settings = new CsvParserSettings() 12 | val format = settings.getFormat 13 | format.setDelimiter(delimiter) 14 | settings.setLineSeparatorDetectionEnabled(true) 15 | settings.setMaxCharsPerColumn(-1) 16 | settings.setMaxColumns(512 * 4) 17 | new CsvParser(settings) 18 | } 19 | 20 | def buildWriter(outputWriter: OutputStreamWriter, delimiter: Char): CsvWriter = { 21 | val settings = new CsvWriterSettings() 22 | settings.getFormat.setDelimiter(delimiter) 23 | settings.getFormat.setLineSeparator("\n") 24 | settings.setMaxCharsPerColumn(-1) 25 | new CsvWriter(outputWriter, settings) 26 | } 27 | } 28 | -------------------------------------------------------------------------------- /src/main/scala/com/microsoft/cdm/utils/DataConverter.scala: -------------------------------------------------------------------------------- 1 | package com.microsoft.cdm.utils 2 | 3 | import java.text.SimpleDateFormat 4 | import java.time.format.{DateTimeFormatter, DateTimeFormatterBuilder} 5 | import java.time.temporal.ChronoField 6 | import java.time.{Instant, LocalDate, ZoneId} 7 | import java.util.TimeZone 8 | import java.util.concurrent.TimeUnit 9 | 10 | import org.apache.spark.sql.types._ 11 | import org.slf4j.LoggerFactory 12 | 13 | /** 14 | * Converts between CSV/CDM data and Spark data tpyes. 15 | */ 16 | @SerialVersionUID(100L) 17 | class DataConverter extends Serializable{ 18 | 19 | val logger = LoggerFactory.getLogger(classOf[DataConverter]) 20 | val dateFormatter = new SimpleDateFormat(Constants.SINGLE_DATE_FORMAT) 21 | 22 | 23 | def toSparkType(dt: CDMDataFormat.Value, precision: Int, scale: Int) = { 24 | val map = Map( 25 | CDMDataFormat.Byte -> ByteType, 26 | CDMDataFormat.Int16 -> ShortType, 27 | CDMDataFormat.Int32 -> IntegerType, 28 | CDMDataFormat.Int64 -> LongType, 29 | CDMDataFormat.Date -> DateType, 30 | CDMDataFormat.DateTime -> TimestampType, 31 | CDMDataFormat.String -> StringType, 32 | CDMDataFormat.Double -> DoubleType, 33 | CDMDataFormat.Decimal -> DecimalType(precision, scale), 34 | CDMDataFormat.Boolean -> BooleanType, 35 | CDMDataFormat.DateTimeOffset -> TimestampType, 36 | CDMDataFormat.Guid -> StringType, //There is no UuidType in Spark 37 | CDMDataFormat.Time -> TimestampType, 38 | CDMDataFormat.Float -> FloatType 39 | ) 40 | map(dt) 41 | } 42 | 43 | def toParquet(dataType: DataType, data: Any): Any= { 44 | (dataType, data) match { 45 | case (_, null) => null 46 | case (IntegerType, _) => data.asInstanceOf[Int] 47 | case (StringType, _) => data.asInstanceOf[String] 48 | case _ => data.toString 49 | } 50 | } 51 | 52 | def toCdmDataFormat(dt: DataType): CDMDataFormat.Value = { 53 | return dt match { 54 | case ByteType => CDMDataFormat.Byte 55 | case ShortType => CDMDataFormat.Int16 56 | case IntegerType => CDMDataFormat.Int32 57 | case LongType => CDMDataFormat.Int64 58 | case DateType => CDMDataFormat.Date 59 | case StringType => CDMDataFormat.String 60 | case DoubleType => CDMDataFormat.Double 61 | case DecimalType() => CDMDataFormat.Decimal 62 | case BooleanType => CDMDataFormat.Boolean 63 | case TimestampType => CDMDataFormat.DateTime 64 | case structType: StructType => CDMDataFormat.entity 65 | case FloatType => CDMDataFormat.Float 66 | } 67 | } 68 | def toCdmDataFormatOverride(dt: String): CDMDataFormat.Value = { 69 | return dt match { 70 | case Constants.MD_DATATYPE_OVERRIDE_TIME=> CDMDataFormat.Time 71 | } 72 | } 73 | 74 | def toCdmDataType(dt: DataType): CDMDataType.Value = { 75 | return dt match { 76 | case ByteType => CDMDataType.byte 77 | case ShortType => CDMDataType.smallInteger 78 | case IntegerType => CDMDataType.integer 79 | case LongType => CDMDataType.bigInteger 80 | case DateType => CDMDataType.date 81 | case StringType => CDMDataType.string 82 | case DoubleType => CDMDataType.double 83 | case DecimalType() => CDMDataType.decimal 84 | case BooleanType => CDMDataType.boolean 85 | case TimestampType => CDMDataType.dateTime 86 | case structType: StructType => CDMDataType.entity 87 | case FloatType => CDMDataType.float 88 | } 89 | } 90 | 91 | def toCdmDataTypeOverride(dt: String): CDMDataType.Value = { 92 | return dt match { 93 | case Constants.MD_DATATYPE_OVERRIDE_TIME => CDMDataType.time 94 | } 95 | } 96 | 97 | def dataToString(data: Any, dataType: DataType, cdmType:Any): String = { 98 | (dataType, data, cdmType) match { 99 | case (_, null, _) => null 100 | case (DateType, v: Number, _) => { 101 | LocalDate.ofEpochDay(v.intValue()).toString 102 | } 103 | case (TimestampType, v: Number, "DateTimeOffset") => { 104 | val nanoAdjustment = TimeUnit.MICROSECONDS.toNanos(Math.floorMod(v.asInstanceOf[Long], TimeUnit.SECONDS.toMicros(1))) 105 | val instant = Instant.ofEpochSecond(TimeUnit.MICROSECONDS.toSeconds(v.asInstanceOf[Long]), nanoAdjustment); 106 | val date = instant.atZone(ZoneId.systemDefault()) 107 | /* 108 | * Using this format rather than ISO_OFFSET_DATE_TIME forces the format 109 | * to use +00:00 when the local time is UTC 110 | */ 111 | val formatter = new DateTimeFormatterBuilder() 112 | .appendPattern("yyyy-MM-dd'T'HH:mm:ss") 113 | .appendFraction(ChronoField.NANO_OF_SECOND, 0, 6, true) 114 | .appendOffset("+HH:MM","+00:00").toFormatter // min 0 max 6 115 | date.format(formatter) 116 | } 117 | case (TimestampType, v: Number, "DateTime") => { 118 | val nanoAdjustment = TimeUnit.MICROSECONDS.toNanos(Math.floorMod(v.asInstanceOf[Long], TimeUnit.SECONDS.toMicros(1))) 119 | val instant = Instant.ofEpochSecond(TimeUnit.MICROSECONDS.toSeconds(v.asInstanceOf[Long]), nanoAdjustment); 120 | val date = instant.atZone(ZoneId.systemDefault()) 121 | date.format(DateTimeFormatter.ISO_INSTANT) 122 | } 123 | case (TimestampType, v: Long, "Time") => { 124 | val nanoAdjustment = TimeUnit.MICROSECONDS.toNanos(Math.floorMod(v.asInstanceOf[Long], TimeUnit.SECONDS.toMicros(1))) 125 | val instant = Instant.ofEpochSecond(TimeUnit.MICROSECONDS.toSeconds(v.asInstanceOf[Long]), nanoAdjustment); 126 | val localTime= instant.atZone(ZoneId.of("UTC")).toLocalTime 127 | localTime.format(DateTimeFormatter.ofPattern("HH:mm:ss.SSSSSS")).toString 128 | } 129 | case _ => { 130 | data.toString 131 | } 132 | } 133 | } 134 | 135 | } 136 | 137 | -------------------------------------------------------------------------------- /src/main/scala/com/microsoft/cdm/utils/DateTimeFormatterHelper.scala: -------------------------------------------------------------------------------- 1 | /* 2 | * Licensed to the Apache Software Foundation (ASF) under one or more 3 | * contributor license agreements. See the NOTICE file distributed with 4 | * this work for additional information regarding copyright ownership. 5 | * The ASF licenses this file to You under the Apache License, Version 2.0 6 | * (the "License"); you may not use this file except in compliance with 7 | * the License. You may obtain a copy of the License at 8 | * 9 | * http://www.apache.org/licenses/LICENSE-2.0 10 | * 11 | * Unless required by applicable law or agreed to in writing, software 12 | * distributed under the License is distributed on an "AS IS" BASIS, 13 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | * See the License for the specific language governing permissions and 15 | * limitations under the License. 16 | */ 17 | 18 | package com.microsoft.cdm.utils 19 | 20 | import com.google.common.cache.CacheBuilder 21 | import com.microsoft.cdm.utils.DateTimeFormatterHelper._ 22 | 23 | import java.time._ 24 | import java.time.chrono.IsoChronology 25 | import java.time.format.{DateTimeFormatter, DateTimeFormatterBuilder, ResolverStyle} 26 | import java.time.temporal.{ChronoField, TemporalAccessor, TemporalQueries} 27 | import java.util.Locale 28 | 29 | trait DateTimeFormatterHelper { 30 | protected def toZonedDateTime( 31 | temporalAccessor: TemporalAccessor, 32 | zoneId: ZoneId): ZonedDateTime = { 33 | // Parsed input might not have time related part. In that case, time component is set to zeros. 34 | val parsedLocalTime = temporalAccessor.query(TemporalQueries.localTime) 35 | val localTime = if (parsedLocalTime == null) LocalTime.MIDNIGHT else parsedLocalTime 36 | // Parsed input must have date component. At least, year must present in temporalAccessor. 37 | val localDate = temporalAccessor.query(TemporalQueries.localDate) 38 | 39 | ZonedDateTime.of(localDate, localTime, zoneId) 40 | } 41 | protected def toInstantWithZoneId(temporalAccessor: TemporalAccessor, zoneId: ZoneId): Instant = { 42 | val localTime = if (temporalAccessor.query(TemporalQueries.localTime) == null) { 43 | LocalTime.ofNanoOfDay(0) 44 | } else { 45 | LocalTime.from(temporalAccessor) 46 | } 47 | val localDate = LocalDate.from(temporalAccessor) 48 | val localDateTime = LocalDateTime.of(localDate, localTime) 49 | val zonedDateTime = ZonedDateTime.of(localDateTime, zoneId) 50 | Instant.from(zonedDateTime) 51 | } 52 | 53 | // Gets a formatter from the cache or creates new one. The buildFormatter method can be called 54 | // a few times with the same parameters in parallel if the cache does not contain values 55 | // associated to those parameters. Since the formatter is immutable, it does not matter. 56 | // In this way, synchronised is intentionally omitted in this method to make parallel calls 57 | // less synchronised. 58 | // The Cache.get method is not used here to avoid creation of additional instances of Callable. 59 | protected def getOrCreateFormatter(pattern: String, locale: Locale): DateTimeFormatter = { 60 | val key = (pattern, locale) 61 | var formatter = cache.getIfPresent(key) 62 | if (formatter == null) { 63 | formatter = buildFormatter(pattern, locale) 64 | cache.put(key, formatter) 65 | } 66 | formatter 67 | } 68 | } 69 | 70 | private object DateTimeFormatterHelper { 71 | val cache = CacheBuilder.newBuilder() 72 | .maximumSize(128) 73 | .build[(String, Locale), DateTimeFormatter]() 74 | 75 | def buildFormatter(pattern: String, locale: Locale): DateTimeFormatter = { 76 | new DateTimeFormatterBuilder() 77 | .parseCaseInsensitive() 78 | .appendPattern(pattern) 79 | .parseDefaulting(ChronoField.ERA, 1) 80 | .parseDefaulting(ChronoField.MONTH_OF_YEAR, 1) 81 | .parseDefaulting(ChronoField.DAY_OF_MONTH, 1) 82 | .parseDefaulting(ChronoField.MINUTE_OF_HOUR, 0) 83 | .parseDefaulting(ChronoField.SECOND_OF_MINUTE, 0) 84 | .toFormatter(locale) 85 | .withChronology(IsoChronology.INSTANCE) 86 | .withResolverStyle(ResolverStyle.STRICT) 87 | } 88 | } 89 | -------------------------------------------------------------------------------- /src/main/scala/com/microsoft/cdm/utils/Messages.scala: -------------------------------------------------------------------------------- 1 | package com.microsoft.cdm.utils 2 | 3 | import java.math.MathContext 4 | 5 | object Messages { 6 | val invalidThreadCount= "The maxCDMThreads parameter is invalid" 7 | val invalidIndexSchema = "The dataframe schema does not match the cdm schema for field \"%s\". Path : \"%s\"" 8 | val mismatchedSizeSchema = "The dataframe schema and cdm schema don't have an equal number of fields. Entity Path: \"%s\"" 9 | val invalidCompressionFormat = "Invalid Compression format specified - %s" 10 | val entityDefinitionModelFileNotFound= "Entity definition model file \"%s\" not found" 11 | val invalidDecimalFormat = "Invalid Decimal(%s,%s)" 12 | val onlyStructsInArraySupported = "Arrays with primitive types/MapType/ArrayType not yet supported" 13 | val cdmDataFormatNotYetSupported = "CDM dataformat for %s not yet supported" 14 | val nestedTypesNotSupported = "Cannot write nested types to csv file. Please change the format to parquet." 15 | val characterNotInRange = "Invalid Delimiter - %s. The provided delimiter should be in a valid char range : 0-65535" 16 | val managedIdentitiesSynapseDataBricksOnly ="Managed identities only supported on Synapse or Databricks" 17 | val managedIdentitiesDatabricksTimeout = "Databricks jobs must not last more than 1 hour (they have not refresh mechanism)" 18 | val invalidDelimiterCharacter = "Invalid Delimiter - %s. Only one character is allowed in delimiter. Input should be a character." 19 | val overrideConfigJson = "%s. Please override config.json by option \"configPath\"" 20 | val configJsonPathNotFound = "Config.json not found in %s." 21 | val incorrectDataFolderFormat = "Incorrect Data Folder format %s - follow DateTimeFormatter format" 22 | val invalidCDMSourceName = "Invalid cdmSource provided - %s. cdmSource can either be - builtin or referenced" 23 | val invalidBothStandardAndEntityDefCont = "Specifying CdmStandard and entityDefinitionModelRoot is not valid" 24 | val entityDefStorageAppCredError = "entityDefinitionStorage option is supported only with managed identities." 25 | val incompatibleFileWithDataframe = "The number of columns in CSV/parquet file is not equal to the number of fields in Spark StructType. Either modify the attributes in manifest to make it equal to the number of columns in CSV/parquet files or modify the csv/parquet file" 26 | val invalidMode = "Invalid mode provided. Suports - permissive or failfast" 27 | val invalidPermissiveMode = "Permissive Mode not supported with Parquet files" 28 | val dropMalformedNotSupported ="DropMalformed mode is not supported" 29 | } -------------------------------------------------------------------------------- /src/main/scala/com/microsoft/cdm/utils/OverridenCdmStandardsAdapter.scala: -------------------------------------------------------------------------------- 1 | package com.microsoft.cdm.utils 2 | 3 | import com.microsoft.commondatamodel.objectmodel.storage.CdmStandardsAdapter 4 | 5 | class OverridenCdmStandardsAdapter @throws[ClassNotFoundException] extends CdmStandardsAdapter { 6 | override def fetchConfig(): String = { 7 | "{\"config\":{},\"type\": \"cdm-standards\"}" 8 | } 9 | } 10 | -------------------------------------------------------------------------------- /src/main/scala/com/microsoft/cdm/utils/SerializedABFSHadoopConf.scala: -------------------------------------------------------------------------------- 1 | package com.microsoft.cdm.utils 2 | 3 | import com.microsoft.cdm.utils.Constants.SASTOKEN_CONF_SETTING 4 | import org.apache.hadoop.conf.Configuration 5 | 6 | object SerializedABFSHadoopConf { 7 | def getConfiguration(storage: String, 8 | container: String, 9 | auth: Auth, 10 | conf: Configuration): SparkSerializableConfiguration = { 11 | conf.set("fs.defaultFS", "abfss:/" + container + "@" + storage + "/") 12 | if (auth.getAuthType == CdmAuthType.AppReg.toString()) { 13 | conf.set("fs.azure.account.auth.type", "OAuth") 14 | conf.set("fs.azure.account.oauth.provider.type", "org.apache.hadoop.fs.azurebfs.oauth2.ClientCredsTokenProvider") 15 | conf.set("fs.azure.account.oauth2.client.id", auth.getAppId) 16 | conf.set("fs.azure.account.oauth2.client.secret", auth.getAppKey) 17 | conf.set("fs.azure.account.oauth2.client.endpoint", "https://login.microsoftonline.com/" + auth.getTenantId + "/oauth2/token") 18 | } else if (auth.getAuthType == CdmAuthType.Sas.toString()) { 19 | conf.set("fs.azure.account.auth.type", "SAS") 20 | conf.set("fs.azure.sas.token.provider.type", "com.microsoft.cdm.utils.CDMSASTokenProvider") 21 | conf.set("fs.azure.account.hns.enabled", "true") 22 | conf.set("fs.abfss.impl.disable.cache", "true") // disable cache for abfss creation so that the sas tokens for different folders don't conflict. 23 | conf.set(SASTOKEN_CONF_SETTING, auth.getSASToken) // setting to store the sas token 24 | } 25 | new SparkSerializableConfiguration(new Configuration(conf)) 26 | } 27 | } 28 | -------------------------------------------------------------------------------- /src/main/scala/com/microsoft/cdm/utils/SparkSerializableConfiguration.scala: -------------------------------------------------------------------------------- 1 | package com.microsoft.cdm.utils 2 | 3 | import org.apache.hadoop.conf.Configuration 4 | import org.apache.hadoop.fs.FileSystem 5 | 6 | import java.io.{IOException, ObjectInputStream, ObjectOutputStream} 7 | import scala.util.control.NonFatal 8 | 9 | @SerialVersionUID(100L) 10 | class SparkSerializableConfiguration(@transient var value: Configuration) extends Serializable { 11 | 12 | def getFileSystem() : FileSystem = { 13 | FileSystem.get(value) 14 | } 15 | 16 | private def writeObject(out: ObjectOutputStream): Unit = tryOrIOException { 17 | out.defaultWriteObject() 18 | value.write(out) 19 | } 20 | 21 | private def readObject(in: ObjectInputStream): Unit = tryOrIOException { 22 | value = new Configuration(false) 23 | value.readFields(in) 24 | } 25 | 26 | def tryOrIOException(block: => Unit) { 27 | try { 28 | block 29 | } catch { 30 | case e: IOException => { 31 | throw e 32 | } 33 | case NonFatal(t) => { 34 | throw new IOException(t) 35 | } 36 | } 37 | } 38 | } -------------------------------------------------------------------------------- /src/main/scala/com/microsoft/cdm/utils/StructTypeMetadata.scala: -------------------------------------------------------------------------------- 1 | package com.microsoft.cdm.utils 2 | 3 | import org.apache.spark.sql.types.{Metadata, StructType} 4 | 5 | import scala.collection.mutable.HashMap 6 | 7 | object StructTypeMetadata{ 8 | object StructTypeMetadataMap{ 9 | lazy val metadataMap = HashMap.empty[String, Metadata] 10 | def setMD(s: String, md: Metadata): Unit = metadataMap(s) = md 11 | def getMD(s: String): Metadata = metadataMap.getOrElse(s, Metadata.empty) 12 | } 13 | 14 | // Since StructTypeMetadataMap is a singleton, it's the key that determines which 15 | // metadata object is returned. 16 | implicit class StructTypeMetadataMap(s: StructType) { 17 | def getMetadata(s: String): Metadata = StructTypeMetadataMap.getMD(s) 18 | def setMetadata(s:String, name: Metadata) = StructTypeMetadataMap.setMD(s, name) 19 | } 20 | } -------------------------------------------------------------------------------- /src/main/scala/com/microsoft/cdm/utils/TimestampFormatter.scala: -------------------------------------------------------------------------------- 1 | /* 2 | * Licensed to the Apache Software Foundation (ASF) under one or more 3 | * contributor license agreements. See the NOTICE file distributed with 4 | * this work for additional information regarding copyright ownership. 5 | * The ASF licenses this file to You under the Apache License, Version 2.0 6 | * (the "License"); you may not use this file except in compliance with 7 | * the License. You may obtain a copy of the License at 8 | * 9 | * http://www.apache.org/licenses/LICENSE-2.0 10 | * 11 | * Unless required by applicable law or agreed to in writing, software 12 | * distributed under the License is distributed on an "AS IS" BASIS, 13 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | * See the License for the specific language governing permissions and 15 | * limitations under the License. 16 | */ 17 | 18 | package com.microsoft.cdm.utils 19 | 20 | 21 | import org.apache.spark.sql.catalyst.util.{DateTimeConstants, DateTimeUtils} 22 | 23 | import java.text.ParseException 24 | import java.time._ 25 | import java.time.format.DateTimeParseException 26 | import java.time.temporal.{TemporalAccessor, TemporalQueries} 27 | import java.util.concurrent.TimeUnit.NANOSECONDS 28 | import java.util.{Locale, TimeZone} 29 | import scala.util.control.NonFatal 30 | 31 | @SerialVersionUID(100L) 32 | sealed trait TimestampFormatter extends Serializable { 33 | /** 34 | * Parses a timestamp in a string and converts it to microseconds. 35 | * 36 | * @param s - string with timestamp to parse 37 | * @return microseconds since epoch. 38 | * @throws ParseException can be thrown by legacy parser 39 | * @throws DateTimeParseException can be thrown by new parser 40 | * @throws DateTimeException unable to obtain local date or time 41 | */ 42 | @throws(classOf[ParseException]) 43 | @throws(classOf[DateTimeParseException]) 44 | @throws(classOf[DateTimeException]) 45 | def parse(s: String): Long 46 | def format(us: Long): String 47 | } 48 | 49 | class Iso8601TimestampFormatter( 50 | pattern: String, 51 | zoneId: ZoneId, 52 | locale: Locale) extends TimestampFormatter with DateTimeFormatterHelper { 53 | @transient 54 | private lazy val formatter = getOrCreateFormatter(pattern, locale) 55 | //private val NANOSECONDS = 1000L 56 | private def toInstant(s: String): Instant = { 57 | val temporalAccessor = formatter.parse(s) 58 | if (temporalAccessor.query(TemporalQueries.offset()) == null) { 59 | toInstantWithZoneId(temporalAccessor, zoneId) 60 | } else { 61 | Instant.from(temporalAccessor) 62 | } 63 | } 64 | def instantToMicros(instant: Instant): Long = { 65 | val us = Math.multiplyExact(instant.getEpochSecond, DateTimeConstants.MICROS_PER_SECOND) 66 | val result = Math.addExact(us, NANOSECONDS.toMicros(instant.getNano)) 67 | result 68 | } 69 | 70 | private val specialValueRe = """(\p{Alpha}+)\p{Blank}*(.*)""".r 71 | private def today(zoneId: ZoneId): ZonedDateTime = { 72 | Instant.now().atZone(zoneId).`with`(LocalTime.MIDNIGHT) 73 | } 74 | def getZoneId(timeZoneId: String): ZoneId = ZoneId.of(timeZoneId, ZoneId.SHORT_IDS) 75 | /** 76 | * Extracts special values from an input string ignoring case. 77 | * @param input - a trimmed string 78 | * @param zoneId - zone identifier used to get the current date. 79 | * @return some special value in lower case or None. 80 | */ 81 | private def extractSpecialValue(input: String, zoneId: ZoneId): Option[String] = { 82 | def isValid(value: String, timeZoneId: String): Boolean = { 83 | // Special value can be without any time zone 84 | if (timeZoneId.isEmpty) return true 85 | // "now" must not have the time zone field 86 | if (value.compareToIgnoreCase("now") == 0) return false 87 | // If the time zone field presents in the input, it must be resolvable 88 | try { 89 | getZoneId(timeZoneId) 90 | true 91 | } catch { 92 | case NonFatal(_) => false 93 | } 94 | } 95 | assert(input.trim.length == input.length) 96 | if (input.length < 3 || !input(0).isLetter) return None 97 | input match { 98 | case specialValueRe(v, z) if isValid(v, z) => Some(v.toLowerCase(Locale.US)) 99 | case _ => None 100 | } 101 | } 102 | 103 | 104 | 105 | override protected def toZonedDateTime( 106 | temporalAccessor: TemporalAccessor, 107 | zoneId: ZoneId): ZonedDateTime = { 108 | // Parsed input might not have time related part. In that case, time component is set to zeros. 109 | val parsedLocalTime = temporalAccessor.query(TemporalQueries.localTime) 110 | val localTime = if (parsedLocalTime == null) LocalTime.MIDNIGHT else parsedLocalTime 111 | // Parsed input must have date component. At least, year must present in temporalAccessor. 112 | val localDate = temporalAccessor.query(TemporalQueries.localDate) 113 | 114 | ZonedDateTime.of(localDate, localTime, zoneId) 115 | } 116 | 117 | override def parse(s: String): Long = instantToMicros(toInstant(s)) 118 | 119 | override def format(us: Long): String = { 120 | val secs = Math.floorDiv(us, DateTimeConstants.MICROS_PER_SECOND) 121 | val mos = Math.floorMod(us, DateTimeConstants.MICROS_PER_SECOND) 122 | val instant = Instant.ofEpochSecond(secs, mos * 1000L) 123 | 124 | formatter.withZone(zoneId).format(instant) 125 | } 126 | } 127 | 128 | object TimestampFormatter { 129 | val defaultPattern: String = "yyyy-MM-dd HH:mm:ss" 130 | val defaultLocale: Locale = Locale.US 131 | 132 | def apply(format: String, timeZone: TimeZone, locale: Locale): TimestampFormatter = { 133 | new Iso8601TimestampFormatter(format, timeZone.toZoneId, locale) 134 | } 135 | 136 | def apply(format: String, timeZone: TimeZone): TimestampFormatter = { 137 | apply(format, timeZone, defaultLocale) 138 | } 139 | 140 | def apply(timeZone: TimeZone): TimestampFormatter = { 141 | apply(defaultPattern, timeZone, defaultLocale) 142 | } 143 | } 144 | -------------------------------------------------------------------------------- /src/main/scala/com/microsoft/cdm/write/CDMBatchWriter.scala: -------------------------------------------------------------------------------- 1 | package com.microsoft.cdm.write 2 | 3 | import java.io.IOException 4 | 5 | import com.microsoft.cdm.log.SparkCDMLogger 6 | import com.microsoft.cdm.utils.{CDMDataFormat, CDMDecimalType, CDMModelWriter, CDMTokenProvider, CdmAuthType, Constants, DataConverter, Messages, SchemaDiffOutput, SerializedABFSHadoopConf} 7 | import org.apache.hadoop.fs.{FileSystem, Path} 8 | import org.apache.parquet.hadoop.util.HadoopOutputFile 9 | import org.apache.spark.sql.SaveMode 10 | import org.apache.spark.sql.connector.write.{BatchWrite, DataWriterFactory, PhysicalWriteInfo, WriterCommitMessage} 11 | import org.apache.spark.sql.types.{ArrayType, DecimalType, StringType, StructType} 12 | import org.slf4j.LoggerFactory 13 | import org.slf4j.event.Level 14 | 15 | import scala.collection.mutable.{ArrayBuffer, ListBuffer} 16 | 17 | class CDMBatchWriter(jobId: String, writeMode: SaveMode, schema: StructType, cdmOptions: CDMWriteOptions) extends BatchWrite { 18 | 19 | val logger = LoggerFactory.getLogger(classOf[CDMBatchWriter]) 20 | 21 | val serializedHadoopConf = SerializedABFSHadoopConf.getConfiguration(cdmOptions.storage, cdmOptions.container, cdmOptions.auth, cdmOptions.conf) 22 | 23 | val tokenProvider = if (cdmOptions.auth.getAuthType == CdmAuthType.Token.toString()) Some(new CDMTokenProvider(serializedHadoopConf, cdmOptions.storage)) else None 24 | 25 | val cdmModel = new CDMModelWriter(cdmOptions.storage, 26 | cdmOptions.container, 27 | cdmOptions.manifestPath, 28 | cdmOptions.manifestFileName, 29 | cdmOptions.manifestName, 30 | cdmOptions.entity, 31 | cdmOptions.useSubManifest, 32 | cdmOptions.entityDefinition, 33 | cdmOptions.entDefContAndPath, 34 | jobId, 35 | cdmOptions.fileFormatSettings, 36 | cdmOptions.auth, tokenProvider, 37 | cdmOptions.overrideConfigPath, 38 | cdmOptions.cdmSource, 39 | cdmOptions.entityDefinitionStorage, 40 | cdmOptions.maxCDMThreads) 41 | 42 | val cdmEntity = cdmModel.entityExists(cdmOptions.entity, serializedHadoopConf) 43 | 44 | /* This list is used track the list of partitions in order to know which files to delete in case of an abort */ 45 | val partitionList = ListBuffer[FileCommitMessage]() 46 | 47 | if (Constants.PRODUCTION && cdmOptions.manifestFileName.equals(Constants.MODEL_JSON)) { 48 | throw new Exception("Writing model.json is not supported.") 49 | } 50 | 51 | def compare(cdmSchema: Iterable[Any], schema: StructType, path: ArrayBuffer[String]): SchemaDiffOutput = { 52 | if(cdmSchema.size != schema.fields.length) return SchemaDiffOutput(false, -1, path) 53 | val dv = new DataConverter; 54 | val arr = cdmSchema.toArray 55 | schema.fields.zipWithIndex.foreach{ case (field, i) => 56 | path.append(field.name) 57 | if (field.dataType.isInstanceOf[StructType]) { 58 | val diff = compare(arr(i).asInstanceOf[Iterable[Any]], field.dataType.asInstanceOf[StructType], path) 59 | if(!diff.isSame) return diff 60 | } 61 | else if(field.dataType.isInstanceOf[ArrayType]){ 62 | val arrayElementType = field.dataType.asInstanceOf[ArrayType].elementType 63 | val diff = compare(arr(i).asInstanceOf[Iterable[Any]], arrayElementType.asInstanceOf[StructType], path) 64 | if(!diff.isSame) return diff 65 | } 66 | else if (field.dataType.isInstanceOf[DecimalType]) { 67 | if(arr(i).isInstanceOf[CDMDecimalType]) { 68 | val cdmDecimal = arr(i).asInstanceOf[CDMDecimalType] 69 | val sparkDecimal = field.dataType.asInstanceOf[DecimalType] 70 | if(cdmDecimal.precision != sparkDecimal.precision || cdmDecimal.scale != sparkDecimal.scale) { 71 | return SchemaDiffOutput(false, i, path) 72 | } 73 | }else { 74 | return SchemaDiffOutput(false, i, path) 75 | } 76 | } 77 | else if (arr(i).equals("Guid")) { 78 | if (!field.dataType.equals(StringType)) { 79 | return SchemaDiffOutput(false, i, path) 80 | } 81 | } 82 | else { 83 | try { 84 | if (!dv.toSparkType(CDMDataFormat.withName(arr(i).toString), 0, 0).getClass.equals(field.dataType.getClass)) { 85 | return SchemaDiffOutput(false, i, path) 86 | } 87 | } 88 | /*NoSuchElementException will thrown when toSparkType functions doesn't get the specified CDMDataType. This can happen for array, structs and CDMDecimaltype */ 89 | catch{ 90 | case e : java.util.NoSuchElementException => { 91 | return SchemaDiffOutput(false, i, path) 92 | } 93 | } 94 | } 95 | path.remove(path.length - 1) //backtrack 96 | } 97 | SchemaDiffOutput(true, -1, path) 98 | } 99 | 100 | def printPath(path: ArrayBuffer[String]): String = { 101 | var result = new StringBuilder() 102 | for (i <- path) { 103 | result.append(i) 104 | result.append(" > ") 105 | } 106 | if (result.length > 0) result.setLength(result.length - 3) 107 | result.toString() 108 | } 109 | 110 | def isNestedTypePresent() : Boolean = { 111 | val structFieldtype = schema.fields.find(each => (each.dataType.isInstanceOf[StructType] || each.dataType.isInstanceOf[ArrayType])).getOrElse(null) 112 | if(structFieldtype != null) true else false 113 | } 114 | override def createBatchWriterFactory(info: PhysicalWriteInfo): DataWriterFactory = { 115 | 116 | if(isNestedTypePresent() && "csv".equals(cdmOptions.fileFormatSettings.fileFormat)) { 117 | throw new IllegalArgumentException(Messages.nestedTypesNotSupported) 118 | } 119 | 120 | var cdmSchema: Iterable[Any] = null 121 | // Get the time taken to run this block of code in Kusto 122 | SparkCDMLogger.logEventToKustoForPerf( 123 | { 124 | cdmSchema = if (cdmEntity.entityDec != null) { 125 | if (writeMode == SaveMode.ErrorIfExists) { 126 | throw new IOException("Entity " + cdmOptions.entity + " exists with SaveMode.ErrorIfExists set") 127 | } else if (writeMode == SaveMode.Overwrite) { 128 | if (cdmOptions.entityDefinition == "") { 129 | // cdm schema will be processed from the dataframe schema in case of overwrite and entity exists 130 | cdmModel.getCDMSchemaFromStructType(schema) 131 | } else { 132 | cdmModel.getCDMSchemaTypesAsSeqFromPredefinedSource(cdmOptions.entityDefinition) 133 | } 134 | cdmModel.getCDMSchemaFromStructType(schema) 135 | } else { 136 | cdmModel.getCDMSchemaTypesAsSeq(cdmOptions.entity, serializedHadoopConf) 137 | } 138 | } else { 139 | if (cdmOptions.entityDefinition == "") { 140 | /* create types from scratch*/ 141 | cdmModel.getCDMSchemaFromStructType(schema) 142 | } else { 143 | cdmModel.getCDMSchemaTypesAsSeqFromPredefinedSource(cdmOptions.entityDefinition) 144 | } 145 | } 146 | }, this.getClass.getName, Thread.currentThread.getStackTrace()(1).getMethodName, Level.DEBUG, "Get CDM Schema for entityDefinition - " + (if (cdmOptions.entityDefinition.isEmpty) cdmOptions.entity else cdmOptions.entityDefinition), Some(logger)) 147 | 148 | val resultPath = ArrayBuffer[String]() 149 | resultPath.append(cdmOptions.entity) 150 | val compareResult = compare(cdmSchema, schema, resultPath) 151 | val prettyPath = printPath(compareResult.path) 152 | 153 | compareResult match { 154 | case SchemaDiffOutput(false, -1, path) => throw new Exception(String.format(Messages.mismatchedSizeSchema, prettyPath)) 155 | case SchemaDiffOutput(false, i, path) => throw new Exception(String.format(Messages.invalidIndexSchema, prettyPath.substring(prettyPath.lastIndexOf(" ") + 1), prettyPath)) 156 | case _ => 157 | } 158 | 159 | new CDMDataWriterFactory(cdmOptions.storage, 160 | cdmOptions.container, 161 | cdmOptions.entity, 162 | schema, 163 | cdmOptions.manifestPath, 164 | cdmOptions.useSubManifest, 165 | cdmSchema.toList, 166 | 167 | cdmOptions.dataFolder, cdmOptions.fileFormatSettings, jobId, cdmOptions.compressionCodec, serializedHadoopConf) 168 | } 169 | 170 | override def commit(messages: Array[WriterCommitMessage]): Unit = { 171 | /* 172 | * The list of partitions to we will write to 173 | */ 174 | messages.foreach { e => 175 | val message = e.asInstanceOf[FileCommitMessage] 176 | partitionList.append(message) 177 | } 178 | 179 | /* 180 | * If we are using managed identities and databricks, verify that the token has not expired. If it has expired, 181 | * notify the user user with an exception 182 | */ 183 | if (tokenProvider != None && !tokenProvider.get.isTokenValid()) { 184 | throw new Exception(Messages.managedIdentitiesDatabricksTimeout) 185 | } 186 | 187 | if (cdmEntity.entityDec != null) { 188 | // val cdmEntity = cdmModel.getEntityDec(entity, serializedHadoopConf) 189 | writeMode match { 190 | case SaveMode.ErrorIfExists => throw new Exception("Entity " + cdmOptions.entity + "exists with SaveMode ErrorIf Exists") 191 | case SaveMode.Overwrite => { 192 | val fs= serializedHadoopConf.getFileSystem() 193 | val oldPartitions = cdmModel.getOldPartitions(fs, cdmEntity) // Gets the old partitions that need to be deleted after overwritten entity is created. 194 | SparkCDMLogger.logEventToKustoForPerf( 195 | { 196 | if(cdmModel.createEntity(schema, partitionList, true, cdmOptions.dataFolder, serializedHadoopConf, cdmEntity) && cdmOptions.useSubManifest) { 197 | deleteOldSubManifest(fs) 198 | } 199 | deleteOldPartitionsFromDisk(cdmOptions.useSubManifest, fs, oldPartitions) 200 | }, this.getClass.getName, Thread.currentThread.getStackTrace()(1).getMethodName, Level.DEBUG, "SaveMode - Overwwite : delete partition and create entity.." + logInfo, Some(logger)) 201 | 202 | 203 | } 204 | case SaveMode.Append => { 205 | SparkCDMLogger.logEventToKustoForPerf( 206 | { 207 | cdmModel.updateEntity(cdmEntity, partitionList, cdmOptions.dataFolder) 208 | }, this.getClass.getName, Thread.currentThread.getStackTrace()(1).getMethodName, Level.DEBUG, "SaveMode - Append : update entity.." + logInfo, Some(logger)) 209 | } 210 | case _ => { 211 | SparkCDMLogger.logEventToKusto(this.getClass.getName, Thread.currentThread.getStackTrace()(1).getMethodName, Level.ERROR, "Other SaveMode." + writeMode, Some(logger)) 212 | } 213 | } 214 | } else { 215 | SparkCDMLogger.logEventToKustoForPerf( 216 | { 217 | // Entity does not exist, so create it 218 | cdmModel.createEntity(schema, partitionList, false, cdmOptions.dataFolder, serializedHadoopConf, cdmEntity) 219 | }, this.getClass.getName, Thread.currentThread.getStackTrace()(1).getMethodName, Level.DEBUG, "Entity does not exist. Creating entity.." + logInfo, Some(logger)) 220 | } 221 | } 222 | 223 | def logInfo() : String = { 224 | 225 | return "[Entity : " + cdmOptions.entity + 226 | ", Manifest Path : " + cdmOptions.storage + cdmOptions.container + cdmOptions.manifestPath + 227 | ", entDefContAndPath : " + cdmOptions.entDefContAndPath + cdmOptions.entityDefinition +"]"; 228 | } 229 | 230 | // When spark job is aborted, delete the partitions that was created. 231 | def cleanupOnAbort() = { 232 | val fs= serializedHadoopConf.getFileSystem() 233 | var path: Path = null 234 | for (file <- partitionList) { 235 | path = new Path(cdmOptions.manifestPath + cdmOptions.entity + "/" + cdmOptions.dataFolder + "/" + file.name + file.extension) 236 | if (fs.exists(path)) { 237 | SparkCDMLogger.logEventToKusto(this.getClass.getName, 238 | Thread.currentThread.getStackTrace()(1).getMethodName, 239 | Level.ERROR, "CDMDataSourceWriter abort - Deleting partition- " + HadoopOutputFile.fromPath(path, serializedHadoopConf.value).toString, 240 | Some(logger)) 241 | fs.delete(path) 242 | } 243 | } 244 | // Only in case of overwrite with submanifests 245 | path = new Path(String.format(Constants.SUBMANIFEST_WITH_OVERWRITTEN_PARTITIONS, cdmOptions.manifestPath + cdmOptions.entity + "/" + cdmOptions.entity)) 246 | if(fs.exists(path)) { 247 | SparkCDMLogger.logEventToKusto(this.getClass.getName, Thread.currentThread.getStackTrace()(1).getMethodName, Level.ERROR, "CDMDataSourceWriter abort - Deleting submanifest with overwritten partition locations - " + HadoopOutputFile.fromPath(path, serializedHadoopConf.value).toString, Some(logger)) 248 | fs.delete(path) 249 | } 250 | } 251 | 252 | 253 | def deleteOldPartitionsFromDisk(useSubManifest: Boolean, fs: FileSystem, oldPartitions: ArrayBuffer[Path]) = { 254 | for( oldPartition <- oldPartitions) { 255 | SparkCDMLogger.logEventToKusto(this.getClass.getName, Thread.currentThread.getStackTrace()(1).getMethodName, Level.INFO, "deleting partitions from disk- " + oldPartition.toString, Some(logger)) 256 | fs.delete(oldPartition) 257 | } 258 | } 259 | 260 | def deleteOldSubManifest(fs: FileSystem) = { 261 | val oldPath = new Path(cdmOptions.manifestPath + cdmOptions.entity + "/" + cdmOptions.entity +".manifest.cdm.json") 262 | val newPath = new Path(String.format(Constants.SUBMANIFEST_WITH_OVERWRITTEN_PARTITIONS, cdmOptions.manifestPath + cdmOptions.entity + "/" + cdmOptions.entity)) 263 | fs.delete(oldPath) 264 | SparkCDMLogger.log(Level.INFO, "Renaming " + newPath.getName + " to " + oldPath.getName , logger) 265 | fs.rename(newPath, oldPath) // renames .rename.manifest.cdm.json to .manifest.cdm.json 266 | } 267 | 268 | override def abort(messages: Array[WriterCommitMessage]): Unit = { 269 | cleanupOnAbort() 270 | SparkCDMLogger.logEventToKusto(this.getClass.getName, Thread.currentThread.getStackTrace()(1).getMethodName, Level.ERROR, "CDMBatchWriter abort " + logInfo, Some(logger)) 271 | 272 | } 273 | } 274 | -------------------------------------------------------------------------------- /src/main/scala/com/microsoft/cdm/write/CDMDataWriter.scala: -------------------------------------------------------------------------------- 1 | package com.microsoft.cdm.write 2 | 3 | import java.io._ 4 | 5 | import com.microsoft.cdm.utils.{CsvParserFactory, DataConverter} 6 | import org.apache.commons.io.FilenameUtils 7 | import org.apache.spark.sql.catalyst.InternalRow 8 | import org.apache.spark.sql.connector.write.{DataWriter, WriterCommitMessage} 9 | import org.apache.spark.sql.types.StructType 10 | 11 | 12 | /** 13 | * Writes a single partition of CDM data to a single CSV in ADLSgen2. 14 | * @param schema schema of the data to write. 15 | * @param fileWriter Type of file writer CSV/parquet 16 | * @param dataConverter converter between Spark and CDM data. 17 | */ 18 | class CDMDataWriter( var schema: StructType, 19 | var fileWriter: WriterConnector, 20 | var dataConverter: DataConverter) extends DataWriter[InternalRow] { 21 | 22 | fileWriter.build(schema) 23 | /** 24 | * Called by Spark runtime. Writes a row of data to an in-memory csv file. 25 | * @param row row of data to write. 26 | */ 27 | def write(row: InternalRow): Unit = { 28 | 29 | // TODO: periodically dump buffer when it gets to a certain size 30 | fileWriter.writeRow(row, dataConverter) 31 | } 32 | 33 | /** 34 | * Called by Spark runtime when all data has been written. Uploads the in-memory buffer to the output CSV/parquet file. 35 | * @return commit message specifying location of the output csv/parquet file. 36 | */ 37 | def commit: WriterCommitMessage = { 38 | 39 | fileWriter.upload 40 | 41 | // Pass the file path back so we can add it as a file to the CDM model 42 | val path = fileWriter.getPath.stripPrefix("/") 43 | val name = FilenameUtils.getBaseName(path) 44 | val extension = FilenameUtils.EXTENSION_SEPARATOR_STR + FilenameUtils.getExtension(path) 45 | new FileCommitMessage(name=name, fileLocation = path, extension) 46 | } 47 | 48 | /** 49 | * Called by spark runtime. 50 | */ 51 | def abort(): Unit = { 52 | /* TODO: Closing is not aborting*/ 53 | fileWriter.abort 54 | } 55 | 56 | override def close(): Unit = { 57 | fileWriter.close 58 | } 59 | } 60 | -------------------------------------------------------------------------------- /src/main/scala/com/microsoft/cdm/write/CDMDataWriterFactory.scala: -------------------------------------------------------------------------------- 1 | package com.microsoft.cdm.write 2 | 3 | import java.net.URLDecoder 4 | 5 | import com.microsoft.cdm.utils.{Constants, DataConverter, FileFormatSettings, SparkSerializableConfiguration} 6 | import org.apache.parquet.hadoop.metadata.CompressionCodecName 7 | import org.apache.spark.sql.catalyst.InternalRow 8 | import org.apache.spark.sql.connector.write.{DataWriter, DataWriterFactory} 9 | import org.apache.spark.sql.types.StructType 10 | 11 | /** 12 | * Factory class. Creates a CDMDataWriter instance for a single partition of data to write. 13 | * @param storage The storage account 14 | * @param container The container name in the storage account. 15 | * @param entity The name of the entity that will be written. 16 | * @param schema Spark schema 17 | * @param manifestPath path relative to storage and container where manifest will be written 18 | * @param useSubManifest indicates if subManifest can be used (Boolean) 19 | * @param cdmSchema cdmSchema 20 | * @param fileFormatSettings file settings including - header, delimiter, file type [parquet or csv] 21 | * @param jobId id of the write job. 22 | * @param compression compression codec name 23 | * @param serConf Spark serialization configuration 24 | */ 25 | @SerialVersionUID(100L) 26 | class CDMDataWriterFactory(var storage: String, 27 | var container: String, 28 | var entity: String, 29 | var schema: StructType, 30 | var manifestPath: String, 31 | var useSubManifest: Boolean, 32 | var cdmSchema: List[Any], 33 | val dataDir: String, 34 | var fileFormatSettings: FileFormatSettings, 35 | var jobId: String, 36 | var compression: CompressionCodecName, 37 | var serConf: SparkSerializableConfiguration) extends DataWriterFactory { 38 | 39 | // TODO: error handling. we're basically assuming successful writes. Need to add logic to remove/rewrite files on failure. 40 | 41 | override def createWriter(partitionId: Int, taskId: Long): DataWriter[InternalRow] = { 42 | var path = manifestPath + entity + "/" + dataDir 43 | path = URLDecoder.decode(path, "UTF-8") 44 | val fileWriter = fileFormatSettings.fileFormat match{ 45 | case "csv" => { 46 | val prefix = "https://" + storage + container 47 | val filename = entity + "-" + jobId + "-" +partitionId + ".csv" 48 | new CSVWriterConnector( prefix, path + "/" + filename, cdmSchema, serConf, fileFormatSettings) 49 | } 50 | case "parquet" => { 51 | val prefix ="https://" +storage + container 52 | val filename = entity + "-" + jobId + "-" + partitionId + compression.getExtension + ".parquet" 53 | new ParquetWriterConnector(prefix, path + "/" + filename, cdmSchema, compression, serConf) 54 | } 55 | } 56 | new CDMDataWriter(schema, fileWriter, new DataConverter()) 57 | } 58 | 59 | } 60 | -------------------------------------------------------------------------------- /src/main/scala/com/microsoft/cdm/write/CDMWriteOptions.scala: -------------------------------------------------------------------------------- 1 | package com.microsoft.cdm.write 2 | 3 | import java.time.LocalDateTime 4 | import java.time.format.DateTimeFormatter 5 | 6 | import com.microsoft.cdm.utils.{CDMDataFolder, CDMOptions, CdmAuthType, Constants, FileFormatSettings, Messages} 7 | 8 | import sys.process._ 9 | import org.apache.parquet.hadoop.metadata.CompressionCodecName 10 | import org.apache.spark.sql.util.CaseInsensitiveStringMap 11 | 12 | class CDMWriteOptions(options: CaseInsensitiveStringMap) extends CDMOptions(options) { 13 | 14 | Constants.MODE = "write" 15 | val manifestName = if(options.containsKey("manifestName")) options.get("manifestName") else "default" 16 | val useSubManifest = if(options.containsKey("useSubManifest")) options.get("useSubManifest").toBoolean else false 17 | val entDefIn = if (options.containsKey("entityDefinitionPath")) options.get("entityDefinitionPath") else "" 18 | val useCdmStandard = if (options.containsKey("useCdmStandardModelRoot")) options.get("useCdmStandardModelRoot").toBoolean else false 19 | val entDefModelRootIn= if (options.containsKey("entityDefinitionModelRoot")) options.get("entityDefinitionModelRoot") else "" 20 | val compressionFormat= if (options.containsKey("compression")) options.get("compression") else "snappy" 21 | val customDataFolderPattern = if (options.containsKey("dataFolderFormat")) options.get("dataFolderFormat") else "" 22 | var entityDefinitionStorage = if (options.containsKey("entityDefinitionStorage")) options.get("entityDefinitionStorage") else storage 23 | 24 | if (useCdmStandard && !entDefModelRootIn.isEmpty) { 25 | throw new Exception(Messages.invalidBothStandardAndEntityDefCont) 26 | } 27 | 28 | if (Constants.PRODUCTION && manifestFileName.equals(Constants.MODEL_JSON)) { 29 | throw new Exception("Writing model.json is not supported.") 30 | } 31 | 32 | var entDefContAndPath = getEntityDefinitionPath(useCdmStandard, container, manifestPath, entDefModelRootIn) 33 | val compressionCodec = getCompression(compressionFormat) 34 | 35 | import com.microsoft.cdm.utils.Environment 36 | import com.microsoft.cdm.utils.SparkPlatform 37 | 38 | if ((Environment.sparkPlatform eq SparkPlatform.DataBricks) && compressionFormat.equals("lzo")) checkLzo(compressionCodec) 39 | 40 | if (getAuthType(options) != CdmAuthType.Token.toString()) { 41 | if(!entityDefinitionStorage.equals(storage)) { 42 | throw new IllegalArgumentException(Messages.entityDefStorageAppCredError) 43 | } 44 | } 45 | 46 | // if there is no entity definition model root, use the output CDM storage account 47 | if(entDefModelRootIn.isEmpty) { 48 | entityDefinitionStorage = storage 49 | } 50 | 51 | val dataFolder = 52 | if (customDataFolderPattern == ""){ 53 | CDMDataFolder.getDataFolderWithDate() 54 | } else { 55 | try{ 56 | val dataFormatter = DateTimeFormatter.ofPattern(customDataFolderPattern) 57 | dataFormatter.format(LocalDateTime.now) 58 | } catch { 59 | case e: Exception => throw new IllegalArgumentException(String.format(Messages.incorrectDataFolderFormat, customDataFolderPattern)) 60 | } 61 | } 62 | 63 | var entityDefinition = if (entDefIn.isEmpty || entDefIn.startsWith("/")) entDefIn else "/"+ entDefIn 64 | 65 | val delimiter = getDelimiterChar(options.get("delimiter")) 66 | val showHeader = if(options.containsKey("columnHeaders")) options.get("columnHeaders").toBoolean else true 67 | 68 | val fileFormatSettings = FileFormatSettings(fileFormatType, delimiter, showHeader) 69 | 70 | 71 | 72 | // Return the container + path to use for the CDM adapter. If useCdmStandard is set, set it to an empty string, which is the flag 73 | // in the CDMCommonModel to use the CdmStandard adapter 74 | private def getEntityDefinitionPath(useCdmStandard: Boolean, origContainer:String, origPath: String, entDefModelRootIn: String) = { 75 | 76 | val entDefPath = if(entDefModelRootIn.isEmpty || entDefModelRootIn.startsWith("/")) entDefModelRootIn else "/"+ entDefModelRootIn 77 | 78 | if (useCdmStandard) { 79 | "" 80 | } else { 81 | //If "entityDefnitionModelRoot" was empty, use the CDM metadata container -> which would be container plus the origPath 82 | if (entDefModelRootIn.isEmpty()) { 83 | origContainer + origPath.dropRight(1) 84 | } else { 85 | entDefPath 86 | } 87 | } 88 | } 89 | 90 | def getCompression(compressionFormat: String): CompressionCodecName = { 91 | try { 92 | CompressionCodecName.fromConf(compressionFormat) 93 | }catch { 94 | case e: IllegalArgumentException => throw new IllegalArgumentException(String.format(Messages.invalidCompressionFormat, compressionFormat)) 95 | } 96 | } 97 | 98 | def checkLzo(compression: CompressionCodecName) = { 99 | try { 100 | val version = "which lzop".!! 101 | Class.forName(compression.getHadoopCompressionCodecClassName) 102 | } 103 | catch { 104 | case _ => throw new UnsupportedOperationException("Codec class " + compression.getHadoopCompressionCodecClassName + " is not available. " + 105 | "If running on databricks, please read: https://docs.databricks.com/data/data-sources/read-lzo.html ") 106 | } 107 | } 108 | 109 | def getDelimiterChar(value: String): Char = { 110 | val delimiter = if(value != null) value else ","; 111 | if (delimiter.length > 1) { 112 | throw new IllegalArgumentException(String.format(Messages.invalidDelimiterCharacter, delimiter)) 113 | } 114 | delimiter.charAt(0) 115 | } 116 | 117 | } 118 | -------------------------------------------------------------------------------- /src/main/scala/com/microsoft/cdm/write/CDMWriterBuilder.scala: -------------------------------------------------------------------------------- 1 | package com.microsoft.cdm.write 2 | 3 | import com.microsoft.cdm.utils.{CDMOptions, Constants} 4 | import org.apache.spark.sql.SaveMode 5 | import org.apache.spark.sql.connector.write.{BatchWrite, SupportsOverwrite, SupportsTruncate, WriteBuilder} 6 | import org.apache.spark.sql.sources.Filter 7 | import org.apache.spark.sql.types.StructType 8 | 9 | class CDMWriterBuilder(queryId: String, schema: StructType, writeMode: SaveMode, options: CDMWriteOptions) extends WriteBuilder 10 | with SupportsOverwrite 11 | with SupportsTruncate { 12 | 13 | Constants.MODE = "write" 14 | 15 | override def buildForBatch(): BatchWrite = new CDMBatchWriter(queryId, writeMode, schema, options) 16 | 17 | override def overwrite(filters: Array[Filter]): WriteBuilder = new CDMWriterBuilder(queryId, schema, SaveMode.Overwrite, options) 18 | 19 | } 20 | 21 | -------------------------------------------------------------------------------- /src/main/scala/com/microsoft/cdm/write/CSVWriterConnector.scala: -------------------------------------------------------------------------------- 1 | package com.microsoft.cdm.write 2 | 3 | import java.io.OutputStreamWriter 4 | 5 | import com.microsoft.cdm.log.SparkCDMLogger 6 | import com.microsoft.cdm.utils.{CsvParserFactory, DataConverter, FileFormatSettings, SparkSerializableConfiguration} 7 | import com.univocity.parsers.csv.CsvWriter 8 | import org.apache.hadoop.fs.Path 9 | import org.apache.parquet.hadoop.util.HadoopOutputFile 10 | import org.apache.parquet.io.PositionOutputStream 11 | import org.apache.spark.sql.catalyst.InternalRow 12 | import org.apache.spark.sql.types.StructType 13 | import org.slf4j.LoggerFactory 14 | import org.slf4j.event.Level 15 | 16 | import scala.collection.JavaConversions 17 | 18 | class CSVWriterConnector(prefix: String, 19 | filePath: String, 20 | cdmSchema: List[Any], 21 | var serConf:SparkSerializableConfiguration, 22 | fileFormatSettings: FileFormatSettings) extends WriterConnector { 23 | val logger = LoggerFactory.getLogger(classOf[CSVWriterConnector]) 24 | private var stream:PositionOutputStream = _ 25 | private var writer:CsvWriter = _ 26 | private var schema: StructType = _ 27 | private val httpPath= prefix + filePath 28 | SparkCDMLogger.log(Level.INFO, "CSV Writer for partition at path: " + prefix + filePath, logger) 29 | 30 | def getPath(): String = httpPath 31 | 32 | def build(inSchema: StructType): Unit = { 33 | try { 34 | schema = inSchema 35 | val path = new Path(filePath) 36 | val fs = path.getFileSystem(serConf.value) 37 | val oFile = HadoopOutputFile.fromPath(path, serConf.value) 38 | stream = oFile.create(fs.getDefaultBlockSize(path)) 39 | writer = CsvParserFactory.buildWriter(new OutputStreamWriter(stream), fileFormatSettings.delimiter) 40 | if(fileFormatSettings.showHeader) { 41 | val headers = schema.fields.map(_.name) 42 | writer.writeHeaders(headers: _*) 43 | } 44 | } catch { 45 | case e: Throwable => SparkCDMLogger.log(Level.ERROR, e.printStackTrace.toString, logger) 46 | } 47 | } 48 | 49 | def upload() = { 50 | writer.close() 51 | } 52 | 53 | 54 | def writeRow(row: InternalRow, dataConverter: DataConverter): Unit = { 55 | val strings = JavaConversions.seqAsJavaList(row.toSeq(schema).zipWithIndex.map{ case(col, index) => 56 | dataConverter.dataToString(col, schema.fields(index).dataType, cdmSchema(index)) 57 | }) 58 | 59 | var strArray = new Array[String](strings.size) 60 | strArray = strings.toArray(strArray) 61 | writer.writeRow(strArray) 62 | } 63 | 64 | def abort(): Unit = { 65 | SparkCDMLogger.log(Level.ERROR, "CSV Writer aborting.." + prefix + filePath, logger) 66 | writer.close() 67 | } 68 | 69 | def close (): Unit = { 70 | writer.close() 71 | } 72 | } 73 | -------------------------------------------------------------------------------- /src/main/scala/com/microsoft/cdm/write/FileCommitMessage.scala: -------------------------------------------------------------------------------- 1 | package com.microsoft.cdm.write 2 | 3 | import org.apache.spark.sql.connector.write.WriterCommitMessage 4 | 5 | // TODO: there's a better scala idiom for this than class 6 | /** 7 | * Commit message returned from CDMDataWriter on successful write. One for each partition gets returned to 8 | * CDMDataSourceWriter. 9 | * @param name name of the partition. 10 | * @param fileLocation output csv/parquet file for the partition. 11 | */ 12 | class FileCommitMessage(val name: String, val fileLocation: String, val extension: String) extends WriterCommitMessage { 13 | name + fileLocation 14 | 15 | // return the partition name 16 | def getPartition(): String = { 17 | name 18 | } 19 | } 20 | -------------------------------------------------------------------------------- /src/main/scala/com/microsoft/cdm/write/ParquetWriterConnector.scala: -------------------------------------------------------------------------------- 1 | package com.microsoft.cdm.write 2 | 3 | import java.nio.{ByteBuffer, ByteOrder} 4 | import java.time.{Instant, ZoneId} 5 | import java.util.TimeZone 6 | 7 | import com.microsoft.cdm.utils.{CDMSparkToParquetSchemaConverter, DataConverter, Messages, SparkSerializableConfiguration} 8 | import org.apache.hadoop.fs.Path 9 | import org.apache.parquet.column.ParquetProperties 10 | import org.apache.parquet.example.data.Group 11 | import org.apache.parquet.example.data.simple.{NanoTime, SimpleGroup, SimpleGroupFactory} 12 | import org.apache.parquet.hadoop.ParquetWriter 13 | import org.apache.parquet.hadoop.example.GroupWriteSupport 14 | import org.apache.parquet.io.api.Binary 15 | import org.apache.parquet.schema.{MessageType, Type} 16 | import org.apache.spark.sql.catalyst.InternalRow 17 | import org.apache.spark.sql.catalyst.util.{ArrayData, DateTimeUtils} 18 | import org.apache.spark.sql.types._ 19 | import org.apache.spark.unsafe.types.UTF8String 20 | import org.slf4j.LoggerFactory 21 | import java.util.concurrent.TimeUnit 22 | 23 | import com.microsoft.cdm.log.SparkCDMLogger 24 | import org.apache.parquet.hadoop.metadata.CompressionCodecName 25 | import org.apache.parquet.schema.PrimitiveType.PrimitiveTypeName 26 | import org.apache.parquet.schema.Type.Repetition.{OPTIONAL, REQUIRED} 27 | import org.slf4j.event.Level 28 | 29 | import scala.collection.JavaConverters._ 30 | 31 | 32 | class ParquetWriterConnector(httpPrefix:String, 33 | filePath: String, 34 | cdmSchema: List[Any], 35 | compression: CompressionCodecName, 36 | var seriazliedHadoopConf: SparkSerializableConfiguration) extends WriterConnector { 37 | val logger = LoggerFactory.getLogger(classOf[ParquetWriterConnector]) 38 | private var structType: StructType= _ 39 | private var schema: MessageType = _ 40 | private var writer: ParquetWriter[Group]= _ 41 | private var groupFactory:SimpleGroupFactory = _ 42 | private var path:Path=_ 43 | private val httpPath = httpPrefix + filePath 44 | def getPath: String = httpPath 45 | private var converter: CDMSparkToParquetSchemaConverter=_ 46 | 47 | val NANOS_PER_HOUR: Long = TimeUnit.HOURS.toNanos (1) 48 | val NANOS_PER_MINUTE: Long = TimeUnit.MINUTES.toNanos (1) 49 | val NANOS_PER_SECOND: Long = TimeUnit.SECONDS.toNanos (1) 50 | val NANOS_PER_MILLISECOND: Long = TimeUnit.MILLISECONDS.toNanos (1) 51 | 52 | SparkCDMLogger.log(Level.INFO, "Parquet Writer for partition at path: " + httpPrefix + filePath, logger) 53 | 54 | def build(inStructType: StructType): Unit = { 55 | try { 56 | /* 57 | * CCDMSparkToParquetSchemaConverter is a modified version of SparkToParquetSchemaConverter. 58 | * Since Spark does not support the TIME type, We use this to tell Parquet that the field type should be TIME. 59 | * We do this ine one of two ways: 60 | * * Set a metadata overwrite field set to Time on implicit write 61 | * * the CDM type is of type Time 62 | * In either case, we will set the column to be of type TIME and not type Timestamp. 63 | * If Spark supported a Time type, we would not need to do this. However, they do not. See: 64 | * https://github.com/apache/spark/pull/25678 65 | */ 66 | converter = new CDMSparkToParquetSchemaConverter(writeLegacyParquetFormat = false) 67 | structType = inStructType 68 | schema = converter.convert(inStructType, cdmSchema) 69 | groupFactory = new SimpleGroupFactory(schema) 70 | GroupWriteSupport.setSchema(schema, seriazliedHadoopConf.value) 71 | val writeSupport = new GroupWriteSupport() 72 | path = new Path(filePath) 73 | writer = new ParquetWriter[Group]( 74 | path, 75 | writeSupport, 76 | compression, 77 | ParquetWriter.DEFAULT_BLOCK_SIZE, 78 | ParquetWriter.DEFAULT_PAGE_SIZE, 79 | ParquetWriter.DEFAULT_PAGE_SIZE, 80 | ParquetWriter.DEFAULT_IS_DICTIONARY_ENABLED, 81 | ParquetWriter.DEFAULT_IS_VALIDATING_ENABLED, 82 | ParquetProperties.DEFAULT_WRITER_VERSION, 83 | seriazliedHadoopConf.value 84 | ) 85 | } 86 | catch { 87 | case e : Exception => { 88 | SparkCDMLogger.log(Level.ERROR, e.printStackTrace.toString, logger) 89 | } 90 | } 91 | } 92 | 93 | def upload() = { 94 | writer.close() 95 | } 96 | 97 | 98 | def parseDateToBinary(value: Long) = { 99 | val (julianDay, timeOfDayNanos) = DateTimeUtils.toJulianDay(value.toLong) 100 | 101 | // Write INT96 timestamp 102 | val timestampBuffer = new Array[Byte](12) 103 | val buf = ByteBuffer.wrap(timestampBuffer) 104 | buf.order(ByteOrder.LITTLE_ENDIAN).putLong(timeOfDayNanos).putInt(julianDay) 105 | 106 | // This is the properly encoded INT96 timestamp 107 | val tsValue = Binary.fromReusedByteArray(timestampBuffer); 108 | tsValue; 109 | } 110 | 111 | /** 112 | * Converts Decimal to FIX_LEN_BYTE_ARRAY of len @param typeLength 113 | * @param decimal 114 | * @param typeLength 115 | * @return 116 | */ 117 | def decimaltoBytes(decimal: Decimal, typeLength: Int): Array[Byte] = { 118 | val bigDecimal = decimal.toJavaBigDecimal 119 | val bytes = new Array[Byte](typeLength) 120 | val fillByte: Byte = if (bigDecimal.signum < 0) 0xFF.toByte else 0x00.toByte 121 | val unscaled: Array[Byte] = bigDecimal.unscaledValue.toByteArray 122 | 123 | // If unscaled.length > typeLength. It means we cant that accomodate it in `bytes` array. We have FIX_LEN_BYTE_ARRAY of size = tyeLength 124 | if (unscaled.length > typeLength) { 125 | throw new UnsupportedOperationException("Decimal size greater than "+ typeLength+" bytes") 126 | } 127 | // Fill the all bytes with fillByte or unscaled 128 | val offset = typeLength - unscaled.length 129 | for( i <- 0 until bytes.length) 130 | { 131 | if(i group.add(index, decimal.toUnscaledLong.asInstanceOf[Int]) 140 | case PrimitiveTypeName.INT64 => group.add(index, decimal.toUnscaledLong) 141 | case PrimitiveTypeName.FIXED_LEN_BYTE_ARRAY => { 142 | val typeLength = primitive.getTypeLength 143 | val byteArray = decimaltoBytes(decimal, typeLength) 144 | group.add(index,Binary.fromReusedByteArray(byteArray)) 145 | } 146 | case PrimitiveTypeName.BINARY => { 147 | val typeLength = primitive.getTypeLength 148 | val byteArray = decimaltoBytes(decimal, typeLength) 149 | group.add(index,Binary.fromReusedByteArray(byteArray)) 150 | } 151 | case _ => throw new UnsupportedOperationException("Unsupported base type for decimal: " + primitive.getPrimitiveTypeName()); 152 | } 153 | } 154 | 155 | def writeRowUtil(row: InternalRow, group: Group, structType: StructType, cdmSchemaLocal:List[Any]): Unit = { 156 | if(row != null) { 157 | row.toSeq(structType).zipWithIndex.foreach { 158 | case (field, index) => { 159 | (structType.fields(index).dataType, field) match { 160 | case (_, null) => { 161 | // If precision <= scale, spark store null value; we dont want that, so controlling it here.. 162 | if (structType.fields(index).dataType.isInstanceOf[DecimalType]) { 163 | val primitive = group.getType.getType(index).asPrimitiveType() 164 | val precision = primitive.getDecimalMetadata.getPrecision 165 | val scale = primitive.getDecimalMetadata.getScale 166 | if (precision <= scale) { 167 | throw new IllegalArgumentException(String.format(Messages.invalidDecimalFormat, new Integer(precision), new Integer(scale))) 168 | } 169 | } 170 | } 171 | case (ByteType, _) => group.add(index, field.asInstanceOf[Byte]) 172 | case (ShortType, _) => group.add(index, field.asInstanceOf[Short]) 173 | case (ar: ArrayType, _) => { 174 | val arrayData = field.asInstanceOf[ArrayData] 175 | val arrayElementType = structType.fields(index).dataType.asInstanceOf[ArrayType].elementType 176 | /* Convert Spark type to parquet type schema. Here Spark ArrayType gets converted to Parquet GroupType 177 | This is the converted parquetType schema : 178 | `optional group field.name (LIST) { 179 | repeated group list { 180 | optional group element { 181 | optional ; 182 | optional ; 183 | } 184 | } 185 | }` */ 186 | val parquetType = converter.convertField(structType.fields(index), Type.Repetition.REPEATED, cdmSchemaLocal(index)) 187 | 188 | /* Create a Row group from the converted schema above to insert data */ 189 | val mainGroup = new SimpleGroup(parquetType.asGroupType()) 190 | 191 | /* `repeated group list` is represented as 0th index in mainGroup 192 | Adding that as group here because later we will insert StructType data inside this group. 193 | Refer: https://github.com/apache/parquet-mr/blob/b2d366a83f293914195f9de86d918f8ddd944374/parquet-column/src/main/java/org/apache/parquet/example/data/simple/SimpleGroup.java#L80 */ 194 | mainGroup.addGroup(0) 195 | 196 | /* parquetType will always have field names "list". Get this group to add the array of objects 197 | This is repeatedGroup structure 198 | repeated group list { 199 | optional group element { 200 | optional ; 201 | optional ; 202 | } 203 | } */ 204 | val repeatedGroup = mainGroup.getGroup("list", 0) 205 | val subGroupFactory = new SimpleGroupFactory(converter.convert(arrayElementType.asInstanceOf[StructType], cdmSchemaLocal(index).asInstanceOf[List[Any]])) 206 | val iterator = arrayData.toObjectArray(arrayElementType).iterator 207 | 208 | /* Iterate through the `arrayData` */ 209 | while (iterator.hasNext) { 210 | val subgroup = subGroupFactory.newGroup(); 211 | val itemType = iterator.next().asInstanceOf[InternalRow] 212 | writeRowUtil(itemType, subgroup, arrayElementType.asInstanceOf[StructType],cdmSchemaLocal(index).asInstanceOf[List[Any]]); 213 | repeatedGroup.add(0, subgroup) 214 | } 215 | group.add(index, mainGroup); 216 | } 217 | case (BooleanType, _) => group.add(index, field.asInstanceOf[Boolean]) 218 | case (DateType, _) => group.add(index, field.asInstanceOf[Integer]) 219 | case (DoubleType, _) => group.add(index, field.asInstanceOf[Double]) 220 | case (DecimalType(), _) => { 221 | val decimal = field.asInstanceOf[Decimal] 222 | writeDecimal(group, index, decimal) 223 | } 224 | case (FloatType, _) => group.add(index, field.asInstanceOf[Float]) 225 | case (IntegerType, _) => group.add(index, field.asInstanceOf[Int]) 226 | case (LongType, _) => group.add(index, field.asInstanceOf[Long]) 227 | case (StringType, _) => { 228 | val string = field.asInstanceOf[UTF8String].toString 229 | group.add(index, string) 230 | } 231 | case (TimestampType, _) => { 232 | if (cdmSchemaLocal(index).equals("Time")) { 233 | val value = field.asInstanceOf[Long]; 234 | val nanoAdjustment = TimeUnit.MICROSECONDS.toNanos(Math.floorMod(value, TimeUnit.SECONDS.toMicros(1))) 235 | val instant = Instant.ofEpochSecond(TimeUnit.MICROSECONDS.toSeconds(value.asInstanceOf[Long]), nanoAdjustment); 236 | val localTime= instant.atZone(ZoneId.of("UTC")).toLocalTime 237 | group.add(index, localTime.toNanoOfDay / 1000) 238 | } else { 239 | val value = field.asInstanceOf[Long]; 240 | val binary = parseDateToBinary(value) 241 | group.add(index, NanoTime.fromBinary(binary)) 242 | } 243 | } 244 | case _ => { 245 | if (structType.fields(index).dataType.isInstanceOf[StructType]) { 246 | val subSchema = structType.fields(index).dataType.asInstanceOf[StructType]; 247 | val subGroupFactory = new SimpleGroupFactory(converter.convert(subSchema, cdmSchemaLocal(index).asInstanceOf[List[Any]])) 248 | val subgroup = subGroupFactory.newGroup(); 249 | writeRowUtil(field.asInstanceOf[InternalRow], subgroup, structType.fields(index).dataType.asInstanceOf[StructType], cdmSchemaLocal(index).asInstanceOf[List[Any]]) 250 | group.add(index, subgroup); 251 | } else { 252 | group.add(index, field.toString) 253 | } 254 | } 255 | } 256 | } 257 | } 258 | } 259 | } 260 | 261 | def writeRow(row: InternalRow, dataConverter: DataConverter) { 262 | val group = groupFactory.newGroup() 263 | writeRowUtil(row, group, structType, cdmSchema); 264 | writer.write(group) 265 | } 266 | 267 | 268 | def abort(): Unit = { 269 | SparkCDMLogger.log(Level.ERROR, "ParquetWriter aborting.." + httpPrefix + filePath, logger) 270 | writer.close() 271 | } 272 | 273 | def close (): Unit = { 274 | writer.close() 275 | } 276 | } 277 | -------------------------------------------------------------------------------- /src/main/scala/com/microsoft/cdm/write/WriterConnector.scala: -------------------------------------------------------------------------------- 1 | package com.microsoft.cdm.write 2 | 3 | import com.microsoft.cdm.utils.DataConverter 4 | import org.apache.spark.sql.catalyst.InternalRow 5 | import org.apache.spark.sql.types.StructType 6 | 7 | @SerialVersionUID(100L) 8 | trait WriterConnector extends Serializable { 9 | def getPath: String 10 | 11 | def build(schema: StructType) 12 | 13 | def writeRow(row: InternalRow, dataConverter: DataConverter) 14 | 15 | def upload 16 | 17 | def abort 18 | 19 | def close 20 | } 21 | -------------------------------------------------------------------------------- /src/test/resources/log4j.properties: -------------------------------------------------------------------------------- 1 | log4j.rootLogger=WARN, consoleAppender 2 | log4j.logger.com.microsoft.cdm=INFO, consoleAppender 3 | log4j.logger.com.microsoft.commondatamodel=WARN, consoleAppender 4 | 5 | log4j.appender.consoleAppender=org.apache.log4j.ConsoleAppender 6 | log4j.appender.consoleAppender.layout=org.apache.log4j.PatternLayout 7 | log4j.appender.consoleAppender.layout.ConversionPattern=[%t] %-5p %c %x - %m%n 8 | -------------------------------------------------------------------------------- /src/test/scala/com/microsoft/cdm/test/TestData.scala: -------------------------------------------------------------------------------- 1 | package com.microsoft.cdm.test 2 | 3 | import java.sql.Timestamp 4 | import java.time.{LocalDate, LocalTime, ZoneId} 5 | import java.time.format.DateTimeFormatter 6 | 7 | import com.microsoft.cdm.utils.Constants 8 | import com.microsoft.cdm.utils.Constants.DECIMAL_PRECISION 9 | import org.apache.spark.sql.{DataFrame, Row, SparkSession} 10 | import org.apache.spark.sql.types.{ArrayType, BooleanType, ByteType, DateType, Decimal, DecimalType, DoubleType, FloatType, IntegerType, LongType, MetadataBuilder, ShortType, StringType, StructField, StructType, TimestampType} 11 | 12 | class TestData(val spark: SparkSession) { 13 | 14 | val date= java.sql.Date.valueOf("2015-03-31"); 15 | val timestamp = new java.sql.Timestamp(System.currentTimeMillis()); 16 | 17 | def prepareDataWithAllTypes(): DataFrame = { 18 | val byteVal = 2.toByte 19 | val shortVal = 129.toShort 20 | val data = Seq( 21 | Row("tim", 1, true, 12.34,6L, date, Decimal(999.00), timestamp, 2f, byteVal, shortVal), 22 | Row("tddim", 1, false, 13.34,7L, date, Decimal(434.3), timestamp, 3.59f, byteVal, shortVal), 23 | Row("tddim", 1, false, 13.34,7L, date, Decimal(100.0), timestamp, 3.59f, byteVal, shortVal), 24 | Row("tddim", 1, false, 13.34,7L, date, Decimal(99.898), timestamp, 3.59f, byteVal, shortVal), 25 | Row("tim", 1, true, 12.34,6L, date, Decimal(1.3), timestamp, 3.59f, byteVal, shortVal), 26 | Row("tddim", 1, false, 13.34,7L, date, Decimal(99999.3), timestamp, 3590.9f, byteVal, shortVal), 27 | Row("tddim", 1, false, 13.34,7L, date, Decimal(4324.4324324), timestamp, 359.8f, byteVal, shortVal), 28 | Row("tddim", 1, false, 13.34,7L, date, Decimal(42.4), timestamp, 3.593f, byteVal, shortVal), 29 | Row("tddim", 1, false, 13.34,7L, date, Decimal(1.43434), timestamp, 3.59f, byteVal, shortVal), 30 | Row("tddim", 1, false, 13.34,7L, date, Decimal(0.0167), timestamp, 3.59f, byteVal, shortVal), 31 | Row("tddim", 1, false, 13.34,7L, date, Decimal(0.00032), timestamp, 332.33f, byteVal, shortVal), 32 | Row("tddim", 1, false, 13.34,7L, date, Decimal(78.5), timestamp, 3.53232f, byteVal, shortVal) 33 | ) 34 | 35 | val schema = new StructType() 36 | .add(StructField("name", StringType, true)) 37 | .add(StructField("id", IntegerType, true)) 38 | .add(StructField("flag", BooleanType, true)) 39 | .add(StructField("salary", DoubleType, true)) 40 | .add(StructField("phone", LongType, true)) 41 | .add(StructField("dob", DateType, true)) 42 | .add(StructField("weight", DecimalType(Constants.DECIMAL_PRECISION,7), true)) 43 | .add(StructField("time", TimestampType, true)) 44 | .add(StructField("float", FloatType, true)) 45 | .add(StructField("byte", ByteType, true)) 46 | .add(StructField("short", ShortType, true)) 47 | 48 | spark.createDataFrame(spark.sparkContext.parallelize(data), schema) 49 | } 50 | 51 | 52 | def prepareNullAndEmptyArrays() = { 53 | val data = Seq( 54 | Row(Array(null, null)) , 55 | Row(Array()), 56 | Row(null), 57 | Row(Array(null, Row(null))) 58 | ) 59 | val schema = new StructType() 60 | .add(StructField("name", ArrayType(StructType(List(StructField("name", StringType, true)))), true) 61 | ) 62 | val df = spark.createDataFrame(spark.sparkContext.parallelize(data), schema) 63 | df 64 | } 65 | 66 | def prepareSimpleDataArrayWithTime() : DataFrame = { 67 | 68 | val formatterTime1 = DateTimeFormatter.ofPattern("HH:mm:ss") 69 | val ls = LocalTime.parse("10:09:08", formatterTime1) 70 | val instant = ls.atDate(LocalDate.of(1970, 1, 1)).atZone(ZoneId.systemDefault()).toInstant 71 | val timestamp = Timestamp.from(instant) 72 | 73 | val md = new MetadataBuilder().putString(Constants.MD_DATATYPE_OVERRIDE, Constants.MD_DATATYPE_OVERRIDE_TIME).build() 74 | val data = Seq( 75 | Row(Array(Row("RowOneArray1", 1, timestamp))), 76 | Row(Array(Row("RowTwoArray1", 3, timestamp), Row("RowTwoArray2", 4, timestamp))) 77 | ) 78 | val schema = new StructType() 79 | .add(StructField("name", ArrayType(StructType( 80 | List(StructField("name", StringType, true), 81 | StructField("number", IntegerType, true), 82 | StructField("aTime", TimestampType, true,md)))), 83 | true) 84 | ) 85 | val df = spark.createDataFrame(spark.sparkContext.parallelize(data,1), schema) 86 | df 87 | } 88 | 89 | def prepareNestedDataArrays() : DataFrame = { 90 | val date= java.sql.Date.valueOf("2015-03-31") 91 | 92 | val formatterTime1 = DateTimeFormatter.ofPattern("HH:mm:ss") 93 | val ls = LocalTime.parse("10:09:08", formatterTime1) 94 | val instant = ls.atDate(LocalDate.of(1970, 1, 1)).atZone(ZoneId.systemDefault()).toInstant 95 | val timestamp = Timestamp.from(instant) 96 | 97 | val data = Seq( Row(13, Row("Str1", true, 12.34,6L, date, Decimal(2.3), timestamp, Row("sub1", Row(timestamp), Array(Row("RowOneArray1", 1, timestamp), Row("RowOneArray2", 2, timestamp))))) , 98 | Row(24, Row("Str2", false, 12.34,6L, date, Decimal(2.3), timestamp, Row("sub2", Row(timestamp), Array(Row("RowTwoArray1", 3, timestamp), Row("RowTwoArray2", 4, timestamp), Row("RowTwoArray3", 5, timestamp), Row("RowTwoArray4", 6, timestamp))))) 99 | ) 100 | 101 | val schema = new StructType() 102 | .add(StructField("id", IntegerType, true)) 103 | .add(StructField("details", new StructType() 104 | .add(StructField("name", StringType, true)) 105 | .add(StructField("flag", BooleanType, true)) 106 | .add(StructField("salary", DoubleType, true)) 107 | .add(StructField("phone", LongType, true)) 108 | .add(StructField("dob", DateType, true)) 109 | .add(StructField("weight", DecimalType(DECIMAL_PRECISION,1), true)) 110 | .add(StructField("time", TimestampType, true)) 111 | .add(StructField("subRow", new StructType() 112 | .add(StructField("name", StringType, true)) 113 | .add(StructField("level3", new StructType() 114 | .add(StructField("time1", TimestampType, true)) 115 | ) 116 | ) 117 | .add(StructField("hit_songs", ArrayType(StructType(List(StructField("name", StringType, true), 118 | StructField("number", IntegerType, true), 119 | StructField("aTime", TimestampType, true))), true), true)) 120 | ) 121 | ))) 122 | val df = spark.createDataFrame(spark.sparkContext.parallelize(data,1), schema) 123 | df 124 | } 125 | 126 | def prepareNestedData(): DataFrame = { 127 | val date= java.sql.Date.valueOf("2015-03-31") 128 | val timestamp = new java.sql.Timestamp(System.currentTimeMillis()); 129 | val data = Seq( 130 | Row(13, Row("Str1", true, 12.34,6L, date, Decimal(2.3), timestamp, Row("sub1", Row(timestamp)))) , 131 | Row(24, Row("Str2", false, 12.34,6L, date, Decimal(2.3), timestamp, Row("sub2", Row(timestamp)))) 132 | ) 133 | 134 | val schema = new StructType() 135 | .add(StructField("id", IntegerType, true)) 136 | .add(StructField("details", new StructType() 137 | .add(StructField("name", StringType, true)) 138 | .add(StructField("flag", BooleanType, true)) 139 | .add(StructField("salary", DoubleType, true)) 140 | .add(StructField("phone", LongType, true)) 141 | .add(StructField("dob", DateType, true)) 142 | .add(StructField("weight", DecimalType(DECIMAL_PRECISION,1), true)) 143 | .add(StructField("time", TimestampType, true)) 144 | .add(StructField("subRow", new StructType() 145 | .add(StructField("name", StringType, true)) 146 | .add(StructField("level3", new StructType() 147 | .add(StructField("time1", TimestampType, true)) 148 | ) 149 | ) 150 | ) 151 | ))) 152 | val df = spark.createDataFrame(spark.sparkContext.parallelize(data), schema) 153 | df 154 | } 155 | 156 | def prepareNullData(): DataFrame = { 157 | 158 | val data = Seq( 159 | Row(null, null, null, null,null, null, null, null, null, null, null) 160 | ) 161 | 162 | val schema = new StructType() 163 | .add(StructField("name", StringType, true)) 164 | .add(StructField("id", IntegerType, true)) 165 | .add(StructField("flag", BooleanType, true)) 166 | .add(StructField("salary", DoubleType, true)) 167 | .add(StructField("phone", LongType, true)) 168 | .add(StructField("dob", DateType, true)) 169 | .add(StructField("weight", DecimalType(Constants.DECIMAL_PRECISION,7), true)) 170 | .add(StructField("time", TimestampType, true)) 171 | .add(StructField("float", FloatType, true)) 172 | .add(StructField("byte", ByteType, true)) 173 | .add(StructField("short", ShortType, true)) 174 | 175 | spark.createDataFrame(spark.sparkContext.parallelize(data), schema) 176 | } 177 | 178 | } 179 | -------------------------------------------------------------------------------- /test/spark-cdm-connector-assembly-0.18.2.jar: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Azure/spark-cdm-connector/3a2f09767dabad4c97c3b849210603f08f7ff6bd/test/spark-cdm-connector-assembly-0.18.2.jar -------------------------------------------------------------------------------- /test/spark-cdm-connector-assembly-permissive.jar: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Azure/spark-cdm-connector/3a2f09767dabad4c97c3b849210603f08f7ff6bd/test/spark-cdm-connector-assembly-permissive.jar -------------------------------------------------------------------------------- /test/tests: -------------------------------------------------------------------------------- 1 | 2 | --------------------------------------------------------------------------------