├── .github └── dependabot.yml ├── .gitignore ├── README.md ├── SECURITY.md ├── azure ├── AzureDataBricksJob │ ├── pom.xml │ └── src │ │ ├── main │ │ ├── java │ │ │ └── com │ │ │ │ └── microsoft │ │ │ │ └── pnp │ │ │ │ ├── GeoFinder.java │ │ │ │ └── MDCCloseableFactory.java │ │ ├── resources │ │ │ └── com │ │ │ │ └── microsoft │ │ │ │ └── pnp │ │ │ │ └── azuredatabricksjob │ │ │ │ └── log4j.properties │ │ └── scala │ │ │ ├── com │ │ │ └── microsoft │ │ │ │ └── pnp │ │ │ │ ├── CassandraSinkForeach.scala │ │ │ │ ├── JobConfiguration.scala │ │ │ │ ├── StreamingMetricsListener.scala │ │ │ │ ├── TaxiCabReader.scala │ │ │ │ ├── TryWith.scala │ │ │ │ ├── Utils.scala │ │ │ │ └── package.scala │ │ │ └── org │ │ │ └── apache │ │ │ └── spark │ │ │ ├── metrics │ │ │ └── source │ │ │ │ ├── AppAccumulators.scala │ │ │ │ └── AppMetrics.scala │ │ │ └── sql │ │ │ └── catalyst │ │ │ ├── csv │ │ │ ├── CSVExprUtils.scala │ │ │ ├── CSVHeaderChecker.scala │ │ │ ├── CSVOptions.scala │ │ │ └── UnivocityParser.scala │ │ │ ├── expressions │ │ │ ├── ExprUtils.scala │ │ │ └── csvExpressions.scala │ │ │ └── util │ │ │ └── FailureSafeParser.scala │ │ └── test │ │ └── scala │ │ ├── com │ │ └── microsoft │ │ │ └── pnp │ │ │ ├── SparkSuitBase.scala │ │ │ ├── TaxiFareMapperTester.scala │ │ │ └── TaxiRideMapperTester.scala │ │ └── org │ │ └── apache │ │ └── spark │ │ └── sql │ │ └── streaming │ │ └── UtilsTests.scala ├── deployresources.json └── pom.xml └── onprem ├── DataLoader ├── .vscode │ ├── launch.json │ └── tasks.json ├── DataFormat.cs ├── ObjectPool.cs ├── Program.cs ├── StreamReaderExtensions.cs ├── TaxiData.cs ├── TaxiFare.cs ├── TaxiRide.cs └── taxi.csproj ├── Dockerfile └── main.env /.github/dependabot.yml: -------------------------------------------------------------------------------- 1 | # To get started with Dependabot version updates, you'll need to specify which 2 | # package ecosystems to update and where the package manifests are located. 3 | # Please see the documentation for all configuration options: 4 | # https://docs.github.com/code-security/dependabot/dependabot-version-updates/configuration-options-for-the-dependabot.yml-file 5 | 6 | version: 2 7 | updates: 8 | - package-ecosystem: "nuget" # See documentation for possible values 9 | directory: "/onprem/DataLoader" # Location of package manifests 10 | schedule: 11 | interval: "weekly" 12 | - package-ecosystem: "docker" # See documentation for possible values 13 | directory: "/onprem" # Location of package manifests 14 | schedule: 15 | interval: "weekly" 16 | - package-ecosystem: "maven" # See documentation for possible values 17 | directory: "/azure" # Location of package manifests 18 | schedule: 19 | interval: "weekly" 20 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | ## Ignore Visual Studio temporary files, build results, and 2 | ## files generated by popular Visual Studio add-ons. 3 | 4 | # User-specific files 5 | *.suo 6 | *.user 7 | *.userosscache 8 | *.sln.docstates 9 | 10 | # User-specific files (MonoDevelop/Xamarin Studio) 11 | *.userprefs 12 | 13 | # Build results 14 | [Dd]ebug/ 15 | [Dd]ebugPublic/ 16 | [Rr]elease/ 17 | [Rr]eleases/ 18 | x64/ 19 | x86/ 20 | build/ 21 | bld/ 22 | [Bb]in/ 23 | [Oo]bj/ 24 | 25 | # Visual Studio 2015 cache/options directory 26 | .vs/ 27 | 28 | # MSTest test Results 29 | [Tt]est[Rr]esult*/ 30 | [Bb]uild[Ll]og.* 31 | 32 | # NUNIT 33 | *.VisualState.xml 34 | TestResult.xml 35 | 36 | # Build Results of an ATL Project 37 | [Dd]ebugPS/ 38 | [Rr]eleasePS/ 39 | dlldata.c 40 | 41 | # DNX 42 | project.lock.json 43 | artifacts/ 44 | 45 | *_i.c 46 | *_p.c 47 | *_i.h 48 | *.ilk 49 | *.meta 50 | *.obj 51 | *.pch 52 | *.pdb 53 | *.pgc 54 | *.pgd 55 | *.rsp 56 | *.sbr 57 | *.tlb 58 | *.tli 59 | *.tlh 60 | *.tmp 61 | *.tmp_proj 62 | *.log 63 | *.vspscc 64 | *.vssscc 65 | .builds 66 | *.pidb 67 | *.svclog 68 | *.scc 69 | 70 | # Chutzpah Test files 71 | _Chutzpah* 72 | 73 | # Visual C++ cache files 74 | ipch/ 75 | *.aps 76 | *.ncb 77 | *.opensdf 78 | *.sdf 79 | *.cachefile 80 | 81 | # Visual Studio profiler 82 | *.psess 83 | *.vsp 84 | *.vspx 85 | 86 | # TFS 2012 Local Workspace 87 | $tf/ 88 | 89 | # Guidance Automation Toolkit 90 | *.gpState 91 | 92 | # ReSharper is a .NET coding add-in 93 | _ReSharper*/ 94 | *.[Rr]e[Ss]harper 95 | *.DotSettings.user 96 | 97 | # JustCode is a .NET coding add-in 98 | .JustCode 99 | 100 | # TeamCity is a build add-in 101 | _TeamCity* 102 | 103 | # DotCover is a Code Coverage Tool 104 | *.dotCover 105 | 106 | # NCrunch 107 | _NCrunch_* 108 | .*crunch*.local.xml 109 | 110 | # MightyMoose 111 | *.mm.* 112 | AutoTest.Net/ 113 | 114 | # Web workbench (sass) 115 | .sass-cache/ 116 | 117 | # Installshield output folder 118 | [Ee]xpress/ 119 | 120 | # DocProject is a documentation generator add-in 121 | DocProject/buildhelp/ 122 | DocProject/Help/*.HxT 123 | DocProject/Help/*.HxC 124 | DocProject/Help/*.hhc 125 | DocProject/Help/*.hhk 126 | DocProject/Help/*.hhp 127 | DocProject/Help/Html2 128 | DocProject/Help/html 129 | 130 | # Click-Once directory 131 | publish/ 132 | 133 | # Publish Web Output 134 | *.[Pp]ublish.xml 135 | *.azurePubxml 136 | ## TODO: Comment the next line if you want to checkin your 137 | ## web deploy settings but do note that will include unencrypted 138 | ## passwords 139 | #*.pubxml 140 | 141 | *.publishproj 142 | 143 | # NuGet Packages 144 | *.nupkg 145 | # The packages folder can be ignored because of Package Restore 146 | **/packages/* 147 | # except build/, which is used as an MSBuild target. 148 | !**/packages/build/ 149 | # Uncomment if necessary however generally it will be regenerated when needed 150 | #!**/packages/repositories.config 151 | 152 | # Windows Azure Build Output 153 | csx/ 154 | *.build.csdef 155 | 156 | # Windows Store app package directory 157 | AppPackages/ 158 | 159 | # Visual Studio cache files 160 | # files ending in .cache can be ignored 161 | *.[Cc]ache 162 | # but keep track of directories ending in .cache 163 | !*.[Cc]ache/ 164 | 165 | # Others 166 | ClientBin/ 167 | [Ss]tyle[Cc]op.* 168 | ~$* 169 | *~ 170 | *.dbmdl 171 | *.dbproj.schemaview 172 | *.pfx 173 | *.publishsettings 174 | node_modules/ 175 | orleans.codegen.cs 176 | 177 | # RIA/Silverlight projects 178 | Generated_Code/ 179 | 180 | # Backup & report files from converting an old project file 181 | # to a newer Visual Studio version. Backup files are not needed, 182 | # because we have git ;-) 183 | _UpgradeReport_Files/ 184 | Backup*/ 185 | UpgradeLog*.XML 186 | UpgradeLog*.htm 187 | 188 | # SQL Server files 189 | *.mdf 190 | *.ldf 191 | 192 | # Business Intelligence projects 193 | *.rdl.data 194 | *.bim.layout 195 | *.bim_*.settings 196 | 197 | # Microsoft Fakes 198 | FakesAssemblies/ 199 | 200 | # Node.js Tools for Visual Studio 201 | .ntvs_analysis.dat 202 | 203 | # Visual Studio 6 build log 204 | *.plg 205 | 206 | # Visual Studio 6 workspace options file 207 | *.opt 208 | 209 | # LightSwitch generated files 210 | GeneratedArtifacts/ 211 | _Pvt_Extensions/ 212 | ModelManifest.xml 213 | 214 | #Not to include .ds_store file 215 | .DS_Store 216 | 217 | #Not to include target files 218 | */target/** 219 | .idea 220 | project 221 | 222 | # Covers JetBrains IDEs: IntelliJ, RubyMine, PhpStorm, AppCode, PyCharm, CLion, Android Studio and WebStorm 223 | # Reference: https://intellij-support.jetbrains.com/hc/en-us/articles/206544839 224 | 225 | # User-specific stuff 226 | .idea/**/workspace.xml 227 | .idea/**/tasks.xml 228 | .idea/**/usage.statistics.xml 229 | .idea/**/dictionaries 230 | .idea/**/shelf 231 | 232 | # Generated files 233 | .idea/**/contentModel.xml 234 | 235 | # Sensitive or high-churn files 236 | .idea/**/dataSources/ 237 | .idea/**/dataSources.ids 238 | .idea/**/dataSources.local.xml 239 | .idea/**/sqlDataSources.xml 240 | .idea/**/dynamic.xml 241 | .idea/**/uiDesigner.xml 242 | .idea/**/dbnavigator.xml 243 | 244 | # Gradle 245 | .idea/**/gradle.xml 246 | .idea/**/libraries 247 | 248 | # Gradle and Maven with auto-import 249 | # When using Gradle or Maven with auto-import, you should exclude module files, 250 | # since they will be recreated, and may cause churn. Uncomment if using 251 | # auto-import. 252 | # .idea/modules.xml 253 | # .idea/*.iml 254 | # .idea/modules 255 | 256 | # CMake 257 | cmake-build-*/ 258 | 259 | # Mongo Explorer plugin 260 | .idea/**/mongoSettings.xml 261 | 262 | # File-based project format 263 | *.iws 264 | 265 | # IntelliJ 266 | out/ 267 | 268 | # mpeltonen/sbt-idea plugin 269 | .idea_modules/ 270 | 271 | # JIRA plugin 272 | atlassian-ide-plugin.xml 273 | 274 | # Cursive Clojure plugin 275 | .idea/replstate.xml 276 | 277 | # Crashlytics plugin (for Android Studio and IntelliJ) 278 | com_crashlytics_export_strings.xml 279 | crashlytics.properties 280 | crashlytics-build.properties 281 | fabric.properties 282 | 283 | # Editor-based Rest Client 284 | .idea/httpRequests 285 | 286 | # Java 287 | # Compiled class file 288 | *.class 289 | 290 | # Log file 291 | *.log 292 | 293 | # BlueJ files 294 | *.ctxt 295 | 296 | # Mobile Tools for Java (J2ME) 297 | .mtj.tmp/ 298 | 299 | # Package Files # 300 | *.jar 301 | *.war 302 | *.nar 303 | *.ear 304 | *.zip 305 | *.tar.gz 306 | *.rar 307 | 308 | # virtual machine crash logs, see http://www.java.com/en/download/help/error_hotspot.xml 309 | hs_err_pid* 310 | 311 | # Maven 312 | target/ 313 | pom.xml.tag 314 | pom.xml.releaseBackup 315 | pom.xml.versionsBackup 316 | pom.xml.next 317 | release.properties 318 | dependency-reduced-pom.xml 319 | buildNumber.properties 320 | .mvn/timing.properties 321 | .mvn/wrapper/maven-wrapper.jar 322 | 323 | # This is to ignore IntelliJ project files, since we use Maven 324 | *.iml 325 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # Stream processing with Azure Databricks 2 | 3 | This reference architecture shows an end-to-end [stream processing](https://docs.microsoft.com/azure/architecture/data-guide/big-data/real-time-processing) pipeline. This type of pipeline has four stages: ingest, process, store, and analysis and reporting. For this reference architecture, the pipeline ingests data from two sources, performs a join on related records from each stream, enriches the result, and calculates an average in real time. The results are stored for further analysis. 4 | 5 | ![](https://github.com/mspnp/architecture-center/blob/master/docs/reference-architectures/data/images/stream-processing-databricks.png) 6 | 7 | **Scenario**: A taxi company collects data about each taxi trip. For this scenario, we assume there are two separate devices sending data. The taxi has a meter that sends information about each ride — the duration, distance, and pickup and dropoff locations. A separate device accepts payments from customers and sends data about fares. To spot ridership trends, the taxi company wants to calculate the average tip per mile driven, in real time, for each neighborhood. 8 | 9 | ## Deploy the solution 10 | 11 | A deployment for this reference architecture is available on [GitHub](https://github.com/mspnp/azure-databricks-streaming-analytics). 12 | 13 | ### Prerequisites 14 | 15 | 1. Clone, fork, or download this GitHub repository. 16 | 17 | 2. Install [Docker](https://www.docker.com/) to run the data generator. 18 | 19 | 3. Install [Azure CLI 2.0](https://docs.microsoft.com/cli/azure/install-azure-cli?view=azure-cli-latest). 20 | 21 | 4. Install [Databricks CLI](https://docs.microsoft.com/azure/databricks/dev-tools/cli/). 22 | 23 | 5. From a command prompt, bash prompt, or PowerShell prompt, sign into your Azure account as follows: 24 | 25 | ```bash 26 | az login 27 | ``` 28 | 29 | 6. Optional - Install a Java IDE, with the following resources: 30 | - JDK 1.8 31 | - Scala SDK 2.12 32 | - Maven 3.6.3 33 | > Note: Instructions are included for building via a docker container if you do not want to install a Java IDE. 34 | 35 | ### Download the New York City taxi and neighborhood data files 36 | 37 | 1. Create a directory named `DataFile` in the root of the cloned Github repository in your local file system. 38 | 39 | 2. Open a web browser and navigate to . 40 | 41 | 3. Click the **Download** button on this page to download a zip file of all the taxi data for that year. 42 | 43 | 4. Extract the zip file to the `DataFile` directory. 44 | 45 | > Note: This zip file contains other zip files. Don't extract the child zip files. 46 | 47 | The directory structure should look like the following: 48 | 49 | ```shell 50 | /DataFile 51 | /FOIL2013 52 | trip_data_1.zip 53 | trip_data_2.zip 54 | trip_data_3.zip 55 | ... 56 | ``` 57 | 58 | 5. Open a web browser and navigate to . 59 | 60 | 6. Under the section **County Subdivisions** click the dropdown an select **New York**. 61 | 62 | 7. Copy the **cb_2019_36_cousub_500k.zip** file from your browser's **downloads** directory to the `DataFile` directory. 63 | 64 | ### Deploy the Azure resources 65 | 66 | 1. From a shell or Windows Command Prompt, run the following command and follow the sign-in prompt: 67 | 68 | ```bash 69 | az login 70 | ``` 71 | 72 | 2. Navigate to the folder named `azure` in the GitHub repository directory: 73 | 74 | ```bash 75 | cd azure 76 | ``` 77 | 78 | 3. Run the following commands to deploy the Azure resources: 79 | 80 | ```bash 81 | export resourceGroup='[Resource group name]' 82 | export resourceLocation='[Region]' 83 | export eventHubNamespace='[Event Hubs namespace name]' 84 | export databricksWorkspaceName='[Azure Databricks workspace name]' 85 | export cosmosDatabaseAccount='[Cosmos DB database name]' 86 | export logAnalyticsWorkspaceName='[Log Analytics workspace name]' 87 | export logAnalyticsWorkspaceRegion='[Log Analytics region]' 88 | 89 | # Create a resource group 90 | az group create --name $resourceGroup --location $resourceLocation 91 | 92 | # Deploy resources 93 | az deployment group create --resource-group $resourceGroup \ 94 | --template-file ./deployresources.json --parameters \ 95 | eventHubNamespace=$eventHubNamespace \ 96 | databricksWorkspaceName=$databricksWorkspaceName \ 97 | cosmosDatabaseAccount=$cosmosDatabaseAccount \ 98 | logAnalyticsWorkspaceName=$logAnalyticsWorkspaceName \ 99 | logAnalyticsWorkspaceRegion=$logAnalyticsWorkspaceRegion 100 | ``` 101 | 102 | 4. The output of the deployment is written to the console once complete. Search the output for the following JSON: 103 | 104 | ```JSON 105 | "outputs": { 106 | "cosmosDb": { 107 | "type": "Object", 108 | "value": { 109 | "hostName": , 110 | "secret": , 111 | "username": 112 | } 113 | }, 114 | "eventHubs": { 115 | "type": "Object", 116 | "value": { 117 | "taxi-fare-eh": , 118 | "taxi-ride-eh": 119 | } 120 | }, 121 | "logAnalytics": { 122 | "type": "Object", 123 | "value": { 124 | "secret": , 125 | "workspaceId": 126 | } 127 | } 128 | }, 129 | ``` 130 | 131 | These values are the secrets that will be added to Databricks secrets in upcoming sections. Keep them secure until you add them in those sections. 132 | 133 | ### Add a Cassandra table to the Cosmos DB Account 134 | 135 | 1. In the Azure portal, navigate to the resource group created in the **deploy the Azure resources** section above. Click on **Azure Cosmos DB Account**. Create a table with the Cassandra API. 136 | 137 | 2. In the **overview** blade, click **add table**. 138 | 139 | 3. When the **add table** blade opens, enter `newyorktaxi` in the **Keyspace name** text box. 140 | 141 | 4. In the **enter CQL command to create the table** section, enter `neighborhoodstats` in the text box beside `newyorktaxi`. 142 | 143 | 5. In the text box below, enter the following: 144 | 145 | ```shell 146 | (neighborhood text, window_end timestamp, number_of_rides bigint, total_fare_amount double, total_tip_amount double, average_fare_amount double, average_tip_amount double, primary key(neighborhood, window_end)) 147 | ``` 148 | 149 | 6. In the **Table throughput** section confirm that `Autoscale` is selected and that value `4000` is in the **Table Max RU/s** text box. 150 | 151 | 7. Click **OK**. 152 | 153 | ### Add the Databricks secrets using the Databricks CLI 154 | 155 | > Tip: Make sure you have authenticated your Databricks CLI configuration. The simplest method in bash is to run: 156 | > 157 | > ```bash 158 | > export DATABRICKS_AAD_TOKEN=$(az account get-access-token --resource 2ff814a6-3304-4ab8-85cb-cd0e6f879c1d | jq .accessToken --raw-output) 159 | > databricks configure --aad-token --host 160 | > ``` 161 | > 162 | > The resource GUID (2ff814a6-3304-4ab8-85cb-cd0e6f879c1d) is a fixed value. For other options see [Set up authentication](https://docs.microsoft.com/azure/databricks/dev-tools/cli/#--set-up-authentication) in the Azure Databricks documentation. 163 | > If you see a JSONDecodeError error when running a command, your token has exired and you can refresh by running the commands above again. 164 | 165 | First, enter the secrets for EventHub: 166 | 167 | 1. Using the **Azure Databricks CLI** installed in step 4 of the prerequisites, create the Azure Databricks secret scope: 168 | 169 | ```bash 170 | databricks secrets create-scope --scope "azure-databricks-job" 171 | ``` 172 | 173 | 2. Add the secret for the taxi ride EventHub: 174 | 175 | ```bash 176 | databricks secrets put --scope "azure-databricks-job" --key "taxi-ride" 177 | ``` 178 | 179 | Once executed, this command opens the vi editor. Enter the **taxi-ride-eh** value from the **eventHubs** output section in step 4 of the *deploy the Azure resources* section. Save and exit vi (if in edit mode hit ESC, then type ":wq"). 180 | 181 | 3. Add the secret for the taxi fare EventHub: 182 | 183 | ```bash 184 | databricks secrets put --scope "azure-databricks-job" --key "taxi-fare" 185 | ``` 186 | 187 | Once executed, this command opens the vi editor. Enter the **taxi-fare-eh** value from the **eventHubs** output section in step 4 of the *deploy the Azure resources* section. Save and exit vi (if in edit mode hit ESC, then type ":wq"). 188 | 189 | Next, enter the secrets for Cosmos DB: 190 | 191 | 1. Using the **Azure Databricks CLI**, add the secret for the Cosmos DB user name: 192 | 193 | ```bash 194 | databricks secrets put --scope azure-databricks-job --key "cassandra-username" 195 | ``` 196 | 197 | Once executed, this command opens the vi editor. Enter the **username** value from the **CosmosDb** output section in step 4 of the *deploy the Azure resources* section. Save and exit vi (if in edit mode hit ESC, then type ":wq"). 198 | 199 | 2. Next, add the secret for the Cosmos DB password: 200 | 201 | ```bash 202 | databricks secrets put --scope azure-databricks-job --key "cassandra-password" 203 | ``` 204 | 205 | Once executed, this command opens the vi editor. Enter the **secret** value from the **CosmosDb** output section in step 4 of the *deploy the Azure resources* section. Save and exit vi (if in edit mode hit ESC, then type ":wq"). 206 | 207 | > Note: If using an [Azure Key Vault-backed secret scope](https://docs.azuredatabricks.net/user-guide/secrets/secret-scopes.html#azure-key-vault-backed-scopes), the scope must be named **azure-databricks-job** and the secrets must have the exact same names as those above. 208 | 209 | ### Add the Census Neighborhoods data file to the Databricks file system 210 | 211 | 1. Create a directory in the Databricks file system: 212 | 213 | ```bash 214 | dbfs mkdirs dbfs:/azure-databricks-job 215 | ``` 216 | 217 | 2. Navigate to the DataFile folder and enter the following: 218 | 219 | ```bash 220 | dbfs cp cb_2020_36_cousub_500k.zip dbfs:/azure-databricks-job/ 221 | ``` 222 | 223 | > Note: The filename may change if you obtain a shapefile for a different year. 224 | 225 | ### Build the .jar files for the Databricks job 226 | 227 | 1. To build the jars using a docker container from a bash prompt change to the **azure** directory and run: 228 | 229 | ```bash 230 | docker run -it --rm -v `pwd`:/streaming_azuredatabricks_azure -v ~/.m2:/root/.m2 maven:3.6.3-jdk-8 mvn -f /streaming_azuredatabricks_azure/pom.xml package 231 | ``` 232 | 233 | > Note: Alternately, use your Java IDE to import the Maven project file named **pom.xml** located in the **azure** directory. Perform a clean build. 234 | 235 | 1. The outputs of the build is a file named **azure-databricks-job-1.0-SNAPSHOT.jar** in the **./AzureDataBricksJob/target** directory. 236 | 237 | ### Create a Databricks cluster 238 | 239 | 1. In the Databricks workspace, click **Compute**, then click **Create cluster**. Enter the cluster name you created in step 3 of the **configure custom logging for the Databricks job** section above. 240 | 241 | 1. Select **Standard** for **Cluster Mode**. 242 | 243 | 1. Set **Databricks runtime version** to **7.3 Extended Support (Scala 2.12, Apache Spark 3.0.1)** 244 | 245 | 1. Deselect **Enable autoscaling**. 246 | 247 | 1. Set **Worker Type** to **Standard_DS3_v2**. 248 | 249 | 1. Set **Workers** to **2**. 250 | 251 | 1. Set **Driver Type** to **Same as worker** 252 | 253 | #### Optional - Configure Azure Log Analytics 254 | 255 | 1. Follow the instructions in [Monitoring Azure Databricks](https://github.com/mspnp/spark-monitoring) to build the monitoring library and upload the resulting library files to your workspace. 256 | 257 | 1. Click on **Advanced Options** then **Init Scripts**. 258 | 259 | 1. Enter **dbfs:/databricks/spark-monitoring/spark-monitoring.sh**. 260 | 261 | 1. Click the **Add** button. 262 | 263 | 1. Click the **Create Cluster** button. 264 | 265 | ### Install dependent libraries on cluster 266 | 267 | 1. In the Databricks user interface, click on the **home** button. 268 | 269 | 2. Click on **Compute** in the navigtation menu on the left then click on the cluster you created in the **Create a Databricks cluster** step. 270 | 271 | 3. Click on **Libraries**, then click **Install New**. 272 | 273 | 4. In the **Library Source** control, select **Maven**. 274 | 275 | 5. Under the **Maven Coordinates** text box, enter `com.microsoft.azure:azure-eventhubs-spark_2.12:2.3.21`. 276 | 277 | 6. Select **Install**. 278 | 279 | 8. Repeat steps 3 - 6 for the `com.datastax.spark:spark-cassandra-connector-assembly_2.12:3.0.1` Maven coordinate. 280 | 281 | 9. Repeat steps 3 - 5 for the `org.geotools:gt-shapefile:23.0` Maven coordinate. 282 | 283 | 10. Enter `https://repo.osgeo.org/repository/release/` in the **Repository** text box. 284 | 285 | 11. Click **Install**. 286 | 287 | ### Create a Databricks job 288 | 289 | 1. Copy the **azure-databricks-job-1.0-SNAPSHOT.jar** file to the Databricks file system by entering the following command in the **Databricks CLI**: 290 | 291 | ```bash 292 | databricks fs cp --overwrite AzureDataBricksJob/target/azure-databricks-job-1.0-SNAPSHOT.jar dbfs:/azure-databricks-job/ 293 | ``` 294 | 295 | 1. In the Databricks workspace, click "Jobs", "create job". 296 | 297 | 1. Enter a job name. 298 | 299 | 1. In the **Task** area, change **Type** to `JAR` and Enter `com.microsoft.pnp.TaxiCabReader` in the **Main Class** field. 300 | 301 | 1. Under **Dependent Libraries** click **Add**, this opens the **Add dependent library** dialog box. 302 | 303 | 1. Change **Library Source** to **DBFS/ADLS**, confirm that Library Type is **Jar** and enter `dbfs:/azure-databricks-job/azure-databricks-job-1.0-SNAPSHOT.jar` in the **File Path** text box and select **Add**. 304 | 305 | 1. In the **Parameters** field, enter the following (replace **\** with a value from above): 306 | 307 | ```shell 308 | ["-n","jar:file:/dbfs/azure-databricks-job/cb_2020_36_cousub_500k.zip!/cb_2020_36_cousub_500k.shp","--taxi-ride-consumer-group","taxi-ride-eh-cg","--taxi-fare-consumer-group","taxi-fare-eh-cg","--window-interval","1 hour","--cassandra-host",""] 309 | ``` 310 | 311 | 1. Under **Cluster**, click the drop down arrow and select the cluster created the **Create a Databricks cluster** section. 312 | 313 | 1. Click Create 314 | 315 | 1. Select the **Runs** tab and click **Run Now**. 316 | 317 | ### Run the data generator 318 | 319 | 1. Navigate to the directory `onprem` in the GitHub repository. 320 | 321 | ```bash 322 | cd ../onprem 323 | ``` 324 | 325 | 1. Update the values in the file **main.env** as follows: 326 | 327 | ```shell 328 | RIDE_EVENT_HUB=[Connection string for the taxi-ride event hub] 329 | FARE_EVENT_HUB=[Connection string for the taxi-fare event hub] 330 | RIDE_DATA_FILE_PATH=/DataFile/FOIL2013 331 | MINUTES_TO_LEAD=0 332 | PUSH_RIDE_DATA_FIRST=false 333 | ``` 334 | 335 | The connection string for the taxi-ride event hub is the **taxi-ride-eh** value from the **eventHubs** output section in step 4 of the *deploy the Azure resources* section. The connection string for the taxi-fare event hub the **taxi-fare-eh** value from the **eventHubs** output section in step 4 of the *deploy the Azure resources* section. 336 | 337 | 1. Run the following command to build the Docker image. 338 | 339 | ```bash 340 | docker build --no-cache -t dataloader . 341 | ``` 342 | 343 | 1. Navigate back to the repository root directory. 344 | 345 | ```bash 346 | cd .. 347 | ``` 348 | 349 | 1. Run the following command to run the Docker image. 350 | 351 | ```bash 352 | docker run -v `pwd`/DataFile:/DataFile --env-file=onprem/main.env dataloader:latest 353 | ``` 354 | 355 | The output should look like the following: 356 | 357 | ```shell 358 | Created 10000 records for TaxiFare 359 | Created 10000 records for TaxiRide 360 | Created 20000 records for TaxiFare 361 | Created 20000 records for TaxiRide 362 | Created 30000 records for TaxiFare 363 | ... 364 | ``` 365 | 366 | Hit CTRL+C to cancel the generation of data. 367 | 368 | ### Verify the solution is running 369 | 370 | To verify the Databricks job is running correctly, open the Azure portal and navigate to the Cosmos DB database. Open the **Data Explorer** blade and examine the data in the **neighborhoodstats** table, you should see results similar to: 371 | 372 | | average_fare _amount | average_tip _amount | neighborhood | number_of_rides | total_fare _amount | total_tip _amount | window_end | 373 | | --- | --- | --- | --- | --- | --- | --- | 374 | | 10.5 | 1.0 | Bronx | 1 | 10.5 | 1.0 | 1/1/2013 8:02:00 AM +00:00 | 375 | | 12.67 | 2.6 | Brooklyn | 3 | 38 | 7.8 | 1/1/2013 8:02:00 AM +00:00 | 376 | | 14.98 | 0.73 | Manhattan | 52 | 779 | 37.83 | 1/1/2013 8:02:00 AM +00:00 | 377 | | ... | ... | ... | ... | ... | ... | ... | 378 | 379 | > [1] Donovan, Brian; Work, Dan (2016): New York City Taxi Trip Data (2010-2013). University of Illinois at Urbana-Champaign. 380 | -------------------------------------------------------------------------------- /SECURITY.md: -------------------------------------------------------------------------------- 1 | 2 | 3 | ## Security 4 | 5 | Microsoft takes the security of our software products and services seriously, which includes all source code repositories managed through our GitHub organizations, which include [Microsoft](https://github.com/microsoft), [Azure](https://github.com/Azure), [DotNet](https://github.com/dotnet), [AspNet](https://github.com/aspnet), and [Xamarin](https://github.com/xamarin). 6 | 7 | If you believe you have found a security vulnerability in any Microsoft-owned repository that meets [Microsoft's definition of a security vulnerability](https://aka.ms/security.md/definition), please report it to us as described below. 8 | 9 | ## Reporting Security Issues 10 | 11 | **Please do not report security vulnerabilities through public GitHub issues.** 12 | 13 | Instead, please report them to the Microsoft Security Response Center (MSRC) at [https://msrc.microsoft.com/create-report](https://aka.ms/security.md/msrc/create-report). 14 | 15 | If you prefer to submit without logging in, send email to [secure@microsoft.com](mailto:secure@microsoft.com). If possible, encrypt your message with our PGP key; please download it from the [Microsoft Security Response Center PGP Key page](https://aka.ms/security.md/msrc/pgp). 16 | 17 | You should receive a response within 24 hours. If for some reason you do not, please follow up via email to ensure we received your original message. Additional information can be found at [microsoft.com/msrc](https://www.microsoft.com/msrc). 18 | 19 | Please include the requested information listed below (as much as you can provide) to help us better understand the nature and scope of the possible issue: 20 | 21 | * Type of issue (e.g. buffer overflow, SQL injection, cross-site scripting, etc.) 22 | * Full paths of source file(s) related to the manifestation of the issue 23 | * The location of the affected source code (tag/branch/commit or direct URL) 24 | * Any special configuration required to reproduce the issue 25 | * Step-by-step instructions to reproduce the issue 26 | * Proof-of-concept or exploit code (if possible) 27 | * Impact of the issue, including how an attacker might exploit the issue 28 | 29 | This information will help us triage your report more quickly. 30 | 31 | If you are reporting for a bug bounty, more complete reports can contribute to a higher bounty award. Please visit our [Microsoft Bug Bounty Program](https://aka.ms/security.md/msrc/bounty) page for more details about our active programs. 32 | 33 | ## Preferred Languages 34 | 35 | We prefer all communications to be in English. 36 | 37 | ## Policy 38 | 39 | Microsoft follows the principle of [Coordinated Vulnerability Disclosure](https://aka.ms/security.md/cvd). 40 | 41 | 42 | -------------------------------------------------------------------------------- /azure/AzureDataBricksJob/pom.xml: -------------------------------------------------------------------------------- 1 | 3 | 4.0.0 4 | 5 | azure-databricks-ra 6 | com.microsoft.pnp 7 | 1.0-SNAPSHOT 8 | ../pom.xml 9 | 10 | azure-databricks-job 11 | 1.0-SNAPSHOT 12 | ${project.artifactId} 13 | jar 14 | 15 | 1.8 16 | 1.8 17 | UTF-8 18 | 2.12.12 19 | 2.12 20 | 4.2.0 21 | 22 | 23 | 24 | com.microsoft.azure 25 | azure-eventhubs-spark_${scala.compat.version} 26 | 2.3.21 27 | provided 28 | 29 | 30 | org.geotools 31 | gt-shapefile 32 | 23.0 33 | provided 34 | 35 | 36 | org.eclipse.jetty 37 | jetty-server 38 | 9.4.43.v20210629 39 | provided 40 | 41 | 42 | com.databricks 43 | dbutils-api_${scala.compat.version} 44 | 0.0.5 45 | provided 46 | 47 | 48 | org.rogach 49 | scallop_${scala.compat.version} 50 | 3.5.1 51 | 52 | 53 | com.datastax.spark 54 | spark-cassandra-connector_2.12 55 | 3.0.1 56 | provided 57 | 58 | 59 | org.locationtech.jts 60 | jts-core 61 | 1.18.2 62 | 63 | 64 | 65 | 66 | osgeo 67 | Open Source Geospatial Foundation Repository 68 | https://repo.osgeo.org/repository/release/ 69 | 70 | 71 | 72 | 73 | 74 | 75 | org.scalatest 76 | scalatest-maven-plugin 77 | 78 | 79 | org.apache.maven.plugins 80 | maven-shade-plugin 81 | 82 | 83 | package 84 | 85 | shade 86 | 87 | 88 | true 89 | 90 | 91 | *:* 92 | 93 | META-INF/*.SF 94 | META-INF/*.DSA 95 | META-INF/*.RSA 96 | 97 | 98 | 99 | 100 | 101 | 102 | 103 | 104 | 105 | 106 | -------------------------------------------------------------------------------- /azure/AzureDataBricksJob/src/main/java/com/microsoft/pnp/GeoFinder.java: -------------------------------------------------------------------------------- 1 | package com.microsoft.pnp; 2 | 3 | import org.locationtech.jts.geom.Coordinate; 4 | import org.locationtech.jts.geom.GeometryFactory; 5 | import org.locationtech.jts.geom.Point; 6 | import org.geotools.data.FeatureSource; 7 | import org.geotools.data.collection.SpatialIndexFeatureCollection; 8 | import org.geotools.data.collection.SpatialIndexFeatureSource; 9 | import org.geotools.data.shapefile.ShapefileDataStore; 10 | import org.geotools.factory.CommonFactoryFinder; 11 | import org.geotools.feature.FeatureCollection; 12 | import org.geotools.feature.FeatureIterator; 13 | import org.opengis.feature.Feature; 14 | import org.opengis.filter.Filter; 15 | import org.opengis.filter.FilterFactory2; 16 | import org.opengis.filter.expression.PropertyName; 17 | import org.slf4j.Logger; 18 | import org.slf4j.LoggerFactory; 19 | import scala.Serializable; 20 | 21 | import java.io.IOException; 22 | import java.net.URL; 23 | import java.util.Optional; 24 | 25 | 26 | public class GeoFinder implements Serializable { 27 | private static final Logger logger = LoggerFactory.getLogger(GeoFinder.class); 28 | 29 | private final FeatureSource featureSource; 30 | private final FilterFactory2 filterFactory; 31 | private final PropertyName propertyName; 32 | private final GeometryFactory geometryFactory; 33 | 34 | private GeoFinder(FeatureSource featureSource, FilterFactory2 filterFactory, PropertyName propertyName) { 35 | this.featureSource = featureSource; 36 | this.filterFactory = filterFactory; 37 | this.propertyName = propertyName; 38 | this.geometryFactory = new GeometryFactory(); 39 | } 40 | 41 | public Optional getNeighborhood(double longitude, double latitude) { 42 | logger.debug(String.format("Searching for coordinate (%f, %f)", longitude, latitude)); 43 | Point point = this.geometryFactory.createPoint(new Coordinate(longitude, latitude)); 44 | Filter filter = this.filterFactory.contains(propertyName, filterFactory.literal(point)); 45 | try { 46 | FeatureCollection featureCollection = this.featureSource.getFeatures(filter); 47 | try (FeatureIterator iterator = featureCollection.features()) { 48 | if (iterator.hasNext()) { 49 | Feature feature = iterator.next(); 50 | return Optional.of(feature.getProperty("NAME").getValue().toString()); 51 | } 52 | } 53 | } catch (IOException ex) { 54 | 55 | logger.warn(String.format("Error searching for coordinate (%f, %f)", longitude, latitude), ex); 56 | } 57 | 58 | return Optional.of("Unknown"); 59 | } 60 | 61 | public static GeoFinder createGeoFinder(URL shapeFileUrl) throws IOException { 62 | try { 63 | logger.info(String.format("Using shapefile: %s", shapeFileUrl)); 64 | ShapefileDataStore dataStore = new ShapefileDataStore(shapeFileUrl); 65 | String[] typeNames = dataStore.getTypeNames(); 66 | String typeName = typeNames[0]; 67 | 68 | logger.info(String.format("Reading content %s", typeName)); 69 | FeatureSource featureSource = new SpatialIndexFeatureSource( 70 | new SpatialIndexFeatureCollection(dataStore.getFeatureSource(typeName).getFeatures())); 71 | 72 | FilterFactory2 filterFactory = CommonFactoryFinder.getFilterFactory2(); 73 | PropertyName propertyName = filterFactory.property(dataStore 74 | .getSchema(typeName) 75 | .getGeometryDescriptor() 76 | .getName()); 77 | return new GeoFinder(featureSource, filterFactory, propertyName); 78 | } catch (IOException ex) { 79 | logger.error(String.format("Error loading Geospatial data from %s", shapeFileUrl), ex); 80 | throw ex; 81 | } 82 | } 83 | } 84 | -------------------------------------------------------------------------------- /azure/AzureDataBricksJob/src/main/java/com/microsoft/pnp/MDCCloseableFactory.java: -------------------------------------------------------------------------------- 1 | package com.microsoft.pnp; 2 | 3 | import org.slf4j.MDC; 4 | 5 | import java.util.HashMap; 6 | import java.util.Map; 7 | import java.util.Optional; 8 | 9 | public class MDCCloseableFactory { 10 | private class MDCCloseable implements AutoCloseable { 11 | public MDCCloseable(Map mdc) { 12 | // Log4j supports Map, but slf4j wants Map 13 | // Because of type erasure, this should be okay, but we can try to find a 14 | // way to fix the warnings later. 15 | MDC.setContextMap((Map)mdc); 16 | } 17 | 18 | @Override 19 | public void close() { 20 | MDC.clear(); 21 | } 22 | } 23 | 24 | private Optional> context; 25 | 26 | public MDCCloseableFactory() { 27 | this(null); 28 | } 29 | 30 | public MDCCloseableFactory(Map context) { 31 | this.context = Optional.ofNullable(context); 32 | } 33 | 34 | public AutoCloseable create(Map mdc) { 35 | // Values in mdc will override context 36 | Map newMDC = new HashMap<>(); 37 | this.context.ifPresent(c -> newMDC.putAll(c)); 38 | newMDC.putAll(mdc); 39 | return new MDCCloseable(newMDC); 40 | } 41 | } 42 | -------------------------------------------------------------------------------- /azure/AzureDataBricksJob/src/main/resources/com/microsoft/pnp/azuredatabricksjob/log4j.properties: -------------------------------------------------------------------------------- 1 | log4j.appender.A1=com.microsoft.pnp.logging.loganalytics.LogAnalyticsAppender 2 | log4j.appender.A1.logType=taxijob 3 | log4j.appender.A1.layout=com.microsoft.pnp.log4j.JSONLayout 4 | log4j.appender.A1.layout.LocationInfo=false 5 | log4j.additivity.com.microsoft.pnp=false 6 | log4j.logger.com.microsoft.pnp=INFO, A1 7 | -------------------------------------------------------------------------------- /azure/AzureDataBricksJob/src/main/scala/com/microsoft/pnp/CassandraSinkForeach.scala: -------------------------------------------------------------------------------- 1 | package com.microsoft.pnp 2 | 3 | import com.datastax.spark.connector.cql.CassandraConnector 4 | import org.apache.spark.sql.{ForeachWriter, Row} 5 | 6 | class CassandraSinkForeach(con: CassandraConnector) 7 | extends ForeachWriter[Row] { 8 | 9 | // This class implements the interface ForeachWriter, which has methods that get called 10 | // whenever there is a sequence of rows generated as output 11 | def open(partitionId: Long, version: Long): Boolean = { 12 | true 13 | } 14 | 15 | def process(record: Row) = { 16 | con.withSessionDo(session => { 17 | val bound = session.prepare( 18 | s""" 19 | |insert into newyorktaxi.neighborhoodstats (neighborhood,window_end,number_of_rides,total_fare_amount,total_tip_amount,average_fare_amount,average_tip_amount) 20 | | values(?, ?, ?, ?, ?, ?, ?)""" 21 | 22 | ).bind( 23 | record.getString(2), 24 | record.getTimestamp(1).toInstant(), 25 | record.getLong(3).asInstanceOf[AnyRef], 26 | record.getDouble(4).asInstanceOf[AnyRef], 27 | record.getDouble(5).asInstanceOf[AnyRef], 28 | record.getDouble(6).asInstanceOf[AnyRef], 29 | record.getDouble(7).asInstanceOf[AnyRef] 30 | ) 31 | 32 | session.execute(bound) 33 | }) 34 | 35 | } 36 | 37 | def close(errorOrNull: Throwable): Unit = { 38 | } 39 | } 40 | -------------------------------------------------------------------------------- /azure/AzureDataBricksJob/src/main/scala/com/microsoft/pnp/JobConfiguration.scala: -------------------------------------------------------------------------------- 1 | package com.microsoft.pnp 2 | 3 | import java.net.URL 4 | 5 | import org.rogach.scallop._ 6 | import org.apache.spark.sql.catalyst.util.IntervalUtils.stringToInterval 7 | import org.apache.spark.unsafe.types.UTF8String 8 | 9 | class JobConfiguration(arguments: Seq[String]) extends ScallopConf(arguments) with Serialization { 10 | val neighborhoodFileURL = opt[URL]( 11 | name = "neighborhood-file-url", 12 | short = 'n', 13 | required = true 14 | )(urlConverter) 15 | 16 | val taxiRideConsumerGroup = opt[String](default = Some("$Default")) 17 | val taxiFareConsumerGroup = opt[String](default = Some("$Default")) 18 | 19 | // Intervals 20 | val windowInterval = opt[String](default = Some("1 hour"), validate = isValidInterval) 21 | val taxiRideWatermarkInterval = opt[String](default = Some("3 minutes"), validate = isValidInterval) 22 | val taxiFareWatermarkInterval = opt[String](default = Some("3 minutes"), validate = isValidInterval) 23 | 24 | val secretScope = opt[String](default = Some("azure-databricks-job")) 25 | val taxiRideEventHubSecretName = opt[String](default = Some("taxi-ride")) 26 | val taxiFareEventHubSecretName = opt[String](default = Some("taxi-fare")) 27 | 28 | val cassandraHost = opt[String]() 29 | 30 | // cassandra secrets 31 | val cassandraUserSecretName = opt[String](default = Some("cassandra-username")) 32 | val cassandraPasswordSecretName = opt[String](default = Some("cassandra-password")) 33 | 34 | verify() 35 | 36 | private def isValidInterval(interval: String): Boolean = { 37 | // This is the same check spark uses 38 | val intervalString = if (interval.startsWith("interval")) { 39 | interval 40 | } else { 41 | "interval " + interval 42 | } 43 | val cal = stringToInterval(UTF8String.fromString(intervalString)) 44 | cal != null 45 | } 46 | } 47 | -------------------------------------------------------------------------------- /azure/AzureDataBricksJob/src/main/scala/com/microsoft/pnp/StreamingMetricsListener.scala: -------------------------------------------------------------------------------- 1 | package com.microsoft.pnp 2 | 3 | import org.apache.spark.sql.streaming.StreamingQueryListener 4 | import org.apache.spark.sql.streaming.StreamingQueryListener._ 5 | import org.slf4j.{Logger, LoggerFactory} 6 | 7 | class StreamingMetricsListener() extends StreamingQueryListener { 8 | lazy val logger: Logger = LoggerFactory.getLogger(this.getClass.getName.stripSuffix("$")) 9 | lazy val mdcFactory: MDCCloseableFactory = new MDCCloseableFactory() 10 | 11 | override def onQueryStarted(event: QueryStartedEvent): Unit = {} 12 | 13 | override def onQueryProgress(event: QueryProgressEvent): Unit = { 14 | try { 15 | //parsing the telemetry Payload and logging to ala 16 | TryWith(this.mdcFactory.create(Utils.parsePayload(event)))( 17 | c => { 18 | this.logger.info("onQueryProgress") 19 | } 20 | ) 21 | } 22 | 23 | catch { 24 | case e: Exception => this.logger.error("onQueryProgress", e) 25 | } 26 | } 27 | 28 | override def onQueryTerminated(event: QueryTerminatedEvent): Unit = { 29 | if (event.exception.nonEmpty) { 30 | this.logger.error(event.exception.get) 31 | } 32 | } 33 | } 34 | -------------------------------------------------------------------------------- /azure/AzureDataBricksJob/src/main/scala/com/microsoft/pnp/TaxiCabReader.scala: -------------------------------------------------------------------------------- 1 | package com.microsoft.pnp 2 | 3 | import com.datastax.spark.connector.cql.CassandraConnector 4 | import org.apache.spark.eventhubs.{EventHubsConf, EventPosition} 5 | import org.apache.spark.metrics.source.{AppAccumulators, AppMetrics} 6 | import org.apache.spark.sql.catalyst.expressions.{CsvToStructs, Expression} 7 | import org.apache.spark.sql.functions._ 8 | import org.apache.spark.sql.streaming.OutputMode 9 | import org.apache.spark.sql.types.{StringType, StructType} 10 | import org.apache.spark.sql.{Column, SparkSession} 11 | import org.apache.spark.{SparkConf, SparkEnv} 12 | 13 | object TaxiCabReader { 14 | private def withExpr(expr: Expression): Column = new Column(expr) 15 | 16 | def main(args: Array[String]) { 17 | 18 | val conf = new JobConfiguration(args) 19 | val rideEventHubConnectionString = getSecret( 20 | conf.secretScope(), conf.taxiRideEventHubSecretName()) 21 | val fareEventHubConnectionString = getSecret( 22 | conf.secretScope(), conf.taxiFareEventHubSecretName()) 23 | 24 | val cassandraEndPoint = conf.cassandraHost() 25 | 26 | val cassandraUserName = getSecret( 27 | conf.secretScope(), conf.cassandraUserSecretName()) 28 | val cassandraPassword = getSecret( 29 | conf.secretScope(), conf.cassandraPasswordSecretName()) 30 | 31 | val spark = SparkSession 32 | .builder 33 | .getOrCreate 34 | 35 | import spark.implicits._ 36 | 37 | // Databricks spark session is created upfront . it is not possible to 38 | // update the conf later . hence this conf is just created with values from 39 | // secrets just for initiating the cassandra driver 40 | // please note :- when spark submit is used, spark session is created in the main method 41 | // what ever values that gets provided in the main while initiating spark should be able available by accessing 42 | // sparksession.getconf 43 | val sparkConfForCassandraDriver = new SparkConf(true) 44 | .set("spark.cassandra.connection.host", cassandraEndPoint) 45 | .set("spark.cassandra.connection.port", "10350") 46 | .set("spark.cassandra.connection.ssl.enabled", "true") 47 | .set("spark.cassandra.auth.username", cassandraUserName) 48 | .set("spark.cassandra.auth.password", cassandraPassword) 49 | .set("spark.master", "local[10]") 50 | .set("spark.cassandra.output.batch.size.rows", "1") 51 | .set("spark.cassandra.connection.remoteConnectionsPerExecutor", "2") 52 | .set("spark.cassandra.output.concurrent.writes", "5") 53 | .set("spark.cassandra.output.batch.grouping.buffer.size", "300") 54 | .set("spark.cassandra.connection.keepAliveMS", "5000") 55 | 56 | // Initializing the connector in the driver . connector is serializable 57 | // will be sending it to foreach sink that gets executed in the workers. 58 | val connector = CassandraConnector(sparkConfForCassandraDriver) 59 | 60 | @transient val appMetrics = new AppMetrics(spark.sparkContext) 61 | appMetrics.registerGauge("metrics.malformedrides", AppAccumulators.getRideInstance(spark.sparkContext)) 62 | appMetrics.registerGauge("metrics.malformedfares", AppAccumulators.getFareInstance(spark.sparkContext)) 63 | SparkEnv.get.metricsSystem.registerSource(appMetrics) 64 | 65 | @transient lazy val NeighborhoodFinder = GeoFinder.createGeoFinder(conf.neighborhoodFileURL()) 66 | 67 | val neighborhoodFinder = (lon: Double, lat: Double) => { 68 | NeighborhoodFinder.getNeighborhood(lon, lat).get() 69 | } 70 | val to_neighborhood = spark.udf.register("neighborhoodFinder", neighborhoodFinder) 71 | 72 | def from_csv(e: Column, schema: StructType, options: Map[String, String]): Column = withExpr { 73 | CsvToStructs(schema, options, e.expr) 74 | } 75 | 76 | spark.streams.addListener(new StreamingMetricsListener()) 77 | 78 | val rideEventHubOptions = EventHubsConf(rideEventHubConnectionString) 79 | .setConsumerGroup(conf.taxiRideConsumerGroup()) 80 | .setStartingPosition(EventPosition.fromStartOfStream) 81 | val rideEvents = spark.readStream 82 | .format("eventhubs") 83 | .options(rideEventHubOptions.toMap) 84 | .load 85 | 86 | val fareEventHubOptions = EventHubsConf(fareEventHubConnectionString) 87 | .setConsumerGroup(conf.taxiFareConsumerGroup()) 88 | .setStartingPosition(EventPosition.fromStartOfStream) 89 | val fareEvents = spark.readStream 90 | .format("eventhubs") 91 | .options(fareEventHubOptions.toMap) 92 | .load 93 | 94 | val transformedRides = rideEvents 95 | .select( 96 | $"body" 97 | .cast(StringType) 98 | .as("messageData"), 99 | from_json($"body".cast(StringType), RideSchema) 100 | .as("ride")) 101 | .transform(ds => { 102 | ds.withColumn( 103 | "errorMessage", 104 | when($"ride".isNull, 105 | lit("Error decoding JSON")) 106 | .otherwise(lit(null)) 107 | ) 108 | }) 109 | 110 | val malformedRides = AppAccumulators.getRideInstance(spark.sparkContext) 111 | 112 | val rides = transformedRides 113 | .filter(r => { 114 | if (r.isNullAt(r.fieldIndex("errorMessage"))) { 115 | true 116 | } 117 | else { 118 | malformedRides.add(1) 119 | false 120 | } 121 | }) 122 | .select( 123 | $"ride.*", 124 | to_neighborhood($"ride.pickupLon", $"ride.pickupLat") 125 | .as("pickupNeighborhood"), 126 | to_neighborhood($"ride.dropoffLon", $"ride.dropoffLat") 127 | .as("dropoffNeighborhood") 128 | ) 129 | .withWatermark("pickupTime", conf.taxiRideWatermarkInterval()) 130 | 131 | val csvOptions = Map("header" -> "true", "multiLine" -> "true") 132 | val transformedFares = fareEvents 133 | .select( 134 | $"body" 135 | .cast(StringType) 136 | .as("messageData"), 137 | from_csv($"body".cast(StringType), FareSchema, csvOptions) 138 | .as("fare")) 139 | .transform(ds => { 140 | ds.withColumn( 141 | "errorMessage", 142 | when($"fare".isNull, 143 | lit("Error decoding CSV")) 144 | .when(to_timestamp($"fare.pickupTimeString", "yyyy-MM-dd HH:mm:ss").isNull, 145 | lit("Error parsing pickupTime")) 146 | .otherwise(lit(null)) 147 | ) 148 | }) 149 | .transform(ds => { 150 | ds.withColumn( 151 | "pickupTime", 152 | when($"fare".isNull, 153 | lit(null)) 154 | .otherwise(to_timestamp($"fare.pickupTimeString", "yyyy-MM-dd HH:mm:ss")) 155 | ) 156 | }) 157 | 158 | 159 | val malformedFares = AppAccumulators.getFareInstance(spark.sparkContext) 160 | 161 | val fares = transformedFares 162 | .filter(r => { 163 | if (r.isNullAt(r.fieldIndex("errorMessage"))) { 164 | true 165 | } 166 | else { 167 | malformedFares.add(1) 168 | false 169 | } 170 | }) 171 | .select( 172 | $"fare.*", 173 | $"pickupTime" 174 | ) 175 | .withWatermark("pickupTime", conf.taxiFareWatermarkInterval()) 176 | 177 | val mergedTaxiTrip = rides.join(fares, Seq("medallion", "hackLicense", "vendorId", "pickupTime")) 178 | 179 | 180 | val maxAvgFarePerNeighborhood = mergedTaxiTrip.selectExpr("medallion", "hackLicense", "vendorId", "pickupTime", "rateCode", "storeAndForwardFlag", "dropoffTime", "passengerCount", "tripTimeInSeconds", "tripDistanceInMiles", "pickupLon", "pickupLat", "dropoffLon", "dropoffLat", "paymentType", "fareAmount", "surcharge", "mtaTax", "tipAmount", "tollsAmount", "totalAmount", "pickupNeighborhood", "dropoffNeighborhood") 181 | .groupBy(window($"pickupTime", conf.windowInterval()), $"pickupNeighborhood") 182 | .agg( 183 | count("*").as("rideCount"), 184 | sum($"fareAmount").as("totalFareAmount"), 185 | sum($"tipAmount").as("totalTipAmount"), 186 | (sum($"fareAmount")/count("*")).as("averageFareAmount"), 187 | (sum($"tipAmount")/count("*")).as("averageTipAmount") 188 | ) 189 | .select($"window.start", $"window.end", $"pickupNeighborhood", $"rideCount", $"totalFareAmount", $"totalTipAmount", $"averageFareAmount", $"averageTipAmount") 190 | 191 | maxAvgFarePerNeighborhood 192 | .writeStream 193 | .queryName("maxAvgFarePerNeighborhood_cassandra_insert") 194 | .outputMode(OutputMode.Append()) 195 | .foreach(new CassandraSinkForeach(connector)) 196 | .start() 197 | .awaitTermination() 198 | } 199 | } 200 | -------------------------------------------------------------------------------- /azure/AzureDataBricksJob/src/main/scala/com/microsoft/pnp/TryWith.scala: -------------------------------------------------------------------------------- 1 | package com.microsoft.pnp 2 | 3 | import scala.util.control.NonFatal 4 | import scala.util.{Failure, Try} 5 | 6 | object TryWith { 7 | def apply[C <: AutoCloseable, R](resource: => C)(f: C => R): Try[R] = 8 | Try(resource).flatMap(resourceInstance => { 9 | try { 10 | val returnValue = f(resourceInstance) 11 | Try(resourceInstance.close()).map(_ => returnValue) 12 | } 13 | catch { 14 | case NonFatal(exceptionInFunction) => 15 | try { 16 | resourceInstance.close() 17 | Failure(exceptionInFunction) 18 | } 19 | catch { 20 | case NonFatal(exceptionInClose) => 21 | exceptionInFunction.addSuppressed(exceptionInClose) 22 | Failure(exceptionInFunction) 23 | } 24 | } 25 | }) 26 | } 27 | -------------------------------------------------------------------------------- /azure/AzureDataBricksJob/src/main/scala/com/microsoft/pnp/Utils.scala: -------------------------------------------------------------------------------- 1 | package com.microsoft.pnp 2 | 3 | import java.time.{ZoneId, ZonedDateTime} 4 | import java.util.HashMap 5 | 6 | import org.apache.spark.sql.streaming.StreamingQueryListener.QueryProgressEvent 7 | 8 | object Utils { 9 | def parsePayload(event: QueryProgressEvent): HashMap[String, AnyRef]={ 10 | val date = java.time.format 11 | .DateTimeFormatter.RFC_1123_DATE_TIME.format(ZonedDateTime.now(ZoneId.of("GMT"))) 12 | 13 | val metrics = new HashMap[String, AnyRef]() 14 | metrics.put("id", event.progress.id) 15 | metrics.put("sink", event.progress.sink) 16 | metrics.put("durationms", event.progress.durationMs.asInstanceOf[AnyRef]) 17 | metrics.put("inputRowsPerSecond", event.progress.inputRowsPerSecond.asInstanceOf[AnyRef]) 18 | metrics.put("procRowsPerSecond", event.progress.processedRowsPerSecond.asInstanceOf[AnyRef]) 19 | metrics.put("inputRows", event.progress.numInputRows.asInstanceOf[AnyRef]) 20 | metrics.put("DateValue", date.toString) 21 | 22 | metrics 23 | } 24 | } 25 | -------------------------------------------------------------------------------- /azure/AzureDataBricksJob/src/main/scala/com/microsoft/pnp/package.scala: -------------------------------------------------------------------------------- 1 | package com.microsoft 2 | 3 | import org.apache.spark.sql.types._ 4 | import com.databricks.dbutils_v1.DBUtilsHolder.dbutils.secrets 5 | 6 | package object pnp { 7 | 8 | def getSecret(secretScope: String, secretName: String): String = { 9 | secrets.get(secretScope, secretName) 10 | } 11 | 12 | val RideSchema = new StructType() 13 | .add("rateCode", IntegerType) 14 | .add("storeAndForwardFlag", StringType) 15 | .add("dropoffTime", TimestampType) 16 | .add("passengerCount", IntegerType) 17 | .add("tripTimeInSeconds", DoubleType) 18 | .add("tripDistanceInMiles", DoubleType) 19 | .add("pickupLon", DoubleType) 20 | .add("pickupLat", DoubleType) 21 | .add("dropoffLon", DoubleType) 22 | .add("dropoffLat", DoubleType) 23 | .add("medallion", LongType) 24 | .add("hackLicense", LongType) 25 | .add("vendorId", StringType) 26 | .add("pickupTime", TimestampType) 27 | .add("errorMessage", StringType) 28 | .add("messageData", StringType) 29 | 30 | val FareSchema = new StructType() 31 | .add("medallion", LongType) 32 | .add("hackLicense", LongType) 33 | .add("vendorId",StringType) 34 | .add("pickupTimeString", StringType) 35 | .add("paymentType", StringType) 36 | .add("fareAmount", DoubleType) 37 | .add("surcharge", DoubleType) 38 | .add("mtaTax", DoubleType) 39 | .add("tipAmount", DoubleType) 40 | .add("tollsAmount", DoubleType) 41 | .add("totalAmount", DoubleType) 42 | } 43 | 44 | -------------------------------------------------------------------------------- /azure/AzureDataBricksJob/src/main/scala/org/apache/spark/metrics/source/AppAccumulators.scala: -------------------------------------------------------------------------------- 1 | package org.apache.spark.metrics.source 2 | import org.apache.spark.SparkContext 3 | import org.apache.spark.util.LongAccumulator 4 | 5 | object AppAccumulators { 6 | @volatile private var fareInstance: LongAccumulator = _ 7 | @volatile private var rideInstance: LongAccumulator = _ 8 | 9 | def getFareInstance(sc: SparkContext): LongAccumulator = { 10 | if (fareInstance == null) { 11 | synchronized { 12 | if (fareInstance == null) { 13 | fareInstance = sc.longAccumulator("MalformedFareCount") 14 | } 15 | } 16 | } 17 | fareInstance 18 | } 19 | 20 | def getRideInstance(sc: SparkContext): LongAccumulator = { 21 | if (rideInstance == null) { 22 | synchronized { 23 | if (rideInstance == null) { 24 | rideInstance = sc.longAccumulator("MalformedRideCount") 25 | } 26 | } 27 | } 28 | rideInstance 29 | } 30 | } 31 | -------------------------------------------------------------------------------- /azure/AzureDataBricksJob/src/main/scala/org/apache/spark/metrics/source/AppMetrics.scala: -------------------------------------------------------------------------------- 1 | package org.apache.spark.metrics.source 2 | 3 | import com.codahale.metrics.{Gauge, MetricRegistry} 4 | import org.apache.spark.SparkContext 5 | import org.apache.spark.util.LongAccumulator 6 | 7 | class AppMetrics(sc: SparkContext) extends Source { 8 | override val metricRegistry = new MetricRegistry 9 | override val sourceName = "%s.AppMetrics".format(sc.appName) 10 | 11 | def registerGauge(metricName: String, acc: LongAccumulator) { 12 | val metric = new Gauge[Long] { 13 | override def getValue: Long = { 14 | acc.value 15 | } 16 | } 17 | metricRegistry.register(MetricRegistry.name(metricName), metric) 18 | } 19 | } 20 | -------------------------------------------------------------------------------- /azure/AzureDataBricksJob/src/main/scala/org/apache/spark/sql/catalyst/csv/CSVExprUtils.scala: -------------------------------------------------------------------------------- 1 | package org.apache.spark.sql.catalyst.csv 2 | 3 | object CSVExprUtils { 4 | /** 5 | * Filter ignorable rows for CSV iterator (lines empty and starting with `comment`). 6 | * This is currently being used in CSV reading path and CSV schema inference. 7 | */ 8 | def filterCommentAndEmpty(iter: Iterator[String], options: CSVOptions): Iterator[String] = { 9 | iter.filter { line => 10 | line.trim.nonEmpty && !line.startsWith(options.comment.toString) 11 | } 12 | } 13 | 14 | def skipComments(iter: Iterator[String], options: CSVOptions): Iterator[String] = { 15 | if (options.isCommentSet) { 16 | val commentPrefix = options.comment.toString 17 | iter.dropWhile { line => 18 | line.trim.isEmpty || line.trim.startsWith(commentPrefix) 19 | } 20 | } else { 21 | iter.dropWhile(_.trim.isEmpty) 22 | } 23 | } 24 | 25 | /** 26 | * Extracts header and moves iterator forward so that only data remains in it 27 | */ 28 | def extractHeader(iter: Iterator[String], options: CSVOptions): Option[String] = { 29 | val nonEmptyLines = skipComments(iter, options) 30 | if (nonEmptyLines.hasNext) { 31 | Some(nonEmptyLines.next()) 32 | } else { 33 | None 34 | } 35 | } 36 | 37 | /** 38 | * Helper method that converts string representation of a character to actual character. 39 | * It handles some Java escaped strings and throws exception if given string is longer than one 40 | * character. 41 | */ 42 | @throws[IllegalArgumentException] 43 | def toChar(str: String): Char = { 44 | (str: Seq[Char]) match { 45 | case Seq() => throw new IllegalArgumentException("Delimiter cannot be empty string") 46 | case Seq('\\') => throw new IllegalArgumentException("Single backslash is prohibited." + 47 | " It has special meaning as beginning of an escape sequence." + 48 | " To get the backslash character, pass a string with two backslashes as the delimiter.") 49 | case Seq(c) => c 50 | case Seq('\\', 't') => '\t' 51 | case Seq('\\', 'r') => '\r' 52 | case Seq('\\', 'b') => '\b' 53 | case Seq('\\', 'f') => '\f' 54 | // In case user changes quote char and uses \" as delimiter in options 55 | case Seq('\\', '\"') => '\"' 56 | case Seq('\\', '\'') => '\'' 57 | case Seq('\\', '\\') => '\\' 58 | case _ if str == """\u0000""" => '\u0000' 59 | case Seq('\\', _) => 60 | throw new IllegalArgumentException(s"Unsupported special character for delimiter: $str") 61 | case _ => 62 | throw new IllegalArgumentException(s"Delimiter cannot be more than one character: $str") 63 | } 64 | } 65 | } 66 | -------------------------------------------------------------------------------- /azure/AzureDataBricksJob/src/main/scala/org/apache/spark/sql/catalyst/csv/CSVHeaderChecker.scala: -------------------------------------------------------------------------------- 1 | /* 2 | * Licensed to the Apache Software Foundation (ASF) under one or more 3 | * contributor license agreements. See the NOTICE file distributed with 4 | * this work for additional information regarding copyright ownership. 5 | * The ASF licenses this file to You under the Apache License, Version 2.0 6 | * (the "License"); you may not use this file except in compliance with 7 | * the License. You may obtain a copy of the License at 8 | * 9 | * http://www.apache.org/licenses/LICENSE-2.0 10 | * 11 | * Unless required by applicable law or agreed to in writing, software 12 | * distributed under the License is distributed on an "AS IS" BASIS, 13 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | * See the License for the specific language governing permissions and 15 | * limitations under the License. 16 | */ 17 | 18 | package org.apache.spark.sql.catalyst.csv 19 | 20 | import com.univocity.parsers.csv.CsvParser 21 | import org.apache.spark.internal.Logging 22 | import org.apache.spark.sql.internal.SQLConf 23 | import org.apache.spark.sql.types.StructType 24 | 25 | /** 26 | * Checks that column names in a CSV header and field names in the schema are the same 27 | * by taking into account case sensitivity. 28 | * 29 | * @param schema provided (or inferred) schema to which CSV must conform. 30 | * @param options parsed CSV options. 31 | * @param source name of CSV source that are currently checked. It is used in error messages. 32 | * @param isStartOfFile indicates if the currently processing partition is the start of the file. 33 | * if unknown or not applicable (for instance when the input is a dataset), 34 | * can be omitted. 35 | */ 36 | class CSVHeaderChecker( 37 | schema: StructType, 38 | options: CSVOptions, 39 | source: String, 40 | isStartOfFile: Boolean = false) extends Logging { 41 | 42 | // Indicates if it is set to `false`, comparison of column names and schema field 43 | // names is not case sensitive. 44 | private val caseSensitive = SQLConf.get.caseSensitiveAnalysis 45 | 46 | // Indicates if it is `true`, column names are ignored otherwise the CSV column 47 | // names are checked for conformance to the schema. In the case if 48 | // the column name don't conform to the schema, an exception is thrown. 49 | private val enforceSchema = options.enforceSchema 50 | 51 | /** 52 | * Checks that column names in a CSV header and field names in the schema are the same 53 | * by taking into account case sensitivity. 54 | * 55 | * @param columnNames names of CSV columns that must be checked against to the schema. 56 | */ 57 | private def checkHeaderColumnNames(columnNames: Array[String]): Unit = { 58 | if (columnNames != null) { 59 | val fieldNames = schema.map(_.name).toIndexedSeq 60 | val (headerLen, schemaSize) = (columnNames.size, fieldNames.length) 61 | var errorMessage: Option[String] = None 62 | 63 | if (headerLen == schemaSize) { 64 | var i = 0 65 | while (errorMessage.isEmpty && i < headerLen) { 66 | var (nameInSchema, nameInHeader) = (fieldNames(i), columnNames(i)) 67 | if (!caseSensitive) { 68 | // scalastyle:off caselocale 69 | nameInSchema = nameInSchema.toLowerCase 70 | nameInHeader = nameInHeader.toLowerCase 71 | // scalastyle:on caselocale 72 | } 73 | if (nameInHeader != nameInSchema) { 74 | errorMessage = Some( 75 | s"""|CSV header does not conform to the schema. 76 | | Header: ${columnNames.mkString(", ")} 77 | | Schema: ${fieldNames.mkString(", ")} 78 | |Expected: ${fieldNames(i)} but found: ${columnNames(i)} 79 | |$source""".stripMargin) 80 | } 81 | i += 1 82 | } 83 | } else { 84 | errorMessage = Some( 85 | s"""|Number of column in CSV header is not equal to number of fields in the schema: 86 | | Header length: $headerLen, schema size: $schemaSize 87 | |$source""".stripMargin) 88 | } 89 | 90 | errorMessage.foreach { msg => 91 | if (enforceSchema) { 92 | logWarning(msg) 93 | } else { 94 | throw new IllegalArgumentException(msg) 95 | } 96 | } 97 | } 98 | } 99 | 100 | // This is currently only used to parse CSV from Dataset[String]. 101 | def checkHeaderColumnNames(line: String): Unit = { 102 | if (options.headerFlag) { 103 | val parser = new CsvParser(options.asParserSettings) 104 | checkHeaderColumnNames(parser.parseLine(line)) 105 | } 106 | } 107 | 108 | // This is currently only used to parse CSV with multiLine mode. 109 | private[csv] def checkHeaderColumnNames(tokenizer: CsvParser): Unit = { 110 | assert(options.multiLine, "This method should be executed with multiLine.") 111 | if (options.headerFlag) { 112 | val firstRecord = tokenizer.parseNext() 113 | checkHeaderColumnNames(firstRecord) 114 | } 115 | } 116 | 117 | // This is currently only used to parse CSV with non-multiLine mode. 118 | private[csv] def checkHeaderColumnNames(lines: Iterator[String], tokenizer: CsvParser): Unit = { 119 | assert(!options.multiLine, "This method should not be executed with multiline.") 120 | // Checking that column names in the header are matched to field names of the schema. 121 | // The header will be removed from lines. 122 | // Note: if there are only comments in the first block, the header would probably 123 | // be not extracted. 124 | if (options.headerFlag && isStartOfFile) { 125 | CSVExprUtils.extractHeader(lines, options).foreach { header => 126 | checkHeaderColumnNames(tokenizer.parseLine(header)) 127 | } 128 | } 129 | } 130 | } -------------------------------------------------------------------------------- /azure/AzureDataBricksJob/src/main/scala/org/apache/spark/sql/catalyst/csv/CSVOptions.scala: -------------------------------------------------------------------------------- 1 | /* 2 | * Licensed to the Apache Software Foundation (ASF) under one or more 3 | * contributor license agreements. See the NOTICE file distributed with 4 | * this work for additional information regarding copyright ownership. 5 | * The ASF licenses this file to You under the Apache License, Version 2.0 6 | * (the "License"); you may not use this file except in compliance with 7 | * the License. You may obtain a copy of the License at 8 | * 9 | * http://www.apache.org/licenses/LICENSE-2.0 10 | * 11 | * Unless required by applicable law or agreed to in writing, software 12 | * distributed under the License is distributed on an "AS IS" BASIS, 13 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | * See the License for the specific language governing permissions and 15 | * limitations under the License. 16 | */ 17 | 18 | package org.apache.spark.sql.catalyst.csv 19 | 20 | import java.nio.charset.StandardCharsets 21 | import java.util.{Locale, TimeZone} 22 | 23 | import com.univocity.parsers.csv.{CsvParserSettings, CsvWriterSettings, UnescapedQuoteHandling} 24 | import org.apache.commons.lang3.time.FastDateFormat 25 | import org.apache.spark.internal.Logging 26 | import org.apache.spark.sql.catalyst.util._ 27 | 28 | class CSVOptions( 29 | @transient val parameters: CaseInsensitiveMap[String], 30 | val columnPruning: Boolean, 31 | defaultTimeZoneId: String, 32 | defaultColumnNameOfCorruptRecord: String) 33 | extends Logging with Serializable { 34 | 35 | def this( 36 | parameters: Map[String, String], 37 | columnPruning: Boolean, 38 | defaultTimeZoneId: String, 39 | defaultColumnNameOfCorruptRecord: String = "") = { 40 | this( 41 | CaseInsensitiveMap(parameters), 42 | columnPruning, 43 | defaultTimeZoneId, 44 | defaultColumnNameOfCorruptRecord) 45 | } 46 | 47 | private def getChar(paramName: String, default: Char): Char = { 48 | val paramValue = parameters.get(paramName) 49 | paramValue match { 50 | case None => default 51 | case Some(null) => default 52 | case Some(value) if value.length == 0 => '\u0000' 53 | case Some(value) if value.length == 1 => value.charAt(0) 54 | case _ => throw new RuntimeException(s"$paramName cannot be more than one character") 55 | } 56 | } 57 | 58 | private def getInt(paramName: String, default: Int): Int = { 59 | val paramValue = parameters.get(paramName) 60 | paramValue match { 61 | case None => default 62 | case Some(null) => default 63 | case Some(value) => try { 64 | value.toInt 65 | } catch { 66 | case e: NumberFormatException => 67 | throw new RuntimeException(s"$paramName should be an integer. Found $value") 68 | } 69 | } 70 | } 71 | 72 | private def getBool(paramName: String, default: Boolean = false): Boolean = { 73 | val param = parameters.getOrElse(paramName, default.toString) 74 | if (param == null) { 75 | default 76 | } else if (param.toLowerCase(Locale.ROOT) == "true") { 77 | true 78 | } else if (param.toLowerCase(Locale.ROOT) == "false") { 79 | false 80 | } else { 81 | throw new Exception(s"$paramName flag can be true or false") 82 | } 83 | } 84 | 85 | val delimiter = CSVExprUtils.toChar( 86 | parameters.getOrElse("sep", parameters.getOrElse("delimiter", ","))) 87 | val parseMode: ParseMode = parameters.get("mode").map(ParseMode.fromString).getOrElse(PermissiveMode) 88 | val charset = parameters.getOrElse("encoding", 89 | parameters.getOrElse("charset", StandardCharsets.UTF_8.name())) 90 | 91 | val quote = getChar("quote", '\"') 92 | val escape = getChar("escape", '\\') 93 | val charToEscapeQuoteEscaping = parameters.get("charToEscapeQuoteEscaping") match { 94 | case None => None 95 | case Some(null) => None 96 | case Some(value) if value.length == 0 => None 97 | case Some(value) if value.length == 1 => Some(value.charAt(0)) 98 | case _ => 99 | throw new RuntimeException("charToEscapeQuoteEscaping cannot be more than one character") 100 | } 101 | val comment = getChar("comment", '\u0000') 102 | 103 | val headerFlag = getBool("header") 104 | val inferSchemaFlag = getBool("inferSchema") 105 | val ignoreLeadingWhiteSpaceInRead = getBool("ignoreLeadingWhiteSpace", default = false) 106 | val ignoreTrailingWhiteSpaceInRead = getBool("ignoreTrailingWhiteSpace", default = false) 107 | 108 | // For write, both options were `true` by default. We leave it as `true` for 109 | // backwards compatibility. 110 | val ignoreLeadingWhiteSpaceFlagInWrite = getBool("ignoreLeadingWhiteSpace", default = true) 111 | val ignoreTrailingWhiteSpaceFlagInWrite = getBool("ignoreTrailingWhiteSpace", default = true) 112 | 113 | val columnNameOfCorruptRecord = 114 | parameters.getOrElse("columnNameOfCorruptRecord", defaultColumnNameOfCorruptRecord) 115 | 116 | val nullValue = parameters.getOrElse("nullValue", "") 117 | 118 | val nanValue = parameters.getOrElse("nanValue", "NaN") 119 | 120 | val positiveInf = parameters.getOrElse("positiveInf", "Inf") 121 | val negativeInf = parameters.getOrElse("negativeInf", "-Inf") 122 | 123 | 124 | val compressionCodec: Option[String] = { 125 | val name = parameters.get("compression").orElse(parameters.get("codec")) 126 | name.map(CompressionCodecs.getCodecClassName) 127 | } 128 | 129 | val timeZone: TimeZone = DateTimeUtils.getTimeZone( 130 | parameters.getOrElse(DateTimeUtils.TIMEZONE_OPTION, defaultTimeZoneId)) 131 | 132 | // Uses `FastDateFormat` which can be direct replacement for `SimpleDateFormat` and thread-safe. 133 | val dateFormat: FastDateFormat = 134 | FastDateFormat.getInstance(parameters.getOrElse("dateFormat", "yyyy-MM-dd"), Locale.US) 135 | 136 | val timestampFormat: FastDateFormat = 137 | FastDateFormat.getInstance( 138 | parameters.getOrElse("timestampFormat", "yyyy-MM-dd'T'HH:mm:ss.SSSXXX"), timeZone, Locale.US) 139 | 140 | val multiLine = parameters.get("multiLine").map(_.toBoolean).getOrElse(false) 141 | 142 | val maxColumns = getInt("maxColumns", 20480) 143 | 144 | val maxCharsPerColumn = getInt("maxCharsPerColumn", -1) 145 | 146 | val escapeQuotes = getBool("escapeQuotes", true) 147 | 148 | val quoteAll = getBool("quoteAll", false) 149 | 150 | val inputBufferSize = 128 151 | 152 | val isCommentSet = this.comment != '\u0000' 153 | 154 | val samplingRatio = 155 | parameters.get("samplingRatio").map(_.toDouble).getOrElse(1.0) 156 | 157 | /** 158 | * Forcibly apply the specified or inferred schema to datasource files. 159 | * If the option is enabled, headers of CSV files will be ignored. 160 | */ 161 | val enforceSchema = getBool("enforceSchema", default = true) 162 | 163 | 164 | /** 165 | * String representation of an empty value in read and in write. 166 | */ 167 | val emptyValue = parameters.get("emptyValue") 168 | /** 169 | * The string is returned when CSV reader doesn't have any characters for input value, 170 | * or an empty quoted string `""`. Default value is empty string. 171 | */ 172 | val emptyValueInRead = emptyValue.getOrElse("") 173 | /** 174 | * The value is used instead of an empty string in write. Default value is `""` 175 | */ 176 | val emptyValueInWrite = emptyValue.getOrElse("\"\"") 177 | 178 | def asWriterSettings: CsvWriterSettings = { 179 | val writerSettings = new CsvWriterSettings() 180 | val format = writerSettings.getFormat 181 | format.setDelimiter(delimiter) 182 | format.setQuote(quote) 183 | format.setQuoteEscape(escape) 184 | charToEscapeQuoteEscaping.foreach(format.setCharToEscapeQuoteEscaping) 185 | format.setComment(comment) 186 | writerSettings.setIgnoreLeadingWhitespaces(ignoreLeadingWhiteSpaceFlagInWrite) 187 | writerSettings.setIgnoreTrailingWhitespaces(ignoreTrailingWhiteSpaceFlagInWrite) 188 | writerSettings.setNullValue(nullValue) 189 | writerSettings.setEmptyValue(emptyValueInWrite) 190 | writerSettings.setSkipEmptyLines(true) 191 | writerSettings.setQuoteAllFields(quoteAll) 192 | writerSettings.setQuoteEscapingEnabled(escapeQuotes) 193 | writerSettings 194 | } 195 | 196 | def asParserSettings: CsvParserSettings = { 197 | val settings = new CsvParserSettings() 198 | val format = settings.getFormat 199 | format.setDelimiter(delimiter) 200 | format.setQuote(quote) 201 | format.setQuoteEscape(escape) 202 | charToEscapeQuoteEscaping.foreach(format.setCharToEscapeQuoteEscaping) 203 | format.setComment(comment) 204 | settings.setIgnoreLeadingWhitespaces(ignoreLeadingWhiteSpaceInRead) 205 | settings.setIgnoreTrailingWhitespaces(ignoreTrailingWhiteSpaceInRead) 206 | settings.setReadInputOnSeparateThread(false) 207 | settings.setInputBufferSize(inputBufferSize) 208 | settings.setMaxColumns(maxColumns) 209 | settings.setNullValue(nullValue) 210 | settings.setEmptyValue(emptyValueInRead) 211 | settings.setMaxCharsPerColumn(maxCharsPerColumn) 212 | settings.setUnescapedQuoteHandling(UnescapedQuoteHandling.STOP_AT_DELIMITER) 213 | settings.setLineSeparatorDetectionEnabled(multiLine == true) 214 | 215 | // This is for handling a header, so we'll just blindly skip 216 | if (headerFlag) { 217 | settings.setNumberOfRowsToSkip(1) 218 | } 219 | settings 220 | } 221 | } -------------------------------------------------------------------------------- /azure/AzureDataBricksJob/src/main/scala/org/apache/spark/sql/catalyst/csv/UnivocityParser.scala: -------------------------------------------------------------------------------- 1 | /* 2 | * Licensed to the Apache Software Foundation (ASF) under one or more 3 | * contributor license agreements. See the NOTICE file distributed with 4 | * this work for additional information regarding copyright ownership. 5 | * The ASF licenses this file to You under the Apache License, Version 2.0 6 | * (the "License"); you may not use this file except in compliance with 7 | * the License. You may obtain a copy of the License at 8 | * 9 | * http://www.apache.org/licenses/LICENSE-2.0 10 | * 11 | * Unless required by applicable law or agreed to in writing, software 12 | * distributed under the License is distributed on an "AS IS" BASIS, 13 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | * See the License for the specific language governing permissions and 15 | * limitations under the License. 16 | */ 17 | 18 | package org.apache.spark.sql.catalyst.csv 19 | 20 | import java.io.{InputStream, StringReader} 21 | import java.math.BigDecimal 22 | 23 | import com.univocity.parsers.csv.CsvParser 24 | import org.apache.spark.internal.Logging 25 | import org.apache.spark.sql.catalyst.InternalRow 26 | import org.apache.spark.sql.catalyst.expressions.GenericInternalRow 27 | import org.apache.spark.sql.catalyst.util.{BadRecordException, DateTimeUtils, FailureSafeParser} 28 | import org.apache.spark.sql.types._ 29 | import org.apache.spark.unsafe.types.UTF8String 30 | 31 | import scala.util.Try 32 | import scala.util.control.NonFatal 33 | 34 | 35 | /** 36 | * Constructs a parser for a given schema that translates CSV data to an [[InternalRow]]. 37 | * 38 | * @param dataSchema The CSV data schema that is specified by the user, or inferred from underlying 39 | * data files. 40 | * @param requiredSchema The schema of the data that should be output for each row. This should be a 41 | * subset of the columns in dataSchema. 42 | * @param options Configuration options for a CSV parser. 43 | */ 44 | class UnivocityParser( 45 | dataSchema: StructType, 46 | requiredSchema: StructType, 47 | val options: CSVOptions) extends Logging { 48 | require(requiredSchema.toSet.subsetOf(dataSchema.toSet), 49 | s"requiredSchema (${requiredSchema.catalogString}) should be the subset of " + 50 | s"dataSchema (${dataSchema.catalogString}).") 51 | 52 | def this(schema: StructType, options: CSVOptions) = this(schema, schema, options) 53 | 54 | // A `ValueConverter` is responsible for converting the given value to a desired type. 55 | private type ValueConverter = String => Any 56 | 57 | // This index is used to reorder parsed tokens 58 | private val tokenIndexArr = 59 | requiredSchema.map(f => java.lang.Integer.valueOf(dataSchema.indexOf(f))).toArray 60 | 61 | // When column pruning is enabled, the parser only parses the required columns based on 62 | // their positions in the data schema. 63 | private val parsedSchema = if (options.columnPruning) requiredSchema else dataSchema 64 | 65 | val tokenizer = { 66 | val parserSetting = options.asParserSettings 67 | // When to-be-parsed schema is shorter than the to-be-read data schema, we let Univocity CSV 68 | // parser select a sequence of fields for reading by their positions. 69 | // if (options.columnPruning && requiredSchema.length < dataSchema.length) { 70 | if (parsedSchema.length < dataSchema.length) { 71 | parserSetting.selectIndexes(tokenIndexArr: _*) 72 | } 73 | new CsvParser(parserSetting) 74 | } 75 | 76 | private val row = new GenericInternalRow(requiredSchema.length) 77 | 78 | // Retrieve the raw record string. 79 | private def getCurrentInput: UTF8String = { 80 | UTF8String.fromString(tokenizer.getContext.currentParsedContent().stripLineEnd) 81 | } 82 | 83 | // This parser first picks some tokens from the input tokens, according to the required schema, 84 | // then parse these tokens and put the values in a row, with the order specified by the required 85 | // schema. 86 | // 87 | // For example, let's say there is CSV data as below: 88 | // 89 | // a,b,c 90 | // 1,2,A 91 | // 92 | // So the CSV data schema is: ["a", "b", "c"] 93 | // And let's say the required schema is: ["c", "b"] 94 | // 95 | // with the input tokens, 96 | // 97 | // input tokens - [1, 2, "A"] 98 | // 99 | // Each input token is placed in each output row's position by mapping these. In this case, 100 | // 101 | // output row - ["A", 2] 102 | private val valueConverters: Array[ValueConverter] = { 103 | requiredSchema.map(f => makeConverter(f.name, f.dataType, f.nullable, options)).toArray 104 | } 105 | 106 | /** 107 | * Create a converter which converts the string value to a value according to a desired type. 108 | * Currently, we do not support complex types (`ArrayType`, `MapType`, `StructType`). 109 | * 110 | * For other nullable types, returns null if it is null or equals to the value specified 111 | * in `nullValue` option. 112 | */ 113 | def makeConverter( 114 | name: String, 115 | dataType: DataType, 116 | nullable: Boolean = true, 117 | options: CSVOptions): ValueConverter = dataType match { 118 | case _: ByteType => (d: String) => 119 | nullSafeDatum(d, name, nullable, options)(_.toByte) 120 | 121 | case _: ShortType => (d: String) => 122 | nullSafeDatum(d, name, nullable, options)(_.toShort) 123 | 124 | case _: IntegerType => (d: String) => 125 | nullSafeDatum(d, name, nullable, options)(_.toInt) 126 | 127 | case _: LongType => (d: String) => 128 | nullSafeDatum(d, name, nullable, options)(_.toLong) 129 | 130 | case _: FloatType => (d: String) => 131 | nullSafeDatum(d, name, nullable, options) { 132 | case options.nanValue => Float.NaN 133 | case options.negativeInf => Float.NegativeInfinity 134 | case options.positiveInf => Float.PositiveInfinity 135 | case datum => datum.toFloat 136 | } 137 | 138 | case _: DoubleType => (d: String) => 139 | nullSafeDatum(d, name, nullable, options) { 140 | case options.nanValue => Double.NaN 141 | case options.negativeInf => Double.NegativeInfinity 142 | case options.positiveInf => Double.PositiveInfinity 143 | case datum => datum.toDouble 144 | } 145 | 146 | case _: BooleanType => (d: String) => 147 | nullSafeDatum(d, name, nullable, options)(_.toBoolean) 148 | 149 | case dt: DecimalType => (d: String) => 150 | nullSafeDatum(d, name, nullable, options) { datum => 151 | val value = new BigDecimal(datum.replaceAll(",", "")) 152 | Decimal(value, dt.precision, dt.scale) 153 | } 154 | 155 | case _: TimestampType => (d: String) => 156 | nullSafeDatum(d, name, nullable, options) { datum => 157 | // This one will lose microseconds parts. 158 | // See https://issues.apache.org/jira/browse/SPARK-10681. 159 | Try(options.timestampFormat.parse(datum).getTime * 1000L) 160 | } 161 | 162 | case _: DateType => (d: String) => 163 | nullSafeDatum(d, name, nullable, options) { datum => 164 | // This one will lose microseconds parts. 165 | // See https://issues.apache.org/jira/browse/SPARK-10681.x 166 | Try(DateTimeUtils.millisToDays(options.dateFormat.parse(datum).getTime)) 167 | } 168 | 169 | case _: StringType => (d: String) => 170 | nullSafeDatum(d, name, nullable, options)(UTF8String.fromString) 171 | 172 | case udt: UserDefinedType[_] => (datum: String) => 173 | makeConverter(name, udt.sqlType, nullable, options) 174 | 175 | // We don't actually hit this exception though, we keep it for understandability 176 | case _ => throw new RuntimeException(s"Unsupported type: ${dataType.typeName}") 177 | } 178 | 179 | private def nullSafeDatum( 180 | datum: String, 181 | name: String, 182 | nullable: Boolean, 183 | options: CSVOptions)(converter: ValueConverter): Any = { 184 | if (datum == options.nullValue || datum == null) { 185 | if (!nullable) { 186 | throw new RuntimeException(s"null value found but field $name is not nullable.") 187 | } 188 | null 189 | } else { 190 | converter.apply(datum) 191 | } 192 | } 193 | 194 | /** 195 | * Parses a single CSV string and turns it into either one resulting row or no row (if the 196 | * the record is malformed). 197 | */ 198 | // We are going to change this to handle headers 199 | def parse(input: String): InternalRow = convert( 200 | Try {tokenizer.parseAll(new StringReader(input)).get(0)}.getOrElse(null) 201 | ) 202 | 203 | private val getToken = if (options.columnPruning) { 204 | (tokens: Array[String], index: Int) => tokens(index) 205 | } else { 206 | (tokens: Array[String], index: Int) => tokens(tokenIndexArr(index)) 207 | } 208 | 209 | private def convert(tokens: Array[String]): InternalRow = { 210 | if (tokens == null) { 211 | throw BadRecordException( 212 | () => getCurrentInput, 213 | () => None, 214 | new RuntimeException("Malformed CSV record")) 215 | } else if (tokens.length != parsedSchema.length) { 216 | // If the number of tokens doesn't match the schema, we should treat it as a malformed record. 217 | // However, we still have chance to parse some of the tokens, by adding extra null tokens in 218 | // the tail if the number is smaller, or by dropping extra tokens if the number is larger. 219 | val checkedTokens = if (parsedSchema.length > tokens.length) { 220 | tokens ++ new Array[String](parsedSchema.length - tokens.length) 221 | } else { 222 | tokens.take(parsedSchema.length) 223 | } 224 | def getPartialResult(): Option[InternalRow] = { 225 | try { 226 | Some(convert(checkedTokens)) 227 | } catch { 228 | case _: BadRecordException => None 229 | } 230 | } 231 | // For records with less or more tokens than the schema, tries to return partial results 232 | // if possible. 233 | throw BadRecordException( 234 | () => getCurrentInput, 235 | () => getPartialResult(), 236 | new RuntimeException("Malformed CSV record")) 237 | } else { 238 | try { 239 | // When the length of the returned tokens is identical to the length of the parsed schema, 240 | // we just need to convert the tokens that correspond to the required columns. 241 | var i = 0 242 | while (i < requiredSchema.length) { 243 | row(i) = valueConverters(i).apply(getToken(tokens, i)) 244 | i += 1 245 | } 246 | row 247 | } catch { 248 | case NonFatal(e) => 249 | // For corrupted records with the number of tokens same as the schema, 250 | // CSV reader doesn't support partial results. All fields other than the field 251 | // configured by `columnNameOfCorruptRecord` are set to `null`. 252 | throw BadRecordException(() => getCurrentInput, () => None, e) 253 | } 254 | } 255 | } 256 | } 257 | 258 | private[sql] object UnivocityParser { 259 | 260 | /** 261 | * Parses a stream that contains CSV strings and turns it into an iterator of tokens. 262 | */ 263 | def tokenizeStream( 264 | inputStream: InputStream, 265 | shouldDropHeader: Boolean, 266 | tokenizer: CsvParser): Iterator[Array[String]] = { 267 | val handleHeader: () => Unit = 268 | () => if (shouldDropHeader) tokenizer.parseNext 269 | 270 | convertStream(inputStream, tokenizer, handleHeader)(tokens => tokens) 271 | } 272 | 273 | /** 274 | * Parses a stream that contains CSV strings and turns it into an iterator of rows. 275 | */ 276 | def parseStream( 277 | inputStream: InputStream, 278 | parser: UnivocityParser, 279 | headerChecker: CSVHeaderChecker, 280 | schema: StructType): Iterator[InternalRow] = { 281 | val tokenizer = parser.tokenizer 282 | val safeParser = new FailureSafeParser[Array[String]]( 283 | input => Seq(parser.convert(input)), 284 | parser.options.parseMode, 285 | schema, 286 | parser.options.columnNameOfCorruptRecord, 287 | parser.options.multiLine) 288 | 289 | val handleHeader: () => Unit = 290 | () => headerChecker.checkHeaderColumnNames(tokenizer) 291 | 292 | convertStream(inputStream, tokenizer, handleHeader) { tokens => 293 | safeParser.parse(tokens) 294 | }.flatten 295 | } 296 | 297 | private def convertStream[T]( 298 | inputStream: InputStream, 299 | tokenizer: CsvParser, 300 | handleHeader: () => Unit)( 301 | convert: Array[String] => T) = new Iterator[T] { 302 | tokenizer.beginParsing(inputStream) 303 | 304 | // We can handle header here since here the stream is open. 305 | handleHeader() 306 | 307 | private var nextRecord = tokenizer.parseNext() 308 | 309 | override def hasNext: Boolean = nextRecord != null 310 | 311 | override def next(): T = { 312 | if (!hasNext) { 313 | throw new NoSuchElementException("End of stream") 314 | } 315 | val curRecord = convert(nextRecord) 316 | nextRecord = tokenizer.parseNext() 317 | curRecord 318 | } 319 | } 320 | 321 | /** 322 | * Parses an iterator that contains CSV strings and turns it into an iterator of rows. 323 | */ 324 | def parseIterator( 325 | lines: Iterator[String], 326 | parser: UnivocityParser, 327 | headerChecker: CSVHeaderChecker, 328 | schema: StructType): Iterator[InternalRow] = { 329 | headerChecker.checkHeaderColumnNames(lines, parser.tokenizer) 330 | 331 | val options = parser.options 332 | 333 | val filteredLines: Iterator[String] = CSVExprUtils.filterCommentAndEmpty(lines, options) 334 | 335 | val safeParser = new FailureSafeParser[String]( 336 | input => Seq(parser.parse(input)), 337 | parser.options.parseMode, 338 | schema, 339 | parser.options.columnNameOfCorruptRecord, 340 | parser.options.multiLine) 341 | filteredLines.flatMap(safeParser.parse) 342 | } 343 | } -------------------------------------------------------------------------------- /azure/AzureDataBricksJob/src/main/scala/org/apache/spark/sql/catalyst/expressions/ExprUtils.scala: -------------------------------------------------------------------------------- 1 | /* 2 | * Licensed to the Apache Software Foundation (ASF) under one or more 3 | * contributor license agreements. See the NOTICE file distributed with 4 | * this work for additional information regarding copyright ownership. 5 | * The ASF licenses this file to You under the Apache License, Version 2.0 6 | * (the "License"); you may not use this file except in compliance with 7 | * the License. You may obtain a copy of the License at 8 | * 9 | * http://www.apache.org/licenses/LICENSE-2.0 10 | * 11 | * Unless required by applicable law or agreed to in writing, software 12 | * distributed under the License is distributed on an "AS IS" BASIS, 13 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | * See the License for the specific language governing permissions and 15 | * limitations under the License. 16 | */ 17 | 18 | package org.apache.spark.sql.catalyst.expressions 19 | 20 | import org.apache.spark.sql.AnalysisException 21 | import org.apache.spark.sql.catalyst.util.ArrayBasedMapData 22 | import org.apache.spark.sql.types.{MapType, StringType, StructType} 23 | 24 | object ExprUtils { 25 | 26 | def evalSchemaExpr(exp: Expression): StructType = exp match { 27 | case Literal(s, StringType) => StructType.fromDDL(s.toString) 28 | case e => throw new AnalysisException( 29 | s"Schema should be specified in DDL format as a string literal instead of ${e.sql}") 30 | } 31 | 32 | def convertToMapData(exp: Expression): Map[String, String] = exp match { 33 | case m: CreateMap 34 | if m.dataType.acceptsType(MapType(StringType, StringType, valueContainsNull = false)) => 35 | val arrayMap = m.eval().asInstanceOf[ArrayBasedMapData] 36 | ArrayBasedMapData.toScalaMap(arrayMap).map { case (key, value) => 37 | key.toString -> value.toString 38 | } 39 | case m: CreateMap => 40 | throw new AnalysisException( 41 | s"A type of keys and values in map() must be string, but got ${m.dataType.catalogString}") 42 | case _ => 43 | throw new AnalysisException("Must use a map() function for options") 44 | } 45 | } -------------------------------------------------------------------------------- /azure/AzureDataBricksJob/src/main/scala/org/apache/spark/sql/catalyst/expressions/csvExpressions.scala: -------------------------------------------------------------------------------- 1 | /* 2 | * Licensed to the Apache Software Foundation (ASF) under one or more 3 | * contributor license agreements. See the NOTICE file distributed with 4 | * this work for additional information regarding copyright ownership. 5 | * The ASF licenses this file to You under the Apache License, Version 2.0 6 | * (the "License"); you may not use this file except in compliance with 7 | * the License. You may obtain a copy of the License at 8 | * 9 | * http://www.apache.org/licenses/LICENSE-2.0 10 | * 11 | * Unless required by applicable law or agreed to in writing, software 12 | * distributed under the License is distributed on an "AS IS" BASIS, 13 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | * See the License for the specific language governing permissions and 15 | * limitations under the License. 16 | */ 17 | 18 | package org.apache.spark.sql.catalyst.expressions 19 | 20 | import java.io.StringReader 21 | 22 | import org.apache.spark.sql.AnalysisException 23 | import org.apache.spark.sql.catalyst.InternalRow 24 | import org.apache.spark.sql.catalyst.csv._ 25 | import org.apache.spark.sql.catalyst.expressions.codegen.CodegenFallback 26 | import org.apache.spark.sql.catalyst.util._ 27 | import org.apache.spark.sql.types._ 28 | import org.apache.spark.unsafe.types.UTF8String 29 | 30 | /** 31 | * Converts a CSV input string to a [[StructType]] with the specified schema. 32 | */ 33 | // scalastyle:off line.size.limit 34 | @ExpressionDescription( 35 | usage = "_FUNC_(csvStr, schema[, options]) - Returns a struct value with the given `csvStr` and `schema`.", 36 | examples = """ 37 | Examples: 38 | > SELECT _FUNC_('1, 0.8', 'a INT, b DOUBLE'); 39 | {"a":1, "b":0.8} 40 | > SELECT _FUNC_('26/08/2015', 'time Timestamp', map('timestampFormat', 'dd/MM/yyyy')) 41 | {"time":2015-08-26 00:00:00.0} 42 | """, 43 | since = "3.0.0") 44 | // scalastyle:on line.size.limit 45 | case class CsvToStructs( 46 | schema: StructType, 47 | options: Map[String, String], 48 | child: Expression, 49 | timeZoneId: Option[String] = None) 50 | extends UnaryExpression 51 | with TimeZoneAwareExpression 52 | with CodegenFallback 53 | with ExpectsInputTypes 54 | with NullIntolerant { 55 | 56 | override def nullable: Boolean = child.nullable 57 | 58 | // The CSV input data might be missing certain fields. We force the nullability 59 | // of the user-provided schema to avoid data corruptions. 60 | val nullableSchema: StructType = schema.asNullable 61 | 62 | // Used in `FunctionRegistry` 63 | def this(child: Expression, schema: Expression, options: Map[String, String]) = 64 | this( 65 | schema = ExprUtils.evalSchemaExpr(schema), 66 | options = options, 67 | child = child, 68 | timeZoneId = None) 69 | 70 | def this(child: Expression, schema: Expression) = this(child, schema, Map.empty[String, String]) 71 | 72 | def this(child: Expression, schema: Expression, options: Expression) = 73 | this( 74 | schema = ExprUtils.evalSchemaExpr(schema), 75 | options = ExprUtils.convertToMapData(options), 76 | child = child, 77 | timeZoneId = None) 78 | 79 | // This converts parsed rows to the desired output by the given schema. 80 | @transient 81 | lazy val converter = (rows: Iterator[InternalRow]) => { 82 | if (rows.hasNext) { 83 | val result = rows.next() 84 | // CSV's parser produces one record only. 85 | assert(!rows.hasNext) 86 | result 87 | } else { 88 | throw new IllegalArgumentException("Expected one row from CSV parser.") 89 | } 90 | } 91 | 92 | @transient lazy val parser = { 93 | val parsedOptions = new CSVOptions(options, columnPruning = true, timeZoneId.get) 94 | val mode = parsedOptions.parseMode 95 | if (mode != PermissiveMode && mode != FailFastMode) { 96 | throw new AnalysisException(s"from_csv() doesn't support the ${mode.name} mode. " + 97 | s"Acceptable modes are ${PermissiveMode.name} and ${FailFastMode.name}.") 98 | } 99 | val actualSchema = 100 | StructType(nullableSchema.filterNot(_.name == parsedOptions.columnNameOfCorruptRecord)) 101 | val rawParser = new UnivocityParser(actualSchema, actualSchema, parsedOptions) 102 | new FailureSafeParser[String]( 103 | input => { 104 | Seq(rawParser.parse(input)) 105 | }, 106 | mode, 107 | nullableSchema, 108 | parsedOptions.columnNameOfCorruptRecord, 109 | parsedOptions.multiLine) 110 | } 111 | 112 | override def dataType: DataType = nullableSchema 113 | 114 | override def withTimeZone(timeZoneId: String): TimeZoneAwareExpression = { 115 | copy(timeZoneId = Option(timeZoneId)) 116 | } 117 | 118 | override def nullSafeEval(input: Any): Any = { 119 | val csv = input.asInstanceOf[UTF8String].toString 120 | converter(parser.parse(csv)) 121 | } 122 | 123 | override def inputTypes: Seq[AbstractDataType] = StringType :: Nil 124 | 125 | override def prettyName: String = "from_csv" 126 | } -------------------------------------------------------------------------------- /azure/AzureDataBricksJob/src/main/scala/org/apache/spark/sql/catalyst/util/FailureSafeParser.scala: -------------------------------------------------------------------------------- 1 | /* 2 | * Licensed to the Apache Software Foundation (ASF) under one or more 3 | * contributor license agreements. See the NOTICE file distributed with 4 | * this work for additional information regarding copyright ownership. 5 | * The ASF licenses this file to You under the Apache License, Version 2.0 6 | * (the "License"); you may not use this file except in compliance with 7 | * the License. You may obtain a copy of the License at 8 | * 9 | * http://www.apache.org/licenses/LICENSE-2.0 10 | * 11 | * Unless required by applicable law or agreed to in writing, software 12 | * distributed under the License is distributed on an "AS IS" BASIS, 13 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | * See the License for the specific language governing permissions and 15 | * limitations under the License. 16 | */ 17 | 18 | package org.apache.spark.sql.catalyst.util 19 | 20 | import org.apache.spark.SparkException 21 | import org.apache.spark.sql.catalyst.InternalRow 22 | import org.apache.spark.sql.catalyst.expressions.GenericInternalRow 23 | import org.apache.spark.sql.types.StructType 24 | import org.apache.spark.unsafe.types.UTF8String 25 | 26 | class FailureSafeParser[IN]( 27 | rawParser: IN => Seq[InternalRow], 28 | mode: ParseMode, 29 | schema: StructType, 30 | columnNameOfCorruptRecord: String, 31 | isMultiLine: Boolean) { 32 | 33 | private val corruptFieldIndex = schema.getFieldIndex(columnNameOfCorruptRecord) 34 | private val actualSchema = StructType(schema.filterNot(_.name == columnNameOfCorruptRecord)) 35 | private val resultRow = new GenericInternalRow(schema.length) 36 | private val nullResult = new GenericInternalRow(schema.length) 37 | 38 | // This function takes 2 parameters: an optional partial result, and the bad record. If the given 39 | // schema doesn't contain a field for corrupted record, we just return the partial result or a 40 | // row with all fields null. If the given schema contains a field for corrupted record, we will 41 | // set the bad record to this field, and set other fields according to the partial result or null. 42 | private val toResultRow: (Option[InternalRow], () => UTF8String) => InternalRow = { 43 | if (corruptFieldIndex.isDefined) { 44 | (row, badRecord) => { 45 | var i = 0 46 | while (i < actualSchema.length) { 47 | val from = actualSchema(i) 48 | resultRow(schema.fieldIndex(from.name)) = row.map(_.get(i, from.dataType)).orNull 49 | i += 1 50 | } 51 | resultRow(corruptFieldIndex.get) = badRecord() 52 | resultRow 53 | } 54 | } else { 55 | (row, _) => row.getOrElse(nullResult) 56 | } 57 | } 58 | 59 | private val skipParsing = !isMultiLine && mode == PermissiveMode && schema.isEmpty 60 | 61 | def parse(input: IN): Iterator[InternalRow] = { 62 | try { 63 | if (skipParsing) { 64 | Iterator.single(InternalRow.empty) 65 | } else { 66 | rawParser.apply(input).toIterator.map(row => toResultRow(Some(row), () => null)) 67 | } 68 | } catch { 69 | case e: BadRecordException => mode match { 70 | case PermissiveMode => 71 | Iterator(toResultRow(e.partialResult(), e.record)) 72 | case DropMalformedMode => 73 | Iterator.empty 74 | case FailFastMode => 75 | throw new SparkException("Malformed records are detected in record parsing. " + 76 | s"Parse Mode: ${FailFastMode.name}.", e.cause) 77 | } 78 | } 79 | } 80 | } -------------------------------------------------------------------------------- /azure/AzureDataBricksJob/src/test/scala/com/microsoft/pnp/SparkSuitBase.scala: -------------------------------------------------------------------------------- 1 | package com.microsoft.pnp 2 | 3 | import org.apache.spark.sql.SparkSession 4 | import org.scalatest.FunSuite 5 | 6 | abstract class SparkSuiteBase extends FunSuite { 7 | lazy val sparkContext = SparkSuiteBase.sparkContext 8 | 9 | } 10 | 11 | object SparkSuiteBase { 12 | private val master = "local[*]" 13 | private val appName = "data_load_testing" 14 | private lazy val sparkContext: SparkSession = new SparkSession.Builder().appName(appName).master(master).getOrCreate() 15 | 16 | } 17 | -------------------------------------------------------------------------------- /azure/AzureDataBricksJob/src/test/scala/com/microsoft/pnp/TaxiFareMapperTester.scala: -------------------------------------------------------------------------------- 1 | package com.microsoft.pnp 2 | 3 | import org.scalatest.Matchers 4 | 5 | import scala.util.{Failure, Success, Try} 6 | 7 | class TaxiFareMapperTester extends SparkSuiteBase with Matchers { 8 | 9 | /* 10 | test("should_map_fare_pickup_time_to_ride_pickup_time_format") { 11 | val expected = "2013-01-01T00:04:27+00:00" 12 | val inputFarePickUptime = "2013-01-01 00:04:27" 13 | val actual = TaxiFareMapper.mapFarePickUpTimeToRidePickUpTimeFormat(inputFarePickUptime) 14 | assert(actual.contentEquals(expected)) 15 | } 16 | 17 | test("csv string with first line comma separated header fields and second line comma separated value fields should be a valid ") { 18 | 19 | val inputString = "header1,header2,header3\nvalue1,value2,value3" 20 | 21 | var shouldSetTrueInSuccessCase = false 22 | TaxiFareMapper.validateHeaderEmbededCsvString(inputString) match { 23 | 24 | case Success(_) => shouldSetTrueInSuccessCase = true 25 | case Failure(_) => shouldSetTrueInSuccessCase = false 26 | 27 | } 28 | assert(shouldSetTrueInSuccessCase) 29 | } 30 | 31 | test("csv string with only comma separated value fields and no header fields should be a invalid ") { 32 | val inputString = "value1,value2,value3" 33 | 34 | var shouldSetTrueInFailureCase = false 35 | TaxiFareMapper.validateHeaderEmbededCsvString(inputString) match { 36 | case Success(_) => shouldSetTrueInFailureCase = false 37 | case Failure(_) => shouldSetTrueInFailureCase = true 38 | } 39 | assert(shouldSetTrueInFailureCase) 40 | } 41 | 42 | test("csv content with less than 11 fields") { 43 | val invalidCsvContent = "2013000717,2013000714,CMT,2013-01-01 00:04:27,CRD,8.5,0.5,0.5,2.37,0" 44 | var shouldSetTrueInFailureCase = false 45 | var actualErrorMessage = "" 46 | 47 | Try(TaxiFareMapper.mapCsvToTaxiFare(invalidCsvContent)) match { 48 | case Success(_) => shouldSetTrueInFailureCase = false 49 | case Failure(exception) => 50 | shouldSetTrueInFailureCase = true 51 | actualErrorMessage = exception.getMessage 52 | } 53 | 54 | val expectedErrorMessage = TaxiFareMapper.invalidTaxiFareCsv 55 | assert(shouldSetTrueInFailureCase) 56 | assert(expectedErrorMessage.contentEquals(actualErrorMessage)) 57 | } 58 | */ 59 | } 60 | -------------------------------------------------------------------------------- /azure/AzureDataBricksJob/src/test/scala/com/microsoft/pnp/TaxiRideMapperTester.scala: -------------------------------------------------------------------------------- 1 | package com.microsoft.pnp 2 | 3 | import java.sql.Timestamp 4 | 5 | import org.apache.spark.sql.Row 6 | import org.apache.spark.sql.types.{StringType, StructType, TimestampType} 7 | import org.scalatest.{BeforeAndAfterEach, Matchers} 8 | import org.slf4j.LoggerFactory 9 | 10 | import scala.util.{Failure, Success} 11 | 12 | class TaxiRideMapperTester extends SparkSuiteBase with Matchers with BeforeAndAfterEach { 13 | 14 | 15 | val logger = LoggerFactory.getLogger("TaxiRideMapperTester") 16 | 17 | override def beforeEach(): Unit = { } 18 | 19 | override def afterEach() { } 20 | 21 | /* 22 | test("it should parse valid json and match mapJsonToTaxiRide success case") { 23 | logger.info("it should parse valid json and match mapJsonToTaxiRide success case") 24 | val taxiRideJsonString = "{\"rateCode\":1,\"storeAndForwardFlag\":\"N\",\"dropoffTime\":\"2013-01-01T00:11:20+00:00\",\"passengerCount\":1,\"tripTimeInSeconds\":413.0,\"tripDistanceInMiles\":2.3,\"pickupLon\":-73.97912,\"pickupLat\":40.7623177,\"dropoffLon\":-73.95027,\"dropoffLat\":40.77126,\"medallion\":2013000717,\"hackLicense\":2013000714,\"vendorId\":\"CMT\",\"pickupTime\":\"2013-01-01T00:04:27+00:00\"}" 25 | 26 | var shouldMapToTaxiRide = false 27 | TaxiRideMapper.mapJsonToTaxiRide(taxiRideJsonString) match { 28 | case Success(_) => shouldMapToTaxiRide = true 29 | case Failure(_) => shouldMapToTaxiRide = false 30 | } 31 | 32 | assert(shouldMapToTaxiRide) 33 | println(1) 34 | } 35 | 36 | test("it should parse corrupted taxi ride json and match mapJsonToTaxiRide failure case") { 37 | val taxiRideJsonString = "{\"menu\": {\n \"id\": \"file\",\n \"value\": \"File\",\n \"popup\": {\n \"menuitem\": [\n {\"value\": \"New\", \"onclick\": \"CreateNewDoc()\"},\n {\"value\": \"Open\", \"onclick\": \"OpenDoc()\"},\n {\"value\": \"Close\", \"onclick\": \"CloseDoc()\"}\n ]\n }\n}}" 38 | val expected = "0_0_null_null" 39 | var taxiRide: TaxiRide = null 40 | TaxiRideMapper.mapJsonToTaxiRide(taxiRideJsonString) match { 41 | case Success(value) => taxiRide = value 42 | case Failure(_) => 43 | } 44 | 45 | assert(taxiRide.key.contentEquals(expected)) 46 | println(2) 47 | } 48 | 49 | test("it should parse valid json and match validateJsonString success case") { 50 | val validJson = "{\n \"fruit\": \"Apple\",\n \"size\": \"Large\",\n \"color\": \"Red\"\n}" 51 | 52 | var shouldValidateToTrue = false 53 | TaxiRideMapper.validateJsonString(validJson) match { 54 | case Success(_) => shouldValidateToTrue = true 55 | case Failure(_) => shouldValidateToTrue = false 56 | } 57 | 58 | assert(shouldValidateToTrue) 59 | println(3) 60 | } 61 | 62 | test("it should parse invalid json and match validateJsonString failure case") { 63 | val invalidJson = "some invalid json string" 64 | 65 | var shouldValidateToTrue = false 66 | TaxiRideMapper.validateJsonString(invalidJson) match { 67 | case Success(_) => shouldValidateToTrue = true 68 | case Failure(_) => shouldValidateToTrue = false 69 | } 70 | 71 | assert(!shouldValidateToTrue) 72 | println(4) 73 | } 74 | 75 | test("it should map a valid taxi ride json to a valid enrichedtaxi ride record") { 76 | val rideContent = "{\"rateCode\":1,\"storeAndForwardFlag\":\"N\",\"dropoffTime\":\"2013-01-01T00:11:20+00:00\",\"passengerCount\":1,\"tripTimeInSeconds\":413.0,\"tripDistanceInMiles\":2.3,\"pickupLon\":-73.97912,\"pickupLat\":40.7623177,\"dropoffLon\":-73.95027,\"dropoffLat\":40.77126,\"medallion\":2013000717,\"hackLicense\":2013000714,\"vendorId\":\"CMT\",\"pickupTime\":\"2013-01-01T00:04:27+00:00\"}" 77 | val recordIngestedTime = "2018-08-23 12:44:19.818" 78 | 79 | val rideDataFrameSchema = new StructType() 80 | .add("rideContent", StringType, true) 81 | .add("recordIngestedTime", TimestampType, true) 82 | 83 | val rideData = Seq( 84 | Row(rideContent, Timestamp.valueOf(recordIngestedTime)) 85 | ) 86 | 87 | import sparkContext.implicits._ 88 | 89 | val rideDataFrame = sparkContext.createDataFrame( 90 | sparkContext.sparkContext.parallelize(rideData), 91 | rideDataFrameSchema 92 | ) 93 | 94 | val enrichedTaxiRideRecords = rideDataFrame.map(row => TaxiRideMapper.mapRowToEncrichedTaxiRideRecord(row)) 95 | .filter(x => x.isValidRecord).as[EnrichedTaxiDataRecord] 96 | 97 | val expectedCount = 1 98 | val actualCount = enrichedTaxiRideRecords.count() 99 | 100 | assert(actualCount == expectedCount) 101 | println(5) 102 | } 103 | 104 | test("it should map a invalid json string to a invalid enriched taxi ride record") { 105 | val rideContent = "some invalid json string" 106 | val recordIngestedTime = "2018-08-23 12:44:19.818" 107 | 108 | val rideDataFrameSchema = new StructType() 109 | .add("rideContent", StringType, true) 110 | .add("recordIngestedTime", TimestampType, true) 111 | 112 | import sparkContext.implicits._ 113 | val rideData = Seq( 114 | Row(rideContent, Timestamp.valueOf(recordIngestedTime)) 115 | ) 116 | 117 | val rideDataFrame = sparkContext.createDataFrame( 118 | sparkContext.sparkContext.parallelize(rideData), 119 | rideDataFrameSchema 120 | ) 121 | 122 | val enrichedTaxiRideRecords = rideDataFrame.map(row => TaxiRideMapper.mapRowToEncrichedTaxiRideRecord(row)) 123 | .filter(x => x.isValidRecord).as[EnrichedTaxiDataRecord] 124 | 125 | val expectedCount = 0 126 | val actualCount = enrichedTaxiRideRecords.count() 127 | 128 | assert(actualCount == expectedCount) 129 | println(6) 130 | } 131 | 132 | test("it should map a valid json string but a corrupted taxiride string to a invalid enriched taxi ride record") { 133 | val rideContent = "{\"menu\": {\n \"id\": \"file\",\n \"value\": \"File\",\n \"popup\": {\n \"menuitem\": [\n {\"value\": \"New\", \"onclick\": \"CreateNewDoc()\"},\n {\"value\": \"Open\", \"onclick\": \"OpenDoc()\"},\n {\"value\": \"Close\", \"onclick\": \"CloseDoc()\"}\n ]\n }\n}}" 134 | val recordIngestedTime = "2018-08-23 12:44:19.818" 135 | 136 | val rideDataFrameSchema = new StructType() 137 | .add("rideContent", StringType, true) 138 | .add("recordIngestedTime", TimestampType, true) 139 | 140 | val rideData = Seq( 141 | Row(rideContent, Timestamp.valueOf(recordIngestedTime)) 142 | ) 143 | 144 | import sparkContext.implicits._ 145 | 146 | val rideDataFrame = sparkContext.createDataFrame( 147 | sparkContext.sparkContext.parallelize(rideData), 148 | rideDataFrameSchema 149 | ) 150 | 151 | val enrichedTaxiRideRecords = rideDataFrame.map(row => TaxiRideMapper.mapRowToEncrichedTaxiRideRecord(row)) 152 | .filter(x => x.isValidRecord).as[EnrichedTaxiDataRecord] 153 | 154 | val expectedCount = 0 155 | val actualCount = enrichedTaxiRideRecords.count() 156 | 157 | assert(actualCount == expectedCount) 158 | println(7) 159 | } 160 | */ 161 | } 162 | -------------------------------------------------------------------------------- /azure/AzureDataBricksJob/src/test/scala/org/apache/spark/sql/streaming/UtilsTests.scala: -------------------------------------------------------------------------------- 1 | package org.apache.spark.sql.streaming 2 | 3 | import java.util.HashMap 4 | import java.util.UUID.randomUUID 5 | 6 | import com.microsoft.pnp.{SparkSuiteBase, Utils} 7 | import org.apache.spark.sql.streaming.StreamingQueryListener.QueryProgressEvent 8 | import org.scalatest.Matchers 9 | 10 | class UtilsTests[sql] extends SparkSuiteBase with Matchers { 11 | 12 | test("should_parse_queryprogress_telemetry") { 13 | val guid = randomUUID() 14 | val duration: java.util.Map[String, java.lang.Long] = new HashMap[String, java.lang.Long] 15 | val eventTime: java.util.Map[String, String] = new HashMap[String, String] 16 | 17 | duration.put("addBatch", 100L) 18 | duration.put("getBatch", 200L) 19 | val source: SourceProgress = new SourceProgress("source", "start", "end", 100, 200, 300) 20 | val sourcearr = new Array[SourceProgress](1) 21 | sourcearr(0) = source 22 | 23 | val progressEvent = new QueryProgressEvent( 24 | new StreamingQueryProgress( 25 | guid, guid, 26 | "streamTest", "time", 27 | 10, 10, duration, 28 | eventTime, 29 | null, sourcearr, null, null 30 | ) 31 | ) 32 | 33 | val metrics = Utils.parsePayload(progressEvent) 34 | assert(progressEvent.progress.id === metrics.get("id")) 35 | assert(progressEvent.progress.numInputRows === metrics.get("inputRows")) 36 | assert(progressEvent.progress.processedRowsPerSecond === metrics.get("procRowsPerSecond")) 37 | assert(progressEvent.progress.inputRowsPerSecond === metrics.get("inputRowsPerSecond")) 38 | assert(progressEvent.progress.durationMs.get("addBatch") === 39 | metrics.get("durationms").asInstanceOf[HashMap[String, AnyRef]].get("addBatch")) 40 | assert(progressEvent.progress.durationMs.get("getBatch") === 41 | metrics.get("durationms").asInstanceOf[HashMap[String, AnyRef]].get("getBatch")) 42 | 43 | } 44 | } -------------------------------------------------------------------------------- /azure/deployresources.json: -------------------------------------------------------------------------------- 1 | { 2 | "$schema": "https://schema.management.azure.com/schemas/2015-01-01/deploymentTemplate.json#", 3 | "contentVersion": "1.0.0.0", 4 | "parameters": { 5 | "eventHubNamespace": { 6 | "type": "string" 7 | }, 8 | "databricksWorkspaceName": { 9 | "type":"string" 10 | }, 11 | "cosmosDatabaseAccount": { 12 | "type": "string" 13 | }, 14 | "logAnalyticsWorkspaceName": { 15 | "type": "string" 16 | }, 17 | "logAnalyticsWorkspaceRegion": { 18 | "type": "string" 19 | } 20 | }, 21 | "variables": { 22 | "eventHubNames": [ 23 | "taxi-ride-eh", 24 | "taxi-fare-eh" 25 | ], 26 | "copy": [ 27 | { 28 | "name": "eventHubs", 29 | "count": "[length(variables('eventHubNames'))]", 30 | "input": { 31 | "name": "[concat(parameters('eventHubNamespace'), '/', variables('eventHubNames')[copyIndex('eventHubs')])]", 32 | "consumerGroupName": "[concat(parameters('eventHubNamespace'), '/', variables('eventHubNames')[copyIndex('eventHubs')], '/', concat(variables('eventHubNames')[copyIndex('eventHubs')], '-cg'))]", 33 | "authorizationRuleName": "[concat(parameters('eventHubNamespace'), '/', variables('eventHubNames')[copyIndex('eventHubs')], '/', concat(variables('eventHubNames')[copyIndex('eventHubs')], '-ap'))]", 34 | "authorizationRuleResourceId": "[resourceId('Microsoft.EventHub/namespaces/eventHubs/authorizationRules', parameters('eventHubNamespace'), variables('eventHubNames')[copyIndex('eventHubs')], concat(variables('eventHubNames')[copyIndex('eventHubs')], '-ap'))]" 35 | } 36 | } 37 | ], 38 | "dataBricksResourceGroup": "[concat(resourceGroup().name, '-', parameters('databricksWorkspaceName'), '-', uniqueString(resourceGroup().name, '-', parameters('databricksWorkspaceName')))]" 39 | }, 40 | "resources": [ 41 | { 42 | "type": "Microsoft.Operationalinsights/workspaces", 43 | "name": "[parameters('logAnalyticsWorkspaceName')]", 44 | "apiVersion": "2015-11-01-preview", 45 | "location": "[parameters('logAnalyticsWorkspaceRegion')]", 46 | "properties": { 47 | "sku": { 48 | "name": "pergb2018" 49 | }, 50 | "retentionInDays": 30 51 | } 52 | }, 53 | { 54 | "type": "Microsoft.EventHub/namespaces", 55 | "name": "[parameters('eventHubNamespace')]", 56 | "apiVersion": "2017-04-01", 57 | "location": "[resourceGroup().location]", 58 | "sku": { 59 | "name": "Standard", 60 | "tier": "Standard" 61 | } 62 | }, 63 | { 64 | "type": "Microsoft.EventHub/namespaces/eventhubs", 65 | "name": "[variables('eventHubs')[copyIndex()].name]", 66 | "apiVersion": "2017-04-01", 67 | "copy": { 68 | "count": "[length(variables('eventHubs'))]", 69 | "mode": "Parallel", 70 | "name": "eventHubs" 71 | }, 72 | "properties": { 73 | "messageRetentionInDays": 3, 74 | "partitionCount": 8 75 | }, 76 | "dependsOn": [ 77 | "[parameters('eventHubNamespace')]" 78 | ] 79 | }, 80 | { 81 | "type": "Microsoft.EventHub/namespaces/eventhubs/consumergroups", 82 | "name": "[variables('eventHubs')[copyIndex()].consumerGroupName]", 83 | "apiVersion": "2017-04-01", 84 | "copy": { 85 | "count": "[length(variables('eventHubs'))]", 86 | "mode": "Parallel", 87 | "name": "consumerGroups" 88 | }, 89 | "properties": {}, 90 | "dependsOn": [ 91 | "eventHubs" 92 | ] 93 | }, 94 | { 95 | "type": "Microsoft.EventHub/namespaces/eventhubs/authorizationRules", 96 | "name": "[variables('eventHubs')[copyIndex()].authorizationRuleName]", 97 | "apiVersion": "2017-04-01", 98 | "copy": { 99 | "count": "[length(variables('eventHubs'))]", 100 | "mode": "Parallel", 101 | "name": "authorizationRules" 102 | }, 103 | "properties": { 104 | "rights": [ 105 | "Listen", 106 | "Send" 107 | ] 108 | }, 109 | "dependsOn": [ 110 | "consumerGroups" 111 | ] 112 | }, 113 | { 114 | "type": "Microsoft.Databricks/workspaces", 115 | "name": "[parameters('databricksWorkspaceName')]", 116 | "location": "[resourceGroup().location]", 117 | "apiVersion": "2018-04-01", 118 | "sku": { 119 | "name": "premium" 120 | }, 121 | "properties": { 122 | "managedResourceGroupId": "[concat(subscription().id, '/resourceGroups/', variables('dataBricksResourceGroup'))]" 123 | } 124 | }, 125 | { 126 | "name": "[parameters('cosmosDatabaseAccount')]", 127 | "type": "Microsoft.DocumentDB/databaseAccounts", 128 | "apiVersion": "2015-04-08", 129 | "location": "[resourceGroup().location]", 130 | "kind": "GlobalDocumentDB", 131 | "tags": { 132 | "defaultExperience": "Cassandra" 133 | }, 134 | "properties": { 135 | "databaseAccountOfferType": "Standard", 136 | "locations": [ 137 | { 138 | "locationName": "[resourceGroup().location]", 139 | "failoverPriority": 0 140 | } 141 | ], 142 | "capabilities": [ 143 | { 144 | "name": "EnableCassandra" 145 | } 146 | ] 147 | } 148 | }, 149 | { 150 | "type": "Microsoft.Resources/deployments", 151 | "apiVersion": "2017-05-10", 152 | "name": "outputGeneration", 153 | "properties": { 154 | "mode": "Incremental", 155 | "template": { 156 | "$schema": "https://schema.management.azure.com/schemas/2015-01-01/deploymentTemplate.json#", 157 | "contentVersion": "1.0.0.0", 158 | "parameters": { 159 | }, 160 | "variables": { 161 | }, 162 | "resources": [ 163 | ], 164 | "outputs": { 165 | "cosmosDb": { 166 | "type": "object", 167 | "value": { 168 | "username": "[substring(reference(parameters('cosmosDatabaseAccount')).cassandraEndpoint, 8, sub(indexOf(reference(parameters('cosmosDatabaseAccount')).cassandraEndpoint, '.'), 8))]", 169 | "hostName": "[substring(reference(parameters('cosmosDatabaseAccount')).cassandraEndpoint, 8, sub(length(reference(parameters('cosmosDatabaseAccount')).cassandraEndpoint), 13))]", 170 | "secret": "[listKeys(parameters('cosmosDatabaseAccount'), '2015-04-08').primaryMasterKey]" 171 | } 172 | }, 173 | "eventHubs": { 174 | "type": "object", 175 | "value": { 176 | "[variables('eventHubNames')[0]]": "[listKeys(variables('eventHubs')[0].authorizationRuleResourceId, '2017-04-01').primaryConnectionString]", 177 | "[variables('eventHubNames')[1]]": "[listKeys(variables('eventHubs')[1].authorizationRuleResourceId, '2017-04-01').primaryConnectionString]" 178 | } 179 | }, 180 | "logAnalytics": { 181 | "type": "object", 182 | "value": { 183 | "workspaceId": "[reference(parameters('logAnalyticsWorkspaceName')).customerId]", 184 | "secret": "[listKeys(parameters('logAnalyticsWorkspaceName'), '2015-11-01-preview').primarySharedKey]" 185 | } 186 | } 187 | } 188 | } 189 | }, 190 | "dependsOn": [ 191 | "authorizationRules", 192 | "[parameters('cosmosDatabaseAccount')]", 193 | "[parameters('databricksWorkspaceName')]" 194 | ] 195 | } 196 | ], 197 | "outputs": { 198 | "cosmosDb": { 199 | "type": "object", 200 | "value": "[reference('outputGeneration').outputs.cosmosDb.value]" 201 | }, 202 | "logAnalytics": { 203 | "type": "object", 204 | "value": "[reference('outputGeneration').outputs.logAnalytics.value]" 205 | }, 206 | "eventHubs": { 207 | "type": "object", 208 | "value": "[reference('outputGeneration').outputs.eventHubs.value]" 209 | } 210 | } 211 | } -------------------------------------------------------------------------------- /azure/pom.xml: -------------------------------------------------------------------------------- 1 | 2 | 5 | 4.0.0 6 | 7 | com.microsoft.pnp 8 | azure-databricks-ra 9 | 1.0-SNAPSHOT 10 | pom 11 | 12 | AzureDataBricksJob 13 | 14 | 15 | 1.8 16 | 1.8 17 | 2.12.12 18 | 2.12 19 | 3.0.1 20 | UTF-8 21 | UTF-8 22 | 3.0.9 23 | 1.7.30 24 | 25 | 26 | 27 | 28 | org.scala-lang 29 | scala-library 30 | ${scala.version} 31 | provided 32 | 33 | 34 | org.apache.spark 35 | spark-core_${scala.compat.version} 36 | ${spark.version} 37 | provided 38 | 39 | 40 | org.apache.spark 41 | spark-sql_${scala.compat.version} 42 | ${spark.version} 43 | provided 44 | 45 | 46 | org.apache.spark 47 | spark-streaming_${scala.compat.version} 48 | ${spark.version} 49 | provided 50 | 51 | 52 | org.slf4j 53 | slf4j-api 54 | ${slf4j.version} 55 | provided 56 | 57 | 58 | org.scalatest 59 | scalatest_${scala.compat.version} 60 | ${scalatest.version} 61 | test 62 | 63 | 64 | 65 | 66 | 67 | org.scala-lang 68 | scala-library 69 | 70 | 71 | org.apache.spark 72 | spark-core_${scala.compat.version} 73 | 74 | 75 | org.apache.spark 76 | spark-sql_${scala.compat.version} 77 | 78 | 79 | org.apache.spark 80 | spark-streaming_${scala.compat.version} 81 | 82 | 83 | org.slf4j 84 | slf4j-api 85 | 86 | 87 | junit 88 | junit 89 | 4.13.2 90 | test 91 | 92 | 93 | org.scalatest 94 | scalatest_${scala.compat.version} 95 | test 96 | 97 | 98 | 99 | 100 | 101 | 102 | 103 | net.alchim31.maven 104 | scala-maven-plugin 105 | 3.4.2 106 | 107 | 108 | 109 | compile 110 | testCompile 111 | 112 | 113 | ${scala.version} 114 | ${scala.compat.version} 115 | 116 | -target:jvm-${maven.compiler.target} 117 | -dependencyfile 118 | ${project.build.directory}/.scala_dependencies 119 | 120 | 121 | -source 122 | ${maven.compiler.source} 123 | -target 124 | ${maven.compiler.target} 125 | 126 | 127 | 128 | 129 | 130 | 131 | org.apache.maven.plugins 132 | maven-compiler-plugin 133 | 3.8.0 134 | 135 | 136 | 137 | compile 138 | testCompile 139 | 140 | 141 | 142 | 143 | ${maven.compiler.source} 144 | ${maven.compiler.target} 145 | 146 | -Xlint 147 | 148 | 149 | 150 | 151 | org.apache.maven.plugins 152 | maven-dependency-plugin 153 | 3.0.2 154 | 155 | 156 | copy-dependencies 157 | package 158 | 159 | copy-dependencies 160 | 161 | 162 | runtime 163 | ${project.build.directory} 164 | 165 | 166 | 167 | 168 | 169 | org.apache.maven.plugins 170 | maven-surefire-plugin 171 | 2.22.0 172 | 173 | 174 | 175 | 176 | 177 | 178 | org.apache.maven.plugins 179 | maven-shade-plugin 180 | 3.1.1 181 | 182 | 183 | package 184 | 185 | shade 186 | 187 | 188 | true 189 | 190 | 191 | *:* 192 | 193 | META-INF/*.SF 194 | META-INF/*.DSA 195 | META-INF/*.RSA 196 | 197 | 198 | 199 | 200 | 201 | 202 | 203 | 204 | org.scalatest 205 | scalatest-maven-plugin 206 | 2.0.0 207 | 208 | ${project.build.directory}/surefire-reports 209 | . 210 | TestSuiteReport.txt 211 | 212 | 213 | 214 | test 215 | 216 | test 217 | 218 | 219 | 220 | 221 | 222 | org.apache.maven.plugins 223 | maven-clean-plugin 224 | 3.1.0 225 | 226 | 227 | auto-clean 228 | initialize 229 | 230 | clean 231 | 232 | 233 | 234 | 235 | 236 | 237 | 238 | 239 | 240 | net.alchim31.maven 241 | scala-maven-plugin 242 | 243 | 244 | org.apache.maven.plugins 245 | maven-compiler-plugin 246 | 247 | 248 | org.apache.maven.plugins 249 | maven-dependency-plugin 250 | 251 | 252 | org.apache.maven.plugins 253 | maven-surefire-plugin 254 | 255 | 256 | org.apache.maven.plugins 257 | maven-shade-plugin 258 | 259 | 260 | org.apache.maven.plugins 261 | maven-clean-plugin 262 | 263 | 264 | org.scalatest 265 | scalatest-maven-plugin 266 | 267 | 268 | 269 | -------------------------------------------------------------------------------- /onprem/DataLoader/.vscode/launch.json: -------------------------------------------------------------------------------- 1 | { 2 | // Use IntelliSense to find out which attributes exist for C# debugging 3 | // Use hover for the description of the existing attributes 4 | // For further information visit https://github.com/OmniSharp/omnisharp-vscode/blob/master/debugger-launchjson.md 5 | "version": "0.2.0", 6 | "configurations": [ 7 | { 8 | "name": ".NET Core Launch (console)", 9 | "type": "coreclr", 10 | "request": "launch", 11 | "preLaunchTask": "build", 12 | "program": "${workspaceRoot}/bin/Debug/netcoreapp2.0/taxi.dll", 13 | "args": [], 14 | "cwd": "${workspaceRoot}", 15 | "stopAtEntry": false, 16 | "console": "internalConsole", 17 | "env": { 18 | "RIDE_EVENT_HUB": "", 19 | "FARE_EVENT_HUB": "", 20 | "RIDE_DATA_FILE_PATH": "", 21 | "MINUTES_TO_LEAD": "", 22 | "PUSH_RIDE_DATA_FIRST": "" 23 | } 24 | }, 25 | { 26 | "name": ".NET Core Attach", 27 | "type": "coreclr", 28 | "request": "attach", 29 | "processId": "${command:pickProcess}" 30 | } 31 | ,] 32 | } 33 | -------------------------------------------------------------------------------- /onprem/DataLoader/.vscode/tasks.json: -------------------------------------------------------------------------------- 1 | { 2 | "version": "2.0.0", 3 | "tasks": [ 4 | { 5 | "label": "build", 6 | "command": "dotnet", 7 | "type": "process", 8 | "group": { 9 | "kind": "build", 10 | "isDefault": true 11 | }, 12 | "args": [ 13 | "build", 14 | "${workspaceFolder}/taxi.csproj" 15 | ], 16 | "problemMatcher": "$msCompile" 17 | } 18 | ] 19 | } -------------------------------------------------------------------------------- /onprem/DataLoader/DataFormat.cs: -------------------------------------------------------------------------------- 1 | namespace Taxi 2 | { 3 | public enum DataFormat 4 | { 5 | Csv, 6 | Json 7 | } 8 | } -------------------------------------------------------------------------------- /onprem/DataLoader/ObjectPool.cs: -------------------------------------------------------------------------------- 1 | namespace Taxi 2 | { 3 | using System; 4 | using System.Collections.Concurrent; 5 | 6 | public class ObjectPool 7 | where T: class 8 | { 9 | private BlockingCollection _pool = new BlockingCollection(); 10 | private Func _factory; 11 | private int _poolSize; 12 | 13 | public ObjectPool(Func factory, int poolSize = 10) 14 | { 15 | _factory = factory ?? throw new ArgumentNullException(nameof(factory)); 16 | _poolSize = poolSize; 17 | Initialize(); 18 | } 19 | 20 | private void Initialize() 21 | { 22 | for (int i = 0; i < _poolSize; i++) 23 | { 24 | _pool.Add(new ObjectPoolObject(_factory(), this)); 25 | } 26 | } 27 | 28 | public ObjectPoolObject GetObject() 29 | { 30 | return _pool.Take(); 31 | } 32 | 33 | private void Return(ObjectPoolObject obj) 34 | { 35 | if (obj == null) 36 | { 37 | throw new ArgumentNullException(nameof(obj)); 38 | } 39 | 40 | _pool.Add(obj); 41 | } 42 | 43 | public class ObjectPoolObject : IDisposable 44 | { 45 | private T _obj; 46 | private ObjectPool _objectPool; 47 | 48 | internal ObjectPoolObject(T obj, ObjectPool objectPool) 49 | { 50 | _obj = obj ?? throw new ArgumentNullException(nameof(obj)); 51 | _objectPool = objectPool ?? throw new ArgumentNullException(nameof(objectPool)); 52 | } 53 | 54 | public void Dispose() 55 | { 56 | _objectPool.Return(this); 57 | } 58 | 59 | public T Value 60 | { 61 | get => _obj; 62 | } 63 | 64 | public static explicit operator T(ObjectPoolObject poolObject) 65 | { 66 | return poolObject._obj; 67 | } 68 | } 69 | } 70 | } -------------------------------------------------------------------------------- /onprem/DataLoader/Program.cs: -------------------------------------------------------------------------------- 1 | namespace Taxi 2 | { 3 | using System; 4 | using System.Collections.Concurrent; 5 | using System.Collections.Generic; 6 | using System.IO; 7 | using System.IO.Compression; 8 | using System.Linq; 9 | using System.Text; 10 | using System.Threading; 11 | using System.Threading.Tasks; 12 | using Microsoft.Azure.EventHubs; 13 | using Newtonsoft.Json; 14 | using System.Threading.Tasks.Dataflow; 15 | 16 | 17 | class Program 18 | { 19 | 20 | private static CancellationTokenSource cts; 21 | private static async Task ReadData(ICollection pathList, Func factory, 22 | ObjectPool pool, int randomSeed, AsyncConsole console, int waittime, DataFormat dataFormat) 23 | where T : TaxiData 24 | { 25 | 26 | 27 | if (pathList == null) 28 | { 29 | throw new ArgumentNullException(nameof(pathList)); 30 | } 31 | 32 | if (factory == null) 33 | { 34 | throw new ArgumentNullException(nameof(factory)); 35 | } 36 | 37 | if (pool == null) 38 | { 39 | throw new ArgumentNullException(nameof(pool)); 40 | } 41 | 42 | if (console == null) 43 | { 44 | throw new ArgumentNullException(nameof(console)); 45 | } 46 | 47 | if (waittime > 0) 48 | { 49 | TimeSpan span = TimeSpan.FromMilliseconds(waittime); 50 | await Task.Delay(span); 51 | } 52 | 53 | string typeName = typeof(T).Name; 54 | Random random = new Random(randomSeed); 55 | 56 | // buffer block that holds the messages . consumer will fetch records from this block asynchronously. 57 | BufferBlock buffer = new BufferBlock(new DataflowBlockOptions() 58 | { 59 | BoundedCapacity = 100000 60 | }); 61 | 62 | // consumer that sends the data to event hub asynchronoulsy. 63 | var consumer = new ActionBlock( 64 | (t) => 65 | { 66 | using (var client = pool.GetObject()) 67 | { 68 | return client.Value.SendAsync(new EventData(Encoding.UTF8.GetBytes( 69 | t.GetData(dataFormat))), t.PartitionKey).ContinueWith( 70 | async task => 71 | { 72 | cts.Cancel(); 73 | await console.WriteLine(task.Exception.InnerException.Message); 74 | await console.WriteLine($"event hub client failed for {typeName}"); 75 | } 76 | , TaskContinuationOptions.OnlyOnFaulted 77 | ); 78 | } 79 | }, 80 | new ExecutionDataflowBlockOptions 81 | { 82 | BoundedCapacity = 100000, 83 | CancellationToken = cts.Token, 84 | MaxDegreeOfParallelism = 100, 85 | } 86 | ); 87 | 88 | // link the buffer to consumer . 89 | buffer.LinkTo(consumer, new DataflowLinkOptions() 90 | { 91 | PropagateCompletion = true 92 | }); 93 | 94 | long messages = 0; 95 | 96 | List taskList = new List(); 97 | 98 | var readTask = Task.Factory.StartNew( 99 | async () => 100 | { 101 | // iterate through the path list and act on each file from here on 102 | foreach (var path in pathList) 103 | { 104 | using (var archive = new ZipArchive(File.OpenRead(path), 105 | ZipArchiveMode.Read)) 106 | { 107 | foreach (var entry in archive.Entries) 108 | { 109 | using (var reader = new StreamReader(entry.Open())) 110 | { 111 | 112 | var header = reader.ReadLines() 113 | .First(); 114 | // Start consumer 115 | var lines = reader.ReadLines() 116 | .Skip(1); 117 | 118 | 119 | // for each line , send to event hub 120 | foreach (var line in lines) 121 | { 122 | // proceed only if previous send operation is succesful. 123 | // cancelation is requested in case if send fails . 124 | if (cts.IsCancellationRequested) 125 | { 126 | break; 127 | } 128 | await buffer.SendAsync(factory(line, header)).ConfigureAwait(false); 129 | if (++messages % 10000 == 0) 130 | { 131 | // random delay every 10000 messages are buffered ?? 132 | await Task.Delay(random.Next(100, 1000)) 133 | .ConfigureAwait(false); 134 | await console.WriteLine($"Created {messages} records for {typeName}").ConfigureAwait(false); 135 | } 136 | 137 | } 138 | } 139 | 140 | if (cts.IsCancellationRequested) 141 | { 142 | break; 143 | } 144 | } 145 | 146 | if (cts.IsCancellationRequested) 147 | { 148 | break; 149 | } 150 | } 151 | 152 | buffer.Complete(); 153 | await Task.WhenAll(buffer.Completion, consumer.Completion); 154 | await console.WriteLine($"Created total {messages} records for {typeName}").ConfigureAwait(false); 155 | } 156 | } 157 | ).Unwrap().ContinueWith( 158 | async task => 159 | { 160 | cts.Cancel(); 161 | await console.WriteLine($"failed to read files for {typeName}").ConfigureAwait(false); 162 | await console.WriteLine(task.Exception.InnerException.Message).ConfigureAwait(false); 163 | } 164 | , TaskContinuationOptions.OnlyOnFaulted 165 | ); 166 | 167 | 168 | // await on consumer completion. Incase if sending is failed at any moment , 169 | // execption is thrown and caught . This is used to signal the cancel the reading operation and abort all activity further 170 | 171 | try 172 | { 173 | await Task.WhenAll(consumer.Completion, readTask); 174 | } 175 | catch (Exception ex) 176 | { 177 | cts.Cancel(); 178 | await console.WriteLine(ex.Message).ConfigureAwait(false); 179 | await console.WriteLine($"failed to send files for {typeName}").ConfigureAwait(false); 180 | throw; 181 | } 182 | 183 | } 184 | 185 | 186 | private static (string RideConnectionString, 187 | string FareConnectionString, 188 | ICollection RideDataFiles, 189 | ICollection TripDataFiles, 190 | int MillisecondsToRun, 191 | int MillisecondsToLead, 192 | bool sendRideDataFirst) ParseArguments() 193 | { 194 | 195 | var rideConnectionString = Environment.GetEnvironmentVariable("RIDE_EVENT_HUB"); 196 | var fareConnectionString = Environment.GetEnvironmentVariable("FARE_EVENT_HUB"); 197 | var rideDataFilePath = Environment.GetEnvironmentVariable("RIDE_DATA_FILE_PATH"); 198 | var numberOfMillisecondsToRun = (int.TryParse(Environment.GetEnvironmentVariable("SECONDS_TO_RUN"), out int outputSecondToRun) ? outputSecondToRun : 0) * 1000; 199 | var numberOfMillisecondsToLead = (int.TryParse(Environment.GetEnvironmentVariable("MINUTES_TO_LEAD"), out int outputMinutesToLead) ? outputMinutesToLead : 0) * 60000; 200 | var pushRideDataFirst = bool.TryParse(Environment.GetEnvironmentVariable("PUSH_RIDE_DATA_FIRST"), out Boolean outputPushRideDataFirst) ? outputPushRideDataFirst : false; 201 | 202 | if (string.IsNullOrWhiteSpace(rideConnectionString)) 203 | { 204 | throw new ArgumentException("rideConnectionString must be provided"); 205 | } 206 | 207 | if (string.IsNullOrWhiteSpace(fareConnectionString)) 208 | { 209 | throw new ArgumentException("fareConnectionString must be provided"); 210 | } 211 | 212 | if (string.IsNullOrWhiteSpace(rideDataFilePath)) 213 | { 214 | throw new ArgumentException("rideDataFilePath must be provided"); 215 | } 216 | 217 | if (!Directory.Exists(rideDataFilePath)) 218 | { 219 | throw new ArgumentException("ride file path doesnot exists"); 220 | } 221 | // get only the ride files in order. trip_data_1.zip gets read before trip_data_2.zip 222 | var rideDataFiles = Directory.EnumerateFiles(rideDataFilePath) 223 | .Where(p => Path.GetFileNameWithoutExtension(p).Contains("trip_data")) 224 | .OrderBy(p => 225 | { 226 | var filename = Path.GetFileNameWithoutExtension(p); 227 | var indexString = filename.Substring(filename.LastIndexOf('_') + 1); 228 | var index = int.TryParse(indexString, out int i) ? i : throw new ArgumentException("tripdata file must be named in format trip_data_*.zip"); 229 | return index; 230 | }).ToArray(); 231 | 232 | // get only the fare files in order 233 | var fareDataFiles = Directory.EnumerateFiles(rideDataFilePath) 234 | .Where(p => Path.GetFileNameWithoutExtension(p).Contains("trip_fare")) 235 | .OrderBy(p => 236 | { 237 | var filename = Path.GetFileNameWithoutExtension(p); 238 | var indexString = filename.Substring(filename.LastIndexOf('_') + 1); 239 | var index = int.TryParse(indexString, out int i) ? i : throw new ArgumentException("tripfare file must be named in format trip_fare_*.zip"); 240 | return index; 241 | }).ToArray(); 242 | 243 | if (rideDataFiles.Length == 0) 244 | { 245 | throw new ArgumentException($"trip data files at {rideDataFilePath} does not exist"); 246 | } 247 | 248 | if (fareDataFiles.Length == 0) 249 | { 250 | throw new ArgumentException($"fare data files at {rideDataFilePath} does not exist"); 251 | } 252 | 253 | return (rideConnectionString, fareConnectionString, rideDataFiles, fareDataFiles, numberOfMillisecondsToRun, numberOfMillisecondsToLead, pushRideDataFirst); 254 | } 255 | 256 | 257 | // blocking collection that helps to print to console the messages on progress on the read and send of files to event hub. 258 | private class AsyncConsole 259 | { 260 | private BlockingCollection _blockingCollection = new BlockingCollection(); 261 | private CancellationToken _cancellationToken; 262 | private Task _writerTask; 263 | 264 | public AsyncConsole(CancellationToken cancellationToken = default(CancellationToken)) 265 | { 266 | _cancellationToken = cancellationToken; 267 | _writerTask = Task.Factory.StartNew((state) => 268 | { 269 | var token = (CancellationToken)state; 270 | string msg; 271 | while (!token.IsCancellationRequested) 272 | { 273 | if (_blockingCollection.TryTake(out msg, 500)) 274 | { 275 | Console.WriteLine(msg); 276 | } 277 | } 278 | 279 | while (_blockingCollection.TryTake(out msg, 100)) 280 | { 281 | Console.WriteLine(msg); 282 | } 283 | }, _cancellationToken, TaskCreationOptions.LongRunning); 284 | } 285 | 286 | public Task WriteLine(string toWrite) 287 | { 288 | _blockingCollection.Add(toWrite); 289 | return Task.FromResult(0); 290 | } 291 | 292 | public Task WriterTask 293 | { 294 | get { return _writerTask; } 295 | } 296 | } 297 | 298 | // start of the read task 299 | public static async Task Main(string[] args) 300 | { 301 | try 302 | { 303 | var arguments = ParseArguments(); 304 | var rideClient = EventHubClient.CreateFromConnectionString( 305 | arguments.RideConnectionString 306 | ); 307 | var fareClient = EventHubClient.CreateFromConnectionString( 308 | arguments.FareConnectionString 309 | ); 310 | 311 | cts = arguments.MillisecondsToRun == 0 ? new CancellationTokenSource() : new CancellationTokenSource(arguments.MillisecondsToRun); 312 | 313 | Console.CancelKeyPress += (s, e) => 314 | { 315 | //Console.WriteLine("Cancelling data generation"); 316 | cts.Cancel(); 317 | e.Cancel = true; 318 | }; 319 | 320 | 321 | AsyncConsole console = new AsyncConsole(cts.Token); 322 | 323 | var rideClientPool = new ObjectPool(() => EventHubClient.CreateFromConnectionString(arguments.RideConnectionString), 100); 324 | var fareClientPool = new ObjectPool(() => EventHubClient.CreateFromConnectionString(arguments.FareConnectionString), 100); 325 | 326 | 327 | var numberOfMillisecondsToLead = arguments.MillisecondsToLead; 328 | var pushRideDataFirst = arguments.sendRideDataFirst; 329 | 330 | var rideTaskWaitTime = 0; 331 | var fareTaskWaitTime = 0; 332 | 333 | if (numberOfMillisecondsToLead > 0) 334 | { 335 | if (!pushRideDataFirst) 336 | { 337 | rideTaskWaitTime = numberOfMillisecondsToLead; 338 | } 339 | else 340 | { 341 | fareTaskWaitTime = numberOfMillisecondsToLead; 342 | } 343 | } 344 | 345 | 346 | var rideTask = ReadData(arguments.RideDataFiles, 347 | TaxiRide.FromString, rideClientPool, 100, console, 348 | rideTaskWaitTime, DataFormat.Json); 349 | 350 | var fareTask = ReadData(arguments.TripDataFiles, 351 | TaxiFare.FromString, fareClientPool, 200, console, 352 | fareTaskWaitTime, DataFormat.Csv); 353 | 354 | 355 | await Task.WhenAll(rideTask, fareTask, console.WriterTask); 356 | Console.WriteLine("Data generation complete"); 357 | } 358 | catch (Exception ex) 359 | { 360 | Console.WriteLine(ex.Message); 361 | Console.WriteLine("Data generation failed"); 362 | return 1; 363 | } 364 | 365 | return 0; 366 | } 367 | } 368 | } -------------------------------------------------------------------------------- /onprem/DataLoader/StreamReaderExtensions.cs: -------------------------------------------------------------------------------- 1 | using System; 2 | using System.Collections.Generic; 3 | using System.IO; 4 | using System.Linq; 5 | using Microsoft.Azure.EventHubs; 6 | 7 | namespace Taxi 8 | { 9 | public static class StreamReaderExtensions 10 | { 11 | public static IEnumerable ReadLines(this StreamReader reader) 12 | { 13 | if (reader == null) 14 | { 15 | throw new ArgumentNullException(nameof(reader)); 16 | } 17 | 18 | string line = null; 19 | while ((line = reader.ReadLine()) != null) 20 | { 21 | yield return line; 22 | } 23 | } 24 | } 25 | } -------------------------------------------------------------------------------- /onprem/DataLoader/TaxiData.cs: -------------------------------------------------------------------------------- 1 | namespace Taxi 2 | { 3 | using System; 4 | using System.Globalization; 5 | using Newtonsoft.Json; 6 | using Newtonsoft.Json.Serialization; 7 | 8 | [JsonObject(NamingStrategyType = typeof(CamelCaseNamingStrategy))] 9 | public abstract class TaxiData 10 | { 11 | public TaxiData() 12 | { 13 | } 14 | 15 | [JsonProperty] 16 | public long Medallion { get; set; } 17 | 18 | [JsonProperty] 19 | public long HackLicense { get; set; } 20 | 21 | [JsonProperty] 22 | public string VendorId { get; set; } 23 | 24 | [JsonProperty] 25 | public DateTimeOffset PickupTime { get; set; } 26 | 27 | [JsonIgnore] 28 | public string PartitionKey 29 | { 30 | get => $"{Medallion}_{HackLicense}_{VendorId}"; 31 | } 32 | 33 | [JsonIgnore] 34 | protected string CsvHeader { get; set; } 35 | 36 | 37 | [JsonIgnore] 38 | protected string CsvString { get; set; } 39 | 40 | public string GetData(DataFormat dataFormat) 41 | { 42 | if (dataFormat == DataFormat.Csv) 43 | { 44 | return $"{CsvHeader}\r\n{CsvString}"; 45 | } 46 | else if (dataFormat == DataFormat.Json) 47 | { 48 | return JsonConvert.SerializeObject(this); 49 | } 50 | else 51 | { 52 | throw new ArgumentException($"Invalid DataFormat: {dataFormat}"); 53 | } 54 | } 55 | } 56 | } -------------------------------------------------------------------------------- /onprem/DataLoader/TaxiFare.cs: -------------------------------------------------------------------------------- 1 | namespace Taxi 2 | { 3 | using System; 4 | using System.Globalization; 5 | using Newtonsoft.Json; 6 | using Newtonsoft.Json.Serialization; 7 | 8 | [JsonObject(NamingStrategyType = typeof(CamelCaseNamingStrategy))] 9 | public class TaxiFare : TaxiData 10 | 11 | { 12 | public TaxiFare() 13 | { 14 | } 15 | 16 | [JsonProperty] 17 | public string PaymentType { get; set; } 18 | 19 | [JsonProperty] 20 | public float FareAmount { get; set; } 21 | 22 | [JsonProperty] 23 | public float Surcharge { get; set; } 24 | 25 | [JsonProperty("mtaTax")] 26 | public float MTATax { get; set; } 27 | 28 | [JsonProperty] 29 | public float TipAmount { get; set; } 30 | 31 | [JsonProperty] 32 | public float TollsAmount { get; set; } 33 | 34 | [JsonProperty] 35 | public float TotalAmount { get; set; } 36 | 37 | public static TaxiFare FromString(string line,string header) 38 | { 39 | if (string.IsNullOrWhiteSpace(line)) 40 | { 41 | throw new ArgumentException($"{nameof(line)} cannot be null, empty, or only whitespace"); 42 | } 43 | 44 | string[] tokens = line.Split(','); 45 | if (tokens.Length != 11) 46 | { 47 | throw new ArgumentException($"Invalid record: {line}"); 48 | } 49 | 50 | var fare = new TaxiFare(); 51 | fare.CsvString = line; 52 | fare.CsvHeader = header; 53 | try 54 | { 55 | fare.Medallion = long.Parse(tokens[0]); 56 | fare.HackLicense = long.Parse(tokens[1]); 57 | fare.VendorId = tokens[2]; 58 | fare.PickupTime = DateTimeOffset.ParseExact( 59 | tokens[3], "yyyy-MM-dd HH:mm:ss", 60 | CultureInfo.InvariantCulture, 61 | DateTimeStyles.AssumeUniversal); 62 | fare.PaymentType = tokens[4]; 63 | fare.FareAmount = float.TryParse(tokens[5], out float result) ? result : 0.0f; 64 | fare.Surcharge = float.TryParse(tokens[6], out result) ? result : 0.0f; 65 | fare.MTATax = float.TryParse(tokens[7], out result) ? result : 0.0f; 66 | fare.TipAmount = float.TryParse(tokens[8], out result) ? result : 0.0f; 67 | fare.TollsAmount = float.TryParse(tokens[9], out result) ? result : 0.0f; 68 | fare.TotalAmount = float.TryParse(tokens[10], out result) ? result : 0.0f; 69 | return fare; 70 | } 71 | catch (Exception ex) 72 | { 73 | throw new ArgumentException($"Invalid record: {line}", ex); 74 | } 75 | } 76 | } 77 | } -------------------------------------------------------------------------------- /onprem/DataLoader/TaxiRide.cs: -------------------------------------------------------------------------------- 1 | namespace Taxi 2 | { 3 | using System; 4 | using System.Globalization; 5 | using Newtonsoft.Json; 6 | using Newtonsoft.Json.Serialization; 7 | 8 | [JsonObject(NamingStrategyType = typeof(CamelCaseNamingStrategy))] 9 | public class TaxiRide : TaxiData 10 | 11 | { 12 | public TaxiRide() 13 | { 14 | } 15 | 16 | [JsonProperty] 17 | public int RateCode { get; set; } 18 | 19 | [JsonProperty] 20 | public string StoreAndForwardFlag { get; set; } 21 | 22 | [JsonProperty] 23 | public DateTimeOffset DropoffTime { get; set; } 24 | 25 | [JsonProperty] 26 | public int PassengerCount { get; set; } 27 | 28 | [JsonProperty] 29 | public float TripTimeInSeconds { get; set; } 30 | 31 | [JsonProperty] 32 | public float TripDistanceInMiles { get; set; } 33 | 34 | [JsonProperty] 35 | public float PickupLon { get; set; } 36 | 37 | [JsonProperty] 38 | public float PickupLat { get; set; } 39 | 40 | [JsonProperty] 41 | public float DropoffLon { get; set; } 42 | 43 | [JsonProperty] 44 | public float DropoffLat { get; set; } 45 | 46 | public static TaxiRide FromString(string line,string header) 47 | { 48 | if (string.IsNullOrWhiteSpace(line)) 49 | { 50 | throw new ArgumentException($"{nameof(line)} cannot be null, empty, or only whitespace"); 51 | } 52 | 53 | string[] tokens = line.Split(','); 54 | if (tokens.Length != 14) 55 | { 56 | throw new ArgumentException($"Invalid record: {line}"); 57 | } 58 | 59 | var ride = new TaxiRide(); 60 | ride.CsvString = line; 61 | ride.CsvHeader = header; 62 | try 63 | { 64 | ride.Medallion = long.Parse(tokens[0]); 65 | ride.HackLicense = long.Parse(tokens[1]); 66 | ride.VendorId = tokens[2]; 67 | ride.RateCode = int.Parse(tokens[3]); 68 | ride.StoreAndForwardFlag = tokens[4]; 69 | ride.PickupTime = DateTimeOffset.ParseExact( 70 | tokens[5], "yyyy-MM-dd HH:mm:ss", 71 | CultureInfo.InvariantCulture, 72 | DateTimeStyles.AssumeUniversal); 73 | ride.DropoffTime = DateTimeOffset.ParseExact( 74 | tokens[6], "yyyy-MM-dd HH:mm:ss", 75 | CultureInfo.InvariantCulture, 76 | DateTimeStyles.AssumeUniversal); 77 | ride.PassengerCount = int.Parse(tokens[7]); 78 | ride.TripTimeInSeconds = float.Parse(tokens[8]); 79 | ride.TripDistanceInMiles = float.Parse(tokens[9]); 80 | 81 | ride.PickupLon = float.TryParse(tokens[10], out float result) ? result : 0.0f; 82 | ride.PickupLat = float.TryParse(tokens[11], out result) ? result : 0.0f; 83 | ride.DropoffLon = float.TryParse(tokens[12], out result) ? result : 0.0f; 84 | ride.DropoffLat = float.TryParse(tokens[13], out result) ? result : 0.0f; 85 | return ride; 86 | } 87 | catch (Exception ex) 88 | { 89 | throw new ArgumentException($"Invalid record: {line}", ex); 90 | } 91 | } 92 | } 93 | } -------------------------------------------------------------------------------- /onprem/DataLoader/taxi.csproj: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | Exe 5 | netcoreapp3.1 6 | latest 7 | win10-x64 8 | 9 | 10 | 11 | 12 | 13 | 14 | 15 | -------------------------------------------------------------------------------- /onprem/Dockerfile: -------------------------------------------------------------------------------- 1 | FROM mcr.microsoft.com/dotnet/core/sdk:3.1 as build 2 | RUN apt-get update 3 | RUN apt-get install -y git 4 | RUN git clone --recursive https://github.com/mspnp/azure-databricks-streaming-analytics.git && cd azure-databricks-streaming-analytics && git fetch && git checkout master 5 | WORKDIR azure-databricks-streaming-analytics/onprem/DataLoader 6 | RUN dotnet build 7 | RUN dotnet publish -f netcoreapp3.1 -c Release 8 | FROM mcr.microsoft.com/dotnet/core/runtime:3.1 AS runtime 9 | WORKDIR DataLoader 10 | COPY --from=build azure-databricks-streaming-analytics/onprem/DataLoader/bin/Release/netcoreapp3.1/publish . 11 | ENTRYPOINT ["dotnet" , "taxi.dll"] 12 | -------------------------------------------------------------------------------- /onprem/main.env: -------------------------------------------------------------------------------- 1 | RIDE_EVENT_HUB= 2 | FARE_EVENT_HUB= 3 | RIDE_DATA_FILE_PATH=/DataFile/FOIL2013 4 | MINUTES_TO_LEAD=0 5 | PUSH_RIDE_DATA_FIRST=false 6 | --------------------------------------------------------------------------------