├── .github
    └── dependabot.yml
├── .gitignore
├── README.md
├── SECURITY.md
├── azure
    ├── AzureDataBricksJob
    │   ├── pom.xml
    │   └── src
    │   │   ├── main
    │   │       ├── java
    │   │       │   └── com
    │   │       │   │   └── microsoft
    │   │       │   │       └── pnp
    │   │       │   │           ├── GeoFinder.java
    │   │       │   │           └── MDCCloseableFactory.java
    │   │       ├── resources
    │   │       │   └── com
    │   │       │   │   └── microsoft
    │   │       │   │       └── pnp
    │   │       │   │           └── azuredatabricksjob
    │   │       │   │               └── log4j.properties
    │   │       └── scala
    │   │       │   ├── com
    │   │       │       └── microsoft
    │   │       │       │   └── pnp
    │   │       │       │       ├── CassandraSinkForeach.scala
    │   │       │       │       ├── JobConfiguration.scala
    │   │       │       │       ├── StreamingMetricsListener.scala
    │   │       │       │       ├── TaxiCabReader.scala
    │   │       │       │       ├── TryWith.scala
    │   │       │       │       ├── Utils.scala
    │   │       │       │       └── package.scala
    │   │       │   └── org
    │   │       │       └── apache
    │   │       │           └── spark
    │   │       │               ├── metrics
    │   │       │                   └── source
    │   │       │                   │   ├── AppAccumulators.scala
    │   │       │                   │   └── AppMetrics.scala
    │   │       │               └── sql
    │   │       │                   └── catalyst
    │   │       │                       ├── csv
    │   │       │                           ├── CSVExprUtils.scala
    │   │       │                           ├── CSVHeaderChecker.scala
    │   │       │                           ├── CSVOptions.scala
    │   │       │                           └── UnivocityParser.scala
    │   │       │                       ├── expressions
    │   │       │                           ├── ExprUtils.scala
    │   │       │                           └── csvExpressions.scala
    │   │       │                       └── util
    │   │       │                           └── FailureSafeParser.scala
    │   │   └── test
    │   │       └── scala
    │   │           ├── com
    │   │               └── microsoft
    │   │               │   └── pnp
    │   │               │       ├── SparkSuitBase.scala
    │   │               │       ├── TaxiFareMapperTester.scala
    │   │               │       └── TaxiRideMapperTester.scala
    │   │           └── org
    │   │               └── apache
    │   │                   └── spark
    │   │                       └── sql
    │   │                           └── streaming
    │   │                               └── UtilsTests.scala
    ├── deployresources.json
    └── pom.xml
└── onprem
    ├── DataLoader
        ├── .vscode
        │   ├── launch.json
        │   └── tasks.json
        ├── DataFormat.cs
        ├── ObjectPool.cs
        ├── Program.cs
        ├── StreamReaderExtensions.cs
        ├── TaxiData.cs
        ├── TaxiFare.cs
        ├── TaxiRide.cs
        └── taxi.csproj
    ├── Dockerfile
    └── main.env


/.github/dependabot.yml:
--------------------------------------------------------------------------------
 1 | # To get started with Dependabot version updates, you'll need to specify which
 2 | # package ecosystems to update and where the package manifests are located.
 3 | # Please see the documentation for all configuration options:
 4 | # https://docs.github.com/code-security/dependabot/dependabot-version-updates/configuration-options-for-the-dependabot.yml-file
 5 | 
 6 | version: 2
 7 | updates:
 8 |   - package-ecosystem: "nuget" # See documentation for possible values
 9 |     directory: "/onprem/DataLoader" # Location of package manifests
10 |     schedule:
11 |       interval: "weekly"
12 |   - package-ecosystem: "docker" # See documentation for possible values
13 |     directory: "/onprem" # Location of package manifests
14 |     schedule:
15 |       interval: "weekly"
16 |   - package-ecosystem: "maven" # See documentation for possible values
17 |     directory: "/azure" # Location of package manifests
18 |     schedule:
19 |       interval: "weekly"
20 | 


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
  1 | ## Ignore Visual Studio temporary files, build results, and
  2 | ## files generated by popular Visual Studio add-ons.
  3 | 
  4 | # User-specific files
  5 | *.suo
  6 | *.user
  7 | *.userosscache
  8 | *.sln.docstates
  9 | 
 10 | # User-specific files (MonoDevelop/Xamarin Studio)
 11 | *.userprefs
 12 | 
 13 | # Build results
 14 | [Dd]ebug/
 15 | [Dd]ebugPublic/
 16 | [Rr]elease/
 17 | [Rr]eleases/
 18 | x64/
 19 | x86/
 20 | build/
 21 | bld/
 22 | [Bb]in/
 23 | [Oo]bj/
 24 | 
 25 | # Visual Studio 2015 cache/options directory
 26 | .vs/
 27 | 
 28 | # MSTest test Results
 29 | [Tt]est[Rr]esult*/
 30 | [Bb]uild[Ll]og.*
 31 | 
 32 | # NUNIT
 33 | *.VisualState.xml
 34 | TestResult.xml
 35 | 
 36 | # Build Results of an ATL Project
 37 | [Dd]ebugPS/
 38 | [Rr]eleasePS/
 39 | dlldata.c
 40 | 
 41 | # DNX
 42 | project.lock.json
 43 | artifacts/
 44 | 
 45 | *_i.c
 46 | *_p.c
 47 | *_i.h
 48 | *.ilk
 49 | *.meta
 50 | *.obj
 51 | *.pch
 52 | *.pdb
 53 | *.pgc
 54 | *.pgd
 55 | *.rsp
 56 | *.sbr
 57 | *.tlb
 58 | *.tli
 59 | *.tlh
 60 | *.tmp
 61 | *.tmp_proj
 62 | *.log
 63 | *.vspscc
 64 | *.vssscc
 65 | .builds
 66 | *.pidb
 67 | *.svclog
 68 | *.scc
 69 | 
 70 | # Chutzpah Test files
 71 | _Chutzpah*
 72 | 
 73 | # Visual C++ cache files
 74 | ipch/
 75 | *.aps
 76 | *.ncb
 77 | *.opensdf
 78 | *.sdf
 79 | *.cachefile
 80 | 
 81 | # Visual Studio profiler
 82 | *.psess
 83 | *.vsp
 84 | *.vspx
 85 | 
 86 | # TFS 2012 Local Workspace
 87 | $tf/
 88 | 
 89 | # Guidance Automation Toolkit
 90 | *.gpState
 91 | 
 92 | # ReSharper is a .NET coding add-in
 93 | _ReSharper*/
 94 | *.[Rr]e[Ss]harper
 95 | *.DotSettings.user
 96 | 
 97 | # JustCode is a .NET coding add-in
 98 | .JustCode
 99 | 
100 | # TeamCity is a build add-in
101 | _TeamCity*
102 | 
103 | # DotCover is a Code Coverage Tool
104 | *.dotCover
105 | 
106 | # NCrunch
107 | _NCrunch_*
108 | .*crunch*.local.xml
109 | 
110 | # MightyMoose
111 | *.mm.*
112 | AutoTest.Net/
113 | 
114 | # Web workbench (sass)
115 | .sass-cache/
116 | 
117 | # Installshield output folder
118 | [Ee]xpress/
119 | 
120 | # DocProject is a documentation generator add-in
121 | DocProject/buildhelp/
122 | DocProject/Help/*.HxT
123 | DocProject/Help/*.HxC
124 | DocProject/Help/*.hhc
125 | DocProject/Help/*.hhk
126 | DocProject/Help/*.hhp
127 | DocProject/Help/Html2
128 | DocProject/Help/html
129 | 
130 | # Click-Once directory
131 | publish/
132 | 
133 | # Publish Web Output
134 | *.[Pp]ublish.xml
135 | *.azurePubxml
136 | ## TODO: Comment the next line if you want to checkin your
137 | ## web deploy settings but do note that will include unencrypted
138 | ## passwords
139 | #*.pubxml
140 | 
141 | *.publishproj
142 | 
143 | # NuGet Packages
144 | *.nupkg
145 | # The packages folder can be ignored because of Package Restore
146 | **/packages/*
147 | # except build/, which is used as an MSBuild target.
148 | !**/packages/build/
149 | # Uncomment if necessary however generally it will be regenerated when needed
150 | #!**/packages/repositories.config
151 | 
152 | # Windows Azure Build Output
153 | csx/
154 | *.build.csdef
155 | 
156 | # Windows Store app package directory
157 | AppPackages/
158 | 
159 | # Visual Studio cache files
160 | # files ending in .cache can be ignored
161 | *.[Cc]ache
162 | # but keep track of directories ending in .cache
163 | !*.[Cc]ache/
164 | 
165 | # Others
166 | ClientBin/
167 | [Ss]tyle[Cc]op.*
168 | ~$*
169 | *~
170 | *.dbmdl
171 | *.dbproj.schemaview
172 | *.pfx
173 | *.publishsettings
174 | node_modules/
175 | orleans.codegen.cs
176 | 
177 | # RIA/Silverlight projects
178 | Generated_Code/
179 | 
180 | # Backup & report files from converting an old project file
181 | # to a newer Visual Studio version. Backup files are not needed,
182 | # because we have git ;-)
183 | _UpgradeReport_Files/
184 | Backup*/
185 | UpgradeLog*.XML
186 | UpgradeLog*.htm
187 | 
188 | # SQL Server files
189 | *.mdf
190 | *.ldf
191 | 
192 | # Business Intelligence projects
193 | *.rdl.data
194 | *.bim.layout
195 | *.bim_*.settings
196 | 
197 | # Microsoft Fakes
198 | FakesAssemblies/
199 | 
200 | # Node.js Tools for Visual Studio
201 | .ntvs_analysis.dat
202 | 
203 | # Visual Studio 6 build log
204 | *.plg
205 | 
206 | # Visual Studio 6 workspace options file
207 | *.opt
208 | 
209 | # LightSwitch generated files
210 | GeneratedArtifacts/
211 | _Pvt_Extensions/
212 | ModelManifest.xml
213 | 
214 | #Not to include .ds_store file
215 | .DS_Store
216 | 
217 | #Not to include target files
218 | */target/**
219 | .idea
220 | project
221 | 
222 | # Covers JetBrains IDEs: IntelliJ, RubyMine, PhpStorm, AppCode, PyCharm, CLion, Android Studio and WebStorm
223 | # Reference: https://intellij-support.jetbrains.com/hc/en-us/articles/206544839
224 | 
225 | # User-specific stuff
226 | .idea/**/workspace.xml
227 | .idea/**/tasks.xml
228 | .idea/**/usage.statistics.xml
229 | .idea/**/dictionaries
230 | .idea/**/shelf
231 | 
232 | # Generated files
233 | .idea/**/contentModel.xml
234 | 
235 | # Sensitive or high-churn files
236 | .idea/**/dataSources/
237 | .idea/**/dataSources.ids
238 | .idea/**/dataSources.local.xml
239 | .idea/**/sqlDataSources.xml
240 | .idea/**/dynamic.xml
241 | .idea/**/uiDesigner.xml
242 | .idea/**/dbnavigator.xml
243 | 
244 | # Gradle
245 | .idea/**/gradle.xml
246 | .idea/**/libraries
247 | 
248 | # Gradle and Maven with auto-import
249 | # When using Gradle or Maven with auto-import, you should exclude module files,
250 | # since they will be recreated, and may cause churn.  Uncomment if using
251 | # auto-import.
252 | # .idea/modules.xml
253 | # .idea/*.iml
254 | # .idea/modules
255 | 
256 | # CMake
257 | cmake-build-*/
258 | 
259 | # Mongo Explorer plugin
260 | .idea/**/mongoSettings.xml
261 | 
262 | # File-based project format
263 | *.iws
264 | 
265 | # IntelliJ
266 | out/
267 | 
268 | # mpeltonen/sbt-idea plugin
269 | .idea_modules/
270 | 
271 | # JIRA plugin
272 | atlassian-ide-plugin.xml
273 | 
274 | # Cursive Clojure plugin
275 | .idea/replstate.xml
276 | 
277 | # Crashlytics plugin (for Android Studio and IntelliJ)
278 | com_crashlytics_export_strings.xml
279 | crashlytics.properties
280 | crashlytics-build.properties
281 | fabric.properties
282 | 
283 | # Editor-based Rest Client
284 | .idea/httpRequests
285 | 
286 | # Java
287 | # Compiled class file
288 | *.class
289 | 
290 | # Log file
291 | *.log
292 | 
293 | # BlueJ files
294 | *.ctxt
295 | 
296 | # Mobile Tools for Java (J2ME)
297 | .mtj.tmp/
298 | 
299 | # Package Files #
300 | *.jar
301 | *.war
302 | *.nar
303 | *.ear
304 | *.zip
305 | *.tar.gz
306 | *.rar
307 | 
308 | # virtual machine crash logs, see http://www.java.com/en/download/help/error_hotspot.xml
309 | hs_err_pid*
310 | 
311 | # Maven
312 | target/
313 | pom.xml.tag
314 | pom.xml.releaseBackup
315 | pom.xml.versionsBackup
316 | pom.xml.next
317 | release.properties
318 | dependency-reduced-pom.xml
319 | buildNumber.properties
320 | .mvn/timing.properties
321 | .mvn/wrapper/maven-wrapper.jar
322 | 
323 | # This is to ignore IntelliJ project files, since we use Maven
324 | *.iml
325 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
  1 | # Stream processing with Azure Databricks
  2 | 
  3 | This reference architecture shows an end-to-end [stream processing](https://docs.microsoft.com/azure/architecture/data-guide/big-data/real-time-processing) pipeline. This type of pipeline has four stages: ingest, process, store, and analysis and reporting. For this reference architecture, the pipeline ingests data from two sources, performs a join on related records from each stream, enriches the result, and calculates an average in real time. The results are stored for further analysis.
  4 | 
  5 | ![](https://github.com/mspnp/architecture-center/blob/master/docs/reference-architectures/data/images/stream-processing-databricks.png)
  6 | 
  7 | **Scenario**: A taxi company collects data about each taxi trip. For this scenario, we assume there are two separate devices sending data. The taxi has a meter that sends information about each ride &mdash; the duration, distance, and pickup and dropoff locations. A separate device accepts payments from customers and sends data about fares. To spot ridership trends, the taxi company wants to calculate the average tip per mile driven, in real time, for each neighborhood.
  8 | 
  9 | ## Deploy the solution
 10 | 
 11 | A deployment for this reference architecture is available on [GitHub](https://github.com/mspnp/azure-databricks-streaming-analytics).
 12 | 
 13 | ### Prerequisites
 14 | 
 15 | 1. Clone, fork, or download this GitHub repository.
 16 | 
 17 | 2. Install [Docker](https://www.docker.com/) to run the data generator.
 18 | 
 19 | 3. Install [Azure CLI 2.0](https://docs.microsoft.com/cli/azure/install-azure-cli?view=azure-cli-latest).
 20 | 
 21 | 4. Install [Databricks CLI](https://docs.microsoft.com/azure/databricks/dev-tools/cli/).
 22 | 
 23 | 5. From a command prompt, bash prompt, or PowerShell prompt, sign into your Azure account as follows:
 24 | 
 25 |     ```bash
 26 |     az login
 27 |     ```
 28 | 
 29 | 6. Optional - Install a Java IDE, with the following resources:
 30 |     - JDK 1.8
 31 |     - Scala SDK 2.12
 32 |     - Maven 3.6.3
 33 |     > Note: Instructions are included for building via a docker container if you do not want to install a Java IDE.
 34 | 
 35 | ### Download the New York City taxi and neighborhood data files
 36 | 
 37 | 1. Create a directory named `DataFile` in the root of the cloned Github repository in your local file system.
 38 | 
 39 | 2. Open a web browser and navigate to <https://uofi.app.box.com/v/NYCtaxidata/folder/2332219935>.
 40 | 
 41 | 3. Click the **Download** button on this page to download a zip file of all the taxi data for that year.
 42 | 
 43 | 4. Extract the zip file to the `DataFile` directory.
 44 | 
 45 |     > Note: This zip file contains other zip files. Don't extract the child zip files.
 46 | 
 47 |     The directory structure should look like the following:
 48 | 
 49 |     ```shell
 50 |     /DataFile
 51 |         /FOIL2013
 52 |             trip_data_1.zip
 53 |             trip_data_2.zip
 54 |             trip_data_3.zip
 55 |             ...
 56 |     ```
 57 | 
 58 | 5. Open a web browser and navigate to <https://www.census.gov/geographies/mapping-files/time-series/geo/cartographic-boundary.html#ti1400387013>.
 59 | 
 60 | 6. Under the section **County Subdivisions** click the dropdown an select **New York**.
 61 | 
 62 | 7. Copy the **cb_2019_36_cousub_500k.zip** file from your browser's **downloads** directory to the `DataFile` directory.
 63 | 
 64 | ### Deploy the Azure resources
 65 | 
 66 | 1. From a shell or Windows Command Prompt, run the following command and follow the sign-in prompt:
 67 | 
 68 |     ```bash
 69 |     az login
 70 |     ```
 71 | 
 72 | 2. Navigate to the folder named `azure` in the GitHub repository directory:
 73 | 
 74 |     ```bash
 75 |     cd azure
 76 |     ```
 77 | 
 78 | 3. Run the following commands to deploy the Azure resources:
 79 | 
 80 |     ```bash
 81 |     export resourceGroup='[Resource group name]'
 82 |     export resourceLocation='[Region]'
 83 |     export eventHubNamespace='[Event Hubs namespace name]'
 84 |     export databricksWorkspaceName='[Azure Databricks workspace name]'
 85 |     export cosmosDatabaseAccount='[Cosmos DB database name]'
 86 |     export logAnalyticsWorkspaceName='[Log Analytics workspace name]'
 87 |     export logAnalyticsWorkspaceRegion='[Log Analytics region]'
 88 | 
 89 |     # Create a resource group
 90 |     az group create --name $resourceGroup --location $resourceLocation
 91 | 
 92 |     # Deploy resources
 93 |     az deployment group create --resource-group $resourceGroup \
 94 |      --template-file ./deployresources.json --parameters \
 95 |      eventHubNamespace=$eventHubNamespace \
 96 |         databricksWorkspaceName=$databricksWorkspaceName \
 97 |      cosmosDatabaseAccount=$cosmosDatabaseAccount \
 98 |      logAnalyticsWorkspaceName=$logAnalyticsWorkspaceName \
 99 |      logAnalyticsWorkspaceRegion=$logAnalyticsWorkspaceRegion
100 |     ```
101 | 
102 | 4. The output of the deployment is written to the console once complete. Search the output for the following JSON:
103 | 
104 | ```JSON
105 | "outputs": {
106 |         "cosmosDb": {
107 |           "type": "Object",
108 |           "value": {
109 |             "hostName": <value>,
110 |             "secret": <value>,
111 |             "username": <value>
112 |           }
113 |         },
114 |         "eventHubs": {
115 |           "type": "Object",
116 |           "value": {
117 |             "taxi-fare-eh": <value>,
118 |             "taxi-ride-eh": <value>
119 |           }
120 |         },
121 |         "logAnalytics": {
122 |           "type": "Object",
123 |           "value": {
124 |             "secret": <value>,
125 |             "workspaceId": <value>
126 |           }
127 |         }
128 | },
129 | ```
130 | 
131 | These values are the secrets that will be added to Databricks secrets in upcoming sections. Keep them secure until you add them in those sections.
132 | 
133 | ### Add a Cassandra table to the Cosmos DB Account
134 | 
135 | 1. In the Azure portal, navigate to the resource group created in the **deploy the Azure resources** section above. Click on **Azure Cosmos DB Account**. Create a table with the Cassandra API.
136 | 
137 | 2. In the **overview** blade, click **add table**.
138 | 
139 | 3. When the **add table** blade opens, enter `newyorktaxi` in the **Keyspace name** text box.
140 | 
141 | 4. In the **enter CQL command to create the table** section, enter `neighborhoodstats` in the text box beside `newyorktaxi`.
142 | 
143 | 5. In the text box below, enter the following:
144 | 
145 |     ```shell
146 |     (neighborhood text, window_end timestamp, number_of_rides bigint, total_fare_amount double, total_tip_amount double, average_fare_amount double, average_tip_amount double, primary key(neighborhood, window_end))
147 |     ```
148 | 
149 | 6. In the **Table throughput** section confirm that `Autoscale` is selected and that value `4000` is in the **Table Max RU/s** text box.
150 | 
151 | 7. Click **OK**.
152 | 
153 | ### Add the Databricks secrets using the Databricks CLI
154 | 
155 | > Tip: Make sure you have authenticated your Databricks CLI configuration.  The simplest method in bash is to run:
156 | >
157 | > ```bash
158 | > export DATABRICKS_AAD_TOKEN=$(az account get-access-token --resource 2ff814a6-3304-4ab8-85cb-cd0e6f879c1d | jq .accessToken --raw-output)
159 | > databricks configure --aad-token --host <enter Databricks Workspace URL from Portal>
160 | > ```
161 | >
162 | > The resource GUID (2ff814a6-3304-4ab8-85cb-cd0e6f879c1d) is a fixed value. For other options see [Set up authentication](https://docs.microsoft.com/azure/databricks/dev-tools/cli/#--set-up-authentication) in the Azure Databricks documentation.
163 | > If you see a JSONDecodeError error when running a command, your token has exired and you can refresh by running the commands above again.
164 | 
165 | First, enter the secrets for EventHub:
166 | 
167 | 1. Using the **Azure Databricks CLI** installed in step 4 of the prerequisites, create the Azure Databricks secret scope:
168 | 
169 |     ```bash
170 |     databricks secrets create-scope --scope "azure-databricks-job"
171 |     ```
172 | 
173 | 2. Add the secret for the taxi ride EventHub:
174 | 
175 |     ```bash
176 |     databricks secrets put --scope "azure-databricks-job" --key "taxi-ride"
177 |     ```
178 | 
179 |     Once executed, this command opens the vi editor. Enter the **taxi-ride-eh** value from the **eventHubs** output section in step 4 of the *deploy the Azure resources* section. Save and exit vi (if in edit mode hit ESC, then type ":wq").
180 | 
181 | 3. Add the secret for the taxi fare EventHub:
182 | 
183 |     ```bash
184 |     databricks secrets put --scope "azure-databricks-job" --key "taxi-fare"
185 |     ```
186 | 
187 |     Once executed, this command opens the vi editor. Enter the **taxi-fare-eh** value from the **eventHubs** output section in step 4 of the *deploy the Azure resources* section. Save and exit vi (if in edit mode hit ESC, then type ":wq").
188 | 
189 | Next, enter the secrets for Cosmos DB:
190 | 
191 | 1. Using the **Azure Databricks CLI**, add the secret for the Cosmos DB user name:
192 | 
193 |     ```bash
194 |     databricks secrets put --scope azure-databricks-job --key "cassandra-username"
195 |     ```
196 | 
197 |     Once executed, this command opens the vi editor. Enter the **username** value from the **CosmosDb** output section in step 4 of the *deploy the Azure resources* section. Save and exit vi (if in edit mode hit ESC, then type ":wq").
198 | 
199 | 2. Next, add the secret for the Cosmos DB password:
200 | 
201 |     ```bash
202 |     databricks secrets put --scope azure-databricks-job --key "cassandra-password"
203 |     ```
204 | 
205 |     Once executed, this command opens the vi editor. Enter the **secret** value from the **CosmosDb** output section in step 4 of the *deploy the Azure resources* section. Save and exit vi (if in edit mode hit ESC, then type ":wq").
206 | 
207 |     > Note: If using an [Azure Key Vault-backed secret scope](https://docs.azuredatabricks.net/user-guide/secrets/secret-scopes.html#azure-key-vault-backed-scopes), the scope must be named **azure-databricks-job** and the secrets must have the exact same names as those above.
208 | 
209 | ### Add the Census Neighborhoods data file to the Databricks file system
210 | 
211 | 1. Create a directory in the Databricks file system:
212 | 
213 |     ```bash
214 |     dbfs mkdirs dbfs:/azure-databricks-job
215 |     ```
216 | 
217 | 2. Navigate to the DataFile folder and enter the following:
218 | 
219 |     ```bash
220 |     dbfs cp cb_2020_36_cousub_500k.zip dbfs:/azure-databricks-job/
221 |     ```
222 | 
223 |     > Note: The filename may change if you obtain a shapefile for a different year.
224 | 
225 | ### Build the .jar files for the Databricks job
226 | 
227 | 1. To build the jars using a docker container from a bash prompt change to the **azure** directory and run:
228 | 
229 |     ```bash
230 |     docker run -it --rm -v `pwd`:/streaming_azuredatabricks_azure -v ~/.m2:/root/.m2 maven:3.6.3-jdk-8 mvn -f /streaming_azuredatabricks_azure/pom.xml package
231 |     ```
232 | 
233 |     > Note: Alternately, use your Java IDE to import the Maven project file named **pom.xml** located in the **azure** directory. Perform a clean build.
234 | 
235 | 1. The outputs of the build is a file named **azure-databricks-job-1.0-SNAPSHOT.jar** in the **./AzureDataBricksJob/target** directory.
236 | 
237 | ### Create a Databricks cluster
238 | 
239 | 1. In the Databricks workspace, click **Compute**, then click **Create cluster**. Enter the cluster name you created in step 3 of the **configure custom logging for the Databricks job** section above.
240 | 
241 | 1. Select **Standard** for **Cluster Mode**.
242 | 
243 | 1. Set **Databricks runtime version** to **7.3 Extended Support (Scala 2.12, Apache Spark 3.0.1)**
244 | 
245 | 1. Deselect **Enable autoscaling**.
246 | 
247 | 1. Set **Worker Type** to **Standard_DS3_v2**.
248 | 
249 | 1. Set **Workers** to **2**.
250 | 
251 | 1. Set **Driver Type** to **Same as worker**
252 | 
253 |    #### Optional - Configure Azure Log Analytics
254 | 
255 |    1. Follow the instructions in [Monitoring Azure Databricks](https://github.com/mspnp/spark-monitoring) to build the monitoring library and upload the resulting library files to your workspace.
256 | 
257 |    1. Click on **Advanced Options** then **Init Scripts**.
258 | 
259 |    1. Enter **dbfs:/databricks/spark-monitoring/spark-monitoring.sh**.
260 | 
261 |    1. Click the **Add** button.
262 | 
263 | 1. Click the **Create Cluster** button.
264 | 
265 | ### Install dependent libraries on cluster
266 | 
267 | 1. In the Databricks user interface, click on the **home** button.
268 | 
269 | 2. Click on **Compute** in the navigtation menu on the left then click on the cluster you created in the **Create a Databricks cluster** step.
270 | 
271 | 3. Click on **Libraries**, then click **Install New**.
272 | 
273 | 4. In the **Library Source** control, select **Maven**.
274 | 
275 | 5. Under the **Maven Coordinates** text box, enter `com.microsoft.azure:azure-eventhubs-spark_2.12:2.3.21`.
276 | 
277 | 6. Select **Install**.
278 | 
279 | 8. Repeat steps 3 - 6 for the `com.datastax.spark:spark-cassandra-connector-assembly_2.12:3.0.1` Maven coordinate.
280 | 
281 | 9. Repeat steps 3 - 5 for the `org.geotools:gt-shapefile:23.0` Maven coordinate.
282 | 
283 | 10. Enter `https://repo.osgeo.org/repository/release/` in the **Repository** text box.
284 | 
285 | 11. Click **Install**.
286 | 
287 | ### Create a Databricks job
288 | 
289 | 1. Copy the **azure-databricks-job-1.0-SNAPSHOT.jar** file to the Databricks file system by entering the following command in the **Databricks CLI**:
290 | 
291 |     ```bash
292 |     databricks fs cp --overwrite AzureDataBricksJob/target/azure-databricks-job-1.0-SNAPSHOT.jar dbfs:/azure-databricks-job/
293 |     ```
294 | 
295 | 1. In the Databricks workspace, click "Jobs", "create job".
296 | 
297 | 1. Enter a job name.
298 | 
299 | 1. In the **Task** area, change **Type** to `JAR` and Enter `com.microsoft.pnp.TaxiCabReader` in the **Main Class** field.
300 | 
301 | 1. Under **Dependent Libraries** click **Add**, this opens the **Add dependent library** dialog box.
302 | 
303 | 1. Change **Library Source** to **DBFS/ADLS**, confirm that Library Type is **Jar** and enter `dbfs:/azure-databricks-job/azure-databricks-job-1.0-SNAPSHOT.jar` in the **File Path** text box and select **Add**.
304 | 
305 | 1. In the **Parameters** field, enter the following (replace **\<Cosmos DB Cassandra host name\>** with a value from above):
306 | 
307 |     ```shell
308 |     ["-n","jar:file:/dbfs/azure-databricks-job/cb_2020_36_cousub_500k.zip!/cb_2020_36_cousub_500k.shp","--taxi-ride-consumer-group","taxi-ride-eh-cg","--taxi-fare-consumer-group","taxi-fare-eh-cg","--window-interval","1 hour","--cassandra-host","<Cosmos DB Cassandra host name>"]
309 |     ```
310 | 
311 | 1. Under **Cluster**, click the drop down arrow and select the cluster created the **Create a Databricks cluster** section.
312 | 
313 | 1. Click Create
314 | 
315 | 1. Select the **Runs** tab and click **Run Now**.
316 | 
317 | ### Run the data generator
318 | 
319 | 1. Navigate to the directory `onprem` in the GitHub repository.
320 | 
321 |     ```bash
322 |     cd ../onprem
323 |     ```
324 | 
325 | 1. Update the values in the file **main.env** as follows:
326 | 
327 |     ```shell
328 |     RIDE_EVENT_HUB=[Connection string for the taxi-ride event hub]
329 |     FARE_EVENT_HUB=[Connection string for the taxi-fare event hub]
330 |     RIDE_DATA_FILE_PATH=/DataFile/FOIL2013
331 |     MINUTES_TO_LEAD=0
332 |     PUSH_RIDE_DATA_FIRST=false
333 |     ```
334 | 
335 |     The connection string for the taxi-ride event hub is the **taxi-ride-eh** value from the **eventHubs** output section in step 4 of the *deploy the Azure resources* section. The connection string for the taxi-fare event hub the **taxi-fare-eh** value from the **eventHubs** output section in step 4 of the *deploy the Azure resources* section.
336 | 
337 | 1. Run the following command to build the Docker image.
338 | 
339 |     ```bash
340 |     docker build --no-cache -t dataloader .
341 |     ```
342 | 
343 | 1. Navigate back to the repository root directory.
344 | 
345 |     ```bash
346 |     cd ..
347 |     ```
348 | 
349 | 1. Run the following command to run the Docker image.
350 | 
351 |     ```bash
352 |     docker run -v `pwd`/DataFile:/DataFile --env-file=onprem/main.env dataloader:latest
353 |     ```
354 | 
355 |     The output should look like the following:
356 | 
357 |     ```shell
358 |     Created 10000 records for TaxiFare
359 |     Created 10000 records for TaxiRide
360 |     Created 20000 records for TaxiFare
361 |     Created 20000 records for TaxiRide
362 |     Created 30000 records for TaxiFare
363 |     ...
364 |     ```
365 | 
366 |     Hit CTRL+C to cancel the generation of data.
367 | 
368 | ### Verify the solution is running
369 | 
370 | To verify the Databricks job is running correctly, open the Azure portal and navigate to the Cosmos DB database. Open the **Data Explorer** blade and examine the data in the **neighborhoodstats** table, you should see results similar to:
371 | 
372 | | average_fare _amount | average_tip _amount | neighborhood | number_of_rides | total_fare _amount | total_tip _amount | window_end |
373 | | --- | --- | --- | --- | --- | --- | --- |
374 | | 10.5 | 1.0 | Bronx | 1 | 10.5 | 1.0 | 1/1/2013 8:02:00 AM +00:00 |
375 | | 12.67 | 2.6 | Brooklyn | 3 | 38 | 7.8 | 1/1/2013 8:02:00 AM +00:00 |
376 | | 14.98 | 0.73 | Manhattan | 52 | 779 | 37.83 | 1/1/2013 8:02:00 AM +00:00 |
377 | | ... | ... | ... | ... | ... | ... | ... |
378 | 
379 | > [1] Donovan, Brian; Work, Dan (2016): New York City Taxi Trip Data (2010-2013). University of Illinois at Urbana-Champaign. <https://doi.org/10.13012/J8PN93H8>
380 | 


--------------------------------------------------------------------------------
/SECURITY.md:
--------------------------------------------------------------------------------
 1 | <!-- BEGIN MICROSOFT SECURITY.MD V0.0.9 BLOCK -->
 2 | 
 3 | ## Security
 4 | 
 5 | Microsoft takes the security of our software products and services seriously, which includes all source code repositories managed through our GitHub organizations, which include [Microsoft](https://github.com/microsoft), [Azure](https://github.com/Azure), [DotNet](https://github.com/dotnet), [AspNet](https://github.com/aspnet), and [Xamarin](https://github.com/xamarin).
 6 | 
 7 | If you believe you have found a security vulnerability in any Microsoft-owned repository that meets [Microsoft's definition of a security vulnerability](https://aka.ms/security.md/definition), please report it to us as described below.
 8 | 
 9 | ## Reporting Security Issues
10 | 
11 | **Please do not report security vulnerabilities through public GitHub issues.**
12 | 
13 | Instead, please report them to the Microsoft Security Response Center (MSRC) at [https://msrc.microsoft.com/create-report](https://aka.ms/security.md/msrc/create-report).
14 | 
15 | If you prefer to submit without logging in, send email to [secure@microsoft.com](mailto:secure@microsoft.com).  If possible, encrypt your message with our PGP key; please download it from the [Microsoft Security Response Center PGP Key page](https://aka.ms/security.md/msrc/pgp).
16 | 
17 | You should receive a response within 24 hours. If for some reason you do not, please follow up via email to ensure we received your original message. Additional information can be found at [microsoft.com/msrc](https://www.microsoft.com/msrc). 
18 | 
19 | Please include the requested information listed below (as much as you can provide) to help us better understand the nature and scope of the possible issue:
20 | 
21 |   * Type of issue (e.g. buffer overflow, SQL injection, cross-site scripting, etc.)
22 |   * Full paths of source file(s) related to the manifestation of the issue
23 |   * The location of the affected source code (tag/branch/commit or direct URL)
24 |   * Any special configuration required to reproduce the issue
25 |   * Step-by-step instructions to reproduce the issue
26 |   * Proof-of-concept or exploit code (if possible)
27 |   * Impact of the issue, including how an attacker might exploit the issue
28 | 
29 | This information will help us triage your report more quickly.
30 | 
31 | If you are reporting for a bug bounty, more complete reports can contribute to a higher bounty award. Please visit our [Microsoft Bug Bounty Program](https://aka.ms/security.md/msrc/bounty) page for more details about our active programs.
32 | 
33 | ## Preferred Languages
34 | 
35 | We prefer all communications to be in English.
36 | 
37 | ## Policy
38 | 
39 | Microsoft follows the principle of [Coordinated Vulnerability Disclosure](https://aka.ms/security.md/cvd).
40 | 
41 | <!-- END MICROSOFT SECURITY.MD BLOCK -->
42 | 


--------------------------------------------------------------------------------
/azure/AzureDataBricksJob/pom.xml:
--------------------------------------------------------------------------------
  1 | <project xmlns="http://maven.apache.org/POM/4.0.0" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
  2 |          xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/maven-v4_0_0.xsd">
  3 |     <modelVersion>4.0.0</modelVersion>
  4 |     <parent>
  5 |         <artifactId>azure-databricks-ra</artifactId>
  6 |         <groupId>com.microsoft.pnp</groupId>
  7 |         <version>1.0-SNAPSHOT</version>
  8 |         <relativePath>../pom.xml</relativePath>
  9 |     </parent>
 10 |     <artifactId>azure-databricks-job</artifactId>
 11 |     <version>1.0-SNAPSHOT</version>
 12 |     <name>${project.artifactId}</name>
 13 |     <packaging>jar</packaging>
 14 |     <properties>
 15 |         <maven.compiler.source>1.8</maven.compiler.source>
 16 |         <maven.compiler.target>1.8</maven.compiler.target>
 17 |         <encoding>UTF-8</encoding>
 18 |         <scala.version>2.12.12</scala.version>
 19 |         <scala.compat.version>2.12</scala.compat.version>
 20 |         <spec2.version>4.2.0</spec2.version>
 21 |     </properties>
 22 |     <dependencies>
 23 |         <dependency>
 24 |             <groupId>com.microsoft.azure</groupId>
 25 |             <artifactId>azure-eventhubs-spark_${scala.compat.version}</artifactId>
 26 |             <version>2.3.21</version>
 27 |             <scope>provided</scope>
 28 |         </dependency>
 29 |         <dependency>
 30 |             <groupId>org.geotools</groupId>
 31 |             <artifactId>gt-shapefile</artifactId>
 32 |             <version>23.0</version>
 33 |             <scope>provided</scope>
 34 |         </dependency>
 35 |         <dependency>
 36 |             <groupId>org.eclipse.jetty</groupId>
 37 |             <artifactId>jetty-server</artifactId>
 38 |             <version>9.4.43.v20210629</version>
 39 |             <scope>provided</scope>
 40 |         </dependency>
 41 |         <dependency>
 42 |             <groupId>com.databricks</groupId>
 43 |             <artifactId>dbutils-api_${scala.compat.version}</artifactId>
 44 |             <version>0.0.5</version>
 45 |             <scope>provided</scope>
 46 |         </dependency>
 47 |         <dependency>
 48 |             <groupId>org.rogach</groupId>
 49 |             <artifactId>scallop_${scala.compat.version}</artifactId>
 50 |             <version>3.5.1</version>
 51 |         </dependency>
 52 |         <dependency>
 53 |             <groupId>com.datastax.spark</groupId>
 54 |             <artifactId>spark-cassandra-connector_2.12</artifactId>
 55 |             <version>3.0.1</version>
 56 |             <scope>provided</scope>
 57 |         </dependency>
 58 |         <dependency>
 59 |             <groupId>org.locationtech.jts</groupId>
 60 |             <artifactId>jts-core</artifactId>
 61 |             <version>1.18.2</version>
 62 |         </dependency>
 63 |     </dependencies>
 64 |     <repositories>
 65 |         <repository>
 66 |             <id>osgeo</id>
 67 |             <name>Open Source Geospatial Foundation Repository</name>
 68 |             <url>https://repo.osgeo.org/repository/release/</url>
 69 |         </repository>
 70 |     </repositories>
 71 | 
 72 |     <build>
 73 |         <plugins>
 74 |             <plugin>
 75 |                 <groupId>org.scalatest</groupId>
 76 |                 <artifactId>scalatest-maven-plugin</artifactId>
 77 |             </plugin>
 78 |             <plugin>
 79 |                 <groupId>org.apache.maven.plugins</groupId>
 80 |                 <artifactId>maven-shade-plugin</artifactId>
 81 |                 <executions>
 82 |                     <execution>
 83 |                         <phase>package</phase>
 84 |                         <goals>
 85 |                             <goal>shade</goal>
 86 |                         </goals>
 87 |                         <configuration>
 88 |                             <minimizeJar>true</minimizeJar>
 89 |                             <filters>
 90 |                                 <filter>
 91 |                                     <artifact>*:*</artifact>
 92 |                                     <excludes>
 93 |                                         <exclude>META-INF/*.SF</exclude>
 94 |                                         <exclude>META-INF/*.DSA</exclude>
 95 |                                         <exclude>META-INF/*.RSA</exclude>
 96 |                                     </excludes>
 97 |                                 </filter>
 98 |                             </filters>
 99 |                         </configuration>
100 |                     </execution>
101 |                 </executions>
102 |             </plugin>
103 |         </plugins>
104 |     </build>
105 | </project>
106 | 


--------------------------------------------------------------------------------
/azure/AzureDataBricksJob/src/main/java/com/microsoft/pnp/GeoFinder.java:
--------------------------------------------------------------------------------
 1 | package com.microsoft.pnp;
 2 | 
 3 | import org.locationtech.jts.geom.Coordinate;
 4 | import org.locationtech.jts.geom.GeometryFactory;
 5 | import org.locationtech.jts.geom.Point;
 6 | import org.geotools.data.FeatureSource;
 7 | import org.geotools.data.collection.SpatialIndexFeatureCollection;
 8 | import org.geotools.data.collection.SpatialIndexFeatureSource;
 9 | import org.geotools.data.shapefile.ShapefileDataStore;
10 | import org.geotools.factory.CommonFactoryFinder;
11 | import org.geotools.feature.FeatureCollection;
12 | import org.geotools.feature.FeatureIterator;
13 | import org.opengis.feature.Feature;
14 | import org.opengis.filter.Filter;
15 | import org.opengis.filter.FilterFactory2;
16 | import org.opengis.filter.expression.PropertyName;
17 | import org.slf4j.Logger;
18 | import org.slf4j.LoggerFactory;
19 | import scala.Serializable;
20 | 
21 | import java.io.IOException;
22 | import java.net.URL;
23 | import java.util.Optional;
24 | 
25 | 
26 | public class GeoFinder implements Serializable {
27 |     private static final Logger logger = LoggerFactory.getLogger(GeoFinder.class);
28 | 
29 |     private final FeatureSource featureSource;
30 |     private final FilterFactory2 filterFactory;
31 |     private final PropertyName propertyName;
32 |     private final GeometryFactory geometryFactory;
33 | 
34 |     private GeoFinder(FeatureSource featureSource, FilterFactory2 filterFactory, PropertyName propertyName) {
35 |         this.featureSource = featureSource;
36 |         this.filterFactory = filterFactory;
37 |         this.propertyName = propertyName;
38 |         this.geometryFactory = new GeometryFactory();
39 |     }
40 | 
41 |     public Optional<String> getNeighborhood(double longitude, double latitude) {
42 |         logger.debug(String.format("Searching for coordinate (%f, %f)", longitude, latitude));
43 |         Point point = this.geometryFactory.createPoint(new Coordinate(longitude, latitude));
44 |         Filter filter = this.filterFactory.contains(propertyName, filterFactory.literal(point));
45 |         try {
46 |             FeatureCollection featureCollection = this.featureSource.getFeatures(filter);
47 |             try (FeatureIterator iterator = featureCollection.features()) {
48 |                 if (iterator.hasNext()) {
49 |                     Feature feature = iterator.next();
50 |                     return Optional.of(feature.getProperty("NAME").getValue().toString());
51 |                 }
52 |             }
53 |         } catch (IOException ex) {
54 | 
55 |             logger.warn(String.format("Error searching for coordinate (%f, %f)", longitude, latitude), ex);
56 |         }
57 | 
58 |         return Optional.of("Unknown");
59 |     }
60 | 
61 |     public static GeoFinder createGeoFinder(URL shapeFileUrl) throws IOException {
62 |         try {
63 |             logger.info(String.format("Using shapefile: %s", shapeFileUrl));
64 |             ShapefileDataStore dataStore = new ShapefileDataStore(shapeFileUrl);
65 |             String[] typeNames = dataStore.getTypeNames();
66 |             String typeName = typeNames[0];
67 | 
68 |             logger.info(String.format("Reading content %s", typeName));
69 |             FeatureSource featureSource = new SpatialIndexFeatureSource(
70 |                     new SpatialIndexFeatureCollection(dataStore.getFeatureSource(typeName).getFeatures()));
71 | 
72 |             FilterFactory2 filterFactory = CommonFactoryFinder.getFilterFactory2();
73 |             PropertyName propertyName = filterFactory.property(dataStore
74 |                     .getSchema(typeName)
75 |                     .getGeometryDescriptor()
76 |                     .getName());
77 |             return new GeoFinder(featureSource, filterFactory, propertyName);
78 |         } catch (IOException ex) {
79 |             logger.error(String.format("Error loading Geospatial data from %s", shapeFileUrl), ex);
80 |             throw ex;
81 |         }
82 |     }
83 | }
84 | 


--------------------------------------------------------------------------------
/azure/AzureDataBricksJob/src/main/java/com/microsoft/pnp/MDCCloseableFactory.java:
--------------------------------------------------------------------------------
 1 | package com.microsoft.pnp;
 2 | 
 3 | import org.slf4j.MDC;
 4 | 
 5 | import java.util.HashMap;
 6 | import java.util.Map;
 7 | import java.util.Optional;
 8 | 
 9 | public class MDCCloseableFactory {
10 |     private class MDCCloseable implements AutoCloseable {
11 |         public MDCCloseable(Map<String, Object> mdc) {
12 |             // Log4j supports Map<String, Object>, but slf4j wants Map<String, String>
13 |             // Because of type erasure, this should be okay, but we can try to find a
14 |             // way to fix the warnings later.
15 |             MDC.setContextMap((Map)mdc);
16 |         }
17 | 
18 |         @Override
19 |         public void close() {
20 |             MDC.clear();
21 |         }
22 |     }
23 | 
24 |     private Optional<Map<String, Object>> context;
25 | 
26 |     public MDCCloseableFactory() {
27 |         this(null);
28 |     }
29 | 
30 |     public MDCCloseableFactory(Map<String, Object> context) {
31 |         this.context = Optional.ofNullable(context);
32 |     }
33 | 
34 |     public AutoCloseable create(Map<String, Object> mdc) {
35 |         // Values in mdc will override context
36 |         Map<String, Object> newMDC = new HashMap<>();
37 |         this.context.ifPresent(c -> newMDC.putAll(c));
38 |         newMDC.putAll(mdc);
39 |         return new MDCCloseable(newMDC);
40 |     }
41 | }
42 | 


--------------------------------------------------------------------------------
/azure/AzureDataBricksJob/src/main/resources/com/microsoft/pnp/azuredatabricksjob/log4j.properties:
--------------------------------------------------------------------------------
1 | log4j.appender.A1=com.microsoft.pnp.logging.loganalytics.LogAnalyticsAppender
2 | log4j.appender.A1.logType=taxijob
3 | log4j.appender.A1.layout=com.microsoft.pnp.log4j.JSONLayout
4 | log4j.appender.A1.layout.LocationInfo=false
5 | log4j.additivity.com.microsoft.pnp=false
6 | log4j.logger.com.microsoft.pnp=INFO, A1
7 | 


--------------------------------------------------------------------------------
/azure/AzureDataBricksJob/src/main/scala/com/microsoft/pnp/CassandraSinkForeach.scala:
--------------------------------------------------------------------------------
 1 | package com.microsoft.pnp
 2 | 
 3 | import com.datastax.spark.connector.cql.CassandraConnector
 4 | import org.apache.spark.sql.{ForeachWriter, Row}
 5 | 
 6 | class CassandraSinkForeach(con: CassandraConnector)
 7 |   extends ForeachWriter[Row] {
 8 | 
 9 |   // This class implements the interface ForeachWriter, which has methods that get called
10 |   // whenever there is a sequence of rows generated as output
11 |   def open(partitionId: Long, version: Long): Boolean = {
12 |     true
13 |   }
14 | 
15 |   def process(record: Row) = {
16 |     con.withSessionDo(session => {
17 |       val bound = session.prepare(
18 |         s"""
19 |            |insert into newyorktaxi.neighborhoodstats (neighborhood,window_end,number_of_rides,total_fare_amount,total_tip_amount,average_fare_amount,average_tip_amount)
20 |            |       values(?, ?, ?, ?, ?, ?, ?)"""
21 | 
22 |       ).bind(
23 |         record.getString(2),
24 |         record.getTimestamp(1).toInstant(),
25 |         record.getLong(3).asInstanceOf[AnyRef],
26 |         record.getDouble(4).asInstanceOf[AnyRef],
27 |         record.getDouble(5).asInstanceOf[AnyRef],
28 |         record.getDouble(6).asInstanceOf[AnyRef],
29 |         record.getDouble(7).asInstanceOf[AnyRef]
30 |       )
31 | 
32 |       session.execute(bound)
33 |     })
34 | 
35 |   }
36 | 
37 |   def close(errorOrNull: Throwable): Unit = {
38 |   }
39 | }
40 | 


--------------------------------------------------------------------------------
/azure/AzureDataBricksJob/src/main/scala/com/microsoft/pnp/JobConfiguration.scala:
--------------------------------------------------------------------------------
 1 | package com.microsoft.pnp
 2 | 
 3 | import java.net.URL
 4 | 
 5 | import org.rogach.scallop._
 6 | import org.apache.spark.sql.catalyst.util.IntervalUtils.stringToInterval
 7 | import org.apache.spark.unsafe.types.UTF8String
 8 | 
 9 | class JobConfiguration(arguments: Seq[String]) extends ScallopConf(arguments) with Serialization {
10 |   val neighborhoodFileURL = opt[URL](
11 |     name = "neighborhood-file-url",
12 |     short = 'n',
13 |     required = true
14 |   )(urlConverter)
15 | 
16 |   val taxiRideConsumerGroup = opt[String](default = Some("$Default"))
17 |   val taxiFareConsumerGroup = opt[String](default = Some("$Default"))
18 | 
19 |   // Intervals
20 |   val windowInterval = opt[String](default = Some("1 hour"), validate = isValidInterval)
21 |   val taxiRideWatermarkInterval = opt[String](default = Some("3 minutes"), validate = isValidInterval)
22 |   val taxiFareWatermarkInterval = opt[String](default = Some("3 minutes"), validate = isValidInterval)
23 | 
24 |   val secretScope = opt[String](default = Some("azure-databricks-job"))
25 |   val taxiRideEventHubSecretName = opt[String](default = Some("taxi-ride"))
26 |   val taxiFareEventHubSecretName = opt[String](default = Some("taxi-fare"))
27 | 
28 |   val cassandraHost = opt[String]()
29 | 
30 |   // cassandra secrets
31 |   val cassandraUserSecretName = opt[String](default = Some("cassandra-username"))
32 |   val cassandraPasswordSecretName = opt[String](default = Some("cassandra-password"))
33 | 
34 |   verify()
35 | 
36 |   private def isValidInterval(interval: String): Boolean = {
37 |     // This is the same check spark uses
38 |     val intervalString = if (interval.startsWith("interval")) {
39 |       interval
40 |     } else {
41 |       "interval " + interval
42 |     }
43 |     val cal = stringToInterval(UTF8String.fromString(intervalString))
44 |     cal != null
45 |   }
46 | }
47 | 


--------------------------------------------------------------------------------
/azure/AzureDataBricksJob/src/main/scala/com/microsoft/pnp/StreamingMetricsListener.scala:
--------------------------------------------------------------------------------
 1 | package com.microsoft.pnp
 2 | 
 3 | import org.apache.spark.sql.streaming.StreamingQueryListener
 4 | import org.apache.spark.sql.streaming.StreamingQueryListener._
 5 | import org.slf4j.{Logger, LoggerFactory}
 6 | 
 7 | class StreamingMetricsListener() extends StreamingQueryListener {
 8 |   lazy val logger: Logger = LoggerFactory.getLogger(this.getClass.getName.stripSuffix("$"))
 9 |   lazy val mdcFactory: MDCCloseableFactory = new MDCCloseableFactory()
10 | 
11 |   override def onQueryStarted(event: QueryStartedEvent): Unit = {}
12 | 
13 |   override def onQueryProgress(event: QueryProgressEvent): Unit = {
14 |     try {
15 |       //parsing the telemetry Payload and logging to ala
16 |       TryWith(this.mdcFactory.create(Utils.parsePayload(event)))(
17 |         c => {
18 |           this.logger.info("onQueryProgress")
19 |         }
20 |       )
21 |     }
22 | 
23 |     catch {
24 |       case e: Exception => this.logger.error("onQueryProgress", e)
25 |     }
26 |   }
27 | 
28 |   override def onQueryTerminated(event: QueryTerminatedEvent): Unit = {
29 |     if (event.exception.nonEmpty) {
30 |       this.logger.error(event.exception.get)
31 |     }
32 |   }
33 | }
34 | 


--------------------------------------------------------------------------------
/azure/AzureDataBricksJob/src/main/scala/com/microsoft/pnp/TaxiCabReader.scala:
--------------------------------------------------------------------------------
  1 | package com.microsoft.pnp
  2 | 
  3 | import com.datastax.spark.connector.cql.CassandraConnector
  4 | import org.apache.spark.eventhubs.{EventHubsConf, EventPosition}
  5 | import org.apache.spark.metrics.source.{AppAccumulators, AppMetrics}
  6 | import org.apache.spark.sql.catalyst.expressions.{CsvToStructs, Expression}
  7 | import org.apache.spark.sql.functions._
  8 | import org.apache.spark.sql.streaming.OutputMode
  9 | import org.apache.spark.sql.types.{StringType, StructType}
 10 | import org.apache.spark.sql.{Column, SparkSession}
 11 | import org.apache.spark.{SparkConf, SparkEnv}
 12 | 
 13 | object TaxiCabReader {
 14 |   private def withExpr(expr: Expression): Column = new Column(expr)
 15 | 
 16 |   def main(args: Array[String]) {
 17 | 
 18 |     val conf = new JobConfiguration(args)
 19 |     val rideEventHubConnectionString = getSecret(
 20 |       conf.secretScope(), conf.taxiRideEventHubSecretName())
 21 |     val fareEventHubConnectionString = getSecret(
 22 |       conf.secretScope(), conf.taxiFareEventHubSecretName())
 23 | 
 24 |     val cassandraEndPoint = conf.cassandraHost()
 25 | 
 26 |     val cassandraUserName = getSecret(
 27 |       conf.secretScope(), conf.cassandraUserSecretName())
 28 |     val cassandraPassword = getSecret(
 29 |       conf.secretScope(), conf.cassandraPasswordSecretName())
 30 | 
 31 |     val spark = SparkSession
 32 |       .builder
 33 |       .getOrCreate
 34 | 
 35 |     import spark.implicits._
 36 | 
 37 |     // Databricks spark session is created upfront . it is not possible to
 38 |     // update the conf later . hence this conf is just created with values from
 39 |     // secrets just for initiating the cassandra driver
 40 |     // please note :- when spark submit is used, spark session is created in the main method
 41 |     // what ever values that gets provided in the main while initiating spark should be able available by accessing
 42 |     // sparksession.getconf
 43 |     val sparkConfForCassandraDriver = new SparkConf(true)
 44 |       .set("spark.cassandra.connection.host", cassandraEndPoint)
 45 |       .set("spark.cassandra.connection.port", "10350")
 46 |       .set("spark.cassandra.connection.ssl.enabled", "true")
 47 |       .set("spark.cassandra.auth.username", cassandraUserName)
 48 |       .set("spark.cassandra.auth.password", cassandraPassword)
 49 |       .set("spark.master", "local[10]")
 50 |       .set("spark.cassandra.output.batch.size.rows", "1")
 51 |       .set("spark.cassandra.connection.remoteConnectionsPerExecutor", "2")
 52 |       .set("spark.cassandra.output.concurrent.writes", "5")
 53 |       .set("spark.cassandra.output.batch.grouping.buffer.size", "300")
 54 |       .set("spark.cassandra.connection.keepAliveMS", "5000")
 55 | 
 56 |     // Initializing the connector in the driver . connector is serializable
 57 |     // will be sending it to foreach sink that gets executed in the workers.
 58 |     val connector = CassandraConnector(sparkConfForCassandraDriver)
 59 | 
 60 |     @transient val appMetrics = new AppMetrics(spark.sparkContext)
 61 |     appMetrics.registerGauge("metrics.malformedrides", AppAccumulators.getRideInstance(spark.sparkContext))
 62 |     appMetrics.registerGauge("metrics.malformedfares", AppAccumulators.getFareInstance(spark.sparkContext))
 63 |     SparkEnv.get.metricsSystem.registerSource(appMetrics)
 64 | 
 65 |     @transient lazy val NeighborhoodFinder = GeoFinder.createGeoFinder(conf.neighborhoodFileURL())
 66 | 
 67 |     val neighborhoodFinder = (lon: Double, lat: Double) => {
 68 |       NeighborhoodFinder.getNeighborhood(lon, lat).get()
 69 |     }
 70 |     val to_neighborhood = spark.udf.register("neighborhoodFinder", neighborhoodFinder)
 71 | 
 72 |     def from_csv(e: Column, schema: StructType, options: Map[String, String]): Column = withExpr {
 73 |       CsvToStructs(schema, options, e.expr)
 74 |     }
 75 | 
 76 |     spark.streams.addListener(new StreamingMetricsListener())
 77 | 
 78 |     val rideEventHubOptions = EventHubsConf(rideEventHubConnectionString)
 79 |       .setConsumerGroup(conf.taxiRideConsumerGroup())
 80 |       .setStartingPosition(EventPosition.fromStartOfStream)
 81 |     val rideEvents = spark.readStream
 82 |       .format("eventhubs")
 83 |       .options(rideEventHubOptions.toMap)
 84 |       .load
 85 | 
 86 |     val fareEventHubOptions = EventHubsConf(fareEventHubConnectionString)
 87 |       .setConsumerGroup(conf.taxiFareConsumerGroup())
 88 |       .setStartingPosition(EventPosition.fromStartOfStream)
 89 |     val fareEvents = spark.readStream
 90 |       .format("eventhubs")
 91 |       .options(fareEventHubOptions.toMap)
 92 |       .load
 93 | 
 94 |     val transformedRides = rideEvents
 95 |       .select(
 96 |         $"body"
 97 |           .cast(StringType)
 98 |           .as("messageData"),
 99 |         from_json($"body".cast(StringType), RideSchema)
100 |           .as("ride"))
101 |       .transform(ds => {
102 |         ds.withColumn(
103 |           "errorMessage",
104 |           when($"ride".isNull,
105 |             lit("Error decoding JSON"))
106 |             .otherwise(lit(null))
107 |         )
108 |       })
109 | 
110 |     val malformedRides = AppAccumulators.getRideInstance(spark.sparkContext)
111 | 
112 |     val rides = transformedRides
113 |       .filter(r => {
114 |         if (r.isNullAt(r.fieldIndex("errorMessage"))) {
115 |           true
116 |         }
117 |         else {
118 |           malformedRides.add(1)
119 |           false
120 |         }
121 |       })
122 |       .select(
123 |         $"ride.*",
124 |         to_neighborhood($"ride.pickupLon", $"ride.pickupLat")
125 |           .as("pickupNeighborhood"),
126 |         to_neighborhood($"ride.dropoffLon", $"ride.dropoffLat")
127 |           .as("dropoffNeighborhood")
128 |       )
129 |       .withWatermark("pickupTime", conf.taxiRideWatermarkInterval())
130 | 
131 |     val csvOptions = Map("header" -> "true", "multiLine" -> "true")
132 |     val transformedFares = fareEvents
133 |       .select(
134 |         $"body"
135 |           .cast(StringType)
136 |           .as("messageData"),
137 |         from_csv($"body".cast(StringType), FareSchema, csvOptions)
138 |           .as("fare"))
139 |       .transform(ds => {
140 |         ds.withColumn(
141 |           "errorMessage",
142 |           when($"fare".isNull,
143 |             lit("Error decoding CSV"))
144 |             .when(to_timestamp($"fare.pickupTimeString", "yyyy-MM-dd HH:mm:ss").isNull,
145 |               lit("Error parsing pickupTime"))
146 |             .otherwise(lit(null))
147 |         )
148 |       })
149 |       .transform(ds => {
150 |         ds.withColumn(
151 |           "pickupTime",
152 |           when($"fare".isNull,
153 |             lit(null))
154 |             .otherwise(to_timestamp($"fare.pickupTimeString", "yyyy-MM-dd HH:mm:ss"))
155 |         )
156 |       })
157 | 
158 | 
159 |     val malformedFares = AppAccumulators.getFareInstance(spark.sparkContext)
160 | 
161 |     val fares = transformedFares
162 |       .filter(r => {
163 |         if (r.isNullAt(r.fieldIndex("errorMessage"))) {
164 |           true
165 |         }
166 |         else {
167 |           malformedFares.add(1)
168 |           false
169 |         }
170 |       })
171 |       .select(
172 |         $"fare.*",
173 |         $"pickupTime"
174 |       )
175 |       .withWatermark("pickupTime", conf.taxiFareWatermarkInterval())
176 | 
177 |     val mergedTaxiTrip = rides.join(fares, Seq("medallion", "hackLicense", "vendorId", "pickupTime"))
178 | 
179 | 
180 |     val maxAvgFarePerNeighborhood = mergedTaxiTrip.selectExpr("medallion", "hackLicense", "vendorId", "pickupTime", "rateCode", "storeAndForwardFlag", "dropoffTime", "passengerCount", "tripTimeInSeconds", "tripDistanceInMiles", "pickupLon", "pickupLat", "dropoffLon", "dropoffLat", "paymentType", "fareAmount", "surcharge", "mtaTax", "tipAmount", "tollsAmount", "totalAmount", "pickupNeighborhood", "dropoffNeighborhood")
181 |       .groupBy(window($"pickupTime", conf.windowInterval()), $"pickupNeighborhood")
182 |       .agg(
183 |         count("*").as("rideCount"),
184 |         sum($"fareAmount").as("totalFareAmount"),
185 |         sum($"tipAmount").as("totalTipAmount"),
186 |         (sum($"fareAmount")/count("*")).as("averageFareAmount"),
187 |         (sum($"tipAmount")/count("*")).as("averageTipAmount")
188 |       )
189 |       .select($"window.start", $"window.end", $"pickupNeighborhood", $"rideCount", $"totalFareAmount", $"totalTipAmount", $"averageFareAmount", $"averageTipAmount")
190 | 
191 |     maxAvgFarePerNeighborhood
192 |       .writeStream
193 |       .queryName("maxAvgFarePerNeighborhood_cassandra_insert")
194 |       .outputMode(OutputMode.Append())
195 |       .foreach(new CassandraSinkForeach(connector))
196 |       .start()
197 |       .awaitTermination()
198 |   }
199 | }
200 | 


--------------------------------------------------------------------------------
/azure/AzureDataBricksJob/src/main/scala/com/microsoft/pnp/TryWith.scala:
--------------------------------------------------------------------------------
 1 | package com.microsoft.pnp
 2 | 
 3 | import scala.util.control.NonFatal
 4 | import scala.util.{Failure, Try}
 5 | 
 6 | object TryWith {
 7 |   def apply[C <: AutoCloseable, R](resource: => C)(f: C => R): Try[R] =
 8 |     Try(resource).flatMap(resourceInstance => {
 9 |       try {
10 |         val returnValue = f(resourceInstance)
11 |         Try(resourceInstance.close()).map(_ => returnValue)
12 |       }
13 |       catch {
14 |         case NonFatal(exceptionInFunction) =>
15 |           try {
16 |             resourceInstance.close()
17 |             Failure(exceptionInFunction)
18 |           }
19 |           catch {
20 |             case NonFatal(exceptionInClose) =>
21 |               exceptionInFunction.addSuppressed(exceptionInClose)
22 |               Failure(exceptionInFunction)
23 |           }
24 |       }
25 |     })
26 | }
27 | 


--------------------------------------------------------------------------------
/azure/AzureDataBricksJob/src/main/scala/com/microsoft/pnp/Utils.scala:
--------------------------------------------------------------------------------
 1 | package com.microsoft.pnp
 2 | 
 3 | import java.time.{ZoneId, ZonedDateTime}
 4 | import java.util.HashMap
 5 | 
 6 | import org.apache.spark.sql.streaming.StreamingQueryListener.QueryProgressEvent
 7 | 
 8 | object Utils {
 9 |   def parsePayload(event: QueryProgressEvent): HashMap[String, AnyRef]={
10 |     val date = java.time.format
11 |       .DateTimeFormatter.RFC_1123_DATE_TIME.format(ZonedDateTime.now(ZoneId.of("GMT")))
12 | 
13 |     val metrics = new HashMap[String, AnyRef]()
14 |     metrics.put("id",  event.progress.id)
15 |     metrics.put("sink", event.progress.sink)
16 |     metrics.put("durationms", event.progress.durationMs.asInstanceOf[AnyRef])
17 |     metrics.put("inputRowsPerSecond", event.progress.inputRowsPerSecond.asInstanceOf[AnyRef])
18 |     metrics.put("procRowsPerSecond", event.progress.processedRowsPerSecond.asInstanceOf[AnyRef])
19 |     metrics.put("inputRows", event.progress.numInputRows.asInstanceOf[AnyRef])
20 |     metrics.put("DateValue", date.toString)
21 | 
22 |     metrics
23 |   }
24 | }
25 | 


--------------------------------------------------------------------------------
/azure/AzureDataBricksJob/src/main/scala/com/microsoft/pnp/package.scala:
--------------------------------------------------------------------------------
 1 | package com.microsoft
 2 | 
 3 | import org.apache.spark.sql.types._
 4 | import com.databricks.dbutils_v1.DBUtilsHolder.dbutils.secrets
 5 | 
 6 | package object pnp {
 7 | 
 8 |   def getSecret(secretScope: String, secretName: String): String = {
 9 |     secrets.get(secretScope, secretName)
10 |   }
11 | 
12 |   val RideSchema = new StructType()
13 |     .add("rateCode", IntegerType)
14 |     .add("storeAndForwardFlag", StringType)
15 |     .add("dropoffTime", TimestampType)
16 |     .add("passengerCount", IntegerType)
17 |     .add("tripTimeInSeconds", DoubleType)
18 |     .add("tripDistanceInMiles", DoubleType)
19 |     .add("pickupLon", DoubleType)
20 |     .add("pickupLat", DoubleType)
21 |     .add("dropoffLon", DoubleType)
22 |     .add("dropoffLat", DoubleType)
23 |     .add("medallion", LongType)
24 |     .add("hackLicense", LongType)
25 |     .add("vendorId", StringType)
26 |     .add("pickupTime", TimestampType)
27 |     .add("errorMessage", StringType)
28 |     .add("messageData", StringType)
29 | 
30 |   val FareSchema = new StructType()
31 |     .add("medallion", LongType)
32 |     .add("hackLicense", LongType)
33 |     .add("vendorId",StringType)
34 |     .add("pickupTimeString", StringType)
35 |     .add("paymentType", StringType)
36 |     .add("fareAmount", DoubleType)
37 |     .add("surcharge", DoubleType)
38 |     .add("mtaTax", DoubleType)
39 |     .add("tipAmount", DoubleType)
40 |     .add("tollsAmount", DoubleType)
41 |     .add("totalAmount", DoubleType)
42 | }
43 | 
44 | 


--------------------------------------------------------------------------------
/azure/AzureDataBricksJob/src/main/scala/org/apache/spark/metrics/source/AppAccumulators.scala:
--------------------------------------------------------------------------------
 1 | package org.apache.spark.metrics.source
 2 | import org.apache.spark.SparkContext
 3 | import org.apache.spark.util.LongAccumulator
 4 | 
 5 | object AppAccumulators {
 6 |   @volatile private var fareInstance: LongAccumulator = _
 7 |   @volatile private var rideInstance: LongAccumulator = _
 8 | 
 9 |   def getFareInstance(sc: SparkContext): LongAccumulator = {
10 |     if (fareInstance == null) {
11 |       synchronized {
12 |         if (fareInstance == null) {
13 |           fareInstance = sc.longAccumulator("MalformedFareCount")
14 |         }
15 |       }
16 |     }
17 |     fareInstance
18 |   }
19 | 
20 |   def getRideInstance(sc: SparkContext): LongAccumulator = {
21 |     if (rideInstance == null) {
22 |       synchronized {
23 |         if (rideInstance == null) {
24 |           rideInstance = sc.longAccumulator("MalformedRideCount")
25 |         }
26 |       }
27 |     }
28 |     rideInstance
29 |   }
30 | }
31 | 


--------------------------------------------------------------------------------
/azure/AzureDataBricksJob/src/main/scala/org/apache/spark/metrics/source/AppMetrics.scala:
--------------------------------------------------------------------------------
 1 | package org.apache.spark.metrics.source
 2 | 
 3 | import com.codahale.metrics.{Gauge, MetricRegistry}
 4 | import org.apache.spark.SparkContext
 5 | import org.apache.spark.util.LongAccumulator
 6 | 
 7 | class AppMetrics(sc: SparkContext) extends Source {
 8 |   override val metricRegistry = new MetricRegistry
 9 |   override val sourceName = "%s.AppMetrics".format(sc.appName)
10 | 
11 |   def registerGauge(metricName: String, acc: LongAccumulator) {
12 |     val metric = new Gauge[Long] {
13 |       override def getValue: Long = {
14 |         acc.value
15 |       }
16 |     }
17 |     metricRegistry.register(MetricRegistry.name(metricName), metric)
18 |   }
19 | }
20 | 


--------------------------------------------------------------------------------
/azure/AzureDataBricksJob/src/main/scala/org/apache/spark/sql/catalyst/csv/CSVExprUtils.scala:
--------------------------------------------------------------------------------
 1 | package org.apache.spark.sql.catalyst.csv
 2 | 
 3 | object CSVExprUtils {
 4 |   /**
 5 |     * Filter ignorable rows for CSV iterator (lines empty and starting with `comment`).
 6 |     * This is currently being used in CSV reading path and CSV schema inference.
 7 |     */
 8 |   def filterCommentAndEmpty(iter: Iterator[String], options: CSVOptions): Iterator[String] = {
 9 |     iter.filter { line =>
10 |       line.trim.nonEmpty && !line.startsWith(options.comment.toString)
11 |     }
12 |   }
13 | 
14 |   def skipComments(iter: Iterator[String], options: CSVOptions): Iterator[String] = {
15 |     if (options.isCommentSet) {
16 |       val commentPrefix = options.comment.toString
17 |       iter.dropWhile { line =>
18 |         line.trim.isEmpty || line.trim.startsWith(commentPrefix)
19 |       }
20 |     } else {
21 |       iter.dropWhile(_.trim.isEmpty)
22 |     }
23 |   }
24 | 
25 |   /**
26 |     * Extracts header and moves iterator forward so that only data remains in it
27 |     */
28 |   def extractHeader(iter: Iterator[String], options: CSVOptions): Option[String] = {
29 |     val nonEmptyLines = skipComments(iter, options)
30 |     if (nonEmptyLines.hasNext) {
31 |       Some(nonEmptyLines.next())
32 |     } else {
33 |       None
34 |     }
35 |   }
36 | 
37 |   /**
38 |     * Helper method that converts string representation of a character to actual character.
39 |     * It handles some Java escaped strings and throws exception if given string is longer than one
40 |     * character.
41 |     */
42 |   @throws[IllegalArgumentException]
43 |   def toChar(str: String): Char = {
44 |     (str: Seq[Char]) match {
45 |       case Seq() => throw new IllegalArgumentException("Delimiter cannot be empty string")
46 |       case Seq('\\') => throw new IllegalArgumentException("Single backslash is prohibited." +
47 |         " It has special meaning as beginning of an escape sequence." +
48 |         " To get the backslash character, pass a string with two backslashes as the delimiter.")
49 |       case Seq(c) => c
50 |       case Seq('\\', 't') => '\t'
51 |       case Seq('\\', 'r') => '\r'
52 |       case Seq('\\', 'b') => '\b'
53 |       case Seq('\\', 'f') => '\f'
54 |       // In case user changes quote char and uses \" as delimiter in options
55 |       case Seq('\\', '\"') => '\"'
56 |       case Seq('\\', '\'') => '\''
57 |       case Seq('\\', '\\') => '\\'
58 |       case _ if str == """\u0000""" => '\u0000'
59 |       case Seq('\\', _) =>
60 |         throw new IllegalArgumentException(s"Unsupported special character for delimiter: $str")
61 |       case _ =>
62 |         throw new IllegalArgumentException(s"Delimiter cannot be more than one character: $str")
63 |     }
64 |   }
65 | }
66 | 


--------------------------------------------------------------------------------
/azure/AzureDataBricksJob/src/main/scala/org/apache/spark/sql/catalyst/csv/CSVHeaderChecker.scala:
--------------------------------------------------------------------------------
  1 | /*
  2 |  * Licensed to the Apache Software Foundation (ASF) under one or more
  3 |  * contributor license agreements.  See the NOTICE file distributed with
  4 |  * this work for additional information regarding copyright ownership.
  5 |  * The ASF licenses this file to You under the Apache License, Version 2.0
  6 |  * (the "License"); you may not use this file except in compliance with
  7 |  * the License.  You may obtain a copy of the License at
  8 |  *
  9 |  *    http://www.apache.org/licenses/LICENSE-2.0
 10 |  *
 11 |  * Unless required by applicable law or agreed to in writing, software
 12 |  * distributed under the License is distributed on an "AS IS" BASIS,
 13 |  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 14 |  * See the License for the specific language governing permissions and
 15 |  * limitations under the License.
 16 |  */
 17 | 
 18 | package org.apache.spark.sql.catalyst.csv
 19 | 
 20 | import com.univocity.parsers.csv.CsvParser
 21 | import org.apache.spark.internal.Logging
 22 | import org.apache.spark.sql.internal.SQLConf
 23 | import org.apache.spark.sql.types.StructType
 24 | 
 25 | /**
 26 |   * Checks that column names in a CSV header and field names in the schema are the same
 27 |   * by taking into account case sensitivity.
 28 |   *
 29 |   * @param schema provided (or inferred) schema to which CSV must conform.
 30 |   * @param options parsed CSV options.
 31 |   * @param source name of CSV source that are currently checked. It is used in error messages.
 32 |   * @param isStartOfFile indicates if the currently processing partition is the start of the file.
 33 |   *                      if unknown or not applicable (for instance when the input is a dataset),
 34 |   *                      can be omitted.
 35 |   */
 36 | class CSVHeaderChecker(
 37 |                         schema: StructType,
 38 |                         options: CSVOptions,
 39 |                         source: String,
 40 |                         isStartOfFile: Boolean = false) extends Logging {
 41 | 
 42 |   // Indicates if it is set to `false`, comparison of column names and schema field
 43 |   // names is not case sensitive.
 44 |   private val caseSensitive = SQLConf.get.caseSensitiveAnalysis
 45 | 
 46 |   // Indicates if it is `true`, column names are ignored otherwise the CSV column
 47 |   // names are checked for conformance to the schema. In the case if
 48 |   // the column name don't conform to the schema, an exception is thrown.
 49 |   private val enforceSchema = options.enforceSchema
 50 | 
 51 |   /**
 52 |     * Checks that column names in a CSV header and field names in the schema are the same
 53 |     * by taking into account case sensitivity.
 54 |     *
 55 |     * @param columnNames names of CSV columns that must be checked against to the schema.
 56 |     */
 57 |   private def checkHeaderColumnNames(columnNames: Array[String]): Unit = {
 58 |     if (columnNames != null) {
 59 |       val fieldNames = schema.map(_.name).toIndexedSeq
 60 |       val (headerLen, schemaSize) = (columnNames.size, fieldNames.length)
 61 |       var errorMessage: Option[String] = None
 62 | 
 63 |       if (headerLen == schemaSize) {
 64 |         var i = 0
 65 |         while (errorMessage.isEmpty && i < headerLen) {
 66 |           var (nameInSchema, nameInHeader) = (fieldNames(i), columnNames(i))
 67 |           if (!caseSensitive) {
 68 |             // scalastyle:off caselocale
 69 |             nameInSchema = nameInSchema.toLowerCase
 70 |             nameInHeader = nameInHeader.toLowerCase
 71 |             // scalastyle:on caselocale
 72 |           }
 73 |           if (nameInHeader != nameInSchema) {
 74 |             errorMessage = Some(
 75 |               s"""|CSV header does not conform to the schema.
 76 |                   | Header: ${columnNames.mkString(", ")}
 77 |                   | Schema: ${fieldNames.mkString(", ")}
 78 |                   |Expected: ${fieldNames(i)} but found: ${columnNames(i)}
 79 |                   |$source""".stripMargin)
 80 |           }
 81 |           i += 1
 82 |         }
 83 |       } else {
 84 |         errorMessage = Some(
 85 |           s"""|Number of column in CSV header is not equal to number of fields in the schema:
 86 |               | Header length: $headerLen, schema size: $schemaSize
 87 |               |$source""".stripMargin)
 88 |       }
 89 | 
 90 |       errorMessage.foreach { msg =>
 91 |         if (enforceSchema) {
 92 |           logWarning(msg)
 93 |         } else {
 94 |           throw new IllegalArgumentException(msg)
 95 |         }
 96 |       }
 97 |     }
 98 |   }
 99 | 
100 |   // This is currently only used to parse CSV from Dataset[String].
101 |   def checkHeaderColumnNames(line: String): Unit = {
102 |     if (options.headerFlag) {
103 |       val parser = new CsvParser(options.asParserSettings)
104 |       checkHeaderColumnNames(parser.parseLine(line))
105 |     }
106 |   }
107 | 
108 |   // This is currently only used to parse CSV with multiLine mode.
109 |   private[csv] def checkHeaderColumnNames(tokenizer: CsvParser): Unit = {
110 |     assert(options.multiLine, "This method should be executed with multiLine.")
111 |     if (options.headerFlag) {
112 |       val firstRecord = tokenizer.parseNext()
113 |       checkHeaderColumnNames(firstRecord)
114 |     }
115 |   }
116 | 
117 |   // This is currently only used to parse CSV with non-multiLine mode.
118 |   private[csv] def checkHeaderColumnNames(lines: Iterator[String], tokenizer: CsvParser): Unit = {
119 |     assert(!options.multiLine, "This method should not be executed with multiline.")
120 |     // Checking that column names in the header are matched to field names of the schema.
121 |     // The header will be removed from lines.
122 |     // Note: if there are only comments in the first block, the header would probably
123 |     // be not extracted.
124 |     if (options.headerFlag && isStartOfFile) {
125 |       CSVExprUtils.extractHeader(lines, options).foreach { header =>
126 |         checkHeaderColumnNames(tokenizer.parseLine(header))
127 |       }
128 |     }
129 |   }
130 | }


--------------------------------------------------------------------------------
/azure/AzureDataBricksJob/src/main/scala/org/apache/spark/sql/catalyst/csv/CSVOptions.scala:
--------------------------------------------------------------------------------
  1 | /*
  2 |  * Licensed to the Apache Software Foundation (ASF) under one or more
  3 |  * contributor license agreements.  See the NOTICE file distributed with
  4 |  * this work for additional information regarding copyright ownership.
  5 |  * The ASF licenses this file to You under the Apache License, Version 2.0
  6 |  * (the "License"); you may not use this file except in compliance with
  7 |  * the License.  You may obtain a copy of the License at
  8 |  *
  9 |  *    http://www.apache.org/licenses/LICENSE-2.0
 10 |  *
 11 |  * Unless required by applicable law or agreed to in writing, software
 12 |  * distributed under the License is distributed on an "AS IS" BASIS,
 13 |  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 14 |  * See the License for the specific language governing permissions and
 15 |  * limitations under the License.
 16 |  */
 17 | 
 18 | package org.apache.spark.sql.catalyst.csv
 19 | 
 20 | import java.nio.charset.StandardCharsets
 21 | import java.util.{Locale, TimeZone}
 22 | 
 23 | import com.univocity.parsers.csv.{CsvParserSettings, CsvWriterSettings, UnescapedQuoteHandling}
 24 | import org.apache.commons.lang3.time.FastDateFormat
 25 | import org.apache.spark.internal.Logging
 26 | import org.apache.spark.sql.catalyst.util._
 27 | 
 28 | class CSVOptions(
 29 |                   @transient val parameters: CaseInsensitiveMap[String],
 30 |                   val columnPruning: Boolean,
 31 |                   defaultTimeZoneId: String,
 32 |                   defaultColumnNameOfCorruptRecord: String)
 33 |   extends Logging with Serializable {
 34 | 
 35 |   def this(
 36 |             parameters: Map[String, String],
 37 |             columnPruning: Boolean,
 38 |             defaultTimeZoneId: String,
 39 |             defaultColumnNameOfCorruptRecord: String = "") = {
 40 |     this(
 41 |       CaseInsensitiveMap(parameters),
 42 |       columnPruning,
 43 |       defaultTimeZoneId,
 44 |       defaultColumnNameOfCorruptRecord)
 45 |   }
 46 | 
 47 |   private def getChar(paramName: String, default: Char): Char = {
 48 |     val paramValue = parameters.get(paramName)
 49 |     paramValue match {
 50 |       case None => default
 51 |       case Some(null) => default
 52 |       case Some(value) if value.length == 0 => '\u0000'
 53 |       case Some(value) if value.length == 1 => value.charAt(0)
 54 |       case _ => throw new RuntimeException(s"$paramName cannot be more than one character")
 55 |     }
 56 |   }
 57 | 
 58 |   private def getInt(paramName: String, default: Int): Int = {
 59 |     val paramValue = parameters.get(paramName)
 60 |     paramValue match {
 61 |       case None => default
 62 |       case Some(null) => default
 63 |       case Some(value) => try {
 64 |         value.toInt
 65 |       } catch {
 66 |         case e: NumberFormatException =>
 67 |           throw new RuntimeException(s"$paramName should be an integer. Found $value")
 68 |       }
 69 |     }
 70 |   }
 71 | 
 72 |   private def getBool(paramName: String, default: Boolean = false): Boolean = {
 73 |     val param = parameters.getOrElse(paramName, default.toString)
 74 |     if (param == null) {
 75 |       default
 76 |     } else if (param.toLowerCase(Locale.ROOT) == "true") {
 77 |       true
 78 |     } else if (param.toLowerCase(Locale.ROOT) == "false") {
 79 |       false
 80 |     } else {
 81 |       throw new Exception(s"$paramName flag can be true or false")
 82 |     }
 83 |   }
 84 | 
 85 |   val delimiter = CSVExprUtils.toChar(
 86 |     parameters.getOrElse("sep", parameters.getOrElse("delimiter", ",")))
 87 |   val parseMode: ParseMode = parameters.get("mode").map(ParseMode.fromString).getOrElse(PermissiveMode)
 88 |   val charset = parameters.getOrElse("encoding",
 89 |     parameters.getOrElse("charset", StandardCharsets.UTF_8.name()))
 90 | 
 91 |   val quote = getChar("quote", '\"')
 92 |   val escape = getChar("escape", '\\')
 93 |   val charToEscapeQuoteEscaping = parameters.get("charToEscapeQuoteEscaping") match {
 94 |     case None => None
 95 |     case Some(null) => None
 96 |     case Some(value) if value.length == 0 => None
 97 |     case Some(value) if value.length == 1 => Some(value.charAt(0))
 98 |     case _ =>
 99 |       throw new RuntimeException("charToEscapeQuoteEscaping cannot be more than one character")
100 |   }
101 |   val comment = getChar("comment", '\u0000')
102 | 
103 |   val headerFlag = getBool("header")
104 |   val inferSchemaFlag = getBool("inferSchema")
105 |   val ignoreLeadingWhiteSpaceInRead = getBool("ignoreLeadingWhiteSpace", default = false)
106 |   val ignoreTrailingWhiteSpaceInRead = getBool("ignoreTrailingWhiteSpace", default = false)
107 | 
108 |   // For write, both options were `true` by default. We leave it as `true` for
109 |   // backwards compatibility.
110 |   val ignoreLeadingWhiteSpaceFlagInWrite = getBool("ignoreLeadingWhiteSpace", default = true)
111 |   val ignoreTrailingWhiteSpaceFlagInWrite = getBool("ignoreTrailingWhiteSpace", default = true)
112 | 
113 |   val columnNameOfCorruptRecord =
114 |     parameters.getOrElse("columnNameOfCorruptRecord", defaultColumnNameOfCorruptRecord)
115 | 
116 |   val nullValue = parameters.getOrElse("nullValue", "")
117 | 
118 |   val nanValue = parameters.getOrElse("nanValue", "NaN")
119 | 
120 |   val positiveInf = parameters.getOrElse("positiveInf", "Inf")
121 |   val negativeInf = parameters.getOrElse("negativeInf", "-Inf")
122 | 
123 | 
124 |   val compressionCodec: Option[String] = {
125 |     val name = parameters.get("compression").orElse(parameters.get("codec"))
126 |     name.map(CompressionCodecs.getCodecClassName)
127 |   }
128 | 
129 |   val timeZone: TimeZone = DateTimeUtils.getTimeZone(
130 |     parameters.getOrElse(DateTimeUtils.TIMEZONE_OPTION, defaultTimeZoneId))
131 | 
132 |   // Uses `FastDateFormat` which can be direct replacement for `SimpleDateFormat` and thread-safe.
133 |   val dateFormat: FastDateFormat =
134 |     FastDateFormat.getInstance(parameters.getOrElse("dateFormat", "yyyy-MM-dd"), Locale.US)
135 | 
136 |   val timestampFormat: FastDateFormat =
137 |     FastDateFormat.getInstance(
138 |       parameters.getOrElse("timestampFormat", "yyyy-MM-dd'T'HH:mm:ss.SSSXXX"), timeZone, Locale.US)
139 | 
140 |   val multiLine = parameters.get("multiLine").map(_.toBoolean).getOrElse(false)
141 | 
142 |   val maxColumns = getInt("maxColumns", 20480)
143 | 
144 |   val maxCharsPerColumn = getInt("maxCharsPerColumn", -1)
145 | 
146 |   val escapeQuotes = getBool("escapeQuotes", true)
147 | 
148 |   val quoteAll = getBool("quoteAll", false)
149 | 
150 |   val inputBufferSize = 128
151 | 
152 |   val isCommentSet = this.comment != '\u0000'
153 | 
154 |   val samplingRatio =
155 |     parameters.get("samplingRatio").map(_.toDouble).getOrElse(1.0)
156 | 
157 |   /**
158 |     * Forcibly apply the specified or inferred schema to datasource files.
159 |     * If the option is enabled, headers of CSV files will be ignored.
160 |     */
161 |   val enforceSchema = getBool("enforceSchema", default = true)
162 | 
163 | 
164 |   /**
165 |     * String representation of an empty value in read and in write.
166 |     */
167 |   val emptyValue = parameters.get("emptyValue")
168 |   /**
169 |     * The string is returned when CSV reader doesn't have any characters for input value,
170 |     * or an empty quoted string `""`. Default value is empty string.
171 |     */
172 |   val emptyValueInRead = emptyValue.getOrElse("")
173 |   /**
174 |     * The value is used instead of an empty string in write. Default value is `""`
175 |     */
176 |   val emptyValueInWrite = emptyValue.getOrElse("\"\"")
177 | 
178 |   def asWriterSettings: CsvWriterSettings = {
179 |     val writerSettings = new CsvWriterSettings()
180 |     val format = writerSettings.getFormat
181 |     format.setDelimiter(delimiter)
182 |     format.setQuote(quote)
183 |     format.setQuoteEscape(escape)
184 |     charToEscapeQuoteEscaping.foreach(format.setCharToEscapeQuoteEscaping)
185 |     format.setComment(comment)
186 |     writerSettings.setIgnoreLeadingWhitespaces(ignoreLeadingWhiteSpaceFlagInWrite)
187 |     writerSettings.setIgnoreTrailingWhitespaces(ignoreTrailingWhiteSpaceFlagInWrite)
188 |     writerSettings.setNullValue(nullValue)
189 |     writerSettings.setEmptyValue(emptyValueInWrite)
190 |     writerSettings.setSkipEmptyLines(true)
191 |     writerSettings.setQuoteAllFields(quoteAll)
192 |     writerSettings.setQuoteEscapingEnabled(escapeQuotes)
193 |     writerSettings
194 |   }
195 | 
196 |   def asParserSettings: CsvParserSettings = {
197 |     val settings = new CsvParserSettings()
198 |     val format = settings.getFormat
199 |     format.setDelimiter(delimiter)
200 |     format.setQuote(quote)
201 |     format.setQuoteEscape(escape)
202 |     charToEscapeQuoteEscaping.foreach(format.setCharToEscapeQuoteEscaping)
203 |     format.setComment(comment)
204 |     settings.setIgnoreLeadingWhitespaces(ignoreLeadingWhiteSpaceInRead)
205 |     settings.setIgnoreTrailingWhitespaces(ignoreTrailingWhiteSpaceInRead)
206 |     settings.setReadInputOnSeparateThread(false)
207 |     settings.setInputBufferSize(inputBufferSize)
208 |     settings.setMaxColumns(maxColumns)
209 |     settings.setNullValue(nullValue)
210 |     settings.setEmptyValue(emptyValueInRead)
211 |     settings.setMaxCharsPerColumn(maxCharsPerColumn)
212 |     settings.setUnescapedQuoteHandling(UnescapedQuoteHandling.STOP_AT_DELIMITER)
213 |     settings.setLineSeparatorDetectionEnabled(multiLine == true)
214 | 
215 |     // This is for handling a header, so we'll just blindly skip
216 |     if (headerFlag) {
217 |       settings.setNumberOfRowsToSkip(1)
218 |     }
219 |     settings
220 |   }
221 | }


--------------------------------------------------------------------------------
/azure/AzureDataBricksJob/src/main/scala/org/apache/spark/sql/catalyst/csv/UnivocityParser.scala:
--------------------------------------------------------------------------------
  1 | /*
  2 |  * Licensed to the Apache Software Foundation (ASF) under one or more
  3 |  * contributor license agreements.  See the NOTICE file distributed with
  4 |  * this work for additional information regarding copyright ownership.
  5 |  * The ASF licenses this file to You under the Apache License, Version 2.0
  6 |  * (the "License"); you may not use this file except in compliance with
  7 |  * the License.  You may obtain a copy of the License at
  8 |  *
  9 |  *    http://www.apache.org/licenses/LICENSE-2.0
 10 |  *
 11 |  * Unless required by applicable law or agreed to in writing, software
 12 |  * distributed under the License is distributed on an "AS IS" BASIS,
 13 |  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 14 |  * See the License for the specific language governing permissions and
 15 |  * limitations under the License.
 16 |  */
 17 | 
 18 | package org.apache.spark.sql.catalyst.csv
 19 | 
 20 | import java.io.{InputStream, StringReader}
 21 | import java.math.BigDecimal
 22 | 
 23 | import com.univocity.parsers.csv.CsvParser
 24 | import org.apache.spark.internal.Logging
 25 | import org.apache.spark.sql.catalyst.InternalRow
 26 | import org.apache.spark.sql.catalyst.expressions.GenericInternalRow
 27 | import org.apache.spark.sql.catalyst.util.{BadRecordException, DateTimeUtils, FailureSafeParser}
 28 | import org.apache.spark.sql.types._
 29 | import org.apache.spark.unsafe.types.UTF8String
 30 | 
 31 | import scala.util.Try
 32 | import scala.util.control.NonFatal
 33 | 
 34 | 
 35 | /**
 36 |   * Constructs a parser for a given schema that translates CSV data to an [[InternalRow]].
 37 |   *
 38 |   * @param dataSchema The CSV data schema that is specified by the user, or inferred from underlying
 39 |   *                   data files.
 40 |   * @param requiredSchema The schema of the data that should be output for each row. This should be a
 41 |   *                       subset of the columns in dataSchema.
 42 |   * @param options Configuration options for a CSV parser.
 43 |   */
 44 | class UnivocityParser(
 45 |                        dataSchema: StructType,
 46 |                        requiredSchema: StructType,
 47 |                        val options: CSVOptions) extends Logging {
 48 |   require(requiredSchema.toSet.subsetOf(dataSchema.toSet),
 49 |     s"requiredSchema (${requiredSchema.catalogString}) should be the subset of " +
 50 |       s"dataSchema (${dataSchema.catalogString}).")
 51 | 
 52 |   def this(schema: StructType, options: CSVOptions) = this(schema, schema, options)
 53 | 
 54 |   // A `ValueConverter` is responsible for converting the given value to a desired type.
 55 |   private type ValueConverter = String => Any
 56 | 
 57 |   // This index is used to reorder parsed tokens
 58 |   private val tokenIndexArr =
 59 |     requiredSchema.map(f => java.lang.Integer.valueOf(dataSchema.indexOf(f))).toArray
 60 | 
 61 |   // When column pruning is enabled, the parser only parses the required columns based on
 62 |   // their positions in the data schema.
 63 |   private val parsedSchema = if (options.columnPruning) requiredSchema else dataSchema
 64 | 
 65 |   val tokenizer = {
 66 |     val parserSetting = options.asParserSettings
 67 |     // When to-be-parsed schema is shorter than the to-be-read data schema, we let Univocity CSV
 68 |     // parser select a sequence of fields for reading by their positions.
 69 |     // if (options.columnPruning && requiredSchema.length < dataSchema.length) {
 70 |     if (parsedSchema.length < dataSchema.length) {
 71 |       parserSetting.selectIndexes(tokenIndexArr: _*)
 72 |     }
 73 |     new CsvParser(parserSetting)
 74 |   }
 75 | 
 76 |   private val row = new GenericInternalRow(requiredSchema.length)
 77 | 
 78 |   // Retrieve the raw record string.
 79 |   private def getCurrentInput: UTF8String = {
 80 |     UTF8String.fromString(tokenizer.getContext.currentParsedContent().stripLineEnd)
 81 |   }
 82 | 
 83 |   // This parser first picks some tokens from the input tokens, according to the required schema,
 84 |   // then parse these tokens and put the values in a row, with the order specified by the required
 85 |   // schema.
 86 |   //
 87 |   // For example, let's say there is CSV data as below:
 88 |   //
 89 |   //   a,b,c
 90 |   //   1,2,A
 91 |   //
 92 |   // So the CSV data schema is: ["a", "b", "c"]
 93 |   // And let's say the required schema is: ["c", "b"]
 94 |   //
 95 |   // with the input tokens,
 96 |   //
 97 |   //   input tokens - [1, 2, "A"]
 98 |   //
 99 |   // Each input token is placed in each output row's position by mapping these. In this case,
100 |   //
101 |   //   output row - ["A", 2]
102 |   private val valueConverters: Array[ValueConverter] = {
103 |     requiredSchema.map(f => makeConverter(f.name, f.dataType, f.nullable, options)).toArray
104 |   }
105 | 
106 |   /**
107 |     * Create a converter which converts the string value to a value according to a desired type.
108 |     * Currently, we do not support complex types (`ArrayType`, `MapType`, `StructType`).
109 |     *
110 |     * For other nullable types, returns null if it is null or equals to the value specified
111 |     * in `nullValue` option.
112 |     */
113 |   def makeConverter(
114 |                      name: String,
115 |                      dataType: DataType,
116 |                      nullable: Boolean = true,
117 |                      options: CSVOptions): ValueConverter = dataType match {
118 |     case _: ByteType => (d: String) =>
119 |       nullSafeDatum(d, name, nullable, options)(_.toByte)
120 | 
121 |     case _: ShortType => (d: String) =>
122 |       nullSafeDatum(d, name, nullable, options)(_.toShort)
123 | 
124 |     case _: IntegerType => (d: String) =>
125 |       nullSafeDatum(d, name, nullable, options)(_.toInt)
126 | 
127 |     case _: LongType => (d: String) =>
128 |       nullSafeDatum(d, name, nullable, options)(_.toLong)
129 | 
130 |     case _: FloatType => (d: String) =>
131 |       nullSafeDatum(d, name, nullable, options) {
132 |         case options.nanValue => Float.NaN
133 |         case options.negativeInf => Float.NegativeInfinity
134 |         case options.positiveInf => Float.PositiveInfinity
135 |         case datum => datum.toFloat
136 |       }
137 | 
138 |     case _: DoubleType => (d: String) =>
139 |       nullSafeDatum(d, name, nullable, options) {
140 |         case options.nanValue => Double.NaN
141 |         case options.negativeInf => Double.NegativeInfinity
142 |         case options.positiveInf => Double.PositiveInfinity
143 |         case datum => datum.toDouble
144 |       }
145 | 
146 |     case _: BooleanType => (d: String) =>
147 |       nullSafeDatum(d, name, nullable, options)(_.toBoolean)
148 | 
149 |     case dt: DecimalType => (d: String) =>
150 |       nullSafeDatum(d, name, nullable, options) { datum =>
151 |         val value = new BigDecimal(datum.replaceAll(",", ""))
152 |         Decimal(value, dt.precision, dt.scale)
153 |       }
154 | 
155 |     case _: TimestampType => (d: String) =>
156 |       nullSafeDatum(d, name, nullable, options) { datum =>
157 |         // This one will lose microseconds parts.
158 |         // See https://issues.apache.org/jira/browse/SPARK-10681.
159 |         Try(options.timestampFormat.parse(datum).getTime * 1000L)
160 |       }
161 | 
162 |     case _: DateType => (d: String) =>
163 |       nullSafeDatum(d, name, nullable, options) { datum =>
164 |         // This one will lose microseconds parts.
165 |         // See https://issues.apache.org/jira/browse/SPARK-10681.x
166 |         Try(DateTimeUtils.millisToDays(options.dateFormat.parse(datum).getTime))
167 |       }
168 | 
169 |     case _: StringType => (d: String) =>
170 |       nullSafeDatum(d, name, nullable, options)(UTF8String.fromString)
171 | 
172 |     case udt: UserDefinedType[_] => (datum: String) =>
173 |       makeConverter(name, udt.sqlType, nullable, options)
174 | 
175 |     // We don't actually hit this exception though, we keep it for understandability
176 |     case _ => throw new RuntimeException(s"Unsupported type: ${dataType.typeName}")
177 |   }
178 | 
179 |   private def nullSafeDatum(
180 |                              datum: String,
181 |                              name: String,
182 |                              nullable: Boolean,
183 |                              options: CSVOptions)(converter: ValueConverter): Any = {
184 |     if (datum == options.nullValue || datum == null) {
185 |       if (!nullable) {
186 |         throw new RuntimeException(s"null value found but field $name is not nullable.")
187 |       }
188 |       null
189 |     } else {
190 |       converter.apply(datum)
191 |     }
192 |   }
193 | 
194 |   /**
195 |     * Parses a single CSV string and turns it into either one resulting row or no row (if the
196 |     * the record is malformed).
197 |     */
198 |   // We are going to change this to handle headers
199 |     def parse(input: String): InternalRow = convert(
200 |     Try {tokenizer.parseAll(new StringReader(input)).get(0)}.getOrElse(null)
201 |   )
202 | 
203 |   private val getToken = if (options.columnPruning) {
204 |     (tokens: Array[String], index: Int) => tokens(index)
205 |   } else {
206 |     (tokens: Array[String], index: Int) => tokens(tokenIndexArr(index))
207 |   }
208 | 
209 |   private def convert(tokens: Array[String]): InternalRow = {
210 |     if (tokens == null) {
211 |       throw BadRecordException(
212 |         () => getCurrentInput,
213 |         () => None,
214 |         new RuntimeException("Malformed CSV record"))
215 |     } else if (tokens.length != parsedSchema.length) {
216 |       // If the number of tokens doesn't match the schema, we should treat it as a malformed record.
217 |       // However, we still have chance to parse some of the tokens, by adding extra null tokens in
218 |       // the tail if the number is smaller, or by dropping extra tokens if the number is larger.
219 |       val checkedTokens = if (parsedSchema.length > tokens.length) {
220 |         tokens ++ new Array[String](parsedSchema.length - tokens.length)
221 |       } else {
222 |         tokens.take(parsedSchema.length)
223 |       }
224 |       def getPartialResult(): Option[InternalRow] = {
225 |         try {
226 |           Some(convert(checkedTokens))
227 |         } catch {
228 |           case _: BadRecordException => None
229 |         }
230 |       }
231 |       // For records with less or more tokens than the schema, tries to return partial results
232 |       // if possible.
233 |       throw BadRecordException(
234 |         () => getCurrentInput,
235 |         () => getPartialResult(),
236 |         new RuntimeException("Malformed CSV record"))
237 |     } else {
238 |       try {
239 |         // When the length of the returned tokens is identical to the length of the parsed schema,
240 |         // we just need to convert the tokens that correspond to the required columns.
241 |         var i = 0
242 |         while (i < requiredSchema.length) {
243 |           row(i) = valueConverters(i).apply(getToken(tokens, i))
244 |           i += 1
245 |         }
246 |         row
247 |       } catch {
248 |         case NonFatal(e) =>
249 |           // For corrupted records with the number of tokens same as the schema,
250 |           // CSV reader doesn't support partial results. All fields other than the field
251 |           // configured by `columnNameOfCorruptRecord` are set to `null`.
252 |           throw BadRecordException(() => getCurrentInput, () => None, e)
253 |       }
254 |     }
255 |   }
256 | }
257 | 
258 | private[sql] object UnivocityParser {
259 | 
260 |   /**
261 |     * Parses a stream that contains CSV strings and turns it into an iterator of tokens.
262 |     */
263 |   def tokenizeStream(
264 |                       inputStream: InputStream,
265 |                       shouldDropHeader: Boolean,
266 |                       tokenizer: CsvParser): Iterator[Array[String]] = {
267 |     val handleHeader: () => Unit =
268 |       () => if (shouldDropHeader) tokenizer.parseNext
269 | 
270 |     convertStream(inputStream, tokenizer, handleHeader)(tokens => tokens)
271 |   }
272 | 
273 |   /**
274 |     * Parses a stream that contains CSV strings and turns it into an iterator of rows.
275 |     */
276 |   def parseStream(
277 |                    inputStream: InputStream,
278 |                    parser: UnivocityParser,
279 |                    headerChecker: CSVHeaderChecker,
280 |                    schema: StructType): Iterator[InternalRow] = {
281 |     val tokenizer = parser.tokenizer
282 |     val safeParser = new FailureSafeParser[Array[String]](
283 |       input => Seq(parser.convert(input)),
284 |       parser.options.parseMode,
285 |       schema,
286 |       parser.options.columnNameOfCorruptRecord,
287 |       parser.options.multiLine)
288 | 
289 |     val handleHeader: () => Unit =
290 |       () => headerChecker.checkHeaderColumnNames(tokenizer)
291 | 
292 |     convertStream(inputStream, tokenizer, handleHeader) { tokens =>
293 |       safeParser.parse(tokens)
294 |     }.flatten
295 |   }
296 | 
297 |   private def convertStream[T](
298 |                                 inputStream: InputStream,
299 |                                 tokenizer: CsvParser,
300 |                                 handleHeader: () => Unit)(
301 |                                 convert: Array[String] => T) = new Iterator[T] {
302 |     tokenizer.beginParsing(inputStream)
303 | 
304 |     // We can handle header here since here the stream is open.
305 |     handleHeader()
306 | 
307 |     private var nextRecord = tokenizer.parseNext()
308 | 
309 |     override def hasNext: Boolean = nextRecord != null
310 | 
311 |     override def next(): T = {
312 |       if (!hasNext) {
313 |         throw new NoSuchElementException("End of stream")
314 |       }
315 |       val curRecord = convert(nextRecord)
316 |       nextRecord = tokenizer.parseNext()
317 |       curRecord
318 |     }
319 |   }
320 | 
321 |   /**
322 |     * Parses an iterator that contains CSV strings and turns it into an iterator of rows.
323 |     */
324 |   def parseIterator(
325 |                      lines: Iterator[String],
326 |                      parser: UnivocityParser,
327 |                      headerChecker: CSVHeaderChecker,
328 |                      schema: StructType): Iterator[InternalRow] = {
329 |     headerChecker.checkHeaderColumnNames(lines, parser.tokenizer)
330 | 
331 |     val options = parser.options
332 | 
333 |     val filteredLines: Iterator[String] = CSVExprUtils.filterCommentAndEmpty(lines, options)
334 | 
335 |     val safeParser = new FailureSafeParser[String](
336 |       input => Seq(parser.parse(input)),
337 |       parser.options.parseMode,
338 |       schema,
339 |       parser.options.columnNameOfCorruptRecord,
340 |       parser.options.multiLine)
341 |     filteredLines.flatMap(safeParser.parse)
342 |   }
343 | }


--------------------------------------------------------------------------------
/azure/AzureDataBricksJob/src/main/scala/org/apache/spark/sql/catalyst/expressions/ExprUtils.scala:
--------------------------------------------------------------------------------
 1 | /*
 2 |  * Licensed to the Apache Software Foundation (ASF) under one or more
 3 |  * contributor license agreements.  See the NOTICE file distributed with
 4 |  * this work for additional information regarding copyright ownership.
 5 |  * The ASF licenses this file to You under the Apache License, Version 2.0
 6 |  * (the "License"); you may not use this file except in compliance with
 7 |  * the License.  You may obtain a copy of the License at
 8 |  *
 9 |  *    http://www.apache.org/licenses/LICENSE-2.0
10 |  *
11 |  * Unless required by applicable law or agreed to in writing, software
12 |  * distributed under the License is distributed on an "AS IS" BASIS,
13 |  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 |  * See the License for the specific language governing permissions and
15 |  * limitations under the License.
16 |  */
17 | 
18 | package org.apache.spark.sql.catalyst.expressions
19 | 
20 | import org.apache.spark.sql.AnalysisException
21 | import org.apache.spark.sql.catalyst.util.ArrayBasedMapData
22 | import org.apache.spark.sql.types.{MapType, StringType, StructType}
23 | 
24 | object ExprUtils {
25 | 
26 |   def evalSchemaExpr(exp: Expression): StructType = exp match {
27 |     case Literal(s, StringType) => StructType.fromDDL(s.toString)
28 |     case e => throw new AnalysisException(
29 |       s"Schema should be specified in DDL format as a string literal instead of ${e.sql}")
30 |   }
31 | 
32 |   def convertToMapData(exp: Expression): Map[String, String] = exp match {
33 |     case m: CreateMap
34 |       if m.dataType.acceptsType(MapType(StringType, StringType, valueContainsNull = false)) =>
35 |       val arrayMap = m.eval().asInstanceOf[ArrayBasedMapData]
36 |       ArrayBasedMapData.toScalaMap(arrayMap).map { case (key, value) =>
37 |         key.toString -> value.toString
38 |       }
39 |     case m: CreateMap =>
40 |       throw new AnalysisException(
41 |         s"A type of keys and values in map() must be string, but got ${m.dataType.catalogString}")
42 |     case _ =>
43 |       throw new AnalysisException("Must use a map() function for options")
44 |   }
45 | }


--------------------------------------------------------------------------------
/azure/AzureDataBricksJob/src/main/scala/org/apache/spark/sql/catalyst/expressions/csvExpressions.scala:
--------------------------------------------------------------------------------
  1 | /*
  2 |  * Licensed to the Apache Software Foundation (ASF) under one or more
  3 |  * contributor license agreements.  See the NOTICE file distributed with
  4 |  * this work for additional information regarding copyright ownership.
  5 |  * The ASF licenses this file to You under the Apache License, Version 2.0
  6 |  * (the "License"); you may not use this file except in compliance with
  7 |  * the License.  You may obtain a copy of the License at
  8 |  *
  9 |  *    http://www.apache.org/licenses/LICENSE-2.0
 10 |  *
 11 |  * Unless required by applicable law or agreed to in writing, software
 12 |  * distributed under the License is distributed on an "AS IS" BASIS,
 13 |  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 14 |  * See the License for the specific language governing permissions and
 15 |  * limitations under the License.
 16 |  */
 17 | 
 18 | package org.apache.spark.sql.catalyst.expressions
 19 | 
 20 | import java.io.StringReader
 21 | 
 22 | import org.apache.spark.sql.AnalysisException
 23 | import org.apache.spark.sql.catalyst.InternalRow
 24 | import org.apache.spark.sql.catalyst.csv._
 25 | import org.apache.spark.sql.catalyst.expressions.codegen.CodegenFallback
 26 | import org.apache.spark.sql.catalyst.util._
 27 | import org.apache.spark.sql.types._
 28 | import org.apache.spark.unsafe.types.UTF8String
 29 | 
 30 | /**
 31 |   * Converts a CSV input string to a [[StructType]] with the specified schema.
 32 |   */
 33 | // scalastyle:off line.size.limit
 34 | @ExpressionDescription(
 35 |   usage = "_FUNC_(csvStr, schema[, options]) - Returns a struct value with the given `csvStr` and `schema`.",
 36 |   examples = """
 37 |     Examples:
 38 |       > SELECT _FUNC_('1, 0.8', 'a INT, b DOUBLE');
 39 |        {"a":1, "b":0.8}
 40 |       > SELECT _FUNC_('26/08/2015', 'time Timestamp', map('timestampFormat', 'dd/MM/yyyy'))
 41 |        {"time":2015-08-26 00:00:00.0}
 42 |   """,
 43 |   since = "3.0.0")
 44 | // scalastyle:on line.size.limit
 45 | case class CsvToStructs(
 46 |                          schema: StructType,
 47 |                          options: Map[String, String],
 48 |                          child: Expression,
 49 |                          timeZoneId: Option[String] = None)
 50 |   extends UnaryExpression
 51 |     with TimeZoneAwareExpression
 52 |     with CodegenFallback
 53 |     with ExpectsInputTypes
 54 |     with NullIntolerant {
 55 | 
 56 |   override def nullable: Boolean = child.nullable
 57 | 
 58 |   // The CSV input data might be missing certain fields. We force the nullability
 59 |   // of the user-provided schema to avoid data corruptions.
 60 |   val nullableSchema: StructType = schema.asNullable
 61 | 
 62 |   // Used in `FunctionRegistry`
 63 |   def this(child: Expression, schema: Expression, options: Map[String, String]) =
 64 |     this(
 65 |       schema = ExprUtils.evalSchemaExpr(schema),
 66 |       options = options,
 67 |       child = child,
 68 |       timeZoneId = None)
 69 | 
 70 |   def this(child: Expression, schema: Expression) = this(child, schema, Map.empty[String, String])
 71 | 
 72 |   def this(child: Expression, schema: Expression, options: Expression) =
 73 |     this(
 74 |       schema = ExprUtils.evalSchemaExpr(schema),
 75 |       options = ExprUtils.convertToMapData(options),
 76 |       child = child,
 77 |       timeZoneId = None)
 78 | 
 79 |   // This converts parsed rows to the desired output by the given schema.
 80 |   @transient
 81 |   lazy val converter = (rows: Iterator[InternalRow]) => {
 82 |     if (rows.hasNext) {
 83 |       val result = rows.next()
 84 |       // CSV's parser produces one record only.
 85 |       assert(!rows.hasNext)
 86 |       result
 87 |     } else {
 88 |       throw new IllegalArgumentException("Expected one row from CSV parser.")
 89 |     }
 90 |   }
 91 | 
 92 |   @transient lazy val parser = {
 93 |     val parsedOptions = new CSVOptions(options, columnPruning = true, timeZoneId.get)
 94 |     val mode = parsedOptions.parseMode
 95 |     if (mode != PermissiveMode && mode != FailFastMode) {
 96 |       throw new AnalysisException(s"from_csv() doesn't support the ${mode.name} mode. " +
 97 |         s"Acceptable modes are ${PermissiveMode.name} and ${FailFastMode.name}.")
 98 |     }
 99 |     val actualSchema =
100 |       StructType(nullableSchema.filterNot(_.name == parsedOptions.columnNameOfCorruptRecord))
101 |     val rawParser = new UnivocityParser(actualSchema, actualSchema, parsedOptions)
102 |     new FailureSafeParser[String](
103 |       input => {
104 |         Seq(rawParser.parse(input))
105 |       },
106 |       mode,
107 |       nullableSchema,
108 |       parsedOptions.columnNameOfCorruptRecord,
109 |       parsedOptions.multiLine)
110 |   }
111 | 
112 |   override def dataType: DataType = nullableSchema
113 | 
114 |   override def withTimeZone(timeZoneId: String): TimeZoneAwareExpression = {
115 |     copy(timeZoneId = Option(timeZoneId))
116 |   }
117 | 
118 |   override def nullSafeEval(input: Any): Any = {
119 |     val csv = input.asInstanceOf[UTF8String].toString
120 |     converter(parser.parse(csv))
121 |   }
122 | 
123 |   override def inputTypes: Seq[AbstractDataType] = StringType :: Nil
124 | 
125 |   override def prettyName: String = "from_csv"
126 | }


--------------------------------------------------------------------------------
/azure/AzureDataBricksJob/src/main/scala/org/apache/spark/sql/catalyst/util/FailureSafeParser.scala:
--------------------------------------------------------------------------------
 1 | /*
 2 |  * Licensed to the Apache Software Foundation (ASF) under one or more
 3 |  * contributor license agreements.  See the NOTICE file distributed with
 4 |  * this work for additional information regarding copyright ownership.
 5 |  * The ASF licenses this file to You under the Apache License, Version 2.0
 6 |  * (the "License"); you may not use this file except in compliance with
 7 |  * the License.  You may obtain a copy of the License at
 8 |  *
 9 |  *    http://www.apache.org/licenses/LICENSE-2.0
10 |  *
11 |  * Unless required by applicable law or agreed to in writing, software
12 |  * distributed under the License is distributed on an "AS IS" BASIS,
13 |  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 |  * See the License for the specific language governing permissions and
15 |  * limitations under the License.
16 |  */
17 | 
18 | package org.apache.spark.sql.catalyst.util
19 | 
20 | import org.apache.spark.SparkException
21 | import org.apache.spark.sql.catalyst.InternalRow
22 | import org.apache.spark.sql.catalyst.expressions.GenericInternalRow
23 | import org.apache.spark.sql.types.StructType
24 | import org.apache.spark.unsafe.types.UTF8String
25 | 
26 | class FailureSafeParser[IN](
27 |                              rawParser: IN => Seq[InternalRow],
28 |                              mode: ParseMode,
29 |                              schema: StructType,
30 |                              columnNameOfCorruptRecord: String,
31 |                              isMultiLine: Boolean) {
32 | 
33 |   private val corruptFieldIndex = schema.getFieldIndex(columnNameOfCorruptRecord)
34 |   private val actualSchema = StructType(schema.filterNot(_.name == columnNameOfCorruptRecord))
35 |   private val resultRow = new GenericInternalRow(schema.length)
36 |   private val nullResult = new GenericInternalRow(schema.length)
37 | 
38 |   // This function takes 2 parameters: an optional partial result, and the bad record. If the given
39 |   // schema doesn't contain a field for corrupted record, we just return the partial result or a
40 |   // row with all fields null. If the given schema contains a field for corrupted record, we will
41 |   // set the bad record to this field, and set other fields according to the partial result or null.
42 |   private val toResultRow: (Option[InternalRow], () => UTF8String) => InternalRow = {
43 |     if (corruptFieldIndex.isDefined) {
44 |       (row, badRecord) => {
45 |         var i = 0
46 |         while (i < actualSchema.length) {
47 |           val from = actualSchema(i)
48 |           resultRow(schema.fieldIndex(from.name)) = row.map(_.get(i, from.dataType)).orNull
49 |           i += 1
50 |         }
51 |         resultRow(corruptFieldIndex.get) = badRecord()
52 |         resultRow
53 |       }
54 |     } else {
55 |       (row, _) => row.getOrElse(nullResult)
56 |     }
57 |   }
58 | 
59 |   private val skipParsing = !isMultiLine && mode == PermissiveMode && schema.isEmpty
60 | 
61 |   def parse(input: IN): Iterator[InternalRow] = {
62 |     try {
63 |       if (skipParsing) {
64 |         Iterator.single(InternalRow.empty)
65 |       } else {
66 |         rawParser.apply(input).toIterator.map(row => toResultRow(Some(row), () => null))
67 |       }
68 |     } catch {
69 |       case e: BadRecordException => mode match {
70 |         case PermissiveMode =>
71 |           Iterator(toResultRow(e.partialResult(), e.record))
72 |         case DropMalformedMode =>
73 |           Iterator.empty
74 |         case FailFastMode =>
75 |           throw new SparkException("Malformed records are detected in record parsing. " +
76 |             s"Parse Mode: ${FailFastMode.name}.", e.cause)
77 |       }
78 |     }
79 |   }
80 | }


--------------------------------------------------------------------------------
/azure/AzureDataBricksJob/src/test/scala/com/microsoft/pnp/SparkSuitBase.scala:
--------------------------------------------------------------------------------
 1 | package com.microsoft.pnp
 2 | 
 3 | import org.apache.spark.sql.SparkSession
 4 | import org.scalatest.FunSuite
 5 | 
 6 | abstract class SparkSuiteBase extends FunSuite {
 7 |   lazy val sparkContext = SparkSuiteBase.sparkContext
 8 | 
 9 | }
10 | 
11 | object SparkSuiteBase {
12 |   private val master = "local[*]"
13 |   private val appName = "data_load_testing"
14 |   private lazy val sparkContext: SparkSession = new SparkSession.Builder().appName(appName).master(master).getOrCreate()
15 | 
16 | }
17 | 


--------------------------------------------------------------------------------
/azure/AzureDataBricksJob/src/test/scala/com/microsoft/pnp/TaxiFareMapperTester.scala:
--------------------------------------------------------------------------------
 1 | package com.microsoft.pnp
 2 | 
 3 | import org.scalatest.Matchers
 4 | 
 5 | import scala.util.{Failure, Success, Try}
 6 | 
 7 | class TaxiFareMapperTester extends SparkSuiteBase with Matchers {
 8 | 
 9 | /*
10 |   test("should_map_fare_pickup_time_to_ride_pickup_time_format") {
11 |     val expected = "2013-01-01T00:04:27+00:00"
12 |     val inputFarePickUptime = "2013-01-01 00:04:27"
13 |     val actual = TaxiFareMapper.mapFarePickUpTimeToRidePickUpTimeFormat(inputFarePickUptime)
14 |     assert(actual.contentEquals(expected))
15 |   }
16 | 
17 |   test("csv string with first line comma separated header fields and second line comma separated value fields should be a valid ") {
18 | 
19 |     val inputString = "header1,header2,header3\nvalue1,value2,value3"
20 | 
21 |     var shouldSetTrueInSuccessCase = false
22 |     TaxiFareMapper.validateHeaderEmbededCsvString(inputString) match {
23 | 
24 |       case Success(_) => shouldSetTrueInSuccessCase = true
25 |       case Failure(_) => shouldSetTrueInSuccessCase = false
26 | 
27 |     }
28 |     assert(shouldSetTrueInSuccessCase)
29 |   }
30 | 
31 |   test("csv string with only comma separated  value fields  and no header fields should be a invalid ") {
32 |     val inputString = "value1,value2,value3"
33 | 
34 |     var shouldSetTrueInFailureCase = false
35 |     TaxiFareMapper.validateHeaderEmbededCsvString(inputString) match {
36 |       case Success(_) => shouldSetTrueInFailureCase = false
37 |       case Failure(_) => shouldSetTrueInFailureCase = true
38 |     }
39 |     assert(shouldSetTrueInFailureCase)
40 |   }
41 | 
42 |   test("csv content with less than 11 fields") {
43 |     val invalidCsvContent = "2013000717,2013000714,CMT,2013-01-01 00:04:27,CRD,8.5,0.5,0.5,2.37,0"
44 |     var shouldSetTrueInFailureCase = false
45 |     var actualErrorMessage = ""
46 | 
47 |     Try(TaxiFareMapper.mapCsvToTaxiFare(invalidCsvContent)) match {
48 |       case Success(_) => shouldSetTrueInFailureCase = false
49 |       case Failure(exception) =>
50 |         shouldSetTrueInFailureCase = true
51 |         actualErrorMessage = exception.getMessage
52 |     }
53 | 
54 |     val expectedErrorMessage = TaxiFareMapper.invalidTaxiFareCsv
55 |     assert(shouldSetTrueInFailureCase)
56 |     assert(expectedErrorMessage.contentEquals(actualErrorMessage))
57 |   }
58 |   */
59 | }
60 | 


--------------------------------------------------------------------------------
/azure/AzureDataBricksJob/src/test/scala/com/microsoft/pnp/TaxiRideMapperTester.scala:
--------------------------------------------------------------------------------
  1 | package com.microsoft.pnp
  2 | 
  3 | import java.sql.Timestamp
  4 | 
  5 | import org.apache.spark.sql.Row
  6 | import org.apache.spark.sql.types.{StringType, StructType, TimestampType}
  7 | import org.scalatest.{BeforeAndAfterEach, Matchers}
  8 | import org.slf4j.LoggerFactory
  9 | 
 10 | import scala.util.{Failure, Success}
 11 | 
 12 | class TaxiRideMapperTester extends SparkSuiteBase with Matchers with BeforeAndAfterEach {
 13 | 
 14 | 
 15 |   val logger = LoggerFactory.getLogger("TaxiRideMapperTester")
 16 | 
 17 |   override def beforeEach(): Unit = { }
 18 | 
 19 |   override def afterEach() { }
 20 | 
 21 | /*
 22 |   test("it should parse valid json and match mapJsonToTaxiRide success case") {
 23 |     logger.info("it should parse valid json and match mapJsonToTaxiRide success case")
 24 |     val taxiRideJsonString = "{\"rateCode\":1,\"storeAndForwardFlag\":\"N\",\"dropoffTime\":\"2013-01-01T00:11:20+00:00\",\"passengerCount\":1,\"tripTimeInSeconds\":413.0,\"tripDistanceInMiles\":2.3,\"pickupLon\":-73.97912,\"pickupLat\":40.7623177,\"dropoffLon\":-73.95027,\"dropoffLat\":40.77126,\"medallion\":2013000717,\"hackLicense\":2013000714,\"vendorId\":\"CMT\",\"pickupTime\":\"2013-01-01T00:04:27+00:00\"}"
 25 | 
 26 |     var shouldMapToTaxiRide = false
 27 |     TaxiRideMapper.mapJsonToTaxiRide(taxiRideJsonString) match {
 28 |       case Success(_) => shouldMapToTaxiRide = true
 29 |       case Failure(_) => shouldMapToTaxiRide = false
 30 |     }
 31 | 
 32 |     assert(shouldMapToTaxiRide)
 33 |     println(1)
 34 |   }
 35 | 
 36 |   test("it should parse corrupted taxi ride json and match mapJsonToTaxiRide failure case") {
 37 |     val taxiRideJsonString = "{\"menu\": {\n  \"id\": \"file\",\n  \"value\": \"File\",\n  \"popup\": {\n    \"menuitem\": [\n      {\"value\": \"New\", \"onclick\": \"CreateNewDoc()\"},\n      {\"value\": \"Open\", \"onclick\": \"OpenDoc()\"},\n      {\"value\": \"Close\", \"onclick\": \"CloseDoc()\"}\n    ]\n  }\n}}"
 38 |     val expected = "0_0_null_null"
 39 |     var taxiRide: TaxiRide = null
 40 |     TaxiRideMapper.mapJsonToTaxiRide(taxiRideJsonString) match {
 41 |       case Success(value) => taxiRide = value
 42 |       case Failure(_) =>
 43 |     }
 44 | 
 45 |     assert(taxiRide.key.contentEquals(expected))
 46 |     println(2)
 47 |   }
 48 | 
 49 |   test("it should parse valid json and match validateJsonString success case") {
 50 |     val validJson = "{\n    \"fruit\": \"Apple\",\n    \"size\": \"Large\",\n    \"color\": \"Red\"\n}"
 51 | 
 52 |     var shouldValidateToTrue = false
 53 |     TaxiRideMapper.validateJsonString(validJson) match {
 54 |       case Success(_) => shouldValidateToTrue = true
 55 |       case Failure(_) => shouldValidateToTrue = false
 56 |     }
 57 | 
 58 |     assert(shouldValidateToTrue)
 59 |     println(3)
 60 |   }
 61 | 
 62 |   test("it should parse invalid json and match  validateJsonString failure case") {
 63 |     val invalidJson = "some invalid json string"
 64 | 
 65 |     var shouldValidateToTrue = false
 66 |     TaxiRideMapper.validateJsonString(invalidJson) match {
 67 |       case Success(_) => shouldValidateToTrue = true
 68 |       case Failure(_) => shouldValidateToTrue = false
 69 |     }
 70 | 
 71 |     assert(!shouldValidateToTrue)
 72 |     println(4)
 73 |   }
 74 | 
 75 |   test("it should map a valid taxi ride json to a valid enrichedtaxi ride record") {
 76 |     val rideContent = "{\"rateCode\":1,\"storeAndForwardFlag\":\"N\",\"dropoffTime\":\"2013-01-01T00:11:20+00:00\",\"passengerCount\":1,\"tripTimeInSeconds\":413.0,\"tripDistanceInMiles\":2.3,\"pickupLon\":-73.97912,\"pickupLat\":40.7623177,\"dropoffLon\":-73.95027,\"dropoffLat\":40.77126,\"medallion\":2013000717,\"hackLicense\":2013000714,\"vendorId\":\"CMT\",\"pickupTime\":\"2013-01-01T00:04:27+00:00\"}"
 77 |     val recordIngestedTime = "2018-08-23 12:44:19.818"
 78 | 
 79 |     val rideDataFrameSchema = new StructType()
 80 |       .add("rideContent", StringType, true)
 81 |       .add("recordIngestedTime", TimestampType, true)
 82 | 
 83 |     val rideData = Seq(
 84 |       Row(rideContent, Timestamp.valueOf(recordIngestedTime))
 85 |     )
 86 | 
 87 |     import sparkContext.implicits._
 88 | 
 89 |     val rideDataFrame = sparkContext.createDataFrame(
 90 |       sparkContext.sparkContext.parallelize(rideData),
 91 |       rideDataFrameSchema
 92 |     )
 93 | 
 94 |     val enrichedTaxiRideRecords = rideDataFrame.map(row => TaxiRideMapper.mapRowToEncrichedTaxiRideRecord(row))
 95 |       .filter(x => x.isValidRecord).as[EnrichedTaxiDataRecord]
 96 | 
 97 |     val expectedCount = 1
 98 |     val actualCount = enrichedTaxiRideRecords.count()
 99 | 
100 |     assert(actualCount == expectedCount)
101 |     println(5)
102 |   }
103 | 
104 |   test("it should map a invalid json string to a invalid enriched taxi ride record") {
105 |     val rideContent = "some invalid json string"
106 |     val recordIngestedTime = "2018-08-23 12:44:19.818"
107 | 
108 |     val rideDataFrameSchema = new StructType()
109 |       .add("rideContent", StringType, true)
110 |       .add("recordIngestedTime", TimestampType, true)
111 | 
112 |     import sparkContext.implicits._
113 |     val rideData = Seq(
114 |       Row(rideContent, Timestamp.valueOf(recordIngestedTime))
115 |     )
116 | 
117 |     val rideDataFrame = sparkContext.createDataFrame(
118 |       sparkContext.sparkContext.parallelize(rideData),
119 |       rideDataFrameSchema
120 |     )
121 | 
122 |     val enrichedTaxiRideRecords = rideDataFrame.map(row => TaxiRideMapper.mapRowToEncrichedTaxiRideRecord(row))
123 |       .filter(x => x.isValidRecord).as[EnrichedTaxiDataRecord]
124 | 
125 |     val expectedCount = 0
126 |     val actualCount = enrichedTaxiRideRecords.count()
127 | 
128 |     assert(actualCount == expectedCount)
129 |     println(6)
130 |   }
131 | 
132 |   test("it should map a valid json string  but a corrupted taxiride string to a invalid enriched taxi ride record") {
133 |     val rideContent = "{\"menu\": {\n  \"id\": \"file\",\n  \"value\": \"File\",\n  \"popup\": {\n    \"menuitem\": [\n      {\"value\": \"New\", \"onclick\": \"CreateNewDoc()\"},\n      {\"value\": \"Open\", \"onclick\": \"OpenDoc()\"},\n      {\"value\": \"Close\", \"onclick\": \"CloseDoc()\"}\n    ]\n  }\n}}"
134 |     val recordIngestedTime = "2018-08-23 12:44:19.818"
135 | 
136 |     val rideDataFrameSchema = new StructType()
137 |       .add("rideContent", StringType, true)
138 |       .add("recordIngestedTime", TimestampType, true)
139 | 
140 |     val rideData = Seq(
141 |       Row(rideContent, Timestamp.valueOf(recordIngestedTime))
142 |     )
143 | 
144 |     import sparkContext.implicits._
145 | 
146 |     val rideDataFrame = sparkContext.createDataFrame(
147 |       sparkContext.sparkContext.parallelize(rideData),
148 |       rideDataFrameSchema
149 |     )
150 | 
151 |     val enrichedTaxiRideRecords = rideDataFrame.map(row => TaxiRideMapper.mapRowToEncrichedTaxiRideRecord(row))
152 |       .filter(x => x.isValidRecord).as[EnrichedTaxiDataRecord]
153 | 
154 |     val expectedCount = 0
155 |     val actualCount = enrichedTaxiRideRecords.count()
156 | 
157 |     assert(actualCount == expectedCount)
158 |     println(7)
159 |   }
160 | */
161 | }
162 | 


--------------------------------------------------------------------------------
/azure/AzureDataBricksJob/src/test/scala/org/apache/spark/sql/streaming/UtilsTests.scala:
--------------------------------------------------------------------------------
 1 | package org.apache.spark.sql.streaming
 2 | 
 3 | import java.util.HashMap
 4 | import java.util.UUID.randomUUID
 5 | 
 6 | import com.microsoft.pnp.{SparkSuiteBase, Utils}
 7 | import org.apache.spark.sql.streaming.StreamingQueryListener.QueryProgressEvent
 8 | import org.scalatest.Matchers
 9 | 
10 | class UtilsTests[sql] extends SparkSuiteBase with Matchers {
11 | 
12 |   test("should_parse_queryprogress_telemetry") {
13 |     val guid = randomUUID()
14 |     val duration: java.util.Map[String, java.lang.Long] = new HashMap[String, java.lang.Long]
15 |     val eventTime: java.util.Map[String, String] = new HashMap[String, String]
16 | 
17 |     duration.put("addBatch", 100L)
18 |     duration.put("getBatch", 200L)
19 |     val source: SourceProgress = new SourceProgress("source", "start", "end", 100, 200, 300)
20 |     val sourcearr = new Array[SourceProgress](1)
21 |     sourcearr(0) = source
22 | 
23 |     val progressEvent = new QueryProgressEvent(
24 |       new StreamingQueryProgress(
25 |         guid, guid,
26 |         "streamTest", "time",
27 |         10, 10, duration,
28 |         eventTime,
29 |         null, sourcearr, null, null
30 |       )
31 |     )
32 | 
33 |     val metrics = Utils.parsePayload(progressEvent)
34 |     assert(progressEvent.progress.id === metrics.get("id"))
35 |     assert(progressEvent.progress.numInputRows === metrics.get("inputRows"))
36 |     assert(progressEvent.progress.processedRowsPerSecond === metrics.get("procRowsPerSecond"))
37 |     assert(progressEvent.progress.inputRowsPerSecond === metrics.get("inputRowsPerSecond"))
38 |     assert(progressEvent.progress.durationMs.get("addBatch") ===
39 |       metrics.get("durationms").asInstanceOf[HashMap[String, AnyRef]].get("addBatch"))
40 |     assert(progressEvent.progress.durationMs.get("getBatch") ===
41 |       metrics.get("durationms").asInstanceOf[HashMap[String, AnyRef]].get("getBatch"))
42 | 
43 |   }
44 | }


--------------------------------------------------------------------------------
/azure/deployresources.json:
--------------------------------------------------------------------------------
  1 | {
  2 |   "$schema": "https://schema.management.azure.com/schemas/2015-01-01/deploymentTemplate.json#",
  3 |   "contentVersion": "1.0.0.0",
  4 |   "parameters": {
  5 |     "eventHubNamespace": {
  6 |       "type": "string"
  7 |     },
  8 |     "databricksWorkspaceName": {
  9 |       "type":"string"
 10 |     },
 11 |     "cosmosDatabaseAccount": {
 12 |       "type": "string"
 13 |     },
 14 |     "logAnalyticsWorkspaceName": {
 15 |       "type": "string"
 16 |     },
 17 |     "logAnalyticsWorkspaceRegion": {
 18 |       "type": "string"
 19 |     }
 20 |   },
 21 |   "variables": {
 22 |     "eventHubNames": [
 23 |       "taxi-ride-eh",
 24 |       "taxi-fare-eh"
 25 |     ],
 26 |     "copy": [
 27 |         {
 28 |             "name": "eventHubs",
 29 |             "count": "[length(variables('eventHubNames'))]",
 30 |             "input": {
 31 |                 "name": "[concat(parameters('eventHubNamespace'), '/', variables('eventHubNames')[copyIndex('eventHubs')])]",
 32 |                 "consumerGroupName": "[concat(parameters('eventHubNamespace'), '/', variables('eventHubNames')[copyIndex('eventHubs')], '/', concat(variables('eventHubNames')[copyIndex('eventHubs')], '-cg'))]",
 33 |                 "authorizationRuleName": "[concat(parameters('eventHubNamespace'), '/', variables('eventHubNames')[copyIndex('eventHubs')], '/', concat(variables('eventHubNames')[copyIndex('eventHubs')], '-ap'))]",
 34 |                 "authorizationRuleResourceId": "[resourceId('Microsoft.EventHub/namespaces/eventHubs/authorizationRules', parameters('eventHubNamespace'), variables('eventHubNames')[copyIndex('eventHubs')], concat(variables('eventHubNames')[copyIndex('eventHubs')], '-ap'))]"
 35 |             }
 36 |         }
 37 |     ],
 38 |     "dataBricksResourceGroup": "[concat(resourceGroup().name, '-', parameters('databricksWorkspaceName'), '-', uniqueString(resourceGroup().name, '-', parameters('databricksWorkspaceName')))]"
 39 |   },
 40 |   "resources": [
 41 |     {
 42 |       "type": "Microsoft.Operationalinsights/workspaces",
 43 |       "name": "[parameters('logAnalyticsWorkspaceName')]",
 44 |       "apiVersion": "2015-11-01-preview",
 45 |       "location": "[parameters('logAnalyticsWorkspaceRegion')]",
 46 |       "properties": {
 47 |           "sku": {
 48 |               "name": "pergb2018"
 49 |           },
 50 |           "retentionInDays": 30
 51 |       }
 52 |     },
 53 |     {
 54 |       "type": "Microsoft.EventHub/namespaces",
 55 |       "name": "[parameters('eventHubNamespace')]",
 56 |       "apiVersion": "2017-04-01",
 57 |       "location": "[resourceGroup().location]",
 58 |       "sku": {
 59 |         "name": "Standard",
 60 |         "tier": "Standard"
 61 |       }
 62 |     },
 63 |     {
 64 |       "type": "Microsoft.EventHub/namespaces/eventhubs",
 65 |       "name": "[variables('eventHubs')[copyIndex()].name]",
 66 |       "apiVersion": "2017-04-01",
 67 |       "copy": {
 68 |         "count": "[length(variables('eventHubs'))]",
 69 |         "mode": "Parallel",
 70 |         "name": "eventHubs"
 71 |       },
 72 |       "properties": {
 73 |         "messageRetentionInDays": 3,
 74 |         "partitionCount": 8
 75 |       },
 76 |       "dependsOn": [
 77 |         "[parameters('eventHubNamespace')]"
 78 |       ]
 79 |     },
 80 |     {
 81 |       "type": "Microsoft.EventHub/namespaces/eventhubs/consumergroups",
 82 |       "name": "[variables('eventHubs')[copyIndex()].consumerGroupName]",
 83 |       "apiVersion": "2017-04-01",
 84 |       "copy": {
 85 |         "count": "[length(variables('eventHubs'))]",
 86 |         "mode": "Parallel",
 87 |         "name": "consumerGroups"
 88 |       },
 89 |       "properties": {},
 90 |       "dependsOn": [
 91 |         "eventHubs"
 92 |       ]
 93 |     },
 94 |     {
 95 |       "type": "Microsoft.EventHub/namespaces/eventhubs/authorizationRules",
 96 |       "name": "[variables('eventHubs')[copyIndex()].authorizationRuleName]",
 97 |       "apiVersion": "2017-04-01",
 98 |       "copy": {
 99 |         "count": "[length(variables('eventHubs'))]",
100 |         "mode": "Parallel",
101 |         "name": "authorizationRules"
102 |       },
103 |       "properties": {
104 |         "rights": [
105 |           "Listen",
106 |           "Send"
107 |         ]
108 |       },
109 |       "dependsOn": [
110 |         "consumerGroups"
111 |       ]
112 |     },
113 |     {
114 |       "type": "Microsoft.Databricks/workspaces",
115 |       "name": "[parameters('databricksWorkspaceName')]",
116 |       "location": "[resourceGroup().location]",
117 |       "apiVersion": "2018-04-01",
118 |       "sku": {
119 |         "name": "premium"
120 |       },
121 |       "properties": {
122 |         "managedResourceGroupId": "[concat(subscription().id, '/resourceGroups/', variables('dataBricksResourceGroup'))]"
123 |       }
124 |     },
125 |     {
126 |       "name": "[parameters('cosmosDatabaseAccount')]",
127 |       "type": "Microsoft.DocumentDB/databaseAccounts",
128 |       "apiVersion": "2015-04-08",
129 |       "location": "[resourceGroup().location]",
130 |       "kind": "GlobalDocumentDB",
131 |       "tags": {
132 |         "defaultExperience": "Cassandra"
133 |       },
134 |       "properties": {
135 |         "databaseAccountOfferType": "Standard",
136 |         "locations": [
137 |           {
138 |             "locationName": "[resourceGroup().location]",
139 |             "failoverPriority": 0
140 |           }
141 |         ],
142 |         "capabilities": [
143 |           {
144 |               "name": "EnableCassandra"
145 |           }
146 |         ]
147 |       }
148 |     },
149 |     {
150 |       "type": "Microsoft.Resources/deployments",
151 |       "apiVersion": "2017-05-10",
152 |       "name": "outputGeneration",
153 |       "properties": {
154 |         "mode": "Incremental",
155 |         "template": {
156 |           "$schema": "https://schema.management.azure.com/schemas/2015-01-01/deploymentTemplate.json#",
157 |           "contentVersion": "1.0.0.0",
158 |           "parameters": {
159 |           },
160 |           "variables": {
161 |           },
162 |           "resources": [
163 |           ],
164 |           "outputs": {
165 |             "cosmosDb": {
166 |               "type": "object",
167 |               "value": {
168 |                 "username": "[substring(reference(parameters('cosmosDatabaseAccount')).cassandraEndpoint, 8, sub(indexOf(reference(parameters('cosmosDatabaseAccount')).cassandraEndpoint, '.'), 8))]",
169 |                 "hostName": "[substring(reference(parameters('cosmosDatabaseAccount')).cassandraEndpoint, 8, sub(length(reference(parameters('cosmosDatabaseAccount')).cassandraEndpoint), 13))]",
170 |                 "secret": "[listKeys(parameters('cosmosDatabaseAccount'), '2015-04-08').primaryMasterKey]"
171 |               }
172 |             },
173 |             "eventHubs": {
174 |               "type": "object",
175 |               "value": {
176 |                 "[variables('eventHubNames')[0]]": "[listKeys(variables('eventHubs')[0].authorizationRuleResourceId, '2017-04-01').primaryConnectionString]",
177 |                 "[variables('eventHubNames')[1]]": "[listKeys(variables('eventHubs')[1].authorizationRuleResourceId, '2017-04-01').primaryConnectionString]"
178 |               }
179 |             },
180 |             "logAnalytics": {
181 |               "type": "object",
182 |               "value": {
183 |                 "workspaceId": "[reference(parameters('logAnalyticsWorkspaceName')).customerId]",
184 |                 "secret": "[listKeys(parameters('logAnalyticsWorkspaceName'), '2015-11-01-preview').primarySharedKey]"
185 |               }
186 |             }
187 |           }
188 |         }
189 |       },
190 |       "dependsOn": [
191 |         "authorizationRules",
192 |         "[parameters('cosmosDatabaseAccount')]",
193 |         "[parameters('databricksWorkspaceName')]"
194 |       ]
195 |     }
196 |   ],
197 |   "outputs": {
198 |     "cosmosDb": {
199 |       "type": "object",
200 |       "value": "[reference('outputGeneration').outputs.cosmosDb.value]"
201 |     },
202 |     "logAnalytics": {
203 |       "type": "object",
204 |       "value": "[reference('outputGeneration').outputs.logAnalytics.value]"
205 |     },
206 |     "eventHubs": {
207 |       "type": "object",
208 |       "value": "[reference('outputGeneration').outputs.eventHubs.value]"
209 |     }
210 |   }
211 | }


--------------------------------------------------------------------------------
/azure/pom.xml:
--------------------------------------------------------------------------------
  1 | <?xml version="1.0" encoding="UTF-8"?>
  2 | <project xmlns="http://maven.apache.org/POM/4.0.0"
  3 |          xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
  4 |          xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">
  5 |     <modelVersion>4.0.0</modelVersion>
  6 | 
  7 |     <groupId>com.microsoft.pnp</groupId>
  8 |     <artifactId>azure-databricks-ra</artifactId>
  9 |     <version>1.0-SNAPSHOT</version>
 10 |     <packaging>pom</packaging>
 11 |     <modules>
 12 |         <module>AzureDataBricksJob</module>
 13 |     </modules>
 14 |     <properties>
 15 |         <maven.compiler.source>1.8</maven.compiler.source>
 16 |         <maven.compiler.target>1.8</maven.compiler.target>
 17 |         <scala.version>2.12.12</scala.version>
 18 |         <scala.compat.version>2.12</scala.compat.version>
 19 |         <spark.version>3.0.1</spark.version>
 20 |         <project.build.sourceEncoding>UTF-8</project.build.sourceEncoding>
 21 |         <project.reporting.outputEncoding>UTF-8</project.reporting.outputEncoding>
 22 |         <scalatest.version>3.0.9</scalatest.version>
 23 |         <slf4j.version>1.7.30</slf4j.version>
 24 |     </properties>
 25 |     <dependencyManagement>
 26 |         <dependencies>
 27 |             <dependency>
 28 |                 <groupId>org.scala-lang</groupId>
 29 |                 <artifactId>scala-library</artifactId>
 30 |                 <version>${scala.version}</version>
 31 |                 <scope>provided</scope>
 32 |             </dependency>
 33 |             <dependency>
 34 |                 <groupId>org.apache.spark</groupId>
 35 |                 <artifactId>spark-core_${scala.compat.version}</artifactId>
 36 |                 <version>${spark.version}</version>
 37 |                 <scope>provided</scope>
 38 |             </dependency>
 39 |             <dependency>
 40 |                 <groupId>org.apache.spark</groupId>
 41 |                 <artifactId>spark-sql_${scala.compat.version}</artifactId>
 42 |                 <version>${spark.version}</version>
 43 |                 <scope>provided</scope>
 44 |             </dependency>
 45 |             <dependency>
 46 |                 <groupId>org.apache.spark</groupId>
 47 |                 <artifactId>spark-streaming_${scala.compat.version}</artifactId>
 48 |                 <version>${spark.version}</version>
 49 |                 <scope>provided</scope>
 50 |             </dependency>
 51 |             <dependency>
 52 |                 <groupId>org.slf4j</groupId>
 53 |                 <artifactId>slf4j-api</artifactId>
 54 |                 <version>${slf4j.version}</version>
 55 |                 <scope>provided</scope>
 56 |             </dependency>
 57 |             <dependency>
 58 |                 <groupId>org.scalatest</groupId>
 59 |                 <artifactId>scalatest_${scala.compat.version}</artifactId>
 60 |                 <version>${scalatest.version}</version>
 61 |                 <scope>test</scope>
 62 |             </dependency>
 63 |         </dependencies>
 64 |     </dependencyManagement>
 65 |     <dependencies>
 66 |         <dependency>
 67 |             <groupId>org.scala-lang</groupId>
 68 |             <artifactId>scala-library</artifactId>
 69 |         </dependency>
 70 |         <dependency>
 71 |             <groupId>org.apache.spark</groupId>
 72 |             <artifactId>spark-core_${scala.compat.version}</artifactId>
 73 |         </dependency>
 74 |         <dependency>
 75 |             <groupId>org.apache.spark</groupId>
 76 |             <artifactId>spark-sql_${scala.compat.version}</artifactId>
 77 |         </dependency>
 78 |         <dependency>
 79 |             <groupId>org.apache.spark</groupId>
 80 |             <artifactId>spark-streaming_${scala.compat.version}</artifactId>
 81 |         </dependency>
 82 |         <dependency>
 83 |             <groupId>org.slf4j</groupId>
 84 |             <artifactId>slf4j-api</artifactId>
 85 |         </dependency>
 86 |         <dependency>
 87 |             <groupId>junit</groupId>
 88 |             <artifactId>junit</artifactId>
 89 |             <version>4.13.2</version>
 90 |             <scope>test</scope>
 91 |         </dependency>
 92 |         <dependency>
 93 |             <groupId>org.scalatest</groupId>
 94 |             <artifactId>scalatest_${scala.compat.version}</artifactId>
 95 |             <scope>test</scope>
 96 |         </dependency>
 97 |     </dependencies>
 98 |     <build>
 99 |         <pluginManagement><!-- lock down plugins versions to avoid using Maven defaults (may be moved to parent pom) -->
100 |             <plugins>
101 |                 <plugin>
102 |                     <!-- see http://davidb.github.com/scala-maven-plugin -->
103 |                     <groupId>net.alchim31.maven</groupId>
104 |                     <artifactId>scala-maven-plugin</artifactId>
105 |                     <version>3.4.2</version>
106 |                     <executions>
107 |                         <execution>
108 |                             <goals>
109 |                                 <goal>compile</goal>
110 |                                 <goal>testCompile</goal>
111 |                             </goals>
112 |                             <configuration>
113 |                                 <scalaVersion>${scala.version}</scalaVersion>
114 |                                 <scalaCompatVersion>${scala.compat.version}</scalaCompatVersion>
115 |                                 <args>
116 |                                     <arg>-target:jvm-${maven.compiler.target}</arg>
117 |                                     <arg>-dependencyfile</arg>
118 |                                     <arg>${project.build.directory}/.scala_dependencies</arg>
119 |                                 </args>
120 |                                 <javacArgs>
121 |                                     <javacArg>-source</javacArg>
122 |                                     <javacArg>${maven.compiler.source}</javacArg>
123 |                                     <javacArg>-target</javacArg>
124 |                                     <javacArg>${maven.compiler.target}</javacArg>
125 |                                 </javacArgs>
126 |                             </configuration>
127 |                         </execution>
128 |                     </executions>
129 |                 </plugin>
130 |                 <plugin>
131 |                     <groupId>org.apache.maven.plugins</groupId>
132 |                     <artifactId>maven-compiler-plugin</artifactId>
133 |                     <version>3.8.0</version>
134 |                     <executions>
135 |                         <execution>
136 |                             <goals>
137 |                                 <goal>compile</goal>
138 |                                 <goal>testCompile</goal>
139 |                             </goals>
140 |                         </execution>
141 |                     </executions>
142 |                     <configuration>
143 |                         <source>${maven.compiler.source}</source>
144 |                         <target>${maven.compiler.target}</target>
145 |                         <compilerArgs>
146 |                             <arg>-Xlint</arg>
147 |                         </compilerArgs>
148 |                     </configuration>
149 |                 </plugin>
150 |                 <plugin>
151 |                     <groupId>org.apache.maven.plugins</groupId>
152 |                     <artifactId>maven-dependency-plugin</artifactId>
153 |                     <version>3.0.2</version>
154 |                     <executions>
155 |                         <execution>
156 |                             <id>copy-dependencies</id>
157 |                             <phase>package</phase>
158 |                             <goals>
159 |                                 <goal>copy-dependencies</goal>
160 |                             </goals>
161 |                             <configuration>
162 |                                 <includeScope>runtime</includeScope>
163 |                                 <outputDirectory>${project.build.directory}</outputDirectory>
164 |                             </configuration>
165 |                         </execution>
166 |                     </executions>
167 |                 </plugin>
168 |                 <plugin>
169 |                     <groupId>org.apache.maven.plugins</groupId>
170 |                     <artifactId>maven-surefire-plugin</artifactId>
171 |                     <version>2.22.0</version>
172 |                     <configuration>
173 |                         <!-- Tests will be run with scalatest-maven-plugin instead -->
174 |                         <!--<skipTests>true</skipTests>-->
175 |                     </configuration>
176 |                 </plugin>
177 |                 <plugin>
178 |                     <groupId>org.apache.maven.plugins</groupId>
179 |                     <artifactId>maven-shade-plugin</artifactId>
180 |                     <version>3.1.1</version>
181 |                     <executions>
182 |                         <execution>
183 |                             <phase>package</phase>
184 |                             <goals>
185 |                                 <goal>shade</goal>
186 |                             </goals>
187 |                             <configuration>
188 |                                 <minimizeJar>true</minimizeJar>
189 |                                 <filters>
190 |                                     <filter>
191 |                                         <artifact>*:*</artifact>
192 |                                         <excludes>
193 |                                             <exclude>META-INF/*.SF</exclude>
194 |                                             <exclude>META-INF/*.DSA</exclude>
195 |                                             <exclude>META-INF/*.RSA</exclude>
196 |                                         </excludes>
197 |                                     </filter>
198 |                                 </filters>
199 |                             </configuration>
200 |                         </execution>
201 |                     </executions>
202 |                 </plugin>
203 |                 <plugin>
204 |                     <groupId>org.scalatest</groupId>
205 |                     <artifactId>scalatest-maven-plugin</artifactId>
206 |                     <version>2.0.0</version>
207 |                     <configuration>
208 |                         <reportsDirectory>${project.build.directory}/surefire-reports</reportsDirectory>
209 |                         <junitxml>.</junitxml>
210 |                         <filereports>TestSuiteReport.txt</filereports>
211 |                     </configuration>
212 |                     <executions>
213 |                         <execution>
214 |                             <id>test</id>
215 |                             <goals>
216 |                                 <goal>test</goal>
217 |                             </goals>
218 |                         </execution>
219 |                     </executions>
220 |                 </plugin>
221 |                 <plugin>
222 |                     <groupId>org.apache.maven.plugins</groupId>
223 |                     <artifactId>maven-clean-plugin</artifactId>
224 |                     <version>3.1.0</version>
225 |                     <executions>
226 |                         <execution>
227 |                             <id>auto-clean</id>
228 |                             <phase>initialize</phase>
229 |                             <goals>
230 |                                 <goal>clean</goal>
231 |                             </goals>
232 |                         </execution>
233 |                     </executions>
234 |                 </plugin>
235 |             </plugins>
236 |         </pluginManagement>
237 |         <plugins>
238 |             <plugin>
239 |                 <!-- see http://davidb.github.com/scala-maven-plugin -->
240 |                 <groupId>net.alchim31.maven</groupId>
241 |                 <artifactId>scala-maven-plugin</artifactId>
242 |             </plugin>
243 |             <plugin>
244 |                 <groupId>org.apache.maven.plugins</groupId>
245 |                 <artifactId>maven-compiler-plugin</artifactId>
246 |             </plugin>
247 |             <plugin>
248 |                 <groupId>org.apache.maven.plugins</groupId>
249 |                 <artifactId>maven-dependency-plugin</artifactId>
250 |             </plugin>
251 |             <plugin>
252 |                 <groupId>org.apache.maven.plugins</groupId>
253 |                 <artifactId>maven-surefire-plugin</artifactId>
254 |             </plugin>
255 |             <plugin>
256 |                 <groupId>org.apache.maven.plugins</groupId>
257 |                 <artifactId>maven-shade-plugin</artifactId>
258 |             </plugin>
259 |             <plugin>
260 |                 <groupId>org.apache.maven.plugins</groupId>
261 |                 <artifactId>maven-clean-plugin</artifactId>
262 |             </plugin>
263 |             <plugin>
264 |                 <groupId>org.scalatest</groupId>
265 |                 <artifactId>scalatest-maven-plugin</artifactId>
266 |             </plugin>
267 |         </plugins>
268 |     </build>
269 | </project>


--------------------------------------------------------------------------------
/onprem/DataLoader/.vscode/launch.json:
--------------------------------------------------------------------------------
 1 | {
 2 |    // Use IntelliSense to find out which attributes exist for C# debugging
 3 |    // Use hover for the description of the existing attributes
 4 |    // For further information visit https://github.com/OmniSharp/omnisharp-vscode/blob/master/debugger-launchjson.md
 5 |    "version": "0.2.0",
 6 |    "configurations": [
 7 |        {
 8 |            "name": ".NET Core Launch (console)",
 9 |            "type": "coreclr",
10 |            "request": "launch",
11 |            "preLaunchTask": "build",
12 |            "program": "${workspaceRoot}/bin/Debug/netcoreapp2.0/taxi.dll",
13 |            "args": [],
14 |            "cwd": "${workspaceRoot}",
15 |            "stopAtEntry": false,
16 |            "console": "internalConsole",
17 |            "env": {
18 |             "RIDE_EVENT_HUB": "",
19 |             "FARE_EVENT_HUB": "",
20 |             "RIDE_DATA_FILE_PATH": "",
21 |             "MINUTES_TO_LEAD": "",
22 |             "PUSH_RIDE_DATA_FIRST": ""
23 |            }
24 |        },
25 |         {
26 |             "name": ".NET Core Attach",
27 |             "type": "coreclr",
28 |             "request": "attach",
29 |             "processId": "${command:pickProcess}"
30 |         }
31 |     ,]
32 | }
33 | 


--------------------------------------------------------------------------------
/onprem/DataLoader/.vscode/tasks.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "version": "2.0.0",
 3 |     "tasks": [
 4 |         {
 5 |             "label": "build",
 6 |             "command": "dotnet",
 7 |             "type": "process",
 8 |             "group": {
 9 |                 "kind": "build",
10 |                 "isDefault": true
11 |             },
12 |             "args": [
13 |                 "build",
14 |                 "${workspaceFolder}/taxi.csproj"
15 |             ],
16 |             "problemMatcher": "$msCompile"
17 |         }
18 |     ]
19 | }


--------------------------------------------------------------------------------
/onprem/DataLoader/DataFormat.cs:
--------------------------------------------------------------------------------
1 | namespace Taxi
2 | {
3 |     public enum DataFormat
4 |     {
5 |         Csv,
6 |         Json
7 |     }
8 | }


--------------------------------------------------------------------------------
/onprem/DataLoader/ObjectPool.cs:
--------------------------------------------------------------------------------
 1 | namespace Taxi
 2 | {
 3 |     using System;
 4 |     using System.Collections.Concurrent;
 5 | 
 6 |     public class ObjectPool<T>
 7 |         where T: class
 8 |     {
 9 |         private BlockingCollection<ObjectPoolObject> _pool = new BlockingCollection<ObjectPoolObject>();
10 |         private Func<T> _factory;
11 |         private int _poolSize;
12 | 
13 |         public ObjectPool(Func<T> factory, int poolSize = 10)
14 |         {
15 |             _factory = factory ?? throw new ArgumentNullException(nameof(factory));
16 |             _poolSize = poolSize;
17 |             Initialize();
18 |         }
19 | 
20 |         private void Initialize()
21 |         {
22 |             for (int i = 0; i < _poolSize; i++)
23 |             {
24 |                 _pool.Add(new ObjectPoolObject(_factory(), this));
25 |             }
26 |         }
27 | 
28 |         public ObjectPoolObject GetObject()
29 |         {
30 |             return _pool.Take();
31 |         }
32 | 
33 |         private void Return(ObjectPoolObject obj)
34 |         {
35 |             if (obj == null)
36 |             {
37 |                 throw new ArgumentNullException(nameof(obj));
38 |             }
39 | 
40 |             _pool.Add(obj);
41 |         }
42 | 
43 |         public class ObjectPoolObject : IDisposable
44 |         {
45 |             private T _obj;
46 |             private ObjectPool<T> _objectPool;
47 | 
48 |             internal ObjectPoolObject(T obj, ObjectPool<T> objectPool)
49 |             {
50 |                 _obj = obj ?? throw new ArgumentNullException(nameof(obj));
51 |                 _objectPool = objectPool ?? throw new ArgumentNullException(nameof(objectPool));
52 |             }
53 | 
54 |             public void Dispose()
55 |             {
56 |                 _objectPool.Return(this);
57 |             }
58 | 
59 |             public T Value
60 |             {
61 |                 get => _obj;
62 |             }
63 | 
64 |             public static explicit operator T(ObjectPoolObject poolObject)
65 |             {
66 |                 return poolObject._obj;
67 |             }
68 |         }
69 |     }
70 | }


--------------------------------------------------------------------------------
/onprem/DataLoader/Program.cs:
--------------------------------------------------------------------------------
  1 | ﻿namespace Taxi
  2 | {
  3 |     using System;
  4 |     using System.Collections.Concurrent;
  5 |     using System.Collections.Generic;
  6 |     using System.IO;
  7 |     using System.IO.Compression;
  8 |     using System.Linq;
  9 |     using System.Text;
 10 |     using System.Threading;
 11 |     using System.Threading.Tasks;
 12 |     using Microsoft.Azure.EventHubs;
 13 |     using Newtonsoft.Json;
 14 |     using System.Threading.Tasks.Dataflow;
 15 | 
 16 | 
 17 |     class Program
 18 |     {
 19 | 
 20 |         private static CancellationTokenSource cts;
 21 |         private static async Task ReadData<T>(ICollection<string> pathList, Func<string, string, T> factory,
 22 |             ObjectPool<EventHubClient> pool, int randomSeed, AsyncConsole console, int waittime, DataFormat dataFormat)
 23 |             where T : TaxiData
 24 |         {
 25 | 
 26 | 
 27 |             if (pathList == null)
 28 |             {
 29 |                 throw new ArgumentNullException(nameof(pathList));
 30 |             }
 31 | 
 32 |             if (factory == null)
 33 |             {
 34 |                 throw new ArgumentNullException(nameof(factory));
 35 |             }
 36 | 
 37 |             if (pool == null)
 38 |             {
 39 |                 throw new ArgumentNullException(nameof(pool));
 40 |             }
 41 | 
 42 |             if (console == null)
 43 |             {
 44 |                 throw new ArgumentNullException(nameof(console));
 45 |             }
 46 | 
 47 |             if (waittime > 0)
 48 |             {
 49 |                 TimeSpan span = TimeSpan.FromMilliseconds(waittime);
 50 |                 await Task.Delay(span);
 51 |             }
 52 | 
 53 |             string typeName = typeof(T).Name;
 54 |             Random random = new Random(randomSeed);
 55 | 
 56 |             // buffer block that holds the messages . consumer will fetch records from this block asynchronously.
 57 |             BufferBlock<T> buffer = new BufferBlock<T>(new DataflowBlockOptions()
 58 |             {
 59 |                 BoundedCapacity = 100000
 60 |             });
 61 | 
 62 |             // consumer that sends the data to event hub asynchronoulsy.
 63 |             var consumer = new ActionBlock<T>(
 64 |              (t) =>
 65 |                 {
 66 |                     using (var client = pool.GetObject())
 67 |                     {
 68 |                         return client.Value.SendAsync(new EventData(Encoding.UTF8.GetBytes(
 69 |                             t.GetData(dataFormat))), t.PartitionKey).ContinueWith(
 70 |                                async task =>
 71 |                                 {
 72 |                                     cts.Cancel();
 73 |                                     await console.WriteLine(task.Exception.InnerException.Message);
 74 |                                     await console.WriteLine($"event hub client failed for {typeName}");
 75 |                                 }
 76 |                                 , TaskContinuationOptions.OnlyOnFaulted
 77 |                             );
 78 |                     }
 79 |                 },
 80 |                 new ExecutionDataflowBlockOptions
 81 |                 {
 82 |                     BoundedCapacity = 100000,
 83 |                     CancellationToken = cts.Token,
 84 |                     MaxDegreeOfParallelism = 100,
 85 |                 }
 86 |             );
 87 | 
 88 |             // link the buffer to consumer .
 89 |             buffer.LinkTo(consumer, new DataflowLinkOptions()
 90 |             {
 91 |                 PropagateCompletion = true
 92 |             });
 93 | 
 94 |             long messages = 0;
 95 | 
 96 |             List<Task> taskList = new List<Task>();
 97 | 
 98 |             var readTask = Task.Factory.StartNew(
 99 |                  async () =>
100 |                  {
101 |                      // iterate through the path list and act on each file from here on
102 |                      foreach (var path in pathList)
103 |                      {
104 |                          using (var archive = new ZipArchive(File.OpenRead(path),
105 |                                                      ZipArchiveMode.Read))
106 |                          {
107 |                              foreach (var entry in archive.Entries)
108 |                              {
109 |                                  using (var reader = new StreamReader(entry.Open()))
110 |                                  {
111 | 
112 |                                      var header = reader.ReadLines()
113 |                                          .First();
114 |                                      // Start consumer
115 |                                      var lines = reader.ReadLines()
116 |                                           .Skip(1);
117 | 
118 | 
119 |                                      // for each line , send to event hub
120 |                                      foreach (var line in lines)
121 |                                      {
122 |                                          // proceed only if previous send operation is succesful.
123 |                                          // cancelation is requested in case if send fails .
124 |                                          if (cts.IsCancellationRequested)
125 |                                          {
126 |                                              break;
127 |                                          }
128 |                                          await buffer.SendAsync(factory(line, header)).ConfigureAwait(false);
129 |                                          if (++messages % 10000 == 0)
130 |                                          {
131 |                                              // random delay every 10000 messages are buffered ??
132 |                                              await Task.Delay(random.Next(100, 1000))
133 |                                                  .ConfigureAwait(false);
134 |                                              await console.WriteLine($"Created {messages} records for {typeName}").ConfigureAwait(false);
135 |                                          }
136 | 
137 |                                      }
138 |                                  }
139 | 
140 |                                  if (cts.IsCancellationRequested)
141 |                                  {
142 |                                      break;
143 |                                  }
144 |                              }
145 | 
146 |                              if (cts.IsCancellationRequested)
147 |                              {
148 |                                  break;
149 |                              }
150 |                          }
151 | 
152 |                          buffer.Complete();
153 |                          await Task.WhenAll(buffer.Completion, consumer.Completion);
154 |                          await console.WriteLine($"Created total {messages} records for {typeName}").ConfigureAwait(false);
155 |                      }
156 |                  }
157 |              ).Unwrap().ContinueWith(
158 |                                async task =>
159 |                                 {
160 |                                     cts.Cancel();
161 |                                     await console.WriteLine($"failed to read files for {typeName}").ConfigureAwait(false);
162 |                                     await console.WriteLine(task.Exception.InnerException.Message).ConfigureAwait(false);
163 |                                 }
164 |                                 , TaskContinuationOptions.OnlyOnFaulted
165 |                             );
166 | 
167 | 
168 |             // await on consumer completion. Incase if sending is failed at any moment ,
169 |             // execption is thrown and caught . This is used to signal the cancel the reading operation and abort all activity further
170 | 
171 |             try
172 |             {
173 |                 await Task.WhenAll(consumer.Completion, readTask);
174 |             }
175 |             catch (Exception ex)
176 |             {
177 |                 cts.Cancel();
178 |                 await console.WriteLine(ex.Message).ConfigureAwait(false);
179 |                 await console.WriteLine($"failed to send files for {typeName}").ConfigureAwait(false);
180 |                 throw;
181 |             }
182 | 
183 |         }
184 | 
185 | 
186 |         private static (string RideConnectionString,
187 |                         string FareConnectionString,
188 |                         ICollection<String> RideDataFiles,
189 |                         ICollection<String> TripDataFiles,
190 |                         int MillisecondsToRun,
191 |                         int MillisecondsToLead,
192 |                         bool sendRideDataFirst) ParseArguments()
193 |         {
194 | 
195 |             var rideConnectionString = Environment.GetEnvironmentVariable("RIDE_EVENT_HUB");
196 |             var fareConnectionString = Environment.GetEnvironmentVariable("FARE_EVENT_HUB");
197 |             var rideDataFilePath = Environment.GetEnvironmentVariable("RIDE_DATA_FILE_PATH");
198 |             var numberOfMillisecondsToRun = (int.TryParse(Environment.GetEnvironmentVariable("SECONDS_TO_RUN"), out int outputSecondToRun) ? outputSecondToRun : 0) * 1000;
199 |             var numberOfMillisecondsToLead = (int.TryParse(Environment.GetEnvironmentVariable("MINUTES_TO_LEAD"), out int outputMinutesToLead) ? outputMinutesToLead : 0) * 60000;
200 |             var pushRideDataFirst = bool.TryParse(Environment.GetEnvironmentVariable("PUSH_RIDE_DATA_FIRST"), out Boolean outputPushRideDataFirst) ? outputPushRideDataFirst : false;
201 | 
202 |             if (string.IsNullOrWhiteSpace(rideConnectionString))
203 |             {
204 |                 throw new ArgumentException("rideConnectionString must be provided");
205 |             }
206 | 
207 |             if (string.IsNullOrWhiteSpace(fareConnectionString))
208 |             {
209 |                 throw new ArgumentException("fareConnectionString must be provided");
210 |             }
211 | 
212 |             if (string.IsNullOrWhiteSpace(rideDataFilePath))
213 |             {
214 |                 throw new ArgumentException("rideDataFilePath must be provided");
215 |             }
216 | 
217 |             if (!Directory.Exists(rideDataFilePath))
218 |             {
219 |                 throw new ArgumentException("ride file path doesnot exists");
220 |             }
221 |             // get only the ride files in order. trip_data_1.zip gets read before trip_data_2.zip
222 |             var rideDataFiles = Directory.EnumerateFiles(rideDataFilePath)
223 |                                     .Where(p => Path.GetFileNameWithoutExtension(p).Contains("trip_data"))
224 |                                     .OrderBy(p =>
225 |                                     {
226 |                                         var filename = Path.GetFileNameWithoutExtension(p);
227 |                                         var indexString = filename.Substring(filename.LastIndexOf('_') + 1);
228 |                                         var index = int.TryParse(indexString, out int i) ? i : throw new ArgumentException("tripdata file must be named in format trip_data_*.zip");
229 |                                         return index;
230 |                                     }).ToArray();
231 | 
232 |             // get only the fare files in order
233 |             var fareDataFiles = Directory.EnumerateFiles(rideDataFilePath)
234 |                             .Where(p => Path.GetFileNameWithoutExtension(p).Contains("trip_fare"))
235 |                             .OrderBy(p =>
236 |                             {
237 |                                 var filename = Path.GetFileNameWithoutExtension(p);
238 |                                 var indexString = filename.Substring(filename.LastIndexOf('_') + 1);
239 |                                 var index = int.TryParse(indexString, out int i) ? i : throw new ArgumentException("tripfare file must be named in format trip_fare_*.zip");
240 |                                 return index;
241 |                             }).ToArray();
242 | 
243 |             if (rideDataFiles.Length == 0)
244 |             {
245 |                 throw new ArgumentException($"trip data files at {rideDataFilePath} does not exist");
246 |             }
247 | 
248 |             if (fareDataFiles.Length == 0)
249 |             {
250 |                 throw new ArgumentException($"fare data files at {rideDataFilePath} does not exist");
251 |             }
252 | 
253 |             return (rideConnectionString, fareConnectionString, rideDataFiles, fareDataFiles, numberOfMillisecondsToRun, numberOfMillisecondsToLead, pushRideDataFirst);
254 |         }
255 | 
256 | 
257 |         // blocking collection that helps to print to console the messages on progress on the read and send of files to event hub.
258 |         private class AsyncConsole
259 |         {
260 |             private BlockingCollection<string> _blockingCollection = new BlockingCollection<string>();
261 |             private CancellationToken _cancellationToken;
262 |             private Task _writerTask;
263 | 
264 |             public AsyncConsole(CancellationToken cancellationToken = default(CancellationToken))
265 |             {
266 |                 _cancellationToken = cancellationToken;
267 |                 _writerTask = Task.Factory.StartNew((state) =>
268 |                 {
269 |                     var token = (CancellationToken)state;
270 |                     string msg;
271 |                     while (!token.IsCancellationRequested)
272 |                     {
273 |                         if (_blockingCollection.TryTake(out msg, 500))
274 |                         {
275 |                             Console.WriteLine(msg);
276 |                         }
277 |                     }
278 | 
279 |                     while (_blockingCollection.TryTake(out msg, 100))
280 |                     {
281 |                         Console.WriteLine(msg);
282 |                     }
283 |                 }, _cancellationToken, TaskCreationOptions.LongRunning);
284 |             }
285 | 
286 |             public Task WriteLine(string toWrite)
287 |             {
288 |                 _blockingCollection.Add(toWrite);
289 |                 return Task.FromResult(0);
290 |             }
291 | 
292 |             public Task WriterTask
293 |             {
294 |                 get { return _writerTask; }
295 |             }
296 |         }
297 | 
298 |         //  start of the read task
299 |         public static async Task<int> Main(string[] args)
300 |         {
301 |             try
302 |             {
303 |                 var arguments = ParseArguments();
304 |                 var rideClient = EventHubClient.CreateFromConnectionString(
305 |                     arguments.RideConnectionString
306 |                 );
307 |                 var fareClient = EventHubClient.CreateFromConnectionString(
308 |                     arguments.FareConnectionString
309 |                 );
310 | 
311 |                 cts = arguments.MillisecondsToRun == 0 ? new CancellationTokenSource() : new CancellationTokenSource(arguments.MillisecondsToRun);
312 | 
313 |                 Console.CancelKeyPress += (s, e) =>
314 |                 {
315 |                     //Console.WriteLine("Cancelling data generation");
316 |                     cts.Cancel();
317 |                     e.Cancel = true;
318 |                 };
319 | 
320 | 
321 |                 AsyncConsole console = new AsyncConsole(cts.Token);
322 | 
323 |                 var rideClientPool = new ObjectPool<EventHubClient>(() => EventHubClient.CreateFromConnectionString(arguments.RideConnectionString), 100);
324 |                 var fareClientPool = new ObjectPool<EventHubClient>(() => EventHubClient.CreateFromConnectionString(arguments.FareConnectionString), 100);
325 | 
326 | 
327 |                 var numberOfMillisecondsToLead = arguments.MillisecondsToLead;
328 |                 var pushRideDataFirst = arguments.sendRideDataFirst;
329 | 
330 |                 var rideTaskWaitTime = 0;
331 |                 var fareTaskWaitTime = 0;
332 | 
333 |                 if (numberOfMillisecondsToLead > 0)
334 |                 {
335 |                     if (!pushRideDataFirst)
336 |                     {
337 |                         rideTaskWaitTime = numberOfMillisecondsToLead;
338 |                     }
339 |                     else
340 |                     {
341 |                         fareTaskWaitTime = numberOfMillisecondsToLead;
342 |                     }
343 |                 }
344 | 
345 | 
346 |                 var rideTask = ReadData<TaxiRide>(arguments.RideDataFiles,
347 |                                         TaxiRide.FromString, rideClientPool, 100, console,
348 |                                         rideTaskWaitTime, DataFormat.Json);
349 | 
350 |                 var fareTask = ReadData<TaxiFare>(arguments.TripDataFiles,
351 |                     TaxiFare.FromString, fareClientPool, 200, console,
352 |                     fareTaskWaitTime, DataFormat.Csv);
353 | 
354 | 
355 |                 await Task.WhenAll(rideTask, fareTask, console.WriterTask);
356 |                 Console.WriteLine("Data generation complete");
357 |             }
358 |             catch (Exception ex)
359 |             {
360 |                 Console.WriteLine(ex.Message);
361 |                 Console.WriteLine("Data generation failed");
362 |                 return 1;
363 |             }
364 | 
365 |             return 0;
366 |         }
367 |     }
368 | }


--------------------------------------------------------------------------------
/onprem/DataLoader/StreamReaderExtensions.cs:
--------------------------------------------------------------------------------
 1 | using System;
 2 | using System.Collections.Generic;
 3 | using System.IO;
 4 | using System.Linq;
 5 | using Microsoft.Azure.EventHubs;
 6 | 
 7 | namespace Taxi
 8 | {
 9 |     public static class StreamReaderExtensions
10 |     {
11 |         public static IEnumerable<string> ReadLines(this StreamReader reader)
12 |         {
13 |             if (reader == null)
14 |             {
15 |                 throw new ArgumentNullException(nameof(reader));
16 |             }
17 | 
18 |             string line = null;
19 |             while ((line = reader.ReadLine()) != null)
20 |             {
21 |                 yield return line;
22 |             }
23 |         }
24 |     }
25 | }


--------------------------------------------------------------------------------
/onprem/DataLoader/TaxiData.cs:
--------------------------------------------------------------------------------
 1 | namespace Taxi
 2 | {
 3 |     using System;
 4 |     using System.Globalization;
 5 |     using Newtonsoft.Json;
 6 |     using Newtonsoft.Json.Serialization;
 7 | 
 8 |     [JsonObject(NamingStrategyType = typeof(CamelCaseNamingStrategy))]
 9 |     public abstract class TaxiData
10 |     {
11 |         public TaxiData()
12 |         {
13 |         }
14 | 
15 |         [JsonProperty]
16 |         public long Medallion { get; set; }
17 | 
18 |         [JsonProperty]
19 |         public long HackLicense { get; set; }
20 | 
21 |         [JsonProperty]
22 |         public string VendorId { get; set; }
23 | 
24 |         [JsonProperty]
25 |         public DateTimeOffset PickupTime { get; set; }
26 | 
27 |         [JsonIgnore]
28 |         public string PartitionKey
29 |         {
30 |             get => $"{Medallion}_{HackLicense}_{VendorId}";
31 |         }
32 | 
33 |         [JsonIgnore]
34 |         protected string CsvHeader { get; set; }  
35 | 
36 | 
37 |         [JsonIgnore]
38 |         protected string CsvString { get; set; }
39 | 
40 |         public string GetData(DataFormat dataFormat)
41 |         {
42 |             if (dataFormat == DataFormat.Csv)
43 |             {
44 |                 return $"{CsvHeader}\r\n{CsvString}";
45 |             }
46 |             else if (dataFormat == DataFormat.Json)
47 |             {
48 |                 return JsonConvert.SerializeObject(this);
49 |             }
50 |             else
51 |             {
52 |                 throw new ArgumentException($"Invalid DataFormat: {dataFormat}");
53 |             }
54 |         }
55 |     }
56 | }


--------------------------------------------------------------------------------
/onprem/DataLoader/TaxiFare.cs:
--------------------------------------------------------------------------------
 1 | namespace Taxi
 2 | {
 3 |     using System;
 4 |     using System.Globalization;
 5 |     using Newtonsoft.Json;
 6 |     using Newtonsoft.Json.Serialization;
 7 | 
 8 |     [JsonObject(NamingStrategyType = typeof(CamelCaseNamingStrategy))]
 9 |     public class TaxiFare : TaxiData
10 | 
11 |     {
12 |         public TaxiFare()
13 |         {
14 |         }
15 | 
16 |         [JsonProperty]
17 |         public string PaymentType { get; set; }
18 | 
19 |         [JsonProperty]
20 |         public float FareAmount { get; set; }
21 | 
22 |         [JsonProperty]
23 |         public float Surcharge { get; set; }
24 | 
25 |         [JsonProperty("mtaTax")]
26 |         public float MTATax { get; set; }
27 | 
28 |         [JsonProperty]
29 |         public float TipAmount { get; set; }
30 | 
31 |         [JsonProperty]
32 |         public float TollsAmount { get; set; }
33 | 
34 |         [JsonProperty]
35 |         public float TotalAmount { get; set; }
36 | 
37 |         public static TaxiFare FromString(string line,string header)
38 |         {
39 |             if (string.IsNullOrWhiteSpace(line))
40 |             {
41 |                 throw new ArgumentException($"{nameof(line)} cannot be null, empty, or only whitespace");
42 |             }
43 | 
44 |             string[] tokens = line.Split(',');
45 |             if (tokens.Length != 11)
46 |             {
47 |                 throw new ArgumentException($"Invalid record: {line}");
48 |             }
49 | 
50 |             var fare = new TaxiFare();
51 |             fare.CsvString = line;
52 |             fare.CsvHeader = header;
53 |             try
54 |             {
55 |                 fare.Medallion = long.Parse(tokens[0]);
56 |                 fare.HackLicense = long.Parse(tokens[1]);
57 |                 fare.VendorId = tokens[2];
58 |                 fare.PickupTime = DateTimeOffset.ParseExact(
59 |                     tokens[3], "yyyy-MM-dd HH:mm:ss",
60 |                     CultureInfo.InvariantCulture,
61 |                     DateTimeStyles.AssumeUniversal);
62 |                 fare.PaymentType = tokens[4];
63 |                 fare.FareAmount = float.TryParse(tokens[5], out float result) ? result : 0.0f;
64 |                 fare.Surcharge = float.TryParse(tokens[6], out result) ? result : 0.0f;
65 |                 fare.MTATax = float.TryParse(tokens[7], out result) ? result : 0.0f;
66 |                 fare.TipAmount = float.TryParse(tokens[8], out result) ? result : 0.0f;
67 |                 fare.TollsAmount = float.TryParse(tokens[9], out result) ? result : 0.0f;
68 |                 fare.TotalAmount = float.TryParse(tokens[10], out result) ? result : 0.0f;
69 |                 return fare;
70 |             }
71 |             catch (Exception ex)
72 |             {
73 |                 throw new ArgumentException($"Invalid record: {line}", ex);
74 |             }
75 |         }
76 |     }
77 | }


--------------------------------------------------------------------------------
/onprem/DataLoader/TaxiRide.cs:
--------------------------------------------------------------------------------
 1 | namespace Taxi
 2 | {
 3 |     using System;
 4 |     using System.Globalization;
 5 |     using Newtonsoft.Json;
 6 |     using Newtonsoft.Json.Serialization;
 7 | 
 8 |     [JsonObject(NamingStrategyType = typeof(CamelCaseNamingStrategy))]
 9 |     public class TaxiRide : TaxiData
10 | 
11 |     {
12 |         public TaxiRide()
13 |         {
14 |         }
15 | 
16 |         [JsonProperty]
17 |         public int RateCode { get; set; }
18 | 
19 |         [JsonProperty]
20 |         public string StoreAndForwardFlag { get; set; }
21 | 
22 |         [JsonProperty]
23 |         public DateTimeOffset DropoffTime { get; set; }
24 | 
25 |         [JsonProperty]
26 |         public int PassengerCount { get; set; }
27 | 
28 |         [JsonProperty]
29 |         public float TripTimeInSeconds { get; set; }
30 | 
31 |         [JsonProperty]
32 |         public float TripDistanceInMiles { get; set; }
33 | 
34 |         [JsonProperty]
35 |         public float PickupLon { get; set; }
36 | 
37 |         [JsonProperty]
38 |         public float PickupLat { get; set; }
39 | 
40 |         [JsonProperty]
41 |         public float DropoffLon { get; set; }
42 | 
43 |         [JsonProperty]
44 |         public float DropoffLat { get; set; }
45 | 
46 |         public static TaxiRide FromString(string line,string header)
47 |         {
48 |             if (string.IsNullOrWhiteSpace(line))
49 |             {
50 |                 throw new ArgumentException($"{nameof(line)} cannot be null, empty, or only whitespace");
51 |             }
52 | 
53 |             string[] tokens = line.Split(',');
54 |             if (tokens.Length != 14)
55 |             {
56 |                 throw new ArgumentException($"Invalid record: {line}");
57 |             }
58 | 
59 |             var ride = new TaxiRide();
60 |             ride.CsvString = line;
61 |             ride.CsvHeader = header;
62 |             try
63 |             {
64 |                 ride.Medallion = long.Parse(tokens[0]);
65 |                 ride.HackLicense = long.Parse(tokens[1]);
66 |                 ride.VendorId = tokens[2];
67 |                 ride.RateCode = int.Parse(tokens[3]);
68 |                 ride.StoreAndForwardFlag = tokens[4];
69 |                 ride.PickupTime = DateTimeOffset.ParseExact(
70 |                     tokens[5], "yyyy-MM-dd HH:mm:ss",
71 |                     CultureInfo.InvariantCulture,
72 |                     DateTimeStyles.AssumeUniversal);
73 |                 ride.DropoffTime = DateTimeOffset.ParseExact(
74 |                     tokens[6], "yyyy-MM-dd HH:mm:ss",
75 |                     CultureInfo.InvariantCulture,
76 |                     DateTimeStyles.AssumeUniversal);
77 |                 ride.PassengerCount = int.Parse(tokens[7]);
78 |                 ride.TripTimeInSeconds = float.Parse(tokens[8]);
79 |                 ride.TripDistanceInMiles = float.Parse(tokens[9]);
80 | 
81 |                 ride.PickupLon = float.TryParse(tokens[10], out float result) ? result : 0.0f;
82 |                 ride.PickupLat = float.TryParse(tokens[11], out result) ? result : 0.0f;
83 |                 ride.DropoffLon = float.TryParse(tokens[12], out result) ? result : 0.0f;
84 |                 ride.DropoffLat = float.TryParse(tokens[13], out result) ? result : 0.0f;
85 |                 return ride;
86 |             }
87 |             catch (Exception ex)
88 |             {
89 |                 throw new ArgumentException($"Invalid record: {line}", ex);
90 |             }
91 |         }
92 |     }
93 | }


--------------------------------------------------------------------------------
/onprem/DataLoader/taxi.csproj:
--------------------------------------------------------------------------------
 1 | <Project Sdk="Microsoft.NET.Sdk">
 2 | 
 3 |   <PropertyGroup>
 4 |     <OutputType>Exe</OutputType>
 5 |     <TargetFramework>netcoreapp3.1</TargetFramework>
 6 |     <LangVersion>latest</LangVersion>
 7 |     <RuntimeIdentifiers>win10-x64</RuntimeIdentifiers>
 8 |   </PropertyGroup>
 9 |   <ItemGroup>
10 |     <PackageReference Include="Microsoft.Azure.EventHubs" Version="4.2.0" />
11 |     <PackageReference Include="Newtonsoft.Json" Version="12.0.3" />
12 |     <PackageReference Include="System.Threading.Tasks.Dataflow" Version="4.11.0" />
13 |   </ItemGroup>
14 | </Project>
15 | 


--------------------------------------------------------------------------------
/onprem/Dockerfile:
--------------------------------------------------------------------------------
 1 | FROM mcr.microsoft.com/dotnet/core/sdk:3.1 as build
 2 | RUN apt-get update
 3 | RUN apt-get install -y git
 4 | RUN git clone --recursive https://github.com/mspnp/azure-databricks-streaming-analytics.git  &&  cd azure-databricks-streaming-analytics && git fetch && git checkout master
 5 | WORKDIR azure-databricks-streaming-analytics/onprem/DataLoader
 6 | RUN dotnet build
 7 | RUN dotnet publish -f netcoreapp3.1 -c Release
 8 | FROM mcr.microsoft.com/dotnet/core/runtime:3.1 AS runtime
 9 | WORKDIR DataLoader
10 | COPY --from=build azure-databricks-streaming-analytics/onprem/DataLoader/bin/Release/netcoreapp3.1/publish .
11 | ENTRYPOINT ["dotnet" , "taxi.dll"]
12 | 


--------------------------------------------------------------------------------
/onprem/main.env:
--------------------------------------------------------------------------------
1 | RIDE_EVENT_HUB=
2 | FARE_EVENT_HUB=
3 | RIDE_DATA_FILE_PATH=/DataFile/FOIL2013
4 | MINUTES_TO_LEAD=0
5 | PUSH_RIDE_DATA_FIRST=false
6 | 


--------------------------------------------------------------------------------