├── .gitignore
├── LICENSE
├── README.md
├── arm-templates
├── azuredeploy.json
└── generated-data-factory
│ └── arm_template.zip
├── azure-pipelines.yml
├── azure-pipelines
├── ReleasePipeline.json
├── deploy-resources.sh
├── install-databricks-client.sh
├── provision-data-factory-without-zip
│ ├── generate-adf-template.jq
│ └── provision-data-factory.sh
├── provision-data-factory.sh
├── provision-databricks.sh
└── run-integration-test.sh
├── datafactory
├── linkedService
│ ├── AzureDatabricks.json
│ └── AzureKeyVault.json
└── pipeline
│ └── Apply ML model pipeline.json
├── java-library
├── pom.xml
└── src
│ ├── main
│ └── java
│ │ └── com
│ │ └── cloudarchitected
│ │ └── databricks_devops_tutorial
│ │ └── model
│ │ └── BikeShareModelFactory.java
│ └── test
│ ├── java
│ └── com
│ │ └── cloudarchitected
│ │ └── databricks_devops_tutorial
│ │ └── model
│ │ └── BikeShareModelFactoryTest.java
│ └── resources
│ └── hour.csv
└── notebooks
├── bikesharing-apply-model.scala
├── bikesharing-inttest.scala
├── bikesharing-prep.scala
└── bikesharing-train-model.scala
/.gitignore:
--------------------------------------------------------------------------------
1 | # Compiled class file
2 | *.class
3 |
4 | # Log file
5 | *.log
6 |
7 | # BlueJ files
8 | *.ctxt
9 |
10 | # Mobile Tools for Java (J2ME)
11 | .mtj.tmp/
12 |
13 | # Package Files #
14 | *.jar
15 | *.war
16 | *.nar
17 | *.ear
18 | *.tar.gz
19 | *.rar
20 |
21 | # virtual machine crash logs, see http://www.java.com/en/download/help/error_hotspot.xml
22 | hs_err_pid*
23 |
--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
1 | MIT License
2 |
3 | Copyright (c) 2019
4 |
5 | Permission is hereby granted, free of charge, to any person obtaining a copy
6 | of this software and associated documentation files (the "Software"), to deal
7 | in the Software without restriction, including without limitation the rights
8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 |
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 |
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.
22 |
--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
1 | # databricks-devops-tutorial
2 | Databricks Azure DevOps Tutorial
3 |
--------------------------------------------------------------------------------
/arm-templates/azuredeploy.json:
--------------------------------------------------------------------------------
1 | {
2 | "$schema": "https://schema.management.azure.com/schemas/2015-01-01/deploymentTemplate.json#",
3 | "contentVersion": "1.0.0.0",
4 | "parameters": {
5 | "location": {
6 | "type": "String",
7 | "defaultValue": "[resourceGroup().location]"
8 | },
9 | "dataFactoryName": {
10 | "type": "String"
11 | },
12 | "keyVaultName": {
13 | "type": "string"
14 | },
15 | "storageAccountName": {
16 | "type": "string"
17 | },
18 | "keyVaultWriterPrincipalId": {
19 | "type": "string"
20 | }
21 | },
22 | "resources": [
23 | {
24 | "type": "Microsoft.DataFactory/factories",
25 | "name": "[parameters('dataFactoryName')]",
26 | "apiVersion": "2018-06-01",
27 | "location": "[parameters('location')]",
28 | "identity": {
29 | "type": "SystemAssigned"
30 | }
31 | },
32 | {
33 | "apiVersion": "2016-10-01",
34 | "name": "[parameters('keyVaultName')]",
35 | "location": "[parameters('location')]",
36 | "type": "Microsoft.KeyVault/vaults",
37 | "properties": {
38 | "accessPolicies": [
39 | {
40 | "objectId": "[parameters('keyVaultWriterPrincipalId')]",
41 | "tenantId": "[reference(concat('Microsoft.DataFactory/factories/', parameters('dataFactoryName')), '2018-06-01', 'Full').identity.tenantId]",
42 | "permissions": {
43 | "secrets": [
44 | "Get",
45 | "List",
46 | "Set",
47 | "Delete"
48 | ]
49 | }
50 | },
51 | {
52 | "objectId": "[reference(concat('Microsoft.DataFactory/factories/', parameters('dataFactoryName')), '2018-06-01', 'Full').identity.principalId]",
53 | "tenantId": "[reference(concat('Microsoft.DataFactory/factories/', parameters('dataFactoryName')), '2018-06-01', 'Full').identity.tenantId]",
54 | "permissions": {
55 | "secrets": [
56 | "Get",
57 | "List"
58 | ]
59 | },
60 | "applicationId": null
61 | }
62 | ],
63 | "tenantId": "[reference(concat('Microsoft.DataFactory/factories/', parameters('dataFactoryName')), '2018-06-01', 'Full').identity.tenantId]",
64 | "sku": {
65 | "name": "standard",
66 | "family": "A"
67 | }
68 | }
69 | },
70 | {
71 | "name": "[parameters('storageAccountName')]",
72 | "type": "Microsoft.Storage/storageAccounts",
73 | "apiVersion": "2018-07-01",
74 | "location": "[parameters('location')]",
75 | "properties": {
76 | "accessTier": "Hot",
77 | "supportsHttpsTrafficOnly": true,
78 | "isHnsEnabled": true
79 | },
80 | "sku": {
81 | "name": "Standard_LRS"
82 | },
83 | "kind": "StorageV2"
84 | }
85 | ],
86 | "outputs": {
87 | }
88 | }
89 |
--------------------------------------------------------------------------------
/arm-templates/generated-data-factory/arm_template.zip:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/algattik/databricks-devops-tutorial/b8bef9b36fa089972ca622e36b86d23cf0a08a3b/arm-templates/generated-data-factory/arm_template.zip
--------------------------------------------------------------------------------
/azure-pipelines.yml:
--------------------------------------------------------------------------------
1 | trigger:
2 | - master
3 |
4 | pool:
5 | vmImage: 'Ubuntu-16.04'
6 |
7 | steps:
8 | - task: Maven@3
9 | inputs:
10 | mavenPomFile: 'java-library/pom.xml'
11 | mavenOptions: '-Xmx3072m'
12 | javaHomeOption: 'JDKVersion'
13 | jdkVersionOption: '1.8'
14 | jdkArchitectureOption: 'x64'
15 | publishJUnitResults: true
16 | testResultsFiles: '**/surefire-reports/TEST-*.xml'
17 | goals: 'package'
18 |
19 | - task: CopyFiles@2
20 | displayName: 'Copy Files to: $(build.artifactstagingdirectory)'
21 | inputs:
22 | SourceFolder: '$(system.defaultworkingdirectory)'
23 | Contents: |
24 | **/*.jar
25 | notebooks/*
26 | arm-templates/**
27 | azure-pipelines/**
28 | TargetFolder: '$(build.artifactstagingdirectory)'
29 |
30 | - task: PublishBuildArtifacts@1
31 | displayName: 'Publish Artifact: drop'
32 | inputs:
33 | PathtoPublish: '$(build.artifactstagingdirectory)'
34 |
--------------------------------------------------------------------------------
/azure-pipelines/ReleasePipeline.json:
--------------------------------------------------------------------------------
1 | {
2 | "source": 2,
3 | "revision": 53,
4 | "description": null,
5 | "createdBy": {
6 | "displayName": "Alexandre Gattiker",
7 | "url": "https://app.vssps.visualstudio.com/A89f9c878-a0cf-4c2b-bb3e-0da77e418adf/_apis/Identities/57487cca-e71e-46d8-b117-b7b0e99912f1",
8 | "_links": {
9 | "avatar": {
10 | "href": "https://dev.azure.com/gattiker/_apis/GraphProfile/MemberAvatars/aad.NzQwYTM1MTctMjUxZC03ZTM3LWFjZmMtM2E2ODU5NmE3NTg3"
11 | }
12 | },
13 | "id": "57487cca-e71e-46d8-b117-b7b0e99912f1",
14 | "uniqueName": "gattiker@example.com",
15 | "imageUrl": "https://dev.azure.com/gattiker/_api/_common/identityImage?id=57487cca-e71e-46d8-b117-b7b0e99912f1",
16 | "descriptor": "aad.NzQwYTM1MTctMjUxZC03ZTM3LWFjZmMtM2E2ODU5NmE3NTg3"
17 | },
18 | "createdOn": "2019-03-30T04:00:29.750Z",
19 | "modifiedBy": {
20 | "displayName": "Alexandre Gattiker",
21 | "url": "https://app.vssps.visualstudio.com/A89f9c878-a0cf-4c2b-bb3e-0da77e418adf/_apis/Identities/57487cca-e71e-46d8-b117-b7b0e99912f1",
22 | "_links": {
23 | "avatar": {
24 | "href": "https://dev.azure.com/gattiker/_apis/GraphProfile/MemberAvatars/aad.NzQwYTM1MTctMjUxZC03ZTM3LWFjZmMtM2E2ODU5NmE3NTg3"
25 | }
26 | },
27 | "id": "57487cca-e71e-46d8-b117-b7b0e99912f1",
28 | "uniqueName": "gattiker@example.com",
29 | "imageUrl": "https://dev.azure.com/gattiker/_api/_common/identityImage?id=57487cca-e71e-46d8-b117-b7b0e99912f1",
30 | "descriptor": "aad.NzQwYTM1MTctMjUxZC03ZTM3LWFjZmMtM2E2ODU5NmE3NTg3"
31 | },
32 | "modifiedOn": "2019-03-30T20:47:38.863Z",
33 | "isDeleted": false,
34 | "variables": {
35 | "DATABRICKS_HOST": {
36 | "value": "https://westeurope.azuredatabricks.net"
37 | },
38 | "DATABRICKS_TOKEN": {
39 | "value": ""
40 | },
41 | "LOCATION": {
42 | "value": "westeurope"
43 | }
44 | },
45 | "variableGroups": [],
46 | "environments": [
47 | {
48 | "id": 1,
49 | "name": "DEV",
50 | "rank": 1,
51 | "owner": {
52 | "displayName": "Alexandre Gattiker",
53 | "url": "https://app.vssps.visualstudio.com/A89f9c878-a0cf-4c2b-bb3e-0da77e418adf/_apis/Identities/57487cca-e71e-46d8-b117-b7b0e99912f1",
54 | "_links": {
55 | "avatar": {
56 | "href": "https://dev.azure.com/gattiker/_apis/GraphProfile/MemberAvatars/aad.NzQwYTM1MTctMjUxZC03ZTM3LWFjZmMtM2E2ODU5NmE3NTg3"
57 | }
58 | },
59 | "id": "57487cca-e71e-46d8-b117-b7b0e99912f1",
60 | "uniqueName": "gattiker@example.com",
61 | "imageUrl": "https://dev.azure.com/gattiker/_api/_common/identityImage?id=57487cca-e71e-46d8-b117-b7b0e99912f1",
62 | "descriptor": "aad.NzQwYTM1MTctMjUxZC03ZTM3LWFjZmMtM2E2ODU5NmE3NTg3"
63 | },
64 | "variables": {
65 | "DATA_FACTORY_NAME": {
66 | "value": "databrickscicdtut01dev"
67 | },
68 | "KEY_VAULT_NAME": {
69 | "value": "databrickscicdtut01dev"
70 | },
71 | "RESOURCE_GROUP_NAME": {
72 | "value": "databrickscicdtut01dev"
73 | },
74 | "STORAGE_ACCOUNT_NAME": {
75 | "value": "databrickscicdtut01dev"
76 | }
77 | },
78 | "variableGroups": [],
79 | "preDeployApprovals": {
80 | "approvals": [
81 | {
82 | "rank": 1,
83 | "isAutomated": true,
84 | "isNotificationOn": false,
85 | "id": 1
86 | }
87 | ],
88 | "approvalOptions": {
89 | "requiredApproverCount": null,
90 | "releaseCreatorCanBeApprover": false,
91 | "autoTriggeredAndPreviousEnvironmentApprovedCanBeSkipped": false,
92 | "enforceIdentityRevalidation": false,
93 | "timeoutInMinutes": 0,
94 | "executionOrder": 1
95 | }
96 | },
97 | "deployStep": {
98 | "id": 2
99 | },
100 | "postDeployApprovals": {
101 | "approvals": [
102 | {
103 | "rank": 1,
104 | "isAutomated": true,
105 | "isNotificationOn": false,
106 | "id": 3
107 | }
108 | ],
109 | "approvalOptions": {
110 | "requiredApproverCount": null,
111 | "releaseCreatorCanBeApprover": false,
112 | "autoTriggeredAndPreviousEnvironmentApprovedCanBeSkipped": false,
113 | "enforceIdentityRevalidation": false,
114 | "timeoutInMinutes": 0,
115 | "executionOrder": 2
116 | }
117 | },
118 | "deployPhases": [
119 | {
120 | "deploymentInput": {
121 | "parallelExecution": {
122 | "parallelExecutionType": 0
123 | },
124 | "skipArtifactsDownload": false,
125 | "artifactsDownloadInput": {
126 | "downloadInputs": [
127 | {
128 | "alias": "_devopstutorial",
129 | "artifactType": "Build",
130 | "artifactDownloadMode": "All",
131 | "artifactItems": []
132 | }
133 | ]
134 | },
135 | "queueId": 107,
136 | "demands": [],
137 | "enableAccessToken": false,
138 | "timeoutInMinutes": 0,
139 | "jobCancelTimeoutInMinutes": 1,
140 | "condition": "succeeded()",
141 | "overrideInputs": {}
142 | },
143 | "rank": 1,
144 | "phaseType": 1,
145 | "name": "Agent job",
146 | "refName": null,
147 | "workflowTasks": [
148 | {
149 | "environment": {},
150 | "taskId": "46e4be58-730b-4389-8a2f-ea10b3e5e815",
151 | "version": "1.*",
152 | "name": "Install Databricks client",
153 | "refName": "",
154 | "enabled": true,
155 | "alwaysRun": false,
156 | "continueOnError": false,
157 | "timeoutInMinutes": 0,
158 | "definitionType": "task",
159 | "overrideInputs": {},
160 | "condition": "succeeded()",
161 | "inputs": {
162 | "connectedServiceNameARM": "ef084d52-d369-4aab-87b4-3a550e592373",
163 | "scriptLocation": "scriptPath",
164 | "scriptPath": "$(System.DefaultWorkingDirectory)/_devopstutorial/drop/azure-pipelines/install-databricks-client.sh",
165 | "inlineScript": "",
166 | "args": "",
167 | "addSpnToEnvironment": "false",
168 | "useGlobalConfig": "false",
169 | "cwd": "$(System.DefaultWorkingDirectory)/_devopstutorial/drop",
170 | "failOnStandardError": "false"
171 | }
172 | },
173 | {
174 | "environment": {},
175 | "taskId": "46e4be58-730b-4389-8a2f-ea10b3e5e815",
176 | "version": "1.*",
177 | "name": "Deploy resources",
178 | "refName": "",
179 | "enabled": true,
180 | "alwaysRun": false,
181 | "continueOnError": false,
182 | "timeoutInMinutes": 0,
183 | "definitionType": "task",
184 | "overrideInputs": {},
185 | "condition": "succeeded()",
186 | "inputs": {
187 | "connectedServiceNameARM": "ef084d52-d369-4aab-87b4-3a550e592373",
188 | "scriptLocation": "scriptPath",
189 | "scriptPath": "$(System.DefaultWorkingDirectory)/_devopstutorial/drop/azure-pipelines/deploy-resources.sh",
190 | "inlineScript": "",
191 | "args": "",
192 | "addSpnToEnvironment": "true",
193 | "useGlobalConfig": "false",
194 | "cwd": "$(System.DefaultWorkingDirectory)/_devopstutorial/drop",
195 | "failOnStandardError": "false"
196 | }
197 | },
198 | {
199 | "environment": {},
200 | "taskId": "46e4be58-730b-4389-8a2f-ea10b3e5e815",
201 | "version": "1.*",
202 | "name": "Provision Databricks",
203 | "refName": "",
204 | "enabled": true,
205 | "alwaysRun": false,
206 | "continueOnError": false,
207 | "timeoutInMinutes": 0,
208 | "definitionType": "task",
209 | "overrideInputs": {},
210 | "condition": "succeeded()",
211 | "inputs": {
212 | "connectedServiceNameARM": "ef084d52-d369-4aab-87b4-3a550e592373",
213 | "scriptLocation": "scriptPath",
214 | "scriptPath": "$(System.DefaultWorkingDirectory)/_devopstutorial/drop/azure-pipelines/provision-databricks.sh",
215 | "inlineScript": "",
216 | "args": "",
217 | "addSpnToEnvironment": "false",
218 | "useGlobalConfig": "false",
219 | "cwd": "$(System.DefaultWorkingDirectory)/_devopstutorial/drop",
220 | "failOnStandardError": "false"
221 | }
222 | }
223 | ]
224 | }
225 | ],
226 | "environmentOptions": {
227 | "emailNotificationType": "OnlyOnFailure",
228 | "emailRecipients": "release.environment.owner;release.creator",
229 | "skipArtifactsDownload": false,
230 | "timeoutInMinutes": 0,
231 | "enableAccessToken": false,
232 | "publishDeploymentStatus": true,
233 | "badgeEnabled": false,
234 | "autoLinkWorkItems": false,
235 | "pullRequestDeploymentEnabled": false
236 | },
237 | "demands": [],
238 | "conditions": [
239 | {
240 | "name": "ReleaseStarted",
241 | "conditionType": 1,
242 | "value": ""
243 | }
244 | ],
245 | "executionPolicy": {
246 | "concurrencyCount": 1,
247 | "queueDepthCount": 0
248 | },
249 | "schedules": [],
250 | "currentRelease": {
251 | "id": 39,
252 | "url": "https://vsrm.dev.azure.com/gattiker/1c9c3807-115e-4c4b-9b83-9dec8b6a8a36/_apis/Release/releases/39",
253 | "_links": {}
254 | },
255 | "retentionPolicy": {
256 | "daysToKeep": 30,
257 | "releasesToKeep": 3,
258 | "retainBuild": true
259 | },
260 | "processParameters": {},
261 | "properties": {},
262 | "preDeploymentGates": {
263 | "id": 0,
264 | "gatesOptions": null,
265 | "gates": []
266 | },
267 | "postDeploymentGates": {
268 | "id": 0,
269 | "gatesOptions": null,
270 | "gates": []
271 | },
272 | "environmentTriggers": [],
273 | "badgeUrl": "https://vsrm.dev.azure.com/gattiker/_apis/public/Release/badge/1c9c3807-115e-4c4b-9b83-9dec8b6a8a36/1/1"
274 | },
275 | {
276 | "id": 5,
277 | "name": "TEST",
278 | "rank": 2,
279 | "owner": {
280 | "displayName": "Alexandre Gattiker",
281 | "url": "https://app.vssps.visualstudio.com/A89f9c878-a0cf-4c2b-bb3e-0da77e418adf/_apis/Identities/57487cca-e71e-46d8-b117-b7b0e99912f1",
282 | "_links": {
283 | "avatar": {
284 | "href": "https://dev.azure.com/gattiker/_apis/GraphProfile/MemberAvatars/aad.NzQwYTM1MTctMjUxZC03ZTM3LWFjZmMtM2E2ODU5NmE3NTg3"
285 | }
286 | },
287 | "id": "57487cca-e71e-46d8-b117-b7b0e99912f1",
288 | "uniqueName": "gattiker@example.com",
289 | "imageUrl": "https://dev.azure.com/gattiker/_api/_common/identityImage?id=57487cca-e71e-46d8-b117-b7b0e99912f1",
290 | "descriptor": "aad.NzQwYTM1MTctMjUxZC03ZTM3LWFjZmMtM2E2ODU5NmE3NTg3"
291 | },
292 | "variables": {
293 | "DATA_FACTORY_NAME": {
294 | "value": "databrickscicdtut01test"
295 | },
296 | "KEY_VAULT_NAME": {
297 | "value": "databrickscicdtut01test"
298 | },
299 | "RESOURCE_GROUP_NAME": {
300 | "value": "databrickscicdtut01test"
301 | },
302 | "STORAGE_ACCOUNT_NAME": {
303 | "value": "databrickscicdtut01test"
304 | }
305 | },
306 | "variableGroups": [],
307 | "preDeployApprovals": {
308 | "approvals": [
309 | {
310 | "rank": 1,
311 | "isAutomated": true,
312 | "isNotificationOn": false,
313 | "id": 13
314 | }
315 | ],
316 | "approvalOptions": {
317 | "requiredApproverCount": null,
318 | "releaseCreatorCanBeApprover": false,
319 | "autoTriggeredAndPreviousEnvironmentApprovedCanBeSkipped": false,
320 | "enforceIdentityRevalidation": false,
321 | "timeoutInMinutes": 0,
322 | "executionOrder": 1
323 | }
324 | },
325 | "deployStep": {
326 | "id": 14
327 | },
328 | "postDeployApprovals": {
329 | "approvals": [
330 | {
331 | "rank": 1,
332 | "isAutomated": true,
333 | "isNotificationOn": false,
334 | "id": 15
335 | }
336 | ],
337 | "approvalOptions": {
338 | "requiredApproverCount": null,
339 | "releaseCreatorCanBeApprover": false,
340 | "autoTriggeredAndPreviousEnvironmentApprovedCanBeSkipped": false,
341 | "enforceIdentityRevalidation": false,
342 | "timeoutInMinutes": 0,
343 | "executionOrder": 2
344 | }
345 | },
346 | "deployPhases": [
347 | {
348 | "deploymentInput": {
349 | "parallelExecution": {
350 | "parallelExecutionType": 0
351 | },
352 | "skipArtifactsDownload": false,
353 | "artifactsDownloadInput": {
354 | "downloadInputs": [
355 | {
356 | "alias": "_devopstutorial",
357 | "artifactType": "Build",
358 | "artifactDownloadMode": "All",
359 | "artifactItems": []
360 | }
361 | ]
362 | },
363 | "queueId": 107,
364 | "demands": [],
365 | "enableAccessToken": false,
366 | "timeoutInMinutes": 0,
367 | "jobCancelTimeoutInMinutes": 1,
368 | "condition": "succeeded()",
369 | "overrideInputs": {}
370 | },
371 | "rank": 1,
372 | "phaseType": 1,
373 | "name": "Agent job",
374 | "refName": null,
375 | "workflowTasks": [
376 | {
377 | "environment": {},
378 | "taskId": "46e4be58-730b-4389-8a2f-ea10b3e5e815",
379 | "version": "1.*",
380 | "name": "Install Databricks client",
381 | "refName": "",
382 | "enabled": true,
383 | "alwaysRun": false,
384 | "continueOnError": false,
385 | "timeoutInMinutes": 0,
386 | "definitionType": "task",
387 | "overrideInputs": {},
388 | "condition": "succeeded()",
389 | "inputs": {
390 | "connectedServiceNameARM": "ef084d52-d369-4aab-87b4-3a550e592373",
391 | "scriptLocation": "scriptPath",
392 | "scriptPath": "$(System.DefaultWorkingDirectory)/_devopstutorial/drop/azure-pipelines/install-databricks-client.sh",
393 | "inlineScript": "",
394 | "args": "",
395 | "addSpnToEnvironment": "false",
396 | "useGlobalConfig": "false",
397 | "cwd": "$(System.DefaultWorkingDirectory)/_devopstutorial/drop",
398 | "failOnStandardError": "false"
399 | }
400 | },
401 | {
402 | "environment": {},
403 | "taskId": "46e4be58-730b-4389-8a2f-ea10b3e5e815",
404 | "version": "1.*",
405 | "name": "Deploy resources",
406 | "refName": "",
407 | "enabled": true,
408 | "alwaysRun": false,
409 | "continueOnError": false,
410 | "timeoutInMinutes": 0,
411 | "definitionType": "task",
412 | "overrideInputs": {},
413 | "condition": "succeeded()",
414 | "inputs": {
415 | "connectedServiceNameARM": "ef084d52-d369-4aab-87b4-3a550e592373",
416 | "scriptLocation": "scriptPath",
417 | "scriptPath": "$(System.DefaultWorkingDirectory)/_devopstutorial/drop/azure-pipelines/deploy-resources.sh",
418 | "inlineScript": "",
419 | "args": "",
420 | "addSpnToEnvironment": "true",
421 | "useGlobalConfig": "false",
422 | "cwd": "$(System.DefaultWorkingDirectory)/_devopstutorial/drop",
423 | "failOnStandardError": "false"
424 | }
425 | },
426 | {
427 | "environment": {},
428 | "taskId": "46e4be58-730b-4389-8a2f-ea10b3e5e815",
429 | "version": "1.*",
430 | "name": "Provision Databricks",
431 | "refName": "",
432 | "enabled": true,
433 | "alwaysRun": false,
434 | "continueOnError": false,
435 | "timeoutInMinutes": 0,
436 | "definitionType": "task",
437 | "overrideInputs": {},
438 | "condition": "succeeded()",
439 | "inputs": {
440 | "connectedServiceNameARM": "ef084d52-d369-4aab-87b4-3a550e592373",
441 | "scriptLocation": "scriptPath",
442 | "scriptPath": "$(System.DefaultWorkingDirectory)/_devopstutorial/drop/azure-pipelines/provision-databricks.sh",
443 | "inlineScript": "",
444 | "args": "",
445 | "addSpnToEnvironment": "false",
446 | "useGlobalConfig": "false",
447 | "cwd": "$(System.DefaultWorkingDirectory)/_devopstutorial/drop",
448 | "failOnStandardError": "false"
449 | }
450 | },
451 | {
452 | "environment": {},
453 | "taskId": "46e4be58-730b-4389-8a2f-ea10b3e5e815",
454 | "version": "1.*",
455 | "name": "Provision Data Factory",
456 | "refName": "",
457 | "enabled": true,
458 | "alwaysRun": false,
459 | "continueOnError": false,
460 | "timeoutInMinutes": 0,
461 | "definitionType": "task",
462 | "overrideInputs": {},
463 | "condition": "succeeded()",
464 | "inputs": {
465 | "connectedServiceNameARM": "ef084d52-d369-4aab-87b4-3a550e592373",
466 | "scriptLocation": "scriptPath",
467 | "scriptPath": "$(System.DefaultWorkingDirectory)/_devopstutorial/drop/azure-pipelines/provision-data-factory.sh",
468 | "inlineScript": "",
469 | "args": "",
470 | "addSpnToEnvironment": "false",
471 | "useGlobalConfig": "false",
472 | "cwd": "$(System.DefaultWorkingDirectory)/_devopstutorial/drop",
473 | "failOnStandardError": "false"
474 | }
475 | },
476 | {
477 | "environment": {},
478 | "taskId": "46e4be58-730b-4389-8a2f-ea10b3e5e815",
479 | "version": "1.*",
480 | "name": "Run integration test",
481 | "refName": "",
482 | "enabled": true,
483 | "alwaysRun": false,
484 | "continueOnError": false,
485 | "timeoutInMinutes": 0,
486 | "definitionType": "task",
487 | "overrideInputs": {},
488 | "condition": "succeeded()",
489 | "inputs": {
490 | "connectedServiceNameARM": "ef084d52-d369-4aab-87b4-3a550e592373",
491 | "scriptLocation": "scriptPath",
492 | "scriptPath": "$(System.DefaultWorkingDirectory)/_devopstutorial/drop/azure-pipelines/run-integration-test.sh",
493 | "inlineScript": "",
494 | "args": "",
495 | "addSpnToEnvironment": "false",
496 | "useGlobalConfig": "false",
497 | "cwd": "$(System.DefaultWorkingDirectory)/_devopstutorial/drop",
498 | "failOnStandardError": "false"
499 | }
500 | }
501 | ]
502 | }
503 | ],
504 | "environmentOptions": {
505 | "emailNotificationType": "OnlyOnFailure",
506 | "emailRecipients": "release.environment.owner;release.creator",
507 | "skipArtifactsDownload": false,
508 | "timeoutInMinutes": 0,
509 | "enableAccessToken": false,
510 | "publishDeploymentStatus": true,
511 | "badgeEnabled": false,
512 | "autoLinkWorkItems": false,
513 | "pullRequestDeploymentEnabled": false
514 | },
515 | "demands": [],
516 | "conditions": [
517 | {
518 | "name": "DEV",
519 | "conditionType": 2,
520 | "value": "4"
521 | }
522 | ],
523 | "executionPolicy": {
524 | "concurrencyCount": 1,
525 | "queueDepthCount": 0
526 | },
527 | "schedules": [],
528 | "currentRelease": {
529 | "id": 39,
530 | "url": "https://vsrm.dev.azure.com/gattiker/1c9c3807-115e-4c4b-9b83-9dec8b6a8a36/_apis/Release/releases/39",
531 | "_links": {}
532 | },
533 | "retentionPolicy": {
534 | "daysToKeep": 30,
535 | "releasesToKeep": 3,
536 | "retainBuild": true
537 | },
538 | "processParameters": {},
539 | "properties": {},
540 | "preDeploymentGates": {
541 | "id": 0,
542 | "gatesOptions": null,
543 | "gates": []
544 | },
545 | "postDeploymentGates": {
546 | "id": 0,
547 | "gatesOptions": null,
548 | "gates": []
549 | },
550 | "environmentTriggers": [],
551 | "badgeUrl": "https://vsrm.dev.azure.com/gattiker/_apis/public/Release/badge/1c9c3807-115e-4c4b-9b83-9dec8b6a8a36/1/5"
552 | }
553 | ],
554 | "artifacts": [
555 | {
556 | "sourceId": "1c9c3807-115e-4c4b-9b83-9dec8b6a8a36:3",
557 | "type": "Build",
558 | "alias": "_devopstutorial",
559 | "definitionReference": {
560 | "artifactSourceDefinitionUrl": {
561 | "id": "https://dev.azure.com/gattiker/_permalink/_build/index?collectionId=8eb8c206-18a2-429f-8229-28fde5b90a93&projectId=1c9c3807-115e-4c4b-9b83-9dec8b6a8a36&definitionId=3",
562 | "name": ""
563 | },
564 | "defaultVersionBranch": {
565 | "id": "",
566 | "name": ""
567 | },
568 | "defaultVersionSpecific": {
569 | "id": "",
570 | "name": ""
571 | },
572 | "defaultVersionTags": {
573 | "id": "",
574 | "name": ""
575 | },
576 | "defaultVersionType": {
577 | "id": "latestType",
578 | "name": "Latest"
579 | },
580 | "definition": {
581 | "id": "3",
582 | "name": "databricks-devops-tutorial-Maven-CI"
583 | },
584 | "definitions": {
585 | "id": "",
586 | "name": ""
587 | },
588 | "IsMultiDefinitionType": {
589 | "id": "False",
590 | "name": "False"
591 | },
592 | "project": {
593 | "id": "1c9c3807-115e-4c4b-9b83-9dec8b6a8a36",
594 | "name": "databricks-devops-tutorial"
595 | },
596 | "repository": {
597 | "id": "",
598 | "name": ""
599 | }
600 | },
601 | "isPrimary": true,
602 | "isRetained": false
603 | }
604 | ],
605 | "triggers": [],
606 | "releaseNameFormat": "Release-$(rev:r)",
607 | "tags": [],
608 | "pipelineProcess": {
609 | "type": 1
610 | },
611 | "properties": {
612 | "DefinitionCreationSource": {
613 | "$type": "System.String",
614 | "$value": "ReleaseNew"
615 | }
616 | },
617 | "id": 1,
618 | "name": "ReleasePipeline",
619 | "path": "\\",
620 | "projectReference": null,
621 | "url": "https://vsrm.dev.azure.com/gattiker/1c9c3807-115e-4c4b-9b83-9dec8b6a8a36/_apis/Release/definitions/1",
622 | "_links": {
623 | "self": {
624 | "href": "https://vsrm.dev.azure.com/gattiker/1c9c3807-115e-4c4b-9b83-9dec8b6a8a36/_apis/Release/definitions/1"
625 | },
626 | "web": {
627 | "href": "https://dev.azure.com/gattiker/1c9c3807-115e-4c4b-9b83-9dec8b6a8a36/_release?definitionId=1"
628 | }
629 | }
630 | }
631 |
--------------------------------------------------------------------------------
/azure-pipelines/deploy-resources.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 | set -euo pipefail
3 |
4 | SvcPrincipalApplicationId=$servicePrincipalId
5 | SvcPrincipalObjectId=$(az ad sp show --id $SvcPrincipalApplicationId | jq -r .objectId)
6 |
7 |
8 | az group create --name $RESOURCE_GROUP_NAME --location $LOCATION
9 | az group deployment create -g $RESOURCE_GROUP_NAME --template-file arm-templates/azuredeploy.json --parameters dataFactoryName=$DATA_FACTORY_NAME keyVaultName=$KEY_VAULT_NAME storageAccountName=$STORAGE_ACCOUNT_NAME keyVaultWriterPrincipalId=$SvcPrincipalObjectId
10 |
11 |
--------------------------------------------------------------------------------
/azure-pipelines/install-databricks-client.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 | set -euo pipefail
3 |
4 | sudo apt-get install -y python3-setuptools
5 | pip3 install wheel
6 | pip3 install databricks-cli
7 | sudo ln -s /home/vsts/.local/bin/databricks /usr/local/bin/databricks
8 | databricks clusters list
9 |
10 |
--------------------------------------------------------------------------------
/azure-pipelines/provision-data-factory-without-zip/generate-adf-template.jq:
--------------------------------------------------------------------------------
1 | {
2 | "$schema": "https://schema.management.azure.com/schemas/2015-01-01/deploymentTemplate.json#",
3 | "contentVersion": "1.0.0.0",
4 | "resources": [
5 | .[]
6 | | .name = env.DATA_FACTORY_NAME + "/" + .name
7 | | .apiVersion = "2018-06-01"
8 | | if(.properties.type=="AzureKeyVault") then
9 | .properties.typeProperties.baseUrl="https://" + env.KEY_VAULT_NAME + ".vault.azure.net/"
10 | else .
11 | end
12 | ]
13 | }
14 |
--------------------------------------------------------------------------------
/azure-pipelines/provision-data-factory-without-zip/provision-data-factory.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 | set -euo pipefail
3 |
4 | export KEY_VAULT_NAME=databrickscicdtutd06test
5 | export RESOURCE_GROUP_NAME=databrickscicdtutd06test
6 | export DATA_FACTORY_NAME=databrickscicdtutd06test
7 |
8 | COUNTER=0
9 | for template in \
10 | "datafactory/linkedService/AzureKeyVault.json" \
11 | "datafactory/linkedService/AzureDatabricks.json" \
12 | "datafactory/pipeline/Apply ML model pipeline.json" \
13 | ; do
14 | let COUNTER=COUNTER+1
15 | mkdir -p datafactory-generated
16 | basename=$(basename "$template")
17 | generated_file="datafactory-generated/$basename"
18 | jq -sf azure-pipelines/provision-data-factory-without-zip/generate-adf-template.jq "$template" > "$generated_file"
19 | az group deployment create -g $RESOURCE_GROUP_NAME --template-file "$generated_file" --name "$DATA_FACTORY_NAME-$COUNTER"
20 | done
21 |
--------------------------------------------------------------------------------
/azure-pipelines/provision-data-factory.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 | set -euo pipefail
3 |
4 | unzip -od generated arm-templates/generated-data-factory/arm_template.zip
5 |
6 | az group deployment create -g $RESOURCE_GROUP_NAME --template-file generated/arm_template.json --parameters factoryName=$DATA_FACTORY_NAME AzureKeyVault_properties_typeProperties_baseUrl=https://$KEY_VAULT_NAME.vault.azure.net/
7 |
8 |
--------------------------------------------------------------------------------
/azure-pipelines/provision-databricks.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 | set -euo pipefail
3 |
4 | az keyvault secret set --vault-name $KEY_VAULT_NAME --name AzureDatabricksToken --value $DATABRICKS_TOKEN
5 |
6 | storageAccountKey=$(az storage account keys list -g $RESOURCE_GROUP_NAME -n $STORAGE_ACCOUNT_NAME --query "[0].value" | tr -d '"')
7 |
8 | if ! databricks secrets list-scopes --output JSON | jq -e '.scopes[] | select (.name == "bikeshare")'; then
9 | databricks secrets create-scope --scope bikeshare --initial-manage-principal "users"
10 | fi
11 | databricks secrets write --scope bikeshare --key storagekey --string-value $storageAccountKey
12 |
13 | databricks fs cp --overwrite java-library/target/*.jar dbfs:/model-factory.jar
14 |
15 |
16 |
--------------------------------------------------------------------------------
/azure-pipelines/run-integration-test.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 | set -euo pipefail
3 |
4 | databricks workspace import_dir -o notebooks /devops-deployed
5 |
6 | run=$(databricks runs submit --json '{
7 | "name": "IntegrationTest",
8 | "new_cluster": {
9 | "spark_version": "5.2.x-scala2.11",
10 | "node_type_id": "Standard_DS3_v2",
11 | "num_workers": 1
12 | },
13 | "libraries": [
14 | {
15 | "jar": "dbfs:/model-factory.jar"
16 | }
17 | ],
18 | "timeout_seconds": 1200,
19 | "notebook_task": {
20 | "notebook_path": "/devops-deployed/bikesharing-inttest",
21 | "base_parameters": {
22 | "output": "abfss://bikeshare@'$STORAGE_ACCOUNT_NAME'.dfs.core.windows.net/predictions/int-test"
23 | }
24 | }
25 | }')
26 | run_id=$(echo $run | jq .run_id)
27 |
28 | until [ "$(echo $run | jq -r .state.life_cycle_state)" = "TERMINATED" ]; do echo Waiting for run completion...; sleep 5; run=$(databricks runs get --run-id $run_id); echo $run | jq .run_page_url; done
29 |
30 | #Output to log
31 | echo $run | jq .
32 |
33 | # Fail stage if not successful
34 | test $(echo $run | jq -r .state.result_state) = "SUCCESS"
35 |
36 |
--------------------------------------------------------------------------------
/datafactory/linkedService/AzureDatabricks.json:
--------------------------------------------------------------------------------
1 | {
2 | "name": "AzureDatabricks",
3 | "properties": {
4 | "type": "AzureDatabricks",
5 | "typeProperties": {
6 | "domain": "https://westeurope.azuredatabricks.net",
7 | "accessToken": {
8 | "type": "AzureKeyVaultSecret",
9 | "store": {
10 | "referenceName": "AzureKeyVault",
11 | "type": "LinkedServiceReference"
12 | },
13 | "secretName": "AzureDatabricksToken"
14 | },
15 | "newClusterNodeType": "Standard_DS3_v2",
16 | "newClusterNumOfWorker": "1",
17 | "newClusterVersion": "5.2.x-scala2.11"
18 | }
19 | },
20 | "type": "Microsoft.DataFactory/factories/linkedservices"
21 | }
--------------------------------------------------------------------------------
/datafactory/linkedService/AzureKeyVault.json:
--------------------------------------------------------------------------------
1 | {
2 | "name": "AzureKeyVault",
3 | "type": "Microsoft.DataFactory/factories/linkedservices",
4 | "properties": {
5 | "type": "AzureKeyVault",
6 | "typeProperties": {
7 | "baseUrl": "https://nameofyourkeyvault.vault.azure.net/"
8 | }
9 | }
10 | }
11 |
--------------------------------------------------------------------------------
/datafactory/pipeline/Apply ML model pipeline.json:
--------------------------------------------------------------------------------
1 | {
2 | "name": "Apply ML model pipeline",
3 | "properties": {
4 | "activities": [
5 | {
6 | "name": "Apply ML model",
7 | "type": "DatabricksNotebook",
8 | "policy": {
9 | "timeout": "0.01:00:00",
10 | "retry": 0,
11 | "retryIntervalInSeconds": 30,
12 | "secureOutput": false,
13 | "secureInput": false
14 | },
15 | "typeProperties": {
16 | "notebookPath": "/devops-deployed/bikesharing-apply-model"
17 | },
18 | "linkedServiceName": {
19 | "referenceName": "AzureDatabricks",
20 | "type": "LinkedServiceReference"
21 | }
22 | }
23 | ]
24 | },
25 | "type": "Microsoft.DataFactory/factories/pipelines"
26 | }
27 |
--------------------------------------------------------------------------------
/java-library/pom.xml:
--------------------------------------------------------------------------------
1 |
2 |
5 | 4.0.0
6 |
7 | com.cloudarchitected.databricks-devops-tutorial
8 | model-factory
9 | 1.0-SNAPSHOT
10 |
11 |
12 |
13 |
14 | org.apache.spark
15 | spark-core_2.11
16 | 2.4.0
17 |
18 |
19 | org.apache.spark
20 | spark-mllib_2.11
21 | 2.4.0
22 |
23 |
24 | org.apache.spark
25 | spark-sql_2.11
26 | 2.4.0
27 |
28 |
29 | junit
30 | junit
31 | 4.12
32 | test
33 |
34 |
35 |
36 |
37 | 1.8
38 | 1.8
39 |
40 |
--------------------------------------------------------------------------------
/java-library/src/main/java/com/cloudarchitected/databricks_devops_tutorial/model/BikeShareModelFactory.java:
--------------------------------------------------------------------------------
1 | package com.cloudarchitected.databricks_devops_tutorial.model;
2 |
3 | import org.apache.spark.ml.Pipeline;
4 | import org.apache.spark.ml.PipelineModel;
5 | import org.apache.spark.ml.PipelineStage;
6 | import org.apache.spark.ml.evaluation.RegressionEvaluator;
7 | import org.apache.spark.ml.feature.VectorAssembler;
8 | import org.apache.spark.ml.regression.LinearRegression;
9 | import org.apache.spark.sql.DataFrameNaFunctions;
10 | import org.apache.spark.sql.Dataset;
11 | import org.apache.spark.sql.Row;
12 |
13 | public class BikeShareModelFactory {
14 |
15 |
16 | private int maxIterations = 10;
17 |
18 | public void setMaxIterations(int maxIterations) {
19 | this.maxIterations = maxIterations;
20 | }
21 |
22 | public PipelineModel buildModel(Dataset training) {
23 |
24 | VectorAssembler assembler = new VectorAssembler()
25 | .setInputCols(new String[]{"season","hr","holiday","weekday","workingday","weathersit","temp","atemp","hum","windspeed"})
26 | .setOutputCol("features");
27 |
28 | LinearRegression lr = new LinearRegression()
29 | .setLabelCol("cnt")
30 | .setMaxIter(maxIterations)
31 | .setRegParam(0.3)
32 | .setElasticNetParam(0.8);
33 |
34 | Pipeline pipeline = new Pipeline().setStages(new PipelineStage[]{assembler, lr});
35 | PipelineModel fitted = pipeline.fit(training);
36 |
37 | return fitted;
38 |
39 |
40 | }
41 | }
42 |
--------------------------------------------------------------------------------
/java-library/src/test/java/com/cloudarchitected/databricks_devops_tutorial/model/BikeShareModelFactoryTest.java:
--------------------------------------------------------------------------------
1 | package com.cloudarchitected.databricks_devops_tutorial.model;
2 |
3 | import org.apache.spark.ml.PipelineModel;
4 | import org.apache.spark.ml.PipelineStage;
5 | import org.apache.spark.ml.linalg.Vectors;
6 | import org.apache.spark.ml.regression.LinearRegressionModel;
7 | import org.apache.spark.ml.regression.LinearRegressionTrainingSummary;
8 | import org.apache.spark.sql.Dataset;
9 | import org.apache.spark.sql.Row;
10 | import org.apache.spark.sql.SparkSession;
11 | import org.junit.Test;
12 |
13 | import java.io.IOException;
14 | import java.nio.file.Files;
15 | import java.nio.file.Path;
16 | import java.nio.file.StandardCopyOption;
17 |
18 | import static org.junit.Assert.assertEquals;
19 |
20 |
21 | public class BikeShareModelFactoryTest {
22 |
23 | @Test
24 | public void modelBuilding() throws IOException {
25 | //Class under test
26 | BikeShareModelFactory tester = new BikeShareModelFactory();
27 | tester.setMaxIterations(1);
28 |
29 | SparkSession spark = SparkSession
30 | .builder()
31 | .appName("Java Spark SQL Example")
32 | .config("spark.master", "local")
33 | .config("spark.driver.bindAddress", "127.0.0.1")
34 | .getOrCreate();
35 |
36 | Path tmpFile = Files.createTempFile("bike_data", ".csv");
37 | // This tells JVM to delete the file on JVM exit.
38 | tmpFile.toFile().deleteOnExit();
39 | // writing sample data
40 | Files.copy(
41 | getClass().getClassLoader().getResourceAsStream("hour.csv"),
42 | tmpFile,
43 | StandardCopyOption.REPLACE_EXISTING);
44 |
45 |
46 | // Load training data.
47 | Dataset training = spark.read().option("header", true).option("inferSchema", true).csv(tmpFile.toString());
48 | PipelineModel pipeline = tester.buildModel(training);
49 |
50 | PipelineStage[] stages = pipeline.stages();
51 | LinearRegressionModel lrModel = (LinearRegressionModel) stages[stages.length - 1];
52 |
53 | // Print the coefficients and intercept for linear regression.
54 | System.out.println("Coefficients: "
55 | + lrModel.coefficients() + " Intercept: " + lrModel.intercept());
56 |
57 | // Summarize the model over the training set and print out some metrics.
58 | LinearRegressionTrainingSummary trainingSummary = lrModel.summary();
59 | System.out.println("numIterations: " + trainingSummary.totalIterations());
60 | System.out.println("objectiveHistory: " + Vectors.dense(trainingSummary.objectiveHistory()));
61 | trainingSummary.residuals().show();
62 | System.out.println("RMSE: " + trainingSummary.rootMeanSquaredError());
63 | System.out.println("r2: " + trainingSummary.r2());
64 |
65 | // assert statements
66 | assertEquals(10, trainingSummary.rootMeanSquaredError(), 10);
67 | }
68 | }
69 |
--------------------------------------------------------------------------------
/java-library/src/test/resources/hour.csv:
--------------------------------------------------------------------------------
1 | instant,dteday,season,yr,mnth,hr,holiday,weekday,workingday,weathersit,temp,atemp,hum,windspeed,casual,registered,cnt
2 | 1,2011-01-01,1,0,1,0,0,6,0,1,0.24,0.2879,0.81,0,3,13,16
3 | 2,2011-01-01,1,0,1,1,0,6,0,1,0.22,0.2727,0.8,0,8,32,40
4 | 3,2011-01-01,1,0,1,2,0,6,0,1,0.22,0.2727,0.8,0,5,27,32
5 | 4,2011-01-01,1,0,1,3,0,6,0,1,0.24,0.2879,0.75,0,3,10,13
6 | 5,2011-01-01,1,0,1,4,0,6,0,1,0.24,0.2879,0.75,0,0,1,1
7 | 6,2011-01-01,1,0,1,5,0,6,0,2,0.24,0.2576,0.75,0.0896,0,1,1
8 | 7,2011-01-01,1,0,1,6,0,6,0,1,0.22,0.2727,0.8,0,2,0,2
9 | 8,2011-01-01,1,0,1,7,0,6,0,1,0.2,0.2576,0.86,0,1,2,3
10 | 9,2011-01-01,1,0,1,8,0,6,0,1,0.24,0.2879,0.75,0,1,7,8
11 | 10,2011-01-01,1,0,1,9,0,6,0,1,0.32,0.3485,0.76,0,8,6,14
12 |
13 |
--------------------------------------------------------------------------------
/notebooks/bikesharing-apply-model.scala:
--------------------------------------------------------------------------------
1 | // Databricks notebook source
2 | dbutils.widgets.text("dataset", "/bikesharing-tutorial/holdout.parquet", "Input dataset")
3 | dbutils.widgets.text("model", "/bikesharing-tutorial/bikesharing.model", "Model")
4 | // Use e.g. abfss://bikeshare@algattikerdevopsstore.dfs.core.windows.net/predictions/scratch
5 | dbutils.widgets.text("output", "/bikesharing-tutorial/predictions", "Output")
6 |
7 | // COMMAND ----------
8 |
9 | val outputDir = dbutils.widgets.get("output")
10 | spark.conf.set("fs.azure.account.key", dbutils.secrets.get(scope = "bikeshare", key = "storagekey"))
11 | spark.conf.set("fs.azure.createRemoteFileSystemDuringInitialization", "true")
12 | dbutils.fs.mkdirs(outputDir)
13 | spark.conf.set("fs.azure.createRemoteFileSystemDuringInitialization", "false")
14 |
15 |
16 | // COMMAND ----------
17 |
18 | import spark.implicits._
19 | import org.apache.spark.sql.types._
20 | import org.apache.spark.sql.functions._
21 |
22 | import org.apache.spark.ml.PipelineModel
23 |
24 | val model = PipelineModel.load(dbutils.widgets.get("model"))
25 |
26 | val testData = spark.read.parquet(dbutils.widgets.get("dataset"))
27 |
28 | val predictions = model.transform(testData)
29 |
30 |
31 |
32 | predictions.createOrReplaceGlobalTempView("bikesharing_predictions")
33 |
34 | display(predictions)
35 |
36 | // COMMAND ----------
37 |
38 | predictions.write.mode("overwrite").format("json").save(outputDir)
39 |
40 | // COMMAND ----------
41 |
42 | dbutils.notebook.exit("SUCCESS")
--------------------------------------------------------------------------------
/notebooks/bikesharing-inttest.scala:
--------------------------------------------------------------------------------
1 | // Databricks notebook source
2 | // Use e.g. abfss://bikeshare@algattikerdevopsstore.dfs.core.windows.net/predictions/scratch
3 | dbutils.widgets.text("output", "/bikesharing-tutorial/inttest-predictions", "Output")
4 |
5 | // COMMAND ----------
6 |
7 | spark.conf.set("fs.azure.account.key", dbutils.secrets.get(scope = "bikeshare", key = "storagekey"))
8 |
9 |
10 | // COMMAND ----------
11 |
12 | val test_dir = "/bikesharing-tutorial/integration-test/"
13 | dbutils.fs.mkdirs(test_dir)
14 |
15 | // COMMAND ----------
16 |
17 | import spark.implicits._
18 | import org.apache.spark.sql.types._
19 | import org.apache.spark.sql.functions._
20 |
21 | // Load training data.
22 | val schema = StructType(Seq(
23 | StructField("instant",IntegerType),
24 | StructField("dteday",TimestampType),
25 | StructField("season",IntegerType),
26 | StructField("yr",IntegerType),
27 | StructField("mnth",IntegerType),
28 | StructField("hr",IntegerType),
29 | StructField("holiday",IntegerType),
30 | StructField("weekday",IntegerType),
31 | StructField("workingday",IntegerType),
32 | StructField("weathersit",IntegerType),
33 | StructField("temp",DoubleType),
34 | StructField("atemp",DoubleType),
35 | StructField("hum",DoubleType),
36 | StructField("windspeed",DoubleType),
37 | StructField("casual",IntegerType),
38 | StructField("registered",IntegerType),
39 | StructField("cnt",IntegerType)
40 | ))
41 |
42 | // Load training data.
43 | val dataset = spark.read
44 | .schema(schema)
45 | .option("header", true)
46 | .csv("/databricks-datasets/bikeSharing/data-001/hour.csv")
47 | .cache
48 |
49 | val lines = dataset.withColumn("line", monotonically_increasing_id).cache
50 |
51 | lines.where('line.between( 0, 9)).write.mode("overwrite").save(test_dir + "train.parquet")
52 | lines.where('line.between(10,19)).write.mode("overwrite").save(test_dir + "test.parquet")
53 | lines.where('line.between(20,29)).write.mode("overwrite").save(test_dir + "holdout.parquet")
54 |
55 | // COMMAND ----------
56 |
57 | dbutils.notebook.run("bikesharing-train-model", /* timeout_seconds = */ 120, Map(
58 | "train" -> (test_dir + "train.parquet"),
59 | "test" -> (test_dir + "test.parquet"),
60 | "model" -> (test_dir + "bikesharing.model")
61 | )
62 | )
63 |
64 | // COMMAND ----------
65 |
66 | val predictionsDir = dbutils.widgets.get("output")
67 | dbutils.notebook.run("bikesharing-apply-model", /* timeout_seconds = */ 120, Map(
68 | "model" -> (test_dir + "bikesharing.model"),
69 | "dataset" -> (test_dir + "holdout.parquet"),
70 | "output" -> predictionsDir
71 | )
72 | )
73 |
74 | // COMMAND ----------
75 |
76 | import org.apache.spark.ml.evaluation.RegressionEvaluator
77 |
78 | val predictions = spark.read.json(predictionsDir)
79 |
80 | val evaluator = new RegressionEvaluator()
81 | .setMetricName("rmse")
82 | .setLabelCol("cnt")
83 | .setPredictionCol("prediction")
84 | val rmse = evaluator.evaluate(predictions)
85 |
86 | println("RMSE on test data = " + rmse)
87 |
88 | assert(predictions.count == 10)
89 | assert(rmse < 100)
90 |
91 | // COMMAND ----------
92 |
93 |
--------------------------------------------------------------------------------
/notebooks/bikesharing-prep.scala:
--------------------------------------------------------------------------------
1 | // Databricks notebook source
2 | import spark.implicits._
3 | import org.apache.spark.sql.types._
4 | import org.apache.spark.sql.functions._
5 |
6 | val schema = StructType(Seq(
7 | StructField("instant",IntegerType),
8 | StructField("dteday",TimestampType),
9 | StructField("season",IntegerType),
10 | StructField("yr",IntegerType),
11 | StructField("mnth",IntegerType),
12 | StructField("hr",IntegerType),
13 | StructField("holiday",IntegerType),
14 | StructField("weekday",IntegerType),
15 | StructField("workingday",IntegerType),
16 | StructField("weathersit",IntegerType),
17 | StructField("temp",DoubleType),
18 | StructField("atemp",DoubleType),
19 | StructField("hum",DoubleType),
20 | StructField("windspeed",DoubleType),
21 | StructField("casual",IntegerType),
22 | StructField("registered",IntegerType),
23 | StructField("cnt",IntegerType)
24 | ))
25 |
26 | // Load training data.
27 | val dataset = spark.read
28 | .schema(schema)
29 | .option("header", true)
30 | .csv("/databricks-datasets/bikeSharing/data-001/hour.csv")
31 | .cache
32 |
33 | // Split into training, test and holdout set
34 | val splitsRDD = dataset.rdd.randomSplit(Array(0.8, 0.1, 0.1), 0)
35 | val splits = splitsRDD.map(spark.createDataFrame(_, dataset.schema))
36 |
37 | // Save away the holdout set for future prediction
38 | splits(0).write.mode("overwrite").save("/bikesharing-tutorial/train.parquet")
39 | splits(1).write.mode("overwrite").save("/bikesharing-tutorial/test.parquet")
40 | splits(2).write.mode("overwrite").save("/bikesharing-tutorial/holdout.parquet")
41 |
42 | // COMMAND ----------
43 |
44 | dataset.schema
45 |
46 | // COMMAND ----------
47 |
48 |
--------------------------------------------------------------------------------
/notebooks/bikesharing-train-model.scala:
--------------------------------------------------------------------------------
1 | // Databricks notebook source
2 | dbutils.widgets.text("train", "/bikesharing-tutorial/train.parquet", "Train dataset")
3 | dbutils.widgets.text("test", "/bikesharing-tutorial/test.parquet", "Test dataset")
4 | dbutils.widgets.text("model", "/bikesharing-tutorial/bikesharing.model", "Output model")
5 |
6 | // COMMAND ----------
7 |
8 | import spark.implicits._
9 | import org.apache.spark.sql.types._
10 | import org.apache.spark.sql.functions._
11 |
12 | // Load training data.
13 | val dataset = spark.read.parquet(dbutils.widgets.get("train"))
14 |
15 | val modelFactory = new com.cloudarchitected.databricks_devops_tutorial.model.BikeShareModelFactory
16 | val pipeline = modelFactory.buildModel(dataset)
17 |
18 | // COMMAND ----------
19 |
20 | import org.apache.spark.ml.evaluation.RegressionEvaluator
21 | val train = spark.read.parquet(dbutils.widgets.get("train"))
22 | val predictions = pipeline.transform(train)
23 | val evaluator = new RegressionEvaluator()
24 | .setMetricName("rmse")
25 | .setLabelCol("cnt")
26 | .setPredictionCol("prediction")
27 | val rmse = evaluator.evaluate(predictions)
28 |
29 | println("RMSE on test data = " + rmse)
30 |
31 | pipeline.write.overwrite().save(dbutils.widgets.get("model"))
32 |
33 | // COMMAND ----------
34 |
35 | display(predictions)
36 |
37 | // COMMAND ----------
38 |
39 | dbutils.notebook.exit("SUCCESS")
--------------------------------------------------------------------------------