├── .gitignore ├── LICENSE ├── README.md ├── arm-templates ├── azuredeploy.json └── generated-data-factory │ └── arm_template.zip ├── azure-pipelines.yml ├── azure-pipelines ├── ReleasePipeline.json ├── deploy-resources.sh ├── install-databricks-client.sh ├── provision-data-factory-without-zip │ ├── generate-adf-template.jq │ └── provision-data-factory.sh ├── provision-data-factory.sh ├── provision-databricks.sh └── run-integration-test.sh ├── datafactory ├── linkedService │ ├── AzureDatabricks.json │ └── AzureKeyVault.json └── pipeline │ └── Apply ML model pipeline.json ├── java-library ├── pom.xml └── src │ ├── main │ └── java │ │ └── com │ │ └── cloudarchitected │ │ └── databricks_devops_tutorial │ │ └── model │ │ └── BikeShareModelFactory.java │ └── test │ ├── java │ └── com │ │ └── cloudarchitected │ │ └── databricks_devops_tutorial │ │ └── model │ │ └── BikeShareModelFactoryTest.java │ └── resources │ └── hour.csv └── notebooks ├── bikesharing-apply-model.scala ├── bikesharing-inttest.scala ├── bikesharing-prep.scala └── bikesharing-train-model.scala /.gitignore: -------------------------------------------------------------------------------- 1 | # Compiled class file 2 | *.class 3 | 4 | # Log file 5 | *.log 6 | 7 | # BlueJ files 8 | *.ctxt 9 | 10 | # Mobile Tools for Java (J2ME) 11 | .mtj.tmp/ 12 | 13 | # Package Files # 14 | *.jar 15 | *.war 16 | *.nar 17 | *.ear 18 | *.tar.gz 19 | *.rar 20 | 21 | # virtual machine crash logs, see http://www.java.com/en/download/help/error_hotspot.xml 22 | hs_err_pid* 23 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2019 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # databricks-devops-tutorial 2 | Databricks Azure DevOps Tutorial 3 | -------------------------------------------------------------------------------- /arm-templates/azuredeploy.json: -------------------------------------------------------------------------------- 1 | { 2 | "$schema": "https://schema.management.azure.com/schemas/2015-01-01/deploymentTemplate.json#", 3 | "contentVersion": "1.0.0.0", 4 | "parameters": { 5 | "location": { 6 | "type": "String", 7 | "defaultValue": "[resourceGroup().location]" 8 | }, 9 | "dataFactoryName": { 10 | "type": "String" 11 | }, 12 | "keyVaultName": { 13 | "type": "string" 14 | }, 15 | "storageAccountName": { 16 | "type": "string" 17 | }, 18 | "keyVaultWriterPrincipalId": { 19 | "type": "string" 20 | } 21 | }, 22 | "resources": [ 23 | { 24 | "type": "Microsoft.DataFactory/factories", 25 | "name": "[parameters('dataFactoryName')]", 26 | "apiVersion": "2018-06-01", 27 | "location": "[parameters('location')]", 28 | "identity": { 29 | "type": "SystemAssigned" 30 | } 31 | }, 32 | { 33 | "apiVersion": "2016-10-01", 34 | "name": "[parameters('keyVaultName')]", 35 | "location": "[parameters('location')]", 36 | "type": "Microsoft.KeyVault/vaults", 37 | "properties": { 38 | "accessPolicies": [ 39 | { 40 | "objectId": "[parameters('keyVaultWriterPrincipalId')]", 41 | "tenantId": "[reference(concat('Microsoft.DataFactory/factories/', parameters('dataFactoryName')), '2018-06-01', 'Full').identity.tenantId]", 42 | "permissions": { 43 | "secrets": [ 44 | "Get", 45 | "List", 46 | "Set", 47 | "Delete" 48 | ] 49 | } 50 | }, 51 | { 52 | "objectId": "[reference(concat('Microsoft.DataFactory/factories/', parameters('dataFactoryName')), '2018-06-01', 'Full').identity.principalId]", 53 | "tenantId": "[reference(concat('Microsoft.DataFactory/factories/', parameters('dataFactoryName')), '2018-06-01', 'Full').identity.tenantId]", 54 | "permissions": { 55 | "secrets": [ 56 | "Get", 57 | "List" 58 | ] 59 | }, 60 | "applicationId": null 61 | } 62 | ], 63 | "tenantId": "[reference(concat('Microsoft.DataFactory/factories/', parameters('dataFactoryName')), '2018-06-01', 'Full').identity.tenantId]", 64 | "sku": { 65 | "name": "standard", 66 | "family": "A" 67 | } 68 | } 69 | }, 70 | { 71 | "name": "[parameters('storageAccountName')]", 72 | "type": "Microsoft.Storage/storageAccounts", 73 | "apiVersion": "2018-07-01", 74 | "location": "[parameters('location')]", 75 | "properties": { 76 | "accessTier": "Hot", 77 | "supportsHttpsTrafficOnly": true, 78 | "isHnsEnabled": true 79 | }, 80 | "sku": { 81 | "name": "Standard_LRS" 82 | }, 83 | "kind": "StorageV2" 84 | } 85 | ], 86 | "outputs": { 87 | } 88 | } 89 | -------------------------------------------------------------------------------- /arm-templates/generated-data-factory/arm_template.zip: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/algattik/databricks-devops-tutorial/b8bef9b36fa089972ca622e36b86d23cf0a08a3b/arm-templates/generated-data-factory/arm_template.zip -------------------------------------------------------------------------------- /azure-pipelines.yml: -------------------------------------------------------------------------------- 1 | trigger: 2 | - master 3 | 4 | pool: 5 | vmImage: 'Ubuntu-16.04' 6 | 7 | steps: 8 | - task: Maven@3 9 | inputs: 10 | mavenPomFile: 'java-library/pom.xml' 11 | mavenOptions: '-Xmx3072m' 12 | javaHomeOption: 'JDKVersion' 13 | jdkVersionOption: '1.8' 14 | jdkArchitectureOption: 'x64' 15 | publishJUnitResults: true 16 | testResultsFiles: '**/surefire-reports/TEST-*.xml' 17 | goals: 'package' 18 | 19 | - task: CopyFiles@2 20 | displayName: 'Copy Files to: $(build.artifactstagingdirectory)' 21 | inputs: 22 | SourceFolder: '$(system.defaultworkingdirectory)' 23 | Contents: | 24 | **/*.jar 25 | notebooks/* 26 | arm-templates/** 27 | azure-pipelines/** 28 | TargetFolder: '$(build.artifactstagingdirectory)' 29 | 30 | - task: PublishBuildArtifacts@1 31 | displayName: 'Publish Artifact: drop' 32 | inputs: 33 | PathtoPublish: '$(build.artifactstagingdirectory)' 34 | -------------------------------------------------------------------------------- /azure-pipelines/ReleasePipeline.json: -------------------------------------------------------------------------------- 1 | { 2 | "source": 2, 3 | "revision": 53, 4 | "description": null, 5 | "createdBy": { 6 | "displayName": "Alexandre Gattiker", 7 | "url": "https://app.vssps.visualstudio.com/A89f9c878-a0cf-4c2b-bb3e-0da77e418adf/_apis/Identities/57487cca-e71e-46d8-b117-b7b0e99912f1", 8 | "_links": { 9 | "avatar": { 10 | "href": "https://dev.azure.com/gattiker/_apis/GraphProfile/MemberAvatars/aad.NzQwYTM1MTctMjUxZC03ZTM3LWFjZmMtM2E2ODU5NmE3NTg3" 11 | } 12 | }, 13 | "id": "57487cca-e71e-46d8-b117-b7b0e99912f1", 14 | "uniqueName": "gattiker@example.com", 15 | "imageUrl": "https://dev.azure.com/gattiker/_api/_common/identityImage?id=57487cca-e71e-46d8-b117-b7b0e99912f1", 16 | "descriptor": "aad.NzQwYTM1MTctMjUxZC03ZTM3LWFjZmMtM2E2ODU5NmE3NTg3" 17 | }, 18 | "createdOn": "2019-03-30T04:00:29.750Z", 19 | "modifiedBy": { 20 | "displayName": "Alexandre Gattiker", 21 | "url": "https://app.vssps.visualstudio.com/A89f9c878-a0cf-4c2b-bb3e-0da77e418adf/_apis/Identities/57487cca-e71e-46d8-b117-b7b0e99912f1", 22 | "_links": { 23 | "avatar": { 24 | "href": "https://dev.azure.com/gattiker/_apis/GraphProfile/MemberAvatars/aad.NzQwYTM1MTctMjUxZC03ZTM3LWFjZmMtM2E2ODU5NmE3NTg3" 25 | } 26 | }, 27 | "id": "57487cca-e71e-46d8-b117-b7b0e99912f1", 28 | "uniqueName": "gattiker@example.com", 29 | "imageUrl": "https://dev.azure.com/gattiker/_api/_common/identityImage?id=57487cca-e71e-46d8-b117-b7b0e99912f1", 30 | "descriptor": "aad.NzQwYTM1MTctMjUxZC03ZTM3LWFjZmMtM2E2ODU5NmE3NTg3" 31 | }, 32 | "modifiedOn": "2019-03-30T20:47:38.863Z", 33 | "isDeleted": false, 34 | "variables": { 35 | "DATABRICKS_HOST": { 36 | "value": "https://westeurope.azuredatabricks.net" 37 | }, 38 | "DATABRICKS_TOKEN": { 39 | "value": "" 40 | }, 41 | "LOCATION": { 42 | "value": "westeurope" 43 | } 44 | }, 45 | "variableGroups": [], 46 | "environments": [ 47 | { 48 | "id": 1, 49 | "name": "DEV", 50 | "rank": 1, 51 | "owner": { 52 | "displayName": "Alexandre Gattiker", 53 | "url": "https://app.vssps.visualstudio.com/A89f9c878-a0cf-4c2b-bb3e-0da77e418adf/_apis/Identities/57487cca-e71e-46d8-b117-b7b0e99912f1", 54 | "_links": { 55 | "avatar": { 56 | "href": "https://dev.azure.com/gattiker/_apis/GraphProfile/MemberAvatars/aad.NzQwYTM1MTctMjUxZC03ZTM3LWFjZmMtM2E2ODU5NmE3NTg3" 57 | } 58 | }, 59 | "id": "57487cca-e71e-46d8-b117-b7b0e99912f1", 60 | "uniqueName": "gattiker@example.com", 61 | "imageUrl": "https://dev.azure.com/gattiker/_api/_common/identityImage?id=57487cca-e71e-46d8-b117-b7b0e99912f1", 62 | "descriptor": "aad.NzQwYTM1MTctMjUxZC03ZTM3LWFjZmMtM2E2ODU5NmE3NTg3" 63 | }, 64 | "variables": { 65 | "DATA_FACTORY_NAME": { 66 | "value": "databrickscicdtut01dev" 67 | }, 68 | "KEY_VAULT_NAME": { 69 | "value": "databrickscicdtut01dev" 70 | }, 71 | "RESOURCE_GROUP_NAME": { 72 | "value": "databrickscicdtut01dev" 73 | }, 74 | "STORAGE_ACCOUNT_NAME": { 75 | "value": "databrickscicdtut01dev" 76 | } 77 | }, 78 | "variableGroups": [], 79 | "preDeployApprovals": { 80 | "approvals": [ 81 | { 82 | "rank": 1, 83 | "isAutomated": true, 84 | "isNotificationOn": false, 85 | "id": 1 86 | } 87 | ], 88 | "approvalOptions": { 89 | "requiredApproverCount": null, 90 | "releaseCreatorCanBeApprover": false, 91 | "autoTriggeredAndPreviousEnvironmentApprovedCanBeSkipped": false, 92 | "enforceIdentityRevalidation": false, 93 | "timeoutInMinutes": 0, 94 | "executionOrder": 1 95 | } 96 | }, 97 | "deployStep": { 98 | "id": 2 99 | }, 100 | "postDeployApprovals": { 101 | "approvals": [ 102 | { 103 | "rank": 1, 104 | "isAutomated": true, 105 | "isNotificationOn": false, 106 | "id": 3 107 | } 108 | ], 109 | "approvalOptions": { 110 | "requiredApproverCount": null, 111 | "releaseCreatorCanBeApprover": false, 112 | "autoTriggeredAndPreviousEnvironmentApprovedCanBeSkipped": false, 113 | "enforceIdentityRevalidation": false, 114 | "timeoutInMinutes": 0, 115 | "executionOrder": 2 116 | } 117 | }, 118 | "deployPhases": [ 119 | { 120 | "deploymentInput": { 121 | "parallelExecution": { 122 | "parallelExecutionType": 0 123 | }, 124 | "skipArtifactsDownload": false, 125 | "artifactsDownloadInput": { 126 | "downloadInputs": [ 127 | { 128 | "alias": "_devopstutorial", 129 | "artifactType": "Build", 130 | "artifactDownloadMode": "All", 131 | "artifactItems": [] 132 | } 133 | ] 134 | }, 135 | "queueId": 107, 136 | "demands": [], 137 | "enableAccessToken": false, 138 | "timeoutInMinutes": 0, 139 | "jobCancelTimeoutInMinutes": 1, 140 | "condition": "succeeded()", 141 | "overrideInputs": {} 142 | }, 143 | "rank": 1, 144 | "phaseType": 1, 145 | "name": "Agent job", 146 | "refName": null, 147 | "workflowTasks": [ 148 | { 149 | "environment": {}, 150 | "taskId": "46e4be58-730b-4389-8a2f-ea10b3e5e815", 151 | "version": "1.*", 152 | "name": "Install Databricks client", 153 | "refName": "", 154 | "enabled": true, 155 | "alwaysRun": false, 156 | "continueOnError": false, 157 | "timeoutInMinutes": 0, 158 | "definitionType": "task", 159 | "overrideInputs": {}, 160 | "condition": "succeeded()", 161 | "inputs": { 162 | "connectedServiceNameARM": "ef084d52-d369-4aab-87b4-3a550e592373", 163 | "scriptLocation": "scriptPath", 164 | "scriptPath": "$(System.DefaultWorkingDirectory)/_devopstutorial/drop/azure-pipelines/install-databricks-client.sh", 165 | "inlineScript": "", 166 | "args": "", 167 | "addSpnToEnvironment": "false", 168 | "useGlobalConfig": "false", 169 | "cwd": "$(System.DefaultWorkingDirectory)/_devopstutorial/drop", 170 | "failOnStandardError": "false" 171 | } 172 | }, 173 | { 174 | "environment": {}, 175 | "taskId": "46e4be58-730b-4389-8a2f-ea10b3e5e815", 176 | "version": "1.*", 177 | "name": "Deploy resources", 178 | "refName": "", 179 | "enabled": true, 180 | "alwaysRun": false, 181 | "continueOnError": false, 182 | "timeoutInMinutes": 0, 183 | "definitionType": "task", 184 | "overrideInputs": {}, 185 | "condition": "succeeded()", 186 | "inputs": { 187 | "connectedServiceNameARM": "ef084d52-d369-4aab-87b4-3a550e592373", 188 | "scriptLocation": "scriptPath", 189 | "scriptPath": "$(System.DefaultWorkingDirectory)/_devopstutorial/drop/azure-pipelines/deploy-resources.sh", 190 | "inlineScript": "", 191 | "args": "", 192 | "addSpnToEnvironment": "true", 193 | "useGlobalConfig": "false", 194 | "cwd": "$(System.DefaultWorkingDirectory)/_devopstutorial/drop", 195 | "failOnStandardError": "false" 196 | } 197 | }, 198 | { 199 | "environment": {}, 200 | "taskId": "46e4be58-730b-4389-8a2f-ea10b3e5e815", 201 | "version": "1.*", 202 | "name": "Provision Databricks", 203 | "refName": "", 204 | "enabled": true, 205 | "alwaysRun": false, 206 | "continueOnError": false, 207 | "timeoutInMinutes": 0, 208 | "definitionType": "task", 209 | "overrideInputs": {}, 210 | "condition": "succeeded()", 211 | "inputs": { 212 | "connectedServiceNameARM": "ef084d52-d369-4aab-87b4-3a550e592373", 213 | "scriptLocation": "scriptPath", 214 | "scriptPath": "$(System.DefaultWorkingDirectory)/_devopstutorial/drop/azure-pipelines/provision-databricks.sh", 215 | "inlineScript": "", 216 | "args": "", 217 | "addSpnToEnvironment": "false", 218 | "useGlobalConfig": "false", 219 | "cwd": "$(System.DefaultWorkingDirectory)/_devopstutorial/drop", 220 | "failOnStandardError": "false" 221 | } 222 | } 223 | ] 224 | } 225 | ], 226 | "environmentOptions": { 227 | "emailNotificationType": "OnlyOnFailure", 228 | "emailRecipients": "release.environment.owner;release.creator", 229 | "skipArtifactsDownload": false, 230 | "timeoutInMinutes": 0, 231 | "enableAccessToken": false, 232 | "publishDeploymentStatus": true, 233 | "badgeEnabled": false, 234 | "autoLinkWorkItems": false, 235 | "pullRequestDeploymentEnabled": false 236 | }, 237 | "demands": [], 238 | "conditions": [ 239 | { 240 | "name": "ReleaseStarted", 241 | "conditionType": 1, 242 | "value": "" 243 | } 244 | ], 245 | "executionPolicy": { 246 | "concurrencyCount": 1, 247 | "queueDepthCount": 0 248 | }, 249 | "schedules": [], 250 | "currentRelease": { 251 | "id": 39, 252 | "url": "https://vsrm.dev.azure.com/gattiker/1c9c3807-115e-4c4b-9b83-9dec8b6a8a36/_apis/Release/releases/39", 253 | "_links": {} 254 | }, 255 | "retentionPolicy": { 256 | "daysToKeep": 30, 257 | "releasesToKeep": 3, 258 | "retainBuild": true 259 | }, 260 | "processParameters": {}, 261 | "properties": {}, 262 | "preDeploymentGates": { 263 | "id": 0, 264 | "gatesOptions": null, 265 | "gates": [] 266 | }, 267 | "postDeploymentGates": { 268 | "id": 0, 269 | "gatesOptions": null, 270 | "gates": [] 271 | }, 272 | "environmentTriggers": [], 273 | "badgeUrl": "https://vsrm.dev.azure.com/gattiker/_apis/public/Release/badge/1c9c3807-115e-4c4b-9b83-9dec8b6a8a36/1/1" 274 | }, 275 | { 276 | "id": 5, 277 | "name": "TEST", 278 | "rank": 2, 279 | "owner": { 280 | "displayName": "Alexandre Gattiker", 281 | "url": "https://app.vssps.visualstudio.com/A89f9c878-a0cf-4c2b-bb3e-0da77e418adf/_apis/Identities/57487cca-e71e-46d8-b117-b7b0e99912f1", 282 | "_links": { 283 | "avatar": { 284 | "href": "https://dev.azure.com/gattiker/_apis/GraphProfile/MemberAvatars/aad.NzQwYTM1MTctMjUxZC03ZTM3LWFjZmMtM2E2ODU5NmE3NTg3" 285 | } 286 | }, 287 | "id": "57487cca-e71e-46d8-b117-b7b0e99912f1", 288 | "uniqueName": "gattiker@example.com", 289 | "imageUrl": "https://dev.azure.com/gattiker/_api/_common/identityImage?id=57487cca-e71e-46d8-b117-b7b0e99912f1", 290 | "descriptor": "aad.NzQwYTM1MTctMjUxZC03ZTM3LWFjZmMtM2E2ODU5NmE3NTg3" 291 | }, 292 | "variables": { 293 | "DATA_FACTORY_NAME": { 294 | "value": "databrickscicdtut01test" 295 | }, 296 | "KEY_VAULT_NAME": { 297 | "value": "databrickscicdtut01test" 298 | }, 299 | "RESOURCE_GROUP_NAME": { 300 | "value": "databrickscicdtut01test" 301 | }, 302 | "STORAGE_ACCOUNT_NAME": { 303 | "value": "databrickscicdtut01test" 304 | } 305 | }, 306 | "variableGroups": [], 307 | "preDeployApprovals": { 308 | "approvals": [ 309 | { 310 | "rank": 1, 311 | "isAutomated": true, 312 | "isNotificationOn": false, 313 | "id": 13 314 | } 315 | ], 316 | "approvalOptions": { 317 | "requiredApproverCount": null, 318 | "releaseCreatorCanBeApprover": false, 319 | "autoTriggeredAndPreviousEnvironmentApprovedCanBeSkipped": false, 320 | "enforceIdentityRevalidation": false, 321 | "timeoutInMinutes": 0, 322 | "executionOrder": 1 323 | } 324 | }, 325 | "deployStep": { 326 | "id": 14 327 | }, 328 | "postDeployApprovals": { 329 | "approvals": [ 330 | { 331 | "rank": 1, 332 | "isAutomated": true, 333 | "isNotificationOn": false, 334 | "id": 15 335 | } 336 | ], 337 | "approvalOptions": { 338 | "requiredApproverCount": null, 339 | "releaseCreatorCanBeApprover": false, 340 | "autoTriggeredAndPreviousEnvironmentApprovedCanBeSkipped": false, 341 | "enforceIdentityRevalidation": false, 342 | "timeoutInMinutes": 0, 343 | "executionOrder": 2 344 | } 345 | }, 346 | "deployPhases": [ 347 | { 348 | "deploymentInput": { 349 | "parallelExecution": { 350 | "parallelExecutionType": 0 351 | }, 352 | "skipArtifactsDownload": false, 353 | "artifactsDownloadInput": { 354 | "downloadInputs": [ 355 | { 356 | "alias": "_devopstutorial", 357 | "artifactType": "Build", 358 | "artifactDownloadMode": "All", 359 | "artifactItems": [] 360 | } 361 | ] 362 | }, 363 | "queueId": 107, 364 | "demands": [], 365 | "enableAccessToken": false, 366 | "timeoutInMinutes": 0, 367 | "jobCancelTimeoutInMinutes": 1, 368 | "condition": "succeeded()", 369 | "overrideInputs": {} 370 | }, 371 | "rank": 1, 372 | "phaseType": 1, 373 | "name": "Agent job", 374 | "refName": null, 375 | "workflowTasks": [ 376 | { 377 | "environment": {}, 378 | "taskId": "46e4be58-730b-4389-8a2f-ea10b3e5e815", 379 | "version": "1.*", 380 | "name": "Install Databricks client", 381 | "refName": "", 382 | "enabled": true, 383 | "alwaysRun": false, 384 | "continueOnError": false, 385 | "timeoutInMinutes": 0, 386 | "definitionType": "task", 387 | "overrideInputs": {}, 388 | "condition": "succeeded()", 389 | "inputs": { 390 | "connectedServiceNameARM": "ef084d52-d369-4aab-87b4-3a550e592373", 391 | "scriptLocation": "scriptPath", 392 | "scriptPath": "$(System.DefaultWorkingDirectory)/_devopstutorial/drop/azure-pipelines/install-databricks-client.sh", 393 | "inlineScript": "", 394 | "args": "", 395 | "addSpnToEnvironment": "false", 396 | "useGlobalConfig": "false", 397 | "cwd": "$(System.DefaultWorkingDirectory)/_devopstutorial/drop", 398 | "failOnStandardError": "false" 399 | } 400 | }, 401 | { 402 | "environment": {}, 403 | "taskId": "46e4be58-730b-4389-8a2f-ea10b3e5e815", 404 | "version": "1.*", 405 | "name": "Deploy resources", 406 | "refName": "", 407 | "enabled": true, 408 | "alwaysRun": false, 409 | "continueOnError": false, 410 | "timeoutInMinutes": 0, 411 | "definitionType": "task", 412 | "overrideInputs": {}, 413 | "condition": "succeeded()", 414 | "inputs": { 415 | "connectedServiceNameARM": "ef084d52-d369-4aab-87b4-3a550e592373", 416 | "scriptLocation": "scriptPath", 417 | "scriptPath": "$(System.DefaultWorkingDirectory)/_devopstutorial/drop/azure-pipelines/deploy-resources.sh", 418 | "inlineScript": "", 419 | "args": "", 420 | "addSpnToEnvironment": "true", 421 | "useGlobalConfig": "false", 422 | "cwd": "$(System.DefaultWorkingDirectory)/_devopstutorial/drop", 423 | "failOnStandardError": "false" 424 | } 425 | }, 426 | { 427 | "environment": {}, 428 | "taskId": "46e4be58-730b-4389-8a2f-ea10b3e5e815", 429 | "version": "1.*", 430 | "name": "Provision Databricks", 431 | "refName": "", 432 | "enabled": true, 433 | "alwaysRun": false, 434 | "continueOnError": false, 435 | "timeoutInMinutes": 0, 436 | "definitionType": "task", 437 | "overrideInputs": {}, 438 | "condition": "succeeded()", 439 | "inputs": { 440 | "connectedServiceNameARM": "ef084d52-d369-4aab-87b4-3a550e592373", 441 | "scriptLocation": "scriptPath", 442 | "scriptPath": "$(System.DefaultWorkingDirectory)/_devopstutorial/drop/azure-pipelines/provision-databricks.sh", 443 | "inlineScript": "", 444 | "args": "", 445 | "addSpnToEnvironment": "false", 446 | "useGlobalConfig": "false", 447 | "cwd": "$(System.DefaultWorkingDirectory)/_devopstutorial/drop", 448 | "failOnStandardError": "false" 449 | } 450 | }, 451 | { 452 | "environment": {}, 453 | "taskId": "46e4be58-730b-4389-8a2f-ea10b3e5e815", 454 | "version": "1.*", 455 | "name": "Provision Data Factory", 456 | "refName": "", 457 | "enabled": true, 458 | "alwaysRun": false, 459 | "continueOnError": false, 460 | "timeoutInMinutes": 0, 461 | "definitionType": "task", 462 | "overrideInputs": {}, 463 | "condition": "succeeded()", 464 | "inputs": { 465 | "connectedServiceNameARM": "ef084d52-d369-4aab-87b4-3a550e592373", 466 | "scriptLocation": "scriptPath", 467 | "scriptPath": "$(System.DefaultWorkingDirectory)/_devopstutorial/drop/azure-pipelines/provision-data-factory.sh", 468 | "inlineScript": "", 469 | "args": "", 470 | "addSpnToEnvironment": "false", 471 | "useGlobalConfig": "false", 472 | "cwd": "$(System.DefaultWorkingDirectory)/_devopstutorial/drop", 473 | "failOnStandardError": "false" 474 | } 475 | }, 476 | { 477 | "environment": {}, 478 | "taskId": "46e4be58-730b-4389-8a2f-ea10b3e5e815", 479 | "version": "1.*", 480 | "name": "Run integration test", 481 | "refName": "", 482 | "enabled": true, 483 | "alwaysRun": false, 484 | "continueOnError": false, 485 | "timeoutInMinutes": 0, 486 | "definitionType": "task", 487 | "overrideInputs": {}, 488 | "condition": "succeeded()", 489 | "inputs": { 490 | "connectedServiceNameARM": "ef084d52-d369-4aab-87b4-3a550e592373", 491 | "scriptLocation": "scriptPath", 492 | "scriptPath": "$(System.DefaultWorkingDirectory)/_devopstutorial/drop/azure-pipelines/run-integration-test.sh", 493 | "inlineScript": "", 494 | "args": "", 495 | "addSpnToEnvironment": "false", 496 | "useGlobalConfig": "false", 497 | "cwd": "$(System.DefaultWorkingDirectory)/_devopstutorial/drop", 498 | "failOnStandardError": "false" 499 | } 500 | } 501 | ] 502 | } 503 | ], 504 | "environmentOptions": { 505 | "emailNotificationType": "OnlyOnFailure", 506 | "emailRecipients": "release.environment.owner;release.creator", 507 | "skipArtifactsDownload": false, 508 | "timeoutInMinutes": 0, 509 | "enableAccessToken": false, 510 | "publishDeploymentStatus": true, 511 | "badgeEnabled": false, 512 | "autoLinkWorkItems": false, 513 | "pullRequestDeploymentEnabled": false 514 | }, 515 | "demands": [], 516 | "conditions": [ 517 | { 518 | "name": "DEV", 519 | "conditionType": 2, 520 | "value": "4" 521 | } 522 | ], 523 | "executionPolicy": { 524 | "concurrencyCount": 1, 525 | "queueDepthCount": 0 526 | }, 527 | "schedules": [], 528 | "currentRelease": { 529 | "id": 39, 530 | "url": "https://vsrm.dev.azure.com/gattiker/1c9c3807-115e-4c4b-9b83-9dec8b6a8a36/_apis/Release/releases/39", 531 | "_links": {} 532 | }, 533 | "retentionPolicy": { 534 | "daysToKeep": 30, 535 | "releasesToKeep": 3, 536 | "retainBuild": true 537 | }, 538 | "processParameters": {}, 539 | "properties": {}, 540 | "preDeploymentGates": { 541 | "id": 0, 542 | "gatesOptions": null, 543 | "gates": [] 544 | }, 545 | "postDeploymentGates": { 546 | "id": 0, 547 | "gatesOptions": null, 548 | "gates": [] 549 | }, 550 | "environmentTriggers": [], 551 | "badgeUrl": "https://vsrm.dev.azure.com/gattiker/_apis/public/Release/badge/1c9c3807-115e-4c4b-9b83-9dec8b6a8a36/1/5" 552 | } 553 | ], 554 | "artifacts": [ 555 | { 556 | "sourceId": "1c9c3807-115e-4c4b-9b83-9dec8b6a8a36:3", 557 | "type": "Build", 558 | "alias": "_devopstutorial", 559 | "definitionReference": { 560 | "artifactSourceDefinitionUrl": { 561 | "id": "https://dev.azure.com/gattiker/_permalink/_build/index?collectionId=8eb8c206-18a2-429f-8229-28fde5b90a93&projectId=1c9c3807-115e-4c4b-9b83-9dec8b6a8a36&definitionId=3", 562 | "name": "" 563 | }, 564 | "defaultVersionBranch": { 565 | "id": "", 566 | "name": "" 567 | }, 568 | "defaultVersionSpecific": { 569 | "id": "", 570 | "name": "" 571 | }, 572 | "defaultVersionTags": { 573 | "id": "", 574 | "name": "" 575 | }, 576 | "defaultVersionType": { 577 | "id": "latestType", 578 | "name": "Latest" 579 | }, 580 | "definition": { 581 | "id": "3", 582 | "name": "databricks-devops-tutorial-Maven-CI" 583 | }, 584 | "definitions": { 585 | "id": "", 586 | "name": "" 587 | }, 588 | "IsMultiDefinitionType": { 589 | "id": "False", 590 | "name": "False" 591 | }, 592 | "project": { 593 | "id": "1c9c3807-115e-4c4b-9b83-9dec8b6a8a36", 594 | "name": "databricks-devops-tutorial" 595 | }, 596 | "repository": { 597 | "id": "", 598 | "name": "" 599 | } 600 | }, 601 | "isPrimary": true, 602 | "isRetained": false 603 | } 604 | ], 605 | "triggers": [], 606 | "releaseNameFormat": "Release-$(rev:r)", 607 | "tags": [], 608 | "pipelineProcess": { 609 | "type": 1 610 | }, 611 | "properties": { 612 | "DefinitionCreationSource": { 613 | "$type": "System.String", 614 | "$value": "ReleaseNew" 615 | } 616 | }, 617 | "id": 1, 618 | "name": "ReleasePipeline", 619 | "path": "\\", 620 | "projectReference": null, 621 | "url": "https://vsrm.dev.azure.com/gattiker/1c9c3807-115e-4c4b-9b83-9dec8b6a8a36/_apis/Release/definitions/1", 622 | "_links": { 623 | "self": { 624 | "href": "https://vsrm.dev.azure.com/gattiker/1c9c3807-115e-4c4b-9b83-9dec8b6a8a36/_apis/Release/definitions/1" 625 | }, 626 | "web": { 627 | "href": "https://dev.azure.com/gattiker/1c9c3807-115e-4c4b-9b83-9dec8b6a8a36/_release?definitionId=1" 628 | } 629 | } 630 | } 631 | -------------------------------------------------------------------------------- /azure-pipelines/deploy-resources.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | set -euo pipefail 3 | 4 | SvcPrincipalApplicationId=$servicePrincipalId 5 | SvcPrincipalObjectId=$(az ad sp show --id $SvcPrincipalApplicationId | jq -r .objectId) 6 | 7 | 8 | az group create --name $RESOURCE_GROUP_NAME --location $LOCATION 9 | az group deployment create -g $RESOURCE_GROUP_NAME --template-file arm-templates/azuredeploy.json --parameters dataFactoryName=$DATA_FACTORY_NAME keyVaultName=$KEY_VAULT_NAME storageAccountName=$STORAGE_ACCOUNT_NAME keyVaultWriterPrincipalId=$SvcPrincipalObjectId 10 | 11 | -------------------------------------------------------------------------------- /azure-pipelines/install-databricks-client.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | set -euo pipefail 3 | 4 | sudo apt-get install -y python3-setuptools 5 | pip3 install wheel 6 | pip3 install databricks-cli 7 | sudo ln -s /home/vsts/.local/bin/databricks /usr/local/bin/databricks 8 | databricks clusters list 9 | 10 | -------------------------------------------------------------------------------- /azure-pipelines/provision-data-factory-without-zip/generate-adf-template.jq: -------------------------------------------------------------------------------- 1 | { 2 | "$schema": "https://schema.management.azure.com/schemas/2015-01-01/deploymentTemplate.json#", 3 | "contentVersion": "1.0.0.0", 4 | "resources": [ 5 | .[] 6 | | .name = env.DATA_FACTORY_NAME + "/" + .name 7 | | .apiVersion = "2018-06-01" 8 | | if(.properties.type=="AzureKeyVault") then 9 | .properties.typeProperties.baseUrl="https://" + env.KEY_VAULT_NAME + ".vault.azure.net/" 10 | else . 11 | end 12 | ] 13 | } 14 | -------------------------------------------------------------------------------- /azure-pipelines/provision-data-factory-without-zip/provision-data-factory.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | set -euo pipefail 3 | 4 | export KEY_VAULT_NAME=databrickscicdtutd06test 5 | export RESOURCE_GROUP_NAME=databrickscicdtutd06test 6 | export DATA_FACTORY_NAME=databrickscicdtutd06test 7 | 8 | COUNTER=0 9 | for template in \ 10 | "datafactory/linkedService/AzureKeyVault.json" \ 11 | "datafactory/linkedService/AzureDatabricks.json" \ 12 | "datafactory/pipeline/Apply ML model pipeline.json" \ 13 | ; do 14 | let COUNTER=COUNTER+1 15 | mkdir -p datafactory-generated 16 | basename=$(basename "$template") 17 | generated_file="datafactory-generated/$basename" 18 | jq -sf azure-pipelines/provision-data-factory-without-zip/generate-adf-template.jq "$template" > "$generated_file" 19 | az group deployment create -g $RESOURCE_GROUP_NAME --template-file "$generated_file" --name "$DATA_FACTORY_NAME-$COUNTER" 20 | done 21 | -------------------------------------------------------------------------------- /azure-pipelines/provision-data-factory.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | set -euo pipefail 3 | 4 | unzip -od generated arm-templates/generated-data-factory/arm_template.zip 5 | 6 | az group deployment create -g $RESOURCE_GROUP_NAME --template-file generated/arm_template.json --parameters factoryName=$DATA_FACTORY_NAME AzureKeyVault_properties_typeProperties_baseUrl=https://$KEY_VAULT_NAME.vault.azure.net/ 7 | 8 | -------------------------------------------------------------------------------- /azure-pipelines/provision-databricks.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | set -euo pipefail 3 | 4 | az keyvault secret set --vault-name $KEY_VAULT_NAME --name AzureDatabricksToken --value $DATABRICKS_TOKEN 5 | 6 | storageAccountKey=$(az storage account keys list -g $RESOURCE_GROUP_NAME -n $STORAGE_ACCOUNT_NAME --query "[0].value" | tr -d '"') 7 | 8 | if ! databricks secrets list-scopes --output JSON | jq -e '.scopes[] | select (.name == "bikeshare")'; then 9 | databricks secrets create-scope --scope bikeshare --initial-manage-principal "users" 10 | fi 11 | databricks secrets write --scope bikeshare --key storagekey --string-value $storageAccountKey 12 | 13 | databricks fs cp --overwrite java-library/target/*.jar dbfs:/model-factory.jar 14 | 15 | 16 | -------------------------------------------------------------------------------- /azure-pipelines/run-integration-test.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | set -euo pipefail 3 | 4 | databricks workspace import_dir -o notebooks /devops-deployed 5 | 6 | run=$(databricks runs submit --json '{ 7 | "name": "IntegrationTest", 8 | "new_cluster": { 9 | "spark_version": "5.2.x-scala2.11", 10 | "node_type_id": "Standard_DS3_v2", 11 | "num_workers": 1 12 | }, 13 | "libraries": [ 14 | { 15 | "jar": "dbfs:/model-factory.jar" 16 | } 17 | ], 18 | "timeout_seconds": 1200, 19 | "notebook_task": { 20 | "notebook_path": "/devops-deployed/bikesharing-inttest", 21 | "base_parameters": { 22 | "output": "abfss://bikeshare@'$STORAGE_ACCOUNT_NAME'.dfs.core.windows.net/predictions/int-test" 23 | } 24 | } 25 | }') 26 | run_id=$(echo $run | jq .run_id) 27 | 28 | until [ "$(echo $run | jq -r .state.life_cycle_state)" = "TERMINATED" ]; do echo Waiting for run completion...; sleep 5; run=$(databricks runs get --run-id $run_id); echo $run | jq .run_page_url; done 29 | 30 | #Output to log 31 | echo $run | jq . 32 | 33 | # Fail stage if not successful 34 | test $(echo $run | jq -r .state.result_state) = "SUCCESS" 35 | 36 | -------------------------------------------------------------------------------- /datafactory/linkedService/AzureDatabricks.json: -------------------------------------------------------------------------------- 1 | { 2 | "name": "AzureDatabricks", 3 | "properties": { 4 | "type": "AzureDatabricks", 5 | "typeProperties": { 6 | "domain": "https://westeurope.azuredatabricks.net", 7 | "accessToken": { 8 | "type": "AzureKeyVaultSecret", 9 | "store": { 10 | "referenceName": "AzureKeyVault", 11 | "type": "LinkedServiceReference" 12 | }, 13 | "secretName": "AzureDatabricksToken" 14 | }, 15 | "newClusterNodeType": "Standard_DS3_v2", 16 | "newClusterNumOfWorker": "1", 17 | "newClusterVersion": "5.2.x-scala2.11" 18 | } 19 | }, 20 | "type": "Microsoft.DataFactory/factories/linkedservices" 21 | } -------------------------------------------------------------------------------- /datafactory/linkedService/AzureKeyVault.json: -------------------------------------------------------------------------------- 1 | { 2 | "name": "AzureKeyVault", 3 | "type": "Microsoft.DataFactory/factories/linkedservices", 4 | "properties": { 5 | "type": "AzureKeyVault", 6 | "typeProperties": { 7 | "baseUrl": "https://nameofyourkeyvault.vault.azure.net/" 8 | } 9 | } 10 | } 11 | -------------------------------------------------------------------------------- /datafactory/pipeline/Apply ML model pipeline.json: -------------------------------------------------------------------------------- 1 | { 2 | "name": "Apply ML model pipeline", 3 | "properties": { 4 | "activities": [ 5 | { 6 | "name": "Apply ML model", 7 | "type": "DatabricksNotebook", 8 | "policy": { 9 | "timeout": "0.01:00:00", 10 | "retry": 0, 11 | "retryIntervalInSeconds": 30, 12 | "secureOutput": false, 13 | "secureInput": false 14 | }, 15 | "typeProperties": { 16 | "notebookPath": "/devops-deployed/bikesharing-apply-model" 17 | }, 18 | "linkedServiceName": { 19 | "referenceName": "AzureDatabricks", 20 | "type": "LinkedServiceReference" 21 | } 22 | } 23 | ] 24 | }, 25 | "type": "Microsoft.DataFactory/factories/pipelines" 26 | } 27 | -------------------------------------------------------------------------------- /java-library/pom.xml: -------------------------------------------------------------------------------- 1 | 2 | 5 | 4.0.0 6 | 7 | com.cloudarchitected.databricks-devops-tutorial 8 | model-factory 9 | 1.0-SNAPSHOT 10 | 11 | 12 | 13 | 14 | org.apache.spark 15 | spark-core_2.11 16 | 2.4.0 17 | 18 | 19 | org.apache.spark 20 | spark-mllib_2.11 21 | 2.4.0 22 | 23 | 24 | org.apache.spark 25 | spark-sql_2.11 26 | 2.4.0 27 | 28 | 29 | junit 30 | junit 31 | 4.12 32 | test 33 | 34 | 35 | 36 | 37 | 1.8 38 | 1.8 39 | 40 | -------------------------------------------------------------------------------- /java-library/src/main/java/com/cloudarchitected/databricks_devops_tutorial/model/BikeShareModelFactory.java: -------------------------------------------------------------------------------- 1 | package com.cloudarchitected.databricks_devops_tutorial.model; 2 | 3 | import org.apache.spark.ml.Pipeline; 4 | import org.apache.spark.ml.PipelineModel; 5 | import org.apache.spark.ml.PipelineStage; 6 | import org.apache.spark.ml.evaluation.RegressionEvaluator; 7 | import org.apache.spark.ml.feature.VectorAssembler; 8 | import org.apache.spark.ml.regression.LinearRegression; 9 | import org.apache.spark.sql.DataFrameNaFunctions; 10 | import org.apache.spark.sql.Dataset; 11 | import org.apache.spark.sql.Row; 12 | 13 | public class BikeShareModelFactory { 14 | 15 | 16 | private int maxIterations = 10; 17 | 18 | public void setMaxIterations(int maxIterations) { 19 | this.maxIterations = maxIterations; 20 | } 21 | 22 | public PipelineModel buildModel(Dataset training) { 23 | 24 | VectorAssembler assembler = new VectorAssembler() 25 | .setInputCols(new String[]{"season","hr","holiday","weekday","workingday","weathersit","temp","atemp","hum","windspeed"}) 26 | .setOutputCol("features"); 27 | 28 | LinearRegression lr = new LinearRegression() 29 | .setLabelCol("cnt") 30 | .setMaxIter(maxIterations) 31 | .setRegParam(0.3) 32 | .setElasticNetParam(0.8); 33 | 34 | Pipeline pipeline = new Pipeline().setStages(new PipelineStage[]{assembler, lr}); 35 | PipelineModel fitted = pipeline.fit(training); 36 | 37 | return fitted; 38 | 39 | 40 | } 41 | } 42 | -------------------------------------------------------------------------------- /java-library/src/test/java/com/cloudarchitected/databricks_devops_tutorial/model/BikeShareModelFactoryTest.java: -------------------------------------------------------------------------------- 1 | package com.cloudarchitected.databricks_devops_tutorial.model; 2 | 3 | import org.apache.spark.ml.PipelineModel; 4 | import org.apache.spark.ml.PipelineStage; 5 | import org.apache.spark.ml.linalg.Vectors; 6 | import org.apache.spark.ml.regression.LinearRegressionModel; 7 | import org.apache.spark.ml.regression.LinearRegressionTrainingSummary; 8 | import org.apache.spark.sql.Dataset; 9 | import org.apache.spark.sql.Row; 10 | import org.apache.spark.sql.SparkSession; 11 | import org.junit.Test; 12 | 13 | import java.io.IOException; 14 | import java.nio.file.Files; 15 | import java.nio.file.Path; 16 | import java.nio.file.StandardCopyOption; 17 | 18 | import static org.junit.Assert.assertEquals; 19 | 20 | 21 | public class BikeShareModelFactoryTest { 22 | 23 | @Test 24 | public void modelBuilding() throws IOException { 25 | //Class under test 26 | BikeShareModelFactory tester = new BikeShareModelFactory(); 27 | tester.setMaxIterations(1); 28 | 29 | SparkSession spark = SparkSession 30 | .builder() 31 | .appName("Java Spark SQL Example") 32 | .config("spark.master", "local") 33 | .config("spark.driver.bindAddress", "127.0.0.1") 34 | .getOrCreate(); 35 | 36 | Path tmpFile = Files.createTempFile("bike_data", ".csv"); 37 | // This tells JVM to delete the file on JVM exit. 38 | tmpFile.toFile().deleteOnExit(); 39 | // writing sample data 40 | Files.copy( 41 | getClass().getClassLoader().getResourceAsStream("hour.csv"), 42 | tmpFile, 43 | StandardCopyOption.REPLACE_EXISTING); 44 | 45 | 46 | // Load training data. 47 | Dataset training = spark.read().option("header", true).option("inferSchema", true).csv(tmpFile.toString()); 48 | PipelineModel pipeline = tester.buildModel(training); 49 | 50 | PipelineStage[] stages = pipeline.stages(); 51 | LinearRegressionModel lrModel = (LinearRegressionModel) stages[stages.length - 1]; 52 | 53 | // Print the coefficients and intercept for linear regression. 54 | System.out.println("Coefficients: " 55 | + lrModel.coefficients() + " Intercept: " + lrModel.intercept()); 56 | 57 | // Summarize the model over the training set and print out some metrics. 58 | LinearRegressionTrainingSummary trainingSummary = lrModel.summary(); 59 | System.out.println("numIterations: " + trainingSummary.totalIterations()); 60 | System.out.println("objectiveHistory: " + Vectors.dense(trainingSummary.objectiveHistory())); 61 | trainingSummary.residuals().show(); 62 | System.out.println("RMSE: " + trainingSummary.rootMeanSquaredError()); 63 | System.out.println("r2: " + trainingSummary.r2()); 64 | 65 | // assert statements 66 | assertEquals(10, trainingSummary.rootMeanSquaredError(), 10); 67 | } 68 | } 69 | -------------------------------------------------------------------------------- /java-library/src/test/resources/hour.csv: -------------------------------------------------------------------------------- 1 | instant,dteday,season,yr,mnth,hr,holiday,weekday,workingday,weathersit,temp,atemp,hum,windspeed,casual,registered,cnt 2 | 1,2011-01-01,1,0,1,0,0,6,0,1,0.24,0.2879,0.81,0,3,13,16 3 | 2,2011-01-01,1,0,1,1,0,6,0,1,0.22,0.2727,0.8,0,8,32,40 4 | 3,2011-01-01,1,0,1,2,0,6,0,1,0.22,0.2727,0.8,0,5,27,32 5 | 4,2011-01-01,1,0,1,3,0,6,0,1,0.24,0.2879,0.75,0,3,10,13 6 | 5,2011-01-01,1,0,1,4,0,6,0,1,0.24,0.2879,0.75,0,0,1,1 7 | 6,2011-01-01,1,0,1,5,0,6,0,2,0.24,0.2576,0.75,0.0896,0,1,1 8 | 7,2011-01-01,1,0,1,6,0,6,0,1,0.22,0.2727,0.8,0,2,0,2 9 | 8,2011-01-01,1,0,1,7,0,6,0,1,0.2,0.2576,0.86,0,1,2,3 10 | 9,2011-01-01,1,0,1,8,0,6,0,1,0.24,0.2879,0.75,0,1,7,8 11 | 10,2011-01-01,1,0,1,9,0,6,0,1,0.32,0.3485,0.76,0,8,6,14 12 | 13 | -------------------------------------------------------------------------------- /notebooks/bikesharing-apply-model.scala: -------------------------------------------------------------------------------- 1 | // Databricks notebook source 2 | dbutils.widgets.text("dataset", "/bikesharing-tutorial/holdout.parquet", "Input dataset") 3 | dbutils.widgets.text("model", "/bikesharing-tutorial/bikesharing.model", "Model") 4 | // Use e.g. abfss://bikeshare@algattikerdevopsstore.dfs.core.windows.net/predictions/scratch 5 | dbutils.widgets.text("output", "/bikesharing-tutorial/predictions", "Output") 6 | 7 | // COMMAND ---------- 8 | 9 | val outputDir = dbutils.widgets.get("output") 10 | spark.conf.set("fs.azure.account.key", dbutils.secrets.get(scope = "bikeshare", key = "storagekey")) 11 | spark.conf.set("fs.azure.createRemoteFileSystemDuringInitialization", "true") 12 | dbutils.fs.mkdirs(outputDir) 13 | spark.conf.set("fs.azure.createRemoteFileSystemDuringInitialization", "false") 14 | 15 | 16 | // COMMAND ---------- 17 | 18 | import spark.implicits._ 19 | import org.apache.spark.sql.types._ 20 | import org.apache.spark.sql.functions._ 21 | 22 | import org.apache.spark.ml.PipelineModel 23 | 24 | val model = PipelineModel.load(dbutils.widgets.get("model")) 25 | 26 | val testData = spark.read.parquet(dbutils.widgets.get("dataset")) 27 | 28 | val predictions = model.transform(testData) 29 | 30 | 31 | 32 | predictions.createOrReplaceGlobalTempView("bikesharing_predictions") 33 | 34 | display(predictions) 35 | 36 | // COMMAND ---------- 37 | 38 | predictions.write.mode("overwrite").format("json").save(outputDir) 39 | 40 | // COMMAND ---------- 41 | 42 | dbutils.notebook.exit("SUCCESS") -------------------------------------------------------------------------------- /notebooks/bikesharing-inttest.scala: -------------------------------------------------------------------------------- 1 | // Databricks notebook source 2 | // Use e.g. abfss://bikeshare@algattikerdevopsstore.dfs.core.windows.net/predictions/scratch 3 | dbutils.widgets.text("output", "/bikesharing-tutorial/inttest-predictions", "Output") 4 | 5 | // COMMAND ---------- 6 | 7 | spark.conf.set("fs.azure.account.key", dbutils.secrets.get(scope = "bikeshare", key = "storagekey")) 8 | 9 | 10 | // COMMAND ---------- 11 | 12 | val test_dir = "/bikesharing-tutorial/integration-test/" 13 | dbutils.fs.mkdirs(test_dir) 14 | 15 | // COMMAND ---------- 16 | 17 | import spark.implicits._ 18 | import org.apache.spark.sql.types._ 19 | import org.apache.spark.sql.functions._ 20 | 21 | // Load training data. 22 | val schema = StructType(Seq( 23 | StructField("instant",IntegerType), 24 | StructField("dteday",TimestampType), 25 | StructField("season",IntegerType), 26 | StructField("yr",IntegerType), 27 | StructField("mnth",IntegerType), 28 | StructField("hr",IntegerType), 29 | StructField("holiday",IntegerType), 30 | StructField("weekday",IntegerType), 31 | StructField("workingday",IntegerType), 32 | StructField("weathersit",IntegerType), 33 | StructField("temp",DoubleType), 34 | StructField("atemp",DoubleType), 35 | StructField("hum",DoubleType), 36 | StructField("windspeed",DoubleType), 37 | StructField("casual",IntegerType), 38 | StructField("registered",IntegerType), 39 | StructField("cnt",IntegerType) 40 | )) 41 | 42 | // Load training data. 43 | val dataset = spark.read 44 | .schema(schema) 45 | .option("header", true) 46 | .csv("/databricks-datasets/bikeSharing/data-001/hour.csv") 47 | .cache 48 | 49 | val lines = dataset.withColumn("line", monotonically_increasing_id).cache 50 | 51 | lines.where('line.between( 0, 9)).write.mode("overwrite").save(test_dir + "train.parquet") 52 | lines.where('line.between(10,19)).write.mode("overwrite").save(test_dir + "test.parquet") 53 | lines.where('line.between(20,29)).write.mode("overwrite").save(test_dir + "holdout.parquet") 54 | 55 | // COMMAND ---------- 56 | 57 | dbutils.notebook.run("bikesharing-train-model", /* timeout_seconds = */ 120, Map( 58 | "train" -> (test_dir + "train.parquet"), 59 | "test" -> (test_dir + "test.parquet"), 60 | "model" -> (test_dir + "bikesharing.model") 61 | ) 62 | ) 63 | 64 | // COMMAND ---------- 65 | 66 | val predictionsDir = dbutils.widgets.get("output") 67 | dbutils.notebook.run("bikesharing-apply-model", /* timeout_seconds = */ 120, Map( 68 | "model" -> (test_dir + "bikesharing.model"), 69 | "dataset" -> (test_dir + "holdout.parquet"), 70 | "output" -> predictionsDir 71 | ) 72 | ) 73 | 74 | // COMMAND ---------- 75 | 76 | import org.apache.spark.ml.evaluation.RegressionEvaluator 77 | 78 | val predictions = spark.read.json(predictionsDir) 79 | 80 | val evaluator = new RegressionEvaluator() 81 | .setMetricName("rmse") 82 | .setLabelCol("cnt") 83 | .setPredictionCol("prediction") 84 | val rmse = evaluator.evaluate(predictions) 85 | 86 | println("RMSE on test data = " + rmse) 87 | 88 | assert(predictions.count == 10) 89 | assert(rmse < 100) 90 | 91 | // COMMAND ---------- 92 | 93 | -------------------------------------------------------------------------------- /notebooks/bikesharing-prep.scala: -------------------------------------------------------------------------------- 1 | // Databricks notebook source 2 | import spark.implicits._ 3 | import org.apache.spark.sql.types._ 4 | import org.apache.spark.sql.functions._ 5 | 6 | val schema = StructType(Seq( 7 | StructField("instant",IntegerType), 8 | StructField("dteday",TimestampType), 9 | StructField("season",IntegerType), 10 | StructField("yr",IntegerType), 11 | StructField("mnth",IntegerType), 12 | StructField("hr",IntegerType), 13 | StructField("holiday",IntegerType), 14 | StructField("weekday",IntegerType), 15 | StructField("workingday",IntegerType), 16 | StructField("weathersit",IntegerType), 17 | StructField("temp",DoubleType), 18 | StructField("atemp",DoubleType), 19 | StructField("hum",DoubleType), 20 | StructField("windspeed",DoubleType), 21 | StructField("casual",IntegerType), 22 | StructField("registered",IntegerType), 23 | StructField("cnt",IntegerType) 24 | )) 25 | 26 | // Load training data. 27 | val dataset = spark.read 28 | .schema(schema) 29 | .option("header", true) 30 | .csv("/databricks-datasets/bikeSharing/data-001/hour.csv") 31 | .cache 32 | 33 | // Split into training, test and holdout set 34 | val splitsRDD = dataset.rdd.randomSplit(Array(0.8, 0.1, 0.1), 0) 35 | val splits = splitsRDD.map(spark.createDataFrame(_, dataset.schema)) 36 | 37 | // Save away the holdout set for future prediction 38 | splits(0).write.mode("overwrite").save("/bikesharing-tutorial/train.parquet") 39 | splits(1).write.mode("overwrite").save("/bikesharing-tutorial/test.parquet") 40 | splits(2).write.mode("overwrite").save("/bikesharing-tutorial/holdout.parquet") 41 | 42 | // COMMAND ---------- 43 | 44 | dataset.schema 45 | 46 | // COMMAND ---------- 47 | 48 | -------------------------------------------------------------------------------- /notebooks/bikesharing-train-model.scala: -------------------------------------------------------------------------------- 1 | // Databricks notebook source 2 | dbutils.widgets.text("train", "/bikesharing-tutorial/train.parquet", "Train dataset") 3 | dbutils.widgets.text("test", "/bikesharing-tutorial/test.parquet", "Test dataset") 4 | dbutils.widgets.text("model", "/bikesharing-tutorial/bikesharing.model", "Output model") 5 | 6 | // COMMAND ---------- 7 | 8 | import spark.implicits._ 9 | import org.apache.spark.sql.types._ 10 | import org.apache.spark.sql.functions._ 11 | 12 | // Load training data. 13 | val dataset = spark.read.parquet(dbutils.widgets.get("train")) 14 | 15 | val modelFactory = new com.cloudarchitected.databricks_devops_tutorial.model.BikeShareModelFactory 16 | val pipeline = modelFactory.buildModel(dataset) 17 | 18 | // COMMAND ---------- 19 | 20 | import org.apache.spark.ml.evaluation.RegressionEvaluator 21 | val train = spark.read.parquet(dbutils.widgets.get("train")) 22 | val predictions = pipeline.transform(train) 23 | val evaluator = new RegressionEvaluator() 24 | .setMetricName("rmse") 25 | .setLabelCol("cnt") 26 | .setPredictionCol("prediction") 27 | val rmse = evaluator.evaluate(predictions) 28 | 29 | println("RMSE on test data = " + rmse) 30 | 31 | pipeline.write.overwrite().save(dbutils.widgets.get("model")) 32 | 33 | // COMMAND ---------- 34 | 35 | display(predictions) 36 | 37 | // COMMAND ---------- 38 | 39 | dbutils.notebook.exit("SUCCESS") --------------------------------------------------------------------------------