├── .docs
    ├── adf_pipeline.png
    ├── arch1_admin.png
    ├── arch2_infra.png
    ├── arch0_pipeline.png
    └── arch3_workspace.png
├── terraform
    ├── tests
    │   ├── test.tfvars
    │   ├── README.md
    │   ├── destroy-test.sh
    │   ├── databricks
    │   │   └── cluster-policy
    │   │   │   └── policy.json
    │   ├── run-test.sh
    │   └── azure
    │   │   ├── service-principal
    │   │       └── main.tf
    │   │   ├── resource-group
    │   │       └── main.tf
    │   │   ├── azure-devops-pipeline
    │   │       └── main.tf
    │   │   ├── key-vault
    │   │       └── main.tf
    │   │   └── data-factory
    │   │       └── main.tf
    ├── modules
    │   ├── azure
    │   │   ├── data-factory
    │   │   │   ├── versions.tf
    │   │   │   ├── outputs.tf
    │   │   │   ├── main.tf
    │   │   │   ├── variables.tf
    │   │   │   └── README.md
    │   │   ├── key-vault
    │   │   │   ├── versions.tf
    │   │   │   ├── outputs.tf
    │   │   │   ├── README.md
    │   │   │   ├── variables.tf
    │   │   │   └── main.tf
    │   │   ├── databricks-vnet
    │   │   │   ├── versions.tf
    │   │   │   ├── outputs.tf
    │   │   │   ├── variables.tf
    │   │   │   └── README.md
    │   │   ├── resource-group
    │   │   │   ├── versions.tf
    │   │   │   ├── outputs.tf
    │   │   │   ├── variables.tf
    │   │   │   ├── main.tf
    │   │   │   └── README.md
    │   │   ├── storage-account
    │   │   │   ├── versions.tf
    │   │   │   ├── outputs.tf
    │   │   │   ├── README.md
    │   │   │   ├── main.tf
    │   │   │   └── variables.tf
    │   │   ├── service-principal
    │   │   │   ├── versions.tf
    │   │   │   ├── outputs.tf
    │   │   │   ├── variables.tf
    │   │   │   ├── README.md
    │   │   │   └── main.tf
    │   │   ├── databricks-workspace
    │   │   │   ├── versions.tf
    │   │   │   ├── outputs.tf
    │   │   │   ├── variables.tf
    │   │   │   ├── main.tf
    │   │   │   └── README.md
    │   │   ├── azure-devops-pipeline
    │   │   │   ├── versions.tf
    │   │   │   ├── outputs.tf
    │   │   │   ├── main.tf
    │   │   │   ├── variables.tf
    │   │   │   └── README.md
    │   │   └── azure-devops-project
    │   │   │   ├── versions.tf
    │   │   │   ├── outputs.tf
    │   │   │   ├── variables.tf
    │   │   │   ├── README.md
    │   │   │   └── main.tf
    │   └── databricks
    │   │   ├── cluster-policy
    │   │       ├── versions.tf
    │   │       ├── outputs.tf
    │   │       ├── variables.tf
    │   │       ├── README.md
    │   │       └── main.tf
    │   │   └── azure-groups-sync
    │   │       ├── versions.tf
    │   │       ├── outputs.tf
    │   │       ├── variables.tf
    │   │       └── README.md
    └── deployments
    │   ├── README.md
    │   ├── workspace-bootstrap
    │       ├── policies.tf
    │       ├── mounts.tf
    │       ├── secrets.tf
    │       ├── notebooks.tf
    │       ├── principals.tf
    │       └── main.tf
    │   ├── test.tfvars
    │   ├── azure-infrastructure
    │       ├── main.tf
    │       ├── databricks-workspace.tf
    │       └── variables.tf
    │   └── destroy-deployment.sh
├── .gitignore
├── pipelines
    ├── templates
    │   ├── configure-python.yml
    │   ├── cluster-policy-single-node.json
    │   ├── deploy-notebooks.yml
    │   ├── set-databricks-permission.yml
    │   ├── run-notebook-job.yml
    │   ├── deploy-instance-pool.yml
    │   ├── get-workspace-login.yml
    │   └── terraform-azure.yml
    └── vars.yml
├── scripts
    ├── get_arm_output.ps1
    ├── add_key_vault_policy.sh
    ├── add_secret_to_key_vault.sh
    ├── create_adls_filesystem.sh
    ├── add_secret_to_secret_scope.sh
    ├── get_data_factory_identity.ps1
    ├── get_workspace_object_id.sh
    ├── create_secret_scope.sh
    ├── get_access_token.sh
    ├── create_secret_scope_acl.sh
    ├── get_instance_pool.sh
    ├── add_role_assignment.sh
    ├── add_workspace_permission.sh
    ├── run_submit_notebook.py
    ├── wait_for_job_run.py
    ├── add_api_permission.sh
    ├── get_workspace_url.sh
    ├── create_service_principal.sh
    ├── create_instance_pool.py
    ├── get_object_details.sh
    ├── create_cluster_policy.sh
    ├── create_cluster.py
    ├── azdo_extension.sh
    ├── azdo_project.sh
    └── sync_group.sh
├── notebooks
    ├── pipeline
    │   ├── 01-create-database.sql
    │   ├── 02-source-to-bronze.py
    │   └── 03-bronze-to-silver.py
    └── shared
    │   ├── print-current-user.py
    │   └── mount-adls-gen-2.py
├── arm
    ├── azure-data-factory-with-key-vault.json
    ├── azure-data-lake-gen-2.json
    └── databricks-workspace-with-vnet-injection.json
└── admin
    ├── setup-with-terraform.sh
    └── vars.sh


/.docs/adf_pipeline.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/alexandruanghel/azdo-databricks/HEAD/.docs/adf_pipeline.png


--------------------------------------------------------------------------------
/.docs/arch1_admin.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/alexandruanghel/azdo-databricks/HEAD/.docs/arch1_admin.png


--------------------------------------------------------------------------------
/.docs/arch2_infra.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/alexandruanghel/azdo-databricks/HEAD/.docs/arch2_infra.png


--------------------------------------------------------------------------------
/.docs/arch0_pipeline.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/alexandruanghel/azdo-databricks/HEAD/.docs/arch0_pipeline.png


--------------------------------------------------------------------------------
/.docs/arch3_workspace.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/alexandruanghel/azdo-databricks/HEAD/.docs/arch3_workspace.png


--------------------------------------------------------------------------------
/terraform/tests/test.tfvars:
--------------------------------------------------------------------------------
1 | #resource_group_name       = "tftest-rg"
2 | #databricks_workspace_name = "tftest-defaults"
3 | 


--------------------------------------------------------------------------------
/terraform/modules/azure/data-factory/versions.tf:
--------------------------------------------------------------------------------
 1 | terraform {
 2 |   required_version = "~> 1.5.7"
 3 | 
 4 |   required_providers {
 5 |     azurerm = {
 6 |       source  = "hashicorp/azurerm"
 7 |       version = ">= 3.72"
 8 |     }
 9 |   }
10 | }
11 | 


--------------------------------------------------------------------------------
/terraform/modules/azure/key-vault/versions.tf:
--------------------------------------------------------------------------------
 1 | terraform {
 2 |   required_version = "~> 1.5.7"
 3 | 
 4 |   required_providers {
 5 |     azurerm = {
 6 |       source  = "hashicorp/azurerm"
 7 |       version = ">= 3.72"
 8 |     }
 9 |   }
10 | }
11 | 


--------------------------------------------------------------------------------
/terraform/modules/azure/databricks-vnet/versions.tf:
--------------------------------------------------------------------------------
 1 | terraform {
 2 |   required_version = "~> 1.5.7"
 3 | 
 4 |   required_providers {
 5 |     azurerm = {
 6 |       source  = "hashicorp/azurerm"
 7 |       version = ">= 3.72"
 8 |     }
 9 |   }
10 | }
11 | 


--------------------------------------------------------------------------------
/terraform/modules/azure/resource-group/versions.tf:
--------------------------------------------------------------------------------
 1 | terraform {
 2 |   required_version = "~> 1.5.7"
 3 | 
 4 |   required_providers {
 5 |     azurerm = {
 6 |       source  = "hashicorp/azurerm"
 7 |       version = ">= 3.72"
 8 |     }
 9 |   }
10 | }
11 | 


--------------------------------------------------------------------------------
/terraform/modules/azure/storage-account/versions.tf:
--------------------------------------------------------------------------------
 1 | terraform {
 2 |   required_version = "~> 1.5.7"
 3 | 
 4 |   required_providers {
 5 |     azurerm = {
 6 |       source  = "hashicorp/azurerm"
 7 |       version = ">= 3.72"
 8 |     }
 9 |   }
10 | }
11 | 


--------------------------------------------------------------------------------
/terraform/modules/databricks/cluster-policy/versions.tf:
--------------------------------------------------------------------------------
 1 | terraform {
 2 |   required_version = "~> 1.5.7"
 3 | 
 4 |   required_providers {
 5 |     databricks = {
 6 |       source  = "databricks/databricks"
 7 |       version = ">= 1.24.1"
 8 |     }
 9 |   }
10 | }
11 | 


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
 1 | # Local .terraform
 2 | **/.terraform/*
 3 | **/.terraform.lock.hcl
 4 | # .tfstate files
 5 | **/*.tfstate
 6 | **/*.tfstate.*
 7 | # Crash log files
 8 | **/crash.log
 9 | 
10 | # Ignore Mac .DS_Store files
11 | .DS_Store
12 | 
13 | # Ignored vscode files
14 | .vscode/
15 | 
16 | # Ignore idea
17 | .idea/
18 | 


--------------------------------------------------------------------------------
/terraform/modules/azure/storage-account/outputs.tf:
--------------------------------------------------------------------------------
 1 | output "id" {
 2 |   description = "The ID of the Storage Account."
 3 |   value       = azurerm_storage_account.this.id
 4 | }
 5 | 
 6 | output "name" {
 7 |   description = "The name of the Storage Account."
 8 |   value       = azurerm_storage_account.this.name
 9 | }
10 | 


--------------------------------------------------------------------------------
/terraform/modules/azure/service-principal/versions.tf:
--------------------------------------------------------------------------------
 1 | terraform {
 2 |   required_version = "~> 1.5.7"
 3 | 
 4 |   required_providers {
 5 |     azuread = {
 6 |       source  = "hashicorp/azuread"
 7 |       version = ">= 2.41"
 8 |     }
 9 |     random = {
10 |       source  = "hashicorp/random"
11 |       version = ">= 3"
12 |     }
13 |   }
14 | }
15 | 


--------------------------------------------------------------------------------
/terraform/modules/azure/databricks-workspace/versions.tf:
--------------------------------------------------------------------------------
 1 | terraform {
 2 |   required_version = "~> 1.5.7"
 3 | 
 4 |   required_providers {
 5 |     azurerm = {
 6 |       source  = "hashicorp/azurerm"
 7 |       version = ">= 3.72"
 8 |     }
 9 |     random = {
10 |       source  = "hashicorp/random"
11 |       version = ">= 3"
12 |     }
13 |   }
14 | }
15 | 


--------------------------------------------------------------------------------
/terraform/modules/azure/azure-devops-pipeline/versions.tf:
--------------------------------------------------------------------------------
 1 | terraform {
 2 |   required_version = "~> 1.5.7"
 3 | 
 4 |   required_providers {
 5 |     azuredevops = {
 6 |       source  = "microsoft/azuredevops"
 7 |       version = ">= 0.9"
 8 |     }
 9 |     random = {
10 |       source  = "hashicorp/random"
11 |       version = ">= 3"
12 |     }
13 |   }
14 | }
15 | 


--------------------------------------------------------------------------------
/terraform/modules/databricks/azure-groups-sync/versions.tf:
--------------------------------------------------------------------------------
 1 | terraform {
 2 |   required_version = "~> 1.5.7"
 3 | 
 4 |   required_providers {
 5 |     azuread = {
 6 |       source  = "hashicorp/azuread"
 7 |       version = ">= 2.41"
 8 |     }
 9 |     databricks = {
10 |       source  = "databricks/databricks"
11 |       version = ">= 1.24.1"
12 |     }
13 |   }
14 | }
15 | 


--------------------------------------------------------------------------------
/terraform/modules/azure/azure-devops-project/versions.tf:
--------------------------------------------------------------------------------
 1 | terraform {
 2 |   required_version = "~> 1.5.7"
 3 | 
 4 |   required_providers {
 5 |     azurerm = {
 6 |       source  = "hashicorp/azurerm"
 7 |       version = ">= 3.72"
 8 |     }
 9 |     azuredevops = {
10 |       source  = "microsoft/azuredevops"
11 |       version = ">= 0.9"
12 |     }
13 |     random = {
14 |       source  = "hashicorp/random"
15 |       version = ">= 3"
16 |     }
17 |   }
18 | }
19 | 


--------------------------------------------------------------------------------
/terraform/modules/azure/resource-group/outputs.tf:
--------------------------------------------------------------------------------
 1 | output "id" {
 2 |   description = "The ID of the Resource Group."
 3 |   value       = azurerm_resource_group.this.id
 4 | }
 5 | 
 6 | output "name" {
 7 |   description = "The name of the Resource Group."
 8 |   value       = azurerm_resource_group.this.name
 9 | }
10 | 
11 | output "location" {
12 |   description = "The location of the Resource Group."
13 |   value       = azurerm_resource_group.this.location
14 | }
15 | 


--------------------------------------------------------------------------------
/pipelines/templates/configure-python.yml:
--------------------------------------------------------------------------------
 1 | #
 2 | # Template that prepares the Python environment on the current Agent.
 3 | #
 4 | 
 5 | steps:
 6 |   - task: UsePythonVersion@0
 7 |     displayName: 'Use Python 3.x'
 8 |     inputs:
 9 |       versionSpec: '3.x'
10 |       addToPath: true
11 |       architecture: 'x64'
12 | 
13 |   - task: Bash@3
14 |     displayName: 'Pip install dependencies'
15 |     inputs:
16 |       targetType: 'inline'
17 |       script: |
18 |         _pip="$(command -v pip || command -v pip3)"
19 |         ${_pip} install requests
20 | 


--------------------------------------------------------------------------------
/terraform/modules/databricks/cluster-policy/outputs.tf:
--------------------------------------------------------------------------------
 1 | output "id" {
 2 |   description = "The ID of the cluster policy in the Databricks workspace."
 3 |   value       = databricks_cluster_policy.this.id
 4 | }
 5 | 
 6 | output "details" {
 7 |   description = "Details about the cluster policy."
 8 |   value       = databricks_cluster_policy.this
 9 | }
10 | 
11 | output "permissions" {
12 |   description = "List with the cluster policy permissions."
13 |   value       = length(databricks_permissions.policy) > 0 ? databricks_permissions.policy[0] : null
14 | }
15 | 


--------------------------------------------------------------------------------
/terraform/tests/README.md:
--------------------------------------------------------------------------------
 1 | # terraform-tests
 2 | 
 3 | -----------
 4 | 
 5 | Tests for the Terraform modules. Each subfolder corresponds directly to a `modules` subfolder.
 6 | 
 7 | This doesn't do automatic assertions yet, it simply makes sure the Terraform module can execute with all parameters.
 8 | 
 9 | ## Usage
10 | 
11 | To run a test: `./run-test.sh [subfolder]`
12 | 
13 | ```
14 | ./run-test.sh azure/databricks-workspace
15 | ```
16 | 
17 | To destroy a test: `./destroy-test.sh [subfolder]`
18 | 
19 | ```
20 | ./destroy-test.sh azure/databricks-workspace
21 | ```
22 | 


--------------------------------------------------------------------------------
/terraform/modules/azure/key-vault/outputs.tf:
--------------------------------------------------------------------------------
 1 | output "id" {
 2 |   description = "The ID of the Azure Key Vault."
 3 |   value       = azurerm_key_vault.this.id
 4 | }
 5 | 
 6 | output "name" {
 7 |   description = "The name of the Azure Key Vault."
 8 |   value       = azurerm_key_vault.this.name
 9 | }
10 | 
11 | output "uri" {
12 |   description = "The URI of the Azure Key Vault."
13 |   value       = azurerm_key_vault.this.vault_uri
14 | }
15 | 
16 | output "policy" {
17 |   description = "The Azure Key Vault policy ID."
18 |   value       = azurerm_key_vault_access_policy.creator.id
19 | }
20 | 


--------------------------------------------------------------------------------
/terraform/modules/azure/service-principal/outputs.tf:
--------------------------------------------------------------------------------
 1 | output "object_id" {
 2 |   description = "The AD Object ID of the Service Principal."
 3 |   value       = azuread_service_principal.sp.object_id
 4 | }
 5 | 
 6 | output "application_id" {
 7 |   description = "The Application ID (Client ID) of the Service Principal."
 8 |   value       = azuread_service_principal.sp.application_id
 9 | }
10 | 
11 | output "secret" {
12 |   description = "The Password / Secret (Client Secret) of the Service Principal."
13 |   value       = azuread_application_password.sp.value
14 |   sensitive   = true
15 | }
16 | 


--------------------------------------------------------------------------------
/terraform/tests/destroy-test.sh:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env bash
 2 | #
 3 | # Destroys a test by running terraform destroy.
 4 | #
 5 | 
 6 | # Debug
 7 | #set -x
 8 | #export TF_LOG="DEBUG"
 9 | 
10 | # Local variables
11 | _realpath() { [[ ${1} == /* ]] && echo "${1}" || echo "${PWD}"/"${1#./}"; }
12 | _realpath="$(command -v realpath || echo _realpath)"
13 | _this_script_dir=$(${_realpath} "$(dirname "${BASH_SOURCE[0]}")")
14 | _scripts_dir=${_this_script_dir}/../../scripts
15 | 
16 | # Run test
17 | source "${_scripts_dir}/terraform_azure.sh" destroy "$@" -var-file="${_this_script_dir}/test.tfvars" -auto-approve
18 | echo
19 | 


--------------------------------------------------------------------------------
/terraform/deployments/README.md:
--------------------------------------------------------------------------------
 1 | # terraform-deployments
 2 | 
 3 | -----------
 4 | 
 5 | These are larger Terraform deployments that build a specific infrastructure:
 6 | 
 7 | - `azure-infrastructure`: Azure infrastructure for the data pipeline and project
 8 | - `workspace-bootstrap`: Databricks workspace bootstrap
 9 | 
10 | ## Usage
11 | 
12 | To run a deployment: `./run-deployment.sh [subfolder]`
13 | 
14 | ```
15 | ./run-deployment.sh azure-infrastructure
16 | ```
17 | 
18 | To destroy a deployment: `./destroy-deployment.sh [subfolder]`
19 | 
20 | ```
21 | ./destroy-deployment.sh azure-infrastructure
22 | ```
23 | 


--------------------------------------------------------------------------------
/scripts/get_arm_output.ps1:
--------------------------------------------------------------------------------
 1 | <#
 2 | Extracts a variable from the json ARM output.
 3 | It uses simple positional arguments.
 4 | Returns the variable value in the Azure Pipelines format (using the same variable name).
 5 | #>
 6 | Param(
 7 |     [Parameter(Mandatory = $True, Position = 1)] [string] $armOutputJson,
 8 |     [Parameter(Mandatory = $True, Position = 2)] [string] $varName
 9 | )
10 | 
11 | $armOutput = ConvertFrom-Json "$armOutputJson"
12 | $value = $armOutput.$varName.value
13 | 
14 | If ( [string]::IsNullOrEmpty($value))
15 | {
16 |     throw "Variable value is NULL or EMPTY: " + $armOutputJson
17 |     exit 1
18 | }
19 | 
20 | Write-Host "##vso[task.setvariable variable=$varName;]$value"
21 | 


--------------------------------------------------------------------------------
/terraform/modules/databricks/azure-groups-sync/outputs.tf:
--------------------------------------------------------------------------------
 1 | output "databricks_users" {
 2 |   description = "The details of the Databricks users."
 3 |   value       = databricks_user.users
 4 | }
 5 | 
 6 | output "databricks_service_principals" {
 7 |   description = "The details of the Databricks service principals."
 8 |   value       = databricks_service_principal.sps
 9 | }
10 | 
11 | output "databricks_groups" {
12 |   description = "The details of the Databricks groups."
13 |   value       = databricks_group.groups
14 | }
15 | 
16 | output "databricks_groups_membership" {
17 |   description = "The Databricks IDs for the groups and their members."
18 |   value       = databricks_group_member.all
19 | }
20 | 


--------------------------------------------------------------------------------
/terraform/modules/azure/azure-devops-project/outputs.tf:
--------------------------------------------------------------------------------
 1 | output "id" {
 2 |   description = "The ID of the Azure DevOps project."
 3 |   value       = azuredevops_project.this.id
 4 | }
 5 | 
 6 | output "name" {
 7 |   description = "The name of the Azure DevOps project."
 8 |   value       = azuredevops_project.this.name
 9 | }
10 | 
11 | output "service_endpoints" {
12 |   description = "The IDs of the service endpoints."
13 |   sensitive   = true
14 |   value       = {
15 |     for endpoint in try(coalescelist(flatten([
16 |       azuredevops_serviceendpoint_github.endpoints,
17 |       azuredevops_serviceendpoint_azurerm.endpoints
18 |     ])), []) :
19 |     endpoint.service_endpoint_name => endpoint.id
20 |   }
21 | }
22 | 


--------------------------------------------------------------------------------
/terraform/modules/azure/data-factory/outputs.tf:
--------------------------------------------------------------------------------
 1 | output "id" {
 2 |   description = "The ID of the Azure Data Factory."
 3 |   value       = azurerm_data_factory.this.id
 4 | }
 5 | 
 6 | output "name" {
 7 |   description = "The name of the Azure Data Factory."
 8 |   value       = azurerm_data_factory.this.name
 9 | }
10 | 
11 | output "principal_id" {
12 |   description = "The ID of the Azure Data Factory Managed Identity in Azure Active Directory."
13 |   value       = azurerm_data_factory.this.identity[0].principal_id
14 | }
15 | 
16 | output "key_vault_linked_services" {
17 |   description = "Details of the Azure Data Factory linked Key Vault services."
18 |   value       = azurerm_data_factory_linked_service_key_vault.key_vaults[*]
19 | }
20 | 


--------------------------------------------------------------------------------
/terraform/modules/azure/azure-devops-pipeline/outputs.tf:
--------------------------------------------------------------------------------
 1 | output "id" {
 2 |   description = "The ID of the Azure DevOps pipeline definition."
 3 |   value       = azuredevops_build_definition.pipeline.id
 4 | }
 5 | 
 6 | output "name" {
 7 |   description = "The name of the Azure DevOps pipeline definition."
 8 |   value       = azuredevops_build_definition.pipeline.name
 9 | }
10 | 
11 | output "path" {
12 |   description = "The path of the Azure DevOps pipeline definition in the repository."
13 |   value       = join("/", [var.github_repo_url, var.pipeline_path])
14 | }
15 | 
16 | output "revision" {
17 |   description = "The revision of the Azure DevOps pipeline definition."
18 |   value       = azuredevops_build_definition.pipeline.revision
19 | }
20 | 


--------------------------------------------------------------------------------
/notebooks/pipeline/01-create-database.sql:
--------------------------------------------------------------------------------
 1 | -- Databricks notebook source
 2 | -- MAGIC %md
 3 | -- MAGIC # Create Database
 4 | 
 5 | -- COMMAND ----------
 6 | 
 7 | -- MAGIC %md
 8 | -- MAGIC #### Set widgets
 9 | 
10 | -- COMMAND ----------
11 | 
12 | CREATE WIDGET TEXT DATABASE_NAME DEFAULT "default";
13 | CREATE WIDGET TEXT DATABASE_LOCATION DEFAULT "dbfs:/user/hive/warehouse"
14 | 
15 | -- COMMAND ----------
16 | 
17 | -- MAGIC %md
18 | -- MAGIC #### Create database
19 | 
20 | -- COMMAND ----------
21 | 
22 | CREATE DATABASE IF NOT EXISTS $DATABASE_NAME LOCATION '$DATABASE_LOCATION'
23 | 
24 | -- COMMAND ----------
25 | 
26 | -- MAGIC %md
27 | -- MAGIC #### Verify database
28 | 
29 | -- COMMAND ----------
30 | 
31 | DESCRIBE DATABASE EXTENDED $DATABASE_NAME
32 | 


--------------------------------------------------------------------------------
/terraform/modules/azure/service-principal/variables.tf:
--------------------------------------------------------------------------------
 1 | variable "name" {
 2 |   type        = string
 3 |   description = "The display name for the App Registration."
 4 | }
 5 | 
 6 | variable "owners" {
 7 |   type        = list(string)
 8 |   description = "(Optional) A list of Azure AD Object IDs that will be granted ownership of the application."
 9 |   default     = []
10 | }
11 | 
12 | variable "api_permissions" {
13 |   type        = list(string)
14 |   description = "(Optional) A list of API Permissions that should be assigned to this App (with admin consent)."
15 |   default     = []
16 | }
17 | 
18 | variable "secret_expiration" {
19 |   type        = string
20 |   description = "(Optional) A relative duration for which the password is valid. Default is 8760h (1 year)."
21 |   default     = "8760h"
22 | }
23 | 


--------------------------------------------------------------------------------
/terraform/tests/databricks/cluster-policy/policy.json:
--------------------------------------------------------------------------------
 1 | {
 2 |   "cluster_type": {
 3 |     "type": "fixed",
 4 |     "value": "all-purpose"
 5 |   },
 6 |   "spark_conf.spark.databricks.cluster.profile": {
 7 |     "type": "fixed",
 8 |     "value": "singleNode",
 9 |     "hidden": true
10 |   },
11 |   "num_workers": {
12 |     "type": "fixed",
13 |     "value": 0,
14 |     "hidden": true
15 |   },
16 |   "autotermination_minutes": {
17 |     "type": "fixed",
18 |     "value": 60,
19 |     "hidden": true
20 |   },
21 |   "custom_tags.PolicyName": {
22 |     "type": "fixed",
23 |     "value": "Single Node",
24 |     "hidden": true
25 |   },
26 |   "spark_version": {
27 |     "type": "regex",
28 |     "pattern": "13.3.x-([cg]pu-ml-)?scala2.12"
29 |   },
30 |   "docker_image.url": {
31 |     "type": "forbidden",
32 |     "hidden": true
33 |   }
34 | }
35 | 


--------------------------------------------------------------------------------
/terraform/modules/azure/azure-devops-pipeline/main.tf:
--------------------------------------------------------------------------------
 1 | /**
 2 | * Creates an Azure DevOps pipeline (hosted on GitHub) with optional variables.
 3 | */
 4 | resource "azuredevops_build_definition" "pipeline" {
 5 |   project_id = var.project_id
 6 |   name       = var.pipeline_name
 7 | 
 8 |   ci_trigger {
 9 |     use_yaml = true
10 |   }
11 | 
12 |   repository {
13 |     repo_type             = "GitHub"
14 |     repo_id               = regex("^https://[w.]*github.com/(?P<repo_id>[^/?#]+/[^/?#]+)", var.github_repo_url)["repo_id"]
15 |     branch_name           = var.github_branch
16 |     yml_path              = var.pipeline_path
17 |     service_connection_id = var.github_endpoint_id
18 |   }
19 | 
20 |   dynamic "variable" {
21 |     for_each = var.pipeline_variables
22 |     content {
23 |       name  = variable.key
24 |       value = variable.value
25 |     }
26 |   }
27 | }
28 | 


--------------------------------------------------------------------------------
/terraform/tests/run-test.sh:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env bash
 2 | #
 3 | # Tests a Terraform module by running Terraform on a test folder
 4 | #
 5 | 
 6 | # Debug
 7 | #set -x
 8 | #export TF_LOG="DEBUG"
 9 | 
10 | # Local variables
11 | _realpath() { [[ ${1} == /* ]] && echo "${1}" || echo "${PWD}"/"${1#./}"; }
12 | _realpath="$(command -v realpath || echo _realpath )"
13 | _this_script_dir=$(${_realpath} "$(dirname "${BASH_SOURCE[0]}")")
14 | _scripts_dir=${_this_script_dir}/../../scripts
15 | 
16 | # Run Terraform to build any test dependencies
17 | source "${_scripts_dir}/terraform_azure.sh" apply "$@" -var-file="${_this_script_dir}/test.tfvars" -auto-approve -target=null_resource.test_dependencies || exit 1
18 | echo
19 | 
20 | # Run test
21 | cd "${_this_script_dir}" || exit 1
22 | source "${_scripts_dir}/terraform_azure.sh" apply "$@" -var-file="${_this_script_dir}/test.tfvars" -auto-approve
23 | echo
24 | 


--------------------------------------------------------------------------------
/terraform/deployments/workspace-bootstrap/policies.tf:
--------------------------------------------------------------------------------
 1 | ### Databricks Cluster Policies
 2 | 
 3 | # Deploy the Single Node Cluster Policy
 4 | # Give CAN_USE on the Single Node Cluster Policy to the Project group
 5 | module "databricks_policy_single_node" {
 6 |   source                = "../../modules/databricks/cluster-policy"
 7 |   policy_name           = "Single Node Cluster"
 8 |   CAN_USE               = [{ principal = var.PROJECT_GROUP_NAME, type = "group" }]
 9 |   policy_overrides_file = var.DATABRICKS_CLUSTER_POLICY_LOCATION
10 |   depends_on            = [module.project_group_sync]
11 | }
12 | 
13 | # Terraform output
14 | output "databricks_policies" {
15 |   value = {
16 |     single_node = {
17 |       id          = module.databricks_policy_single_node.id
18 |       details     = module.databricks_policy_single_node.details
19 |       permissions = module.databricks_policy_single_node.permissions
20 |     }
21 |   }
22 | }
23 | 


--------------------------------------------------------------------------------
/terraform/modules/databricks/azure-groups-sync/variables.tf:
--------------------------------------------------------------------------------
 1 | variable "groups" {
 2 |   type        = list(string)
 3 |   description = "The list of groups to be synced."
 4 | }
 5 | 
 6 | variable "workspace_access" {
 7 |   type        = list(string)
 8 |   description = "(Optional) A list of groups that should have access to Databricks Workspace."
 9 |   default     = []
10 | }
11 | 
12 | variable "databricks_sql_access" {
13 |   type        = list(string)
14 |   description = "(Optional) A list of groups that should have access to Databricks SQL."
15 |   default     = []
16 | }
17 | 
18 | variable "allow_cluster_create" {
19 |   type        = list(string)
20 |   description = "(Optional) A list of groups that should have cluster create privileges."
21 |   default     = []
22 | }
23 | 
24 | variable "allow_instance_pool_create" {
25 |   type        = list(string)
26 |   description = "(Optional) A list of groups that should have instance pool create privileges."
27 |   default     = []
28 | }
29 | 


--------------------------------------------------------------------------------
/terraform/modules/azure/azure-devops-project/variables.tf:
--------------------------------------------------------------------------------
 1 | variable "project_name" {
 2 |   type        = string
 3 |   description = "The name of the Azure DevOps project."
 4 | }
 5 | 
 6 | variable "github_endpoints" {
 7 |   type        = list(string)
 8 |   description = "(Optional) A list of GitHub endpoints to be created."
 9 |   default     = []
10 | }
11 | 
12 | variable "github_pat" {
13 |   type        = string
14 |   sensitive   = true
15 |   description = "(Optional) The GitHub Personal Access Token. If not set, it will use the AZDO_GITHUB_SERVICE_CONNECTION_PAT environment variable."
16 |   default     = null
17 | }
18 | 
19 | variable "arm_endpoints" {
20 |   type = list(object({
21 |     name          = string
22 |     client_id     = string
23 |     client_secret = string
24 |   })
25 |   )
26 |   sensitive   = true
27 |   description = "(Optional) A list of ARM endpoints to be created. These must have a Name, a Service Principal Client ID and a Client Secret."
28 |   default     = []
29 | }
30 | 


--------------------------------------------------------------------------------
/terraform/modules/azure/resource-group/variables.tf:
--------------------------------------------------------------------------------
 1 | variable "resource_group_name" {
 2 |   type        = string
 3 |   description = "The name of the Resource Group."
 4 | }
 5 | 
 6 | variable "azure_location" {
 7 |   type        = string
 8 |   description = "Azure location for the Resource Group."
 9 | }
10 | 
11 | variable "owners" {
12 |   type        = list(string)
13 |   description = "(Optional) A list of Object IDs that should have the Owner role over the Resource Group."
14 |   default     = []
15 | }
16 | 
17 | variable "contributors" {
18 |   type        = list(string)
19 |   description = "(Optional) A list of Object IDs that should have the Contributor role over the Resource Group."
20 |   default     = []
21 | }
22 | 
23 | variable "readers" {
24 |   type        = list(string)
25 |   description = "(Optional) A list of Object IDs that should have the Reader role over the Resource Group."
26 |   default     = []
27 | }
28 | 
29 | variable "tags" {
30 |   type        = map(string)
31 |   description = "(Optional) A mapping of tags to assign to the Resource Group."
32 |   default     = {}
33 | }
34 | 


--------------------------------------------------------------------------------
/terraform/modules/azure/azure-devops-project/README.md:
--------------------------------------------------------------------------------
 1 | ## Description
 2 | 
 3 | Creates an Azure DevOps Project with optional service endpoints (AzureRM or GitHub).
 4 | 
 5 | ## Inputs
 6 | 
 7 | | Name             | Description                              | Type           | Default | Required |
 8 | |------------------|------------------------------------------|----------------|---------|:--------:|
 9 | | project_name     | The name of the Azure DevOps project     | `string`       | n/a     |   yes    |
10 | | github_endpoints | A list of GitHub endpoints to be created | `list(string)` | `[]`    |    no    |
11 | | github_pat       | The GitHub Personal Access Token         | `string`       | `null`  |    no    |
12 | | arm_endpoints    | A list of ARM endpoints to be created    | `list(object)` | `[]`    |    no    |
13 | 
14 | ## Outputs
15 | 
16 | | Name              | Description                          |
17 | |-------------------|--------------------------------------|
18 | | id                | The ID of the Azure DevOps project   |
19 | | name              | The name of the Azure DevOps project |
20 | | service_endpoints | The IDs of the service endpoints     |
21 | 


--------------------------------------------------------------------------------
/terraform/modules/azure/data-factory/main.tf:
--------------------------------------------------------------------------------
 1 | /**
 2 | * Creates an Azure Data Factory with optional Key Vault linked services.
 3 | */
 4 | data "azurerm_resource_group" "this" {
 5 |   name = var.resource_group_name
 6 | }
 7 | 
 8 | locals {
 9 |   location = var.azure_location == null ? data.azurerm_resource_group.this.location : var.azure_location
10 | 
11 |   tags = {
12 |     ManagedBy = "Terraform"
13 |   }
14 | }
15 | 
16 | resource "azurerm_data_factory" "this" {
17 |   name                   = var.data_factory_name
18 |   location               = local.location
19 |   resource_group_name    = data.azurerm_resource_group.this.name
20 |   public_network_enabled = true
21 |   tags                   = merge(local.tags, var.tags)
22 |   identity {
23 |     type = "SystemAssigned"
24 |   }
25 | }
26 | 
27 | resource "azurerm_data_factory_linked_service_key_vault" "key_vaults" {
28 |   count           = length(var.key_vault_ids)
29 |   name            = element(split("/", var.key_vault_ids[count.index]), length(split("/", var.key_vault_ids[count.index]))-1)
30 |   data_factory_id = azurerm_data_factory.this.id
31 |   key_vault_id    = var.key_vault_ids[count.index]
32 | }
33 | 


--------------------------------------------------------------------------------
/terraform/modules/azure/data-factory/variables.tf:
--------------------------------------------------------------------------------
 1 | variable "resource_group_name" {
 2 |   type        = string
 3 |   description = "The name of the Resource Group in which the resources should exist."
 4 | }
 5 | 
 6 | variable "azure_location" {
 7 |   type        = string
 8 |   description = "(Optional) Azure location in which the resources should exist. If not set, it will use the location of the Resource Group."
 9 |   default     = null
10 | }
11 | 
12 | variable "data_factory_name" {
13 |   type        = string
14 |   description = "The name of the Azure Data Factory."
15 | 
16 |   validation {
17 |     condition     = length(var.data_factory_name) >= 3 && length(var.data_factory_name) <= 63
18 |     error_message = "The name of the Azure Data Factory must be between 3 and 63 characters."
19 |   }
20 | }
21 | 
22 | variable "key_vault_ids" {
23 |   type        = list(string)
24 |   description = "(Optional) A list of Azure Key Vault IDs to be used for creating linked services."
25 |   default     = []
26 | }
27 | 
28 | variable "tags" {
29 |   type        = map(string)
30 |   description = "(Optional) A mapping of tags to assign to the resources."
31 |   default     = {}
32 | }
33 | 


--------------------------------------------------------------------------------
/terraform/modules/azure/resource-group/main.tf:
--------------------------------------------------------------------------------
 1 | /**
 2 | * Creates an Azure Resource Group with optional IAM roles
 3 | */
 4 | locals {
 5 |   tags = {
 6 |     ManagedBy = "Terraform"
 7 |   }
 8 | }
 9 | 
10 | resource "azurerm_resource_group" "this" {
11 |   name     = var.resource_group_name
12 |   location = var.azure_location
13 |   tags     = merge(local.tags, var.tags)
14 | }
15 | 
16 | resource "azurerm_role_assignment" "owners" {
17 |   count                = length(var.owners)
18 |   scope                = azurerm_resource_group.this.id
19 |   role_definition_name = "Owner"
20 |   principal_id         = var.owners[count.index]
21 | }
22 | 
23 | resource "azurerm_role_assignment" "contributors" {
24 |   count                = length(var.contributors)
25 |   scope                = azurerm_resource_group.this.id
26 |   role_definition_name = "Contributor"
27 |   principal_id         = var.contributors[count.index]
28 | }
29 | 
30 | resource "azurerm_role_assignment" "readers" {
31 |   count                = length(var.readers)
32 |   scope                = azurerm_resource_group.this.id
33 |   role_definition_name = "Reader"
34 |   principal_id         = var.readers[count.index]
35 | }
36 | 


--------------------------------------------------------------------------------
/terraform/modules/azure/databricks-workspace/outputs.tf:
--------------------------------------------------------------------------------
 1 | output "id" {
 2 |   description = "The Azure Resource ID of the Databricks workspace."
 3 |   value       = azurerm_databricks_workspace.this.id
 4 | }
 5 | 
 6 | output "workspace_name" {
 7 |   description = "The name of the Databricks workspace."
 8 |   value       = azurerm_databricks_workspace.this.name
 9 | }
10 | 
11 | output "workspace_id" {
12 |   description = "The unique identifier of the Databricks workspace in Databricks control plane."
13 |   value       = azurerm_databricks_workspace.this.workspace_id
14 | }
15 | 
16 | output "workspace_url" {
17 |   description = "The workspace URL which is of the format 'adb-{workspace_id}.{random}.azuredatabricks.net'."
18 |   value       = azurerm_databricks_workspace.this.workspace_url
19 | }
20 | 
21 | output "managed_resource_group_name" {
22 |   description = "The name of the Managed Resource Group for managed Databricks resources."
23 |   value       = azurerm_databricks_workspace.this.managed_resource_group_name
24 | }
25 | 
26 | output "managed_resource_group_id" {
27 |   description = "The Azure Resource ID of the Managed Resource Group."
28 |   value       = azurerm_databricks_workspace.this.managed_resource_group_id
29 | }
30 | 


--------------------------------------------------------------------------------
/scripts/add_key_vault_policy.sh:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env bash
 2 | #
 3 | # Adds an access policy to a Key Vault (using 'az keyvault set-policy').
 4 | # It uses simple positional arguments.
 5 | #
 6 | 
 7 | # Required parameters
 8 | _key_vault_name=${1}
 9 | _sp_client_id=${2}
10 | 
11 | # Optional parameters
12 | _secret_permissions=${3:-"get list"}
13 | 
14 | # Local variables
15 | _realpath() { [[ ${1} == /* ]] && echo "${1}" || echo "${PWD}"/"${1#./}"; }
16 | _realpath="$(command -v realpath || echo _realpath )"
17 | _script_dir=$(${_realpath} "$(dirname "${BASH_SOURCE[0]}")")
18 | 
19 | _usage() {
20 |   echo -e "Usage: ${0} <key_vault_name> <sp_client_id> [secret_permissions]"
21 |   exit 1
22 | }
23 | 
24 | # Parameters check
25 | [ -z "${_key_vault_name}" ] && _usage
26 | [ -z "${_sp_client_id}" ] && _usage
27 | 
28 | # Get the Object ID of Service Principal
29 | source "${_script_dir}/get_object_details.sh" "${_sp_client_id}"
30 | sp_object_id=${object_id}
31 | 
32 | # Use the az cli command to add the policy to the Key Vault
33 | echo -e "Adding a read-only policy for Service Principal ${_sp_client_id} to Key Vault ${_key_vault_name}"
34 | az keyvault set-policy --name "${_key_vault_name}" \
35 |                        --object-id "${sp_object_id}" \
36 |                        --secret-permissions ${_secret_permissions} || exit 1
37 | 


--------------------------------------------------------------------------------
/pipelines/templates/cluster-policy-single-node.json:
--------------------------------------------------------------------------------
 1 | {
 2 |   "cluster_type": {
 3 |     "type": "fixed",
 4 |     "value": "all-purpose"
 5 |   },
 6 |   "spark_conf.spark.databricks.cluster.profile": {
 7 |     "type": "fixed",
 8 |     "value": "singleNode",
 9 |     "hidden": true
10 |   },
11 |   "num_workers": {
12 |     "type": "fixed",
13 |     "value": 0,
14 |     "hidden": true
15 |   },
16 |   "spark_conf.spark.databricks.passthrough.enabled": {
17 |     "type": "fixed",
18 |     "value": "true",
19 |     "hidden": false
20 |   },
21 |   "autotermination_minutes": {
22 |     "type": "fixed",
23 |     "value": 60,
24 |     "hidden": true
25 |   },
26 |   "custom_tags.PolicyName": {
27 |     "type": "fixed",
28 |     "value": "Single Node",
29 |     "hidden": true
30 |   },
31 |   "spark_version": {
32 |     "type": "regex",
33 |     "pattern": "13.3.x-([cg]pu-ml-)?scala2.12"
34 |   },
35 |   "docker_image.url": {
36 |     "type": "forbidden",
37 |     "hidden": true
38 |   },
39 |   "instance_pool_id": {
40 |     "type": "unlimited",
41 |     "isOptional": false
42 |   },
43 |   "cluster_log_conf.path": {
44 |     "type": "unlimited",
45 |     "defaultValue": "dbfs:/cluster-logs",
46 |     "isOptional": false,
47 |     "hidden": false
48 |   },
49 |   "cluster_log_conf.type": {
50 |     "type": "fixed",
51 |     "value": "DBFS",
52 |     "hidden": false
53 |   }
54 | }
55 | 


--------------------------------------------------------------------------------
/scripts/add_secret_to_key_vault.sh:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env bash
 2 | #
 3 | # Adds a secret to a Key Vault (using 'az keyvault secret set').
 4 | # It uses simple positional arguments.
 5 | #
 6 | 
 7 | # Required parameters
 8 | _key_vault_name=${1}
 9 | _secret_name=${2}
10 | _secret_value=${3}
11 | 
12 | # Optional parameters
13 | _years_valid=${4:-"1"}
14 | _credential_description=${5:-"azdo"}
15 | 
16 | # Local variables
17 | _date="$(date -d "+${_years_valid} years" +%Y-%m-%d'T'%H:%M:%S'Z' 2> /dev/null || date -j -v "+${_years_valid}y" +%Y-%m-%d'T'%H:%M:%S'Z' 2> /dev/null )"
18 | [ -z "${_date}" ] && exit 1
19 | 
20 | _usage() {
21 |   echo -e "Usage: ${0} <key_vault_name> <secret_name> <secret_value>"
22 |   exit 1
23 | }
24 | 
25 | # Parameters check
26 | [ -z "${_key_vault_name}" ] && _usage
27 | [ -z "${_secret_name}" ] && _usage
28 | [ -z "${_secret_value}" ] && _usage
29 | 
30 | # Use the az cli command to create/update a secret in the Key Vault
31 | echo -e "Storing the secret \"${_secret_name}\"(expiring on \"${_date}\") in Key Vault \"${_key_vault_name}\""
32 | az keyvault secret set --name "${_secret_name}" \
33 |                        --vault-name "${_key_vault_name}" \
34 |                        --value "${_secret_value}" \
35 |                        --description "${_credential_description}" \
36 |                        --expires "${_date}" > /dev/null || exit 1
37 | echo -e "Secret stored in Key Vault"
38 | 


--------------------------------------------------------------------------------
/terraform/modules/azure/azure-devops-pipeline/variables.tf:
--------------------------------------------------------------------------------
 1 | variable "pipeline_name" {
 2 |   type        = string
 3 |   description = "The name of the Azure DevOps pipeline."
 4 | }
 5 | 
 6 | variable "pipeline_path" {
 7 |   type        = string
 8 |   description = "The path in the GitHub repo to the pipelines YAML file."
 9 | }
10 | 
11 | variable "project_id" {
12 |   type        = string
13 |   description = "The ID of the Azure DevOps project."
14 | }
15 | 
16 | variable "github_endpoint_id" {
17 |   type        = string
18 |   description = "The ID of the GitHub service endpoint."
19 | }
20 | 
21 | variable "github_repo_url" {
22 |   type        = string
23 |   description = "The URL used by the GitHub service endpoint and pipeline."
24 | 
25 |   validation {
26 |     condition     = length(regex("^https://[w.]*github.com/(?P<repo_id>[^/?#]+/[^/?#]+)", var.github_repo_url)) == 1
27 |     error_message = "This must be a valid URL to a GitHub repository (https://github.com/<GitHub Org>/<Repo Name>)."
28 |   }
29 | }
30 | 
31 | variable "github_branch" {
32 |   type        = string
33 |   description = "(Optional) Branch name for which the pipeline will be configured. Default is master."
34 |   default     = "master"
35 | }
36 | 
37 | variable "pipeline_variables" {
38 |   type        = map(string)
39 |   description = "(Optional) A map of variables names and values that should be set to the pipeline."
40 |   default     = {}
41 | }
42 | 


--------------------------------------------------------------------------------
/scripts/create_adls_filesystem.sh:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env bash
 2 | #
 3 | # Creates a file system for Azure Data Lake Storage Gen2 account (using 'az storage fs').
 4 | # It uses simple positional arguments.
 5 | #
 6 | 
 7 | # Required parameters
 8 | _storage_account_name=${1}
 9 | _storage_filesystem_name=${2}
10 | 
11 | _usage() {
12 |   echo -e "Usage: ${0} <storage_account_name> <storage_filesystem_name>"
13 |   exit 1
14 | }
15 | 
16 | # Parameters check
17 | [ -z "${_storage_account_name}" ] && _usage
18 | [ -z "${_storage_filesystem_name}" ] && _usage
19 | 
20 | # Use the az cli command to create the ADLS Filesystem
21 | echo -e "Creating the Filesystem \"${_storage_filesystem_name}\" in the Storage Account \"${_storage_account_name}\""
22 | _filesystem=$(az storage fs show --account-name "${_storage_account_name}" \
23 |                                  --name "${_storage_filesystem_name}" \
24 |                                  --auth-mode login --timeout 30)
25 | 
26 | if [ -z "${_filesystem}" ] && ! az storage fs create --account-name "${_storage_account_name}" \
27 |                                  --name "${_storage_filesystem_name}" \
28 |                                  --auth-mode login --timeout 30 > /dev/null; then
29 |   echo -e "ERROR: Filesystem \"${_storage_filesystem_name}\" was not created successfully"
30 |   exit 1
31 | else
32 |   echo -e "Filesystem \"${_storage_filesystem_name}\" created successfully or already exists"
33 | fi
34 | 


--------------------------------------------------------------------------------
/terraform/deployments/workspace-bootstrap/mounts.tf:
--------------------------------------------------------------------------------
 1 | ### Databricks DBFS Mounts
 2 | 
 3 | # Generate a mountpoint name if one was not provided
 4 | # The name would be /mnt/<STORAGE_ACCOUNT_NAME>-<PROJECT_CONTAINER_NAME>
 5 | locals {
 6 |   PROJECT_MOUNT_POINT = var.PROJECT_MOUNT_POINT == null ? "${var.STORAGE_ACCOUNT_NAME}-${var.PROJECT_CONTAINER_NAME}" : basename(var.PROJECT_MOUNT_POINT)
 7 | }
 8 | 
 9 | # Mount the ADLS Gen2 Project Filesystem using the latest Client Secret of the data pipeline Service Principal
10 | resource "databricks_mount" "project" {
11 |   cluster_id = databricks_cluster.shared_autoscaling.id
12 |   name       = local.PROJECT_MOUNT_POINT
13 | 
14 |   abfs {
15 |     container_name         = var.PROJECT_CONTAINER_NAME
16 |     storage_account_name   = var.STORAGE_ACCOUNT_NAME
17 |     tenant_id              = data.azurerm_client_config.current.tenant_id
18 |     client_id              = data.azuread_service_principal.data_pipeline.application_id
19 |     client_secret_scope    = databricks_secret_scope.main.name
20 |     client_secret_key      = var.SECRET_NAME_CLIENT_SECRET
21 |     initialize_file_system = true
22 |   }
23 | 
24 |   depends_on = [databricks_secret.sp_client_secret, databricks_cluster.shared_autoscaling]
25 | }
26 | 
27 | # Terraform output
28 | output "databricks_mounts" {
29 |   value = {
30 |     project = {
31 |       id      = databricks_mount.project.id
32 |       details = databricks_mount.project
33 |     }
34 |   }
35 | }
36 | 


--------------------------------------------------------------------------------
/terraform/modules/databricks/cluster-policy/variables.tf:
--------------------------------------------------------------------------------
 1 | variable "policy_name" {
 2 |   type        = string
 3 |   description = "Cluster policy name."
 4 | }
 5 | 
 6 | variable "CAN_USE" {
 7 |   type = list(object({
 8 |     principal = string
 9 |     type      = string
10 |   })
11 |   )
12 |   description = "(Optional) Objects of principals that should have CAN_USE permission on the policy."
13 |   default     = []
14 | }
15 | 
16 | variable "default_spark_version_regex" {
17 |   type        = string
18 |   description = "(Optional) The default policy Spark version regex. Default is `.*-scala2.12`"
19 |   default     = ".*-scala2.12"
20 | }
21 | 
22 | variable "default_autotermination_minutes" {
23 |   type        = number
24 |   description = "(Optional) The default policy cluster autotermination in minutes. Default is 120 minutes."
25 |   default     = 120
26 | }
27 | 
28 | variable "default_cluster_log_path" {
29 |   type        = string
30 |   description = "(Optional) The default policy location to deliver Spark driver, worker, and event logs. Default is `dbfs:/cluster-logs`."
31 |   default     = "dbfs:/cluster-logs"
32 | }
33 | 
34 | variable "policy_overrides_file" {
35 |   type        = string
36 |   description = "(Optional) The path to a json file containing any cluster policy overrides."
37 |   default     = null
38 | }
39 | 
40 | variable "policy_overrides_object" {
41 |   description = "(Optional) Cluster policy overrides defined as object."
42 |   default     = {}
43 | }
44 | 


--------------------------------------------------------------------------------
/notebooks/shared/print-current-user.py:
--------------------------------------------------------------------------------
 1 | # Databricks notebook source
 2 | # MAGIC %md
 3 | # MAGIC ### Show current user (API method)
 4 | 
 5 | # COMMAND ----------
 6 | 
 7 | # MAGIC %md
 8 | # MAGIC ##### Get the API url and bearer token
 9 | 
10 | # COMMAND ----------
11 | 
12 | API_URL = dbutils.notebook.entry_point.getDbutils().notebook().getContext().apiUrl().getOrElse(None)
13 | TOKEN = dbutils.notebook.entry_point.getDbutils().notebook().getContext().apiToken().getOrElse(None)
14 | 
15 | # COMMAND ----------
16 | 
17 | # MAGIC %md
18 | # MAGIC ##### Call the SCIM API
19 | 
20 | # COMMAND ----------
21 | 
22 | import pprint
23 | 
24 | import requests
25 | 
26 | response = requests.get(
27 |     API_URL + '/api/2.0/preview/scim/v2/Me',
28 |     headers={"Authorization": "Bearer " + TOKEN}
29 | )
30 | 
31 | if response.status_code == 200:
32 |     pprint.pprint(response.json())
33 |     userName = response.json()["userName"]
34 |     print(userName)
35 | else:
36 |     print("Error: %s: %s" % (response.json()["error_code"], response.json()["message"]))
37 | 
38 | # COMMAND ----------
39 | 
40 | # MAGIC %md
41 | # MAGIC ### Show current user (SQL method)
42 | 
43 | # COMMAND ----------
44 | 
45 | spark.conf.set("spark.databricks.userInfoFunctions.enabled", True)
46 | userName = spark.sql("SELECT current_user as user").collect()[0].user
47 | 
48 | print(userName)
49 | 
50 | # COMMAND ----------
51 | 
52 | # MAGIC %md
53 | # MAGIC ### Exit notebook with a value
54 | 
55 | # COMMAND ----------
56 | 
57 | dbutils.notebook.exit(userName)
58 | 


--------------------------------------------------------------------------------
/terraform/deployments/test.tfvars:
--------------------------------------------------------------------------------
 1 | STORAGE_ACCOUNT_NAME                = "dlsuniquetftest0"
 2 | PIPELINE_CONTAINER_NAME             = "tftest-pipeline"
 3 | PROJECT_CONTAINER_NAME              = "tftest-project"
 4 | DATA_FACTORY_NAME                   = "tftest-adf-unique0"
 5 | DATABRICKS_WORKSPACE_NAME           = "tftest-adb-workspace"
 6 | DATABRICKS_PRICING_TIER             = "trial"
 7 | DATABRICKS_VNET_NAME                = "tftest-adb-vnet"
 8 | DATABRICKS_VNET_CIDR                = "192.168.0.0/24"
 9 | DATABRICKS_PRIVATE_SUBNET_NAME      = "tftest-private-subnet"
10 | DATABRICKS_PRIVATE_SUBNET_CIDR      = "192.168.0.0/25"
11 | DATABRICKS_PUBLIC_SUBNET_NAME       = "tftest-public-subnet"
12 | DATABRICKS_PUBLIC_SUBNET_CIDR       = "192.168.0.128/25"
13 | DATABRICKS_NSG_NAME                 = "tftest-adb-nsg"
14 | DATABRICKS_SECRET_SCOPE_NAME        = "databricks-secret-scope"
15 | DATABRICKS_JOBS_POOL_NAME           = "Jobs Pool"
16 | DATABRICKS_JOBS_POOL_NODE_TYPE      = "Standard_F4s_v2"
17 | DATABRICKS_SHARED_POOL_NAME         = "Shared Pool"
18 | DATABRICKS_SHARED_POOL_NODE_TYPE    = "Standard_D4ds_v4"
19 | DATABRICKS_SHARED_CLUSTER_NAME      = "Shared Autoscaling"
20 | DATABRICKS_SPARK_VERSION            = "13.3.x-scala2.12"
21 | DATABRICKS_CLUSTER_POLICY_LOCATION  = "../../../pipelines/templates/cluster-policy-single-node.json"
22 | NOTEBOOKS_SHARED_SOURCE_LOCATION    = "../../../notebooks/shared"
23 | NOTEBOOKS_SHARED_WORKSPACE_FOLDER   = "/Shared/generic"
24 | NOTEBOOKS_PROJECT_WORKSPACE_FOLDER  = "/Project"
25 | NOTEBOOKS_PIPELINE_WORKSPACE_FOLDER = "/Pipeline"
26 | 


--------------------------------------------------------------------------------
/terraform/modules/azure/service-principal/README.md:
--------------------------------------------------------------------------------
 1 | ## Description
 2 | 
 3 | Creates an Azure Service Principal with optional App Owners and API Permissions (including admin-consent).
 4 | 
 5 | ## Inputs
 6 | 
 7 | | Name              | Description                                                                             | Type           | Default | Required |
 8 | |-------------------|-----------------------------------------------------------------------------------------|----------------|---------|:--------:|
 9 | | name              | The display name for the App Registration                                               | `string`       | n/a     |   yes    |
10 | | owners            | A list of Azure AD Object IDs that will be granted ownership of the application         | `list(string)` | `[]`    |    no    |
11 | | api_permissions   | A list of API Permissions that should be assigned to this App (including admin consent) | `list(string)` | `[]`    |    no    |
12 | | secret_expiration | A relative duration for which the Password is valid                                     | `string`       | `8760h` |    no    |
13 | 
14 | ## Outputs
15 | 
16 | | Name           | Description                                                    |
17 | |----------------|----------------------------------------------------------------|
18 | | object_id      | The AD Object ID of the Service Principal                      |
19 | | application_id | The Application ID (Client ID) of the Service Principal        |
20 | | secret         | The Password / Secret (Client Secret) of the Service Principal |
21 | 


--------------------------------------------------------------------------------
/scripts/add_secret_to_secret_scope.sh:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env bash
 2 | #
 3 | # Adds a secret to a Databricks Secret Scope using the Secrets API (https://docs.databricks.com/dev-tools/api/latest/secrets.html).
 4 | # It uses simple positional arguments.
 5 | #
 6 | 
 7 | # Required parameters
 8 | _workspace_url=${1}
 9 | _access_token=${2}
10 | _secret_scope_name=${3}
11 | _secret_name=${4}
12 | _secret_value=${5}
13 | 
14 | _usage() {
15 |   echo -e "Usage: ${0} <workspace_url> <access_token> <secret_scope_name> <secret_name> <secret_value>"
16 |   exit 1
17 | }
18 | 
19 | # Parameters check
20 | [ -z "${_workspace_url}" ] && _usage
21 | [ -z "${_access_token}" ] && _usage
22 | [ -z "${_secret_scope_name}" ] && _usage
23 | [ -z "${_secret_name}" ] && _usage
24 | [ -z "${_secret_value}" ] && _usage
25 | 
26 | # Set the payload
27 | payload='
28 | {
29 |   "scope": "'${_secret_scope_name}'",
30 |   "key": "'${_secret_name}'",
31 |   "string_value": "'${_secret_value}'"
32 | }
33 | '
34 | 
35 | # Call the Databricks Secrets API
36 | echo -e "Storing the secret \"${_secret_name}\" in the Secret Scope \"${_secret_scope_name}\""
37 | _response=$(curl -sS --request POST \
38 |                      --header "Authorization: Bearer ${_access_token}" \
39 |                      --header "Content-Type: application/json" \
40 |                      "${_workspace_url}/api/2.0/secrets/put" \
41 |                      -d "${payload}")
42 | 
43 | # Return ok if there is no error code
44 | if [ "${_response}" == "{}" ]; then
45 |   echo -e "Secret \"${_secret_name}\" stored"
46 | else
47 |   echo "${_response}"
48 |   exit 1
49 | fi
50 | 


--------------------------------------------------------------------------------
/terraform/modules/azure/key-vault/README.md:
--------------------------------------------------------------------------------
 1 | ## Description
 2 | 
 3 | Creates an Azure Key Vault.
 4 | 
 5 | ## Inputs
 6 | 
 7 | | Name                       | Description                                                            | Type          | Default    | Required |
 8 | |----------------------------|------------------------------------------------------------------------|---------------|------------|:--------:|
 9 | | resource_group_name        | The name of the Resource Group in which the resources should exist     | `string`      | n/a        |   yes    |
10 | | azure_location             | Azure location in which the resources should exist                     | `string`      | `null`     |    no    |
11 | | key_vault_name             | The name of the Azure Key Vault                                        | `string`      | n/a        |   yes    |
12 | | sku_name                   | The name of the SKU used for this Key Vault                            | `string`      | `standard` |    no    |
13 | | soft_delete_retention_days | The number of days that items should be retained for once soft-deleted | `number`      | `7`        |    no    |
14 | | tags                       | A mapping of tags to assign to the resources                           | `map(string)` | `{}`       |    no    |
15 | 
16 | ## Outputs
17 | 
18 | | Name   | Description                     |
19 | |--------|---------------------------------|
20 | | id     | The ID of the Azure Key Vault   |
21 | | name   | The name of the Azure Key Vault |
22 | | uri    | The URI of the Azure Key Vault  |
23 | | policy | The Azure Key Vault policy ID   |
24 | 


--------------------------------------------------------------------------------
/scripts/get_data_factory_identity.ps1:
--------------------------------------------------------------------------------
 1 | <#
 2 | Retrieves the Azure Object ID of a Data Factory Managed Identity.
 3 | It uses simple positional arguments.
 4 | Returns the service principal client id as a variable called adfPrincipalId in the Azure Pipelines format.
 5 | Returns the service principal object id as a variable called adfObjectId in the Azure Pipelines format.
 6 | #>
 7 | Param(
 8 |     [Parameter(Mandatory = $True, Position = 1)] [string] $resourceGroup,
 9 |     [Parameter(Mandatory = $True, Position = 2)] [string] $dataFactoryName
10 | )
11 | 
12 | # Get the Object ID of the Azure Data Factory Managed Identity (Identity.PrincipalId returns the Object ID)
13 | $AzureDataFactory = Get-AzDataFactoryV2 -ResourceGroupName $resourceGroup -Name $dataFactoryName
14 | $AzureDataFactoryObjectId = $AzureDataFactory.Identity.PrincipalId
15 | 
16 | If ( [string]::IsNullOrEmpty($AzureDataFactoryObjectId))
17 | {
18 |     throw "AzureDataFactoryObjectId is NULL or EMPTY: " + $AzureDataFactory.DataFactoryId
19 |     exit 1
20 | }
21 | 
22 | # Get the Application ID from the Object ID
23 | $AzureDataFactoryPrincipal = Get-AzADServicePrincipal -ObjectId $AzureDataFactoryObjectId
24 | $AzureDataFactoryPrincipalId = $AzureDataFactoryPrincipal.AppId
25 | 
26 | If ( [string]::IsNullOrEmpty($AzureDataFactoryPrincipalId))
27 | {
28 |     throw "AzureDataFactoryPrincipalId is NULL or EMPTY, Object ID: " + $AzureDataFactoryObjectId
29 |     exit 1
30 | }
31 | 
32 | Write-Host "##vso[task.setvariable variable=adfPrincipalId;]$AzureDataFactoryPrincipalId"
33 | Write-Host "##vso[task.setvariable variable=adfObjectId;]$AzureDataFactoryObjectId"
34 | 


--------------------------------------------------------------------------------
/terraform/deployments/azure-infrastructure/main.tf:
--------------------------------------------------------------------------------
 1 | /**
 2 | * Builds the Azure infrastructure for the data pipeline and project.
 3 | */
 4 | 
 5 | provider "azurerm" {
 6 |   features {
 7 |     key_vault {
 8 |       purge_soft_delete_on_destroy = true
 9 |     }
10 |   }
11 |   skip_provider_registration = true
12 | }
13 | 
14 | terraform {
15 |   required_version = "~> 1.5.7"
16 | 
17 |   backend "azurerm" {}
18 | 
19 |   required_providers {
20 |     azuread = {
21 |       source  = "hashicorp/azuread"
22 |       version = "~> 2"
23 |     }
24 |     azurerm = {
25 |       source  = "hashicorp/azurerm"
26 |       version = "~> 3"
27 |     }
28 |     random = {
29 |       source  = "hashicorp/random"
30 |       version = "~> 3"
31 |     }
32 |   }
33 | }
34 | 
35 | 
36 | ### Data Sources
37 | 
38 | # Get information about the AzureRM provider
39 | data "azurerm_client_config" "current" {}
40 | 
41 | # Get information about the pre-provisioned Service Principal
42 | data "azuread_service_principal" "data_service_principal" {
43 |   application_id = var.DATA_SERVICE_PRINCIPAL_CLIENT_ID
44 | }
45 | 
46 | # Get information about the pre-provisioned Project group
47 | data "azuread_group" "project_group" {
48 |   display_name     = var.PROJECT_GROUP_NAME
49 |   security_enabled = true
50 | }
51 | 
52 | # Get information about the pre-provisioned Resource Group
53 | data "azurerm_resource_group" "main" {
54 |   name = var.RESOURCE_GROUP_NAME
55 | }
56 | 
57 | # Get information about the pre-provisioned Key Vault
58 | data "azurerm_key_vault" "main" {
59 |   name                = var.KEY_VAULT_NAME
60 |   resource_group_name = var.RESOURCE_GROUP_NAME
61 | }
62 | 


--------------------------------------------------------------------------------
/terraform/modules/azure/resource-group/README.md:
--------------------------------------------------------------------------------
 1 | ## Description
 2 | 
 3 | Creates an Azure Resource Group with optional IAM roles attached to it.
 4 | 
 5 | ## Inputs
 6 | 
 7 | | Name                | Description                                                                        | Type           | Default | Required |
 8 | |---------------------|------------------------------------------------------------------------------------|----------------|---------|:--------:|
 9 | | resource_group_name | The name of the Resource Group                                                     | `string`       | n/a     |   yes    |
10 | | azure_location      | Azure location for the Resource Group                                              | `string`       | n/a     |   yes    |
11 | | owners              | A list of Object IDs that should have the Owner role over the Resource Group       | `list(string)` | `[]`    |    no    |
12 | | contributors        | A list of Object IDs that should have the Contributor role over the Resource Group | `list(string)` | `[]`    |    no    |
13 | | readers             | A list of Object IDs that should have the Reader role over the Resource Group      | `list(string)` | `[]`    |    no    |
14 | | tags                | A mapping of tags to assign to the Resource Group                                  | `map(string)`  | `{}`    |    no    |
15 | 
16 | ## Outputs
17 | 
18 | | Name     | Description                        |
19 | |----------|------------------------------------|
20 | | id       | The ID of the Resource Group       |
21 | | name     | The name of the Resource Group     |
22 | | location | The location of the Resource Group |
23 | 


--------------------------------------------------------------------------------
/scripts/get_workspace_object_id.sh:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env bash
 2 | #
 3 | # Retrieves a Databricks Object ID using the Workspace API (https://docs.databricks.com/dev-tools/api/latest/workspace.html).
 4 | # It uses simple positional arguments.
 5 | # Returns the Object ID as a variable called workspaceObjectId in the Azure Pipelines format.
 6 | #
 7 | 
 8 | # Required parameters
 9 | _workspace_url=${1}
10 | _access_token=${2}
11 | _object_path=${3}
12 | 
13 | # Local variables
14 | _python="$(command -v python || command -v python3)"
15 | 
16 | _usage() {
17 |   echo -e "Usage: ${0} <workspace_url> <access_token> <object_path>"
18 |   exit 1
19 | }
20 | 
21 | # Parameters check
22 | [ -z "${_workspace_url}" ] && _usage
23 | [ -z "${_access_token}" ] && _usage
24 | [ -z "${_object_path}" ] && _usage
25 | 
26 | # Call the Databricks workspace API
27 | echo -e "Getting the Object ID of \"${_object_path}\""
28 | _response=$(curl -sS --request GET \
29 |                      --header "Authorization: Bearer ${_access_token}" \
30 |                      --header "Content-Type: application/json" \
31 |                      "${_workspace_url}/api/2.0/workspace/get-status?path=${_object_path}")
32 | 
33 | # Extract the Object ID from response
34 | object_id=$(echo "${_response}" | ${_python} -c 'import sys,json; print(json.load(sys.stdin)["object_id"])' 2> /dev/null)
35 | [ -z "${object_id}" ] && { echo "${_response}"; exit 1; }
36 | echo -e "Got the Object ID: ${object_id}"
37 | 
38 | # Pass the variables to Azure Pipelines
39 | if [ "${BASH_SOURCE[0]}" == "$0" ]; then
40 |   [ -n "${object_id}" ] && echo "##vso[task.setvariable variable=workspaceObjectId;issecret=false]${object_id}"
41 | fi
42 | 


--------------------------------------------------------------------------------
/pipelines/templates/deploy-notebooks.yml:
--------------------------------------------------------------------------------
 1 | #
 2 | # Template that deploys a folder with Databricks notebooks to a Databricks workspace.
 3 | #
 4 | 
 5 | parameters:
 6 |   - name: databricksWorkspaceUrl
 7 |     displayName: 'Azure Databricks Workspace Url'
 8 |     type: string
 9 | 
10 |   - name: accessToken
11 |     displayName: 'Azure AD Access Token'
12 |     type: string
13 | 
14 |   - name: notebooksSourceLocation
15 |     displayName: 'Location of notebooks to be deployed'
16 |     type: string
17 | 
18 |   - name: notebooksWorkspaceFolder
19 |     displayName: 'Databricks notebooks folder'
20 |     type: string
21 | 
22 | 
23 | steps:
24 |   - task: UsePythonVersion@0
25 |     displayName: 'Use Python 3.x'
26 |     inputs:
27 |       versionSpec: '3.x'
28 |       addToPath: true
29 |       architecture: 'x64'
30 | 
31 |   - task: Bash@3
32 |     displayName: 'Install the databricks-cli'
33 |     inputs:
34 |       targetType: 'inline'
35 |       script: |
36 |         _pip="$(command -v pip || command -v pip3)"
37 |         ${_pip} install databricks-cli
38 | 
39 |   - task: Bash@3
40 |     displayName: 'Deploy notebooks to workspace'
41 |     inputs:
42 |       targetType: 'inline'
43 |       script: |
44 |         export LC_ALL=C.UTF-8
45 |         export LANG=C.UTF-8
46 |         _command="databricks workspace import_dir -o -e ${NOTEBOOKS_SRC} ${NOTEBOOKS_DEST}"
47 |         echo "Running: \'${_command}\'"
48 |         ${_command}
49 |     env:
50 |       DATABRICKS_HOST: ${{ parameters.databricksWorkspaceUrl }}
51 |       DATABRICKS_TOKEN: ${{ parameters.accessToken }}
52 |       NOTEBOOKS_SRC: ${{ parameters.notebooksSourceLocation }}
53 |       NOTEBOOKS_DEST: ${{ parameters.notebooksWorkspaceFolder }}
54 | 


--------------------------------------------------------------------------------
/scripts/create_secret_scope.sh:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env bash
 2 | #
 3 | # Creates a Databricks Secret Scope using the Secrets API (https://docs.databricks.com/dev-tools/api/latest/secrets.html).
 4 | # It uses simple positional arguments.
 5 | #
 6 | 
 7 | # Required parameters
 8 | _workspace_url=${1}
 9 | _access_token=${2}
10 | _secret_scope_name=${3}
11 | 
12 | # Local variables
13 | _python="$(command -v python || command -v python3)"
14 | 
15 | # Optional parameters
16 | initial_manage_principal=${4}
17 | 
18 | _usage() {
19 |   echo -e "Usage: ${0} <workspace_url> <access_token> <secret_scope_name>"
20 |   exit 1
21 | }
22 | 
23 | # Parameters check
24 | [ -z "${_workspace_url}" ] && _usage
25 | [ -z "${_access_token}" ] && _usage
26 | [ -z "${_secret_scope_name}" ] && _usage
27 | 
28 | # Set the payload
29 | payload='
30 | {
31 |   "scope": "'${_secret_scope_name}'"
32 |   '$([ -n "${initial_manage_principal}" ] && echo ',"initial_manage_principal": "test"')'
33 | }
34 | '
35 | 
36 | # Call the Databricks Secrets API
37 | echo -e "Creating the Secret Scope \"${_secret_scope_name}\""
38 | _response=$(curl -sS --request POST \
39 |                      --header "Authorization: Bearer ${_access_token}" \
40 |                      --header "Content-Type: application/json" \
41 |                      "${_workspace_url}/api/2.0/secrets/scopes/create" \
42 |                      -d "${payload}")
43 | _error_code=$(echo "${_response}" | ${_python} -c 'import sys,json; print(json.load(sys.stdin)["error_code"])' 2> /dev/null)
44 | 
45 | if [ "${_response}" == "{}" ] || [ "${_error_code}" == "RESOURCE_ALREADY_EXISTS" ]; then
46 |   echo -e "Secret Scope \"${_secret_scope_name}\" created or already exists"
47 | else
48 |   echo "${_response}"
49 |   exit 1
50 | fi
51 | 


--------------------------------------------------------------------------------
/terraform/modules/azure/databricks-vnet/outputs.tf:
--------------------------------------------------------------------------------
 1 | output "virtual_network_id" {
 2 |   description = "The ID of the Virtual Network."
 3 |   value       = azurerm_virtual_network.databricks_vnet.id
 4 | }
 5 | 
 6 | output "virtual_network_name" {
 7 |   description = "The name of the Virtual Network."
 8 |   value       = azurerm_virtual_network.databricks_vnet.name
 9 | }
10 | 
11 | output "private_subnet_id" {
12 |   description = "The ID of the Private Subnet within the Virtual Network."
13 |   value       = azurerm_subnet.databricks_private_subnet.id
14 | }
15 | 
16 | output "private_subnet_name" {
17 |   description = "The name of the Private Subnet within the Virtual Network."
18 |   value       = azurerm_subnet.databricks_private_subnet.name
19 | }
20 | 
21 | output "public_subnet_id" {
22 |   description = "The ID of the Public Subnet within the Virtual Network."
23 |   value       = azurerm_subnet.databricks_public_subnet.id
24 | }
25 | 
26 | output "public_subnet_name" {
27 |   description = "The name of the Public Subnet within the Virtual Network."
28 |   value       = azurerm_subnet.databricks_public_subnet.name
29 | }
30 | 
31 | output "network_security_group_id" {
32 |   description = "The ID of the Databricks Network Security Group attached to the subnets."
33 |   value       = azurerm_virtual_network.databricks_vnet.id
34 | }
35 | 
36 | output "nat_gateway_id" {
37 |   description = "The ID of the NAT gateway attached to the subnets."
38 |   value       = length(azurerm_nat_gateway.databricks) > 0 ? azurerm_nat_gateway.databricks[0].id : null
39 | 
40 | }
41 | 
42 | output "nat_public_ip_id" {
43 |   description = "The ID of the NAT gateway public IP."
44 |   value       = length(azurerm_public_ip.databricks) > 0 ? azurerm_public_ip.databricks[0].id : null
45 | }
46 | 


--------------------------------------------------------------------------------
/terraform/modules/azure/key-vault/variables.tf:
--------------------------------------------------------------------------------
 1 | variable "resource_group_name" {
 2 |   type        = string
 3 |   description = "The name of the Resource Group in which the resources should exist."
 4 | }
 5 | 
 6 | variable "azure_location" {
 7 |   type        = string
 8 |   description = "(Optional) Azure location in which the resources should exist. If not set, it will use the location of the Resource Group."
 9 |   default     = null
10 | }
11 | 
12 | variable "key_vault_name" {
13 |   type        = string
14 |   description = "The name of the Azure Key Vault."
15 | 
16 |   validation {
17 |     condition     = length(var.key_vault_name) >= 3 && length(var.key_vault_name) <= 24
18 |     error_message = "The name of the Key Vault must be between 3 and 24 characters."
19 |   }
20 | }
21 | 
22 | variable "sku_name" {
23 |   type        = string
24 |   description = "(Optional) The name of the SKU used for this Key Vault. Possible values are standard and premium. Default is standard."
25 |   default     = "standard"
26 | 
27 |   validation {
28 |     condition     = contains(["premium", "standard"], var.sku_name)
29 |     error_message = "Possible values are standard and premium."
30 |   }
31 | }
32 | 
33 | variable "soft_delete_retention_days" {
34 |   type        = number
35 |   description = "(Optional) The number of days that items should be retained for once soft-deleted. Default is 7 days."
36 |   default     = 7
37 | 
38 |   validation {
39 |     condition     = var.soft_delete_retention_days >= 7 && var.soft_delete_retention_days <= 90
40 |     error_message = "This value can be between 7 and 90 days."
41 |   }
42 | }
43 | 
44 | variable "tags" {
45 |   type        = map(string)
46 |   description = "(Optional) A mapping of tags to assign to the resources."
47 |   default     = {}
48 | }
49 | 


--------------------------------------------------------------------------------
/terraform/modules/azure/data-factory/README.md:
--------------------------------------------------------------------------------
 1 | ## Description
 2 | 
 3 | Creates an Azure Data Factory with optional Key Vault linked services.
 4 | 
 5 | ## Inputs
 6 | 
 7 | | Name                | Description                                                           | Type           | Default | Required |
 8 | |---------------------|-----------------------------------------------------------------------|----------------|---------|:--------:|
 9 | | resource_group_name | The name of the Resource Group in which the resources should exist    | `string`       | n/a     |   yes    |
10 | | azure_location      | Azure location in which the resources should exist                    | `string`       | `null`  |    no    |
11 | | data_factory_name   | The name of the Azure Data Factory                                    | `string`       | n/a     |   yes    |
12 | | key_vault_ids       | A list of Azure Key Vault IDs to be used for creating linked services | `list(string)` | `[]`    |    no    |
13 | | tags                | A mapping of tags to assign to the resources                          | `map(string)`  | `{}`    |    no    |
14 | 
15 | ## Outputs
16 | 
17 | | Name                      | Description                                                                 |
18 | |---------------------------|-----------------------------------------------------------------------------|
19 | | id                        | The ID of the Azure Data Factory                                            |
20 | | name                      | The name of the Azure Data Factory                                          |
21 | | principal_id              | The ID of the Azure Data Factory Managed Identity in Azure Active Directory |
22 | | key_vault_linked_services | Details of the Azure Data Factory linked Key Vault services                 |
23 | 


--------------------------------------------------------------------------------
/notebooks/shared/mount-adls-gen-2.py:
--------------------------------------------------------------------------------
 1 | # Databricks notebook source
 2 | # MAGIC %md
 3 | # MAGIC ### Set mount config
 4 | 
 5 | # COMMAND ----------
 6 | 
 7 | configs = {
 8 |     "fs.azure.account.auth.type": "OAuth",
 9 |     "fs.azure.account.oauth.provider.type": "org.apache.hadoop.fs.azurebfs.oauth2.ClientCredsTokenProvider",
10 |     "fs.azure.account.oauth2.client.id": dbutils.widgets.get("spClientId"),
11 |     "fs.azure.account.oauth2.client.secret": dbutils.secrets.get(
12 |         scope=dbutils.widgets.get("secretScopeName"),
13 |         key=dbutils.widgets.get("secretNameClientSecret"),
14 |     ),
15 |     "fs.azure.account.oauth2.client.endpoint": "https://login.microsoftonline.com/"
16 |     + dbutils.widgets.get("tenantId")
17 |     + "/oauth2/token",
18 | }
19 | 
20 | # COMMAND ----------
21 | 
22 | # MAGIC %md
23 | # MAGIC ### Mount with the config
24 | 
25 | # COMMAND ----------
26 | 
27 | mountPoint = dbutils.widgets.get("mountPoint")
28 | mountSource = (
29 |     "abfss://"
30 |     + dbutils.widgets.get("storageContainerName")
31 |     + "@"
32 |     + dbutils.widgets.get("storageAccountName")
33 |     + ".dfs.core.windows.net/"
34 | )
35 | 
36 | if mountPoint in list(map(lambda m: m.mountPoint, dbutils.fs.mounts())):
37 |     try:
38 |         dbutils.fs.ls(mountPoint)
39 |     except:
40 |         dbutils.fs.unmount(mountPoint)
41 | 
42 | if mountPoint not in list(map(lambda m: m.mountPoint, dbutils.fs.mounts())):
43 |     spark.conf.set("fs.azure.createRemoteFileSystemDuringInitialization", "true")
44 |     dbutils.fs.mount(source=mountSource, mount_point=mountPoint, extra_configs=configs)
45 | 
46 | # COMMAND ----------
47 | 
48 | # MAGIC %md
49 | # MAGIC ### Exit notebook
50 | 
51 | # COMMAND ----------
52 | 
53 | dbutils.notebook.exit(
54 |     '{"mountSource": "' + mountSource + '", "mountPoint": "' + mountPoint + '"}'
55 | )
56 | 


--------------------------------------------------------------------------------
/scripts/get_access_token.sh:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env bash
 2 | #
 3 | # Generates an Azure Active Directory Token of the current az cli login (using 'az account get-access-token').
 4 | # If optional positional arguments are used, it will login with those credentials first.
 5 | # Returns the access token as a variable called accessToken in the Azure Pipelines format.
 6 | #
 7 | 
 8 | # Optional parameters - if not set it will use the Databricks Resource ID and the existing CLI login
 9 | _azure_resource=${1:-"2ff814a6-3304-4ab8-85cb-cd0e6f879c1d"}
10 | _sp_client_id=${2:-${ARM_CLIENT_ID}}
11 | _sp_client_secret=${3:-${ARM_CLIENT_SECRET}}
12 | _tenant_id=${4:-${ARM_TENANT_ID:-${tenantId}}}
13 | 
14 | # Local variables
15 | _python="$(command -v python || command -v python3)"
16 | 
17 | # Log in as service principal with Azure CLI (if parameters were defined)
18 | if [ -n "${_sp_client_id}" ] && [ -n "${_sp_client_secret}" ] && [ -n "${_tenant_id}" ]; then
19 |   echo -e "Will use the Service Principal ${_sp_client_id} and its Secret to authenticate to Azure RM"
20 | 
21 |   # Log out
22 |   az logout
23 | 
24 |   # Log in with the details from parameters
25 |   echo -e "Logging in as: ${_sp_client_id}"
26 |   az login --service-principal --username "${_sp_client_id}" --password "${_sp_client_secret}" --tenant "${_tenant_id}" --allow-no-subscriptions || exit 1
27 | fi
28 | 
29 | # Use the az cli command to get the token
30 | echo "Getting the AAD Access Token"
31 | access_token=$(az account get-access-token --resource="${_azure_resource}" | ${_python} -c 'import sys,json; print(json.load(sys.stdin)["accessToken"])')
32 | [ -z "${access_token}" ] && exit 1
33 | echo "Got the AAD access token"
34 | 
35 | # Pass the variables to Azure Pipelines
36 | if [ "${BASH_SOURCE[0]}" == "$0" ]; then
37 |   [ -n "${access_token}" ] && echo "##vso[task.setvariable variable=accessToken;issecret=true]${access_token}" || exit 1
38 | fi
39 | 


--------------------------------------------------------------------------------
/terraform/modules/azure/azure-devops-project/main.tf:
--------------------------------------------------------------------------------
 1 | /**
 2 | * Creates an Azure DevOps project with optional service endpoints (AzureRM or GitHub).
 3 | */
 4 | data "azurerm_client_config" "current" {}
 5 | 
 6 | data "azurerm_subscription" "current" { subscription_id = data.azurerm_client_config.current.subscription_id }
 7 | 
 8 | resource "azuredevops_project" "this" {
 9 |   name               = var.project_name
10 |   visibility         = "private"
11 |   version_control    = "Git"
12 |   work_item_template = "Agile"
13 | }
14 | 
15 | resource "azuredevops_serviceendpoint_github" "endpoints" {
16 |   count                 = length(var.github_endpoints)
17 |   project_id            = azuredevops_project.this.id
18 |   service_endpoint_name = var.github_endpoints[count.index]
19 | 
20 |   auth_personal {
21 |     # Also can be set with AZDO_GITHUB_SERVICE_CONNECTION_PAT environment variable
22 |     personal_access_token = var.github_pat
23 |   }
24 | }
25 | 
26 | resource "azuredevops_serviceendpoint_azurerm" "endpoints" {
27 |   count                 = length(var.arm_endpoints)
28 |   project_id            = azuredevops_project.this.id
29 |   service_endpoint_name = var.arm_endpoints[count.index].name
30 |   credentials {
31 |     serviceprincipalid  = var.arm_endpoints[count.index].client_id
32 |     serviceprincipalkey = var.arm_endpoints[count.index].client_secret
33 |   }
34 |   azurerm_spn_tenantid      = data.azurerm_client_config.current.tenant_id
35 |   azurerm_subscription_id   = data.azurerm_client_config.current.subscription_id
36 |   azurerm_subscription_name = data.azurerm_subscription.current.display_name
37 | }
38 | 
39 | resource "azuredevops_pipeline_authorization" "enable_for_all" {
40 |   count       = length(var.arm_endpoints)
41 |   project_id  = azuredevops_project.this.id
42 |   resource_id = azuredevops_serviceendpoint_azurerm.endpoints[count.index].id
43 |   type        = "endpoint"
44 | }
45 | 


--------------------------------------------------------------------------------
/terraform/modules/azure/key-vault/main.tf:
--------------------------------------------------------------------------------
 1 | /**
 2 | * Creates an Azure Key Vault.
 3 | * Gives "Key, Secret, & Certificate Management" policies to the creator.
 4 | */
 5 | data "azurerm_client_config" "current" {}
 6 | 
 7 | data "azurerm_resource_group" "this" {
 8 |   name = var.resource_group_name
 9 | }
10 | 
11 | locals {
12 |   location = var.azure_location == null ? data.azurerm_resource_group.this.location : var.azure_location
13 | 
14 |   tags = {
15 |     ManagedBy = "Terraform"
16 |   }
17 | }
18 | 
19 | resource "azurerm_key_vault" "this" {
20 |   name                       = var.key_vault_name
21 |   location                   = local.location
22 |   resource_group_name        = data.azurerm_resource_group.this.name
23 |   tenant_id                  = data.azurerm_client_config.current.tenant_id
24 |   soft_delete_retention_days = var.soft_delete_retention_days
25 |   purge_protection_enabled   = false
26 |   sku_name                   = var.sku_name
27 |   tags                       = merge(local.tags, var.tags)
28 | }
29 | 
30 | resource "azurerm_key_vault_access_policy" "creator" {
31 |   key_vault_id            = azurerm_key_vault.this.id
32 |   tenant_id               = data.azurerm_client_config.current.tenant_id
33 |   object_id               = data.azurerm_client_config.current.object_id
34 |   certificate_permissions = [
35 |     "Get", "List", "Delete", "Create", "Import", "Update", "ManageContacts", "GetIssuers", "ListIssuers", "SetIssuers",
36 |     "DeleteIssuers", "ManageIssuers", "Recover", "Purge"
37 |   ]
38 |   key_permissions = [
39 |     "Get", "Create", "Delete", "List", "Update", "Import", "Backup", "Restore", "Recover", "Purge"
40 |   ]
41 |   secret_permissions  = ["Get", "List", "Set", "Delete", "Backup", "Restore", "Recover", "Purge"]
42 |   storage_permissions = [
43 |     "Get", "List", "Delete", "Set", "Update", "RegenerateKey", "SetSAS", "ListSAS", "GetSAS", "DeleteSAS", "Purge"
44 |   ]
45 | }
46 | 


--------------------------------------------------------------------------------
/scripts/create_secret_scope_acl.sh:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env bash
 2 | #
 3 | # Creates a Databricks Secret Scope ACL using the Secrets API (https://docs.databricks.com/dev-tools/api/latest/secrets.html).
 4 | # It uses simple positional arguments.
 5 | #
 6 | 
 7 | # Required parameters
 8 | _workspace_url=${1}
 9 | _access_token=${2}
10 | _secret_scope_name=${3}
11 | _principal=${4}
12 | _permission=${5}
13 | 
14 | # Local variables
15 | _python="$(command -v python || command -v python3)"
16 | 
17 | _usage() {
18 |   echo -e "Usage: ${0} <workspace_url> <access_token> <secret_scope_name> <principal> <permission>"
19 |   exit 1
20 | }
21 | 
22 | # Parameters check
23 | [ -z "${_workspace_url}" ] && _usage
24 | [ -z "${_access_token}" ] && _usage
25 | [ -z "${_secret_scope_name}" ] && _usage
26 | [ -z "${_principal}" ] && _usage
27 | [ -z "${_permission}" ] && _usage
28 | 
29 | # Set the payload
30 | payload='
31 | {
32 |   "scope": "'${_secret_scope_name}'",
33 |   "principal": "'${_principal}'",
34 |   "permission": "'${_permission}'"
35 | }
36 | '
37 | 
38 | # Call the Databricks Secrets API
39 | echo -e "Adding the \"${_permission}\" permission to principal \"${_principal}\" to \"${_secret_scope_name}\" secret scope"
40 | _response=$(curl -sS --request POST \
41 |                      --header "Authorization: Bearer ${_access_token}" \
42 |                      --header "Content-Type: application/json" \
43 |                      "${_workspace_url}/api/2.0/secrets/acls/put" \
44 |                      -d "${payload}")
45 | 
46 | # Get the error code
47 | _error_code=$(echo "${_response}" | ${_python} -c 'import sys,json; print(json.load(sys.stdin)["error_code"])' 2> /dev/null)
48 | 
49 | # Return ok if ACL was created or already exists
50 | if [ "${_response}" == "{}" ] || [ "${_error_code}" == "RESOURCE_ALREADY_EXISTS" ]; then
51 |   echo -e "ACL \"${_permission}\" added or already exists"
52 | else
53 |   echo "${_response}"
54 |   exit 1
55 | fi
56 | 


--------------------------------------------------------------------------------
/terraform/modules/azure/azure-devops-pipeline/README.md:
--------------------------------------------------------------------------------
 1 | ## Description
 2 | 
 3 | Creates an Azure DevOps Pipeline (hosted on GitHub) with optional variables.
 4 | 
 5 | ## Inputs
 6 | 
 7 | | Name               | Description                                                            | Type          | Default  | Required |
 8 | |--------------------|------------------------------------------------------------------------|---------------|----------|:--------:|
 9 | | pipeline_name      | The name of the Azure DevOps pipeline                                  | `string`      | n/a      |   yes    |
10 | | pipeline_path      | The path in the GitHub repo to the pipelines YAML file                 | `string`      | n/a      |   yes    |
11 | | project_id         | The ID of the Azure DevOps project                                     | `string`      | n/a      |   yes    |
12 | | github_endpoint_id | The ID of the GitHub service endpoint                                  | `string`      | n/a      |   yes    |
13 | | github_repo_url    | The URL used by the GitHub service endpoint and pipeline               | `string`      | n/a      |   yes    |
14 | | github_branch      | Branch name for which the pipeline will be configured                  | `string`      | `master` |    no    |
15 | | pipeline_variables | A map of variables names and values that should be set to the pipeline | `map(string)` | `{}`     |    no    |
16 | 
17 | ## Outputs
18 | 
19 | | Name     | Description                                                            |
20 | |----------|------------------------------------------------------------------------|
21 | | id       | The ID of the Azure DevOps pipeline                                    |
22 | | name     | The name of the Azure DevOps pipeline                                  |
23 | | name     | Full Git path to the yaml file of the Azure DevOps pipeline definition |
24 | | revision | The revision of the Azure DevOps pipeline                              |
25 | 


--------------------------------------------------------------------------------
/scripts/get_instance_pool.sh:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env bash
 2 | #
 3 | # Retrieves a Databricks Instance Pool ID using the Instance Pools API (https://docs.databricks.com/dev-tools/api/latest/instance-pools.html).
 4 | # It uses simple positional arguments.
 5 | # Returns the pool id as a variable called databricksPoolId in the Azure Pipelines format.
 6 | #
 7 | 
 8 | # Required parameters
 9 | _workspace_url=${1}
10 | _access_token=${2}
11 | _instance_pool_name=${3}
12 | 
13 | # Local variables
14 | _python="$(command -v python || command -v python3)"
15 | 
16 | _usage() {
17 |   echo -e "Usage: ${0} <workspace_url> <access_token> <instance_pool_name>"
18 |   exit 1
19 | }
20 | 
21 | # Parameters check
22 | [ -z "${_workspace_url}" ] && _usage
23 | [ -z "${_access_token}" ] && _usage
24 | [ -z "${_instance_pool_name}" ] && _usage
25 | 
26 | 
27 | # Call the Databricks Instance Pools API
28 | echo -e "Getting the Instance Pool ID of Pool \"${_instance_pool_name}\""
29 | _response=$(curl -sS --request GET \
30 |                      --header "Authorization: Bearer ${_access_token}" \
31 |                      --header "Content-Type: application/json" \
32 |                      "${_workspace_url}/api/2.0/instance-pools/list")
33 | 
34 | # Extract the Pool ID from response
35 | if [ -z "${_response}" ] || [ "${_response}" == "{}" ]; then
36 |   echo -e "Instance Pool \"${_instance_pool_name}\" not found"
37 |   exit 1
38 | else
39 |   instance_pool_id=$(echo "${_response}" | ${_python} -c 'import sys,json; print([ p["instance_pool_id"] for p in json.load(sys.stdin)["instance_pools"] if p["instance_pool_name"] == "'"${_instance_pool_name}"'" ][0])')
40 |   [ -z "${instance_pool_id}" ] && { echo "${_response}"; exit 1; }
41 |   echo -e "Got the Instance Pool ID: ${instance_pool_id}"
42 | fi
43 | 
44 | # Pass the variables to Azure Pipelines
45 | if [ "${BASH_SOURCE[0]}" == "$0" ]; then
46 |   [ -n "${instance_pool_id}" ] && echo "##vso[task.setvariable variable=databricksPoolId;issecret=false]${instance_pool_id}"
47 | fi
48 | 


--------------------------------------------------------------------------------
/pipelines/templates/set-databricks-permission.yml:
--------------------------------------------------------------------------------
 1 | #
 2 | # Template that adds a Databricks workspace permission using the Permissions API.
 3 | #
 4 | 
 5 | parameters:
 6 |   - name: databricksWorkspaceUrl
 7 |     displayName: 'Azure Databricks Workspace Url'
 8 |     type: string
 9 | 
10 |   - name: accessToken
11 |     displayName: 'Azure AD Access Token'
12 |     type: string
13 | 
14 |   - name: databricksResourceType
15 |     displayName: 'Type of the Azure Databricks resource'
16 |     type: string
17 | 
18 |   - name: databricksResourceId
19 |     displayName: 'Id of the Azure Databricks resource'
20 |     type: string
21 | 
22 |   - name: databricksPrincipalType
23 |     displayName: 'One of user, group or service_principal'
24 |     type: string
25 |     values:
26 |       - user
27 |       - group
28 |       - service_principal
29 | 
30 |   - name: databricksPrincipalId
31 |     displayName: 'The name of the User, Group or Service Principal'
32 |     type: string
33 | 
34 |   - name: databricksPermissionLevel
35 |     displayName: 'Permission Level of the Principal over the Resource'
36 |     type: string
37 | 
38 |   - name: scriptsLocation
39 |     displayName: 'Location of Scripts'
40 |     type: string
41 | 
42 | 
43 | steps:
44 |   - task: Bash@3
45 |     displayName: 'Set ${{ parameters.databricksPermissionLevel }} on a ${{ parameters.databricksResourceType }} resource type to ${{ parameters.databricksPrincipalId }}'
46 |     inputs:
47 |       targetType: 'filePath'
48 |       filePath: '${{ parameters.scriptsLocation }}/add_workspace_permission.sh'
49 |       arguments: '"${{ parameters.databricksWorkspaceUrl }}"
50 |                   "${{ parameters.accessToken }}"
51 |                   "${{ parameters.databricksResourceType }}"
52 |                   "${{ parameters.databricksResourceId }}"
53 |                   "${{ parameters.databricksPrincipalType }}"
54 |                   "${{ parameters.databricksPrincipalId }}"
55 |                   "${{ parameters.databricksPermissionLevel }}"'
56 | 


--------------------------------------------------------------------------------
/scripts/add_role_assignment.sh:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env bash
 2 | #
 3 | # Creates a new Azure role assignment for a user, group, or service principal (using 'az role assignment').
 4 | # It uses simple positional arguments.
 5 | #
 6 | 
 7 | # Required parameters
 8 | _role_name=${1}
 9 | _principal_id=${2}
10 | _resource_id=${3}
11 | 
12 | # Local variables
13 | _realpath() { [[ ${1} == /* ]] && echo "${1}" || echo "${PWD}"/"${1#./}"; }
14 | _realpath="$(command -v realpath || echo _realpath )"
15 |  _script_dir=$(${_realpath} "$(dirname "${BASH_SOURCE[0]}")")
16 | 
17 | _usage() {
18 |   echo -e "Usage: ${0} <role_name> <principal_id> <resource_id>"
19 |   exit 1
20 | }
21 | 
22 | # Parameters check
23 | [ -z "${_role_name}" ] && _usage
24 | [ -z "${_principal_id}" ] && _usage
25 | [ -z "${_resource_id}" ] && _usage
26 | 
27 | # Get the Object details
28 | source "${_script_dir}/get_object_details.sh" "${_principal_id}"
29 | object_id=${object_id}
30 | object_type=${object_type}
31 | 
32 | # Get the Resource ID
33 | if [[ "${_resource_id}" =~ "/subscriptions" ]]; then
34 |   resource_scope="${_resource_id}"
35 | else
36 |   echo -e "Getting the scope for RG \"${_resource_id}\""
37 |   resource_scope=$(az group show --name "${_resource_id}" --query id --output tsv)
38 |   [ -z "${resource_scope}" ] && exit 1
39 | fi
40 | 
41 | # Use the az cli command to assign the role on the resource
42 | echo -e "Assigning the \"${_role_name}\" role to \"${_principal_id}\" on the Resource \"${resource_scope}\""
43 | if ! az role assignment create --role "${_role_name}" \
44 |                                --assignee-object-id "${object_id}" \
45 |                                --assignee-principal-type "${object_type}" \
46 |                                --scope "${resource_scope}" \
47 |                                > /dev/null; then
48 |   echo -e "ERROR: Failed to assign the \"${_role_name}\" role on \"${_resource_id}\""
49 |   exit 1
50 | else
51 |   echo -e "\"${_role_name}\" role successfully assigned on \"${_resource_id}\""
52 | fi
53 | 


--------------------------------------------------------------------------------
/notebooks/pipeline/02-source-to-bronze.py:
--------------------------------------------------------------------------------
 1 | # Databricks notebook source
 2 | # MAGIC %md
 3 | # MAGIC # Bronze pipeline
 4 | 
 5 | # COMMAND ----------
 6 | 
 7 | # MAGIC %md
 8 | # MAGIC #### Set widgets
 9 | 
10 | # COMMAND ----------
11 | 
12 | dbutils.widgets.removeAll()
13 | 
14 | # COMMAND ----------
15 | 
16 | dbutils.widgets.text("sourcePath", "/databricks-datasets/weather/high_temps")
17 | dbutils.widgets.text("bronzeTable", "default.bronze")
18 | 
19 | # COMMAND ----------
20 | 
21 | sourcePath = dbutils.widgets.get("sourcePath")
22 | bronzeTable = dbutils.widgets.get("bronzeTable")
23 | 
24 | # COMMAND ----------
25 | 
26 | # MAGIC %md
27 | # MAGIC #### Check source data
28 | 
29 | # COMMAND ----------
30 | 
31 | dbutils.fs.ls(sourcePath)
32 | 
33 | # COMMAND ----------
34 | 
35 | # MAGIC %md
36 | # MAGIC #### Read source data
37 | 
38 | # COMMAND ----------
39 | 
40 | csvDF = (
41 |     spark.read.option(  # The DataFrameReader
42 |         "header", "true"
43 |     )  # Use first line of all files as header
44 |     .option("sep", ",")  # Use comma delimiter (default)
45 |     .option("inferSchema", "true")  # Automatically infer schema
46 |     .csv(sourcePath)  # Creates a DataFrame from CSV after reading in the file(s)
47 | )
48 | 
49 | # COMMAND ----------
50 | 
51 | csvDF.printSchema()
52 | 
53 | # COMMAND ----------
54 | 
55 | display(csvDF.limit(5))
56 | 
57 | # COMMAND ----------
58 | 
59 | # MAGIC %md
60 | # MAGIC #### Write to Delta Bronze table
61 | 
62 | # COMMAND ----------
63 | 
64 | (csvDF.write.format("delta").mode("append").saveAsTable(bronzeTable))
65 | 
66 | # COMMAND ----------
67 | 
68 | # MAGIC %md
69 | # MAGIC #### Optimize Delta table
70 | 
71 | # COMMAND ----------
72 | 
73 | spark.sql("OPTIMIZE {}".format(bronzeTable))
74 | 
75 | # COMMAND ----------
76 | 
77 | # MAGIC %md
78 | # MAGIC #### Verify Delta table
79 | 
80 | # COMMAND ----------
81 | 
82 | display(spark.sql("DESCRIBE DETAIL {}".format(bronzeTable)))
83 | 
84 | # COMMAND ----------
85 | 
86 | display(spark.sql("SELECT COUNT(*) FROM {}".format(bronzeTable)))
87 | 


--------------------------------------------------------------------------------
/terraform/modules/databricks/azure-groups-sync/README.md:
--------------------------------------------------------------------------------
 1 | ## Description
 2 | 
 3 | Syncs a list of groups from the Azure AD Tenant to the Databricks workspace.
 4 | 
 5 | The groups and users/service principals must already exist in the Azure AD Tenant.
 6 | 
 7 | Does not support nested groups.
 8 | 
 9 | This module will control both the groups and members:
10 | 
11 | - if a user is removed from all groups it will also be removed from the Databricks workspace
12 | 
13 | ## Inputs
14 | 
15 | | Name                       | Description                                                          | Type           | Default | Required |
16 | |----------------------------|----------------------------------------------------------------------|----------------|---------|:--------:|
17 | | groups                     | The list of groups to be synced                                      | `list(string)` | n/a     |   yes    |
18 | | workspace_access           | A list of groups that should have access to Databricks Workspace     | `list(string)` | `[]`    |    no    |
19 | | databricks_sql_access      | A list of groups that should have access to Databricks SQL           | `list(string)` | `[]`    |    no    |
20 | | allow_cluster_create       | A sublist of groups that should have cluster create privileges       | `list(string)` | `[]`    |    no    |
21 | | allow_instance_pool_create | A sublist of groups that should have instance pool create privileges | `list(string)` | `[]`    |    no    |
22 | 
23 | ## Outputs
24 | 
25 | | Name                          | Description                                         |
26 | |-------------------------------|-----------------------------------------------------|
27 | | databricks_users              | The details of the Databricks users                 |
28 | | databricks_service_principals | The details of the Databricks service principals    |
29 | | databricks_groups             | The details of the Databricks groups                |
30 | | databricks_groups_membership  | The Databricks IDs for the groups and their members |
31 | 


--------------------------------------------------------------------------------
/terraform/deployments/workspace-bootstrap/secrets.tf:
--------------------------------------------------------------------------------
 1 | ## Databricks Secrets
 2 | 
 3 | # Create the Databricks Secret Scope
 4 | resource "databricks_secret_scope" "main" {
 5 |   name = var.DATABRICKS_SECRET_SCOPE_NAME
 6 | }
 7 | 
 8 | # Give READ on the Secret Scope to the Azure Data Factory Managed Identity
 9 | resource "databricks_secret_acl" "data_factory_principal" {
10 |   principal  = data.azuread_service_principal.data_factory.application_id
11 |   permission = "READ"
12 |   scope      = databricks_secret_scope.main.name
13 |   depends_on = [databricks_service_principal.data_factory]
14 | }
15 | 
16 | # Give WRITE on the Secret Scope to the data pipeline Service Principal
17 | resource "databricks_secret_acl" "data_pipeline_principal" {
18 |   principal  = data.azuread_service_principal.data_pipeline.application_id
19 |   permission = "WRITE"
20 |   scope      = databricks_secret_scope.main.name
21 |   depends_on = [databricks_service_principal.data_pipeline]
22 | }
23 | 
24 | # Get the data pipeline Service Principal Client Secret from the Key Vault
25 | data "azurerm_key_vault_secret" "sp_client_secret" {
26 |   name         = var.SECRET_NAME_CLIENT_SECRET
27 |   key_vault_id = data.azurerm_key_vault.main.id
28 | }
29 | 
30 | # Add the secret from Key Vault to the Databricks Secret Scope
31 | # This needs to be done until Key Vault backed Secret Scopes are supported with Service Principals
32 | resource "databricks_secret" "sp_client_secret" {
33 |   key          = var.SECRET_NAME_CLIENT_SECRET
34 |   string_value = data.azurerm_key_vault_secret.sp_client_secret.value
35 |   scope        = databricks_secret_scope.main.id
36 | }
37 | 
38 | # Terraform output
39 | output "databricks_secret_scopes" {
40 |   value = {
41 |     main = databricks_secret_scope.main
42 |   }
43 | }
44 | 
45 | output "data_pipeline_secrets" {
46 |   value = {
47 |     secret_scope_name         = databricks_secret_scope.main.name
48 |     key_vault_name            = data.azurerm_key_vault.main.name
49 |     secret_name_client_secret = databricks_secret.sp_client_secret.key
50 |   }
51 | }
52 | 


--------------------------------------------------------------------------------
/scripts/add_workspace_permission.sh:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env bash
 2 | #
 3 | # Adds a Databricks workspace permission using the Permissions API (https://docs.databricks.com/dev-tools/api/latest/permissions.html).
 4 | # It uses simple positional arguments.
 5 | #
 6 | 
 7 | # Required parameters
 8 | _workspace_url=${1}
 9 | _access_token=${2}
10 | _resource_type=${3}
11 | _resource_id=${4}
12 | _principal_type=${5}
13 | _principal_id=${6}
14 | _permission=${7}
15 | 
16 | # Local variables
17 | _python="$(command -v python || command -v python3)"
18 | 
19 | _usage() {
20 |   echo -e "Usage: ${0} <workspace_url> <access_token> <resource_type> <resource_id> [user, group, service_principal] <principal_id> <permission>"
21 |   exit 1
22 | }
23 | 
24 | # Parameters check
25 | [ -z "${_workspace_url}" ] && _usage
26 | [ -z "${_access_token}" ] && _usage
27 | [ -z "${_resource_type}" ] && _usage
28 | [ -z "${_resource_id}" ] && _usage
29 | [ -z "${_principal_type}" ] && _usage
30 | [ -z "${_principal_id}" ] && _usage
31 | [ -z "${_permission}" ] && _usage
32 | 
33 | # Set the payload
34 | payload='
35 | {
36 |   "access_control_list": [
37 |     {
38 |       "'${_principal_type}'_name": "'${_principal_id}'",
39 |       "permission_level": "'${_permission}'"
40 |     }
41 |   ]
42 | }
43 | '
44 | 
45 | # Call the Databricks Permissions API
46 | echo -e "Setting the \"${_permission}\" permission to principal \"${_principal_id}\" on \"${_resource_type}\" \"${_resource_id}\""
47 | _response=$(curl -sS --request PATCH \
48 |                      --header "Authorization: Bearer ${_access_token}" \
49 |                      --header "Content-Type: application/json" \
50 |                      "${_workspace_url}/api/2.0/preview/permissions/${_resource_type}/${_resource_id}" \
51 |                      -d "${payload}")
52 | 
53 | # Get the error code
54 | _error_code=$(echo "${_response}" | ${_python} -c 'import sys,json; print(json.load(sys.stdin)["error_code"])' 2> /dev/null)
55 | 
56 | # Return ok if there is no error code
57 | if [ "${_response}" == "{}" ] || [ -z "${_error_code}" ]; then
58 |   echo -e "Permission level \"${_permission}\" set"
59 | else
60 |   echo "${_response}"
61 |   exit 1
62 | fi
63 | 


--------------------------------------------------------------------------------
/terraform/modules/azure/storage-account/README.md:
--------------------------------------------------------------------------------
 1 | ## Description
 2 | 
 3 | Creates an Azure Storage Account (either Blob or ADLS) with optional parameters.
 4 | 
 5 | ## Inputs
 6 | 
 7 | | Name                     | Description                                                              | Type           | Default       | Required |
 8 | |--------------------------|--------------------------------------------------------------------------|----------------|---------------|:--------:|
 9 | | resource_group_name      | The name of the Resource Group in which the resources should exist       | `string`       | n/a           |   yes    |
10 | | azure_location           | Azure location in which the resources should exist                       | `string`       | `null`        |    no    |
11 | | storage_account_name     | The name of the Storage Account                                          | `string`       | n/a           |   yes    |
12 | | hierarchical_namespace   | Set to true for an Azure Data Lake Gen 2 Storage Account                 | `bool`         | `false`       |    no    |
13 | | storage_containers       | A list of containers to be created within the Storage Account            | `list(string)` | `["default"]` |    no    |
14 | | account_replication_type | The type of replication to use for the Storage Account                   | `string`       | `LRS`         |    no    |
15 | | allowed_subnet_ids       | The virtual network subnet IDs allowed to connect to the Storage Account | `list(string)` | `[]`          |    no    |
16 | | allowed_ips              | The IPs allowed to connect to the Storage Account                        | `list(string)` | `[]`          |    no    |
17 | | network_default_action   | Specifies the default action of allow or deny when no other rules match  | `string`       | `Allow`       |    no    |
18 | | tags                     | A mapping of tags to assign to the resources                             | `map(string)`  | `{}`          |    no    |
19 | 
20 | ## Outputs
21 | 
22 | | Name | Description                     |
23 | |------|---------------------------------|
24 | | id   | The ID of the Storage Account   |
25 | | name | The name of the Storage Account |
26 | 


--------------------------------------------------------------------------------
/scripts/run_submit_notebook.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | """
 3 | Python script that starts a Databricks Job using the Runs submit API (https://docs.databricks.com/dev-tools/api/latest/jobs.html#runs-submit).
 4 | It uses simple positional arguments.
 5 | It returns the run_id as a variable called notebookRunId in the Azure Pipelines format.
 6 | """
 7 | import json
 8 | import sys
 9 | 
10 | import requests
11 | 
12 | 
13 | def main():
14 |     workspace_url = sys.argv[1]
15 |     access_token = sys.argv[2]
16 |     pool_or_node_type_id = sys.argv[3]
17 |     num_workers = int(sys.argv[4])
18 |     spark_version = sys.argv[5]
19 |     notebook_path = sys.argv[6]
20 |     notebook_parameters = sys.argv[7]
21 | 
22 |     base_url = '{0}/api/2.0'.format(workspace_url.rstrip("/"))
23 |     headers = {
24 |         "Content-Type": "application/json",
25 |         "Authorization": "Bearer " + access_token
26 |     }
27 |     payload = {
28 |         "new_cluster": {
29 |             "num_workers": num_workers,
30 |             "spark_version": spark_version,
31 |             "instance_pool_id": pool_or_node_type_id
32 |         },
33 |         "notebook_task": {
34 |             "notebook_path": notebook_path,
35 |             "base_parameters": json.loads(notebook_parameters)
36 |         }
37 |     }
38 | 
39 |     all_node_types = requests.get(url=base_url + '/clusters/list-node-types', headers=headers).json()
40 |     for node_type in all_node_types["node_types"]:
41 |         if pool_or_node_type_id == node_type["node_type_id"]:
42 |             payload["node_type_id"] = pool_or_node_type_id
43 |             payload.pop("instance_pool_id")
44 |             break
45 | 
46 |     response = requests.post(url=base_url + '/jobs/runs/submit', headers=headers, json=payload)
47 |     if response.status_code == requests.codes.ok:
48 |         run_id = response.json()['run_id']
49 |         print("run_id: {0}".format(run_id))
50 | 
51 |         # Pass the variables to Azure Pipelines
52 |         print("##vso[task.setvariable variable=notebookRunId;issecret=false]{0}".format(run_id))
53 |         return
54 |     else:
55 |         return response.text
56 | 
57 | 
58 | if __name__ == '__main__':
59 |     sys.exit(main())
60 | 


--------------------------------------------------------------------------------
/scripts/wait_for_job_run.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | """
 3 | Python script that waits for a Databricks Job to complete using the Runs get output API (https://docs.databricks.com/dev-tools/api/latest/jobs.html#runs-get-output).
 4 | It uses simple positional arguments.
 5 | It waits for a maximum of 15 minutes by default.
 6 | It prints the notebook_output if one exists.
 7 | """
 8 | import sys
 9 | import time
10 | 
11 | import requests
12 | 
13 | 
14 | def main():
15 |     workspace_url = sys.argv[1]
16 |     access_token = sys.argv[2]
17 |     run_id = sys.argv[3]
18 |     run_wait_time = 900  # 15 minutes
19 | 
20 |     url = '{0}/api/2.0/jobs/runs/get-output?run_id={1}'.format(workspace_url.rstrip("/"), run_id)
21 |     headers = {
22 |         "Content-Type": "application/json",
23 |         "Authorization": "Bearer " + access_token
24 |     }
25 | 
26 |     current_run_time = 0
27 |     response = requests.get(url=url, headers=headers)
28 |     while current_run_time < run_wait_time:
29 |         if response.status_code == requests.codes.ok:
30 |             response_json = response.json()
31 |             run_state = response_json["metadata"]["state"]["life_cycle_state"]
32 |             if run_state == "INTERNAL_ERROR" or run_state == "SKIPPED":
33 |                 return run_state
34 |             if run_state == "TERMINATED":
35 |                 result_state = response_json["metadata"]["state"]["result_state"]
36 |                 if result_state != "SUCCESS":
37 |                     return result_state
38 |                 if "notebook_output" in response_json.keys():
39 |                     print(response_json["notebook_output"]["result"])
40 |                 return
41 |             current_run_time += 10
42 |             print("Current state: " + str(run_state) + ". Sleeping for 10 seconds")
43 |             print("Remaining: " + str(run_wait_time - current_run_time) + " seconds" + "\n")
44 |             time.sleep(10)
45 | 
46 |             response = requests.get(url=url, headers=headers)
47 |         else:
48 |             return "Error " + str(response.status_code) + ":\n" + response.text
49 |     else:
50 |         return response.text
51 | 
52 | 
53 | if __name__ == '__main__':
54 |     sys.exit(main())
55 | 


--------------------------------------------------------------------------------
/pipelines/templates/run-notebook-job.yml:
--------------------------------------------------------------------------------
 1 | #
 2 | # Template that starts a Runs submit job with a Databricks notebook and waits for it to finish
 3 | #
 4 | 
 5 | parameters:
 6 |   - name: databricksWorkspaceUrl
 7 |     displayName: 'Azure Databricks Workspace Url'
 8 |     type: string
 9 | 
10 |   - name: accessToken
11 |     displayName: 'Azure AD Access Token'
12 |     type: string
13 | 
14 |   - name: databricksClusterSparkVersion
15 |     displayName: 'Azure Databricks Cluster Spark Version'
16 |     type: string
17 |     default: '13.3.x-scala2.12'
18 | 
19 |   - name: databricksClusterNodeTypeOrPool
20 |     displayName: 'Azure Databricks Cluster Node Type or Instance Pool Id'
21 |     type: string
22 | 
23 |   - name: databricksClusterNumWorkers
24 |     displayName: 'Number of worker nodes of the Databricks Cluster'
25 |     type: number
26 |     default: 1
27 | 
28 |   - name: notebookPath
29 |     displayName: 'Databricks Notebook Path'
30 |     type: string
31 | 
32 |   - name: notebookParameters
33 |     displayName: 'Parameters of the Databricks Notebook'
34 |     type: string
35 |     default: ''
36 | 
37 |   - name: scriptsLocation
38 |     displayName: 'Location of Scripts'
39 |     type: string
40 | 
41 | 
42 | steps:
43 |   - task: PythonScript@0
44 |     displayName: 'Start Notebook Job with ${{ parameters.notebookPath }}'
45 |     inputs:
46 |       scriptSource: 'filePath'
47 |       scriptPath: '${{ parameters.scriptsLocation }}/run_submit_notebook.py'
48 |       arguments: '"${{ parameters.databricksWorkspaceUrl }}"
49 |                   "${{ parameters.accessToken }}"
50 |                   "${{ parameters.databricksClusterNodeTypeOrPool }}"
51 |                   "${{ parameters.databricksClusterNumWorkers }}"
52 |                   "${{ parameters.databricksClusterSparkVersion }}"
53 |                   "${{ parameters.notebookPath }}"
54 |                   "${{ parameters.notebookParameters }}"'
55 | 
56 |   - task: PythonScript@0
57 |     displayName: 'Wait for Job to complete'
58 |     inputs:
59 |       scriptSource: 'filePath'
60 |       scriptPath: '${{ parameters.scriptsLocation }}/wait_for_job_run.py'
61 |       arguments: '"${{ parameters.databricksWorkspaceUrl }}" "${{ parameters.accessToken }}" "$(notebookRunId)"'
62 | 


--------------------------------------------------------------------------------
/terraform/deployments/workspace-bootstrap/notebooks.tf:
--------------------------------------------------------------------------------
 1 | ### Databricks Folders and Notebooks
 2 | 
 3 | # Deploy Databricks generic notebooks in a Shared location
 4 | resource "databricks_notebook" "shared" {
 5 |   for_each = fileset(var.NOTEBOOKS_SHARED_SOURCE_LOCATION, "*")
 6 |   source   = "${var.NOTEBOOKS_SHARED_SOURCE_LOCATION}/${each.value}"
 7 |   path     = "${var.NOTEBOOKS_SHARED_WORKSPACE_FOLDER}/${replace(each.value, "/(\\..*)$/", "")}"
 8 | }
 9 | 
10 | # Create an empty workspace folder for the Pipeline notebooks
11 | resource "databricks_directory" "pipeline_folder" {
12 |   path = var.NOTEBOOKS_PIPELINE_WORKSPACE_FOLDER
13 | }
14 | 
15 | resource "databricks_permissions" "pipeline_folder" {
16 |   directory_path = databricks_directory.pipeline_folder.path
17 | 
18 |   access_control {
19 |     service_principal_name = data.azuread_service_principal.data_pipeline.application_id
20 |     permission_level       = "CAN_MANAGE"
21 |   }
22 | 
23 |   access_control {
24 |     service_principal_name = data.azuread_service_principal.data_factory.application_id
25 |     permission_level       = "CAN_RUN"
26 |   }
27 | 
28 |   depends_on = [
29 |     databricks_directory.pipeline_folder, databricks_service_principal.data_factory,
30 |     databricks_service_principal.data_pipeline
31 |   ]
32 | }
33 | 
34 | # Create an empty workspace folder for the Project notebooks
35 | resource "databricks_directory" "project_folder" {
36 |   path = var.NOTEBOOKS_PROJECT_WORKSPACE_FOLDER
37 | }
38 | 
39 | resource "databricks_permissions" "project_folder" {
40 |   directory_path = databricks_directory.project_folder.path
41 | 
42 |   access_control {
43 |     group_name       = var.PROJECT_GROUP_NAME
44 |     permission_level = "CAN_MANAGE"
45 |   }
46 | 
47 |   depends_on = [databricks_directory.project_folder, module.project_group_sync]
48 | }
49 | 
50 | # Terraform output
51 | output "databricks_folders" {
52 |   value = {
53 |     pipeline_folder = databricks_directory.pipeline_folder.path
54 |     project_folder  = databricks_directory.project_folder.path
55 |     shared_folder   = {
56 |       path               = var.NOTEBOOKS_SHARED_WORKSPACE_FOLDER
57 |       notebook_path_list = toset([for path, details in databricks_notebook.shared : details])
58 |     }
59 |   }
60 | }
61 | 


--------------------------------------------------------------------------------
/terraform/modules/azure/storage-account/main.tf:
--------------------------------------------------------------------------------
 1 | /**
 2 | * Creates an Azure Storage Account with containers and network rules
 3 | */
 4 | data "azurerm_resource_group" "this" {
 5 |   name = var.resource_group_name
 6 | }
 7 | 
 8 | locals {
 9 |   location = var.azure_location == null ? data.azurerm_resource_group.this.location : var.azure_location
10 | 
11 |   tags = {
12 |     ManagedBy = "Terraform"
13 |   }
14 | }
15 | 
16 | resource "azurerm_storage_account" "this" {
17 |   name                      = var.storage_account_name
18 |   location                  = local.location
19 |   resource_group_name       = data.azurerm_resource_group.this.name
20 |   account_kind              = "StorageV2"
21 |   account_tier              = "Standard"
22 |   account_replication_type  = var.account_replication_type
23 |   access_tier               = "Hot"
24 |   is_hns_enabled            = var.hierarchical_namespace
25 |   enable_https_traffic_only = true
26 |   tags                      = merge(local.tags, var.tags)
27 | }
28 | 
29 | resource "azurerm_storage_container" "default" {
30 |   count                 = var.hierarchical_namespace == false ? length(var.storage_containers) : 0
31 |   name                  = var.storage_containers[count.index]
32 |   storage_account_name  = azurerm_storage_account.this.name
33 |   container_access_type = "private"
34 |   depends_on            = [azurerm_storage_account.this]
35 | }
36 | 
37 | resource "azurerm_storage_data_lake_gen2_filesystem" "default" {
38 |   count              = var.hierarchical_namespace == true ? length(var.storage_containers) : 0
39 |   name               = var.storage_containers[count.index]
40 |   storage_account_id = azurerm_storage_account.this.id
41 |   depends_on         = [azurerm_storage_account.this]
42 | }
43 | 
44 | resource "azurerm_storage_account_network_rules" "default" {
45 |   storage_account_id         = azurerm_storage_account.this.id
46 |   default_action             = var.network_default_action
47 |   ip_rules                   = var.allowed_ips
48 |   virtual_network_subnet_ids = var.allowed_subnet_ids
49 |   bypass                     = ["Logging", "Metrics", "AzureServices"]
50 |   depends_on                 = [azurerm_storage_container.default, azurerm_storage_data_lake_gen2_filesystem.default]
51 | }
52 | 


--------------------------------------------------------------------------------
/pipelines/templates/deploy-instance-pool.yml:
--------------------------------------------------------------------------------
 1 | #
 2 | # Template that deploys a Databricks Instance Pool.
 3 | #
 4 | 
 5 | parameters:
 6 |   - name: databricksWorkspaceUrl
 7 |     displayName: 'Azure Databricks Workspace Url'
 8 |     type: string
 9 | 
10 |   - name: accessToken
11 |     displayName: 'Azure AD Access Token'
12 |     type: string
13 | 
14 |   - name: databricksPoolName
15 |     displayName: 'Name of the Azure Databricks Instance Pool'
16 |     type: string
17 | 
18 |   - name: databricksPoolNodeType
19 |     displayName: 'Azure Databricks Node Type'
20 |     type: string
21 | 
22 |   - name: databricksPoolNodeAvailability
23 |     displayName: 'Spot or On Demand instances'
24 |     type: string
25 |     default: 'ON_DEMAND_AZURE'
26 |     values:
27 |       - SPOT_AZURE
28 |       - ON_DEMAND_AZURE
29 | 
30 |   - name: databricksPoolMinIdleInstances
31 |     displayName: 'The minimum number of idle instances maintained by the pool'
32 |     type: number
33 |     default: 0
34 | 
35 |   - name: databricksPoolIdleInstanceAutotermination
36 |     displayName: 'The number of minutes that idle instances are maintained by the pool before being terminated'
37 |     type: number
38 |     default: 30
39 | 
40 |   - name: databricksPoolSparkVersion
41 |     displayName: 'Azure Databricks Spark Version'
42 |     type: string
43 |     default: '13.3.x-scala2.12'
44 | 
45 |   - name: scriptsLocation
46 |     displayName: 'Location of Scripts'
47 |     type: string
48 | 
49 | 
50 | steps:
51 |   - task: PythonScript@0
52 |     displayName: 'Create Databricks Instance Pool ${{ parameters.databricksPoolName }}'
53 |     inputs:
54 |       scriptSource: 'filePath'
55 |       scriptPath: '${{ parameters.scriptsLocation }}/create_instance_pool.py'
56 |       arguments: '"${{ parameters.databricksWorkspaceUrl }}"
57 |                   "${{ parameters.accessToken }}"
58 |                   "${{ parameters.databricksPoolName }}"
59 |                   "${{ parameters.databricksPoolNodeType }}"
60 |                   "${{ parameters.databricksPoolMinIdleInstances }}"
61 |                   "${{ parameters.databricksPoolIdleInstanceAutotermination }}"
62 |                   "${{ parameters.databricksPoolSparkVersion }}"
63 |                   "${{ parameters.databricksPoolNodeAvailability }}"'
64 | 


--------------------------------------------------------------------------------
/pipelines/templates/get-workspace-login.yml:
--------------------------------------------------------------------------------
 1 | #
 2 | # Template that retrieves the Azure Databricks workspace URL and an AAD Access Token that can be used to access it.
 3 | # By default, it uses the Azure Pipelines Principal that is automatically available when using an AzureCLI task.
 4 | # If the 'servicePrincipalClientId' and 'servicePrincipalClientSecret' parameters are set, then the script logs in with these credentials before generating the token.
 5 | #
 6 | 
 7 | parameters:
 8 |   - name: serviceConnection
 9 |     displayName: 'Azure Resource Manager service connection'
10 |     type: string
11 | 
12 |   - name: resourceGroupName
13 |     displayName: 'Azure Databricks Resource Group Name'
14 |     type: string
15 | 
16 |   - name: databricksWorkspaceName
17 |     displayName: 'Azure Databricks Workspace Name'
18 |     type: string
19 | 
20 |   - name: databricksUniqueId
21 |     displayName: 'Databricks Unique Id'
22 |     type: string
23 |     default: '2ff814a6-3304-4ab8-85cb-cd0e6f879c1d'
24 | 
25 |   - name: servicePrincipalClientId
26 |     displayName: '(Optional) Service Principal Client Id to be used for login'
27 |     type: string
28 |     default: ''
29 | 
30 |   - name: servicePrincipalClientSecret
31 |     displayName: '(Optional) Service Principal Client Secret to be used for login'
32 |     type: string
33 |     default: ''
34 | 
35 |   - name: scriptsLocation
36 |     displayName: 'Location of Scripts'
37 |     type: string
38 | 
39 | 
40 | steps:
41 |   - task: AzureCLI@2
42 |     displayName: 'Get Databricks workspace URL'
43 |     inputs:
44 |       azureSubscription: '${{ parameters.serviceConnection }}'
45 |       scriptType: 'bash'
46 |       scriptPath: '${{ parameters.scriptsLocation }}/get_workspace_url.sh'
47 |       arguments: '"${{ parameters.resourceGroupName }}" "${{ parameters.databricksWorkspaceName }}"'
48 | 
49 |   - task: AzureCLI@2
50 |     displayName: 'Get AAD Access Token'
51 |     inputs:
52 |       azureSubscription: '${{ parameters.serviceConnection }}'
53 |       addSpnToEnvironment: true
54 |       scriptType: 'bash'
55 |       scriptPath: '${{ parameters.scriptsLocation }}/get_access_token.sh'
56 |       arguments: '"${{ parameters.databricksUniqueId }}" "${{ parameters.servicePrincipalClientId }}" "${{ parameters.servicePrincipalClientSecret }}"'
57 | 


--------------------------------------------------------------------------------
/terraform/deployments/azure-infrastructure/databricks-workspace.tf:
--------------------------------------------------------------------------------
 1 | ### Databricks workspace
 2 | 
 3 | # Deploy a Virtual Network for Databricks
 4 | module "databricks_vnet" {
 5 |   source                      = "../../modules/azure/databricks-vnet"
 6 |   azure_location              = data.azurerm_resource_group.main.location
 7 |   resource_group_name         = data.azurerm_resource_group.main.name
 8 |   virtual_network_name        = var.DATABRICKS_VNET_NAME
 9 |   virtual_network_cidr        = var.DATABRICKS_VNET_CIDR
10 |   network_security_group_name = var.DATABRICKS_NSG_NAME
11 |   private_subnet_name         = var.DATABRICKS_PRIVATE_SUBNET_NAME
12 |   private_subnet_cidr         = var.DATABRICKS_PRIVATE_SUBNET_CIDR
13 |   public_subnet_name          = var.DATABRICKS_PUBLIC_SUBNET_NAME
14 |   public_subnet_cidr          = var.DATABRICKS_PUBLIC_SUBNET_CIDR
15 |   service_endpoints           = ["Microsoft.Storage", "Microsoft.AzureActiveDirectory"]
16 |   use_nat_gateway             = tobool(var.DATABRICKS_DISABLE_PUBLIC_IP)
17 |   tags                        = var.deployment_tags
18 | }
19 | 
20 | # Deploy the Databricks workspace with the custom VNet
21 | module "databricks_workspace_vnet_injection" {
22 |   source               = "../../modules/azure/databricks-workspace"
23 |   azure_location       = data.azurerm_resource_group.main.location
24 |   resource_group_name  = data.azurerm_resource_group.main.name
25 |   workspace_name       = var.DATABRICKS_WORKSPACE_NAME
26 |   pricing_tier         = var.DATABRICKS_PRICING_TIER
27 |   virtual_network_name = module.databricks_vnet.virtual_network_name
28 |   private_subnet_name  = module.databricks_vnet.private_subnet_name
29 |   public_subnet_name   = module.databricks_vnet.public_subnet_name
30 |   disable_public_ip    = tobool(var.DATABRICKS_DISABLE_PUBLIC_IP)
31 |   tags                 = var.deployment_tags
32 |   depends_on           = [module.databricks_vnet]
33 | }
34 | 
35 | 
36 | ### Terraform output
37 | 
38 | output "databricks_workspace" {
39 |   value = {
40 |     id                        = module.databricks_workspace_vnet_injection.id
41 |     url                       = module.databricks_workspace_vnet_injection.workspace_url
42 |     managed_resource_group_id = module.databricks_workspace_vnet_injection.managed_resource_group_id
43 |   }
44 | }
45 | 


--------------------------------------------------------------------------------
/terraform/modules/databricks/cluster-policy/README.md:
--------------------------------------------------------------------------------
 1 | ## Description
 2 | 
 3 | Creates a Databricks cluster policy with optional CAN_USE permissions.
 4 | 
 5 | The policy will be created by [merging](https://www.terraform.io/docs/language/functions/merge.html) the following
 6 | sources (in this order):
 7 | 
 8 | 1. a default policy definition with optional variables
 9 | 2. a policy json file
10 | 3. policy overrides as a Terraform object
11 | 
12 | ## Inputs
13 | 
14 | | Name                            | Description                                                                 | Type           | Default              | Required |
15 | |---------------------------------|-----------------------------------------------------------------------------|----------------|----------------------|:--------:|
16 | | policy_name                     | Cluster policy name                                                         | `string`       | n/a                  |   yes    |
17 | | CAN_USE                         | Objects of principals that should have CAN_USE permission on the policy     | `list(object)` | `[]`                 |    no    |
18 | | default_spark_version_regex     | The default policy Spark version regex                                      | `string`       | `.*-scala2.12`       |    no    |
19 | | default_autotermination_minutes | The default policy cluster autotermination in minutes                       | `number`       | `120`                |    no    |
20 | | default_cluster_log_path        | The default policy location to deliver Spark driver, worker, and event logs | `string`       | `dbfs:/cluster-logs` |    no    |
21 | | policy_overrides_file           | The path to a json file containing any cluster policy overrides             | `string`       | `null`               |    no    |
22 | | policy_overrides_object         | Cluster policy overrides defined as object                                  | `object`       | `{}`                 |    no    |
23 | 
24 | ## Outputs
25 | 
26 | | Name        | Description                                              |
27 | |-------------|----------------------------------------------------------|
28 | | id          | The ID of the cluster policy in the Databricks workspace |
29 | | details     | Details about the cluster policy                         |
30 | | permissions | List with the cluster policy permissions                 |
31 | 


--------------------------------------------------------------------------------
/terraform/deployments/workspace-bootstrap/principals.tf:
--------------------------------------------------------------------------------
 1 | ### Databricks Principals
 2 | 
 3 | # Add the Azure Data Factory Service Principal to the Databricks workspace
 4 | # The Service Principal must have 'allow_cluster_create' in order to create new jobs clusters as policies are not supported by ADF
 5 | resource "databricks_service_principal" "data_factory" {
 6 |   application_id           = data.azuread_service_principal.data_factory.application_id
 7 |   display_name             = data.azuread_service_principal.data_factory.display_name
 8 |   external_id              = data.azuread_service_principal.data_factory.object_id
 9 |   workspace_access         = true
10 |   databricks_sql_access    = true
11 |   allow_cluster_create     = true
12 |   active                   = true
13 |   force                    = true
14 |   disable_as_user_deletion = true
15 | }
16 | 
17 | # Add the data pipeline Service Principal to the Databricks workspace
18 | resource "databricks_service_principal" "data_pipeline" {
19 |   application_id           = data.azuread_service_principal.data_pipeline.application_id
20 |   display_name             = data.azuread_service_principal.data_pipeline.display_name
21 |   external_id              = data.azuread_service_principal.data_pipeline.object_id
22 |   workspace_access         = true
23 |   databricks_sql_access    = true
24 |   allow_cluster_create     = true
25 |   active                   = true
26 |   force                    = true
27 |   disable_as_user_deletion = true
28 | }
29 | 
30 | # Sync the AD Project group with the Databricks workspace
31 | module "project_group_sync" {
32 |   source                = "../../modules/databricks/azure-groups-sync"
33 |   groups                = [var.PROJECT_GROUP_NAME]
34 |   workspace_access      = [var.PROJECT_GROUP_NAME]
35 |   databricks_sql_access = [var.PROJECT_GROUP_NAME]
36 | }
37 | 
38 | # Terraform output
39 | output "databricks_principals" {
40 |   value = {
41 |     data_factory = {
42 |       id           = databricks_service_principal.data_factory.id
43 |       display_name = data.azuread_service_principal.data_factory.display_name
44 |     }
45 |     data_pipeline = {
46 |       id           = databricks_service_principal.data_pipeline.id
47 |       display_name = data.azuread_service_principal.data_pipeline.display_name
48 |     }
49 |     project_group = module.project_group_sync.databricks_groups
50 |   }
51 | }
52 | 


--------------------------------------------------------------------------------
/arm/azure-data-factory-with-key-vault.json:
--------------------------------------------------------------------------------
 1 | {
 2 |   "$schema": "https://schema.management.azure.com/schemas/2019-04-01/deploymentTemplate.json#",
 3 |   "contentVersion": "1.0.0.0",
 4 |   "parameters": {
 5 |     "factoryName": {
 6 |       "type": "string",
 7 |       "metadata": {
 8 |         "description": "The name of the Data Factory to create."
 9 |       }
10 |     },
11 |     "location": {
12 |       "type": "string",
13 |       "defaultValue": "[resourceGroup().location]",
14 |       "metadata": {
15 |         "description": "Specifies the Azure location where the data factory should be created."
16 |       }
17 |     },
18 |     "keyVaultName": {
19 |       "type": "string",
20 |       "metadata": {
21 |         "description": "Name of the Azure Key Vault."
22 |       }
23 |     },
24 |     "keyVaultUrl": {
25 |       "type": "string",
26 |       "metadata": {
27 |         "description": "URL of the Azure Key Vault."
28 |       }
29 |     }
30 |   },
31 |   "variables": {
32 |     "dataFactoryId": "[resourceId('Microsoft.DataFactory/factories', parameters('factoryName'))]"
33 |   },
34 |   "resources": [
35 |     {
36 |       "name": "[parameters('factoryName')]",
37 |       "type": "Microsoft.DataFactory/factories",
38 |       "apiVersion": "2018-06-01",
39 |       "location": "[parameters('location')]",
40 |       "identity": {
41 |         "type": "SystemAssigned"
42 |       },
43 |       "properties": {}
44 |     },
45 |     {
46 |       "name": "[concat(parameters('factoryName'), '/', parameters('keyVaultName'))]",
47 |       "type": "Microsoft.DataFactory/factories/linkedServices",
48 |       "apiVersion": "2018-06-01",
49 |       "properties": {
50 |         "annotations": [
51 |         ],
52 |         "type": "AzureKeyVault",
53 |         "typeProperties": {
54 |           "baseUrl": "[parameters('keyVaultUrl')]"
55 |         }
56 |       },
57 |       "dependsOn": [
58 |         "[variables('dataFactoryId')]"
59 |       ]
60 |     }
61 |   ],
62 |   "outputs": {
63 |     "dataFactoryId": {
64 |       "type": "string",
65 |       "value": "[variables('dataFactoryId')]"
66 |     },
67 |     "dataFactorySystemIdentity": {
68 |       "type": "string",
69 |       "value": "[reference(variables('dataFactoryId'), '2018-06-01', 'Full').identity.principalId]"
70 |     },
71 |     "keyVaultServiceName": {
72 |       "type": "string",
73 |       "value": "[parameters('keyVaultName')]"
74 |     }
75 |   }
76 | }
77 | 


--------------------------------------------------------------------------------
/scripts/add_api_permission.sh:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env bash
 2 | #
 3 | # Adds an API Permission to a Service Principal (https://docs.microsoft.com/en-us/graph/permissions-reference).
 4 | # Works with either Azure Active Directory Graph API or Microsoft Graph API.
 5 | # It uses simple positional arguments.
 6 | #
 7 | 
 8 | # Required parameters
 9 | _api_permission=${1}
10 | _sp_client_id=${2}
11 | 
12 | # Local variables
13 | _realpath() { [[ ${1} == /* ]] && echo "${1}" || echo "${PWD}"/"${1#./}"; }
14 | _realpath="$(command -v realpath || echo _realpath )"
15 | _script_dir=$(${_realpath} "$(dirname "${BASH_SOURCE[0]}")")
16 | _python="$(command -v python || command -v python3)"
17 | _msGraphResourceId="00000003-0000-0000-c000-000000000000"          # Microsoft Graph API ID
18 | 
19 | _usage() {
20 |   echo -e "Usage: ${0} <api_permission> <sp_client_id>"
21 |   exit 1
22 | }
23 | 
24 | # Parameters check
25 | [ -z "${_api_permission}" ] && _usage
26 | [ -z "${_sp_client_id}" ] && _usage
27 | 
28 | # Get the API Permission ID
29 | echo -e "Getting the ID for \"${_api_permission}\""
30 | api_permission_id=$(az ad sp show --id ${_msGraphResourceId} --query "appRoles[?value=='${_api_permission}']" \
31 |                       | ${_python} -c 'import sys,json; print(json.load(sys.stdin)[0]["id"])')
32 | [ -z "${api_permission_id}" ] && exit 1
33 | echo -e "Got the API Permission ID: \"${api_permission_id}\""
34 | 
35 | # Use the az cli command to set the API Permission
36 | echo -e "Setting the API Permission on the Service Principal \"${_sp_client_id}\""
37 | timer=0
38 | while [ ${timer} -lt 100 ]; do
39 |   az ad app permission add --id "${_sp_client_id}" \
40 |                            --api "${_msGraphResourceId}" \
41 |                            --api-permissions "${api_permission_id}=Role" && break
42 |   echo -e "Principal \"${_sp_client_id}\" might not be accessible yet, sleeping for 10 seconds"
43 |   sleep 10 && timer=$((timer+10)) && (exit 1)
44 | done || { echo "ERROR: Timed out waiting"; exit 1; }
45 | 
46 | 
47 | #  az ad app permission grant --id "${_sp_client_id}" \
48 | #                           --api "${_adGraphResourceId}" \
49 | #                           --scope "${api_permission_id}" || exit 1
50 | 
51 | # Grant admin-consent using the CLI command
52 | echo -e "Granting admin-consent to the Service Principal \"${_sp_client_id}\""
53 | sleep 10
54 | az ad app permission admin-consent --id "${_sp_client_id}" || exit 1
55 | 


--------------------------------------------------------------------------------
/scripts/get_workspace_url.sh:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env bash
 2 | #
 3 | # Retrieves a Databricks workspace URL using the Azure CLI ('az resource show').
 4 | # It uses simple positional arguments.
 5 | # Returns the workspace URL as a variable called databricksWorkspaceUrl in the Azure Pipelines format.
 6 | # Returns the workspace ID as a variable called databricksWorkspaceId in the Azure Pipelines format.
 7 | # Returns the workspace hostname as a variable called databricksWorkspaceHostname in the Azure Pipelines format.
 8 | #
 9 | 
10 | # Required parameters
11 | _resource_group_name=${1}
12 | _workspace_name=${2}
13 | 
14 | # Local variables
15 | _python="$(command -v python || command -v python3)"
16 | 
17 | _usage() {
18 |   echo -e "Usage: ${0} <resource_group_name> <workspace_name>"
19 |   exit 1
20 | }
21 | 
22 | # Parameters check
23 | [ -z "${_resource_group_name}" ] && _usage
24 | [ -z "${_workspace_name}" ] && _usage
25 | 
26 | # Use the az cli command
27 | echo -e "Getting the URL of Workspace ${_workspace_name} from Resource Group ${_resource_group_name}"
28 | _response=$(az resource show --name "${_workspace_name}" \
29 |                              --resource-type "Microsoft.Databricks/workspaces" \
30 |                              --resource-group "${_resource_group_name}" \
31 |                              --output json)
32 | [ -z "${_response}" ] && exit 1
33 | 
34 | # Get the Databricks workspace URL from response
35 | workspace_hostname=$(echo "${_response}" | ${_python} -c 'import sys,json; print(json.load(sys.stdin)["properties"]["workspaceUrl"])')
36 | [ -z "${workspace_hostname}" ] && { echo "${_response}"; exit 1; }
37 | workspace_url="https://${workspace_hostname}"
38 | echo -e "Got the URL: ${workspace_url}"
39 | 
40 | # Get the Databricks workspace Resource ID from response
41 | workspace_id=$(echo "${_response}" | ${_python} -c 'import sys,json; print(json.load(sys.stdin)["id"])')
42 | [ -z "${workspace_id}" ] && { echo "${_response}"; exit 1; }
43 | 
44 | # Pass the variables to Azure Pipelines
45 | if [ "${BASH_SOURCE[0]}" == "$0" ]; then
46 |   [ -n "${workspace_id}" ] && echo "##vso[task.setvariable variable=databricksWorkspaceId;issecret=false]${workspace_id}"
47 |   [ -n "${workspace_hostname}" ] && echo "##vso[task.setvariable variable=databricksWorkspaceHostname;issecret=false]${workspace_hostname}"
48 |   [ -n "${workspace_url}" ] && echo "##vso[task.setvariable variable=databricksWorkspaceUrl;issecret=false]${workspace_url}"
49 | fi
50 | 


--------------------------------------------------------------------------------
/pipelines/vars.yml:
--------------------------------------------------------------------------------
 1 | variables:
 2 |   DATA_SERVICE_PRINCIPAL_CLIENT_ID:    $(provisionedServicePrincipalClientId) # provisioned by the admin setup
 3 |   PROJECT_GROUP_NAME:                  $(provisionedProjectGroupName)         # provisioned by the admin setup
 4 |   RESOURCE_GROUP_NAME:                 $(provisionedResourceGroupName)        # provisioned by the admin setup
 5 |   KEY_VAULT_NAME:                      $(provisionedKeyVaultName)             # provisioned by the admin setup
 6 |   SECRET_NAME_CLIENT_SECRET:           $(provisionedSecretName)               # provisioned by the admin setup
 7 |   STORAGE_ACCOUNT_NAME:                'dlsdatabricksunique123'               # must be unique in Azure
 8 |   PIPELINE_CONTAINER_NAME:             'pipeline'
 9 |   PROJECT_CONTAINER_NAME:              'project'
10 |   DATA_FACTORY_NAME:                   'adf-databricks-unique123'             # must be unique in Azure
11 |   DATA_FACTORY_PIPELINE_NAME:          'databricks-pipeline'
12 |   DATABRICKS_WORKSPACE_NAME:           'databricks-workspace'
13 |   DATABRICKS_PRICING_TIER:             'premium'
14 |   DATABRICKS_VNET_NAME:                'databricks-vnet'
15 |   DATABRICKS_VNET_CIDR:                '10.179.0.0/16'
16 |   DATABRICKS_PRIVATE_SUBNET_NAME:      'private-subnet'
17 |   DATABRICKS_PRIVATE_SUBNET_CIDR:      '10.179.0.0/18'
18 |   DATABRICKS_PUBLIC_SUBNET_NAME:       'public-subnet'
19 |   DATABRICKS_PUBLIC_SUBNET_CIDR:       '10.179.64.0/18'
20 |   DATABRICKS_NSG_NAME:                 'databricks-nsg'
21 |   DATABRICKS_DISABLE_PUBLIC_IP:        false
22 |   DATABRICKS_SECRET_SCOPE_NAME:        'databricks-secret-scope'
23 |   DATABRICKS_JOBS_POOL_NAME:           'Jobs Pool'
24 |   DATABRICKS_JOBS_POOL_NODE_TYPE:      'Standard_F4s_v2'
25 |   DATABRICKS_SHARED_POOL_NAME:         'Shared Pool'
26 |   DATABRICKS_SHARED_POOL_NODE_TYPE:    'Standard_D4ds_v4'
27 |   DATABRICKS_SHARED_CLUSTER_NAME:      'Shared Autoscaling'
28 |   DATABRICKS_SPARK_VERSION:            '13.3.x-scala2.12'
29 |   DATABRICKS_CLUSTER_POLICY_LOCATION:  '$(System.DefaultWorkingDirectory)/pipelines/templates/cluster-policy-single-node.json'
30 |   NOTEBOOKS_SHARED_SOURCE_LOCATION:    '$(System.DefaultWorkingDirectory)/notebooks/shared'
31 |   NOTEBOOKS_SHARED_WORKSPACE_FOLDER:   '/Shared/generic'
32 |   NOTEBOOKS_PROJECT_WORKSPACE_FOLDER:  '/Project'
33 |   NOTEBOOKS_PIPELINE_SOURCE_LOCATION:  '$(System.DefaultWorkingDirectory)/notebooks/pipeline'
34 |   NOTEBOOKS_PIPELINE_WORKSPACE_FOLDER: '/Pipeline'
35 | 


--------------------------------------------------------------------------------
/terraform/deployments/workspace-bootstrap/main.tf:
--------------------------------------------------------------------------------
 1 | /**
 2 | * Bootstraps the Databricks workspace for the data pipeline and project.
 3 | */
 4 | 
 5 | provider "azurerm" {
 6 |   features {}
 7 |   skip_provider_registration = true
 8 | }
 9 | 
10 | terraform {
11 |   required_version = "~> 1.5.7"
12 | 
13 |   backend "azurerm" {}
14 | 
15 |   required_providers {
16 |     azuread = {
17 |       source  = "hashicorp/azuread"
18 |       version = "~> 2"
19 |     }
20 |     azurerm = {
21 |       source  = "hashicorp/azurerm"
22 |       version = "~> 3"
23 |     }
24 |     random = {
25 |       source  = "hashicorp/random"
26 |       version = "~> 3"
27 |     }
28 |     databricks = {
29 |       source  = "databricks/databricks"
30 |       version = "~> 1.24.1"
31 |     }
32 |   }
33 | }
34 | 
35 | 
36 | ### Data Sources
37 | 
38 | # Get information about the AzureRM provider
39 | data "azurerm_client_config" "current" {}
40 | 
41 | # Get information about the pre-provisioned Resource Group
42 | data "azurerm_resource_group" "main" {
43 |   name = var.RESOURCE_GROUP_NAME
44 | }
45 | 
46 | # Get information about the pre-provisioned Project group
47 | data "azuread_group" "project_group" {
48 |   display_name     = var.PROJECT_GROUP_NAME
49 |   security_enabled = true
50 | }
51 | 
52 | # Get information about the pre-provisioned Service Principal
53 | data "azuread_service_principal" "data_pipeline" {
54 |   application_id = var.DATA_SERVICE_PRINCIPAL_CLIENT_ID
55 | }
56 | 
57 | # Get information about the Azure Key Vault
58 | data "azurerm_key_vault" "main" {
59 |   name                = var.KEY_VAULT_NAME
60 |   resource_group_name = data.azurerm_resource_group.main.name
61 | }
62 | 
63 | # Get information about the Databricks workspace
64 | data "azurerm_databricks_workspace" "main" {
65 |   name                = var.DATABRICKS_WORKSPACE_NAME
66 |   resource_group_name = data.azurerm_resource_group.main.name
67 | }
68 | 
69 | # Get information about the Azure Data Factory
70 | data "azurerm_data_factory" "main" {
71 |   name                = var.DATA_FACTORY_NAME
72 |   resource_group_name = data.azurerm_resource_group.main.name
73 | }
74 | 
75 | # Get the Azure Data Factory Service Principal ID of the Managed Identity Object ID
76 | data "azuread_service_principal" "data_factory" {
77 |   object_id = data.azurerm_data_factory.main.identity[0].principal_id
78 | }
79 | 
80 | # Configure the Databricks Terraform provider
81 | provider "databricks" {
82 |   host = data.azurerm_databricks_workspace.main.workspace_url
83 | }
84 | 


--------------------------------------------------------------------------------
/scripts/create_service_principal.sh:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env bash
 2 | #
 3 | # Creates an Azure Service Principal (using 'az ad sp create-for-rbac').
 4 | # It uses simple positional arguments.
 5 | # Returns the service principal client id as a variable called spClientId in the Azure Pipelines format.
 6 | # Returns the service principal object id as a variable called spObjectId in the Azure Pipelines format.
 7 | # Returns the service principal client secret as a variable called spClientSecret in the Azure Pipelines format.
 8 | #
 9 | 
10 | # Required parameters
11 | _sp_registration_name=${1}
12 | 
13 | # Local variables
14 | _realpath() { [[ ${1} == /* ]] && echo "${1}" || echo "${PWD}"/"${1#./}"; }
15 | _realpath="$(command -v realpath || echo _realpath )"
16 | _script_dir=$(${_realpath} "$(dirname "${BASH_SOURCE[0]}")")
17 | _python="$(command -v python || command -v python3)"
18 | 
19 | _usage() {
20 |   echo -e "Usage: ${0} <sp_registration_name>"
21 |   exit 1
22 | }
23 | 
24 | # Parameters check
25 | [ -z "${_sp_registration_name}" ] && _usage
26 | 
27 | # Create the Service Principal
28 | echo -e "Creating a Service Principal named \"${_sp_registration_name}\""
29 | _response=$(az ad sp create-for-rbac --name "${_sp_registration_name}")
30 | [ -z "${_response}" ] && exit 1
31 | 
32 | # Extract the Service Principal Client ID from the response
33 | echo -e "Extracting the Service Principal Client ID and Client Secret from the JSON response"
34 | sp_client_id=$(echo "${_response}" | ${_python} -c 'import sys,json; print(json.load(sys.stdin)["appId"])')
35 | [ -z "${sp_client_id}" ] && { echo "${_response}"; exit 1; }
36 | echo -e "Got the Client ID: \"${sp_client_id}\""
37 | 
38 | # Extract the Service Principal Secret from the response
39 | sp_client_secret=$(echo "${_response}" | ${_python} -c 'import sys,json; print(json.load(sys.stdin)["password"])')
40 | [ -z "${sp_client_secret}" ] && exit 1
41 | echo -e "Got the Client Secret"
42 | 
43 | # Get the Object ID of the Service Principal
44 | source "${_script_dir}/get_object_details.sh" "${sp_client_id}"
45 | sp_object_id=${object_id}
46 | 
47 | # Pass the variables to Azure Pipelines
48 | if [ "${BASH_SOURCE[0]}" == "$0" ]; then
49 |   [ -n "${sp_client_id}" ] && echo "##vso[task.setvariable variable=spClientId;issecret=false]${sp_client_id}" || exit 1
50 |   [ -n "${sp_object_id}" ] && echo "##vso[task.setvariable variable=spObjectId;issecret=false]${sp_object_id}" || exit 1
51 |   [ -n "${sp_client_secret}" ] && echo "##vso[task.setvariable variable=spClientSecret;issecret=true]${sp_client_secret}" || exit 1
52 | fi
53 | 


--------------------------------------------------------------------------------
/notebooks/pipeline/03-bronze-to-silver.py:
--------------------------------------------------------------------------------
  1 | # Databricks notebook source
  2 | # MAGIC %md
  3 | # MAGIC # Silver pipeline
  4 | 
  5 | # COMMAND ----------
  6 | 
  7 | # MAGIC %md
  8 | # MAGIC #### Set widgets
  9 | 
 10 | # COMMAND ----------
 11 | 
 12 | dbutils.widgets.text("bronzeTable", "default.bronze")
 13 | dbutils.widgets.text("silverTable", "default.silver")
 14 | 
 15 | # COMMAND ----------
 16 | 
 17 | bronzeTable = dbutils.widgets.get("bronzeTable")
 18 | silverTable = dbutils.widgets.get("silverTable")
 19 | 
 20 | # COMMAND ----------
 21 | 
 22 | # MAGIC %md
 23 | # MAGIC #### Check Bronze table
 24 | 
 25 | # COMMAND ----------
 26 | 
 27 | display(spark.sql("DESCRIBE DETAIL {}".format(bronzeTable)))
 28 | 
 29 | # COMMAND ----------
 30 | 
 31 | # MAGIC %md
 32 | # MAGIC #### Read Bronze table
 33 | 
 34 | # COMMAND ----------
 35 | 
 36 | bronzeDF = spark.read.table(bronzeTable)
 37 | 
 38 | # COMMAND ----------
 39 | 
 40 | bronzeDF.printSchema()
 41 | 
 42 | # COMMAND ----------
 43 | 
 44 | # MAGIC %md
 45 | # MAGIC #### Transform Bronze data
 46 | 
 47 | # COMMAND ----------
 48 | 
 49 | from pyspark.sql import functions as F
 50 | 
 51 | transformedDF = bronzeDF.withColumnRenamed("temp", "temperature").withColumn(
 52 |     "date", F.to_date(F.col("date"), "yyyy-MM-dd")
 53 | )
 54 | 
 55 | # COMMAND ----------
 56 | 
 57 | # MAGIC %md
 58 | # MAGIC #### Create the Delta Silver table if not exists
 59 | 
 60 | # COMMAND ----------
 61 | 
 62 | (transformedDF.limit(0).write.format("delta").mode("ignore").saveAsTable(silverTable))
 63 | 
 64 | # COMMAND ----------
 65 | 
 66 | # MAGIC %md
 67 | # MAGIC #### Write to Delta Silver table
 68 | # MAGIC Using `MERGE` to avoid duplicates
 69 | 
 70 | # COMMAND ----------
 71 | 
 72 | from delta.tables import *
 73 | 
 74 | deltaSilverTable = DeltaTable.forName(spark, silverTable)
 75 | 
 76 | (
 77 |     deltaSilverTable.alias("silver")
 78 |     .merge(transformedDF.alias("updates"), "silver.date = updates.date")
 79 |     .whenNotMatchedInsertAll()
 80 |     .execute()
 81 | )
 82 | 
 83 | # COMMAND ----------
 84 | 
 85 | # MAGIC %md
 86 | # MAGIC #### Optimize Delta table
 87 | 
 88 | # COMMAND ----------
 89 | 
 90 | spark.sql("OPTIMIZE {}".format(silverTable))
 91 | 
 92 | # COMMAND ----------
 93 | 
 94 | # MAGIC %md
 95 | # MAGIC #### Verify Delta table
 96 | 
 97 | # COMMAND ----------
 98 | 
 99 | display(spark.sql("DESCRIBE DETAIL {}".format(silverTable)))
100 | 
101 | # COMMAND ----------
102 | 
103 | display(spark.sql("SELECT COUNT(*) FROM {}".format(silverTable)))
104 | 
105 | # COMMAND ----------
106 | 


--------------------------------------------------------------------------------
/terraform/modules/azure/service-principal/main.tf:
--------------------------------------------------------------------------------
 1 | /**
 2 | * Creates an Azure Service Principal with optional App Owners and API Permissions (including admin-consent).
 3 | */
 4 | data "azurerm_client_config" "current" {}
 5 | 
 6 | # Get the well-known application IDs for APIs published by Microsoft
 7 | data "azuread_application_published_app_ids" "well_known" {}
 8 | 
 9 | # Get all information about the Microsoft Graph API
10 | data "azuread_service_principal" "msgraph" {
11 |   application_id = data.azuread_application_published_app_ids.well_known.result.MicrosoftGraph
12 | }
13 | 
14 | # Create the Azure App Registration
15 | resource "azuread_application" "sp" {
16 |   display_name    = var.name
17 |   identifier_uris = ["api://${var.name}"]
18 |   owners          = var.owners
19 | 
20 |   # Use the app_role_ids mapping to get the app role IDs of the required API Permissions
21 |   dynamic required_resource_access {
22 |     for_each = length(var.api_permissions) > 0 ? [1] : []
23 |     content {
24 |       resource_app_id = data.azuread_service_principal.msgraph.application_id
25 |       dynamic resource_access {
26 |         for_each = var.api_permissions
27 |         content {
28 |           id   = data.azuread_service_principal.msgraph.app_role_ids[resource_access.value]
29 |           type = "Role"
30 |         }
31 |       }
32 |     }
33 |   }
34 | 
35 |   web {
36 |     homepage_url  = "https://${var.name}"
37 |     redirect_uris = []
38 | 
39 |     implicit_grant {
40 |       access_token_issuance_enabled = false
41 |     }
42 |   }
43 | }
44 | 
45 | # Create the Service Principal associated with the App Registration
46 | resource "azuread_service_principal" "sp" {
47 |   application_id               = azuread_application.sp.application_id
48 |   app_role_assignment_required = false
49 |   depends_on                   = [azuread_application.sp]
50 | }
51 | 
52 | # Grant admin-consent for the requested API Permissions
53 | resource "azuread_app_role_assignment" "admin_consent" {
54 |   for_each = toset(var.api_permissions)
55 | 
56 |   app_role_id         = data.azuread_service_principal.msgraph.app_role_ids[each.value]
57 |   principal_object_id = azuread_service_principal.sp.object_id
58 |   resource_object_id  = data.azuread_service_principal.msgraph.object_id
59 | }
60 | 
61 | # Create the Service Principal client secret
62 | resource "azuread_application_password" "sp" {
63 |   application_object_id = azuread_application.sp.id
64 |   display_name          = "Managed by Terraform"
65 |   end_date_relative     = var.secret_expiration
66 |   depends_on            = [azuread_service_principal.sp]
67 | }
68 | 


--------------------------------------------------------------------------------
/arm/azure-data-lake-gen-2.json:
--------------------------------------------------------------------------------
 1 | {
 2 |   "$schema": "https://schema.management.azure.com/schemas/2019-04-01/deploymentTemplate.json#",
 3 |   "contentVersion": "1.0.0.0",
 4 |   "parameters": {
 5 |     "storageAccountName": {
 6 |       "type": "string",
 7 |       "metadata": {
 8 |         "description": "Specifies the name of the storage account."
 9 |       }
10 |     },
11 |     "storageAccountType": {
12 |       "type": "string",
13 |       "defaultValue": "Standard_LRS",
14 |       "allowedValues": [
15 |         "Standard_LRS",
16 |         "Standard_GRS",
17 |         "Standard_RAGRS",
18 |         "Standard_ZRS",
19 |         "Standard_GZRS",
20 |         "Standard_RAGZRS"
21 |       ],
22 |       "metadata": {
23 |         "description": "Storage Account type."
24 |       }
25 |     },
26 |     "location": {
27 |       "type": "string",
28 |       "defaultValue": "[resourceGroup().location]",
29 |       "metadata": {
30 |         "description": "Location for all resources."
31 |       }
32 |     },
33 |     "isHnsEnabled": {
34 |       "type": "bool",
35 |       "defaultValue": true,
36 |       "metadata": {
37 |         "description": "Account ADLS Gen2 hierarchical namespace enabled if set to true."
38 |       }
39 |     },
40 |     "supportsHttpsTrafficOnly": {
41 |       "type": "bool",
42 |       "defaultValue": true,
43 |       "metadata": {
44 |         "description": "Allows https traffic only to storage service if sets to true."
45 |       }
46 |     },
47 |     "accessTier": {
48 |       "type": "string",
49 |       "defaultValue": "Hot",
50 |       "allowedValues": [
51 |         "Hot",
52 |         "Cold"
53 |       ],
54 |       "metadata": {
55 |         "description": "The account Access Tier is the default tier that is inferred by any blob without an explicitly set tier."
56 |       }
57 |     }
58 |   },
59 |   "resources": [
60 |     {
61 |       "name": "[parameters('storageAccountName')]",
62 |       "type": "Microsoft.Storage/storageAccounts",
63 |       "apiVersion": "2019-06-01",
64 |       "location": "[parameters('location')]",
65 |       "sku": {
66 |         "name": "[parameters('storageAccountType')]",
67 |         "tier": "Standard"
68 |       },
69 |       "kind": "StorageV2",
70 |       "properties": {
71 |         "isHnsEnabled": "[parameters('isHnsEnabled')]",
72 |         "supportsHttpsTrafficOnly": "[parameters('supportsHttpsTrafficOnly')]",
73 |         "accessTier": "[parameters('accessTier')]"
74 |       }
75 |     }
76 |   ],
77 |   "outputs": {
78 |     "storageAccountId": {
79 |       "type": "string",
80 |       "value": "[resourceId('Microsoft.Storage/storageAccounts', parameters('storageAccountName'))]"
81 |     }
82 |   }
83 | }
84 | 


--------------------------------------------------------------------------------
/terraform/tests/azure/service-principal/main.tf:
--------------------------------------------------------------------------------
 1 | /**
 2 | * Tests for the service-principal module
 3 | */
 4 | provider "azurerm" {
 5 |   features {}
 6 | }
 7 | 
 8 | terraform {
 9 |   required_version = "~> 1.5.7"
10 | 
11 |   required_providers {
12 |     azuread = {
13 |       source  = "hashicorp/azuread"
14 |       version = "~> 2"
15 |     }
16 |     random = {
17 |       source  = "hashicorp/random"
18 |       version = "~> 3"
19 |     }
20 |   }
21 | }
22 | 
23 | # Minimum of variables required for the test
24 | variable "azure_location" { default = "westeurope" }
25 | 
26 | # Create a random string for test uniqueness
27 | resource "random_string" "suffix" {
28 |   length  = 10
29 |   numeric = true
30 |   lower   = true
31 |   upper   = false
32 |   special = false
33 | }
34 | 
35 | # Set the rest of the test variables using the random string
36 | locals {
37 |   spn_defaults        = "tftestspn-defaults-${random_string.suffix.result}"
38 |   spn_api_permissions = "tftestspn-api-${random_string.suffix.result}"
39 |   spn_with_owner      = "tftestspn-owner-${random_string.suffix.result}"
40 | }
41 | 
42 | # Marker for test dependencies
43 | resource "null_resource" "test_dependencies" {
44 |   triggers = {
45 |     random_string = random_string.suffix.result
46 |   }
47 |   depends_on = [random_string.suffix]
48 | }
49 | 
50 | # Create a Service Principal with defaults
51 | module "test_sp_defaults" {
52 |   source = "../../../modules/azure/service-principal"
53 |   name   = local.spn_defaults
54 | }
55 | 
56 | # Create a Service Principal with API Permissions
57 | module "test_sp_api_permissions" {
58 |   source          = "../../../modules/azure/service-principal"
59 |   name            = local.spn_api_permissions
60 |   api_permissions = ["User.Read.All", "GroupMember.Read.All", "Application.Read.All"]
61 | }
62 | 
63 | # Create a Service Principal with an Owner
64 | module "test_sp_with_owner" {
65 |   source = "../../../modules/azure/service-principal"
66 |   name   = local.spn_with_owner
67 |   owners = [module.test_sp_defaults.object_id]
68 | }
69 | 
70 | # Terraform output
71 | output "service_principal_tests" {
72 |   value = {
73 |     test_sp_defaults = {
74 |       object_id      = module.test_sp_defaults.object_id
75 |       application_id = module.test_sp_defaults.application_id
76 |     }
77 |     test_sp_api_permissions = {
78 |       object_id      = module.test_sp_api_permissions.object_id
79 |       application_id = module.test_sp_api_permissions.application_id
80 |     }
81 |     test_sp_with_owner = {
82 |       object_id      = module.test_sp_with_owner.object_id
83 |       application_id = module.test_sp_with_owner.application_id
84 |     }
85 |   }
86 | }
87 | 


--------------------------------------------------------------------------------
/terraform/modules/azure/storage-account/variables.tf:
--------------------------------------------------------------------------------
 1 | variable "resource_group_name" {
 2 |   type        = string
 3 |   description = "The name of the Resource Group in which the resources should exist."
 4 | }
 5 | 
 6 | variable "azure_location" {
 7 |   type        = string
 8 |   description = "(Optional) Azure location in which the resources should exist. If not set, it will use the location of the Resource Group."
 9 |   default     = null
10 | }
11 | 
12 | variable "storage_account_name" {
13 |   type        = string
14 |   description = "The name of the Storage Account."
15 | 
16 |   validation {
17 |     condition     = length(var.storage_account_name) >= 3 && length(var.storage_account_name) <= 24
18 |     error_message = "The name of the Storage Account must be between 3 and 24 characters."
19 |   }
20 | }
21 | 
22 | variable "hierarchical_namespace" {
23 |   type        = bool
24 |   description = "(Optional) Set to true for an Azure Data Lake Gen 2 Storage Account. Default is false."
25 |   default     = false
26 | }
27 | 
28 | variable "storage_containers" {
29 |   type        = list(string)
30 |   description = "(Optional) A list of containers to be created within the Storage Account. By default it will create a container called default."
31 |   default     = ["default"]
32 | }
33 | 
34 | variable "account_replication_type" {
35 |   type        = string
36 |   description = "(Optional) The type of replication to use for the Storage Account. Default is LRS."
37 |   default     = "LRS"
38 | 
39 |   validation {
40 |     condition     = contains(["LRS", "GRS", "RAGRS", "ZRS", "GZRS", "RAGZRS"], var.account_replication_type)
41 |     error_message = "Valid options are LRS, GRS, RAGRS, ZRS, GZRS and RAGZRS."
42 |   }
43 | }
44 | 
45 | variable "allowed_subnet_ids" {
46 |   type        = list(string)
47 |   description = "(Optional) The virtual network subnet IDs allowed to connect to the Storage Account."
48 |   default     = []
49 | }
50 | 
51 | variable "allowed_ips" {
52 |   type        = list(string)
53 |   description = "(Optional) The IPs allowed to connect to the Storage Account."
54 |   default     = []
55 | }
56 | 
57 | variable "network_default_action" {
58 |   type        = string
59 |   description = "(Optional) Specifies the default action of allow or deny when no other rules match. Default is Allow."
60 |   default     = "Allow"
61 | 
62 |   validation {
63 |     condition     = contains(["Allow", "Deny"], var.network_default_action)
64 |     error_message = "Valid options are Deny or Allow."
65 |   }
66 | }
67 | 
68 | variable "tags" {
69 |   type        = map(string)
70 |   description = "(Optional) A mapping of tags to assign to the resources."
71 |   default     = {}
72 | }
73 | 


--------------------------------------------------------------------------------
/terraform/modules/databricks/cluster-policy/main.tf:
--------------------------------------------------------------------------------
 1 | /**
 2 | * Creates a Databricks cluster policy with optional CAN_USE permissions.
 3 | * The policy will be created by merging the following sources (in this order):
 4 | *   1) a default policy definition with optional variables
 5 | *   2) a policy json file
 6 | *   3) policy overrides as a Terraform object
 7 | */
 8 | 
 9 | # Define default rules that will be applied to every policy unless overridden
10 | locals {
11 |   default_policy = {
12 |     "spark_version" : {
13 |       "type" : "regex",
14 |       "pattern" : var.default_spark_version_regex,
15 |       "hidden" : false
16 |     },
17 |     "autotermination_minutes" : {
18 |       "type" : "fixed",
19 |       "value" : var.default_autotermination_minutes,
20 |       "hidden" : false
21 |     },
22 |     "cluster_log_conf.path" : {
23 |       "type" : "unlimited",
24 |       "defaultValue" : var.default_cluster_log_path,
25 |       "isOptional" : false,
26 |       "hidden" : false
27 |     },
28 |     "cluster_log_conf.type" : {
29 |       "type" : "fixed",
30 |       "value" : "DBFS",
31 |       "hidden" : false
32 |     },
33 |     "custom_tags.PolicyName" : {
34 |       "type" : "fixed",
35 |       "value" : var.policy_name,
36 |       "hidden" : false
37 |     },
38 |     "docker_image.url" : {
39 |       "type" : "forbidden",
40 |       "hidden" : true
41 |     }
42 |   }
43 | }
44 | 
45 | # Read the policy json file if one was defined
46 | data "local_file" "policy_definition" {
47 |   count    = try(fileexists(var.policy_overrides_file), false) ? 1 : 0
48 |   filename = var.policy_overrides_file
49 | }
50 | 
51 | resource "databricks_cluster_policy" "this" {
52 |   name = var.policy_name
53 | 
54 |   # Merge the local default rules with the other policy overrides passed from the variables
55 |   definition = jsonencode(merge(local.default_policy,
56 |     try(jsondecode(base64decode(data.local_file.policy_definition[0].content_base64)), {}),
57 |     var.policy_overrides_object))
58 | }
59 | 
60 | # Create the policy
61 | resource "databricks_permissions" "policy" {
62 |   count             = length(var.CAN_USE) > 0 ? 1 : 0
63 |   cluster_policy_id = databricks_cluster_policy.this.id
64 | 
65 |   dynamic "access_control" {
66 |     for_each = toset(var.CAN_USE)
67 |     content {
68 |       user_name              = access_control.value.type == "user" ? access_control.value.principal : ""
69 |       group_name             = access_control.value.type == "group" ? access_control.value.principal : ""
70 |       service_principal_name = access_control.value.type == "service_principal" ? access_control.value.principal : ""
71 |       permission_level       = "CAN_USE"
72 |     }
73 |   }
74 | }
75 | 


--------------------------------------------------------------------------------
/terraform/modules/azure/databricks-workspace/variables.tf:
--------------------------------------------------------------------------------
 1 | variable "resource_group_name" {
 2 |   type        = string
 3 |   description = "The name of the Resource Group in which the resources should exist."
 4 | }
 5 | 
 6 | variable "azure_location" {
 7 |   type        = string
 8 |   description = "(Optional) Azure location in which the resources should exist. If not set, it will use the location of the Resource Group."
 9 |   default     = null
10 | }
11 | 
12 | variable "workspace_name" {
13 |   type        = string
14 |   description = "The name of the Azure Databricks workspace resource."
15 | 
16 |   validation {
17 |     condition     = length(var.workspace_name) >= 3 && length(var.workspace_name) <= 30
18 |     error_message = "The name of the Databricks workspace must be between 3 and 30 characters."
19 |   }
20 | }
21 | 
22 | variable "managed_resource_group_name" {
23 |   type        = string
24 |   description = "(Optional) The name of the Resource Group where Azure should place the managed Databricks resources. This should not already exist."
25 |   default     = null
26 | }
27 | 
28 | variable "pricing_tier" {
29 |   type        = string
30 |   description = "(Optional) The pricing tier to use for the Databricks workspace. Possible values are standard, premium, or trial. Default is premium."
31 |   default     = "premium"
32 | 
33 |   validation {
34 |     condition     = contains(["premium", "standard", "trial"], var.pricing_tier)
35 |     error_message = "The Azure Databricks Pricing Tier must be set to premium, standard or trial."
36 |   }
37 | }
38 | 
39 | variable "virtual_network_name" {
40 |   type        = string
41 |   description = "(Optional) The Azure Resource ID of the Virtual Network for VNet injection. If not set, a new Virtual Network will be created in the Managed Resource Group."
42 |   default     = null
43 | }
44 | 
45 | variable "private_subnet_name" {
46 |   type        = string
47 |   description = "(Optional) The name of the Private Subnet within the Virtual Network. Default is private-subnet."
48 |   default     = "private-subnet"
49 | }
50 | 
51 | variable "public_subnet_name" {
52 |   type        = string
53 |   description = "(Optional) The name of the Public Subnet within the Virtual Network. Default is public-subnet."
54 |   default     = "public-subnet"
55 | }
56 | 
57 | variable "disable_public_ip" {
58 |   type        = bool
59 |   description = "(Optional) Set to true to deploy the workspace with Secure Cluster Connectivity (No Public IP) enabled. Default is false."
60 |   default     = false
61 | }
62 | 
63 | variable "tags" {
64 |   type        = map(string)
65 |   description = "(Optional) A mapping of tags to assign to the resources."
66 |   default     = {}
67 | }
68 | 


--------------------------------------------------------------------------------
/scripts/create_instance_pool.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | """
 3 | Python script that creates a Databricks Instance Pool using the Instance Pools API (https://docs.databricks.com/dev-tools/api/latest/instance-pools.html).
 4 | It uses simple positional arguments.
 5 | It returns the Instance Pool ID as a variable called databricksPoolId in the Azure Pipelines format.
 6 | """
 7 | import sys
 8 | 
 9 | import requests
10 | 
11 | 
12 | def main():
13 |     workspace_url = sys.argv[1]
14 |     access_token = sys.argv[2]
15 |     instance_pool_name = sys.argv[3]
16 |     node_type_id = sys.argv[4]
17 |     min_idle_instances = sys.argv[5]
18 |     idle_instance_autotermination_minutes = sys.argv[6]
19 |     preloaded_spark_version = sys.argv[7]
20 |     azure_availability = sys.argv[8]
21 | 
22 |     base_url = '{0}/api/2.0/instance-pools'.format(workspace_url.rstrip("/"))
23 |     headers = {
24 |         "Content-Type": "application/json",
25 |         "Authorization": "Bearer " + access_token
26 |     }
27 |     payload = {
28 |         "instance_pool_name": instance_pool_name,
29 |         "node_type_id": node_type_id,
30 |         "min_idle_instances": min_idle_instances,
31 |         "idle_instance_autotermination_minutes": idle_instance_autotermination_minutes,
32 |         "preloaded_spark_versions": [preloaded_spark_version],
33 |         "azure_attributes": {
34 |             "availability": azure_availability
35 |         }
36 |     }
37 | 
38 |     # Set the spot settings if availability was set to SPOT_AZURE, otherwise set ON_DEMAND
39 |     if payload["azure_attributes"]["availability"] == "SPOT_AZURE":
40 |         payload["azure_attributes"]["spot_bid_max_price"] = -1
41 |     else:
42 |         payload["azure_attributes"]["availability"] = "ON_DEMAND_AZURE"
43 | 
44 |     url = base_url + '/create'
45 |     all_pools = requests.get(url=base_url + '/list', headers=headers).json()
46 |     if "instance_pools" in all_pools:
47 |         for pool in all_pools["instance_pools"]:
48 |             if instance_pool_name == pool["instance_pool_name"]:
49 |                 payload["instance_pool_id"] = pool["instance_pool_id"]
50 |                 url = base_url + '/edit'
51 |                 break
52 | 
53 |     response = requests.post(url=url, headers=headers, json=payload)
54 |     if response.status_code == requests.codes.ok:
55 |         if "instance_pool_id" in payload:
56 |             instance_pool_id = payload["instance_pool_id"]
57 |         else:
58 |             instance_pool_id = response.json()['instance_pool_id']
59 | 
60 |         # Pass the variables to Azure Pipelines
61 |         print("##vso[task.setvariable variable=databricksPoolId;issecret=false]{0}".format(instance_pool_id))
62 |         return
63 |     else:
64 |         return response.text
65 | 
66 | 
67 | if __name__ == '__main__':
68 |     sys.exit(main())
69 | 


--------------------------------------------------------------------------------
/scripts/get_object_details.sh:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env bash
 2 | #
 3 | # Retrieves the Azure Object ID of the current az cli login (using 'az ad sp show').
 4 | # It can retrieve a normal user or a service principal or a group.
 5 | # If an optional argument is used, it will retrieve the Object ID of the value of the argument.
 6 | # Returns the object id as a variable called objectId in the Azure Pipelines format.
 7 | # Returns the object type as a variable called objectType in the Azure Pipelines format.
 8 | #
 9 | 
10 | # Optional parameters - if not set it will use Azure DevOps Principal already logged in
11 | _principal_id=${1}
12 | 
13 | # Local variables
14 | _python="$(command -v python || command -v python3)"
15 | 
16 | # Parameters check
17 | if [ -z "${_principal_id}" ]; then
18 |   echo -e "The Principal was not defined, using the cli user"
19 |   az_account=$(az account show)
20 |   [ -z "${az_account}" ] && exit 1
21 | 
22 |   _principal_id=$(echo "${az_account}" | ${_python} -c 'import sys,json; print(json.load(sys.stdin)["user"]["name"])' 2> /dev/null)
23 |   echo -e "Got the Principal: \"${_principal_id}\""
24 | fi
25 | 
26 | # Use the az cli command to get the object details
27 | echo -e "Getting the Object ID for \"${_principal_id}\""
28 | timer=0
29 | while [ ${timer} -lt 100 ]; do
30 |   _response=$(az ad sp show --id "${_principal_id}" || az ad user show --id "${_principal_id}" || az ad group show --group "${_principal_id}")
31 |   [ -n "${_response}" ] && break
32 |   echo -e "Principal \"${_principal_id}\" might not be accessible yet, sleeping for 10 seconds"
33 |   sleep 10 && timer=$((timer+10)) && (exit 1)
34 | done || { echo "ERROR: Timed out waiting"; exit 1; }
35 | 
36 | # Extract the object ID from response
37 | object_id=$(echo "${_response}" | ${_python} -c 'import sys,json; print(json.load(sys.stdin)["id"])')
38 | [ -z "${object_id}" ] && { echo "${_response}"; exit 1; }
39 | echo -e "Got the Object ID: \"${object_id}\""
40 | 
41 | # Extract the object type from response
42 | odata_context=$(echo "${_response}" | ${_python} -c 'import sys,json; print(json.load(sys.stdin)["@odata.context"])')
43 | if [[ "${odata_context}" =~ "servicePrincipals" ]]; then
44 |   object_type="ServicePrincipal"
45 | elif [[ "${odata_context}" =~ "users" ]]; then
46 |   object_type="User"
47 | elif [[ "${odata_context}" =~ "groups" ]]; then
48 |   object_type="Group"
49 | fi
50 | [ -z "${object_type}" ] && { echo "${_response}"; exit 1; }
51 | echo -e "Got the Object Type: \"${object_type}\""
52 | 
53 | # Pass the variables to Azure Pipelines
54 | if [ "${BASH_SOURCE[0]}" == "$0" ]; then
55 |   [ -n "${object_id}" ] && echo "##vso[task.setvariable variable=objectId;issecret=false]${object_id}" || exit 1
56 |   [ -n "${object_type}" ] && echo "##vso[task.setvariable variable=objectType;issecret=false]${object_type}" || exit 1
57 | fi
58 | 


--------------------------------------------------------------------------------
/pipelines/templates/terraform-azure.yml:
--------------------------------------------------------------------------------
 1 | #
 2 | # Template that runs Terraform commands on a directory.
 3 | #
 4 | 
 5 | parameters:
 6 |   - name: serviceConnection
 7 |     displayName: 'Azure Resource Manager service connection'
 8 |     type: string
 9 | 
10 |   - name: terraformVersion
11 |     displayName: 'Terraform version to install and use'
12 |     type: string
13 |     default: '1.5.7'
14 | 
15 |   - name: terraformWorkingDirectory
16 |     displayName: 'Folder path containing the Terraform files'
17 |     type: string
18 | 
19 |   - name: terraformBackendStorageAccountName
20 |     displayName: 'The name of the Storage Account for Terraform Backend'
21 |     type: string
22 | 
23 |   - name: terraformBackendResourceGroupName
24 |     displayName: 'The name of the Resource Group in which the Terraform Storage Account exists'
25 |     type: string
26 | 
27 |   - name: terraformBackendContainerName
28 |     displayName: 'The name of the Storage Container used by Terraform State files'
29 |     type: string
30 | 
31 |   - name: terraformBackendKey
32 |     displayName: 'The name of the Blob used for storing Terraform State file'
33 |     type: string
34 | 
35 | 
36 | steps:
37 |   - task: TerraformInstaller@1
38 |     displayName: 'Install Terraform Version ${{ parameters.terraformVersion }}'
39 |     inputs:
40 |       terraformVersion: ${{ parameters.terraformVersion }}
41 | 
42 |   - task: TerraformTaskV4@4
43 |     displayName: 'Run terraform init'
44 |     inputs:
45 |       provider: 'azurerm'
46 |       command: 'init'
47 |       commandOptions: '-upgrade=true -input=false'
48 |       workingDirectory: ${{ parameters.terraformWorkingDirectory }}
49 |       backendServiceArm: ${{ parameters.serviceConnection }}
50 |       backendAzureRmResourceGroupName: ${{ parameters.terraformBackendResourceGroupName }}
51 |       backendAzureRmStorageAccountName: ${{ parameters.terraformBackendStorageAccountName }}
52 |       backendAzureRmContainerName: ${{ parameters.terraformBackendContainerName }}
53 |       backendAzureRmKey: ${{ parameters.terraformBackendKey }}
54 | 
55 |   - task: TerraformTaskV4@4
56 |     displayName: 'Run terraform validate'
57 |     inputs:
58 |       provider: 'azurerm'
59 |       command: 'validate'
60 |       workingDirectory: ${{ parameters.terraformWorkingDirectory }}
61 | 
62 |   - task: TerraformTaskV4@4
63 |     displayName: 'Run terraform plan'
64 |     inputs:
65 |       provider: 'azurerm'
66 |       command: 'plan'
67 |       commandOptions: '-input=false -out=tfplan.out'
68 |       workingDirectory: ${{ parameters.terraformWorkingDirectory }}
69 |       environmentServiceNameAzureRM: ${{ parameters.serviceConnection }}
70 | 
71 |   - task: TerraformTaskV4@4
72 |     displayName: 'Run terraform apply'
73 |     inputs:
74 |       provider: 'azurerm'
75 |       command: 'apply'
76 |       commandOptions: '-input=false -parallelism=3 -auto-approve tfplan.out'
77 |       workingDirectory: ${{ parameters.terraformWorkingDirectory }}
78 |       environmentServiceNameAzureRM: ${{ parameters.serviceConnection }}
79 | 


--------------------------------------------------------------------------------
/terraform/tests/azure/resource-group/main.tf:
--------------------------------------------------------------------------------
 1 | /**
 2 | * Tests for the resource-group module
 3 | */
 4 | provider "azurerm" {
 5 |   features {}
 6 | }
 7 | 
 8 | terraform {
 9 |   required_version = "~> 1.5.7"
10 | 
11 |   required_providers {
12 |     azuread = {
13 |       source  = "hashicorp/azuread"
14 |       version = "~> 2"
15 |     }
16 |     azurerm = {
17 |       source  = "hashicorp/azurerm"
18 |       version = "~> 3"
19 |     }
20 |     random = {
21 |       source  = "hashicorp/random"
22 |       version = "~> 3"
23 |     }
24 |   }
25 | }
26 | 
27 | # Minimum of variables required for the test
28 | variable "azure_location" { default = "westeurope" }
29 | variable "resource_group_name" { default = null }
30 | 
31 | # Create a random string for test uniqueness
32 | resource "random_string" "suffix" {
33 |   length  = 10
34 |   numeric = true
35 |   lower   = true
36 |   upper   = false
37 |   special = false
38 | }
39 | 
40 | # Set the rest of the test variables using the random string
41 | locals {
42 |   resource_group_defaults   = var.resource_group_name == null ? "tftest-rg-${random_string.suffix.result}" : var.resource_group_name
43 |   resource_group_with_roles = "tftest-rg-roles-${random_string.suffix.result}"
44 |   spn                       = "tftestspn1${random_string.suffix.result}"
45 |   custom_tags               = { Purpose = "Terraform-test-${random_string.suffix.result}" }
46 | }
47 | 
48 | # Create a test app registration
49 | resource "azuread_application" "test_app" {
50 |   display_name    = "TF Test ${local.spn}"
51 |   identifier_uris = ["api://${local.spn}"]
52 | }
53 | 
54 | # Create a test service principal
55 | resource "azuread_service_principal" "test_sp" {
56 |   application_id = azuread_application.test_app.application_id
57 |   depends_on     = [azuread_application.test_app]
58 | }
59 | 
60 | # Build a Resource Group with default parameters
61 | module "test_resource_group_defaults" {
62 |   source              = "../../../modules/azure/resource-group"
63 |   azure_location      = var.azure_location
64 |   resource_group_name = local.resource_group_defaults
65 | }
66 | 
67 | # Build a Resource Group with roles
68 | module "test_resource_group_with_roles" {
69 |   source              = "../../../modules/azure/resource-group"
70 |   azure_location      = var.azure_location
71 |   resource_group_name = local.resource_group_with_roles
72 |   readers             = [azuread_service_principal.test_sp.object_id]
73 |   contributors        = [azuread_service_principal.test_sp.object_id]
74 |   owners              = [azuread_service_principal.test_sp.object_id]
75 |   tags                = local.custom_tags
76 | }
77 | 
78 | # Terraform output
79 | output "resource_group_tests" {
80 |   value = {
81 |     test_resource_group_defaults = {
82 |       id       = module.test_resource_group_defaults.id
83 |       name     = module.test_resource_group_defaults.name
84 |       location = module.test_resource_group_defaults.location
85 |     }
86 |     test_resource_group_with_roles = {
87 |       id       = module.test_resource_group_with_roles.id
88 |       name     = module.test_resource_group_with_roles.name
89 |       location = module.test_resource_group_with_roles.location
90 |     }
91 |   }
92 | }
93 | 


--------------------------------------------------------------------------------
/terraform/modules/azure/databricks-workspace/main.tf:
--------------------------------------------------------------------------------
 1 | /**
 2 | * Creates an Azure Databricks workspace with optional VNet injection (https://docs.microsoft.com/en-us/azure/databricks/administration-guide/cloud-configurations/azure/vnet-inject).
 3 | */
 4 | data "azurerm_resource_group" "databricks" {
 5 |   name = var.resource_group_name
 6 | }
 7 | 
 8 | data "azurerm_virtual_network" "databricks_vnet" {
 9 |   count               = var.virtual_network_name == null ? 0 : 1
10 |   name                = var.virtual_network_name
11 |   resource_group_name = data.azurerm_resource_group.databricks.name
12 | }
13 | 
14 | data "azurerm_subnet" "databricks_public_subnet" {
15 |   count                = var.virtual_network_name == null ? 0 : 1
16 |   name                 = var.public_subnet_name
17 |   virtual_network_name = data.azurerm_virtual_network.databricks_vnet[0].name
18 |   resource_group_name  = data.azurerm_resource_group.databricks.name
19 | }
20 | 
21 | data "azurerm_subnet" "databricks_private_subnet" {
22 |   count                = var.virtual_network_name == null ? 0 : 1
23 |   name                 = var.private_subnet_name
24 |   virtual_network_name = data.azurerm_virtual_network.databricks_vnet[0].name
25 |   resource_group_name  = data.azurerm_resource_group.databricks.name
26 | }
27 | 
28 | resource "random_string" "rg_suffix" {
29 |   length  = 10
30 |   numeric = true
31 |   lower   = true
32 |   upper   = false
33 |   special = false
34 | 
35 |   keepers = {
36 |     workspace_name = var.workspace_name
37 |   }
38 | }
39 | 
40 | locals {
41 |   location                    = var.azure_location == null ? data.azurerm_resource_group.databricks.location : var.azure_location
42 |   managed_resource_group_name = var.managed_resource_group_name == null ? "databricks-rg-${var.workspace_name}-${random_string.rg_suffix.result}" : var.managed_resource_group_name
43 | 
44 |   tags = {
45 |     WorkspaceManagedBy = "Terraform"
46 |   }
47 | }
48 | 
49 | resource "azurerm_databricks_workspace" "this" {
50 |   name                        = var.workspace_name
51 |   location                    = local.location
52 |   resource_group_name         = data.azurerm_resource_group.databricks.name
53 |   sku                         = var.pricing_tier
54 |   managed_resource_group_name = local.managed_resource_group_name
55 |   tags                        = merge(local.tags, var.tags)
56 | 
57 |   dynamic "custom_parameters" {
58 |     for_each = var.virtual_network_name == null ? [] : [1]
59 |     content {
60 |       no_public_ip        = var.disable_public_ip
61 |       virtual_network_id  = data.azurerm_virtual_network.databricks_vnet[0].id
62 |       private_subnet_name = var.private_subnet_name
63 |       public_subnet_name  = var.public_subnet_name
64 | 
65 |       private_subnet_network_security_group_association_id = data.azurerm_subnet.databricks_private_subnet[0].id
66 |       public_subnet_network_security_group_association_id  = data.azurerm_subnet.databricks_public_subnet[0].id
67 |     }
68 |   }
69 | }
70 | 
71 | # Wait for 5 minutes to allow permissions and network rules to propagate to the Managed Resource Group
72 | resource "time_sleep" "wait_5_min" {
73 |   create_duration = "300s"
74 |   depends_on      = [azurerm_databricks_workspace.this]
75 | }
76 | 


--------------------------------------------------------------------------------
/terraform/deployments/destroy-deployment.sh:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env bash
 2 | #
 3 | # Destroys a deployment by running terraform destroy.
 4 | #
 5 | 
 6 | # Debug
 7 | #set -x
 8 | #export TF_LOG="DEBUG"
 9 | 
10 | # Local variables
11 | _realpath() { [[ ${1} == /* ]] && echo "${1}" || echo "${PWD}"/"${1#./}"; }
12 | _realpath="$(command -v realpath || echo _realpath )"
13 | _this_script_dir=$(${_realpath} "$(dirname "${BASH_SOURCE[0]}")")
14 | _scripts_dir=${_this_script_dir}/../../scripts
15 | _python="$(command -v python || command -v python3)"
16 | 
17 | # Extract variables from terraform admin output
18 | echo -e "Extracting variables from terraform admin output"
19 | tf_admin_output=$(terraform output -json -state "${_this_script_dir}/../../admin/terraform/terraform.tfstate")
20 | if [ -z "${tf_admin_output}" ] || [ "${tf_admin_output}" == "{}" ]; then
21 |   echo -e "Could not extract the required variables from the admin setup state (\"${_this_script_dir}/../../admin/terraform/terraform.tfstate\")"
22 |   echo -e "\"${_this_script_dir}/../../admin/setup-with-terraform.sh\" must be run first"
23 |   exit 1
24 | fi
25 | 
26 | ## for the provisioned resources
27 | export TF_VAR_DATA_SERVICE_PRINCIPAL_CLIENT_ID=$(echo "${tf_admin_output}" | ${_python} -c 'import sys,json; print(json.load(sys.stdin)["databricks_resources"]["value"]["data_service_principal_application_id"])')
28 | export TF_VAR_PROJECT_GROUP_NAME=$(echo "${tf_admin_output}" | ${_python} -c 'import sys,json; print(json.load(sys.stdin)["databricks_resources"]["value"]["project_group_name"])')
29 | export TF_VAR_RESOURCE_GROUP_NAME=$(echo "${tf_admin_output}" | ${_python} -c 'import sys,json; print(json.load(sys.stdin)["databricks_resources"]["value"]["resource_group_name"])')
30 | export TF_VAR_KEY_VAULT_NAME=$(echo "${tf_admin_output}" | ${_python} -c 'import sys,json; print(json.load(sys.stdin)["databricks_resources"]["value"]["key_vault_name"])')
31 | export TF_VAR_SECRET_NAME_CLIENT_SECRET=$(echo "${tf_admin_output}" | ${_python} -c 'import sys,json; print(json.load(sys.stdin)["databricks_resources"]["value"]["secret_name"])')
32 | export ARM_CLIENT_ID=$(echo "${tf_admin_output}" | ${_python} -c 'import sys,json; print(json.load(sys.stdin)["databricks_resources"]["value"]["infra_service_principal_application_id"])')
33 | #
34 | ## Generate a new client secret for the infra Service Principal to simulate the Azure DevOps environment
35 | #echo -e "Generating a new client secret for the infra Service Principal"
36 | #ARM_CLIENT_SECRET=$(az ad sp credential reset --id "${ARM_CLIENT_ID}" | ${_python} -c 'import sys,json; print(json.load(sys.stdin)["password"])')
37 | #[ -z "${ARM_CLIENT_SECRET}" ] && exit 1
38 | #export ARM_CLIENT_SECRET
39 | #
40 | #ARM_TENANT_ID=$(az account show | ${_python} -c 'import sys,json; print(json.load(sys.stdin)["tenantId"])')
41 | #[ -z "${ARM_TENANT_ID}" ] && exit 1
42 | #export ARM_TENANT_ID
43 | #
44 | #ARM_SUBSCRIPTION_ID=$(az account show | ${_python} -c 'import sys,json; print(json.load(sys.stdin)["id"])' 2> /dev/null)
45 | #[ -z "${ARM_SUBSCRIPTION_ID}" ] && exit 1
46 | #export ARM_SUBSCRIPTION_ID
47 | 
48 | # Wait until the secret is active
49 | #echo -e "Sleeping for 6 minutes to allow enough time for the secret to propagate" && sleep 360
50 | 
51 | # Destroy test
52 | echo -e "Destroying the test"
53 | source "${_scripts_dir}/terraform_azure.sh" destroy "$@" -var-file="${_this_script_dir}/test.tfvars"
54 | echo
55 | 


--------------------------------------------------------------------------------
/scripts/create_cluster_policy.sh:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env bash
 2 | #
 3 | # Creates a Databricks Cluster Policy using the Cluster Policies API (https://docs.databricks.com/dev-tools/api/latest/policies.html).
 4 | # It uses simple positional arguments.
 5 | # Returns the Policy ID as a variable called databricksPolicyId in the Azure Pipelines format.
 6 | #
 7 | 
 8 | # Required parameters
 9 | _workspace_url=${1}
10 | _access_token=${2}
11 | _policy_name=${3}
12 | _policy_file=${4}
13 | 
14 | # Local variables
15 | _python="$(command -v python || command -v python3)"
16 | 
17 | _usage() {
18 |   echo -e "Usage: ${0} <workspace_url> <policy_name> <policy_file>"
19 |   exit 1
20 | }
21 | 
22 | # Parameters check
23 | [ -z "${_workspace_url}" ] && _usage
24 | [ -z "${_access_token}" ] && _usage
25 | [ -z "${_policy_name}" ] && _usage
26 | [ -z "${_policy_file}" ] && _usage
27 | 
28 | policy_definition=$(cat "${_policy_file}")
29 | [ -z "${policy_definition}" ] && exit 1
30 | 
31 | # Call the policies API to get all existing policies
32 | echo -e "Getting all Cluster Policies"
33 | _response=$(curl -sS --request GET \
34 |                      --header "Authorization: Bearer ${_access_token}" \
35 |                      --header "Content-Type: application/json" \
36 |                      "${_workspace_url}/api/2.0/policies/clusters/list")
37 | 
38 | _total_count=$(echo "${_response}" | ${_python} -c 'import sys,json; print(json.load(sys.stdin)["total_count"])' 2> /dev/null)
39 | if [ -z "${_total_count}" ]; then
40 |   echo "${_response}"
41 |   exit 1
42 | else
43 |   policy_id=$(echo "${_response}" | ${_python} -c 'import sys,json; print([ p["policy_id"] for p in json.load(sys.stdin)["policies"] if p["name"] == "'"${_policy_name}"'" ][0])' 2> /dev/null)
44 | fi
45 | 
46 | # Set the base payload
47 | base_payload='
48 |   "name": "'${_policy_name}'",
49 |   "definition": "'${policy_definition//\"/$'\\"'}'"
50 | '
51 | 
52 | # Create the Policy if it doesn't exist, otherwise update it
53 | if [ "${_total_count}" == 0 ] || [ -z "${policy_id}" ]; then
54 |   echo -e "Creating the Cluster Policy \"${_policy_name}\""
55 |   payload='{'${base_payload}'}'
56 |   _response=$(curl -sS --request POST \
57 |                        --header "Authorization: Bearer ${_access_token}" \
58 |                        --header "Content-Type: application/json" \
59 |                        "${_workspace_url}/api/2.0/policies/clusters/create" \
60 |                        -d "$(echo ${payload})")
61 |   policy_id=$(echo "${_response}" | ${_python} -c 'import sys,json; print(json.load(sys.stdin)["policy_id"])' 2> /dev/null)
62 |   [ -z "${policy_id}" ] && { echo "${_response}"; exit 1; }
63 | else
64 |   echo -e "Updating the Cluster Policy \"${_policy_name}\"(${policy_id})"
65 |   payload='{"policy_id": "'${policy_id}'",'${base_payload}'}'
66 |   _response=$(curl -sS --request POST \
67 |                        --header "Authorization: Bearer ${_access_token}" \
68 |                        --header "Content-Type: application/json" \
69 |                        "${_workspace_url}/api/2.0/policies/clusters/edit" \
70 |                        -d "$(echo ${payload})")
71 |   [ "${_response}" != "{}" ] && { echo "${_response}"; exit 1; }
72 | fi
73 | 
74 | # Pass the variables to Azure Pipelines
75 | if [ "${BASH_SOURCE[0]}" == "$0" ]; then
76 |   [ -n "${policy_id}" ] && echo "##vso[task.setvariable variable=databricksPolicyId;issecret=false]${policy_id}"
77 | fi
78 | 


--------------------------------------------------------------------------------
/terraform/tests/azure/azure-devops-pipeline/main.tf:
--------------------------------------------------------------------------------
 1 | /**
 2 | * Tests for the azure-devops-pipeline module
 3 | */
 4 | provider "azurerm" {
 5 |   features {}
 6 | }
 7 | 
 8 | terraform {
 9 |   required_version = "~> 1.5.7"
10 | 
11 |   required_providers {
12 |     azurerm = {
13 |       source  = "hashicorp/azurerm"
14 |       version = "~> 3"
15 |     }
16 |     azuredevops = {
17 |       source  = "microsoft/azuredevops"
18 |       version = "~> 0.9"
19 |     }
20 |     random = {
21 |       source  = "hashicorp/random"
22 |       version = "~> 3"
23 |     }
24 |   }
25 | }
26 | 
27 | # Create a random string for test uniqueness
28 | resource "random_string" "suffix" {
29 |   length  = 10
30 |   numeric = true
31 |   lower   = true
32 |   upper   = false
33 |   special = false
34 | }
35 | 
36 | # Set the rest of the test variables using the random string
37 | locals {
38 |   pipeline_defaults       = "tftest-pipeline-default-${random_string.suffix.result}"
39 |   pipeline_with_variables = "tftest-pipeline-variables-${random_string.suffix.result}"
40 |   project_name            = "tftest-project-pipelines-${random_string.suffix.result}"
41 |   github_endpoint         = "tftest-endpoint-git-${random_string.suffix.result}"
42 | }
43 | 
44 | # Build an Azure DevOps project with a GitHub Service connection
45 | module "project_with_github_endpoint" {
46 |   source           = "../../../modules/azure/azure-devops-project"
47 |   project_name     = local.project_name
48 |   github_endpoints = [local.github_endpoint]
49 |   #  github_pat       = random_string.suffix.result
50 | }
51 | 
52 | # Build an Azure DevOps pipeline with default parameters
53 | module "test_pipeline_defaults" {
54 |   source             = "../../../modules/azure/azure-devops-pipeline"
55 |   pipeline_name      = local.pipeline_defaults
56 |   pipeline_path      = "azure-pipelines.yml"
57 |   project_id         = module.project_with_github_endpoint.id
58 |   github_endpoint_id = module.project_with_github_endpoint.service_endpoints[local.github_endpoint]
59 |   github_repo_url    = "https://github.com/alexandruanghel/azdo-databricks"
60 | }
61 | 
62 | # Build an Azure DevOps pipeline with two variables
63 | module "test_pipeline_with_variables" {
64 |   source             = "../../../modules/azure/azure-devops-pipeline"
65 |   pipeline_name      = local.pipeline_with_variables
66 |   pipeline_path      = "azure-pipelines.yml"
67 |   project_id         = module.project_with_github_endpoint.id
68 |   github_endpoint_id = module.project_with_github_endpoint.service_endpoints[local.github_endpoint]
69 |   github_repo_url    = "https://github.com/alexandruanghel/azdo-databricks"
70 |   github_branch      = "master"
71 |   pipeline_variables = {
72 |     tftest_var1 = "TF Test 1"
73 |     tftest_var2 = "TF Test 3"
74 |   }
75 | }
76 | 
77 | # Terraform output
78 | output "azure_devops_pipeline_tests" {
79 |   value = {
80 |     test_pipeline_defaults = {
81 |       id       = module.test_pipeline_defaults.id
82 |       name     = module.test_pipeline_defaults.name
83 |       path     = module.test_pipeline_defaults.path
84 |       revision = module.test_pipeline_defaults.revision
85 |     }
86 |     test_pipeline_with_variables = {
87 |       id       = module.test_pipeline_with_variables.id
88 |       name     = module.test_pipeline_with_variables.name
89 |       path     = module.test_pipeline_with_variables.path
90 |       revision = module.test_pipeline_with_variables.revision
91 |     }
92 |   }
93 | }
94 | 


--------------------------------------------------------------------------------
/arm/databricks-workspace-with-vnet-injection.json:
--------------------------------------------------------------------------------
 1 | {
 2 |   "$schema": "https://schema.management.azure.com/schemas/2019-04-01/deploymentTemplate.json#",
 3 |   "contentVersion": "1.0.0.0",
 4 |   "parameters": {
 5 |     "workspaceName": {
 6 |       "type": "string",
 7 |       "metadata": {
 8 |         "description": "The name of the Azure Databricks workspace to create."
 9 |       }
10 |     },
11 |     "pricingTier": {
12 |       "defaultValue": "premium",
13 |       "allowedValues": [
14 |         "trial",
15 |         "standard",
16 |         "premium"
17 |       ],
18 |       "type": "string",
19 |       "metadata": {
20 |         "description": "The pricing tier of the workspace."
21 |       }
22 |     },
23 |     "disablePublicIp": {
24 |       "type": "string",
25 |       "defaultValue": "false",
26 |       "metadata": {
27 |         "description": "Specifies whether to deploy Azure Databricks workspace with Secure Cluster Connectivity (No Public IP) enabled or not."
28 |       }
29 |     },
30 |     "customVirtualNetworkId": {
31 |       "type": "string",
32 |       "metadata": {
33 |         "description": "The complete ARM resource Id of the custom virtual network."
34 |       }
35 |     },
36 |     "customPublicSubnetName": {
37 |       "type": "string",
38 |       "defaultValue": "public-subnet",
39 |       "metadata": {
40 |         "description": "The name of the public subnet in the custom VNet."
41 |       }
42 |     },
43 |     "customPrivateSubnetName": {
44 |       "type": "string",
45 |       "defaultValue": "private-subnet",
46 |       "metadata": {
47 |         "description": "The name of the private subnet in the custom VNet."
48 |       }
49 |     },
50 |     "location": {
51 |       "type": "string",
52 |       "defaultValue": "[resourceGroup().location]",
53 |       "metadata": {
54 |         "description": "Location for all resources."
55 |       }
56 |     }
57 |   },
58 |   "variables": {
59 |     "managedResourceGroupName": "[concat('databricks-rg-', parameters('workspaceName'), '-', uniqueString(parameters('workspaceName'), resourceGroup().id))]",
60 |     "managedResourceGroupId": "[subscriptionResourceId('Microsoft.Resources/resourceGroups', variables('managedResourceGroupName'))]"
61 |   },
62 |   "resources": [
63 |     {
64 |       "name": "[parameters('workspaceName')]",
65 |       "type": "Microsoft.Databricks/workspaces",
66 |       "apiVersion": "2018-04-01",
67 |       "location": "[parameters('location')]",
68 |       "sku": {
69 |         "name": "[parameters('pricingTier')]"
70 |       },
71 |       "comments": "The resource group specified will be locked after deployment.",
72 |       "properties": {
73 |         "managedResourceGroupId": "[variables('managedResourceGroupId')]",
74 |         "parameters": {
75 |           "customVirtualNetworkId": {
76 |             "value": "[parameters('customVirtualNetworkId')]"
77 |           },
78 |           "customPublicSubnetName": {
79 |             "value": "[parameters('customPublicSubnetName')]"
80 |           },
81 |           "customPrivateSubnetName": {
82 |             "value": "[parameters('customPrivateSubnetName')]"
83 |           },
84 |           "enableNoPublicIp": {
85 |             "value": "[bool(parameters('disablePublicIp'))]"
86 |           }
87 |         }
88 |       }
89 |     }
90 |   ],
91 |   "outputs": {
92 |     "workspaceId": {
93 |       "type": "string",
94 |       "value": "[resourceId('Microsoft.Databricks/workspaces', parameters('workspaceName'))]"
95 |     }
96 |   }
97 | }
98 | 


--------------------------------------------------------------------------------
/scripts/create_cluster.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | """
 3 | Python script that creates a Databricks Cluster using the Clusters API (https://docs.databricks.com/dev-tools/api/latest/clusters.html).
 4 | It uses simple positional arguments.
 5 | It returns the Cluster ID as a variable called databricksClusterId in the Azure Pipelines format.
 6 | """
 7 | import sys
 8 | 
 9 | import requests
10 | 
11 | 
12 | def main():
13 |     workspace_url = sys.argv[1]
14 |     access_token = sys.argv[2]
15 |     cluster_name = sys.argv[3]
16 |     cluster_type = sys.argv[4]
17 |     autotermination_minutes = sys.argv[5]
18 |     spark_version = sys.argv[6]
19 |     pool_or_node_type_id = sys.argv[7]
20 |     num_workers = int(sys.argv[8])
21 |     stop_cluster = True
22 | 
23 |     max_num_workers = 0
24 |     if len(sys.argv) > 9:
25 |         if len(sys.argv[9]) > 0:
26 |             max_num_workers = int(sys.argv[9])
27 | 
28 |     base_url = '{0}/api/2.0/clusters'.format(workspace_url.rstrip("/"))
29 |     headers = {
30 |         "Content-Type": "application/json",
31 |         "Authorization": "Bearer " + access_token
32 |     }
33 |     payload = {
34 |         "cluster_name": cluster_name,
35 |         "spark_version": spark_version,
36 |         "autotermination_minutes": autotermination_minutes,
37 |         "instance_pool_id": pool_or_node_type_id
38 |     }
39 |     if max_num_workers > num_workers:
40 |         payload["autoscale"] = {
41 |             "min_workers": num_workers,
42 |             "max_workers": max_num_workers
43 |         }
44 |     else:
45 |         payload["num_workers"] = num_workers
46 | 
47 |     if cluster_type.lower() == "credential passthrough":
48 |         payload["spark_conf"] = {
49 |             "spark.databricks.cluster.profile": "serverless",
50 |             "spark.databricks.repl.allowedLanguages": "python,sql",
51 |             "spark.databricks.passthrough.enabled": "true",
52 |             "spark.databricks.pyspark.enableProcessIsolation": "true"
53 |         }
54 | 
55 |     url = base_url + '/create'
56 |     all_clusters = requests.get(url=base_url + '/list', headers=headers).json()
57 |     if "clusters" in all_clusters:
58 |         for cluster in all_clusters["clusters"]:
59 |             if cluster_name == cluster["cluster_name"]:
60 |                 payload["cluster_id"] = cluster["cluster_id"]
61 |                 url = base_url + '/edit'
62 |                 break
63 |     all_node_types = requests.get(url=base_url + '/list-node-types', headers=headers).json()
64 |     for node_type in all_node_types["node_types"]:
65 |         if pool_or_node_type_id == node_type["node_type_id"]:
66 |             payload["node_type_id"] = pool_or_node_type_id
67 |             payload.pop("instance_pool_id")
68 |             break
69 | 
70 |     print(payload)
71 |     response = requests.post(url=url, headers=headers, json=payload)
72 |     if response.status_code == requests.codes.ok:
73 |         if "cluster_id" in payload:
74 |             cluster_id = payload["cluster_id"]
75 |         else:
76 |             cluster_id = response.json()['cluster_id']
77 | 
78 |         # Stop the cluster immediately after creation
79 |         if stop_cluster:
80 |             requests.post(url=base_url + '/delete', headers=headers, data='{"cluster_id": "' + cluster_id + '"}')
81 | 
82 |         # Pass the variables to Azure Pipelines
83 |         print("##vso[task.setvariable variable=databricksClusterId;issecret=false]{0}".format(cluster_id))
84 |         return
85 |     else:
86 |         return response.text
87 | 
88 | 
89 | if __name__ == '__main__':
90 |     sys.exit(main())
91 | 


--------------------------------------------------------------------------------
/terraform/tests/azure/key-vault/main.tf:
--------------------------------------------------------------------------------
  1 | /**
  2 | * Tests for the key-vault module
  3 | */
  4 | provider "azurerm" {
  5 |   features {
  6 |     key_vault {
  7 |       purge_soft_delete_on_destroy = true
  8 |     }
  9 |   }
 10 | }
 11 | 
 12 | terraform {
 13 |   required_version = "~> 1.5.7"
 14 | 
 15 |   required_providers {
 16 |     azurerm = {
 17 |       source  = "hashicorp/azurerm"
 18 |       version = "~> 3"
 19 |     }
 20 |     random = {
 21 |       source  = "hashicorp/random"
 22 |       version = "~> 3"
 23 |     }
 24 |   }
 25 | }
 26 | 
 27 | # Minimum of variables required for the test
 28 | variable "azure_location" { default = "westeurope" }
 29 | variable "resource_group_name" { default = null }
 30 | 
 31 | # Create a random string for test uniqueness
 32 | resource "random_string" "suffix" {
 33 |   length  = 10
 34 |   numeric = true
 35 |   lower   = true
 36 |   upper   = false
 37 |   special = false
 38 | }
 39 | 
 40 | # Set the rest of the test variables using the random string
 41 | locals {
 42 |   resource_group_name = var.resource_group_name == null ? "tftest-rg-${random_string.suffix.result}" : var.resource_group_name
 43 |   key_vault_defaults  = "tftest-akv-${random_string.suffix.result}"
 44 |   key_vault_custom    = "tftest-akvc-${random_string.suffix.result}"
 45 |   custom_tags         = { Purpose = "Terraform-test-${random_string.suffix.result}" }
 46 | }
 47 | 
 48 | # Create an empty Resource Group to be used by the rest of the resources
 49 | data "azurerm_client_config" "current" {}
 50 | 
 51 | module "test_resource_group" {
 52 |   source              = "../../../modules/azure/resource-group"
 53 |   azure_location      = var.azure_location
 54 |   resource_group_name = local.resource_group_name
 55 |   owners              = [data.azurerm_client_config.current.object_id]
 56 |   tags                = local.custom_tags
 57 | }
 58 | 
 59 | # Marker for test dependencies
 60 | resource "null_resource" "test_dependencies" {
 61 |   triggers = {
 62 |     rg = module.test_resource_group.id
 63 |   }
 64 |   depends_on = [module.test_resource_group]
 65 | }
 66 | 
 67 | # Build a Key Vault with default parameters
 68 | module "test_key_vault_defaults" {
 69 |   source              = "../../../modules/azure/key-vault"
 70 |   resource_group_name = local.resource_group_name
 71 |   key_vault_name      = local.key_vault_defaults
 72 |   depends_on          = [null_resource.test_dependencies]
 73 | }
 74 | 
 75 | # Build a Key Vault with custom parameters
 76 | module "test_key_vault_custom" {
 77 |   source                     = "../../../modules/azure/key-vault"
 78 |   azure_location             = var.azure_location
 79 |   resource_group_name        = local.resource_group_name
 80 |   key_vault_name             = local.key_vault_custom
 81 |   sku_name                   = "premium"
 82 |   soft_delete_retention_days = 10
 83 |   tags                       = local.custom_tags
 84 |   depends_on                 = [null_resource.test_dependencies]
 85 | }
 86 | 
 87 | # Terraform output
 88 | output "key_vault_tests" {
 89 |   value = {
 90 |     test_key_vault_defaults = {
 91 |       id     = module.test_key_vault_defaults.id
 92 |       name   = module.test_key_vault_defaults.name
 93 |       uri    = module.test_key_vault_defaults.uri
 94 |       policy = module.test_key_vault_defaults.policy
 95 |     }
 96 |     test_key_vault_custom = {
 97 |       id     = module.test_key_vault_custom.id
 98 |       name   = module.test_key_vault_custom.name
 99 |       uri    = module.test_key_vault_custom.uri
100 |       policy = module.test_key_vault_custom.policy
101 |     }
102 |   }
103 | }
104 | 


--------------------------------------------------------------------------------
/scripts/azdo_extension.sh:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env bash
  2 | #
  3 | # Manages an Azure DevOps extension (using https://docs.microsoft.com/en-us/cli/azure/devops/extension).
  4 | # It uses both positional arguments and environment variables.
  5 | #
  6 | 
  7 | # Debug
  8 | #set -x
  9 | 
 10 | # Local variables
 11 | _python="$(command -v python || command -v python3)"
 12 | 
 13 | # Environment variables
 14 | export AZURE_DEVOPS_ORG_URL=${AZURE_DEVOPS_ORG_URL:-${AZDO_ORG_SERVICE_URL}}
 15 | export AZURE_DEVOPS_EXT_PAT=${AZURE_DEVOPS_EXT_PAT:-${AZDO_PERSONAL_ACCESS_TOKEN}}
 16 | 
 17 | _usage() {
 18 |   echo -e "Usage: ${0} {install|uninstall} <extension-id> <publisher-id>"
 19 |   exit 1
 20 | }
 21 | 
 22 | _install_extension() {
 23 |   # Install the Azure DevOps cli extension
 24 |   az extension add --name azure-devops 2> /dev/null || { az extension add --name azure-devops --debug; exit 1; }
 25 | }
 26 | 
 27 | _check_args() {
 28 |   # Check the input parameters
 29 |   [ -z "${2}" ] && _usage
 30 |   if [ -z "${AZURE_DEVOPS_ORG_URL}" ]; then
 31 |     echo "ERROR: The Azure DevOps organization URL was not defined"
 32 |     echo "       Either AZURE_DEVOPS_ORG_URL or AZDO_ORG_SERVICE_URL variables must be set"
 33 |     exit 1
 34 |   fi
 35 | }
 36 | 
 37 | _check_auth() {
 38 |   # Check the existing Azure Authentication
 39 |   if [ -z "${AZURE_DEVOPS_EXT_PAT}" ]; then
 40 |     az_signed_in_user=$(az ad signed-in-user show --query userPrincipalName --output tsv)
 41 |     if [ -z "${az_signed_in_user}" ]; then
 42 |       echo "ERROR: User Principal not logged in, run 'az login' first (az login with a Service Principal is not supported)"
 43 |       echo "       Or set the AZURE_DEVOPS_EXT_PAT (or AZDO_PERSONAL_ACCESS_TOKEN) environment variable for direct PAT login"
 44 |       exit 1
 45 |     fi
 46 |   fi
 47 | }
 48 | 
 49 | _install() {
 50 |   # Install the Azure DevOps extension
 51 |   local _extension_id="${1}"
 52 |   local _publisher_id="${2}"
 53 |   echo -e "Installing the extension \"${_extension_id}\" of publisher \"${_publisher_id}\""
 54 | 
 55 |   _response=$(az devops extension install \
 56 |                      --extension-id "${_extension_id}" \
 57 |                      --publisher-id "${_publisher_id}" \
 58 |                      --organization "${AZURE_DEVOPS_ORG_URL}" 2>&1 )
 59 |   if [ -n "$(echo "${_response}" | ${_python} -c 'import sys,json; print(json.load(sys.stdin)["extensionId"])' 2> /dev/null)" ] || \
 60 |      echo "${_response}" | grep "TF1590010" > /dev/null; then
 61 |     echo -e "Extension installed successfully or already installed"
 62 |   else
 63 |     echo -e "${_response}"
 64 |     exit 1
 65 |   fi
 66 | }
 67 | 
 68 | _uninstall() {
 69 |   # Uninstall the Azure DevOps extension
 70 |   local _extension_id="${1}"
 71 |   local _publisher_id="${2}"
 72 |   echo -e "Uninstalling the extension \"${_extension_id}\" of publisher \"${_publisher_id}\""
 73 | 
 74 |   _response=$(az devops extension uninstall --yes \
 75 |                      --extension-id "${_extension_id}" \
 76 |                      --publisher-id "${_publisher_id}" \
 77 |                      --organization "${AZURE_DEVOPS_ORG_URL}" 2>&1 )
 78 |   if [ -z "${_response}" ]; then
 79 |     echo -e "Extension uninstalled successfully"
 80 |   else
 81 |     echo -e "${_response}"
 82 |     exit 1
 83 |   fi
 84 | }
 85 | 
 86 | case "${1}" in
 87 |   install)
 88 |     _check_args "$@"
 89 |     _check_auth
 90 |     _install_extension
 91 |     _install "${2}" "${3}"
 92 |     ;;
 93 |   uninstall)
 94 |     _check_args "$@"
 95 |     _check_auth
 96 |     _install_extension
 97 |     _uninstall "${2}" "${3}"
 98 |     ;;
 99 |   *)
100 |     _usage
101 |     ;;
102 | esac
103 | 


--------------------------------------------------------------------------------
/terraform/modules/azure/databricks-vnet/variables.tf:
--------------------------------------------------------------------------------
 1 | variable "resource_group_name" {
 2 |   type        = string
 3 |   description = "The name of the Resource Group in which the resources should exist."
 4 | }
 5 | 
 6 | variable "azure_location" {
 7 |   type        = string
 8 |   description = "(Optional) Azure location in which the resources should exist. If not set, it will use the location of the Resource Group."
 9 |   default     = null
10 | }
11 | 
12 | variable "virtual_network_name" {
13 |   type        = string
14 |   description = "(Optional) The name of the Virtual Network where the Databricks clusters should be created. Default is workers-vnet."
15 |   default     = "workers-vnet"
16 | }
17 | 
18 | variable "virtual_network_cidr" {
19 |   type        = string
20 |   description = "(Optional) CIDR range for the Virtual Network (must be at least /24). Default is 10.179.0.0/16"
21 |   default     = "10.179.0.0/16"
22 | 
23 |   validation {
24 |     condition     = tonumber(regex("/(\\d+)", var.virtual_network_cidr)[0]) <= 24
25 |     error_message = "The CIDR prefix for the Databricks Virtual Network must be at least /24."
26 |   }
27 | }
28 | 
29 | variable "private_subnet_name" {
30 |   type        = string
31 |   description = "(Optional) The name of the Private Subnet within the Virtual Network. Default is private-subnet."
32 |   default     = "private-subnet"
33 | }
34 | 
35 | variable "private_subnet_cidr" {
36 |   type        = string
37 |   description = "(Optional) CIDR range for the Private Subnet (must be at least /26). Default is 10.179.0.0/18."
38 |   default     = "10.179.0.0/18"
39 | 
40 |   validation {
41 |     condition     = tonumber(regex("/(\\d+)", var.private_subnet_cidr)[0]) <= 26
42 |     error_message = "The CIDR prefix for the Databricks Private Subnet must be at least /26."
43 |   }
44 | }
45 | 
46 | variable "public_subnet_name" {
47 |   type        = string
48 |   description = "(Optional) The name of the Public Subnet within the Virtual Network. Default is public-subnet."
49 |   default     = "public-subnet"
50 | }
51 | 
52 | variable "public_subnet_cidr" {
53 |   type        = string
54 |   description = "(Optional) CIDR range for the Public Subnet (must be at least /26). Default is 10.179.64.0/18."
55 |   default     = "10.179.64.0/18"
56 | 
57 |   validation {
58 |     condition     = tonumber(regex("/(\\d+)", var.public_subnet_cidr)[0]) <= 26
59 |     error_message = "The CIDR prefix for the Databricks Public Subnet must be at least /26."
60 |   }
61 | }
62 | 
63 | variable "network_security_group_name" {
64 |   type        = string
65 |   description = "(Optional) The name of the Databricks Network Security Group attached to the subnets. Default is databricks-nsg."
66 |   default     = "databricks-nsg"
67 | }
68 | 
69 | variable "use_nat_gateway" {
70 |   type        = bool
71 |   description = "(Optional) Set true to deploy a NAT gateway for no public ip subnets. Default is false."
72 |   default     = false
73 | }
74 | 
75 | variable "nat_gateway_name" {
76 |   type        = string
77 |   description = "(Optional) The name of the NAT gateway to be attached to the subnets. Default is databricks-nat-gateway."
78 |   default     = "databricks-nat-gateway"
79 | }
80 | 
81 | variable "nat_gateway_public_ip_name" {
82 |   type        = string
83 |   description = "(Optional) The name of the NAT gateway public IP. Default is databricks-nat-public-ip."
84 |   default     = "databricks-nat-public-ip"
85 | }
86 | 
87 | variable "service_endpoints" {
88 |   type        = list(string)
89 |   description = "(Optional) A list of service endpoints to associate with the public subnet."
90 |   default     = []
91 | }
92 | 
93 | variable "tags" {
94 |   type        = map(string)
95 |   description = "(Optional) A mapping of tags to assign to the resources."
96 |   default     = {}
97 | }
98 | 


--------------------------------------------------------------------------------
/scripts/azdo_project.sh:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env bash
  2 | #
  3 | # Manages an Azure DevOps project (using https://docs.microsoft.com/en-us/cli/azure/devops/project).
  4 | # It uses both positional arguments and environment variables.
  5 | #
  6 | 
  7 | # Debug
  8 | #set -x
  9 | 
 10 | # Local variables
 11 | _python="$(command -v python || command -v python3)"
 12 | 
 13 | # Environment variables
 14 | export AZURE_DEVOPS_ORG_URL=${AZURE_DEVOPS_ORG_URL:-${AZDO_ORG_SERVICE_URL}}
 15 | export AZURE_DEVOPS_EXT_PAT=${AZURE_DEVOPS_EXT_PAT:-${AZDO_PERSONAL_ACCESS_TOKEN}}
 16 | 
 17 | 
 18 | _usage() {
 19 |   echo -e "Usage: ${0} {create|delete} <project_name>"
 20 |   exit 1
 21 | }
 22 | 
 23 | _install_extension() {
 24 |   # Install the Azure DevOps cli extension
 25 |   az extension add --name azure-devops 2> /dev/null || { az extension add --name azure-devops --debug; exit 1; }
 26 | }
 27 | 
 28 | _check_args() {
 29 |   # Check the input parameters
 30 |   [ -z "${2}" ] && _usage
 31 |   if [ -z "${AZURE_DEVOPS_ORG_URL}" ]; then
 32 |     echo "ERROR: The Azure DevOps organization URL was not defined"
 33 |     echo "       Either AZURE_DEVOPS_ORG_URL or AZDO_ORG_SERVICE_URL variables must be set"
 34 |     exit 1
 35 |   fi
 36 | }
 37 | 
 38 | _check_auth() {
 39 |   # Check the existing Azure Authentication
 40 |   if [ -z "${AZURE_DEVOPS_EXT_PAT}" ]; then
 41 |     az_signed_in_user=$(az ad signed-in-user show --query userPrincipalName --output tsv)
 42 |     if [ -z "${az_signed_in_user}" ]; then
 43 |       echo "ERROR: User Principal not logged in, run 'az login' first (az login with a Service Principal is not supported)"
 44 |       echo "       Or set the AZURE_DEVOPS_EXT_PAT (or AZDO_PERSONAL_ACCESS_TOKEN) environment variable for direct PAT login"
 45 |       exit 1
 46 |     fi
 47 |   fi
 48 | }
 49 | 
 50 | _create_project() {
 51 |   # Create an Azure DevOps project
 52 |   local azdo_project_name="${1}"
 53 |   echo -e "Creating the Azure DevOps project \"${azdo_project_name}\" in organization \"${AZURE_DEVOPS_ORG_URL}\""
 54 |   azdo_project_id=$(az devops project show --project "${azdo_project_name}" --org "${AZURE_DEVOPS_ORG_URL}" --query id --output tsv)
 55 |   if [ -n "${azdo_project_id}" ]; then
 56 |     echo -e "Azure DevOps project \"${azdo_project_name}\" already exists with id \"${azdo_project_id}\""
 57 |   else
 58 |     azdo_project_id=$(az devops project create --name "${azdo_project_name}" --org "${AZURE_DEVOPS_ORG_URL}" \
 59 |                         | ${_python} -c 'import sys,json; print(json.load(sys.stdin)["id"])' 2> /dev/null )
 60 |     if [ -n "${azdo_project_id}" ]; then
 61 |       echo -e "Azure DevOps project \"${azdo_project_name}\" created successfully"
 62 |     else
 63 |       echo -e "ERROR: Azure DevOps project \"${azdo_project_name}\" was not created successfully"
 64 |       exit 1
 65 |     fi
 66 |   fi
 67 | }
 68 | 
 69 | _delete_project() {
 70 |   # Delete an Azure DevOps project
 71 |   local azdo_project_name="${1}"
 72 |   echo -e "Deleting the Azure DevOps project \"${azdo_project_name}\" from organization \"${AZURE_DEVOPS_ORG_URL}\""
 73 | 
 74 |   azdo_project_id=$(az devops project show --project "${azdo_project_name}" --org "${AZURE_DEVOPS_ORG_URL}" --query id --output tsv)
 75 |   if ! az devops project delete --yes --id "${azdo_project_id}" --org "${AZURE_DEVOPS_ORG_URL}"; then
 76 |     echo -e "ERROR: Azure DevOps project \"${azdo_project_name}\" was not deleted successfully"
 77 |     exit 1
 78 |   else
 79 |     echo -e "Azure DevOps project \"${azdo_project_name}\" deleted successfully"
 80 |   fi
 81 | }
 82 | 
 83 | case "${1}" in
 84 |   create)
 85 |     _check_args "$@"
 86 |     _check_auth
 87 |     _install_extension
 88 |     _create_project "${2}"
 89 |     ;;
 90 |   delete)
 91 |     _check_args "$@"
 92 |     _check_auth
 93 |     _install_extension
 94 |     _delete_project "${2}"
 95 |     ;;
 96 |   *)
 97 |     _usage
 98 |     ;;
 99 | esac
100 | 


--------------------------------------------------------------------------------
/terraform/deployments/azure-infrastructure/variables.tf:
--------------------------------------------------------------------------------
  1 | variable "DATA_SERVICE_PRINCIPAL_CLIENT_ID" {
  2 |   type        = string
  3 |   description = "The Service Principal Client ID used by the data pipeline. This must already exist in the Azure AD Tenant."
  4 | }
  5 | 
  6 | variable "PROJECT_GROUP_NAME" {
  7 |   type        = string
  8 |   description = "The name of the Project User Group. This must already exist in the Azure AD Tenant."
  9 | }
 10 | 
 11 | variable "RESOURCE_GROUP_NAME" {
 12 |   type        = string
 13 |   description = "The name of the Resource Group in which the resources should be built. This must already exist."
 14 | }
 15 | 
 16 | variable "KEY_VAULT_NAME" {
 17 |   type        = string
 18 |   description = "The name of the Azure Key Vault. This must already exist in the Azure AD Tenant."
 19 | }
 20 | 
 21 | variable "STORAGE_ACCOUNT_NAME" {
 22 |   type        = string
 23 |   description = "The name of the Storage Account."
 24 | }
 25 | 
 26 | variable "PIPELINE_CONTAINER_NAME" {
 27 |   type        = string
 28 |   description = "ADLS Gen 2 Filesystem Container for the Pipeline Data."
 29 | }
 30 | 
 31 | variable "PROJECT_CONTAINER_NAME" {
 32 |   type        = string
 33 |   description = "ADLS Gen 2 Filesystem Container for the Project Data. It will be mounted to the Databricks workspace."
 34 | }
 35 | 
 36 | variable "DATA_FACTORY_NAME" {
 37 |   type        = string
 38 |   description = "The name of the Azure Data Factory."
 39 | }
 40 | 
 41 | variable "DATABRICKS_WORKSPACE_NAME" {
 42 |   type        = string
 43 |   description = "The name of the Azure Databricks workspace."
 44 | }
 45 | 
 46 | variable "DATABRICKS_PRICING_TIER" {
 47 |   type        = string
 48 |   description = "(Optional) The pricing tier to use for the Databricks workspace. Possible values are standard, premium, or trial. Default is premium."
 49 |   default     = "premium"
 50 | }
 51 | 
 52 | variable "DATABRICKS_VNET_NAME" {
 53 |   type        = string
 54 |   description = "(Optional) The name of the Virtual Network where the Databricks clusters should be created. Default is workers-vnet."
 55 |   default     = "workers-vnet"
 56 | }
 57 | 
 58 | variable "DATABRICKS_VNET_CIDR" {
 59 |   type        = string
 60 |   description = "(Optional) CIDR range for the Virtual Network (must be at least /24). Default is 10.179.0.0/16"
 61 |   default     = "10.179.0.0/16"
 62 | }
 63 | 
 64 | variable "DATABRICKS_PRIVATE_SUBNET_NAME" {
 65 |   type        = string
 66 |   description = "(Optional) The name of the Private Subnet within the Virtual Network. Default is private-subnet."
 67 |   default     = "private-subnet"
 68 | }
 69 | 
 70 | variable "DATABRICKS_PRIVATE_SUBNET_CIDR" {
 71 |   type        = string
 72 |   description = "(Optional) CIDR range for the Private Subnet (must be at least /26). Default is 10.179.0.0/18."
 73 |   default     = "10.179.0.0/18"
 74 | }
 75 | 
 76 | variable "DATABRICKS_PUBLIC_SUBNET_NAME" {
 77 |   type        = string
 78 |   description = "(Optional) The name of the Public Subnet within the Virtual Network. Default is public-subnet."
 79 |   default     = "public-subnet"
 80 | }
 81 | 
 82 | variable "DATABRICKS_PUBLIC_SUBNET_CIDR" {
 83 |   type        = string
 84 |   description = "(Optional) CIDR range for the Public Subnet (must be at least /26). Default is 10.179.64.0/18."
 85 |   default     = "10.179.64.0/18"
 86 | }
 87 | 
 88 | variable "DATABRICKS_NSG_NAME" {
 89 |   type        = string
 90 |   description = "(Optional) The name of the Network Security Group attached to the Databricks subnets. Default is databricks-nsg."
 91 |   default     = "databricks-nsg"
 92 | }
 93 | 
 94 | variable "DATABRICKS_DISABLE_PUBLIC_IP" {
 95 |   type        = bool
 96 |   description = "(Optional) Set to true to deploy the workspace with Secure Cluster Connectivity (No Public IP) enabled. Default is false."
 97 |   default     = false
 98 | }
 99 | 
100 | variable "deployment_tags" {
101 |   type        = map(string)
102 |   description = "A mapping of tags to assign to all resources."
103 |   default     = {
104 |     DeploymentName = "azure-infrastructure"
105 |   }
106 | }
107 | 


--------------------------------------------------------------------------------
/terraform/modules/azure/databricks-vnet/README.md:
--------------------------------------------------------------------------------
 1 | ## Description
 2 | 
 3 | Creates an Azure Virtual Network
 4 | for [Databricks VNet injection](https://docs.microsoft.com/en-us/azure/databricks/administration-guide/cloud-configurations/azure/vnet-inject).
 5 | 
 6 | ## Inputs
 7 | 
 8 | | Name                        | Description                                                               | Type           | Default                    | Required |
 9 | |-----------------------------|---------------------------------------------------------------------------|----------------|----------------------------|:--------:|
10 | | resource_group_name         | The name of the Resource Group in which the resources should exist        | `string`       | n/a                        |   yes    |
11 | | azure_location              | Azure location in which the resources should exist                        | `string`       | `null`                     |    no    |
12 | | virtual_network_name        | The name of the Virtual Network                                           | `string`       | `workers-vnet`             |    no    |
13 | | virtual_network_cidr        | CIDR range for the Virtual Network                                        | `string`       | `10.179.0.0/16`            |    no    |
14 | | private_subnet_name         | The name of the Private Subnet within the Virtual Network                 | `string`       | `private-subnet`           |    no    |
15 | | private_subnet_cidr         | CIDR range for the Private Subnet                                         | `string`       | `10.179.0.0/18`            |    no    |
16 | | public_subnet_name          | The name of the Public Subnet within the Virtual Network                  | `string`       | `public-subnet`            |    no    |
17 | | public_subnet_cidr          | CIDR range for the Public Subnet                                          | `string`       | `10.179.64.0/18`           |    no    |
18 | | network_security_group_name | The name of the Databricks Network Security Group attached to the subnets | `string`       | `databricks-nsg`           |    no    |
19 | | use_nat_gateway             | Set true to deploy a NAT gateway for no public ip subnets                 | `bool`         | `false`                    |    no    |
20 | | nat_gateway_name            | The name of the NAT gateway to be attached to the subnets                 | `string`       | `databricks-nat-gateway`   |    no    |
21 | | nat_gateway_public_ip_name  | The name of the NAT gateway public IP                                     | `string`       | `databricks-nat-public-ip` |    no    |
22 | | service_endpoints           | A list of service endpoints to associate with the public subnet           | `list(string)` | `[]`                       |    no    |
23 | | tags                        | A mapping of tags to assign to the resources                              | `map(string)`  | `{}`                       |    no    |
24 | 
25 | ## Outputs
26 | 
27 | | Name                      | Description                                                             |
28 | |---------------------------|-------------------------------------------------------------------------|
29 | | virtual_network_id        | The ID of the Virtual Network                                           |
30 | | virtual_network_name      | The name of the Virtual Network                                         |
31 | | private_subnet_id         | The ID of the Private Subnet within the Virtual Network                 |
32 | | private_subnet_name       | The name of the Private Subnet within the Virtual Network               |
33 | | public_subnet_id          | The ID of the Public Subnet within the Virtual Network                  |
34 | | public_subnet_name        | The name of the Public Subnet within the Virtual Network                |
35 | | network_security_group_id | The ID of the Databricks Network Security Group attached to the subnets |
36 | | nat_gateway_id            | The ID of the NAT gateway attached to the subnets                       |
37 | | nat_public_ip_id          | The ID of the NAT gateway public IP                                     |
38 | 


--------------------------------------------------------------------------------
/admin/setup-with-terraform.sh:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env bash
 2 | #
 3 | # Builds the core infrastructure using Terraform.
 4 | # Variables are loaded from the vars.sh file.
 5 | # Only users can run this script as granting admin-consent with a Service Principal is not supported by the az ad cli.
 6 | # The user executing this script must be Owner on the Subscription and Global administrator on the AD Tenant.
 7 | #
 8 | 
 9 | # Debug
10 | #set -x
11 | #export TF_LOG="DEBUG"
12 | 
13 | 
14 | ### Variables
15 | 
16 | # Local variables
17 | _realpath() { [[ ${1} == /* ]] && echo "${1}" || echo "${PWD}"/"${1#./}"; }
18 | _realpath="$(command -v realpath || echo _realpath )"
19 | _setup_script_dir=$(${_realpath} "$(dirname "${BASH_SOURCE[0]}")")
20 | _scripts_dir=${_setup_script_dir}/../scripts
21 | _python="$(command -v python || command -v python3)"
22 | 
23 | # Import variables
24 | source "${_setup_script_dir}/vars.sh"
25 | 
26 | # Required variables
27 | ## the Personal Access Token to authenticate to GitHub
28 | AZDO_GITHUB_SERVICE_CONNECTION_PAT=${AZDO_GITHUB_SERVICE_CONNECTION_PAT:-${AZURE_DEVOPS_EXT_GITHUB_PAT}}
29 | 
30 | ## the Azure DevOps organization URL
31 | AZDO_ORG_SERVICE_URL=${AZDO_ORG_SERVICE_URL:-${AZURE_DEVOPS_ORG_URL}}
32 | 
33 | ## the Azure DevOps organization Personal Access Token
34 | AZDO_PERSONAL_ACCESS_TOKEN=${AZDO_PERSONAL_ACCESS_TOKEN:-${AZURE_DEVOPS_EXT_PAT}}
35 | 
36 | # Check the required variables
37 | if [ -z "${AZDO_GITHUB_SERVICE_CONNECTION_PAT}" ]; then
38 |   echo "ERROR: The GitHub PAT token was not defined"
39 |   echo "       Either AZURE_DEVOPS_EXT_GITHUB_PAT or AZDO_GITHUB_SERVICE_CONNECTION_PAT variables must be set"
40 |   exit 1
41 | else
42 |   export AZDO_GITHUB_SERVICE_CONNECTION_PAT
43 | fi
44 | if [ -z "${AZDO_ORG_SERVICE_URL}" ]; then
45 |   echo "ERROR: The Azure DevOps organization URL was not defined"
46 |   echo "       Either AZURE_DEVOPS_ORG_URL or AZDO_ORG_SERVICE_URL variables must be set"
47 |   exit 1
48 | else
49 |   export AZDO_ORG_SERVICE_URL
50 | fi
51 | if [ -z "${AZDO_PERSONAL_ACCESS_TOKEN}" ]; then
52 |   echo "ERROR: The Azure DevOps organization Personal Access Token was not defined"
53 |   echo "       Either AZURE_DEVOPS_EXT_PAT or AZDO_PERSONAL_ACCESS_TOKEN variables must be set"
54 |   exit 1
55 | else
56 |   export AZDO_PERSONAL_ACCESS_TOKEN
57 | fi
58 | 
59 | # Make sure the current Azure CLI login works (using a Service Principal is not supported for this)
60 | echo -e "Checking the existing Azure Authentication\n----------------------"
61 | az_account=$(az account show)
62 | if [ -z "${az_account}" ]; then
63 |   echo -e "ERROR: Authenticating using the Azure CLI as a User is required for this setup"
64 |   echo -e "       Granting admin-consent with a Service Principal is not supported by the az ad cli"
65 |   echo -e "       This is due to https://github.com/Azure/azure-cli/issues/12137"
66 |   echo -e "       Please run 'az login' with a directory administrator User Principal"
67 |   exit 1
68 | fi
69 | user_type=$(echo "${az_account}" | ${_python} -c 'import sys,json; print(json.load(sys.stdin)["user"]["type"])')
70 | user_name=$(echo "${az_account}" | ${_python} -c 'import sys,json; print(json.load(sys.stdin)["user"]["name"])')
71 | [ -z "${user_type}" ] || [ -z "${user_name}" ] && exit 1
72 | if [ "${user_type}" == "servicePrincipal" ]; then
73 |   echo -e "ERROR: Authenticating using the Azure CLI is only supported as a User (not a Service Principal)"
74 |   echo -e "       Please run 'az login' with a directory administrator User Principal"
75 |   exit 1
76 | else
77 |   echo -e "Will use the current Azure CLI login (\"${user_name}\") to authenticate to Azure RM"
78 | fi
79 | 
80 | # Set the Subscription
81 | if [ -n "${ARM_SUBSCRIPTION_ID}" ]; then
82 |   echo -e "Setting the active subscription to \"${ARM_SUBSCRIPTION_ID}\""
83 |   az account set --subscription "${ARM_SUBSCRIPTION_ID}" || exit 1
84 | fi
85 | echo
86 | 
87 | 
88 | ### Run Terraform to build the core infrastructure
89 | echo
90 | echo -e "Building the core infrastructure with Terraform\n----------------------------------------------\n"
91 | source "${_scripts_dir}/terraform_azure.sh" apply "${_setup_script_dir}/terraform" -parallelism=3 "$@" -auto-approve
92 | echo
93 | 


--------------------------------------------------------------------------------
/terraform/tests/azure/data-factory/main.tf:
--------------------------------------------------------------------------------
  1 | /**
  2 | * Tests for the data-factory module
  3 | */
  4 | provider "azurerm" {
  5 |   features {}
  6 | }
  7 | 
  8 | terraform {
  9 |   required_version = "~> 1.5.7"
 10 | 
 11 |   required_providers {
 12 |     azurerm = {
 13 |       source  = "hashicorp/azurerm"
 14 |       version = "~> 3"
 15 |     }
 16 |     random = {
 17 |       source  = "hashicorp/random"
 18 |       version = "~> 3"
 19 |     }
 20 |   }
 21 | }
 22 | 
 23 | # Minimum of variables required for the test
 24 | variable "azure_location" { default = "westeurope" }
 25 | variable "resource_group_name" { default = null }
 26 | 
 27 | # Create a random string for test uniqueness
 28 | resource "random_string" "suffix" {
 29 |   length  = 10
 30 |   numeric = true
 31 |   lower   = true
 32 |   upper   = false
 33 |   special = false
 34 | }
 35 | 
 36 | # Set the rest of the test variables using the random string
 37 | locals {
 38 |   resource_group_name         = var.resource_group_name == null ? "tftest-rg-${random_string.suffix.result}" : var.resource_group_name
 39 |   data_factory_defaults       = "tftest-defaults-${random_string.suffix.result}"
 40 |   data_factory_with_key_vault = "tftest-with-akv-${random_string.suffix.result}"
 41 |   key_vault_name              = "tftest-akv-${random_string.suffix.result}"
 42 |   custom_tags                 = { Purpose = "Terraform-test-${random_string.suffix.result}" }
 43 | }
 44 | 
 45 | # Create an empty Resource Group to be used by the rest of the resources
 46 | data "azurerm_client_config" "current" {}
 47 | 
 48 | module "test_resource_group" {
 49 |   source              = "../../../modules/azure/resource-group"
 50 |   azure_location      = var.azure_location
 51 |   resource_group_name = local.resource_group_name
 52 |   owners              = [data.azurerm_client_config.current.object_id]
 53 |   tags                = local.custom_tags
 54 | }
 55 | 
 56 | # Build a Key Vault for the Azure Data Factory linked service
 57 | module "key_vault" {
 58 |   source              = "../../../modules/azure/key-vault"
 59 |   resource_group_name = module.test_resource_group.name
 60 |   azure_location      = var.azure_location
 61 |   key_vault_name      = local.key_vault_name
 62 | }
 63 | 
 64 | # Marker for test dependencies
 65 | resource "null_resource" "test_dependencies" {
 66 |   triggers = {
 67 |     rg = join(",", [module.test_resource_group.id, module.key_vault.id])
 68 |   }
 69 |   depends_on = [module.test_resource_group, module.key_vault.id]
 70 | }
 71 | 
 72 | # Build an Azure Data Factory with default parameters
 73 | module "test_data_factory_defaults" {
 74 |   source              = "../../../modules/azure/data-factory"
 75 |   resource_group_name = local.resource_group_name
 76 |   data_factory_name   = local.data_factory_defaults
 77 |   depends_on          = [null_resource.test_dependencies]
 78 | }
 79 | 
 80 | # Build an Azure Data Factory with a Key Vault linked service
 81 | module "test_data_factory_with_key_vault" {
 82 |   source              = "../../../modules/azure/data-factory"
 83 |   azure_location      = var.azure_location
 84 |   resource_group_name = local.resource_group_name
 85 |   data_factory_name   = local.data_factory_with_key_vault
 86 |   key_vault_ids       = [module.key_vault.id]
 87 |   tags                = local.custom_tags
 88 |   depends_on          = [null_resource.test_dependencies]
 89 | }
 90 | 
 91 | # Terraform output
 92 | output "data_factory_tests" {
 93 |   value = {
 94 |     test_data_factory_defaults = {
 95 |       id                        = module.test_data_factory_defaults.id
 96 |       name                      = module.test_data_factory_defaults.name
 97 |       principal_id              = module.test_data_factory_defaults.principal_id
 98 |       key_vault_linked_services = module.test_data_factory_defaults.key_vault_linked_services
 99 |     }
100 |     test_data_factory_with_key_vault = {
101 |       id                        = module.test_data_factory_with_key_vault.id
102 |       name                      = module.test_data_factory_with_key_vault.name
103 |       principal_id              = module.test_data_factory_with_key_vault.principal_id
104 |       key_vault_linked_services = module.test_data_factory_with_key_vault.key_vault_linked_services
105 |     }
106 |   }
107 | }
108 | 


--------------------------------------------------------------------------------
/scripts/sync_group.sh:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env bash
 2 | #
 3 | # Syncs an Azure Active Directory Group to a Databricks workspace using the SCIM API (https://docs.databricks.com/dev-tools/api/latest/scim/index.html).
 4 | # The group can contain users or service principals.
 5 | # It uses simple positional arguments.
 6 | # Returns the Databricks Group ID as a variable called databricksGroupId in the Azure Pipelines format.
 7 | #
 8 | 
 9 | # Required parameters
10 | _workspace_url=${1}
11 | _access_token=${2}
12 | _group_name=${3}
13 | 
14 | # Local variables
15 | _realpath() { [[ ${1} == /* ]] && echo "${1}" || echo "${PWD}"/"${1#./}"; }
16 | _realpath="$(command -v realpath || echo _realpath )"
17 | _script_dir=$(${_realpath} "$(dirname "${BASH_SOURCE[0]}")")
18 | _python="$(command -v python || command -v python3)"
19 | 
20 | _usage() {
21 |   echo -e "Usage: ${0} <workspace_url> <access_token> <group_name>"
22 |   exit 1
23 | }
24 | 
25 | # Parameters check
26 | [ -z "${_workspace_url}" ] && _usage
27 | [ -z "${_access_token}" ] && _usage
28 | [ -z "${_group_name}" ] && _usage
29 | 
30 | # Use the az cli command to get the Group members from Azure AD
31 | echo -e "Checking the Group \"${_group_name}\" in the Azure AD Tenant"
32 | group_members=$(az ad group member list --group "${_group_name}" \
33 |                                         --query "[].[objectType, userPrincipalName || appId, displayName || appDisplayName ]" \
34 |                                         --output tsv)
35 | 
36 | # Add the Group members as new users to the Databricks workspace
37 | IFS=$'\n'
38 | for member in ${group_members}; do
39 |   IFS=$'\t'
40 |   # ${member} will contain 3 tab separated strings as retrieved from Azure: principal_type principal_name display_name
41 |   source "${_script_dir}/add_principal_to_workspace.sh" "${_workspace_url}" "${_access_token}" ${member}
42 |   new_members="${principal_id} ${new_members}"
43 | done
44 | 
45 | # Set the payload
46 | IFS=' '
47 | payload='
48 | {
49 |   "schemas":[ "urn:ietf:params:scim:schemas:core:2.0:Group" ],
50 |   "displayName":"'${_group_name}'",
51 |   "members":[
52 | '$(
53 |   for last in ${new_members}; do true; done
54 |   for member in ${new_members}
55 |   do
56 |     echo '{ "value":"'"${member}"'" }'"$([ "${member}" != "${last}" ] && echo ',')"''
57 | 
58 |   done
59 | )'
60 |   ]
61 | }
62 | '
63 | 
64 | # Check if the Group already exists in the Databricks workspace
65 | echo -e "Checking the Group \"${_group_name}\" in workspace \"${_workspace_url}\""
66 | _response=$(curl -sS --request GET \
67 |                      --header "Authorization: Bearer ${_access_token}" \
68 |                      --header "Accept: application/scim+json" \
69 |                      "${_workspace_url}/api/2.0/preview/scim/v2/Groups?filter=displayName+eq+%22${_group_name// /$'%20'}%22")
70 | group_id=$(echo "${_response}" | ${_python} -c 'import sys,json; print(json.load(sys.stdin)["Resources"][0]["id"])' 2> /dev/null)
71 | 
72 | # Create the Group if it doesn't exit and update the Group otherwise
73 | if [ -z "${group_id}" ]; then
74 |   echo -e "Creating the Group \"${_group_name}\""
75 |   _response=$(curl -sS --request POST \
76 |                        --header "Authorization: Bearer ${_access_token}" \
77 |                        --header "Content-Type: application/scim+json" \
78 |                        "${_workspace_url}/api/2.0/preview/scim/v2/Groups" \
79 |                        -d "${payload}")
80 | else
81 |   echo -e "Updating the Group \"${_group_name}\""
82 |   _response=$(curl -sS --request PUT \
83 |                        --header "Authorization: Bearer ${_access_token}" \
84 |                        --header "Content-Type: application/scim+json" \
85 |                        "${_workspace_url}/api/2.0/preview/scim/v2/Groups/${group_id}" \
86 |                        -d "${payload}")
87 | fi
88 | 
89 | # Check the response and extract the Databricks Group ID
90 | group_id=$(echo "${_response}" | ${_python} -c 'import sys,json; print(json.load(sys.stdin)["id"])')
91 | [ -z "${group_id}" ] && { echo "${_response}"; exit 1; }
92 | echo "Databricks Group ID: ${group_id}"
93 | 
94 | # Pass the variables to Azure Pipelines
95 | if [ "${BASH_SOURCE[0]}" == "$0" ]; then
96 |   [ -n "${group_id}" ] && echo "##vso[task.setvariable variable=databricksGroupId]${group_id}" || exit 1
97 | fi
98 | 


--------------------------------------------------------------------------------
/admin/vars.sh:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env bash
 2 | 
 3 | ### General variables
 4 | AZURE_LOCATION=${AZURE_LOCATION:-"westeurope"}
 5 | #ARM_SUBSCRIPTION_ID=""
 6 | #ARM_TENANT_ID=""
 7 | 
 8 | # the Azure DevOps organization url
 9 | AZURE_DEVOPS_ORG_URL=${AZURE_DEVOPS_ORG_URL:-${AZDO_ORG_SERVICE_URL}}
10 | 
11 | # the Azure DevOps organization Personal Access Token - required for Terraform, optional for Azure CLI
12 | AZURE_DEVOPS_EXT_PAT=${AZURE_DEVOPS_EXT_PAT:-${AZDO_PERSONAL_ACCESS_TOKEN}}
13 | 
14 | # the Personal Access Token to authenticate to GitHub
15 | AZURE_DEVOPS_EXT_GITHUB_PAT=${AZURE_DEVOPS_EXT_GITHUB_PAT:-${AZDO_GITHUB_SERVICE_CONNECTION_PAT}}
16 | 
17 | # a suffix to append to most names
18 | SUFFIX=${SUFFIX:-"demo"}
19 | 
20 | ### Service Principals and Groups
21 | INFRA_SP_NAME=${INFRA_SP_NAME:-"spn-infra-${SUFFIX}"}
22 | DATA_SP_NAME=${DATA_SP_NAME:-"spn-data-${SUFFIX}"}
23 | PROJECT_GROUP_NAME=${PROJECT_GROUP_NAME:-"Project Group ${SUFFIX}"}
24 | 
25 | ### Databricks resources
26 | DATABRICKS_RESOURCE_GROUP_NAME=${DATABRICKS_RESOURCE_GROUP_NAME:-"rg-databricks-${SUFFIX}"}
27 | KEY_VAULT_NAME=${KEY_VAULT_NAME:-"kv-main-${SUFFIX}"}
28 | SECRET_NAME="dataServicePrincipalClientSecret"
29 | 
30 | ### Terraform resources
31 | # remove if not using Terraform
32 | TF_RESOURCE_GROUP_NAME=${TF_RESOURCE_GROUP_NAME:-"rg-terraform-${SUFFIX}"}
33 | TF_STORAGE_ACCOUNT_NAME=${TF_STORAGE_ACCOUNT_NAME:-"stterraform123${SUFFIX}"}
34 | TF_CONTAINER_NAME=${TF_CONTAINER_NAME:-"tfstate"}
35 | 
36 | ### Azure DevOps variables
37 | AZURE_DEVOPS_PROJECT_NAME=${AZURE_DEVOPS_PROJECT_NAME:-"my-project-${SUFFIX}"}
38 | 
39 | AZURE_DEVOPS_INFRA_ARM_ENDPOINT_NAME=${AZURE_DEVOPS_INFRA_ARM_ENDPOINT_NAME:-"my-infra-endpoint-${SUFFIX}"}
40 | AZURE_DEVOPS_DATA_ARM_ENDPOINT_NAME=${AZURE_DEVOPS_DATA_ARM_ENDPOINT_NAME:-"my-data-endpoint-${SUFFIX}"}
41 | 
42 | AZURE_DEVOPS_GITHUB_ENDPOINT_NAME=${AZURE_DEVOPS_GITHUB_ENDPOINT_NAME:-"my-git-endpoint-${SUFFIX}"}
43 | AZURE_DEVOPS_GITHUB_REPO_URL=${AZURE_DEVOPS_GITHUB_REPO_URL:-"https://github.com/alexandruanghel/azdo-databricks"}
44 | AZURE_DEVOPS_GITHUB_BRANCH=${AZURE_DEVOPS_GITHUB_BRANCH:-"master"}
45 | 
46 | AZURE_DEVOPS_INFRA_PIPELINE_NAME=${AZURE_DEVOPS_INFRA_PIPELINE_NAME:-"my-infra-pipeline-${SUFFIX}"}
47 | AZURE_DEVOPS_INFRA_PIPELINE_PATH_CLI=${AZURE_DEVOPS_INFRA_PIPELINE_PATH:-"pipelines/azure-pipelines-infra-with-azure-cli.yml"}
48 | AZURE_DEVOPS_INFRA_PIPELINE_PATH_TF=${AZURE_DEVOPS_INFRA_PIPELINE_PATH:-"pipelines/azure-pipelines-infra-with-terraform.yml"}
49 | AZURE_DEVOPS_DATA_PIPELINE_NAME=${AZURE_DEVOPS_DATA_PIPELINE_NAME:-"my-data-pipeline-${SUFFIX}"}
50 | AZURE_DEVOPS_DATA_PIPELINE_PATH=${AZURE_DEVOPS_DATA_PIPELINE_PATH:-"pipelines/azure-pipelines-data-factory-msi.yml"}
51 | #AZURE_DEVOPS_DATA_PIPELINE_PATH=${AZURE_DEVOPS_DATA_PIPELINE_PATH:-"pipelines/azure-pipelines-data-factory-accesstoken.yml"}
52 | 
53 | ### Terraform variables
54 | export TF_VAR_AZURE_LOCATION="${AZURE_LOCATION}"
55 | 
56 | export TF_VAR_INFRA_SP_NAME="${INFRA_SP_NAME}"
57 | export TF_VAR_DATA_SP_NAME="${DATA_SP_NAME}"
58 | export TF_VAR_PROJECT_GROUP_NAME="${PROJECT_GROUP_NAME}"
59 | 
60 | export TF_VAR_DATABRICKS_RESOURCE_GROUP_NAME="${DATABRICKS_RESOURCE_GROUP_NAME}"
61 | export TF_VAR_KEY_VAULT_NAME="${KEY_VAULT_NAME}"
62 | export TF_VAR_SECRET_NAME="${SECRET_NAME}"
63 | 
64 | export TF_VAR_TF_RESOURCE_GROUP_NAME="${TF_RESOURCE_GROUP_NAME}"
65 | export TF_VAR_TF_STORAGE_ACCOUNT_NAME="${TF_STORAGE_ACCOUNT_NAME}"
66 | export TF_VAR_TF_CONTAINER_NAME="${TF_CONTAINER_NAME}"
67 | 
68 | export TF_VAR_AZURE_DEVOPS_PROJECT_NAME="${AZURE_DEVOPS_PROJECT_NAME}"
69 | export TF_VAR_AZURE_DEVOPS_INFRA_ARM_ENDPOINT_NAME="${AZURE_DEVOPS_INFRA_ARM_ENDPOINT_NAME}"
70 | export TF_VAR_AZURE_DEVOPS_DATA_ARM_ENDPOINT_NAME="${AZURE_DEVOPS_DATA_ARM_ENDPOINT_NAME}"
71 | export TF_VAR_AZURE_DEVOPS_GITHUB_ENDPOINT_NAME="${AZURE_DEVOPS_GITHUB_ENDPOINT_NAME}"
72 | export TF_VAR_AZURE_DEVOPS_GITHUB_REPO_URL="${AZURE_DEVOPS_GITHUB_REPO_URL}"
73 | export TF_VAR_AZURE_DEVOPS_GITHUB_BRANCH="${AZURE_DEVOPS_GITHUB_BRANCH}"
74 | export TF_VAR_AZURE_DEVOPS_INFRA_PIPELINE_NAME="${AZURE_DEVOPS_INFRA_PIPELINE_NAME}"
75 | export TF_VAR_AZURE_DEVOPS_INFRA_PIPELINE_PATH="${AZURE_DEVOPS_INFRA_PIPELINE_PATH_TF}"
76 | export TF_VAR_AZURE_DEVOPS_DATA_PIPELINE_NAME="${AZURE_DEVOPS_DATA_PIPELINE_NAME}"
77 | export TF_VAR_AZURE_DEVOPS_DATA_PIPELINE_PATH="${AZURE_DEVOPS_DATA_PIPELINE_PATH}"
78 | 


--------------------------------------------------------------------------------
/terraform/modules/azure/databricks-workspace/README.md:
--------------------------------------------------------------------------------
 1 | ## Description
 2 | 
 3 | Creates an Azure Databricks workspace with
 4 | optional [VNet injection](https://docs.microsoft.com/en-us/azure/databricks/administration-guide/cloud-configurations/azure/vnet-inject).
 5 | 
 6 | ## Inputs
 7 | 
 8 | | Name                                                 | Description                                                                                                                             | Type          | Default          | Required |
 9 | |------------------------------------------------------|-----------------------------------------------------------------------------------------------------------------------------------------|---------------|------------------|:--------:|
10 | | resource_group_name                                  | The name of the Resource Group in which the resources should exist                                                                      | `string`      | n/a              |   yes    |
11 | | azure_location                                       | Azure location in which the resources should exist                                                                                      | `string`      | `null`           |    no    |
12 | | workspace_name                                       | The name of the Databricks workspace resource                                                                                           | `string`      | n/a              |   yes    |
13 | | managed_resource_group_name                          | The name of the Resource Group where Azure should place the managed Databricks resources                                                | `string`      | `null`           |    no    |
14 | | pricing_tier                                         | The pricing tier to use for the Databricks workspace                                                                                    | `string`      | `premium`        |    no    |
15 | | virtual_network_id                                   | The Azure Resource ID of the Virtual Network for VNet injection                                                                         | `string`      | `null`           |    no    |
16 | | private_subnet_name                                  | The name of the Private Subnet within the Virtual Network                                                                               | `string`      | `private-subnet` |    no    |
17 | | private_subnet_network_security_group_association_id | The resource ID of the azurerm_subnet_network_security_group_association resource which is referred to by the private_subnet_name field | `string`      | `null`           |    no    |
18 | | public_subnet_name                                   | The name of the Public Subnet within the Virtual Network                                                                                | `string`      | `public-subnet`  |    no    |
19 | | public_subnet_network_security_group_association_id  | The resource ID of the azurerm_subnet_network_security_group_association resource which is referred to by the public_subnet_name field  | `string`      | `null`           |    no    |
20 | | tags                                                 | A mapping of tags to assign to the resources                                                                                            | `map(string)` | `{}`             |    no    |
21 | 
22 | ## Outputs
23 | 
24 | | Name                        | Description                                                                   |
25 | |-----------------------------|-------------------------------------------------------------------------------|
26 | | id                          | The Azure Resource ID of the Databricks workspace                             |
27 | | workspace_name              | The name of the Databricks workspace                                          |
28 | | workspace_id                | The unique identifier of the Databricks workspace in Databricks control plane |
29 | | workspace_url               | The workspace URL                                                             |
30 | | managed_resource_group_name | The name of the Managed Resource Group for managed Databricks resources       |
31 | | managed_resource_group_id   | The Azure Resource ID of the Managed Resource Group                           |
32 | 


--------------------------------------------------------------------------------