├── .gitignore ├── adfv2 ├── dataset │ ├── csv_files_key.json │ ├── csv_files_mi.json │ ├── parquet_files_key.json │ └── parquet_files_mi.json ├── factory │ ├── blog-datapipeline-adfv2.json │ ├── blog-datapipeline-adfv2dev.json │ └── blog-datapipeline-dev.json ├── linkedService │ ├── AzureDataLakeStorage1.json │ ├── AzureDataLakeStorage2KEY.json │ ├── AzureDatabricks1.json │ ├── AzureDatabricks_mi.json │ └── AzureKeyVault.json ├── pipeline │ ├── blog-datapipeline-pipeline-dbrmi.json │ ├── blog-datapipeline-pipeline-key.json │ └── blog-datapipeline-pipeline-mi.json └── publish_config.json ├── azure-pipelines.yml ├── cosmosdb_firewall_nopip_policy.json ├── data ├── dboPerson.txt └── dboRelation.txt ├── libraries ├── azure-cosmos-spark_3-1_2-12-4.2.0.jar └── graphframes-0.8.1-spark3.0-s_2.12.jar ├── notebooks ├── insert_data_CosmosDB_Gremlin.py └── mount_ADLSgen2_rawdata.py ├── pictures ├── 2_architecture.png ├── architecture_detailed.png └── old_architecture_detailed.png ├── readme.md └── scripts ├── 1_create_resources.sh ├── 2_configure_databricks.sh ├── 3_configure_access_storage_databricks.sh ├── 4_configure_secret_scope_databricks.sh ├── 5_configure_mount_storage_databricks.sh ├── 6_configure_firewall.sh └── 99_databricks_sqlapi.sh /.gitignore: -------------------------------------------------------------------------------- 1 | /oud -------------------------------------------------------------------------------- /adfv2/dataset/csv_files_key.json: -------------------------------------------------------------------------------- 1 | { 2 | "name": "csv_files_key", 3 | "properties": { 4 | "linkedServiceName": { 5 | "referenceName": "AzureDataLakeStorage2KEY", 6 | "type": "LinkedServiceReference", 7 | "parameters": { 8 | "stor_url": "@pipeline().globalParameters.stor_url", 9 | "akv_url": "@pipeline().globalParameters.akv_url" 10 | } 11 | }, 12 | "annotations": [], 13 | "type": "DelimitedText", 14 | "typeProperties": { 15 | "location": { 16 | "type": "AzureBlobFSLocation", 17 | "fileSystem": "rawdata" 18 | }, 19 | "columnDelimiter": ",", 20 | "escapeChar": "\\", 21 | "firstRowAsHeader": true, 22 | "quoteChar": "\"" 23 | }, 24 | "schema": [ 25 | { 26 | "name": "id", 27 | "type": "String" 28 | }, 29 | { 30 | "name": "name", 31 | "type": "String" 32 | }, 33 | { 34 | "name": "age", 35 | "type": "String" 36 | } 37 | ] 38 | }, 39 | "type": "Microsoft.DataFactory/factories/datasets" 40 | } -------------------------------------------------------------------------------- /adfv2/dataset/csv_files_mi.json: -------------------------------------------------------------------------------- 1 | { 2 | "name": "csv_files_mi", 3 | "properties": { 4 | "linkedServiceName": { 5 | "referenceName": "AzureDataLakeStorage1", 6 | "type": "LinkedServiceReference", 7 | "parameters": { 8 | "stor_url": "@pipeline().globalParameters.stor_url" 9 | } 10 | }, 11 | "annotations": [], 12 | "type": "DelimitedText", 13 | "typeProperties": { 14 | "location": { 15 | "type": "AzureBlobFSLocation", 16 | "fileSystem": "rawdata" 17 | }, 18 | "columnDelimiter": ",", 19 | "escapeChar": "\\", 20 | "firstRowAsHeader": true, 21 | "quoteChar": "\"" 22 | }, 23 | "schema": [ 24 | { 25 | "name": "id", 26 | "type": "String" 27 | }, 28 | { 29 | "name": "name", 30 | "type": "String" 31 | }, 32 | { 33 | "name": "age", 34 | "type": "String" 35 | } 36 | ] 37 | }, 38 | "type": "Microsoft.DataFactory/factories/datasets" 39 | } -------------------------------------------------------------------------------- /adfv2/dataset/parquet_files_key.json: -------------------------------------------------------------------------------- 1 | { 2 | "name": "parquet_files_key", 3 | "properties": { 4 | "linkedServiceName": { 5 | "referenceName": "AzureDataLakeStorage2KEY", 6 | "type": "LinkedServiceReference", 7 | "parameters": { 8 | "stor_url": "@pipeline().globalParameters.stor_url", 9 | "akv_url": "@pipeline().globalParameters.akv_url" 10 | } 11 | }, 12 | "annotations": [], 13 | "type": "Parquet", 14 | "typeProperties": { 15 | "location": { 16 | "type": "AzureBlobFSLocation", 17 | "fileSystem": "defineddata" 18 | }, 19 | "compressionCodec": "snappy" 20 | }, 21 | "schema": [] 22 | }, 23 | "type": "Microsoft.DataFactory/factories/datasets" 24 | } -------------------------------------------------------------------------------- /adfv2/dataset/parquet_files_mi.json: -------------------------------------------------------------------------------- 1 | { 2 | "name": "parquet_files_mi", 3 | "properties": { 4 | "linkedServiceName": { 5 | "referenceName": "AzureDataLakeStorage1", 6 | "type": "LinkedServiceReference", 7 | "parameters": { 8 | "stor_url": "@pipeline().globalParameters.stor_url" 9 | } 10 | }, 11 | "annotations": [], 12 | "type": "Parquet", 13 | "typeProperties": { 14 | "location": { 15 | "type": "AzureBlobFSLocation", 16 | "fileSystem": "defineddata" 17 | }, 18 | "compressionCodec": "snappy" 19 | }, 20 | "schema": [] 21 | }, 22 | "type": "Microsoft.DataFactory/factories/datasets" 23 | } -------------------------------------------------------------------------------- /adfv2/factory/blog-datapipeline-adfv2.json: -------------------------------------------------------------------------------- 1 | { 2 | "name": "blog-datapipeline-adfv2", 3 | "location": "westeurope" 4 | } -------------------------------------------------------------------------------- /adfv2/factory/blog-datapipeline-adfv2dev.json: -------------------------------------------------------------------------------- 1 | { 2 | "name": "blog-datapipeline-adfv2dev", 3 | "properties": { 4 | "globalParameters": { 5 | "stor_url": { 6 | "type": "string", 7 | "value": "1" 8 | }, 9 | "stor_name": { 10 | "type": "string", 11 | "value": "2" 12 | }, 13 | "akv_url": { 14 | "type": "string", 15 | "value": "3" 16 | }, 17 | "cluster_id": { 18 | "type": "string", 19 | "value": "4" 20 | }, 21 | "notebook_name": { 22 | "type": "string", 23 | "value": "5" 24 | }, 25 | "workspace_id_url": { 26 | "type": "string", 27 | "value": "6" 28 | }, 29 | "cosmosdb_name": { 30 | "type": "string", 31 | "value": "7" 32 | }, 33 | "vaultBaseUrl": { 34 | "type": "string", 35 | "value": "8" 36 | } 37 | } 38 | }, 39 | "location": "westeurope" 40 | } -------------------------------------------------------------------------------- /adfv2/factory/blog-datapipeline-dev.json: -------------------------------------------------------------------------------- 1 | { 2 | "name": "blog-datapipeline-dev", 3 | "properties": { 4 | "globalParameters": { 5 | "notebook_name": { 6 | "type": "string", 7 | "value": "1" 8 | }, 9 | "stor_name": { 10 | "type": "string", 11 | "value": "2" 12 | }, 13 | "stor_url": { 14 | "type": "string", 15 | "value": "3" 16 | }, 17 | "cluster_id": { 18 | "type": "string", 19 | "value": "4" 20 | }, 21 | "akv_url": { 22 | "type": "string", 23 | "value": "5" 24 | }, 25 | "vaultBaseUrl": { 26 | "type": "string", 27 | "value": "6" 28 | }, 29 | "workspace_id_url": { 30 | "type": "string", 31 | "value": "7" 32 | }, 33 | "cosmosdb_name": { 34 | "type": "string", 35 | "value": "8" 36 | } 37 | } 38 | }, 39 | "location": "westeurope" 40 | } -------------------------------------------------------------------------------- /adfv2/linkedService/AzureDataLakeStorage1.json: -------------------------------------------------------------------------------- 1 | { 2 | "name": "AzureDataLakeStorage1", 3 | "type": "Microsoft.DataFactory/factories/linkedservices", 4 | "properties": { 5 | "type": "AzureBlobFS", 6 | "annotations": [], 7 | "typeProperties": { 8 | "url": "@{linkedService().stor_url}" 9 | }, 10 | "parameters": { 11 | "stor_url": { 12 | "type": "String" 13 | } 14 | } 15 | } 16 | } -------------------------------------------------------------------------------- /adfv2/linkedService/AzureDataLakeStorage2KEY.json: -------------------------------------------------------------------------------- 1 | { 2 | "name": "AzureDataLakeStorage2KEY", 3 | "type": "Microsoft.DataFactory/factories/linkedservices", 4 | "properties": { 5 | "type": "AzureBlobFS", 6 | "annotations": [], 7 | "typeProperties": { 8 | "url": "@{linkedService().stor_url}", 9 | "accountKey": { 10 | "type": "AzureKeyVaultSecret", 11 | "store": { 12 | "referenceName": "AzureKeyVault", 13 | "type": "LinkedServiceReference", 14 | "parameters": { 15 | "akv_url": { 16 | "value": "@linkedService().akv_url", 17 | "type": "Expression" 18 | } 19 | } 20 | }, 21 | "secretName": "stor-key" 22 | } 23 | }, 24 | "parameters": { 25 | "stor_url": { 26 | "type": "String" 27 | }, 28 | "akv_url": { 29 | "type": "String" 30 | } 31 | } 32 | } 33 | } -------------------------------------------------------------------------------- /adfv2/linkedService/AzureDatabricks1.json: -------------------------------------------------------------------------------- 1 | { 2 | "name": "AzureDatabricks1", 3 | "type": "Microsoft.DataFactory/factories/linkedservices", 4 | "properties": { 5 | "parameters": { 6 | "workspace_id_url": { 7 | "type": "String" 8 | }, 9 | "cluster_id": { 10 | "type": "String" 11 | }, 12 | "vaultBaseUrl": { 13 | "type": "string", 14 | "defaultValue": "https://blogdatapipelineprodakv2.vault.azure.net/" 15 | } 16 | }, 17 | "annotations": [], 18 | "type": "AzureDatabricks", 19 | "typeProperties": { 20 | "domain": "@{linkedService().workspace_id_url}", 21 | "accessToken": { 22 | "type": "AzureKeyVaultSecret", 23 | "store": { 24 | "referenceName": "AzureKeyVault", 25 | "type": "LinkedServiceReference", 26 | "parameters": { 27 | "akv_url": { 28 | "value": "@linkedService().vaultBaseUrl", 29 | "type": "Expression" 30 | } 31 | } 32 | }, 33 | "secretName": "pattoken" 34 | }, 35 | "existingClusterId": "@{linkedService().cluster_id}" 36 | } 37 | } 38 | } -------------------------------------------------------------------------------- /adfv2/linkedService/AzureDatabricks_mi.json: -------------------------------------------------------------------------------- 1 | { 2 | "name": "AzureDatabricks_mi", 3 | "type": "Microsoft.DataFactory/factories/linkedservices", 4 | "properties": { 5 | "parameters": { 6 | "workspace_id_url": { 7 | "type": "String" 8 | }, 9 | "cluster_id": { 10 | "type": "String" 11 | }, 12 | "dbr_resource_id": { 13 | "type": "String" 14 | } 15 | }, 16 | "annotations": [], 17 | "type": "AzureDatabricks", 18 | "typeProperties": { 19 | "domain": "@{linkedService().workspace_id_url}", 20 | "authentication": "MSI", 21 | "workspaceResourceId": "@{linkedService().dbr_resource_id}", 22 | "existingClusterId": "@{linkedService().cluster_id}" 23 | } 24 | } 25 | } -------------------------------------------------------------------------------- /adfv2/linkedService/AzureKeyVault.json: -------------------------------------------------------------------------------- 1 | { 2 | "name": "AzureKeyVault", 3 | "type": "Microsoft.DataFactory/factories/linkedservices", 4 | "properties": { 5 | "type": "AzureKeyVault", 6 | "annotations": [], 7 | "typeProperties": { 8 | "baseUrl": "@{linkedService().akv_url}" 9 | }, 10 | "parameters": { 11 | "akv_url": { 12 | "type": "String" 13 | } 14 | } 15 | } 16 | } -------------------------------------------------------------------------------- /adfv2/pipeline/blog-datapipeline-pipeline-dbrmi.json: -------------------------------------------------------------------------------- 1 | { 2 | "name": "blog-datapipeline-pipeline-dbrmi", 3 | "properties": { 4 | "activities": [ 5 | { 6 | "name": "Copy csv to parquet", 7 | "type": "Copy", 8 | "dependsOn": [], 9 | "policy": { 10 | "timeout": "7.00:00:00", 11 | "retry": 0, 12 | "retryIntervalInSeconds": 30, 13 | "secureOutput": false, 14 | "secureInput": false 15 | }, 16 | "userProperties": [ 17 | { 18 | "name": "Source", 19 | "value": "rawdata//" 20 | }, 21 | { 22 | "name": "Destination", 23 | "value": "defineddata//" 24 | } 25 | ], 26 | "typeProperties": { 27 | "source": { 28 | "type": "DelimitedTextSource", 29 | "storeSettings": { 30 | "type": "AzureBlobFSReadSettings", 31 | "recursive": true, 32 | "wildcardFileName": "*" 33 | }, 34 | "formatSettings": { 35 | "type": "DelimitedTextReadSettings", 36 | "skipLineCount": 0 37 | } 38 | }, 39 | "sink": { 40 | "type": "ParquetSink", 41 | "storeSettings": { 42 | "type": "AzureBlobFSWriteSettings" 43 | }, 44 | "formatSettings": { 45 | "type": "ParquetWriteSettings" 46 | } 47 | }, 48 | "enableStaging": false, 49 | "validateDataConsistency": false 50 | }, 51 | "inputs": [ 52 | { 53 | "referenceName": "csv_files_mi", 54 | "type": "DatasetReference" 55 | } 56 | ], 57 | "outputs": [ 58 | { 59 | "referenceName": "parquet_files_mi", 60 | "type": "DatasetReference" 61 | } 62 | ] 63 | }, 64 | { 65 | "name": "Create graph and write CosmosDB", 66 | "type": "DatabricksNotebook", 67 | "dependsOn": [ 68 | { 69 | "activity": "Copy csv to parquet", 70 | "dependencyConditions": [ 71 | "Succeeded" 72 | ] 73 | } 74 | ], 75 | "policy": { 76 | "timeout": "7.00:00:00", 77 | "retry": 0, 78 | "retryIntervalInSeconds": 30, 79 | "secureOutput": false, 80 | "secureInput": false 81 | }, 82 | "userProperties": [], 83 | "typeProperties": { 84 | "notebookPath": { 85 | "value": "@pipeline().globalParameters.notebook_name", 86 | "type": "Expression" 87 | }, 88 | "baseParameters": { 89 | "cosmosdb_name": { 90 | "value": "@pipeline().globalParameters.cosmosdb_name", 91 | "type": "Expression" 92 | }, 93 | "stor_name": { 94 | "value": "@pipeline().globalParameters.stor_name", 95 | "type": "Expression" 96 | } 97 | } 98 | }, 99 | "linkedServiceName": { 100 | "referenceName": "AzureDatabricks_mi", 101 | "type": "LinkedServiceReference", 102 | "parameters": { 103 | "workspace_id_url": "@pipeline().globalParameters.workspace_id_url", 104 | "cluster_id": "@pipeline().globalParameters.cluster_id", 105 | "dbr_resource_id": "@pipeline().globalParameters.dbr_resource_id" 106 | } 107 | } 108 | } 109 | ], 110 | "annotations": [], 111 | "lastPublishTime": "2020-10-20T10:52:16Z" 112 | }, 113 | "type": "Microsoft.DataFactory/factories/pipelines" 114 | } -------------------------------------------------------------------------------- /adfv2/pipeline/blog-datapipeline-pipeline-key.json: -------------------------------------------------------------------------------- 1 | { 2 | "name": "blog-datapipeline-pipeline-key", 3 | "properties": { 4 | "activities": [ 5 | { 6 | "name": "Copy csv to parquet", 7 | "description": "Copy csv to parquet", 8 | "type": "Copy", 9 | "dependsOn": [], 10 | "policy": { 11 | "timeout": "7.00:00:00", 12 | "retry": 0, 13 | "retryIntervalInSeconds": 30, 14 | "secureOutput": false, 15 | "secureInput": false 16 | }, 17 | "userProperties": [ 18 | { 19 | "name": "Source", 20 | "value": "rawdata//" 21 | }, 22 | { 23 | "name": "Destination", 24 | "value": "defineddata//" 25 | } 26 | ], 27 | "typeProperties": { 28 | "source": { 29 | "type": "DelimitedTextSource", 30 | "storeSettings": { 31 | "type": "AzureBlobFSReadSettings", 32 | "recursive": true, 33 | "wildcardFileName": "*" 34 | }, 35 | "formatSettings": { 36 | "type": "DelimitedTextReadSettings", 37 | "skipLineCount": 0 38 | } 39 | }, 40 | "sink": { 41 | "type": "ParquetSink", 42 | "storeSettings": { 43 | "type": "AzureBlobFSWriteSettings" 44 | }, 45 | "formatSettings": { 46 | "type": "ParquetWriteSettings" 47 | } 48 | }, 49 | "enableStaging": false, 50 | "validateDataConsistency": false 51 | }, 52 | "inputs": [ 53 | { 54 | "referenceName": "csv_files_key", 55 | "type": "DatasetReference" 56 | } 57 | ], 58 | "outputs": [ 59 | { 60 | "referenceName": "parquet_files_key", 61 | "type": "DatasetReference" 62 | } 63 | ] 64 | }, 65 | { 66 | "name": "Create graph and write CosmosDB", 67 | "type": "DatabricksNotebook", 68 | "dependsOn": [ 69 | { 70 | "activity": "Copy csv to parquet", 71 | "dependencyConditions": [ 72 | "Succeeded" 73 | ] 74 | } 75 | ], 76 | "policy": { 77 | "timeout": "7.00:00:00", 78 | "retry": 0, 79 | "retryIntervalInSeconds": 30, 80 | "secureOutput": false, 81 | "secureInput": false 82 | }, 83 | "userProperties": [], 84 | "typeProperties": { 85 | "notebookPath": { 86 | "value": "@pipeline().globalParameters.notebook_name", 87 | "type": "Expression" 88 | }, 89 | "baseParameters": { 90 | "cosmosdb_name": { 91 | "value": "@pipeline().globalParameters.cosmosdb_name", 92 | "type": "Expression" 93 | }, 94 | "stor_name": { 95 | "value": "@pipeline().globalParameters.stor_name", 96 | "type": "Expression" 97 | } 98 | } 99 | }, 100 | "linkedServiceName": { 101 | "referenceName": "AzureDatabricks1", 102 | "type": "LinkedServiceReference", 103 | "parameters": { 104 | "workspace_id_url": "@pipeline().globalParameters.workspace_id_url", 105 | "cluster_id": "@pipeline().globalParameters.cluster_id", 106 | "vaultBaseUrl": "@pipeline().globalParameters.vaultBaseUrl" 107 | } 108 | } 109 | } 110 | ], 111 | "annotations": [], 112 | "lastPublishTime": "2020-10-20T10:52:17Z" 113 | }, 114 | "type": "Microsoft.DataFactory/factories/pipelines" 115 | } -------------------------------------------------------------------------------- /adfv2/pipeline/blog-datapipeline-pipeline-mi.json: -------------------------------------------------------------------------------- 1 | { 2 | "name": "blog-datapipeline-pipeline-mi", 3 | "properties": { 4 | "activities": [ 5 | { 6 | "name": "Copy csv to parquet", 7 | "type": "Copy", 8 | "dependsOn": [], 9 | "policy": { 10 | "timeout": "7.00:00:00", 11 | "retry": 0, 12 | "retryIntervalInSeconds": 30, 13 | "secureOutput": false, 14 | "secureInput": false 15 | }, 16 | "userProperties": [ 17 | { 18 | "name": "Source", 19 | "value": "rawdata//" 20 | }, 21 | { 22 | "name": "Destination", 23 | "value": "defineddata//" 24 | } 25 | ], 26 | "typeProperties": { 27 | "source": { 28 | "type": "DelimitedTextSource", 29 | "storeSettings": { 30 | "type": "AzureBlobFSReadSettings", 31 | "recursive": true, 32 | "wildcardFileName": "*" 33 | }, 34 | "formatSettings": { 35 | "type": "DelimitedTextReadSettings", 36 | "skipLineCount": 0 37 | } 38 | }, 39 | "sink": { 40 | "type": "ParquetSink", 41 | "storeSettings": { 42 | "type": "AzureBlobFSWriteSettings" 43 | }, 44 | "formatSettings": { 45 | "type": "ParquetWriteSettings" 46 | } 47 | }, 48 | "enableStaging": false, 49 | "validateDataConsistency": false 50 | }, 51 | "inputs": [ 52 | { 53 | "referenceName": "csv_files_mi", 54 | "type": "DatasetReference" 55 | } 56 | ], 57 | "outputs": [ 58 | { 59 | "referenceName": "parquet_files_mi", 60 | "type": "DatasetReference" 61 | } 62 | ] 63 | }, 64 | { 65 | "name": "Create graph and write CosmosDB", 66 | "type": "DatabricksNotebook", 67 | "dependsOn": [ 68 | { 69 | "activity": "Copy csv to parquet", 70 | "dependencyConditions": [ 71 | "Succeeded" 72 | ] 73 | } 74 | ], 75 | "policy": { 76 | "timeout": "7.00:00:00", 77 | "retry": 0, 78 | "retryIntervalInSeconds": 30, 79 | "secureOutput": false, 80 | "secureInput": false 81 | }, 82 | "userProperties": [], 83 | "typeProperties": { 84 | "notebookPath": { 85 | "value": "@pipeline().globalParameters.notebook_name", 86 | "type": "Expression" 87 | }, 88 | "baseParameters": { 89 | "cosmosdb_name": { 90 | "value": "@pipeline().globalParameters.cosmosdb_name", 91 | "type": "Expression" 92 | }, 93 | "stor_name": { 94 | "value": "@pipeline().globalParameters.stor_name", 95 | "type": "Expression" 96 | } 97 | } 98 | }, 99 | "linkedServiceName": { 100 | "referenceName": "AzureDatabricks1", 101 | "type": "LinkedServiceReference", 102 | "parameters": { 103 | "workspace_id_url": "@pipeline().globalParameters.workspace_id_url", 104 | "cluster_id": "@pipeline().globalParameters.cluster_id", 105 | "vaultBaseUrl": "@pipeline().globalParameters.vaultBaseUrl" 106 | } 107 | } 108 | } 109 | ], 110 | "annotations": [], 111 | "lastPublishTime": "2020-10-20T10:52:16Z" 112 | }, 113 | "type": "Microsoft.DataFactory/factories/pipelines" 114 | } -------------------------------------------------------------------------------- /adfv2/publish_config.json: -------------------------------------------------------------------------------- 1 | {"publishBranch":"adf_publish","includeFactoryTemplate":true} -------------------------------------------------------------------------------- /azure-pipelines.yml: -------------------------------------------------------------------------------- 1 | variables: 2 | # Azure DevOps settings 3 | AzureServiceConnectionId: 'blog-datapipelineprod-servcon' 4 | # Change environment variables used in bash scripts with your own 5 | RG: 'blog-datapipelineprod-rg' 6 | SUB: '<>' 7 | AKV: 'blogdatapipelineakv123' # unique value 8 | STOR: 'blogdatapipelinestor123' # unique value 9 | COSMOSDBNAME: 'blog-datapipeline-cosmos123' #unique value 10 | DBRWORKSPACE: 'blog-datapipeline-dbr123' #unique value 11 | # fixed Environment variables, no need for unique values 12 | LOC: 'westeurope' 13 | SPN: 'blog-datapipeline-spn' 14 | ADFV2: 'blog-datapipeline-adfv2' 15 | VNET: 'blog-datapipeline-vnet' 16 | # global settings 17 | ACCESS_STOR_AADDBR: 0 18 | SECRETSCOPE_KEYVAULT: 0 # see script 4_configure_secret_scope_databricks.sh in key vault is used 19 | MOUNT_STORAGE_DATABRICKS: 0 # can only be used if access_store_AAD is true 20 | ENABLE_FIREWALL: 1 21 | 22 | trigger: 23 | - master 24 | 25 | pool: 26 | vmImage: 'ubuntu-latest' 27 | 28 | resources: 29 | repositories: 30 | - repository: blog-datapipeline-cicd # change with your own repo name when necessary 31 | type: git 32 | name: blog-datapipeline-devops 33 | ref: master 34 | - repository: blog-datapipeline-deployadfv2 # change with your own repo name when necessary 35 | type: git 36 | name: blog-datapipeline-devops 37 | ref: adf_publish 38 | 39 | steps: 40 | - checkout: blog-datapipeline-cicd 41 | path: blog-datapipeline-cicd 42 | - task: AzurePowerShell@4 43 | displayName: 'Create ADFv2 instance with MI' 44 | inputs: 45 | azureSubscription: $(AzureServiceConnectionId) 46 | ScriptType: InlineScript 47 | Inline: "Set-AzDataFactoryV2 -ResourceGroupName $(RG) -Location $(LOC) -Name $(ADFV2) -Force" 48 | azurePowerShellVersion: LatestVersion 49 | - task: AzureCLI@1 50 | displayName: 'Create resources' 51 | inputs: 52 | azureSubscription: $(AzureServiceConnectionId) 53 | scriptType: bash 54 | scriptPath: '../blog-datapipeline-cicd/scripts/1_create_resources.sh' 55 | - task: AzureCLI@1 56 | displayName: 'Configure Databricks' 57 | inputs: 58 | azureSubscription: $(AzureServiceConnectionId) 59 | scriptType: bash 60 | scriptPath: "../blog-datapipeline-cicd/scripts/2_configure_databricks.sh" 61 | - task: AzureCLI@1 62 | displayName: 'Configure access to storage account' 63 | inputs: 64 | azureSubscription: $(AzureServiceConnectionId) 65 | scriptType: bash 66 | scriptPath: '../blog-datapipeline-cicd/scripts/3_configure_access_storage_databricks.sh' 67 | - task: AzureCLI@1 68 | displayName: 'Configure Secret Scope Databricks' 69 | inputs: 70 | azureSubscription: $(AzureServiceConnectionId) 71 | scriptType: bash 72 | scriptPath: '../blog-datapipeline-cicd/scripts/4_configure_secret_scope_databricks.sh' 73 | - task: AzureCLI@1 74 | displayName: 'Configure Mounting to Databricks' 75 | inputs: 76 | azureSubscription: $(AzureServiceConnectionId) 77 | scriptType: bash 78 | scriptPath: '../blog-datapipeline-cicd/scripts/5_configure_mount_storage_databricks.sh' 79 | - task: AzureCLI@1 80 | displayName: 'Configure Firewall' 81 | inputs: 82 | azureSubscription: $(AzureServiceConnectionId) 83 | scriptType: bash 84 | scriptPath: '../blog-datapipeline-cicd/scripts/6_configure_firewall.sh' 85 | - checkout: blog-datapipeline-deployadfv2 86 | path: blog-datapipeline-deployadfv2 87 | - task: AzureResourceManagerTemplateDeployment@3 88 | displayName: 'Deploy ARM template ADFv2' 89 | inputs: 90 | azureResourceManagerConnection: $(AzureServiceConnectionId) 91 | subscriptionId: $(SUB) 92 | resourceGroupName: $(RG) 93 | location: $(LOC) 94 | csmFile: '../blog-datapipeline-deployadfv2/blog-datapipeline-adfv2/ARMTemplateForFactory.json' 95 | csmParametersFile: '../blog-datapipeline-deployadfv2/blog-datapipeline-adfv2/ARMTemplateParametersForFactory.json' 96 | overrideParameters: "-factoryName $(ADFV2) -dataFactory_properties_globalParameters_akv_url_value $(akv_url) -dataFactory_properties_globalParameters_stor_url_value $(stor_url) -dataFactory_properties_globalParameters_stor_name_value $(STOR) -dataFactory_properties_globalParameters_cosmosdb_name_value $(COSMOSDBNAME) -dataFactory_properties_globalParameters_dbr_resource_id_value $(dbr_resource_id) -dataFactory_properties_globalParameters_workspace_id_url_value $(workspace_id_url) -dataFactory_properties_globalParameters_cluster_id_value $(cluster_id) -dataFactory_properties_globalParameters_vaultBaseUrl_value $(akv_url) -dataFactory_location $(LOC) -dataFactory_properties_globalParameters_notebook_name_value /insert_data_CosmosDB_Gremlin.py" 97 | - task: AzurePowerShell@4 98 | displayName: 'Run ADFv2 pipeline - standard' 99 | inputs: 100 | azureSubscription: $(AzureServiceConnectionId) 101 | ScriptType: InlineScript 102 | Inline: "Invoke-AzDataFactoryV2Pipeline -ResourceGroupName $(RG) -DataFactoryName $(ADFV2) -PipelineName \"blog-datapipeline-pipeline-dbrmi\"" 103 | azurePowerShellVersion: LatestVersion -------------------------------------------------------------------------------- /cosmosdb_firewall_nopip_policy.json: -------------------------------------------------------------------------------- 1 | { 2 | "properties": { 3 | "displayName": "Azure Cosmos DB accounts should have firewall rules and no public IPs assigned", 4 | "policyType": "Custom", 5 | "mode": "All", 6 | "description": "Firewall rules should be defined on your Azure Cosmos DB accounts to prevent traffic from unauthorized sources. Accounts that have the virtual network filter enabled and not public IP addressess assigned are deemed compliant. Accounts disabling public access are also deemed compliant.", 7 | "metadata": { 8 | "category": "Cosmos DB", 9 | "createdBy": "<>", 10 | "createdOn": "2021-03-14T14:28:01.3849493Z", 11 | "updatedBy": "<>", 12 | "updatedOn": "2021-03-14T14:45:32.3184974Z" 13 | }, 14 | "parameters": { 15 | "effect": { 16 | "type": "String", 17 | "metadata": { 18 | "displayName": "Policy Effect", 19 | "description": "The desired effect of the policy." 20 | }, 21 | "allowedValues": [ 22 | "Audit", 23 | "Deny", 24 | "Disabled" 25 | ], 26 | "defaultValue": "Deny" 27 | } 28 | }, 29 | "policyRule": { 30 | "if": { 31 | "allOf": [ 32 | { 33 | "field": "type", 34 | "equals": "Microsoft.DocumentDB/databaseAccounts" 35 | }, 36 | { 37 | "anyOf": [ 38 | { 39 | "field": "Microsoft.DocumentDB/databaseAccounts/publicNetworkAccess", 40 | "exists": "false" 41 | }, 42 | { 43 | "field": "Microsoft.DocumentDB/databaseAccounts/publicNetworkAccess", 44 | "equals": "Enabled" 45 | } 46 | ] 47 | }, 48 | { 49 | "anyOf": [ 50 | { 51 | "field": "Microsoft.DocumentDB/databaseAccounts/isVirtualNetworkFilterEnabled", 52 | "exists": "false" 53 | }, 54 | { 55 | "field": "Microsoft.DocumentDB/databaseAccounts/isVirtualNetworkFilterEnabled", 56 | "equals": "false" 57 | }, 58 | { 59 | "field": "Microsoft.DocumentDB/databaseAccounts/ipRules[*].ipAddressOrRange", 60 | "exists": "true" 61 | } 62 | ] 63 | } 64 | ] 65 | }, 66 | "then": { 67 | "effect": "[parameters('effect')]" 68 | } 69 | } 70 | }, 71 | "id": "/subscriptions/<>/providers/Microsoft.Authorization/policyDefinitions/f0a8662e-cd7a-4b91-b766-fd2ebf133bfa", 72 | "type": "Microsoft.Authorization/policyDefinitions", 73 | "name": "f0a8662e-cd7a-4b91-b766-fd2ebf133bfa" 74 | } 75 | -------------------------------------------------------------------------------- /data/dboPerson.txt: -------------------------------------------------------------------------------- 1 | id,name,age 2 | "a","Alice",34 3 | "c","Charlie",30 4 | "d","David",29 5 | "e","Esther",32 6 | "f","Fanny",36 7 | "g","Gab",60 8 | -------------------------------------------------------------------------------- /data/dboRelation.txt: -------------------------------------------------------------------------------- 1 | fromid,toid,relationtype 2 | "a","b","friend" 3 | "b","c","friend" 4 | "c","b","follow" 5 | "f","c","follow" 6 | "e","f","follow" 7 | "e","d","follow" 8 | "d","a","friend" 9 | "a","e","friend" 10 | -------------------------------------------------------------------------------- /libraries/azure-cosmos-spark_3-1_2-12-4.2.0.jar: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/rebremer/blog-datapipeline-cicd/68d47fde48a92a5044eaf119bae98bbd08e5ab95/libraries/azure-cosmos-spark_3-1_2-12-4.2.0.jar -------------------------------------------------------------------------------- /libraries/graphframes-0.8.1-spark3.0-s_2.12.jar: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/rebremer/blog-datapipeline-cicd/68d47fde48a92a5044eaf119bae98bbd08e5ab95/libraries/graphframes-0.8.1-spark3.0-s_2.12.jar -------------------------------------------------------------------------------- /notebooks/insert_data_CosmosDB_Gremlin.py: -------------------------------------------------------------------------------- 1 | # Databricks notebook source 2 | # MAGIC %md Azure Databricks notebooks by Rene Bremer 3 | # MAGIC 4 | # MAGIC Copyright (c) Microsoft Corporation. All rights reserved. 5 | # MAGIC 6 | # MAGIC Licensed under the MIT License. 7 | 8 | # COMMAND ---------- 9 | 10 | par_cosmosdb_name = dbutils.widgets.get("cosmosdb_name") 11 | par_stor_name = dbutils.widgets.get("stor_name") 12 | 13 | # COMMAND ---------- 14 | 15 | # Databricks notebook source 16 | # DBTITLE 1,Get parquet data from ADLSgen2 17 | from pyspark.sql.functions import * 18 | try: 19 | mnt_defineddata = dbutils.fs.ls('/mnt/defineddata') 20 | defineddata_mounted = 1 21 | except: 22 | defineddata_mounted = 0 23 | spn_id = dbutils.secrets.get(scope="dbrkeys",key="spn-id") 24 | 25 | if defineddata_mounted == 1: 26 | print ("try to get data from mounted storage") 27 | dfperson = spark.read.parquet("/mnt/defineddata/dboPerson.parquet").withColumn("entity", lit("person")) 28 | dfrelation = spark.read.parquet("/mnt/defineddata/dboRelation.parquet") 29 | elif spn_id != "": 30 | print ("try to get data from using spn") 31 | spark.conf.set("fs.azure.account.auth.type." + par_stor_name + ".dfs.core.windows.net", "OAuth") 32 | spark.conf.set("fs.azure.account.oauth.provider.type." + par_stor_name + ".dfs.core.windows.net", "org.apache.hadoop.fs.azurebfs.oauth2.ClientCredsTokenProvider") 33 | spark.conf.set("fs.azure.account.oauth2.client.id." + par_stor_name + ".dfs.core.windows.net", dbutils.secrets.get(scope="dbrkeys",key="spn-id")) 34 | spark.conf.set("fs.azure.account.oauth2.client.secret." + par_stor_name + ".dfs.core.windows.net", dbutils.secrets.get(scope="dbrkeys",key="spn-key")) 35 | spark.conf.set("fs.azure.account.oauth2.client.endpoint." + par_stor_name + ".dfs.core.windows.net", "https://login.microsoftonline.com/" + dbutils.secrets.get(scope="dbrkeys",key="tenant-id") + "/oauth2/token") 36 | dfperson = spark.read.parquet("abfss://defineddata@" + par_stor_name + ".dfs.core.windows.net/dboPerson.parquet").withColumn("entity", lit("person")) 37 | dfrelation = spark.read.parquet("abfss://defineddata@" + par_stor_name + ".dfs.core.windows.net/dboRelation.parquet") 38 | else: 39 | print ("try to get data from using storage access key, not recommended in production") 40 | spark.conf.set("fs.azure.account.key." + par_stor_name + ".dfs.core.windows.net", dbutils.secrets.get(scope="dbrkeys",key="stor-key")) 41 | dfperson = spark.read.parquet("abfss://defineddata@" + par_stor_name + ".dfs.core.windows.net/dboPerson.parquet").withColumn("entity", lit("person")) 42 | dfrelation = spark.read.parquet("abfss://defineddata@" + par_stor_name + ".dfs.core.windows.net/dboRelation.parquet") 43 | 44 | columns_new = [col.replace("fromid", "src") for col in dfrelation.columns] 45 | dfrelation = dfrelation.toDF(*columns_new) 46 | 47 | columns_new = [col.replace("toid", "dst") for col in dfrelation.columns] 48 | dfrelation = dfrelation.toDF(*columns_new) 49 | 50 | # COMMAND ---------- 51 | 52 | from graphframes import GraphFrame 53 | g = GraphFrame(dfperson, dfrelation) 54 | 55 | # COMMAND ---------- 56 | 57 | from pyspark.sql.types import StringType 58 | from urllib.parse import quote 59 | 60 | def urlencode(value): 61 | return quote(value, safe="") 62 | 63 | udf_urlencode = udf(urlencode, StringType()) 64 | 65 | # COMMAND ---------- 66 | 67 | def to_cosmosdb_vertices(dfVertices, labelColumn, partitionKey = ""): 68 | dfVertices = dfVertices.withColumn("id", udf_urlencode("id")) 69 | dfVertices = dfVertices.withColumn("age", udf_urlencode("age")) 70 | dfVertices = dfVertices.withColumn("name", udf_urlencode("name")) 71 | 72 | columns = ["id", labelColumn] 73 | 74 | if partitionKey: 75 | columns.append(partitionKey) 76 | 77 | #columns.extend(['nvl2({x}, array(named_struct("id", uuid(), "_value", {x})), NULL) AS {x}'.format(x=x) \ 78 | # for x in dfVertices.columns if x not in columns]) 79 | 80 | return dfVertices.selectExpr(*columns).withColumnRenamed(labelColumn, "label") 81 | 82 | # COMMAND ---------- 83 | 84 | cosmosDbVertices = dfperson 85 | #display(dfperson) 86 | 87 | # COMMAND ---------- 88 | 89 | from pyspark.sql.functions import concat_ws, col 90 | 91 | def to_cosmosdb_edges(g, labelColumn, partitionKey = ""): 92 | dfEdges = g.edges 93 | 94 | if partitionKey: 95 | dfEdges = dfEdges.alias("e") \ 96 | .join(g.vertices.alias("sv"), col("e.src") == col("sv.id")) \ 97 | .join(g.vertices.alias("dv"), col("e.dst") == col("dv.id")) \ 98 | .selectExpr("e.*", "sv." + partitionKey, "dv." + partitionKey + " AS _sinkPartition") 99 | 100 | dfEdges = dfEdges \ 101 | .withColumn("id", udf_urlencode(concat_ws("_", col("src"), col(labelColumn), col("dst")))) \ 102 | .withColumn("_isEdge", lit(True)) \ 103 | .withColumn("_vertexId", udf_urlencode("src")) \ 104 | .withColumn("_sink", udf_urlencode("dst")) \ 105 | .withColumnRenamed(labelColumn, "label") \ 106 | .drop("src", "dst") 107 | 108 | return dfEdges 109 | 110 | # COMMAND ---------- 111 | 112 | cosmosDbEdges = to_cosmosdb_edges(g, "relationtype") 113 | display(cosmosDbEdges) 114 | 115 | # COMMAND ---------- 116 | 117 | cfg = { 118 | "spark.cosmos.accountEndpoint" : "https://" + par_cosmosdb_name + ".documents.azure.com:443/", 119 | "spark.cosmos.accountKey" : dbutils.secrets.get(scope="dbrkeys",key="cosmosdb-key"), 120 | "spark.cosmos.database" : "peopledb", 121 | "spark.cosmos.container" : "peoplegraph" 122 | } 123 | 124 | cosmosDbFormat ="cosmos.oltp" 125 | 126 | cosmosDbVertices.write.format("cosmos.oltp").options(**cfg).mode("APPEND").save() 127 | cosmosDbEdges.write.format("cosmos.oltp").options(**cfg).mode("APPEND").save() 128 | 129 | #cosmosDbConfig = { 130 | # "Endpoint" : "https://" + par_cosmosdb_name + ".documents.azure.com:443/", 131 | # "Masterkey" : dbutils.secrets.get(scope="dbrkeys",key="cosmosdb-key"), 132 | # "Database" : "peopledb", 133 | # "Collection" : "peoplegraph", 134 | # "Upsert" : "true" 135 | #} 136 | 137 | #cosmosDbFormat = "com.microsoft.azure.cosmosdb.spark" 138 | 139 | #cosmosDbVertices.write.format(cosmosDbFormat).mode("append").options(**cosmosDbConfig).save() 140 | #cosmosDbEdges.write.format(cosmosDbFormat).mode("append").options(**cosmosDbConfig).save() 141 | 142 | 143 | # COMMAND ---------- 144 | 145 | 146 | -------------------------------------------------------------------------------- /notebooks/mount_ADLSgen2_rawdata.py: -------------------------------------------------------------------------------- 1 | # Databricks notebook source 2 | # MAGIC %md Azure Databricks notebooks by Rene Bremer 3 | # MAGIC 4 | # MAGIC Copyright (c) Microsoft Corporation. All rights reserved. 5 | # MAGIC 6 | # MAGIC Licensed under the MIT License. 7 | 8 | # COMMAND ---------- 9 | 10 | par_stor_name = dbutils.widgets.get("stor_name") 11 | 12 | # COMMAND ---------- 13 | 14 | # Databricks notebook source 15 | # "fs.azure.account.oauth2.client.secret": dbutils.secrets.get(scope="",key=""), 16 | 17 | configs = {"fs.azure.account.auth.type": "OAuth", 18 | "fs.azure.account.oauth.provider.type": "org.apache.hadoop.fs.azurebfs.oauth2.ClientCredsTokenProvider", 19 | "fs.azure.account.oauth2.client.id": dbutils.secrets.get(scope="dbrkeys",key="spn-id"), 20 | "fs.azure.account.oauth2.client.secret": dbutils.secrets.get(scope="dbrkeys",key="spn-key"), 21 | "fs.azure.account.oauth2.client.endpoint": "https://login.microsoftonline.com/" + dbutils.secrets.get(scope="dbrkeys",key="tenant-id") + "/oauth2/token"} 22 | 23 | # Optionally, you can add to the source URI of your mount point. 24 | dbutils.fs.mount( 25 | source = "abfss://defineddata@" + par_stor_name + ".dfs.core.windows.net/", 26 | mount_point = "/mnt/defineddata", 27 | extra_configs = configs) 28 | 29 | # COMMAND ---------- 30 | 31 | %sh 32 | ls -l /dbfs/mnt/defineddata 33 | 34 | # COMMAND ---------- -------------------------------------------------------------------------------- /pictures/2_architecture.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/rebremer/blog-datapipeline-cicd/68d47fde48a92a5044eaf119bae98bbd08e5ab95/pictures/2_architecture.png -------------------------------------------------------------------------------- /pictures/architecture_detailed.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/rebremer/blog-datapipeline-cicd/68d47fde48a92a5044eaf119bae98bbd08e5ab95/pictures/architecture_detailed.png -------------------------------------------------------------------------------- /pictures/old_architecture_detailed.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/rebremer/blog-datapipeline-cicd/68d47fde48a92a5044eaf119bae98bbd08e5ab95/pictures/old_architecture_detailed.png -------------------------------------------------------------------------------- /readme.md: -------------------------------------------------------------------------------- 1 | ### Data Factory pipeline CICD project using DevOps, Databricks, Cosmos DB 2 | 3 | ![Architecture](https://github.com/rebremer/blog-datapipeline-cicd/blob/master/pictures/architecture_detailed.png) 4 | 5 | In this github project, a data pipeline project is created that does the following: 6 | 1. Setup Azure DevOps CI/CD project  7 | 2. Deploy and configure Azure resources using Azure DevOps 8 | 3. Run Data Factory pipeline adding data to Cosmos DB Graph API using a Databricks notebook 9 | 10 | Details can be found in this blog: https://rebremer.medium.com/how-to-bring-your-modern-data-pipeline-to-production-2f14e42ac200 -------------------------------------------------------------------------------- /scripts/1_create_resources.sh: -------------------------------------------------------------------------------- 1 | # Resource group 2 | az group create -n $RG -l $LOC 3 | # Key vault 4 | az keyvault create -l $LOC -n $AKV -g $RG 5 | tenantId=$(az account show --query tenantId -o tsv) 6 | az keyvault secret set -n tenant-id --vault-name $AKV --value $tenantId 7 | # Storage account 8 | az storage account create -n $STOR -g $RG -l $LOC --sku Standard_LRS --kind StorageV2 --enable-hierarchical-namespace true 9 | az storage container create --account-name $STOR -n "rawdata" 10 | az storage container create --account-name $STOR -n "defineddata" 11 | az storage blob upload -f "../data/dboPerson.txt" -c "rawdata" -n "dboPerson.txt" --account-name $STOR 12 | az storage blob upload -f "../data/dboRelation.txt" -c "rawdata" -n "dboRelation.txt" --account-name $STOR 13 | # Databricks 14 | az extension add --name databricks 15 | dbr_response=$(az databricks workspace show -g $RG -n $DBRWORKSPACE) 16 | if ["$dbr_response" = ""]; then 17 | vnetaddressrange="10.210.0.0" 18 | subnet1addressrange="10.210.0.0" 19 | subnet2addressrange="10.210.1.0" 20 | az network vnet create -g $RG -n $VNET --address-prefix $vnetaddressrange/16 -l $LOC 21 | az network nsg create -g $RG -n "public-subnet-nsg" 22 | az network nsg create -g $RG -n "private-subnet-nsg" 23 | az network vnet subnet create -g $RG --vnet-name $VNET -n "public-subnet" --address-prefixes $subnet1addressrange/24 --network-security-group "public-subnet-nsg" 24 | az network vnet subnet create -g $RG --vnet-name $VNET -n "private-subnet" --address-prefixes $subnet2addressrange/24 --network-security-group "private-subnet-nsg" 25 | az network vnet subnet update --resource-group $RG --name "public-subnet" --vnet-name $VNET --delegations Microsoft.Databricks/workspaces 26 | az network vnet subnet update --resource-group $RG --name "private-subnet" --vnet-name $VNET --delegations Microsoft.Databricks/workspaces 27 | dbr_response=$(az databricks workspace create -l $LOC -n $DBRWORKSPACE -g $RG --sku premium --vnet $VNET --public-subnet "public-subnet" --private-subnet "private-subnet") 28 | fi 29 | # Variables 30 | dbr_resource_id=$(jq .id -r <<< "$dbr_response") 31 | workspaceUrl_no_http=$(jq .workspaceUrl -r <<< "$dbr_response") 32 | workspace_id_url="https://"$workspaceUrl_no_http"/" 33 | akv_url="https://"$AKV".vault.azure.net/" 34 | stor_url="https://"$STOR".dfs.core.windows.net/" 35 | echo "##vso[task.setvariable variable=dbr_resource_id]$dbr_resource_id" 36 | echo "##vso[task.setvariable variable=workspace_id_url]$workspace_id_url" 37 | echo "##vso[task.setvariable variable=akv_url]$akv_url" 38 | echo "##vso[task.setvariable variable=stor_url]$stor_url" 39 | # Cosmos DB graph API 40 | cosmosdbdatabase="peopledb" 41 | cosmosdbgraph="peoplegraph" 42 | az cosmosdb create -n $COSMOSDBNAME -g $RG --capabilities EnableGremlin 43 | az cosmosdb gremlin database create -a $COSMOSDBNAME -n $cosmosdbdatabase -g $RG 44 | az cosmosdb gremlin graph create -g $RG -a $COSMOSDBNAME -d $cosmosdbdatabase -n $cosmosdbgraph --partition-key-path "/name" 45 | cosmosdb_response=$(az cosmosdb keys list -n $COSMOSDBNAME -g $RG) 46 | cosmosdb_key=$(jq .primaryMasterKey -r <<< "$cosmosdb_response") 47 | az keyvault secret set -n cosmosdb-key --vault-name $AKV --value $cosmosdb_key 48 | # Datafactory 49 | az extension add --name datafactory 50 | api_response=$(az datafactory show -n $ADFV2 -g $RG) 51 | adfv2_id=$(jq .identity.principalId -r <<< "$api_response") 52 | az keyvault set-policy -n $AKV --secret-permissions set get list --object-id $adfv2_id 53 | # Assign RBAC rights ADFv2 MI on storage account. 54 | # Service connection SPN needs to have owner rights on account 55 | scope="/subscriptions/$SUB/resourceGroups/$RG/providers/Microsoft.Storage/storageAccounts/$STOR" 56 | az role assignment create --assignee-object-id $adfv2_id --role "Storage Blob Data Contributor" --scope $scope 57 | # Assign RBAC rights ADFv2 MI on Databricks 58 | # Service connection SPN needs to have owner rights on account 59 | az role assignment create --assignee-object-id $adfv2_id --role "Contributor" --scope $dbr_resource_id 60 | -------------------------------------------------------------------------------- /scripts/2_configure_databricks.sh: -------------------------------------------------------------------------------- 1 | # 1a. Get tenantID and resource id 2 | tenantId=$(az account show --query tenantId -o tsv) 3 | wsId=$(az resource show \ 4 | --resource-type Microsoft.Databricks/workspaces \ 5 | -g "$RG" \ 6 | -n "$DBRWORKSPACE" \ 7 | --query id -o tsv) 8 | # 1b. Get two bearer tokens in Azure 9 | token_response=$(az account get-access-token --resource 2ff814a6-3304-4ab8-85cb-cd0e6f879c1d) 10 | token=$(jq .accessToken -r <<< "$token_response") 11 | token_response=$(az account get-access-token --resource https://management.core.windows.net/) 12 | azToken=$(jq .accessToken -r <<< "$token_response") 13 | # 14 | # Databricks 15 | dbr_response=$(az databricks workspace show -g $RG -n $DBRWORKSPACE) 16 | workspaceUrl_no_http=$(jq .workspaceUrl -r <<< "$dbr_response") 17 | workspace_id_url="https://"$workspaceUrl_no_http"/" 18 | # 19 | # 2. Upload notebook to Databricks Workspace 20 | api_response=$(curl -v -X POST ${workspace_id_url}api/2.0/workspace/import \ 21 | -H "Authorization: Bearer $token" \ 22 | -H "X-Databricks-Azure-SP-Management-Token:$azToken" \ 23 | -H "X-Databricks-Azure-Workspace-Resource-Id:$wsId" \ 24 | -F path="/mount_ADLSgen2_rawdata.py" -F format=SOURCE -F language=PYTHON -F overwrite=true -F content=@../notebooks/mount_ADLSgen2_rawdata.py) 25 | api_response=$(curl -v -X POST ${workspace_id_url}api/2.0/workspace/import \ 26 | -H "Authorization: Bearer $token" \ 27 | -H "X-Databricks-Azure-SP-Management-Token:$azToken" \ 28 | -H "X-Databricks-Azure-Workspace-Resource-Id:$wsId" \ 29 | -F path="/insert_data_CosmosDB_Gremlin.py" -F format=SOURCE -F language=PYTHON -F overwrite=true -F content=@../notebooks/insert_data_CosmosDB_Gremlin.py) 30 | # 31 | # 3. Upload libraries to Databricks DBFS 32 | api_response=$(curl -v -X POST ${workspace_id_url}api/2.0/dbfs/put \ 33 | -H "Authorization: Bearer $token" \ 34 | -H "X-Databricks-Azure-SP-Management-Token:$azToken" \ 35 | -H "X-Databricks-Azure-Workspace-Resource-Id:$wsId" \ 36 | -F path="/azure-cosmos-spark_3-1_2-12-4.2.0.jar" -F contents=@../libraries/azure-cosmos-spark_3-1_2-12-4.2.0.jar -F overwrite=true) 37 | api_response=$(curl -v -X POST ${workspace_id_url}api/2.0/dbfs/put \ 38 | -H "Authorization: Bearer $token" \ 39 | -H "X-Databricks-Azure-SP-Management-Token:$azToken" \ 40 | -H "X-Databricks-Azure-Workspace-Resource-Id:$wsId" \ 41 | -F path="/graphframes-0.8.1-spark3.0-s_2.12.jar" -F contents=@../libraries/graphframes-0.8.1-spark3.0-s_2.12.jar -F overwrite=true) 42 | # 43 | # 4. Create Databricks cluster 44 | api_response=$(curl -v -X POST ${workspace_id_url}api/2.0/clusters/create \ 45 | -H "Authorization: Bearer $token" \ 46 | -H "X-Databricks-Azure-SP-Management-Token:$azToken" \ 47 | -H "X-Databricks-Azure-Workspace-Resource-Id:$wsId" \ 48 | -d "{\"cluster_name\": \"clusterPAT6\",\"spark_version\": \"6.6.x-scala2.11\",\"node_type_id\": \"Standard_D3_v2\", \"autotermination_minutes\":60, \"num_workers\" : 1}") 49 | cluster_id=$(jq .cluster_id -r <<< "$api_response") 50 | echo "##vso[task.setvariable variable=cluster_id]$cluster_id" 51 | sleep 1m 52 | # 53 | # 5. Add libraries to cluster 54 | api_response=$(curl -v -X POST ${workspace_id_url}api/2.0/libraries/install \ 55 | -H "Authorization: Bearer $token" \ 56 | -H "X-Databricks-Azure-SP-Management-Token:$azToken" \ 57 | -H "X-Databricks-Azure-Workspace-Resource-Id:$wsId" \ 58 | -d "{\"cluster_id\": \"$cluster_id\", \"libraries\": [{\"jar\": \"dbfs:/azure-cosmos-spark_3-1_2-12-4.2.0.jar\"},{\"jar\": \"dbfs:/graphframes-0.8.1-spark3.0-s_2.12.jar\"}]}") 59 | -------------------------------------------------------------------------------- /scripts/3_configure_access_storage_databricks.sh: -------------------------------------------------------------------------------- 1 | if [ $ACCESS_STOR_AADDBR = 0 ]; then 2 | echo "Usage access key of storage account to authenticate" 3 | key_response=$(az storage account keys list -g $RG -n $STOR) 4 | stor_key=$(jq .[0].value -r <<< "$key_response") 5 | az keyvault secret set -n stor-key --vault-name $AKV --value $stor_key 6 | else 7 | echo "Assigning Azure AD SPN to Databricks for authentication to storage account" 8 | spn_response=$(az ad sp create-for-rbac -n $SPN --skip-assignment) 9 | spn_id=$(jq .appId -r <<< "$spn_response") 10 | spn_key=$(jq .password -r <<< "$spn_response") 11 | # 12 | az keyvault secret set -n spn-id --vault-name $AKV --value $spn_id 13 | az keyvault secret set -n spn-key --vault-name $AKV --value $spn_key 14 | # 15 | spn_response=$(az ad sp show --id $spn_id) 16 | spn_object_id=$(jq .objectId -r <<< "$spn_response") 17 | scope="/subscriptions/$SUB/resourceGroups/$RG/providers/Microsoft.Storage/storageAccounts/$STOR" 18 | az role assignment create --assignee-object-id $spn_object_id --role "Storage Blob Data Contributor" --scope $scope 19 | # 20 | # In case a Databricks secret scope is used and this script is run afterwards with elevated user rights, 21 | # also run script 4_configure_secret_scope_databricks.sh to add spn_id and spn_key to Databricks secret scope 22 | fi 23 | -------------------------------------------------------------------------------- /scripts/4_configure_secret_scope_databricks.sh: -------------------------------------------------------------------------------- 1 | # Databricks 2 | az extension add --name databricks 3 | dbr_response=$(az databricks workspace show -g $RG -n $DBRWORKSPACE) 4 | workspaceUrl_no_http=$(jq .workspaceUrl -r <<< "$dbr_response") 5 | workspace_id_url="https://"$workspaceUrl_no_http"/" 6 | # Get two bearer tokens in Azure 7 | tenantId=$(az account show --query tenantId -o tsv) 8 | wsId=$(az resource show \ 9 | --resource-type Microsoft.Databricks/workspaces \ 10 | -g "$RG" \ 11 | -n "$DBRWORKSPACE" \ 12 | --query id -o tsv) 13 | token_response=$(az account get-access-token --resource 2ff814a6-3304-4ab8-85cb-cd0e6f879c1d) 14 | token=$(jq .accessToken -r <<< "$token_response") 15 | token_response=$(az account get-access-token --resource https://management.core.windows.net/) 16 | azToken=$(jq .accessToken -r <<< "$token_response") 17 | # 18 | if [ $SECRETSCOPE_KEYVAULT = 0 ]; then 19 | api_response=$(curl -v -X POST ${workspace_id_url}api/2.0/secrets/scopes/create \ 20 | -H "Authorization: Bearer $token" \ 21 | -H "X-Databricks-Azure-SP-Management-Token:$azToken" \ 22 | -H "X-Databricks-Azure-Workspace-Resource-Id:$wsId" \ 23 | -d "{\"scope\": \"dbrkeys\"}") 24 | # 2a2. Move keys from key vault to Databricks backed secret scope 25 | keyvault_response=$(az keyvault secret show -n spn-id --vault-name $AKV) 26 | spn_id=$(jq .value -r <<< "$keyvault_response") 27 | keyvault_response=$(az keyvault secret show -n spn-key --vault-name $AKV) 28 | spn_key=$(jq .value -r <<< "$keyvault_response") 29 | keyvault_response=$(az keyvault secret show -n stor-key --vault-name $AKV) 30 | stor_key=$(jq .value -r <<< "$keyvault_response") 31 | keyvault_response=$(az keyvault secret show -n cosmosdb-key --vault-name $AKV) 32 | cosmosdb_key=$(jq .value -r <<< "$keyvault_response") 33 | keyvault_response=$(az keyvault secret show -n tenant-id --vault-name $AKV) 34 | tenant_id=$(jq .value -r <<< "$keyvault_response") 35 | api_response=$(curl -v -X POST ${workspace_id_url}api/2.0/secrets/put \ 36 | -H "Authorization: Bearer $token" \ 37 | -H "X-Databricks-Azure-SP-Management-Token:$azToken" \ 38 | -H "X-Databricks-Azure-Workspace-Resource-Id:$wsId" \ 39 | -d "{\"scope\": \"dbrkeys\", \"key\": \"spn-id\", \"string_value\": \"$spn_id\"}") 40 | api_response=$(curl -v -X POST ${workspace_id_url}api/2.0/secrets/put \ 41 | -H "Authorization: Bearer $token" \ 42 | -H "X-Databricks-Azure-SP-Management-Token:$azToken" \ 43 | -H "X-Databricks-Azure-Workspace-Resource-Id:$wsId" \ 44 | -d "{\"scope\": \"dbrkeys\", \"key\": \"spn-key\", \"string_value\": \"$spn_key\"}") 45 | api_response=$(curl -v -X POST ${workspace_id_url}api/2.0/secrets/put \ 46 | -H "Authorization: Bearer $token" \ 47 | -H "X-Databricks-Azure-SP-Management-Token:$azToken" \ 48 | -H "X-Databricks-Azure-Workspace-Resource-Id:$wsId" \ 49 | -d "{\"scope\": \"dbrkeys\", \"key\": \"stor-key\", \"string_value\": \"$stor_key\"}") 50 | api_response=$(curl -v -X POST ${workspace_id_url}api/2.0/secrets/put \ 51 | -H "Authorization: Bearer $token" \ 52 | -H "X-Databricks-Azure-SP-Management-Token:$azToken" \ 53 | -H "X-Databricks-Azure-Workspace-Resource-Id:$wsId" \ 54 | -d "{\"scope\": \"dbrkeys\", \"key\": \"cosmosdb-key\", \"string_value\": \"$cosmosdb_key\"}") 55 | api_response=$(curl -v -X POST ${workspace_id_url}api/2.0/secrets/put \ 56 | -H "Authorization: Bearer $token" \ 57 | -H "X-Databricks-Azure-SP-Management-Token:$azToken" \ 58 | -H "X-Databricks-Azure-Workspace-Resource-Id:$wsId" \ 59 | -d "{\"scope\": \"dbrkeys\", \"key\": \"tenant-id\", \"string_value\": \"$tenant_id\"}") 60 | else 61 | #2b. Create secret sope backed by Azure Key Vault, only works with Azure AD token 62 | # 63 | # 20201213: This does not work from Azure DevOps SPN (only when) 64 | # See https://github.com/databricks/databricks-cli/issues/338 for workaround 65 | # 66 | echo "Create secret sope backed by Azure Key Vault" 67 | # 68 | akv_url="https://"$AKV".vault.azure.net/" 69 | api_response=$(curl -v -X POST ${workspace_id_url}api/2.0/secrets/scopes/create \ 70 | -H "Authorization: Bearer $token" \ 71 | -H "X-Databricks-Azure-SP-Management-Token:$azToken" \ 72 | -H "X-Databricks-Azure-Workspace-Resource-Id:$wsId" \ 73 | -d "{\"scope\": \"dbrkeys\", \"scope_backend_type\": \"AZURE_KEYVAULT\", \"backend_azure_keyvault\":{\"resource_id\": \"/subscriptions/$SUB/resourceGroups/$RG/providers/Microsoft.KeyVault/vaults/$AKV\", \"dns_name\": \"$akv_url\"}}") 74 | error_code=$(jq .error_code -r <<< "$api_response") 75 | echo $error_code 76 | message=$(jq .message -r <<< "$api_response") 77 | echo $message 78 | fi -------------------------------------------------------------------------------- /scripts/5_configure_mount_storage_databricks.sh: -------------------------------------------------------------------------------- 1 | if [ $MOUNT_STORAGE_DATABRICKS = 0 ]; then 2 | echo "Storage shall not be mounted" 3 | exit 0 4 | fi 5 | # 6 | keyvault_response=$(az keyvault secret show -n spn-id --vault-name $AKV) 7 | spn_id=$(jq .value -r <<< "$keyvault_response") 8 | if ["$spn_id" = ""]; then 9 | 10 | echo "No spn present, storage cannot be mounted" 11 | exit 0 12 | fi 13 | # Databricks 14 | dbr_response=$(az databricks workspace show -g $RG -n $DBRWORKSPACE) 15 | workspaceUrl_no_http=$(jq .workspaceUrl -r <<< "$dbr_response") 16 | workspace_id_url="https://"$workspaceUrl_no_http"/" 17 | # 18 | # 1. Create job 19 | # Get two bearer tokens in Azure 20 | tenantId=$(az account show --query tenantId -o tsv) 21 | wsId=$(az resource show \ 22 | --resource-type Microsoft.Databricks/workspaces \ 23 | -g "$RG" \ 24 | -n "$DBRWORKSPACE" \ 25 | --query id -o tsv) 26 | token_response=$(az account get-access-token --resource 2ff814a6-3304-4ab8-85cb-cd0e6f879c1d) 27 | token=$(jq .accessToken -r <<< "$token_response") 28 | token_response=$(az account get-access-token --resource https://management.core.windows.net/) 29 | azToken=$(jq .accessToken -r <<< "$token_response") 30 | api_response=$(curl -v -X POST ${workspace_id_url}api/2.0/jobs/create \ 31 | -H "Authorization: Bearer $token" \ 32 | -H "X-Databricks-Azure-SP-Management-Token:$azToken" \ 33 | -H "X-Databricks-Azure-Workspace-Resource-Id:$wsId" \ 34 | -d "{\"name\": \"mount storage\", \"existing_cluster_id\": \"$cluster_id\", \"notebook_task\": {\"notebook_path\": \"/mount_ADLSgen2_rawdata.py\", \"base_parameters\": [{\"key\":\"stor_name\", \"value\":\"$STOR\"}]}}") 35 | job_id=$(jq .job_id -r <<< "$api_response") 36 | # 37 | # 2. Run job to run notebook to mount storage 38 | api_response=$(curl -v -X POST ${workspace_id_url}api/2.0/jobs/run-now \ 39 | -H "Authorization: Bearer $token" \ 40 | -H "X-Databricks-Azure-SP-Management-Token:$azToken" \ 41 | -H "X-Databricks-Azure-Workspace-Resource-Id:$wsId" \ 42 | -d "{\"job_id\": $job_id}") 43 | run_id=$(jq .run_id -r <<< "$api_response") 44 | # 45 | # 3. Wait until jobs if finished (mainly dependent on step 9 to create cluster) 46 | i=0 47 | while [ $i -lt 10 ] 48 | do 49 | echo "Time waited for job to finish: $i minutes" 50 | ((i++)) 51 | api_response=$(curl -v -X GET ${workspace_id_url}api/2.0/jobs/runs/get\?run_id=$run_id \ 52 | -H "Authorization: Bearer $token" \ 53 | -H "X-Databricks-Azure-SP-Management-Token:$azToken" \ 54 | -H "X-Databricks-Azure-Workspace-Resource-Id:$wsId" 55 | ) 56 | state=$(jq .state.life_cycle_state -r <<< "$api_response") 57 | echo "job state: $state" 58 | if [[ "$state" == 'TERMINATED' || "$state" == 'SKIPPED' || "$state" == 'INTERNAL_ERROR' ]]; then 59 | break 60 | fi 61 | sleep 1m 62 | done 63 | -------------------------------------------------------------------------------- /scripts/6_configure_firewall.sh: -------------------------------------------------------------------------------- 1 | if [ $ENABLE_FIREWALL = 0 ]; then 2 | echo "Firewall will not be enabled" 3 | exit 0 4 | fi 5 | # 6 | # 0. Service endpoints 7 | az network vnet subnet update --resource-group $RG --vnet-name $VNET --name "public-subnet" --service-endpoints "Microsoft.KeyVault" "Microsoft.Storage" "Microsoft.AzureCosmosDB" 8 | cli_response=$(az network vnet subnet show --resource-group "$RG" --vnet-name "$VNET" --name "public-subnet") 9 | subnet_id=$(jq .id -r <<< "$cli_response") 10 | # 11 | # 1. Cosmos DB 12 | #az cosmosdb account update --resource-group "$RG" --name "$COSMOSDBNAME" --enable-public-network false 13 | # Databricks 14 | az cosmosdb update -g $RG -n $COSMOSDBNAME --enable-virtual-network true 15 | az cosmosdb network-rule add -g $RG -n $COSMOSDBNAME --subnet $subnet_id 16 | # 17 | # 2. Key vault 18 | az keyvault update --resource-group "$RG" --name "$AKV" --default-action Deny 19 | # Databricks 20 | az keyvault network-rule add -g "$RG" -n "$AKV" --subnet $subnet_id 21 | # Azure Data Factory 22 | az keyvault update --resource-group "$RG" --name "$AKV" --bypass AzureServices 23 | # 24 | # 3. Storage account 25 | az storage account update --resource-group "$RG" --name "$STOR" --default-action Deny 26 | # Databricks 27 | az storage account network-rule add -g $RG --account-name $STOR --subnet $subnet_id 28 | # Azure Data Factory 29 | az storage account update -g "$RG" -n "$STOR" --bypass AzureServices 30 | # 31 | sleep 1m # avoid raise conditions -------------------------------------------------------------------------------- /scripts/99_databricks_sqlapi.sh: -------------------------------------------------------------------------------- 1 | RG="<>" 2 | DBRWORKSPACE="<>" 3 | # 1a. Get tenantID and resource id 4 | tenantId=$(az account show --query tenantId -o tsv) 5 | wsId=$(az resource show \ 6 | --resource-type Microsoft.Databricks/workspaces \ 7 | -g "$RG" \ 8 | -n "$DBRWORKSPACE" \ 9 | --query id -o tsv) 10 | # 1b. Get two bearer tokens in Azure 11 | token_response=$(az account get-access-token --resource 2ff814a6-3304-4ab8-85cb-cd0e6f879c1d) 12 | token=$(jq .accessToken -r <<< "$token_response") 13 | token_response=$(az account get-access-token --resource https://management.core.windows.net/) 14 | azToken=$(jq .accessToken -r <<< "$token_response") 15 | # 16 | # 1c. Databricks variables 17 | dbr_response=$(az databricks workspace show -g $RG -n $DBRWORKSPACE) 18 | workspaceUrl_no_http=$(jq .workspaceUrl -r <<< "$dbr_response") 19 | workspace_id_url="https://"$workspaceUrl_no_http"/" 20 | # 21 | # 2. Use SQL Warehouses APIs 2.0, create data warehouse 22 | api_response=$(curl -v -X POST ${workspace_id_url}api/2.0/sql/warehouses/ \ 23 | -H "Authorization: Bearer $token" \ 24 | -H "X-Databricks-Azure-SP-Management-Token:$azToken" \ 25 | -H "X-Databricks-Azure-Workspace-Resource-Id:$wsId" \ 26 | -d "{\"name\": \"testAzureADSQL\",\"cluster_size\": \"Small\",\"min_num_clusters\": 1,\"max_num_clusters\": 10, \"enable_photon\": \"true\"}") 27 | database_id=$(jq .database -r <<< "$dbr_response") 28 | # 29 | # 3. Use Queries and Dashboards API, find queries 30 | api_response=$(curl -v -X GET ${workspace_id_url}api/2.0/preview/sql/queries \ 31 | -H "Authorization: Bearer $token" \ 32 | -H "X-Databricks-Azure-SP-Management-Token:$azToken" \ 33 | -H "X-Databricks-Azure-Workspace-Resource-Id:$wsId") --------------------------------------------------------------------------------