├── .gitignore
├── adfv2
    ├── dataset
    │   ├── csv_files_key.json
    │   ├── csv_files_mi.json
    │   ├── parquet_files_key.json
    │   └── parquet_files_mi.json
    ├── factory
    │   ├── blog-datapipeline-adfv2.json
    │   ├── blog-datapipeline-adfv2dev.json
    │   └── blog-datapipeline-dev.json
    ├── linkedService
    │   ├── AzureDataLakeStorage1.json
    │   ├── AzureDataLakeStorage2KEY.json
    │   ├── AzureDatabricks1.json
    │   ├── AzureDatabricks_mi.json
    │   └── AzureKeyVault.json
    ├── pipeline
    │   ├── blog-datapipeline-pipeline-dbrmi.json
    │   ├── blog-datapipeline-pipeline-key.json
    │   └── blog-datapipeline-pipeline-mi.json
    └── publish_config.json
├── azure-pipelines.yml
├── cosmosdb_firewall_nopip_policy.json
├── data
    ├── dboPerson.txt
    └── dboRelation.txt
├── libraries
    ├── azure-cosmos-spark_3-1_2-12-4.2.0.jar
    └── graphframes-0.8.1-spark3.0-s_2.12.jar
├── notebooks
    ├── insert_data_CosmosDB_Gremlin.py
    └── mount_ADLSgen2_rawdata.py
├── pictures
    ├── 2_architecture.png
    ├── architecture_detailed.png
    └── old_architecture_detailed.png
├── readme.md
└── scripts
    ├── 1_create_resources.sh
    ├── 2_configure_databricks.sh
    ├── 3_configure_access_storage_databricks.sh
    ├── 4_configure_secret_scope_databricks.sh
    ├── 5_configure_mount_storage_databricks.sh
    ├── 6_configure_firewall.sh
    └── 99_databricks_sqlapi.sh


/.gitignore:
--------------------------------------------------------------------------------
1 | /oud


--------------------------------------------------------------------------------
/adfv2/dataset/csv_files_key.json:
--------------------------------------------------------------------------------
 1 | {
 2 | 	"name": "csv_files_key",
 3 | 	"properties": {
 4 | 		"linkedServiceName": {
 5 | 			"referenceName": "AzureDataLakeStorage2KEY",
 6 | 			"type": "LinkedServiceReference",
 7 | 			"parameters": {
 8 | 				"stor_url": "@pipeline().globalParameters.stor_url",
 9 | 				"akv_url": "@pipeline().globalParameters.akv_url"
10 | 			}
11 | 		},
12 | 		"annotations": [],
13 | 		"type": "DelimitedText",
14 | 		"typeProperties": {
15 | 			"location": {
16 | 				"type": "AzureBlobFSLocation",
17 | 				"fileSystem": "rawdata"
18 | 			},
19 | 			"columnDelimiter": ",",
20 | 			"escapeChar": "\\",
21 | 			"firstRowAsHeader": true,
22 | 			"quoteChar": "\""
23 | 		},
24 | 		"schema": [
25 | 			{
26 | 				"name": "id",
27 | 				"type": "String"
28 | 			},
29 | 			{
30 | 				"name": "name",
31 | 				"type": "String"
32 | 			},
33 | 			{
34 | 				"name": "age",
35 | 				"type": "String"
36 | 			}
37 | 		]
38 | 	},
39 | 	"type": "Microsoft.DataFactory/factories/datasets"
40 | }


--------------------------------------------------------------------------------
/adfv2/dataset/csv_files_mi.json:
--------------------------------------------------------------------------------
 1 | {
 2 | 	"name": "csv_files_mi",
 3 | 	"properties": {
 4 | 		"linkedServiceName": {
 5 | 			"referenceName": "AzureDataLakeStorage1",
 6 | 			"type": "LinkedServiceReference",
 7 | 			"parameters": {
 8 | 				"stor_url": "@pipeline().globalParameters.stor_url"
 9 | 			}
10 | 		},
11 | 		"annotations": [],
12 | 		"type": "DelimitedText",
13 | 		"typeProperties": {
14 | 			"location": {
15 | 				"type": "AzureBlobFSLocation",
16 | 				"fileSystem": "rawdata"
17 | 			},
18 | 			"columnDelimiter": ",",
19 | 			"escapeChar": "\\",
20 | 			"firstRowAsHeader": true,
21 | 			"quoteChar": "\""
22 | 		},
23 | 		"schema": [
24 | 			{
25 | 				"name": "id",
26 | 				"type": "String"
27 | 			},
28 | 			{
29 | 				"name": "name",
30 | 				"type": "String"
31 | 			},
32 | 			{
33 | 				"name": "age",
34 | 				"type": "String"
35 | 			}
36 | 		]
37 | 	},
38 | 	"type": "Microsoft.DataFactory/factories/datasets"
39 | }


--------------------------------------------------------------------------------
/adfv2/dataset/parquet_files_key.json:
--------------------------------------------------------------------------------
 1 | {
 2 | 	"name": "parquet_files_key",
 3 | 	"properties": {
 4 | 		"linkedServiceName": {
 5 | 			"referenceName": "AzureDataLakeStorage2KEY",
 6 | 			"type": "LinkedServiceReference",
 7 | 			"parameters": {
 8 | 				"stor_url": "@pipeline().globalParameters.stor_url",
 9 | 				"akv_url": "@pipeline().globalParameters.akv_url"
10 | 			}
11 | 		},
12 | 		"annotations": [],
13 | 		"type": "Parquet",
14 | 		"typeProperties": {
15 | 			"location": {
16 | 				"type": "AzureBlobFSLocation",
17 | 				"fileSystem": "defineddata"
18 | 			},
19 | 			"compressionCodec": "snappy"
20 | 		},
21 | 		"schema": []
22 | 	},
23 | 	"type": "Microsoft.DataFactory/factories/datasets"
24 | }


--------------------------------------------------------------------------------
/adfv2/dataset/parquet_files_mi.json:
--------------------------------------------------------------------------------
 1 | {
 2 | 	"name": "parquet_files_mi",
 3 | 	"properties": {
 4 | 		"linkedServiceName": {
 5 | 			"referenceName": "AzureDataLakeStorage1",
 6 | 			"type": "LinkedServiceReference",
 7 | 			"parameters": {
 8 | 				"stor_url": "@pipeline().globalParameters.stor_url"
 9 | 			}
10 | 		},
11 | 		"annotations": [],
12 | 		"type": "Parquet",
13 | 		"typeProperties": {
14 | 			"location": {
15 | 				"type": "AzureBlobFSLocation",
16 | 				"fileSystem": "defineddata"
17 | 			},
18 | 			"compressionCodec": "snappy"
19 | 		},
20 | 		"schema": []
21 | 	},
22 | 	"type": "Microsoft.DataFactory/factories/datasets"
23 | }


--------------------------------------------------------------------------------
/adfv2/factory/blog-datapipeline-adfv2.json:
--------------------------------------------------------------------------------
1 | {
2 | 	"name": "blog-datapipeline-adfv2",
3 | 	"location": "westeurope"
4 | }


--------------------------------------------------------------------------------
/adfv2/factory/blog-datapipeline-adfv2dev.json:
--------------------------------------------------------------------------------
 1 | {
 2 | 	"name": "blog-datapipeline-adfv2dev",
 3 | 	"properties": {
 4 | 		"globalParameters": {
 5 | 			"stor_url": {
 6 | 				"type": "string",
 7 | 				"value": "1"
 8 | 			},
 9 | 			"stor_name": {
10 | 				"type": "string",
11 | 				"value": "2"
12 | 			},
13 | 			"akv_url": {
14 | 				"type": "string",
15 | 				"value": "3"
16 | 			},
17 | 			"cluster_id": {
18 | 				"type": "string",
19 | 				"value": "4"
20 | 			},
21 | 			"notebook_name": {
22 | 				"type": "string",
23 | 				"value": "5"
24 | 			},
25 | 			"workspace_id_url": {
26 | 				"type": "string",
27 | 				"value": "6"
28 | 			},
29 | 			"cosmosdb_name": {
30 | 				"type": "string",
31 | 				"value": "7"
32 | 			},
33 | 			"vaultBaseUrl": {
34 | 				"type": "string",
35 | 				"value": "8"
36 | 			}
37 | 		}
38 | 	},
39 | 	"location": "westeurope"
40 | }


--------------------------------------------------------------------------------
/adfv2/factory/blog-datapipeline-dev.json:
--------------------------------------------------------------------------------
 1 | {
 2 | 	"name": "blog-datapipeline-dev",
 3 | 	"properties": {
 4 | 		"globalParameters": {
 5 | 			"notebook_name": {
 6 | 				"type": "string",
 7 | 				"value": "1"
 8 | 			},
 9 | 			"stor_name": {
10 | 				"type": "string",
11 | 				"value": "2"
12 | 			},
13 | 			"stor_url": {
14 | 				"type": "string",
15 | 				"value": "3"
16 | 			},
17 | 			"cluster_id": {
18 | 				"type": "string",
19 | 				"value": "4"
20 | 			},
21 | 			"akv_url": {
22 | 				"type": "string",
23 | 				"value": "5"
24 | 			},
25 | 			"vaultBaseUrl": {
26 | 				"type": "string",
27 | 				"value": "6"
28 | 			},
29 | 			"workspace_id_url": {
30 | 				"type": "string",
31 | 				"value": "7"
32 | 			},
33 | 			"cosmosdb_name": {
34 | 				"type": "string",
35 | 				"value": "8"
36 | 			}
37 | 		}
38 | 	},
39 | 	"location": "westeurope"
40 | }


--------------------------------------------------------------------------------
/adfv2/linkedService/AzureDataLakeStorage1.json:
--------------------------------------------------------------------------------
 1 | {
 2 | 	"name": "AzureDataLakeStorage1",
 3 | 	"type": "Microsoft.DataFactory/factories/linkedservices",
 4 | 	"properties": {
 5 | 		"type": "AzureBlobFS",
 6 | 		"annotations": [],
 7 | 		"typeProperties": {
 8 | 			"url": "@{linkedService().stor_url}"
 9 | 		},
10 | 		"parameters": {
11 | 			"stor_url": {
12 | 				"type": "String"
13 | 			}
14 | 		}
15 | 	}
16 | }


--------------------------------------------------------------------------------
/adfv2/linkedService/AzureDataLakeStorage2KEY.json:
--------------------------------------------------------------------------------
 1 | {
 2 | 	"name": "AzureDataLakeStorage2KEY",
 3 | 	"type": "Microsoft.DataFactory/factories/linkedservices",
 4 | 	"properties": {
 5 | 		"type": "AzureBlobFS",
 6 | 		"annotations": [],
 7 | 		"typeProperties": {
 8 | 			"url": "@{linkedService().stor_url}",
 9 | 			"accountKey": {
10 | 				"type": "AzureKeyVaultSecret",
11 | 				"store": {
12 | 					"referenceName": "AzureKeyVault",
13 | 					"type": "LinkedServiceReference",
14 | 					"parameters": {
15 | 						"akv_url": {
16 | 							"value": "@linkedService().akv_url",
17 | 							"type": "Expression"
18 | 						}
19 | 					}
20 | 				},
21 | 				"secretName": "stor-key"
22 | 			}
23 | 		},
24 | 		"parameters": {
25 | 			"stor_url": {
26 | 				"type": "String"
27 | 			},
28 | 			"akv_url": {
29 | 				"type": "String"
30 | 			}
31 | 		}
32 | 	}
33 | }


--------------------------------------------------------------------------------
/adfv2/linkedService/AzureDatabricks1.json:
--------------------------------------------------------------------------------
 1 | {
 2 | 	"name": "AzureDatabricks1",
 3 | 	"type": "Microsoft.DataFactory/factories/linkedservices",
 4 | 	"properties": {
 5 | 		"parameters": {
 6 | 			"workspace_id_url": {
 7 | 				"type": "String"
 8 | 			},
 9 | 			"cluster_id": {
10 | 				"type": "String"
11 | 			},
12 | 			"vaultBaseUrl": {
13 | 				"type": "string",
14 | 				"defaultValue": "https://blogdatapipelineprodakv2.vault.azure.net/"
15 | 			}
16 | 		},
17 | 		"annotations": [],
18 | 		"type": "AzureDatabricks",
19 | 		"typeProperties": {
20 | 			"domain": "@{linkedService().workspace_id_url}",
21 | 			"accessToken": {
22 | 				"type": "AzureKeyVaultSecret",
23 | 				"store": {
24 | 					"referenceName": "AzureKeyVault",
25 | 					"type": "LinkedServiceReference",
26 | 					"parameters": {
27 | 						"akv_url": {
28 | 							"value": "@linkedService().vaultBaseUrl",
29 | 							"type": "Expression"
30 | 						}
31 | 					}
32 | 				},
33 | 				"secretName": "pattoken"
34 | 			},
35 | 			"existingClusterId": "@{linkedService().cluster_id}"
36 | 		}
37 | 	}
38 | }


--------------------------------------------------------------------------------
/adfv2/linkedService/AzureDatabricks_mi.json:
--------------------------------------------------------------------------------
 1 | {
 2 | 	"name": "AzureDatabricks_mi",
 3 | 	"type": "Microsoft.DataFactory/factories/linkedservices",
 4 | 	"properties": {
 5 | 		"parameters": {
 6 | 			"workspace_id_url": {
 7 | 				"type": "String"
 8 | 			},
 9 | 			"cluster_id": {
10 | 				"type": "String"
11 | 			},
12 | 			"dbr_resource_id": {
13 | 				"type": "String"
14 | 			}
15 | 		},
16 | 		"annotations": [],
17 | 		"type": "AzureDatabricks",
18 | 		"typeProperties": {
19 | 			"domain": "@{linkedService().workspace_id_url}",
20 | 			"authentication": "MSI",
21 | 			"workspaceResourceId": "@{linkedService().dbr_resource_id}",
22 | 			"existingClusterId": "@{linkedService().cluster_id}"
23 | 		}
24 | 	}
25 | }


--------------------------------------------------------------------------------
/adfv2/linkedService/AzureKeyVault.json:
--------------------------------------------------------------------------------
 1 | {
 2 | 	"name": "AzureKeyVault",
 3 | 	"type": "Microsoft.DataFactory/factories/linkedservices",
 4 | 	"properties": {
 5 | 		"type": "AzureKeyVault",
 6 | 		"annotations": [],
 7 | 		"typeProperties": {
 8 | 			"baseUrl": "@{linkedService().akv_url}"
 9 | 		},
10 | 		"parameters": {
11 | 			"akv_url": {
12 | 				"type": "String"
13 | 			}
14 | 		}
15 | 	}
16 | }


--------------------------------------------------------------------------------
/adfv2/pipeline/blog-datapipeline-pipeline-dbrmi.json:
--------------------------------------------------------------------------------
  1 | {
  2 | 	"name": "blog-datapipeline-pipeline-dbrmi",
  3 | 	"properties": {
  4 | 		"activities": [
  5 | 			{
  6 | 				"name": "Copy csv to parquet",
  7 | 				"type": "Copy",
  8 | 				"dependsOn": [],
  9 | 				"policy": {
 10 | 					"timeout": "7.00:00:00",
 11 | 					"retry": 0,
 12 | 					"retryIntervalInSeconds": 30,
 13 | 					"secureOutput": false,
 14 | 					"secureInput": false
 15 | 				},
 16 | 				"userProperties": [
 17 | 					{
 18 | 						"name": "Source",
 19 | 						"value": "rawdata//"
 20 | 					},
 21 | 					{
 22 | 						"name": "Destination",
 23 | 						"value": "defineddata//"
 24 | 					}
 25 | 				],
 26 | 				"typeProperties": {
 27 | 					"source": {
 28 | 						"type": "DelimitedTextSource",
 29 | 						"storeSettings": {
 30 | 							"type": "AzureBlobFSReadSettings",
 31 | 							"recursive": true,
 32 | 							"wildcardFileName": "*"
 33 | 						},
 34 | 						"formatSettings": {
 35 | 							"type": "DelimitedTextReadSettings",
 36 | 							"skipLineCount": 0
 37 | 						}
 38 | 					},
 39 | 					"sink": {
 40 | 						"type": "ParquetSink",
 41 | 						"storeSettings": {
 42 | 							"type": "AzureBlobFSWriteSettings"
 43 | 						},
 44 | 						"formatSettings": {
 45 | 							"type": "ParquetWriteSettings"
 46 | 						}
 47 | 					},
 48 | 					"enableStaging": false,
 49 | 					"validateDataConsistency": false
 50 | 				},
 51 | 				"inputs": [
 52 | 					{
 53 | 						"referenceName": "csv_files_mi",
 54 | 						"type": "DatasetReference"
 55 | 					}
 56 | 				],
 57 | 				"outputs": [
 58 | 					{
 59 | 						"referenceName": "parquet_files_mi",
 60 | 						"type": "DatasetReference"
 61 | 					}
 62 | 				]
 63 | 			},
 64 | 			{
 65 | 				"name": "Create graph and write CosmosDB",
 66 | 				"type": "DatabricksNotebook",
 67 | 				"dependsOn": [
 68 | 					{
 69 | 						"activity": "Copy csv to parquet",
 70 | 						"dependencyConditions": [
 71 | 							"Succeeded"
 72 | 						]
 73 | 					}
 74 | 				],
 75 | 				"policy": {
 76 | 					"timeout": "7.00:00:00",
 77 | 					"retry": 0,
 78 | 					"retryIntervalInSeconds": 30,
 79 | 					"secureOutput": false,
 80 | 					"secureInput": false
 81 | 				},
 82 | 				"userProperties": [],
 83 | 				"typeProperties": {
 84 | 					"notebookPath": {
 85 | 						"value": "@pipeline().globalParameters.notebook_name",
 86 | 						"type": "Expression"
 87 | 					},
 88 | 					"baseParameters": {
 89 | 						"cosmosdb_name": {
 90 | 							"value": "@pipeline().globalParameters.cosmosdb_name",
 91 | 							"type": "Expression"
 92 | 						},
 93 | 						"stor_name": {
 94 | 							"value": "@pipeline().globalParameters.stor_name",
 95 | 							"type": "Expression"
 96 | 						}
 97 | 					}
 98 | 				},
 99 | 				"linkedServiceName": {
100 | 					"referenceName": "AzureDatabricks_mi",
101 | 					"type": "LinkedServiceReference",
102 | 					"parameters": {
103 | 						"workspace_id_url": "@pipeline().globalParameters.workspace_id_url",
104 | 						"cluster_id": "@pipeline().globalParameters.cluster_id",
105 | 						"dbr_resource_id": "@pipeline().globalParameters.dbr_resource_id"
106 | 					}
107 | 				}
108 | 			}
109 | 		],
110 | 		"annotations": [],
111 | 		"lastPublishTime": "2020-10-20T10:52:16Z"
112 | 	},
113 | 	"type": "Microsoft.DataFactory/factories/pipelines"
114 | }


--------------------------------------------------------------------------------
/adfv2/pipeline/blog-datapipeline-pipeline-key.json:
--------------------------------------------------------------------------------
  1 | {
  2 | 	"name": "blog-datapipeline-pipeline-key",
  3 | 	"properties": {
  4 | 		"activities": [
  5 | 			{
  6 | 				"name": "Copy csv to parquet",
  7 | 				"description": "Copy csv to parquet",
  8 | 				"type": "Copy",
  9 | 				"dependsOn": [],
 10 | 				"policy": {
 11 | 					"timeout": "7.00:00:00",
 12 | 					"retry": 0,
 13 | 					"retryIntervalInSeconds": 30,
 14 | 					"secureOutput": false,
 15 | 					"secureInput": false
 16 | 				},
 17 | 				"userProperties": [
 18 | 					{
 19 | 						"name": "Source",
 20 | 						"value": "rawdata//"
 21 | 					},
 22 | 					{
 23 | 						"name": "Destination",
 24 | 						"value": "defineddata//"
 25 | 					}
 26 | 				],
 27 | 				"typeProperties": {
 28 | 					"source": {
 29 | 						"type": "DelimitedTextSource",
 30 | 						"storeSettings": {
 31 | 							"type": "AzureBlobFSReadSettings",
 32 | 							"recursive": true,
 33 | 							"wildcardFileName": "*"
 34 | 						},
 35 | 						"formatSettings": {
 36 | 							"type": "DelimitedTextReadSettings",
 37 | 							"skipLineCount": 0
 38 | 						}
 39 | 					},
 40 | 					"sink": {
 41 | 						"type": "ParquetSink",
 42 | 						"storeSettings": {
 43 | 							"type": "AzureBlobFSWriteSettings"
 44 | 						},
 45 | 						"formatSettings": {
 46 | 							"type": "ParquetWriteSettings"
 47 | 						}
 48 | 					},
 49 | 					"enableStaging": false,
 50 | 					"validateDataConsistency": false
 51 | 				},
 52 | 				"inputs": [
 53 | 					{
 54 | 						"referenceName": "csv_files_key",
 55 | 						"type": "DatasetReference"
 56 | 					}
 57 | 				],
 58 | 				"outputs": [
 59 | 					{
 60 | 						"referenceName": "parquet_files_key",
 61 | 						"type": "DatasetReference"
 62 | 					}
 63 | 				]
 64 | 			},
 65 | 			{
 66 | 				"name": "Create graph and write CosmosDB",
 67 | 				"type": "DatabricksNotebook",
 68 | 				"dependsOn": [
 69 | 					{
 70 | 						"activity": "Copy csv to parquet",
 71 | 						"dependencyConditions": [
 72 | 							"Succeeded"
 73 | 						]
 74 | 					}
 75 | 				],
 76 | 				"policy": {
 77 | 					"timeout": "7.00:00:00",
 78 | 					"retry": 0,
 79 | 					"retryIntervalInSeconds": 30,
 80 | 					"secureOutput": false,
 81 | 					"secureInput": false
 82 | 				},
 83 | 				"userProperties": [],
 84 | 				"typeProperties": {
 85 | 					"notebookPath": {
 86 | 						"value": "@pipeline().globalParameters.notebook_name",
 87 | 						"type": "Expression"
 88 | 					},
 89 | 					"baseParameters": {
 90 | 						"cosmosdb_name": {
 91 | 							"value": "@pipeline().globalParameters.cosmosdb_name",
 92 | 							"type": "Expression"
 93 | 						},
 94 | 						"stor_name": {
 95 | 							"value": "@pipeline().globalParameters.stor_name",
 96 | 							"type": "Expression"
 97 | 						}
 98 | 					}
 99 | 				},
100 | 				"linkedServiceName": {
101 | 					"referenceName": "AzureDatabricks1",
102 | 					"type": "LinkedServiceReference",
103 | 					"parameters": {
104 | 						"workspace_id_url": "@pipeline().globalParameters.workspace_id_url",
105 | 						"cluster_id": "@pipeline().globalParameters.cluster_id",
106 | 						"vaultBaseUrl": "@pipeline().globalParameters.vaultBaseUrl"
107 | 					}
108 | 				}
109 | 			}
110 | 		],
111 | 		"annotations": [],
112 | 		"lastPublishTime": "2020-10-20T10:52:17Z"
113 | 	},
114 | 	"type": "Microsoft.DataFactory/factories/pipelines"
115 | }


--------------------------------------------------------------------------------
/adfv2/pipeline/blog-datapipeline-pipeline-mi.json:
--------------------------------------------------------------------------------
  1 | {
  2 | 	"name": "blog-datapipeline-pipeline-mi",
  3 | 	"properties": {
  4 | 		"activities": [
  5 | 			{
  6 | 				"name": "Copy csv to parquet",
  7 | 				"type": "Copy",
  8 | 				"dependsOn": [],
  9 | 				"policy": {
 10 | 					"timeout": "7.00:00:00",
 11 | 					"retry": 0,
 12 | 					"retryIntervalInSeconds": 30,
 13 | 					"secureOutput": false,
 14 | 					"secureInput": false
 15 | 				},
 16 | 				"userProperties": [
 17 | 					{
 18 | 						"name": "Source",
 19 | 						"value": "rawdata//"
 20 | 					},
 21 | 					{
 22 | 						"name": "Destination",
 23 | 						"value": "defineddata//"
 24 | 					}
 25 | 				],
 26 | 				"typeProperties": {
 27 | 					"source": {
 28 | 						"type": "DelimitedTextSource",
 29 | 						"storeSettings": {
 30 | 							"type": "AzureBlobFSReadSettings",
 31 | 							"recursive": true,
 32 | 							"wildcardFileName": "*"
 33 | 						},
 34 | 						"formatSettings": {
 35 | 							"type": "DelimitedTextReadSettings",
 36 | 							"skipLineCount": 0
 37 | 						}
 38 | 					},
 39 | 					"sink": {
 40 | 						"type": "ParquetSink",
 41 | 						"storeSettings": {
 42 | 							"type": "AzureBlobFSWriteSettings"
 43 | 						},
 44 | 						"formatSettings": {
 45 | 							"type": "ParquetWriteSettings"
 46 | 						}
 47 | 					},
 48 | 					"enableStaging": false,
 49 | 					"validateDataConsistency": false
 50 | 				},
 51 | 				"inputs": [
 52 | 					{
 53 | 						"referenceName": "csv_files_mi",
 54 | 						"type": "DatasetReference"
 55 | 					}
 56 | 				],
 57 | 				"outputs": [
 58 | 					{
 59 | 						"referenceName": "parquet_files_mi",
 60 | 						"type": "DatasetReference"
 61 | 					}
 62 | 				]
 63 | 			},
 64 | 			{
 65 | 				"name": "Create graph and write CosmosDB",
 66 | 				"type": "DatabricksNotebook",
 67 | 				"dependsOn": [
 68 | 					{
 69 | 						"activity": "Copy csv to parquet",
 70 | 						"dependencyConditions": [
 71 | 							"Succeeded"
 72 | 						]
 73 | 					}
 74 | 				],
 75 | 				"policy": {
 76 | 					"timeout": "7.00:00:00",
 77 | 					"retry": 0,
 78 | 					"retryIntervalInSeconds": 30,
 79 | 					"secureOutput": false,
 80 | 					"secureInput": false
 81 | 				},
 82 | 				"userProperties": [],
 83 | 				"typeProperties": {
 84 | 					"notebookPath": {
 85 | 						"value": "@pipeline().globalParameters.notebook_name",
 86 | 						"type": "Expression"
 87 | 					},
 88 | 					"baseParameters": {
 89 | 						"cosmosdb_name": {
 90 | 							"value": "@pipeline().globalParameters.cosmosdb_name",
 91 | 							"type": "Expression"
 92 | 						},
 93 | 						"stor_name": {
 94 | 							"value": "@pipeline().globalParameters.stor_name",
 95 | 							"type": "Expression"
 96 | 						}
 97 | 					}
 98 | 				},
 99 | 				"linkedServiceName": {
100 | 					"referenceName": "AzureDatabricks1",
101 | 					"type": "LinkedServiceReference",
102 | 					"parameters": {
103 | 						"workspace_id_url": "@pipeline().globalParameters.workspace_id_url",
104 | 						"cluster_id": "@pipeline().globalParameters.cluster_id",
105 | 						"vaultBaseUrl": "@pipeline().globalParameters.vaultBaseUrl"
106 | 					}
107 | 				}
108 | 			}
109 | 		],
110 | 		"annotations": [],
111 | 		"lastPublishTime": "2020-10-20T10:52:16Z"
112 | 	},
113 | 	"type": "Microsoft.DataFactory/factories/pipelines"
114 | }


--------------------------------------------------------------------------------
/adfv2/publish_config.json:
--------------------------------------------------------------------------------
1 | {"publishBranch":"adf_publish","includeFactoryTemplate":true}


--------------------------------------------------------------------------------
/azure-pipelines.yml:
--------------------------------------------------------------------------------
  1 | variables:
  2 |   # Azure DevOps settings
  3 |   AzureServiceConnectionId: 'blog-datapipelineprod-servcon'
  4 |   # Change environment variables used in bash scripts with your own
  5 |   RG: 'blog-datapipelineprod-rg'
  6 |   SUB: '<<your subscription>>'
  7 |   AKV: 'blogdatapipelineakv123' # unique value
  8 |   STOR: 'blogdatapipelinestor123' # unique value
  9 |   COSMOSDBNAME: 'blog-datapipeline-cosmos123' #unique value
 10 |   DBRWORKSPACE: 'blog-datapipeline-dbr123' #unique value
 11 |   # fixed Environment variables, no need for unique values
 12 |   LOC: 'westeurope'
 13 |   SPN: 'blog-datapipeline-spn'
 14 |   ADFV2: 'blog-datapipeline-adfv2'
 15 |   VNET: 'blog-datapipeline-vnet'
 16 |   # global settings
 17 |   ACCESS_STOR_AADDBR: 0
 18 |   SECRETSCOPE_KEYVAULT: 0 # see script 4_configure_secret_scope_databricks.sh in key vault is used
 19 |   MOUNT_STORAGE_DATABRICKS: 0 # can only be used if access_store_AAD is true
 20 |   ENABLE_FIREWALL: 1
 21 | 
 22 | trigger:
 23 | - master
 24 | 
 25 | pool:
 26 |   vmImage: 'ubuntu-latest'
 27 | 
 28 | resources:
 29 |   repositories:
 30 |   - repository: blog-datapipeline-cicd # change with your own repo name when necessary
 31 |     type: git
 32 |     name: blog-datapipeline-devops
 33 |     ref: master
 34 |   - repository: blog-datapipeline-deployadfv2 # change with your own repo name when necessary
 35 |     type: git
 36 |     name: blog-datapipeline-devops
 37 |     ref: adf_publish
 38 | 
 39 | steps:
 40 | - checkout: blog-datapipeline-cicd
 41 |   path: blog-datapipeline-cicd
 42 | - task: AzurePowerShell@4
 43 |   displayName: 'Create ADFv2 instance with MI'
 44 |   inputs:
 45 |     azureSubscription: $(AzureServiceConnectionId)
 46 |     ScriptType: InlineScript
 47 |     Inline: "Set-AzDataFactoryV2 -ResourceGroupName $(RG) -Location $(LOC) -Name $(ADFV2) -Force"
 48 |     azurePowerShellVersion: LatestVersion
 49 | - task: AzureCLI@1
 50 |   displayName: 'Create resources'
 51 |   inputs:
 52 |     azureSubscription: $(AzureServiceConnectionId)
 53 |     scriptType: bash
 54 |     scriptPath: '../blog-datapipeline-cicd/scripts/1_create_resources.sh'
 55 | - task: AzureCLI@1
 56 |   displayName: 'Configure Databricks'
 57 |   inputs:
 58 |     azureSubscription: $(AzureServiceConnectionId)
 59 |     scriptType: bash
 60 |     scriptPath: "../blog-datapipeline-cicd/scripts/2_configure_databricks.sh"
 61 | - task: AzureCLI@1
 62 |   displayName: 'Configure access to storage account'
 63 |   inputs:
 64 |     azureSubscription: $(AzureServiceConnectionId)
 65 |     scriptType: bash
 66 |     scriptPath: '../blog-datapipeline-cicd/scripts/3_configure_access_storage_databricks.sh'
 67 | - task: AzureCLI@1
 68 |   displayName: 'Configure Secret Scope Databricks'
 69 |   inputs:
 70 |     azureSubscription: $(AzureServiceConnectionId)
 71 |     scriptType: bash
 72 |     scriptPath: '../blog-datapipeline-cicd/scripts/4_configure_secret_scope_databricks.sh'
 73 | - task: AzureCLI@1
 74 |   displayName: 'Configure Mounting to Databricks'
 75 |   inputs:
 76 |     azureSubscription: $(AzureServiceConnectionId)
 77 |     scriptType: bash
 78 |     scriptPath: '../blog-datapipeline-cicd/scripts/5_configure_mount_storage_databricks.sh'
 79 | - task: AzureCLI@1
 80 |   displayName: 'Configure Firewall'
 81 |   inputs:
 82 |     azureSubscription: $(AzureServiceConnectionId)
 83 |     scriptType: bash
 84 |     scriptPath: '../blog-datapipeline-cicd/scripts/6_configure_firewall.sh'
 85 | - checkout: blog-datapipeline-deployadfv2
 86 |   path: blog-datapipeline-deployadfv2
 87 | - task: AzureResourceManagerTemplateDeployment@3
 88 |   displayName: 'Deploy ARM template ADFv2'
 89 |   inputs:
 90 |     azureResourceManagerConnection: $(AzureServiceConnectionId)
 91 |     subscriptionId: $(SUB)
 92 |     resourceGroupName: $(RG)
 93 |     location: $(LOC)
 94 |     csmFile: '../blog-datapipeline-deployadfv2/blog-datapipeline-adfv2/ARMTemplateForFactory.json'
 95 |     csmParametersFile: '../blog-datapipeline-deployadfv2/blog-datapipeline-adfv2/ARMTemplateParametersForFactory.json'
 96 |     overrideParameters: "-factoryName $(ADFV2) -dataFactory_properties_globalParameters_akv_url_value $(akv_url) -dataFactory_properties_globalParameters_stor_url_value $(stor_url) -dataFactory_properties_globalParameters_stor_name_value $(STOR) -dataFactory_properties_globalParameters_cosmosdb_name_value $(COSMOSDBNAME) -dataFactory_properties_globalParameters_dbr_resource_id_value $(dbr_resource_id) -dataFactory_properties_globalParameters_workspace_id_url_value $(workspace_id_url) -dataFactory_properties_globalParameters_cluster_id_value $(cluster_id) -dataFactory_properties_globalParameters_vaultBaseUrl_value $(akv_url) -dataFactory_location $(LOC) -dataFactory_properties_globalParameters_notebook_name_value /insert_data_CosmosDB_Gremlin.py"
 97 | - task: AzurePowerShell@4
 98 |   displayName: 'Run ADFv2 pipeline - standard'
 99 |   inputs:
100 |     azureSubscription: $(AzureServiceConnectionId)
101 |     ScriptType: InlineScript
102 |     Inline: "Invoke-AzDataFactoryV2Pipeline -ResourceGroupName $(RG) -DataFactoryName $(ADFV2) -PipelineName \"blog-datapipeline-pipeline-dbrmi\""
103 |     azurePowerShellVersion: LatestVersion


--------------------------------------------------------------------------------
/cosmosdb_firewall_nopip_policy.json:
--------------------------------------------------------------------------------
 1 | {
 2 |   "properties": {
 3 |     "displayName": "Azure Cosmos DB accounts should have firewall rules and no public IPs assigned",
 4 |     "policyType": "Custom",
 5 |     "mode": "All",
 6 |     "description": "Firewall rules should be defined on your Azure Cosmos DB accounts to prevent traffic from unauthorized sources. Accounts that have the virtual network filter enabled and not public IP addressess assigned are deemed compliant. Accounts disabling public access are also deemed compliant.",
 7 |     "metadata": {
 8 |       "category": "Cosmos DB",
 9 |       "createdBy": "<<object ID of SPN who created this policy>>",
10 |       "createdOn": "2021-03-14T14:28:01.3849493Z",
11 |       "updatedBy": "<<object ID of SPN who updated this policy>>",
12 |       "updatedOn": "2021-03-14T14:45:32.3184974Z"
13 |     },
14 |     "parameters": {
15 |       "effect": {
16 |         "type": "String",
17 |         "metadata": {
18 |           "displayName": "Policy Effect",
19 |           "description": "The desired effect of the policy."
20 |         },
21 |         "allowedValues": [
22 |           "Audit",
23 |           "Deny",
24 |           "Disabled"
25 |         ],
26 |         "defaultValue": "Deny"
27 |       }
28 |     },
29 |     "policyRule": {
30 |       "if": {
31 |         "allOf": [
32 |           {
33 |             "field": "type",
34 |             "equals": "Microsoft.DocumentDB/databaseAccounts"
35 |           },
36 |           {
37 |             "anyOf": [
38 |               {
39 |                 "field": "Microsoft.DocumentDB/databaseAccounts/publicNetworkAccess",
40 |                 "exists": "false"
41 |               },
42 |               {
43 |                 "field": "Microsoft.DocumentDB/databaseAccounts/publicNetworkAccess",
44 |                 "equals": "Enabled"
45 |               }
46 |             ]
47 |           },
48 |           {
49 |             "anyOf": [
50 |               {
51 |                 "field": "Microsoft.DocumentDB/databaseAccounts/isVirtualNetworkFilterEnabled",
52 |                 "exists": "false"
53 |               },
54 |               {
55 |                 "field": "Microsoft.DocumentDB/databaseAccounts/isVirtualNetworkFilterEnabled",
56 |                 "equals": "false"
57 |               },
58 |               {
59 |                 "field": "Microsoft.DocumentDB/databaseAccounts/ipRules[*].ipAddressOrRange",
60 |                 "exists": "true"
61 |               }
62 |             ]
63 |           }
64 |         ]
65 |       },
66 |       "then": {
67 |         "effect": "[parameters('effect')]"
68 |       }
69 |     }
70 |   },
71 |   "id": "/subscriptions/<<your subscription id>>/providers/Microsoft.Authorization/policyDefinitions/f0a8662e-cd7a-4b91-b766-fd2ebf133bfa",
72 |   "type": "Microsoft.Authorization/policyDefinitions",
73 |   "name": "f0a8662e-cd7a-4b91-b766-fd2ebf133bfa"
74 | }
75 | 


--------------------------------------------------------------------------------
/data/dboPerson.txt:
--------------------------------------------------------------------------------
1 | id,name,age
2 | "a","Alice",34
3 | "c","Charlie",30
4 | "d","David",29
5 | "e","Esther",32
6 | "f","Fanny",36
7 | "g","Gab",60
8 | 


--------------------------------------------------------------------------------
/data/dboRelation.txt:
--------------------------------------------------------------------------------
 1 | fromid,toid,relationtype
 2 | "a","b","friend"
 3 | "b","c","friend"
 4 | "c","b","follow"
 5 | "f","c","follow"
 6 | "e","f","follow"
 7 | "e","d","follow"
 8 | "d","a","friend"
 9 | "a","e","friend"
10 | 


--------------------------------------------------------------------------------
/libraries/azure-cosmos-spark_3-1_2-12-4.2.0.jar:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/rebremer/blog-datapipeline-cicd/68d47fde48a92a5044eaf119bae98bbd08e5ab95/libraries/azure-cosmos-spark_3-1_2-12-4.2.0.jar


--------------------------------------------------------------------------------
/libraries/graphframes-0.8.1-spark3.0-s_2.12.jar:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/rebremer/blog-datapipeline-cicd/68d47fde48a92a5044eaf119bae98bbd08e5ab95/libraries/graphframes-0.8.1-spark3.0-s_2.12.jar


--------------------------------------------------------------------------------
/notebooks/insert_data_CosmosDB_Gremlin.py:
--------------------------------------------------------------------------------
  1 | # Databricks notebook source
  2 | # MAGIC %md Azure Databricks notebooks by Rene Bremer
  3 | # MAGIC 
  4 | # MAGIC Copyright (c) Microsoft Corporation. All rights reserved.
  5 | # MAGIC 
  6 | # MAGIC Licensed under the MIT License.
  7 | 
  8 | # COMMAND ----------
  9 | 
 10 | par_cosmosdb_name = dbutils.widgets.get("cosmosdb_name")
 11 | par_stor_name = dbutils.widgets.get("stor_name")
 12 | 
 13 | # COMMAND ----------
 14 | 
 15 | # Databricks notebook source
 16 | # DBTITLE 1,Get parquet data from ADLSgen2
 17 | from pyspark.sql.functions import *
 18 | try:
 19 |   mnt_defineddata = dbutils.fs.ls('/mnt/defineddata')
 20 |   defineddata_mounted = 1
 21 | except:
 22 |   defineddata_mounted = 0
 23 |   spn_id = dbutils.secrets.get(scope="dbrkeys",key="spn-id")
 24 |  
 25 | if defineddata_mounted == 1:
 26 |   print ("try to get data from mounted storage")
 27 |   dfperson = spark.read.parquet("/mnt/defineddata/dboPerson.parquet").withColumn("entity", lit("person"))
 28 |   dfrelation = spark.read.parquet("/mnt/defineddata/dboRelation.parquet")
 29 | elif spn_id != "":
 30 |     print ("try to get data from using spn")    
 31 |     spark.conf.set("fs.azure.account.auth.type." + par_stor_name + ".dfs.core.windows.net", "OAuth")
 32 |     spark.conf.set("fs.azure.account.oauth.provider.type." + par_stor_name + ".dfs.core.windows.net", "org.apache.hadoop.fs.azurebfs.oauth2.ClientCredsTokenProvider")
 33 |     spark.conf.set("fs.azure.account.oauth2.client.id." + par_stor_name + ".dfs.core.windows.net", dbutils.secrets.get(scope="dbrkeys",key="spn-id"))
 34 |     spark.conf.set("fs.azure.account.oauth2.client.secret." + par_stor_name + ".dfs.core.windows.net", dbutils.secrets.get(scope="dbrkeys",key="spn-key"))
 35 |     spark.conf.set("fs.azure.account.oauth2.client.endpoint." + par_stor_name + ".dfs.core.windows.net", "https://login.microsoftonline.com/" + dbutils.secrets.get(scope="dbrkeys",key="tenant-id") + "/oauth2/token")
 36 |     dfperson = spark.read.parquet("abfss://defineddata@" + par_stor_name + ".dfs.core.windows.net/dboPerson.parquet").withColumn("entity", lit("person"))
 37 |     dfrelation = spark.read.parquet("abfss://defineddata@" + par_stor_name + ".dfs.core.windows.net/dboRelation.parquet")
 38 | else:
 39 |     print ("try to get data from using storage access key, not recommended in production")
 40 |     spark.conf.set("fs.azure.account.key." + par_stor_name + ".dfs.core.windows.net", dbutils.secrets.get(scope="dbrkeys",key="stor-key"))
 41 |     dfperson = spark.read.parquet("abfss://defineddata@" + par_stor_name + ".dfs.core.windows.net/dboPerson.parquet").withColumn("entity", lit("person"))
 42 |     dfrelation = spark.read.parquet("abfss://defineddata@" + par_stor_name + ".dfs.core.windows.net/dboRelation.parquet")
 43 |  
 44 | columns_new = [col.replace("fromid", "src") for col in dfrelation.columns]
 45 | dfrelation = dfrelation.toDF(*columns_new)
 46 |  
 47 | columns_new = [col.replace("toid", "dst") for col in dfrelation.columns]
 48 | dfrelation = dfrelation.toDF(*columns_new)
 49 | 
 50 | # COMMAND ----------
 51 | 
 52 | from graphframes import GraphFrame
 53 | g = GraphFrame(dfperson, dfrelation)
 54 | 
 55 | # COMMAND ----------
 56 | 
 57 | from pyspark.sql.types import StringType
 58 | from urllib.parse import quote
 59 | 
 60 | def urlencode(value):
 61 |   return quote(value, safe="")
 62 | 
 63 | udf_urlencode = udf(urlencode, StringType())
 64 | 
 65 | # COMMAND ----------
 66 | 
 67 | def to_cosmosdb_vertices(dfVertices, labelColumn, partitionKey = ""):
 68 |   dfVertices = dfVertices.withColumn("id", udf_urlencode("id"))
 69 |   dfVertices = dfVertices.withColumn("age", udf_urlencode("age"))
 70 |   dfVertices = dfVertices.withColumn("name", udf_urlencode("name"))
 71 |   
 72 |   columns = ["id", labelColumn]
 73 |   
 74 |   if partitionKey:
 75 |     columns.append(partitionKey)
 76 |   
 77 |   #columns.extend(['nvl2({x}, array(named_struct("id", uuid(), "_value", {x})), NULL) AS {x}'.format(x=x) \
 78 | #                for x in dfVertices.columns if x not in columns])
 79 |  
 80 |   return dfVertices.selectExpr(*columns).withColumnRenamed(labelColumn, "label")
 81 | 
 82 | # COMMAND ----------
 83 | 
 84 | cosmosDbVertices = dfperson
 85 | #display(dfperson)
 86 | 
 87 | # COMMAND ----------
 88 | 
 89 | from pyspark.sql.functions import concat_ws, col
 90 | 
 91 | def to_cosmosdb_edges(g, labelColumn, partitionKey = ""): 
 92 |   dfEdges = g.edges
 93 |   
 94 |   if partitionKey:
 95 |     dfEdges = dfEdges.alias("e") \
 96 |       .join(g.vertices.alias("sv"), col("e.src") == col("sv.id")) \
 97 |       .join(g.vertices.alias("dv"), col("e.dst") == col("dv.id")) \
 98 |       .selectExpr("e.*", "sv." + partitionKey, "dv." + partitionKey + " AS _sinkPartition")
 99 | 
100 |   dfEdges = dfEdges \
101 |     .withColumn("id", udf_urlencode(concat_ws("_", col("src"), col(labelColumn), col("dst")))) \
102 |     .withColumn("_isEdge", lit(True)) \
103 |     .withColumn("_vertexId", udf_urlencode("src")) \
104 |     .withColumn("_sink", udf_urlencode("dst")) \
105 |     .withColumnRenamed(labelColumn, "label") \
106 |     .drop("src", "dst")
107 |   
108 |   return dfEdges
109 | 
110 | # COMMAND ----------
111 | 
112 | cosmosDbEdges = to_cosmosdb_edges(g, "relationtype")
113 | display(cosmosDbEdges)
114 | 
115 | # COMMAND ----------
116 | 
117 | cfg = {
118 |   "spark.cosmos.accountEndpoint" : "https://" + par_cosmosdb_name + ".documents.azure.com:443/",
119 |   "spark.cosmos.accountKey" : dbutils.secrets.get(scope="dbrkeys",key="cosmosdb-key"),
120 |   "spark.cosmos.database" : "peopledb",
121 |   "spark.cosmos.container" : "peoplegraph"
122 | }
123 | 
124 | cosmosDbFormat ="cosmos.oltp"
125 | 
126 | cosmosDbVertices.write.format("cosmos.oltp").options(**cfg).mode("APPEND").save()
127 | cosmosDbEdges.write.format("cosmos.oltp").options(**cfg).mode("APPEND").save()
128 | 
129 | #cosmosDbConfig = {
130 | #  "Endpoint" : "https://" + par_cosmosdb_name + ".documents.azure.com:443/",
131 | #  "Masterkey" : dbutils.secrets.get(scope="dbrkeys",key="cosmosdb-key"),
132 | #  "Database" : "peopledb",
133 | #  "Collection" : "peoplegraph",
134 | #  "Upsert" : "true"
135 | #}
136 | 
137 | #cosmosDbFormat = "com.microsoft.azure.cosmosdb.spark"
138 | 
139 | #cosmosDbVertices.write.format(cosmosDbFormat).mode("append").options(**cosmosDbConfig).save()
140 | #cosmosDbEdges.write.format(cosmosDbFormat).mode("append").options(**cosmosDbConfig).save()
141 | 
142 | 
143 | # COMMAND ----------
144 | 
145 | 
146 | 


--------------------------------------------------------------------------------
/notebooks/mount_ADLSgen2_rawdata.py:
--------------------------------------------------------------------------------
 1 | # Databricks notebook source
 2 | # MAGIC %md Azure Databricks notebooks by Rene Bremer
 3 | # MAGIC 
 4 | # MAGIC Copyright (c) Microsoft Corporation. All rights reserved.
 5 | # MAGIC 
 6 | # MAGIC Licensed under the MIT License.
 7 | 
 8 | # COMMAND ----------
 9 | 
10 | par_stor_name = dbutils.widgets.get("stor_name")
11 | 
12 | # COMMAND ----------
13 | 
14 | # Databricks notebook source
15 | # "fs.azure.account.oauth2.client.secret": dbutils.secrets.get(scope="<scope-name>",key="<service-credential-key-name>"),
16 | 
17 | configs = {"fs.azure.account.auth.type": "OAuth",
18 |            "fs.azure.account.oauth.provider.type": "org.apache.hadoop.fs.azurebfs.oauth2.ClientCredsTokenProvider",
19 |            "fs.azure.account.oauth2.client.id": dbutils.secrets.get(scope="dbrkeys",key="spn-id"),
20 |            "fs.azure.account.oauth2.client.secret": dbutils.secrets.get(scope="dbrkeys",key="spn-key"),
21 |            "fs.azure.account.oauth2.client.endpoint": "https://login.microsoftonline.com/" + dbutils.secrets.get(scope="dbrkeys",key="tenant-id") + "/oauth2/token"}
22 | 
23 | # Optionally, you can add <directory-name> to the source URI of your mount point.
24 | dbutils.fs.mount(
25 |   source = "abfss://defineddata@" + par_stor_name + ".dfs.core.windows.net/",
26 |   mount_point = "/mnt/defineddata",
27 |   extra_configs = configs)
28 | 
29 | # COMMAND ----------
30 | 
31 | %sh
32 | ls -l /dbfs/mnt/defineddata
33 | 
34 | # COMMAND ----------


--------------------------------------------------------------------------------
/pictures/2_architecture.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/rebremer/blog-datapipeline-cicd/68d47fde48a92a5044eaf119bae98bbd08e5ab95/pictures/2_architecture.png


--------------------------------------------------------------------------------
/pictures/architecture_detailed.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/rebremer/blog-datapipeline-cicd/68d47fde48a92a5044eaf119bae98bbd08e5ab95/pictures/architecture_detailed.png


--------------------------------------------------------------------------------
/pictures/old_architecture_detailed.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/rebremer/blog-datapipeline-cicd/68d47fde48a92a5044eaf119bae98bbd08e5ab95/pictures/old_architecture_detailed.png


--------------------------------------------------------------------------------
/readme.md:
--------------------------------------------------------------------------------
 1 | ### Data Factory pipeline CICD project using DevOps, Databricks, Cosmos DB
 2 | 
 3 | ![Architecture](https://github.com/rebremer/blog-datapipeline-cicd/blob/master/pictures/architecture_detailed.png)
 4 | 
 5 | In this github project, a data pipeline project is created that does the following:
 6 | 1. Setup Azure DevOps CI/CD project 
 7 | 2. Deploy and configure Azure resources using Azure DevOps
 8 | 3. Run Data Factory pipeline adding data to Cosmos DB Graph API using a Databricks notebook
 9 | 
10 | Details can be found in this blog: https://rebremer.medium.com/how-to-bring-your-modern-data-pipeline-to-production-2f14e42ac200


--------------------------------------------------------------------------------
/scripts/1_create_resources.sh:
--------------------------------------------------------------------------------
 1 | # Resource group
 2 | az group create -n $RG -l $LOC
 3 | # Key vault
 4 | az keyvault create -l $LOC -n $AKV -g $RG
 5 | tenantId=$(az account show --query tenantId -o tsv)
 6 | az keyvault secret set -n tenant-id --vault-name $AKV --value $tenantId
 7 | # Storage account
 8 | az storage account create -n $STOR -g $RG -l $LOC --sku Standard_LRS --kind StorageV2 --enable-hierarchical-namespace true
 9 | az storage container create --account-name $STOR -n "rawdata"
10 | az storage container create --account-name $STOR -n "defineddata"
11 | az storage blob upload -f "../data/dboPerson.txt" -c "rawdata" -n "dboPerson.txt" --account-name $STOR 
12 | az storage blob upload -f "../data/dboRelation.txt" -c "rawdata" -n "dboRelation.txt" --account-name $STOR
13 | # Databricks
14 | az extension add --name databricks
15 | dbr_response=$(az databricks workspace show -g $RG -n $DBRWORKSPACE)
16 | if ["$dbr_response" = ""]; then
17 |    vnetaddressrange="10.210.0.0"
18 |    subnet1addressrange="10.210.0.0"
19 |    subnet2addressrange="10.210.1.0"
20 |    az network vnet create -g $RG -n $VNET --address-prefix $vnetaddressrange/16 -l $LOC  
21 |    az network nsg create -g $RG -n "public-subnet-nsg"
22 |    az network nsg create -g $RG -n "private-subnet-nsg"
23 |    az network vnet subnet create -g $RG --vnet-name $VNET -n "public-subnet" --address-prefixes $subnet1addressrange/24 --network-security-group "public-subnet-nsg"
24 |    az network vnet subnet create -g $RG --vnet-name $VNET -n "private-subnet" --address-prefixes $subnet2addressrange/24 --network-security-group "private-subnet-nsg"
25 |    az network vnet subnet update --resource-group $RG --name "public-subnet" --vnet-name $VNET --delegations Microsoft.Databricks/workspaces
26 |    az network vnet subnet update --resource-group $RG --name "private-subnet" --vnet-name $VNET --delegations Microsoft.Databricks/workspaces
27 |    dbr_response=$(az databricks workspace create -l $LOC -n $DBRWORKSPACE -g $RG --sku premium --vnet $VNET --public-subnet "public-subnet" --private-subnet "private-subnet")
28 | fi
29 | # Variables
30 | dbr_resource_id=$(jq .id -r <<< "$dbr_response")
31 | workspaceUrl_no_http=$(jq .workspaceUrl -r <<< "$dbr_response")
32 | workspace_id_url="https://"$workspaceUrl_no_http"/"
33 | akv_url="https://"$AKV".vault.azure.net/"
34 | stor_url="https://"$STOR".dfs.core.windows.net/"
35 | echo "##vso[task.setvariable variable=dbr_resource_id]$dbr_resource_id"
36 | echo "##vso[task.setvariable variable=workspace_id_url]$workspace_id_url"
37 | echo "##vso[task.setvariable variable=akv_url]$akv_url"
38 | echo "##vso[task.setvariable variable=stor_url]$stor_url"
39 | # Cosmos DB graph API
40 | cosmosdbdatabase="peopledb"
41 | cosmosdbgraph="peoplegraph"
42 | az cosmosdb create -n $COSMOSDBNAME -g $RG --capabilities EnableGremlin
43 | az cosmosdb gremlin database create -a $COSMOSDBNAME -n $cosmosdbdatabase -g $RG
44 | az cosmosdb gremlin graph create -g $RG -a $COSMOSDBNAME -d $cosmosdbdatabase -n $cosmosdbgraph --partition-key-path "/name"
45 | cosmosdb_response=$(az cosmosdb keys list -n $COSMOSDBNAME -g $RG)
46 | cosmosdb_key=$(jq .primaryMasterKey -r <<< "$cosmosdb_response")
47 | az keyvault secret set -n cosmosdb-key --vault-name $AKV --value $cosmosdb_key
48 | # Datafactory
49 | az extension add --name datafactory
50 | api_response=$(az datafactory show -n $ADFV2 -g $RG)
51 | adfv2_id=$(jq .identity.principalId -r <<< "$api_response")
52 | az keyvault set-policy -n $AKV --secret-permissions set get list --object-id $adfv2_id
53 | # Assign RBAC rights ADFv2 MI on storage account. 
54 | # Service connection SPN needs to have owner rights on account
55 | scope="/subscriptions/$SUB/resourceGroups/$RG/providers/Microsoft.Storage/storageAccounts/$STOR"
56 | az role assignment create --assignee-object-id $adfv2_id --role "Storage Blob Data Contributor" --scope $scope
57 | # Assign RBAC rights ADFv2 MI on Databricks 
58 | # Service connection SPN needs to have owner rights on account
59 | az role assignment create --assignee-object-id $adfv2_id --role "Contributor" --scope $dbr_resource_id
60 | 


--------------------------------------------------------------------------------
/scripts/2_configure_databricks.sh:
--------------------------------------------------------------------------------
 1 | # 1a. Get tenantID and resource id
 2 | tenantId=$(az account show --query tenantId -o tsv)
 3 | wsId=$(az resource show \
 4 |   --resource-type Microsoft.Databricks/workspaces \
 5 |   -g "$RG" \
 6 |   -n "$DBRWORKSPACE" \
 7 |   --query id -o tsv)
 8 | # 1b. Get two bearer tokens in Azure
 9 | token_response=$(az account get-access-token --resource 2ff814a6-3304-4ab8-85cb-cd0e6f879c1d)
10 | token=$(jq .accessToken -r <<< "$token_response")
11 | token_response=$(az account get-access-token --resource https://management.core.windows.net/)
12 | azToken=$(jq .accessToken -r <<< "$token_response")
13 | #
14 | # Databricks
15 | dbr_response=$(az databricks workspace show -g $RG -n $DBRWORKSPACE)
16 | workspaceUrl_no_http=$(jq .workspaceUrl -r <<< "$dbr_response")
17 | workspace_id_url="https://"$workspaceUrl_no_http"/"
18 | #
19 | # 2. Upload notebook to Databricks Workspace
20 | api_response=$(curl -v -X POST ${workspace_id_url}api/2.0/workspace/import \
21 |   -H "Authorization: Bearer $token" \
22 |   -H "X-Databricks-Azure-SP-Management-Token:$azToken" \
23 |   -H "X-Databricks-Azure-Workspace-Resource-Id:$wsId" \
24 |   -F path="/mount_ADLSgen2_rawdata.py" -F format=SOURCE -F language=PYTHON -F overwrite=true -F content=@../notebooks/mount_ADLSgen2_rawdata.py)
25 | api_response=$(curl -v -X POST ${workspace_id_url}api/2.0/workspace/import \
26 |   -H "Authorization: Bearer $token" \
27 |   -H "X-Databricks-Azure-SP-Management-Token:$azToken" \
28 |   -H "X-Databricks-Azure-Workspace-Resource-Id:$wsId" \
29 |   -F path="/insert_data_CosmosDB_Gremlin.py" -F format=SOURCE -F language=PYTHON -F overwrite=true -F content=@../notebooks/insert_data_CosmosDB_Gremlin.py)
30 | #
31 | # 3. Upload libraries to Databricks DBFS
32 | api_response=$(curl -v -X POST ${workspace_id_url}api/2.0/dbfs/put \
33 |   -H "Authorization: Bearer $token" \
34 |   -H "X-Databricks-Azure-SP-Management-Token:$azToken" \
35 |   -H "X-Databricks-Azure-Workspace-Resource-Id:$wsId" \
36 |   -F path="/azure-cosmos-spark_3-1_2-12-4.2.0.jar" -F contents=@../libraries/azure-cosmos-spark_3-1_2-12-4.2.0.jar -F overwrite=true)
37 | api_response=$(curl -v -X POST ${workspace_id_url}api/2.0/dbfs/put \
38 |   -H "Authorization: Bearer $token" \
39 |   -H "X-Databricks-Azure-SP-Management-Token:$azToken" \
40 |   -H "X-Databricks-Azure-Workspace-Resource-Id:$wsId" \
41 |   -F path="/graphframes-0.8.1-spark3.0-s_2.12.jar" -F contents=@../libraries/graphframes-0.8.1-spark3.0-s_2.12.jar -F overwrite=true)
42 | #
43 | # 4. Create Databricks cluster
44 | api_response=$(curl -v -X POST ${workspace_id_url}api/2.0/clusters/create \
45 |   -H "Authorization: Bearer $token" \
46 |   -H "X-Databricks-Azure-SP-Management-Token:$azToken" \
47 |   -H "X-Databricks-Azure-Workspace-Resource-Id:$wsId" \
48 |   -d "{\"cluster_name\": \"clusterPAT6\",\"spark_version\": \"6.6.x-scala2.11\",\"node_type_id\": \"Standard_D3_v2\", \"autotermination_minutes\":60, \"num_workers\" : 1}")
49 | cluster_id=$(jq .cluster_id -r <<< "$api_response")
50 | echo "##vso[task.setvariable variable=cluster_id]$cluster_id"
51 | sleep 1m
52 | #
53 | # 5. Add libraries to cluster
54 | api_response=$(curl -v -X POST ${workspace_id_url}api/2.0/libraries/install \
55 |   -H "Authorization: Bearer $token" \
56 |   -H "X-Databricks-Azure-SP-Management-Token:$azToken" \
57 |   -H "X-Databricks-Azure-Workspace-Resource-Id:$wsId" \
58 |   -d "{\"cluster_id\": \"$cluster_id\", \"libraries\": [{\"jar\": \"dbfs:/azure-cosmos-spark_3-1_2-12-4.2.0.jar\"},{\"jar\": \"dbfs:/graphframes-0.8.1-spark3.0-s_2.12.jar\"}]}")
59 | 


--------------------------------------------------------------------------------
/scripts/3_configure_access_storage_databricks.sh:
--------------------------------------------------------------------------------
 1 | if [ $ACCESS_STOR_AADDBR = 0 ]; then
 2 |     echo "Usage access key of storage account to authenticate"
 3 |     key_response=$(az storage account keys list -g $RG -n $STOR)
 4 |     stor_key=$(jq .[0].value -r <<< "$key_response")
 5 |     az keyvault secret set -n stor-key --vault-name $AKV --value $stor_key
 6 | else
 7 |     echo "Assigning Azure AD SPN to Databricks for authentication to storage account"
 8 |     spn_response=$(az ad sp create-for-rbac -n $SPN --skip-assignment)
 9 |     spn_id=$(jq .appId -r <<< "$spn_response")
10 |     spn_key=$(jq .password -r <<< "$spn_response")
11 |     #
12 |     az keyvault secret set -n spn-id --vault-name $AKV --value $spn_id
13 |     az keyvault secret set -n spn-key --vault-name $AKV --value $spn_key
14 |     #
15 |     spn_response=$(az ad sp show --id $spn_id)
16 |     spn_object_id=$(jq .objectId -r <<< "$spn_response")
17 |     scope="/subscriptions/$SUB/resourceGroups/$RG/providers/Microsoft.Storage/storageAccounts/$STOR"
18 |     az role assignment create --assignee-object-id $spn_object_id --role "Storage Blob Data Contributor" --scope $scope
19 |     #
20 |     # In case a Databricks secret scope is used and this script is run afterwards with elevated user rights,
21 |     # also run script 4_configure_secret_scope_databricks.sh to add spn_id and spn_key to Databricks secret scope
22 | fi
23 | 


--------------------------------------------------------------------------------
/scripts/4_configure_secret_scope_databricks.sh:
--------------------------------------------------------------------------------
 1 | # Databricks
 2 | az extension add --name databricks
 3 | dbr_response=$(az databricks workspace show -g $RG -n $DBRWORKSPACE)
 4 | workspaceUrl_no_http=$(jq .workspaceUrl -r <<< "$dbr_response")
 5 | workspace_id_url="https://"$workspaceUrl_no_http"/"
 6 | # Get two bearer tokens in Azure
 7 | tenantId=$(az account show --query tenantId -o tsv)
 8 | wsId=$(az resource show \
 9 |   --resource-type Microsoft.Databricks/workspaces \
10 |   -g "$RG" \
11 |   -n "$DBRWORKSPACE" \
12 |   --query id -o tsv)
13 | token_response=$(az account get-access-token --resource 2ff814a6-3304-4ab8-85cb-cd0e6f879c1d)
14 | token=$(jq .accessToken -r <<< "$token_response")
15 | token_response=$(az account get-access-token --resource https://management.core.windows.net/)
16 | azToken=$(jq .accessToken -r <<< "$token_response")
17 | #
18 | if [ $SECRETSCOPE_KEYVAULT = 0 ]; then
19 |   api_response=$(curl -v -X POST ${workspace_id_url}api/2.0/secrets/scopes/create \
20 |     -H "Authorization: Bearer $token" \
21 |     -H "X-Databricks-Azure-SP-Management-Token:$azToken" \
22 |     -H "X-Databricks-Azure-Workspace-Resource-Id:$wsId" \
23 |     -d "{\"scope\": \"dbrkeys\"}")
24 |   # 2a2. Move keys from key vault to Databricks backed secret scope
25 |   keyvault_response=$(az keyvault secret show -n spn-id --vault-name $AKV)
26 |   spn_id=$(jq .value -r <<< "$keyvault_response")
27 |   keyvault_response=$(az keyvault secret show -n spn-key --vault-name $AKV)
28 |   spn_key=$(jq .value -r <<< "$keyvault_response")
29 |   keyvault_response=$(az keyvault secret show -n stor-key --vault-name $AKV)
30 |   stor_key=$(jq .value -r <<< "$keyvault_response")
31 |   keyvault_response=$(az keyvault secret show -n cosmosdb-key --vault-name $AKV)
32 |   cosmosdb_key=$(jq .value -r <<< "$keyvault_response")
33 |   keyvault_response=$(az keyvault secret show -n tenant-id --vault-name $AKV)
34 |   tenant_id=$(jq .value -r <<< "$keyvault_response")
35 |   api_response=$(curl -v -X POST ${workspace_id_url}api/2.0/secrets/put \
36 |     -H "Authorization: Bearer $token" \
37 |     -H "X-Databricks-Azure-SP-Management-Token:$azToken" \
38 |     -H "X-Databricks-Azure-Workspace-Resource-Id:$wsId" \
39 |     -d "{\"scope\": \"dbrkeys\", \"key\": \"spn-id\", \"string_value\": \"$spn_id\"}")
40 |   api_response=$(curl -v -X POST ${workspace_id_url}api/2.0/secrets/put \
41 |     -H "Authorization: Bearer $token" \
42 |     -H "X-Databricks-Azure-SP-Management-Token:$azToken" \
43 |     -H "X-Databricks-Azure-Workspace-Resource-Id:$wsId" \
44 |     -d "{\"scope\": \"dbrkeys\", \"key\": \"spn-key\", \"string_value\": \"$spn_key\"}")
45 |   api_response=$(curl -v -X POST ${workspace_id_url}api/2.0/secrets/put \
46 |     -H "Authorization: Bearer $token" \
47 |     -H "X-Databricks-Azure-SP-Management-Token:$azToken" \
48 |     -H "X-Databricks-Azure-Workspace-Resource-Id:$wsId" \
49 |     -d "{\"scope\": \"dbrkeys\", \"key\": \"stor-key\", \"string_value\": \"$stor_key\"}")
50 |   api_response=$(curl -v -X POST ${workspace_id_url}api/2.0/secrets/put \
51 |     -H "Authorization: Bearer $token" \
52 |     -H "X-Databricks-Azure-SP-Management-Token:$azToken" \
53 |     -H "X-Databricks-Azure-Workspace-Resource-Id:$wsId" \
54 |     -d "{\"scope\": \"dbrkeys\", \"key\": \"cosmosdb-key\", \"string_value\": \"$cosmosdb_key\"}")
55 |   api_response=$(curl -v -X POST ${workspace_id_url}api/2.0/secrets/put \
56 |     -H "Authorization: Bearer $token" \
57 |     -H "X-Databricks-Azure-SP-Management-Token:$azToken" \
58 |     -H "X-Databricks-Azure-Workspace-Resource-Id:$wsId" \
59 |     -d "{\"scope\": \"dbrkeys\", \"key\": \"tenant-id\", \"string_value\": \"$tenant_id\"}")
60 | else
61 |   #2b. Create secret sope backed by Azure Key Vault, only works with Azure AD token
62 |   #
63 |   # 20201213: This does not work from Azure DevOps SPN (only when)
64 |   # See https://github.com/databricks/databricks-cli/issues/338 for workaround
65 |   #
66 |   echo "Create secret sope backed by Azure Key Vault"
67 |   #
68 |   akv_url="https://"$AKV".vault.azure.net/"
69 |   api_response=$(curl -v -X POST ${workspace_id_url}api/2.0/secrets/scopes/create \
70 |     -H "Authorization: Bearer $token" \
71 |     -H "X-Databricks-Azure-SP-Management-Token:$azToken" \
72 |     -H "X-Databricks-Azure-Workspace-Resource-Id:$wsId" \
73 |     -d "{\"scope\": \"dbrkeys\", \"scope_backend_type\": \"AZURE_KEYVAULT\", \"backend_azure_keyvault\":{\"resource_id\": \"/subscriptions/$SUB/resourceGroups/$RG/providers/Microsoft.KeyVault/vaults/$AKV\", \"dns_name\": \"$akv_url\"}}")
74 |   error_code=$(jq .error_code -r <<< "$api_response")
75 |   echo $error_code
76 |   message=$(jq .message -r <<< "$api_response")
77 |   echo $message
78 | fi


--------------------------------------------------------------------------------
/scripts/5_configure_mount_storage_databricks.sh:
--------------------------------------------------------------------------------
 1 | if [ $MOUNT_STORAGE_DATABRICKS = 0 ]; then
 2 |     echo "Storage shall not be mounted"
 3 |     exit 0
 4 | fi
 5 | #
 6 | keyvault_response=$(az keyvault secret show -n spn-id --vault-name $AKV)
 7 | spn_id=$(jq .value -r <<< "$keyvault_response")
 8 | if ["$spn_id" = ""]; then
 9 | 
10 |     echo "No spn present, storage cannot be mounted"
11 |     exit 0
12 | fi
13 | # Databricks
14 | dbr_response=$(az databricks workspace show -g $RG -n $DBRWORKSPACE)
15 | workspaceUrl_no_http=$(jq .workspaceUrl -r <<< "$dbr_response")
16 | workspace_id_url="https://"$workspaceUrl_no_http"/"
17 | #
18 | # 1. Create job
19 | # Get two bearer tokens in Azure
20 | tenantId=$(az account show --query tenantId -o tsv)
21 | wsId=$(az resource show \
22 |   --resource-type Microsoft.Databricks/workspaces \
23 |   -g "$RG" \
24 |   -n "$DBRWORKSPACE" \
25 |   --query id -o tsv)
26 | token_response=$(az account get-access-token --resource 2ff814a6-3304-4ab8-85cb-cd0e6f879c1d)
27 | token=$(jq .accessToken -r <<< "$token_response")
28 | token_response=$(az account get-access-token --resource https://management.core.windows.net/)
29 | azToken=$(jq .accessToken -r <<< "$token_response")
30 | api_response=$(curl -v -X POST ${workspace_id_url}api/2.0/jobs/create \
31 |   -H "Authorization: Bearer $token" \
32 |   -H "X-Databricks-Azure-SP-Management-Token:$azToken" \
33 |   -H "X-Databricks-Azure-Workspace-Resource-Id:$wsId" \
34 |   -d "{\"name\": \"mount storage\", \"existing_cluster_id\": \"$cluster_id\", \"notebook_task\": {\"notebook_path\": \"/mount_ADLSgen2_rawdata.py\", \"base_parameters\": [{\"key\":\"stor_name\", \"value\":\"$STOR\"}]}}")
35 | job_id=$(jq .job_id -r <<< "$api_response")
36 | #
37 | # 2. Run job to run notebook to mount storage
38 | api_response=$(curl -v -X POST ${workspace_id_url}api/2.0/jobs/run-now \
39 |   -H "Authorization: Bearer $token" \
40 |   -H "X-Databricks-Azure-SP-Management-Token:$azToken" \
41 |   -H "X-Databricks-Azure-Workspace-Resource-Id:$wsId" \
42 |   -d "{\"job_id\": $job_id}")
43 | run_id=$(jq .run_id -r <<< "$api_response")
44 | #
45 | # 3. Wait until jobs if finished (mainly dependent on step 9 to create cluster)
46 | i=0
47 | while [ $i -lt 10 ]
48 | do
49 |   echo "Time waited for job to finish: $i minutes"
50 |   ((i++))
51 |   api_response=$(curl -v -X GET ${workspace_id_url}api/2.0/jobs/runs/get\?run_id=$run_id \
52 |     -H "Authorization: Bearer $token" \
53 |     -H "X-Databricks-Azure-SP-Management-Token:$azToken" \
54 |     -H "X-Databricks-Azure-Workspace-Resource-Id:$wsId"
55 |   )
56 |   state=$(jq .state.life_cycle_state -r <<< "$api_response")
57 |   echo "job state: $state"
58 |   if [[ "$state" == 'TERMINATED' || "$state" == 'SKIPPED' || "$state" == 'INTERNAL_ERROR' ]]; then
59 |     break
60 |   fi
61 |   sleep 1m
62 | done
63 | 


--------------------------------------------------------------------------------
/scripts/6_configure_firewall.sh:
--------------------------------------------------------------------------------
 1 | if [ $ENABLE_FIREWALL = 0 ]; then
 2 |     echo "Firewall will not be enabled"
 3 |     exit 0
 4 | fi
 5 | #
 6 | # 0. Service endpoints
 7 | az network vnet subnet update --resource-group $RG --vnet-name $VNET --name "public-subnet" --service-endpoints "Microsoft.KeyVault" "Microsoft.Storage" "Microsoft.AzureCosmosDB"
 8 | cli_response=$(az network vnet subnet show --resource-group "$RG" --vnet-name "$VNET" --name "public-subnet")
 9 | subnet_id=$(jq .id -r <<< "$cli_response")
10 | #
11 | # 1. Cosmos DB
12 | #az cosmosdb account update --resource-group "$RG" --name "$COSMOSDBNAME" --enable-public-network false
13 | # Databricks
14 | az cosmosdb update -g $RG -n $COSMOSDBNAME --enable-virtual-network true
15 | az cosmosdb network-rule add -g $RG -n $COSMOSDBNAME --subnet $subnet_id
16 | #
17 | # 2. Key vault
18 | az keyvault update --resource-group "$RG" --name "$AKV" --default-action Deny
19 | # Databricks
20 | az keyvault network-rule add -g "$RG" -n "$AKV" --subnet $subnet_id
21 | # Azure Data Factory
22 | az keyvault update --resource-group "$RG" --name "$AKV" --bypass AzureServices
23 | #
24 | # 3. Storage account
25 | az storage account update --resource-group "$RG" --name "$STOR" --default-action Deny
26 | # Databricks
27 | az storage account network-rule add -g $RG --account-name $STOR --subnet $subnet_id
28 | # Azure Data Factory
29 | az storage account update -g "$RG" -n "$STOR" --bypass AzureServices
30 | #
31 | sleep 1m # avoid raise conditions


--------------------------------------------------------------------------------
/scripts/99_databricks_sqlapi.sh:
--------------------------------------------------------------------------------
 1 | RG="<<your resource group>>"
 2 | DBRWORKSPACE="<<your Databricks workspace>>"
 3 | # 1a. Get tenantID and resource id
 4 | tenantId=$(az account show --query tenantId -o tsv)
 5 | wsId=$(az resource show \
 6 |   --resource-type Microsoft.Databricks/workspaces \
 7 |   -g "$RG" \
 8 |   -n "$DBRWORKSPACE" \
 9 |   --query id -o tsv)
10 | # 1b. Get two bearer tokens in Azure
11 | token_response=$(az account get-access-token --resource 2ff814a6-3304-4ab8-85cb-cd0e6f879c1d)
12 | token=$(jq .accessToken -r <<< "$token_response")
13 | token_response=$(az account get-access-token --resource https://management.core.windows.net/)
14 | azToken=$(jq .accessToken -r <<< "$token_response")
15 | #
16 | # 1c. Databricks variables
17 | dbr_response=$(az databricks workspace show -g $RG -n $DBRWORKSPACE)
18 | workspaceUrl_no_http=$(jq .workspaceUrl -r <<< "$dbr_response")
19 | workspace_id_url="https://"$workspaceUrl_no_http"/"
20 | #
21 | # 2. Use SQL Warehouses APIs 2.0, create data warehouse
22 | api_response=$(curl -v -X POST ${workspace_id_url}api/2.0/sql/warehouses/ \
23 |   -H "Authorization: Bearer $token" \
24 |   -H "X-Databricks-Azure-SP-Management-Token:$azToken" \
25 |   -H "X-Databricks-Azure-Workspace-Resource-Id:$wsId" \
26 |   -d "{\"name\": \"testAzureADSQL\",\"cluster_size\": \"Small\",\"min_num_clusters\": 1,\"max_num_clusters\": 10, \"enable_photon\": \"true\"}")
27 | database_id=$(jq .database -r <<< "$dbr_response")
28 | #
29 | # 3. Use Queries and Dashboards API, find queries
30 | api_response=$(curl -v -X GET ${workspace_id_url}api/2.0/preview/sql/queries \
31 |   -H "Authorization: Bearer $token" \
32 |   -H "X-Databricks-Azure-SP-Management-Token:$azToken" \
33 |   -H "X-Databricks-Azure-Workspace-Resource-Id:$wsId")


--------------------------------------------------------------------------------