├── images
    ├── mlops-full.png
    ├── Environments.png
    ├── mlops-simple.png
    ├── mlflow-creation.png
    ├── mlops-databricks.png
    ├── mlops-overview.png
    ├── mlflow-experiment.png
    ├── 01AddingGitHubSecrets.png
    ├── 02AddingGitHubSecrets.png
    ├── databricks-git-devops.png
    ├── multi-stage-pipeline.png
    ├── 01AddingPipelineVariables.png
    └── 02AddingPipelineVariables.png
├── notebooks
    └── MLOpsDemo
    │   ├── images
    │       ├── Environments.png
    │       ├── mlflow-creation.png
    │       ├── mlflow-experiment.png
    │       ├── 01AddingGitHubSecrets.png
    │       ├── 02AddingGitHubSecrets.png
    │       ├── databricks-git-devops.png
    │       ├── 01AddingPipelineVariables.png
    │       └── 02AddingPipelineVariables.png
    │   ├── train.py
    │   ├── inference.py
    │   ├── serving_build_container_image.py
    │   ├── serving_deploy_to_aci.py
    │   └── serving_deploy_to_aks.py
├── LICENSE
├── README.md
└── azure-pipelines.yml


/images/mlops-full.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ahmedsmostafa/databricks-azureml-mlops/HEAD/images/mlops-full.png


--------------------------------------------------------------------------------
/images/Environments.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ahmedsmostafa/databricks-azureml-mlops/HEAD/images/Environments.png


--------------------------------------------------------------------------------
/images/mlops-simple.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ahmedsmostafa/databricks-azureml-mlops/HEAD/images/mlops-simple.png


--------------------------------------------------------------------------------
/images/mlflow-creation.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ahmedsmostafa/databricks-azureml-mlops/HEAD/images/mlflow-creation.png


--------------------------------------------------------------------------------
/images/mlops-databricks.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ahmedsmostafa/databricks-azureml-mlops/HEAD/images/mlops-databricks.png


--------------------------------------------------------------------------------
/images/mlops-overview.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ahmedsmostafa/databricks-azureml-mlops/HEAD/images/mlops-overview.png


--------------------------------------------------------------------------------
/images/mlflow-experiment.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ahmedsmostafa/databricks-azureml-mlops/HEAD/images/mlflow-experiment.png


--------------------------------------------------------------------------------
/images/01AddingGitHubSecrets.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ahmedsmostafa/databricks-azureml-mlops/HEAD/images/01AddingGitHubSecrets.png


--------------------------------------------------------------------------------
/images/02AddingGitHubSecrets.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ahmedsmostafa/databricks-azureml-mlops/HEAD/images/02AddingGitHubSecrets.png


--------------------------------------------------------------------------------
/images/databricks-git-devops.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ahmedsmostafa/databricks-azureml-mlops/HEAD/images/databricks-git-devops.png


--------------------------------------------------------------------------------
/images/multi-stage-pipeline.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ahmedsmostafa/databricks-azureml-mlops/HEAD/images/multi-stage-pipeline.png


--------------------------------------------------------------------------------
/images/01AddingPipelineVariables.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ahmedsmostafa/databricks-azureml-mlops/HEAD/images/01AddingPipelineVariables.png


--------------------------------------------------------------------------------
/images/02AddingPipelineVariables.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ahmedsmostafa/databricks-azureml-mlops/HEAD/images/02AddingPipelineVariables.png


--------------------------------------------------------------------------------
/notebooks/MLOpsDemo/images/Environments.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ahmedsmostafa/databricks-azureml-mlops/HEAD/notebooks/MLOpsDemo/images/Environments.png


--------------------------------------------------------------------------------
/notebooks/MLOpsDemo/images/mlflow-creation.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ahmedsmostafa/databricks-azureml-mlops/HEAD/notebooks/MLOpsDemo/images/mlflow-creation.png


--------------------------------------------------------------------------------
/notebooks/MLOpsDemo/images/mlflow-experiment.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ahmedsmostafa/databricks-azureml-mlops/HEAD/notebooks/MLOpsDemo/images/mlflow-experiment.png


--------------------------------------------------------------------------------
/notebooks/MLOpsDemo/images/01AddingGitHubSecrets.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ahmedsmostafa/databricks-azureml-mlops/HEAD/notebooks/MLOpsDemo/images/01AddingGitHubSecrets.png


--------------------------------------------------------------------------------
/notebooks/MLOpsDemo/images/02AddingGitHubSecrets.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ahmedsmostafa/databricks-azureml-mlops/HEAD/notebooks/MLOpsDemo/images/02AddingGitHubSecrets.png


--------------------------------------------------------------------------------
/notebooks/MLOpsDemo/images/databricks-git-devops.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ahmedsmostafa/databricks-azureml-mlops/HEAD/notebooks/MLOpsDemo/images/databricks-git-devops.png


--------------------------------------------------------------------------------
/notebooks/MLOpsDemo/images/01AddingPipelineVariables.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ahmedsmostafa/databricks-azureml-mlops/HEAD/notebooks/MLOpsDemo/images/01AddingPipelineVariables.png


--------------------------------------------------------------------------------
/notebooks/MLOpsDemo/images/02AddingPipelineVariables.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ahmedsmostafa/databricks-azureml-mlops/HEAD/notebooks/MLOpsDemo/images/02AddingPipelineVariables.png


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
 1 | MIT License
 2 | 
 3 | Copyright (c) 2020 Ahmed Mostafa; forked from Sascha Dittmann
 4 | 
 5 | Permission is hereby granted, free of charge, to any person obtaining a copy
 6 | of this software and associated documentation files (the "Software"), to deal
 7 | in the Software without restriction, including without limitation the rights
 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 | 
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 | 
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.
22 | 


--------------------------------------------------------------------------------
/notebooks/MLOpsDemo/train.py:
--------------------------------------------------------------------------------
  1 | # Databricks notebook source
  2 | #import numpy as np
  3 | #dbutils.widgets.removeAll()
  4 | #dbutils.widgets.dropdown("alpha", "0.5", [str(x)[:3] for x in np.arange(0,1.1,0.1)])
  5 | #dbutils.widgets.dropdown("l1_ratio", "0.5", [str(x)[:3] for x in np.arange(0,1.1,0.1)])
  6 | 
  7 | # COMMAND ----------
  8 | 
  9 | # MAGIC %md
 10 | # MAGIC # Training the Model
 11 | # MAGIC First, train a linear regression model that takes two hyperparameters: *alpha* and *l1_ratio*.
 12 | # MAGIC 
 13 | # MAGIC > The data set used in this example is from http://archive.ics.uci.edu/ml/datasets/Wine+Quality
 14 | # MAGIC > P. Cortez, A. Cerdeira, F. Almeida, T. Matos and J. Reis.
 15 | # MAGIC > Modeling wine preferences by data mining from physicochemical properties. In Decision Support Systems, Elsevier, 47(4):547-553, 2009.
 16 | 
 17 | # COMMAND ----------
 18 | 
 19 | import os
 20 | import warnings
 21 | import sys
 22 | 
 23 | import pandas as pd
 24 | import numpy as np
 25 | from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
 26 | from sklearn.model_selection import train_test_split
 27 | from sklearn.linear_model import ElasticNet
 28 | 
 29 | import logging
 30 | logging.basicConfig(level=logging.WARN)
 31 | logger = logging.getLogger(__name__)
 32 | 
 33 | # COMMAND ----------
 34 | 
 35 | import mlflow
 36 | import mlflow.sklearn
 37 | 
 38 | mlflow.set_experiment("/MLOpsDemo/DrinksQuality")
 39 | 
 40 | # COMMAND ----------
 41 | 
 42 | def eval_metrics(actual, pred):
 43 |   rmse = np.sqrt(mean_squared_error(actual, pred))
 44 |   mae = mean_absolute_error(actual, pred)
 45 |   r2 = r2_score(actual, pred)
 46 |   return rmse, mae, r2
 47 | 
 48 | # COMMAND ----------
 49 | 
 50 | try:
 51 |   alpha = float(dbutils.widgets.getArgument("alpha"))
 52 | except:
 53 |   alpha = 0.5
 54 | try:
 55 |   l1_ratio = float(dbutils.widgets.getArgument("l1_ratio"))
 56 | except:
 57 |   l1_ratio = 0.5
 58 | 
 59 | # COMMAND ----------
 60 | 
 61 | warnings.filterwarnings("ignore")
 62 | np.random.seed(40)
 63 | 
 64 | # Read the wine-quality csv file from the URL
 65 | csv_url =\
 66 |   'http://archive.ics.uci.edu/ml/machine-learning-databases/wine-quality/winequality-red.csv'
 67 | try:
 68 |   data = pd.read_csv(csv_url, sep=';')
 69 | except Exception as e:
 70 |   logger.exception("Unable to download training & test CSV, check your internet connection. Error: %s", e)
 71 | 
 72 | # Split the data into training and test sets. (0.75, 0.25) split.
 73 | train, test = train_test_split(data)
 74 | 
 75 | # The predicted column is "quality" which is a scalar from [3, 9]
 76 | train_x = train.drop(["quality"], axis=1)
 77 | test_x = test.drop(["quality"], axis=1)
 78 | train_y = train[["quality"]]
 79 | test_y = test[["quality"]]
 80 | 
 81 | # COMMAND ----------
 82 | 
 83 | with mlflow.start_run():
 84 |   lr = ElasticNet(alpha=alpha, l1_ratio=l1_ratio, random_state=42)
 85 |   lr.fit(train_x, train_y)
 86 | 
 87 |   predicted_qualities = lr.predict(test_x)
 88 | 
 89 |   (rmse, mae, r2) = eval_metrics(test_y, predicted_qualities)
 90 | 
 91 |   print("Elasticnet model (alpha=%f, l1_ratio=%f):" % (alpha, l1_ratio))
 92 |   print("  RMSE: %s" % rmse)
 93 |   print("  MAE: %s" % mae)
 94 |   print("  R2: %s" % r2)
 95 | 
 96 |   mlflow.log_param("alpha", alpha)
 97 |   mlflow.log_param("l1_ratio", l1_ratio)
 98 |   mlflow.log_metric("rmse", rmse)
 99 |   mlflow.log_metric("r2", r2)
100 |   mlflow.log_metric("mae", mae)
101 | 
102 |   mlflow.sklearn.log_model(lr, "model")
103 | 
104 | # COMMAND ----------
105 | 
106 | 


--------------------------------------------------------------------------------
/notebooks/MLOpsDemo/inference.py:
--------------------------------------------------------------------------------
 1 | # Databricks notebook source
 2 | # MAGIC %md ## Inference
 3 | 
 4 | # COMMAND ----------
 5 | 
 6 | from mlflow.tracking.client import MlflowClient
 7 | from mlflow.entities import ViewType
 8 | 
 9 | experiment_name = "/MLOpsDemo/DrinksQuality"
10 | experiment = MlflowClient().get_experiment_by_name(experiment_name)
11 | experiment_ids = eval('[' + experiment.experiment_id + ']')
12 | # all_experiments = [exp.experiment_id for exp in MlflowClient().list_experiments()]
13 | print("Experiment IDs:", experiment_ids)
14 | 
15 | query = "metrics.rmse < 0.8"
16 | runs = MlflowClient().search_runs(experiment_ids, query, ViewType.ALL)
17 | 
18 | rmse_low = None
19 | run_id = None
20 | for run in runs:
21 |   if (rmse_low == None or run.data.metrics['rmse'] < rmse_low):
22 |     rmse_low = run.data.metrics['rmse']
23 |     run_id = run.info.run_id
24 | print("Lowest RMSE:", rmse_low)
25 | print("Run ID:", run_id)
26 | 
27 | model_uri = "runs:/" + run_id + "/model"
28 | 
29 | # COMMAND ----------
30 | 
31 | # MAGIC %md ## Load MLflow Model as a scikit-learn Model
32 | # MAGIC You can use the MLflow API to load the model from the MLflow server that was produced by a given run.
33 | # MAGIC 
34 | # MAGIC Once you load it, it is a just a scikit-learn model and you can explore it or use it.
35 | 
36 | # COMMAND ----------
37 | 
38 | import mlflow.sklearn
39 | model = mlflow.sklearn.load_model(model_uri=model_uri)
40 | model.coef_
41 | 
42 | # COMMAND ----------
43 | 
44 | import numpy as np
45 | import pandas as pd
46 | 
47 | cols = ['alcohol', 'chlorides', 'citric acid', 'density', 'fixed acidity', 'free sulfur dioxide', 'pH', 'residual sugar', 'sulphates', 'total sulfur dioxide', 'volatile acidity']
48 | d = [12.8, 0.029, 0.48, 0.98, 6.2, 29, 3.33, 1.2, 0.39, 75, 0.66]
49 | d = np.array([d])
50 | 
51 | data = pd.DataFrame(d, columns=cols)
52 | display(data)
53 | 
54 | # COMMAND ----------
55 | 
56 | #Get a prediction for a row of the dataset
57 | model.predict(data)
58 | 
59 | # COMMAND ----------
60 | 
61 | # MAGIC %md ## Use an MLflow Model for Batch Inference
62 | # MAGIC You can get a PySpark UDF to do some batch inference using one of the models.
63 | 
64 | # COMMAND ----------
65 | 
66 | csv_url = 'http://archive.ics.uci.edu/ml/machine-learning-databases/wine-quality/winequality-red.csv'
67 | try:
68 |   data = pd.read_csv(csv_url, sep=';')
69 | except Exception as e:
70 |   logger.exception("Unable to download training & test CSV, check your internet connection. Error: %s", e)
71 |   
72 | # Create a Spark DataFrame from the original pandas DataFrame minus the column you want to predict.
73 | # Use this to simulate what this would be like if you had a big data set e.g. click logs that was 
74 | # regularly being updated that you wanted to score.
75 | dataframe = spark.createDataFrame(data.drop(["quality"], axis=1))
76 | display(dataframe)
77 | 
78 | # COMMAND ----------
79 | 
80 | # MAGIC %md Use the MLflow API to create a PySpark UDF from a run. See [Export a python_function model as an Apache Spark UDF](https://mlflow.org/docs/latest/models.html#export-a-python-function-model-as-an-apache-spark-udf).
81 | 
82 | # COMMAND ----------
83 | 
84 | import mlflow.pyfunc
85 | pyfunc_udf = mlflow.pyfunc.spark_udf(spark, model_uri=model_uri)
86 | 
87 | # COMMAND ----------
88 | 
89 | # MAGIC %md Add a column to the data by applying the PySpark UDF to the DataFrame.
90 | 
91 | # COMMAND ----------
92 | 
93 | predicted_df = dataframe.withColumn("prediction", pyfunc_udf('alcohol', 'chlorides', 'citric acid', 'density', 'fixed acidity', 'free sulfur dioxide', 'pH', 'residual sugar', 'sulphates', 'total sulfur dioxide', 'volatile acidity'))
94 | display(predicted_df)
95 | 
96 | # COMMAND ----------
97 | 
98 | 


--------------------------------------------------------------------------------
/notebooks/MLOpsDemo/serving_build_container_image.py:
--------------------------------------------------------------------------------
 1 | # Databricks notebook source
 2 | # MAGIC %md ## Serving Models with Microsoft Azure ML
 3 | # MAGIC 
 4 | # MAGIC ##### NOTE: I do not recommend using *Run All* because it takes several minutes to deploy and update models; models cannot be queried until they are active.
 5 | 
 6 | # COMMAND ----------
 7 | 
 8 | # MAGIC %md ### Create or load an Azure ML Workspace
 9 | 
10 | # COMMAND ----------
11 | 
12 | # MAGIC %md Before models can be deployed to Azure ML, you must create or obtain an Azure ML Workspace. The `azureml.core.Workspace.create()` function will load a workspace of a specified name or create one if it does not already exist. For more information about creating an Azure ML Workspace, see the [Azure ML Workspace management documentation](https://docs.microsoft.com/en-us/azure/machine-learning/service/how-to-manage-workspace).
13 | 
14 | # COMMAND ----------
15 | 
16 | import azureml
17 | from azureml.core import Workspace
18 | from azureml.core.authentication import ServicePrincipalAuthentication
19 | 
20 | workspace_name = dbutils.secrets.get(scope = "azureml", key = "workspace_name")
21 | workspace_location = "westeurope"
22 | resource_group = dbutils.secrets.get(scope = "azureml", key = "resource_group")
23 | subscription_id = dbutils.secrets.get(scope = "azureml", key = "subscription_id")
24 | 
25 | # COMMAND ----------
26 | 
27 | svc_pr = ServicePrincipalAuthentication(
28 |     tenant_id = dbutils.secrets.get(scope = "azureml", key = "tenant_id"),
29 |     service_principal_id = dbutils.secrets.get(scope = "azureml", key = "client_id"),
30 |     service_principal_password = dbutils.secrets.get(scope = "azureml", key = "client_secret"))
31 | 
32 | workspace = Workspace.create(name = workspace_name,
33 |                              location = workspace_location,
34 |                              resource_group = resource_group,
35 |                              subscription_id = subscription_id,
36 |                              auth=svc_pr,
37 |                              exist_ok=True)
38 | 
39 | # COMMAND ----------
40 | 
41 | workspace
42 | 
43 | # COMMAND ----------
44 | 
45 | # MAGIC %md ## Build an Azure Container Image for model deployment
46 | 
47 | # COMMAND ----------
48 | 
49 | # MAGIC %md ### Use MLflow to build a Container Image for the trained model
50 | # MAGIC 
51 | # MAGIC Use the `mlflow.azuereml.build_image` function to build an Azure Container Image for the trained MLflow model. This function also registers the MLflow model with a specified Azure ML workspace. The resulting image can be deployed to Azure Container Instances (ACI) or Azure Kubernetes Service (AKS) for real-time serving.
52 | 
53 | # COMMAND ----------
54 | 
55 | from mlflow.tracking.client import MlflowClient
56 | from mlflow.entities import ViewType
57 | 
58 | experiment_name = "/MLOpsDemo/DrinksQuality"
59 | experiment = MlflowClient().get_experiment_by_name(experiment_name)
60 | experiment_ids = eval('[' + experiment.experiment_id + ']')
61 | # all_experiments = [exp.experiment_id for exp in MlflowClient().list_experiments()]
62 | print("Experiment IDs:", experiment_ids)
63 | 
64 | query = "metrics.rmse < 0.8"
65 | runs = MlflowClient().search_runs(experiment_ids, query, ViewType.ALL)
66 | 
67 | rmse_low = None
68 | run_id = None
69 | for run in runs:
70 |   if (rmse_low == None or run.data.metrics['rmse'] < rmse_low):
71 |     rmse_low = run.data.metrics['rmse']
72 |     run_id = run.info.run_id
73 | print("Lowest RMSE:", rmse_low)
74 | print("Run ID:", run_id)
75 | 
76 | model_uri = "runs:/" + run_id + "/model"
77 | 
78 | # COMMAND ----------
79 | 
80 | import mlflow.azureml
81 | 
82 | model_image, azure_model = mlflow.azureml.build_image(model_uri=model_uri, 
83 |                                                       workspace=workspace,
84 |                                                       model_name="drinksquality",
85 |                                                       image_name="drinksquality",
86 |                                                       description="Sklearn ElasticNet image for predicting wine quality",
87 |                                                       synchronous=False)
88 | 
89 | model_image.wait_for_creation(show_output=True)
90 | 
91 | # COMMAND ----------
92 | 
93 | dbutils.notebook.exit('{"model_image_id": "%s"}' % model_image.id)


--------------------------------------------------------------------------------
/notebooks/MLOpsDemo/serving_deploy_to_aci.py:
--------------------------------------------------------------------------------
  1 | # Databricks notebook source
  2 | # MAGIC %md ## Serving Models with Microsoft Azure ML
  3 | # MAGIC 
  4 | # MAGIC ##### NOTE: I do not recommend using *Run All* because it takes several minutes to deploy and update models; models cannot be queried until they are active.
  5 | 
  6 | # COMMAND ----------
  7 | 
  8 | #dbutils.widgets.removeAll()
  9 | #dbutils.widgets.text("model_image_id", "")
 10 | 
 11 | # COMMAND ----------
 12 | 
 13 | # model_image_id = dbutils.widgets.getArgument("model_image_id")
 14 | # print("Model Image ID:", model_image_id)
 15 | 
 16 | # COMMAND ----------
 17 | 
 18 | # MAGIC %md ### Create or load an Azure ML Workspace
 19 | 
 20 | # COMMAND ----------
 21 | 
 22 | # MAGIC %md Before models can be deployed to Azure ML, you must create or obtain an Azure ML Workspace. The `azureml.core.Workspace.create()` function will load a workspace of a specified name or create one if it does not already exist. For more information about creating an Azure ML Workspace, see the [Azure ML Workspace management documentation](https://docs.microsoft.com/en-us/azure/machine-learning/service/how-to-manage-workspace).
 23 | 
 24 | # COMMAND ----------
 25 | 
 26 | import azureml
 27 | from azureml.core import Workspace
 28 | from azureml.core.authentication import ServicePrincipalAuthentication
 29 | 
 30 | workspace_name = dbutils.secrets.get(scope = "azureml", key = "workspace_name")
 31 | workspace_location = "westeurope"
 32 | resource_group = dbutils.secrets.get(scope = "azureml", key = "resource_group")
 33 | subscription_id = dbutils.secrets.get(scope = "azureml", key = "subscription_id")
 34 | 
 35 | svc_pr = ServicePrincipalAuthentication(
 36 |     tenant_id = dbutils.secrets.get(scope = "azureml", key = "tenant_id"),
 37 |     service_principal_id = dbutils.secrets.get(scope = "azureml", key = "client_id"),
 38 |     service_principal_password = dbutils.secrets.get(scope = "azureml", key = "client_secret"))
 39 | 
 40 | workspace = Workspace.create(name = workspace_name,
 41 |                              location = workspace_location,
 42 |                              resource_group = resource_group,
 43 |                              subscription_id = subscription_id,
 44 |                              auth=svc_pr,
 45 |                              exist_ok=True)
 46 | 
 47 | # COMMAND ----------
 48 | 
 49 | # MAGIC %md
 50 | # MAGIC Get the latest model image from the workspace
 51 | 
 52 | # COMMAND ----------
 53 | 
 54 | model_image_id = workspace.images['drinksquality'].id
 55 | print("Model Image ID:", model_image_id)
 56 | 
 57 | # COMMAND ----------
 58 | 
 59 | # MAGIC %md ## Deploy the model to "dev" using [Azure Container Instances (ACI)](https://docs.microsoft.com/en-us/azure/container-instances/)
 60 | # MAGIC 
 61 | # MAGIC The [ACI platform](https://docs.microsoft.com/en-us/azure/container-instances/) is the recommended environment for staging and developmental model deployments.
 62 | 
 63 | # COMMAND ----------
 64 | 
 65 | # MAGIC %md ### Create an ACI webservice deployment using the model's Container Image
 66 | # MAGIC 
 67 | # MAGIC Using the Azure ML SDK, deploy the Container Image for the trained MLflow model to ACI.
 68 | 
 69 | # COMMAND ----------
 70 | 
 71 | from azureml.core.webservice import AciWebservice, Webservice
 72 | from azureml.core.image import Image
 73 | 
 74 | model_image = Image(workspace, id=model_image_id)
 75 | 
 76 | dev_webservice_name = "drinks-quality-aci"
 77 | dev_webservice_deployment_config = AciWebservice.deploy_configuration()
 78 | dev_webservice = Webservice.deploy_from_image(name=dev_webservice_name, image=model_image, deployment_config=dev_webservice_deployment_config, workspace=workspace, deployment_target=None, overwrite=True)
 79 | 
 80 | # COMMAND ----------
 81 | 
 82 | while dev_webservice.state != "Healthy":
 83 |   dev_webservice.update_deployment_state()
 84 | 
 85 | # COMMAND ----------
 86 | 
 87 | # dev_webservice.wait_for_deployment()
 88 | 
 89 | # COMMAND ----------
 90 | 
 91 | # MAGIC %md ## Query the deployed model in "dev"
 92 | 
 93 | # COMMAND ----------
 94 | 
 95 | # MAGIC %md ### Load dataset
 96 | 
 97 | # COMMAND ----------
 98 | 
 99 | import numpy as np
100 | import pandas as pd
101 | 
102 | csv_url = 'http://archive.ics.uci.edu/ml/machine-learning-databases/wine-quality/winequality-red.csv'
103 | try:
104 |   data = pd.read_csv(csv_url, sep=';')
105 | except Exception as e:
106 |   logger.exception("Unable to download training & test CSV, check your internet connection. Error: %s", e)
107 | 
108 | data = data.drop(["quality"], axis=1)[:10]
109 | 
110 | # COMMAND ----------
111 | 
112 | # MAGIC %md ## Create sample input vector
113 | 
114 | # COMMAND ----------
115 | 
116 | query_input = data.to_json(orient='split')
117 | query_input = eval(query_input)
118 | query_input.pop('index', None)
119 | 
120 | # COMMAND ----------
121 | 
122 | # MAGIC %md #### Evaluate the sample input vector by sending an HTTP request
123 | # MAGIC Query the ACI webservice's scoring endpoint by sending an HTTP POST request that contains the input vector.
124 | 
125 | # COMMAND ----------
126 | 
127 | import requests
128 | import json
129 | 
130 | def query_endpoint_example(scoring_uri, inputs, service_key=None):
131 |   headers = {
132 |     "Content-Type": "application/json",
133 |   }
134 |   if service_key is not None:
135 |     headers["Authorization"] = "Bearer {service_key}".format(service_key=service_key)
136 |     
137 |   print("Sending batch prediction request with inputs: {}".format(inputs))
138 |   response = requests.post(scoring_uri, data=json.dumps(inputs), headers=headers)
139 |   preds = json.loads(response.text)
140 |   print("Received response: {}".format(preds))
141 |   return preds
142 | 
143 | # COMMAND ----------
144 | 
145 | print("Webservice URL:", dev_webservice.scoring_uri)
146 | 
147 | # COMMAND ----------
148 | 
149 | dev_prediction = query_endpoint_example(scoring_uri=dev_webservice.scoring_uri, inputs=query_input)


--------------------------------------------------------------------------------
/notebooks/MLOpsDemo/serving_deploy_to_aks.py:
--------------------------------------------------------------------------------
  1 | # Databricks notebook source
  2 | # MAGIC %md ## Serving Models with Microsoft Azure ML
  3 | # MAGIC 
  4 | # MAGIC ##### NOTE: I do not recommend using *Run All* because it takes several minutes to deploy and update models; models cannot be queried until they are active.
  5 | 
  6 | # COMMAND ----------
  7 | 
  8 | # MAGIC %md ### Create or load an Azure ML Workspace
  9 | 
 10 | # COMMAND ----------
 11 | 
 12 | # MAGIC %md Before models can be deployed to Azure ML, you must create or obtain an Azure ML Workspace. The `azureml.core.Workspace.create()` function will load a workspace of a specified name or create one if it does not already exist. For more information about creating an Azure ML Workspace, see the [Azure ML Workspace management documentation](https://docs.microsoft.com/en-us/azure/machine-learning/service/how-to-manage-workspace).
 13 | 
 14 | # COMMAND ----------
 15 | 
 16 | import azureml
 17 | from azureml.core import Workspace
 18 | from azureml.core.authentication import ServicePrincipalAuthentication
 19 | 
 20 | workspace_name = "azuredevopsml"
 21 | workspace_location = "westeurope"
 22 | resource_group = "azuredevopsrg"
 23 | subscription_id = "46be3785-50c9-401f-b7e5-1e72664f6e93"
 24 | 
 25 | svc_pr = ServicePrincipalAuthentication(
 26 |     tenant_id = dbutils.secrets.get(scope = "azureml", key = "tenant_id"),
 27 |     service_principal_id = dbutils.secrets.get(scope = "azureml", key = "client_id"),
 28 |     service_principal_password = dbutils.secrets.get(scope = "azureml", key = "client_secret"))
 29 | 
 30 | aksml_workspace = Workspace.create(name = workspace_name,
 31 |                              location = workspace_location,
 32 |                              resource_group = resource_group,
 33 |                              subscription_id = subscription_id,
 34 |                              auth=svc_pr,
 35 |                              exist_ok=True)
 36 | 
 37 | # COMMAND ----------
 38 | 
 39 | # MAGIC %md ## Deploy the model to production using [Azure Kubernetes Service (AKS)](https://azure.microsoft.com/en-us/services/kubernetes-service/).
 40 | 
 41 | # COMMAND ----------
 42 | 
 43 | # MAGIC %md ### Create a new AKS cluster
 44 | # MAGIC 
 45 | # MAGIC If you do not have an active AKS cluster for model deployment, create one using the Azure ML SDK.
 46 | 
 47 | # COMMAND ----------
 48 | 
 49 | model_image_id = aksml_workspace.images['drinksquality'].id
 50 | print("Model Image ID:", model_image_id)
 51 | 
 52 | # COMMAND ----------
 53 | 
 54 | from azureml.core.compute import AksCompute, ComputeTarget
 55 | from azureml.core.compute_target import ComputeTargetException
 56 | aks_name = 'drinksqualityaks'
 57 | 
 58 | # COMMAND ----------
 59 | 
 60 | # from azureml.core.webservice import AksWebservice
 61 | # deployment_config = AksWebservice.deploy_configuration(cpu_cores = 1, memory_gb = 1)
 62 | 
 63 | prov_config = AksCompute.provisioning_configuration()
 64 | # prov_config = AksCompute.provisioning_configuration(agent_count=3, vm_size="Standard_B4ms")
 65 | 
 66 | 
 67 | # COMMAND ----------
 68 | 
 69 | print(aksml_workspace.compute_targets)
 70 | 
 71 | 
 72 | # COMMAND ----------
 73 | 
 74 | computes = ComputeTarget.list(aksml_workspace)
 75 | aks_exists = False
 76 | for target in computes:
 77 |   print(target)
 78 |   print(target.type)
 79 |   print(target.get_status())
 80 |   if target.type == "AKS":
 81 |     if target.get_status() == "Failed":
 82 |       # delete and recreate the target
 83 |       target.delete()
 84 |       # prov_config = AksCompute.provisioning_configuration(agent_count=3, vm_size="Standard_B4ms")
 85 |       aks_target = ComputeTarget.create(aksml_workspace, name = aks_name, provisioning_configuration = prov_config)
 86 |     if target.get_status() == "Succeeded":
 87 |       # attach to existing AKS
 88 |       aks_exists = True
 89 |       
 90 | print(aks_exists)
 91 | 
 92 | # COMMAND ----------
 93 | 
 94 | if aks_exists:
 95 |   aks_target = aksml_workspace.compute_targets['drinksqualityaks']
 96 | else:
 97 |   aks_target = ComputeTarget.create(aksml_workspace, name = aks_name, provisioning_configuration = prov_config)
 98 | 
 99 | # COMMAND ----------
100 | 
101 | type(aks_target)
102 | 
103 | # COMMAND ----------
104 | 
105 | aks_target.wait_for_completion(show_output=True)
106 | 
107 | # COMMAND ----------
108 | 
109 | # MAGIC %md ### Deploy the model's image to the specified AKS cluster
110 | 
111 | # COMMAND ----------
112 | 
113 | from azureml.core.webservice import Webservice, AksWebservice
114 | from azureml.core.image import Image
115 | 
116 | # Get Model
117 | model_image = Image(aksml_workspace, id=model_image_id)
118 | # Get Webservice
119 | prod_webservice_name = "drinks-quality-aks"
120 | 
121 | # COMMAND ----------
122 | 
123 | from azureml.core.webservice import Webservice, AksWebservice
124 | from azureml.core.image import Image
125 | 
126 | # Get Model
127 | model_image = Image(aksml_workspace, id=model_image_id)
128 | 
129 | # Get Webservice
130 | prod_webservice_name = "drinks-quality-aks"
131 | try:
132 |   prod_webservice = Webservice(aksml_workspace, prod_webservice_name)
133 |   print('updating existing webservice.')
134 |   prod_webservice.update(image=model_image)
135 |   prod_webservice.wait_for_deployment(show_output = True)
136 | except:
137 |   print('creating new webservice.')
138 |   # Set configuration and service name
139 |   prod_webservice_deployment_config = AksWebservice.deploy_configuration()
140 |   # Deploy from image
141 |   prod_webservice = Webservice.deploy_from_image(workspace = aksml_workspace, 
142 |                                                  name = prod_webservice_name,
143 |                                                  image = model_image,
144 |                                                  deployment_config = prod_webservice_deployment_config,
145 |                                                  deployment_target = aks_target)
146 |   # Wait for the deployment to complete
147 |   prod_webservice.wait_for_deployment(show_output = True)
148 | 
149 | # COMMAND ----------
150 | 
151 | # MAGIC %md ## Query the deployed model in production
152 | 
153 | # COMMAND ----------
154 | 
155 | # MAGIC %md ### Load dataset
156 | 
157 | # COMMAND ----------
158 | 
159 | import numpy as np
160 | import pandas as pd
161 | 
162 | csv_url = 'http://archive.ics.uci.edu/ml/machine-learning-databases/wine-quality/winequality-red.csv'
163 | try:
164 |   data = pd.read_csv(csv_url, sep=';')
165 | except Exception as e:
166 |   logger.exception("Unable to download training & test CSV, check your internet connection. Error: %s", e)
167 | 
168 | data = data.drop(["quality"], axis=1)[:10]
169 | 
170 | # COMMAND ----------
171 | 
172 | # MAGIC %md ### Create sample input vector
173 | 
174 | # COMMAND ----------
175 | 
176 | query_input = data.to_json(orient='split')
177 | query_input = eval(query_input)
178 | query_input.pop('index', None)
179 | 
180 | # COMMAND ----------
181 | 
182 | # MAGIC %md #### Evaluate the sample input vector by sending an HTTP request
183 | # MAGIC Query the AKS webservice's scoring endpoint by sending an HTTP POST request that includes the input vector. The production AKS deployment may require an authorization token (service key) for queries. Include this key in the HTTP request header.
184 | 
185 | # COMMAND ----------
186 | 
187 | import requests
188 | import json
189 | 
190 | def query_endpoint_example(scoring_uri, inputs, service_key=None):
191 |   headers = {
192 |     "Content-Type": "application/json",
193 |   }
194 |   if service_key is not None:
195 |     headers["Authorization"] = "Bearer {service_key}".format(service_key=service_key)
196 |     
197 |   print("Sending batch prediction request with inputs: {}".format(inputs))
198 |   response = requests.post(scoring_uri, data=json.dumps(inputs), headers=headers)
199 |   preds = json.loads(response.text)
200 |   print("Received response: {}".format(preds))
201 |   return preds
202 | 
203 | # COMMAND ----------
204 | 
205 | prod_scoring_uri = prod_webservice.scoring_uri
206 | prod_service_key = prod_webservice.get_keys()[0] if len(prod_webservice.get_keys()) > 0 else None
207 | print("Webservice URL:", prod_scoring_uri)
208 | 
209 | # COMMAND ----------
210 | 
211 | prod_prediction1 = query_endpoint_example(scoring_uri=prod_scoring_uri, service_key=prod_service_key, inputs=query_input)
212 | 
213 | # COMMAND ----------
214 | 
215 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
  1 | [![Build Status](https://dev.azure.com/ahmosta/MLOpsDatabricks/_apis/build/status/MLOpsDatabricks?branchName=master)](https://dev.azure.com/ahmosta/MLOpsDatabricks/_build/latest?definitionId=1&branchName=master)
  2 | 
  3 | # Large-scale Data/MLOps with Azure & Databricks
  4 | ## How to implement DataOps/MLOps using Azure Devops, Databricks, MLFlow, Azure ML
  5 | 
  6 | Operationalizing Data Analytics and Machine Learning workloads can be challenging; because the ecosystem of platforms and services involved used to build such workloads is big; which increases the complexity of deploying such workloads to production. The complexity also increases with the continous adoption of running in containers and using container orchestration frameworks such as Kubernetes.
  7 | 
  8 | This repo demonstrates an approach of implementing DevOps pipelines for large-scale Data Analytics and Machine Learning (also called Data/MLOps) using a combination of [Azure Databricks](https://azure.microsoft.com/en-us/services/databricks/), [MLFlow](http://mlflow.org), and [AzureML](https://azure.microsoft.com/en-us/services/machine-learning/).
  9 | 
 10 | The DevOps pipeline is implemented in [Azure DevOps](https://azure.microsoft.com/en-us/services/devops/), and it deploys the workload in a containerized form simulating staging & production environments to [Azure Container Services](https://azure.microsoft.com/en-us/product-categories/containers/) and [Azure Kubernetes Services](https://azure.microsoft.com/en-us/services/kubernetes-service/).
 11 | 
 12 | The diagram below shows a high-level overview of a generic DevOps process;  that build pipelines build the project's artifacts followed by a testing & release process. such process enables faster deployment for modules without impacting the overall system as well as the flexibility to deploy to one or more environments.
 13 | 
 14 |    ![mlops-full](/images/mlops-full.png)
 15 | 
 16 | This tutorial fits ML as well as DataOps workloads, to simplify things it will walk you through how to implement Data/ML Ops in the following general form:
 17 | 
 18 |    ![mlops-simple](./images/mlops-simple.png)
 19 | 
 20 | 
 21 | ## Why Azure Databricks?
 22 | Azure Databricks offers great capabilities for developing & building analytics & ML workloads for data ingestion, data engineering, and data science for various applications (e.g. Data Management, Batch processing, Stream processing, Graph processing, and distributed machine learning). 
 23 | 
 24 | Such capabilities are offered in a unified experience for collaboration between the different team stakeholders; supporting writing code in Scala, Python, SQL, and R and writing standard as well as Apache Spark applications.
 25 | 
 26 | Additionally, the platform provides a very convenient infrastructure management layer:
 27 | - Deploying workloads to highly scalable clusters is very easy.
 28 | - Clusters can be configured to auto-scale to adapt to the processing workload.
 29 | - Clusters can be set to work in an epheremal way; clusters will automatically terminate once a job is done; or no activity. This feature adds great value for managing costs.
 30 | - Databricks supports running containers through [Databricks Container Services](https://docs.microsoft.com/en-us/azure/databricks/clusters/custom-containers)
 31 | 
 32 | Such execution capabilities makes Azure Databricksa great fit, not only for developing & building workloads, but also for running such workloads and serving Data as well.
 33 | 
 34 | ## Why Azure ML and MLFlow?
 35 | This tutorial demonstrates also how Databricks Notebooks can leverage MLFlow and Azure ML, while they can be seen similar, but their usage & application shine depends on where they're used.
 36 | 
 37 | MLFlow is natively supported within Databricks, as MLFlow manages the machine learning experiment and the runs within the Databricks workspace development environment. Therefore, it conviently offers data scientists & engineers with such ML management and tracking capability for their models without having to leave their development environment.
 38 | 
 39 | The integration between MLFlow and AzureML however provides such management across environments, and AzureML is eventually used - from within the MLFlow Experiment - to build a Docker container image for the best scored model, and publish such workload(s) to the Azure container registry, and deploy it afterwards to either Azure Container Instances and Azure Kubernetes Services.
 40 | 
 41 | ![mlops-databricks](./images/mlops-databricks.png)
 42 | 
 43 | ## Using This Sample Project
 44 | 
 45 | This repo is configured to run and use Azure DevOps, therefore you need to prepare you enviroment with the following steps.
 46 | 
 47 | The DevOps Pipeline is a "multi-staged" pipeline and it is defined using the YAML file [azure-pipelines.yml](./azure-pipelines.yml) for Azure DevOps. 
 48 | 
 49 | > Note: Building Github workflow actions is in progress as well.
 50 | 
 51 | ## Required Accounts And Resources
 52 | 
 53 | This example uses Azure DevOps as an CI/CD toolset, as well as Microsoft Azure services to host the trained Machine Learning Model.
 54 | 
 55 | * At the time of creating this tutorial, GitHub Actions were still beta. If you wan't to try this new feature, you have to [Sign up for the beta](https://github.com/features/actions) first.
 56 | 
 57 | ### Azure Databricks Workspace
 58 | 
 59 | In your Azure subsciption, you need to [create an Azure Databricks workspace](https://docs.azuredatabricks.net/getting-started/try-databricks.html#step-2-create-a-databricks-workspace) to get started.
 60 | 
 61 | > NOTE: I recommend to place the Azure Databricks Workspace in a new Resource Group, to be able to clean everything up more easily afterwards.
 62 | 
 63 | ## Importing This DevOps Project
 64 | 
 65 | As soon as you have access to the Azure DevOps platform, you're able to [create a project](https://docs.microsoft.com/en-us/azure/devops/user-guide/sign-up-invite-teammates?view=azure-devops#create-a-project) to host your MLOps pipeline.
 66 | 
 67 | As soon as this is created, you can [import this GitHub repository](https://docs.microsoft.com/en-us/azure/devops/repos/git/import-git-repository?view=azure-devops) into your Azure DevOps project.
 68 | 
 69 | ### Connecting Azure Databricks Notebooks to the Azure DevOps *Repo*
 70 | 
 71 | It is recommended to connect your notebooks to the Azure DevOps repo. This will ensure your changes & updates are pushed to the repo automatically and gets built properly. The pipeline is automatically triggered by any commit/push to the repo.
 72 | 
 73 | To configure this, go to the "User Settings" and click on "Git Integration". ![databricks and azure devops integration](./images/databricks-git-devops.png)
 74 | 
 75 | ### Create MLFlow Experiment
 76 | The Databricks notebooks use MLFlow under the hoods; in order to create the MLFlow experiment you need to do this after importing the notebooks in the Databricks workspace.
 77 | 
 78 | ![databricks mlflow experiment](./images/mlflow-experiment.png)
 79 | 
 80 | Clicking on the above link will open a screen where you can specify the name of the experiment and its location on DBFS. For this demo, make sure the MLFlow experiment's name is DrinksQuality.
 81 | 
 82 | ![databricks mlflow creation](./images/mlflow-creation.png)
 83 | 
 84 | 
 85 | ## Set up The Build Pipeline
 86 | 
 87 | By importing the GitHub files, you also imported the [azure-pipelines.yml](./azure-pipelines.yml) file.
 88 | 
 89 | This file can be used to create your first Build Pipeline.
 90 | 
 91 | This Build Pipeline is using a feature called "[Multi-Stage Pipelines](https://docs.microsoft.com/en-us/azure/devops/pipelines/process/stages?view=azure-devops&tabs=yaml)". This feature might not be enabled for you, so in order to use it, you should [enable this preview feature](https://docs.microsoft.com/en-us/azure/devops/project/navigation/preview-features?view=azure-devops).
 92 | 
 93 | ![Azure DevOps multi-stage-pipeline](./images/multi-stage-pipeline.png)
 94 | 
 95 | ## Connecting Azure Databricks
 96 | 
 97 | ### Connecting the Azure DevOps *pipeline* to Azure Databricks
 98 | 
 99 | To be able to run this pipeline, you also need to connect your Azure Databricks Workspace with the pipeline.
100 | 
101 | Therefore, yor first need to [generate an access token on Databricks](https://docs.azuredatabricks.net/dev-tools/api/latest/authentication.html#generate-a-token).
102 | 
103 | This token must be stored as encrypted secret in your Azure DevOps Build Pipeline...
104 | 
105 | ![Adding an Azure Pipeline Variable](./images/01AddingPipelineVariables.png "Adding an Azure Pipeline Variable")
106 | 
107 | > NOTE: The variable must be called *databricks.token* as it is referenced within the pipeline YAML file.
108 | > NOTE: There are additional variables that need to be defined to ease the build & deployment operation. You're free to decide if those variables should be defined as secrets or text values.
109 | 
110 | ![Azure Pipeline Variables](./images/02AddingPipelineVariables.png)
111 | 
112 | ... or your GitHub Project.
113 | 
114 | ![Adding an Azure Pipeline Variable](./images/01AddingGitHubSecrets.png "Adding a GitHub Secret")
115 | 
116 | > NOTE: The GitHub Secret must be called *DATABRICKS_TOKEN*
117 | 
118 | ![Azure Pipeline Variables](./images/02AddingGitHubSecrets.png)
119 | 
120 | ## Connecting the Azure ML Service Workspace
121 | 
122 | ### Step 1: Create Azure AD Service Principal
123 | 
124 | The Databricks-Notebooks will be used also for serving your model, by leveraging and creating an Azure Machine Learning Workspace (and other resources) for you.
125 | 
126 | Azure Databricks Service requires access rights to do that, therefore you need to create a Service Principal in your Azure Active Directory.
127 | 
128 | You can do that directly in the [Cloud Shell](https://docs.microsoft.com/en-us/azure/cloud-shell/overview) of the Azure Portal, by using one these two commands:
129 | 
130 | ``` bash
131 | az ad sp create-for-rbac -n "http://MLOps-Databricks"
132 | ```
133 | 
134 | > Least Privilege Principle: If you want to narrow that down to a specific Resource Group and Azure Role, use the following command
135 | 
136 | ``` bash
137 | az ad sp create-for-rbac -n "http://MLOps-Databricks" --role contributor --scopes /subscriptions/{SubID}/resourceGroups/{ResourceGroup1}
138 | ```
139 | 
140 | > Make a note of the result of this command, as you will need it in a later step.
141 | 
142 | ### Step 2: Install / Update Databricks CLI
143 | 
144 | Azure Databricks has its own place to store secrets.
145 | 
146 | At the time of creating this example, this store can be only accessed via the Databricks command-line interface (CLI).
147 | 
148 | Although not required, but you can install this CLI on your local machine or in the Azure Cloud Shell.
149 | 
150 | ``` bash
151 | pip install -U databricks-cli
152 | ```
153 | 
154 | > NOTE: You need python 2.7.9 or later / 3.6 or later to install and use the Databricks command-line interface (CLI) 
155 | 
156 | ### Step 3 (optional): Store Databricks Secrets
157 | 
158 | Using the Databricks CLI, you can now create your own section (scope) for your secrets...
159 | 
160 | ``` bash
161 | databricks secrets create-scope --scope azureml
162 | ```
163 | 
164 | ... and add the required secrets to the scope.
165 | 
166 | ``` bash
167 | # Use the "tenant" property from the Azure AD Service Principal command output
168 | databricks secrets put --scope azureml --key tenant_id
169 | # Use the "appId" property from the Azure AD Service Principal command output
170 | databricks secrets put --scope azureml --key client_id
171 | # Use the "password" property from the Azure AD Service Principal command output
172 | databricks secrets put --scope azureml --key client_secret
173 | 
174 | databricks secrets put --scope azureml --key subscription_id
175 | databricks secrets put --scope azureml --key resource_group
176 | databricks secrets put --scope azureml --key workspace_name
177 | ```
178 | > NOTE: The Azure DevOps Pipeline installs and defines these secrets automatically. Databricks Secrets Scopes can be passed as parameters to give flexibility to the Notebook using secrets between environments. 
179 | 
180 | ## OPTIONAL: Pre-Approval Checks (Azure DevOps)
181 | 
182 | To avoid high costs from the Azure Kubernetes Service, which will be created by the "Deploy To Production" stage, I recommend that you [set up a Pre-Approval Check](https://docs.microsoft.com/en-us/azure/devops/pipelines/process/approvals?view=azure-devops) for the drinks-quality-production environment.
183 | 
184 | This can be done in the Environments section of your Azure Pipelines.
185 | 
186 | ![Azure Pipeline Environments](./images/Environments.png)
187 | 
188 | ## Issues & Take aways & TO DOs:
189 | 
190 | - Notebooks should be configured to pull variables from Databricks Secrets
191 | - Notebooks secrets values should be defined in separate Secrets Scopes.
192 |   - Secret Scopes can be set to the same variable, is updated using Databricks CLI, from the Azure DevOps pipeline.
193 | - Manage AzureML workspace & Environments from within Azure DevOps pipeline instead of the Python SDK (within Databricks notebooks).
194 | - Use Databricks automated clusters (job clusters) instead of Interactive clusters.
195 | - Multi-Stage pipelines are very nice; but they might become harder to maintain. Think about separating your pipelines and connecting them together.
196 | 
197 | _Disclaimer:_ This work is inspired by and based on efforts done by Sascha Dittman.
198 | 


--------------------------------------------------------------------------------
/azure-pipelines.yml:
--------------------------------------------------------------------------------
  1 | # Starter pipeline
  2 | # Start with a minimal pipeline that you can customize to build and deploy your code.
  3 | # Add steps that build, run tests, deploy, and more:
  4 | # https://aka.ms/yaml
  5 | 
  6 | trigger:
  7 | - master
  8 | 
  9 | variables:
 10 | #  databricks.host: https://ukwest.azuredatabricks.net/?o=79284026118
 11 | #  databricks.notebook.path: /MLOpsDemo
 12 |   databricks.cluster.name: mlopscluster-devops 
 13 |   databricks.cluster.id: 
 14 |   databricks.cluster.spark_version: 6.4.x-cpu-ml-scala2.11
 15 |   databricks.cluster.node_type_id: Standard_DS3_v2
 16 |   databricks.cluster.driver_node_type_id: Standard_DS3_v2
 17 |   databricks.cluster.autotermination_minutes: 120
 18 |   databricks.cluster.workers.min: 1
 19 |   databricks.cluster.workers.max: 3
 20 |   databricks.job.train.name: Drinks Quality (Train)
 21 |   databricks.job.train.id:
 22 |   azureml.sdk: azureml-sdk[databricks]
 23 |   databricks.job.buildimage.name: Drinks Quality (Build Container Image)
 24 |   databricks.job.buildimage.id:
 25 |   azureml.image.id: 
 26 |   databricks.job.deploytoaci.name: Drinks Quality (Deploy To ACI)
 27 |   databricks.job.deploytoaci.id:
 28 |   databricks.job.deploytoaks.name: Drinks Quality (Deploy To AKS)
 29 |   databricks.job.deploytoaks.id:
 30 | 
 31 | stages:
 32 | - stage: Build
 33 |   displayName: 'Train, Evaluate & Register Model'
 34 |   jobs:
 35 |   - job: Train
 36 |     displayName: 'Train, Evaluate & Register Model'
 37 |     pool:
 38 |       vmImage: 'ubuntu-latest'
 39 |     steps:
 40 |     - task: UsePythonVersion@0
 41 |       displayName: 'Use Python 3.6'
 42 |       inputs:
 43 |         versionSpec: '3.6'
 44 |         addToPath: true
 45 |         architecture: 'x64'
 46 |     - task: Bash@3
 47 |       displayName: 'Install Databricks CLI'
 48 |       inputs:
 49 |         targetType: 'inline'
 50 |         script: 'pip install -U databricks-cli'
 51 |     - task: Bash@3
 52 |       displayName: 'Configure Databricks CLI'
 53 |       inputs:
 54 |         targetType: 'inline'
 55 |         script: |
 56 |           # We need to write the pipe the conf into databricks configure --token since
 57 |           # that command only takes inputs from stdin. 
 58 |           conf=`cat << EOM
 59 |           $(databricks.host)
 60 |           $(databricks.token)
 61 |           EOM`
 62 |           
 63 |           # For password auth there are three lines expected
 64 |           # hostname, username, password
 65 |           echo "$conf" | databricks configure --token
 66 |     - task: Bash@3
 67 |       displayName: 'Configure Databricks Secrets'
 68 |       inputs:
 69 |        targetType: 'inline'
 70 |        script: |
 71 |           databricks secrets create-scope --scope azureml
 72 |           
 73 |           # Use the "tenant" property from the Azure AD Service Principal command output
 74 |           databricks secrets put --scope azureml --key tenant_id --string-value "$(tenant_id)"
 75 |           
 76 |           # Use the "appId" property from the Azure AD Service Principal command output
 77 |           databricks secrets put --scope azureml --key client_id --string-value "$(client_id)"
 78 |           # Use the "password" property from the Azure AD Service Principal command output
 79 |           databricks secrets put --scope azureml --key client_secret --string-value "$(client_secret)"
 80 |           databricks secrets put --scope azureml --key subscription_id --string-value "$(subscription_id)"
 81 |           databricks secrets put --scope azureml --key resource_group --string-value "$(resource_group)"
 82 |           databricks secrets put --scope azureml --key workspace_name --string-value "$(workspace_name)"
 83 |     - task: Bash@3
 84 |       displayName: 'Create Notebook Path'
 85 |       inputs:
 86 |         targetType: 'inline'
 87 |         script: 'databricks workspace mkdirs "$(databricks.notebook.path)"'
 88 |     - task: Bash@3
 89 |       displayName: 'Import Notebooks'
 90 |       inputs:
 91 |         targetType: 'inline'
 92 |         script: 'databricks workspace import_dir --overwrite "$(devops.repo.notebook.path)" "$(databricks.notebook.path)"'
 93 |     - task: Bash@3
 94 |       displayName: 'Create / Get Cluster'
 95 |       inputs:
 96 |         targetType: 'inline'
 97 |         script: |
 98 |           cluster_id=$(databricks clusters list | grep "$(databricks.cluster.name)" | awk '{print $1}')
 99 |           
100 |           if [ -z "$cluster_id" ]
101 |           then
102 |           JSON=`cat << EOM
103 |           {
104 |             "cluster_name": "$(databricks.cluster.name)",
105 |             "spark_version": "$(databricks.cluster.spark_version)",
106 |             "spark_conf": {
107 |               "spark.databricks.delta.preview.enabled": "true"
108 |             },
109 |             "node_type_id": "$(databricks.cluster.node_type_id)",
110 |             "driver_node_type_id": "$(databricks.cluster.driver_node_type_id)",
111 |             "spark_env_vars": {
112 |               "PYSPARK_PYTHON": "/databricks/python3/bin/python3"
113 |             },
114 |             "autotermination_minutes": $(databricks.cluster.autotermination_minutes),
115 |             "enable_elastic_disk": true,
116 |             "autoscale": {
117 |               "min_workers": $(databricks.cluster.workers.min),
118 |               "max_workers": $(databricks.cluster.workers.max)
119 |             },
120 |             "init_scripts_safe_mode": false
121 |           }
122 |           EOM`
123 |           
124 |           cluster_id=$(databricks clusters create --json "$JSON" | jq -r ".cluster_id")
125 |           sleep 10
126 |           fi
127 |           
128 |           echo "##vso[task.setvariable variable=databricks.cluster.id;]$cluster_id"
129 |     - task: Bash@3
130 |       displayName: 'Start Cluster'
131 |       inputs:
132 |         targetType: 'inline'
133 |         script: |
134 |           echo "Checking Cluster State (Cluster ID: $(databricks.cluster.id))..."
135 |           cluster_state=$(databricks clusters get --cluster-id "$(databricks.cluster.id)" | jq -r ".state")
136 |           echo "Cluster State: $cluster_state"
137 |           
138 |           if [ $cluster_state == "TERMINATED" ]
139 |           then
140 |             echo "Starting Databricks Cluster..."
141 |             databricks clusters start --cluster-id "$(databricks.cluster.id)"
142 |             sleep 30
143 |             cluster_state=$(databricks clusters get --cluster-id "$(databricks.cluster.id)" | jq -r ".state")
144 |             echo "Cluster State: $cluster_state"
145 |           fi
146 |           
147 |           while [ $cluster_state == "PENDING" ]
148 |           do
149 |             sleep 30
150 |             cluster_state=$(databricks clusters get --cluster-id "$(databricks.cluster.id)" | jq -r ".state")
151 |             echo "Cluster State: $cluster_state"
152 |           done
153 |           
154 |           if [ $cluster_state == "RUNNING" ]
155 |           then
156 |             exit 0
157 |           else
158 |             exit 1
159 |           fi
160 |     - task: Bash@3
161 |       displayName: 'Install Azure ML SDK'
162 |       inputs:
163 |         targetType: 'inline'
164 |         script: |
165 |           library_status=$(databricks libraries list --cluster-id $(databricks.cluster.id) | jq -c '.library_statuses[] | select( .library.pypi.package == "$(azureml.sdk)" ) | .status' -r)
166 |           if [ -z "$library_status" ]
167 |           then
168 |             echo "Installing $(azureml.sdk) library to $(databricks.cluster.id)..."
169 |             databricks libraries install --cluster-id "$(databricks.cluster.id)" --pypi-package "$(azureml.sdk)"
170 |             sleep 10
171 |             library_status=$(databricks libraries list --cluster-id $(databricks.cluster.id) | jq -c '.library_statuses[] | select( .library.pypi.package == "$(azureml.sdk)" ) | .status' -r)
172 |             echo "Library Status: $library_status"
173 |           fi
174 |           
175 |           while [ $library_status == "PENDING" -o $library_status == "INSTALLING" ]
176 |           do
177 |             sleep 30
178 |             library_status=$(databricks libraries list --cluster-id $(databricks.cluster.id) | jq -c '.library_statuses[] | select( .library.pypi.package == "$(azureml.sdk)" ) | .status' -r)
179 |             echo "Library Status: $library_status"
180 |           done
181 |           
182 |           if [ $library_status == "INSTALLED" ]
183 |           then
184 |             exit 0
185 |           else
186 |             exit 1
187 |           fi
188 |     - task: Bash@3
189 |       displayName: 'Create / Get Training Job'
190 |       inputs:
191 |         targetType: 'inline'
192 |         script: |
193 |           job_id=$(databricks jobs list | grep "$(databricks.job.train.name)" | awk '{print $1}')
194 |           
195 |           if [ -z "$job_id" ]
196 |           then
197 |           echo "Creating $(databricks.job.train.name) job..."
198 |           JSON=`cat << EOM
199 |           {
200 |             "notebook_task": {
201 |               "notebook_path": "$(databricks.notebook.path)/train",
202 |               "base_parameters": {
203 |                 "alpha": "0.5",
204 |                 "l1_ratio": "0.5"
205 |               }
206 |             },
207 |             "existing_cluster_id": "$(databricks.cluster.id)",
208 |             "name": "$(databricks.job.train.name)",
209 |             "max_concurrent_runs": 30,
210 |             "timeout_seconds": 86400,
211 |             "libraries": [],
212 |             "email_notifications": {}
213 |           }
214 |           EOM`
215 |           
216 |           job_id=$(databricks jobs create --json "$JSON" | jq ".job_id")
217 |           fi
218 |           
219 |           echo "##vso[task.setvariable variable=databricks.job.train.id;]$job_id"
220 |     - task: Bash@3
221 |       displayName: 'Run Training Jobs'
222 |       inputs:
223 |         targetType: 'inline'
224 |         script: |
225 |           echo "Running job with ID $(databricks.job.train.id) with alpha=0.5, l1_ratio=0.5..."
226 |           run_id1=$(databricks jobs run-now --job-id $(databricks.job.train.id) --notebook-params '{ "alpha": "0.5", "l1_ratio": "0.5" }' | jq ".run_id")
227 |           echo "  Run ID: $run_id1"
228 | 
229 |           run_state=$(databricks runs get --run-id $run_id1 | jq -r ".state.life_cycle_state")
230 |           echo "Run State (ID $run_id1): $run_state"
231 |           while [ $run_state == "RUNNING" -o $run_state == "PENDING" ]
232 |           do
233 |             sleep 30
234 |             run_state=$(databricks runs get --run-id $run_id1 | jq -r ".state.life_cycle_state")
235 |             echo "Run State (ID $run_id1): $run_state"
236 |           done
237 |           result_state1=$(databricks runs get --run-id $run_id1 | jq -r ".state.result_state")
238 |           state_message1=$(databricks runs get --run-id $run_id1 | jq -r ".state.state_message")
239 |           echo "Result State (ID $run_id1): $result_state1, Message: $state_message1"
240 |           
241 |           # echo "Running job with ID $(databricks.job.train.id) with alpha=0.3, l1_ratio=0.3..."
242 |           # run_id2=$(databricks jobs run-now --job-id $(databricks.job.train.id) --notebook-params '{ "alpha": "0.3", "l1_ratio": "0.3" }' | jq ".run_id")
243 |           # echo "  Run ID: $run_id2"
244 |           
245 |           # echo "Running job with ID $(databricks.job.train.id) with alpha=0.1, l1_ratio=0.1..."
246 |           # run_id3=$(databricks jobs run-now --job-id $(databricks.job.train.id) --notebook-params '{ "alpha": "0.1", "l1_ratio": "0.1" }' | jq ".run_id")
247 |           # echo "  Run ID: $run_id3"
248 |           
249 |           # run_state=$(databricks runs get --run-id $run_id2 | jq -r ".state.life_cycle_state")
250 |           # echo "Run State (ID $run_id2): $run_state"
251 |           # while [ $run_state == "RUNNING" -o $run_state == "PENDING" ]
252 |           # do
253 |           #   sleep 30
254 |           #   run_state=$(databricks runs get --run-id $run_id2 | jq -r ".state.life_cycle_state")
255 |           #   echo "Run State (ID $run_id2): $run_state"
256 |           # done
257 |           # result_state2=$(databricks runs get --run-id $run_id2 | jq -r ".state.result_state")
258 |           # state_message2=$(databricks runs get --run-id $run_id2 | jq -r ".state.state_message")
259 |           # echo "Result State (ID $run_id2): $result_state2, Message: $state_message2"
260 |           
261 |           # run_state=$(databricks runs get --run-id $run_id3 | jq -r ".state.life_cycle_state")
262 |           # echo "Run State (ID $run_id3): $run_state"
263 |           # while [ $run_state == "RUNNING" -o $run_state == "PENDING" ]
264 |           # do
265 |           #   sleep 30
266 |           #   run_state=$(databricks runs get --run-id $run_id3 | jq -r ".state.life_cycle_state")
267 |           #   echo "Run State (ID $run_id3): $run_state"
268 |           # done
269 |           # result_state3=$(databricks runs get --run-id $run_id3 | jq -r ".state.result_state")
270 |           # state_message3=$(databricks runs get --run-id $run_id3 | jq -r ".state.state_message")
271 |           # echo "Result State (ID $run_id3): $result_state3, Message: $state_message3"
272 |           
273 |           # if [ $result_state1 == "SUCCESS" -a $result_state2 == "SUCCESS" -a $result_state3 == "SUCCESS" ]
274 |           if [ $result_state1 == "SUCCESS" ]
275 |           then
276 |             exit 0
277 |             echo "Training completed successfully.."
278 |           else
279 |             exit 1
280 |             echo "Training had some errors.."
281 |           fi
282 |     - task: Bash@3
283 |       displayName: 'Build Container Image'
284 |       inputs:
285 |         targetType: 'inline'
286 |         script: |
287 |           job_id=$(databricks jobs list | grep "$(databricks.job.buildimage.name)" | awk '{print $1}')
288 |           
289 |           if [ -z "$job_id" ]
290 |           then
291 |           JSON=`cat << EOM
292 |           {
293 |             "notebook_task": {
294 |               "notebook_path": "$(databricks.notebook.path)/serving_build_container_image"
295 |             },
296 |             "existing_cluster_id": "$(databricks.cluster.id)",
297 |             "name": "$(databricks.job.buildimage.name)",
298 |             "max_concurrent_runs": 30,
299 |             "timeout_seconds": 86400,
300 |             "libraries": [],
301 |             "email_notifications": {}
302 |           }
303 |           EOM`
304 |           
305 |           echo "Creating a job for Building Container Image ..."
306 |           job_id=$(databricks jobs create --json "$JSON" | jq ".job_id")
307 |           echo "##vso[task.setvariable variable=databricks.job.buildimage.id;]$job_id"
308 |           fi
309 | 
310 |           echo "Creating a run for JOB ID $job_id"
311 |           #run_id=$(databricks runs submit --json "$JSON" | jq ".run_id")
312 |           run_id=$(databricks jobs run-now --job-id $job_id | jq ".run_id")
313 |           echo "  Run ID: $run_id"
314 |           
315 |           run_state=$(databricks runs get --run-id $run_id | jq -r ".state.life_cycle_state")
316 |           echo "Run State (ID $run_id): $run_state"
317 |           while [ $run_state == "RUNNING" -o $run_state == "PENDING" ]
318 |           do
319 |             sleep 30
320 |             run_state=$(databricks runs get --run-id $run_id | jq -r ".state.life_cycle_state")
321 |             echo "Run State (ID $run_id): $run_state"
322 |           done
323 |           result_state=$(databricks runs get --run-id $run_id | jq -r ".state.result_state")
324 |           state_message=$(databricks runs get --run-id $run_id | jq -r ".state.state_message")
325 |           echo "Result State (ID $run_id): $result_state, Message: '$state_message'"
326 |           
327 |           if [ $result_state == "SUCCESS" ]
328 |           then
329 |             mkdir -p metadata
330 |             databricks runs get-output --run-id $run_id | jq -r .notebook_output.result | tee metadata/image.json
331 |             exit 0
332 |           else
333 |             exit 1
334 |           fi
335 |     # - task: CopyFiles@2
336 |     #   displayName: 'Copy Files to Artifact Staging Directory'
337 |     #   inputs:
338 |     #     SourceFolder: '$(Build.SourcesDirectory)'
339 |     #     Contents: '**/metadata/*'
340 |     #     TargetFolder: '$(Build.ArtifactStagingDirectory)'
341 |     # - task: PublishBuildArtifacts@1
342 |     #   displayName: 'Publish Artifact: drop'
343 |     #   inputs:
344 |     #     PathtoPublish: '$(Build.ArtifactStagingDirectory)'
345 |     #     ArtifactName: 'drop'
346 |     #     publishLocation: 'Container'
347 |     # - task: PublishPipelineArtifact@1
348 |     #   displayName: 'Publish Artifact: drop'
349 |     #   inputs:
350 |     #     targetPath: '$(Build.ArtifactStagingDirectory)'
351 |     #     artifact: 'drop'
352 |     #     publishLocation: 'pipeline'
353 | - stage: Staging
354 |   displayName: 'Deploy to Staging'
355 |   dependsOn: Build
356 |   condition: succeeded()
357 |   jobs:
358 |     # track deployments on the environment
359 |   - deployment: DeployToACI
360 |     displayName: 'Deploy to Azure Container Instance'
361 |     pool:
362 |       vmImage: 'ubuntu-latest'
363 |     # creates an environment if it doesn’t exist
364 |     environment: 'drinks-quality-staging'
365 |     strategy:
366 |       # default deployment strategy
367 |       runOnce:
368 |         deploy:
369 |           steps:
370 |           # - task: DownloadPipelineArtifact@2
371 |           #   displayName: 'Download Artifact: drop'
372 |           #   inputs:
373 |           #     buildType: 'current'
374 |           #     artifactName: 'drop'
375 |           #     targetPath: '$(System.ArtifactsDirectory)'
376 |           # - task: DownloadBuildArtifacts@0
377 |           #   displayName: 'Download Artifact: drop'
378 |           #   inputs:
379 |           #     buildType: 'current'
380 |           #     downloadType: 'single'
381 |           #     artifactName: 'drop'
382 |           #     downloadPath: '$(System.ArtifactsDirectory)'
383 |           - task: UsePythonVersion@0
384 |             displayName: 'Use Python 3.6'
385 |             inputs:
386 |               versionSpec: '3.6'
387 |               addToPath: true
388 |               architecture: 'x64'
389 |           - task: Bash@3
390 |             displayName: 'Install Databricks CLI'
391 |             inputs:
392 |               targetType: 'inline'
393 |               script: 'pip install -U databricks-cli'
394 |           - task: Bash@3
395 |             displayName: 'Configure Databricks CLI'
396 |             inputs:
397 |               targetType: 'inline'
398 |               script: |
399 |                 # We need to write the pipe the conf into databricks configure --token since
400 |                 # that command only takes inputs from stdin. 
401 |                 conf=`cat << EOM
402 |                 $(databricks.host)
403 |                 $(databricks.token)
404 |                 EOM`
405 |                 
406 |                 # For password auth there are three lines expected
407 |                 # hostname, username, password
408 |                 echo "$conf" | databricks configure --token
409 |           - task: Bash@3
410 |             displayName: 'Get Cluster ID'
411 |             inputs:
412 |               targetType: 'inline'
413 |               script: |
414 |                 cluster_id=$(databricks clusters list | grep "$(databricks.cluster.name)" | awk '{print $1}')
415 |                 if [ -z "$cluster_id" ]
416 |                 then
417 |                   echo "ERROR: Unable to get Cluster ID"
418 |                   exit 1
419 |                 fi
420 |                 echo "##vso[task.setvariable variable=databricks.cluster.id;]$cluster_id"
421 |           - task: Bash@3
422 |             displayName: 'Create / Get Deploy ACI Job'
423 |             inputs:
424 |               targetType: 'inline'
425 |               script: |
426 |                 echo "Check if a job already exists"
427 |                 job_id=$(databricks jobs list | grep "$(databricks.job.deploytoaci.name)" | awk '{print $1}')
428 |                 if [ -z "$job_id" ]
429 |                 then
430 |                 JSON=`cat << EOM
431 |                 {
432 |                   "notebook_task": {
433 |                     "notebook_path": "$(databricks.notebook.path)/serving_deploy_to_aci"
434 |                   },
435 |                   "existing_cluster_id": "$(databricks.cluster.id)",
436 |                   "name": "$(databricks.job.deploytoaci.name)",
437 |                   "max_concurrent_runs": 30,
438 |                   "timeout_seconds": 86400,
439 |                   "libraries": [],
440 |                   "email_notifications": {}
441 |                 }
442 |                 EOM`
443 |                 
444 |                 job_id=$(databricks jobs create --json "$JSON" | jq ".job_id")
445 |                 fi
446 |                 
447 |                 echo "##vso[task.setvariable variable=databricks.job.deploytoaci.id;]$job_id"
448 |           # - task: Bash@3
449 |           #   displayName: 'Get Image ID'
450 |           #   inputs:
451 |           #     targetType: 'inline'
452 |           #     script: |
453 |           #       echo "Retrieving Image ID..."
454 |           #       model_image_id=$(cat image.json | jq -r ".model_image_id")
455 |           #       if [ -z "$model_image_id" ]
456 |           #       then
457 |           #         echo "ERROR: Unable to get Image ID"
458 |           #         exit 1
459 |           #       fi
460 |           #       echo "  Image ID: $model_image_id"
461 |           #       echo "##vso[task.setvariable variable=azureml.image.id;]$model_image_id"
462 |           #     workingDirectory: '$(System.ArtifactsDirectory)/drop/metadata'
463 |           - task: Bash@3
464 |             displayName: 'Run the Deploy To ACI job'
465 |             inputs:
466 |               targetType: 'inline'
467 |               script: |
468 |                 echo "Running job with ID $(databricks.job.deploytoaci.id)"
469 |                 run_id=$(databricks jobs run-now --job-id $(databricks.job.deploytoaci.id) | jq ".run_id")
470 |                 echo "  Run ID: $run_id"
471 |                 
472 |                 run_state=$(databricks runs get --run-id $run_id | jq -r ".state.life_cycle_state")
473 |                 echo "Run State (ID $run_id): $run_state"
474 |                 while [ $run_state == "RUNNING" -o $run_state == "PENDING" ]
475 |                 do
476 |                   sleep 30
477 |                   run_state=$(databricks runs get --run-id $run_id | jq -r ".state.life_cycle_state")
478 |                   echo "Run State (ID $run_id): $run_state"
479 |                 done
480 |                 result_state=$(databricks runs get --run-id $run_id | jq -r ".state.result_state")
481 |                 state_message=$(databricks runs get --run-id $run_id | jq -r ".state.state_message")
482 |                 echo "Result State (ID $run_id): $result_state, Message: $state_message"
483 |                 
484 |                 if [ $result_state == "SUCCESS" ]
485 |                 then
486 |                   exit 0
487 |                 else
488 |                   exit 1
489 |                 fi
490 | - stage: Production
491 |   displayName: 'Deploy to Production'
492 |   dependsOn: Staging
493 |   condition: succeeded()
494 |   jobs:
495 |     # track deployments on the environment
496 |   - deployment: DeployToAKS
497 |     displayName: 'Deploy to Azure Kubernetes Service'
498 |     pool:
499 |       vmImage: 'ubuntu-latest'
500 |     # creates an environment if it doesn’t exist
501 |     environment: 'drinks-quality-production'
502 |     strategy:
503 |       # default deployment strategy
504 |       runOnce:
505 |         deploy:
506 |           steps:
507 |           # - task: DownloadBuildArtifacts@0
508 |           #   displayName: 'Download Artifact: drop'
509 |           #   inputs:
510 |           #     buildType: 'current'
511 |           #     downloadType: 'single'
512 |           #     artifactName: 'drop'
513 |           #     downloadPath: '$(System.ArtifactsDirectory)' 
514 |           - task: UsePythonVersion@0
515 |             displayName: 'Use Python 3.6'
516 |             inputs:
517 |               versionSpec: '3.6'
518 |               addToPath: true
519 |               architecture: 'x64'
520 |           - task: Bash@3
521 |             displayName: 'Install Databricks CLI'
522 |             inputs:
523 |               targetType: 'inline'
524 |               script: 'pip install -U databricks-cli'
525 |           - task: Bash@3
526 |             displayName: 'Configure Databricks CLI'
527 |             inputs:
528 |               targetType: 'inline'
529 |               script: |
530 |                 # We need to write the pipe the conf into databricks configure --token since
531 |                 # that command only takes inputs from stdin. 
532 |                 conf=`cat << EOM
533 |                 $(databricks.host)
534 |                 $(databricks.token)
535 |                 EOM`
536 |                 
537 |                 # For password auth there are three lines expected
538 |                 # hostname, username, password
539 |                 echo "$conf" | databricks configure --token
540 |           - task: Bash@3
541 |             displayName: 'Get Cluster ID'
542 |             inputs:
543 |               targetType: 'inline'
544 |               script: |
545 |                 cluster_id=$(databricks clusters list | grep "$(databricks.cluster.name)" | awk '{print $1}')
546 |                 if [ -z "$cluster_id" ]
547 |                 then
548 |                   echo "ERROR: Unable to get Cluster ID"
549 |                   exit 1
550 |                 fi
551 |                 echo "##vso[task.setvariable variable=databricks.cluster.id;]$cluster_id"
552 |           - task: Bash@3
553 |             displayName: 'Create / Get Deploy AKS Job'
554 |             inputs:
555 |               targetType: 'inline'
556 |               script: |
557 |                 job_id=$(databricks jobs list | grep "$(databricks.job.deploytoaks.name)" | awk '{print $1}')
558 |                 
559 |                 if [ -z "$job_id" ]
560 |                 then
561 |                 JSON=`cat << EOM
562 |                 {
563 |                   "notebook_task": {
564 |                     "notebook_path": "$(databricks.notebook.path)/serving_deploy_to_aks",
565 |                     "base_parameters": {
566 |                       "model_image_id": ""
567 |                     }
568 |                   },
569 |                   "existing_cluster_id": "$(databricks.cluster.id)",
570 |                   "name": "$(databricks.job.deploytoaks.name)",
571 |                   "max_concurrent_runs": 30,
572 |                   "timeout_seconds": 86400,
573 |                   "libraries": [],
574 |                   "email_notifications": {}
575 |                 }
576 |                 EOM`
577 |                 
578 |                 job_id=$(databricks jobs create --json "$JSON" | jq ".job_id")
579 |                 fi
580 |                 
581 |                 echo "##vso[task.setvariable variable=databricks.job.deploytoaks.id;]$job_id"
582 |           # - task: Bash@3
583 |           #   displayName: 'Get Image ID'
584 |           #   inputs:
585 |           #     targetType: 'inline'
586 |           #     script: |
587 |           #       echo "Retrieving Image ID..."
588 |           #       model_image_id=$(cat image.json | jq -r ".model_image_id")
589 |           #       if [ -z "$model_image_id" ]
590 |           #       then
591 |           #         echo "ERROR: Unable to get Image ID"
592 |           #         exit 1
593 |           #       fi
594 |           #       echo "  Image ID: $model_image_id"
595 |           #       echo "##vso[task.setvariable variable=azureml.image.id;]$model_image_id"
596 |           #     workingDirectory: '$(System.ArtifactsDirectory)/drop/metadata'
597 |           - task: Bash@3
598 |             displayName: 'Deploy To AKS'
599 |             inputs:
600 |               targetType: 'inline'
601 |               script: |
602 |                 echo "Running job with ID $(databricks.job.deploytoaks.id) with model_id=$(azureml.image.id)..."
603 |                 run_id=$(databricks jobs run-now --job-id $(databricks.job.deploytoaks.id) --notebook-params '{ "model_image_id": "$(azureml.image.id)" }' | jq ".run_id")
604 |                 echo "  Run ID: $run_id"
605 |                 
606 |                 run_state=$(databricks runs get --run-id $run_id | jq -r ".state.life_cycle_state")
607 |                 echo "Run State (ID $run_id): $run_state"
608 |                 while [ $run_state == "RUNNING" -o $run_state == "PENDING" ]
609 |                 do
610 |                   sleep 30
611 |                   run_state=$(databricks runs get --run-id $run_id | jq -r ".state.life_cycle_state")
612 |                   echo "Run State (ID $run_id): $run_state"
613 |                 done
614 |                 result_state=$(databricks runs get --run-id $run_id | jq -r ".state.result_state")
615 |                 state_message=$(databricks runs get --run-id $run_id | jq -r ".state.state_message")
616 |                 echo "Result State (ID $run_id): $result_state, Message: $state_message"
617 |                 
618 |                 if [ $result_state == "SUCCESS" ]
619 |                 then
620 |                   exit 0
621 |                 else
622 |                   exit 1
623 |                 fi


--------------------------------------------------------------------------------