├── images ├── mlops-full.png ├── Environments.png ├── mlops-simple.png ├── mlflow-creation.png ├── mlops-databricks.png ├── mlops-overview.png ├── mlflow-experiment.png ├── 01AddingGitHubSecrets.png ├── 02AddingGitHubSecrets.png ├── databricks-git-devops.png ├── multi-stage-pipeline.png ├── 01AddingPipelineVariables.png └── 02AddingPipelineVariables.png ├── notebooks └── MLOpsDemo │ ├── images │ ├── Environments.png │ ├── mlflow-creation.png │ ├── mlflow-experiment.png │ ├── 01AddingGitHubSecrets.png │ ├── 02AddingGitHubSecrets.png │ ├── databricks-git-devops.png │ ├── 01AddingPipelineVariables.png │ └── 02AddingPipelineVariables.png │ ├── train.py │ ├── inference.py │ ├── serving_build_container_image.py │ ├── serving_deploy_to_aci.py │ └── serving_deploy_to_aks.py ├── LICENSE ├── README.md └── azure-pipelines.yml /images/mlops-full.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ahmedsmostafa/databricks-azureml-mlops/HEAD/images/mlops-full.png -------------------------------------------------------------------------------- /images/Environments.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ahmedsmostafa/databricks-azureml-mlops/HEAD/images/Environments.png -------------------------------------------------------------------------------- /images/mlops-simple.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ahmedsmostafa/databricks-azureml-mlops/HEAD/images/mlops-simple.png -------------------------------------------------------------------------------- /images/mlflow-creation.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ahmedsmostafa/databricks-azureml-mlops/HEAD/images/mlflow-creation.png -------------------------------------------------------------------------------- /images/mlops-databricks.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ahmedsmostafa/databricks-azureml-mlops/HEAD/images/mlops-databricks.png -------------------------------------------------------------------------------- /images/mlops-overview.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ahmedsmostafa/databricks-azureml-mlops/HEAD/images/mlops-overview.png -------------------------------------------------------------------------------- /images/mlflow-experiment.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ahmedsmostafa/databricks-azureml-mlops/HEAD/images/mlflow-experiment.png -------------------------------------------------------------------------------- /images/01AddingGitHubSecrets.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ahmedsmostafa/databricks-azureml-mlops/HEAD/images/01AddingGitHubSecrets.png -------------------------------------------------------------------------------- /images/02AddingGitHubSecrets.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ahmedsmostafa/databricks-azureml-mlops/HEAD/images/02AddingGitHubSecrets.png -------------------------------------------------------------------------------- /images/databricks-git-devops.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ahmedsmostafa/databricks-azureml-mlops/HEAD/images/databricks-git-devops.png -------------------------------------------------------------------------------- /images/multi-stage-pipeline.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ahmedsmostafa/databricks-azureml-mlops/HEAD/images/multi-stage-pipeline.png -------------------------------------------------------------------------------- /images/01AddingPipelineVariables.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ahmedsmostafa/databricks-azureml-mlops/HEAD/images/01AddingPipelineVariables.png -------------------------------------------------------------------------------- /images/02AddingPipelineVariables.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ahmedsmostafa/databricks-azureml-mlops/HEAD/images/02AddingPipelineVariables.png -------------------------------------------------------------------------------- /notebooks/MLOpsDemo/images/Environments.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ahmedsmostafa/databricks-azureml-mlops/HEAD/notebooks/MLOpsDemo/images/Environments.png -------------------------------------------------------------------------------- /notebooks/MLOpsDemo/images/mlflow-creation.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ahmedsmostafa/databricks-azureml-mlops/HEAD/notebooks/MLOpsDemo/images/mlflow-creation.png -------------------------------------------------------------------------------- /notebooks/MLOpsDemo/images/mlflow-experiment.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ahmedsmostafa/databricks-azureml-mlops/HEAD/notebooks/MLOpsDemo/images/mlflow-experiment.png -------------------------------------------------------------------------------- /notebooks/MLOpsDemo/images/01AddingGitHubSecrets.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ahmedsmostafa/databricks-azureml-mlops/HEAD/notebooks/MLOpsDemo/images/01AddingGitHubSecrets.png -------------------------------------------------------------------------------- /notebooks/MLOpsDemo/images/02AddingGitHubSecrets.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ahmedsmostafa/databricks-azureml-mlops/HEAD/notebooks/MLOpsDemo/images/02AddingGitHubSecrets.png -------------------------------------------------------------------------------- /notebooks/MLOpsDemo/images/databricks-git-devops.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ahmedsmostafa/databricks-azureml-mlops/HEAD/notebooks/MLOpsDemo/images/databricks-git-devops.png -------------------------------------------------------------------------------- /notebooks/MLOpsDemo/images/01AddingPipelineVariables.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ahmedsmostafa/databricks-azureml-mlops/HEAD/notebooks/MLOpsDemo/images/01AddingPipelineVariables.png -------------------------------------------------------------------------------- /notebooks/MLOpsDemo/images/02AddingPipelineVariables.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ahmedsmostafa/databricks-azureml-mlops/HEAD/notebooks/MLOpsDemo/images/02AddingPipelineVariables.png -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2020 Ahmed Mostafa; forked from Sascha Dittmann 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /notebooks/MLOpsDemo/train.py: -------------------------------------------------------------------------------- 1 | # Databricks notebook source 2 | #import numpy as np 3 | #dbutils.widgets.removeAll() 4 | #dbutils.widgets.dropdown("alpha", "0.5", [str(x)[:3] for x in np.arange(0,1.1,0.1)]) 5 | #dbutils.widgets.dropdown("l1_ratio", "0.5", [str(x)[:3] for x in np.arange(0,1.1,0.1)]) 6 | 7 | # COMMAND ---------- 8 | 9 | # MAGIC %md 10 | # MAGIC # Training the Model 11 | # MAGIC First, train a linear regression model that takes two hyperparameters: *alpha* and *l1_ratio*. 12 | # MAGIC 13 | # MAGIC > The data set used in this example is from http://archive.ics.uci.edu/ml/datasets/Wine+Quality 14 | # MAGIC > P. Cortez, A. Cerdeira, F. Almeida, T. Matos and J. Reis. 15 | # MAGIC > Modeling wine preferences by data mining from physicochemical properties. In Decision Support Systems, Elsevier, 47(4):547-553, 2009. 16 | 17 | # COMMAND ---------- 18 | 19 | import os 20 | import warnings 21 | import sys 22 | 23 | import pandas as pd 24 | import numpy as np 25 | from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score 26 | from sklearn.model_selection import train_test_split 27 | from sklearn.linear_model import ElasticNet 28 | 29 | import logging 30 | logging.basicConfig(level=logging.WARN) 31 | logger = logging.getLogger(__name__) 32 | 33 | # COMMAND ---------- 34 | 35 | import mlflow 36 | import mlflow.sklearn 37 | 38 | mlflow.set_experiment("/MLOpsDemo/DrinksQuality") 39 | 40 | # COMMAND ---------- 41 | 42 | def eval_metrics(actual, pred): 43 | rmse = np.sqrt(mean_squared_error(actual, pred)) 44 | mae = mean_absolute_error(actual, pred) 45 | r2 = r2_score(actual, pred) 46 | return rmse, mae, r2 47 | 48 | # COMMAND ---------- 49 | 50 | try: 51 | alpha = float(dbutils.widgets.getArgument("alpha")) 52 | except: 53 | alpha = 0.5 54 | try: 55 | l1_ratio = float(dbutils.widgets.getArgument("l1_ratio")) 56 | except: 57 | l1_ratio = 0.5 58 | 59 | # COMMAND ---------- 60 | 61 | warnings.filterwarnings("ignore") 62 | np.random.seed(40) 63 | 64 | # Read the wine-quality csv file from the URL 65 | csv_url =\ 66 | 'http://archive.ics.uci.edu/ml/machine-learning-databases/wine-quality/winequality-red.csv' 67 | try: 68 | data = pd.read_csv(csv_url, sep=';') 69 | except Exception as e: 70 | logger.exception("Unable to download training & test CSV, check your internet connection. Error: %s", e) 71 | 72 | # Split the data into training and test sets. (0.75, 0.25) split. 73 | train, test = train_test_split(data) 74 | 75 | # The predicted column is "quality" which is a scalar from [3, 9] 76 | train_x = train.drop(["quality"], axis=1) 77 | test_x = test.drop(["quality"], axis=1) 78 | train_y = train[["quality"]] 79 | test_y = test[["quality"]] 80 | 81 | # COMMAND ---------- 82 | 83 | with mlflow.start_run(): 84 | lr = ElasticNet(alpha=alpha, l1_ratio=l1_ratio, random_state=42) 85 | lr.fit(train_x, train_y) 86 | 87 | predicted_qualities = lr.predict(test_x) 88 | 89 | (rmse, mae, r2) = eval_metrics(test_y, predicted_qualities) 90 | 91 | print("Elasticnet model (alpha=%f, l1_ratio=%f):" % (alpha, l1_ratio)) 92 | print(" RMSE: %s" % rmse) 93 | print(" MAE: %s" % mae) 94 | print(" R2: %s" % r2) 95 | 96 | mlflow.log_param("alpha", alpha) 97 | mlflow.log_param("l1_ratio", l1_ratio) 98 | mlflow.log_metric("rmse", rmse) 99 | mlflow.log_metric("r2", r2) 100 | mlflow.log_metric("mae", mae) 101 | 102 | mlflow.sklearn.log_model(lr, "model") 103 | 104 | # COMMAND ---------- 105 | 106 | -------------------------------------------------------------------------------- /notebooks/MLOpsDemo/inference.py: -------------------------------------------------------------------------------- 1 | # Databricks notebook source 2 | # MAGIC %md ## Inference 3 | 4 | # COMMAND ---------- 5 | 6 | from mlflow.tracking.client import MlflowClient 7 | from mlflow.entities import ViewType 8 | 9 | experiment_name = "/MLOpsDemo/DrinksQuality" 10 | experiment = MlflowClient().get_experiment_by_name(experiment_name) 11 | experiment_ids = eval('[' + experiment.experiment_id + ']') 12 | # all_experiments = [exp.experiment_id for exp in MlflowClient().list_experiments()] 13 | print("Experiment IDs:", experiment_ids) 14 | 15 | query = "metrics.rmse < 0.8" 16 | runs = MlflowClient().search_runs(experiment_ids, query, ViewType.ALL) 17 | 18 | rmse_low = None 19 | run_id = None 20 | for run in runs: 21 | if (rmse_low == None or run.data.metrics['rmse'] < rmse_low): 22 | rmse_low = run.data.metrics['rmse'] 23 | run_id = run.info.run_id 24 | print("Lowest RMSE:", rmse_low) 25 | print("Run ID:", run_id) 26 | 27 | model_uri = "runs:/" + run_id + "/model" 28 | 29 | # COMMAND ---------- 30 | 31 | # MAGIC %md ## Load MLflow Model as a scikit-learn Model 32 | # MAGIC You can use the MLflow API to load the model from the MLflow server that was produced by a given run. 33 | # MAGIC 34 | # MAGIC Once you load it, it is a just a scikit-learn model and you can explore it or use it. 35 | 36 | # COMMAND ---------- 37 | 38 | import mlflow.sklearn 39 | model = mlflow.sklearn.load_model(model_uri=model_uri) 40 | model.coef_ 41 | 42 | # COMMAND ---------- 43 | 44 | import numpy as np 45 | import pandas as pd 46 | 47 | cols = ['alcohol', 'chlorides', 'citric acid', 'density', 'fixed acidity', 'free sulfur dioxide', 'pH', 'residual sugar', 'sulphates', 'total sulfur dioxide', 'volatile acidity'] 48 | d = [12.8, 0.029, 0.48, 0.98, 6.2, 29, 3.33, 1.2, 0.39, 75, 0.66] 49 | d = np.array([d]) 50 | 51 | data = pd.DataFrame(d, columns=cols) 52 | display(data) 53 | 54 | # COMMAND ---------- 55 | 56 | #Get a prediction for a row of the dataset 57 | model.predict(data) 58 | 59 | # COMMAND ---------- 60 | 61 | # MAGIC %md ## Use an MLflow Model for Batch Inference 62 | # MAGIC You can get a PySpark UDF to do some batch inference using one of the models. 63 | 64 | # COMMAND ---------- 65 | 66 | csv_url = 'http://archive.ics.uci.edu/ml/machine-learning-databases/wine-quality/winequality-red.csv' 67 | try: 68 | data = pd.read_csv(csv_url, sep=';') 69 | except Exception as e: 70 | logger.exception("Unable to download training & test CSV, check your internet connection. Error: %s", e) 71 | 72 | # Create a Spark DataFrame from the original pandas DataFrame minus the column you want to predict. 73 | # Use this to simulate what this would be like if you had a big data set e.g. click logs that was 74 | # regularly being updated that you wanted to score. 75 | dataframe = spark.createDataFrame(data.drop(["quality"], axis=1)) 76 | display(dataframe) 77 | 78 | # COMMAND ---------- 79 | 80 | # MAGIC %md Use the MLflow API to create a PySpark UDF from a run. See [Export a python_function model as an Apache Spark UDF](https://mlflow.org/docs/latest/models.html#export-a-python-function-model-as-an-apache-spark-udf). 81 | 82 | # COMMAND ---------- 83 | 84 | import mlflow.pyfunc 85 | pyfunc_udf = mlflow.pyfunc.spark_udf(spark, model_uri=model_uri) 86 | 87 | # COMMAND ---------- 88 | 89 | # MAGIC %md Add a column to the data by applying the PySpark UDF to the DataFrame. 90 | 91 | # COMMAND ---------- 92 | 93 | predicted_df = dataframe.withColumn("prediction", pyfunc_udf('alcohol', 'chlorides', 'citric acid', 'density', 'fixed acidity', 'free sulfur dioxide', 'pH', 'residual sugar', 'sulphates', 'total sulfur dioxide', 'volatile acidity')) 94 | display(predicted_df) 95 | 96 | # COMMAND ---------- 97 | 98 | -------------------------------------------------------------------------------- /notebooks/MLOpsDemo/serving_build_container_image.py: -------------------------------------------------------------------------------- 1 | # Databricks notebook source 2 | # MAGIC %md ## Serving Models with Microsoft Azure ML 3 | # MAGIC 4 | # MAGIC ##### NOTE: I do not recommend using *Run All* because it takes several minutes to deploy and update models; models cannot be queried until they are active. 5 | 6 | # COMMAND ---------- 7 | 8 | # MAGIC %md ### Create or load an Azure ML Workspace 9 | 10 | # COMMAND ---------- 11 | 12 | # MAGIC %md Before models can be deployed to Azure ML, you must create or obtain an Azure ML Workspace. The `azureml.core.Workspace.create()` function will load a workspace of a specified name or create one if it does not already exist. For more information about creating an Azure ML Workspace, see the [Azure ML Workspace management documentation](https://docs.microsoft.com/en-us/azure/machine-learning/service/how-to-manage-workspace). 13 | 14 | # COMMAND ---------- 15 | 16 | import azureml 17 | from azureml.core import Workspace 18 | from azureml.core.authentication import ServicePrincipalAuthentication 19 | 20 | workspace_name = dbutils.secrets.get(scope = "azureml", key = "workspace_name") 21 | workspace_location = "westeurope" 22 | resource_group = dbutils.secrets.get(scope = "azureml", key = "resource_group") 23 | subscription_id = dbutils.secrets.get(scope = "azureml", key = "subscription_id") 24 | 25 | # COMMAND ---------- 26 | 27 | svc_pr = ServicePrincipalAuthentication( 28 | tenant_id = dbutils.secrets.get(scope = "azureml", key = "tenant_id"), 29 | service_principal_id = dbutils.secrets.get(scope = "azureml", key = "client_id"), 30 | service_principal_password = dbutils.secrets.get(scope = "azureml", key = "client_secret")) 31 | 32 | workspace = Workspace.create(name = workspace_name, 33 | location = workspace_location, 34 | resource_group = resource_group, 35 | subscription_id = subscription_id, 36 | auth=svc_pr, 37 | exist_ok=True) 38 | 39 | # COMMAND ---------- 40 | 41 | workspace 42 | 43 | # COMMAND ---------- 44 | 45 | # MAGIC %md ## Build an Azure Container Image for model deployment 46 | 47 | # COMMAND ---------- 48 | 49 | # MAGIC %md ### Use MLflow to build a Container Image for the trained model 50 | # MAGIC 51 | # MAGIC Use the `mlflow.azuereml.build_image` function to build an Azure Container Image for the trained MLflow model. This function also registers the MLflow model with a specified Azure ML workspace. The resulting image can be deployed to Azure Container Instances (ACI) or Azure Kubernetes Service (AKS) for real-time serving. 52 | 53 | # COMMAND ---------- 54 | 55 | from mlflow.tracking.client import MlflowClient 56 | from mlflow.entities import ViewType 57 | 58 | experiment_name = "/MLOpsDemo/DrinksQuality" 59 | experiment = MlflowClient().get_experiment_by_name(experiment_name) 60 | experiment_ids = eval('[' + experiment.experiment_id + ']') 61 | # all_experiments = [exp.experiment_id for exp in MlflowClient().list_experiments()] 62 | print("Experiment IDs:", experiment_ids) 63 | 64 | query = "metrics.rmse < 0.8" 65 | runs = MlflowClient().search_runs(experiment_ids, query, ViewType.ALL) 66 | 67 | rmse_low = None 68 | run_id = None 69 | for run in runs: 70 | if (rmse_low == None or run.data.metrics['rmse'] < rmse_low): 71 | rmse_low = run.data.metrics['rmse'] 72 | run_id = run.info.run_id 73 | print("Lowest RMSE:", rmse_low) 74 | print("Run ID:", run_id) 75 | 76 | model_uri = "runs:/" + run_id + "/model" 77 | 78 | # COMMAND ---------- 79 | 80 | import mlflow.azureml 81 | 82 | model_image, azure_model = mlflow.azureml.build_image(model_uri=model_uri, 83 | workspace=workspace, 84 | model_name="drinksquality", 85 | image_name="drinksquality", 86 | description="Sklearn ElasticNet image for predicting wine quality", 87 | synchronous=False) 88 | 89 | model_image.wait_for_creation(show_output=True) 90 | 91 | # COMMAND ---------- 92 | 93 | dbutils.notebook.exit('{"model_image_id": "%s"}' % model_image.id) -------------------------------------------------------------------------------- /notebooks/MLOpsDemo/serving_deploy_to_aci.py: -------------------------------------------------------------------------------- 1 | # Databricks notebook source 2 | # MAGIC %md ## Serving Models with Microsoft Azure ML 3 | # MAGIC 4 | # MAGIC ##### NOTE: I do not recommend using *Run All* because it takes several minutes to deploy and update models; models cannot be queried until they are active. 5 | 6 | # COMMAND ---------- 7 | 8 | #dbutils.widgets.removeAll() 9 | #dbutils.widgets.text("model_image_id", "") 10 | 11 | # COMMAND ---------- 12 | 13 | # model_image_id = dbutils.widgets.getArgument("model_image_id") 14 | # print("Model Image ID:", model_image_id) 15 | 16 | # COMMAND ---------- 17 | 18 | # MAGIC %md ### Create or load an Azure ML Workspace 19 | 20 | # COMMAND ---------- 21 | 22 | # MAGIC %md Before models can be deployed to Azure ML, you must create or obtain an Azure ML Workspace. The `azureml.core.Workspace.create()` function will load a workspace of a specified name or create one if it does not already exist. For more information about creating an Azure ML Workspace, see the [Azure ML Workspace management documentation](https://docs.microsoft.com/en-us/azure/machine-learning/service/how-to-manage-workspace). 23 | 24 | # COMMAND ---------- 25 | 26 | import azureml 27 | from azureml.core import Workspace 28 | from azureml.core.authentication import ServicePrincipalAuthentication 29 | 30 | workspace_name = dbutils.secrets.get(scope = "azureml", key = "workspace_name") 31 | workspace_location = "westeurope" 32 | resource_group = dbutils.secrets.get(scope = "azureml", key = "resource_group") 33 | subscription_id = dbutils.secrets.get(scope = "azureml", key = "subscription_id") 34 | 35 | svc_pr = ServicePrincipalAuthentication( 36 | tenant_id = dbutils.secrets.get(scope = "azureml", key = "tenant_id"), 37 | service_principal_id = dbutils.secrets.get(scope = "azureml", key = "client_id"), 38 | service_principal_password = dbutils.secrets.get(scope = "azureml", key = "client_secret")) 39 | 40 | workspace = Workspace.create(name = workspace_name, 41 | location = workspace_location, 42 | resource_group = resource_group, 43 | subscription_id = subscription_id, 44 | auth=svc_pr, 45 | exist_ok=True) 46 | 47 | # COMMAND ---------- 48 | 49 | # MAGIC %md 50 | # MAGIC Get the latest model image from the workspace 51 | 52 | # COMMAND ---------- 53 | 54 | model_image_id = workspace.images['drinksquality'].id 55 | print("Model Image ID:", model_image_id) 56 | 57 | # COMMAND ---------- 58 | 59 | # MAGIC %md ## Deploy the model to "dev" using [Azure Container Instances (ACI)](https://docs.microsoft.com/en-us/azure/container-instances/) 60 | # MAGIC 61 | # MAGIC The [ACI platform](https://docs.microsoft.com/en-us/azure/container-instances/) is the recommended environment for staging and developmental model deployments. 62 | 63 | # COMMAND ---------- 64 | 65 | # MAGIC %md ### Create an ACI webservice deployment using the model's Container Image 66 | # MAGIC 67 | # MAGIC Using the Azure ML SDK, deploy the Container Image for the trained MLflow model to ACI. 68 | 69 | # COMMAND ---------- 70 | 71 | from azureml.core.webservice import AciWebservice, Webservice 72 | from azureml.core.image import Image 73 | 74 | model_image = Image(workspace, id=model_image_id) 75 | 76 | dev_webservice_name = "drinks-quality-aci" 77 | dev_webservice_deployment_config = AciWebservice.deploy_configuration() 78 | dev_webservice = Webservice.deploy_from_image(name=dev_webservice_name, image=model_image, deployment_config=dev_webservice_deployment_config, workspace=workspace, deployment_target=None, overwrite=True) 79 | 80 | # COMMAND ---------- 81 | 82 | while dev_webservice.state != "Healthy": 83 | dev_webservice.update_deployment_state() 84 | 85 | # COMMAND ---------- 86 | 87 | # dev_webservice.wait_for_deployment() 88 | 89 | # COMMAND ---------- 90 | 91 | # MAGIC %md ## Query the deployed model in "dev" 92 | 93 | # COMMAND ---------- 94 | 95 | # MAGIC %md ### Load dataset 96 | 97 | # COMMAND ---------- 98 | 99 | import numpy as np 100 | import pandas as pd 101 | 102 | csv_url = 'http://archive.ics.uci.edu/ml/machine-learning-databases/wine-quality/winequality-red.csv' 103 | try: 104 | data = pd.read_csv(csv_url, sep=';') 105 | except Exception as e: 106 | logger.exception("Unable to download training & test CSV, check your internet connection. Error: %s", e) 107 | 108 | data = data.drop(["quality"], axis=1)[:10] 109 | 110 | # COMMAND ---------- 111 | 112 | # MAGIC %md ## Create sample input vector 113 | 114 | # COMMAND ---------- 115 | 116 | query_input = data.to_json(orient='split') 117 | query_input = eval(query_input) 118 | query_input.pop('index', None) 119 | 120 | # COMMAND ---------- 121 | 122 | # MAGIC %md #### Evaluate the sample input vector by sending an HTTP request 123 | # MAGIC Query the ACI webservice's scoring endpoint by sending an HTTP POST request that contains the input vector. 124 | 125 | # COMMAND ---------- 126 | 127 | import requests 128 | import json 129 | 130 | def query_endpoint_example(scoring_uri, inputs, service_key=None): 131 | headers = { 132 | "Content-Type": "application/json", 133 | } 134 | if service_key is not None: 135 | headers["Authorization"] = "Bearer {service_key}".format(service_key=service_key) 136 | 137 | print("Sending batch prediction request with inputs: {}".format(inputs)) 138 | response = requests.post(scoring_uri, data=json.dumps(inputs), headers=headers) 139 | preds = json.loads(response.text) 140 | print("Received response: {}".format(preds)) 141 | return preds 142 | 143 | # COMMAND ---------- 144 | 145 | print("Webservice URL:", dev_webservice.scoring_uri) 146 | 147 | # COMMAND ---------- 148 | 149 | dev_prediction = query_endpoint_example(scoring_uri=dev_webservice.scoring_uri, inputs=query_input) -------------------------------------------------------------------------------- /notebooks/MLOpsDemo/serving_deploy_to_aks.py: -------------------------------------------------------------------------------- 1 | # Databricks notebook source 2 | # MAGIC %md ## Serving Models with Microsoft Azure ML 3 | # MAGIC 4 | # MAGIC ##### NOTE: I do not recommend using *Run All* because it takes several minutes to deploy and update models; models cannot be queried until they are active. 5 | 6 | # COMMAND ---------- 7 | 8 | # MAGIC %md ### Create or load an Azure ML Workspace 9 | 10 | # COMMAND ---------- 11 | 12 | # MAGIC %md Before models can be deployed to Azure ML, you must create or obtain an Azure ML Workspace. The `azureml.core.Workspace.create()` function will load a workspace of a specified name or create one if it does not already exist. For more information about creating an Azure ML Workspace, see the [Azure ML Workspace management documentation](https://docs.microsoft.com/en-us/azure/machine-learning/service/how-to-manage-workspace). 13 | 14 | # COMMAND ---------- 15 | 16 | import azureml 17 | from azureml.core import Workspace 18 | from azureml.core.authentication import ServicePrincipalAuthentication 19 | 20 | workspace_name = "azuredevopsml" 21 | workspace_location = "westeurope" 22 | resource_group = "azuredevopsrg" 23 | subscription_id = "46be3785-50c9-401f-b7e5-1e72664f6e93" 24 | 25 | svc_pr = ServicePrincipalAuthentication( 26 | tenant_id = dbutils.secrets.get(scope = "azureml", key = "tenant_id"), 27 | service_principal_id = dbutils.secrets.get(scope = "azureml", key = "client_id"), 28 | service_principal_password = dbutils.secrets.get(scope = "azureml", key = "client_secret")) 29 | 30 | aksml_workspace = Workspace.create(name = workspace_name, 31 | location = workspace_location, 32 | resource_group = resource_group, 33 | subscription_id = subscription_id, 34 | auth=svc_pr, 35 | exist_ok=True) 36 | 37 | # COMMAND ---------- 38 | 39 | # MAGIC %md ## Deploy the model to production using [Azure Kubernetes Service (AKS)](https://azure.microsoft.com/en-us/services/kubernetes-service/). 40 | 41 | # COMMAND ---------- 42 | 43 | # MAGIC %md ### Create a new AKS cluster 44 | # MAGIC 45 | # MAGIC If you do not have an active AKS cluster for model deployment, create one using the Azure ML SDK. 46 | 47 | # COMMAND ---------- 48 | 49 | model_image_id = aksml_workspace.images['drinksquality'].id 50 | print("Model Image ID:", model_image_id) 51 | 52 | # COMMAND ---------- 53 | 54 | from azureml.core.compute import AksCompute, ComputeTarget 55 | from azureml.core.compute_target import ComputeTargetException 56 | aks_name = 'drinksqualityaks' 57 | 58 | # COMMAND ---------- 59 | 60 | # from azureml.core.webservice import AksWebservice 61 | # deployment_config = AksWebservice.deploy_configuration(cpu_cores = 1, memory_gb = 1) 62 | 63 | prov_config = AksCompute.provisioning_configuration() 64 | # prov_config = AksCompute.provisioning_configuration(agent_count=3, vm_size="Standard_B4ms") 65 | 66 | 67 | # COMMAND ---------- 68 | 69 | print(aksml_workspace.compute_targets) 70 | 71 | 72 | # COMMAND ---------- 73 | 74 | computes = ComputeTarget.list(aksml_workspace) 75 | aks_exists = False 76 | for target in computes: 77 | print(target) 78 | print(target.type) 79 | print(target.get_status()) 80 | if target.type == "AKS": 81 | if target.get_status() == "Failed": 82 | # delete and recreate the target 83 | target.delete() 84 | # prov_config = AksCompute.provisioning_configuration(agent_count=3, vm_size="Standard_B4ms") 85 | aks_target = ComputeTarget.create(aksml_workspace, name = aks_name, provisioning_configuration = prov_config) 86 | if target.get_status() == "Succeeded": 87 | # attach to existing AKS 88 | aks_exists = True 89 | 90 | print(aks_exists) 91 | 92 | # COMMAND ---------- 93 | 94 | if aks_exists: 95 | aks_target = aksml_workspace.compute_targets['drinksqualityaks'] 96 | else: 97 | aks_target = ComputeTarget.create(aksml_workspace, name = aks_name, provisioning_configuration = prov_config) 98 | 99 | # COMMAND ---------- 100 | 101 | type(aks_target) 102 | 103 | # COMMAND ---------- 104 | 105 | aks_target.wait_for_completion(show_output=True) 106 | 107 | # COMMAND ---------- 108 | 109 | # MAGIC %md ### Deploy the model's image to the specified AKS cluster 110 | 111 | # COMMAND ---------- 112 | 113 | from azureml.core.webservice import Webservice, AksWebservice 114 | from azureml.core.image import Image 115 | 116 | # Get Model 117 | model_image = Image(aksml_workspace, id=model_image_id) 118 | # Get Webservice 119 | prod_webservice_name = "drinks-quality-aks" 120 | 121 | # COMMAND ---------- 122 | 123 | from azureml.core.webservice import Webservice, AksWebservice 124 | from azureml.core.image import Image 125 | 126 | # Get Model 127 | model_image = Image(aksml_workspace, id=model_image_id) 128 | 129 | # Get Webservice 130 | prod_webservice_name = "drinks-quality-aks" 131 | try: 132 | prod_webservice = Webservice(aksml_workspace, prod_webservice_name) 133 | print('updating existing webservice.') 134 | prod_webservice.update(image=model_image) 135 | prod_webservice.wait_for_deployment(show_output = True) 136 | except: 137 | print('creating new webservice.') 138 | # Set configuration and service name 139 | prod_webservice_deployment_config = AksWebservice.deploy_configuration() 140 | # Deploy from image 141 | prod_webservice = Webservice.deploy_from_image(workspace = aksml_workspace, 142 | name = prod_webservice_name, 143 | image = model_image, 144 | deployment_config = prod_webservice_deployment_config, 145 | deployment_target = aks_target) 146 | # Wait for the deployment to complete 147 | prod_webservice.wait_for_deployment(show_output = True) 148 | 149 | # COMMAND ---------- 150 | 151 | # MAGIC %md ## Query the deployed model in production 152 | 153 | # COMMAND ---------- 154 | 155 | # MAGIC %md ### Load dataset 156 | 157 | # COMMAND ---------- 158 | 159 | import numpy as np 160 | import pandas as pd 161 | 162 | csv_url = 'http://archive.ics.uci.edu/ml/machine-learning-databases/wine-quality/winequality-red.csv' 163 | try: 164 | data = pd.read_csv(csv_url, sep=';') 165 | except Exception as e: 166 | logger.exception("Unable to download training & test CSV, check your internet connection. Error: %s", e) 167 | 168 | data = data.drop(["quality"], axis=1)[:10] 169 | 170 | # COMMAND ---------- 171 | 172 | # MAGIC %md ### Create sample input vector 173 | 174 | # COMMAND ---------- 175 | 176 | query_input = data.to_json(orient='split') 177 | query_input = eval(query_input) 178 | query_input.pop('index', None) 179 | 180 | # COMMAND ---------- 181 | 182 | # MAGIC %md #### Evaluate the sample input vector by sending an HTTP request 183 | # MAGIC Query the AKS webservice's scoring endpoint by sending an HTTP POST request that includes the input vector. The production AKS deployment may require an authorization token (service key) for queries. Include this key in the HTTP request header. 184 | 185 | # COMMAND ---------- 186 | 187 | import requests 188 | import json 189 | 190 | def query_endpoint_example(scoring_uri, inputs, service_key=None): 191 | headers = { 192 | "Content-Type": "application/json", 193 | } 194 | if service_key is not None: 195 | headers["Authorization"] = "Bearer {service_key}".format(service_key=service_key) 196 | 197 | print("Sending batch prediction request with inputs: {}".format(inputs)) 198 | response = requests.post(scoring_uri, data=json.dumps(inputs), headers=headers) 199 | preds = json.loads(response.text) 200 | print("Received response: {}".format(preds)) 201 | return preds 202 | 203 | # COMMAND ---------- 204 | 205 | prod_scoring_uri = prod_webservice.scoring_uri 206 | prod_service_key = prod_webservice.get_keys()[0] if len(prod_webservice.get_keys()) > 0 else None 207 | print("Webservice URL:", prod_scoring_uri) 208 | 209 | # COMMAND ---------- 210 | 211 | prod_prediction1 = query_endpoint_example(scoring_uri=prod_scoring_uri, service_key=prod_service_key, inputs=query_input) 212 | 213 | # COMMAND ---------- 214 | 215 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | [![Build Status](https://dev.azure.com/ahmosta/MLOpsDatabricks/_apis/build/status/MLOpsDatabricks?branchName=master)](https://dev.azure.com/ahmosta/MLOpsDatabricks/_build/latest?definitionId=1&branchName=master) 2 | 3 | # Large-scale Data/MLOps with Azure & Databricks 4 | ## How to implement DataOps/MLOps using Azure Devops, Databricks, MLFlow, Azure ML 5 | 6 | Operationalizing Data Analytics and Machine Learning workloads can be challenging; because the ecosystem of platforms and services involved used to build such workloads is big; which increases the complexity of deploying such workloads to production. The complexity also increases with the continous adoption of running in containers and using container orchestration frameworks such as Kubernetes. 7 | 8 | This repo demonstrates an approach of implementing DevOps pipelines for large-scale Data Analytics and Machine Learning (also called Data/MLOps) using a combination of [Azure Databricks](https://azure.microsoft.com/en-us/services/databricks/), [MLFlow](http://mlflow.org), and [AzureML](https://azure.microsoft.com/en-us/services/machine-learning/). 9 | 10 | The DevOps pipeline is implemented in [Azure DevOps](https://azure.microsoft.com/en-us/services/devops/), and it deploys the workload in a containerized form simulating staging & production environments to [Azure Container Services](https://azure.microsoft.com/en-us/product-categories/containers/) and [Azure Kubernetes Services](https://azure.microsoft.com/en-us/services/kubernetes-service/). 11 | 12 | The diagram below shows a high-level overview of a generic DevOps process; that build pipelines build the project's artifacts followed by a testing & release process. such process enables faster deployment for modules without impacting the overall system as well as the flexibility to deploy to one or more environments. 13 | 14 | ![mlops-full](/images/mlops-full.png) 15 | 16 | This tutorial fits ML as well as DataOps workloads, to simplify things it will walk you through how to implement Data/ML Ops in the following general form: 17 | 18 | ![mlops-simple](./images/mlops-simple.png) 19 | 20 | 21 | ## Why Azure Databricks? 22 | Azure Databricks offers great capabilities for developing & building analytics & ML workloads for data ingestion, data engineering, and data science for various applications (e.g. Data Management, Batch processing, Stream processing, Graph processing, and distributed machine learning). 23 | 24 | Such capabilities are offered in a unified experience for collaboration between the different team stakeholders; supporting writing code in Scala, Python, SQL, and R and writing standard as well as Apache Spark applications. 25 | 26 | Additionally, the platform provides a very convenient infrastructure management layer: 27 | - Deploying workloads to highly scalable clusters is very easy. 28 | - Clusters can be configured to auto-scale to adapt to the processing workload. 29 | - Clusters can be set to work in an epheremal way; clusters will automatically terminate once a job is done; or no activity. This feature adds great value for managing costs. 30 | - Databricks supports running containers through [Databricks Container Services](https://docs.microsoft.com/en-us/azure/databricks/clusters/custom-containers) 31 | 32 | Such execution capabilities makes Azure Databricksa great fit, not only for developing & building workloads, but also for running such workloads and serving Data as well. 33 | 34 | ## Why Azure ML and MLFlow? 35 | This tutorial demonstrates also how Databricks Notebooks can leverage MLFlow and Azure ML, while they can be seen similar, but their usage & application shine depends on where they're used. 36 | 37 | MLFlow is natively supported within Databricks, as MLFlow manages the machine learning experiment and the runs within the Databricks workspace development environment. Therefore, it conviently offers data scientists & engineers with such ML management and tracking capability for their models without having to leave their development environment. 38 | 39 | The integration between MLFlow and AzureML however provides such management across environments, and AzureML is eventually used - from within the MLFlow Experiment - to build a Docker container image for the best scored model, and publish such workload(s) to the Azure container registry, and deploy it afterwards to either Azure Container Instances and Azure Kubernetes Services. 40 | 41 | ![mlops-databricks](./images/mlops-databricks.png) 42 | 43 | ## Using This Sample Project 44 | 45 | This repo is configured to run and use Azure DevOps, therefore you need to prepare you enviroment with the following steps. 46 | 47 | The DevOps Pipeline is a "multi-staged" pipeline and it is defined using the YAML file [azure-pipelines.yml](./azure-pipelines.yml) for Azure DevOps. 48 | 49 | > Note: Building Github workflow actions is in progress as well. 50 | 51 | ## Required Accounts And Resources 52 | 53 | This example uses Azure DevOps as an CI/CD toolset, as well as Microsoft Azure services to host the trained Machine Learning Model. 54 | 55 | * At the time of creating this tutorial, GitHub Actions were still beta. If you wan't to try this new feature, you have to [Sign up for the beta](https://github.com/features/actions) first. 56 | 57 | ### Azure Databricks Workspace 58 | 59 | In your Azure subsciption, you need to [create an Azure Databricks workspace](https://docs.azuredatabricks.net/getting-started/try-databricks.html#step-2-create-a-databricks-workspace) to get started. 60 | 61 | > NOTE: I recommend to place the Azure Databricks Workspace in a new Resource Group, to be able to clean everything up more easily afterwards. 62 | 63 | ## Importing This DevOps Project 64 | 65 | As soon as you have access to the Azure DevOps platform, you're able to [create a project](https://docs.microsoft.com/en-us/azure/devops/user-guide/sign-up-invite-teammates?view=azure-devops#create-a-project) to host your MLOps pipeline. 66 | 67 | As soon as this is created, you can [import this GitHub repository](https://docs.microsoft.com/en-us/azure/devops/repos/git/import-git-repository?view=azure-devops) into your Azure DevOps project. 68 | 69 | ### Connecting Azure Databricks Notebooks to the Azure DevOps *Repo* 70 | 71 | It is recommended to connect your notebooks to the Azure DevOps repo. This will ensure your changes & updates are pushed to the repo automatically and gets built properly. The pipeline is automatically triggered by any commit/push to the repo. 72 | 73 | To configure this, go to the "User Settings" and click on "Git Integration". ![databricks and azure devops integration](./images/databricks-git-devops.png) 74 | 75 | ### Create MLFlow Experiment 76 | The Databricks notebooks use MLFlow under the hoods; in order to create the MLFlow experiment you need to do this after importing the notebooks in the Databricks workspace. 77 | 78 | ![databricks mlflow experiment](./images/mlflow-experiment.png) 79 | 80 | Clicking on the above link will open a screen where you can specify the name of the experiment and its location on DBFS. For this demo, make sure the MLFlow experiment's name is DrinksQuality. 81 | 82 | ![databricks mlflow creation](./images/mlflow-creation.png) 83 | 84 | 85 | ## Set up The Build Pipeline 86 | 87 | By importing the GitHub files, you also imported the [azure-pipelines.yml](./azure-pipelines.yml) file. 88 | 89 | This file can be used to create your first Build Pipeline. 90 | 91 | This Build Pipeline is using a feature called "[Multi-Stage Pipelines](https://docs.microsoft.com/en-us/azure/devops/pipelines/process/stages?view=azure-devops&tabs=yaml)". This feature might not be enabled for you, so in order to use it, you should [enable this preview feature](https://docs.microsoft.com/en-us/azure/devops/project/navigation/preview-features?view=azure-devops). 92 | 93 | ![Azure DevOps multi-stage-pipeline](./images/multi-stage-pipeline.png) 94 | 95 | ## Connecting Azure Databricks 96 | 97 | ### Connecting the Azure DevOps *pipeline* to Azure Databricks 98 | 99 | To be able to run this pipeline, you also need to connect your Azure Databricks Workspace with the pipeline. 100 | 101 | Therefore, yor first need to [generate an access token on Databricks](https://docs.azuredatabricks.net/dev-tools/api/latest/authentication.html#generate-a-token). 102 | 103 | This token must be stored as encrypted secret in your Azure DevOps Build Pipeline... 104 | 105 | ![Adding an Azure Pipeline Variable](./images/01AddingPipelineVariables.png "Adding an Azure Pipeline Variable") 106 | 107 | > NOTE: The variable must be called *databricks.token* as it is referenced within the pipeline YAML file. 108 | > NOTE: There are additional variables that need to be defined to ease the build & deployment operation. You're free to decide if those variables should be defined as secrets or text values. 109 | 110 | ![Azure Pipeline Variables](./images/02AddingPipelineVariables.png) 111 | 112 | ... or your GitHub Project. 113 | 114 | ![Adding an Azure Pipeline Variable](./images/01AddingGitHubSecrets.png "Adding a GitHub Secret") 115 | 116 | > NOTE: The GitHub Secret must be called *DATABRICKS_TOKEN* 117 | 118 | ![Azure Pipeline Variables](./images/02AddingGitHubSecrets.png) 119 | 120 | ## Connecting the Azure ML Service Workspace 121 | 122 | ### Step 1: Create Azure AD Service Principal 123 | 124 | The Databricks-Notebooks will be used also for serving your model, by leveraging and creating an Azure Machine Learning Workspace (and other resources) for you. 125 | 126 | Azure Databricks Service requires access rights to do that, therefore you need to create a Service Principal in your Azure Active Directory. 127 | 128 | You can do that directly in the [Cloud Shell](https://docs.microsoft.com/en-us/azure/cloud-shell/overview) of the Azure Portal, by using one these two commands: 129 | 130 | ``` bash 131 | az ad sp create-for-rbac -n "http://MLOps-Databricks" 132 | ``` 133 | 134 | > Least Privilege Principle: If you want to narrow that down to a specific Resource Group and Azure Role, use the following command 135 | 136 | ``` bash 137 | az ad sp create-for-rbac -n "http://MLOps-Databricks" --role contributor --scopes /subscriptions/{SubID}/resourceGroups/{ResourceGroup1} 138 | ``` 139 | 140 | > Make a note of the result of this command, as you will need it in a later step. 141 | 142 | ### Step 2: Install / Update Databricks CLI 143 | 144 | Azure Databricks has its own place to store secrets. 145 | 146 | At the time of creating this example, this store can be only accessed via the Databricks command-line interface (CLI). 147 | 148 | Although not required, but you can install this CLI on your local machine or in the Azure Cloud Shell. 149 | 150 | ``` bash 151 | pip install -U databricks-cli 152 | ``` 153 | 154 | > NOTE: You need python 2.7.9 or later / 3.6 or later to install and use the Databricks command-line interface (CLI) 155 | 156 | ### Step 3 (optional): Store Databricks Secrets 157 | 158 | Using the Databricks CLI, you can now create your own section (scope) for your secrets... 159 | 160 | ``` bash 161 | databricks secrets create-scope --scope azureml 162 | ``` 163 | 164 | ... and add the required secrets to the scope. 165 | 166 | ``` bash 167 | # Use the "tenant" property from the Azure AD Service Principal command output 168 | databricks secrets put --scope azureml --key tenant_id 169 | # Use the "appId" property from the Azure AD Service Principal command output 170 | databricks secrets put --scope azureml --key client_id 171 | # Use the "password" property from the Azure AD Service Principal command output 172 | databricks secrets put --scope azureml --key client_secret 173 | 174 | databricks secrets put --scope azureml --key subscription_id 175 | databricks secrets put --scope azureml --key resource_group 176 | databricks secrets put --scope azureml --key workspace_name 177 | ``` 178 | > NOTE: The Azure DevOps Pipeline installs and defines these secrets automatically. Databricks Secrets Scopes can be passed as parameters to give flexibility to the Notebook using secrets between environments. 179 | 180 | ## OPTIONAL: Pre-Approval Checks (Azure DevOps) 181 | 182 | To avoid high costs from the Azure Kubernetes Service, which will be created by the "Deploy To Production" stage, I recommend that you [set up a Pre-Approval Check](https://docs.microsoft.com/en-us/azure/devops/pipelines/process/approvals?view=azure-devops) for the drinks-quality-production environment. 183 | 184 | This can be done in the Environments section of your Azure Pipelines. 185 | 186 | ![Azure Pipeline Environments](./images/Environments.png) 187 | 188 | ## Issues & Take aways & TO DOs: 189 | 190 | - Notebooks should be configured to pull variables from Databricks Secrets 191 | - Notebooks secrets values should be defined in separate Secrets Scopes. 192 | - Secret Scopes can be set to the same variable, is updated using Databricks CLI, from the Azure DevOps pipeline. 193 | - Manage AzureML workspace & Environments from within Azure DevOps pipeline instead of the Python SDK (within Databricks notebooks). 194 | - Use Databricks automated clusters (job clusters) instead of Interactive clusters. 195 | - Multi-Stage pipelines are very nice; but they might become harder to maintain. Think about separating your pipelines and connecting them together. 196 | 197 | _Disclaimer:_ This work is inspired by and based on efforts done by Sascha Dittman. 198 | -------------------------------------------------------------------------------- /azure-pipelines.yml: -------------------------------------------------------------------------------- 1 | # Starter pipeline 2 | # Start with a minimal pipeline that you can customize to build and deploy your code. 3 | # Add steps that build, run tests, deploy, and more: 4 | # https://aka.ms/yaml 5 | 6 | trigger: 7 | - master 8 | 9 | variables: 10 | # databricks.host: https://ukwest.azuredatabricks.net/?o=79284026118 11 | # databricks.notebook.path: /MLOpsDemo 12 | databricks.cluster.name: mlopscluster-devops 13 | databricks.cluster.id: 14 | databricks.cluster.spark_version: 6.4.x-cpu-ml-scala2.11 15 | databricks.cluster.node_type_id: Standard_DS3_v2 16 | databricks.cluster.driver_node_type_id: Standard_DS3_v2 17 | databricks.cluster.autotermination_minutes: 120 18 | databricks.cluster.workers.min: 1 19 | databricks.cluster.workers.max: 3 20 | databricks.job.train.name: Drinks Quality (Train) 21 | databricks.job.train.id: 22 | azureml.sdk: azureml-sdk[databricks] 23 | databricks.job.buildimage.name: Drinks Quality (Build Container Image) 24 | databricks.job.buildimage.id: 25 | azureml.image.id: 26 | databricks.job.deploytoaci.name: Drinks Quality (Deploy To ACI) 27 | databricks.job.deploytoaci.id: 28 | databricks.job.deploytoaks.name: Drinks Quality (Deploy To AKS) 29 | databricks.job.deploytoaks.id: 30 | 31 | stages: 32 | - stage: Build 33 | displayName: 'Train, Evaluate & Register Model' 34 | jobs: 35 | - job: Train 36 | displayName: 'Train, Evaluate & Register Model' 37 | pool: 38 | vmImage: 'ubuntu-latest' 39 | steps: 40 | - task: UsePythonVersion@0 41 | displayName: 'Use Python 3.6' 42 | inputs: 43 | versionSpec: '3.6' 44 | addToPath: true 45 | architecture: 'x64' 46 | - task: Bash@3 47 | displayName: 'Install Databricks CLI' 48 | inputs: 49 | targetType: 'inline' 50 | script: 'pip install -U databricks-cli' 51 | - task: Bash@3 52 | displayName: 'Configure Databricks CLI' 53 | inputs: 54 | targetType: 'inline' 55 | script: | 56 | # We need to write the pipe the conf into databricks configure --token since 57 | # that command only takes inputs from stdin. 58 | conf=`cat << EOM 59 | $(databricks.host) 60 | $(databricks.token) 61 | EOM` 62 | 63 | # For password auth there are three lines expected 64 | # hostname, username, password 65 | echo "$conf" | databricks configure --token 66 | - task: Bash@3 67 | displayName: 'Configure Databricks Secrets' 68 | inputs: 69 | targetType: 'inline' 70 | script: | 71 | databricks secrets create-scope --scope azureml 72 | 73 | # Use the "tenant" property from the Azure AD Service Principal command output 74 | databricks secrets put --scope azureml --key tenant_id --string-value "$(tenant_id)" 75 | 76 | # Use the "appId" property from the Azure AD Service Principal command output 77 | databricks secrets put --scope azureml --key client_id --string-value "$(client_id)" 78 | # Use the "password" property from the Azure AD Service Principal command output 79 | databricks secrets put --scope azureml --key client_secret --string-value "$(client_secret)" 80 | databricks secrets put --scope azureml --key subscription_id --string-value "$(subscription_id)" 81 | databricks secrets put --scope azureml --key resource_group --string-value "$(resource_group)" 82 | databricks secrets put --scope azureml --key workspace_name --string-value "$(workspace_name)" 83 | - task: Bash@3 84 | displayName: 'Create Notebook Path' 85 | inputs: 86 | targetType: 'inline' 87 | script: 'databricks workspace mkdirs "$(databricks.notebook.path)"' 88 | - task: Bash@3 89 | displayName: 'Import Notebooks' 90 | inputs: 91 | targetType: 'inline' 92 | script: 'databricks workspace import_dir --overwrite "$(devops.repo.notebook.path)" "$(databricks.notebook.path)"' 93 | - task: Bash@3 94 | displayName: 'Create / Get Cluster' 95 | inputs: 96 | targetType: 'inline' 97 | script: | 98 | cluster_id=$(databricks clusters list | grep "$(databricks.cluster.name)" | awk '{print $1}') 99 | 100 | if [ -z "$cluster_id" ] 101 | then 102 | JSON=`cat << EOM 103 | { 104 | "cluster_name": "$(databricks.cluster.name)", 105 | "spark_version": "$(databricks.cluster.spark_version)", 106 | "spark_conf": { 107 | "spark.databricks.delta.preview.enabled": "true" 108 | }, 109 | "node_type_id": "$(databricks.cluster.node_type_id)", 110 | "driver_node_type_id": "$(databricks.cluster.driver_node_type_id)", 111 | "spark_env_vars": { 112 | "PYSPARK_PYTHON": "/databricks/python3/bin/python3" 113 | }, 114 | "autotermination_minutes": $(databricks.cluster.autotermination_minutes), 115 | "enable_elastic_disk": true, 116 | "autoscale": { 117 | "min_workers": $(databricks.cluster.workers.min), 118 | "max_workers": $(databricks.cluster.workers.max) 119 | }, 120 | "init_scripts_safe_mode": false 121 | } 122 | EOM` 123 | 124 | cluster_id=$(databricks clusters create --json "$JSON" | jq -r ".cluster_id") 125 | sleep 10 126 | fi 127 | 128 | echo "##vso[task.setvariable variable=databricks.cluster.id;]$cluster_id" 129 | - task: Bash@3 130 | displayName: 'Start Cluster' 131 | inputs: 132 | targetType: 'inline' 133 | script: | 134 | echo "Checking Cluster State (Cluster ID: $(databricks.cluster.id))..." 135 | cluster_state=$(databricks clusters get --cluster-id "$(databricks.cluster.id)" | jq -r ".state") 136 | echo "Cluster State: $cluster_state" 137 | 138 | if [ $cluster_state == "TERMINATED" ] 139 | then 140 | echo "Starting Databricks Cluster..." 141 | databricks clusters start --cluster-id "$(databricks.cluster.id)" 142 | sleep 30 143 | cluster_state=$(databricks clusters get --cluster-id "$(databricks.cluster.id)" | jq -r ".state") 144 | echo "Cluster State: $cluster_state" 145 | fi 146 | 147 | while [ $cluster_state == "PENDING" ] 148 | do 149 | sleep 30 150 | cluster_state=$(databricks clusters get --cluster-id "$(databricks.cluster.id)" | jq -r ".state") 151 | echo "Cluster State: $cluster_state" 152 | done 153 | 154 | if [ $cluster_state == "RUNNING" ] 155 | then 156 | exit 0 157 | else 158 | exit 1 159 | fi 160 | - task: Bash@3 161 | displayName: 'Install Azure ML SDK' 162 | inputs: 163 | targetType: 'inline' 164 | script: | 165 | library_status=$(databricks libraries list --cluster-id $(databricks.cluster.id) | jq -c '.library_statuses[] | select( .library.pypi.package == "$(azureml.sdk)" ) | .status' -r) 166 | if [ -z "$library_status" ] 167 | then 168 | echo "Installing $(azureml.sdk) library to $(databricks.cluster.id)..." 169 | databricks libraries install --cluster-id "$(databricks.cluster.id)" --pypi-package "$(azureml.sdk)" 170 | sleep 10 171 | library_status=$(databricks libraries list --cluster-id $(databricks.cluster.id) | jq -c '.library_statuses[] | select( .library.pypi.package == "$(azureml.sdk)" ) | .status' -r) 172 | echo "Library Status: $library_status" 173 | fi 174 | 175 | while [ $library_status == "PENDING" -o $library_status == "INSTALLING" ] 176 | do 177 | sleep 30 178 | library_status=$(databricks libraries list --cluster-id $(databricks.cluster.id) | jq -c '.library_statuses[] | select( .library.pypi.package == "$(azureml.sdk)" ) | .status' -r) 179 | echo "Library Status: $library_status" 180 | done 181 | 182 | if [ $library_status == "INSTALLED" ] 183 | then 184 | exit 0 185 | else 186 | exit 1 187 | fi 188 | - task: Bash@3 189 | displayName: 'Create / Get Training Job' 190 | inputs: 191 | targetType: 'inline' 192 | script: | 193 | job_id=$(databricks jobs list | grep "$(databricks.job.train.name)" | awk '{print $1}') 194 | 195 | if [ -z "$job_id" ] 196 | then 197 | echo "Creating $(databricks.job.train.name) job..." 198 | JSON=`cat << EOM 199 | { 200 | "notebook_task": { 201 | "notebook_path": "$(databricks.notebook.path)/train", 202 | "base_parameters": { 203 | "alpha": "0.5", 204 | "l1_ratio": "0.5" 205 | } 206 | }, 207 | "existing_cluster_id": "$(databricks.cluster.id)", 208 | "name": "$(databricks.job.train.name)", 209 | "max_concurrent_runs": 30, 210 | "timeout_seconds": 86400, 211 | "libraries": [], 212 | "email_notifications": {} 213 | } 214 | EOM` 215 | 216 | job_id=$(databricks jobs create --json "$JSON" | jq ".job_id") 217 | fi 218 | 219 | echo "##vso[task.setvariable variable=databricks.job.train.id;]$job_id" 220 | - task: Bash@3 221 | displayName: 'Run Training Jobs' 222 | inputs: 223 | targetType: 'inline' 224 | script: | 225 | echo "Running job with ID $(databricks.job.train.id) with alpha=0.5, l1_ratio=0.5..." 226 | run_id1=$(databricks jobs run-now --job-id $(databricks.job.train.id) --notebook-params '{ "alpha": "0.5", "l1_ratio": "0.5" }' | jq ".run_id") 227 | echo " Run ID: $run_id1" 228 | 229 | run_state=$(databricks runs get --run-id $run_id1 | jq -r ".state.life_cycle_state") 230 | echo "Run State (ID $run_id1): $run_state" 231 | while [ $run_state == "RUNNING" -o $run_state == "PENDING" ] 232 | do 233 | sleep 30 234 | run_state=$(databricks runs get --run-id $run_id1 | jq -r ".state.life_cycle_state") 235 | echo "Run State (ID $run_id1): $run_state" 236 | done 237 | result_state1=$(databricks runs get --run-id $run_id1 | jq -r ".state.result_state") 238 | state_message1=$(databricks runs get --run-id $run_id1 | jq -r ".state.state_message") 239 | echo "Result State (ID $run_id1): $result_state1, Message: $state_message1" 240 | 241 | # echo "Running job with ID $(databricks.job.train.id) with alpha=0.3, l1_ratio=0.3..." 242 | # run_id2=$(databricks jobs run-now --job-id $(databricks.job.train.id) --notebook-params '{ "alpha": "0.3", "l1_ratio": "0.3" }' | jq ".run_id") 243 | # echo " Run ID: $run_id2" 244 | 245 | # echo "Running job with ID $(databricks.job.train.id) with alpha=0.1, l1_ratio=0.1..." 246 | # run_id3=$(databricks jobs run-now --job-id $(databricks.job.train.id) --notebook-params '{ "alpha": "0.1", "l1_ratio": "0.1" }' | jq ".run_id") 247 | # echo " Run ID: $run_id3" 248 | 249 | # run_state=$(databricks runs get --run-id $run_id2 | jq -r ".state.life_cycle_state") 250 | # echo "Run State (ID $run_id2): $run_state" 251 | # while [ $run_state == "RUNNING" -o $run_state == "PENDING" ] 252 | # do 253 | # sleep 30 254 | # run_state=$(databricks runs get --run-id $run_id2 | jq -r ".state.life_cycle_state") 255 | # echo "Run State (ID $run_id2): $run_state" 256 | # done 257 | # result_state2=$(databricks runs get --run-id $run_id2 | jq -r ".state.result_state") 258 | # state_message2=$(databricks runs get --run-id $run_id2 | jq -r ".state.state_message") 259 | # echo "Result State (ID $run_id2): $result_state2, Message: $state_message2" 260 | 261 | # run_state=$(databricks runs get --run-id $run_id3 | jq -r ".state.life_cycle_state") 262 | # echo "Run State (ID $run_id3): $run_state" 263 | # while [ $run_state == "RUNNING" -o $run_state == "PENDING" ] 264 | # do 265 | # sleep 30 266 | # run_state=$(databricks runs get --run-id $run_id3 | jq -r ".state.life_cycle_state") 267 | # echo "Run State (ID $run_id3): $run_state" 268 | # done 269 | # result_state3=$(databricks runs get --run-id $run_id3 | jq -r ".state.result_state") 270 | # state_message3=$(databricks runs get --run-id $run_id3 | jq -r ".state.state_message") 271 | # echo "Result State (ID $run_id3): $result_state3, Message: $state_message3" 272 | 273 | # if [ $result_state1 == "SUCCESS" -a $result_state2 == "SUCCESS" -a $result_state3 == "SUCCESS" ] 274 | if [ $result_state1 == "SUCCESS" ] 275 | then 276 | exit 0 277 | echo "Training completed successfully.." 278 | else 279 | exit 1 280 | echo "Training had some errors.." 281 | fi 282 | - task: Bash@3 283 | displayName: 'Build Container Image' 284 | inputs: 285 | targetType: 'inline' 286 | script: | 287 | job_id=$(databricks jobs list | grep "$(databricks.job.buildimage.name)" | awk '{print $1}') 288 | 289 | if [ -z "$job_id" ] 290 | then 291 | JSON=`cat << EOM 292 | { 293 | "notebook_task": { 294 | "notebook_path": "$(databricks.notebook.path)/serving_build_container_image" 295 | }, 296 | "existing_cluster_id": "$(databricks.cluster.id)", 297 | "name": "$(databricks.job.buildimage.name)", 298 | "max_concurrent_runs": 30, 299 | "timeout_seconds": 86400, 300 | "libraries": [], 301 | "email_notifications": {} 302 | } 303 | EOM` 304 | 305 | echo "Creating a job for Building Container Image ..." 306 | job_id=$(databricks jobs create --json "$JSON" | jq ".job_id") 307 | echo "##vso[task.setvariable variable=databricks.job.buildimage.id;]$job_id" 308 | fi 309 | 310 | echo "Creating a run for JOB ID $job_id" 311 | #run_id=$(databricks runs submit --json "$JSON" | jq ".run_id") 312 | run_id=$(databricks jobs run-now --job-id $job_id | jq ".run_id") 313 | echo " Run ID: $run_id" 314 | 315 | run_state=$(databricks runs get --run-id $run_id | jq -r ".state.life_cycle_state") 316 | echo "Run State (ID $run_id): $run_state" 317 | while [ $run_state == "RUNNING" -o $run_state == "PENDING" ] 318 | do 319 | sleep 30 320 | run_state=$(databricks runs get --run-id $run_id | jq -r ".state.life_cycle_state") 321 | echo "Run State (ID $run_id): $run_state" 322 | done 323 | result_state=$(databricks runs get --run-id $run_id | jq -r ".state.result_state") 324 | state_message=$(databricks runs get --run-id $run_id | jq -r ".state.state_message") 325 | echo "Result State (ID $run_id): $result_state, Message: '$state_message'" 326 | 327 | if [ $result_state == "SUCCESS" ] 328 | then 329 | mkdir -p metadata 330 | databricks runs get-output --run-id $run_id | jq -r .notebook_output.result | tee metadata/image.json 331 | exit 0 332 | else 333 | exit 1 334 | fi 335 | # - task: CopyFiles@2 336 | # displayName: 'Copy Files to Artifact Staging Directory' 337 | # inputs: 338 | # SourceFolder: '$(Build.SourcesDirectory)' 339 | # Contents: '**/metadata/*' 340 | # TargetFolder: '$(Build.ArtifactStagingDirectory)' 341 | # - task: PublishBuildArtifacts@1 342 | # displayName: 'Publish Artifact: drop' 343 | # inputs: 344 | # PathtoPublish: '$(Build.ArtifactStagingDirectory)' 345 | # ArtifactName: 'drop' 346 | # publishLocation: 'Container' 347 | # - task: PublishPipelineArtifact@1 348 | # displayName: 'Publish Artifact: drop' 349 | # inputs: 350 | # targetPath: '$(Build.ArtifactStagingDirectory)' 351 | # artifact: 'drop' 352 | # publishLocation: 'pipeline' 353 | - stage: Staging 354 | displayName: 'Deploy to Staging' 355 | dependsOn: Build 356 | condition: succeeded() 357 | jobs: 358 | # track deployments on the environment 359 | - deployment: DeployToACI 360 | displayName: 'Deploy to Azure Container Instance' 361 | pool: 362 | vmImage: 'ubuntu-latest' 363 | # creates an environment if it doesn’t exist 364 | environment: 'drinks-quality-staging' 365 | strategy: 366 | # default deployment strategy 367 | runOnce: 368 | deploy: 369 | steps: 370 | # - task: DownloadPipelineArtifact@2 371 | # displayName: 'Download Artifact: drop' 372 | # inputs: 373 | # buildType: 'current' 374 | # artifactName: 'drop' 375 | # targetPath: '$(System.ArtifactsDirectory)' 376 | # - task: DownloadBuildArtifacts@0 377 | # displayName: 'Download Artifact: drop' 378 | # inputs: 379 | # buildType: 'current' 380 | # downloadType: 'single' 381 | # artifactName: 'drop' 382 | # downloadPath: '$(System.ArtifactsDirectory)' 383 | - task: UsePythonVersion@0 384 | displayName: 'Use Python 3.6' 385 | inputs: 386 | versionSpec: '3.6' 387 | addToPath: true 388 | architecture: 'x64' 389 | - task: Bash@3 390 | displayName: 'Install Databricks CLI' 391 | inputs: 392 | targetType: 'inline' 393 | script: 'pip install -U databricks-cli' 394 | - task: Bash@3 395 | displayName: 'Configure Databricks CLI' 396 | inputs: 397 | targetType: 'inline' 398 | script: | 399 | # We need to write the pipe the conf into databricks configure --token since 400 | # that command only takes inputs from stdin. 401 | conf=`cat << EOM 402 | $(databricks.host) 403 | $(databricks.token) 404 | EOM` 405 | 406 | # For password auth there are three lines expected 407 | # hostname, username, password 408 | echo "$conf" | databricks configure --token 409 | - task: Bash@3 410 | displayName: 'Get Cluster ID' 411 | inputs: 412 | targetType: 'inline' 413 | script: | 414 | cluster_id=$(databricks clusters list | grep "$(databricks.cluster.name)" | awk '{print $1}') 415 | if [ -z "$cluster_id" ] 416 | then 417 | echo "ERROR: Unable to get Cluster ID" 418 | exit 1 419 | fi 420 | echo "##vso[task.setvariable variable=databricks.cluster.id;]$cluster_id" 421 | - task: Bash@3 422 | displayName: 'Create / Get Deploy ACI Job' 423 | inputs: 424 | targetType: 'inline' 425 | script: | 426 | echo "Check if a job already exists" 427 | job_id=$(databricks jobs list | grep "$(databricks.job.deploytoaci.name)" | awk '{print $1}') 428 | if [ -z "$job_id" ] 429 | then 430 | JSON=`cat << EOM 431 | { 432 | "notebook_task": { 433 | "notebook_path": "$(databricks.notebook.path)/serving_deploy_to_aci" 434 | }, 435 | "existing_cluster_id": "$(databricks.cluster.id)", 436 | "name": "$(databricks.job.deploytoaci.name)", 437 | "max_concurrent_runs": 30, 438 | "timeout_seconds": 86400, 439 | "libraries": [], 440 | "email_notifications": {} 441 | } 442 | EOM` 443 | 444 | job_id=$(databricks jobs create --json "$JSON" | jq ".job_id") 445 | fi 446 | 447 | echo "##vso[task.setvariable variable=databricks.job.deploytoaci.id;]$job_id" 448 | # - task: Bash@3 449 | # displayName: 'Get Image ID' 450 | # inputs: 451 | # targetType: 'inline' 452 | # script: | 453 | # echo "Retrieving Image ID..." 454 | # model_image_id=$(cat image.json | jq -r ".model_image_id") 455 | # if [ -z "$model_image_id" ] 456 | # then 457 | # echo "ERROR: Unable to get Image ID" 458 | # exit 1 459 | # fi 460 | # echo " Image ID: $model_image_id" 461 | # echo "##vso[task.setvariable variable=azureml.image.id;]$model_image_id" 462 | # workingDirectory: '$(System.ArtifactsDirectory)/drop/metadata' 463 | - task: Bash@3 464 | displayName: 'Run the Deploy To ACI job' 465 | inputs: 466 | targetType: 'inline' 467 | script: | 468 | echo "Running job with ID $(databricks.job.deploytoaci.id)" 469 | run_id=$(databricks jobs run-now --job-id $(databricks.job.deploytoaci.id) | jq ".run_id") 470 | echo " Run ID: $run_id" 471 | 472 | run_state=$(databricks runs get --run-id $run_id | jq -r ".state.life_cycle_state") 473 | echo "Run State (ID $run_id): $run_state" 474 | while [ $run_state == "RUNNING" -o $run_state == "PENDING" ] 475 | do 476 | sleep 30 477 | run_state=$(databricks runs get --run-id $run_id | jq -r ".state.life_cycle_state") 478 | echo "Run State (ID $run_id): $run_state" 479 | done 480 | result_state=$(databricks runs get --run-id $run_id | jq -r ".state.result_state") 481 | state_message=$(databricks runs get --run-id $run_id | jq -r ".state.state_message") 482 | echo "Result State (ID $run_id): $result_state, Message: $state_message" 483 | 484 | if [ $result_state == "SUCCESS" ] 485 | then 486 | exit 0 487 | else 488 | exit 1 489 | fi 490 | - stage: Production 491 | displayName: 'Deploy to Production' 492 | dependsOn: Staging 493 | condition: succeeded() 494 | jobs: 495 | # track deployments on the environment 496 | - deployment: DeployToAKS 497 | displayName: 'Deploy to Azure Kubernetes Service' 498 | pool: 499 | vmImage: 'ubuntu-latest' 500 | # creates an environment if it doesn’t exist 501 | environment: 'drinks-quality-production' 502 | strategy: 503 | # default deployment strategy 504 | runOnce: 505 | deploy: 506 | steps: 507 | # - task: DownloadBuildArtifacts@0 508 | # displayName: 'Download Artifact: drop' 509 | # inputs: 510 | # buildType: 'current' 511 | # downloadType: 'single' 512 | # artifactName: 'drop' 513 | # downloadPath: '$(System.ArtifactsDirectory)' 514 | - task: UsePythonVersion@0 515 | displayName: 'Use Python 3.6' 516 | inputs: 517 | versionSpec: '3.6' 518 | addToPath: true 519 | architecture: 'x64' 520 | - task: Bash@3 521 | displayName: 'Install Databricks CLI' 522 | inputs: 523 | targetType: 'inline' 524 | script: 'pip install -U databricks-cli' 525 | - task: Bash@3 526 | displayName: 'Configure Databricks CLI' 527 | inputs: 528 | targetType: 'inline' 529 | script: | 530 | # We need to write the pipe the conf into databricks configure --token since 531 | # that command only takes inputs from stdin. 532 | conf=`cat << EOM 533 | $(databricks.host) 534 | $(databricks.token) 535 | EOM` 536 | 537 | # For password auth there are three lines expected 538 | # hostname, username, password 539 | echo "$conf" | databricks configure --token 540 | - task: Bash@3 541 | displayName: 'Get Cluster ID' 542 | inputs: 543 | targetType: 'inline' 544 | script: | 545 | cluster_id=$(databricks clusters list | grep "$(databricks.cluster.name)" | awk '{print $1}') 546 | if [ -z "$cluster_id" ] 547 | then 548 | echo "ERROR: Unable to get Cluster ID" 549 | exit 1 550 | fi 551 | echo "##vso[task.setvariable variable=databricks.cluster.id;]$cluster_id" 552 | - task: Bash@3 553 | displayName: 'Create / Get Deploy AKS Job' 554 | inputs: 555 | targetType: 'inline' 556 | script: | 557 | job_id=$(databricks jobs list | grep "$(databricks.job.deploytoaks.name)" | awk '{print $1}') 558 | 559 | if [ -z "$job_id" ] 560 | then 561 | JSON=`cat << EOM 562 | { 563 | "notebook_task": { 564 | "notebook_path": "$(databricks.notebook.path)/serving_deploy_to_aks", 565 | "base_parameters": { 566 | "model_image_id": "" 567 | } 568 | }, 569 | "existing_cluster_id": "$(databricks.cluster.id)", 570 | "name": "$(databricks.job.deploytoaks.name)", 571 | "max_concurrent_runs": 30, 572 | "timeout_seconds": 86400, 573 | "libraries": [], 574 | "email_notifications": {} 575 | } 576 | EOM` 577 | 578 | job_id=$(databricks jobs create --json "$JSON" | jq ".job_id") 579 | fi 580 | 581 | echo "##vso[task.setvariable variable=databricks.job.deploytoaks.id;]$job_id" 582 | # - task: Bash@3 583 | # displayName: 'Get Image ID' 584 | # inputs: 585 | # targetType: 'inline' 586 | # script: | 587 | # echo "Retrieving Image ID..." 588 | # model_image_id=$(cat image.json | jq -r ".model_image_id") 589 | # if [ -z "$model_image_id" ] 590 | # then 591 | # echo "ERROR: Unable to get Image ID" 592 | # exit 1 593 | # fi 594 | # echo " Image ID: $model_image_id" 595 | # echo "##vso[task.setvariable variable=azureml.image.id;]$model_image_id" 596 | # workingDirectory: '$(System.ArtifactsDirectory)/drop/metadata' 597 | - task: Bash@3 598 | displayName: 'Deploy To AKS' 599 | inputs: 600 | targetType: 'inline' 601 | script: | 602 | echo "Running job with ID $(databricks.job.deploytoaks.id) with model_id=$(azureml.image.id)..." 603 | run_id=$(databricks jobs run-now --job-id $(databricks.job.deploytoaks.id) --notebook-params '{ "model_image_id": "$(azureml.image.id)" }' | jq ".run_id") 604 | echo " Run ID: $run_id" 605 | 606 | run_state=$(databricks runs get --run-id $run_id | jq -r ".state.life_cycle_state") 607 | echo "Run State (ID $run_id): $run_state" 608 | while [ $run_state == "RUNNING" -o $run_state == "PENDING" ] 609 | do 610 | sleep 30 611 | run_state=$(databricks runs get --run-id $run_id | jq -r ".state.life_cycle_state") 612 | echo "Run State (ID $run_id): $run_state" 613 | done 614 | result_state=$(databricks runs get --run-id $run_id | jq -r ".state.result_state") 615 | state_message=$(databricks runs get --run-id $run_id | jq -r ".state.state_message") 616 | echo "Result State (ID $run_id): $result_state, Message: $state_message" 617 | 618 | if [ $result_state == "SUCCESS" ] 619 | then 620 | exit 0 621 | else 622 | exit 1 623 | fi --------------------------------------------------------------------------------